summaryrefslogtreecommitdiffstats
path: root/lib/Target/ARM
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Target/ARM')
-rw-r--r--lib/Target/ARM/ARM.h23
-rw-r--r--lib/Target/ARM/ARM.td33
-rw-r--r--lib/Target/ARM/ARMAddressingModes.h109
-rw-r--r--lib/Target/ARM/ARMBaseInstrInfo.cpp1060
-rw-r--r--lib/Target/ARM/ARMBaseInstrInfo.h333
-rw-r--r--lib/Target/ARM/ARMBaseRegisterInfo.cpp1360
-rw-r--r--lib/Target/ARM/ARMBaseRegisterInfo.h148
-rw-r--r--lib/Target/ARM/ARMCallingConv.td2
-rw-r--r--lib/Target/ARM/ARMCodeEmitter.cpp211
-rw-r--r--lib/Target/ARM/ARMConstantIslandPass.cpp775
-rw-r--r--lib/Target/ARM/ARMConstantPoolValue.cpp36
-rw-r--r--lib/Target/ARM/ARMConstantPoolValue.h33
-rw-r--r--lib/Target/ARM/ARMFrameInfo.h4
-rw-r--r--lib/Target/ARM/ARMISelDAGToDAG.cpp1196
-rw-r--r--lib/Target/ARM/ARMISelLowering.cpp1523
-rw-r--r--lib/Target/ARM/ARMISelLowering.h91
-rw-r--r--lib/Target/ARM/ARMInstrFormats.td906
-rw-r--r--lib/Target/ARM/ARMInstrInfo.cpp856
-rw-r--r--lib/Target/ARM/ARMInstrInfo.h243
-rw-r--r--lib/Target/ARM/ARMInstrInfo.td682
-rw-r--r--lib/Target/ARM/ARMInstrNEON.td2127
-rw-r--r--lib/Target/ARM/ARMInstrThumb.td662
-rw-r--r--lib/Target/ARM/ARMInstrThumb2.td826
-rw-r--r--lib/Target/ARM/ARMInstrVFP.td195
-rw-r--r--lib/Target/ARM/ARMJITInfo.cpp68
-rw-r--r--lib/Target/ARM/ARMLoadStoreOptimizer.cpp562
-rw-r--r--lib/Target/ARM/ARMMCAsmInfo.cpp72
-rw-r--r--lib/Target/ARM/ARMMCAsmInfo.h31
-rw-r--r--lib/Target/ARM/ARMMachineFunctionInfo.h19
-rw-r--r--lib/Target/ARM/ARMPerfectShuffle.h6586
-rw-r--r--lib/Target/ARM/ARMRegisterInfo.cpp1367
-rw-r--r--lib/Target/ARM/ARMRegisterInfo.h121
-rw-r--r--lib/Target/ARM/ARMRegisterInfo.td99
-rw-r--r--lib/Target/ARM/ARMSchedule.td149
-rw-r--r--lib/Target/ARM/ARMScheduleV6.td12
-rw-r--r--lib/Target/ARM/ARMScheduleV7.td587
-rw-r--r--lib/Target/ARM/ARMSubtarget.cpp77
-rw-r--r--lib/Target/ARM/ARMSubtarget.h31
-rw-r--r--lib/Target/ARM/ARMTargetMachine.cpp218
-rw-r--r--lib/Target/ARM/ARMTargetMachine.h50
-rw-r--r--lib/Target/ARM/ARMTargetObjectFile.h39
-rw-r--r--lib/Target/ARM/AsmParser/ARMAsmParser.cpp618
-rw-r--r--lib/Target/ARM/AsmParser/CMakeLists.txt6
-rw-r--r--lib/Target/ARM/AsmParser/Makefile15
-rw-r--r--lib/Target/ARM/AsmPrinter/ARMAsmPrinter.cpp691
-rw-r--r--lib/Target/ARM/AsmPrinter/Makefile2
-rw-r--r--lib/Target/ARM/CMakeLists.txt7
-rw-r--r--lib/Target/ARM/Makefile2
-rw-r--r--lib/Target/ARM/NEONPreAllocPass.cpp394
-rw-r--r--lib/Target/ARM/README-Thumb.txt28
-rw-r--r--lib/Target/ARM/README-Thumb2.txt6
-rw-r--r--lib/Target/ARM/README.txt63
-rw-r--r--lib/Target/ARM/TargetInfo/ARMTargetInfo.cpp23
-rw-r--r--lib/Target/ARM/TargetInfo/CMakeLists.txt7
-rw-r--r--lib/Target/ARM/TargetInfo/Makefile15
-rw-r--r--lib/Target/ARM/Thumb1InstrInfo.cpp185
-rw-r--r--lib/Target/ARM/Thumb1InstrInfo.h27
-rw-r--r--lib/Target/ARM/Thumb1RegisterInfo.cpp323
-rw-r--r--lib/Target/ARM/Thumb1RegisterInfo.h39
-rw-r--r--lib/Target/ARM/Thumb2ITBlockPass.cpp158
-rw-r--r--lib/Target/ARM/Thumb2InstrInfo.cpp635
-rw-r--r--lib/Target/ARM/Thumb2InstrInfo.h74
-rw-r--r--lib/Target/ARM/Thumb2RegisterInfo.cpp724
-rw-r--r--lib/Target/ARM/Thumb2RegisterInfo.h29
-rw-r--r--lib/Target/ARM/Thumb2SizeReduction.cpp685
65 files changed, 20707 insertions, 7571 deletions
diff --git a/lib/Target/ARM/ARM.h b/lib/Target/ARM/ARM.h
index 08dc07c..487ce1d 100644
--- a/lib/Target/ARM/ARM.h
+++ b/lib/Target/ARM/ARM.h
@@ -15,6 +15,7 @@
#ifndef TARGET_ARM_H
#define TARGET_ARM_H
+#include "llvm/Support/ErrorHandling.h"
#include "llvm/Target/TargetMachine.h"
#include <cassert>
@@ -24,7 +25,8 @@ class ARMBaseTargetMachine;
class FunctionPass;
class MachineCodeEmitter;
class JITCodeEmitter;
-class raw_ostream;
+class ObjectCodeEmitter;
+class formatted_raw_ostream;
// Enums corresponding to ARM condition codes
namespace ARMCC {
@@ -50,7 +52,7 @@ namespace ARMCC {
inline static CondCodes getOppositeCondition(CondCodes CC){
switch (CC) {
- default: assert(0 && "Unknown condition code");
+ default: llvm_unreachable("Unknown condition code");
case EQ: return NE;
case NE: return EQ;
case HS: return LO;
@@ -71,7 +73,7 @@ namespace ARMCC {
inline static const char *ARMCondCodeToString(ARMCC::CondCodes CC) {
switch (CC) {
- default: assert(0 && "Unknown condition code");
+ default: llvm_unreachable("Unknown condition code");
case ARMCC::EQ: return "eq";
case ARMCC::NE: return "ne";
case ARMCC::HS: return "hs";
@@ -90,20 +92,23 @@ inline static const char *ARMCondCodeToString(ARMCC::CondCodes CC) {
}
}
-FunctionPass *createARMISelDag(ARMBaseTargetMachine &TM);
-FunctionPass *createARMCodePrinterPass(raw_ostream &O,
- ARMBaseTargetMachine &TM,
- bool Verbose);
-FunctionPass *createARMCodeEmitterPass(ARMBaseTargetMachine &TM,
- MachineCodeEmitter &MCE);
+FunctionPass *createARMISelDag(ARMBaseTargetMachine &TM,
+ CodeGenOpt::Level OptLevel);
FunctionPass *createARMCodeEmitterPass(ARMBaseTargetMachine &TM,
MachineCodeEmitter &MCE);
FunctionPass *createARMJITCodeEmitterPass(ARMBaseTargetMachine &TM,
JITCodeEmitter &JCE);
+FunctionPass *createARMObjectCodeEmitterPass(ARMBaseTargetMachine &TM,
+ ObjectCodeEmitter &OCE);
FunctionPass *createARMLoadStoreOptimizationPass(bool PreAlloc = false);
FunctionPass *createARMConstantIslandPass();
+FunctionPass *createNEONPreAllocPass();
+FunctionPass *createThumb2ITBlockPass();
+FunctionPass *createThumb2SizeReductionPass();
+
+extern Target TheARMTarget, TheThumbTarget;
} // end namespace llvm;
diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td
index 9001e50..8851fbb 100644
--- a/lib/Target/ARM/ARM.td
+++ b/lib/Target/ARM/ARM.td
@@ -89,27 +89,20 @@ def : ProcNoItin<"xscale", [ArchV5TE]>;
def : ProcNoItin<"iwmmxt", [ArchV5TE]>;
// V6 Processors.
-def : Processor<"arm1136j-s", V6Itineraries,
- [ArchV6]>;
-def : Processor<"arm1136jf-s", V6Itineraries,
- [ArchV6, FeatureVFP2]>;
-def : Processor<"arm1176jz-s", V6Itineraries,
- [ArchV6]>;
-def : Processor<"arm1176jzf-s", V6Itineraries,
- [ArchV6, FeatureVFP2]>;
-def : Processor<"mpcorenovfp", V6Itineraries,
- [ArchV6]>;
-def : Processor<"mpcore", V6Itineraries,
- [ArchV6, FeatureVFP2]>;
+def : ProcNoItin<"arm1136j-s", [ArchV6]>;
+def : ProcNoItin<"arm1136jf-s", [ArchV6, FeatureVFP2]>;
+def : ProcNoItin<"arm1176jz-s", [ArchV6]>;
+def : ProcNoItin<"arm1176jzf-s", [ArchV6, FeatureVFP2]>;
+def : ProcNoItin<"mpcorenovfp", [ArchV6]>;
+def : ProcNoItin<"mpcore", [ArchV6, FeatureVFP2]>;
// V6T2 Processors.
-def : Processor<"arm1156t2-s", V6Itineraries,
- [ArchV6T2, FeatureThumb2]>;
-def : Processor<"arm1156t2f-s", V6Itineraries,
- [ArchV6T2, FeatureThumb2, FeatureVFP2]>;
+def : ProcNoItin<"arm1156t2-s", [ArchV6T2, FeatureThumb2]>;
+def : ProcNoItin<"arm1156t2f-s", [ArchV6T2, FeatureThumb2, FeatureVFP2]>;
// V7 Processors.
-def : ProcNoItin<"cortex-a8", [ArchV7A, FeatureThumb2, FeatureNEON]>;
+def : Processor<"cortex-a8", CortexA8Itineraries,
+ [ArchV7A, FeatureThumb2, FeatureNEON]>;
def : ProcNoItin<"cortex-a9", [ArchV7A, FeatureThumb2, FeatureNEON]>;
//===----------------------------------------------------------------------===//
@@ -131,13 +124,13 @@ def ARMInstrInfo : InstrInfo {
let TSFlagsFields = ["AddrModeBits",
"SizeFlag",
"IndexModeBits",
- "isUnaryDataProc",
- "Form"];
+ "Form",
+ "isUnaryDataProc"];
let TSFlagsShifts = [0,
4,
7,
9,
- 10];
+ 15];
}
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/ARM/ARMAddressingModes.h b/lib/Target/ARM/ARMAddressingModes.h
index 15c9ec1..1839153 100644
--- a/lib/Target/ARM/ARMAddressingModes.h
+++ b/lib/Target/ARM/ARMAddressingModes.h
@@ -15,11 +15,12 @@
#define LLVM_TARGET_ARM_ARMADDRESSINGMODES_H
#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/MathExtras.h"
#include <cassert>
namespace llvm {
-
+
/// ARM_AM - ARM Addressing Mode Stuff
namespace ARM_AM {
enum ShiftOpc {
@@ -30,14 +31,14 @@ namespace ARM_AM {
ror,
rrx
};
-
+
enum AddrOpc {
add = '+', sub = '-'
};
-
+
static inline const char *getShiftOpcStr(ShiftOpc Op) {
switch (Op) {
- default: assert(0 && "Unknown shift opc!");
+ default: llvm_unreachable("Unknown shift opc!");
case ARM_AM::asr: return "asr";
case ARM_AM::lsl: return "lsl";
case ARM_AM::lsr: return "lsr";
@@ -45,7 +46,7 @@ namespace ARM_AM {
case ARM_AM::rrx: return "rrx";
}
}
-
+
static inline ShiftOpc getShiftOpcForNode(SDValue N) {
switch (N.getOpcode()) {
default: return ARM_AM::no_shift;
@@ -70,7 +71,7 @@ namespace ARM_AM {
static inline const char *getAMSubModeStr(AMSubMode Mode) {
switch (Mode) {
- default: assert(0 && "Unknown addressing sub-mode!");
+ default: llvm_unreachable("Unknown addressing sub-mode!");
case ARM_AM::ia: return "ia";
case ARM_AM::ib: return "ib";
case ARM_AM::da: return "da";
@@ -80,7 +81,7 @@ namespace ARM_AM {
static inline const char *getAMSubModeAltStr(AMSubMode Mode, bool isLD) {
switch (Mode) {
- default: assert(0 && "Unknown addressing sub-mode!");
+ default: llvm_unreachable("Unknown addressing sub-mode!");
case ARM_AM::ia: return isLD ? "fd" : "ea";
case ARM_AM::ib: return isLD ? "ed" : "fa";
case ARM_AM::da: return isLD ? "fa" : "ed";
@@ -94,14 +95,14 @@ namespace ARM_AM {
assert(Amt < 32 && "Invalid rotate amount");
return (Val >> Amt) | (Val << ((32-Amt)&31));
}
-
+
/// rotl32 - Rotate a 32-bit unsigned value left by a specified # bits.
///
static inline unsigned rotl32(unsigned Val, unsigned Amt) {
assert(Amt < 32 && "Invalid rotate amount");
return (Val << Amt) | (Val >> ((32-Amt)&31));
}
-
+
//===--------------------------------------------------------------------===//
// Addressing Mode #1: shift_operand with registers
//===--------------------------------------------------------------------===//
@@ -136,7 +137,7 @@ namespace ARM_AM {
static inline unsigned getSOImmValRot(unsigned Imm) {
return (Imm >> 8) * 2;
}
-
+
/// getSOImmValRotate - Try to handle Imm with an immediate shifter operand,
/// computing the rotate amount to use. If this immediate value cannot be
/// handled with a single shifter-op, determine a good rotate amount that will
@@ -145,14 +146,14 @@ namespace ARM_AM {
// 8-bit (or less) immediates are trivially shifter_operands with a rotate
// of zero.
if ((Imm & ~255U) == 0) return 0;
-
+
// Use CTZ to compute the rotate amount.
unsigned TZ = CountTrailingZeros_32(Imm);
-
+
// Rotate amount must be even. Something like 0x200 must be rotated 8 bits,
// not 9.
unsigned RotAmt = TZ & ~1;
-
+
// If we can handle this spread, return it.
if ((rotr32(Imm, RotAmt) & ~255U) == 0)
return (32-RotAmt)&31; // HW rotates right, not left.
@@ -165,16 +166,16 @@ namespace ARM_AM {
// Restart the search for a high-order bit after the initial seconds of
// ones.
unsigned TZ2 = CountTrailingZeros_32(Imm & ~((1 << TrailingOnes)-1));
-
+
// Rotate amount must be even.
unsigned RotAmt2 = TZ2 & ~1;
-
+
// If this fits, use it.
if (RotAmt2 != 32 && (rotr32(Imm, RotAmt2) & ~255U) == 0)
return (32-RotAmt2)&31; // HW rotates right, not left.
}
}
-
+
// Otherwise, we have no way to cover this span of bits with a single
// shifter_op immediate. Return a chunk of bits that will be useful to
// handle.
@@ -188,17 +189,17 @@ namespace ARM_AM {
// 8-bit (or less) immediates are trivially shifter_operands with a rotate
// of zero.
if ((Arg & ~255U) == 0) return Arg;
-
+
unsigned RotAmt = getSOImmValRotate(Arg);
// If this cannot be handled with a single shifter_op, bail out.
if (rotr32(~255U, RotAmt) & Arg)
return -1;
-
+
// Encode this correctly.
return rotl32(Arg, RotAmt) | ((RotAmt>>1) << 8);
}
-
+
/// isSOImmTwoPartVal - Return true if the specified value can be obtained by
/// or'ing together two SOImmVal's.
static inline bool isSOImmTwoPartVal(unsigned V) {
@@ -206,12 +207,12 @@ namespace ARM_AM {
V = rotr32(~255U, getSOImmValRotate(V)) & V;
if (V == 0)
return false;
-
+
// If this can be handled with two shifter_op's, accept.
V = rotr32(~255U, getSOImmValRotate(V)) & V;
return V == 0;
}
-
+
/// getSOImmTwoPartFirst - If V is a value that satisfies isSOImmTwoPartVal,
/// return the first chunk of it.
static inline unsigned getSOImmTwoPartFirst(unsigned V) {
@@ -221,14 +222,14 @@ namespace ARM_AM {
/// getSOImmTwoPartSecond - If V is a value that satisfies isSOImmTwoPartVal,
/// return the second chunk of it.
static inline unsigned getSOImmTwoPartSecond(unsigned V) {
- // Mask out the first hunk.
+ // Mask out the first hunk.
V = rotr32(~255U, getSOImmValRotate(V)) & V;
-
+
// Take what's left.
assert(V == (rotr32(255U, getSOImmValRotate(V)) & V));
return V;
}
-
+
/// getThumbImmValShift - Try to handle Imm with a 8-bit immediate followed
/// by a left shift. Returns the shift amount to use.
static inline unsigned getThumbImmValShift(unsigned Imm) {
@@ -243,7 +244,7 @@ namespace ARM_AM {
/// isThumbImmShiftedVal - Return true if the specified value can be obtained
/// by left shifting a 8-bit immediate.
static inline bool isThumbImmShiftedVal(unsigned V) {
- // If this can be handled with
+ // If this can be handled with
V = (~255U << getThumbImmValShift(V)) & V;
return V == 0;
}
@@ -259,10 +260,10 @@ namespace ARM_AM {
return CountTrailingZeros_32(Imm);
}
- /// isThumbImm16ShiftedVal - Return true if the specified value can be
+ /// isThumbImm16ShiftedVal - Return true if the specified value can be
/// obtained by left shifting a 16-bit immediate.
static inline bool isThumbImm16ShiftedVal(unsigned V) {
- // If this can be handled with
+ // If this can be handled with
V = (~65535U << getThumbImm16ValShift(V)) & V;
return V == 0;
}
@@ -273,28 +274,6 @@ namespace ARM_AM {
return V >> getThumbImmValShift(V);
}
- /// getT2SOImmValDecode - Given a 12-bit encoded Thumb-2 modified immediate,
- /// return the corresponding 32-bit immediate value.
- /// See ARM Reference Manual A6.3.2.
- static inline unsigned getT2SOImmValDecode(unsigned Imm) {
- unsigned Base = Imm & 0xff;
- switch ((Imm >> 8) & 0xf) {
- case 0:
- return Base;
- case 1:
- return Base | (Base << 16);
- case 2:
- return (Base << 8) | (Base << 24);
- case 3:
- return Base | (Base << 8) | (Base << 16) | (Base << 24);
- default:
- break;
- }
-
- // shifted immediate
- unsigned RotAmount = ((Imm >> 7) & 0x1f) - 8;
- return (Base | 0x80) << (24 - RotAmount);
- }
/// getT2SOImmValSplat - Return the 12-bit encoded representation
/// if the specified value can be obtained by splatting the low 8 bits
@@ -305,12 +284,12 @@ namespace ARM_AM {
/// abcdefgh abcdefgh abcdefgh abcdefgh control = 3
/// Return -1 if none of the above apply.
/// See ARM Reference Manual A6.3.2.
- static inline int getT2SOImmValSplat(unsigned V) {
+ static inline int getT2SOImmValSplatVal(unsigned V) {
unsigned u, Vs, Imm;
// control = 0
- if ((V & 0xffffff00) == 0)
+ if ((V & 0xffffff00) == 0)
return V;
-
+
// If the value is zeroes in the first byte, just shift those off
Vs = ((V & 0xff) == 0) ? V >> 8 : V;
// Any passing value only has 8 bits of payload, splatted across the word
@@ -329,11 +308,11 @@ namespace ARM_AM {
return -1;
}
- /// getT2SOImmValRotate - Return the 12-bit encoded representation if the
+ /// getT2SOImmValRotateVal - Return the 12-bit encoded representation if the
/// specified value is a rotated 8-bit value. Return -1 if no rotation
/// encoding is possible.
/// See ARM Reference Manual A6.3.2.
- static inline int getT2SOImmValRotate (unsigned V) {
+ static inline int getT2SOImmValRotateVal(unsigned V) {
unsigned RotAmt = CountLeadingZeros_32(V);
if (RotAmt >= 24)
return -1;
@@ -346,23 +325,23 @@ namespace ARM_AM {
}
/// getT2SOImmVal - Given a 32-bit immediate, if it is something that can fit
- /// into a Thumb-2 shifter_operand immediate operand, return the 12-bit
+ /// into a Thumb-2 shifter_operand immediate operand, return the 12-bit
/// encoding for it. If not, return -1.
/// See ARM Reference Manual A6.3.2.
static inline int getT2SOImmVal(unsigned Arg) {
// If 'Arg' is an 8-bit splat, then get the encoded value.
- int Splat = getT2SOImmValSplat(Arg);
+ int Splat = getT2SOImmValSplatVal(Arg);
if (Splat != -1)
return Splat;
-
+
// If 'Arg' can be handled with a single shifter_op return the value.
- int Rot = getT2SOImmValRotate(Arg);
+ int Rot = getT2SOImmValRotateVal(Arg);
if (Rot != -1)
return Rot;
return -1;
}
-
+
//===--------------------------------------------------------------------===//
// Addressing Mode #2
@@ -380,7 +359,7 @@ namespace ARM_AM {
// If this addressing mode is a frame index (before prolog/epilog insertion
// and code rewriting), this operand will have the form: FI#, reg0, <offs>
// with no shift amount for the frame offset.
- //
+ //
static inline unsigned getAM2Opc(AddrOpc Opc, unsigned Imm12, ShiftOpc SO) {
assert(Imm12 < (1 << 12) && "Imm too large!");
bool isSub = Opc == sub;
@@ -395,8 +374,8 @@ namespace ARM_AM {
static inline ShiftOpc getAM2ShiftOpc(unsigned AM2Opc) {
return (ShiftOpc)(AM2Opc >> 13);
}
-
-
+
+
//===--------------------------------------------------------------------===//
// Addressing Mode #3
//===--------------------------------------------------------------------===//
@@ -409,7 +388,7 @@ namespace ARM_AM {
// The first operand is always a Reg. The second operand is a reg if in
// reg/reg form, otherwise it's reg#0. The third field encodes the operation
// in bit 8, the immediate in bits 0-7.
-
+
/// getAM3Opc - This function encodes the addrmode3 opc field.
static inline unsigned getAM3Opc(AddrOpc Opc, unsigned char Offset) {
bool isSub = Opc == sub;
@@ -421,7 +400,7 @@ namespace ARM_AM {
static inline AddrOpc getAM3Op(unsigned AM3Opc) {
return ((AM3Opc >> 8) & 1) ? sub : add;
}
-
+
//===--------------------------------------------------------------------===//
// Addressing Mode #4
//===--------------------------------------------------------------------===//
@@ -469,7 +448,7 @@ namespace ARM_AM {
//
// IA - Increment after
// DB - Decrement before
-
+
/// getAM5Opc - This function encodes the addrmode5 opc field.
static inline unsigned getAM5Opc(AddrOpc Opc, unsigned char Offset) {
bool isSub = Opc == sub;
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp
new file mode 100644
index 0000000..ecdf5a0
--- /dev/null
+++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -0,0 +1,1060 @@
+//===- ARMBaseInstrInfo.cpp - ARM Instruction Information -----------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Base ARM implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMBaseInstrInfo.h"
+#include "ARM.h"
+#include "ARMAddressingModes.h"
+#include "ARMGenInstrInfo.inc"
+#include "ARMMachineFunctionInfo.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+using namespace llvm;
+
+static cl::opt<bool>
+EnableARM3Addr("enable-arm-3-addr-conv", cl::Hidden,
+ cl::desc("Enable ARM 2-addr to 3-addr conv"));
+
+ARMBaseInstrInfo::ARMBaseInstrInfo()
+ : TargetInstrInfoImpl(ARMInsts, array_lengthof(ARMInsts)) {
+}
+
+MachineInstr *
+ARMBaseInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
+ MachineBasicBlock::iterator &MBBI,
+ LiveVariables *LV) const {
+ // FIXME: Thumb2 support.
+
+ if (!EnableARM3Addr)
+ return NULL;
+
+ MachineInstr *MI = MBBI;
+ MachineFunction &MF = *MI->getParent()->getParent();
+ unsigned TSFlags = MI->getDesc().TSFlags;
+ bool isPre = false;
+ switch ((TSFlags & ARMII::IndexModeMask) >> ARMII::IndexModeShift) {
+ default: return NULL;
+ case ARMII::IndexModePre:
+ isPre = true;
+ break;
+ case ARMII::IndexModePost:
+ break;
+ }
+
+ // Try splitting an indexed load/store to an un-indexed one plus an add/sub
+ // operation.
+ unsigned MemOpc = getUnindexedOpcode(MI->getOpcode());
+ if (MemOpc == 0)
+ return NULL;
+
+ MachineInstr *UpdateMI = NULL;
+ MachineInstr *MemMI = NULL;
+ unsigned AddrMode = (TSFlags & ARMII::AddrModeMask);
+ const TargetInstrDesc &TID = MI->getDesc();
+ unsigned NumOps = TID.getNumOperands();
+ bool isLoad = !TID.mayStore();
+ const MachineOperand &WB = isLoad ? MI->getOperand(1) : MI->getOperand(0);
+ const MachineOperand &Base = MI->getOperand(2);
+ const MachineOperand &Offset = MI->getOperand(NumOps-3);
+ unsigned WBReg = WB.getReg();
+ unsigned BaseReg = Base.getReg();
+ unsigned OffReg = Offset.getReg();
+ unsigned OffImm = MI->getOperand(NumOps-2).getImm();
+ ARMCC::CondCodes Pred = (ARMCC::CondCodes)MI->getOperand(NumOps-1).getImm();
+ switch (AddrMode) {
+ default:
+ assert(false && "Unknown indexed op!");
+ return NULL;
+ case ARMII::AddrMode2: {
+ bool isSub = ARM_AM::getAM2Op(OffImm) == ARM_AM::sub;
+ unsigned Amt = ARM_AM::getAM2Offset(OffImm);
+ if (OffReg == 0) {
+ if (ARM_AM::getSOImmVal(Amt) == -1)
+ // Can't encode it in a so_imm operand. This transformation will
+ // add more than 1 instruction. Abandon!
+ return NULL;
+ UpdateMI = BuildMI(MF, MI->getDebugLoc(),
+ get(isSub ? ARM::SUBri : ARM::ADDri), WBReg)
+ .addReg(BaseReg).addImm(Amt)
+ .addImm(Pred).addReg(0).addReg(0);
+ } else if (Amt != 0) {
+ ARM_AM::ShiftOpc ShOpc = ARM_AM::getAM2ShiftOpc(OffImm);
+ unsigned SOOpc = ARM_AM::getSORegOpc(ShOpc, Amt);
+ UpdateMI = BuildMI(MF, MI->getDebugLoc(),
+ get(isSub ? ARM::SUBrs : ARM::ADDrs), WBReg)
+ .addReg(BaseReg).addReg(OffReg).addReg(0).addImm(SOOpc)
+ .addImm(Pred).addReg(0).addReg(0);
+ } else
+ UpdateMI = BuildMI(MF, MI->getDebugLoc(),
+ get(isSub ? ARM::SUBrr : ARM::ADDrr), WBReg)
+ .addReg(BaseReg).addReg(OffReg)
+ .addImm(Pred).addReg(0).addReg(0);
+ break;
+ }
+ case ARMII::AddrMode3 : {
+ bool isSub = ARM_AM::getAM3Op(OffImm) == ARM_AM::sub;
+ unsigned Amt = ARM_AM::getAM3Offset(OffImm);
+ if (OffReg == 0)
+ // Immediate is 8-bits. It's guaranteed to fit in a so_imm operand.
+ UpdateMI = BuildMI(MF, MI->getDebugLoc(),
+ get(isSub ? ARM::SUBri : ARM::ADDri), WBReg)
+ .addReg(BaseReg).addImm(Amt)
+ .addImm(Pred).addReg(0).addReg(0);
+ else
+ UpdateMI = BuildMI(MF, MI->getDebugLoc(),
+ get(isSub ? ARM::SUBrr : ARM::ADDrr), WBReg)
+ .addReg(BaseReg).addReg(OffReg)
+ .addImm(Pred).addReg(0).addReg(0);
+ break;
+ }
+ }
+
+ std::vector<MachineInstr*> NewMIs;
+ if (isPre) {
+ if (isLoad)
+ MemMI = BuildMI(MF, MI->getDebugLoc(),
+ get(MemOpc), MI->getOperand(0).getReg())
+ .addReg(WBReg).addReg(0).addImm(0).addImm(Pred);
+ else
+ MemMI = BuildMI(MF, MI->getDebugLoc(),
+ get(MemOpc)).addReg(MI->getOperand(1).getReg())
+ .addReg(WBReg).addReg(0).addImm(0).addImm(Pred);
+ NewMIs.push_back(MemMI);
+ NewMIs.push_back(UpdateMI);
+ } else {
+ if (isLoad)
+ MemMI = BuildMI(MF, MI->getDebugLoc(),
+ get(MemOpc), MI->getOperand(0).getReg())
+ .addReg(BaseReg).addReg(0).addImm(0).addImm(Pred);
+ else
+ MemMI = BuildMI(MF, MI->getDebugLoc(),
+ get(MemOpc)).addReg(MI->getOperand(1).getReg())
+ .addReg(BaseReg).addReg(0).addImm(0).addImm(Pred);
+ if (WB.isDead())
+ UpdateMI->getOperand(0).setIsDead();
+ NewMIs.push_back(UpdateMI);
+ NewMIs.push_back(MemMI);
+ }
+
+ // Transfer LiveVariables states, kill / dead info.
+ if (LV) {
+ for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+ MachineOperand &MO = MI->getOperand(i);
+ if (MO.isReg() && MO.getReg() &&
+ TargetRegisterInfo::isVirtualRegister(MO.getReg())) {
+ unsigned Reg = MO.getReg();
+
+ LiveVariables::VarInfo &VI = LV->getVarInfo(Reg);
+ if (MO.isDef()) {
+ MachineInstr *NewMI = (Reg == WBReg) ? UpdateMI : MemMI;
+ if (MO.isDead())
+ LV->addVirtualRegisterDead(Reg, NewMI);
+ }
+ if (MO.isUse() && MO.isKill()) {
+ for (unsigned j = 0; j < 2; ++j) {
+ // Look at the two new MI's in reverse order.
+ MachineInstr *NewMI = NewMIs[j];
+ if (!NewMI->readsRegister(Reg))
+ continue;
+ LV->addVirtualRegisterKilled(Reg, NewMI);
+ if (VI.removeKill(MI))
+ VI.Kills.push_back(NewMI);
+ break;
+ }
+ }
+ }
+ }
+ }
+
+ MFI->insert(MBBI, NewMIs[1]);
+ MFI->insert(MBBI, NewMIs[0]);
+ return NewMIs[0];
+}
+
+// Branch analysis.
+bool
+ARMBaseInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB,
+ MachineBasicBlock *&FBB,
+ SmallVectorImpl<MachineOperand> &Cond,
+ bool AllowModify) const {
+ // If the block has no terminators, it just falls into the block after it.
+ MachineBasicBlock::iterator I = MBB.end();
+ if (I == MBB.begin() || !isUnpredicatedTerminator(--I))
+ return false;
+
+ // Get the last instruction in the block.
+ MachineInstr *LastInst = I;
+
+ // If there is only one terminator instruction, process it.
+ unsigned LastOpc = LastInst->getOpcode();
+ if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) {
+ if (isUncondBranchOpcode(LastOpc)) {
+ TBB = LastInst->getOperand(0).getMBB();
+ return false;
+ }
+ if (isCondBranchOpcode(LastOpc)) {
+ // Block ends with fall-through condbranch.
+ TBB = LastInst->getOperand(0).getMBB();
+ Cond.push_back(LastInst->getOperand(1));
+ Cond.push_back(LastInst->getOperand(2));
+ return false;
+ }
+ return true; // Can't handle indirect branch.
+ }
+
+ // Get the instruction before it if it is a terminator.
+ MachineInstr *SecondLastInst = I;
+
+ // If there are three terminators, we don't know what sort of block this is.
+ if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(--I))
+ return true;
+
+ // If the block ends with a B and a Bcc, handle it.
+ unsigned SecondLastOpc = SecondLastInst->getOpcode();
+ if (isCondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
+ TBB = SecondLastInst->getOperand(0).getMBB();
+ Cond.push_back(SecondLastInst->getOperand(1));
+ Cond.push_back(SecondLastInst->getOperand(2));
+ FBB = LastInst->getOperand(0).getMBB();
+ return false;
+ }
+
+ // If the block ends with two unconditional branches, handle it. The second
+ // one is not executed, so remove it.
+ if (isUncondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
+ TBB = SecondLastInst->getOperand(0).getMBB();
+ I = LastInst;
+ if (AllowModify)
+ I->eraseFromParent();
+ return false;
+ }
+
+ // ...likewise if it ends with a branch table followed by an unconditional
+ // branch. The branch folder can create these, and we must get rid of them for
+ // correctness of Thumb constant islands.
+ if (isJumpTableBranchOpcode(SecondLastOpc) &&
+ isUncondBranchOpcode(LastOpc)) {
+ I = LastInst;
+ if (AllowModify)
+ I->eraseFromParent();
+ return true;
+ }
+
+ // Otherwise, can't handle this.
+ return true;
+}
+
+
+unsigned ARMBaseInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
+ MachineBasicBlock::iterator I = MBB.end();
+ if (I == MBB.begin()) return 0;
+ --I;
+ if (!isUncondBranchOpcode(I->getOpcode()) &&
+ !isCondBranchOpcode(I->getOpcode()))
+ return 0;
+
+ // Remove the branch.
+ I->eraseFromParent();
+
+ I = MBB.end();
+
+ if (I == MBB.begin()) return 1;
+ --I;
+ if (!isCondBranchOpcode(I->getOpcode()))
+ return 1;
+
+ // Remove the branch.
+ I->eraseFromParent();
+ return 2;
+}
+
+unsigned
+ARMBaseInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+ MachineBasicBlock *FBB,
+ const SmallVectorImpl<MachineOperand> &Cond) const {
+ // FIXME this should probably have a DebugLoc argument
+ DebugLoc dl = DebugLoc::getUnknownLoc();
+
+ ARMFunctionInfo *AFI = MBB.getParent()->getInfo<ARMFunctionInfo>();
+ int BOpc = !AFI->isThumbFunction()
+ ? ARM::B : (AFI->isThumb2Function() ? ARM::t2B : ARM::tB);
+ int BccOpc = !AFI->isThumbFunction()
+ ? ARM::Bcc : (AFI->isThumb2Function() ? ARM::t2Bcc : ARM::tBcc);
+
+ // Shouldn't be a fall through.
+ assert(TBB && "InsertBranch must not be told to insert a fallthrough");
+ assert((Cond.size() == 2 || Cond.size() == 0) &&
+ "ARM branch conditions have two components!");
+
+ if (FBB == 0) {
+ if (Cond.empty()) // Unconditional branch?
+ BuildMI(&MBB, dl, get(BOpc)).addMBB(TBB);
+ else
+ BuildMI(&MBB, dl, get(BccOpc)).addMBB(TBB)
+ .addImm(Cond[0].getImm()).addReg(Cond[1].getReg());
+ return 1;
+ }
+
+ // Two-way conditional branch.
+ BuildMI(&MBB, dl, get(BccOpc)).addMBB(TBB)
+ .addImm(Cond[0].getImm()).addReg(Cond[1].getReg());
+ BuildMI(&MBB, dl, get(BOpc)).addMBB(FBB);
+ return 2;
+}
+
+bool ARMBaseInstrInfo::
+ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
+ ARMCC::CondCodes CC = (ARMCC::CondCodes)(int)Cond[0].getImm();
+ Cond[0].setImm(ARMCC::getOppositeCondition(CC));
+ return false;
+}
+
+bool ARMBaseInstrInfo::
+PredicateInstruction(MachineInstr *MI,
+ const SmallVectorImpl<MachineOperand> &Pred) const {
+ unsigned Opc = MI->getOpcode();
+ if (isUncondBranchOpcode(Opc)) {
+ MI->setDesc(get(getMatchingCondBranchOpcode(Opc)));
+ MI->addOperand(MachineOperand::CreateImm(Pred[0].getImm()));
+ MI->addOperand(MachineOperand::CreateReg(Pred[1].getReg(), false));
+ return true;
+ }
+
+ int PIdx = MI->findFirstPredOperandIdx();
+ if (PIdx != -1) {
+ MachineOperand &PMO = MI->getOperand(PIdx);
+ PMO.setImm(Pred[0].getImm());
+ MI->getOperand(PIdx+1).setReg(Pred[1].getReg());
+ return true;
+ }
+ return false;
+}
+
+bool ARMBaseInstrInfo::
+SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1,
+ const SmallVectorImpl<MachineOperand> &Pred2) const {
+ if (Pred1.size() > 2 || Pred2.size() > 2)
+ return false;
+
+ ARMCC::CondCodes CC1 = (ARMCC::CondCodes)Pred1[0].getImm();
+ ARMCC::CondCodes CC2 = (ARMCC::CondCodes)Pred2[0].getImm();
+ if (CC1 == CC2)
+ return true;
+
+ switch (CC1) {
+ default:
+ return false;
+ case ARMCC::AL:
+ return true;
+ case ARMCC::HS:
+ return CC2 == ARMCC::HI;
+ case ARMCC::LS:
+ return CC2 == ARMCC::LO || CC2 == ARMCC::EQ;
+ case ARMCC::GE:
+ return CC2 == ARMCC::GT;
+ case ARMCC::LE:
+ return CC2 == ARMCC::LT;
+ }
+}
+
+bool ARMBaseInstrInfo::DefinesPredicate(MachineInstr *MI,
+ std::vector<MachineOperand> &Pred) const {
+ // FIXME: This confuses implicit_def with optional CPSR def.
+ const TargetInstrDesc &TID = MI->getDesc();
+ if (!TID.getImplicitDefs() && !TID.hasOptionalDef())
+ return false;
+
+ bool Found = false;
+ for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+ const MachineOperand &MO = MI->getOperand(i);
+ if (MO.isReg() && MO.getReg() == ARM::CPSR) {
+ Pred.push_back(MO);
+ Found = true;
+ }
+ }
+
+ return Found;
+}
+
+
+/// FIXME: Works around a gcc miscompilation with -fstrict-aliasing
+static unsigned getNumJTEntries(const std::vector<MachineJumpTableEntry> &JT,
+ unsigned JTI) DISABLE_INLINE;
+static unsigned getNumJTEntries(const std::vector<MachineJumpTableEntry> &JT,
+ unsigned JTI) {
+ return JT[JTI].MBBs.size();
+}
+
+/// GetInstSize - Return the size of the specified MachineInstr.
+///
+unsigned ARMBaseInstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const {
+ const MachineBasicBlock &MBB = *MI->getParent();
+ const MachineFunction *MF = MBB.getParent();
+ const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
+
+ // Basic size info comes from the TSFlags field.
+ const TargetInstrDesc &TID = MI->getDesc();
+ unsigned TSFlags = TID.TSFlags;
+
+ unsigned Opc = MI->getOpcode();
+ switch ((TSFlags & ARMII::SizeMask) >> ARMII::SizeShift) {
+ default: {
+ // If this machine instr is an inline asm, measure it.
+ if (MI->getOpcode() == ARM::INLINEASM)
+ return getInlineAsmLength(MI->getOperand(0).getSymbolName(), *MAI);
+ if (MI->isLabel())
+ return 0;
+ switch (Opc) {
+ default:
+ llvm_unreachable("Unknown or unset size field for instr!");
+ case TargetInstrInfo::IMPLICIT_DEF:
+ case TargetInstrInfo::KILL:
+ case TargetInstrInfo::DBG_LABEL:
+ case TargetInstrInfo::EH_LABEL:
+ return 0;
+ }
+ break;
+ }
+ case ARMII::Size8Bytes: return 8; // ARM instruction x 2.
+ case ARMII::Size4Bytes: return 4; // ARM / Thumb2 instruction.
+ case ARMII::Size2Bytes: return 2; // Thumb1 instruction.
+ case ARMII::SizeSpecial: {
+ switch (Opc) {
+ case ARM::CONSTPOOL_ENTRY:
+ // If this machine instr is a constant pool entry, its size is recorded as
+ // operand #2.
+ return MI->getOperand(2).getImm();
+ case ARM::Int_eh_sjlj_setjmp:
+ return 24;
+ case ARM::t2Int_eh_sjlj_setjmp:
+ return 20;
+ case ARM::BR_JTr:
+ case ARM::BR_JTm:
+ case ARM::BR_JTadd:
+ case ARM::tBR_JTr:
+ case ARM::t2BR_JT:
+ case ARM::t2TBB:
+ case ARM::t2TBH: {
+ // These are jumptable branches, i.e. a branch followed by an inlined
+ // jumptable. The size is 4 + 4 * number of entries. For TBB, each
+ // entry is one byte; TBH two byte each.
+ unsigned EntrySize = (Opc == ARM::t2TBB)
+ ? 1 : ((Opc == ARM::t2TBH) ? 2 : 4);
+ unsigned NumOps = TID.getNumOperands();
+ MachineOperand JTOP =
+ MI->getOperand(NumOps - (TID.isPredicable() ? 3 : 2));
+ unsigned JTI = JTOP.getIndex();
+ const MachineJumpTableInfo *MJTI = MF->getJumpTableInfo();
+ const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables();
+ assert(JTI < JT.size());
+ // Thumb instructions are 2 byte aligned, but JT entries are 4 byte
+ // 4 aligned. The assembler / linker may add 2 byte padding just before
+ // the JT entries. The size does not include this padding; the
+ // constant islands pass does separate bookkeeping for it.
+ // FIXME: If we know the size of the function is less than (1 << 16) *2
+ // bytes, we can use 16-bit entries instead. Then there won't be an
+ // alignment issue.
+ unsigned InstSize = (Opc == ARM::tBR_JTr || Opc == ARM::t2BR_JT) ? 2 : 4;
+ unsigned NumEntries = getNumJTEntries(JT, JTI);
+ if (Opc == ARM::t2TBB && (NumEntries & 1))
+ // Make sure the instruction that follows TBB is 2-byte aligned.
+ // FIXME: Constant island pass should insert an "ALIGN" instruction
+ // instead.
+ ++NumEntries;
+ return NumEntries * EntrySize + InstSize;
+ }
+ default:
+ // Otherwise, pseudo-instruction sizes are zero.
+ return 0;
+ }
+ }
+ }
+ return 0; // Not reached
+}
+
+/// Return true if the instruction is a register to register move and
+/// leave the source and dest operands in the passed parameters.
+///
+bool
+ARMBaseInstrInfo::isMoveInstr(const MachineInstr &MI,
+ unsigned &SrcReg, unsigned &DstReg,
+ unsigned& SrcSubIdx, unsigned& DstSubIdx) const {
+ SrcSubIdx = DstSubIdx = 0; // No sub-registers.
+
+ switch (MI.getOpcode()) {
+ default: break;
+ case ARM::FCPYS:
+ case ARM::FCPYD:
+ case ARM::VMOVD:
+ case ARM::VMOVQ: {
+ SrcReg = MI.getOperand(1).getReg();
+ DstReg = MI.getOperand(0).getReg();
+ return true;
+ }
+ case ARM::MOVr:
+ case ARM::tMOVr:
+ case ARM::tMOVgpr2tgpr:
+ case ARM::tMOVtgpr2gpr:
+ case ARM::tMOVgpr2gpr:
+ case ARM::t2MOVr: {
+ assert(MI.getDesc().getNumOperands() >= 2 &&
+ MI.getOperand(0).isReg() &&
+ MI.getOperand(1).isReg() &&
+ "Invalid ARM MOV instruction");
+ SrcReg = MI.getOperand(1).getReg();
+ DstReg = MI.getOperand(0).getReg();
+ return true;
+ }
+ }
+
+ return false;
+}
+
+unsigned
+ARMBaseInstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
+ int &FrameIndex) const {
+ switch (MI->getOpcode()) {
+ default: break;
+ case ARM::LDR:
+ case ARM::t2LDRs: // FIXME: don't use t2LDRs to access frame.
+ if (MI->getOperand(1).isFI() &&
+ MI->getOperand(2).isReg() &&
+ MI->getOperand(3).isImm() &&
+ MI->getOperand(2).getReg() == 0 &&
+ MI->getOperand(3).getImm() == 0) {
+ FrameIndex = MI->getOperand(1).getIndex();
+ return MI->getOperand(0).getReg();
+ }
+ break;
+ case ARM::t2LDRi12:
+ case ARM::tRestore:
+ if (MI->getOperand(1).isFI() &&
+ MI->getOperand(2).isImm() &&
+ MI->getOperand(2).getImm() == 0) {
+ FrameIndex = MI->getOperand(1).getIndex();
+ return MI->getOperand(0).getReg();
+ }
+ break;
+ case ARM::FLDD:
+ case ARM::FLDS:
+ if (MI->getOperand(1).isFI() &&
+ MI->getOperand(2).isImm() &&
+ MI->getOperand(2).getImm() == 0) {
+ FrameIndex = MI->getOperand(1).getIndex();
+ return MI->getOperand(0).getReg();
+ }
+ break;
+ }
+
+ return 0;
+}
+
+unsigned
+ARMBaseInstrInfo::isStoreToStackSlot(const MachineInstr *MI,
+ int &FrameIndex) const {
+ switch (MI->getOpcode()) {
+ default: break;
+ case ARM::STR:
+ case ARM::t2STRs: // FIXME: don't use t2STRs to access frame.
+ if (MI->getOperand(1).isFI() &&
+ MI->getOperand(2).isReg() &&
+ MI->getOperand(3).isImm() &&
+ MI->getOperand(2).getReg() == 0 &&
+ MI->getOperand(3).getImm() == 0) {
+ FrameIndex = MI->getOperand(1).getIndex();
+ return MI->getOperand(0).getReg();
+ }
+ break;
+ case ARM::t2STRi12:
+ case ARM::tSpill:
+ if (MI->getOperand(1).isFI() &&
+ MI->getOperand(2).isImm() &&
+ MI->getOperand(2).getImm() == 0) {
+ FrameIndex = MI->getOperand(1).getIndex();
+ return MI->getOperand(0).getReg();
+ }
+ break;
+ case ARM::FSTD:
+ case ARM::FSTS:
+ if (MI->getOperand(1).isFI() &&
+ MI->getOperand(2).isImm() &&
+ MI->getOperand(2).getImm() == 0) {
+ FrameIndex = MI->getOperand(1).getIndex();
+ return MI->getOperand(0).getReg();
+ }
+ break;
+ }
+
+ return 0;
+}
+
+bool
+ARMBaseInstrInfo::copyRegToReg(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ unsigned DestReg, unsigned SrcReg,
+ const TargetRegisterClass *DestRC,
+ const TargetRegisterClass *SrcRC) const {
+ DebugLoc DL = DebugLoc::getUnknownLoc();
+ if (I != MBB.end()) DL = I->getDebugLoc();
+
+ if (DestRC != SrcRC) {
+ // Allow DPR / DPR_VFP2 / DPR_8 cross-class copies
+ // Allow QPR / QPR_VFP2 cross-class copies
+ if (DestRC == ARM::DPRRegisterClass) {
+ if (SrcRC == ARM::DPR_VFP2RegisterClass ||
+ SrcRC == ARM::DPR_8RegisterClass) {
+ } else
+ return false;
+ } else if (DestRC == ARM::DPR_VFP2RegisterClass) {
+ if (SrcRC == ARM::DPRRegisterClass ||
+ SrcRC == ARM::DPR_8RegisterClass) {
+ } else
+ return false;
+ } else if (DestRC == ARM::DPR_8RegisterClass) {
+ if (SrcRC == ARM::DPRRegisterClass ||
+ SrcRC == ARM::DPR_VFP2RegisterClass) {
+ } else
+ return false;
+ } else if ((DestRC == ARM::QPRRegisterClass &&
+ SrcRC == ARM::QPR_VFP2RegisterClass) ||
+ (DestRC == ARM::QPR_VFP2RegisterClass &&
+ SrcRC == ARM::QPRRegisterClass)) {
+ } else
+ return false;
+ }
+
+ if (DestRC == ARM::GPRRegisterClass) {
+ AddDefaultCC(AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::MOVr),
+ DestReg).addReg(SrcReg)));
+ } else if (DestRC == ARM::SPRRegisterClass) {
+ AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::FCPYS), DestReg)
+ .addReg(SrcReg));
+ } else if ((DestRC == ARM::DPRRegisterClass) ||
+ (DestRC == ARM::DPR_VFP2RegisterClass) ||
+ (DestRC == ARM::DPR_8RegisterClass)) {
+ AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::FCPYD), DestReg)
+ .addReg(SrcReg));
+ } else if (DestRC == ARM::QPRRegisterClass ||
+ DestRC == ARM::QPR_VFP2RegisterClass) {
+ BuildMI(MBB, I, DL, get(ARM::VMOVQ), DestReg).addReg(SrcReg);
+ } else {
+ return false;
+ }
+
+ return true;
+}
+
+void ARMBaseInstrInfo::
+storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+ unsigned SrcReg, bool isKill, int FI,
+ const TargetRegisterClass *RC) const {
+ DebugLoc DL = DebugLoc::getUnknownLoc();
+ if (I != MBB.end()) DL = I->getDebugLoc();
+ MachineFunction &MF = *MBB.getParent();
+ MachineFrameInfo &MFI = *MF.getFrameInfo();
+
+ MachineMemOperand *MMO =
+ MF.getMachineMemOperand(PseudoSourceValue::getFixedStack(FI),
+ MachineMemOperand::MOStore, 0,
+ MFI.getObjectSize(FI),
+ MFI.getObjectAlignment(FI));
+
+ if (RC == ARM::GPRRegisterClass) {
+ AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::STR))
+ .addReg(SrcReg, getKillRegState(isKill))
+ .addFrameIndex(FI).addReg(0).addImm(0).addMemOperand(MMO));
+ } else if (RC == ARM::DPRRegisterClass ||
+ RC == ARM::DPR_VFP2RegisterClass ||
+ RC == ARM::DPR_8RegisterClass) {
+ AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::FSTD))
+ .addReg(SrcReg, getKillRegState(isKill))
+ .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
+ } else if (RC == ARM::SPRRegisterClass) {
+ AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::FSTS))
+ .addReg(SrcReg, getKillRegState(isKill))
+ .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
+ } else {
+ assert((RC == ARM::QPRRegisterClass ||
+ RC == ARM::QPR_VFP2RegisterClass) && "Unknown regclass!");
+ // FIXME: Neon instructions should support predicates
+ BuildMI(MBB, I, DL, get(ARM::VSTRQ)).addReg(SrcReg, getKillRegState(isKill))
+ .addFrameIndex(FI).addImm(0).addMemOperand(MMO);
+ }
+}
+
+void ARMBaseInstrInfo::
+loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+ unsigned DestReg, int FI,
+ const TargetRegisterClass *RC) const {
+ DebugLoc DL = DebugLoc::getUnknownLoc();
+ if (I != MBB.end()) DL = I->getDebugLoc();
+ MachineFunction &MF = *MBB.getParent();
+ MachineFrameInfo &MFI = *MF.getFrameInfo();
+
+ MachineMemOperand *MMO =
+ MF.getMachineMemOperand(PseudoSourceValue::getFixedStack(FI),
+ MachineMemOperand::MOLoad, 0,
+ MFI.getObjectSize(FI),
+ MFI.getObjectAlignment(FI));
+
+ if (RC == ARM::GPRRegisterClass) {
+ AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::LDR), DestReg)
+ .addFrameIndex(FI).addReg(0).addImm(0).addMemOperand(MMO));
+ } else if (RC == ARM::DPRRegisterClass ||
+ RC == ARM::DPR_VFP2RegisterClass ||
+ RC == ARM::DPR_8RegisterClass) {
+ AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::FLDD), DestReg)
+ .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
+ } else if (RC == ARM::SPRRegisterClass) {
+ AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::FLDS), DestReg)
+ .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
+ } else {
+ assert((RC == ARM::QPRRegisterClass ||
+ RC == ARM::QPR_VFP2RegisterClass) && "Unknown regclass!");
+ // FIXME: Neon instructions should support predicates
+ BuildMI(MBB, I, DL, get(ARM::VLDRQ), DestReg).addFrameIndex(FI).addImm(0).addMemOperand(MMO);
+ }
+}
+
+MachineInstr *ARMBaseInstrInfo::
+foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
+ const SmallVectorImpl<unsigned> &Ops, int FI) const {
+ if (Ops.size() != 1) return NULL;
+
+ unsigned OpNum = Ops[0];
+ unsigned Opc = MI->getOpcode();
+ MachineInstr *NewMI = NULL;
+ if (Opc == ARM::MOVr || Opc == ARM::t2MOVr) {
+ // If it is updating CPSR, then it cannot be folded.
+ if (MI->getOperand(4).getReg() == ARM::CPSR && !MI->getOperand(4).isDead())
+ return NULL;
+ unsigned Pred = MI->getOperand(2).getImm();
+ unsigned PredReg = MI->getOperand(3).getReg();
+ if (OpNum == 0) { // move -> store
+ unsigned SrcReg = MI->getOperand(1).getReg();
+ bool isKill = MI->getOperand(1).isKill();
+ bool isUndef = MI->getOperand(1).isUndef();
+ if (Opc == ARM::MOVr)
+ NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::STR))
+ .addReg(SrcReg, getKillRegState(isKill) | getUndefRegState(isUndef))
+ .addFrameIndex(FI).addReg(0).addImm(0).addImm(Pred).addReg(PredReg);
+ else // ARM::t2MOVr
+ NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::t2STRi12))
+ .addReg(SrcReg, getKillRegState(isKill) | getUndefRegState(isUndef))
+ .addFrameIndex(FI).addImm(0).addImm(Pred).addReg(PredReg);
+ } else { // move -> load
+ unsigned DstReg = MI->getOperand(0).getReg();
+ bool isDead = MI->getOperand(0).isDead();
+ bool isUndef = MI->getOperand(0).isUndef();
+ if (Opc == ARM::MOVr)
+ NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::LDR))
+ .addReg(DstReg,
+ RegState::Define |
+ getDeadRegState(isDead) |
+ getUndefRegState(isUndef))
+ .addFrameIndex(FI).addReg(0).addImm(0).addImm(Pred).addReg(PredReg);
+ else // ARM::t2MOVr
+ NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::t2LDRi12))
+ .addReg(DstReg,
+ RegState::Define |
+ getDeadRegState(isDead) |
+ getUndefRegState(isUndef))
+ .addFrameIndex(FI).addImm(0).addImm(Pred).addReg(PredReg);
+ }
+ } else if (Opc == ARM::tMOVgpr2gpr ||
+ Opc == ARM::tMOVtgpr2gpr ||
+ Opc == ARM::tMOVgpr2tgpr) {
+ if (OpNum == 0) { // move -> store
+ unsigned SrcReg = MI->getOperand(1).getReg();
+ bool isKill = MI->getOperand(1).isKill();
+ bool isUndef = MI->getOperand(1).isUndef();
+ NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::t2STRi12))
+ .addReg(SrcReg, getKillRegState(isKill) | getUndefRegState(isUndef))
+ .addFrameIndex(FI).addImm(0).addImm(ARMCC::AL).addReg(0);
+ } else { // move -> load
+ unsigned DstReg = MI->getOperand(0).getReg();
+ bool isDead = MI->getOperand(0).isDead();
+ bool isUndef = MI->getOperand(0).isUndef();
+ NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::t2LDRi12))
+ .addReg(DstReg,
+ RegState::Define |
+ getDeadRegState(isDead) |
+ getUndefRegState(isUndef))
+ .addFrameIndex(FI).addImm(0).addImm(ARMCC::AL).addReg(0);
+ }
+ } else if (Opc == ARM::FCPYS) {
+ unsigned Pred = MI->getOperand(2).getImm();
+ unsigned PredReg = MI->getOperand(3).getReg();
+ if (OpNum == 0) { // move -> store
+ unsigned SrcReg = MI->getOperand(1).getReg();
+ bool isKill = MI->getOperand(1).isKill();
+ bool isUndef = MI->getOperand(1).isUndef();
+ NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::FSTS))
+ .addReg(SrcReg, getKillRegState(isKill) | getUndefRegState(isUndef))
+ .addFrameIndex(FI)
+ .addImm(0).addImm(Pred).addReg(PredReg);
+ } else { // move -> load
+ unsigned DstReg = MI->getOperand(0).getReg();
+ bool isDead = MI->getOperand(0).isDead();
+ bool isUndef = MI->getOperand(0).isUndef();
+ NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::FLDS))
+ .addReg(DstReg,
+ RegState::Define |
+ getDeadRegState(isDead) |
+ getUndefRegState(isUndef))
+ .addFrameIndex(FI).addImm(0).addImm(Pred).addReg(PredReg);
+ }
+ }
+ else if (Opc == ARM::FCPYD) {
+ unsigned Pred = MI->getOperand(2).getImm();
+ unsigned PredReg = MI->getOperand(3).getReg();
+ if (OpNum == 0) { // move -> store
+ unsigned SrcReg = MI->getOperand(1).getReg();
+ bool isKill = MI->getOperand(1).isKill();
+ bool isUndef = MI->getOperand(1).isUndef();
+ NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::FSTD))
+ .addReg(SrcReg, getKillRegState(isKill) | getUndefRegState(isUndef))
+ .addFrameIndex(FI).addImm(0).addImm(Pred).addReg(PredReg);
+ } else { // move -> load
+ unsigned DstReg = MI->getOperand(0).getReg();
+ bool isDead = MI->getOperand(0).isDead();
+ bool isUndef = MI->getOperand(0).isUndef();
+ NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::FLDD))
+ .addReg(DstReg,
+ RegState::Define |
+ getDeadRegState(isDead) |
+ getUndefRegState(isUndef))
+ .addFrameIndex(FI).addImm(0).addImm(Pred).addReg(PredReg);
+ }
+ }
+
+ return NewMI;
+}
+
+MachineInstr*
+ARMBaseInstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
+ MachineInstr* MI,
+ const SmallVectorImpl<unsigned> &Ops,
+ MachineInstr* LoadMI) const {
+ // FIXME
+ return 0;
+}
+
+bool
+ARMBaseInstrInfo::canFoldMemoryOperand(const MachineInstr *MI,
+ const SmallVectorImpl<unsigned> &Ops) const {
+ if (Ops.size() != 1) return false;
+
+ unsigned Opc = MI->getOpcode();
+ if (Opc == ARM::MOVr || Opc == ARM::t2MOVr) {
+ // If it is updating CPSR, then it cannot be folded.
+ return MI->getOperand(4).getReg() != ARM::CPSR ||
+ MI->getOperand(4).isDead();
+ } else if (Opc == ARM::tMOVgpr2gpr ||
+ Opc == ARM::tMOVtgpr2gpr ||
+ Opc == ARM::tMOVgpr2tgpr) {
+ return true;
+ } else if (Opc == ARM::FCPYS || Opc == ARM::FCPYD) {
+ return true;
+ } else if (Opc == ARM::VMOVD || Opc == ARM::VMOVQ) {
+ return false; // FIXME
+ }
+
+ return false;
+}
+
+/// getInstrPredicate - If instruction is predicated, returns its predicate
+/// condition, otherwise returns AL. It also returns the condition code
+/// register by reference.
+ARMCC::CondCodes
+llvm::getInstrPredicate(const MachineInstr *MI, unsigned &PredReg) {
+ int PIdx = MI->findFirstPredOperandIdx();
+ if (PIdx == -1) {
+ PredReg = 0;
+ return ARMCC::AL;
+ }
+
+ PredReg = MI->getOperand(PIdx+1).getReg();
+ return (ARMCC::CondCodes)MI->getOperand(PIdx).getImm();
+}
+
+
+int llvm::getMatchingCondBranchOpcode(int Opc) {
+ if (Opc == ARM::B)
+ return ARM::Bcc;
+ else if (Opc == ARM::tB)
+ return ARM::tBcc;
+ else if (Opc == ARM::t2B)
+ return ARM::t2Bcc;
+
+ llvm_unreachable("Unknown unconditional branch opcode!");
+ return 0;
+}
+
+
+void llvm::emitARMRegPlusImmediate(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator &MBBI, DebugLoc dl,
+ unsigned DestReg, unsigned BaseReg, int NumBytes,
+ ARMCC::CondCodes Pred, unsigned PredReg,
+ const ARMBaseInstrInfo &TII) {
+ bool isSub = NumBytes < 0;
+ if (isSub) NumBytes = -NumBytes;
+
+ while (NumBytes) {
+ unsigned RotAmt = ARM_AM::getSOImmValRotate(NumBytes);
+ unsigned ThisVal = NumBytes & ARM_AM::rotr32(0xFF, RotAmt);
+ assert(ThisVal && "Didn't extract field correctly");
+
+ // We will handle these bits from offset, clear them.
+ NumBytes &= ~ThisVal;
+
+ assert(ARM_AM::getSOImmVal(ThisVal) != -1 && "Bit extraction didn't work?");
+
+ // Build the new ADD / SUB.
+ unsigned Opc = isSub ? ARM::SUBri : ARM::ADDri;
+ BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg)
+ .addReg(BaseReg, RegState::Kill).addImm(ThisVal)
+ .addImm((unsigned)Pred).addReg(PredReg).addReg(0);
+ BaseReg = DestReg;
+ }
+}
+
+bool llvm::rewriteARMFrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
+ unsigned FrameReg, int &Offset,
+ const ARMBaseInstrInfo &TII) {
+ unsigned Opcode = MI.getOpcode();
+ const TargetInstrDesc &Desc = MI.getDesc();
+ unsigned AddrMode = (Desc.TSFlags & ARMII::AddrModeMask);
+ bool isSub = false;
+
+ // Memory operands in inline assembly always use AddrMode2.
+ if (Opcode == ARM::INLINEASM)
+ AddrMode = ARMII::AddrMode2;
+
+ if (Opcode == ARM::ADDri) {
+ Offset += MI.getOperand(FrameRegIdx+1).getImm();
+ if (Offset == 0) {
+ // Turn it into a move.
+ MI.setDesc(TII.get(ARM::MOVr));
+ MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
+ MI.RemoveOperand(FrameRegIdx+1);
+ Offset = 0;
+ return true;
+ } else if (Offset < 0) {
+ Offset = -Offset;
+ isSub = true;
+ MI.setDesc(TII.get(ARM::SUBri));
+ }
+
+ // Common case: small offset, fits into instruction.
+ if (ARM_AM::getSOImmVal(Offset) != -1) {
+ // Replace the FrameIndex with sp / fp
+ MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
+ MI.getOperand(FrameRegIdx+1).ChangeToImmediate(Offset);
+ Offset = 0;
+ return true;
+ }
+
+ // Otherwise, pull as much of the immedidate into this ADDri/SUBri
+ // as possible.
+ unsigned RotAmt = ARM_AM::getSOImmValRotate(Offset);
+ unsigned ThisImmVal = Offset & ARM_AM::rotr32(0xFF, RotAmt);
+
+ // We will handle these bits from offset, clear them.
+ Offset &= ~ThisImmVal;
+
+ // Get the properly encoded SOImmVal field.
+ assert(ARM_AM::getSOImmVal(ThisImmVal) != -1 &&
+ "Bit extraction didn't work?");
+ MI.getOperand(FrameRegIdx+1).ChangeToImmediate(ThisImmVal);
+ } else {
+ unsigned ImmIdx = 0;
+ int InstrOffs = 0;
+ unsigned NumBits = 0;
+ unsigned Scale = 1;
+ switch (AddrMode) {
+ case ARMII::AddrMode2: {
+ ImmIdx = FrameRegIdx+2;
+ InstrOffs = ARM_AM::getAM2Offset(MI.getOperand(ImmIdx).getImm());
+ if (ARM_AM::getAM2Op(MI.getOperand(ImmIdx).getImm()) == ARM_AM::sub)
+ InstrOffs *= -1;
+ NumBits = 12;
+ break;
+ }
+ case ARMII::AddrMode3: {
+ ImmIdx = FrameRegIdx+2;
+ InstrOffs = ARM_AM::getAM3Offset(MI.getOperand(ImmIdx).getImm());
+ if (ARM_AM::getAM3Op(MI.getOperand(ImmIdx).getImm()) == ARM_AM::sub)
+ InstrOffs *= -1;
+ NumBits = 8;
+ break;
+ }
+ case ARMII::AddrMode4:
+ // Can't fold any offset even if it's zero.
+ return false;
+ case ARMII::AddrMode5: {
+ ImmIdx = FrameRegIdx+1;
+ InstrOffs = ARM_AM::getAM5Offset(MI.getOperand(ImmIdx).getImm());
+ if (ARM_AM::getAM5Op(MI.getOperand(ImmIdx).getImm()) == ARM_AM::sub)
+ InstrOffs *= -1;
+ NumBits = 8;
+ Scale = 4;
+ break;
+ }
+ default:
+ llvm_unreachable("Unsupported addressing mode!");
+ break;
+ }
+
+ Offset += InstrOffs * Scale;
+ assert((Offset & (Scale-1)) == 0 && "Can't encode this offset!");
+ if (Offset < 0) {
+ Offset = -Offset;
+ isSub = true;
+ }
+
+ // Attempt to fold address comp. if opcode has offset bits
+ if (NumBits > 0) {
+ // Common case: small offset, fits into instruction.
+ MachineOperand &ImmOp = MI.getOperand(ImmIdx);
+ int ImmedOffset = Offset / Scale;
+ unsigned Mask = (1 << NumBits) - 1;
+ if ((unsigned)Offset <= Mask * Scale) {
+ // Replace the FrameIndex with sp
+ MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
+ if (isSub)
+ ImmedOffset |= 1 << NumBits;
+ ImmOp.ChangeToImmediate(ImmedOffset);
+ Offset = 0;
+ return true;
+ }
+
+ // Otherwise, it didn't fit. Pull in what we can to simplify the immed.
+ ImmedOffset = ImmedOffset & Mask;
+ if (isSub)
+ ImmedOffset |= 1 << NumBits;
+ ImmOp.ChangeToImmediate(ImmedOffset);
+ Offset &= ~(Mask*Scale);
+ }
+ }
+
+ Offset = (isSub) ? -Offset : Offset;
+ return Offset == 0;
+}
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.h b/lib/Target/ARM/ARMBaseInstrInfo.h
new file mode 100644
index 0000000..a13155b
--- /dev/null
+++ b/lib/Target/ARM/ARMBaseInstrInfo.h
@@ -0,0 +1,333 @@
+//===- ARMBaseInstrInfo.h - ARM Base Instruction Information -------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Base ARM implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARMBASEINSTRUCTIONINFO_H
+#define ARMBASEINSTRUCTIONINFO_H
+
+#include "ARM.h"
+#include "ARMRegisterInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+namespace llvm {
+
+/// ARMII - This namespace holds all of the target specific flags that
+/// instruction info tracks.
+///
+namespace ARMII {
+ enum {
+ //===------------------------------------------------------------------===//
+ // Instruction Flags.
+
+ //===------------------------------------------------------------------===//
+ // This four-bit field describes the addressing mode used.
+
+ AddrModeMask = 0xf,
+ AddrModeNone = 0,
+ AddrMode1 = 1,
+ AddrMode2 = 2,
+ AddrMode3 = 3,
+ AddrMode4 = 4,
+ AddrMode5 = 5,
+ AddrMode6 = 6,
+ AddrModeT1_1 = 7,
+ AddrModeT1_2 = 8,
+ AddrModeT1_4 = 9,
+ AddrModeT1_s = 10, // i8 * 4 for pc and sp relative data
+ AddrModeT2_i12 = 11,
+ AddrModeT2_i8 = 12,
+ AddrModeT2_so = 13,
+ AddrModeT2_pc = 14, // +/- i12 for pc relative data
+ AddrModeT2_i8s4 = 15, // i8 * 4
+
+ // Size* - Flags to keep track of the size of an instruction.
+ SizeShift = 4,
+ SizeMask = 7 << SizeShift,
+ SizeSpecial = 1, // 0 byte pseudo or special case.
+ Size8Bytes = 2,
+ Size4Bytes = 3,
+ Size2Bytes = 4,
+
+ // IndexMode - Unindex, pre-indexed, or post-indexed. Only valid for load
+ // and store ops
+ IndexModeShift = 7,
+ IndexModeMask = 3 << IndexModeShift,
+ IndexModePre = 1,
+ IndexModePost = 2,
+
+ //===------------------------------------------------------------------===//
+ // Instruction encoding formats.
+ //
+ FormShift = 9,
+ FormMask = 0x3f << FormShift,
+
+ // Pseudo instructions
+ Pseudo = 0 << FormShift,
+
+ // Multiply instructions
+ MulFrm = 1 << FormShift,
+
+ // Branch instructions
+ BrFrm = 2 << FormShift,
+ BrMiscFrm = 3 << FormShift,
+
+ // Data Processing instructions
+ DPFrm = 4 << FormShift,
+ DPSoRegFrm = 5 << FormShift,
+
+ // Load and Store
+ LdFrm = 6 << FormShift,
+ StFrm = 7 << FormShift,
+ LdMiscFrm = 8 << FormShift,
+ StMiscFrm = 9 << FormShift,
+ LdStMulFrm = 10 << FormShift,
+
+ // Miscellaneous arithmetic instructions
+ ArithMiscFrm = 11 << FormShift,
+
+ // Extend instructions
+ ExtFrm = 12 << FormShift,
+
+ // VFP formats
+ VFPUnaryFrm = 13 << FormShift,
+ VFPBinaryFrm = 14 << FormShift,
+ VFPConv1Frm = 15 << FormShift,
+ VFPConv2Frm = 16 << FormShift,
+ VFPConv3Frm = 17 << FormShift,
+ VFPConv4Frm = 18 << FormShift,
+ VFPConv5Frm = 19 << FormShift,
+ VFPLdStFrm = 20 << FormShift,
+ VFPLdStMulFrm = 21 << FormShift,
+ VFPMiscFrm = 22 << FormShift,
+
+ // Thumb format
+ ThumbFrm = 23 << FormShift,
+
+ // NEON format
+ NEONFrm = 24 << FormShift,
+ NEONGetLnFrm = 25 << FormShift,
+ NEONSetLnFrm = 26 << FormShift,
+ NEONDupFrm = 27 << FormShift,
+
+ //===------------------------------------------------------------------===//
+ // Misc flags.
+
+ // UnaryDP - Indicates this is a unary data processing instruction, i.e.
+ // it doesn't have a Rn operand.
+ UnaryDP = 1 << 15,
+
+ // Xform16Bit - Indicates this Thumb2 instruction may be transformed into
+ // a 16-bit Thumb instruction if certain conditions are met.
+ Xform16Bit = 1 << 16,
+
+ //===------------------------------------------------------------------===//
+ // Field shifts - such shifts are used to set field while generating
+ // machine instructions.
+ M_BitShift = 5,
+ ShiftImmShift = 5,
+ ShiftShift = 7,
+ N_BitShift = 7,
+ ImmHiShift = 8,
+ SoRotImmShift = 8,
+ RegRsShift = 8,
+ ExtRotImmShift = 10,
+ RegRdLoShift = 12,
+ RegRdShift = 12,
+ RegRdHiShift = 16,
+ RegRnShift = 16,
+ S_BitShift = 20,
+ W_BitShift = 21,
+ AM3_I_BitShift = 22,
+ D_BitShift = 22,
+ U_BitShift = 23,
+ P_BitShift = 24,
+ I_BitShift = 25,
+ CondShift = 28
+ };
+}
+
+class ARMBaseInstrInfo : public TargetInstrInfoImpl {
+protected:
+ // Can be only subclassed.
+ explicit ARMBaseInstrInfo();
+public:
+ // Return the non-pre/post incrementing version of 'Opc'. Return 0
+ // if there is not such an opcode.
+ virtual unsigned getUnindexedOpcode(unsigned Opc) const =0;
+
+ // Return true if the block does not fall through.
+ virtual bool BlockHasNoFallThrough(const MachineBasicBlock &MBB) const =0;
+
+ virtual MachineInstr *convertToThreeAddress(MachineFunction::iterator &MFI,
+ MachineBasicBlock::iterator &MBBI,
+ LiveVariables *LV) const;
+
+ virtual const ARMBaseRegisterInfo &getRegisterInfo() const =0;
+
+ // Branch analysis.
+ virtual bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+ MachineBasicBlock *&FBB,
+ SmallVectorImpl<MachineOperand> &Cond,
+ bool AllowModify) const;
+ virtual unsigned RemoveBranch(MachineBasicBlock &MBB) const;
+ virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+ MachineBasicBlock *FBB,
+ const SmallVectorImpl<MachineOperand> &Cond) const;
+
+ virtual
+ bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const;
+
+ // Predication support.
+ bool isPredicated(const MachineInstr *MI) const {
+ int PIdx = MI->findFirstPredOperandIdx();
+ return PIdx != -1 && MI->getOperand(PIdx).getImm() != ARMCC::AL;
+ }
+
+ ARMCC::CondCodes getPredicate(const MachineInstr *MI) const {
+ int PIdx = MI->findFirstPredOperandIdx();
+ return PIdx != -1 ? (ARMCC::CondCodes)MI->getOperand(PIdx).getImm()
+ : ARMCC::AL;
+ }
+
+ virtual
+ bool PredicateInstruction(MachineInstr *MI,
+ const SmallVectorImpl<MachineOperand> &Pred) const;
+
+ virtual
+ bool SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1,
+ const SmallVectorImpl<MachineOperand> &Pred2) const;
+
+ virtual bool DefinesPredicate(MachineInstr *MI,
+ std::vector<MachineOperand> &Pred) const;
+
+ /// GetInstSize - Returns the size of the specified MachineInstr.
+ ///
+ virtual unsigned GetInstSizeInBytes(const MachineInstr* MI) const;
+
+ /// Return true if the instruction is a register to register move and return
+ /// the source and dest operands and their sub-register indices by reference.
+ virtual bool isMoveInstr(const MachineInstr &MI,
+ unsigned &SrcReg, unsigned &DstReg,
+ unsigned &SrcSubIdx, unsigned &DstSubIdx) const;
+
+ virtual unsigned isLoadFromStackSlot(const MachineInstr *MI,
+ int &FrameIndex) const;
+ virtual unsigned isStoreToStackSlot(const MachineInstr *MI,
+ int &FrameIndex) const;
+
+ virtual bool copyRegToReg(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ unsigned DestReg, unsigned SrcReg,
+ const TargetRegisterClass *DestRC,
+ const TargetRegisterClass *SrcRC) const;
+
+ virtual void storeRegToStackSlot(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ unsigned SrcReg, bool isKill, int FrameIndex,
+ const TargetRegisterClass *RC) const;
+
+ virtual void loadRegFromStackSlot(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ unsigned DestReg, int FrameIndex,
+ const TargetRegisterClass *RC) const;
+
+ virtual bool canFoldMemoryOperand(const MachineInstr *MI,
+ const SmallVectorImpl<unsigned> &Ops) const;
+
+ virtual MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
+ MachineInstr* MI,
+ const SmallVectorImpl<unsigned> &Ops,
+ int FrameIndex) const;
+
+ virtual MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
+ MachineInstr* MI,
+ const SmallVectorImpl<unsigned> &Ops,
+ MachineInstr* LoadMI) const;
+
+};
+
+static inline
+const MachineInstrBuilder &AddDefaultPred(const MachineInstrBuilder &MIB) {
+ return MIB.addImm((int64_t)ARMCC::AL).addReg(0);
+}
+
+static inline
+const MachineInstrBuilder &AddDefaultCC(const MachineInstrBuilder &MIB) {
+ return MIB.addReg(0);
+}
+
+static inline
+const MachineInstrBuilder &AddDefaultT1CC(const MachineInstrBuilder &MIB,
+ bool isDead = false) {
+ return MIB.addReg(ARM::CPSR, getDefRegState(true) | getDeadRegState(isDead));
+}
+
+static inline
+const MachineInstrBuilder &AddNoT1CC(const MachineInstrBuilder &MIB) {
+ return MIB.addReg(0);
+}
+
+static inline
+bool isUncondBranchOpcode(int Opc) {
+ return Opc == ARM::B || Opc == ARM::tB || Opc == ARM::t2B;
+}
+
+static inline
+bool isCondBranchOpcode(int Opc) {
+ return Opc == ARM::Bcc || Opc == ARM::tBcc || Opc == ARM::t2Bcc;
+}
+
+static inline
+bool isJumpTableBranchOpcode(int Opc) {
+ return Opc == ARM::BR_JTr || Opc == ARM::BR_JTm || Opc == ARM::BR_JTadd ||
+ Opc == ARM::tBR_JTr || Opc == ARM::t2BR_JT;
+}
+
+/// getInstrPredicate - If instruction is predicated, returns its predicate
+/// condition, otherwise returns AL. It also returns the condition code
+/// register by reference.
+ARMCC::CondCodes getInstrPredicate(const MachineInstr *MI, unsigned &PredReg);
+
+int getMatchingCondBranchOpcode(int Opc);
+
+/// emitARMRegPlusImmediate / emitT2RegPlusImmediate - Emits a series of
+/// instructions to materializea destreg = basereg + immediate in ARM / Thumb2
+/// code.
+void emitARMRegPlusImmediate(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator &MBBI, DebugLoc dl,
+ unsigned DestReg, unsigned BaseReg, int NumBytes,
+ ARMCC::CondCodes Pred, unsigned PredReg,
+ const ARMBaseInstrInfo &TII);
+
+void emitT2RegPlusImmediate(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator &MBBI, DebugLoc dl,
+ unsigned DestReg, unsigned BaseReg, int NumBytes,
+ ARMCC::CondCodes Pred, unsigned PredReg,
+ const ARMBaseInstrInfo &TII);
+
+
+/// rewriteARMFrameIndex / rewriteT2FrameIndex -
+/// Rewrite MI to access 'Offset' bytes from the FP. Return false if the
+/// offset could not be handled directly in MI, and return the left-over
+/// portion by reference.
+bool rewriteARMFrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
+ unsigned FrameReg, int &Offset,
+ const ARMBaseInstrInfo &TII);
+
+bool rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
+ unsigned FrameReg, int &Offset,
+ const ARMBaseInstrInfo &TII);
+
+} // End llvm namespace
+
+#endif
diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
new file mode 100644
index 0000000..42ef183
--- /dev/null
+++ b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
@@ -0,0 +1,1360 @@
+//===- ARMBaseRegisterInfo.cpp - ARM Register Information -------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the base ARM implementation of TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM.h"
+#include "ARMAddressingModes.h"
+#include "ARMBaseInstrInfo.h"
+#include "ARMBaseRegisterInfo.h"
+#include "ARMInstrInfo.h"
+#include "ARMMachineFunctionInfo.h"
+#include "ARMSubtarget.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/LLVMContext.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLocation.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetFrameInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/SmallVector.h"
+using namespace llvm;
+
+unsigned ARMBaseRegisterInfo::getRegisterNumbering(unsigned RegEnum,
+ bool *isSPVFP) {
+ if (isSPVFP)
+ *isSPVFP = false;
+
+ using namespace ARM;
+ switch (RegEnum) {
+ default:
+ llvm_unreachable("Unknown ARM register!");
+ case R0: case D0: case Q0: return 0;
+ case R1: case D1: case Q1: return 1;
+ case R2: case D2: case Q2: return 2;
+ case R3: case D3: case Q3: return 3;
+ case R4: case D4: case Q4: return 4;
+ case R5: case D5: case Q5: return 5;
+ case R6: case D6: case Q6: return 6;
+ case R7: case D7: case Q7: return 7;
+ case R8: case D8: case Q8: return 8;
+ case R9: case D9: case Q9: return 9;
+ case R10: case D10: case Q10: return 10;
+ case R11: case D11: case Q11: return 11;
+ case R12: case D12: case Q12: return 12;
+ case SP: case D13: case Q13: return 13;
+ case LR: case D14: case Q14: return 14;
+ case PC: case D15: case Q15: return 15;
+
+ case D16: return 16;
+ case D17: return 17;
+ case D18: return 18;
+ case D19: return 19;
+ case D20: return 20;
+ case D21: return 21;
+ case D22: return 22;
+ case D23: return 23;
+ case D24: return 24;
+ case D25: return 25;
+ case D26: return 27;
+ case D27: return 27;
+ case D28: return 28;
+ case D29: return 29;
+ case D30: return 30;
+ case D31: return 31;
+
+ case S0: case S1: case S2: case S3:
+ case S4: case S5: case S6: case S7:
+ case S8: case S9: case S10: case S11:
+ case S12: case S13: case S14: case S15:
+ case S16: case S17: case S18: case S19:
+ case S20: case S21: case S22: case S23:
+ case S24: case S25: case S26: case S27:
+ case S28: case S29: case S30: case S31: {
+ if (isSPVFP)
+ *isSPVFP = true;
+ switch (RegEnum) {
+ default: return 0; // Avoid compile time warning.
+ case S0: return 0;
+ case S1: return 1;
+ case S2: return 2;
+ case S3: return 3;
+ case S4: return 4;
+ case S5: return 5;
+ case S6: return 6;
+ case S7: return 7;
+ case S8: return 8;
+ case S9: return 9;
+ case S10: return 10;
+ case S11: return 11;
+ case S12: return 12;
+ case S13: return 13;
+ case S14: return 14;
+ case S15: return 15;
+ case S16: return 16;
+ case S17: return 17;
+ case S18: return 18;
+ case S19: return 19;
+ case S20: return 20;
+ case S21: return 21;
+ case S22: return 22;
+ case S23: return 23;
+ case S24: return 24;
+ case S25: return 25;
+ case S26: return 26;
+ case S27: return 27;
+ case S28: return 28;
+ case S29: return 29;
+ case S30: return 30;
+ case S31: return 31;
+ }
+ }
+ }
+}
+
+ARMBaseRegisterInfo::ARMBaseRegisterInfo(const ARMBaseInstrInfo &tii,
+ const ARMSubtarget &sti)
+ : ARMGenRegisterInfo(ARM::ADJCALLSTACKDOWN, ARM::ADJCALLSTACKUP),
+ TII(tii), STI(sti),
+ FramePtr((STI.isTargetDarwin() || STI.isThumb()) ? ARM::R7 : ARM::R11) {
+}
+
+const unsigned*
+ARMBaseRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
+ static const unsigned CalleeSavedRegs[] = {
+ ARM::LR, ARM::R11, ARM::R10, ARM::R9, ARM::R8,
+ ARM::R7, ARM::R6, ARM::R5, ARM::R4,
+
+ ARM::D15, ARM::D14, ARM::D13, ARM::D12,
+ ARM::D11, ARM::D10, ARM::D9, ARM::D8,
+ 0
+ };
+
+ static const unsigned DarwinCalleeSavedRegs[] = {
+ // Darwin ABI deviates from ARM standard ABI. R9 is not a callee-saved
+ // register.
+ ARM::LR, ARM::R7, ARM::R6, ARM::R5, ARM::R4,
+ ARM::R11, ARM::R10, ARM::R8,
+
+ ARM::D15, ARM::D14, ARM::D13, ARM::D12,
+ ARM::D11, ARM::D10, ARM::D9, ARM::D8,
+ 0
+ };
+ return STI.isTargetDarwin() ? DarwinCalleeSavedRegs : CalleeSavedRegs;
+}
+
+const TargetRegisterClass* const *
+ARMBaseRegisterInfo::getCalleeSavedRegClasses(const MachineFunction *MF) const {
+ static const TargetRegisterClass * const CalleeSavedRegClasses[] = {
+ &ARM::GPRRegClass, &ARM::GPRRegClass, &ARM::GPRRegClass,
+ &ARM::GPRRegClass, &ARM::GPRRegClass, &ARM::GPRRegClass,
+ &ARM::GPRRegClass, &ARM::GPRRegClass, &ARM::GPRRegClass,
+
+ &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass,
+ &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass,
+ 0
+ };
+
+ static const TargetRegisterClass * const ThumbCalleeSavedRegClasses[] = {
+ &ARM::GPRRegClass, &ARM::GPRRegClass, &ARM::GPRRegClass,
+ &ARM::GPRRegClass, &ARM::GPRRegClass, &ARM::tGPRRegClass,
+ &ARM::tGPRRegClass,&ARM::tGPRRegClass,&ARM::tGPRRegClass,
+
+ &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass,
+ &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass,
+ 0
+ };
+
+ static const TargetRegisterClass * const DarwinCalleeSavedRegClasses[] = {
+ &ARM::GPRRegClass, &ARM::GPRRegClass, &ARM::GPRRegClass,
+ &ARM::GPRRegClass, &ARM::GPRRegClass, &ARM::GPRRegClass,
+ &ARM::GPRRegClass, &ARM::GPRRegClass,
+
+ &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass,
+ &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass,
+ 0
+ };
+
+ static const TargetRegisterClass * const DarwinThumbCalleeSavedRegClasses[] ={
+ &ARM::GPRRegClass, &ARM::tGPRRegClass, &ARM::tGPRRegClass,
+ &ARM::tGPRRegClass, &ARM::tGPRRegClass, &ARM::GPRRegClass,
+ &ARM::GPRRegClass, &ARM::GPRRegClass,
+
+ &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass,
+ &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass,
+ 0
+ };
+
+ if (STI.isThumb1Only()) {
+ return STI.isTargetDarwin()
+ ? DarwinThumbCalleeSavedRegClasses : ThumbCalleeSavedRegClasses;
+ }
+ return STI.isTargetDarwin()
+ ? DarwinCalleeSavedRegClasses : CalleeSavedRegClasses;
+}
+
+BitVector ARMBaseRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+ // FIXME: avoid re-calculating this everytime.
+ BitVector Reserved(getNumRegs());
+ Reserved.set(ARM::SP);
+ Reserved.set(ARM::PC);
+ if (STI.isTargetDarwin() || hasFP(MF))
+ Reserved.set(FramePtr);
+ // Some targets reserve R9.
+ if (STI.isR9Reserved())
+ Reserved.set(ARM::R9);
+ return Reserved;
+}
+
+bool ARMBaseRegisterInfo::isReservedReg(const MachineFunction &MF,
+ unsigned Reg) const {
+ switch (Reg) {
+ default: break;
+ case ARM::SP:
+ case ARM::PC:
+ return true;
+ case ARM::R7:
+ case ARM::R11:
+ if (FramePtr == Reg && (STI.isTargetDarwin() || hasFP(MF)))
+ return true;
+ break;
+ case ARM::R9:
+ return STI.isR9Reserved();
+ }
+
+ return false;
+}
+
+const TargetRegisterClass *
+ARMBaseRegisterInfo::getPointerRegClass(unsigned Kind) const {
+ return ARM::GPRRegisterClass;
+}
+
+/// getAllocationOrder - Returns the register allocation order for a specified
+/// register class in the form of a pair of TargetRegisterClass iterators.
+std::pair<TargetRegisterClass::iterator,TargetRegisterClass::iterator>
+ARMBaseRegisterInfo::getAllocationOrder(const TargetRegisterClass *RC,
+ unsigned HintType, unsigned HintReg,
+ const MachineFunction &MF) const {
+ // Alternative register allocation orders when favoring even / odd registers
+ // of register pairs.
+
+ // No FP, R9 is available.
+ static const unsigned GPREven1[] = {
+ ARM::R0, ARM::R2, ARM::R4, ARM::R6, ARM::R8, ARM::R10,
+ ARM::R1, ARM::R3, ARM::R12,ARM::LR, ARM::R5, ARM::R7,
+ ARM::R9, ARM::R11
+ };
+ static const unsigned GPROdd1[] = {
+ ARM::R1, ARM::R3, ARM::R5, ARM::R7, ARM::R9, ARM::R11,
+ ARM::R0, ARM::R2, ARM::R12,ARM::LR, ARM::R4, ARM::R6,
+ ARM::R8, ARM::R10
+ };
+
+ // FP is R7, R9 is available.
+ static const unsigned GPREven2[] = {
+ ARM::R0, ARM::R2, ARM::R4, ARM::R8, ARM::R10,
+ ARM::R1, ARM::R3, ARM::R12,ARM::LR, ARM::R5, ARM::R6,
+ ARM::R9, ARM::R11
+ };
+ static const unsigned GPROdd2[] = {
+ ARM::R1, ARM::R3, ARM::R5, ARM::R9, ARM::R11,
+ ARM::R0, ARM::R2, ARM::R12,ARM::LR, ARM::R4, ARM::R6,
+ ARM::R8, ARM::R10
+ };
+
+ // FP is R11, R9 is available.
+ static const unsigned GPREven3[] = {
+ ARM::R0, ARM::R2, ARM::R4, ARM::R6, ARM::R8,
+ ARM::R1, ARM::R3, ARM::R10,ARM::R12,ARM::LR, ARM::R5, ARM::R7,
+ ARM::R9
+ };
+ static const unsigned GPROdd3[] = {
+ ARM::R1, ARM::R3, ARM::R5, ARM::R6, ARM::R9,
+ ARM::R0, ARM::R2, ARM::R10,ARM::R12,ARM::LR, ARM::R4, ARM::R7,
+ ARM::R8
+ };
+
+ // No FP, R9 is not available.
+ static const unsigned GPREven4[] = {
+ ARM::R0, ARM::R2, ARM::R4, ARM::R6, ARM::R10,
+ ARM::R1, ARM::R3, ARM::R12,ARM::LR, ARM::R5, ARM::R7, ARM::R8,
+ ARM::R11
+ };
+ static const unsigned GPROdd4[] = {
+ ARM::R1, ARM::R3, ARM::R5, ARM::R7, ARM::R11,
+ ARM::R0, ARM::R2, ARM::R12,ARM::LR, ARM::R4, ARM::R6, ARM::R8,
+ ARM::R10
+ };
+
+ // FP is R7, R9 is not available.
+ static const unsigned GPREven5[] = {
+ ARM::R0, ARM::R2, ARM::R4, ARM::R10,
+ ARM::R1, ARM::R3, ARM::R12,ARM::LR, ARM::R5, ARM::R6, ARM::R8,
+ ARM::R11
+ };
+ static const unsigned GPROdd5[] = {
+ ARM::R1, ARM::R3, ARM::R5, ARM::R11,
+ ARM::R0, ARM::R2, ARM::R12,ARM::LR, ARM::R4, ARM::R6, ARM::R8,
+ ARM::R10
+ };
+
+ // FP is R11, R9 is not available.
+ static const unsigned GPREven6[] = {
+ ARM::R0, ARM::R2, ARM::R4, ARM::R6,
+ ARM::R1, ARM::R3, ARM::R10,ARM::R12,ARM::LR, ARM::R5, ARM::R7, ARM::R8
+ };
+ static const unsigned GPROdd6[] = {
+ ARM::R1, ARM::R3, ARM::R5, ARM::R7,
+ ARM::R0, ARM::R2, ARM::R10,ARM::R12,ARM::LR, ARM::R4, ARM::R6, ARM::R8
+ };
+
+
+ if (HintType == ARMRI::RegPairEven) {
+ if (isPhysicalRegister(HintReg) && getRegisterPairEven(HintReg, MF) == 0)
+ // It's no longer possible to fulfill this hint. Return the default
+ // allocation order.
+ return std::make_pair(RC->allocation_order_begin(MF),
+ RC->allocation_order_end(MF));
+
+ if (!STI.isTargetDarwin() && !hasFP(MF)) {
+ if (!STI.isR9Reserved())
+ return std::make_pair(GPREven1,
+ GPREven1 + (sizeof(GPREven1)/sizeof(unsigned)));
+ else
+ return std::make_pair(GPREven4,
+ GPREven4 + (sizeof(GPREven4)/sizeof(unsigned)));
+ } else if (FramePtr == ARM::R7) {
+ if (!STI.isR9Reserved())
+ return std::make_pair(GPREven2,
+ GPREven2 + (sizeof(GPREven2)/sizeof(unsigned)));
+ else
+ return std::make_pair(GPREven5,
+ GPREven5 + (sizeof(GPREven5)/sizeof(unsigned)));
+ } else { // FramePtr == ARM::R11
+ if (!STI.isR9Reserved())
+ return std::make_pair(GPREven3,
+ GPREven3 + (sizeof(GPREven3)/sizeof(unsigned)));
+ else
+ return std::make_pair(GPREven6,
+ GPREven6 + (sizeof(GPREven6)/sizeof(unsigned)));
+ }
+ } else if (HintType == ARMRI::RegPairOdd) {
+ if (isPhysicalRegister(HintReg) && getRegisterPairOdd(HintReg, MF) == 0)
+ // It's no longer possible to fulfill this hint. Return the default
+ // allocation order.
+ return std::make_pair(RC->allocation_order_begin(MF),
+ RC->allocation_order_end(MF));
+
+ if (!STI.isTargetDarwin() && !hasFP(MF)) {
+ if (!STI.isR9Reserved())
+ return std::make_pair(GPROdd1,
+ GPROdd1 + (sizeof(GPROdd1)/sizeof(unsigned)));
+ else
+ return std::make_pair(GPROdd4,
+ GPROdd4 + (sizeof(GPROdd4)/sizeof(unsigned)));
+ } else if (FramePtr == ARM::R7) {
+ if (!STI.isR9Reserved())
+ return std::make_pair(GPROdd2,
+ GPROdd2 + (sizeof(GPROdd2)/sizeof(unsigned)));
+ else
+ return std::make_pair(GPROdd5,
+ GPROdd5 + (sizeof(GPROdd5)/sizeof(unsigned)));
+ } else { // FramePtr == ARM::R11
+ if (!STI.isR9Reserved())
+ return std::make_pair(GPROdd3,
+ GPROdd3 + (sizeof(GPROdd3)/sizeof(unsigned)));
+ else
+ return std::make_pair(GPROdd6,
+ GPROdd6 + (sizeof(GPROdd6)/sizeof(unsigned)));
+ }
+ }
+ return std::make_pair(RC->allocation_order_begin(MF),
+ RC->allocation_order_end(MF));
+}
+
+/// ResolveRegAllocHint - Resolves the specified register allocation hint
+/// to a physical register. Returns the physical register if it is successful.
+unsigned
+ARMBaseRegisterInfo::ResolveRegAllocHint(unsigned Type, unsigned Reg,
+ const MachineFunction &MF) const {
+ if (Reg == 0 || !isPhysicalRegister(Reg))
+ return 0;
+ if (Type == 0)
+ return Reg;
+ else if (Type == (unsigned)ARMRI::RegPairOdd)
+ // Odd register.
+ return getRegisterPairOdd(Reg, MF);
+ else if (Type == (unsigned)ARMRI::RegPairEven)
+ // Even register.
+ return getRegisterPairEven(Reg, MF);
+ return 0;
+}
+
+void
+ARMBaseRegisterInfo::UpdateRegAllocHint(unsigned Reg, unsigned NewReg,
+ MachineFunction &MF) const {
+ MachineRegisterInfo *MRI = &MF.getRegInfo();
+ std::pair<unsigned, unsigned> Hint = MRI->getRegAllocationHint(Reg);
+ if ((Hint.first == (unsigned)ARMRI::RegPairOdd ||
+ Hint.first == (unsigned)ARMRI::RegPairEven) &&
+ Hint.second && TargetRegisterInfo::isVirtualRegister(Hint.second)) {
+ // If 'Reg' is one of the even / odd register pair and it's now changed
+ // (e.g. coalesced) into a different register. The other register of the
+ // pair allocation hint must be updated to reflect the relationship
+ // change.
+ unsigned OtherReg = Hint.second;
+ Hint = MRI->getRegAllocationHint(OtherReg);
+ if (Hint.second == Reg)
+ // Make sure the pair has not already divorced.
+ MRI->setRegAllocationHint(OtherReg, Hint.first, NewReg);
+ }
+}
+
+/// hasFP - Return true if the specified function should have a dedicated frame
+/// pointer register. This is true if the function has variable sized allocas
+/// or if frame pointer elimination is disabled.
+///
+bool ARMBaseRegisterInfo::hasFP(const MachineFunction &MF) const {
+ const MachineFrameInfo *MFI = MF.getFrameInfo();
+ return (NoFramePointerElim ||
+ MFI->hasVarSizedObjects() ||
+ MFI->isFrameAddressTaken());
+}
+
+bool ARMBaseRegisterInfo::cannotEliminateFrame(const MachineFunction &MF) const {
+ const MachineFrameInfo *MFI = MF.getFrameInfo();
+ if (NoFramePointerElim && MFI->hasCalls())
+ return true;
+ return MFI->hasVarSizedObjects() || MFI->isFrameAddressTaken();
+}
+
+/// estimateStackSize - Estimate and return the size of the frame.
+static unsigned estimateStackSize(MachineFunction &MF, MachineFrameInfo *MFI) {
+ const MachineFrameInfo *FFI = MF.getFrameInfo();
+ int Offset = 0;
+ for (int i = FFI->getObjectIndexBegin(); i != 0; ++i) {
+ int FixedOff = -FFI->getObjectOffset(i);
+ if (FixedOff > Offset) Offset = FixedOff;
+ }
+ for (unsigned i = 0, e = FFI->getObjectIndexEnd(); i != e; ++i) {
+ if (FFI->isDeadObjectIndex(i))
+ continue;
+ Offset += FFI->getObjectSize(i);
+ unsigned Align = FFI->getObjectAlignment(i);
+ // Adjust to alignment boundary
+ Offset = (Offset+Align-1)/Align*Align;
+ }
+ return (unsigned)Offset;
+}
+
+/// estimateRSStackSizeLimit - Look at each instruction that references stack
+/// frames and return the stack size limit beyond which some of these
+/// instructions will require scratch register during their expansion later.
+unsigned
+ARMBaseRegisterInfo::estimateRSStackSizeLimit(MachineFunction &MF) const {
+ unsigned Limit = (1 << 12) - 1;
+ for (MachineFunction::iterator BB = MF.begin(),E = MF.end(); BB != E; ++BB) {
+ for (MachineBasicBlock::iterator I = BB->begin(), E = BB->end();
+ I != E; ++I) {
+ for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
+ if (!I->getOperand(i).isFI()) continue;
+
+ const TargetInstrDesc &Desc = TII.get(I->getOpcode());
+ unsigned AddrMode = (Desc.TSFlags & ARMII::AddrModeMask);
+ if (AddrMode == ARMII::AddrMode3 ||
+ AddrMode == ARMII::AddrModeT2_i8)
+ return (1 << 8) - 1;
+
+ if (AddrMode == ARMII::AddrMode5 ||
+ AddrMode == ARMII::AddrModeT2_i8s4)
+ Limit = std::min(Limit, ((1U << 8) - 1) * 4);
+
+ if (AddrMode == ARMII::AddrModeT2_i12 && hasFP(MF))
+ // When the stack offset is negative, we will end up using
+ // the i8 instructions instead.
+ return (1 << 8) - 1;
+ break; // At most one FI per instruction
+ }
+ }
+ }
+
+ return Limit;
+}
+
+void
+ARMBaseRegisterInfo::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+ RegScavenger *RS) const {
+ // This tells PEI to spill the FP as if it is any other callee-save register
+ // to take advantage the eliminateFrameIndex machinery. This also ensures it
+ // is spilled in the order specified by getCalleeSavedRegs() to make it easier
+ // to combine multiple loads / stores.
+ bool CanEliminateFrame = true;
+ bool CS1Spilled = false;
+ bool LRSpilled = false;
+ unsigned NumGPRSpills = 0;
+ SmallVector<unsigned, 4> UnspilledCS1GPRs;
+ SmallVector<unsigned, 4> UnspilledCS2GPRs;
+ ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+
+ // Don't spill FP if the frame can be eliminated. This is determined
+ // by scanning the callee-save registers to see if any is used.
+ const unsigned *CSRegs = getCalleeSavedRegs();
+ const TargetRegisterClass* const *CSRegClasses = getCalleeSavedRegClasses();
+ for (unsigned i = 0; CSRegs[i]; ++i) {
+ unsigned Reg = CSRegs[i];
+ bool Spilled = false;
+ if (MF.getRegInfo().isPhysRegUsed(Reg)) {
+ AFI->setCSRegisterIsSpilled(Reg);
+ Spilled = true;
+ CanEliminateFrame = false;
+ } else {
+ // Check alias registers too.
+ for (const unsigned *Aliases = getAliasSet(Reg); *Aliases; ++Aliases) {
+ if (MF.getRegInfo().isPhysRegUsed(*Aliases)) {
+ Spilled = true;
+ CanEliminateFrame = false;
+ }
+ }
+ }
+
+ if (CSRegClasses[i] == ARM::GPRRegisterClass ||
+ CSRegClasses[i] == ARM::tGPRRegisterClass) {
+ if (Spilled) {
+ NumGPRSpills++;
+
+ if (!STI.isTargetDarwin()) {
+ if (Reg == ARM::LR)
+ LRSpilled = true;
+ CS1Spilled = true;
+ continue;
+ }
+
+ // Keep track if LR and any of R4, R5, R6, and R7 is spilled.
+ switch (Reg) {
+ case ARM::LR:
+ LRSpilled = true;
+ // Fallthrough
+ case ARM::R4:
+ case ARM::R5:
+ case ARM::R6:
+ case ARM::R7:
+ CS1Spilled = true;
+ break;
+ default:
+ break;
+ }
+ } else {
+ if (!STI.isTargetDarwin()) {
+ UnspilledCS1GPRs.push_back(Reg);
+ continue;
+ }
+
+ switch (Reg) {
+ case ARM::R4:
+ case ARM::R5:
+ case ARM::R6:
+ case ARM::R7:
+ case ARM::LR:
+ UnspilledCS1GPRs.push_back(Reg);
+ break;
+ default:
+ UnspilledCS2GPRs.push_back(Reg);
+ break;
+ }
+ }
+ }
+ }
+
+ bool ForceLRSpill = false;
+ if (!LRSpilled && AFI->isThumb1OnlyFunction()) {
+ unsigned FnSize = TII.GetFunctionSizeInBytes(MF);
+ // Force LR to be spilled if the Thumb function size is > 2048. This enables
+ // use of BL to implement far jump. If it turns out that it's not needed
+ // then the branch fix up path will undo it.
+ if (FnSize >= (1 << 11)) {
+ CanEliminateFrame = false;
+ ForceLRSpill = true;
+ }
+ }
+
+ bool ExtraCSSpill = false;
+ if (!CanEliminateFrame || cannotEliminateFrame(MF)) {
+ AFI->setHasStackFrame(true);
+
+ // If LR is not spilled, but at least one of R4, R5, R6, and R7 is spilled.
+ // Spill LR as well so we can fold BX_RET to the registers restore (LDM).
+ if (!LRSpilled && CS1Spilled) {
+ MF.getRegInfo().setPhysRegUsed(ARM::LR);
+ AFI->setCSRegisterIsSpilled(ARM::LR);
+ NumGPRSpills++;
+ UnspilledCS1GPRs.erase(std::find(UnspilledCS1GPRs.begin(),
+ UnspilledCS1GPRs.end(), (unsigned)ARM::LR));
+ ForceLRSpill = false;
+ ExtraCSSpill = true;
+ }
+
+ // Darwin ABI requires FP to point to the stack slot that contains the
+ // previous FP.
+ if (STI.isTargetDarwin() || hasFP(MF)) {
+ MF.getRegInfo().setPhysRegUsed(FramePtr);
+ NumGPRSpills++;
+ }
+
+ // If stack and double are 8-byte aligned and we are spilling an odd number
+ // of GPRs. Spill one extra callee save GPR so we won't have to pad between
+ // the integer and double callee save areas.
+ unsigned TargetAlign = MF.getTarget().getFrameInfo()->getStackAlignment();
+ if (TargetAlign == 8 && (NumGPRSpills & 1)) {
+ if (CS1Spilled && !UnspilledCS1GPRs.empty()) {
+ for (unsigned i = 0, e = UnspilledCS1GPRs.size(); i != e; ++i) {
+ unsigned Reg = UnspilledCS1GPRs[i];
+ // Don't spill high register if the function is thumb1
+ if (!AFI->isThumb1OnlyFunction() ||
+ isARMLowRegister(Reg) || Reg == ARM::LR) {
+ MF.getRegInfo().setPhysRegUsed(Reg);
+ AFI->setCSRegisterIsSpilled(Reg);
+ if (!isReservedReg(MF, Reg))
+ ExtraCSSpill = true;
+ break;
+ }
+ }
+ } else if (!UnspilledCS2GPRs.empty() &&
+ !AFI->isThumb1OnlyFunction()) {
+ unsigned Reg = UnspilledCS2GPRs.front();
+ MF.getRegInfo().setPhysRegUsed(Reg);
+ AFI->setCSRegisterIsSpilled(Reg);
+ if (!isReservedReg(MF, Reg))
+ ExtraCSSpill = true;
+ }
+ }
+
+ // Estimate if we might need to scavenge a register at some point in order
+ // to materialize a stack offset. If so, either spill one additional
+ // callee-saved register or reserve a special spill slot to facilitate
+ // register scavenging. Thumb1 needs a spill slot for stack pointer
+ // adjustments also, even when the frame itself is small.
+ if (RS && !ExtraCSSpill) {
+ MachineFrameInfo *MFI = MF.getFrameInfo();
+ // If any of the stack slot references may be out of range of an
+ // immediate offset, make sure a register (or a spill slot) is
+ // available for the register scavenger. Note that if we're indexing
+ // off the frame pointer, the effective stack size is 4 bytes larger
+ // since the FP points to the stack slot of the previous FP.
+ if (estimateStackSize(MF, MFI) + (hasFP(MF) ? 4 : 0)
+ >= estimateRSStackSizeLimit(MF)) {
+ // If any non-reserved CS register isn't spilled, just spill one or two
+ // extra. That should take care of it!
+ unsigned NumExtras = TargetAlign / 4;
+ SmallVector<unsigned, 2> Extras;
+ while (NumExtras && !UnspilledCS1GPRs.empty()) {
+ unsigned Reg = UnspilledCS1GPRs.back();
+ UnspilledCS1GPRs.pop_back();
+ if (!isReservedReg(MF, Reg)) {
+ Extras.push_back(Reg);
+ NumExtras--;
+ }
+ }
+ // For non-Thumb1 functions, also check for hi-reg CS registers
+ if (!AFI->isThumb1OnlyFunction()) {
+ while (NumExtras && !UnspilledCS2GPRs.empty()) {
+ unsigned Reg = UnspilledCS2GPRs.back();
+ UnspilledCS2GPRs.pop_back();
+ if (!isReservedReg(MF, Reg)) {
+ Extras.push_back(Reg);
+ NumExtras--;
+ }
+ }
+ }
+ if (Extras.size() && NumExtras == 0) {
+ for (unsigned i = 0, e = Extras.size(); i != e; ++i) {
+ MF.getRegInfo().setPhysRegUsed(Extras[i]);
+ AFI->setCSRegisterIsSpilled(Extras[i]);
+ }
+ } else if (!AFI->isThumb1OnlyFunction()) {
+ // note: Thumb1 functions spill to R12, not the stack.
+ // Reserve a slot closest to SP or frame pointer.
+ const TargetRegisterClass *RC = ARM::GPRRegisterClass;
+ RS->setScavengingFrameIndex(MFI->CreateStackObject(RC->getSize(),
+ RC->getAlignment()));
+ }
+ }
+ }
+ }
+
+ if (ForceLRSpill) {
+ MF.getRegInfo().setPhysRegUsed(ARM::LR);
+ AFI->setCSRegisterIsSpilled(ARM::LR);
+ AFI->setLRIsSpilledForFarJump(true);
+ }
+}
+
+unsigned ARMBaseRegisterInfo::getRARegister() const {
+ return ARM::LR;
+}
+
+unsigned ARMBaseRegisterInfo::getFrameRegister(MachineFunction &MF) const {
+ if (STI.isTargetDarwin() || hasFP(MF))
+ return FramePtr;
+ return ARM::SP;
+}
+
+unsigned ARMBaseRegisterInfo::getEHExceptionRegister() const {
+ llvm_unreachable("What is the exception register");
+ return 0;
+}
+
+unsigned ARMBaseRegisterInfo::getEHHandlerRegister() const {
+ llvm_unreachable("What is the exception handler register");
+ return 0;
+}
+
+int ARMBaseRegisterInfo::getDwarfRegNum(unsigned RegNum, bool isEH) const {
+ return ARMGenRegisterInfo::getDwarfRegNumFull(RegNum, 0);
+}
+
+unsigned ARMBaseRegisterInfo::getRegisterPairEven(unsigned Reg,
+ const MachineFunction &MF) const {
+ switch (Reg) {
+ default: break;
+ // Return 0 if either register of the pair is a special register.
+ // So no R12, etc.
+ case ARM::R1:
+ return ARM::R0;
+ case ARM::R3:
+ // FIXME!
+ return STI.isThumb1Only() ? 0 : ARM::R2;
+ case ARM::R5:
+ return ARM::R4;
+ case ARM::R7:
+ return isReservedReg(MF, ARM::R7) ? 0 : ARM::R6;
+ case ARM::R9:
+ return isReservedReg(MF, ARM::R9) ? 0 :ARM::R8;
+ case ARM::R11:
+ return isReservedReg(MF, ARM::R11) ? 0 : ARM::R10;
+
+ case ARM::S1:
+ return ARM::S0;
+ case ARM::S3:
+ return ARM::S2;
+ case ARM::S5:
+ return ARM::S4;
+ case ARM::S7:
+ return ARM::S6;
+ case ARM::S9:
+ return ARM::S8;
+ case ARM::S11:
+ return ARM::S10;
+ case ARM::S13:
+ return ARM::S12;
+ case ARM::S15:
+ return ARM::S14;
+ case ARM::S17:
+ return ARM::S16;
+ case ARM::S19:
+ return ARM::S18;
+ case ARM::S21:
+ return ARM::S20;
+ case ARM::S23:
+ return ARM::S22;
+ case ARM::S25:
+ return ARM::S24;
+ case ARM::S27:
+ return ARM::S26;
+ case ARM::S29:
+ return ARM::S28;
+ case ARM::S31:
+ return ARM::S30;
+
+ case ARM::D1:
+ return ARM::D0;
+ case ARM::D3:
+ return ARM::D2;
+ case ARM::D5:
+ return ARM::D4;
+ case ARM::D7:
+ return ARM::D6;
+ case ARM::D9:
+ return ARM::D8;
+ case ARM::D11:
+ return ARM::D10;
+ case ARM::D13:
+ return ARM::D12;
+ case ARM::D15:
+ return ARM::D14;
+ case ARM::D17:
+ return ARM::D16;
+ case ARM::D19:
+ return ARM::D18;
+ case ARM::D21:
+ return ARM::D20;
+ case ARM::D23:
+ return ARM::D22;
+ case ARM::D25:
+ return ARM::D24;
+ case ARM::D27:
+ return ARM::D26;
+ case ARM::D29:
+ return ARM::D28;
+ case ARM::D31:
+ return ARM::D30;
+ }
+
+ return 0;
+}
+
+unsigned ARMBaseRegisterInfo::getRegisterPairOdd(unsigned Reg,
+ const MachineFunction &MF) const {
+ switch (Reg) {
+ default: break;
+ // Return 0 if either register of the pair is a special register.
+ // So no R12, etc.
+ case ARM::R0:
+ return ARM::R1;
+ case ARM::R2:
+ // FIXME!
+ return STI.isThumb1Only() ? 0 : ARM::R3;
+ case ARM::R4:
+ return ARM::R5;
+ case ARM::R6:
+ return isReservedReg(MF, ARM::R7) ? 0 : ARM::R7;
+ case ARM::R8:
+ return isReservedReg(MF, ARM::R9) ? 0 :ARM::R9;
+ case ARM::R10:
+ return isReservedReg(MF, ARM::R11) ? 0 : ARM::R11;
+
+ case ARM::S0:
+ return ARM::S1;
+ case ARM::S2:
+ return ARM::S3;
+ case ARM::S4:
+ return ARM::S5;
+ case ARM::S6:
+ return ARM::S7;
+ case ARM::S8:
+ return ARM::S9;
+ case ARM::S10:
+ return ARM::S11;
+ case ARM::S12:
+ return ARM::S13;
+ case ARM::S14:
+ return ARM::S15;
+ case ARM::S16:
+ return ARM::S17;
+ case ARM::S18:
+ return ARM::S19;
+ case ARM::S20:
+ return ARM::S21;
+ case ARM::S22:
+ return ARM::S23;
+ case ARM::S24:
+ return ARM::S25;
+ case ARM::S26:
+ return ARM::S27;
+ case ARM::S28:
+ return ARM::S29;
+ case ARM::S30:
+ return ARM::S31;
+
+ case ARM::D0:
+ return ARM::D1;
+ case ARM::D2:
+ return ARM::D3;
+ case ARM::D4:
+ return ARM::D5;
+ case ARM::D6:
+ return ARM::D7;
+ case ARM::D8:
+ return ARM::D9;
+ case ARM::D10:
+ return ARM::D11;
+ case ARM::D12:
+ return ARM::D13;
+ case ARM::D14:
+ return ARM::D15;
+ case ARM::D16:
+ return ARM::D17;
+ case ARM::D18:
+ return ARM::D19;
+ case ARM::D20:
+ return ARM::D21;
+ case ARM::D22:
+ return ARM::D23;
+ case ARM::D24:
+ return ARM::D25;
+ case ARM::D26:
+ return ARM::D27;
+ case ARM::D28:
+ return ARM::D29;
+ case ARM::D30:
+ return ARM::D31;
+ }
+
+ return 0;
+}
+
+/// emitLoadConstPool - Emits a load from constpool to materialize the
+/// specified immediate.
+void ARMBaseRegisterInfo::
+emitLoadConstPool(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator &MBBI,
+ DebugLoc dl,
+ unsigned DestReg, unsigned SubIdx, int Val,
+ ARMCC::CondCodes Pred,
+ unsigned PredReg) const {
+ MachineFunction &MF = *MBB.getParent();
+ MachineConstantPool *ConstantPool = MF.getConstantPool();
+ Constant *C =
+ ConstantInt::get(Type::getInt32Ty(MF.getFunction()->getContext()), Val);
+ unsigned Idx = ConstantPool->getConstantPoolIndex(C, 4);
+
+ BuildMI(MBB, MBBI, dl, TII.get(ARM::LDRcp))
+ .addReg(DestReg, getDefRegState(true), SubIdx)
+ .addConstantPoolIndex(Idx)
+ .addReg(0).addImm(0).addImm(Pred).addReg(PredReg);
+}
+
+bool ARMBaseRegisterInfo::
+requiresRegisterScavenging(const MachineFunction &MF) const {
+ return true;
+}
+
+// hasReservedCallFrame - Under normal circumstances, when a frame pointer is
+// not required, we reserve argument space for call sites in the function
+// immediately on entry to the current function. This eliminates the need for
+// add/sub sp brackets around call sites. Returns true if the call frame is
+// included as part of the stack frame.
+bool ARMBaseRegisterInfo::
+hasReservedCallFrame(MachineFunction &MF) const {
+ const MachineFrameInfo *FFI = MF.getFrameInfo();
+ unsigned CFSize = FFI->getMaxCallFrameSize();
+ // It's not always a good idea to include the call frame as part of the
+ // stack frame. ARM (especially Thumb) has small immediate offset to
+ // address the stack frame. So a large call frame can cause poor codegen
+ // and may even makes it impossible to scavenge a register.
+ if (CFSize >= ((1 << 12) - 1) / 2) // Half of imm12
+ return false;
+
+ return !MF.getFrameInfo()->hasVarSizedObjects();
+}
+
+static void
+emitSPUpdate(bool isARM,
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
+ DebugLoc dl, const ARMBaseInstrInfo &TII,
+ int NumBytes,
+ ARMCC::CondCodes Pred = ARMCC::AL, unsigned PredReg = 0) {
+ if (isARM)
+ emitARMRegPlusImmediate(MBB, MBBI, dl, ARM::SP, ARM::SP, NumBytes,
+ Pred, PredReg, TII);
+ else
+ emitT2RegPlusImmediate(MBB, MBBI, dl, ARM::SP, ARM::SP, NumBytes,
+ Pred, PredReg, TII);
+}
+
+
+void ARMBaseRegisterInfo::
+eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I) const {
+ if (!hasReservedCallFrame(MF)) {
+ // If we have alloca, convert as follows:
+ // ADJCALLSTACKDOWN -> sub, sp, sp, amount
+ // ADJCALLSTACKUP -> add, sp, sp, amount
+ MachineInstr *Old = I;
+ DebugLoc dl = Old->getDebugLoc();
+ unsigned Amount = Old->getOperand(0).getImm();
+ if (Amount != 0) {
+ // We need to keep the stack aligned properly. To do this, we round the
+ // amount of space needed for the outgoing arguments up to the next
+ // alignment boundary.
+ unsigned Align = MF.getTarget().getFrameInfo()->getStackAlignment();
+ Amount = (Amount+Align-1)/Align*Align;
+
+ ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+ assert(!AFI->isThumb1OnlyFunction() &&
+ "This eliminateCallFramePseudoInstr does not suppor Thumb1!");
+ bool isARM = !AFI->isThumbFunction();
+
+ // Replace the pseudo instruction with a new instruction...
+ unsigned Opc = Old->getOpcode();
+ ARMCC::CondCodes Pred = (ARMCC::CondCodes)Old->getOperand(1).getImm();
+ // FIXME: Thumb2 version of ADJCALLSTACKUP and ADJCALLSTACKDOWN?
+ if (Opc == ARM::ADJCALLSTACKDOWN || Opc == ARM::tADJCALLSTACKDOWN) {
+ // Note: PredReg is operand 2 for ADJCALLSTACKDOWN.
+ unsigned PredReg = Old->getOperand(2).getReg();
+ emitSPUpdate(isARM, MBB, I, dl, TII, -Amount, Pred, PredReg);
+ } else {
+ // Note: PredReg is operand 3 for ADJCALLSTACKUP.
+ unsigned PredReg = Old->getOperand(3).getReg();
+ assert(Opc == ARM::ADJCALLSTACKUP || Opc == ARM::tADJCALLSTACKUP);
+ emitSPUpdate(isARM, MBB, I, dl, TII, Amount, Pred, PredReg);
+ }
+ }
+ }
+ MBB.erase(I);
+}
+
+/// findScratchRegister - Find a 'free' ARM register. If register scavenger
+/// is not being used, R12 is available. Otherwise, try for a call-clobbered
+/// register first and then a spilled callee-saved register if that fails.
+static
+unsigned findScratchRegister(RegScavenger *RS, const TargetRegisterClass *RC,
+ ARMFunctionInfo *AFI) {
+ unsigned Reg = RS ? RS->FindUnusedReg(RC) : (unsigned) ARM::R12;
+ assert(!AFI->isThumb1OnlyFunction());
+ return Reg;
+}
+
+unsigned
+ARMBaseRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+ int SPAdj, int *Value,
+ RegScavenger *RS) const {
+ unsigned i = 0;
+ MachineInstr &MI = *II;
+ MachineBasicBlock &MBB = *MI.getParent();
+ MachineFunction &MF = *MBB.getParent();
+ const MachineFrameInfo *MFI = MF.getFrameInfo();
+ ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+ assert(!AFI->isThumb1OnlyFunction() &&
+ "This eliminateFrameIndex does not support Thumb1!");
+
+ while (!MI.getOperand(i).isFI()) {
+ ++i;
+ assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!");
+ }
+
+ unsigned FrameReg = ARM::SP;
+ int FrameIndex = MI.getOperand(i).getIndex();
+ int Offset = MFI->getObjectOffset(FrameIndex) + MFI->getStackSize() + SPAdj;
+
+ if (AFI->isGPRCalleeSavedArea1Frame(FrameIndex))
+ Offset -= AFI->getGPRCalleeSavedArea1Offset();
+ else if (AFI->isGPRCalleeSavedArea2Frame(FrameIndex))
+ Offset -= AFI->getGPRCalleeSavedArea2Offset();
+ else if (AFI->isDPRCalleeSavedAreaFrame(FrameIndex))
+ Offset -= AFI->getDPRCalleeSavedAreaOffset();
+ else if (hasFP(MF) && AFI->hasStackFrame()) {
+ assert(SPAdj == 0 && "Unexpected stack offset!");
+ // Use frame pointer to reference fixed objects unless this is a
+ // frameless function,
+ FrameReg = getFrameRegister(MF);
+ Offset -= AFI->getFramePtrSpillOffset();
+ }
+
+ // modify MI as necessary to handle as much of 'Offset' as possible
+ bool Done = false;
+ if (!AFI->isThumbFunction())
+ Done = rewriteARMFrameIndex(MI, i, FrameReg, Offset, TII);
+ else {
+ assert(AFI->isThumb2Function());
+ Done = rewriteT2FrameIndex(MI, i, FrameReg, Offset, TII);
+ }
+ if (Done)
+ return 0;
+
+ // If we get here, the immediate doesn't fit into the instruction. We folded
+ // as much as possible above, handle the rest, providing a register that is
+ // SP+LargeImm.
+ assert((Offset ||
+ (MI.getDesc().TSFlags & ARMII::AddrModeMask) == ARMII::AddrMode4) &&
+ "This code isn't needed if offset already handled!");
+
+ // Insert a set of r12 with the full address: r12 = sp + offset
+ // If the offset we have is too large to fit into the instruction, we need
+ // to form it with a series of ADDri's. Do this by taking 8-bit chunks
+ // out of 'Offset'.
+ unsigned ScratchReg = findScratchRegister(RS, ARM::GPRRegisterClass, AFI);
+ if (ScratchReg == 0)
+ // No register is "free". Scavenge a register.
+ ScratchReg = RS->scavengeRegister(ARM::GPRRegisterClass, II, SPAdj);
+ int PIdx = MI.findFirstPredOperandIdx();
+ ARMCC::CondCodes Pred = (PIdx == -1)
+ ? ARMCC::AL : (ARMCC::CondCodes)MI.getOperand(PIdx).getImm();
+ unsigned PredReg = (PIdx == -1) ? 0 : MI.getOperand(PIdx+1).getReg();
+ if (Offset == 0)
+ // Must be addrmode4.
+ MI.getOperand(i).ChangeToRegister(FrameReg, false, false, false);
+ else {
+ if (!AFI->isThumbFunction())
+ emitARMRegPlusImmediate(MBB, II, MI.getDebugLoc(), ScratchReg, FrameReg,
+ Offset, Pred, PredReg, TII);
+ else {
+ assert(AFI->isThumb2Function());
+ emitT2RegPlusImmediate(MBB, II, MI.getDebugLoc(), ScratchReg, FrameReg,
+ Offset, Pred, PredReg, TII);
+ }
+ MI.getOperand(i).ChangeToRegister(ScratchReg, false, false, true);
+ }
+ return 0;
+}
+
+/// Move iterator pass the next bunch of callee save load / store ops for
+/// the particular spill area (1: integer area 1, 2: integer area 2,
+/// 3: fp area, 0: don't care).
+static void movePastCSLoadStoreOps(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator &MBBI,
+ int Opc1, int Opc2, unsigned Area,
+ const ARMSubtarget &STI) {
+ while (MBBI != MBB.end() &&
+ ((MBBI->getOpcode() == Opc1) || (MBBI->getOpcode() == Opc2)) &&
+ MBBI->getOperand(1).isFI()) {
+ if (Area != 0) {
+ bool Done = false;
+ unsigned Category = 0;
+ switch (MBBI->getOperand(0).getReg()) {
+ case ARM::R4: case ARM::R5: case ARM::R6: case ARM::R7:
+ case ARM::LR:
+ Category = 1;
+ break;
+ case ARM::R8: case ARM::R9: case ARM::R10: case ARM::R11:
+ Category = STI.isTargetDarwin() ? 2 : 1;
+ break;
+ case ARM::D8: case ARM::D9: case ARM::D10: case ARM::D11:
+ case ARM::D12: case ARM::D13: case ARM::D14: case ARM::D15:
+ Category = 3;
+ break;
+ default:
+ Done = true;
+ break;
+ }
+ if (Done || Category != Area)
+ break;
+ }
+
+ ++MBBI;
+ }
+}
+
+void ARMBaseRegisterInfo::
+emitPrologue(MachineFunction &MF) const {
+ MachineBasicBlock &MBB = MF.front();
+ MachineBasicBlock::iterator MBBI = MBB.begin();
+ MachineFrameInfo *MFI = MF.getFrameInfo();
+ ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+ assert(!AFI->isThumb1OnlyFunction() &&
+ "This emitPrologue does not suppor Thumb1!");
+ bool isARM = !AFI->isThumbFunction();
+ unsigned VARegSaveSize = AFI->getVarArgsRegSaveSize();
+ unsigned NumBytes = MFI->getStackSize();
+ const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
+ DebugLoc dl = (MBBI != MBB.end() ?
+ MBBI->getDebugLoc() : DebugLoc::getUnknownLoc());
+
+ // Determine the sizes of each callee-save spill areas and record which frame
+ // belongs to which callee-save spill areas.
+ unsigned GPRCS1Size = 0, GPRCS2Size = 0, DPRCSSize = 0;
+ int FramePtrSpillFI = 0;
+
+ // Allocate the vararg register save area. This is not counted in NumBytes.
+ if (VARegSaveSize)
+ emitSPUpdate(isARM, MBB, MBBI, dl, TII, -VARegSaveSize);
+
+ if (!AFI->hasStackFrame()) {
+ if (NumBytes != 0)
+ emitSPUpdate(isARM, MBB, MBBI, dl, TII, -NumBytes);
+ return;
+ }
+
+ for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+ unsigned Reg = CSI[i].getReg();
+ int FI = CSI[i].getFrameIdx();
+ switch (Reg) {
+ case ARM::R4:
+ case ARM::R5:
+ case ARM::R6:
+ case ARM::R7:
+ case ARM::LR:
+ if (Reg == FramePtr)
+ FramePtrSpillFI = FI;
+ AFI->addGPRCalleeSavedArea1Frame(FI);
+ GPRCS1Size += 4;
+ break;
+ case ARM::R8:
+ case ARM::R9:
+ case ARM::R10:
+ case ARM::R11:
+ if (Reg == FramePtr)
+ FramePtrSpillFI = FI;
+ if (STI.isTargetDarwin()) {
+ AFI->addGPRCalleeSavedArea2Frame(FI);
+ GPRCS2Size += 4;
+ } else {
+ AFI->addGPRCalleeSavedArea1Frame(FI);
+ GPRCS1Size += 4;
+ }
+ break;
+ default:
+ AFI->addDPRCalleeSavedAreaFrame(FI);
+ DPRCSSize += 8;
+ }
+ }
+
+ // Build the new SUBri to adjust SP for integer callee-save spill area 1.
+ emitSPUpdate(isARM, MBB, MBBI, dl, TII, -GPRCS1Size);
+ movePastCSLoadStoreOps(MBB, MBBI, ARM::STR, ARM::t2STRi12, 1, STI);
+
+ // Set FP to point to the stack slot that contains the previous FP.
+ // For Darwin, FP is R7, which has now been stored in spill area 1.
+ // Otherwise, if this is not Darwin, all the callee-saved registers go
+ // into spill area 1, including the FP in R11. In either case, it is
+ // now safe to emit this assignment.
+ if (STI.isTargetDarwin() || hasFP(MF)) {
+ unsigned ADDriOpc = !AFI->isThumbFunction() ? ARM::ADDri : ARM::t2ADDri;
+ MachineInstrBuilder MIB =
+ BuildMI(MBB, MBBI, dl, TII.get(ADDriOpc), FramePtr)
+ .addFrameIndex(FramePtrSpillFI).addImm(0);
+ AddDefaultCC(AddDefaultPred(MIB));
+ }
+
+ // Build the new SUBri to adjust SP for integer callee-save spill area 2.
+ emitSPUpdate(isARM, MBB, MBBI, dl, TII, -GPRCS2Size);
+
+ // Build the new SUBri to adjust SP for FP callee-save spill area.
+ movePastCSLoadStoreOps(MBB, MBBI, ARM::STR, ARM::t2STRi12, 2, STI);
+ emitSPUpdate(isARM, MBB, MBBI, dl, TII, -DPRCSSize);
+
+ // Determine starting offsets of spill areas.
+ unsigned DPRCSOffset = NumBytes - (GPRCS1Size + GPRCS2Size + DPRCSSize);
+ unsigned GPRCS2Offset = DPRCSOffset + DPRCSSize;
+ unsigned GPRCS1Offset = GPRCS2Offset + GPRCS2Size;
+ AFI->setFramePtrSpillOffset(MFI->getObjectOffset(FramePtrSpillFI) + NumBytes);
+ AFI->setGPRCalleeSavedArea1Offset(GPRCS1Offset);
+ AFI->setGPRCalleeSavedArea2Offset(GPRCS2Offset);
+ AFI->setDPRCalleeSavedAreaOffset(DPRCSOffset);
+
+ NumBytes = DPRCSOffset;
+ if (NumBytes) {
+ // Insert it after all the callee-save spills.
+ movePastCSLoadStoreOps(MBB, MBBI, ARM::FSTD, 0, 3, STI);
+ emitSPUpdate(isARM, MBB, MBBI, dl, TII, -NumBytes);
+ }
+
+ if (STI.isTargetELF() && hasFP(MF)) {
+ MFI->setOffsetAdjustment(MFI->getOffsetAdjustment() -
+ AFI->getFramePtrSpillOffset());
+ }
+
+ AFI->setGPRCalleeSavedArea1Size(GPRCS1Size);
+ AFI->setGPRCalleeSavedArea2Size(GPRCS2Size);
+ AFI->setDPRCalleeSavedAreaSize(DPRCSSize);
+}
+
+static bool isCalleeSavedRegister(unsigned Reg, const unsigned *CSRegs) {
+ for (unsigned i = 0; CSRegs[i]; ++i)
+ if (Reg == CSRegs[i])
+ return true;
+ return false;
+}
+
+static bool isCSRestore(MachineInstr *MI,
+ const ARMBaseInstrInfo &TII,
+ const unsigned *CSRegs) {
+ return ((MI->getOpcode() == (int)ARM::FLDD ||
+ MI->getOpcode() == (int)ARM::LDR ||
+ MI->getOpcode() == (int)ARM::t2LDRi12) &&
+ MI->getOperand(1).isFI() &&
+ isCalleeSavedRegister(MI->getOperand(0).getReg(), CSRegs));
+}
+
+void ARMBaseRegisterInfo::
+emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const {
+ MachineBasicBlock::iterator MBBI = prior(MBB.end());
+ assert(MBBI->getDesc().isReturn() &&
+ "Can only insert epilog into returning blocks");
+ DebugLoc dl = MBBI->getDebugLoc();
+ MachineFrameInfo *MFI = MF.getFrameInfo();
+ ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+ assert(!AFI->isThumb1OnlyFunction() &&
+ "This emitEpilogue does not suppor Thumb1!");
+ bool isARM = !AFI->isThumbFunction();
+
+ unsigned VARegSaveSize = AFI->getVarArgsRegSaveSize();
+ int NumBytes = (int)MFI->getStackSize();
+
+ if (!AFI->hasStackFrame()) {
+ if (NumBytes != 0)
+ emitSPUpdate(isARM, MBB, MBBI, dl, TII, NumBytes);
+ } else {
+ // Unwind MBBI to point to first LDR / FLDD.
+ const unsigned *CSRegs = getCalleeSavedRegs();
+ if (MBBI != MBB.begin()) {
+ do
+ --MBBI;
+ while (MBBI != MBB.begin() && isCSRestore(MBBI, TII, CSRegs));
+ if (!isCSRestore(MBBI, TII, CSRegs))
+ ++MBBI;
+ }
+
+ // Move SP to start of FP callee save spill area.
+ NumBytes -= (AFI->getGPRCalleeSavedArea1Size() +
+ AFI->getGPRCalleeSavedArea2Size() +
+ AFI->getDPRCalleeSavedAreaSize());
+
+ // Darwin ABI requires FP to point to the stack slot that contains the
+ // previous FP.
+ bool HasFP = hasFP(MF);
+ if ((STI.isTargetDarwin() && NumBytes) || HasFP) {
+ NumBytes = AFI->getFramePtrSpillOffset() - NumBytes;
+ // Reset SP based on frame pointer only if the stack frame extends beyond
+ // frame pointer stack slot or target is ELF and the function has FP.
+ if (HasFP ||
+ AFI->getGPRCalleeSavedArea2Size() ||
+ AFI->getDPRCalleeSavedAreaSize() ||
+ AFI->getDPRCalleeSavedAreaOffset()) {
+ if (NumBytes) {
+ if (isARM)
+ emitARMRegPlusImmediate(MBB, MBBI, dl, ARM::SP, FramePtr, -NumBytes,
+ ARMCC::AL, 0, TII);
+ else
+ emitT2RegPlusImmediate(MBB, MBBI, dl, ARM::SP, FramePtr, -NumBytes,
+ ARMCC::AL, 0, TII);
+ } else {
+ // Thumb2 or ARM.
+ if (isARM)
+ BuildMI(MBB, MBBI, dl, TII.get(ARM::MOVr), ARM::SP)
+ .addReg(FramePtr)
+ .addImm((unsigned)ARMCC::AL).addReg(0).addReg(0);
+ else
+ BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVgpr2gpr), ARM::SP)
+ .addReg(FramePtr);
+ }
+ }
+ } else if (NumBytes)
+ emitSPUpdate(isARM, MBB, MBBI, dl, TII, NumBytes);
+
+ // Move SP to start of integer callee save spill area 2.
+ movePastCSLoadStoreOps(MBB, MBBI, ARM::FLDD, 0, 3, STI);
+ emitSPUpdate(isARM, MBB, MBBI, dl, TII, AFI->getDPRCalleeSavedAreaSize());
+
+ // Move SP to start of integer callee save spill area 1.
+ movePastCSLoadStoreOps(MBB, MBBI, ARM::LDR, ARM::t2LDRi12, 2, STI);
+ emitSPUpdate(isARM, MBB, MBBI, dl, TII, AFI->getGPRCalleeSavedArea2Size());
+
+ // Move SP to SP upon entry to the function.
+ movePastCSLoadStoreOps(MBB, MBBI, ARM::LDR, ARM::t2LDRi12, 1, STI);
+ emitSPUpdate(isARM, MBB, MBBI, dl, TII, AFI->getGPRCalleeSavedArea1Size());
+ }
+
+ if (VARegSaveSize)
+ emitSPUpdate(isARM, MBB, MBBI, dl, TII, VARegSaveSize);
+}
+
+#include "ARMGenRegisterInfo.inc"
diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.h b/lib/Target/ARM/ARMBaseRegisterInfo.h
new file mode 100644
index 0000000..da703fb
--- /dev/null
+++ b/lib/Target/ARM/ARMBaseRegisterInfo.h
@@ -0,0 +1,148 @@
+//===- ARMBaseRegisterInfo.h - ARM Register Information Impl ----*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the base ARM implementation of TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARMBASEREGISTERINFO_H
+#define ARMBASEREGISTERINFO_H
+
+#include "ARM.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "ARMGenRegisterInfo.h.inc"
+
+namespace llvm {
+ class ARMSubtarget;
+ class ARMBaseInstrInfo;
+ class Type;
+
+/// Register allocation hints.
+namespace ARMRI {
+ enum {
+ RegPairOdd = 1,
+ RegPairEven = 2
+ };
+}
+
+/// isARMLowRegister - Returns true if the register is low register r0-r7.
+///
+static inline bool isARMLowRegister(unsigned Reg) {
+ using namespace ARM;
+ switch (Reg) {
+ case R0: case R1: case R2: case R3:
+ case R4: case R5: case R6: case R7:
+ return true;
+ default:
+ return false;
+ }
+}
+
+struct ARMBaseRegisterInfo : public ARMGenRegisterInfo {
+protected:
+ const ARMBaseInstrInfo &TII;
+ const ARMSubtarget &STI;
+
+ /// FramePtr - ARM physical register used as frame ptr.
+ unsigned FramePtr;
+
+ // Can be only subclassed.
+ explicit ARMBaseRegisterInfo(const ARMBaseInstrInfo &tii,
+ const ARMSubtarget &STI);
+
+ // Return the opcode that implements 'Op', or 0 if no opcode
+ unsigned getOpcode(int Op) const;
+
+public:
+ /// getRegisterNumbering - Given the enum value for some register, e.g.
+ /// ARM::LR, return the number that it corresponds to (e.g. 14). It
+ /// also returns true in isSPVFP if the register is a single precision
+ /// VFP register.
+ static unsigned getRegisterNumbering(unsigned RegEnum, bool *isSPVFP = 0);
+
+ /// Code Generation virtual methods...
+ const unsigned *getCalleeSavedRegs(const MachineFunction *MF = 0) const;
+
+ const TargetRegisterClass* const*
+ getCalleeSavedRegClasses(const MachineFunction *MF = 0) const;
+
+ BitVector getReservedRegs(const MachineFunction &MF) const;
+
+ const TargetRegisterClass *getPointerRegClass(unsigned Kind = 0) const;
+
+ std::pair<TargetRegisterClass::iterator,TargetRegisterClass::iterator>
+ getAllocationOrder(const TargetRegisterClass *RC,
+ unsigned HintType, unsigned HintReg,
+ const MachineFunction &MF) const;
+
+ unsigned ResolveRegAllocHint(unsigned Type, unsigned Reg,
+ const MachineFunction &MF) const;
+
+ void UpdateRegAllocHint(unsigned Reg, unsigned NewReg,
+ MachineFunction &MF) const;
+
+ bool hasFP(const MachineFunction &MF) const;
+
+ bool cannotEliminateFrame(const MachineFunction &MF) const;
+
+ void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+ RegScavenger *RS = NULL) const;
+
+ // Debug information queries.
+ unsigned getRARegister() const;
+ unsigned getFrameRegister(MachineFunction &MF) const;
+
+ // Exception handling queries.
+ unsigned getEHExceptionRegister() const;
+ unsigned getEHHandlerRegister() const;
+
+ int getDwarfRegNum(unsigned RegNum, bool isEH) const;
+
+ bool isLowRegister(unsigned Reg) const;
+
+
+ /// emitLoadConstPool - Emits a load from constpool to materialize the
+ /// specified immediate.
+ virtual void emitLoadConstPool(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator &MBBI,
+ DebugLoc dl,
+ unsigned DestReg, unsigned SubIdx,
+ int Val,
+ ARMCC::CondCodes Pred = ARMCC::AL,
+ unsigned PredReg = 0) const;
+
+ /// Code Generation virtual methods...
+ virtual bool isReservedReg(const MachineFunction &MF, unsigned Reg) const;
+
+ virtual bool requiresRegisterScavenging(const MachineFunction &MF) const;
+
+ virtual bool hasReservedCallFrame(MachineFunction &MF) const;
+
+ virtual void eliminateCallFramePseudoInstr(MachineFunction &MF,
+ MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I) const;
+
+ virtual unsigned eliminateFrameIndex(MachineBasicBlock::iterator II,
+ int SPAdj, int *Value = NULL,
+ RegScavenger *RS = NULL) const;
+
+ virtual void emitPrologue(MachineFunction &MF) const;
+ virtual void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
+
+private:
+ unsigned estimateRSStackSizeLimit(MachineFunction &MF) const;
+
+ unsigned getRegisterPairEven(unsigned Reg, const MachineFunction &MF) const;
+
+ unsigned getRegisterPairOdd(unsigned Reg, const MachineFunction &MF) const;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/ARM/ARMCallingConv.td b/lib/Target/ARM/ARMCallingConv.td
index 8a4c741..7161639 100644
--- a/lib/Target/ARM/ARMCallingConv.td
+++ b/lib/Target/ARM/ARMCallingConv.td
@@ -111,6 +111,7 @@ def CC_ARM_AAPCS_VFP : CallingConv<[
CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
+ CCIfType<[v2f64], CCAssignToReg<[Q0, Q1, Q2, Q3]>>,
CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>,
CCIfType<[f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7, S8,
S9, S10, S11, S12, S13, S14, S15]>>,
@@ -122,6 +123,7 @@ def RetCC_ARM_AAPCS_VFP : CallingConv<[
CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
+ CCIfType<[v2f64], CCAssignToReg<[Q0, Q1, Q2, Q3]>>,
CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>,
CCIfType<[f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7, S8,
S9, S10, S11, S12, S13, S14, S15]>>,
diff --git a/lib/Target/ARM/ARMCodeEmitter.cpp b/lib/Target/ARM/ARMCodeEmitter.cpp
index f295761..6f1c624 100644
--- a/lib/Target/ARM/ARMCodeEmitter.cpp
+++ b/lib/Target/ARM/ARMCodeEmitter.cpp
@@ -26,14 +26,18 @@
#include "llvm/PassManager.h"
#include "llvm/CodeGen/MachineCodeEmitter.h"
#include "llvm/CodeGen/JITCodeEmitter.h"
+#include "llvm/CodeGen/ObjectCodeEmitter.h"
#include "llvm/CodeGen/MachineConstantPool.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/Passes.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Support/Compiler.h"
#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
#ifndef NDEBUG
#include <iomanip>
#endif
@@ -57,12 +61,18 @@ namespace {
ARMJITInfo *JTI;
const ARMInstrInfo *II;
const TargetData *TD;
+ const ARMSubtarget *Subtarget;
TargetMachine &TM;
CodeEmitter &MCE;
const std::vector<MachineConstantPoolEntry> *MCPEs;
const std::vector<MachineJumpTableEntry> *MJTEs;
bool IsPIC;
+ void getAnalysisUsage(AnalysisUsage &AU) const {
+ AU.addRequired<MachineModuleInfo>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
public:
static char ID;
explicit Emitter(TargetMachine &tm, CodeEmitter &mce)
@@ -160,7 +170,7 @@ namespace {
/// Routines that handle operands which add machine relocations which are
/// fixed up by the relocation stage.
void emitGlobalAddress(GlobalValue *GV, unsigned Reloc,
- bool NeedStub, intptr_t ACPV = 0);
+ bool NeedStub, bool Indirect, intptr_t ACPV = 0);
void emitExternalSymbolAddress(const char *ES, unsigned Reloc);
void emitConstPoolAddress(unsigned CPI, unsigned Reloc);
void emitJumpTableAddress(unsigned JTIndex, unsigned Reloc);
@@ -174,36 +184,39 @@ namespace {
/// createARMCodeEmitterPass - Return a pass that emits the collected ARM code
/// to the specified MCE object.
-namespace llvm {
-
-FunctionPass *createARMCodeEmitterPass(ARMBaseTargetMachine &TM,
- MachineCodeEmitter &MCE) {
+FunctionPass *llvm::createARMCodeEmitterPass(ARMBaseTargetMachine &TM,
+ MachineCodeEmitter &MCE) {
return new Emitter<MachineCodeEmitter>(TM, MCE);
}
-FunctionPass *createARMJITCodeEmitterPass(ARMBaseTargetMachine &TM,
- JITCodeEmitter &JCE) {
+FunctionPass *llvm::createARMJITCodeEmitterPass(ARMBaseTargetMachine &TM,
+ JITCodeEmitter &JCE) {
return new Emitter<JITCodeEmitter>(TM, JCE);
}
-
-} // end namespace llvm
+FunctionPass *llvm::createARMObjectCodeEmitterPass(ARMBaseTargetMachine &TM,
+ ObjectCodeEmitter &OCE) {
+ return new Emitter<ObjectCodeEmitter>(TM, OCE);
+}
template<class CodeEmitter>
bool Emitter<CodeEmitter>::runOnMachineFunction(MachineFunction &MF) {
assert((MF.getTarget().getRelocationModel() != Reloc::Default ||
MF.getTarget().getRelocationModel() != Reloc::Static) &&
"JIT relocation model must be set to static or default!");
+ JTI = ((ARMTargetMachine&)MF.getTarget()).getJITInfo();
II = ((ARMTargetMachine&)MF.getTarget()).getInstrInfo();
TD = ((ARMTargetMachine&)MF.getTarget()).getTargetData();
- JTI = ((ARMTargetMachine&)MF.getTarget()).getJITInfo();
+ Subtarget = &TM.getSubtarget<ARMSubtarget>();
MCPEs = &MF.getConstantPool()->getConstants();
MJTEs = &MF.getJumpTableInfo()->getJumpTables();
IsPIC = TM.getRelocationModel() == Reloc::PIC_;
JTI->Initialize(MF, IsPIC);
+ MCE.setModuleInfo(&getAnalysis<MachineModuleInfo>());
do {
- DOUT << "JITTing function '" << MF.getFunction()->getName() << "'\n";
+ DEBUG(errs() << "JITTing function '"
+ << MF.getFunction()->getName() << "'\n");
MCE.startFunction(MF);
- for (MachineFunction::iterator MBB = MF.begin(), E = MF.end();
+ for (MachineFunction::iterator MBB = MF.begin(), E = MF.end();
MBB != E; ++MBB) {
MCE.StartMachineBasicBlock(MBB);
for (MachineBasicBlock::const_iterator I = MBB->begin(), E = MBB->end();
@@ -220,7 +233,7 @@ bool Emitter<CodeEmitter>::runOnMachineFunction(MachineFunction &MF) {
template<class CodeEmitter>
unsigned Emitter<CodeEmitter>::getShiftOp(unsigned Imm) const {
switch (ARM_AM::getAM2ShiftOpc(Imm)) {
- default: assert(0 && "Unknown shift opc!");
+ default: llvm_unreachable("Unknown shift opc!");
case ARM_AM::asr: return 2;
case ARM_AM::lsl: return 0;
case ARM_AM::lsr: return 1;
@@ -240,7 +253,7 @@ unsigned Emitter<CodeEmitter>::getMachineOpValue(const MachineInstr &MI,
else if (MO.isImm())
return static_cast<unsigned>(MO.getImm());
else if (MO.isGlobal())
- emitGlobalAddress(MO.getGlobal(), ARM::reloc_arm_branch, true);
+ emitGlobalAddress(MO.getGlobal(), ARM::reloc_arm_branch, true, false);
else if (MO.isSymbol())
emitExternalSymbolAddress(MO.getSymbolName(), ARM::reloc_arm_branch);
else if (MO.isCPI()) {
@@ -254,8 +267,10 @@ unsigned Emitter<CodeEmitter>::getMachineOpValue(const MachineInstr &MI,
else if (MO.isMBB())
emitMachineBasicBlock(MO.getMBB(), ARM::reloc_arm_branch);
else {
- cerr << "ERROR: Unknown type of MachineOperand: " << MO << "\n";
- abort();
+#ifndef NDEBUG
+ errs() << MO;
+#endif
+ llvm_unreachable(0);
}
return 0;
}
@@ -264,9 +279,14 @@ unsigned Emitter<CodeEmitter>::getMachineOpValue(const MachineInstr &MI,
///
template<class CodeEmitter>
void Emitter<CodeEmitter>::emitGlobalAddress(GlobalValue *GV, unsigned Reloc,
- bool NeedStub, intptr_t ACPV) {
- MCE.addRelocation(MachineRelocation::getGV(MCE.getCurrentPCOffset(), Reloc,
- GV, ACPV, NeedStub));
+ bool NeedStub, bool Indirect,
+ intptr_t ACPV) {
+ MachineRelocation MR = Indirect
+ ? MachineRelocation::getIndirectSymbol(MCE.getCurrentPCOffset(), Reloc,
+ GV, ACPV, NeedStub)
+ : MachineRelocation::getGV(MCE.getCurrentPCOffset(), Reloc,
+ GV, ACPV, NeedStub);
+ MCE.addRelocation(MR);
}
/// emitExternalSymbolAddress - Arrange for the address of an external symbol to
@@ -294,7 +314,7 @@ void Emitter<CodeEmitter>::emitConstPoolAddress(unsigned CPI,
/// be emitted to the current location in the function, and allow it to be PC
/// relative.
template<class CodeEmitter>
-void Emitter<CodeEmitter>::emitJumpTableAddress(unsigned JTIndex,
+void Emitter<CodeEmitter>::emitJumpTableAddress(unsigned JTIndex,
unsigned Reloc) {
MCE.addRelocation(MachineRelocation::getJumpTable(MCE.getCurrentPCOffset(),
Reloc, JTIndex, 0, true));
@@ -310,32 +330,28 @@ void Emitter<CodeEmitter>::emitMachineBasicBlock(MachineBasicBlock *BB,
template<class CodeEmitter>
void Emitter<CodeEmitter>::emitWordLE(unsigned Binary) {
-#ifndef NDEBUG
- DOUT << " 0x" << std::hex << std::setw(8) << std::setfill('0')
- << Binary << std::dec << "\n";
-#endif
+ DEBUG(errs() << " 0x";
+ errs().write_hex(Binary) << "\n");
MCE.emitWordLE(Binary);
}
template<class CodeEmitter>
void Emitter<CodeEmitter>::emitDWordLE(uint64_t Binary) {
-#ifndef NDEBUG
- DOUT << " 0x" << std::hex << std::setw(8) << std::setfill('0')
- << (unsigned)Binary << std::dec << "\n";
- DOUT << " 0x" << std::hex << std::setw(8) << std::setfill('0')
- << (unsigned)(Binary >> 32) << std::dec << "\n";
-#endif
+ DEBUG(errs() << " 0x";
+ errs().write_hex(Binary) << "\n");
MCE.emitDWordLE(Binary);
}
template<class CodeEmitter>
void Emitter<CodeEmitter>::emitInstruction(const MachineInstr &MI) {
- DOUT << "JIT: " << (void*)MCE.getCurrentPCValue() << ":\t" << MI;
+ DEBUG(errs() << "JIT: " << (void*)MCE.getCurrentPCValue() << ":\t" << MI);
+
+ MCE.processDebugLoc(MI.getDebugLoc(), true);
NumEmitted++; // Keep track of the # of mi's emitted
switch (MI.getDesc().TSFlags & ARMII::FormMask) {
default: {
- assert(0 && "Unhandled instruction encoding format!");
+ llvm_unreachable("Unhandled instruction encoding format!");
break;
}
case ARMII::Pseudo:
@@ -393,6 +409,7 @@ void Emitter<CodeEmitter>::emitInstruction(const MachineInstr &MI) {
emitMiscInstruction(MI);
break;
}
+ MCE.processDebugLoc(MI.getDebugLoc(), false);
}
template<class CodeEmitter>
@@ -400,7 +417,7 @@ void Emitter<CodeEmitter>::emitConstPoolInstruction(const MachineInstr &MI) {
unsigned CPI = MI.getOperand(0).getImm(); // CP instruction index.
unsigned CPIndex = MI.getOperand(1).getIndex(); // Actual cp entry index.
const MachineConstantPoolEntry &MCPE = (*MCPEs)[CPIndex];
-
+
// Remember the CONSTPOOL_ENTRY address for later relocation.
JTI->addConstantPoolEntryAddr(CPI, MCE.getCurrentPCValue());
@@ -410,55 +427,49 @@ void Emitter<CodeEmitter>::emitConstPoolInstruction(const MachineInstr &MI) {
ARMConstantPoolValue *ACPV =
static_cast<ARMConstantPoolValue*>(MCPE.Val.MachineCPVal);
- DOUT << " ** ARM constant pool #" << CPI << " @ "
- << (void*)MCE.getCurrentPCValue() << " " << *ACPV << '\n';
+ DEBUG(errs() << " ** ARM constant pool #" << CPI << " @ "
+ << (void*)MCE.getCurrentPCValue() << " " << *ACPV << '\n');
GlobalValue *GV = ACPV->getGV();
if (GV) {
- assert(!ACPV->isStub() && "Don't know how to deal this yet!");
- if (ACPV->isNonLazyPointer())
- MCE.addRelocation(MachineRelocation::getIndirectSymbol(
- MCE.getCurrentPCOffset(), ARM::reloc_arm_machine_cp_entry, GV,
- (intptr_t)ACPV, false));
- else
- emitGlobalAddress(GV, ARM::reloc_arm_machine_cp_entry,
- ACPV->isStub() || isa<Function>(GV), (intptr_t)ACPV);
+ Reloc::Model RelocM = TM.getRelocationModel();
+ emitGlobalAddress(GV, ARM::reloc_arm_machine_cp_entry,
+ isa<Function>(GV),
+ Subtarget->GVIsIndirectSymbol(GV, RelocM),
+ (intptr_t)ACPV);
} else {
- assert(!ACPV->isNonLazyPointer() && "Don't know how to deal this yet!");
emitExternalSymbolAddress(ACPV->getSymbol(), ARM::reloc_arm_absolute);
}
emitWordLE(0);
} else {
Constant *CV = MCPE.Val.ConstVal;
-#ifndef NDEBUG
- DOUT << " ** Constant pool #" << CPI << " @ "
- << (void*)MCE.getCurrentPCValue() << " ";
- if (const Function *F = dyn_cast<Function>(CV))
- DOUT << F->getName();
- else
- DOUT << *CV;
- DOUT << '\n';
-#endif
+ DEBUG({
+ errs() << " ** Constant pool #" << CPI << " @ "
+ << (void*)MCE.getCurrentPCValue() << " ";
+ if (const Function *F = dyn_cast<Function>(CV))
+ errs() << F->getName();
+ else
+ errs() << *CV;
+ errs() << '\n';
+ });
if (GlobalValue *GV = dyn_cast<GlobalValue>(CV)) {
- emitGlobalAddress(GV, ARM::reloc_arm_absolute, isa<Function>(GV));
+ emitGlobalAddress(GV, ARM::reloc_arm_absolute, isa<Function>(GV), false);
emitWordLE(0);
} else if (const ConstantInt *CI = dyn_cast<ConstantInt>(CV)) {
uint32_t Val = *(uint32_t*)CI->getValue().getRawData();
emitWordLE(Val);
} else if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CV)) {
- if (CFP->getType() == Type::FloatTy)
+ if (CFP->getType()->isFloatTy())
emitWordLE(CFP->getValueAPF().bitcastToAPInt().getZExtValue());
- else if (CFP->getType() == Type::DoubleTy)
+ else if (CFP->getType()->isDoubleTy())
emitDWordLE(CFP->getValueAPF().bitcastToAPInt().getZExtValue());
else {
- assert(0 && "Unable to handle this constantpool entry!");
- abort();
+ llvm_unreachable("Unable to handle this constantpool entry!");
}
} else {
- assert(0 && "Unable to handle this constantpool entry!");
- abort();
+ llvm_unreachable("Unable to handle this constantpool entry!");
}
}
}
@@ -467,7 +478,8 @@ template<class CodeEmitter>
void Emitter<CodeEmitter>::emitMOVi2piecesInstruction(const MachineInstr &MI) {
const MachineOperand &MO0 = MI.getOperand(0);
const MachineOperand &MO1 = MI.getOperand(1);
- assert(MO1.isImm() && "Not a valid so_imm value!");
+ assert(MO1.isImm() && ARM_AM::getSOImmVal(MO1.isImm()) != -1 &&
+ "Not a valid so_imm value!");
unsigned V1 = ARM_AM::getSOImmTwoPartFirst(MO1.getImm());
unsigned V2 = ARM_AM::getSOImmTwoPartSecond(MO1.getImm());
@@ -483,7 +495,7 @@ void Emitter<CodeEmitter>::emitMOVi2piecesInstruction(const MachineInstr &MI) {
// Encode so_imm.
// Set bit I(25) to identify this is the immediate form of <shifter_op>
Binary |= 1 << ARMII::I_BitShift;
- Binary |= getMachineSoImmOpValue(ARM_AM::getSOImmVal(V1));
+ Binary |= getMachineSoImmOpValue(V1);
emitWordLE(Binary);
// Now the 'orr' instruction.
@@ -501,14 +513,14 @@ void Emitter<CodeEmitter>::emitMOVi2piecesInstruction(const MachineInstr &MI) {
// Encode so_imm.
// Set bit I(25) to identify this is the immediate form of <shifter_op>
Binary |= 1 << ARMII::I_BitShift;
- Binary |= getMachineSoImmOpValue(ARM_AM::getSOImmVal(V2));
+ Binary |= getMachineSoImmOpValue(V2);
emitWordLE(Binary);
}
template<class CodeEmitter>
void Emitter<CodeEmitter>::emitLEApcrelJTInstruction(const MachineInstr &MI) {
// It's basically add r, pc, (LJTI - $+8)
-
+
const TargetInstrDesc &TID = MI.getDesc();
// Emit the 'add' instruction.
@@ -527,7 +539,6 @@ void Emitter<CodeEmitter>::emitLEApcrelJTInstruction(const MachineInstr &MI) {
Binary |= ARMRegisterInfo::getRegisterNumbering(ARM::PC) << ARMII::RegRnShift;
// Encode the displacement.
- // Set bit I(25) to identify this is the immediate form of <shifter_op>.
Binary |= 1 << ARMII::I_BitShift;
emitJumpTableAddress(MI.getOperand(1).getIndex(), ARM::reloc_arm_jt_base);
@@ -576,8 +587,8 @@ void Emitter<CodeEmitter>::emitPseudoMoveInstruction(const MachineInstr &MI) {
template<class CodeEmitter>
void Emitter<CodeEmitter>::addPCLabel(unsigned LabelID) {
- DOUT << " ** LPC" << LabelID << " @ "
- << (void*)MCE.getCurrentPCValue() << '\n';
+ DEBUG(errs() << " ** LPC" << LabelID << " @ "
+ << (void*)MCE.getCurrentPCValue() << '\n');
JTI->addPCLabelAddr(LabelID, MCE.getCurrentPCValue());
}
@@ -586,13 +597,13 @@ void Emitter<CodeEmitter>::emitPseudoInstruction(const MachineInstr &MI) {
unsigned Opcode = MI.getDesc().Opcode;
switch (Opcode) {
default:
- abort(); // FIXME:
+ llvm_unreachable("ARMCodeEmitter::emitPseudoInstruction");
+ // FIXME: Add support for MOVimm32.
case TargetInstrInfo::INLINEASM: {
// We allow inline assembler nodes with empty bodies - they can
// implicitly define registers, which is ok for JIT.
if (MI.getOperand(0).getSymbolName()[0]) {
- assert(0 && "JIT does not support inline asm!\n");
- abort();
+ llvm_report_error("JIT does not support inline asm!");
}
break;
}
@@ -601,7 +612,7 @@ void Emitter<CodeEmitter>::emitPseudoInstruction(const MachineInstr &MI) {
MCE.emitLabel(MI.getOperand(0).getImm());
break;
case TargetInstrInfo::IMPLICIT_DEF:
- case TargetInstrInfo::DECLARE:
+ case TargetInstrInfo::KILL:
case ARM::DWARF_LOC:
// Do nothing.
break;
@@ -674,7 +685,7 @@ unsigned Emitter<CodeEmitter>::getMachineSoRegOpValue(
// ROR - 0111
// RRX - 0110 and bit[11:8] clear.
switch (SOpc) {
- default: assert(0 && "Unknown shift opc!");
+ default: llvm_unreachable("Unknown shift opc!");
case ARM_AM::lsl: SBits = 0x1; break;
case ARM_AM::lsr: SBits = 0x3; break;
case ARM_AM::asr: SBits = 0x5; break;
@@ -688,7 +699,7 @@ unsigned Emitter<CodeEmitter>::getMachineSoRegOpValue(
// ASR - 100
// ROR - 110
switch (SOpc) {
- default: assert(0 && "Unknown shift opc!");
+ default: llvm_unreachable("Unknown shift opc!");
case ARM_AM::lsl: SBits = 0x0; break;
case ARM_AM::lsr: SBits = 0x2; break;
case ARM_AM::asr: SBits = 0x4; break;
@@ -713,12 +724,15 @@ unsigned Emitter<CodeEmitter>::getMachineSoRegOpValue(
template<class CodeEmitter>
unsigned Emitter<CodeEmitter>::getMachineSoImmOpValue(unsigned SoImm) {
+ int SoImmVal = ARM_AM::getSOImmVal(SoImm);
+ assert(SoImmVal != -1 && "Not a valid so_imm value!");
+
// Encode rotate_imm.
- unsigned Binary = (ARM_AM::getSOImmValRot(SoImm) >> 1)
+ unsigned Binary = (ARM_AM::getSOImmValRot((unsigned)SoImmVal) >> 1)
<< ARMII::SoRotImmShift;
// Encode immed_8.
- Binary |= ARM_AM::getSOImmValImm(SoImm);
+ Binary |= ARM_AM::getSOImmValImm((unsigned)SoImmVal);
return Binary;
}
@@ -740,6 +754,10 @@ void Emitter<CodeEmitter>::emitDataProcessingInstruction(
unsigned ImplicitRn) {
const TargetInstrDesc &TID = MI.getDesc();
+ if (TID.Opcode == ARM::BFC) {
+ llvm_report_error("ARMv6t2 JIT is not yet supported.");
+ }
+
// Part of binary is determined by TableGn.
unsigned Binary = getBinaryCodeForInstr(MI);
@@ -791,9 +809,7 @@ void Emitter<CodeEmitter>::emitDataProcessingInstruction(
}
// Encode so_imm.
- // Set bit I(25) to identify this is the immediate form of <shifter_op>.
- Binary |= 1 << ARMII::I_BitShift;
- Binary |= getMachineSoImmOpValue(MO.getImm());
+ Binary |= getMachineSoImmOpValue((unsigned)MO.getImm());
emitWordLE(Binary);
}
@@ -952,8 +968,8 @@ static unsigned getAddrModeUPBits(unsigned Mode) {
// DA - Decrement after - bit U = 0 and bit P = 0
// DB - Decrement before - bit U = 0 and bit P = 1
switch (Mode) {
- default: assert(0 && "Unknown addressing sub-mode!");
- case ARM_AM::da: break;
+ default: llvm_unreachable("Unknown addressing sub-mode!");
+ case ARM_AM::da: break;
case ARM_AM::db: Binary |= 0x1 << ARMII::P_BitShift; break;
case ARM_AM::ia: Binary |= 0x1 << ARMII::U_BitShift; break;
case ARM_AM::ib: Binary |= 0x3 << ARMII::U_BitShift; break;
@@ -983,7 +999,7 @@ void Emitter<CodeEmitter>::emitLoadStoreMultipleInstruction(
Binary |= 0x1 << ARMII::W_BitShift;
// Set registers
- for (unsigned i = 4, e = MI.getNumOperands(); i != e; ++i) {
+ for (unsigned i = 5, e = MI.getNumOperands(); i != e; ++i) {
const MachineOperand &MO = MI.getOperand(i);
if (!MO.isReg() || MO.isImplicit())
break;
@@ -1107,7 +1123,7 @@ void Emitter<CodeEmitter>::emitMiscArithInstruction(const MachineInstr &MI) {
unsigned ShiftAmt = MI.getOperand(OpIdx).getImm();
assert(ShiftAmt < 32 && "shift_imm range is 0 to 31!");
Binary |= ShiftAmt << ARMII::ShiftShift;
-
+
emitWordLE(Binary);
}
@@ -1115,8 +1131,9 @@ template<class CodeEmitter>
void Emitter<CodeEmitter>::emitBranchInstruction(const MachineInstr &MI) {
const TargetInstrDesc &TID = MI.getDesc();
- if (TID.Opcode == ARM::TPsoft)
- abort(); // FIXME
+ if (TID.Opcode == ARM::TPsoft) {
+ llvm_unreachable("ARM::TPsoft FIXME"); // FIXME
+ }
// Part of binary is determined by TableGn.
unsigned Binary = getBinaryCodeForInstr(MI);
@@ -1135,7 +1152,8 @@ void Emitter<CodeEmitter>::emitInlineJumpTable(unsigned JTIndex) {
// Remember the base address of the inline jump table.
uintptr_t JTBase = MCE.getCurrentPCValue();
JTI->addJumpTableBaseAddr(JTIndex, JTBase);
- DOUT << " ** Jump Table #" << JTIndex << " @ " << (void*)JTBase << '\n';
+ DEBUG(errs() << " ** Jump Table #" << JTIndex << " @ " << (void*)JTBase
+ << '\n');
// Now emit the jump table entries.
const std::vector<MachineBasicBlock*> &MBBs = (*MJTEs)[JTIndex].MBBs;
@@ -1155,17 +1173,17 @@ void Emitter<CodeEmitter>::emitMiscBranchInstruction(const MachineInstr &MI) {
const TargetInstrDesc &TID = MI.getDesc();
// Handle jump tables.
- if (TID.Opcode == ARM::BR_JTr || TID.Opcode == ARM::BR_JTadd ||
- TID.Opcode == ARM::t2BR_JTr || TID.Opcode == ARM::t2BR_JTadd) {
+ if (TID.Opcode == ARM::BR_JTr || TID.Opcode == ARM::BR_JTadd) {
// First emit a ldr pc, [] instruction.
emitDataProcessingInstruction(MI, ARM::PC);
// Then emit the inline jump table.
- unsigned JTIndex = (TID.Opcode == ARM::BR_JTr || TID.Opcode == ARM::t2BR_JTr)
+ unsigned JTIndex =
+ (TID.Opcode == ARM::BR_JTr)
? MI.getOperand(1).getIndex() : MI.getOperand(2).getIndex();
emitInlineJumpTable(JTIndex);
return;
- } else if (TID.Opcode == ARM::BR_JTm || TID.Opcode == ARM::t2BR_JTm) {
+ } else if (TID.Opcode == ARM::BR_JTm) {
// First emit a ldr pc, [] instruction.
emitLoadStoreInstruction(MI, ARM::PC);
@@ -1183,7 +1201,7 @@ void Emitter<CodeEmitter>::emitMiscBranchInstruction(const MachineInstr &MI) {
if (TID.Opcode == ARM::BX_RET)
// The return register is LR.
Binary |= ARMRegisterInfo::getRegisterNumbering(ARM::LR);
- else
+ else
// otherwise, set the return register
Binary |= getMachineOpValue(MI, 0);
@@ -1194,7 +1212,7 @@ static unsigned encodeVFPRd(const MachineInstr &MI, unsigned OpIdx) {
unsigned RegD = MI.getOperand(OpIdx).getReg();
unsigned Binary = 0;
bool isSPVFP = false;
- RegD = ARMRegisterInfo::getRegisterNumbering(RegD, isSPVFP);
+ RegD = ARMRegisterInfo::getRegisterNumbering(RegD, &isSPVFP);
if (!isSPVFP)
Binary |= RegD << ARMII::RegRdShift;
else {
@@ -1208,7 +1226,7 @@ static unsigned encodeVFPRn(const MachineInstr &MI, unsigned OpIdx) {
unsigned RegN = MI.getOperand(OpIdx).getReg();
unsigned Binary = 0;
bool isSPVFP = false;
- RegN = ARMRegisterInfo::getRegisterNumbering(RegN, isSPVFP);
+ RegN = ARMRegisterInfo::getRegisterNumbering(RegN, &isSPVFP);
if (!isSPVFP)
Binary |= RegN << ARMII::RegRnShift;
else {
@@ -1222,7 +1240,7 @@ static unsigned encodeVFPRm(const MachineInstr &MI, unsigned OpIdx) {
unsigned RegM = MI.getOperand(OpIdx).getReg();
unsigned Binary = 0;
bool isSPVFP = false;
- RegM = ARMRegisterInfo::getRegisterNumbering(RegM, isSPVFP);
+ RegM = ARMRegisterInfo::getRegisterNumbering(RegM, &isSPVFP);
if (!isSPVFP)
Binary |= RegM;
else {
@@ -1268,7 +1286,7 @@ void Emitter<CodeEmitter>::emitVFPArithInstruction(const MachineInstr &MI) {
// Encode Dm / Sm.
Binary |= encodeVFPRm(MI, OpIdx);
-
+
emitWordLE(Binary);
}
@@ -1386,11 +1404,11 @@ void Emitter<CodeEmitter>::emitVFPLoadStoreMultipleInstruction(
Binary |= 0x1 << ARMII::W_BitShift;
// First register is encoded in Dd.
- Binary |= encodeVFPRd(MI, 4);
+ Binary |= encodeVFPRd(MI, 5);
// Number of registers are encoded in offset field.
unsigned NumRegs = 1;
- for (unsigned i = 5, e = MI.getNumOperands(); i != e; ++i) {
+ for (unsigned i = 6, e = MI.getNumOperands(); i != e; ++i) {
const MachineOperand &MO = MI.getOperand(i);
if (!MO.isReg() || MO.isImplicit())
break;
@@ -1413,4 +1431,3 @@ void Emitter<CodeEmitter>::emitMiscInstruction(const MachineInstr &MI) {
}
#include "ARMGenCodeEmitter.inc"
-
diff --git a/lib/Target/ARM/ARMConstantIslandPass.cpp b/lib/Target/ARM/ARMConstantIslandPass.cpp
index 9fedaa4..309e3ba 100644
--- a/lib/Target/ARM/ARMConstantIslandPass.cpp
+++ b/lib/Target/ARM/ARMConstantIslandPass.cpp
@@ -15,24 +15,31 @@
#define DEBUG_TYPE "arm-cp-islands"
#include "ARM.h"
+#include "ARMAddressingModes.h"
#include "ARMMachineFunctionInfo.h"
#include "ARMInstrInfo.h"
#include "llvm/CodeGen/MachineConstantPool.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
#include "llvm/Target/TargetData.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Support/Compiler.h"
#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/Statistic.h"
using namespace llvm;
-STATISTIC(NumCPEs, "Number of constpool entries");
-STATISTIC(NumSplit, "Number of uncond branches inserted");
-STATISTIC(NumCBrFixed, "Number of cond branches fixed");
-STATISTIC(NumUBrFixed, "Number of uncond branches fixed");
+STATISTIC(NumCPEs, "Number of constpool entries");
+STATISTIC(NumSplit, "Number of uncond branches inserted");
+STATISTIC(NumCBrFixed, "Number of cond branches fixed");
+STATISTIC(NumUBrFixed, "Number of uncond branches fixed");
+STATISTIC(NumTBs, "Number of table branches generated");
+STATISTIC(NumT2CPShrunk, "Number of Thumb2 constantpool instructions shrunk");
+STATISTIC(NumT2BrShrunk, "Number of Thumb2 immediate branches shrunk");
namespace {
/// ARMConstantIslands - Due to limited PC-relative displacements, ARM
@@ -63,6 +70,8 @@ namespace {
/// to a return, unreachable, or unconditional branch).
std::vector<MachineBasicBlock*> WaterList;
+ typedef std::vector<MachineBasicBlock*>::iterator water_iterator;
+
/// CPUser - One user of a constant pool, keeping the machine instruction
/// pointer, the constant pool being referenced, and the max displacement
/// allowed from the instruction to the CP.
@@ -70,8 +79,11 @@ namespace {
MachineInstr *MI;
MachineInstr *CPEMI;
unsigned MaxDisp;
- CPUser(MachineInstr *mi, MachineInstr *cpemi, unsigned maxdisp)
- : MI(mi), CPEMI(cpemi), MaxDisp(maxdisp) {}
+ bool NegOk;
+ bool IsSoImm;
+ CPUser(MachineInstr *mi, MachineInstr *cpemi, unsigned maxdisp,
+ bool neg, bool soimm)
+ : MI(mi), CPEMI(cpemi), MaxDisp(maxdisp), NegOk(neg), IsSoImm(soimm) {}
};
/// CPUsers - Keep track of all of the machine instructions that use various
@@ -117,29 +129,34 @@ namespace {
///
SmallVector<MachineInstr*, 4> PushPopMIs;
+ /// T2JumpTables - Keep track of all the Thumb2 jumptable instructions.
+ SmallVector<MachineInstr*, 4> T2JumpTables;
+
/// HasFarJump - True if any far jump instruction has been emitted during
/// the branch fix up pass.
bool HasFarJump;
const TargetInstrInfo *TII;
+ const ARMSubtarget *STI;
ARMFunctionInfo *AFI;
bool isThumb;
+ bool isThumb1;
bool isThumb2;
public:
static char ID;
ARMConstantIslands() : MachineFunctionPass(&ID) {}
- virtual bool runOnMachineFunction(MachineFunction &Fn);
+ virtual bool runOnMachineFunction(MachineFunction &MF);
virtual const char *getPassName() const {
return "ARM constant island placement and branch shortening pass";
}
private:
- void DoInitialPlacement(MachineFunction &Fn,
+ void DoInitialPlacement(MachineFunction &MF,
std::vector<MachineInstr*> &CPEMIs);
CPEntry *findConstPoolEntry(unsigned CPI, const MachineInstr *CPEMI);
- void InitialFunctionScan(MachineFunction &Fn,
+ void InitialFunctionScan(MachineFunction &MF,
const std::vector<MachineInstr*> &CPEMIs);
MachineBasicBlock *SplitBlockBeforeInstr(MachineInstr *MI);
void UpdateForInsertedWaterBlock(MachineBasicBlock *NewBB);
@@ -147,58 +164,62 @@ namespace {
bool DecrementOldEntry(unsigned CPI, MachineInstr* CPEMI);
int LookForExistingCPEntry(CPUser& U, unsigned UserOffset);
bool LookForWater(CPUser&U, unsigned UserOffset,
- MachineBasicBlock** NewMBB);
- MachineBasicBlock* AcceptWater(MachineBasicBlock *WaterBB,
- std::vector<MachineBasicBlock*>::iterator IP);
+ MachineBasicBlock *&NewMBB);
+ MachineBasicBlock *AcceptWater(water_iterator IP);
void CreateNewWater(unsigned CPUserIndex, unsigned UserOffset,
- MachineBasicBlock** NewMBB);
- bool HandleConstantPoolUser(MachineFunction &Fn, unsigned CPUserIndex);
+ MachineBasicBlock *&NewMBB);
+ bool HandleConstantPoolUser(MachineFunction &MF, unsigned CPUserIndex);
void RemoveDeadCPEMI(MachineInstr *CPEMI);
bool RemoveUnusedCPEntries();
bool CPEIsInRange(MachineInstr *MI, unsigned UserOffset,
- MachineInstr *CPEMI, unsigned Disp,
- bool DoDump);
+ MachineInstr *CPEMI, unsigned Disp, bool NegOk,
+ bool DoDump = false);
bool WaterIsInRange(unsigned UserOffset, MachineBasicBlock *Water,
CPUser &U);
bool OffsetIsInRange(unsigned UserOffset, unsigned TrialOffset,
- unsigned Disp, bool NegativeOK);
+ unsigned Disp, bool NegativeOK, bool IsSoImm = false);
bool BBIsInRange(MachineInstr *MI, MachineBasicBlock *BB, unsigned Disp);
- bool FixUpImmediateBr(MachineFunction &Fn, ImmBranch &Br);
- bool FixUpConditionalBr(MachineFunction &Fn, ImmBranch &Br);
- bool FixUpUnconditionalBr(MachineFunction &Fn, ImmBranch &Br);
+ bool FixUpImmediateBr(MachineFunction &MF, ImmBranch &Br);
+ bool FixUpConditionalBr(MachineFunction &MF, ImmBranch &Br);
+ bool FixUpUnconditionalBr(MachineFunction &MF, ImmBranch &Br);
bool UndoLRSpillRestore();
+ bool OptimizeThumb2Instructions(MachineFunction &MF);
+ bool OptimizeThumb2Branches(MachineFunction &MF);
+ bool OptimizeThumb2JumpTables(MachineFunction &MF);
unsigned GetOffsetOf(MachineInstr *MI) const;
void dumpBBs();
- void verify(MachineFunction &Fn);
+ void verify(MachineFunction &MF);
};
char ARMConstantIslands::ID = 0;
}
/// verify - check BBOffsets, BBSizes, alignment of islands
-void ARMConstantIslands::verify(MachineFunction &Fn) {
+void ARMConstantIslands::verify(MachineFunction &MF) {
assert(BBOffsets.size() == BBSizes.size());
for (unsigned i = 1, e = BBOffsets.size(); i != e; ++i)
assert(BBOffsets[i-1]+BBSizes[i-1] == BBOffsets[i]);
- if (isThumb) {
- for (MachineFunction::iterator MBBI = Fn.begin(), E = Fn.end();
- MBBI != E; ++MBBI) {
- MachineBasicBlock *MBB = MBBI;
- if (!MBB->empty() &&
- MBB->begin()->getOpcode() == ARM::CONSTPOOL_ENTRY)
- assert((BBOffsets[MBB->getNumber()]%4 == 0 &&
- BBSizes[MBB->getNumber()]%4 == 0) ||
- (BBOffsets[MBB->getNumber()]%4 != 0 &&
- BBSizes[MBB->getNumber()]%4 != 0));
+ if (!isThumb)
+ return;
+#ifndef NDEBUG
+ for (MachineFunction::iterator MBBI = MF.begin(), E = MF.end();
+ MBBI != E; ++MBBI) {
+ MachineBasicBlock *MBB = MBBI;
+ if (!MBB->empty() &&
+ MBB->begin()->getOpcode() == ARM::CONSTPOOL_ENTRY) {
+ unsigned MBBId = MBB->getNumber();
+ assert((BBOffsets[MBBId]%4 == 0 && BBSizes[MBBId]%4 == 0) ||
+ (BBOffsets[MBBId]%4 != 0 && BBSizes[MBBId]%4 != 0));
}
}
+#endif
}
/// print block size and offset information - debugging
void ARMConstantIslands::dumpBBs() {
for (unsigned J = 0, E = BBOffsets.size(); J !=E; ++J) {
- DOUT << "block " << J << " offset " << BBOffsets[J] <<
- " size " << BBSizes[J] << "\n";
+ DEBUG(errs() << "block " << J << " offset " << BBOffsets[J]
+ << " size " << BBSizes[J] << "\n");
}
}
@@ -208,31 +229,36 @@ FunctionPass *llvm::createARMConstantIslandPass() {
return new ARMConstantIslands();
}
-bool ARMConstantIslands::runOnMachineFunction(MachineFunction &Fn) {
- MachineConstantPool &MCP = *Fn.getConstantPool();
+bool ARMConstantIslands::runOnMachineFunction(MachineFunction &MF) {
+ MachineConstantPool &MCP = *MF.getConstantPool();
+
+ TII = MF.getTarget().getInstrInfo();
+ AFI = MF.getInfo<ARMFunctionInfo>();
+ STI = &MF.getTarget().getSubtarget<ARMSubtarget>();
- TII = Fn.getTarget().getInstrInfo();
- AFI = Fn.getInfo<ARMFunctionInfo>();
isThumb = AFI->isThumbFunction();
+ isThumb1 = AFI->isThumb1OnlyFunction();
isThumb2 = AFI->isThumb2Function();
HasFarJump = false;
// Renumber all of the machine basic blocks in the function, guaranteeing that
// the numbers agree with the position of the block in the function.
- Fn.RenumberBlocks();
+ MF.RenumberBlocks();
+
+ // Thumb1 functions containing constant pools get 4-byte alignment.
+ // This is so we can keep exact track of where the alignment padding goes.
- /// Thumb functions containing constant pools get 2-byte alignment.
- /// This is so we can keep exact track of where the alignment padding goes.
- /// Set default.
- AFI->setAlign(isThumb ? 1U : 2U);
+ // Set default. Thumb1 function is 2-byte aligned, ARM and Thumb2 are 4-byte
+ // aligned.
+ AFI->setAlign(isThumb1 ? 1U : 2U);
// Perform the initial placement of the constant pool entries. To start with,
// we put them all at the end of the function.
std::vector<MachineInstr*> CPEMIs;
if (!MCP.isEmpty()) {
- DoInitialPlacement(Fn, CPEMIs);
- if (isThumb)
+ DoInitialPlacement(MF, CPEMIs);
+ if (isThumb1)
AFI->setAlign(2U);
}
@@ -242,7 +268,7 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &Fn) {
// Do the initial scan of the function, building up information about the
// sizes of each block, the location of all the water, and finding all of the
// constant pool users.
- InitialFunctionScan(Fn, CPEMIs);
+ InitialFunctionScan(MF, CPEMIs);
CPEMIs.clear();
/// Remove dead constant pool entries.
@@ -251,25 +277,37 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &Fn) {
// Iteratively place constant pool entries and fix up branches until there
// is no change.
bool MadeChange = false;
+ unsigned NoCPIters = 0, NoBRIters = 0;
while (true) {
- bool Change = false;
+ bool CPChange = false;
for (unsigned i = 0, e = CPUsers.size(); i != e; ++i)
- Change |= HandleConstantPoolUser(Fn, i);
+ CPChange |= HandleConstantPoolUser(MF, i);
+ if (CPChange && ++NoCPIters > 30)
+ llvm_unreachable("Constant Island pass failed to converge!");
DEBUG(dumpBBs());
+
+ bool BRChange = false;
for (unsigned i = 0, e = ImmBranches.size(); i != e; ++i)
- Change |= FixUpImmediateBr(Fn, ImmBranches[i]);
+ BRChange |= FixUpImmediateBr(MF, ImmBranches[i]);
+ if (BRChange && ++NoBRIters > 30)
+ llvm_unreachable("Branch Fix Up pass failed to converge!");
DEBUG(dumpBBs());
- if (!Change)
+
+ if (!CPChange && !BRChange)
break;
MadeChange = true;
}
+ // Shrink 32-bit Thumb2 branch, load, and store instructions.
+ if (isThumb2)
+ MadeChange |= OptimizeThumb2Instructions(MF);
+
// After a while, this might be made debug-only, but it is not expensive.
- verify(Fn);
+ verify(MF);
// If LR has been forced spilled and no far jumps (i.e. BL) has been issued.
// Undo the spill / restore of LR if possible.
- if (!HasFarJump && AFI->isLRSpilledForFarJump() && isThumb)
+ if (isThumb && !HasFarJump && AFI->isLRSpilledForFarJump())
MadeChange |= UndoLRSpillRestore();
BBSizes.clear();
@@ -279,24 +317,25 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &Fn) {
CPEntries.clear();
ImmBranches.clear();
PushPopMIs.clear();
+ T2JumpTables.clear();
return MadeChange;
}
/// DoInitialPlacement - Perform the initial placement of the constant pool
/// entries. To start with, we put them all at the end of the function.
-void ARMConstantIslands::DoInitialPlacement(MachineFunction &Fn,
+void ARMConstantIslands::DoInitialPlacement(MachineFunction &MF,
std::vector<MachineInstr*> &CPEMIs) {
// Create the basic block to hold the CPE's.
- MachineBasicBlock *BB = Fn.CreateMachineBasicBlock();
- Fn.push_back(BB);
+ MachineBasicBlock *BB = MF.CreateMachineBasicBlock();
+ MF.push_back(BB);
// Add all of the constants from the constant pool to the end block, use an
// identity mapping of CPI's to CPE's.
const std::vector<MachineConstantPoolEntry> &CPs =
- Fn.getConstantPool()->getConstants();
+ MF.getConstantPool()->getConstants();
- const TargetData &TD = *Fn.getTarget().getTargetData();
+ const TargetData &TD = *MF.getTarget().getTargetData();
for (unsigned i = 0, e = CPs.size(); i != e; ++i) {
unsigned Size = TD.getTypeAllocSize(CPs[i].getType());
// Verify that all constant pool entries are a multiple of 4 bytes. If not,
@@ -313,7 +352,8 @@ void ARMConstantIslands::DoInitialPlacement(MachineFunction &Fn,
CPEs.push_back(CPEntry(CPEMI, i));
CPEntries.push_back(CPEs);
NumCPEs++;
- DOUT << "Moved CPI#" << i << " to end of function as #" << i << "\n";
+ DEBUG(errs() << "Moved CPI#" << i << " to end of function as #" << i
+ << "\n");
}
}
@@ -352,10 +392,10 @@ ARMConstantIslands::CPEntry
/// InitialFunctionScan - Do the initial scan of the function, building up
/// information about the sizes of each block, the location of all the water,
/// and finding all of the constant pool users.
-void ARMConstantIslands::InitialFunctionScan(MachineFunction &Fn,
+void ARMConstantIslands::InitialFunctionScan(MachineFunction &MF,
const std::vector<MachineInstr*> &CPEMIs) {
unsigned Offset = 0;
- for (MachineFunction::iterator MBBI = Fn.begin(), E = Fn.end();
+ for (MachineFunction::iterator MBBI = MF.begin(), E = MF.end();
MBBI != E; ++MBBI) {
MachineBasicBlock &MBB = *MBBI;
@@ -377,18 +417,19 @@ void ARMConstantIslands::InitialFunctionScan(MachineFunction &Fn,
unsigned Scale = 1;
int UOpc = Opc;
switch (Opc) {
+ default:
+ continue; // Ignore other JT branches
case ARM::tBR_JTr:
- case ARM::t2BR_JTr:
- case ARM::t2BR_JTm:
- case ARM::t2BR_JTadd:
- // A Thumb table jump may involve padding; for the offsets to
+ // A Thumb1 table jump may involve padding; for the offsets to
// be right, functions containing these must be 4-byte aligned.
AFI->setAlign(2U);
if ((Offset+MBBSize)%4 != 0)
+ // FIXME: Add a pseudo ALIGN instruction instead.
MBBSize += 2; // padding
continue; // Does not get an entry in ImmBranches
- default:
- continue; // Ignore other JT branches
+ case ARM::t2BR_JT:
+ T2JumpTables.push_back(I);
+ continue; // Does not get an entry in ImmBranches
case ARM::Bcc:
isCond = true;
UOpc = ARM::B;
@@ -427,6 +468,9 @@ void ARMConstantIslands::InitialFunctionScan(MachineFunction &Fn,
if (Opc == ARM::tPUSH || Opc == ARM::tPOP_RET)
PushPopMIs.push_back(I);
+ if (Opc == ARM::CONSTPOOL_ENTRY)
+ continue;
+
// Scan the instructions for constant pool operands.
for (unsigned op = 0, e = I->getNumOperands(); op != e; ++op)
if (I->getOperand(op).isCPI()) {
@@ -436,50 +480,52 @@ void ARMConstantIslands::InitialFunctionScan(MachineFunction &Fn,
// Basic size info comes from the TSFlags field.
unsigned Bits = 0;
unsigned Scale = 1;
- unsigned TSFlags = I->getDesc().TSFlags;
- switch (TSFlags & ARMII::AddrModeMask) {
+ bool NegOk = false;
+ bool IsSoImm = false;
+
+ switch (Opc) {
default:
- // Constant pool entries can reach anything.
- if (I->getOpcode() == ARM::CONSTPOOL_ENTRY)
- continue;
- if (I->getOpcode() == ARM::tLEApcrel) {
- Bits = 8; // Taking the address of a CP entry.
- break;
- }
- assert(0 && "Unknown addressing mode for CP reference!");
- case ARMII::AddrMode1: // AM1: 8 bits << 2
- Bits = 8;
- Scale = 4; // Taking the address of a CP entry.
- break;
- case ARMII::AddrMode2:
- Bits = 12; // +-offset_12
- break;
- case ARMII::AddrMode3:
- Bits = 8; // +-offset_8
+ llvm_unreachable("Unknown addressing mode for CP reference!");
break;
- // addrmode4 has no immediate offset.
- case ARMII::AddrMode5:
+
+ // Taking the address of a CP entry.
+ case ARM::LEApcrel:
+ // This takes a SoImm, which is 8 bit immediate rotated. We'll
+ // pretend the maximum offset is 255 * 4. Since each instruction
+ // 4 byte wide, this is always correct. We'llc heck for other
+ // displacements that fits in a SoImm as well.
Bits = 8;
- Scale = 4; // +-(offset_8*4)
+ Scale = 4;
+ NegOk = true;
+ IsSoImm = true;
break;
- // addrmode6 has no immediate offset.
- case ARMII::AddrModeT1_1:
- Bits = 5; // +offset_5
+ case ARM::t2LEApcrel:
+ Bits = 12;
+ NegOk = true;
break;
- case ARMII::AddrModeT1_2:
- Bits = 5;
- Scale = 2; // +(offset_5*2)
+ case ARM::tLEApcrel:
+ Bits = 8;
+ Scale = 4;
break;
- case ARMII::AddrModeT1_4:
- Bits = 5;
- Scale = 4; // +(offset_5*4)
+
+ case ARM::LDR:
+ case ARM::LDRcp:
+ case ARM::t2LDRpci:
+ Bits = 12; // +-offset_12
+ NegOk = true;
break;
- case ARMII::AddrModeT1_s:
+
+ case ARM::tLDRpci:
+ case ARM::tLDRcp:
Bits = 8;
Scale = 4; // +(offset_8*4)
break;
- case ARMII::AddrModeT2_pc:
- Bits = 12; // +-offset_12
+
+ case ARM::FLDD:
+ case ARM::FLDS:
+ Bits = 8;
+ Scale = 4; // +-(offset_8*4)
+ NegOk = true;
break;
}
@@ -487,7 +533,7 @@ void ARMConstantIslands::InitialFunctionScan(MachineFunction &Fn,
unsigned CPI = I->getOperand(op).getIndex();
MachineInstr *CPEMI = CPEMIs[CPI];
unsigned MaxOffs = ((1 << Bits)-1) * Scale;
- CPUsers.push_back(CPUser(I, CPEMI, MaxOffs));
+ CPUsers.push_back(CPUser(I, CPEMI, MaxOffs, NegOk, IsSoImm));
// Increment corresponding CPEntry reference count.
CPEntry *CPE = findConstPoolEntry(CPI, CPEMI);
@@ -563,7 +609,7 @@ void ARMConstantIslands::UpdateForInsertedWaterBlock(MachineBasicBlock *NewBB) {
// Next, update WaterList. Specifically, we need to add NewMBB as having
// available water after it.
- std::vector<MachineBasicBlock*>::iterator IP =
+ water_iterator IP =
std::lower_bound(WaterList.begin(), WaterList.end(), NewBB,
CompareMBBNumbers);
WaterList.insert(IP, NewBB);
@@ -590,8 +636,8 @@ MachineBasicBlock *ARMConstantIslands::SplitBlockBeforeInstr(MachineInstr *MI) {
// Note the new unconditional branch is not being recorded.
// There doesn't seem to be meaningful DebugInfo available; this doesn't
// correspond to anything in the source.
- BuildMI(OrigBB, DebugLoc::getUnknownLoc(),
- TII->get(isThumb ? ((isThumb2) ? ARM::t2B : ARM::tB) : ARM::B)).addMBB(NewBB);
+ unsigned Opc = isThumb ? (isThumb2 ? ARM::t2B : ARM::tB) : ARM::B;
+ BuildMI(OrigBB, DebugLoc::getUnknownLoc(), TII->get(Opc)).addMBB(NewBB);
NumSplit++;
// Update the CFG. All succs of OrigBB are now succs of NewBB.
@@ -625,7 +671,7 @@ MachineBasicBlock *ARMConstantIslands::SplitBlockBeforeInstr(MachineInstr *MI) {
// available water after it (but not if it's already there, which happens
// when splitting before a conditional branch that is followed by an
// unconditional branch - in that case we want to insert NewBB).
- std::vector<MachineBasicBlock*>::iterator IP =
+ water_iterator IP =
std::lower_bound(WaterList.begin(), WaterList.end(), OrigBB,
CompareMBBNumbers);
MachineBasicBlock* WaterBB = *IP;
@@ -648,7 +694,7 @@ MachineBasicBlock *ARMConstantIslands::SplitBlockBeforeInstr(MachineInstr *MI) {
// We removed instructions from UserMBB, subtract that off from its size.
// Add 2 or 4 to the block to count the unconditional branch we added to it.
- unsigned delta = isThumb ? 2 : 4;
+ int delta = isThumb1 ? 2 : 4;
BBSizes[OrigBBI] -= NewBBSize - delta;
// ...and adjust BBOffsets for NewBB accordingly.
@@ -664,24 +710,39 @@ MachineBasicBlock *ARMConstantIslands::SplitBlockBeforeInstr(MachineInstr *MI) {
/// reference) is within MaxDisp of TrialOffset (a proposed location of a
/// constant pool entry).
bool ARMConstantIslands::OffsetIsInRange(unsigned UserOffset,
- unsigned TrialOffset, unsigned MaxDisp, bool NegativeOK) {
+ unsigned TrialOffset, unsigned MaxDisp,
+ bool NegativeOK, bool IsSoImm) {
// On Thumb offsets==2 mod 4 are rounded down by the hardware for
// purposes of the displacement computation; compensate for that here.
// Effectively, the valid range of displacements is 2 bytes smaller for such
// references.
- if (isThumb && UserOffset%4 !=0)
+ unsigned TotalAdj = 0;
+ if (isThumb && UserOffset%4 !=0) {
UserOffset -= 2;
+ TotalAdj = 2;
+ }
// CPEs will be rounded up to a multiple of 4.
- if (isThumb && TrialOffset%4 != 0)
+ if (isThumb && TrialOffset%4 != 0) {
TrialOffset += 2;
+ TotalAdj += 2;
+ }
+
+ // In Thumb2 mode, later branch adjustments can shift instructions up and
+ // cause alignment change. In the worst case scenario this can cause the
+ // user's effective address to be subtracted by 2 and the CPE's address to
+ // be plus 2.
+ if (isThumb2 && TotalAdj != 4)
+ MaxDisp -= (4 - TotalAdj);
if (UserOffset <= TrialOffset) {
// User before the Trial.
- if (TrialOffset-UserOffset <= MaxDisp)
+ if (TrialOffset - UserOffset <= MaxDisp)
return true;
+ // FIXME: Make use full range of soimm values.
} else if (NegativeOK) {
- if (UserOffset-TrialOffset <= MaxDisp)
+ if (UserOffset - TrialOffset <= MaxDisp)
return true;
+ // FIXME: Make use full range of soimm values.
}
return false;
}
@@ -690,39 +751,36 @@ bool ARMConstantIslands::OffsetIsInRange(unsigned UserOffset,
/// Water (a basic block) will be in range for the specific MI.
bool ARMConstantIslands::WaterIsInRange(unsigned UserOffset,
- MachineBasicBlock* Water, CPUser &U)
-{
+ MachineBasicBlock* Water, CPUser &U) {
unsigned MaxDisp = U.MaxDisp;
- MachineFunction::iterator I = next(MachineFunction::iterator(Water));
unsigned CPEOffset = BBOffsets[Water->getNumber()] +
BBSizes[Water->getNumber()];
// If the CPE is to be inserted before the instruction, that will raise
- // the offset of the instruction. (Currently applies only to ARM, so
- // no alignment compensation attempted here.)
+ // the offset of the instruction.
if (CPEOffset < UserOffset)
UserOffset += U.CPEMI->getOperand(2).getImm();
- return OffsetIsInRange (UserOffset, CPEOffset, MaxDisp, !isThumb);
+ return OffsetIsInRange(UserOffset, CPEOffset, MaxDisp, U.NegOk, U.IsSoImm);
}
/// CPEIsInRange - Returns true if the distance between specific MI and
/// specific ConstPool entry instruction can fit in MI's displacement field.
bool ARMConstantIslands::CPEIsInRange(MachineInstr *MI, unsigned UserOffset,
- MachineInstr *CPEMI,
- unsigned MaxDisp, bool DoDump) {
+ MachineInstr *CPEMI, unsigned MaxDisp,
+ bool NegOk, bool DoDump) {
unsigned CPEOffset = GetOffsetOf(CPEMI);
assert(CPEOffset%4 == 0 && "Misaligned CPE");
if (DoDump) {
- DOUT << "User of CPE#" << CPEMI->getOperand(0).getImm()
- << " max delta=" << MaxDisp
- << " insn address=" << UserOffset
- << " CPE address=" << CPEOffset
- << " offset=" << int(CPEOffset-UserOffset) << "\t" << *MI;
+ DEBUG(errs() << "User of CPE#" << CPEMI->getOperand(0).getImm()
+ << " max delta=" << MaxDisp
+ << " insn address=" << UserOffset
+ << " CPE address=" << CPEOffset
+ << " offset=" << int(CPEOffset-UserOffset) << "\t" << *MI);
}
- return OffsetIsInRange(UserOffset, CPEOffset, MaxDisp, !isThumb);
+ return OffsetIsInRange(UserOffset, CPEOffset, MaxDisp, NegOk);
}
#ifndef NDEBUG
@@ -745,52 +803,48 @@ static bool BBIsJumpedOver(MachineBasicBlock *MBB) {
void ARMConstantIslands::AdjustBBOffsetsAfter(MachineBasicBlock *BB,
int delta) {
MachineFunction::iterator MBBI = BB; MBBI = next(MBBI);
- for(unsigned i=BB->getNumber()+1; i<BB->getParent()->getNumBlockIDs(); i++) {
+ for(unsigned i = BB->getNumber()+1, e = BB->getParent()->getNumBlockIDs();
+ i < e; ++i) {
BBOffsets[i] += delta;
// If some existing blocks have padding, adjust the padding as needed, a
// bit tricky. delta can be negative so don't use % on that.
- if (isThumb) {
- MachineBasicBlock *MBB = MBBI;
- if (!MBB->empty()) {
- // Constant pool entries require padding.
- if (MBB->begin()->getOpcode() == ARM::CONSTPOOL_ENTRY) {
- unsigned oldOffset = BBOffsets[i] - delta;
- if (oldOffset%4==0 && BBOffsets[i]%4!=0) {
- // add new padding
- BBSizes[i] += 2;
- delta += 2;
- } else if (oldOffset%4!=0 && BBOffsets[i]%4==0) {
- // remove existing padding
- BBSizes[i] -=2;
- delta -= 2;
- }
+ if (!isThumb)
+ continue;
+ MachineBasicBlock *MBB = MBBI;
+ if (!MBB->empty()) {
+ // Constant pool entries require padding.
+ if (MBB->begin()->getOpcode() == ARM::CONSTPOOL_ENTRY) {
+ unsigned OldOffset = BBOffsets[i] - delta;
+ if ((OldOffset%4) == 0 && (BBOffsets[i]%4) != 0) {
+ // add new padding
+ BBSizes[i] += 2;
+ delta += 2;
+ } else if ((OldOffset%4) != 0 && (BBOffsets[i]%4) == 0) {
+ // remove existing padding
+ BBSizes[i] -= 2;
+ delta -= 2;
}
- // Thumb jump tables require padding. They should be at the end;
- // following unconditional branches are removed by AnalyzeBranch.
- MachineInstr *ThumbJTMI = NULL;
- if ((prior(MBB->end())->getOpcode() == ARM::tBR_JTr)
- || (prior(MBB->end())->getOpcode() == ARM::t2BR_JTr)
- || (prior(MBB->end())->getOpcode() == ARM::t2BR_JTm)
- || (prior(MBB->end())->getOpcode() == ARM::t2BR_JTadd))
- ThumbJTMI = prior(MBB->end());
- if (ThumbJTMI) {
- unsigned newMIOffset = GetOffsetOf(ThumbJTMI);
- unsigned oldMIOffset = newMIOffset - delta;
- if (oldMIOffset%4 == 0 && newMIOffset%4 != 0) {
- // remove existing padding
- BBSizes[i] -= 2;
- delta -= 2;
- } else if (oldMIOffset%4 != 0 && newMIOffset%4 == 0) {
- // add new padding
- BBSizes[i] += 2;
- delta += 2;
- }
+ }
+ // Thumb1 jump tables require padding. They should be at the end;
+ // following unconditional branches are removed by AnalyzeBranch.
+ MachineInstr *ThumbJTMI = prior(MBB->end());
+ if (ThumbJTMI->getOpcode() == ARM::tBR_JTr) {
+ unsigned NewMIOffset = GetOffsetOf(ThumbJTMI);
+ unsigned OldMIOffset = NewMIOffset - delta;
+ if ((OldMIOffset%4) == 0 && (NewMIOffset%4) != 0) {
+ // remove existing padding
+ BBSizes[i] -= 2;
+ delta -= 2;
+ } else if ((OldMIOffset%4) != 0 && (NewMIOffset%4) == 0) {
+ // add new padding
+ BBSizes[i] += 2;
+ delta += 2;
}
- if (delta==0)
- return;
}
- MBBI = next(MBBI);
+ if (delta==0)
+ return;
}
+ MBBI = next(MBBI);
}
}
@@ -824,8 +878,8 @@ int ARMConstantIslands::LookForExistingCPEntry(CPUser& U, unsigned UserOffset)
MachineInstr *CPEMI = U.CPEMI;
// Check to see if the CPE is already in-range.
- if (CPEIsInRange(UserMI, UserOffset, CPEMI, U.MaxDisp, true)) {
- DOUT << "In range\n";
+ if (CPEIsInRange(UserMI, UserOffset, CPEMI, U.MaxDisp, U.NegOk, true)) {
+ DEBUG(errs() << "In range\n");
return 1;
}
@@ -839,8 +893,9 @@ int ARMConstantIslands::LookForExistingCPEntry(CPUser& U, unsigned UserOffset)
// Removing CPEs can leave empty entries, skip
if (CPEs[i].CPEMI == NULL)
continue;
- if (CPEIsInRange(UserMI, UserOffset, CPEs[i].CPEMI, U.MaxDisp, false)) {
- DOUT << "Replacing CPE#" << CPI << " with CPE#" << CPEs[i].CPI << "\n";
+ if (CPEIsInRange(UserMI, UserOffset, CPEs[i].CPEMI, U.MaxDisp, U.NegOk)) {
+ DEBUG(errs() << "Replacing CPE#" << CPI << " with CPE#"
+ << CPEs[i].CPI << "\n");
// Point the CPUser node to the replacement
U.CPEMI = CPEs[i].CPEMI;
// Change the CPI in the instruction operand to refer to the clone.
@@ -870,15 +925,15 @@ static inline unsigned getUnconditionalBrDisp(int Opc) {
default:
break;
}
-
+
return ((1<<23)-1)*4;
}
/// AcceptWater - Small amount of common code factored out of the following.
-
-MachineBasicBlock* ARMConstantIslands::AcceptWater(MachineBasicBlock *WaterBB,
- std::vector<MachineBasicBlock*>::iterator IP) {
- DOUT << "found water in range\n";
+///
+MachineBasicBlock *ARMConstantIslands::AcceptWater(water_iterator IP) {
+ DEBUG(errs() << "found water in range\n");
+ MachineBasicBlock *WaterBB = *IP;
// Remove the original WaterList entry; we want subsequent
// insertions in this vicinity to go after the one we're
// about to insert. This considerably reduces the number
@@ -890,41 +945,44 @@ MachineBasicBlock* ARMConstantIslands::AcceptWater(MachineBasicBlock *WaterBB,
/// LookForWater - look for an existing entry in the WaterList in which
/// we can place the CPE referenced from U so it's within range of U's MI.
-/// Returns true if found, false if not. If it returns true, *NewMBB
-/// is set to the WaterList entry.
-/// For ARM, we prefer the water that's farthest away. For Thumb, prefer
-/// water that will not introduce padding to water that will; within each
-/// group, prefer the water that's farthest away.
-
+/// Returns true if found, false if not. If it returns true, NewMBB
+/// is set to the WaterList entry. For Thumb, prefer water that will not
+/// introduce padding to water that will. To ensure that this pass
+/// terminates, the CPE location for a particular CPUser is only allowed to
+/// move to a lower address, so search backward from the end of the list and
+/// prefer the first water that is in range.
bool ARMConstantIslands::LookForWater(CPUser &U, unsigned UserOffset,
- MachineBasicBlock** NewMBB) {
- std::vector<MachineBasicBlock*>::iterator IPThatWouldPad;
- MachineBasicBlock* WaterBBThatWouldPad = NULL;
- if (!WaterList.empty()) {
- for (std::vector<MachineBasicBlock*>::iterator IP = prior(WaterList.end()),
- B = WaterList.begin();; --IP) {
- MachineBasicBlock* WaterBB = *IP;
- if (WaterIsInRange(UserOffset, WaterBB, U)) {
- if (isThumb &&
- (BBOffsets[WaterBB->getNumber()] +
- BBSizes[WaterBB->getNumber()])%4 != 0) {
- // This is valid Water, but would introduce padding. Remember
- // it in case we don't find any Water that doesn't do this.
- if (!WaterBBThatWouldPad) {
- WaterBBThatWouldPad = WaterBB;
- IPThatWouldPad = IP;
- }
- } else {
- *NewMBB = AcceptWater(WaterBB, IP);
- return true;
+ MachineBasicBlock *&NewMBB) {
+ if (WaterList.empty())
+ return false;
+
+ bool FoundWaterThatWouldPad = false;
+ water_iterator IPThatWouldPad;
+ for (water_iterator IP = prior(WaterList.end()),
+ B = WaterList.begin();; --IP) {
+ MachineBasicBlock* WaterBB = *IP;
+ // Check if water is in range and at a lower address than the current one.
+ if (WaterIsInRange(UserOffset, WaterBB, U) &&
+ WaterBB->getNumber() < U.CPEMI->getParent()->getNumber()) {
+ unsigned WBBId = WaterBB->getNumber();
+ if (isThumb &&
+ (BBOffsets[WBBId] + BBSizes[WBBId])%4 != 0) {
+ // This is valid Water, but would introduce padding. Remember
+ // it in case we don't find any Water that doesn't do this.
+ if (!FoundWaterThatWouldPad) {
+ FoundWaterThatWouldPad = true;
+ IPThatWouldPad = IP;
}
+ } else {
+ NewMBB = AcceptWater(IP);
+ return true;
+ }
}
- if (IP == B)
- break;
- }
+ if (IP == B)
+ break;
}
- if (isThumb && WaterBBThatWouldPad) {
- *NewMBB = AcceptWater(WaterBBThatWouldPad, IPThatWouldPad);
+ if (FoundWaterThatWouldPad) {
+ NewMBB = AcceptWater(IPThatWouldPad);
return true;
}
return false;
@@ -934,12 +992,12 @@ bool ARMConstantIslands::LookForWater(CPUser &U, unsigned UserOffset,
/// CPUsers[CPUserIndex], so create a place to put the CPE. The end of the
/// block is used if in range, and the conditional branch munged so control
/// flow is correct. Otherwise the block is split to create a hole with an
-/// unconditional branch around it. In either case *NewMBB is set to a
+/// unconditional branch around it. In either case NewMBB is set to a
/// block following which the new island can be inserted (the WaterList
/// is not adjusted).
-
void ARMConstantIslands::CreateNewWater(unsigned CPUserIndex,
- unsigned UserOffset, MachineBasicBlock** NewMBB) {
+ unsigned UserOffset,
+ MachineBasicBlock *&NewMBB) {
CPUser &U = CPUsers[CPUserIndex];
MachineInstr *UserMI = U.MI;
MachineInstr *CPEMI = U.CPEMI;
@@ -950,18 +1008,18 @@ void ARMConstantIslands::CreateNewWater(unsigned CPUserIndex,
// If the use is at the end of the block, or the end of the block
// is within range, make new water there. (The addition below is
- // for the unconditional branch we will be adding: 4 bytes on ARM,
- // 2 on Thumb. Possible Thumb alignment padding is allowed for
+ // for the unconditional branch we will be adding: 4 bytes on ARM + Thumb2,
+ // 2 on Thumb1. Possible Thumb1 alignment padding is allowed for
// inside OffsetIsInRange.
// If the block ends in an unconditional branch already, it is water,
// and is known to be out of range, so we'll always be adding a branch.)
if (&UserMBB->back() == UserMI ||
- OffsetIsInRange(UserOffset, OffsetOfNextBlock + (isThumb ? 2: 4),
- U.MaxDisp, !isThumb)) {
- DOUT << "Split at end of block\n";
+ OffsetIsInRange(UserOffset, OffsetOfNextBlock + (isThumb1 ? 2: 4),
+ U.MaxDisp, U.NegOk, U.IsSoImm)) {
+ DEBUG(errs() << "Split at end of block\n");
if (&UserMBB->back() == UserMI)
assert(BBHasFallthrough(UserMBB) && "Expected a fallthrough BB!");
- *NewMBB = next(MachineFunction::iterator(UserMBB));
+ NewMBB = next(MachineFunction::iterator(UserMBB));
// Add an unconditional branch from UserMBB to fallthrough block.
// Record it for branch lengthening; this new branch will not get out of
// range, but if the preceding conditional branch is out of range, the
@@ -969,16 +1027,16 @@ void ARMConstantIslands::CreateNewWater(unsigned CPUserIndex,
// range, so the machinery has to know about it.
int UncondBr = isThumb ? ((isThumb2) ? ARM::t2B : ARM::tB) : ARM::B;
BuildMI(UserMBB, DebugLoc::getUnknownLoc(),
- TII->get(UncondBr)).addMBB(*NewMBB);
+ TII->get(UncondBr)).addMBB(NewMBB);
unsigned MaxDisp = getUnconditionalBrDisp(UncondBr);
ImmBranches.push_back(ImmBranch(&UserMBB->back(),
MaxDisp, false, UncondBr));
- int delta = isThumb ? 2 : 4;
+ int delta = isThumb1 ? 2 : 4;
BBSizes[UserMBB->getNumber()] += delta;
AdjustBBOffsetsAfter(UserMBB, delta);
} else {
// What a big block. Find a place within the block to split it.
- // This is a little tricky on Thumb since instructions are 2 bytes
+ // This is a little tricky on Thumb1 since instructions are 2 bytes
// and constant pool entries are 4 bytes: if instruction I references
// island CPE, and instruction I+1 references CPE', it will
// not work well to put CPE as far forward as possible, since then
@@ -991,7 +1049,7 @@ void ARMConstantIslands::CreateNewWater(unsigned CPUserIndex,
// if not, we back up the insertion point.
// The 4 in the following is for the unconditional branch we'll be
- // inserting (allows for long branch on Thumb). Alignment of the
+ // inserting (allows for long branch on Thumb1). Alignment of the
// island is handled inside OffsetIsInRange.
unsigned BaseInsertOffset = UserOffset + U.MaxDisp -4;
// This could point off the end of the block if we've already got
@@ -1000,7 +1058,7 @@ void ARMConstantIslands::CreateNewWater(unsigned CPUserIndex,
// conditional and a maximally long unconditional).
if (BaseInsertOffset >= BBOffsets[UserMBB->getNumber()+1])
BaseInsertOffset = BBOffsets[UserMBB->getNumber()+1] -
- (isThumb ? 6 : 8);
+ (isThumb1 ? 6 : 8);
unsigned EndInsertOffset = BaseInsertOffset +
CPEMI->getOperand(2).getImm();
MachineBasicBlock::iterator MI = UserMI;
@@ -1011,10 +1069,11 @@ void ARMConstantIslands::CreateNewWater(unsigned CPUserIndex,
Offset += TII->GetInstSizeInBytes(MI),
MI = next(MI)) {
if (CPUIndex < CPUsers.size() && CPUsers[CPUIndex].MI == MI) {
+ CPUser &U = CPUsers[CPUIndex];
if (!OffsetIsInRange(Offset, EndInsertOffset,
- CPUsers[CPUIndex].MaxDisp, !isThumb)) {
- BaseInsertOffset -= (isThumb ? 2 : 4);
- EndInsertOffset -= (isThumb ? 2 : 4);
+ U.MaxDisp, U.NegOk, U.IsSoImm)) {
+ BaseInsertOffset -= (isThumb1 ? 2 : 4);
+ EndInsertOffset -= (isThumb1 ? 2 : 4);
}
// This is overly conservative, as we don't account for CPEMIs
// being reused within the block, but it doesn't matter much.
@@ -1022,8 +1081,8 @@ void ARMConstantIslands::CreateNewWater(unsigned CPUserIndex,
CPUIndex++;
}
}
- DOUT << "Split in middle of big block\n";
- *NewMBB = SplitBlockBeforeInstr(prior(MI));
+ DEBUG(errs() << "Split in middle of big block\n");
+ NewMBB = SplitBlockBeforeInstr(prior(MI));
}
}
@@ -1031,7 +1090,7 @@ void ARMConstantIslands::CreateNewWater(unsigned CPUserIndex,
/// is out-of-range. If so, pick up the constant pool value and move it some
/// place in-range. Return true if we changed any addresses (thus must run
/// another pass of branch lengthening), false otherwise.
-bool ARMConstantIslands::HandleConstantPoolUser(MachineFunction &Fn,
+bool ARMConstantIslands::HandleConstantPoolUser(MachineFunction &MF,
unsigned CPUserIndex) {
CPUser &U = CPUsers[CPUserIndex];
MachineInstr *UserMI = U.MI;
@@ -1040,14 +1099,9 @@ bool ARMConstantIslands::HandleConstantPoolUser(MachineFunction &Fn,
unsigned Size = CPEMI->getOperand(2).getImm();
MachineBasicBlock *NewMBB;
// Compute this only once, it's expensive. The 4 or 8 is the value the
- // hardware keeps in the PC (2 insns ahead of the reference).
+ // hardware keeps in the PC.
unsigned UserOffset = GetOffsetOf(UserMI) + (isThumb ? 4 : 8);
- // Special case: tLEApcrel are two instructions MI's. The actual user is the
- // second instruction.
- if (UserMI->getOpcode() == ARM::tLEApcrel)
- UserOffset += 2;
-
// See if the current entry is within range, or there is a clone of it
// in range.
int result = LookForExistingCPEntry(U, UserOffset);
@@ -1058,19 +1112,16 @@ bool ARMConstantIslands::HandleConstantPoolUser(MachineFunction &Fn,
// We will be generating a new clone. Get a UID for it.
unsigned ID = AFI->createConstPoolEntryUId();
- // Look for water where we can place this CPE. We look for the farthest one
- // away that will work. Forward references only for now (although later
- // we might find some that are backwards).
-
- if (!LookForWater(U, UserOffset, &NewMBB)) {
+ // Look for water where we can place this CPE.
+ if (!LookForWater(U, UserOffset, NewMBB)) {
// No water found.
- DOUT << "No water found\n";
- CreateNewWater(CPUserIndex, UserOffset, &NewMBB);
+ DEBUG(errs() << "No water found\n");
+ CreateNewWater(CPUserIndex, UserOffset, NewMBB);
}
// Okay, we know we can put an island before NewMBB now, do it!
- MachineBasicBlock *NewIsland = Fn.CreateMachineBasicBlock();
- Fn.insert(NewMBB, NewIsland);
+ MachineBasicBlock *NewIsland = MF.CreateMachineBasicBlock();
+ MF.insert(NewMBB, NewIsland);
// Update internal data structures to account for the newly inserted MBB.
UpdateForInsertedWaterBlock(NewIsland);
@@ -1101,7 +1152,8 @@ bool ARMConstantIslands::HandleConstantPoolUser(MachineFunction &Fn,
break;
}
- DOUT << " Moved CPE to #" << ID << " CPI=" << CPI << "\t" << *UserMI;
+ DEBUG(errs() << " Moved CPE to #" << ID << " CPI=" << CPI
+ << '\t' << *UserMI);
return true;
}
@@ -1115,7 +1167,7 @@ void ARMConstantIslands::RemoveDeadCPEMI(MachineInstr *CPEMI) {
BBSizes[CPEBB->getNumber()] -= Size;
// All succeeding offsets have the current size value added in, fix this.
if (CPEBB->empty()) {
- // In thumb mode, the size of island may be padded by two to compensate for
+ // In thumb1 mode, the size of island may be padded by two to compensate for
// the alignment requirement. Then it will now be 2 when the block is
// empty, so fix this.
// All succeeding offsets have the current size value added in, fix this.
@@ -1157,11 +1209,11 @@ bool ARMConstantIslands::BBIsInRange(MachineInstr *MI,MachineBasicBlock *DestBB,
unsigned BrOffset = GetOffsetOf(MI) + PCAdj;
unsigned DestOffset = BBOffsets[DestBB->getNumber()];
- DOUT << "Branch of destination BB#" << DestBB->getNumber()
- << " from BB#" << MI->getParent()->getNumber()
- << " max delta=" << MaxDisp
- << " from " << GetOffsetOf(MI) << " to " << DestOffset
- << " offset " << int(DestOffset-BrOffset) << "\t" << *MI;
+ DEBUG(errs() << "Branch of destination BB#" << DestBB->getNumber()
+ << " from BB#" << MI->getParent()->getNumber()
+ << " max delta=" << MaxDisp
+ << " from " << GetOffsetOf(MI) << " to " << DestOffset
+ << " offset " << int(DestOffset-BrOffset) << "\t" << *MI);
if (BrOffset <= DestOffset) {
// Branch before the Dest.
@@ -1176,7 +1228,7 @@ bool ARMConstantIslands::BBIsInRange(MachineInstr *MI,MachineBasicBlock *DestBB,
/// FixUpImmediateBr - Fix up an immediate branch whose destination is too far
/// away to fit in its displacement field.
-bool ARMConstantIslands::FixUpImmediateBr(MachineFunction &Fn, ImmBranch &Br) {
+bool ARMConstantIslands::FixUpImmediateBr(MachineFunction &MF, ImmBranch &Br) {
MachineInstr *MI = Br.MI;
MachineBasicBlock *DestBB = MI->getOperand(0).getMBB();
@@ -1185,8 +1237,8 @@ bool ARMConstantIslands::FixUpImmediateBr(MachineFunction &Fn, ImmBranch &Br) {
return false;
if (!Br.isCond)
- return FixUpUnconditionalBr(Fn, Br);
- return FixUpConditionalBr(Fn, Br);
+ return FixUpUnconditionalBr(MF, Br);
+ return FixUpConditionalBr(MF, Br);
}
/// FixUpUnconditionalBr - Fix up an unconditional branch whose destination is
@@ -1194,10 +1246,11 @@ bool ARMConstantIslands::FixUpImmediateBr(MachineFunction &Fn, ImmBranch &Br) {
/// spilled in the epilogue, then we can use BL to implement a far jump.
/// Otherwise, add an intermediate branch instruction to a branch.
bool
-ARMConstantIslands::FixUpUnconditionalBr(MachineFunction &Fn, ImmBranch &Br) {
+ARMConstantIslands::FixUpUnconditionalBr(MachineFunction &MF, ImmBranch &Br) {
MachineInstr *MI = Br.MI;
MachineBasicBlock *MBB = MI->getParent();
- assert(isThumb && !isThumb2 && "Expected a Thumb-1 function!");
+ if (!isThumb1)
+ llvm_unreachable("FixUpUnconditionalBr is Thumb1 only!");
// Use BL to implement far jump.
Br.MaxDisp = (1 << 21) * 2;
@@ -1207,7 +1260,7 @@ ARMConstantIslands::FixUpUnconditionalBr(MachineFunction &Fn, ImmBranch &Br) {
HasFarJump = true;
NumUBrFixed++;
- DOUT << " Changed B to long jump " << *MI;
+ DEBUG(errs() << " Changed B to long jump " << *MI);
return true;
}
@@ -1216,7 +1269,7 @@ ARMConstantIslands::FixUpUnconditionalBr(MachineFunction &Fn, ImmBranch &Br) {
/// far away to fit in its displacement field. It is converted to an inverse
/// conditional branch + an unconditional branch to the destination.
bool
-ARMConstantIslands::FixUpConditionalBr(MachineFunction &Fn, ImmBranch &Br) {
+ARMConstantIslands::FixUpConditionalBr(MachineFunction &MF, ImmBranch &Br) {
MachineInstr *MI = Br.MI;
MachineBasicBlock *DestBB = MI->getOperand(0).getMBB();
@@ -1251,7 +1304,8 @@ ARMConstantIslands::FixUpConditionalBr(MachineFunction &Fn, ImmBranch &Br) {
// b L1
MachineBasicBlock *NewDest = BMI->getOperand(0).getMBB();
if (BBIsInRange(MI, NewDest, Br.MaxDisp)) {
- DOUT << " Invert Bcc condition and swap its destination with " << *BMI;
+ DEBUG(errs() << " Invert Bcc condition and swap its destination with "
+ << *BMI);
BMI->getOperand(0).setMBB(DestBB);
MI->getOperand(0).setMBB(NewDest);
MI->getOperand(1).setImm(CC);
@@ -1273,9 +1327,9 @@ ARMConstantIslands::FixUpConditionalBr(MachineFunction &Fn, ImmBranch &Br) {
}
MachineBasicBlock *NextBB = next(MachineFunction::iterator(MBB));
- DOUT << " Insert B to BB#" << DestBB->getNumber()
- << " also invert condition and change dest. to BB#"
- << NextBB->getNumber() << "\n";
+ DEBUG(errs() << " Insert B to BB#" << DestBB->getNumber()
+ << " also invert condition and change dest. to BB#"
+ << NextBB->getNumber() << "\n");
// Insert a new conditional branch and a new unconditional branch.
// Also update the ImmBranch as well as adding a new entry for the new branch.
@@ -1300,14 +1354,17 @@ ARMConstantIslands::FixUpConditionalBr(MachineFunction &Fn, ImmBranch &Br) {
}
/// UndoLRSpillRestore - Remove Thumb push / pop instructions that only spills
-/// LR / restores LR to pc.
+/// LR / restores LR to pc. FIXME: This is done here because it's only possible
+/// to do this if tBfar is not used.
bool ARMConstantIslands::UndoLRSpillRestore() {
bool MadeChange = false;
for (unsigned i = 0, e = PushPopMIs.size(); i != e; ++i) {
MachineInstr *MI = PushPopMIs[i];
+ // First two operands are predicates, the third is a zero since there
+ // is no writeback.
if (MI->getOpcode() == ARM::tPOP_RET &&
- MI->getOperand(0).getReg() == ARM::PC &&
- MI->getNumExplicitOperands() == 1) {
+ MI->getOperand(3).getReg() == ARM::PC &&
+ MI->getNumExplicitOperands() == 4) {
BuildMI(MI->getParent(), MI->getDebugLoc(), TII->get(ARM::tBX_RET));
MI->eraseFromParent();
MadeChange = true;
@@ -1315,3 +1372,201 @@ bool ARMConstantIslands::UndoLRSpillRestore() {
}
return MadeChange;
}
+
+bool ARMConstantIslands::OptimizeThumb2Instructions(MachineFunction &MF) {
+ bool MadeChange = false;
+
+ // Shrink ADR and LDR from constantpool.
+ for (unsigned i = 0, e = CPUsers.size(); i != e; ++i) {
+ CPUser &U = CPUsers[i];
+ unsigned Opcode = U.MI->getOpcode();
+ unsigned NewOpc = 0;
+ unsigned Scale = 1;
+ unsigned Bits = 0;
+ switch (Opcode) {
+ default: break;
+ case ARM::t2LEApcrel:
+ if (isARMLowRegister(U.MI->getOperand(0).getReg())) {
+ NewOpc = ARM::tLEApcrel;
+ Bits = 8;
+ Scale = 4;
+ }
+ break;
+ case ARM::t2LDRpci:
+ if (isARMLowRegister(U.MI->getOperand(0).getReg())) {
+ NewOpc = ARM::tLDRpci;
+ Bits = 8;
+ Scale = 4;
+ }
+ break;
+ }
+
+ if (!NewOpc)
+ continue;
+
+ unsigned UserOffset = GetOffsetOf(U.MI) + 4;
+ unsigned MaxOffs = ((1 << Bits) - 1) * Scale;
+ // FIXME: Check if offset is multiple of scale if scale is not 4.
+ if (CPEIsInRange(U.MI, UserOffset, U.CPEMI, MaxOffs, false, true)) {
+ U.MI->setDesc(TII->get(NewOpc));
+ MachineBasicBlock *MBB = U.MI->getParent();
+ BBSizes[MBB->getNumber()] -= 2;
+ AdjustBBOffsetsAfter(MBB, -2);
+ ++NumT2CPShrunk;
+ MadeChange = true;
+ }
+ }
+
+ MadeChange |= OptimizeThumb2Branches(MF);
+ MadeChange |= OptimizeThumb2JumpTables(MF);
+ return MadeChange;
+}
+
+bool ARMConstantIslands::OptimizeThumb2Branches(MachineFunction &MF) {
+ bool MadeChange = false;
+
+ for (unsigned i = 0, e = ImmBranches.size(); i != e; ++i) {
+ ImmBranch &Br = ImmBranches[i];
+ unsigned Opcode = Br.MI->getOpcode();
+ unsigned NewOpc = 0;
+ unsigned Scale = 1;
+ unsigned Bits = 0;
+ switch (Opcode) {
+ default: break;
+ case ARM::t2B:
+ NewOpc = ARM::tB;
+ Bits = 11;
+ Scale = 2;
+ break;
+ case ARM::t2Bcc:
+ NewOpc = ARM::tBcc;
+ Bits = 8;
+ Scale = 2;
+ break;
+ }
+ if (!NewOpc)
+ continue;
+
+ unsigned MaxOffs = ((1 << (Bits-1))-1) * Scale;
+ MachineBasicBlock *DestBB = Br.MI->getOperand(0).getMBB();
+ if (BBIsInRange(Br.MI, DestBB, MaxOffs)) {
+ Br.MI->setDesc(TII->get(NewOpc));
+ MachineBasicBlock *MBB = Br.MI->getParent();
+ BBSizes[MBB->getNumber()] -= 2;
+ AdjustBBOffsetsAfter(MBB, -2);
+ ++NumT2BrShrunk;
+ MadeChange = true;
+ }
+ }
+
+ return MadeChange;
+}
+
+
+/// OptimizeThumb2JumpTables - Use tbb / tbh instructions to generate smaller
+/// jumptables when it's possible.
+bool ARMConstantIslands::OptimizeThumb2JumpTables(MachineFunction &MF) {
+ bool MadeChange = false;
+
+ // FIXME: After the tables are shrunk, can we get rid some of the
+ // constantpool tables?
+ const MachineJumpTableInfo *MJTI = MF.getJumpTableInfo();
+ const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables();
+ for (unsigned i = 0, e = T2JumpTables.size(); i != e; ++i) {
+ MachineInstr *MI = T2JumpTables[i];
+ const TargetInstrDesc &TID = MI->getDesc();
+ unsigned NumOps = TID.getNumOperands();
+ unsigned JTOpIdx = NumOps - (TID.isPredicable() ? 3 : 2);
+ MachineOperand JTOP = MI->getOperand(JTOpIdx);
+ unsigned JTI = JTOP.getIndex();
+ assert(JTI < JT.size());
+
+ bool ByteOk = true;
+ bool HalfWordOk = true;
+ unsigned JTOffset = GetOffsetOf(MI) + 4;
+ const std::vector<MachineBasicBlock*> &JTBBs = JT[JTI].MBBs;
+ for (unsigned j = 0, ee = JTBBs.size(); j != ee; ++j) {
+ MachineBasicBlock *MBB = JTBBs[j];
+ unsigned DstOffset = BBOffsets[MBB->getNumber()];
+ // Negative offset is not ok. FIXME: We should change BB layout to make
+ // sure all the branches are forward.
+ if (ByteOk && (DstOffset - JTOffset) > ((1<<8)-1)*2)
+ ByteOk = false;
+ unsigned TBHLimit = ((1<<16)-1)*2;
+ if (HalfWordOk && (DstOffset - JTOffset) > TBHLimit)
+ HalfWordOk = false;
+ if (!ByteOk && !HalfWordOk)
+ break;
+ }
+
+ if (ByteOk || HalfWordOk) {
+ MachineBasicBlock *MBB = MI->getParent();
+ unsigned BaseReg = MI->getOperand(0).getReg();
+ bool BaseRegKill = MI->getOperand(0).isKill();
+ if (!BaseRegKill)
+ continue;
+ unsigned IdxReg = MI->getOperand(1).getReg();
+ bool IdxRegKill = MI->getOperand(1).isKill();
+ MachineBasicBlock::iterator PrevI = MI;
+ if (PrevI == MBB->begin())
+ continue;
+
+ MachineInstr *AddrMI = --PrevI;
+ bool OptOk = true;
+ // Examine the instruction that calculate the jumptable entry address.
+ // If it's not the one just before the t2BR_JT, we won't delete it, then
+ // it's not worth doing the optimization.
+ for (unsigned k = 0, eee = AddrMI->getNumOperands(); k != eee; ++k) {
+ const MachineOperand &MO = AddrMI->getOperand(k);
+ if (!MO.isReg() || !MO.getReg())
+ continue;
+ if (MO.isDef() && MO.getReg() != BaseReg) {
+ OptOk = false;
+ break;
+ }
+ if (MO.isUse() && !MO.isKill() && MO.getReg() != IdxReg) {
+ OptOk = false;
+ break;
+ }
+ }
+ if (!OptOk)
+ continue;
+
+ // The previous instruction should be a tLEApcrel or t2LEApcrelJT, we want
+ // to delete it as well.
+ MachineInstr *LeaMI = --PrevI;
+ if ((LeaMI->getOpcode() != ARM::tLEApcrelJT &&
+ LeaMI->getOpcode() != ARM::t2LEApcrelJT) ||
+ LeaMI->getOperand(0).getReg() != BaseReg)
+ OptOk = false;
+
+ if (!OptOk)
+ continue;
+
+ unsigned Opc = ByteOk ? ARM::t2TBB : ARM::t2TBH;
+ MachineInstr *NewJTMI = BuildMI(MBB, MI->getDebugLoc(), TII->get(Opc))
+ .addReg(IdxReg, getKillRegState(IdxRegKill))
+ .addJumpTableIndex(JTI, JTOP.getTargetFlags())
+ .addImm(MI->getOperand(JTOpIdx+1).getImm());
+ // FIXME: Insert an "ALIGN" instruction to ensure the next instruction
+ // is 2-byte aligned. For now, asm printer will fix it up.
+ unsigned NewSize = TII->GetInstSizeInBytes(NewJTMI);
+ unsigned OrigSize = TII->GetInstSizeInBytes(AddrMI);
+ OrigSize += TII->GetInstSizeInBytes(LeaMI);
+ OrigSize += TII->GetInstSizeInBytes(MI);
+
+ AddrMI->eraseFromParent();
+ LeaMI->eraseFromParent();
+ MI->eraseFromParent();
+
+ int delta = OrigSize - NewSize;
+ BBSizes[MBB->getNumber()] -= delta;
+ AdjustBBOffsetsAfter(MBB, -delta);
+
+ ++NumTBs;
+ MadeChange = true;
+ }
+ }
+
+ return MadeChange;
+}
diff --git a/lib/Target/ARM/ARMConstantPoolValue.cpp b/lib/Target/ARM/ARMConstantPoolValue.cpp
index a75ed3b..7170089 100644
--- a/lib/Target/ARM/ARMConstantPoolValue.cpp
+++ b/lib/Target/ARM/ARMConstantPoolValue.cpp
@@ -15,33 +15,31 @@
#include "llvm/ADT/FoldingSet.h"
#include "llvm/GlobalValue.h"
#include "llvm/Type.h"
-#include "llvm/Support/Streams.h"
#include "llvm/Support/raw_ostream.h"
+#include <cstdlib>
using namespace llvm;
ARMConstantPoolValue::ARMConstantPoolValue(GlobalValue *gv, unsigned id,
- ARMCP::ARMCPKind k,
+ ARMCP::ARMCPKind K,
unsigned char PCAdj,
const char *Modif,
bool AddCA)
: MachineConstantPoolValue((const Type*)gv->getType()),
- GV(gv), S(NULL), LabelId(id), Kind(k), PCAdjust(PCAdj),
+ GV(gv), S(NULL), LabelId(id), Kind(K), PCAdjust(PCAdj),
Modifier(Modif), AddCurrentAddress(AddCA) {}
-ARMConstantPoolValue::ARMConstantPoolValue(const char *s, unsigned id,
- ARMCP::ARMCPKind k,
+ARMConstantPoolValue::ARMConstantPoolValue(LLVMContext &C,
+ const char *s, unsigned id,
unsigned char PCAdj,
const char *Modif,
bool AddCA)
- : MachineConstantPoolValue((const Type*)Type::Int32Ty),
- GV(NULL), S(s), LabelId(id), Kind(k), PCAdjust(PCAdj),
+ : MachineConstantPoolValue((const Type*)Type::getInt32Ty(C)),
+ GV(NULL), S(strdup(s)), LabelId(id), Kind(ARMCP::CPValue), PCAdjust(PCAdj),
Modifier(Modif), AddCurrentAddress(AddCA) {}
-ARMConstantPoolValue::ARMConstantPoolValue(GlobalValue *gv,
- ARMCP::ARMCPKind k,
- const char *Modif)
- : MachineConstantPoolValue((const Type*)Type::Int32Ty),
- GV(gv), S(NULL), LabelId(0), Kind(k), PCAdjust(0),
+ARMConstantPoolValue::ARMConstantPoolValue(GlobalValue *gv, const char *Modif)
+ : MachineConstantPoolValue((const Type*)Type::getInt32Ty(gv->getContext())),
+ GV(gv), S(NULL), LabelId(0), Kind(ARMCP::CPValue), PCAdjust(0),
Modifier(Modif) {}
int ARMConstantPoolValue::getExistingMachineCPValue(MachineConstantPool *CP,
@@ -56,7 +54,6 @@ int ARMConstantPoolValue::getExistingMachineCPValue(MachineConstantPool *CP,
if (CPV->GV == GV &&
CPV->S == S &&
CPV->LabelId == LabelId &&
- CPV->Kind == Kind &&
CPV->PCAdjust == PCAdjust)
return i;
}
@@ -65,31 +62,28 @@ int ARMConstantPoolValue::getExistingMachineCPValue(MachineConstantPool *CP,
return -1;
}
+ARMConstantPoolValue::~ARMConstantPoolValue() {
+ free((void*)S);
+}
+
void
ARMConstantPoolValue::AddSelectionDAGCSEId(FoldingSetNodeID &ID) {
ID.AddPointer(GV);
ID.AddPointer(S);
ID.AddInteger(LabelId);
- ID.AddInteger((unsigned)Kind);
ID.AddInteger(PCAdjust);
}
void ARMConstantPoolValue::dump() const {
- cerr << " " << *this;
+ errs() << " " << *this;
}
-void ARMConstantPoolValue::print(std::ostream &O) const {
- raw_os_ostream RawOS(O);
- print(RawOS);
-}
void ARMConstantPoolValue::print(raw_ostream &O) const {
if (GV)
O << GV->getName();
else
O << S;
- if (isNonLazyPointer()) O << "$non_lazy_ptr";
- else if (isStub()) O << "$stub";
if (Modifier) O << "(" << Modifier << ")";
if (PCAdjust != 0) {
O << "-(LPC" << LabelId << "+" << (unsigned)PCAdjust;
diff --git a/lib/Target/ARM/ARMConstantPoolValue.h b/lib/Target/ARM/ARMConstantPoolValue.h
index d2b9066..00c4808 100644
--- a/lib/Target/ARM/ARMConstantPoolValue.h
+++ b/lib/Target/ARM/ARMConstantPoolValue.h
@@ -15,17 +15,16 @@
#define LLVM_TARGET_ARM_CONSTANTPOOLVALUE_H
#include "llvm/CodeGen/MachineConstantPool.h"
-#include <iosfwd>
namespace llvm {
class GlobalValue;
+class LLVMContext;
namespace ARMCP {
enum ARMCPKind {
CPValue,
- CPNonLazyPtr,
- CPStub
+ CPLSDA
};
}
@@ -36,7 +35,7 @@ class ARMConstantPoolValue : public MachineConstantPoolValue {
GlobalValue *GV; // GlobalValue being loaded.
const char *S; // ExtSymbol being loaded.
unsigned LabelId; // Label id of the load.
- ARMCP::ARMCPKind Kind; // non_lazy_ptr or stub?
+ ARMCP::ARMCPKind Kind; // Value or LSDA?
unsigned char PCAdjust; // Extra adjustment if constantpool is pc relative.
// 8 for ARM, 4 for Thumb.
const char *Modifier; // GV modifier i.e. (&GV(modifier)-(LPIC+8))
@@ -47,12 +46,12 @@ public:
ARMCP::ARMCPKind Kind = ARMCP::CPValue,
unsigned char PCAdj = 0, const char *Modifier = NULL,
bool AddCurrentAddress = false);
- ARMConstantPoolValue(const char *s, unsigned id,
- ARMCP::ARMCPKind Kind = ARMCP::CPValue,
+ ARMConstantPoolValue(LLVMContext &C, const char *s, unsigned id,
unsigned char PCAdj = 0, const char *Modifier = NULL,
bool AddCurrentAddress = false);
- ARMConstantPoolValue(GlobalValue *GV, ARMCP::ARMCPKind Kind,
- const char *Modifier);
+ ARMConstantPoolValue(GlobalValue *GV, const char *Modifier);
+ ARMConstantPoolValue();
+ ~ARMConstantPoolValue();
GlobalValue *getGV() const { return GV; }
@@ -61,27 +60,27 @@ public:
bool hasModifier() const { return Modifier != NULL; }
bool mustAddCurrentAddress() const { return AddCurrentAddress; }
unsigned getLabelId() const { return LabelId; }
- bool isNonLazyPointer() const { return Kind == ARMCP::CPNonLazyPtr; }
- bool isStub() const { return Kind == ARMCP::CPStub; }
unsigned char getPCAdjustment() const { return PCAdjust; }
+ bool isLSDA() { return Kind == ARMCP::CPLSDA; }
+
+ virtual unsigned getRelocationInfo() const {
+ // FIXME: This is conservatively claiming that these entries require a
+ // relocation, we may be able to do better than this.
+ return 2;
+ }
+
virtual int getExistingMachineCPValue(MachineConstantPool *CP,
unsigned Alignment);
virtual void AddSelectionDAGCSEId(FoldingSetNodeID &ID);
- void print(std::ostream *O) const { if (O) print(*O); }
- void print(std::ostream &O) const;
void print(raw_ostream *O) const { if (O) print(*O); }
void print(raw_ostream &O) const;
void dump() const;
};
- inline std::ostream &operator<<(std::ostream &O, const ARMConstantPoolValue &V) {
- V.print(O);
- return O;
-}
-
+
inline raw_ostream &operator<<(raw_ostream &O, const ARMConstantPoolValue &V) {
V.print(O);
return O;
diff --git a/lib/Target/ARM/ARMFrameInfo.h b/lib/Target/ARM/ARMFrameInfo.h
index 405b8f2..d5dae24 100644
--- a/lib/Target/ARM/ARMFrameInfo.h
+++ b/lib/Target/ARM/ARMFrameInfo.h
@@ -15,15 +15,15 @@
#define ARM_FRAMEINFO_H
#include "ARM.h"
-#include "llvm/Target/TargetFrameInfo.h"
#include "ARMSubtarget.h"
+#include "llvm/Target/TargetFrameInfo.h"
namespace llvm {
class ARMFrameInfo : public TargetFrameInfo {
public:
explicit ARMFrameInfo(const ARMSubtarget &ST)
- : TargetFrameInfo(StackGrowsDown, ST.getStackAlignment(), 0) {
+ : TargetFrameInfo(StackGrowsDown, ST.getStackAlignment(), 0, 4) {
}
};
diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp
index 6485fc1..bebf4e8 100644
--- a/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -21,6 +21,7 @@
#include "llvm/DerivedTypes.h"
#include "llvm/Function.h"
#include "llvm/Intrinsics.h"
+#include "llvm/LLVMContext.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -30,10 +31,10 @@
#include "llvm/Target/TargetOptions.h"
#include "llvm/Support/Compiler.h"
#include "llvm/Support/Debug.h"
-using namespace llvm;
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
-static const unsigned arm_dsubreg_0 = 5;
-static const unsigned arm_dsubreg_1 = 6;
+using namespace llvm;
//===--------------------------------------------------------------------===//
/// ARMDAGToDAGISel - ARM specific code to select ARM machine
@@ -48,8 +49,9 @@ class ARMDAGToDAGISel : public SelectionDAGISel {
const ARMSubtarget *Subtarget;
public:
- explicit ARMDAGToDAGISel(ARMBaseTargetMachine &tm)
- : SelectionDAGISel(tm), TM(tm),
+ explicit ARMDAGToDAGISel(ARMBaseTargetMachine &tm,
+ CodeGenOpt::Level OptLevel)
+ : SelectionDAGISel(tm, OptLevel), TM(tm),
Subtarget(&TM.getSubtarget<ARMSubtarget>()) {
}
@@ -57,7 +59,8 @@ public:
return "ARM Instruction Selection";
}
- /// getI32Imm - Return a target constant with the specified value, of type i32.
+ /// getI32Imm - Return a target constant of type i32 with the specified
+ /// value.
inline SDValue getI32Imm(unsigned Imm) {
return CurDAG->getTargetConstant(Imm, MVT::i32);
}
@@ -74,6 +77,8 @@ public:
SDValue &Offset, SDValue &Opc);
bool SelectAddrMode3Offset(SDValue Op, SDValue N,
SDValue &Offset, SDValue &Opc);
+ bool SelectAddrMode4(SDValue Op, SDValue N, SDValue &Addr,
+ SDValue &Mode);
bool SelectAddrMode5(SDValue Op, SDValue N, SDValue &Base,
SDValue &Offset);
bool SelectAddrMode6(SDValue Op, SDValue N, SDValue &Addr, SDValue &Update,
@@ -118,15 +123,63 @@ private:
SDNode *SelectARMIndexedLoad(SDValue Op);
SDNode *SelectT2IndexedLoad(SDValue Op);
+ /// SelectDYN_ALLOC - Select dynamic alloc for Thumb.
+ SDNode *SelectDYN_ALLOC(SDValue Op);
+
+ /// SelectVLD - Select NEON load intrinsics. NumVecs should
+ /// be 2, 3 or 4. The opcode arrays specify the instructions used for
+ /// loads of D registers and even subregs and odd subregs of Q registers.
+ /// For NumVecs == 2, QOpcodes1 is not used.
+ SDNode *SelectVLD(SDValue Op, unsigned NumVecs, unsigned *DOpcodes,
+ unsigned *QOpcodes0, unsigned *QOpcodes1);
+
+ /// SelectVLDSTLane - Select NEON load/store lane intrinsics. NumVecs should
+ /// be 2, 3 or 4. The opcode arrays specify the instructions used for
+ /// load/store of D registers and even subregs and odd subregs of Q registers.
+ SDNode *SelectVLDSTLane(SDValue Op, bool IsLoad, unsigned NumVecs,
+ unsigned *DOpcodes, unsigned *QOpcodes0,
+ unsigned *QOpcodes1);
+
+ /// SelectV6T2BitfieldExtractOp - Select SBFX/UBFX instructions for ARM.
+ SDNode *SelectV6T2BitfieldExtractOp(SDValue Op, unsigned Opc);
/// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
/// inline asm expressions.
virtual bool SelectInlineAsmMemoryOperand(const SDValue &Op,
char ConstraintCode,
std::vector<SDValue> &OutOps);
+
+ /// PairDRegs - Insert a pair of double registers into an implicit def to
+ /// form a quad register.
+ SDNode *PairDRegs(EVT VT, SDValue V0, SDValue V1);
};
}
+/// isInt32Immediate - This method tests to see if the node is a 32-bit constant
+/// operand. If so Imm will receive the 32-bit value.
+static bool isInt32Immediate(SDNode *N, unsigned &Imm) {
+ if (N->getOpcode() == ISD::Constant && N->getValueType(0) == MVT::i32) {
+ Imm = cast<ConstantSDNode>(N)->getZExtValue();
+ return true;
+ }
+ return false;
+}
+
+// isInt32Immediate - This method tests to see if a constant operand.
+// If so Imm will receive the 32 bit value.
+static bool isInt32Immediate(SDValue N, unsigned &Imm) {
+ return isInt32Immediate(N.getNode(), Imm);
+}
+
+// isOpcWithIntImmediate - This method tests to see if the node is a specific
+// opcode and that it has a immediate integer right operand.
+// If so Imm will receive the 32 bit value.
+static bool isOpcWithIntImmediate(SDNode *N, unsigned Opc, unsigned& Imm) {
+ return N->getOpcode() == Opc &&
+ isInt32Immediate(N->getOperand(1).getNode(), Imm);
+}
+
+
void ARMDAGToDAGISel::InstructionSelect() {
DEBUG(BB->dump());
@@ -144,7 +197,7 @@ bool ARMDAGToDAGISel::SelectShifterOperandReg(SDValue Op,
// Don't match base register only case. That is matched to a separate
// lower complexity pattern with explicit register operand.
if (ShOpcVal == ARM_AM::no_shift) return false;
-
+
BaseReg = N.getOperand(0);
unsigned ShImmVal = 0;
if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
@@ -198,7 +251,7 @@ bool ARMDAGToDAGISel::SelectAddrMode2(SDValue Op, SDValue N,
MVT::i32);
return true;
}
-
+
// Match simple R +/- imm12 operands.
if (N.getOpcode() == ISD::ADD)
if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
@@ -223,15 +276,15 @@ bool ARMDAGToDAGISel::SelectAddrMode2(SDValue Op, SDValue N,
return true;
}
}
-
+
// Otherwise this is R +/- [possibly shifted] R
ARM_AM::AddrOpc AddSub = N.getOpcode() == ISD::ADD ? ARM_AM::add:ARM_AM::sub;
ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOperand(1));
unsigned ShAmt = 0;
-
+
Base = N.getOperand(0);
Offset = N.getOperand(1);
-
+
if (ShOpcVal != ARM_AM::no_shift) {
// Check to see if the RHS of the shift is a constant, if not, we can't fold
// it.
@@ -243,7 +296,7 @@ bool ARMDAGToDAGISel::SelectAddrMode2(SDValue Op, SDValue N,
ShOpcVal = ARM_AM::no_shift;
}
}
-
+
// Try matching (R shl C) + (R).
if (N.getOpcode() == ISD::ADD && ShOpcVal == ARM_AM::no_shift) {
ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOperand(0));
@@ -260,7 +313,7 @@ bool ARMDAGToDAGISel::SelectAddrMode2(SDValue Op, SDValue N,
}
}
}
-
+
Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(AddSub, ShAmt, ShOpcVal),
MVT::i32);
return true;
@@ -315,7 +368,7 @@ bool ARMDAGToDAGISel::SelectAddrMode3(SDValue Op, SDValue N,
Opc = CurDAG->getTargetConstant(ARM_AM::getAM3Opc(ARM_AM::sub, 0),MVT::i32);
return true;
}
-
+
if (N.getOpcode() != ISD::ADD) {
Base = N;
if (N.getOpcode() == ISD::FrameIndex) {
@@ -326,7 +379,7 @@ bool ARMDAGToDAGISel::SelectAddrMode3(SDValue Op, SDValue N,
Opc = CurDAG->getTargetConstant(ARM_AM::getAM3Opc(ARM_AM::add, 0),MVT::i32);
return true;
}
-
+
// If the RHS is +/- imm8, fold into addr mode.
if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
int RHSC = (int)RHS->getZExtValue();
@@ -348,7 +401,7 @@ bool ARMDAGToDAGISel::SelectAddrMode3(SDValue Op, SDValue N,
return true;
}
}
-
+
Base = N.getOperand(0);
Offset = N.getOperand(1);
Opc = CurDAG->getTargetConstant(ARM_AM::getAM3Opc(ARM_AM::add, 0), MVT::i32);
@@ -377,6 +430,12 @@ bool ARMDAGToDAGISel::SelectAddrMode3Offset(SDValue Op, SDValue N,
return true;
}
+bool ARMDAGToDAGISel::SelectAddrMode4(SDValue Op, SDValue N,
+ SDValue &Addr, SDValue &Mode) {
+ Addr = N;
+ Mode = CurDAG->getTargetConstant(0, MVT::i32);
+ return true;
+}
bool ARMDAGToDAGISel::SelectAddrMode5(SDValue Op, SDValue N,
SDValue &Base, SDValue &Offset) {
@@ -392,7 +451,7 @@ bool ARMDAGToDAGISel::SelectAddrMode5(SDValue Op, SDValue N,
MVT::i32);
return true;
}
-
+
// If the RHS is +/- imm8, fold into addr mode.
if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
int RHSC = (int)RHS->getZExtValue();
@@ -417,7 +476,7 @@ bool ARMDAGToDAGISel::SelectAddrMode5(SDValue Op, SDValue N,
}
}
}
-
+
Base = N;
Offset = CurDAG->getTargetConstant(ARM_AM::getAM5Opc(ARM_AM::add, 0),
MVT::i32);
@@ -428,14 +487,14 @@ bool ARMDAGToDAGISel::SelectAddrMode6(SDValue Op, SDValue N,
SDValue &Addr, SDValue &Update,
SDValue &Opc) {
Addr = N;
- // The optional writeback is handled in ARMLoadStoreOpt.
+ // Default to no writeback.
Update = CurDAG->getRegister(0, MVT::i32);
Opc = CurDAG->getTargetConstant(ARM_AM::getAM6Opc(false), MVT::i32);
return true;
}
bool ARMDAGToDAGISel::SelectAddrModePC(SDValue Op, SDValue N,
- SDValue &Offset, SDValue &Label) {
+ SDValue &Offset, SDValue &Label) {
if (N.getOpcode() == ARMISD::PIC_ADD && N.hasOneUse()) {
Offset = N.getOperand(0);
SDValue N1 = N.getOperand(1);
@@ -451,13 +510,11 @@ bool ARMDAGToDAGISel::SelectThumbAddrModeRR(SDValue Op, SDValue N,
// FIXME dl should come from the parent load or store, not the address
DebugLoc dl = Op.getDebugLoc();
if (N.getOpcode() != ISD::ADD) {
- Base = N;
- // We must materialize a zero in a reg! Returning a constant here
- // wouldn't work without additional code to position the node within
- // ISel's topological ordering in a place where ISel will process it
- // normally. Instead, just explicitly issue a tMOVri8 node!
- Offset = SDValue(CurDAG->getTargetNode(ARM::tMOVi8, dl, MVT::i32,
- CurDAG->getTargetConstant(0, MVT::i32)), 0);
+ ConstantSDNode *NC = dyn_cast<ConstantSDNode>(N);
+ if (!NC || NC->getZExtValue() != 0)
+ return false;
+
+ Base = Offset = N;
return true;
}
@@ -567,7 +624,7 @@ bool ARMDAGToDAGISel::SelectThumbAddrModeSP(SDValue Op, SDValue N,
}
}
}
-
+
return false;
}
@@ -594,41 +651,70 @@ bool ARMDAGToDAGISel::SelectT2ShifterOperandReg(SDValue Op, SDValue N,
bool ARMDAGToDAGISel::SelectT2AddrModeImm12(SDValue Op, SDValue N,
SDValue &Base, SDValue &OffImm) {
// Match simple R + imm12 operands.
- if (N.getOpcode() != ISD::ADD)
- return false;
+
+ // Base only.
+ if (N.getOpcode() != ISD::ADD && N.getOpcode() != ISD::SUB) {
+ if (N.getOpcode() == ISD::FrameIndex) {
+ // Match frame index...
+ int FI = cast<FrameIndexSDNode>(N)->getIndex();
+ Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
+ OffImm = CurDAG->getTargetConstant(0, MVT::i32);
+ return true;
+ } else if (N.getOpcode() == ARMISD::Wrapper) {
+ Base = N.getOperand(0);
+ if (Base.getOpcode() == ISD::TargetConstantPool)
+ return false; // We want to select t2LDRpci instead.
+ } else
+ Base = N;
+ OffImm = CurDAG->getTargetConstant(0, MVT::i32);
+ return true;
+ }
if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+ if (SelectT2AddrModeImm8(Op, N, Base, OffImm))
+ // Let t2LDRi8 handle (R - imm8).
+ return false;
+
int RHSC = (int)RHS->getZExtValue();
- if (RHSC >= 0 && RHSC < 0x1000) { // 12 bits.
+ if (N.getOpcode() == ISD::SUB)
+ RHSC = -RHSC;
+
+ if (RHSC >= 0 && RHSC < 0x1000) { // 12 bits (unsigned)
Base = N.getOperand(0);
+ if (Base.getOpcode() == ISD::FrameIndex) {
+ int FI = cast<FrameIndexSDNode>(Base)->getIndex();
+ Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
+ }
OffImm = CurDAG->getTargetConstant(RHSC, MVT::i32);
return true;
}
}
- return false;
+ // Base only.
+ Base = N;
+ OffImm = CurDAG->getTargetConstant(0, MVT::i32);
+ return true;
}
bool ARMDAGToDAGISel::SelectT2AddrModeImm8(SDValue Op, SDValue N,
SDValue &Base, SDValue &OffImm) {
- if (N.getOpcode() == ISD::ADD) {
+ // Match simple R - imm8 operands.
+ if (N.getOpcode() == ISD::ADD || N.getOpcode() == ISD::SUB) {
if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
- int RHSC = (int)RHS->getZExtValue();
- if (RHSC < 0 && RHSC > -0x100) { // 8 bits.
- Base = N.getOperand(0);
+ int RHSC = (int)RHS->getSExtValue();
+ if (N.getOpcode() == ISD::SUB)
+ RHSC = -RHSC;
+
+ if ((RHSC >= -255) && (RHSC < 0)) { // 8 bits (always negative)
+ Base = N.getOperand(0);
+ if (Base.getOpcode() == ISD::FrameIndex) {
+ int FI = cast<FrameIndexSDNode>(Base)->getIndex();
+ Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
+ }
OffImm = CurDAG->getTargetConstant(RHSC, MVT::i32);
return true;
}
}
- } else if (N.getOpcode() == ISD::SUB) {
- if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
- int RHSC = (int)RHS->getZExtValue();
- if (RHSC >= 0 && RHSC < 0x100) { // 8 bits.
- Base = N.getOperand(0);
- OffImm = CurDAG->getTargetConstant(-RHSC, MVT::i32);
- return true;
- }
- }
}
return false;
@@ -643,7 +729,7 @@ bool ARMDAGToDAGISel::SelectT2AddrModeImm8Offset(SDValue Op, SDValue N,
if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N)) {
int RHSC = (int)RHS->getZExtValue();
if (RHSC >= 0 && RHSC < 0x100) { // 8 bits.
- OffImm = (AM == ISD::PRE_INC)
+ OffImm = ((AM == ISD::PRE_INC) || (AM == ISD::POST_INC))
? CurDAG->getTargetConstant(RHSC, MVT::i32)
: CurDAG->getTargetConstant(-RHSC, MVT::i32);
return true;
@@ -658,7 +744,8 @@ bool ARMDAGToDAGISel::SelectT2AddrModeImm8s4(SDValue Op, SDValue N,
if (N.getOpcode() == ISD::ADD) {
if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
int RHSC = (int)RHS->getZExtValue();
- if (((RHSC & 0x3) == 0) && (RHSC < 0 && RHSC > -0x400)) { // 8 bits.
+ if (((RHSC & 0x3) == 0) &&
+ ((RHSC >= 0 && RHSC < 0x400) || (RHSC < 0 && RHSC > -0x400))) { // 8 bits.
Base = N.getOperand(0);
OffImm = CurDAG->getTargetConstant(RHSC, MVT::i32);
return true;
@@ -681,20 +768,17 @@ bool ARMDAGToDAGISel::SelectT2AddrModeImm8s4(SDValue Op, SDValue N,
bool ARMDAGToDAGISel::SelectT2AddrModeSoReg(SDValue Op, SDValue N,
SDValue &Base,
SDValue &OffReg, SDValue &ShImm) {
- // Base only.
- if (N.getOpcode() != ISD::ADD && N.getOpcode() != ISD::SUB) {
- Base = N;
- if (N.getOpcode() == ISD::FrameIndex) {
- int FI = cast<FrameIndexSDNode>(N)->getIndex();
- Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
- } else if (N.getOpcode() == ARMISD::Wrapper) {
- Base = N.getOperand(0);
- if (Base.getOpcode() == ISD::TargetConstantPool)
- return false; // We want to select t2LDRpci instead.
- }
- OffReg = CurDAG->getRegister(0, MVT::i32);
- ShImm = CurDAG->getTargetConstant(0, MVT::i32);
- return true;
+ // (R - imm8) should be handled by t2LDRi8. The rest are handled by t2LDRi12.
+ if (N.getOpcode() != ISD::ADD)
+ return false;
+
+ // Leave (R + imm12) for t2LDRi12, (R - imm8) for t2LDRi8.
+ if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+ int RHSC = (int)RHS->getZExtValue();
+ if (RHSC >= 0 && RHSC < 0x1000) // 12 bits (unsigned)
+ return false;
+ else if (RHSC < 0 && RHSC >= -255) // 8 bits
+ return false;
}
// Look for (R + R) or (R + (R << [1,2,3])).
@@ -708,8 +792,8 @@ bool ARMDAGToDAGISel::SelectT2AddrModeSoReg(SDValue Op, SDValue N,
ShOpcVal = ARM_AM::getShiftOpcForNode(Base);
if (ShOpcVal == ARM_AM::lsl)
std::swap(Base, OffReg);
- }
-
+ }
+
if (ShOpcVal == ARM_AM::lsl) {
// Check to see if the RHS of the shift is a constant, if not, we can't fold
// it.
@@ -723,11 +807,8 @@ bool ARMDAGToDAGISel::SelectT2AddrModeSoReg(SDValue Op, SDValue N,
} else {
ShOpcVal = ARM_AM::no_shift;
}
- } else if (SelectT2AddrModeImm12(Op, N, Base, ShImm) ||
- SelectT2AddrModeImm8 (Op, N, Base, ShImm))
- // Don't match if it's possible to match to one of the r +/- imm cases.
- return false;
-
+ }
+
ShImm = CurDAG->getTargetConstant(ShAmt, MVT::i32);
return true;
@@ -746,7 +827,7 @@ SDNode *ARMDAGToDAGISel::SelectARMIndexedLoad(SDValue Op) {
if (AM == ISD::UNINDEXED)
return NULL;
- MVT LoadedVT = LD->getMemoryVT();
+ EVT LoadedVT = LD->getMemoryVT();
SDValue Offset, AMOpc;
bool isPre = (AM == ISD::PRE_INC) || (AM == ISD::PRE_DEC);
unsigned Opcode = 0;
@@ -780,8 +861,8 @@ SDNode *ARMDAGToDAGISel::SelectARMIndexedLoad(SDValue Op) {
SDValue Base = LD->getBasePtr();
SDValue Ops[]= { Base, Offset, AMOpc, getAL(CurDAG),
CurDAG->getRegister(0, MVT::i32), Chain };
- return CurDAG->getTargetNode(Opcode, Op.getDebugLoc(), MVT::i32, MVT::i32,
- MVT::Other, Ops, 6);
+ return CurDAG->getMachineNode(Opcode, Op.getDebugLoc(), MVT::i32, MVT::i32,
+ MVT::Other, Ops, 6);
}
return NULL;
@@ -793,14 +874,14 @@ SDNode *ARMDAGToDAGISel::SelectT2IndexedLoad(SDValue Op) {
if (AM == ISD::UNINDEXED)
return NULL;
- MVT LoadedVT = LD->getMemoryVT();
+ EVT LoadedVT = LD->getMemoryVT();
bool isSExtLd = LD->getExtensionType() == ISD::SEXTLOAD;
SDValue Offset;
bool isPre = (AM == ISD::PRE_INC) || (AM == ISD::PRE_DEC);
unsigned Opcode = 0;
bool Match = false;
if (SelectT2AddrModeImm8Offset(Op, LD->getOffset(), Offset)) {
- switch (LoadedVT.getSimpleVT()) {
+ switch (LoadedVT.getSimpleVT().SimpleTy) {
case MVT::i32:
Opcode = isPre ? ARM::t2LDR_PRE : ARM::t2LDR_POST;
break;
@@ -828,13 +909,300 @@ SDNode *ARMDAGToDAGISel::SelectT2IndexedLoad(SDValue Op) {
SDValue Base = LD->getBasePtr();
SDValue Ops[]= { Base, Offset, getAL(CurDAG),
CurDAG->getRegister(0, MVT::i32), Chain };
- return CurDAG->getTargetNode(Opcode, Op.getDebugLoc(), MVT::i32, MVT::i32,
- MVT::Other, Ops, 5);
+ return CurDAG->getMachineNode(Opcode, Op.getDebugLoc(), MVT::i32, MVT::i32,
+ MVT::Other, Ops, 5);
+ }
+
+ return NULL;
+}
+
+SDNode *ARMDAGToDAGISel::SelectDYN_ALLOC(SDValue Op) {
+ SDNode *N = Op.getNode();
+ DebugLoc dl = N->getDebugLoc();
+ EVT VT = Op.getValueType();
+ SDValue Chain = Op.getOperand(0);
+ SDValue Size = Op.getOperand(1);
+ SDValue Align = Op.getOperand(2);
+ SDValue SP = CurDAG->getRegister(ARM::SP, MVT::i32);
+ int32_t AlignVal = cast<ConstantSDNode>(Align)->getSExtValue();
+ if (AlignVal < 0)
+ // We need to align the stack. Use Thumb1 tAND which is the only thumb
+ // instruction that can read and write SP. This matches to a pseudo
+ // instruction that has a chain to ensure the result is written back to
+ // the stack pointer.
+ SP = SDValue(CurDAG->getMachineNode(ARM::tANDsp, dl, VT, SP, Align), 0);
+
+ bool isC = isa<ConstantSDNode>(Size);
+ uint32_t C = isC ? cast<ConstantSDNode>(Size)->getZExtValue() : ~0UL;
+ // Handle the most common case for both Thumb1 and Thumb2:
+ // tSUBspi - immediate is between 0 ... 508 inclusive.
+ if (C <= 508 && ((C & 3) == 0))
+ // FIXME: tSUBspi encode scale 4 implicitly.
+ return CurDAG->SelectNodeTo(N, ARM::tSUBspi_, VT, MVT::Other, SP,
+ CurDAG->getTargetConstant(C/4, MVT::i32),
+ Chain);
+
+ if (Subtarget->isThumb1Only()) {
+ // Use tADDspr since Thumb1 does not have a sub r, sp, r. ARMISelLowering
+ // should have negated the size operand already. FIXME: We can't insert
+ // new target independent node at this stage so we are forced to negate
+ // it earlier. Is there a better solution?
+ return CurDAG->SelectNodeTo(N, ARM::tADDspr_, VT, MVT::Other, SP, Size,
+ Chain);
+ } else if (Subtarget->isThumb2()) {
+ if (isC && Predicate_t2_so_imm(Size.getNode())) {
+ // t2SUBrSPi
+ SDValue Ops[] = { SP, CurDAG->getTargetConstant(C, MVT::i32), Chain };
+ return CurDAG->SelectNodeTo(N, ARM::t2SUBrSPi_, VT, MVT::Other, Ops, 3);
+ } else if (isC && Predicate_imm0_4095(Size.getNode())) {
+ // t2SUBrSPi12
+ SDValue Ops[] = { SP, CurDAG->getTargetConstant(C, MVT::i32), Chain };
+ return CurDAG->SelectNodeTo(N, ARM::t2SUBrSPi12_, VT, MVT::Other, Ops, 3);
+ } else {
+ // t2SUBrSPs
+ SDValue Ops[] = { SP, Size,
+ getI32Imm(ARM_AM::getSORegOpc(ARM_AM::lsl,0)), Chain };
+ return CurDAG->SelectNodeTo(N, ARM::t2SUBrSPs_, VT, MVT::Other, Ops, 4);
+ }
+ }
+
+ // FIXME: Add ADD / SUB sp instructions for ARM.
+ return 0;
+}
+
+/// PairDRegs - Insert a pair of double registers into an implicit def to
+/// form a quad register.
+SDNode *ARMDAGToDAGISel::PairDRegs(EVT VT, SDValue V0, SDValue V1) {
+ DebugLoc dl = V0.getNode()->getDebugLoc();
+ SDValue Undef =
+ SDValue(CurDAG->getMachineNode(TargetInstrInfo::IMPLICIT_DEF, dl, VT), 0);
+ SDValue SubReg0 = CurDAG->getTargetConstant(ARM::DSUBREG_0, MVT::i32);
+ SDValue SubReg1 = CurDAG->getTargetConstant(ARM::DSUBREG_1, MVT::i32);
+ SDNode *Pair = CurDAG->getMachineNode(TargetInstrInfo::INSERT_SUBREG, dl,
+ VT, Undef, V0, SubReg0);
+ return CurDAG->getMachineNode(TargetInstrInfo::INSERT_SUBREG, dl,
+ VT, SDValue(Pair, 0), V1, SubReg1);
+}
+
+/// GetNEONSubregVT - Given a type for a 128-bit NEON vector, return the type
+/// for a 64-bit subregister of the vector.
+static EVT GetNEONSubregVT(EVT VT) {
+ switch (VT.getSimpleVT().SimpleTy) {
+ default: llvm_unreachable("unhandled NEON type");
+ case MVT::v16i8: return MVT::v8i8;
+ case MVT::v8i16: return MVT::v4i16;
+ case MVT::v4f32: return MVT::v2f32;
+ case MVT::v4i32: return MVT::v2i32;
+ case MVT::v2i64: return MVT::v1i64;
+ }
+}
+
+SDNode *ARMDAGToDAGISel::SelectVLD(SDValue Op, unsigned NumVecs,
+ unsigned *DOpcodes, unsigned *QOpcodes0,
+ unsigned *QOpcodes1) {
+ assert(NumVecs >=2 && NumVecs <= 4 && "VLD NumVecs out-of-range");
+ SDNode *N = Op.getNode();
+ DebugLoc dl = N->getDebugLoc();
+
+ SDValue MemAddr, MemUpdate, MemOpc;
+ if (!SelectAddrMode6(Op, N->getOperand(2), MemAddr, MemUpdate, MemOpc))
+ return NULL;
+
+ SDValue Chain = N->getOperand(0);
+ EVT VT = N->getValueType(0);
+ bool is64BitVector = VT.is64BitVector();
+
+ unsigned OpcodeIndex;
+ switch (VT.getSimpleVT().SimpleTy) {
+ default: llvm_unreachable("unhandled vld type");
+ // Double-register operations:
+ case MVT::v8i8: OpcodeIndex = 0; break;
+ case MVT::v4i16: OpcodeIndex = 1; break;
+ case MVT::v2f32:
+ case MVT::v2i32: OpcodeIndex = 2; break;
+ case MVT::v1i64: OpcodeIndex = 3; break;
+ // Quad-register operations:
+ case MVT::v16i8: OpcodeIndex = 0; break;
+ case MVT::v8i16: OpcodeIndex = 1; break;
+ case MVT::v4f32:
+ case MVT::v4i32: OpcodeIndex = 2; break;
+ }
+
+ if (is64BitVector) {
+ unsigned Opc = DOpcodes[OpcodeIndex];
+ const SDValue Ops[] = { MemAddr, MemUpdate, MemOpc, Chain };
+ std::vector<EVT> ResTys(NumVecs, VT);
+ ResTys.push_back(MVT::Other);
+ return CurDAG->getMachineNode(Opc, dl, ResTys, Ops, 4);
+ }
+
+ EVT RegVT = GetNEONSubregVT(VT);
+ if (NumVecs == 2) {
+ // Quad registers are directly supported for VLD2,
+ // loading 2 pairs of D regs.
+ unsigned Opc = QOpcodes0[OpcodeIndex];
+ const SDValue Ops[] = { MemAddr, MemUpdate, MemOpc, Chain };
+ std::vector<EVT> ResTys(4, VT);
+ ResTys.push_back(MVT::Other);
+ SDNode *VLd = CurDAG->getMachineNode(Opc, dl, ResTys, Ops, 4);
+ Chain = SDValue(VLd, 4);
+
+ // Combine the even and odd subregs to produce the result.
+ for (unsigned Vec = 0; Vec < NumVecs; ++Vec) {
+ SDNode *Q = PairDRegs(VT, SDValue(VLd, 2*Vec), SDValue(VLd, 2*Vec+1));
+ ReplaceUses(SDValue(N, Vec), SDValue(Q, 0));
+ }
+ } else {
+ // Otherwise, quad registers are loaded with two separate instructions,
+ // where one loads the even registers and the other loads the odd registers.
+
+ // Enable writeback to the address register.
+ MemOpc = CurDAG->getTargetConstant(ARM_AM::getAM6Opc(true), MVT::i32);
+
+ std::vector<EVT> ResTys(NumVecs, RegVT);
+ ResTys.push_back(MemAddr.getValueType());
+ ResTys.push_back(MVT::Other);
+
+ // Load the even subreg.
+ unsigned Opc = QOpcodes0[OpcodeIndex];
+ const SDValue OpsA[] = { MemAddr, MemUpdate, MemOpc, Chain };
+ SDNode *VLdA = CurDAG->getMachineNode(Opc, dl, ResTys, OpsA, 4);
+ Chain = SDValue(VLdA, NumVecs+1);
+
+ // Load the odd subreg.
+ Opc = QOpcodes1[OpcodeIndex];
+ const SDValue OpsB[] = { SDValue(VLdA, NumVecs), MemUpdate, MemOpc, Chain };
+ SDNode *VLdB = CurDAG->getMachineNode(Opc, dl, ResTys, OpsB, 4);
+ Chain = SDValue(VLdB, NumVecs+1);
+
+ // Combine the even and odd subregs to produce the result.
+ for (unsigned Vec = 0; Vec < NumVecs; ++Vec) {
+ SDNode *Q = PairDRegs(VT, SDValue(VLdA, Vec), SDValue(VLdB, Vec));
+ ReplaceUses(SDValue(N, Vec), SDValue(Q, 0));
+ }
+ }
+ ReplaceUses(SDValue(N, NumVecs), Chain);
+ return NULL;
+}
+
+SDNode *ARMDAGToDAGISel::SelectVLDSTLane(SDValue Op, bool IsLoad,
+ unsigned NumVecs, unsigned *DOpcodes,
+ unsigned *QOpcodes0,
+ unsigned *QOpcodes1) {
+ assert(NumVecs >=2 && NumVecs <= 4 && "VLDSTLane NumVecs out-of-range");
+ SDNode *N = Op.getNode();
+ DebugLoc dl = N->getDebugLoc();
+
+ SDValue MemAddr, MemUpdate, MemOpc;
+ if (!SelectAddrMode6(Op, N->getOperand(2), MemAddr, MemUpdate, MemOpc))
+ return NULL;
+
+ SDValue Chain = N->getOperand(0);
+ unsigned Lane =
+ cast<ConstantSDNode>(N->getOperand(NumVecs+3))->getZExtValue();
+ EVT VT = IsLoad ? N->getValueType(0) : N->getOperand(3).getValueType();
+ bool is64BitVector = VT.is64BitVector();
+
+ // Quad registers are handled by load/store of subregs. Find the subreg info.
+ unsigned NumElts = 0;
+ int SubregIdx = 0;
+ EVT RegVT = VT;
+ if (!is64BitVector) {
+ RegVT = GetNEONSubregVT(VT);
+ NumElts = RegVT.getVectorNumElements();
+ SubregIdx = (Lane < NumElts) ? ARM::DSUBREG_0 : ARM::DSUBREG_1;
+ }
+
+ unsigned OpcodeIndex;
+ switch (VT.getSimpleVT().SimpleTy) {
+ default: llvm_unreachable("unhandled vld/vst lane type");
+ // Double-register operations:
+ case MVT::v8i8: OpcodeIndex = 0; break;
+ case MVT::v4i16: OpcodeIndex = 1; break;
+ case MVT::v2f32:
+ case MVT::v2i32: OpcodeIndex = 2; break;
+ // Quad-register operations:
+ case MVT::v8i16: OpcodeIndex = 0; break;
+ case MVT::v4f32:
+ case MVT::v4i32: OpcodeIndex = 1; break;
+ }
+
+ SmallVector<SDValue, 9> Ops;
+ Ops.push_back(MemAddr);
+ Ops.push_back(MemUpdate);
+ Ops.push_back(MemOpc);
+
+ unsigned Opc = 0;
+ if (is64BitVector) {
+ Opc = DOpcodes[OpcodeIndex];
+ for (unsigned Vec = 0; Vec < NumVecs; ++Vec)
+ Ops.push_back(N->getOperand(Vec+3));
+ } else {
+ // Check if this is loading the even or odd subreg of a Q register.
+ if (Lane < NumElts) {
+ Opc = QOpcodes0[OpcodeIndex];
+ } else {
+ Lane -= NumElts;
+ Opc = QOpcodes1[OpcodeIndex];
+ }
+ // Extract the subregs of the input vector.
+ for (unsigned Vec = 0; Vec < NumVecs; ++Vec)
+ Ops.push_back(CurDAG->getTargetExtractSubreg(SubregIdx, dl, RegVT,
+ N->getOperand(Vec+3)));
+ }
+ Ops.push_back(getI32Imm(Lane));
+ Ops.push_back(Chain);
+
+ if (!IsLoad)
+ return CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops.data(), NumVecs+5);
+
+ std::vector<EVT> ResTys(NumVecs, RegVT);
+ ResTys.push_back(MVT::Other);
+ SDNode *VLdLn =
+ CurDAG->getMachineNode(Opc, dl, ResTys, Ops.data(), NumVecs+5);
+ // For a 64-bit vector load to D registers, nothing more needs to be done.
+ if (is64BitVector)
+ return VLdLn;
+
+ // For 128-bit vectors, take the 64-bit results of the load and insert them
+ // as subregs into the result.
+ for (unsigned Vec = 0; Vec < NumVecs; ++Vec) {
+ SDValue QuadVec = CurDAG->getTargetInsertSubreg(SubregIdx, dl, VT,
+ N->getOperand(Vec+3),
+ SDValue(VLdLn, Vec));
+ ReplaceUses(SDValue(N, Vec), QuadVec);
}
+ Chain = SDValue(VLdLn, NumVecs);
+ ReplaceUses(SDValue(N, NumVecs), Chain);
return NULL;
}
+SDNode *ARMDAGToDAGISel::SelectV6T2BitfieldExtractOp(SDValue Op,
+ unsigned Opc) {
+ if (!Subtarget->hasV6T2Ops())
+ return NULL;
+
+ unsigned Shl_imm = 0;
+ if (isOpcWithIntImmediate(Op.getOperand(0).getNode(), ISD::SHL, Shl_imm)){
+ assert(Shl_imm > 0 && Shl_imm < 32 && "bad amount in shift node!");
+ unsigned Srl_imm = 0;
+ if (isInt32Immediate(Op.getOperand(1), Srl_imm)) {
+ assert(Srl_imm > 0 && Srl_imm < 32 && "bad amount in shift node!");
+ unsigned Width = 32 - Srl_imm;
+ int LSB = Srl_imm - Shl_imm;
+ if ((LSB + Width) > 32)
+ return NULL;
+ SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
+ SDValue Ops[] = { Op.getOperand(0).getOperand(0),
+ CurDAG->getTargetConstant(LSB, MVT::i32),
+ CurDAG->getTargetConstant(Width, MVT::i32),
+ getAL(CurDAG), Reg0 };
+ return CurDAG->SelectNodeTo(Op.getNode(), Opc, MVT::i32, Ops, 5);
+ }
+ }
+ return NULL;
+}
SDNode *ARMDAGToDAGISel::Select(SDValue Op) {
SDNode *N = Op.getNode();
@@ -848,44 +1216,50 @@ SDNode *ARMDAGToDAGISel::Select(SDValue Op) {
case ISD::Constant: {
unsigned Val = cast<ConstantSDNode>(N)->getZExtValue();
bool UseCP = true;
- if (Subtarget->isThumb()) {
- if (Subtarget->hasThumb2())
- // Thumb2 has the MOVT instruction, so all immediates can
- // be done with MOV + MOVT, at worst.
- UseCP = 0;
- else
+ if (Subtarget->hasThumb2())
+ // Thumb2-aware targets have the MOVT instruction, so all immediates can
+ // be done with MOV + MOVT, at worst.
+ UseCP = 0;
+ else {
+ if (Subtarget->isThumb()) {
UseCP = (Val > 255 && // MOV
~Val > 255 && // MOV + MVN
!ARM_AM::isThumbImmShiftedVal(Val)); // MOV + LSL
- } else
- UseCP = (ARM_AM::getSOImmVal(Val) == -1 && // MOV
- ARM_AM::getSOImmVal(~Val) == -1 && // MVN
- !ARM_AM::isSOImmTwoPartVal(Val)); // two instrs.
+ } else
+ UseCP = (ARM_AM::getSOImmVal(Val) == -1 && // MOV
+ ARM_AM::getSOImmVal(~Val) == -1 && // MVN
+ !ARM_AM::isSOImmTwoPartVal(Val)); // two instrs.
+ }
+
if (UseCP) {
SDValue CPIdx =
- CurDAG->getTargetConstantPool(ConstantInt::get(Type::Int32Ty, Val),
+ CurDAG->getTargetConstantPool(ConstantInt::get(
+ Type::getInt32Ty(*CurDAG->getContext()), Val),
TLI.getPointerTy());
SDNode *ResNode;
- if (Subtarget->isThumb())
- ResNode = CurDAG->getTargetNode(ARM::tLDRcp, dl, MVT::i32, MVT::Other,
- CPIdx, CurDAG->getEntryNode());
- else {
+ if (Subtarget->isThumb1Only()) {
+ SDValue Pred = CurDAG->getTargetConstant(0xEULL, MVT::i32);
+ SDValue PredReg = CurDAG->getRegister(0, MVT::i32);
+ SDValue Ops[] = { CPIdx, Pred, PredReg, CurDAG->getEntryNode() };
+ ResNode = CurDAG->getMachineNode(ARM::tLDRcp, dl, MVT::i32, MVT::Other,
+ Ops, 4);
+ } else {
SDValue Ops[] = {
- CPIdx,
+ CPIdx,
CurDAG->getRegister(0, MVT::i32),
CurDAG->getTargetConstant(0, MVT::i32),
getAL(CurDAG),
CurDAG->getRegister(0, MVT::i32),
CurDAG->getEntryNode()
};
- ResNode=CurDAG->getTargetNode(ARM::LDRcp, dl, MVT::i32, MVT::Other,
- Ops, 6);
+ ResNode=CurDAG->getMachineNode(ARM::LDRcp, dl, MVT::i32, MVT::Other,
+ Ops, 6);
}
ReplaceUses(Op, SDValue(ResNode, 0));
return NULL;
}
-
+
// Other cases are autogenerated.
break;
}
@@ -893,80 +1267,106 @@ SDNode *ARMDAGToDAGISel::Select(SDValue Op) {
// Selects to ADDri FI, 0 which in turn will become ADDri SP, imm.
int FI = cast<FrameIndexSDNode>(N)->getIndex();
SDValue TFI = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
- if (Subtarget->isThumb()) {
+ if (Subtarget->isThumb1Only()) {
return CurDAG->SelectNodeTo(N, ARM::tADDrSPi, MVT::i32, TFI,
CurDAG->getTargetConstant(0, MVT::i32));
} else {
+ unsigned Opc = ((Subtarget->isThumb() && Subtarget->hasThumb2()) ?
+ ARM::t2ADDri : ARM::ADDri);
SDValue Ops[] = { TFI, CurDAG->getTargetConstant(0, MVT::i32),
- getAL(CurDAG), CurDAG->getRegister(0, MVT::i32),
- CurDAG->getRegister(0, MVT::i32) };
- return CurDAG->SelectNodeTo(N, ARM::ADDri, MVT::i32, Ops, 5);
+ getAL(CurDAG), CurDAG->getRegister(0, MVT::i32),
+ CurDAG->getRegister(0, MVT::i32) };
+ return CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops, 5);
}
}
- case ISD::ADD: {
- if (!Subtarget->isThumb())
- break;
- // Select add sp, c to tADDhirr.
- SDValue N0 = Op.getOperand(0);
- SDValue N1 = Op.getOperand(1);
- RegisterSDNode *LHSR = dyn_cast<RegisterSDNode>(Op.getOperand(0));
- RegisterSDNode *RHSR = dyn_cast<RegisterSDNode>(Op.getOperand(1));
- if (LHSR && LHSR->getReg() == ARM::SP) {
- std::swap(N0, N1);
- std::swap(LHSR, RHSR);
- }
- if (RHSR && RHSR->getReg() == ARM::SP) {
- SDValue Val = SDValue(CurDAG->getTargetNode(ARM::tMOVlor2hir, dl,
- Op.getValueType(), N0, N0), 0);
- return CurDAG->SelectNodeTo(N, ARM::tADDhirr, Op.getValueType(), Val, N1);
- }
+ case ARMISD::DYN_ALLOC:
+ return SelectDYN_ALLOC(Op);
+ case ISD::SRL:
+ if (SDNode *I = SelectV6T2BitfieldExtractOp(Op,
+ Subtarget->isThumb() ? ARM::t2UBFX : ARM::UBFX))
+ return I;
+ break;
+ case ISD::SRA:
+ if (SDNode *I = SelectV6T2BitfieldExtractOp(Op,
+ Subtarget->isThumb() ? ARM::t2SBFX : ARM::SBFX))
+ return I;
break;
- }
case ISD::MUL:
- if (Subtarget->isThumb())
+ if (Subtarget->isThumb1Only())
break;
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
unsigned RHSV = C->getZExtValue();
if (!RHSV) break;
if (isPowerOf2_32(RHSV-1)) { // 2^n+1?
+ unsigned ShImm = Log2_32(RHSV-1);
+ if (ShImm >= 32)
+ break;
SDValue V = Op.getOperand(0);
- unsigned ShImm = ARM_AM::getSORegOpc(ARM_AM::lsl, Log2_32(RHSV-1));
- SDValue Ops[] = { V, V, CurDAG->getRegister(0, MVT::i32),
- CurDAG->getTargetConstant(ShImm, MVT::i32),
- getAL(CurDAG), CurDAG->getRegister(0, MVT::i32),
- CurDAG->getRegister(0, MVT::i32) };
- return CurDAG->SelectNodeTo(N, ARM::ADDrs, MVT::i32, Ops, 7);
+ ShImm = ARM_AM::getSORegOpc(ARM_AM::lsl, ShImm);
+ SDValue ShImmOp = CurDAG->getTargetConstant(ShImm, MVT::i32);
+ SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
+ if (Subtarget->isThumb()) {
+ SDValue Ops[] = { V, V, ShImmOp, getAL(CurDAG), Reg0, Reg0 };
+ return CurDAG->SelectNodeTo(N, ARM::t2ADDrs, MVT::i32, Ops, 6);
+ } else {
+ SDValue Ops[] = { V, V, Reg0, ShImmOp, getAL(CurDAG), Reg0, Reg0 };
+ return CurDAG->SelectNodeTo(N, ARM::ADDrs, MVT::i32, Ops, 7);
+ }
}
if (isPowerOf2_32(RHSV+1)) { // 2^n-1?
+ unsigned ShImm = Log2_32(RHSV+1);
+ if (ShImm >= 32)
+ break;
SDValue V = Op.getOperand(0);
- unsigned ShImm = ARM_AM::getSORegOpc(ARM_AM::lsl, Log2_32(RHSV+1));
- SDValue Ops[] = { V, V, CurDAG->getRegister(0, MVT::i32),
- CurDAG->getTargetConstant(ShImm, MVT::i32),
- getAL(CurDAG), CurDAG->getRegister(0, MVT::i32),
- CurDAG->getRegister(0, MVT::i32) };
- return CurDAG->SelectNodeTo(N, ARM::RSBrs, MVT::i32, Ops, 7);
+ ShImm = ARM_AM::getSORegOpc(ARM_AM::lsl, ShImm);
+ SDValue ShImmOp = CurDAG->getTargetConstant(ShImm, MVT::i32);
+ SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
+ if (Subtarget->isThumb()) {
+ SDValue Ops[] = { V, V, ShImmOp, getAL(CurDAG), Reg0 };
+ return CurDAG->SelectNodeTo(N, ARM::t2RSBrs, MVT::i32, Ops, 5);
+ } else {
+ SDValue Ops[] = { V, V, Reg0, ShImmOp, getAL(CurDAG), Reg0, Reg0 };
+ return CurDAG->SelectNodeTo(N, ARM::RSBrs, MVT::i32, Ops, 7);
+ }
}
}
break;
case ARMISD::FMRRD:
- return CurDAG->getTargetNode(ARM::FMRRD, dl, MVT::i32, MVT::i32,
- Op.getOperand(0), getAL(CurDAG),
- CurDAG->getRegister(0, MVT::i32));
+ return CurDAG->getMachineNode(ARM::FMRRD, dl, MVT::i32, MVT::i32,
+ Op.getOperand(0), getAL(CurDAG),
+ CurDAG->getRegister(0, MVT::i32));
case ISD::UMUL_LOHI: {
- SDValue Ops[] = { Op.getOperand(0), Op.getOperand(1),
+ if (Subtarget->isThumb1Only())
+ break;
+ if (Subtarget->isThumb()) {
+ SDValue Ops[] = { Op.getOperand(0), Op.getOperand(1),
+ getAL(CurDAG), CurDAG->getRegister(0, MVT::i32),
+ CurDAG->getRegister(0, MVT::i32) };
+ return CurDAG->getMachineNode(ARM::t2UMULL, dl, MVT::i32, MVT::i32, Ops,4);
+ } else {
+ SDValue Ops[] = { Op.getOperand(0), Op.getOperand(1),
getAL(CurDAG), CurDAG->getRegister(0, MVT::i32),
CurDAG->getRegister(0, MVT::i32) };
- return CurDAG->getTargetNode(ARM::UMULL, dl, MVT::i32, MVT::i32, Ops, 5);
+ return CurDAG->getMachineNode(ARM::UMULL, dl, MVT::i32, MVT::i32, Ops, 5);
+ }
}
case ISD::SMUL_LOHI: {
- SDValue Ops[] = { Op.getOperand(0), Op.getOperand(1),
+ if (Subtarget->isThumb1Only())
+ break;
+ if (Subtarget->isThumb()) {
+ SDValue Ops[] = { Op.getOperand(0), Op.getOperand(1),
+ getAL(CurDAG), CurDAG->getRegister(0, MVT::i32) };
+ return CurDAG->getMachineNode(ARM::t2SMULL, dl, MVT::i32, MVT::i32, Ops,4);
+ } else {
+ SDValue Ops[] = { Op.getOperand(0), Op.getOperand(1),
getAL(CurDAG), CurDAG->getRegister(0, MVT::i32),
CurDAG->getRegister(0, MVT::i32) };
- return CurDAG->getTargetNode(ARM::SMULL, dl, MVT::i32, MVT::i32, Ops, 5);
+ return CurDAG->getMachineNode(ARM::SMULL, dl, MVT::i32, MVT::i32, Ops, 5);
+ }
}
case ISD::LOAD: {
SDNode *ResNode = 0;
- if (Subtarget->isThumb2())
+ if (Subtarget->isThumb() && Subtarget->hasThumb2())
ResNode = SelectT2IndexedLoad(Op);
else
ResNode = SelectARMIndexedLoad(Op);
@@ -988,7 +1388,7 @@ SDNode *ARMDAGToDAGISel::Select(SDValue Op) {
// Emits: (t2Bcc:void (bb:Other):$dst, (imm:i32):$cc)
// Pattern complexity = 6 cost = 1 size = 0
- unsigned Opc = Subtarget->isThumb() ?
+ unsigned Opc = Subtarget->isThumb() ?
((Subtarget->hasThumb2()) ? ARM::t2Bcc : ARM::tBcc) : ARM::Bcc;
SDValue Chain = Op.getOperand(0);
SDValue N1 = Op.getOperand(1);
@@ -1003,8 +1403,8 @@ SDNode *ARMDAGToDAGISel::Select(SDValue Op) {
cast<ConstantSDNode>(N2)->getZExtValue()),
MVT::i32);
SDValue Ops[] = { N1, Tmp2, N3, Chain, InFlag };
- SDNode *ResNode = CurDAG->getTargetNode(Opc, dl, MVT::Other,
- MVT::Flag, Ops, 5);
+ SDNode *ResNode = CurDAG->getMachineNode(Opc, dl, MVT::Other,
+ MVT::Flag, Ops, 5);
Chain = SDValue(ResNode, 0);
if (Op.getNode()->getNumValues() == 2) {
InFlag = SDValue(ResNode, 1);
@@ -1014,8 +1414,7 @@ SDNode *ARMDAGToDAGISel::Select(SDValue Op) {
return NULL;
}
case ARMISD::CMOV: {
- bool isThumb = Subtarget->isThumb();
- MVT VT = Op.getValueType();
+ EVT VT = Op.getValueType();
SDValue N0 = Op.getOperand(0);
SDValue N1 = Op.getOperand(1);
SDValue N2 = Op.getOperand(2);
@@ -1024,39 +1423,79 @@ SDNode *ARMDAGToDAGISel::Select(SDValue Op) {
assert(N2.getOpcode() == ISD::Constant);
assert(N3.getOpcode() == ISD::Register);
- // Pattern: (ARMcmov:i32 GPR:i32:$false, so_reg:i32:$true, (imm:i32):$cc)
- // Emits: (MOVCCs:i32 GPR:i32:$false, so_reg:i32:$true, (imm:i32):$cc)
- // Pattern complexity = 18 cost = 1 size = 0
- SDValue CPTmp0;
- SDValue CPTmp1;
- SDValue CPTmp2;
- if (!isThumb && VT == MVT::i32 &&
- SelectShifterOperandReg(Op, N1, CPTmp0, CPTmp1, CPTmp2)) {
- SDValue Tmp2 = CurDAG->getTargetConstant(((unsigned)
- cast<ConstantSDNode>(N2)->getZExtValue()),
- MVT::i32);
- SDValue Ops[] = { N0, CPTmp0, CPTmp1, CPTmp2, Tmp2, N3, InFlag };
- return CurDAG->SelectNodeTo(Op.getNode(), ARM::MOVCCs, MVT::i32, Ops, 7);
- }
+ if (!Subtarget->isThumb1Only() && VT == MVT::i32) {
+ // Pattern: (ARMcmov:i32 GPR:i32:$false, so_reg:i32:$true, (imm:i32):$cc)
+ // Emits: (MOVCCs:i32 GPR:i32:$false, so_reg:i32:$true, (imm:i32):$cc)
+ // Pattern complexity = 18 cost = 1 size = 0
+ SDValue CPTmp0;
+ SDValue CPTmp1;
+ SDValue CPTmp2;
+ if (Subtarget->isThumb()) {
+ if (SelectT2ShifterOperandReg(Op, N1, CPTmp0, CPTmp1)) {
+ unsigned SOVal = cast<ConstantSDNode>(CPTmp1)->getZExtValue();
+ unsigned SOShOp = ARM_AM::getSORegShOp(SOVal);
+ unsigned Opc = 0;
+ switch (SOShOp) {
+ case ARM_AM::lsl: Opc = ARM::t2MOVCClsl; break;
+ case ARM_AM::lsr: Opc = ARM::t2MOVCClsr; break;
+ case ARM_AM::asr: Opc = ARM::t2MOVCCasr; break;
+ case ARM_AM::ror: Opc = ARM::t2MOVCCror; break;
+ default:
+ llvm_unreachable("Unknown so_reg opcode!");
+ break;
+ }
+ SDValue SOShImm =
+ CurDAG->getTargetConstant(ARM_AM::getSORegOffset(SOVal), MVT::i32);
+ SDValue Tmp2 = CurDAG->getTargetConstant(((unsigned)
+ cast<ConstantSDNode>(N2)->getZExtValue()),
+ MVT::i32);
+ SDValue Ops[] = { N0, CPTmp0, SOShImm, Tmp2, N3, InFlag };
+ return CurDAG->SelectNodeTo(Op.getNode(), Opc, MVT::i32,Ops, 6);
+ }
+ } else {
+ if (SelectShifterOperandReg(Op, N1, CPTmp0, CPTmp1, CPTmp2)) {
+ SDValue Tmp2 = CurDAG->getTargetConstant(((unsigned)
+ cast<ConstantSDNode>(N2)->getZExtValue()),
+ MVT::i32);
+ SDValue Ops[] = { N0, CPTmp0, CPTmp1, CPTmp2, Tmp2, N3, InFlag };
+ return CurDAG->SelectNodeTo(Op.getNode(),
+ ARM::MOVCCs, MVT::i32, Ops, 7);
+ }
+ }
- // Pattern: (ARMcmov:i32 GPR:i32:$false,
- // (imm:i32)<<P:Predicate_so_imm>><<X:so_imm_XFORM>>:$true,
- // (imm:i32):$cc)
- // Emits: (MOVCCi:i32 GPR:i32:$false,
- // (so_imm_XFORM:i32 (imm:i32):$true), (imm:i32):$cc)
- // Pattern complexity = 10 cost = 1 size = 0
- if (VT == MVT::i32 &&
- N3.getOpcode() == ISD::Constant &&
- Predicate_so_imm(N3.getNode())) {
- SDValue Tmp1 = CurDAG->getTargetConstant(((unsigned)
- cast<ConstantSDNode>(N1)->getZExtValue()),
- MVT::i32);
- Tmp1 = Transform_so_imm_XFORM(Tmp1.getNode());
- SDValue Tmp2 = CurDAG->getTargetConstant(((unsigned)
- cast<ConstantSDNode>(N2)->getZExtValue()),
- MVT::i32);
- SDValue Ops[] = { N0, Tmp1, Tmp2, N3, InFlag };
- return CurDAG->SelectNodeTo(Op.getNode(), ARM::MOVCCi, MVT::i32, Ops, 5);
+ // Pattern: (ARMcmov:i32 GPR:i32:$false,
+ // (imm:i32)<<P:Predicate_so_imm>>:$true,
+ // (imm:i32):$cc)
+ // Emits: (MOVCCi:i32 GPR:i32:$false,
+ // (so_imm:i32 (imm:i32):$true), (imm:i32):$cc)
+ // Pattern complexity = 10 cost = 1 size = 0
+ if (N3.getOpcode() == ISD::Constant) {
+ if (Subtarget->isThumb()) {
+ if (Predicate_t2_so_imm(N3.getNode())) {
+ SDValue Tmp1 = CurDAG->getTargetConstant(((unsigned)
+ cast<ConstantSDNode>(N1)->getZExtValue()),
+ MVT::i32);
+ SDValue Tmp2 = CurDAG->getTargetConstant(((unsigned)
+ cast<ConstantSDNode>(N2)->getZExtValue()),
+ MVT::i32);
+ SDValue Ops[] = { N0, Tmp1, Tmp2, N3, InFlag };
+ return CurDAG->SelectNodeTo(Op.getNode(),
+ ARM::t2MOVCCi, MVT::i32, Ops, 5);
+ }
+ } else {
+ if (Predicate_so_imm(N3.getNode())) {
+ SDValue Tmp1 = CurDAG->getTargetConstant(((unsigned)
+ cast<ConstantSDNode>(N1)->getZExtValue()),
+ MVT::i32);
+ SDValue Tmp2 = CurDAG->getTargetConstant(((unsigned)
+ cast<ConstantSDNode>(N2)->getZExtValue()),
+ MVT::i32);
+ SDValue Ops[] = { N0, Tmp1, Tmp2, N3, InFlag };
+ return CurDAG->SelectNodeTo(Op.getNode(),
+ ARM::MOVCCi, MVT::i32, Ops, 5);
+ }
+ }
+ }
}
// Pattern: (ARMcmov:i32 GPR:i32:$false, GPR:i32:$true, (imm:i32):$cc)
@@ -1073,23 +1512,25 @@ SDNode *ARMDAGToDAGISel::Select(SDValue Op) {
MVT::i32);
SDValue Ops[] = { N0, N1, Tmp2, N3, InFlag };
unsigned Opc = 0;
- switch (VT.getSimpleVT()) {
+ switch (VT.getSimpleVT().SimpleTy) {
default: assert(false && "Illegal conditional move type!");
break;
case MVT::i32:
- Opc = isThumb ? ARM::tMOVCCr : ARM::MOVCCr;
+ Opc = Subtarget->isThumb()
+ ? (Subtarget->hasThumb2() ? ARM::t2MOVCCr : ARM::tMOVCCr_pseudo)
+ : ARM::MOVCCr;
break;
case MVT::f32:
Opc = ARM::FCPYScc;
break;
case MVT::f64:
Opc = ARM::FCPYDcc;
- break;
+ break;
}
return CurDAG->SelectNodeTo(Op.getNode(), Opc, VT, Ops, 5);
}
case ARMISD::CNEG: {
- MVT VT = Op.getValueType();
+ EVT VT = Op.getValueType();
SDValue N0 = Op.getOperand(0);
SDValue N1 = Op.getOperand(1);
SDValue N2 = Op.getOperand(2);
@@ -1103,7 +1544,7 @@ SDNode *ARMDAGToDAGISel::Select(SDValue Op) {
MVT::i32);
SDValue Ops[] = { N0, N1, Tmp2, N3, InFlag };
unsigned Opc = 0;
- switch (VT.getSimpleVT()) {
+ switch (VT.getSimpleVT().SimpleTy) {
default: assert(false && "Illegal conditional move type!");
break;
case MVT::f32:
@@ -1116,104 +1557,308 @@ SDNode *ARMDAGToDAGISel::Select(SDValue Op) {
return CurDAG->SelectNodeTo(Op.getNode(), Opc, VT, Ops, 5);
}
- case ISD::DECLARE: {
- SDValue Chain = Op.getOperand(0);
- SDValue N1 = Op.getOperand(1);
- SDValue N2 = Op.getOperand(2);
- FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(N1);
- // FIXME: handle VLAs.
- if (!FINode) {
- ReplaceUses(Op.getValue(0), Chain);
- return NULL;
+ case ARMISD::VZIP: {
+ unsigned Opc = 0;
+ EVT VT = N->getValueType(0);
+ switch (VT.getSimpleVT().SimpleTy) {
+ default: return NULL;
+ case MVT::v8i8: Opc = ARM::VZIPd8; break;
+ case MVT::v4i16: Opc = ARM::VZIPd16; break;
+ case MVT::v2f32:
+ case MVT::v2i32: Opc = ARM::VZIPd32; break;
+ case MVT::v16i8: Opc = ARM::VZIPq8; break;
+ case MVT::v8i16: Opc = ARM::VZIPq16; break;
+ case MVT::v4f32:
+ case MVT::v4i32: Opc = ARM::VZIPq32; break;
}
- if (N2.getOpcode() == ARMISD::PIC_ADD && isa<LoadSDNode>(N2.getOperand(0)))
- N2 = N2.getOperand(0);
- LoadSDNode *Ld = dyn_cast<LoadSDNode>(N2);
- if (!Ld) {
- ReplaceUses(Op.getValue(0), Chain);
- return NULL;
+ return CurDAG->getMachineNode(Opc, dl, VT, VT,
+ N->getOperand(0), N->getOperand(1));
+ }
+ case ARMISD::VUZP: {
+ unsigned Opc = 0;
+ EVT VT = N->getValueType(0);
+ switch (VT.getSimpleVT().SimpleTy) {
+ default: return NULL;
+ case MVT::v8i8: Opc = ARM::VUZPd8; break;
+ case MVT::v4i16: Opc = ARM::VUZPd16; break;
+ case MVT::v2f32:
+ case MVT::v2i32: Opc = ARM::VUZPd32; break;
+ case MVT::v16i8: Opc = ARM::VUZPq8; break;
+ case MVT::v8i16: Opc = ARM::VUZPq16; break;
+ case MVT::v4f32:
+ case MVT::v4i32: Opc = ARM::VUZPq32; break;
}
- SDValue BasePtr = Ld->getBasePtr();
- assert(BasePtr.getOpcode() == ARMISD::Wrapper &&
- isa<ConstantPoolSDNode>(BasePtr.getOperand(0)) &&
- "llvm.dbg.variable should be a constantpool node");
- ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(BasePtr.getOperand(0));
- GlobalValue *GV = 0;
- if (CP->isMachineConstantPoolEntry()) {
- ARMConstantPoolValue *ACPV = (ARMConstantPoolValue*)CP->getMachineCPVal();
- GV = ACPV->getGV();
- } else
- GV = dyn_cast<GlobalValue>(CP->getConstVal());
- if (!GV) {
- ReplaceUses(Op.getValue(0), Chain);
- return NULL;
+ return CurDAG->getMachineNode(Opc, dl, VT, VT,
+ N->getOperand(0), N->getOperand(1));
+ }
+ case ARMISD::VTRN: {
+ unsigned Opc = 0;
+ EVT VT = N->getValueType(0);
+ switch (VT.getSimpleVT().SimpleTy) {
+ default: return NULL;
+ case MVT::v8i8: Opc = ARM::VTRNd8; break;
+ case MVT::v4i16: Opc = ARM::VTRNd16; break;
+ case MVT::v2f32:
+ case MVT::v2i32: Opc = ARM::VTRNd32; break;
+ case MVT::v16i8: Opc = ARM::VTRNq8; break;
+ case MVT::v8i16: Opc = ARM::VTRNq16; break;
+ case MVT::v4f32:
+ case MVT::v4i32: Opc = ARM::VTRNq32; break;
}
-
- SDValue Tmp1 = CurDAG->getTargetFrameIndex(FINode->getIndex(),
- TLI.getPointerTy());
- SDValue Tmp2 = CurDAG->getTargetGlobalAddress(GV, TLI.getPointerTy());
- SDValue Ops[] = { Tmp1, Tmp2, Chain };
- return CurDAG->getTargetNode(TargetInstrInfo::DECLARE, dl,
- MVT::Other, Ops, 3);
+ return CurDAG->getMachineNode(Opc, dl, VT, VT,
+ N->getOperand(0), N->getOperand(1));
}
- case ISD::CONCAT_VECTORS: {
- MVT VT = Op.getValueType();
- assert(VT.is128BitVector() && Op.getNumOperands() == 2 &&
- "unexpected CONCAT_VECTORS");
- SDValue N0 = Op.getOperand(0);
- SDValue N1 = Op.getOperand(1);
- SDNode *Result =
- CurDAG->getTargetNode(TargetInstrInfo::IMPLICIT_DEF, dl, VT);
- if (N0.getOpcode() != ISD::UNDEF)
- Result = CurDAG->getTargetNode(TargetInstrInfo::INSERT_SUBREG, dl, VT,
- SDValue(Result, 0), N0,
- CurDAG->getTargetConstant(arm_dsubreg_0,
- MVT::i32));
- if (N1.getOpcode() != ISD::UNDEF)
- Result = CurDAG->getTargetNode(TargetInstrInfo::INSERT_SUBREG, dl, VT,
- SDValue(Result, 0), N1,
- CurDAG->getTargetConstant(arm_dsubreg_1,
- MVT::i32));
- return Result;
- }
+ case ISD::INTRINSIC_VOID:
+ case ISD::INTRINSIC_W_CHAIN: {
+ unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+ EVT VT = N->getValueType(0);
+ unsigned Opc = 0;
+
+ switch (IntNo) {
+ default:
+ break;
+
+ case Intrinsic::arm_neon_vld2: {
+ unsigned DOpcodes[] = { ARM::VLD2d8, ARM::VLD2d16,
+ ARM::VLD2d32, ARM::VLD2d64 };
+ unsigned QOpcodes[] = { ARM::VLD2q8, ARM::VLD2q16, ARM::VLD2q32 };
+ return SelectVLD(Op, 2, DOpcodes, QOpcodes, 0);
+ }
- case ISD::VECTOR_SHUFFLE: {
- MVT VT = Op.getValueType();
-
- // Match 128-bit splat to VDUPLANEQ. (This could be done with a Pat in
- // ARMInstrNEON.td but it is awkward because the shuffle mask needs to be
- // transformed first into a lane number and then to both a subregister
- // index and an adjusted lane number.) If the source operand is a
- // SCALAR_TO_VECTOR, leave it so it will be matched later as a VDUP.
- ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
- if (VT.is128BitVector() && SVOp->isSplat() &&
- Op.getOperand(0).getOpcode() != ISD::SCALAR_TO_VECTOR &&
- Op.getOperand(1).getOpcode() == ISD::UNDEF) {
- unsigned LaneVal = SVOp->getSplatIndex();
-
- MVT HalfVT;
- unsigned Opc = 0;
- switch (VT.getVectorElementType().getSimpleVT()) {
- default: assert(false && "unhandled VDUP splat type");
- case MVT::i8: Opc = ARM::VDUPLN8q; HalfVT = MVT::v8i8; break;
- case MVT::i16: Opc = ARM::VDUPLN16q; HalfVT = MVT::v4i16; break;
- case MVT::i32: Opc = ARM::VDUPLN32q; HalfVT = MVT::v2i32; break;
- case MVT::f32: Opc = ARM::VDUPLNfq; HalfVT = MVT::v2f32; break;
+ case Intrinsic::arm_neon_vld3: {
+ unsigned DOpcodes[] = { ARM::VLD3d8, ARM::VLD3d16,
+ ARM::VLD3d32, ARM::VLD3d64 };
+ unsigned QOpcodes0[] = { ARM::VLD3q8a, ARM::VLD3q16a, ARM::VLD3q32a };
+ unsigned QOpcodes1[] = { ARM::VLD3q8b, ARM::VLD3q16b, ARM::VLD3q32b };
+ return SelectVLD(Op, 3, DOpcodes, QOpcodes0, QOpcodes1);
+ }
+
+ case Intrinsic::arm_neon_vld4: {
+ unsigned DOpcodes[] = { ARM::VLD4d8, ARM::VLD4d16,
+ ARM::VLD4d32, ARM::VLD4d64 };
+ unsigned QOpcodes0[] = { ARM::VLD4q8a, ARM::VLD4q16a, ARM::VLD4q32a };
+ unsigned QOpcodes1[] = { ARM::VLD4q8b, ARM::VLD4q16b, ARM::VLD4q32b };
+ return SelectVLD(Op, 4, DOpcodes, QOpcodes0, QOpcodes1);
+ }
+
+ case Intrinsic::arm_neon_vld2lane: {
+ unsigned DOpcodes[] = { ARM::VLD2LNd8, ARM::VLD2LNd16, ARM::VLD2LNd32 };
+ unsigned QOpcodes0[] = { ARM::VLD2LNq16a, ARM::VLD2LNq32a };
+ unsigned QOpcodes1[] = { ARM::VLD2LNq16b, ARM::VLD2LNq32b };
+ return SelectVLDSTLane(Op, true, 2, DOpcodes, QOpcodes0, QOpcodes1);
+ }
+
+ case Intrinsic::arm_neon_vld3lane: {
+ unsigned DOpcodes[] = { ARM::VLD3LNd8, ARM::VLD3LNd16, ARM::VLD3LNd32 };
+ unsigned QOpcodes0[] = { ARM::VLD3LNq16a, ARM::VLD3LNq32a };
+ unsigned QOpcodes1[] = { ARM::VLD3LNq16b, ARM::VLD3LNq32b };
+ return SelectVLDSTLane(Op, true, 3, DOpcodes, QOpcodes0, QOpcodes1);
+ }
+
+ case Intrinsic::arm_neon_vld4lane: {
+ unsigned DOpcodes[] = { ARM::VLD4LNd8, ARM::VLD4LNd16, ARM::VLD4LNd32 };
+ unsigned QOpcodes0[] = { ARM::VLD4LNq16a, ARM::VLD4LNq32a };
+ unsigned QOpcodes1[] = { ARM::VLD4LNq16b, ARM::VLD4LNq32b };
+ return SelectVLDSTLane(Op, true, 4, DOpcodes, QOpcodes0, QOpcodes1);
+ }
+
+ case Intrinsic::arm_neon_vst2: {
+ SDValue MemAddr, MemUpdate, MemOpc;
+ if (!SelectAddrMode6(Op, N->getOperand(2), MemAddr, MemUpdate, MemOpc))
+ return NULL;
+ SDValue Chain = N->getOperand(0);
+ VT = N->getOperand(3).getValueType();
+ if (VT.is64BitVector()) {
+ switch (VT.getSimpleVT().SimpleTy) {
+ default: llvm_unreachable("unhandled vst2 type");
+ case MVT::v8i8: Opc = ARM::VST2d8; break;
+ case MVT::v4i16: Opc = ARM::VST2d16; break;
+ case MVT::v2f32:
+ case MVT::v2i32: Opc = ARM::VST2d32; break;
+ case MVT::v1i64: Opc = ARM::VST2d64; break;
+ }
+ const SDValue Ops[] = { MemAddr, MemUpdate, MemOpc,
+ N->getOperand(3), N->getOperand(4), Chain };
+ return CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops, 6);
+ }
+ // Quad registers are stored as pairs of double registers.
+ EVT RegVT;
+ switch (VT.getSimpleVT().SimpleTy) {
+ default: llvm_unreachable("unhandled vst2 type");
+ case MVT::v16i8: Opc = ARM::VST2q8; RegVT = MVT::v8i8; break;
+ case MVT::v8i16: Opc = ARM::VST2q16; RegVT = MVT::v4i16; break;
+ case MVT::v4f32: Opc = ARM::VST2q32; RegVT = MVT::v2f32; break;
+ case MVT::v4i32: Opc = ARM::VST2q32; RegVT = MVT::v2i32; break;
+ }
+ SDValue D0 = CurDAG->getTargetExtractSubreg(ARM::DSUBREG_0, dl, RegVT,
+ N->getOperand(3));
+ SDValue D1 = CurDAG->getTargetExtractSubreg(ARM::DSUBREG_1, dl, RegVT,
+ N->getOperand(3));
+ SDValue D2 = CurDAG->getTargetExtractSubreg(ARM::DSUBREG_0, dl, RegVT,
+ N->getOperand(4));
+ SDValue D3 = CurDAG->getTargetExtractSubreg(ARM::DSUBREG_1, dl, RegVT,
+ N->getOperand(4));
+ const SDValue Ops[] = { MemAddr, MemUpdate, MemOpc,
+ D0, D1, D2, D3, Chain };
+ return CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops, 8);
+ }
+
+ case Intrinsic::arm_neon_vst3: {
+ SDValue MemAddr, MemUpdate, MemOpc;
+ if (!SelectAddrMode6(Op, N->getOperand(2), MemAddr, MemUpdate, MemOpc))
+ return NULL;
+ SDValue Chain = N->getOperand(0);
+ VT = N->getOperand(3).getValueType();
+ if (VT.is64BitVector()) {
+ switch (VT.getSimpleVT().SimpleTy) {
+ default: llvm_unreachable("unhandled vst3 type");
+ case MVT::v8i8: Opc = ARM::VST3d8; break;
+ case MVT::v4i16: Opc = ARM::VST3d16; break;
+ case MVT::v2f32:
+ case MVT::v2i32: Opc = ARM::VST3d32; break;
+ case MVT::v1i64: Opc = ARM::VST3d64; break;
+ }
+ const SDValue Ops[] = { MemAddr, MemUpdate, MemOpc,
+ N->getOperand(3), N->getOperand(4),
+ N->getOperand(5), Chain };
+ return CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops, 7);
}
+ // Quad registers are stored with two separate instructions, where one
+ // stores the even registers and the other stores the odd registers.
+ EVT RegVT;
+ unsigned Opc2 = 0;
+ switch (VT.getSimpleVT().SimpleTy) {
+ default: llvm_unreachable("unhandled vst3 type");
+ case MVT::v16i8:
+ Opc = ARM::VST3q8a; Opc2 = ARM::VST3q8b; RegVT = MVT::v8i8; break;
+ case MVT::v8i16:
+ Opc = ARM::VST3q16a; Opc2 = ARM::VST3q16b; RegVT = MVT::v4i16; break;
+ case MVT::v4f32:
+ Opc = ARM::VST3q32a; Opc2 = ARM::VST3q32b; RegVT = MVT::v2f32; break;
+ case MVT::v4i32:
+ Opc = ARM::VST3q32a; Opc2 = ARM::VST3q32b; RegVT = MVT::v2i32; break;
+ }
+ // Enable writeback to the address register.
+ MemOpc = CurDAG->getTargetConstant(ARM_AM::getAM6Opc(true), MVT::i32);
+
+ SDValue D0 = CurDAG->getTargetExtractSubreg(ARM::DSUBREG_0, dl, RegVT,
+ N->getOperand(3));
+ SDValue D2 = CurDAG->getTargetExtractSubreg(ARM::DSUBREG_0, dl, RegVT,
+ N->getOperand(4));
+ SDValue D4 = CurDAG->getTargetExtractSubreg(ARM::DSUBREG_0, dl, RegVT,
+ N->getOperand(5));
+ const SDValue OpsA[] = { MemAddr, MemUpdate, MemOpc, D0, D2, D4, Chain };
+ SDNode *VStA = CurDAG->getMachineNode(Opc, dl, MemAddr.getValueType(),
+ MVT::Other, OpsA, 7);
+ Chain = SDValue(VStA, 1);
+
+ SDValue D1 = CurDAG->getTargetExtractSubreg(ARM::DSUBREG_1, dl, RegVT,
+ N->getOperand(3));
+ SDValue D3 = CurDAG->getTargetExtractSubreg(ARM::DSUBREG_1, dl, RegVT,
+ N->getOperand(4));
+ SDValue D5 = CurDAG->getTargetExtractSubreg(ARM::DSUBREG_1, dl, RegVT,
+ N->getOperand(5));
+ MemAddr = SDValue(VStA, 0);
+ const SDValue OpsB[] = { MemAddr, MemUpdate, MemOpc, D1, D3, D5, Chain };
+ SDNode *VStB = CurDAG->getMachineNode(Opc2, dl, MemAddr.getValueType(),
+ MVT::Other, OpsB, 7);
+ Chain = SDValue(VStB, 1);
+ ReplaceUses(SDValue(N, 0), Chain);
+ return NULL;
+ }
- // The source operand needs to be changed to a subreg of the original
- // 128-bit operand, and the lane number needs to be adjusted accordingly.
- unsigned NumElts = VT.getVectorNumElements() / 2;
- unsigned SRVal = (LaneVal < NumElts ? arm_dsubreg_0 : arm_dsubreg_1);
- SDValue SR = CurDAG->getTargetConstant(SRVal, MVT::i32);
- SDValue NewLane = CurDAG->getTargetConstant(LaneVal % NumElts, MVT::i32);
- SDNode *SubReg = CurDAG->getTargetNode(TargetInstrInfo::EXTRACT_SUBREG,
- dl, HalfVT, N->getOperand(0), SR);
- return CurDAG->SelectNodeTo(N, Opc, VT, SDValue(SubReg, 0), NewLane);
+ case Intrinsic::arm_neon_vst4: {
+ SDValue MemAddr, MemUpdate, MemOpc;
+ if (!SelectAddrMode6(Op, N->getOperand(2), MemAddr, MemUpdate, MemOpc))
+ return NULL;
+ SDValue Chain = N->getOperand(0);
+ VT = N->getOperand(3).getValueType();
+ if (VT.is64BitVector()) {
+ switch (VT.getSimpleVT().SimpleTy) {
+ default: llvm_unreachable("unhandled vst4 type");
+ case MVT::v8i8: Opc = ARM::VST4d8; break;
+ case MVT::v4i16: Opc = ARM::VST4d16; break;
+ case MVT::v2f32:
+ case MVT::v2i32: Opc = ARM::VST4d32; break;
+ case MVT::v1i64: Opc = ARM::VST4d64; break;
+ }
+ const SDValue Ops[] = { MemAddr, MemUpdate, MemOpc,
+ N->getOperand(3), N->getOperand(4),
+ N->getOperand(5), N->getOperand(6), Chain };
+ return CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops, 8);
+ }
+ // Quad registers are stored with two separate instructions, where one
+ // stores the even registers and the other stores the odd registers.
+ EVT RegVT;
+ unsigned Opc2 = 0;
+ switch (VT.getSimpleVT().SimpleTy) {
+ default: llvm_unreachable("unhandled vst4 type");
+ case MVT::v16i8:
+ Opc = ARM::VST4q8a; Opc2 = ARM::VST4q8b; RegVT = MVT::v8i8; break;
+ case MVT::v8i16:
+ Opc = ARM::VST4q16a; Opc2 = ARM::VST4q16b; RegVT = MVT::v4i16; break;
+ case MVT::v4f32:
+ Opc = ARM::VST4q32a; Opc2 = ARM::VST4q32b; RegVT = MVT::v2f32; break;
+ case MVT::v4i32:
+ Opc = ARM::VST4q32a; Opc2 = ARM::VST4q32b; RegVT = MVT::v2i32; break;
+ }
+ // Enable writeback to the address register.
+ MemOpc = CurDAG->getTargetConstant(ARM_AM::getAM6Opc(true), MVT::i32);
+
+ SDValue D0 = CurDAG->getTargetExtractSubreg(ARM::DSUBREG_0, dl, RegVT,
+ N->getOperand(3));
+ SDValue D2 = CurDAG->getTargetExtractSubreg(ARM::DSUBREG_0, dl, RegVT,
+ N->getOperand(4));
+ SDValue D4 = CurDAG->getTargetExtractSubreg(ARM::DSUBREG_0, dl, RegVT,
+ N->getOperand(5));
+ SDValue D6 = CurDAG->getTargetExtractSubreg(ARM::DSUBREG_0, dl, RegVT,
+ N->getOperand(6));
+ const SDValue OpsA[] = { MemAddr, MemUpdate, MemOpc,
+ D0, D2, D4, D6, Chain };
+ SDNode *VStA = CurDAG->getMachineNode(Opc, dl, MemAddr.getValueType(),
+ MVT::Other, OpsA, 8);
+ Chain = SDValue(VStA, 1);
+
+ SDValue D1 = CurDAG->getTargetExtractSubreg(ARM::DSUBREG_1, dl, RegVT,
+ N->getOperand(3));
+ SDValue D3 = CurDAG->getTargetExtractSubreg(ARM::DSUBREG_1, dl, RegVT,
+ N->getOperand(4));
+ SDValue D5 = CurDAG->getTargetExtractSubreg(ARM::DSUBREG_1, dl, RegVT,
+ N->getOperand(5));
+ SDValue D7 = CurDAG->getTargetExtractSubreg(ARM::DSUBREG_1, dl, RegVT,
+ N->getOperand(6));
+ MemAddr = SDValue(VStA, 0);
+ const SDValue OpsB[] = { MemAddr, MemUpdate, MemOpc,
+ D1, D3, D5, D7, Chain };
+ SDNode *VStB = CurDAG->getMachineNode(Opc2, dl, MemAddr.getValueType(),
+ MVT::Other, OpsB, 8);
+ Chain = SDValue(VStB, 1);
+ ReplaceUses(SDValue(N, 0), Chain);
+ return NULL;
}
- break;
+ case Intrinsic::arm_neon_vst2lane: {
+ unsigned DOpcodes[] = { ARM::VST2LNd8, ARM::VST2LNd16, ARM::VST2LNd32 };
+ unsigned QOpcodes0[] = { ARM::VST2LNq16a, ARM::VST2LNq32a };
+ unsigned QOpcodes1[] = { ARM::VST2LNq16b, ARM::VST2LNq32b };
+ return SelectVLDSTLane(Op, false, 2, DOpcodes, QOpcodes0, QOpcodes1);
+ }
+
+ case Intrinsic::arm_neon_vst3lane: {
+ unsigned DOpcodes[] = { ARM::VST3LNd8, ARM::VST3LNd16, ARM::VST3LNd32 };
+ unsigned QOpcodes0[] = { ARM::VST3LNq16a, ARM::VST3LNq32a };
+ unsigned QOpcodes1[] = { ARM::VST3LNq16b, ARM::VST3LNq32b };
+ return SelectVLDSTLane(Op, false, 3, DOpcodes, QOpcodes0, QOpcodes1);
+ }
+
+ case Intrinsic::arm_neon_vst4lane: {
+ unsigned DOpcodes[] = { ARM::VST4LNd8, ARM::VST4LNd16, ARM::VST4LNd32 };
+ unsigned QOpcodes0[] = { ARM::VST4LNq16a, ARM::VST4LNq32a };
+ unsigned QOpcodes1[] = { ARM::VST4LNq16b, ARM::VST4LNq32b };
+ return SelectVLDSTLane(Op, false, 4, DOpcodes, QOpcodes0, QOpcodes1);
+ }
+ }
}
}
@@ -1224,20 +1869,17 @@ bool ARMDAGToDAGISel::
SelectInlineAsmMemoryOperand(const SDValue &Op, char ConstraintCode,
std::vector<SDValue> &OutOps) {
assert(ConstraintCode == 'm' && "unexpected asm memory constraint");
-
- SDValue Base, Offset, Opc;
- if (!SelectAddrMode2(Op, Op, Base, Offset, Opc))
- return true;
-
- OutOps.push_back(Base);
- OutOps.push_back(Offset);
- OutOps.push_back(Opc);
+ // Require the address to be in a register. That is safe for all ARM
+ // variants and it is hard to do anything much smarter without knowing
+ // how the operand is used.
+ OutOps.push_back(Op);
return false;
}
/// createARMISelDag - This pass converts a legalized DAG into a
/// ARM-specific DAG, ready for instruction scheduling.
///
-FunctionPass *llvm::createARMISelDag(ARMBaseTargetMachine &TM) {
- return new ARMDAGToDAGISel(TM);
+FunctionPass *llvm::createARMISelDag(ARMBaseTargetMachine &TM,
+ CodeGenOpt::Level OptLevel) {
+ return new ARMDAGToDAGISel(TM, OptLevel);
}
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index 41c9ecc..426cecb 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -17,9 +17,11 @@
#include "ARMConstantPoolValue.h"
#include "ARMISelLowering.h"
#include "ARMMachineFunctionInfo.h"
+#include "ARMPerfectShuffle.h"
#include "ARMRegisterInfo.h"
#include "ARMSubtarget.h"
#include "ARMTargetMachine.h"
+#include "ARMTargetObjectFile.h"
#include "llvm/CallingConv.h"
#include "llvm/Constants.h"
#include "llvm/Function.h"
@@ -36,74 +38,101 @@
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/Target/TargetOptions.h"
#include "llvm/ADT/VectorExtras.h"
+#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/MathExtras.h"
+#include <sstream>
using namespace llvm;
-static bool CC_ARM_APCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+static bool CC_ARM_APCS_Custom_f64(unsigned &ValNo, EVT &ValVT, EVT &LocVT,
CCValAssign::LocInfo &LocInfo,
ISD::ArgFlagsTy &ArgFlags,
CCState &State);
-static bool CC_ARM_AAPCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+static bool CC_ARM_AAPCS_Custom_f64(unsigned &ValNo, EVT &ValVT, EVT &LocVT,
CCValAssign::LocInfo &LocInfo,
ISD::ArgFlagsTy &ArgFlags,
CCState &State);
-static bool RetCC_ARM_APCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+static bool RetCC_ARM_APCS_Custom_f64(unsigned &ValNo, EVT &ValVT, EVT &LocVT,
CCValAssign::LocInfo &LocInfo,
ISD::ArgFlagsTy &ArgFlags,
CCState &State);
-static bool RetCC_ARM_AAPCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+static bool RetCC_ARM_AAPCS_Custom_f64(unsigned &ValNo, EVT &ValVT, EVT &LocVT,
CCValAssign::LocInfo &LocInfo,
ISD::ArgFlagsTy &ArgFlags,
CCState &State);
-void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT,
- MVT PromotedBitwiseVT) {
+void ARMTargetLowering::addTypeForNEON(EVT VT, EVT PromotedLdStVT,
+ EVT PromotedBitwiseVT) {
if (VT != PromotedLdStVT) {
- setOperationAction(ISD::LOAD, VT, Promote);
- AddPromotedToType (ISD::LOAD, VT, PromotedLdStVT);
+ setOperationAction(ISD::LOAD, VT.getSimpleVT(), Promote);
+ AddPromotedToType (ISD::LOAD, VT.getSimpleVT(),
+ PromotedLdStVT.getSimpleVT());
- setOperationAction(ISD::STORE, VT, Promote);
- AddPromotedToType (ISD::STORE, VT, PromotedLdStVT);
+ setOperationAction(ISD::STORE, VT.getSimpleVT(), Promote);
+ AddPromotedToType (ISD::STORE, VT.getSimpleVT(),
+ PromotedLdStVT.getSimpleVT());
}
- MVT ElemTy = VT.getVectorElementType();
+ EVT ElemTy = VT.getVectorElementType();
if (ElemTy != MVT::i64 && ElemTy != MVT::f64)
- setOperationAction(ISD::VSETCC, VT, Custom);
+ setOperationAction(ISD::VSETCC, VT.getSimpleVT(), Custom);
if (ElemTy == MVT::i8 || ElemTy == MVT::i16)
- setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
- setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
- setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
- setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
- setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT.getSimpleVT(), Custom);
+ if (ElemTy != MVT::i32) {
+ setOperationAction(ISD::SINT_TO_FP, VT.getSimpleVT(), Expand);
+ setOperationAction(ISD::UINT_TO_FP, VT.getSimpleVT(), Expand);
+ setOperationAction(ISD::FP_TO_SINT, VT.getSimpleVT(), Expand);
+ setOperationAction(ISD::FP_TO_UINT, VT.getSimpleVT(), Expand);
+ }
+ setOperationAction(ISD::BUILD_VECTOR, VT.getSimpleVT(), Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, VT.getSimpleVT(), Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, VT.getSimpleVT(), Custom);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, VT.getSimpleVT(), Expand);
if (VT.isInteger()) {
- setOperationAction(ISD::SHL, VT, Custom);
- setOperationAction(ISD::SRA, VT, Custom);
- setOperationAction(ISD::SRL, VT, Custom);
+ setOperationAction(ISD::SHL, VT.getSimpleVT(), Custom);
+ setOperationAction(ISD::SRA, VT.getSimpleVT(), Custom);
+ setOperationAction(ISD::SRL, VT.getSimpleVT(), Custom);
}
// Promote all bit-wise operations.
if (VT.isInteger() && VT != PromotedBitwiseVT) {
- setOperationAction(ISD::AND, VT, Promote);
- AddPromotedToType (ISD::AND, VT, PromotedBitwiseVT);
- setOperationAction(ISD::OR, VT, Promote);
- AddPromotedToType (ISD::OR, VT, PromotedBitwiseVT);
- setOperationAction(ISD::XOR, VT, Promote);
- AddPromotedToType (ISD::XOR, VT, PromotedBitwiseVT);
- }
-}
-
-void ARMTargetLowering::addDRTypeForNEON(MVT VT) {
+ setOperationAction(ISD::AND, VT.getSimpleVT(), Promote);
+ AddPromotedToType (ISD::AND, VT.getSimpleVT(),
+ PromotedBitwiseVT.getSimpleVT());
+ setOperationAction(ISD::OR, VT.getSimpleVT(), Promote);
+ AddPromotedToType (ISD::OR, VT.getSimpleVT(),
+ PromotedBitwiseVT.getSimpleVT());
+ setOperationAction(ISD::XOR, VT.getSimpleVT(), Promote);
+ AddPromotedToType (ISD::XOR, VT.getSimpleVT(),
+ PromotedBitwiseVT.getSimpleVT());
+ }
+
+ // Neon does not support vector divide/remainder operations.
+ setOperationAction(ISD::SDIV, VT.getSimpleVT(), Expand);
+ setOperationAction(ISD::UDIV, VT.getSimpleVT(), Expand);
+ setOperationAction(ISD::FDIV, VT.getSimpleVT(), Expand);
+ setOperationAction(ISD::SREM, VT.getSimpleVT(), Expand);
+ setOperationAction(ISD::UREM, VT.getSimpleVT(), Expand);
+ setOperationAction(ISD::FREM, VT.getSimpleVT(), Expand);
+}
+
+void ARMTargetLowering::addDRTypeForNEON(EVT VT) {
addRegisterClass(VT, ARM::DPRRegisterClass);
addTypeForNEON(VT, MVT::f64, MVT::v2i32);
}
-void ARMTargetLowering::addQRTypeForNEON(MVT VT) {
+void ARMTargetLowering::addQRTypeForNEON(EVT VT) {
addRegisterClass(VT, ARM::QPRRegisterClass);
addTypeForNEON(VT, MVT::v2f64, MVT::v4i32);
}
+static TargetLoweringObjectFile *createTLOF(TargetMachine &TM) {
+ if (TM.getSubtarget<ARMSubtarget>().isTargetDarwin())
+ return new TargetLoweringObjectFileMachO();
+ return new ARMElfTargetObjectFile();
+}
+
ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
- : TargetLowering(TM), ARMPCLabelIndex(0) {
+ : TargetLowering(TM, createTLOF(TM)), ARMPCLabelIndex(0) {
Subtarget = &TM.getSubtarget<ARMSubtarget>();
if (Subtarget->isTargetDarwin()) {
@@ -188,11 +217,20 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
setLibcallName(RTLIB::SRL_I128, 0);
setLibcallName(RTLIB::SRA_I128, 0);
- if (Subtarget->isThumb())
+ // Libcalls should use the AAPCS base standard ABI, even if hard float
+ // is in effect, as per the ARM RTABI specification, section 4.1.2.
+ if (Subtarget->isAAPCS_ABI()) {
+ for (int i = 0; i < RTLIB::UNKNOWN_LIBCALL; ++i) {
+ setLibcallCallingConv(static_cast<RTLIB::Libcall>(i),
+ CallingConv::ARM_AAPCS);
+ }
+ }
+
+ if (Subtarget->isThumb1Only())
addRegisterClass(MVT::i32, ARM::tGPRRegisterClass);
else
addRegisterClass(MVT::i32, ARM::GPRRegisterClass);
- if (!UseSoftFloat && Subtarget->hasVFP2() && !Subtarget->isThumb()) {
+ if (!UseSoftFloat && Subtarget->hasVFP2() && !Subtarget->isThumb1Only()) {
addRegisterClass(MVT::f32, ARM::SPRRegisterClass);
addRegisterClass(MVT::f64, ARM::DPRRegisterClass);
@@ -213,6 +251,39 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
addQRTypeForNEON(MVT::v4i32);
addQRTypeForNEON(MVT::v2i64);
+ // v2f64 is legal so that QR subregs can be extracted as f64 elements, but
+ // neither Neon nor VFP support any arithmetic operations on it.
+ setOperationAction(ISD::FADD, MVT::v2f64, Expand);
+ setOperationAction(ISD::FSUB, MVT::v2f64, Expand);
+ setOperationAction(ISD::FMUL, MVT::v2f64, Expand);
+ setOperationAction(ISD::FDIV, MVT::v2f64, Expand);
+ setOperationAction(ISD::FREM, MVT::v2f64, Expand);
+ setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Expand);
+ setOperationAction(ISD::VSETCC, MVT::v2f64, Expand);
+ setOperationAction(ISD::FNEG, MVT::v2f64, Expand);
+ setOperationAction(ISD::FABS, MVT::v2f64, Expand);
+ setOperationAction(ISD::FSQRT, MVT::v2f64, Expand);
+ setOperationAction(ISD::FSIN, MVT::v2f64, Expand);
+ setOperationAction(ISD::FCOS, MVT::v2f64, Expand);
+ setOperationAction(ISD::FPOWI, MVT::v2f64, Expand);
+ setOperationAction(ISD::FPOW, MVT::v2f64, Expand);
+ setOperationAction(ISD::FLOG, MVT::v2f64, Expand);
+ setOperationAction(ISD::FLOG2, MVT::v2f64, Expand);
+ setOperationAction(ISD::FLOG10, MVT::v2f64, Expand);
+ setOperationAction(ISD::FEXP, MVT::v2f64, Expand);
+ setOperationAction(ISD::FEXP2, MVT::v2f64, Expand);
+ setOperationAction(ISD::FCEIL, MVT::v2f64, Expand);
+ setOperationAction(ISD::FTRUNC, MVT::v2f64, Expand);
+ setOperationAction(ISD::FRINT, MVT::v2f64, Expand);
+ setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Expand);
+ setOperationAction(ISD::FFLOOR, MVT::v2f64, Expand);
+
+ // Neon does not support some operations on v1i64 and v2i64 types.
+ setOperationAction(ISD::MUL, MVT::v1i64, Expand);
+ setOperationAction(ISD::MUL, MVT::v2i64, Expand);
+ setOperationAction(ISD::VSETCC, MVT::v1i64, Expand);
+ setOperationAction(ISD::VSETCC, MVT::v2i64, Expand);
+
setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
setTargetDAGCombine(ISD::SHL);
setTargetDAGCombine(ISD::SRL);
@@ -246,7 +317,7 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
}
// i64 operation support.
- if (Subtarget->isThumb()) {
+ if (Subtarget->isThumb1Only()) {
setOperationAction(ISD::MUL, MVT::i64, Expand);
setOperationAction(ISD::MULHU, MVT::i32, Expand);
setOperationAction(ISD::MULHS, MVT::i32, Expand);
@@ -287,7 +358,6 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
setOperationAction(ISD::DBG_STOPPOINT, MVT::Other, Expand);
setOperationAction(ISD::DEBUG_LOC, MVT::Other, Expand);
- setOperationAction(ISD::RET, MVT::Other, Custom);
setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
setOperationAction(ISD::ConstantPool, MVT::i32, Custom);
setOperationAction(ISD::GLOBAL_OFFSET_TABLE, MVT::i32, Custom);
@@ -300,7 +370,14 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
setOperationAction(ISD::VAEND, MVT::Other, Expand);
setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
- setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand);
+ setOperationAction(ISD::EHSELECTION, MVT::i32, Expand);
+ // FIXME: Shouldn't need this, since no register is used, but the legalizer
+ // doesn't yet know how to not do that for SjLj.
+ setExceptionSelectorRegister(ARM::R0);
+ if (Subtarget->isThumb())
+ setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
+ else
+ setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand);
setOperationAction(ISD::MEMBARRIER, MVT::Other, Expand);
if (!Subtarget->hasV6Ops() && !Subtarget->isThumb2()) {
@@ -309,7 +386,7 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
}
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
- if (!UseSoftFloat && Subtarget->hasVFP2() && !Subtarget->isThumb())
+ if (!UseSoftFloat && Subtarget->hasVFP2() && !Subtarget->isThumb1Only())
// Turn f64->i64 into FMRRD, i64 -> f64 to FMDRR iff target supports vfp2.
setOperationAction(ISD::BIT_CONVERT, MVT::i64, Custom);
@@ -339,7 +416,7 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
setOperationAction(ISD::FCOS, MVT::f64, Expand);
setOperationAction(ISD::FREM, MVT::f64, Expand);
setOperationAction(ISD::FREM, MVT::f32, Expand);
- if (!UseSoftFloat && Subtarget->hasVFP2() && !Subtarget->isThumb()) {
+ if (!UseSoftFloat && Subtarget->hasVFP2() && !Subtarget->isThumb1Only()) {
setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
}
@@ -347,7 +424,7 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
setOperationAction(ISD::FPOW, MVT::f32, Expand);
// int <-> fp are custom expanded into bit_convert + ARMISD ops.
- if (!UseSoftFloat && Subtarget->hasVFP2() && !Subtarget->isThumb()) {
+ if (!UseSoftFloat && Subtarget->hasVFP2() && !Subtarget->isThumb1Only()) {
setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
@@ -361,26 +438,19 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
setStackPointerRegisterToSaveRestore(ARM::SP);
setSchedulingPreference(SchedulingForRegPressure);
- setIfCvtBlockSizeLimit(Subtarget->isThumb() ? 0 : 10);
- setIfCvtDupBlockSizeLimit(Subtarget->isThumb() ? 0 : 2);
-
- if (!Subtarget->isThumb()) {
- // Use branch latency information to determine if-conversion limits.
- // FIXME: If-converter should use instruction latency of the branch being
- // eliminated to compute the threshold. For ARMv6, the branch "latency"
- // varies depending on whether it's dynamically or statically predicted
- // and on whether the destination is in the prefetch buffer.
- const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
- const InstrItineraryData &InstrItins = Subtarget->getInstrItineraryData();
- unsigned Latency= InstrItins.getLatency(TII->get(ARM::Bcc).getSchedClass());
- if (Latency > 1) {
- setIfCvtBlockSizeLimit(Latency-1);
- if (Latency > 2)
- setIfCvtDupBlockSizeLimit(Latency-2);
- } else {
- setIfCvtBlockSizeLimit(10);
- setIfCvtDupBlockSizeLimit(2);
- }
+
+ // FIXME: If-converter should use instruction latency to determine
+ // profitability rather than relying on fixed limits.
+ if (Subtarget->getCPUString() == "generic") {
+ // Generic (and overly aggressive) if-conversion limits.
+ setIfCvtBlockSizeLimit(10);
+ setIfCvtDupBlockSizeLimit(2);
+ } else if (Subtarget->hasV6Ops()) {
+ setIfCvtBlockSizeLimit(2);
+ setIfCvtDupBlockSizeLimit(1);
+ } else {
+ setIfCvtBlockSizeLimit(3);
+ setIfCvtDupBlockSizeLimit(2);
}
maxStoresPerMemcpy = 1; //// temporary - rewrite interface to use type
@@ -401,6 +471,7 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
case ARMISD::tCALL: return "ARMISD::tCALL";
case ARMISD::BRCOND: return "ARMISD::BRCOND";
case ARMISD::BR_JT: return "ARMISD::BR_JT";
+ case ARMISD::BR2_JT: return "ARMISD::BR2_JT";
case ARMISD::RET_FLAG: return "ARMISD::RET_FLAG";
case ARMISD::PIC_ADD: return "ARMISD::PIC_ADD";
case ARMISD::CMP: return "ARMISD::CMP";
@@ -425,6 +496,8 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
case ARMISD::THREAD_POINTER:return "ARMISD::THREAD_POINTER";
+ case ARMISD::DYN_ALLOC: return "ARMISD::DYN_ALLOC";
+
case ARMISD::VCEQ: return "ARMISD::VCEQ";
case ARMISD::VCGE: return "ARMISD::VCGE";
case ARMISD::VCGEU: return "ARMISD::VCGEU";
@@ -453,13 +526,21 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
case ARMISD::VQRSHRNsu: return "ARMISD::VQRSHRNsu";
case ARMISD::VGETLANEu: return "ARMISD::VGETLANEu";
case ARMISD::VGETLANEs: return "ARMISD::VGETLANEs";
- case ARMISD::VDUPLANEQ: return "ARMISD::VDUPLANEQ";
+ case ARMISD::VDUP: return "ARMISD::VDUP";
+ case ARMISD::VDUPLANE: return "ARMISD::VDUPLANE";
+ case ARMISD::VEXT: return "ARMISD::VEXT";
+ case ARMISD::VREV64: return "ARMISD::VREV64";
+ case ARMISD::VREV32: return "ARMISD::VREV32";
+ case ARMISD::VREV16: return "ARMISD::VREV16";
+ case ARMISD::VZIP: return "ARMISD::VZIP";
+ case ARMISD::VUZP: return "ARMISD::VUZP";
+ case ARMISD::VTRN: return "ARMISD::VTRN";
}
}
/// getFunctionAlignment - Return the Log2 alignment of this function.
unsigned ARMTargetLowering::getFunctionAlignment(const Function *F) const {
- return getTargetMachine().getSubtarget<ARMSubtarget>().isThumb() ? 1 : 2;
+ return getTargetMachine().getSubtarget<ARMSubtarget>().isThumb() ? 0 : 1;
}
//===----------------------------------------------------------------------===//
@@ -469,7 +550,7 @@ unsigned ARMTargetLowering::getFunctionAlignment(const Function *F) const {
/// IntCCToARMCC - Convert a DAG integer condition code to an ARM CC
static ARMCC::CondCodes IntCCToARMCC(ISD::CondCode CC) {
switch (CC) {
- default: assert(0 && "Unknown condition code!");
+ default: llvm_unreachable("Unknown condition code!");
case ISD::SETNE: return ARMCC::NE;
case ISD::SETEQ: return ARMCC::EQ;
case ISD::SETGT: return ARMCC::GT;
@@ -483,15 +564,12 @@ static ARMCC::CondCodes IntCCToARMCC(ISD::CondCode CC) {
}
}
-/// FPCCToARMCC - Convert a DAG fp condition code to an ARM CC. It
-/// returns true if the operands should be inverted to form the proper
-/// comparison.
-static bool FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode,
+/// FPCCToARMCC - Convert a DAG fp condition code to an ARM CC.
+static void FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode,
ARMCC::CondCodes &CondCode2) {
- bool Invert = false;
CondCode2 = ARMCC::AL;
switch (CC) {
- default: assert(0 && "Unknown FP condition!");
+ default: llvm_unreachable("Unknown FP condition!");
case ISD::SETEQ:
case ISD::SETOEQ: CondCode = ARMCC::EQ; break;
case ISD::SETGT:
@@ -499,7 +577,7 @@ static bool FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode,
case ISD::SETGE:
case ISD::SETOGE: CondCode = ARMCC::GE; break;
case ISD::SETOLT: CondCode = ARMCC::MI; break;
- case ISD::SETOLE: CondCode = ARMCC::GT; Invert = true; break;
+ case ISD::SETOLE: CondCode = ARMCC::LS; break;
case ISD::SETONE: CondCode = ARMCC::MI; CondCode2 = ARMCC::GT; break;
case ISD::SETO: CondCode = ARMCC::VC; break;
case ISD::SETUO: CondCode = ARMCC::VS; break;
@@ -513,24 +591,16 @@ static bool FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode,
case ISD::SETNE:
case ISD::SETUNE: CondCode = ARMCC::NE; break;
}
- return Invert;
}
//===----------------------------------------------------------------------===//
// Calling Convention Implementation
-//
-// The lower operations present on calling convention works on this order:
-// LowerCALL (virt regs --> phys regs, virt regs --> stack)
-// LowerFORMAL_ARGUMENTS (phys --> virt regs, stack --> virt regs)
-// LowerRET (virt regs --> phys regs)
-// LowerCALL (phys regs --> virt regs)
-//
//===----------------------------------------------------------------------===//
#include "ARMGenCallingConv.inc"
// APCS f64 is in register pairs, possibly split to stack
-static bool f64AssignAPCS(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+static bool f64AssignAPCS(unsigned &ValNo, EVT &ValVT, EVT &LocVT,
CCValAssign::LocInfo &LocInfo,
CCState &State, bool CanFail) {
static const unsigned RegList[] = { ARM::R0, ARM::R1, ARM::R2, ARM::R3 };
@@ -560,7 +630,7 @@ static bool f64AssignAPCS(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
return true;
}
-static bool CC_ARM_APCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+static bool CC_ARM_APCS_Custom_f64(unsigned &ValNo, EVT &ValVT, EVT &LocVT,
CCValAssign::LocInfo &LocInfo,
ISD::ArgFlagsTy &ArgFlags,
CCState &State) {
@@ -573,7 +643,7 @@ static bool CC_ARM_APCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
}
// AAPCS f64 is in aligned register pairs
-static bool f64AssignAAPCS(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+static bool f64AssignAAPCS(unsigned &ValNo, EVT &ValVT, EVT &LocVT,
CCValAssign::LocInfo &LocInfo,
CCState &State, bool CanFail) {
static const unsigned HiRegList[] = { ARM::R0, ARM::R2 };
@@ -603,7 +673,7 @@ static bool f64AssignAAPCS(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
return true;
}
-static bool CC_ARM_AAPCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+static bool CC_ARM_AAPCS_Custom_f64(unsigned &ValNo, EVT &ValVT, EVT &LocVT,
CCValAssign::LocInfo &LocInfo,
ISD::ArgFlagsTy &ArgFlags,
CCState &State) {
@@ -615,7 +685,7 @@ static bool CC_ARM_AAPCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
return true; // we handled it
}
-static bool f64RetAssign(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+static bool f64RetAssign(unsigned &ValNo, EVT &ValVT, EVT &LocVT,
CCValAssign::LocInfo &LocInfo, CCState &State) {
static const unsigned HiRegList[] = { ARM::R0, ARM::R2 };
static const unsigned LoRegList[] = { ARM::R1, ARM::R3 };
@@ -635,7 +705,7 @@ static bool f64RetAssign(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
return true;
}
-static bool RetCC_ARM_APCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+static bool RetCC_ARM_APCS_Custom_f64(unsigned &ValNo, EVT &ValVT, EVT &LocVT,
CCValAssign::LocInfo &LocInfo,
ISD::ArgFlagsTy &ArgFlags,
CCState &State) {
@@ -646,7 +716,7 @@ static bool RetCC_ARM_APCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
return true; // we handled it
}
-static bool RetCC_ARM_AAPCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+static bool RetCC_ARM_AAPCS_Custom_f64(unsigned &ValNo, EVT &ValVT, EVT &LocVT,
CCValAssign::LocInfo &LocInfo,
ISD::ArgFlagsTy &ArgFlags,
CCState &State) {
@@ -656,49 +726,48 @@ static bool RetCC_ARM_AAPCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
/// CCAssignFnForNode - Selects the correct CCAssignFn for a the
/// given CallingConvention value.
-CCAssignFn *ARMTargetLowering::CCAssignFnForNode(unsigned CC,
- bool Return) const {
+CCAssignFn *ARMTargetLowering::CCAssignFnForNode(CallingConv::ID CC,
+ bool Return,
+ bool isVarArg) const {
switch (CC) {
default:
- assert(0 && "Unsupported calling convention");
+ llvm_unreachable("Unsupported calling convention");
case CallingConv::C:
case CallingConv::Fast:
- // Use target triple & subtarget features to do actual dispatch.
- if (Subtarget->isAAPCS_ABI()) {
- if (Subtarget->hasVFP2() &&
- FloatABIType == FloatABI::Hard)
- return (Return ? RetCC_ARM_AAPCS_VFP: CC_ARM_AAPCS_VFP);
- else
- return (Return ? RetCC_ARM_AAPCS: CC_ARM_AAPCS);
- } else
- return (Return ? RetCC_ARM_APCS: CC_ARM_APCS);
+ // Use target triple & subtarget features to do actual dispatch.
+ if (Subtarget->isAAPCS_ABI()) {
+ if (Subtarget->hasVFP2() &&
+ FloatABIType == FloatABI::Hard && !isVarArg)
+ return (Return ? RetCC_ARM_AAPCS_VFP: CC_ARM_AAPCS_VFP);
+ else
+ return (Return ? RetCC_ARM_AAPCS: CC_ARM_AAPCS);
+ } else
+ return (Return ? RetCC_ARM_APCS: CC_ARM_APCS);
case CallingConv::ARM_AAPCS_VFP:
- return (Return ? RetCC_ARM_AAPCS_VFP: CC_ARM_AAPCS_VFP);
+ return (Return ? RetCC_ARM_AAPCS_VFP: CC_ARM_AAPCS_VFP);
case CallingConv::ARM_AAPCS:
- return (Return ? RetCC_ARM_AAPCS: CC_ARM_AAPCS);
+ return (Return ? RetCC_ARM_AAPCS: CC_ARM_AAPCS);
case CallingConv::ARM_APCS:
- return (Return ? RetCC_ARM_APCS: CC_ARM_APCS);
+ return (Return ? RetCC_ARM_APCS: CC_ARM_APCS);
}
}
-/// LowerCallResult - Lower the result values of an ISD::CALL into the
-/// appropriate copies out of appropriate physical registers. This assumes that
-/// Chain/InFlag are the input chain/flag to use, and that TheCall is the call
-/// being lowered. The returns a SDNode with the same number of values as the
-/// ISD::CALL.
-SDNode *ARMTargetLowering::
-LowerCallResult(SDValue Chain, SDValue InFlag, CallSDNode *TheCall,
- unsigned CallingConv, SelectionDAG &DAG) {
+/// LowerCallResult - Lower the result values of a call into the
+/// appropriate copies out of appropriate physical registers.
+SDValue
+ARMTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
+ CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins,
+ DebugLoc dl, SelectionDAG &DAG,
+ SmallVectorImpl<SDValue> &InVals) {
- DebugLoc dl = TheCall->getDebugLoc();
// Assign locations to each value returned by this call.
SmallVector<CCValAssign, 16> RVLocs;
- bool isVarArg = TheCall->isVarArg();
- CCState CCInfo(CallingConv, isVarArg, getTargetMachine(), RVLocs);
- CCInfo.AnalyzeCallResult(TheCall,
- CCAssignFnForNode(CallingConv, /* Return*/ true));
-
- SmallVector<SDValue, 8> ResultVals;
+ CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
+ RVLocs, *DAG.getContext());
+ CCInfo.AnalyzeCallResult(Ins,
+ CCAssignFnForNode(CallConv, /* Return*/ true,
+ isVarArg));
// Copy all of the result registers out of their specified physreg.
for (unsigned i = 0; i != RVLocs.size(); ++i) {
@@ -743,20 +812,17 @@ LowerCallResult(SDValue Chain, SDValue InFlag, CallSDNode *TheCall,
}
switch (VA.getLocInfo()) {
- default: assert(0 && "Unknown loc info!");
+ default: llvm_unreachable("Unknown loc info!");
case CCValAssign::Full: break;
case CCValAssign::BCvt:
Val = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getValVT(), Val);
break;
}
- ResultVals.push_back(Val);
+ InVals.push_back(Val);
}
- // Merge everything together with a MERGE_VALUES node.
- ResultVals.push_back(Chain);
- return DAG.getNode(ISD::MERGE_VALUES, dl, TheCall->getVTList(),
- &ResultVals[0], ResultVals.size()).getNode();
+ return Chain;
}
/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified
@@ -776,11 +842,11 @@ CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
/// LowerMemOpCallTo - Store the argument to the stack.
SDValue
-ARMTargetLowering::LowerMemOpCallTo(CallSDNode *TheCall, SelectionDAG &DAG,
- const SDValue &StackPtr,
- const CCValAssign &VA, SDValue Chain,
- SDValue Arg, ISD::ArgFlagsTy Flags) {
- DebugLoc dl = TheCall->getDebugLoc();
+ARMTargetLowering::LowerMemOpCallTo(SDValue Chain,
+ SDValue StackPtr, SDValue Arg,
+ DebugLoc dl, SelectionDAG &DAG,
+ const CCValAssign &VA,
+ ISD::ArgFlagsTy Flags) {
unsigned LocMemOffset = VA.getLocMemOffset();
SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset);
PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
@@ -791,14 +857,13 @@ ARMTargetLowering::LowerMemOpCallTo(CallSDNode *TheCall, SelectionDAG &DAG,
PseudoSourceValue::getStack(), LocMemOffset);
}
-void ARMTargetLowering::PassF64ArgInRegs(CallSDNode *TheCall, SelectionDAG &DAG,
+void ARMTargetLowering::PassF64ArgInRegs(DebugLoc dl, SelectionDAG &DAG,
SDValue Chain, SDValue &Arg,
RegsToPassVector &RegsToPass,
CCValAssign &VA, CCValAssign &NextVA,
SDValue &StackPtr,
SmallVector<SDValue, 8> &MemOpChains,
ISD::ArgFlagsTy Flags) {
- DebugLoc dl = TheCall->getDebugLoc();
SDValue fmrrd = DAG.getNode(ARMISD::FMRRD, dl,
DAG.getVTList(MVT::i32, MVT::i32), Arg);
@@ -811,27 +876,31 @@ void ARMTargetLowering::PassF64ArgInRegs(CallSDNode *TheCall, SelectionDAG &DAG,
if (StackPtr.getNode() == 0)
StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy());
- MemOpChains.push_back(LowerMemOpCallTo(TheCall, DAG, StackPtr, NextVA,
- Chain, fmrrd.getValue(1), Flags));
+ MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, fmrrd.getValue(1),
+ dl, DAG, NextVA,
+ Flags));
}
}
-/// LowerCALL - Lowering a ISD::CALL node into a callseq_start <-
+/// LowerCall - Lowering a call into a callseq_start <-
/// ARMISD:CALL <- callseq_end chain. Also add input and output parameter
/// nodes.
-SDValue ARMTargetLowering::LowerCALL(SDValue Op, SelectionDAG &DAG) {
- CallSDNode *TheCall = cast<CallSDNode>(Op.getNode());
- MVT RetVT = TheCall->getRetValType(0);
- SDValue Chain = TheCall->getChain();
- unsigned CC = TheCall->getCallingConv();
- bool isVarArg = TheCall->isVarArg();
- SDValue Callee = TheCall->getCallee();
- DebugLoc dl = TheCall->getDebugLoc();
+SDValue
+ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
+ CallingConv::ID CallConv, bool isVarArg,
+ bool isTailCall,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<ISD::InputArg> &Ins,
+ DebugLoc dl, SelectionDAG &DAG,
+ SmallVectorImpl<SDValue> &InVals) {
// Analyze operands of the call, assigning locations to each operand.
SmallVector<CCValAssign, 16> ArgLocs;
- CCState CCInfo(CC, isVarArg, getTargetMachine(), ArgLocs);
- CCInfo.AnalyzeCallOperands(TheCall, CCAssignFnForNode(CC, /* Return*/ false));
+ CCState CCInfo(CallConv, isVarArg, getTargetMachine(), ArgLocs,
+ *DAG.getContext());
+ CCInfo.AnalyzeCallOperands(Outs,
+ CCAssignFnForNode(CallConv, /* Return*/ false,
+ isVarArg));
// Get a count of how many bytes are to be pushed on the stack.
unsigned NumBytes = CCInfo.getNextStackOffset();
@@ -851,12 +920,12 @@ SDValue ARMTargetLowering::LowerCALL(SDValue Op, SelectionDAG &DAG) {
i != e;
++i, ++realArgIdx) {
CCValAssign &VA = ArgLocs[i];
- SDValue Arg = TheCall->getArg(realArgIdx);
- ISD::ArgFlagsTy Flags = TheCall->getArgFlags(realArgIdx);
+ SDValue Arg = Outs[realArgIdx].Val;
+ ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
// Promote the value if needed.
switch (VA.getLocInfo()) {
- default: assert(0 && "Unknown loc info!");
+ default: llvm_unreachable("Unknown loc info!");
case CCValAssign::Full: break;
case CCValAssign::SExt:
Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
@@ -872,7 +941,7 @@ SDValue ARMTargetLowering::LowerCALL(SDValue Op, SelectionDAG &DAG) {
break;
}
- // f64 and v2f64 are passed in i32 pairs and must be split into pieces
+ // f64 and v2f64 might be passed in i32 pairs and must be split into pieces
if (VA.needsCustom()) {
if (VA.getLocVT() == MVT::v2f64) {
SDValue Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
@@ -880,23 +949,23 @@ SDValue ARMTargetLowering::LowerCALL(SDValue Op, SelectionDAG &DAG) {
SDValue Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
DAG.getConstant(1, MVT::i32));
- PassF64ArgInRegs(TheCall, DAG, Chain, Op0, RegsToPass,
+ PassF64ArgInRegs(dl, DAG, Chain, Op0, RegsToPass,
VA, ArgLocs[++i], StackPtr, MemOpChains, Flags);
VA = ArgLocs[++i]; // skip ahead to next loc
if (VA.isRegLoc()) {
- PassF64ArgInRegs(TheCall, DAG, Chain, Op1, RegsToPass,
+ PassF64ArgInRegs(dl, DAG, Chain, Op1, RegsToPass,
VA, ArgLocs[++i], StackPtr, MemOpChains, Flags);
} else {
assert(VA.isMemLoc());
if (StackPtr.getNode() == 0)
StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy());
- MemOpChains.push_back(LowerMemOpCallTo(TheCall, DAG, StackPtr, VA,
- Chain, Op1, Flags));
+ MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Op1,
+ dl, DAG, VA, Flags));
}
} else {
- PassF64ArgInRegs(TheCall, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++i],
+ PassF64ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++i],
StackPtr, MemOpChains, Flags);
}
} else if (VA.isRegLoc()) {
@@ -906,8 +975,8 @@ SDValue ARMTargetLowering::LowerCALL(SDValue Op, SelectionDAG &DAG) {
if (StackPtr.getNode() == 0)
StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy());
- MemOpChains.push_back(LowerMemOpCallTo(TheCall, DAG, StackPtr, VA,
- Chain, Arg, Flags));
+ MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
+ dl, DAG, VA, Flags));
}
}
@@ -933,17 +1002,17 @@ SDValue ARMTargetLowering::LowerCALL(SDValue Op, SelectionDAG &DAG) {
if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
GlobalValue *GV = G->getGlobal();
isDirect = true;
- bool isExt = (GV->isDeclaration() || GV->hasWeakLinkage() ||
- GV->hasLinkOnceLinkage());
+ bool isExt = GV->isDeclaration() || GV->isWeakForLinker();
bool isStub = (isExt && Subtarget->isTargetDarwin()) &&
getTargetMachine().getRelocationModel() != Reloc::Static;
isARMFunc = !Subtarget->isThumb() || isStub;
// ARM call to a local ARM function is predicable.
isLocalARMFunc = !Subtarget->isThumb() && !isExt;
// tBX takes a register source operand.
- if (isARMFunc && Subtarget->isThumb() && !Subtarget->hasV5TOps()) {
- ARMConstantPoolValue *CPV = new ARMConstantPoolValue(GV, ARMPCLabelIndex,
- ARMCP::CPStub, 4);
+ if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
+ ARMConstantPoolValue *CPV = new ARMConstantPoolValue(GV,
+ ARMPCLabelIndex,
+ ARMCP::CPValue, 4);
SDValue CPAddr = DAG.getTargetConstantPool(CPV, getPointerTy(), 4);
CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
Callee = DAG.getLoad(getPointerTy(), dl,
@@ -960,9 +1029,9 @@ SDValue ARMTargetLowering::LowerCALL(SDValue Op, SelectionDAG &DAG) {
isARMFunc = !Subtarget->isThumb() || isStub;
// tBX takes a register source operand.
const char *Sym = S->getSymbol();
- if (isARMFunc && Subtarget->isThumb() && !Subtarget->hasV5TOps()) {
- ARMConstantPoolValue *CPV = new ARMConstantPoolValue(Sym, ARMPCLabelIndex,
- ARMCP::CPStub, 4);
+ if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
+ ARMConstantPoolValue *CPV = new ARMConstantPoolValue(*DAG.getContext(),
+ Sym, ARMPCLabelIndex, 4);
SDValue CPAddr = DAG.getTargetConstantPool(CPV, getPointerTy(), 4);
CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
Callee = DAG.getLoad(getPointerTy(), dl,
@@ -977,7 +1046,7 @@ SDValue ARMTargetLowering::LowerCALL(SDValue Op, SelectionDAG &DAG) {
// FIXME: handle tail calls differently.
unsigned CallOpc;
if (Subtarget->isThumb()) {
- if (!Subtarget->hasV5TOps() && (!isDirect || isARMFunc))
+ if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps())
CallOpc = ARMISD::CALL_NOLINK;
else
CallOpc = isARMFunc ? ARMISD::CALL : ARMISD::tCALL;
@@ -986,7 +1055,7 @@ SDValue ARMTargetLowering::LowerCALL(SDValue Op, SelectionDAG &DAG) {
? (isLocalARMFunc ? ARMISD::CALL_PRED : ARMISD::CALL)
: ARMISD::CALL_NOLINK;
}
- if (CallOpc == ARMISD::CALL_NOLINK && !Subtarget->isThumb()) {
+ if (CallOpc == ARMISD::CALL_NOLINK && !Subtarget->isThumb1Only()) {
// implicit def LR - LR mustn't be allocated as GRP:$dst of CALL_NOLINK
Chain = DAG.getCopyToReg(Chain, dl, ARM::LR, DAG.getUNDEF(MVT::i32),InFlag);
InFlag = Chain.getValue(1);
@@ -1011,30 +1080,31 @@ SDValue ARMTargetLowering::LowerCALL(SDValue Op, SelectionDAG &DAG) {
Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
DAG.getIntPtrConstant(0, true), InFlag);
- if (RetVT != MVT::Other)
+ if (!Ins.empty())
InFlag = Chain.getValue(1);
// Handle result values, copying them out of physregs into vregs that we
// return.
- return SDValue(LowerCallResult(Chain, InFlag, TheCall, CC, DAG),
- Op.getResNo());
+ return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins,
+ dl, DAG, InVals);
}
-SDValue ARMTargetLowering::LowerRET(SDValue Op, SelectionDAG &DAG) {
- // The chain is always operand #0
- SDValue Chain = Op.getOperand(0);
- DebugLoc dl = Op.getDebugLoc();
+SDValue
+ARMTargetLowering::LowerReturn(SDValue Chain,
+ CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ DebugLoc dl, SelectionDAG &DAG) {
// CCValAssign - represent the assignment of the return value to a location.
SmallVector<CCValAssign, 16> RVLocs;
- unsigned CC = DAG.getMachineFunction().getFunction()->getCallingConv();
- bool isVarArg = DAG.getMachineFunction().getFunction()->isVarArg();
// CCState - Info about the registers and stack slots.
- CCState CCInfo(CC, isVarArg, getTargetMachine(), RVLocs);
+ CCState CCInfo(CallConv, isVarArg, getTargetMachine(), RVLocs,
+ *DAG.getContext());
- // Analyze return values of ISD::RET.
- CCInfo.AnalyzeReturn(Op.getNode(), CCAssignFnForNode(CC, /* Return */ true));
+ // Analyze outgoing return values.
+ CCInfo.AnalyzeReturn(Outs, CCAssignFnForNode(CallConv, /* Return */ true,
+ isVarArg));
// If this is the first return lowered for this function, add
// the regs to the liveout set for the function.
@@ -1053,12 +1123,10 @@ SDValue ARMTargetLowering::LowerRET(SDValue Op, SelectionDAG &DAG) {
CCValAssign &VA = RVLocs[i];
assert(VA.isRegLoc() && "Can only return in registers!");
- // ISD::RET => ret chain, (regnum1,val1), ...
- // So i*2+1 index only the regnums
- SDValue Arg = Op.getOperand(realRVLocIdx*2+1);
+ SDValue Arg = Outs[realRVLocIdx].Val;
switch (VA.getLocInfo()) {
- default: assert(0 && "Unknown loc info!");
+ default: llvm_unreachable("Unknown loc info!");
case CCValAssign::Full: break;
case CCValAssign::BCvt:
Arg = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getLocVT(), Arg);
@@ -1112,13 +1180,13 @@ SDValue ARMTargetLowering::LowerRET(SDValue Op, SelectionDAG &DAG) {
}
// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
-// their target countpart wrapped in the ARMISD::Wrapper node. Suppose N is
+// their target counterpart wrapped in the ARMISD::Wrapper node. Suppose N is
// one of the above mentioned nodes. It has to be wrapped because otherwise
// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
// be used to form addressing mode. These wrapped nodes will be selected
// into MOVi.
static SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) {
- MVT PtrVT = Op.getValueType();
+ EVT PtrVT = Op.getValueType();
// FIXME there is no actual debug info here
DebugLoc dl = Op.getDebugLoc();
ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
@@ -1137,11 +1205,11 @@ SDValue
ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
SelectionDAG &DAG) {
DebugLoc dl = GA->getDebugLoc();
- MVT PtrVT = getPointerTy();
+ EVT PtrVT = getPointerTy();
unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
ARMConstantPoolValue *CPV =
- new ARMConstantPoolValue(GA->getGlobal(), ARMPCLabelIndex, ARMCP::CPValue,
- PCAdj, "tlsgd", true);
+ new ARMConstantPoolValue(GA->getGlobal(), ARMPCLabelIndex,
+ ARMCP::CPValue, PCAdj, "tlsgd", true);
SDValue Argument = DAG.getTargetConstantPool(CPV, PtrVT, 4);
Argument = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Argument);
Argument = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Argument, NULL, 0);
@@ -1154,12 +1222,13 @@ ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
ArgListTy Args;
ArgListEntry Entry;
Entry.Node = Argument;
- Entry.Ty = (const Type *) Type::Int32Ty;
+ Entry.Ty = (const Type *) Type::getInt32Ty(*DAG.getContext());
Args.push_back(Entry);
// FIXME: is there useful debug info available here?
std::pair<SDValue, SDValue> CallResult =
- LowerCallTo(Chain, (const Type *) Type::Int32Ty, false, false, false, false,
- 0, CallingConv::C, false,
+ LowerCallTo(Chain, (const Type *) Type::getInt32Ty(*DAG.getContext()),
+ false, false, false, false,
+ 0, CallingConv::C, false, /*isReturnValueUsed=*/true,
DAG.getExternalSymbol("__tls_get_addr", PtrVT), Args, DAG, dl);
return CallResult.first;
}
@@ -1173,16 +1242,16 @@ ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA,
DebugLoc dl = GA->getDebugLoc();
SDValue Offset;
SDValue Chain = DAG.getEntryNode();
- MVT PtrVT = getPointerTy();
+ EVT PtrVT = getPointerTy();
// Get the Thread Pointer
SDValue ThreadPointer = DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);
- if (GV->isDeclaration()){
+ if (GV->isDeclaration()) {
// initial exec model
unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
ARMConstantPoolValue *CPV =
- new ARMConstantPoolValue(GA->getGlobal(), ARMPCLabelIndex, ARMCP::CPValue,
- PCAdj, "gottpoff", true);
+ new ARMConstantPoolValue(GA->getGlobal(), ARMPCLabelIndex,
+ ARMCP::CPValue, PCAdj, "gottpoff", true);
Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4);
Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
Offset = DAG.getLoad(PtrVT, dl, Chain, Offset, NULL, 0);
@@ -1194,8 +1263,7 @@ ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA,
Offset = DAG.getLoad(PtrVT, dl, Chain, Offset, NULL, 0);
} else {
// local exec model
- ARMConstantPoolValue *CPV =
- new ARMConstantPoolValue(GV, ARMCP::CPValue, "tpoff");
+ ARMConstantPoolValue *CPV = new ARMConstantPoolValue(GV, "tpoff");
Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4);
Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
Offset = DAG.getLoad(PtrVT, dl, Chain, Offset, NULL, 0);
@@ -1222,59 +1290,47 @@ ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) {
SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op,
SelectionDAG &DAG) {
- MVT PtrVT = getPointerTy();
+ EVT PtrVT = getPointerTy();
DebugLoc dl = Op.getDebugLoc();
GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
Reloc::Model RelocM = getTargetMachine().getRelocationModel();
if (RelocM == Reloc::PIC_) {
bool UseGOTOFF = GV->hasLocalLinkage() || GV->hasHiddenVisibility();
ARMConstantPoolValue *CPV =
- new ARMConstantPoolValue(GV, ARMCP::CPValue, UseGOTOFF ? "GOTOFF":"GOT");
+ new ARMConstantPoolValue(GV, UseGOTOFF ? "GOTOFF" : "GOT");
SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
SDValue Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
- CPAddr, NULL, 0);
+ CPAddr,
+ PseudoSourceValue::getConstantPool(), 0);
SDValue Chain = Result.getValue(1);
SDValue GOT = DAG.getGLOBAL_OFFSET_TABLE(PtrVT);
Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result, GOT);
if (!UseGOTOFF)
- Result = DAG.getLoad(PtrVT, dl, Chain, Result, NULL, 0);
+ Result = DAG.getLoad(PtrVT, dl, Chain, Result,
+ PseudoSourceValue::getGOT(), 0);
return Result;
} else {
SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, 4);
CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
- return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr, NULL, 0);
+ return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr,
+ PseudoSourceValue::getConstantPool(), 0);
}
}
-/// GVIsIndirectSymbol - true if the GV will be accessed via an indirect symbol
-/// even in non-static mode.
-static bool GVIsIndirectSymbol(GlobalValue *GV, Reloc::Model RelocM) {
- // If symbol visibility is hidden, the extra load is not needed if
- // the symbol is definitely defined in the current translation unit.
- bool isDecl = GV->isDeclaration() && !GV->hasNotBeenReadFromBitcode();
- if (GV->hasHiddenVisibility() && (!isDecl && !GV->hasCommonLinkage()))
- return false;
- return RelocM != Reloc::Static && (isDecl || GV->isWeakForLinker());
-}
-
SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op,
SelectionDAG &DAG) {
- MVT PtrVT = getPointerTy();
+ EVT PtrVT = getPointerTy();
DebugLoc dl = Op.getDebugLoc();
GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
Reloc::Model RelocM = getTargetMachine().getRelocationModel();
- bool IsIndirect = GVIsIndirectSymbol(GV, RelocM);
SDValue CPAddr;
if (RelocM == Reloc::Static)
CPAddr = DAG.getTargetConstantPool(GV, PtrVT, 4);
else {
- unsigned PCAdj = (RelocM != Reloc::PIC_)
- ? 0 : (Subtarget->isThumb() ? 4 : 8);
- ARMCP::ARMCPKind Kind = IsIndirect ? ARMCP::CPNonLazyPtr
- : ARMCP::CPValue;
- ARMConstantPoolValue *CPV = new ARMConstantPoolValue(GV, ARMPCLabelIndex,
- Kind, PCAdj);
+ unsigned PCAdj = (RelocM != Reloc::PIC_) ? 0 : (Subtarget->isThumb()?4:8);
+ ARMConstantPoolValue *CPV =
+ new ARMConstantPoolValue(GV, ARMPCLabelIndex, ARMCP::CPValue, PCAdj);
CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
}
CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
@@ -1286,7 +1342,8 @@ SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op,
SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex++, MVT::i32);
Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel);
}
- if (IsIndirect)
+
+ if (Subtarget->GVIsIndirectSymbol(GV, RelocM))
Result = DAG.getLoad(PtrVT, dl, Chain, Result, NULL, 0);
return Result;
@@ -1296,32 +1353,55 @@ SDValue ARMTargetLowering::LowerGLOBAL_OFFSET_TABLE(SDValue Op,
SelectionDAG &DAG){
assert(Subtarget->isTargetELF() &&
"GLOBAL OFFSET TABLE not implemented for non-ELF targets");
- MVT PtrVT = getPointerTy();
+ EVT PtrVT = getPointerTy();
DebugLoc dl = Op.getDebugLoc();
unsigned PCAdj = Subtarget->isThumb() ? 4 : 8;
- ARMConstantPoolValue *CPV = new ARMConstantPoolValue("_GLOBAL_OFFSET_TABLE_",
- ARMPCLabelIndex,
- ARMCP::CPValue, PCAdj);
+ ARMConstantPoolValue *CPV = new ARMConstantPoolValue(*DAG.getContext(),
+ "_GLOBAL_OFFSET_TABLE_",
+ ARMPCLabelIndex, PCAdj);
SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
- SDValue Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr, NULL, 0);
+ SDValue Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr,
+ PseudoSourceValue::getConstantPool(), 0);
SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex++, MVT::i32);
return DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel);
}
SDValue
ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
- MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
DebugLoc dl = Op.getDebugLoc();
switch (IntNo) {
default: return SDValue(); // Don't custom lower most intrinsics.
- case Intrinsic::arm_thread_pointer:
- return DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);
+ case Intrinsic::arm_thread_pointer: {
+ EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+ return DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);
+ }
+ case Intrinsic::eh_sjlj_lsda: {
+ MachineFunction &MF = DAG.getMachineFunction();
+ EVT PtrVT = getPointerTy();
+ DebugLoc dl = Op.getDebugLoc();
+ Reloc::Model RelocM = getTargetMachine().getRelocationModel();
+ SDValue CPAddr;
+ unsigned PCAdj = (RelocM != Reloc::PIC_)
+ ? 0 : (Subtarget->isThumb() ? 4 : 8);
+ ARMConstantPoolValue *CPV =
+ new ARMConstantPoolValue(MF.getFunction(), ARMPCLabelIndex,
+ ARMCP::CPLSDA, PCAdj);
+ CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
+ CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
+ SDValue Result =
+ DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr, NULL, 0);
+ SDValue Chain = Result.getValue(1);
+
+ if (RelocM == Reloc::PIC_) {
+ SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex++, MVT::i32);
+ Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel);
+ }
+ return Result;
+ }
case Intrinsic::eh_sjlj_setjmp:
- SDValue Res = DAG.getNode(ARMISD::EH_SJLJ_SETJMP, dl, MVT::i32,
- Op.getOperand(1));
- return Res;
+ return DAG.getNode(ARMISD::EH_SJLJ_SETJMP, dl, MVT::i32, Op.getOperand(1));
}
}
@@ -1330,13 +1410,60 @@ static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG,
// vastart just stores the address of the VarArgsFrameIndex slot into the
// memory location argument.
DebugLoc dl = Op.getDebugLoc();
- MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+ EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
SDValue FR = DAG.getFrameIndex(VarArgsFrameIndex, PtrVT);
const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1), SV, 0);
}
SDValue
+ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) {
+ SDNode *Node = Op.getNode();
+ DebugLoc dl = Node->getDebugLoc();
+ EVT VT = Node->getValueType(0);
+ SDValue Chain = Op.getOperand(0);
+ SDValue Size = Op.getOperand(1);
+ SDValue Align = Op.getOperand(2);
+
+ // Chain the dynamic stack allocation so that it doesn't modify the stack
+ // pointer when other instructions are using the stack.
+ Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, true));
+
+ unsigned AlignVal = cast<ConstantSDNode>(Align)->getZExtValue();
+ unsigned StackAlign = getTargetMachine().getFrameInfo()->getStackAlignment();
+ if (AlignVal > StackAlign)
+ // Do this now since selection pass cannot introduce new target
+ // independent node.
+ Align = DAG.getConstant(-(uint64_t)AlignVal, VT);
+
+ // In Thumb1 mode, there isn't a "sub r, sp, r" instruction, we will end up
+ // using a "add r, sp, r" instead. Negate the size now so we don't have to
+ // do even more horrible hack later.
+ MachineFunction &MF = DAG.getMachineFunction();
+ ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+ if (AFI->isThumb1OnlyFunction()) {
+ bool Negate = true;
+ ConstantSDNode *C = dyn_cast<ConstantSDNode>(Size);
+ if (C) {
+ uint32_t Val = C->getZExtValue();
+ if (Val <= 508 && ((Val & 3) == 0))
+ Negate = false;
+ }
+ if (Negate)
+ Size = DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(0, VT), Size);
+ }
+
+ SDVTList VTList = DAG.getVTList(VT, MVT::Other);
+ SDValue Ops1[] = { Chain, Size, Align };
+ SDValue Res = DAG.getNode(ARMISD::DYN_ALLOC, dl, VTList, Ops1, 3);
+ Chain = Res.getValue(1);
+ Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, true),
+ DAG.getIntPtrConstant(0, true), SDValue());
+ SDValue Ops2[] = { Res, Chain };
+ return DAG.getMergeValues(Ops2, 2, dl);
+}
+
+SDValue
ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA,
SDValue &Root, SelectionDAG &DAG,
DebugLoc dl) {
@@ -1344,7 +1471,7 @@ ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA,
ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
TargetRegisterClass *RC;
- if (AFI->isThumbFunction())
+ if (AFI->isThumb1OnlyFunction())
RC = ARM::tGPRRegisterClass;
else
RC = ARM::GPRRegisterClass;
@@ -1371,21 +1498,25 @@ ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA,
}
SDValue
-ARMTargetLowering::LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG) {
+ARMTargetLowering::LowerFormalArguments(SDValue Chain,
+ CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::InputArg>
+ &Ins,
+ DebugLoc dl, SelectionDAG &DAG,
+ SmallVectorImpl<SDValue> &InVals) {
+
MachineFunction &MF = DAG.getMachineFunction();
MachineFrameInfo *MFI = MF.getFrameInfo();
- SDValue Root = Op.getOperand(0);
- DebugLoc dl = Op.getDebugLoc();
- bool isVarArg = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue() != 0;
- unsigned CC = MF.getFunction()->getCallingConv();
ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
// Assign locations to all of the incoming arguments.
SmallVector<CCValAssign, 16> ArgLocs;
- CCState CCInfo(CC, isVarArg, getTargetMachine(), ArgLocs);
- CCInfo.AnalyzeFormalArguments(Op.getNode(),
- CCAssignFnForNode(CC, /* Return*/ false));
+ CCState CCInfo(CallConv, isVarArg, getTargetMachine(), ArgLocs,
+ *DAG.getContext());
+ CCInfo.AnalyzeFormalArguments(Ins,
+ CCAssignFnForNode(CallConv, /* Return*/ false,
+ isVarArg));
SmallVector<SDValue, 16> ArgValues;
@@ -1394,7 +1525,7 @@ ARMTargetLowering::LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG) {
// Arguments stored in registers.
if (VA.isRegLoc()) {
- MVT RegVT = VA.getLocVT();
+ EVT RegVT = VA.getLocVT();
SDValue ArgValue;
if (VA.needsCustom()) {
@@ -1404,43 +1535,43 @@ ARMTargetLowering::LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG) {
if (VA.getLocVT() == MVT::v2f64) {
SDValue ArgValue1 = GetF64FormalArgument(VA, ArgLocs[++i],
- Root, DAG, dl);
+ Chain, DAG, dl);
VA = ArgLocs[++i]; // skip ahead to next loc
SDValue ArgValue2 = GetF64FormalArgument(VA, ArgLocs[++i],
- Root, DAG, dl);
+ Chain, DAG, dl);
ArgValue = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64,
ArgValue, ArgValue1, DAG.getIntPtrConstant(0));
ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64,
ArgValue, ArgValue2, DAG.getIntPtrConstant(1));
} else
- ArgValue = GetF64FormalArgument(VA, ArgLocs[++i], Root, DAG, dl);
+ ArgValue = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);
} else {
TargetRegisterClass *RC;
- if (FloatABIType == FloatABI::Hard && RegVT == MVT::f32)
+
+ if (RegVT == MVT::f32)
RC = ARM::SPRRegisterClass;
- else if (FloatABIType == FloatABI::Hard && RegVT == MVT::f64)
+ else if (RegVT == MVT::f64)
RC = ARM::DPRRegisterClass;
- else if (AFI->isThumbFunction())
- RC = ARM::tGPRRegisterClass;
+ else if (RegVT == MVT::v2f64)
+ RC = ARM::QPRRegisterClass;
+ else if (RegVT == MVT::i32)
+ RC = (AFI->isThumb1OnlyFunction() ?
+ ARM::tGPRRegisterClass : ARM::GPRRegisterClass);
else
- RC = ARM::GPRRegisterClass;
-
- assert((RegVT == MVT::i32 || RegVT == MVT::f32 ||
- (FloatABIType == FloatABI::Hard && RegVT == MVT::f64)) &&
- "RegVT not supported by FORMAL_ARGUMENTS Lowering");
+ llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
// Transform the arguments in physical registers into virtual ones.
unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
- ArgValue = DAG.getCopyFromReg(Root, dl, Reg, RegVT);
+ ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
}
// If this is an 8 or 16-bit value, it is really passed promoted
// to 32 bits. Insert an assert[sz]ext to capture this, then
// truncate to the right size.
switch (VA.getLocInfo()) {
- default: assert(0 && "Unknown loc info!");
+ default: llvm_unreachable("Unknown loc info!");
case CCValAssign::Full: break;
case CCValAssign::BCvt:
ArgValue = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getValVT(), ArgValue);
@@ -1457,7 +1588,7 @@ ARMTargetLowering::LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG) {
break;
}
- ArgValues.push_back(ArgValue);
+ InVals.push_back(ArgValue);
} else { // VA.isRegLoc()
@@ -1470,7 +1601,7 @@ ARMTargetLowering::LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG) {
// Create load nodes to retrieve arguments from the stack.
SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
- ArgValues.push_back(DAG.getLoad(VA.getValVT(), dl, Root, FIN, NULL, 0));
+ InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, FIN, NULL, 0));
}
}
@@ -1500,31 +1631,27 @@ ARMTargetLowering::LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG) {
SmallVector<SDValue, 4> MemOps;
for (; NumGPRs < 4; ++NumGPRs) {
TargetRegisterClass *RC;
- if (AFI->isThumbFunction())
+ if (AFI->isThumb1OnlyFunction())
RC = ARM::tGPRRegisterClass;
else
RC = ARM::GPRRegisterClass;
unsigned VReg = MF.addLiveIn(GPRArgRegs[NumGPRs], RC);
- SDValue Val = DAG.getCopyFromReg(Root, dl, VReg, MVT::i32);
+ SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, NULL, 0);
MemOps.push_back(Store);
FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), FIN,
DAG.getConstant(4, getPointerTy()));
}
if (!MemOps.empty())
- Root = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
- &MemOps[0], MemOps.size());
+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+ &MemOps[0], MemOps.size());
} else
// This will point to the next argument passed via stack.
VarArgsFrameIndex = MFI->CreateFixedObject(4, ArgOffset);
}
- ArgValues.push_back(Root);
-
- // Return the new list of results.
- return DAG.getNode(ISD::MERGE_VALUES, dl, Op.getNode()->getVTList(),
- &ArgValues[0], ArgValues.size()).getValue(Op.getResNo());
+ return Chain;
}
/// isFloatingPointZero - Return true if this is +0.0.
@@ -1543,46 +1670,46 @@ static bool isFloatingPointZero(SDValue Op) {
return false;
}
-static bool isLegalCmpImmediate(unsigned C, bool isThumb) {
- return ( isThumb && (C & ~255U) == 0) ||
- (!isThumb && ARM_AM::getSOImmVal(C) != -1);
+static bool isLegalCmpImmediate(unsigned C, bool isThumb1Only) {
+ return ( isThumb1Only && (C & ~255U) == 0) ||
+ (!isThumb1Only && ARM_AM::getSOImmVal(C) != -1);
}
/// Returns appropriate ARM CMP (cmp) and corresponding condition code for
/// the given operands.
static SDValue getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
- SDValue &ARMCC, SelectionDAG &DAG, bool isThumb,
+ SDValue &ARMCC, SelectionDAG &DAG, bool isThumb1Only,
DebugLoc dl) {
if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
unsigned C = RHSC->getZExtValue();
- if (!isLegalCmpImmediate(C, isThumb)) {
+ if (!isLegalCmpImmediate(C, isThumb1Only)) {
// Constant does not fit, try adjusting it by one?
switch (CC) {
default: break;
case ISD::SETLT:
case ISD::SETGE:
- if (isLegalCmpImmediate(C-1, isThumb)) {
+ if (isLegalCmpImmediate(C-1, isThumb1Only)) {
CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
RHS = DAG.getConstant(C-1, MVT::i32);
}
break;
case ISD::SETULT:
case ISD::SETUGE:
- if (C > 0 && isLegalCmpImmediate(C-1, isThumb)) {
+ if (C > 0 && isLegalCmpImmediate(C-1, isThumb1Only)) {
CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
RHS = DAG.getConstant(C-1, MVT::i32);
}
break;
case ISD::SETLE:
case ISD::SETGT:
- if (isLegalCmpImmediate(C+1, isThumb)) {
+ if (isLegalCmpImmediate(C+1, isThumb1Only)) {
CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
RHS = DAG.getConstant(C+1, MVT::i32);
}
break;
case ISD::SETULE:
case ISD::SETUGT:
- if (C < 0xffffffff && isLegalCmpImmediate(C+1, isThumb)) {
+ if (C < 0xffffffff && isLegalCmpImmediate(C+1, isThumb1Only)) {
CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
RHS = DAG.getConstant(C+1, MVT::i32);
}
@@ -1620,7 +1747,7 @@ static SDValue getVFPCmp(SDValue LHS, SDValue RHS, SelectionDAG &DAG,
static SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG,
const ARMSubtarget *ST) {
- MVT VT = Op.getValueType();
+ EVT VT = Op.getValueType();
SDValue LHS = Op.getOperand(0);
SDValue RHS = Op.getOperand(1);
ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
@@ -1631,13 +1758,12 @@ static SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG,
if (LHS.getValueType() == MVT::i32) {
SDValue ARMCC;
SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
- SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMCC, DAG, ST->isThumb(), dl);
+ SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMCC, DAG, ST->isThumb1Only(), dl);
return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMCC, CCR,Cmp);
}
ARMCC::CondCodes CondCode, CondCode2;
- if (FPCCToARMCC(CC, CondCode, CondCode2))
- std::swap(TrueVal, FalseVal);
+ FPCCToARMCC(CC, CondCode, CondCode2);
SDValue ARMCC = DAG.getConstant(CondCode, MVT::i32);
SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
@@ -1666,16 +1792,14 @@ static SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG,
if (LHS.getValueType() == MVT::i32) {
SDValue ARMCC;
SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
- SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMCC, DAG, ST->isThumb(), dl);
+ SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMCC, DAG, ST->isThumb1Only(), dl);
return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other,
Chain, Dest, ARMCC, CCR,Cmp);
}
assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
ARMCC::CondCodes CondCode, CondCode2;
- if (FPCCToARMCC(CC, CondCode, CondCode2))
- // Swap the LHS/RHS of the comparison if needed.
- std::swap(LHS, RHS);
+ FPCCToARMCC(CC, CondCode, CondCode2);
SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);
SDValue ARMCC = DAG.getConstant(CondCode, MVT::i32);
@@ -1697,21 +1821,32 @@ SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) {
SDValue Index = Op.getOperand(2);
DebugLoc dl = Op.getDebugLoc();
- MVT PTy = getPointerTy();
+ EVT PTy = getPointerTy();
JumpTableSDNode *JT = cast<JumpTableSDNode>(Table);
ARMFunctionInfo *AFI = DAG.getMachineFunction().getInfo<ARMFunctionInfo>();
- SDValue UId = DAG.getConstant(AFI->createJumpTableUId(), PTy);
+ SDValue UId = DAG.getConstant(AFI->createJumpTableUId(), PTy);
SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PTy);
Table = DAG.getNode(ARMISD::WrapperJT, dl, MVT::i32, JTI, UId);
Index = DAG.getNode(ISD::MUL, dl, PTy, Index, DAG.getConstant(4, PTy));
SDValue Addr = DAG.getNode(ISD::ADD, dl, PTy, Index, Table);
- bool isPIC = getTargetMachine().getRelocationModel() == Reloc::PIC_;
- Addr = DAG.getLoad(isPIC ? (MVT)MVT::i32 : PTy, dl,
- Chain, Addr, NULL, 0);
- Chain = Addr.getValue(1);
- if (isPIC)
+ if (Subtarget->isThumb2()) {
+ // Thumb2 uses a two-level jump. That is, it jumps into the jump table
+ // which does another jump to the destination. This also makes it easier
+ // to translate it to TBB / TBH later.
+ // FIXME: This might not work if the function is extremely large.
+ return DAG.getNode(ARMISD::BR2_JT, dl, MVT::Other, Chain,
+ Addr, Op.getOperand(2), JTI, UId);
+ }
+ if (getTargetMachine().getRelocationModel() == Reloc::PIC_) {
+ Addr = DAG.getLoad((EVT)MVT::i32, dl, Chain, Addr, NULL, 0);
+ Chain = Addr.getValue(1);
Addr = DAG.getNode(ISD::ADD, dl, PTy, Addr, Table);
- return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI, UId);
+ return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI, UId);
+ } else {
+ Addr = DAG.getLoad(PTy, dl, Chain, Addr, NULL, 0);
+ Chain = Addr.getValue(1);
+ return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI, UId);
+ }
}
static SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) {
@@ -1723,7 +1858,7 @@ static SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) {
}
static SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
- MVT VT = Op.getValueType();
+ EVT VT = Op.getValueType();
DebugLoc dl = Op.getDebugLoc();
unsigned Opc =
Op.getOpcode() == ISD::SINT_TO_FP ? ARMISD::SITOF : ARMISD::UITOF;
@@ -1737,8 +1872,8 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
SDValue Tmp0 = Op.getOperand(0);
SDValue Tmp1 = Op.getOperand(1);
DebugLoc dl = Op.getDebugLoc();
- MVT VT = Op.getValueType();
- MVT SrcVT = Tmp1.getValueType();
+ EVT VT = Op.getValueType();
+ EVT SrcVT = Tmp1.getValueType();
SDValue AbsVal = DAG.getNode(ISD::FABS, dl, VT, Tmp0);
SDValue Cmp = getVFPCmp(Tmp1, DAG.getConstantFP(0.0, SrcVT), DAG, dl);
SDValue ARMCC = DAG.getConstant(ARMCC::LT, MVT::i32);
@@ -1749,7 +1884,7 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) {
MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
MFI->setFrameAddressIsTaken(true);
- MVT VT = Op.getValueType();
+ EVT VT = Op.getValueType();
DebugLoc dl = Op.getDebugLoc(); // FIXME probably not meaningful
unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
unsigned FrameReg = (Subtarget->isThumb() || Subtarget->isTargetDarwin())
@@ -1784,7 +1919,7 @@ ARMTargetLowering::EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl,
unsigned BytesLeft = SizeVal & 3;
unsigned NumMemOps = SizeVal >> 2;
unsigned EmittedNumMemOps = 0;
- MVT VT = MVT::i32;
+ EVT VT = MVT::i32;
unsigned VTSize = 4;
unsigned i = 0;
const unsigned MAX_LOADS_IN_LDM = 6;
@@ -1890,45 +2025,55 @@ static SDValue ExpandBIT_CONVERT(SDNode *N, SelectionDAG &DAG) {
/// getZeroVector - Returns a vector of specified type with all zero elements.
///
-static SDValue getZeroVector(MVT VT, SelectionDAG &DAG, DebugLoc dl) {
+static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, DebugLoc dl) {
assert(VT.isVector() && "Expected a vector type");
// Zero vectors are used to represent vector negation and in those cases
// will be implemented with the NEON VNEG instruction. However, VNEG does
// not support i64 elements, so sometimes the zero vectors will need to be
// explicitly constructed. For those cases, and potentially other uses in
- // the future, always build zero vectors as <4 x i32> or <2 x i32> bitcasted
+ // the future, always build zero vectors as <16 x i8> or <8 x i8> bitcasted
// to their dest type. This ensures they get CSE'd.
SDValue Vec;
- SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
- if (VT.getSizeInBits() == 64)
- Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst);
- else
- Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
+ SDValue Cst = DAG.getTargetConstant(0, MVT::i8);
+ SmallVector<SDValue, 8> Ops;
+ MVT TVT;
+
+ if (VT.getSizeInBits() == 64) {
+ Ops.assign(8, Cst); TVT = MVT::v8i8;
+ } else {
+ Ops.assign(16, Cst); TVT = MVT::v16i8;
+ }
+ Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, TVT, &Ops[0], Ops.size());
return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec);
}
/// getOnesVector - Returns a vector of specified type with all bits set.
///
-static SDValue getOnesVector(MVT VT, SelectionDAG &DAG, DebugLoc dl) {
+static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, DebugLoc dl) {
assert(VT.isVector() && "Expected a vector type");
- // Always build ones vectors as <4 x i32> or <2 x i32> bitcasted to their dest
- // type. This ensures they get CSE'd.
+ // Always build ones vectors as <16 x i32> or <8 x i32> bitcasted to their
+ // dest type. This ensures they get CSE'd.
SDValue Vec;
- SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32);
- if (VT.getSizeInBits() == 64)
- Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst);
- else
- Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
+ SDValue Cst = DAG.getTargetConstant(0xFF, MVT::i8);
+ SmallVector<SDValue, 8> Ops;
+ MVT TVT;
+
+ if (VT.getSizeInBits() == 64) {
+ Ops.assign(8, Cst); TVT = MVT::v8i8;
+ } else {
+ Ops.assign(16, Cst); TVT = MVT::v16i8;
+ }
+ Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, TVT, &Ops[0], Ops.size());
return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec);
}
static SDValue LowerShift(SDNode *N, SelectionDAG &DAG,
const ARMSubtarget *ST) {
- MVT VT = N->getValueType(0);
+ EVT VT = N->getValueType(0);
DebugLoc dl = N->getDebugLoc();
// Lower vector shifts on NEON to use VSHL.
@@ -1947,7 +2092,7 @@ static SDValue LowerShift(SDNode *N, SelectionDAG &DAG,
// NEON uses the same intrinsics for both left and right shifts. For
// right shifts, the shift amounts are negative, so negate the vector of
// shift amounts.
- MVT ShiftVT = N->getOperand(1).getValueType();
+ EVT ShiftVT = N->getOperand(1).getValueType();
SDValue NegatedCount = DAG.getNode(ISD::SUB, dl, ShiftVT,
getZeroVector(ShiftVT, DAG, dl),
N->getOperand(1));
@@ -1959,8 +2104,11 @@ static SDValue LowerShift(SDNode *N, SelectionDAG &DAG,
N->getOperand(0), NegatedCount);
}
- assert(VT == MVT::i64 &&
- (N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) &&
+ // We can get here for a node like i32 = ISD::SHL i32, i64
+ if (VT != MVT::i64)
+ return SDValue();
+
+ assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) &&
"Unknown shift to lower!");
// We only lower SRA, SRL of 1 here, all others use generic lowering.
@@ -1969,7 +2117,7 @@ static SDValue LowerShift(SDNode *N, SelectionDAG &DAG,
return SDValue();
// If we are in thumb mode, we don't have RRX.
- if (ST->isThumb()) return SDValue();
+ if (ST->isThumb1Only()) return SDValue();
// Okay, we have a 64-bit SRA or SRL of 1. Lower this to an RRX expr.
SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0),
@@ -1998,13 +2146,13 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) {
SDValue Op0 = Op.getOperand(0);
SDValue Op1 = Op.getOperand(1);
SDValue CC = Op.getOperand(2);
- MVT VT = Op.getValueType();
+ EVT VT = Op.getValueType();
ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
DebugLoc dl = Op.getDebugLoc();
if (Op.getOperand(1).getValueType().isFloatingPoint()) {
switch (SetCCOpcode) {
- default: assert(0 && "Illegal FP comparison"); break;
+ default: llvm_unreachable("Illegal FP comparison"); break;
case ISD::SETUNE:
case ISD::SETNE: Invert = true; // Fallthrough
case ISD::SETOEQ:
@@ -2043,7 +2191,7 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) {
} else {
// Integer comparisons.
switch (SetCCOpcode) {
- default: assert(0 && "Illegal integer comparison"); break;
+ default: llvm_unreachable("Illegal integer comparison"); break;
case ISD::SETNE: Invert = true;
case ISD::SETEQ: Opc = ARMISD::VCEQ; break;
case ISD::SETLT: Swap = true;
@@ -2056,7 +2204,7 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) {
case ISD::SETUGE: Opc = ARMISD::VCGEU; break;
}
- // Detect VTST (Vector Test Bits) = vicmp ne (and (op0, op1), zero).
+ // Detect VTST (Vector Test Bits) = icmp ne (and (op0, op1), zero).
if (Opc == ARMISD::VCEQ) {
SDValue AndOp;
@@ -2147,7 +2295,7 @@ static SDValue isVMOVSplat(uint64_t SplatBits, uint64_t SplatUndef,
}
default:
- assert(0 && "unexpected size for isVMOVSplat");
+ llvm_unreachable("unexpected size for isVMOVSplat");
break;
}
@@ -2174,22 +2322,123 @@ SDValue ARM::getVMOVImm(SDNode *N, unsigned ByteSize, SelectionDAG &DAG) {
SplatBitSize, DAG);
}
-static SDValue BuildSplat(SDValue Val, MVT VT, SelectionDAG &DAG, DebugLoc dl) {
+static bool isVEXTMask(const SmallVectorImpl<int> &M, EVT VT,
+ bool &ReverseVEXT, unsigned &Imm) {
+ unsigned NumElts = VT.getVectorNumElements();
+ ReverseVEXT = false;
+ Imm = M[0];
+
+ // If this is a VEXT shuffle, the immediate value is the index of the first
+ // element. The other shuffle indices must be the successive elements after
+ // the first one.
+ unsigned ExpectedElt = Imm;
+ for (unsigned i = 1; i < NumElts; ++i) {
+ // Increment the expected index. If it wraps around, it may still be
+ // a VEXT but the source vectors must be swapped.
+ ExpectedElt += 1;
+ if (ExpectedElt == NumElts * 2) {
+ ExpectedElt = 0;
+ ReverseVEXT = true;
+ }
+
+ if (ExpectedElt != static_cast<unsigned>(M[i]))
+ return false;
+ }
+
+ // Adjust the index value if the source operands will be swapped.
+ if (ReverseVEXT)
+ Imm -= NumElts;
+
+ return true;
+}
+
+/// isVREVMask - Check if a vector shuffle corresponds to a VREV
+/// instruction with the specified blocksize. (The order of the elements
+/// within each block of the vector is reversed.)
+static bool isVREVMask(const SmallVectorImpl<int> &M, EVT VT,
+ unsigned BlockSize) {
+ assert((BlockSize==16 || BlockSize==32 || BlockSize==64) &&
+ "Only possible block sizes for VREV are: 16, 32, 64");
+
+ unsigned NumElts = VT.getVectorNumElements();
+ unsigned EltSz = VT.getVectorElementType().getSizeInBits();
+ unsigned BlockElts = M[0] + 1;
+
+ if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz)
+ return false;
+
+ for (unsigned i = 0; i < NumElts; ++i) {
+ if ((unsigned) M[i] !=
+ (i - i%BlockElts) + (BlockElts - 1 - i%BlockElts))
+ return false;
+ }
+
+ return true;
+}
+
+static bool isVTRNMask(const SmallVectorImpl<int> &M, EVT VT,
+ unsigned &WhichResult) {
+ unsigned NumElts = VT.getVectorNumElements();
+ WhichResult = (M[0] == 0 ? 0 : 1);
+ for (unsigned i = 0; i < NumElts; i += 2) {
+ if ((unsigned) M[i] != i + WhichResult ||
+ (unsigned) M[i+1] != i + NumElts + WhichResult)
+ return false;
+ }
+ return true;
+}
+
+static bool isVUZPMask(const SmallVectorImpl<int> &M, EVT VT,
+ unsigned &WhichResult) {
+ unsigned NumElts = VT.getVectorNumElements();
+ WhichResult = (M[0] == 0 ? 0 : 1);
+ for (unsigned i = 0; i != NumElts; ++i) {
+ if ((unsigned) M[i] != 2 * i + WhichResult)
+ return false;
+ }
+
+ // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
+ if (VT.is64BitVector() && VT.getVectorElementType().getSizeInBits() == 32)
+ return false;
+
+ return true;
+}
+
+static bool isVZIPMask(const SmallVectorImpl<int> &M, EVT VT,
+ unsigned &WhichResult) {
+ unsigned NumElts = VT.getVectorNumElements();
+ WhichResult = (M[0] == 0 ? 0 : 1);
+ unsigned Idx = WhichResult * NumElts / 2;
+ for (unsigned i = 0; i != NumElts; i += 2) {
+ if ((unsigned) M[i] != Idx ||
+ (unsigned) M[i+1] != Idx + NumElts)
+ return false;
+ Idx += 1;
+ }
+
+ // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
+ if (VT.is64BitVector() && VT.getVectorElementType().getSizeInBits() == 32)
+ return false;
+
+ return true;
+}
+
+static SDValue BuildSplat(SDValue Val, EVT VT, SelectionDAG &DAG, DebugLoc dl) {
// Canonicalize all-zeros and all-ones vectors.
- ConstantSDNode *ConstVal = dyn_cast<ConstantSDNode>(Val.getNode());
+ ConstantSDNode *ConstVal = cast<ConstantSDNode>(Val.getNode());
if (ConstVal->isNullValue())
return getZeroVector(VT, DAG, dl);
if (ConstVal->isAllOnesValue())
return getOnesVector(VT, DAG, dl);
- MVT CanonicalVT;
+ EVT CanonicalVT;
if (VT.is64BitVector()) {
switch (Val.getValueType().getSizeInBits()) {
case 8: CanonicalVT = MVT::v8i8; break;
case 16: CanonicalVT = MVT::v4i16; break;
case 32: CanonicalVT = MVT::v2i32; break;
case 64: CanonicalVT = MVT::v1i64; break;
- default: assert(0 && "unexpected splat element type"); break;
+ default: llvm_unreachable("unexpected splat element type"); break;
}
} else {
assert(VT.is128BitVector() && "unknown splat vector size");
@@ -2198,7 +2447,7 @@ static SDValue BuildSplat(SDValue Val, MVT VT, SelectionDAG &DAG, DebugLoc dl) {
case 16: CanonicalVT = MVT::v8i16; break;
case 32: CanonicalVT = MVT::v4i32; break;
case 64: CanonicalVT = MVT::v2i64; break;
- default: assert(0 && "unexpected splat element type"); break;
+ default: llvm_unreachable("unexpected splat element type"); break;
}
}
@@ -2213,69 +2462,291 @@ static SDValue BuildSplat(SDValue Val, MVT VT, SelectionDAG &DAG, DebugLoc dl) {
// If this is a case we can't handle, return null and let the default
// expansion code take care of it.
static SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
- BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
- assert(BVN != 0 && "Expected a BuildVectorSDNode in LowerBUILD_VECTOR");
+ BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
DebugLoc dl = Op.getDebugLoc();
+ EVT VT = Op.getValueType();
APInt SplatBits, SplatUndef;
unsigned SplatBitSize;
bool HasAnyUndefs;
if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
- SDValue Val = isVMOVSplat(SplatBits.getZExtValue(),
- SplatUndef.getZExtValue(), SplatBitSize, DAG);
- if (Val.getNode())
- return BuildSplat(Val, Op.getValueType(), DAG, dl);
+ if (SplatBitSize <= 64) {
+ SDValue Val = isVMOVSplat(SplatBits.getZExtValue(),
+ SplatUndef.getZExtValue(), SplatBitSize, DAG);
+ if (Val.getNode())
+ return BuildSplat(Val, VT, DAG, dl);
+ }
+ }
+
+ // If there are only 2 elements in a 128-bit vector, insert them into an
+ // undef vector. This handles the common case for 128-bit vector argument
+ // passing, where the insertions should be translated to subreg accesses
+ // with no real instructions.
+ if (VT.is128BitVector() && Op.getNumOperands() == 2) {
+ SDValue Val = DAG.getUNDEF(VT);
+ SDValue Op0 = Op.getOperand(0);
+ SDValue Op1 = Op.getOperand(1);
+ if (Op0.getOpcode() != ISD::UNDEF)
+ Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Val, Op0,
+ DAG.getIntPtrConstant(0));
+ if (Op1.getOpcode() != ISD::UNDEF)
+ Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Val, Op1,
+ DAG.getIntPtrConstant(1));
+ return Val;
}
return SDValue();
}
-static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
- return Op;
+/// isShuffleMaskLegal - Targets can use this to indicate that they only
+/// support *some* VECTOR_SHUFFLE operations, those with specific masks.
+/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
+/// are assumed to be legal.
+bool
+ARMTargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
+ EVT VT) const {
+ if (VT.getVectorNumElements() == 4 &&
+ (VT.is128BitVector() || VT.is64BitVector())) {
+ unsigned PFIndexes[4];
+ for (unsigned i = 0; i != 4; ++i) {
+ if (M[i] < 0)
+ PFIndexes[i] = 8;
+ else
+ PFIndexes[i] = M[i];
+ }
+
+ // Compute the index in the perfect shuffle table.
+ unsigned PFTableIndex =
+ PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
+ unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
+ unsigned Cost = (PFEntry >> 30);
+
+ if (Cost <= 4)
+ return true;
+ }
+
+ bool ReverseVEXT;
+ unsigned Imm, WhichResult;
+
+ return (ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
+ isVREVMask(M, VT, 64) ||
+ isVREVMask(M, VT, 32) ||
+ isVREVMask(M, VT, 16) ||
+ isVEXTMask(M, VT, ReverseVEXT, Imm) ||
+ isVTRNMask(M, VT, WhichResult) ||
+ isVUZPMask(M, VT, WhichResult) ||
+ isVZIPMask(M, VT, WhichResult));
+}
+
+/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
+/// the specified operations to build the shuffle.
+static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
+ SDValue RHS, SelectionDAG &DAG,
+ DebugLoc dl) {
+ unsigned OpNum = (PFEntry >> 26) & 0x0F;
+ unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);
+ unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1);
+
+ enum {
+ OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
+ OP_VREV,
+ OP_VDUP0,
+ OP_VDUP1,
+ OP_VDUP2,
+ OP_VDUP3,
+ OP_VEXT1,
+ OP_VEXT2,
+ OP_VEXT3,
+ OP_VUZPL, // VUZP, left result
+ OP_VUZPR, // VUZP, right result
+ OP_VZIPL, // VZIP, left result
+ OP_VZIPR, // VZIP, right result
+ OP_VTRNL, // VTRN, left result
+ OP_VTRNR // VTRN, right result
+ };
+
+ if (OpNum == OP_COPY) {
+ if (LHSID == (1*9+2)*9+3) return LHS;
+ assert(LHSID == ((4*9+5)*9+6)*9+7 && "Illegal OP_COPY!");
+ return RHS;
+ }
+
+ SDValue OpLHS, OpRHS;
+ OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
+ OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl);
+ EVT VT = OpLHS.getValueType();
+
+ switch (OpNum) {
+ default: llvm_unreachable("Unknown shuffle opcode!");
+ case OP_VREV:
+ return DAG.getNode(ARMISD::VREV64, dl, VT, OpLHS);
+ case OP_VDUP0:
+ case OP_VDUP1:
+ case OP_VDUP2:
+ case OP_VDUP3:
+ return DAG.getNode(ARMISD::VDUPLANE, dl, VT,
+ OpLHS, DAG.getConstant(OpNum-OP_VDUP0, MVT::i32));
+ case OP_VEXT1:
+ case OP_VEXT2:
+ case OP_VEXT3:
+ return DAG.getNode(ARMISD::VEXT, dl, VT,
+ OpLHS, OpRHS,
+ DAG.getConstant(OpNum-OP_VEXT1+1, MVT::i32));
+ case OP_VUZPL:
+ case OP_VUZPR:
+ return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT),
+ OpLHS, OpRHS).getValue(OpNum-OP_VUZPL);
+ case OP_VZIPL:
+ case OP_VZIPR:
+ return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT),
+ OpLHS, OpRHS).getValue(OpNum-OP_VZIPL);
+ case OP_VTRNL:
+ case OP_VTRNR:
+ return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT),
+ OpLHS, OpRHS).getValue(OpNum-OP_VTRNL);
+ }
}
-static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
- return Op;
+static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
+ SDValue V1 = Op.getOperand(0);
+ SDValue V2 = Op.getOperand(1);
+ DebugLoc dl = Op.getDebugLoc();
+ EVT VT = Op.getValueType();
+ ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
+ SmallVector<int, 8> ShuffleMask;
+
+ // Convert shuffles that are directly supported on NEON to target-specific
+ // DAG nodes, instead of keeping them as shuffles and matching them again
+ // during code selection. This is more efficient and avoids the possibility
+ // of inconsistencies between legalization and selection.
+ // FIXME: floating-point vectors should be canonicalized to integer vectors
+ // of the same time so that they get CSEd properly.
+ SVN->getMask(ShuffleMask);
+
+ if (ShuffleVectorSDNode::isSplatMask(&ShuffleMask[0], VT)) {
+ int Lane = SVN->getSplatIndex();
+ if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) {
+ return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0));
+ }
+ return DAG.getNode(ARMISD::VDUPLANE, dl, VT, V1,
+ DAG.getConstant(Lane, MVT::i32));
+ }
+
+ bool ReverseVEXT;
+ unsigned Imm;
+ if (isVEXTMask(ShuffleMask, VT, ReverseVEXT, Imm)) {
+ if (ReverseVEXT)
+ std::swap(V1, V2);
+ return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V2,
+ DAG.getConstant(Imm, MVT::i32));
+ }
+
+ if (isVREVMask(ShuffleMask, VT, 64))
+ return DAG.getNode(ARMISD::VREV64, dl, VT, V1);
+ if (isVREVMask(ShuffleMask, VT, 32))
+ return DAG.getNode(ARMISD::VREV32, dl, VT, V1);
+ if (isVREVMask(ShuffleMask, VT, 16))
+ return DAG.getNode(ARMISD::VREV16, dl, VT, V1);
+
+ // Check for Neon shuffles that modify both input vectors in place.
+ // If both results are used, i.e., if there are two shuffles with the same
+ // source operands and with masks corresponding to both results of one of
+ // these operations, DAG memoization will ensure that a single node is
+ // used for both shuffles.
+ unsigned WhichResult;
+ if (isVTRNMask(ShuffleMask, VT, WhichResult))
+ return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT),
+ V1, V2).getValue(WhichResult);
+ if (isVUZPMask(ShuffleMask, VT, WhichResult))
+ return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT),
+ V1, V2).getValue(WhichResult);
+ if (isVZIPMask(ShuffleMask, VT, WhichResult))
+ return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT),
+ V1, V2).getValue(WhichResult);
+
+ // If the shuffle is not directly supported and it has 4 elements, use
+ // the PerfectShuffle-generated table to synthesize it from other shuffles.
+ if (VT.getVectorNumElements() == 4 &&
+ (VT.is128BitVector() || VT.is64BitVector())) {
+ unsigned PFIndexes[4];
+ for (unsigned i = 0; i != 4; ++i) {
+ if (ShuffleMask[i] < 0)
+ PFIndexes[i] = 8;
+ else
+ PFIndexes[i] = ShuffleMask[i];
+ }
+
+ // Compute the index in the perfect shuffle table.
+ unsigned PFTableIndex =
+ PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
+
+ unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
+ unsigned Cost = (PFEntry >> 30);
+
+ if (Cost <= 4)
+ return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
+ }
+
+ return SDValue();
}
static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
- MVT VT = Op.getValueType();
+ EVT VT = Op.getValueType();
DebugLoc dl = Op.getDebugLoc();
- assert((VT == MVT::i8 || VT == MVT::i16) &&
- "unexpected type for custom-lowering vector extract");
SDValue Vec = Op.getOperand(0);
SDValue Lane = Op.getOperand(1);
+
+ // FIXME: This is invalid for 8 and 16-bit elements - the information about
+ // sign / zero extension is lost!
Op = DAG.getNode(ARMISD::VGETLANEu, dl, MVT::i32, Vec, Lane);
Op = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Op, DAG.getValueType(VT));
- return DAG.getNode(ISD::TRUNCATE, dl, VT, Op);
+
+ if (VT.bitsLT(MVT::i32))
+ Op = DAG.getNode(ISD::TRUNCATE, dl, VT, Op);
+ else if (VT.bitsGT(MVT::i32))
+ Op = DAG.getNode(ISD::ANY_EXTEND, dl, VT, Op);
+
+ return Op;
}
-static SDValue LowerCONCAT_VECTORS(SDValue Op) {
- if (Op.getValueType().is128BitVector() && Op.getNumOperands() == 2)
- return Op;
- return SDValue();
+static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
+ // The only time a CONCAT_VECTORS operation can have legal types is when
+ // two 64-bit vectors are concatenated to a 128-bit vector.
+ assert(Op.getValueType().is128BitVector() && Op.getNumOperands() == 2 &&
+ "unexpected CONCAT_VECTORS");
+ DebugLoc dl = Op.getDebugLoc();
+ SDValue Val = DAG.getUNDEF(MVT::v2f64);
+ SDValue Op0 = Op.getOperand(0);
+ SDValue Op1 = Op.getOperand(1);
+ if (Op0.getOpcode() != ISD::UNDEF)
+ Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val,
+ DAG.getNode(ISD::BIT_CONVERT, dl, MVT::f64, Op0),
+ DAG.getIntPtrConstant(0));
+ if (Op1.getOpcode() != ISD::UNDEF)
+ Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val,
+ DAG.getNode(ISD::BIT_CONVERT, dl, MVT::f64, Op1),
+ DAG.getIntPtrConstant(1));
+ return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), Val);
}
SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) {
switch (Op.getOpcode()) {
- default: assert(0 && "Don't know how to custom lower this!"); abort();
+ default: llvm_unreachable("Don't know how to custom lower this!");
case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
case ISD::GlobalAddress:
return Subtarget->isTargetDarwin() ? LowerGlobalAddressDarwin(Op, DAG) :
LowerGlobalAddressELF(Op, DAG);
case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
- case ISD::CALL: return LowerCALL(Op, DAG);
- case ISD::RET: return LowerRET(Op, DAG);
case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG, Subtarget);
case ISD::BR_CC: return LowerBR_CC(Op, DAG, Subtarget);
case ISD::BR_JT: return LowerBR_JT(Op, DAG);
+ case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
case ISD::VASTART: return LowerVASTART(Op, DAG, VarArgsFrameIndex);
case ISD::SINT_TO_FP:
case ISD::UINT_TO_FP: return LowerINT_TO_FP(Op, DAG);
case ISD::FP_TO_SINT:
case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
- case ISD::FORMAL_ARGUMENTS: return LowerFORMAL_ARGUMENTS(Op, DAG);
case ISD::RETURNADDR: break;
case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
case ISD::GLOBAL_OFFSET_TABLE: return LowerGLOBAL_OFFSET_TABLE(Op, DAG);
@@ -2287,9 +2758,8 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) {
case ISD::VSETCC: return LowerVSETCC(Op, DAG);
case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG);
- case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG);
case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
- case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op);
+ case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
}
return SDValue();
}
@@ -2301,7 +2771,7 @@ void ARMTargetLowering::ReplaceNodeResults(SDNode *N,
SelectionDAG &DAG) {
switch (N->getOpcode()) {
default:
- assert(0 && "Don't know how to custom expand this!");
+ llvm_unreachable("Don't know how to custom expand this!");
return;
case ISD::BIT_CONVERT:
Results.push_back(ExpandBIT_CONVERT(N, DAG));
@@ -2322,12 +2792,14 @@ void ARMTargetLowering::ReplaceNodeResults(SDNode *N,
MachineBasicBlock *
ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
- MachineBasicBlock *BB) const {
+ MachineBasicBlock *BB,
+ DenseMap<MachineBasicBlock*, MachineBasicBlock*> *EM) const {
const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
DebugLoc dl = MI->getDebugLoc();
switch (MI->getOpcode()) {
- default: assert(false && "Unexpected instr type to insert");
- case ARM::tMOVCCr: {
+ default:
+ llvm_unreachable("Unexpected instr type to insert");
+ case ARM::tMOVCCr_pseudo: {
// To "insert" a SELECT_CC instruction, we actually have to insert the
// diamond control-flow pattern. The incoming instruction knows the
// destination vreg to set, the condition code register to branch on, the
@@ -2352,12 +2824,15 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
F->insert(It, sinkMBB);
// Update machine-CFG edges by first adding all successors of the current
// block to the new block which will contain the Phi node for the select.
- for(MachineBasicBlock::succ_iterator i = BB->succ_begin(),
- e = BB->succ_end(); i != e; ++i)
- sinkMBB->addSuccessor(*i);
+ // Also inform sdisel of the edge changes.
+ for (MachineBasicBlock::succ_iterator I = BB->succ_begin(),
+ E = BB->succ_end(); I != E; ++I) {
+ EM->insert(std::make_pair(*I, sinkMBB));
+ sinkMBB->addSuccessor(*I);
+ }
// Next, remove all successors of the current block, and add the true
// and fallthrough blocks as its successors.
- while(!BB->succ_empty())
+ while (!BB->succ_empty())
BB->removeSuccessor(BB->succ_begin());
BB->addSuccessor(copy0MBB);
BB->addSuccessor(sinkMBB);
@@ -2381,6 +2856,78 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
F->DeleteMachineInstr(MI); // The pseudo instruction is gone now.
return BB;
}
+
+ case ARM::tANDsp:
+ case ARM::tADDspr_:
+ case ARM::tSUBspi_:
+ case ARM::t2SUBrSPi_:
+ case ARM::t2SUBrSPi12_:
+ case ARM::t2SUBrSPs_: {
+ MachineFunction *MF = BB->getParent();
+ unsigned DstReg = MI->getOperand(0).getReg();
+ unsigned SrcReg = MI->getOperand(1).getReg();
+ bool DstIsDead = MI->getOperand(0).isDead();
+ bool SrcIsKill = MI->getOperand(1).isKill();
+
+ if (SrcReg != ARM::SP) {
+ // Copy the source to SP from virtual register.
+ const TargetRegisterClass *RC = MF->getRegInfo().getRegClass(SrcReg);
+ unsigned CopyOpc = (RC == ARM::tGPRRegisterClass)
+ ? ARM::tMOVtgpr2gpr : ARM::tMOVgpr2gpr;
+ BuildMI(BB, dl, TII->get(CopyOpc), ARM::SP)
+ .addReg(SrcReg, getKillRegState(SrcIsKill));
+ }
+
+ unsigned OpOpc = 0;
+ bool NeedPred = false, NeedCC = false, NeedOp3 = false;
+ switch (MI->getOpcode()) {
+ default:
+ llvm_unreachable("Unexpected pseudo instruction!");
+ case ARM::tANDsp:
+ OpOpc = ARM::tAND;
+ NeedPred = true;
+ break;
+ case ARM::tADDspr_:
+ OpOpc = ARM::tADDspr;
+ break;
+ case ARM::tSUBspi_:
+ OpOpc = ARM::tSUBspi;
+ break;
+ case ARM::t2SUBrSPi_:
+ OpOpc = ARM::t2SUBrSPi;
+ NeedPred = true; NeedCC = true;
+ break;
+ case ARM::t2SUBrSPi12_:
+ OpOpc = ARM::t2SUBrSPi12;
+ NeedPred = true;
+ break;
+ case ARM::t2SUBrSPs_:
+ OpOpc = ARM::t2SUBrSPs;
+ NeedPred = true; NeedCC = true; NeedOp3 = true;
+ break;
+ }
+ MachineInstrBuilder MIB = BuildMI(BB, dl, TII->get(OpOpc), ARM::SP);
+ if (OpOpc == ARM::tAND)
+ AddDefaultT1CC(MIB);
+ MIB.addReg(ARM::SP);
+ MIB.addOperand(MI->getOperand(2));
+ if (NeedOp3)
+ MIB.addOperand(MI->getOperand(3));
+ if (NeedPred)
+ AddDefaultPred(MIB);
+ if (NeedCC)
+ AddDefaultCC(MIB);
+
+ // Copy the result from SP to virtual register.
+ const TargetRegisterClass *RC = MF->getRegInfo().getRegClass(DstReg);
+ unsigned CopyOpc = (RC == ARM::tGPRRegisterClass)
+ ? ARM::tMOVgpr2tgpr : ARM::tMOVgpr2gpr;
+ BuildMI(BB, dl, TII->get(CopyOpc))
+ .addReg(DstReg, getDefRegState(true) | getDeadRegState(DstIsDead))
+ .addReg(ARM::SP);
+ MF->DeleteMachineInstr(MI); // The pseudo instruction is gone now.
+ return BB;
+ }
}
}
@@ -2393,7 +2940,7 @@ SDValue combineSelectAndUse(SDNode *N, SDValue Slct, SDValue OtherOp,
TargetLowering::DAGCombinerInfo &DCI) {
SelectionDAG &DAG = DCI.DAG;
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- MVT VT = N->getValueType(0);
+ EVT VT = N->getValueType(0);
unsigned Opc = N->getOpcode();
bool isSlctCC = Slct.getOpcode() == ISD::SELECT_CC;
SDValue LHS = isSlctCC ? Slct.getOperand(2) : Slct.getOperand(1);
@@ -2421,7 +2968,7 @@ SDValue combineSelectAndUse(SDNode *N, SDValue Slct, SDValue OtherOp,
cast<ConstantSDNode>(RHS)->isNullValue()) {
std::swap(LHS, RHS);
SDValue Op0 = Slct.getOperand(0);
- MVT OpVT = isSlctCC ? Op0.getValueType() :
+ EVT OpVT = isSlctCC ? Op0.getValueType() :
Op0.getOperand(0).getValueType();
bool isInt = OpVT.isInteger();
CC = ISD::getSetCCInverse(CC, isInt);
@@ -2516,7 +3063,7 @@ static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
/// operand of a vector shift left operation. That value must be in the range:
/// 0 <= Value < ElementBits for a left shift; or
/// 0 <= Value <= ElementBits for a long left shift.
-static bool isVShiftLImm(SDValue Op, MVT VT, bool isLong, int64_t &Cnt) {
+static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
assert(VT.isVector() && "vector shift count is not a vector type");
unsigned ElementBits = VT.getVectorElementType().getSizeInBits();
if (! getVShiftImm(Op, ElementBits, Cnt))
@@ -2530,7 +3077,7 @@ static bool isVShiftLImm(SDValue Op, MVT VT, bool isLong, int64_t &Cnt) {
/// absolute value must be in the range:
/// 1 <= |Value| <= ElementBits for a right shift; or
/// 1 <= |Value| <= ElementBits/2 for a narrow right shift.
-static bool isVShiftRImm(SDValue Op, MVT VT, bool isNarrow, bool isIntrinsic,
+static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic,
int64_t &Cnt) {
assert(VT.isVector() && "vector shift count is not a vector type");
unsigned ElementBits = VT.getVectorElementType().getSizeInBits();
@@ -2571,7 +3118,7 @@ static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) {
case Intrinsic::arm_neon_vqrshiftns:
case Intrinsic::arm_neon_vqrshiftnu:
case Intrinsic::arm_neon_vqrshiftnsu: {
- MVT VT = N->getOperand(1).getValueType();
+ EVT VT = N->getOperand(1).getValueType();
int64_t Cnt;
unsigned VShiftOpc = 0;
@@ -2593,8 +3140,7 @@ static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) {
case Intrinsic::arm_neon_vshiftlu:
if (isVShiftLImm(N->getOperand(2), VT, true, Cnt))
break;
- assert(0 && "invalid shift count for vshll intrinsic");
- abort();
+ llvm_unreachable("invalid shift count for vshll intrinsic");
case Intrinsic::arm_neon_vrshifts:
case Intrinsic::arm_neon_vrshiftu:
@@ -2611,8 +3157,7 @@ static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) {
case Intrinsic::arm_neon_vqshiftsu:
if (isVShiftLImm(N->getOperand(2), VT, false, Cnt))
break;
- assert(0 && "invalid shift count for vqshlu intrinsic");
- abort();
+ llvm_unreachable("invalid shift count for vqshlu intrinsic");
case Intrinsic::arm_neon_vshiftn:
case Intrinsic::arm_neon_vrshiftn:
@@ -2625,11 +3170,10 @@ static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) {
// Narrowing shifts require an immediate right shift.
if (isVShiftRImm(N->getOperand(2), VT, true, true, Cnt))
break;
- assert(0 && "invalid shift count for narrowing vector shift intrinsic");
- abort();
+ llvm_unreachable("invalid shift count for narrowing vector shift intrinsic");
default:
- assert(0 && "unhandled vector shift");
+ llvm_unreachable("unhandled vector shift");
}
switch (IntNo) {
@@ -2678,7 +3222,7 @@ static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) {
}
case Intrinsic::arm_neon_vshiftins: {
- MVT VT = N->getOperand(1).getValueType();
+ EVT VT = N->getOperand(1).getValueType();
int64_t Cnt;
unsigned VShiftOpc = 0;
@@ -2687,8 +3231,7 @@ static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) {
else if (isVShiftRImm(N->getOperand(3), VT, false, true, Cnt))
VShiftOpc = ARMISD::VSRI;
else {
- assert(0 && "invalid shift count for vsli/vsri intrinsic");
- abort();
+ llvm_unreachable("invalid shift count for vsli/vsri intrinsic");
}
return DAG.getNode(VShiftOpc, N->getDebugLoc(), N->getValueType(0),
@@ -2712,7 +3255,7 @@ static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) {
/// their values after they get legalized to loads from a constant pool.
static SDValue PerformShiftCombine(SDNode *N, SelectionDAG &DAG,
const ARMSubtarget *ST) {
- MVT VT = N->getValueType(0);
+ EVT VT = N->getValueType(0);
// Nothing to be done for scalar shifts.
if (! VT.isVector())
@@ -2722,7 +3265,7 @@ static SDValue PerformShiftCombine(SDNode *N, SelectionDAG &DAG,
int64_t Cnt;
switch (N->getOpcode()) {
- default: assert(0 && "unexpected shift opcode");
+ default: llvm_unreachable("unexpected shift opcode");
case ISD::SHL:
if (isVShiftLImm(N->getOperand(1), VT, false, Cnt))
@@ -2755,8 +3298,8 @@ static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG,
if (ST->hasNEON() && N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
SDValue Vec = N0.getOperand(0);
SDValue Lane = N0.getOperand(1);
- MVT VT = N->getValueType(0);
- MVT EltVT = N0.getValueType();
+ EVT VT = N->getValueType(0);
+ EVT EltVT = N0.getValueType();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (VT == MVT::i32 &&
@@ -2765,7 +3308,7 @@ static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG,
unsigned Opc = 0;
switch (N->getOpcode()) {
- default: assert(0 && "unexpected opcode");
+ default: llvm_unreachable("unexpected opcode");
case ISD::SIGN_EXTEND:
Opc = ARMISD::VGETLANEs;
break;
@@ -2802,10 +3345,88 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
return SDValue();
}
+bool ARMTargetLowering::allowsUnalignedMemoryAccesses(EVT VT) const {
+ if (!Subtarget->hasV6Ops())
+ // Pre-v6 does not support unaligned mem access.
+ return false;
+ else if (!Subtarget->hasV6Ops()) {
+ // v6 may or may not support unaligned mem access.
+ if (!Subtarget->isTargetDarwin())
+ return false;
+ }
+
+ switch (VT.getSimpleVT().SimpleTy) {
+ default:
+ return false;
+ case MVT::i8:
+ case MVT::i16:
+ case MVT::i32:
+ return true;
+ // FIXME: VLD1 etc with standard alignment is legal.
+ }
+}
+
+static bool isLegalT1AddressImmediate(int64_t V, EVT VT) {
+ if (V < 0)
+ return false;
+
+ unsigned Scale = 1;
+ switch (VT.getSimpleVT().SimpleTy) {
+ default: return false;
+ case MVT::i1:
+ case MVT::i8:
+ // Scale == 1;
+ break;
+ case MVT::i16:
+ // Scale == 2;
+ Scale = 2;
+ break;
+ case MVT::i32:
+ // Scale == 4;
+ Scale = 4;
+ break;
+ }
+
+ if ((V & (Scale - 1)) != 0)
+ return false;
+ V /= Scale;
+ return V == (V & ((1LL << 5) - 1));
+}
+
+static bool isLegalT2AddressImmediate(int64_t V, EVT VT,
+ const ARMSubtarget *Subtarget) {
+ bool isNeg = false;
+ if (V < 0) {
+ isNeg = true;
+ V = - V;
+ }
+
+ switch (VT.getSimpleVT().SimpleTy) {
+ default: return false;
+ case MVT::i1:
+ case MVT::i8:
+ case MVT::i16:
+ case MVT::i32:
+ // + imm12 or - imm8
+ if (isNeg)
+ return V == (V & ((1LL << 8) - 1));
+ return V == (V & ((1LL << 12) - 1));
+ case MVT::f32:
+ case MVT::f64:
+ // Same as ARM mode. FIXME: NEON?
+ if (!Subtarget->hasVFP2())
+ return false;
+ if ((V & 3) != 0)
+ return false;
+ V >>= 2;
+ return V == (V & ((1LL << 8) - 1));
+ }
+}
+
/// isLegalAddressImmediate - Return true if the integer value can be used
/// as the offset of the target addressing mode for load / store of the
/// given type.
-static bool isLegalAddressImmediate(int64_t V, MVT VT,
+static bool isLegalAddressImmediate(int64_t V, EVT VT,
const ARMSubtarget *Subtarget) {
if (V == 0)
return true;
@@ -2813,36 +3434,15 @@ static bool isLegalAddressImmediate(int64_t V, MVT VT,
if (!VT.isSimple())
return false;
- if (Subtarget->isThumb()) {
- if (V < 0)
- return false;
-
- unsigned Scale = 1;
- switch (VT.getSimpleVT()) {
- default: return false;
- case MVT::i1:
- case MVT::i8:
- // Scale == 1;
- break;
- case MVT::i16:
- // Scale == 2;
- Scale = 2;
- break;
- case MVT::i32:
- // Scale == 4;
- Scale = 4;
- break;
- }
-
- if ((V & (Scale - 1)) != 0)
- return false;
- V /= Scale;
- return V == (V & ((1LL << 5) - 1));
- }
+ if (Subtarget->isThumb1Only())
+ return isLegalT1AddressImmediate(V, VT);
+ else if (Subtarget->isThumb2())
+ return isLegalT2AddressImmediate(V, VT, Subtarget);
+ // ARM mode.
if (V < 0)
V = - V;
- switch (VT.getSimpleVT()) {
+ switch (VT.getSimpleVT().SimpleTy) {
default: return false;
case MVT::i1:
case MVT::i8:
@@ -2854,7 +3454,7 @@ static bool isLegalAddressImmediate(int64_t V, MVT VT,
return V == (V & ((1LL << 8) - 1));
case MVT::f32:
case MVT::f64:
- if (!Subtarget->hasVFP2())
+ if (!Subtarget->hasVFP2()) // FIXME: NEON?
return false;
if ((V & 3) != 0)
return false;
@@ -2863,11 +3463,44 @@ static bool isLegalAddressImmediate(int64_t V, MVT VT,
}
}
+bool ARMTargetLowering::isLegalT2ScaledAddressingMode(const AddrMode &AM,
+ EVT VT) const {
+ int Scale = AM.Scale;
+ if (Scale < 0)
+ return false;
+
+ switch (VT.getSimpleVT().SimpleTy) {
+ default: return false;
+ case MVT::i1:
+ case MVT::i8:
+ case MVT::i16:
+ case MVT::i32:
+ if (Scale == 1)
+ return true;
+ // r + r << imm
+ Scale = Scale & ~1;
+ return Scale == 2 || Scale == 4 || Scale == 8;
+ case MVT::i64:
+ // r + r
+ if (((unsigned)AM.HasBaseReg + Scale) <= 2)
+ return true;
+ return false;
+ case MVT::isVoid:
+ // Note, we allow "void" uses (basically, uses that aren't loads or
+ // stores), because arm allows folding a scale into many arithmetic
+ // operations. This should be made more precise and revisited later.
+
+ // Allow r << imm, but the imm has to be a multiple of two.
+ if (Scale & 1) return false;
+ return isPowerOf2_32(Scale);
+ }
+}
+
/// isLegalAddressingMode - Return true if the addressing mode represented
/// by AM is legal for this target, for a load/store of the specified type.
bool ARMTargetLowering::isLegalAddressingMode(const AddrMode &AM,
const Type *Ty) const {
- MVT VT = getValueType(Ty, true);
+ EVT VT = getValueType(Ty, true);
if (!isLegalAddressImmediate(AM.BaseOffs, VT, Subtarget))
return false;
@@ -2879,7 +3512,7 @@ bool ARMTargetLowering::isLegalAddressingMode(const AddrMode &AM,
case 0: // no scale reg, must be "r+i" or "r", or "i".
break;
case 1:
- if (Subtarget->isThumb())
+ if (Subtarget->isThumb1Only())
return false;
// FALL THROUGH.
default:
@@ -2890,22 +3523,22 @@ bool ARMTargetLowering::isLegalAddressingMode(const AddrMode &AM,
if (!VT.isSimple())
return false;
+ if (Subtarget->isThumb2())
+ return isLegalT2ScaledAddressingMode(AM, VT);
+
int Scale = AM.Scale;
- switch (VT.getSimpleVT()) {
+ switch (VT.getSimpleVT().SimpleTy) {
default: return false;
case MVT::i1:
case MVT::i8:
case MVT::i32:
- case MVT::i64:
- // This assumes i64 is legalized to a pair of i32. If not (i.e.
- // ldrd / strd are used, then its address mode is same as i16.
- // r + r
if (Scale < 0) Scale = -Scale;
if (Scale == 1)
return true;
// r + r << imm
return isPowerOf2_32(Scale & ~1);
case MVT::i16:
+ case MVT::i64:
// r + r
if (((unsigned)AM.HasBaseReg + Scale) <= 2)
return true;
@@ -2917,15 +3550,15 @@ bool ARMTargetLowering::isLegalAddressingMode(const AddrMode &AM,
// operations. This should be made more precise and revisited later.
// Allow r << imm, but the imm has to be a multiple of two.
- if (AM.Scale & 1) return false;
- return isPowerOf2_32(AM.Scale);
+ if (Scale & 1) return false;
+ return isPowerOf2_32(Scale);
}
break;
}
return true;
}
-static bool getARMIndexedAddressParts(SDNode *Ptr, MVT VT,
+static bool getARMIndexedAddressParts(SDNode *Ptr, EVT VT,
bool isSEXTLoad, SDValue &Base,
SDValue &Offset, bool &isInc,
SelectionDAG &DAG) {
@@ -2983,7 +3616,7 @@ static bool getARMIndexedAddressParts(SDNode *Ptr, MVT VT,
return false;
}
-static bool getT2IndexedAddressParts(SDNode *Ptr, MVT VT,
+static bool getT2IndexedAddressParts(SDNode *Ptr, EVT VT,
bool isSEXTLoad, SDValue &Base,
SDValue &Offset, bool &isInc,
SelectionDAG &DAG) {
@@ -3019,7 +3652,7 @@ ARMTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
if (Subtarget->isThumb1Only())
return false;
- MVT VT;
+ EVT VT;
SDValue Ptr;
bool isSEXTLoad = false;
if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
@@ -3037,7 +3670,7 @@ ARMTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
if (Subtarget->isThumb2())
isLegal = getT2IndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
Offset, isInc, DAG);
- else
+ else
isLegal = getARMIndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
Offset, isInc, DAG);
if (!isLegal)
@@ -3058,7 +3691,7 @@ bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,
if (Subtarget->isThumb1Only())
return false;
- MVT VT;
+ EVT VT;
SDValue Ptr;
bool isSEXTLoad = false;
if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
@@ -3074,7 +3707,7 @@ bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,
if (Subtarget->isThumb2())
isLegal = getT2IndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
isInc, DAG);
- else
+ else
isLegal = getARMIndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
isInc, DAG);
if (!isLegal)
@@ -3128,12 +3761,12 @@ ARMTargetLowering::getConstraintType(const std::string &Constraint) const {
std::pair<unsigned, const TargetRegisterClass*>
ARMTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
- MVT VT) const {
+ EVT VT) const {
if (Constraint.size() == 1) {
// GCC RS6000 Constraint Letters
switch (Constraint[0]) {
case 'l':
- if (Subtarget->isThumb())
+ if (Subtarget->isThumb1Only())
return std::make_pair(0U, ARM::tGPRRegisterClass);
else
return std::make_pair(0U, ARM::GPRRegisterClass);
@@ -3152,7 +3785,7 @@ ARMTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
std::vector<unsigned> ARMTargetLowering::
getRegClassForInlineAsmConstraint(const std::string &Constraint,
- MVT VT) const {
+ EVT VT) const {
if (Constraint.size() != 1)
return std::vector<unsigned>();
@@ -3214,10 +3847,16 @@ void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
switch (Constraint) {
case 'I':
- if (Subtarget->isThumb()) {
- // This must be a constant between 0 and 255, for ADD immediates.
+ if (Subtarget->isThumb1Only()) {
+ // This must be a constant between 0 and 255, for ADD
+ // immediates.
if (CVal >= 0 && CVal <= 255)
break;
+ } else if (Subtarget->isThumb2()) {
+ // A constant that can be used as an immediate value in a
+ // data-processing instruction.
+ if (ARM_AM::getT2SOImmVal(CVal) != -1)
+ break;
} else {
// A constant that can be used as an immediate value in a
// data-processing instruction.
@@ -3227,7 +3866,7 @@ void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
return;
case 'J':
- if (Subtarget->isThumb()) {
+ if (Subtarget->isThumb()) { // FIXME thumb2
// This must be a constant between -255 and -1, for negated ADD
// immediates. This can be used in GCC with an "n" modifier that
// prints the negated value, for use with SUB instructions. It is
@@ -3244,13 +3883,21 @@ void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
return;
case 'K':
- if (Subtarget->isThumb()) {
+ if (Subtarget->isThumb1Only()) {
// A 32-bit value where only one byte has a nonzero value. Exclude
// zero to match GCC. This constraint is used by GCC internally for
// constants that can be loaded with a move/shift combination.
// It is not useful otherwise but is implemented for compatibility.
if (CVal != 0 && ARM_AM::isThumbImmShiftedVal(CVal))
break;
+ } else if (Subtarget->isThumb2()) {
+ // A constant whose bitwise inverse can be used as an immediate
+ // value in a data-processing instruction. This can be used in GCC
+ // with a "B" modifier that prints the inverted value, for use with
+ // BIC and MVN instructions. It is not useful otherwise but is
+ // implemented for compatibility.
+ if (ARM_AM::getT2SOImmVal(~CVal) != -1)
+ break;
} else {
// A constant whose bitwise inverse can be used as an immediate
// value in a data-processing instruction. This can be used in GCC
@@ -3263,11 +3910,19 @@ void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
return;
case 'L':
- if (Subtarget->isThumb()) {
+ if (Subtarget->isThumb1Only()) {
// This must be a constant between -7 and 7,
// for 3-operand ADD/SUB immediate instructions.
if (CVal >= -7 && CVal < 7)
break;
+ } else if (Subtarget->isThumb2()) {
+ // A constant whose negation can be used as an immediate value in a
+ // data-processing instruction. This can be used in GCC with an "n"
+ // modifier that prints the negated value, for use with SUB
+ // instructions. It is not useful otherwise but is implemented for
+ // compatibility.
+ if (ARM_AM::getT2SOImmVal(-CVal) != -1)
+ break;
} else {
// A constant whose negation can be used as an immediate value in a
// data-processing instruction. This can be used in GCC with an "n"
@@ -3280,7 +3935,7 @@ void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
return;
case 'M':
- if (Subtarget->isThumb()) {
+ if (Subtarget->isThumb()) { // FIXME thumb2
// This must be a multiple of 4 between 0 and 1020, for
// ADD sp + immediate.
if ((CVal >= 0 && CVal <= 1020) && ((CVal & 3) == 0))
@@ -3295,7 +3950,7 @@ void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
return;
case 'N':
- if (Subtarget->isThumb()) {
+ if (Subtarget->isThumb()) { // FIXME thumb2
// This must be a constant between 0 and 31, for shift amounts.
if (CVal >= 0 && CVal <= 31)
break;
@@ -3303,7 +3958,7 @@ void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
return;
case 'O':
- if (Subtarget->isThumb()) {
+ if (Subtarget->isThumb()) { // FIXME thumb2
// This must be a multiple of 4 between -508 and 508, for
// ADD/SUB sp = sp + immediate.
if ((CVal >= -508 && CVal <= 508) && ((CVal & 3) == 0))
@@ -3322,3 +3977,9 @@ void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, hasMemory,
Ops, DAG);
}
+
+bool
+ARMTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
+ // The ARM target isn't yet aware of offsets.
+ return false;
+}
diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h
index 553a86d..7d85f45 100644
--- a/lib/Target/ARM/ARMISelLowering.h
+++ b/lib/Target/ARM/ARMISelLowering.h
@@ -40,6 +40,7 @@ namespace llvm {
tCALL, // Thumb function call.
BRCOND, // Conditional branch.
BR_JT, // Jumptable branch.
+ BR2_JT, // Jumptable branch (2 level - jumptable entry is a jump).
RET_FLAG, // Return with a flag operand.
PIC_ADD, // Add with a PC operand and a PIC label.
@@ -64,11 +65,13 @@ namespace llvm {
FMRRD, // double to two gprs.
FMDRR, // Two gprs to double.
- EH_SJLJ_SETJMP, // SjLj exception handling setjmp
- EH_SJLJ_LONGJMP, // SjLj exception handling longjmp
+ EH_SJLJ_SETJMP, // SjLj exception handling setjmp.
+ EH_SJLJ_LONGJMP, // SjLj exception handling longjmp.
THREAD_POINTER,
+ DYN_ALLOC, // Dynamic allocation on the stack.
+
VCEQ, // Vector compare equal.
VCGE, // Vector compare greater than or equal.
VCGEU, // Vector compare unsigned greater than or equal.
@@ -112,8 +115,18 @@ namespace llvm {
VGETLANEu, // zero-extend vector extract element
VGETLANEs, // sign-extend vector extract element
- // Vector duplicate lane (128-bit result only; 64-bit is a shuffle)
- VDUPLANEQ // splat a lane from a 64-bit vector to a 128-bit vector
+ // Vector duplicate:
+ VDUP,
+ VDUPLANE,
+
+ // Vector shuffles:
+ VEXT, // extract
+ VREV64, // reverse elements within 64-bit doublewords
+ VREV32, // reverse elements within 32-bit words
+ VREV16, // reverse elements within 16-bit halfwords
+ VZIP, // zip (interleave)
+ VUZP, // unzip (deinterleave)
+ VTRN // transpose
};
}
@@ -147,11 +160,18 @@ namespace llvm {
virtual const char *getTargetNodeName(unsigned Opcode) const;
virtual MachineBasicBlock *EmitInstrWithCustomInserter(MachineInstr *MI,
- MachineBasicBlock *MBB) const;
+ MachineBasicBlock *MBB,
+ DenseMap<MachineBasicBlock*, MachineBasicBlock*>*) const;
+
+ /// allowsUnalignedMemoryAccesses - Returns true if the target allows
+ /// unaligned memory accesses. of the specified type.
+ /// FIXME: Add getOptimalMemOpType to implement memcpy with NEON?
+ virtual bool allowsUnalignedMemoryAccesses(EVT VT) const;
/// isLegalAddressingMode - Return true if the addressing mode represented
/// by AM is legal for this target, for a load/store of the specified type.
virtual bool isLegalAddressingMode(const AddrMode &AM, const Type *Ty)const;
+ bool isLegalT2ScaledAddressingMode(const AddrMode &AM, EVT VT) const;
/// getPreIndexedAddressParts - returns true by value, base pointer and
/// offset pointer and addressing mode by reference if the node's address
@@ -175,13 +195,15 @@ namespace llvm {
APInt &KnownOne,
const SelectionDAG &DAG,
unsigned Depth) const;
+
+
ConstraintType getConstraintType(const std::string &Constraint) const;
std::pair<unsigned, const TargetRegisterClass*>
getRegForInlineAsmConstraint(const std::string &Constraint,
- MVT VT) const;
+ EVT VT) const;
std::vector<unsigned>
getRegClassForInlineAsmConstraint(const std::string &Constraint,
- MVT VT) const;
+ EVT VT) const;
/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
/// vector. If it is invalid, don't add anything to Ops. If hasMemory is
@@ -200,21 +222,23 @@ namespace llvm {
/// getFunctionAlignment - Return the Log2 alignment of this function.
virtual unsigned getFunctionAlignment(const Function *F) const;
+ bool isShuffleMaskLegal(const SmallVectorImpl<int> &M, EVT VT) const;
+ bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const;
private:
/// Subtarget - Keep a pointer to the ARMSubtarget around so that we can
/// make the right decision when generating code for different targets.
const ARMSubtarget *Subtarget;
- /// ARMPCLabelIndex - Keep track the number of ARM PC labels created.
+ /// ARMPCLabelIndex - Keep track of the number of ARM PC labels created.
///
unsigned ARMPCLabelIndex;
- void addTypeForNEON(MVT VT, MVT PromotedLdStVT, MVT PromotedBitwiseVT);
- void addDRTypeForNEON(MVT VT);
- void addQRTypeForNEON(MVT VT);
+ void addTypeForNEON(EVT VT, EVT PromotedLdStVT, EVT PromotedBitwiseVT);
+ void addDRTypeForNEON(EVT VT);
+ void addQRTypeForNEON(EVT VT);
typedef SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPassVector;
- void PassF64ArgInRegs(CallSDNode *TheCall, SelectionDAG &DAG,
+ void PassF64ArgInRegs(DebugLoc dl, SelectionDAG &DAG,
SDValue Chain, SDValue &Arg,
RegsToPassVector &RegsToPass,
CCValAssign &VA, CCValAssign &NextVA,
@@ -224,15 +248,13 @@ namespace llvm {
SDValue GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA,
SDValue &Root, SelectionDAG &DAG, DebugLoc dl);
- CCAssignFn *CCAssignFnForNode(unsigned CC, bool Return) const;
- SDValue LowerMemOpCallTo(CallSDNode *TheCall, SelectionDAG &DAG,
- const SDValue &StackPtr, const CCValAssign &VA,
- SDValue Chain, SDValue Arg, ISD::ArgFlagsTy Flags);
- SDNode *LowerCallResult(SDValue Chain, SDValue InFlag, CallSDNode *TheCall,
- unsigned CallingConv, SelectionDAG &DAG);
- SDValue LowerCALL(SDValue Op, SelectionDAG &DAG);
+ CCAssignFn *CCAssignFnForNode(CallingConv::ID CC, bool Return, bool isVarArg) const;
+ SDValue LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, SDValue Arg,
+ DebugLoc dl, SelectionDAG &DAG,
+ const CCValAssign &VA,
+ ISD::ArgFlagsTy Flags);
+ SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG);
SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG);
- SDValue LowerRET(SDValue Op, SelectionDAG &DAG);
SDValue LowerGlobalAddressDarwin(SDValue Op, SelectionDAG &DAG);
SDValue LowerGlobalAddressELF(SDValue Op, SelectionDAG &DAG);
SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG);
@@ -241,9 +263,9 @@ namespace llvm {
SDValue LowerToTLSExecModels(GlobalAddressSDNode *GA,
SelectionDAG &DAG);
SDValue LowerGLOBAL_OFFSET_TABLE(SDValue Op, SelectionDAG &DAG);
- SDValue LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG);
SDValue LowerBR_JT(SDValue Op, SelectionDAG &DAG);
SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG);
+ SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG);
SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl,
SDValue Chain,
@@ -252,6 +274,33 @@ namespace llvm {
bool AlwaysInline,
const Value *DstSV, uint64_t DstSVOff,
const Value *SrcSV, uint64_t SrcSVOff);
+ SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
+ CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins,
+ DebugLoc dl, SelectionDAG &DAG,
+ SmallVectorImpl<SDValue> &InVals);
+
+ virtual SDValue
+ LowerFormalArguments(SDValue Chain,
+ CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins,
+ DebugLoc dl, SelectionDAG &DAG,
+ SmallVectorImpl<SDValue> &InVals);
+
+ virtual SDValue
+ LowerCall(SDValue Chain, SDValue Callee,
+ CallingConv::ID CallConv, bool isVarArg,
+ bool isTailCall,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<ISD::InputArg> &Ins,
+ DebugLoc dl, SelectionDAG &DAG,
+ SmallVectorImpl<SDValue> &InVals);
+
+ virtual SDValue
+ LowerReturn(SDValue Chain,
+ CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ DebugLoc dl, SelectionDAG &DAG);
};
}
diff --git a/lib/Target/ARM/ARMInstrFormats.td b/lib/Target/ARM/ARMInstrFormats.td
index 301a6c1..3d19f23 100644
--- a/lib/Target/ARM/ARMInstrFormats.td
+++ b/lib/Target/ARM/ARMInstrFormats.td
@@ -54,9 +54,16 @@ def NEONGetLnFrm : Format<25>;
def NEONSetLnFrm : Format<26>;
def NEONDupFrm : Format<27>;
-// Misc flag for data processing instructions that indicates whether
+// Misc flags.
+
// the instruction has a Rn register operand.
-class UnaryDP { bit isUnaryDataProc = 1; }
+// UnaryDP - Indicates this is a unary data processing instruction, i.e.
+// it doesn't have a Rn operand.
+class UnaryDP { bit isUnaryDataProc = 1; }
+
+// Xform16Bit - Indicates this Thumb2 instruction may be transformed into
+// a 16-bit Thumb instruction if certain conditions are met.
+class Xform16Bit { bit canXformTo16Bit = 1; }
//===----------------------------------------------------------------------===//
// ARM Instruction flags. These need to match ARMInstrInfo.h.
@@ -77,7 +84,7 @@ def AddrModeT1_1 : AddrMode<7>;
def AddrModeT1_2 : AddrMode<8>;
def AddrModeT1_4 : AddrMode<9>;
def AddrModeT1_s : AddrMode<10>;
-def AddrModeT2_i12: AddrMode<12>;
+def AddrModeT2_i12: AddrMode<11>;
def AddrModeT2_i8 : AddrMode<12>;
def AddrModeT2_so : AddrMode<13>;
def AddrModeT2_pc : AddrMode<14>;
@@ -103,11 +110,33 @@ def IndexModePost : IndexMode<2>;
//===----------------------------------------------------------------------===//
+// ARM special operands.
+//
+
+// ARM Predicate operand. Default to 14 = always (AL). Second part is CC
+// register whose default is 0 (no register).
+def pred : PredicateOperand<OtherVT, (ops i32imm, CCR),
+ (ops (i32 14), (i32 zero_reg))> {
+ let PrintMethod = "printPredicateOperand";
+}
+
+// Conditional code result for instructions whose 's' bit is set, e.g. subs.
+def cc_out : OptionalDefOperand<OtherVT, (ops CCR), (ops (i32 zero_reg))> {
+ let PrintMethod = "printSBitModifierOperand";
+}
+
+// Same as cc_out except it defaults to setting CPSR.
+def s_cc_out : OptionalDefOperand<OtherVT, (ops CCR), (ops (i32 CPSR))> {
+ let PrintMethod = "printSBitModifierOperand";
+}
+
+//===----------------------------------------------------------------------===//
+
// ARM Instruction templates.
//
class InstARM<AddrMode am, SizeFlagVal sz, IndexMode im,
- Format f, string cstr>
+ Format f, string cstr, InstrItinClass itin>
: Instruction {
field bits<32> Inst;
@@ -130,12 +159,15 @@ class InstARM<AddrMode am, SizeFlagVal sz, IndexMode im,
// Attributes specific to ARM instructions...
//
bit isUnaryDataProc = 0;
+ bit canXformTo16Bit = 0;
let Constraints = cstr;
+ let Itinerary = itin;
}
-class PseudoInst<dag oops, dag iops, string asm, list<dag> pattern>
- : InstARM<AddrModeNone, SizeSpecial, IndexModeNone, Pseudo, ""> {
+class PseudoInst<dag oops, dag iops, InstrItinClass itin,
+ string asm, list<dag> pattern>
+ : InstARM<AddrModeNone, SizeSpecial, IndexModeNone, Pseudo, "", itin> {
let OutOperandList = oops;
let InOperandList = iops;
let AsmString = asm;
@@ -144,9 +176,10 @@ class PseudoInst<dag oops, dag iops, string asm, list<dag> pattern>
// Almost all ARM instructions are predicable.
class I<dag oops, dag iops, AddrMode am, SizeFlagVal sz,
- IndexMode im, Format f, string opc, string asm, string cstr,
+ IndexMode im, Format f, InstrItinClass itin,
+ string opc, string asm, string cstr,
list<dag> pattern>
- : InstARM<am, sz, im, f, cstr> {
+ : InstARM<am, sz, im, f, cstr, itin> {
let OutOperandList = oops;
let InOperandList = !con(iops, (ops pred:$p));
let AsmString = !strconcat(opc, !strconcat("${p}", asm));
@@ -158,9 +191,10 @@ class I<dag oops, dag iops, AddrMode am, SizeFlagVal sz,
// an input operand since by default it's a zero register. It will
// become an implicit def once it's "flipped".
class sI<dag oops, dag iops, AddrMode am, SizeFlagVal sz,
- IndexMode im, Format f, string opc, string asm, string cstr,
+ IndexMode im, Format f, InstrItinClass itin,
+ string opc, string asm, string cstr,
list<dag> pattern>
- : InstARM<am, sz, im, f, cstr> {
+ : InstARM<am, sz, im, f, cstr, itin> {
let OutOperandList = oops;
let InOperandList = !con(iops, (ops pred:$p, cc_out:$s));
let AsmString = !strconcat(opc, !strconcat("${p}${s}", asm));
@@ -170,8 +204,9 @@ class sI<dag oops, dag iops, AddrMode am, SizeFlagVal sz,
// Special cases
class XI<dag oops, dag iops, AddrMode am, SizeFlagVal sz,
- IndexMode im, Format f, string asm, string cstr, list<dag> pattern>
- : InstARM<am, sz, im, f, cstr> {
+ IndexMode im, Format f, InstrItinClass itin,
+ string asm, string cstr, list<dag> pattern>
+ : InstARM<am, sz, im, f, cstr, itin> {
let OutOperandList = oops;
let InOperandList = iops;
let AsmString = asm;
@@ -179,90 +214,93 @@ class XI<dag oops, dag iops, AddrMode am, SizeFlagVal sz,
list<Predicate> Predicates = [IsARM];
}
-class AI<dag oops, dag iops, Format f, string opc,
- string asm, list<dag> pattern>
- : I<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, f, opc,
- asm, "", pattern>;
-class AsI<dag oops, dag iops, Format f, string opc,
+class AI<dag oops, dag iops, Format f, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : I<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, f, itin,
+ opc, asm, "", pattern>;
+class AsI<dag oops, dag iops, Format f, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : sI<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, f, itin,
+ opc, asm, "", pattern>;
+class AXI<dag oops, dag iops, Format f, InstrItinClass itin,
string asm, list<dag> pattern>
- : sI<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, f, opc,
+ : XI<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, f, itin,
asm, "", pattern>;
-class AXI<dag oops, dag iops, Format f, string asm,
- list<dag> pattern>
- : XI<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, f, asm,
- "", pattern>;
// Ctrl flow instructions
-class ABI<bits<4> opcod, dag oops, dag iops, string opc,
- string asm, list<dag> pattern>
- : I<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, BrFrm, opc,
- asm, "", pattern> {
+class ABI<bits<4> opcod, dag oops, dag iops, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : I<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, BrFrm, itin,
+ opc, asm, "", pattern> {
let Inst{27-24} = opcod;
}
-class ABXI<bits<4> opcod, dag oops, dag iops, string asm, list<dag> pattern>
- : XI<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, BrFrm, asm,
- "", pattern> {
+class ABXI<bits<4> opcod, dag oops, dag iops, InstrItinClass itin,
+ string asm, list<dag> pattern>
+ : XI<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, BrFrm, itin,
+ asm, "", pattern> {
let Inst{27-24} = opcod;
}
-class ABXIx2<dag oops, dag iops, string asm, list<dag> pattern>
- : XI<oops, iops, AddrModeNone, Size8Bytes, IndexModeNone, BrMiscFrm, asm,
- "", pattern>;
+class ABXIx2<dag oops, dag iops, InstrItinClass itin,
+ string asm, list<dag> pattern>
+ : XI<oops, iops, AddrModeNone, Size8Bytes, IndexModeNone, BrMiscFrm, itin,
+ asm, "", pattern>;
// BR_JT instructions
-class JTI<dag oops, dag iops, string asm, list<dag> pattern>
- : XI<oops, iops, AddrModeNone, SizeSpecial, IndexModeNone, BrMiscFrm,
+class JTI<dag oops, dag iops, InstrItinClass itin,
+ string asm, list<dag> pattern>
+ : XI<oops, iops, AddrModeNone, SizeSpecial, IndexModeNone, BrMiscFrm, itin,
asm, "", pattern>;
// addrmode1 instructions
-class AI1<bits<4> opcod, dag oops, dag iops, Format f, string opc,
- string asm, list<dag> pattern>
- : I<oops, iops, AddrMode1, Size4Bytes, IndexModeNone, f, opc,
- asm, "", pattern> {
+class AI1<bits<4> opcod, dag oops, dag iops, Format f, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : I<oops, iops, AddrMode1, Size4Bytes, IndexModeNone, f, itin,
+ opc, asm, "", pattern> {
let Inst{24-21} = opcod;
let Inst{27-26} = {0,0};
}
-class AsI1<bits<4> opcod, dag oops, dag iops, Format f, string opc,
- string asm, list<dag> pattern>
- : sI<oops, iops, AddrMode1, Size4Bytes, IndexModeNone, f, opc,
- asm, "", pattern> {
+class AsI1<bits<4> opcod, dag oops, dag iops, Format f, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : sI<oops, iops, AddrMode1, Size4Bytes, IndexModeNone, f, itin,
+ opc, asm, "", pattern> {
let Inst{24-21} = opcod;
let Inst{27-26} = {0,0};
}
-class AXI1<bits<4> opcod, dag oops, dag iops, Format f, string asm,
- list<dag> pattern>
- : XI<oops, iops, AddrMode1, Size4Bytes, IndexModeNone, f, asm,
- "", pattern> {
+class AXI1<bits<4> opcod, dag oops, dag iops, Format f, InstrItinClass itin,
+ string asm, list<dag> pattern>
+ : XI<oops, iops, AddrMode1, Size4Bytes, IndexModeNone, f, itin,
+ asm, "", pattern> {
let Inst{24-21} = opcod;
let Inst{27-26} = {0,0};
}
-class AI1x2<dag oops, dag iops, Format f, string opc,
- string asm, list<dag> pattern>
- : I<oops, iops, AddrMode1, Size8Bytes, IndexModeNone, f, opc,
- asm, "", pattern>;
+class AI1x2<dag oops, dag iops, Format f, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : I<oops, iops, AddrMode1, Size8Bytes, IndexModeNone, f, itin,
+ opc, asm, "", pattern>;
// addrmode2 loads and stores
-class AI2<dag oops, dag iops, Format f, string opc,
- string asm, list<dag> pattern>
- : I<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f, opc,
- asm, "", pattern> {
+class AI2<dag oops, dag iops, Format f, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : I<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f, itin,
+ opc, asm, "", pattern> {
let Inst{27-26} = {0,1};
}
// loads
-class AI2ldw<dag oops, dag iops, Format f, string opc,
- string asm, list<dag> pattern>
- : I<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f, opc,
- asm, "", pattern> {
+class AI2ldw<dag oops, dag iops, Format f, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : I<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f, itin,
+ opc, asm, "", pattern> {
let Inst{20} = 1; // L bit
let Inst{21} = 0; // W bit
let Inst{22} = 0; // B bit
let Inst{24} = 1; // P bit
let Inst{27-26} = {0,1};
}
-class AXI2ldw<dag oops, dag iops, Format f, string asm,
- list<dag> pattern>
- : XI<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f,
+class AXI2ldw<dag oops, dag iops, Format f, InstrItinClass itin,
+ string asm, list<dag> pattern>
+ : XI<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f, itin,
asm, "", pattern> {
let Inst{20} = 1; // L bit
let Inst{21} = 0; // W bit
@@ -270,19 +308,19 @@ class AXI2ldw<dag oops, dag iops, Format f, string asm,
let Inst{24} = 1; // P bit
let Inst{27-26} = {0,1};
}
-class AI2ldb<dag oops, dag iops, Format f, string opc,
- string asm, list<dag> pattern>
- : I<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f, opc,
- asm, "", pattern> {
+class AI2ldb<dag oops, dag iops, Format f, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : I<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f, itin,
+ opc, asm, "", pattern> {
let Inst{20} = 1; // L bit
let Inst{21} = 0; // W bit
let Inst{22} = 1; // B bit
let Inst{24} = 1; // P bit
let Inst{27-26} = {0,1};
}
-class AXI2ldb<dag oops, dag iops, Format f, string asm,
- list<dag> pattern>
- : XI<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f,
+class AXI2ldb<dag oops, dag iops, Format f, InstrItinClass itin,
+ string asm, list<dag> pattern>
+ : XI<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f, itin,
asm, "", pattern> {
let Inst{20} = 1; // L bit
let Inst{21} = 0; // W bit
@@ -292,19 +330,19 @@ class AXI2ldb<dag oops, dag iops, Format f, string asm,
}
// stores
-class AI2stw<dag oops, dag iops, Format f, string opc,
- string asm, list<dag> pattern>
- : I<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f, opc,
- asm, "", pattern> {
+class AI2stw<dag oops, dag iops, Format f, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : I<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f, itin,
+ opc, asm, "", pattern> {
let Inst{20} = 0; // L bit
let Inst{21} = 0; // W bit
let Inst{22} = 0; // B bit
let Inst{24} = 1; // P bit
let Inst{27-26} = {0,1};
}
-class AXI2stw<dag oops, dag iops, Format f, string asm,
- list<dag> pattern>
- : XI<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f,
+class AXI2stw<dag oops, dag iops, Format f, InstrItinClass itin,
+ string asm, list<dag> pattern>
+ : XI<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f, itin,
asm, "", pattern> {
let Inst{20} = 0; // L bit
let Inst{21} = 0; // W bit
@@ -312,19 +350,19 @@ class AXI2stw<dag oops, dag iops, Format f, string asm,
let Inst{24} = 1; // P bit
let Inst{27-26} = {0,1};
}
-class AI2stb<dag oops, dag iops, Format f, string opc,
- string asm, list<dag> pattern>
- : I<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f, opc,
- asm, "", pattern> {
+class AI2stb<dag oops, dag iops, Format f, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : I<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f, itin,
+ opc, asm, "", pattern> {
let Inst{20} = 0; // L bit
let Inst{21} = 0; // W bit
let Inst{22} = 1; // B bit
let Inst{24} = 1; // P bit
let Inst{27-26} = {0,1};
}
-class AXI2stb<dag oops, dag iops, Format f, string asm,
- list<dag> pattern>
- : XI<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f,
+class AXI2stb<dag oops, dag iops, Format f, InstrItinClass itin,
+ string asm, list<dag> pattern>
+ : XI<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f, itin,
asm, "", pattern> {
let Inst{20} = 0; // L bit
let Inst{21} = 0; // W bit
@@ -334,20 +372,20 @@ class AXI2stb<dag oops, dag iops, Format f, string asm,
}
// Pre-indexed loads
-class AI2ldwpr<dag oops, dag iops, Format f, string opc,
- string asm, string cstr, list<dag> pattern>
- : I<oops, iops, AddrMode2, Size4Bytes, IndexModePre, f, opc,
- asm, cstr, pattern> {
+class AI2ldwpr<dag oops, dag iops, Format f, InstrItinClass itin,
+ string opc, string asm, string cstr, list<dag> pattern>
+ : I<oops, iops, AddrMode2, Size4Bytes, IndexModePre, f, itin,
+ opc, asm, cstr, pattern> {
let Inst{20} = 1; // L bit
let Inst{21} = 1; // W bit
let Inst{22} = 0; // B bit
let Inst{24} = 1; // P bit
let Inst{27-26} = {0,1};
}
-class AI2ldbpr<dag oops, dag iops, Format f, string opc,
- string asm, string cstr, list<dag> pattern>
- : I<oops, iops, AddrMode2, Size4Bytes, IndexModePre, f, opc,
- asm, cstr, pattern> {
+class AI2ldbpr<dag oops, dag iops, Format f, InstrItinClass itin,
+ string opc, string asm, string cstr, list<dag> pattern>
+ : I<oops, iops, AddrMode2, Size4Bytes, IndexModePre, f, itin,
+ opc, asm, cstr, pattern> {
let Inst{20} = 1; // L bit
let Inst{21} = 1; // W bit
let Inst{22} = 1; // B bit
@@ -356,20 +394,20 @@ class AI2ldbpr<dag oops, dag iops, Format f, string opc,
}
// Pre-indexed stores
-class AI2stwpr<dag oops, dag iops, Format f, string opc,
- string asm, string cstr, list<dag> pattern>
- : I<oops, iops, AddrMode2, Size4Bytes, IndexModePre, f, opc,
- asm, cstr, pattern> {
+class AI2stwpr<dag oops, dag iops, Format f, InstrItinClass itin,
+ string opc, string asm, string cstr, list<dag> pattern>
+ : I<oops, iops, AddrMode2, Size4Bytes, IndexModePre, f, itin,
+ opc, asm, cstr, pattern> {
let Inst{20} = 0; // L bit
let Inst{21} = 1; // W bit
let Inst{22} = 0; // B bit
let Inst{24} = 1; // P bit
let Inst{27-26} = {0,1};
}
-class AI2stbpr<dag oops, dag iops, Format f, string opc,
- string asm, string cstr, list<dag> pattern>
- : I<oops, iops, AddrMode2, Size4Bytes, IndexModePre, f, opc,
- asm, cstr, pattern> {
+class AI2stbpr<dag oops, dag iops, Format f, InstrItinClass itin,
+ string opc, string asm, string cstr, list<dag> pattern>
+ : I<oops, iops, AddrMode2, Size4Bytes, IndexModePre, f, itin,
+ opc, asm, cstr, pattern> {
let Inst{20} = 0; // L bit
let Inst{21} = 1; // W bit
let Inst{22} = 1; // B bit
@@ -378,20 +416,20 @@ class AI2stbpr<dag oops, dag iops, Format f, string opc,
}
// Post-indexed loads
-class AI2ldwpo<dag oops, dag iops, Format f, string opc,
- string asm, string cstr, list<dag> pattern>
- : I<oops, iops, AddrMode2, Size4Bytes, IndexModePost, f, opc,
- asm, cstr,pattern> {
+class AI2ldwpo<dag oops, dag iops, Format f, InstrItinClass itin,
+ string opc, string asm, string cstr, list<dag> pattern>
+ : I<oops, iops, AddrMode2, Size4Bytes, IndexModePost, f, itin,
+ opc, asm, cstr,pattern> {
let Inst{20} = 1; // L bit
let Inst{21} = 0; // W bit
let Inst{22} = 0; // B bit
let Inst{24} = 0; // P bit
let Inst{27-26} = {0,1};
}
-class AI2ldbpo<dag oops, dag iops, Format f, string opc,
- string asm, string cstr, list<dag> pattern>
- : I<oops, iops, AddrMode2, Size4Bytes, IndexModePost, f, opc,
- asm, cstr,pattern> {
+class AI2ldbpo<dag oops, dag iops, Format f, InstrItinClass itin,
+ string opc, string asm, string cstr, list<dag> pattern>
+ : I<oops, iops, AddrMode2, Size4Bytes, IndexModePost, f, itin,
+ opc, asm, cstr,pattern> {
let Inst{20} = 1; // L bit
let Inst{21} = 0; // W bit
let Inst{22} = 1; // B bit
@@ -400,20 +438,20 @@ class AI2ldbpo<dag oops, dag iops, Format f, string opc,
}
// Post-indexed stores
-class AI2stwpo<dag oops, dag iops, Format f, string opc,
- string asm, string cstr, list<dag> pattern>
- : I<oops, iops, AddrMode2, Size4Bytes, IndexModePost, f, opc,
- asm, cstr,pattern> {
+class AI2stwpo<dag oops, dag iops, Format f, InstrItinClass itin,
+ string opc, string asm, string cstr, list<dag> pattern>
+ : I<oops, iops, AddrMode2, Size4Bytes, IndexModePost, f, itin,
+ opc, asm, cstr,pattern> {
let Inst{20} = 0; // L bit
let Inst{21} = 0; // W bit
let Inst{22} = 0; // B bit
let Inst{24} = 0; // P bit
let Inst{27-26} = {0,1};
}
-class AI2stbpo<dag oops, dag iops, Format f, string opc,
- string asm, string cstr, list<dag> pattern>
- : I<oops, iops, AddrMode2, Size4Bytes, IndexModePost, f, opc,
- asm, cstr,pattern> {
+class AI2stbpo<dag oops, dag iops, Format f, InstrItinClass itin,
+ string opc, string asm, string cstr, list<dag> pattern>
+ : I<oops, iops, AddrMode2, Size4Bytes, IndexModePost, f, itin,
+ opc, asm, cstr,pattern> {
let Inst{20} = 0; // L bit
let Inst{21} = 0; // W bit
let Inst{22} = 1; // B bit
@@ -422,20 +460,20 @@ class AI2stbpo<dag oops, dag iops, Format f, string opc,
}
// addrmode3 instructions
-class AI3<dag oops, dag iops, Format f, string opc,
- string asm, list<dag> pattern>
- : I<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, opc,
- asm, "", pattern>;
-class AXI3<dag oops, dag iops, Format f, string asm,
- list<dag> pattern>
- : XI<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, asm,
- "", pattern>;
+class AI3<dag oops, dag iops, Format f, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : I<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, itin,
+ opc, asm, "", pattern>;
+class AXI3<dag oops, dag iops, Format f, InstrItinClass itin,
+ string asm, list<dag> pattern>
+ : XI<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, itin,
+ asm, "", pattern>;
// loads
-class AI3ldh<dag oops, dag iops, Format f, string opc,
- string asm, list<dag> pattern>
- : I<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, opc,
- asm, "", pattern> {
+class AI3ldh<dag oops, dag iops, Format f, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : I<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, itin,
+ opc, asm, "", pattern> {
let Inst{4} = 1;
let Inst{5} = 1; // H bit
let Inst{6} = 0; // S bit
@@ -443,10 +481,11 @@ class AI3ldh<dag oops, dag iops, Format f, string opc,
let Inst{20} = 1; // L bit
let Inst{21} = 0; // W bit
let Inst{24} = 1; // P bit
+ let Inst{27-25} = 0b000;
}
-class AXI3ldh<dag oops, dag iops, Format f, string asm,
- list<dag> pattern>
- : XI<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f,
+class AXI3ldh<dag oops, dag iops, Format f, InstrItinClass itin,
+ string asm, list<dag> pattern>
+ : XI<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, itin,
asm, "", pattern> {
let Inst{4} = 1;
let Inst{5} = 1; // H bit
@@ -456,10 +495,10 @@ class AXI3ldh<dag oops, dag iops, Format f, string asm,
let Inst{21} = 0; // W bit
let Inst{24} = 1; // P bit
}
-class AI3ldsh<dag oops, dag iops, Format f, string opc,
- string asm, list<dag> pattern>
- : I<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, opc,
- asm, "", pattern> {
+class AI3ldsh<dag oops, dag iops, Format f, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : I<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, itin,
+ opc, asm, "", pattern> {
let Inst{4} = 1;
let Inst{5} = 1; // H bit
let Inst{6} = 1; // S bit
@@ -467,10 +506,11 @@ class AI3ldsh<dag oops, dag iops, Format f, string opc,
let Inst{20} = 1; // L bit
let Inst{21} = 0; // W bit
let Inst{24} = 1; // P bit
+ let Inst{27-25} = 0b000;
}
-class AXI3ldsh<dag oops, dag iops, Format f, string asm,
- list<dag> pattern>
- : XI<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f,
+class AXI3ldsh<dag oops, dag iops, Format f, InstrItinClass itin,
+ string asm, list<dag> pattern>
+ : XI<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, itin,
asm, "", pattern> {
let Inst{4} = 1;
let Inst{5} = 1; // H bit
@@ -480,10 +520,10 @@ class AXI3ldsh<dag oops, dag iops, Format f, string asm,
let Inst{21} = 0; // W bit
let Inst{24} = 1; // P bit
}
-class AI3ldsb<dag oops, dag iops, Format f, string opc,
- string asm, list<dag> pattern>
- : I<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, opc,
- asm, "", pattern> {
+class AI3ldsb<dag oops, dag iops, Format f, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : I<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, itin,
+ opc, asm, "", pattern> {
let Inst{4} = 1;
let Inst{5} = 0; // H bit
let Inst{6} = 1; // S bit
@@ -491,10 +531,11 @@ class AI3ldsb<dag oops, dag iops, Format f, string opc,
let Inst{20} = 1; // L bit
let Inst{21} = 0; // W bit
let Inst{24} = 1; // P bit
+ let Inst{27-25} = 0b000;
}
-class AXI3ldsb<dag oops, dag iops, Format f, string asm,
- list<dag> pattern>
- : XI<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f,
+class AXI3ldsb<dag oops, dag iops, Format f, InstrItinClass itin,
+ string asm, list<dag> pattern>
+ : XI<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, itin,
asm, "", pattern> {
let Inst{4} = 1;
let Inst{5} = 0; // H bit
@@ -504,10 +545,10 @@ class AXI3ldsb<dag oops, dag iops, Format f, string asm,
let Inst{21} = 0; // W bit
let Inst{24} = 1; // P bit
}
-class AI3ldd<dag oops, dag iops, Format f, string opc,
- string asm, list<dag> pattern>
- : I<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, opc,
- asm, "", pattern> {
+class AI3ldd<dag oops, dag iops, Format f, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : I<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, itin,
+ opc, asm, "", pattern> {
let Inst{4} = 1;
let Inst{5} = 0; // H bit
let Inst{6} = 1; // S bit
@@ -515,13 +556,14 @@ class AI3ldd<dag oops, dag iops, Format f, string opc,
let Inst{20} = 0; // L bit
let Inst{21} = 0; // W bit
let Inst{24} = 1; // P bit
+ let Inst{27-25} = 0b000;
}
// stores
-class AI3sth<dag oops, dag iops, Format f, string opc,
- string asm, list<dag> pattern>
- : I<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, opc,
- asm, "", pattern> {
+class AI3sth<dag oops, dag iops, Format f, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : I<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, itin,
+ opc, asm, "", pattern> {
let Inst{4} = 1;
let Inst{5} = 1; // H bit
let Inst{6} = 0; // S bit
@@ -529,10 +571,11 @@ class AI3sth<dag oops, dag iops, Format f, string opc,
let Inst{20} = 0; // L bit
let Inst{21} = 0; // W bit
let Inst{24} = 1; // P bit
+ let Inst{27-25} = 0b000;
}
-class AXI3sth<dag oops, dag iops, Format f, string asm,
- list<dag> pattern>
- : XI<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f,
+class AXI3sth<dag oops, dag iops, Format f, InstrItinClass itin,
+ string asm, list<dag> pattern>
+ : XI<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, itin,
asm, "", pattern> {
let Inst{4} = 1;
let Inst{5} = 1; // H bit
@@ -542,10 +585,10 @@ class AXI3sth<dag oops, dag iops, Format f, string asm,
let Inst{21} = 0; // W bit
let Inst{24} = 1; // P bit
}
-class AI3std<dag oops, dag iops, Format f, string opc,
- string asm, list<dag> pattern>
- : I<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, opc,
- asm, "", pattern> {
+class AI3std<dag oops, dag iops, Format f, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : I<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, itin,
+ opc, asm, "", pattern> {
let Inst{4} = 1;
let Inst{5} = 1; // H bit
let Inst{6} = 1; // S bit
@@ -553,13 +596,14 @@ class AI3std<dag oops, dag iops, Format f, string opc,
let Inst{20} = 0; // L bit
let Inst{21} = 0; // W bit
let Inst{24} = 1; // P bit
+ let Inst{27-25} = 0b000;
}
// Pre-indexed loads
-class AI3ldhpr<dag oops, dag iops, Format f, string opc,
- string asm, string cstr, list<dag> pattern>
- : I<oops, iops, AddrMode3, Size4Bytes, IndexModePre, f, opc,
- asm, cstr, pattern> {
+class AI3ldhpr<dag oops, dag iops, Format f, InstrItinClass itin,
+ string opc, string asm, string cstr, list<dag> pattern>
+ : I<oops, iops, AddrMode3, Size4Bytes, IndexModePre, f, itin,
+ opc, asm, cstr, pattern> {
let Inst{4} = 1;
let Inst{5} = 1; // H bit
let Inst{6} = 0; // S bit
@@ -567,11 +611,12 @@ class AI3ldhpr<dag oops, dag iops, Format f, string opc,
let Inst{20} = 1; // L bit
let Inst{21} = 1; // W bit
let Inst{24} = 1; // P bit
+ let Inst{27-25} = 0b000;
}
-class AI3ldshpr<dag oops, dag iops, Format f, string opc,
- string asm, string cstr, list<dag> pattern>
- : I<oops, iops, AddrMode3, Size4Bytes, IndexModePre, f, opc,
- asm, cstr, pattern> {
+class AI3ldshpr<dag oops, dag iops, Format f, InstrItinClass itin,
+ string opc, string asm, string cstr, list<dag> pattern>
+ : I<oops, iops, AddrMode3, Size4Bytes, IndexModePre, f, itin,
+ opc, asm, cstr, pattern> {
let Inst{4} = 1;
let Inst{5} = 1; // H bit
let Inst{6} = 1; // S bit
@@ -579,11 +624,12 @@ class AI3ldshpr<dag oops, dag iops, Format f, string opc,
let Inst{20} = 1; // L bit
let Inst{21} = 1; // W bit
let Inst{24} = 1; // P bit
+ let Inst{27-25} = 0b000;
}
-class AI3ldsbpr<dag oops, dag iops, Format f, string opc,
- string asm, string cstr, list<dag> pattern>
- : I<oops, iops, AddrMode3, Size4Bytes, IndexModePre, f, opc,
- asm, cstr, pattern> {
+class AI3ldsbpr<dag oops, dag iops, Format f, InstrItinClass itin,
+ string opc, string asm, string cstr, list<dag> pattern>
+ : I<oops, iops, AddrMode3, Size4Bytes, IndexModePre, f, itin,
+ opc, asm, cstr, pattern> {
let Inst{4} = 1;
let Inst{5} = 0; // H bit
let Inst{6} = 1; // S bit
@@ -591,13 +637,14 @@ class AI3ldsbpr<dag oops, dag iops, Format f, string opc,
let Inst{20} = 1; // L bit
let Inst{21} = 1; // W bit
let Inst{24} = 1; // P bit
+ let Inst{27-25} = 0b000;
}
// Pre-indexed stores
-class AI3sthpr<dag oops, dag iops, Format f, string opc,
- string asm, string cstr, list<dag> pattern>
- : I<oops, iops, AddrMode3, Size4Bytes, IndexModePre, f, opc,
- asm, cstr, pattern> {
+class AI3sthpr<dag oops, dag iops, Format f, InstrItinClass itin,
+ string opc, string asm, string cstr, list<dag> pattern>
+ : I<oops, iops, AddrMode3, Size4Bytes, IndexModePre, f, itin,
+ opc, asm, cstr, pattern> {
let Inst{4} = 1;
let Inst{5} = 1; // H bit
let Inst{6} = 0; // S bit
@@ -605,13 +652,14 @@ class AI3sthpr<dag oops, dag iops, Format f, string opc,
let Inst{20} = 0; // L bit
let Inst{21} = 1; // W bit
let Inst{24} = 1; // P bit
+ let Inst{27-25} = 0b000;
}
// Post-indexed loads
-class AI3ldhpo<dag oops, dag iops, Format f, string opc,
- string asm, string cstr, list<dag> pattern>
- : I<oops, iops, AddrMode3, Size4Bytes, IndexModePost, f, opc,
- asm, cstr,pattern> {
+class AI3ldhpo<dag oops, dag iops, Format f, InstrItinClass itin,
+ string opc, string asm, string cstr, list<dag> pattern>
+ : I<oops, iops, AddrMode3, Size4Bytes, IndexModePost, f, itin,
+ opc, asm, cstr,pattern> {
let Inst{4} = 1;
let Inst{5} = 1; // H bit
let Inst{6} = 0; // S bit
@@ -619,11 +667,12 @@ class AI3ldhpo<dag oops, dag iops, Format f, string opc,
let Inst{20} = 1; // L bit
let Inst{21} = 1; // W bit
let Inst{24} = 0; // P bit
+ let Inst{27-25} = 0b000;
}
-class AI3ldshpo<dag oops, dag iops, Format f, string opc,
- string asm, string cstr, list<dag> pattern>
- : I<oops, iops, AddrMode3, Size4Bytes, IndexModePost, f, opc,
- asm, cstr,pattern> {
+class AI3ldshpo<dag oops, dag iops, Format f, InstrItinClass itin,
+ string opc, string asm, string cstr, list<dag> pattern>
+ : I<oops, iops, AddrMode3, Size4Bytes, IndexModePost, f, itin,
+ opc, asm, cstr,pattern> {
let Inst{4} = 1;
let Inst{5} = 1; // H bit
let Inst{6} = 1; // S bit
@@ -631,11 +680,12 @@ class AI3ldshpo<dag oops, dag iops, Format f, string opc,
let Inst{20} = 1; // L bit
let Inst{21} = 1; // W bit
let Inst{24} = 0; // P bit
+ let Inst{27-25} = 0b000;
}
-class AI3ldsbpo<dag oops, dag iops, Format f, string opc,
- string asm, string cstr, list<dag> pattern>
- : I<oops, iops, AddrMode3, Size4Bytes, IndexModePost, f, opc,
- asm, cstr,pattern> {
+class AI3ldsbpo<dag oops, dag iops, Format f, InstrItinClass itin,
+ string opc, string asm, string cstr, list<dag> pattern>
+ : I<oops, iops, AddrMode3, Size4Bytes, IndexModePost, f, itin,
+ opc, asm, cstr,pattern> {
let Inst{4} = 1;
let Inst{5} = 0; // H bit
let Inst{6} = 1; // S bit
@@ -643,13 +693,14 @@ class AI3ldsbpo<dag oops, dag iops, Format f, string opc,
let Inst{20} = 1; // L bit
let Inst{21} = 1; // W bit
let Inst{24} = 0; // P bit
+ let Inst{27-25} = 0b000;
}
// Post-indexed stores
-class AI3sthpo<dag oops, dag iops, Format f, string opc,
- string asm, string cstr, list<dag> pattern>
- : I<oops, iops, AddrMode3, Size4Bytes, IndexModePost, f, opc,
- asm, cstr,pattern> {
+class AI3sthpo<dag oops, dag iops, Format f, InstrItinClass itin,
+ string opc, string asm, string cstr, list<dag> pattern>
+ : I<oops, iops, AddrMode3, Size4Bytes, IndexModePost, f, itin,
+ opc, asm, cstr,pattern> {
let Inst{4} = 1;
let Inst{5} = 1; // H bit
let Inst{6} = 0; // S bit
@@ -657,57 +708,60 @@ class AI3sthpo<dag oops, dag iops, Format f, string opc,
let Inst{20} = 0; // L bit
let Inst{21} = 1; // W bit
let Inst{24} = 0; // P bit
+ let Inst{27-25} = 0b000;
}
// addrmode4 instructions
-class AXI4ld<dag oops, dag iops, Format f, string asm, list<dag> pattern>
- : XI<oops, iops, AddrMode4, Size4Bytes, IndexModeNone, f, asm,
- "", pattern> {
+class AXI4ld<dag oops, dag iops, Format f, InstrItinClass itin,
+ string asm, list<dag> pattern>
+ : XI<oops, iops, AddrMode4, Size4Bytes, IndexModeNone, f, itin,
+ asm, "", pattern> {
let Inst{20} = 1; // L bit
let Inst{22} = 0; // S bit
let Inst{27-25} = 0b100;
}
-class AXI4st<dag oops, dag iops, Format f, string asm, list<dag> pattern>
- : XI<oops, iops, AddrMode4, Size4Bytes, IndexModeNone, f, asm,
- "", pattern> {
+class AXI4st<dag oops, dag iops, Format f, InstrItinClass itin,
+ string asm, list<dag> pattern>
+ : XI<oops, iops, AddrMode4, Size4Bytes, IndexModeNone, f, itin,
+ asm, "", pattern> {
let Inst{20} = 0; // L bit
let Inst{22} = 0; // S bit
let Inst{27-25} = 0b100;
}
// Unsigned multiply, multiply-accumulate instructions.
-class AMul1I<bits<7> opcod, dag oops, dag iops, string opc,
- string asm, list<dag> pattern>
- : I<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, MulFrm, opc,
- asm, "", pattern> {
+class AMul1I<bits<7> opcod, dag oops, dag iops, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : I<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, MulFrm, itin,
+ opc, asm, "", pattern> {
let Inst{7-4} = 0b1001;
let Inst{20} = 0; // S bit
let Inst{27-21} = opcod;
}
-class AsMul1I<bits<7> opcod, dag oops, dag iops, string opc,
- string asm, list<dag> pattern>
- : sI<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, MulFrm, opc,
- asm, "", pattern> {
+class AsMul1I<bits<7> opcod, dag oops, dag iops, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : sI<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, MulFrm, itin,
+ opc, asm, "", pattern> {
let Inst{7-4} = 0b1001;
let Inst{27-21} = opcod;
}
// Most significant word multiply
-class AMul2I<bits<7> opcod, dag oops, dag iops, string opc,
- string asm, list<dag> pattern>
- : I<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, MulFrm, opc,
- asm, "", pattern> {
+class AMul2I<bits<7> opcod, dag oops, dag iops, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : I<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, MulFrm, itin,
+ opc, asm, "", pattern> {
let Inst{7-4} = 0b1001;
let Inst{20} = 1;
let Inst{27-21} = opcod;
}
// SMUL<x><y> / SMULW<y> / SMLA<x><y> / SMLAW<x><y>
-class AMulxyI<bits<7> opcod, dag oops, dag iops, string opc,
- string asm, list<dag> pattern>
- : I<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, MulFrm, opc,
- asm, "", pattern> {
+class AMulxyI<bits<7> opcod, dag oops, dag iops, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : I<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, MulFrm, itin,
+ opc, asm, "", pattern> {
let Inst{4} = 0;
let Inst{7} = 1;
let Inst{20} = 0;
@@ -715,19 +769,19 @@ class AMulxyI<bits<7> opcod, dag oops, dag iops, string opc,
}
// Extend instructions.
-class AExtI<bits<8> opcod, dag oops, dag iops, string opc,
- string asm, list<dag> pattern>
- : I<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, ExtFrm, opc,
- asm, "", pattern> {
+class AExtI<bits<8> opcod, dag oops, dag iops, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : I<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, ExtFrm, itin,
+ opc, asm, "", pattern> {
let Inst{7-4} = 0b0111;
let Inst{27-20} = opcod;
}
// Misc Arithmetic instructions.
-class AMiscA1I<bits<8> opcod, dag oops, dag iops, string opc,
- string asm, list<dag> pattern>
- : I<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, ArithMiscFrm, opc,
- asm, "", pattern> {
+class AMiscA1I<bits<8> opcod, dag oops, dag iops, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : I<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, ArithMiscFrm, itin,
+ opc, asm, "", pattern> {
let Inst{27-20} = opcod;
}
@@ -751,74 +805,120 @@ class ARMV6Pat<dag pattern, dag result> : Pat<pattern, result> {
// TI - Thumb instruction.
-class ThumbI<dag outs, dag ins, AddrMode am, SizeFlagVal sz,
- string asm, string cstr, list<dag> pattern>
- : InstARM<am, sz, IndexModeNone, ThumbFrm, cstr> {
- let OutOperandList = outs;
- let InOperandList = ins;
+class ThumbI<dag oops, dag iops, AddrMode am, SizeFlagVal sz,
+ InstrItinClass itin, string asm, string cstr, list<dag> pattern>
+ : InstARM<am, sz, IndexModeNone, ThumbFrm, cstr, itin> {
+ let OutOperandList = oops;
+ let InOperandList = iops;
let AsmString = asm;
let Pattern = pattern;
list<Predicate> Predicates = [IsThumb];
}
-class TI<dag outs, dag ins, string asm, list<dag> pattern>
- : ThumbI<outs, ins, AddrModeNone, Size2Bytes, asm, "", pattern>;
+class TI<dag oops, dag iops, InstrItinClass itin, string asm, list<dag> pattern>
+ : ThumbI<oops, iops, AddrModeNone, Size2Bytes, itin, asm, "", pattern>;
-// BL, BLX(1) are translated by assembler into two instructions
-class TIx2<dag outs, dag ins, string asm, list<dag> pattern>
- : ThumbI<outs, ins, AddrModeNone, Size4Bytes, asm, "", pattern>;
-
-// BR_JT instructions
-class TJTI<dag outs, dag ins, string asm, list<dag> pattern>
- : ThumbI<outs, ins, AddrModeNone, SizeSpecial, asm, "", pattern>;
+// Two-address instructions
+class TIt<dag oops, dag iops, InstrItinClass itin, string asm, list<dag> pattern>
+ : ThumbI<oops, iops, AddrModeNone, Size2Bytes, itin, asm, "$lhs = $dst", pattern>;
-// TPat - Same as Pat<>, but requires that the compiler be in Thumb mode.
-class TPat<dag pattern, dag result> : Pat<pattern, result> {
- list<Predicate> Predicates = [IsThumb];
-}
+// tBL, tBX instructions
+class TIx2<dag oops, dag iops, InstrItinClass itin, string asm, list<dag> pattern>
+ : ThumbI<oops, iops, AddrModeNone, Size4Bytes, itin, asm, "", pattern>;
-class Tv5Pat<dag pattern, dag result> : Pat<pattern, result> {
- list<Predicate> Predicates = [IsThumb, HasV5T];
-}
+// BR_JT instructions
+class TJTI<dag oops, dag iops, InstrItinClass itin, string asm, list<dag> pattern>
+ : ThumbI<oops, iops, AddrModeNone, SizeSpecial, itin, asm, "", pattern>;
// Thumb1 only
-class Thumb1I<dag outs, dag ins, AddrMode am, SizeFlagVal sz,
- string asm, string cstr, list<dag> pattern>
- : InstARM<am, sz, IndexModeNone, ThumbFrm, cstr> {
- let OutOperandList = outs;
- let InOperandList = ins;
+class Thumb1I<dag oops, dag iops, AddrMode am, SizeFlagVal sz,
+ InstrItinClass itin, string asm, string cstr, list<dag> pattern>
+ : InstARM<am, sz, IndexModeNone, ThumbFrm, cstr, itin> {
+ let OutOperandList = oops;
+ let InOperandList = iops;
let AsmString = asm;
let Pattern = pattern;
list<Predicate> Predicates = [IsThumb1Only];
}
-class T1I<dag outs, dag ins, string asm, list<dag> pattern>
- : Thumb1I<outs, ins, AddrModeNone, Size2Bytes, asm, "", pattern>;
-class T1I1<dag outs, dag ins, string asm, list<dag> pattern>
- : Thumb1I<outs, ins, AddrModeT1_1, Size2Bytes, asm, "", pattern>;
-class T1I2<dag outs, dag ins, string asm, list<dag> pattern>
- : Thumb1I<outs, ins, AddrModeT1_2, Size2Bytes, asm, "", pattern>;
-class T1I4<dag outs, dag ins, string asm, list<dag> pattern>
- : Thumb1I<outs, ins, AddrModeT1_4, Size2Bytes, asm, "", pattern>;
-class T1Is<dag outs, dag ins, string asm, list<dag> pattern>
- : Thumb1I<outs, ins, AddrModeT1_s, Size2Bytes, asm, "", pattern>;
-class T1Ix2<dag outs, dag ins, string asm, list<dag> pattern>
- : Thumb1I<outs, ins, AddrModeNone, Size4Bytes, asm, "", pattern>;
-class T1JTI<dag outs, dag ins, string asm, list<dag> pattern>
- : Thumb1I<outs, ins, AddrModeNone, SizeSpecial, asm, "", pattern>;
+class T1I<dag oops, dag iops, InstrItinClass itin,
+ string asm, list<dag> pattern>
+ : Thumb1I<oops, iops, AddrModeNone, Size2Bytes, itin, asm, "", pattern>;
+class T1Ix2<dag oops, dag iops, InstrItinClass itin,
+ string asm, list<dag> pattern>
+ : Thumb1I<oops, iops, AddrModeNone, Size4Bytes, itin, asm, "", pattern>;
+class T1JTI<dag oops, dag iops, InstrItinClass itin,
+ string asm, list<dag> pattern>
+ : Thumb1I<oops, iops, AddrModeNone, SizeSpecial, itin, asm, "", pattern>;
// Two-address instructions
-class T1It<dag outs, dag ins, string asm, list<dag> pattern>
- : Thumb1I<outs, ins, AddrModeNone, Size2Bytes, asm, "$lhs = $dst", pattern>;
+class T1It<dag oops, dag iops, InstrItinClass itin,
+ string asm, list<dag> pattern>
+ : Thumb1I<oops, iops, AddrModeNone, Size2Bytes, itin,
+ asm, "$lhs = $dst", pattern>;
-class T1Pat<dag pattern, dag result> : Pat<pattern, result> {
+// Thumb1 instruction that can either be predicated or set CPSR.
+class Thumb1sI<dag oops, dag iops, AddrMode am, SizeFlagVal sz,
+ InstrItinClass itin,
+ string opc, string asm, string cstr, list<dag> pattern>
+ : InstARM<am, sz, IndexModeNone, ThumbFrm, cstr, itin> {
+ let OutOperandList = !con(oops, (ops s_cc_out:$s));
+ let InOperandList = !con(iops, (ops pred:$p));
+ let AsmString = !strconcat(opc, !strconcat("${s}${p}", asm));
+ let Pattern = pattern;
+ list<Predicate> Predicates = [IsThumb1Only];
+}
+
+class T1sI<dag oops, dag iops, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : Thumb1sI<oops, iops, AddrModeNone, Size2Bytes, itin, opc, asm, "", pattern>;
+
+// Two-address instructions
+class T1sIt<dag oops, dag iops, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : Thumb1sI<oops, iops, AddrModeNone, Size2Bytes, itin, opc, asm,
+ "$lhs = $dst", pattern>;
+
+// Thumb1 instruction that can be predicated.
+class Thumb1pI<dag oops, dag iops, AddrMode am, SizeFlagVal sz,
+ InstrItinClass itin,
+ string opc, string asm, string cstr, list<dag> pattern>
+ : InstARM<am, sz, IndexModeNone, ThumbFrm, cstr, itin> {
+ let OutOperandList = oops;
+ let InOperandList = !con(iops, (ops pred:$p));
+ let AsmString = !strconcat(opc, !strconcat("${p}", asm));
+ let Pattern = pattern;
list<Predicate> Predicates = [IsThumb1Only];
}
+class T1pI<dag oops, dag iops, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : Thumb1pI<oops, iops, AddrModeNone, Size2Bytes, itin, opc, asm, "", pattern>;
+
+// Two-address instructions
+class T1pIt<dag oops, dag iops, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : Thumb1pI<oops, iops, AddrModeNone, Size2Bytes, itin, opc, asm,
+ "$lhs = $dst", pattern>;
+
+class T1pI1<dag oops, dag iops, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : Thumb1pI<oops, iops, AddrModeT1_1, Size2Bytes, itin, opc, asm, "", pattern>;
+class T1pI2<dag oops, dag iops, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : Thumb1pI<oops, iops, AddrModeT1_2, Size2Bytes, itin, opc, asm, "", pattern>;
+class T1pI4<dag oops, dag iops, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : Thumb1pI<oops, iops, AddrModeT1_4, Size2Bytes, itin, opc, asm, "", pattern>;
+class T1pIs<dag oops, dag iops,
+ InstrItinClass itin, string opc, string asm, list<dag> pattern>
+ : Thumb1pI<oops, iops, AddrModeT1_s, Size2Bytes, itin, opc, asm, "", pattern>;
+
// Thumb2I - Thumb2 instruction. Almost all Thumb2 instructions are predicable.
class Thumb2I<dag oops, dag iops, AddrMode am, SizeFlagVal sz,
+ InstrItinClass itin,
string opc, string asm, string cstr, list<dag> pattern>
- : InstARM<am, sz, IndexModeNone, ThumbFrm, cstr> {
+ : InstARM<am, sz, IndexModeNone, ThumbFrm, cstr, itin> {
let OutOperandList = oops;
let InOperandList = !con(iops, (ops pred:$p));
let AsmString = !strconcat(opc, !strconcat("${p}", asm));
@@ -832,8 +932,9 @@ class Thumb2I<dag oops, dag iops, AddrMode am, SizeFlagVal sz,
// FIXME: This uses unified syntax so {s} comes before {p}. We should make it
// more consistent.
class Thumb2sI<dag oops, dag iops, AddrMode am, SizeFlagVal sz,
+ InstrItinClass itin,
string opc, string asm, string cstr, list<dag> pattern>
- : InstARM<am, sz, IndexModeNone, ThumbFrm, cstr> {
+ : InstARM<am, sz, IndexModeNone, ThumbFrm, cstr, itin> {
let OutOperandList = oops;
let InOperandList = !con(iops, (ops pred:$p, cc_out:$s));
let AsmString = !strconcat(opc, !strconcat("${s}${p}", asm));
@@ -843,8 +944,9 @@ class Thumb2sI<dag oops, dag iops, AddrMode am, SizeFlagVal sz,
// Special cases
class Thumb2XI<dag oops, dag iops, AddrMode am, SizeFlagVal sz,
+ InstrItinClass itin,
string asm, string cstr, list<dag> pattern>
- : InstARM<am, sz, IndexModeNone, ThumbFrm, cstr> {
+ : InstARM<am, sz, IndexModeNone, ThumbFrm, cstr, itin> {
let OutOperandList = oops;
let InOperandList = iops;
let AsmString = asm;
@@ -852,31 +954,46 @@ class Thumb2XI<dag oops, dag iops, AddrMode am, SizeFlagVal sz,
list<Predicate> Predicates = [IsThumb2];
}
-class T2I<dag oops, dag iops, string opc, string asm, list<dag> pattern>
- : Thumb2I<oops, iops, AddrModeNone, Size4Bytes, opc, asm, "", pattern>;
-class T2Ii12<dag oops, dag iops, string opc, string asm, list<dag> pattern>
- : Thumb2I<oops, iops, AddrModeT2_i12, Size4Bytes, opc, asm, "", pattern>;
-class T2Ii8<dag oops, dag iops, string opc, string asm, list<dag> pattern>
- : Thumb2I<oops, iops, AddrModeT2_i8, Size4Bytes, opc, asm, "", pattern>;
-class T2Iso<dag oops, dag iops, string opc, string asm, list<dag> pattern>
- : Thumb2I<oops, iops, AddrModeT2_so, Size4Bytes, opc, asm, "", pattern>;
-class T2Ipc<dag oops, dag iops, string opc, string asm, list<dag> pattern>
- : Thumb2I<oops, iops, AddrModeT2_pc, Size4Bytes, opc, asm, "", pattern>;
-class T2Ii8s4<dag oops, dag iops, string opc, string asm, list<dag> pattern>
- : Thumb2I<oops, iops, AddrModeT2_i8s4, Size4Bytes, opc, asm, "", pattern>;
+class T2I<dag oops, dag iops, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : Thumb2I<oops, iops, AddrModeNone, Size4Bytes, itin, opc, asm, "", pattern>;
+class T2Ii12<dag oops, dag iops, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : Thumb2I<oops, iops, AddrModeT2_i12, Size4Bytes, itin, opc, asm, "", pattern>;
+class T2Ii8<dag oops, dag iops, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : Thumb2I<oops, iops, AddrModeT2_i8, Size4Bytes, itin, opc, asm, "", pattern>;
+class T2Iso<dag oops, dag iops, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : Thumb2I<oops, iops, AddrModeT2_so, Size4Bytes, itin, opc, asm, "", pattern>;
+class T2Ipc<dag oops, dag iops, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : Thumb2I<oops, iops, AddrModeT2_pc, Size4Bytes, itin, opc, asm, "", pattern>;
+class T2Ii8s4<dag oops, dag iops, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : Thumb2I<oops, iops, AddrModeT2_i8s4, Size4Bytes, itin, opc, asm, "", pattern>;
+
+class T2sI<dag oops, dag iops, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : Thumb2sI<oops, iops, AddrModeNone, Size4Bytes, itin, opc, asm, "", pattern>;
+
+class T2XI<dag oops, dag iops, InstrItinClass itin,
+ string asm, list<dag> pattern>
+ : Thumb2XI<oops, iops, AddrModeNone, Size4Bytes, itin, asm, "", pattern>;
+class T2JTI<dag oops, dag iops, InstrItinClass itin,
+ string asm, list<dag> pattern>
+ : Thumb2XI<oops, iops, AddrModeNone, SizeSpecial, itin, asm, "", pattern>;
-class T2sI<dag oops, dag iops, string opc, string asm, list<dag> pattern>
- : Thumb2sI<oops, iops, AddrModeNone, Size4Bytes, opc, asm, "", pattern>;
+class T2Ix2<dag oops, dag iops, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : Thumb2I<oops, iops, AddrModeNone, Size8Bytes, itin, opc, asm, "", pattern>;
-class T2XI<dag oops, dag iops, string asm, list<dag> pattern>
- : Thumb2XI<oops, iops, AddrModeNone, Size4Bytes, asm, "", pattern>;
-class T2JTI<dag oops, dag iops, string asm, list<dag> pattern>
- : Thumb2XI<oops, iops, AddrModeNone, SizeSpecial, asm, "", pattern>;
// T2Iidxldst - Thumb2 indexed load / store instructions.
class T2Iidxldst<dag oops, dag iops, AddrMode am, IndexMode im,
+ InstrItinClass itin,
string opc, string asm, string cstr, list<dag> pattern>
- : InstARM<am, Size4Bytes, im, ThumbFrm, cstr> {
+ : InstARM<am, Size4Bytes, im, ThumbFrm, cstr, itin> {
let OutOperandList = oops;
let InOperandList = !con(iops, (ops pred:$p));
let AsmString = !strconcat(opc, !strconcat("${p}", asm));
@@ -884,6 +1001,15 @@ class T2Iidxldst<dag oops, dag iops, AddrMode am, IndexMode im,
list<Predicate> Predicates = [IsThumb2];
}
+// Tv5Pat - Same as Pat<>, but requires V5T Thumb mode.
+class Tv5Pat<dag pattern, dag result> : Pat<pattern, result> {
+ list<Predicate> Predicates = [IsThumb1Only, HasV5T];
+}
+
+// T1Pat - Same as Pat<>, but requires that the compiler be in Thumb1 mode.
+class T1Pat<dag pattern, dag result> : Pat<pattern, result> {
+ list<Predicate> Predicates = [IsThumb1Only];
+}
// T2Pat - Same as Pat<>, but requires that the compiler be in Thumb2 mode.
class T2Pat<dag pattern, dag result> : Pat<pattern, result> {
@@ -896,11 +1022,41 @@ class T2Pat<dag pattern, dag result> : Pat<pattern, result> {
// ARM VFP Instruction templates.
//
+// Almost all VFP instructions are predicable.
+class VFPI<dag oops, dag iops, AddrMode am, SizeFlagVal sz,
+ IndexMode im, Format f, InstrItinClass itin,
+ string opc, string asm, string cstr, list<dag> pattern>
+ : InstARM<am, sz, im, f, cstr, itin> {
+ let OutOperandList = oops;
+ let InOperandList = !con(iops, (ops pred:$p));
+ let AsmString = !strconcat(opc, !strconcat("${p}", asm));
+ let Pattern = pattern;
+ list<Predicate> Predicates = [HasVFP2];
+}
+
+// Special cases
+class VFPXI<dag oops, dag iops, AddrMode am, SizeFlagVal sz,
+ IndexMode im, Format f, InstrItinClass itin,
+ string asm, string cstr, list<dag> pattern>
+ : InstARM<am, sz, im, f, cstr, itin> {
+ let OutOperandList = oops;
+ let InOperandList = iops;
+ let AsmString = asm;
+ let Pattern = pattern;
+ list<Predicate> Predicates = [HasVFP2];
+}
+
+class VFPAI<dag oops, dag iops, Format f, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : VFPI<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, f, itin,
+ opc, asm, "", pattern>;
+
// ARM VFP addrmode5 loads and stores
class ADI5<bits<4> opcod1, bits<2> opcod2, dag oops, dag iops,
+ InstrItinClass itin,
string opc, string asm, list<dag> pattern>
- : I<oops, iops, AddrMode5, Size4Bytes, IndexModeNone,
- VFPLdStFrm, opc, asm, "", pattern> {
+ : VFPI<oops, iops, AddrMode5, Size4Bytes, IndexModeNone,
+ VFPLdStFrm, itin, opc, asm, "", pattern> {
// TODO: Mark the instructions with the appropriate subtarget info.
let Inst{27-24} = opcod1;
let Inst{21-20} = opcod2;
@@ -908,9 +1064,10 @@ class ADI5<bits<4> opcod1, bits<2> opcod2, dag oops, dag iops,
}
class ASI5<bits<4> opcod1, bits<2> opcod2, dag oops, dag iops,
+ InstrItinClass itin,
string opc, string asm, list<dag> pattern>
- : I<oops, iops, AddrMode5, Size4Bytes, IndexModeNone,
- VFPLdStFrm, opc, asm, "", pattern> {
+ : VFPI<oops, iops, AddrMode5, Size4Bytes, IndexModeNone,
+ VFPLdStFrm, itin, opc, asm, "", pattern> {
// TODO: Mark the instructions with the appropriate subtarget info.
let Inst{27-24} = opcod1;
let Inst{21-20} = opcod2;
@@ -918,27 +1075,28 @@ class ASI5<bits<4> opcod1, bits<2> opcod2, dag oops, dag iops,
}
// Load / store multiple
-class AXSI5<dag oops, dag iops, string asm, list<dag> pattern>
- : XI<oops, iops, AddrMode5, Size4Bytes, IndexModeNone,
- VFPLdStMulFrm, asm, "", pattern> {
+class AXDI5<dag oops, dag iops, InstrItinClass itin,
+ string asm, list<dag> pattern>
+ : VFPXI<oops, iops, AddrMode5, Size4Bytes, IndexModeNone,
+ VFPLdStMulFrm, itin, asm, "", pattern> {
// TODO: Mark the instructions with the appropriate subtarget info.
let Inst{27-25} = 0b110;
let Inst{11-8} = 0b1011;
}
-class AXDI5<dag oops, dag iops, string asm, list<dag> pattern>
- : XI<oops, iops, AddrMode5, Size4Bytes, IndexModeNone,
- VFPLdStMulFrm, asm, "", pattern> {
+class AXSI5<dag oops, dag iops, InstrItinClass itin,
+ string asm, list<dag> pattern>
+ : VFPXI<oops, iops, AddrMode5, Size4Bytes, IndexModeNone,
+ VFPLdStMulFrm, itin, asm, "", pattern> {
// TODO: Mark the instructions with the appropriate subtarget info.
let Inst{27-25} = 0b110;
let Inst{11-8} = 0b1010;
}
-
// Double precision, unary
class ADuI<bits<8> opcod1, bits<4> opcod2, bits<4> opcod3, dag oops, dag iops,
- string opc, string asm, list<dag> pattern>
- : AI<oops, iops, VFPUnaryFrm, opc, asm, pattern> {
+ InstrItinClass itin, string opc, string asm, list<dag> pattern>
+ : VFPAI<oops, iops, VFPUnaryFrm, itin, opc, asm, pattern> {
let Inst{27-20} = opcod1;
let Inst{19-16} = opcod2;
let Inst{11-8} = 0b1011;
@@ -946,17 +1104,17 @@ class ADuI<bits<8> opcod1, bits<4> opcod2, bits<4> opcod3, dag oops, dag iops,
}
// Double precision, binary
-class ADbI<bits<8> opcod, dag oops, dag iops, string opc,
- string asm, list<dag> pattern>
- : AI<oops, iops, VFPBinaryFrm, opc, asm, pattern> {
+class ADbI<bits<8> opcod, dag oops, dag iops, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : VFPAI<oops, iops, VFPBinaryFrm, itin, opc, asm, pattern> {
let Inst{27-20} = opcod;
let Inst{11-8} = 0b1011;
}
// Single precision, unary
class ASuI<bits<8> opcod1, bits<4> opcod2, bits<4> opcod3, dag oops, dag iops,
- string opc, string asm, list<dag> pattern>
- : AI<oops, iops, VFPUnaryFrm, opc, asm, pattern> {
+ InstrItinClass itin, string opc, string asm, list<dag> pattern>
+ : VFPAI<oops, iops, VFPUnaryFrm, itin, opc, asm, pattern> {
// Bits 22 (D bit) and 5 (M bit) will be changed during instruction encoding.
let Inst{27-20} = opcod1;
let Inst{19-16} = opcod2;
@@ -964,48 +1122,74 @@ class ASuI<bits<8> opcod1, bits<4> opcod2, bits<4> opcod3, dag oops, dag iops,
let Inst{7-4} = opcod3;
}
+// Single precision unary, if no NEON
+// Same as ASuI except not available if NEON is enabled
+class ASuIn<bits<8> opcod1, bits<4> opcod2, bits<4> opcod3, dag oops, dag iops,
+ InstrItinClass itin, string opc, string asm, list<dag> pattern>
+ : ASuI<opcod1, opcod2, opcod2, oops, iops, itin, opc, asm, pattern> {
+ list<Predicate> Predicates = [HasVFP2,DontUseNEONForFP];
+}
+
// Single precision, binary
-class ASbI<bits<8> opcod, dag oops, dag iops, string opc,
- string asm, list<dag> pattern>
- : AI<oops, iops, VFPBinaryFrm, opc, asm, pattern> {
+class ASbI<bits<8> opcod, dag oops, dag iops, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : VFPAI<oops, iops, VFPBinaryFrm, itin, opc, asm, pattern> {
// Bit 22 (D bit) can be changed during instruction encoding.
let Inst{27-20} = opcod;
let Inst{11-8} = 0b1010;
}
+// Single precision binary, if no NEON
+// Same as ASbI except not available if NEON is enabled
+class ASbIn<bits<8> opcod, dag oops, dag iops, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : ASbI<opcod, oops, iops, itin, opc, asm, pattern> {
+ list<Predicate> Predicates = [HasVFP2,DontUseNEONForFP];
+}
+
// VFP conversion instructions
class AVConv1I<bits<8> opcod1, bits<4> opcod2, bits<4> opcod3,
- dag oops, dag iops, string opc, string asm, list<dag> pattern>
- : AI<oops, iops, VFPConv1Frm, opc, asm, pattern> {
+ dag oops, dag iops, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : VFPAI<oops, iops, VFPConv1Frm, itin, opc, asm, pattern> {
let Inst{27-20} = opcod1;
let Inst{19-16} = opcod2;
let Inst{11-8} = opcod3;
let Inst{6} = 1;
}
+// VFP conversion instructions, if no NEON
+class AVConv1In<bits<8> opcod1, bits<4> opcod2, bits<4> opcod3,
+ dag oops, dag iops, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : AVConv1I<opcod1, opcod2, opcod3, oops, iops, itin, opc, asm, pattern> {
+ list<Predicate> Predicates = [HasVFP2,DontUseNEONForFP];
+}
+
class AVConvXI<bits<8> opcod1, bits<4> opcod2, dag oops, dag iops, Format f,
- string opc, string asm, list<dag> pattern>
- : AI<oops, iops, f, opc, asm, pattern> {
+ InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : VFPAI<oops, iops, f, itin, opc, asm, pattern> {
let Inst{27-20} = opcod1;
let Inst{11-8} = opcod2;
let Inst{4} = 1;
}
-class AVConv2I<bits<8> opcod1, bits<4> opcod2, dag oops, dag iops, string opc,
- string asm, list<dag> pattern>
- : AVConvXI<opcod1, opcod2, oops, iops, VFPConv2Frm, opc, asm, pattern>;
+class AVConv2I<bits<8> opcod1, bits<4> opcod2, dag oops, dag iops,
+ InstrItinClass itin, string opc, string asm, list<dag> pattern>
+ : AVConvXI<opcod1, opcod2, oops, iops, VFPConv2Frm, itin, opc, asm, pattern>;
-class AVConv3I<bits<8> opcod1, bits<4> opcod2, dag oops, dag iops, string opc,
- string asm, list<dag> pattern>
- : AVConvXI<opcod1, opcod2, oops, iops, VFPConv3Frm, opc, asm, pattern>;
+class AVConv3I<bits<8> opcod1, bits<4> opcod2, dag oops, dag iops,
+ InstrItinClass itin, string opc, string asm, list<dag> pattern>
+ : AVConvXI<opcod1, opcod2, oops, iops, VFPConv3Frm, itin, opc, asm, pattern>;
-class AVConv4I<bits<8> opcod1, bits<4> opcod2, dag oops, dag iops, string opc,
- string asm, list<dag> pattern>
- : AVConvXI<opcod1, opcod2, oops, iops, VFPConv4Frm, opc, asm, pattern>;
+class AVConv4I<bits<8> opcod1, bits<4> opcod2, dag oops, dag iops,
+ InstrItinClass itin, string opc, string asm, list<dag> pattern>
+ : AVConvXI<opcod1, opcod2, oops, iops, VFPConv4Frm, itin, opc, asm, pattern>;
-class AVConv5I<bits<8> opcod1, bits<4> opcod2, dag oops, dag iops, string opc,
- string asm, list<dag> pattern>
- : AVConvXI<opcod1, opcod2, oops, iops, VFPConv5Frm, opc, asm, pattern>;
+class AVConv5I<bits<8> opcod1, bits<4> opcod2, dag oops, dag iops,
+ InstrItinClass itin, string opc, string asm, list<dag> pattern>
+ : AVConvXI<opcod1, opcod2, oops, iops, VFPConv5Frm, itin, opc, asm, pattern>;
//===----------------------------------------------------------------------===//
@@ -1013,9 +1197,9 @@ class AVConv5I<bits<8> opcod1, bits<4> opcod2, dag oops, dag iops, string opc,
// ARM NEON Instruction templates.
//
-class NeonI<dag oops, dag iops, AddrMode am, IndexMode im, string asm,
- string cstr, list<dag> pattern>
- : InstARM<am, Size4Bytes, im, NEONFrm, cstr> {
+class NeonI<dag oops, dag iops, AddrMode am, IndexMode im, InstrItinClass itin,
+ string asm, string cstr, list<dag> pattern>
+ : InstARM<am, Size4Bytes, im, NEONFrm, cstr, itin> {
let OutOperandList = oops;
let InOperandList = iops;
let AsmString = asm;
@@ -1023,20 +1207,33 @@ class NeonI<dag oops, dag iops, AddrMode am, IndexMode im, string asm,
list<Predicate> Predicates = [HasNEON];
}
-class NI<dag oops, dag iops, string asm, list<dag> pattern>
- : NeonI<oops, iops, AddrModeNone, IndexModeNone, asm, "", pattern> {
+class NI<dag oops, dag iops, InstrItinClass itin, string asm, list<dag> pattern>
+ : NeonI<oops, iops, AddrModeNone, IndexModeNone, itin, asm, "", pattern> {
+}
+
+class NI4<dag oops, dag iops, InstrItinClass itin, string asm, list<dag> pattern>
+ : NeonI<oops, iops, AddrMode4, IndexModeNone, itin, asm, "", pattern> {
+}
+
+class NLdSt<bit op23, bits<2> op21_20, bits<4> op11_8, bits<4> op7_4,
+ dag oops, dag iops, InstrItinClass itin,
+ string asm, string cstr, list<dag> pattern>
+ : NeonI<oops, iops, AddrMode6, IndexModeNone, itin, asm, cstr, pattern> {
+ let Inst{31-24} = 0b11110100;
}
-class NDataI<dag oops, dag iops, string asm, string cstr, list<dag> pattern>
- : NeonI<oops, iops, AddrModeNone, IndexModeNone, asm, cstr, pattern> {
+class NDataI<dag oops, dag iops, InstrItinClass itin,
+ string asm, string cstr, list<dag> pattern>
+ : NeonI<oops, iops, AddrModeNone, IndexModeNone, itin, asm, cstr, pattern> {
let Inst{31-25} = 0b1111001;
}
// NEON "one register and a modified immediate" format.
class N1ModImm<bit op23, bits<3> op21_19, bits<4> op11_8, bit op7, bit op6,
bit op5, bit op4,
- dag oops, dag iops, string asm, string cstr, list<dag> pattern>
- : NDataI<oops, iops, asm, cstr, pattern> {
+ dag oops, dag iops, InstrItinClass itin,
+ string asm, string cstr, list<dag> pattern>
+ : NDataI<oops, iops, itin, asm, cstr, pattern> {
let Inst{23} = op23;
let Inst{21-19} = op21_19;
let Inst{11-8} = op11_8;
@@ -1049,8 +1246,9 @@ class N1ModImm<bit op23, bits<3> op21_19, bits<4> op11_8, bit op7, bit op6,
// NEON 2 vector register format.
class N2V<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, bits<2> op17_16,
bits<5> op11_7, bit op6, bit op4,
- dag oops, dag iops, string asm, string cstr, list<dag> pattern>
- : NDataI<oops, iops, asm, cstr, pattern> {
+ dag oops, dag iops, InstrItinClass itin,
+ string asm, string cstr, list<dag> pattern>
+ : NDataI<oops, iops, itin, asm, cstr, pattern> {
let Inst{24-23} = op24_23;
let Inst{21-20} = op21_20;
let Inst{19-18} = op19_18;
@@ -1063,8 +1261,9 @@ class N2V<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, bits<2> op17_16,
// NEON 2 vector register with immediate.
class N2VImm<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7,
bit op6, bit op4,
- dag oops, dag iops, string asm, string cstr, list<dag> pattern>
- : NDataI<oops, iops, asm, cstr, pattern> {
+ dag oops, dag iops, InstrItinClass itin,
+ string asm, string cstr, list<dag> pattern>
+ : NDataI<oops, iops, itin, asm, cstr, pattern> {
let Inst{24} = op24;
let Inst{23} = op23;
let Inst{21-16} = op21_16;
@@ -1076,8 +1275,9 @@ class N2VImm<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7,
// NEON 3 vector register format.
class N3V<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6, bit op4,
- dag oops, dag iops, string asm, string cstr, list<dag> pattern>
- : NDataI<oops, iops, asm, cstr, pattern> {
+ dag oops, dag iops, InstrItinClass itin,
+ string asm, string cstr, list<dag> pattern>
+ : NDataI<oops, iops, itin, asm, cstr, pattern> {
let Inst{24} = op24;
let Inst{23} = op23;
let Inst{21-20} = op21_20;
@@ -1088,9 +1288,9 @@ class N3V<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6, bit op4,
// NEON VMOVs between scalar and core registers.
class NVLaneOp<bits<8> opcod1, bits<4> opcod2, bits<2> opcod3,
- dag oops, dag iops, Format f, string opc, string asm,
- list<dag> pattern>
- : AI<oops, iops, f, opc, asm, pattern> {
+ dag oops, dag iops, Format f, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : AI<oops, iops, f, itin, opc, asm, pattern> {
let Inst{27-20} = opcod1;
let Inst{11-8} = opcod2;
let Inst{6-5} = opcod3;
@@ -1098,13 +1298,23 @@ class NVLaneOp<bits<8> opcod1, bits<4> opcod2, bits<2> opcod3,
list<Predicate> Predicates = [HasNEON];
}
class NVGetLane<bits<8> opcod1, bits<4> opcod2, bits<2> opcod3,
- dag oops, dag iops, string opc, string asm, list<dag> pattern>
- : NVLaneOp<opcod1, opcod2, opcod3, oops, iops, NEONGetLnFrm, opc, asm,
- pattern>;
+ dag oops, dag iops, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : NVLaneOp<opcod1, opcod2, opcod3, oops, iops, NEONGetLnFrm, itin,
+ opc, asm, pattern>;
class NVSetLane<bits<8> opcod1, bits<4> opcod2, bits<2> opcod3,
- dag oops, dag iops, string opc, string asm, list<dag> pattern>
- : NVLaneOp<opcod1, opcod2, opcod3, oops, iops, NEONSetLnFrm, opc, asm,
- pattern>;
+ dag oops, dag iops, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : NVLaneOp<opcod1, opcod2, opcod3, oops, iops, NEONSetLnFrm, itin,
+ opc, asm, pattern>;
class NVDup<bits<8> opcod1, bits<4> opcod2, bits<2> opcod3,
- dag oops, dag iops, string opc, string asm, list<dag> pattern>
- : NVLaneOp<opcod1, opcod2, opcod3, oops, iops, NEONDupFrm, opc, asm, pattern>;
+ dag oops, dag iops, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : NVLaneOp<opcod1, opcod2, opcod3, oops, iops, NEONDupFrm, itin,
+ opc, asm, pattern>;
+
+// NEONFPPat - Same as Pat<>, but requires that the compiler be using NEON
+// for single-precision FP.
+class NEONFPPat<dag pattern, dag result> : Pat<pattern, result> {
+ list<Predicate> Predicates = [HasNEON,UseNEONForFP];
+}
diff --git a/lib/Target/ARM/ARMInstrInfo.cpp b/lib/Target/ARM/ARMInstrInfo.cpp
index 443fdc7..4c92891 100644
--- a/lib/Target/ARM/ARMInstrInfo.cpp
+++ b/lib/Target/ARM/ARMInstrInfo.cpp
@@ -21,52 +21,15 @@
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineJumpTableInfo.h"
-#include "llvm/Target/TargetAsmInfo.h"
+#include "llvm/MC/MCAsmInfo.h"
#include "llvm/Support/CommandLine.h"
using namespace llvm;
-static cl::opt<bool>
-EnableARM3Addr("enable-arm-3-addr-conv", cl::Hidden,
- cl::desc("Enable ARM 2-addr to 3-addr conv"));
-
-static inline
-const MachineInstrBuilder &AddDefaultPred(const MachineInstrBuilder &MIB) {
- return MIB.addImm((int64_t)ARMCC::AL).addReg(0);
-}
-
-static inline
-const MachineInstrBuilder &AddDefaultCC(const MachineInstrBuilder &MIB) {
- return MIB.addReg(0);
-}
-
-ARMBaseInstrInfo::ARMBaseInstrInfo(const ARMSubtarget &STI)
- : TargetInstrInfoImpl(ARMInsts, array_lengthof(ARMInsts)) {
-}
-
ARMInstrInfo::ARMInstrInfo(const ARMSubtarget &STI)
- : ARMBaseInstrInfo(STI), RI(*this, STI) {
+ : RI(*this, STI), Subtarget(STI) {
}
-void ARMInstrInfo::reMaterialize(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator I,
- unsigned DestReg,
- const MachineInstr *Orig) const {
- DebugLoc dl = Orig->getDebugLoc();
- if (Orig->getOpcode() == ARM::MOVi2pieces) {
- RI.emitLoadConstPool(MBB, I, this, dl,
- DestReg,
- Orig->getOperand(1).getImm(),
- (ARMCC::CondCodes)Orig->getOperand(2).getImm(),
- Orig->getOperand(3).getReg());
- return;
- }
-
- MachineInstr *MI = MBB.getParent()->CloneMachineInstr(Orig);
- MI->getOperand(0).setReg(DestReg);
- MBB.insert(I, MI);
-}
-
-static unsigned getUnindexedOpcode(unsigned Opc) {
+unsigned ARMInstrInfo::getUnindexedOpcode(unsigned Opc) const {
switch (Opc) {
default: break;
case ARM::LDR_PRE:
@@ -94,820 +57,45 @@ static unsigned getUnindexedOpcode(unsigned Opc) {
case ARM::STRB_POST:
return ARM::STRB;
}
- return 0;
-}
-
-MachineInstr *
-ARMBaseInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
- MachineBasicBlock::iterator &MBBI,
- LiveVariables *LV) const {
- if (!EnableARM3Addr)
- return NULL;
-
- MachineInstr *MI = MBBI;
- MachineFunction &MF = *MI->getParent()->getParent();
- unsigned TSFlags = MI->getDesc().TSFlags;
- bool isPre = false;
- switch ((TSFlags & ARMII::IndexModeMask) >> ARMII::IndexModeShift) {
- default: return NULL;
- case ARMII::IndexModePre:
- isPre = true;
- break;
- case ARMII::IndexModePost:
- break;
- }
-
- // Try splitting an indexed load/store to an un-indexed one plus an add/sub
- // operation.
- unsigned MemOpc = getUnindexedOpcode(MI->getOpcode());
- if (MemOpc == 0)
- return NULL;
-
- MachineInstr *UpdateMI = NULL;
- MachineInstr *MemMI = NULL;
- unsigned AddrMode = (TSFlags & ARMII::AddrModeMask);
- const TargetInstrDesc &TID = MI->getDesc();
- unsigned NumOps = TID.getNumOperands();
- bool isLoad = !TID.mayStore();
- const MachineOperand &WB = isLoad ? MI->getOperand(1) : MI->getOperand(0);
- const MachineOperand &Base = MI->getOperand(2);
- const MachineOperand &Offset = MI->getOperand(NumOps-3);
- unsigned WBReg = WB.getReg();
- unsigned BaseReg = Base.getReg();
- unsigned OffReg = Offset.getReg();
- unsigned OffImm = MI->getOperand(NumOps-2).getImm();
- ARMCC::CondCodes Pred = (ARMCC::CondCodes)MI->getOperand(NumOps-1).getImm();
- switch (AddrMode) {
- default:
- assert(false && "Unknown indexed op!");
- return NULL;
- case ARMII::AddrMode2: {
- bool isSub = ARM_AM::getAM2Op(OffImm) == ARM_AM::sub;
- unsigned Amt = ARM_AM::getAM2Offset(OffImm);
- if (OffReg == 0) {
- int SOImmVal = ARM_AM::getSOImmVal(Amt);
- if (SOImmVal == -1)
- // Can't encode it in a so_imm operand. This transformation will
- // add more than 1 instruction. Abandon!
- return NULL;
- UpdateMI = BuildMI(MF, MI->getDebugLoc(),
- get(isSub ? ARM::SUBri : ARM::ADDri), WBReg)
- .addReg(BaseReg).addImm(SOImmVal)
- .addImm(Pred).addReg(0).addReg(0);
- } else if (Amt != 0) {
- ARM_AM::ShiftOpc ShOpc = ARM_AM::getAM2ShiftOpc(OffImm);
- unsigned SOOpc = ARM_AM::getSORegOpc(ShOpc, Amt);
- UpdateMI = BuildMI(MF, MI->getDebugLoc(),
- get(isSub ? ARM::SUBrs : ARM::ADDrs), WBReg)
- .addReg(BaseReg).addReg(OffReg).addReg(0).addImm(SOOpc)
- .addImm(Pred).addReg(0).addReg(0);
- } else
- UpdateMI = BuildMI(MF, MI->getDebugLoc(),
- get(isSub ? ARM::SUBrr : ARM::ADDrr), WBReg)
- .addReg(BaseReg).addReg(OffReg)
- .addImm(Pred).addReg(0).addReg(0);
- break;
- }
- case ARMII::AddrMode3 : {
- bool isSub = ARM_AM::getAM3Op(OffImm) == ARM_AM::sub;
- unsigned Amt = ARM_AM::getAM3Offset(OffImm);
- if (OffReg == 0)
- // Immediate is 8-bits. It's guaranteed to fit in a so_imm operand.
- UpdateMI = BuildMI(MF, MI->getDebugLoc(),
- get(isSub ? ARM::SUBri : ARM::ADDri), WBReg)
- .addReg(BaseReg).addImm(Amt)
- .addImm(Pred).addReg(0).addReg(0);
- else
- UpdateMI = BuildMI(MF, MI->getDebugLoc(),
- get(isSub ? ARM::SUBrr : ARM::ADDrr), WBReg)
- .addReg(BaseReg).addReg(OffReg)
- .addImm(Pred).addReg(0).addReg(0);
- break;
- }
- }
-
- std::vector<MachineInstr*> NewMIs;
- if (isPre) {
- if (isLoad)
- MemMI = BuildMI(MF, MI->getDebugLoc(),
- get(MemOpc), MI->getOperand(0).getReg())
- .addReg(WBReg).addReg(0).addImm(0).addImm(Pred);
- else
- MemMI = BuildMI(MF, MI->getDebugLoc(),
- get(MemOpc)).addReg(MI->getOperand(1).getReg())
- .addReg(WBReg).addReg(0).addImm(0).addImm(Pred);
- NewMIs.push_back(MemMI);
- NewMIs.push_back(UpdateMI);
- } else {
- if (isLoad)
- MemMI = BuildMI(MF, MI->getDebugLoc(),
- get(MemOpc), MI->getOperand(0).getReg())
- .addReg(BaseReg).addReg(0).addImm(0).addImm(Pred);
- else
- MemMI = BuildMI(MF, MI->getDebugLoc(),
- get(MemOpc)).addReg(MI->getOperand(1).getReg())
- .addReg(BaseReg).addReg(0).addImm(0).addImm(Pred);
- if (WB.isDead())
- UpdateMI->getOperand(0).setIsDead();
- NewMIs.push_back(UpdateMI);
- NewMIs.push_back(MemMI);
- }
-
- // Transfer LiveVariables states, kill / dead info.
- if (LV) {
- for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
- MachineOperand &MO = MI->getOperand(i);
- if (MO.isReg() && MO.getReg() &&
- TargetRegisterInfo::isVirtualRegister(MO.getReg())) {
- unsigned Reg = MO.getReg();
-
- LiveVariables::VarInfo &VI = LV->getVarInfo(Reg);
- if (MO.isDef()) {
- MachineInstr *NewMI = (Reg == WBReg) ? UpdateMI : MemMI;
- if (MO.isDead())
- LV->addVirtualRegisterDead(Reg, NewMI);
- }
- if (MO.isUse() && MO.isKill()) {
- for (unsigned j = 0; j < 2; ++j) {
- // Look at the two new MI's in reverse order.
- MachineInstr *NewMI = NewMIs[j];
- if (!NewMI->readsRegister(Reg))
- continue;
- LV->addVirtualRegisterKilled(Reg, NewMI);
- if (VI.removeKill(MI))
- VI.Kills.push_back(NewMI);
- break;
- }
- }
- }
- }
- }
-
- MFI->insert(MBBI, NewMIs[1]);
- MFI->insert(MBBI, NewMIs[0]);
- return NewMIs[0];
-}
-
-// Branch analysis.
-bool
-ARMBaseInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB,
- MachineBasicBlock *&FBB,
- SmallVectorImpl<MachineOperand> &Cond,
- bool AllowModify) const {
- // If the block has no terminators, it just falls into the block after it.
- MachineBasicBlock::iterator I = MBB.end();
- if (I == MBB.begin() || !isUnpredicatedTerminator(--I))
- return false;
-
- // Get the last instruction in the block.
- MachineInstr *LastInst = I;
-
- // If there is only one terminator instruction, process it.
- unsigned LastOpc = LastInst->getOpcode();
- if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) {
- if (LastOpc == ARM::B || LastOpc == ARM::tB || LastOpc == ARM::t2B) {
- TBB = LastInst->getOperand(0).getMBB();
- return false;
- }
- if (LastOpc == ARM::Bcc || LastOpc == ARM::tBcc || LastOpc == ARM::t2Bcc) {
- // Block ends with fall-through condbranch.
- TBB = LastInst->getOperand(0).getMBB();
- Cond.push_back(LastInst->getOperand(1));
- Cond.push_back(LastInst->getOperand(2));
- return false;
- }
- return true; // Can't handle indirect branch.
- }
-
- // Get the instruction before it if it is a terminator.
- MachineInstr *SecondLastInst = I;
-
- // If there are three terminators, we don't know what sort of block this is.
- if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(--I))
- return true;
-
- // If the block ends with ARM::B/ARM::tB/ARM::t2B and a
- // ARM::Bcc/ARM::tBcc/ARM::t2Bcc, handle it.
- unsigned SecondLastOpc = SecondLastInst->getOpcode();
- if ((SecondLastOpc == ARM::Bcc && LastOpc == ARM::B) ||
- (SecondLastOpc == ARM::tBcc && LastOpc == ARM::tB) ||
- (SecondLastOpc == ARM::t2Bcc && LastOpc == ARM::t2B)) {
- TBB = SecondLastInst->getOperand(0).getMBB();
- Cond.push_back(SecondLastInst->getOperand(1));
- Cond.push_back(SecondLastInst->getOperand(2));
- FBB = LastInst->getOperand(0).getMBB();
- return false;
- }
-
- // If the block ends with two unconditional branches, handle it. The second
- // one is not executed, so remove it.
- if ((SecondLastOpc == ARM::B || SecondLastOpc==ARM::tB ||
- SecondLastOpc==ARM::t2B) &&
- (LastOpc == ARM::B || LastOpc == ARM::tB || LastOpc == ARM::t2B)) {
- TBB = SecondLastInst->getOperand(0).getMBB();
- I = LastInst;
- if (AllowModify)
- I->eraseFromParent();
- return false;
- }
-
- // ...likewise if it ends with a branch table followed by an unconditional
- // branch. The branch folder can create these, and we must get rid of them for
- // correctness of Thumb constant islands.
- if ((SecondLastOpc == ARM::BR_JTr || SecondLastOpc==ARM::BR_JTm ||
- SecondLastOpc == ARM::BR_JTadd || SecondLastOpc==ARM::tBR_JTr ||
- SecondLastOpc == ARM::t2BR_JTr || SecondLastOpc==ARM::t2BR_JTm ||
- SecondLastOpc == ARM::t2BR_JTadd) &&
- (LastOpc == ARM::B || LastOpc == ARM::tB || LastOpc == ARM::t2B)) {
- I = LastInst;
- if (AllowModify)
- I->eraseFromParent();
- return true;
- }
-
- // Otherwise, can't handle this.
- return true;
-}
-
-
-unsigned ARMBaseInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
- MachineFunction &MF = *MBB.getParent();
- ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
- int BOpc = AFI->isThumbFunction() ?
- (AFI->isThumb2Function() ? ARM::t2B : ARM::tB) : ARM::B;
- int BccOpc = AFI->isThumbFunction() ?
- (AFI->isThumb2Function() ? ARM::t2Bcc : ARM::tBcc) : ARM::Bcc;
-
- MachineBasicBlock::iterator I = MBB.end();
- if (I == MBB.begin()) return 0;
- --I;
- if (I->getOpcode() != BOpc && I->getOpcode() != BccOpc)
- return 0;
-
- // Remove the branch.
- I->eraseFromParent();
-
- I = MBB.end();
-
- if (I == MBB.begin()) return 1;
- --I;
- if (I->getOpcode() != BccOpc)
- return 1;
-
- // Remove the branch.
- I->eraseFromParent();
- return 2;
-}
-
-unsigned
-ARMBaseInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
- MachineBasicBlock *FBB,
- const SmallVectorImpl<MachineOperand> &Cond) const {
- // FIXME this should probably have a DebugLoc argument
- DebugLoc dl = DebugLoc::getUnknownLoc();
- MachineFunction &MF = *MBB.getParent();
- ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
- int BOpc = AFI->isThumbFunction() ?
- (AFI->isThumb2Function() ? ARM::t2B : ARM::tB) : ARM::B;
- int BccOpc = AFI->isThumbFunction() ?
- (AFI->isThumb2Function() ? ARM::t2Bcc : ARM::tBcc) : ARM::Bcc;
-
- // Shouldn't be a fall through.
- assert(TBB && "InsertBranch must not be told to insert a fallthrough");
- assert((Cond.size() == 2 || Cond.size() == 0) &&
- "ARM branch conditions have two components!");
- if (FBB == 0) {
- if (Cond.empty()) // Unconditional branch?
- BuildMI(&MBB, dl, get(BOpc)).addMBB(TBB);
- else
- BuildMI(&MBB, dl, get(BccOpc)).addMBB(TBB)
- .addImm(Cond[0].getImm()).addReg(Cond[1].getReg());
- return 1;
- }
-
- // Two-way conditional branch.
- BuildMI(&MBB, dl, get(BccOpc)).addMBB(TBB)
- .addImm(Cond[0].getImm()).addReg(Cond[1].getReg());
- BuildMI(&MBB, dl, get(BOpc)).addMBB(FBB);
- return 2;
+ return 0;
}
-bool
-ARMBaseInstrInfo::BlockHasNoFallThrough(const MachineBasicBlock &MBB) const {
+bool ARMInstrInfo::BlockHasNoFallThrough(const MachineBasicBlock &MBB) const {
if (MBB.empty()) return false;
switch (MBB.back().getOpcode()) {
case ARM::BX_RET: // Return.
case ARM::LDM_RET:
- case ARM::tBX_RET:
- case ARM::tBX_RET_vararg:
- case ARM::tPOP_RET:
case ARM::B:
- case ARM::tB:
- case ARM::t2B: // Uncond branch.
- case ARM::tBR_JTr:
- case ARM::t2BR_JTr:
case ARM::BR_JTr: // Jumptable branch.
- case ARM::t2BR_JTm:
case ARM::BR_JTm: // Jumptable branch through mem.
- case ARM::t2BR_JTadd:
case ARM::BR_JTadd: // Jumptable branch add to pc.
return true;
- default: return false;
- }
-}
-
-bool ARMBaseInstrInfo::
-ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
- ARMCC::CondCodes CC = (ARMCC::CondCodes)(int)Cond[0].getImm();
- Cond[0].setImm(ARMCC::getOppositeCondition(CC));
- return false;
-}
-
-bool ARMBaseInstrInfo::isPredicated(const MachineInstr *MI) const {
- int PIdx = MI->findFirstPredOperandIdx();
- return PIdx != -1 && MI->getOperand(PIdx).getImm() != ARMCC::AL;
-}
-
-bool ARMBaseInstrInfo::
-PredicateInstruction(MachineInstr *MI,
- const SmallVectorImpl<MachineOperand> &Pred) const {
- unsigned Opc = MI->getOpcode();
- if (Opc == ARM::B || Opc == ARM::tB || Opc == ARM::t2B) {
- MI->setDesc(get((Opc == ARM::B) ? ARM::Bcc :
- ((Opc == ARM::tB) ? ARM::tBcc : ARM::t2Bcc)));
- MI->addOperand(MachineOperand::CreateImm(Pred[0].getImm()));
- MI->addOperand(MachineOperand::CreateReg(Pred[1].getReg(), false));
- return true;
- }
-
- int PIdx = MI->findFirstPredOperandIdx();
- if (PIdx != -1) {
- MachineOperand &PMO = MI->getOperand(PIdx);
- PMO.setImm(Pred[0].getImm());
- MI->getOperand(PIdx+1).setReg(Pred[1].getReg());
- return true;
- }
- return false;
-}
-
-bool ARMBaseInstrInfo::
-SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1,
- const SmallVectorImpl<MachineOperand> &Pred2) const {
- if (Pred1.size() > 2 || Pred2.size() > 2)
- return false;
-
- ARMCC::CondCodes CC1 = (ARMCC::CondCodes)Pred1[0].getImm();
- ARMCC::CondCodes CC2 = (ARMCC::CondCodes)Pred2[0].getImm();
- if (CC1 == CC2)
- return true;
-
- switch (CC1) {
- default:
- return false;
- case ARMCC::AL:
- return true;
- case ARMCC::HS:
- return CC2 == ARMCC::HI;
- case ARMCC::LS:
- return CC2 == ARMCC::LO || CC2 == ARMCC::EQ;
- case ARMCC::GE:
- return CC2 == ARMCC::GT;
- case ARMCC::LE:
- return CC2 == ARMCC::LT;
- }
-}
-
-bool ARMBaseInstrInfo::DefinesPredicate(MachineInstr *MI,
- std::vector<MachineOperand> &Pred) const {
- const TargetInstrDesc &TID = MI->getDesc();
- if (!TID.getImplicitDefs() && !TID.hasOptionalDef())
- return false;
-
- bool Found = false;
- for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
- const MachineOperand &MO = MI->getOperand(i);
- if (MO.isReg() && MO.getReg() == ARM::CPSR) {
- Pred.push_back(MO);
- Found = true;
- }
- }
-
- return Found;
-}
-
-
-/// FIXME: Works around a gcc miscompilation with -fstrict-aliasing
-static unsigned getNumJTEntries(const std::vector<MachineJumpTableEntry> &JT,
- unsigned JTI) DISABLE_INLINE;
-static unsigned getNumJTEntries(const std::vector<MachineJumpTableEntry> &JT,
- unsigned JTI) {
- return JT[JTI].MBBs.size();
-}
-
-/// GetInstSize - Return the size of the specified MachineInstr.
-///
-unsigned ARMBaseInstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const {
- const MachineBasicBlock &MBB = *MI->getParent();
- const MachineFunction *MF = MBB.getParent();
- const TargetAsmInfo *TAI = MF->getTarget().getTargetAsmInfo();
-
- // Basic size info comes from the TSFlags field.
- const TargetInstrDesc &TID = MI->getDesc();
- unsigned TSFlags = TID.TSFlags;
-
- switch ((TSFlags & ARMII::SizeMask) >> ARMII::SizeShift) {
- default: {
- // If this machine instr is an inline asm, measure it.
- if (MI->getOpcode() == ARM::INLINEASM)
- return TAI->getInlineAsmLength(MI->getOperand(0).getSymbolName());
- if (MI->isLabel())
- return 0;
- switch (MI->getOpcode()) {
- default:
- assert(0 && "Unknown or unset size field for instr!");
- break;
- case TargetInstrInfo::IMPLICIT_DEF:
- case TargetInstrInfo::DECLARE:
- case TargetInstrInfo::DBG_LABEL:
- case TargetInstrInfo::EH_LABEL:
- return 0;
- }
- break;
- }
- case ARMII::Size8Bytes: return 8; // Arm instruction x 2.
- case ARMII::Size4Bytes: return 4; // Arm instruction.
- case ARMII::Size2Bytes: return 2; // Thumb instruction.
- case ARMII::SizeSpecial: {
- switch (MI->getOpcode()) {
- case ARM::CONSTPOOL_ENTRY:
- // If this machine instr is a constant pool entry, its size is recorded as
- // operand #2.
- return MI->getOperand(2).getImm();
- case ARM::Int_eh_sjlj_setjmp: return 12;
- case ARM::BR_JTr:
- case ARM::BR_JTm:
- case ARM::BR_JTadd:
- case ARM::t2BR_JTr:
- case ARM::t2BR_JTm:
- case ARM::t2BR_JTadd:
- case ARM::tBR_JTr: {
- // These are jumptable branches, i.e. a branch followed by an inlined
- // jumptable. The size is 4 + 4 * number of entries.
- unsigned NumOps = TID.getNumOperands();
- MachineOperand JTOP =
- MI->getOperand(NumOps - (TID.isPredicable() ? 3 : 2));
- unsigned JTI = JTOP.getIndex();
- const MachineJumpTableInfo *MJTI = MF->getJumpTableInfo();
- const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables();
- assert(JTI < JT.size());
- // Thumb instructions are 2 byte aligned, but JT entries are 4 byte
- // 4 aligned. The assembler / linker may add 2 byte padding just before
- // the JT entries. The size does not include this padding; the
- // constant islands pass does separate bookkeeping for it.
- // FIXME: If we know the size of the function is less than (1 << 16) *2
- // bytes, we can use 16-bit entries instead. Then there won't be an
- // alignment issue.
- return getNumJTEntries(JT, JTI) * 4 +
- ((MI->getOpcode()==ARM::tBR_JTr) ? 2 : 4);
- }
- default:
- // Otherwise, pseudo-instruction sizes are zero.
- return 0;
- }
- }
- }
- return 0; // Not reached
-}
-
-/// Return true if the instruction is a register to register move and
-/// leave the source and dest operands in the passed parameters.
-///
-bool
-ARMBaseInstrInfo::isMoveInstr(const MachineInstr &MI,
- unsigned &SrcReg, unsigned &DstReg,
- unsigned& SrcSubIdx, unsigned& DstSubIdx) const {
- SrcSubIdx = DstSubIdx = 0; // No sub-registers.
-
- unsigned oc = MI.getOpcode();
- switch (oc) {
default:
- return false;
- case ARM::FCPYS:
- case ARM::FCPYD:
- case ARM::VMOVD:
- case ARM::VMOVQ:
- SrcReg = MI.getOperand(1).getReg();
- DstReg = MI.getOperand(0).getReg();
- return true;
- case ARM::MOVr:
- assert(MI.getDesc().getNumOperands() >= 2 &&
- MI.getOperand(0).isReg() &&
- MI.getOperand(1).isReg() &&
- "Invalid ARM MOV instruction");
- SrcReg = MI.getOperand(1).getReg();
- DstReg = MI.getOperand(0).getReg();
- return true;
- }
-}
-
-unsigned
-ARMBaseInstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
- int &FrameIndex) const {
- switch (MI->getOpcode()) {
- default: break;
- case ARM::LDR:
- if (MI->getOperand(1).isFI() &&
- MI->getOperand(2).isReg() &&
- MI->getOperand(3).isImm() &&
- MI->getOperand(2).getReg() == 0 &&
- MI->getOperand(3).getImm() == 0) {
- FrameIndex = MI->getOperand(1).getIndex();
- return MI->getOperand(0).getReg();
- }
- break;
- case ARM::FLDD:
- case ARM::FLDS:
- if (MI->getOperand(1).isFI() &&
- MI->getOperand(2).isImm() &&
- MI->getOperand(2).getImm() == 0) {
- FrameIndex = MI->getOperand(1).getIndex();
- return MI->getOperand(0).getReg();
- }
- break;
- }
- return 0;
-}
-
-unsigned
-ARMBaseInstrInfo::isStoreToStackSlot(const MachineInstr *MI,
- int &FrameIndex) const {
- switch (MI->getOpcode()) {
- default: break;
- case ARM::STR:
- if (MI->getOperand(1).isFI() &&
- MI->getOperand(2).isReg() &&
- MI->getOperand(3).isImm() &&
- MI->getOperand(2).getReg() == 0 &&
- MI->getOperand(3).getImm() == 0) {
- FrameIndex = MI->getOperand(1).getIndex();
- return MI->getOperand(0).getReg();
- }
- break;
- case ARM::FSTD:
- case ARM::FSTS:
- if (MI->getOperand(1).isFI() &&
- MI->getOperand(2).isImm() &&
- MI->getOperand(2).getImm() == 0) {
- FrameIndex = MI->getOperand(1).getIndex();
- return MI->getOperand(0).getReg();
- }
break;
}
- return 0;
-}
-
-bool
-ARMBaseInstrInfo::copyRegToReg(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator I,
- unsigned DestReg, unsigned SrcReg,
- const TargetRegisterClass *DestRC,
- const TargetRegisterClass *SrcRC) const {
- DebugLoc DL = DebugLoc::getUnknownLoc();
- if (I != MBB.end()) DL = I->getDebugLoc();
-
- if (DestRC != SrcRC) {
- // Not yet supported!
- return false;
- }
-
- if (DestRC == ARM::GPRRegisterClass)
- AddDefaultCC(AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::MOVr), DestReg)
- .addReg(SrcReg)));
- else if (DestRC == ARM::SPRRegisterClass)
- AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::FCPYS), DestReg)
- .addReg(SrcReg));
- else if (DestRC == ARM::DPRRegisterClass)
- AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::FCPYD), DestReg)
- .addReg(SrcReg));
- else if (DestRC == ARM::QPRRegisterClass)
- BuildMI(MBB, I, DL, get(ARM::VMOVQ), DestReg).addReg(SrcReg);
- else
- return false;
-
- return true;
-}
-
-void ARMBaseInstrInfo::
-storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
- unsigned SrcReg, bool isKill, int FI,
- const TargetRegisterClass *RC) const {
- DebugLoc DL = DebugLoc::getUnknownLoc();
- if (I != MBB.end()) DL = I->getDebugLoc();
-
- if (RC == ARM::GPRRegisterClass) {
- AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::STR))
- .addReg(SrcReg, getKillRegState(isKill))
- .addFrameIndex(FI).addReg(0).addImm(0));
- } else if (RC == ARM::DPRRegisterClass) {
- AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::FSTD))
- .addReg(SrcReg, getKillRegState(isKill))
- .addFrameIndex(FI).addImm(0));
- } else {
- assert(RC == ARM::SPRRegisterClass && "Unknown regclass!");
- AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::FSTS))
- .addReg(SrcReg, getKillRegState(isKill))
- .addFrameIndex(FI).addImm(0));
- }
-}
-
-void
-ARMBaseInstrInfo::storeRegToAddr(MachineFunction &MF, unsigned SrcReg,
- bool isKill,
- SmallVectorImpl<MachineOperand> &Addr,
- const TargetRegisterClass *RC,
- SmallVectorImpl<MachineInstr*> &NewMIs) const{
- DebugLoc DL = DebugLoc::getUnknownLoc();
- unsigned Opc = 0;
- if (RC == ARM::GPRRegisterClass) {
- Opc = ARM::STR;
- } else if (RC == ARM::DPRRegisterClass) {
- Opc = ARM::FSTD;
- } else {
- assert(RC == ARM::SPRRegisterClass && "Unknown regclass!");
- Opc = ARM::FSTS;
- }
-
- MachineInstrBuilder MIB =
- BuildMI(MF, DL, get(Opc)).addReg(SrcReg, getKillRegState(isKill));
- for (unsigned i = 0, e = Addr.size(); i != e; ++i)
- MIB.addOperand(Addr[i]);
- AddDefaultPred(MIB);
- NewMIs.push_back(MIB);
- return;
-}
-
-void ARMBaseInstrInfo::
-loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
- unsigned DestReg, int FI,
- const TargetRegisterClass *RC) const {
- DebugLoc DL = DebugLoc::getUnknownLoc();
- if (I != MBB.end()) DL = I->getDebugLoc();
-
- if (RC == ARM::GPRRegisterClass) {
- AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::LDR), DestReg)
- .addFrameIndex(FI).addReg(0).addImm(0));
- } else if (RC == ARM::DPRRegisterClass) {
- AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::FLDD), DestReg)
- .addFrameIndex(FI).addImm(0));
- } else {
- assert(RC == ARM::SPRRegisterClass && "Unknown regclass!");
- AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::FLDS), DestReg)
- .addFrameIndex(FI).addImm(0));
- }
-}
-
-void ARMBaseInstrInfo::
-loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
- SmallVectorImpl<MachineOperand> &Addr,
- const TargetRegisterClass *RC,
- SmallVectorImpl<MachineInstr*> &NewMIs) const {
- DebugLoc DL = DebugLoc::getUnknownLoc();
- unsigned Opc = 0;
- if (RC == ARM::GPRRegisterClass) {
- Opc = ARM::LDR;
- } else if (RC == ARM::DPRRegisterClass) {
- Opc = ARM::FLDD;
- } else {
- assert(RC == ARM::SPRRegisterClass && "Unknown regclass!");
- Opc = ARM::FLDS;
- }
-
- MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc), DestReg);
- for (unsigned i = 0, e = Addr.size(); i != e; ++i)
- MIB.addOperand(Addr[i]);
- AddDefaultPred(MIB);
- NewMIs.push_back(MIB);
- return;
+ return false;
}
-MachineInstr *ARMBaseInstrInfo::
-foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
- const SmallVectorImpl<unsigned> &Ops, int FI) const {
- if (Ops.size() != 1) return NULL;
-
- unsigned OpNum = Ops[0];
- unsigned Opc = MI->getOpcode();
- MachineInstr *NewMI = NULL;
- switch (Opc) {
- default: break;
- case ARM::MOVr: {
- if (MI->getOperand(4).getReg() == ARM::CPSR)
- // If it is updating CPSR, then it cannot be folded.
- break;
- unsigned Pred = MI->getOperand(2).getImm();
- unsigned PredReg = MI->getOperand(3).getReg();
- if (OpNum == 0) { // move -> store
- unsigned SrcReg = MI->getOperand(1).getReg();
- bool isKill = MI->getOperand(1).isKill();
- bool isUndef = MI->getOperand(1).isUndef();
- NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::STR))
- .addReg(SrcReg, getKillRegState(isKill) | getUndefRegState(isUndef))
- .addFrameIndex(FI).addReg(0).addImm(0).addImm(Pred).addReg(PredReg);
- } else { // move -> load
- unsigned DstReg = MI->getOperand(0).getReg();
- bool isDead = MI->getOperand(0).isDead();
- bool isUndef = MI->getOperand(0).isUndef();
- NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::LDR))
- .addReg(DstReg,
- RegState::Define |
- getDeadRegState(isDead) |
- getUndefRegState(isUndef))
- .addFrameIndex(FI).addReg(0).addImm(0).addImm(Pred).addReg(PredReg);
- }
- break;
- }
- case ARM::FCPYS: {
- unsigned Pred = MI->getOperand(2).getImm();
- unsigned PredReg = MI->getOperand(3).getReg();
- if (OpNum == 0) { // move -> store
- unsigned SrcReg = MI->getOperand(1).getReg();
- bool isKill = MI->getOperand(1).isKill();
- bool isUndef = MI->getOperand(1).isUndef();
- NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::FSTS))
- .addReg(SrcReg, getKillRegState(isKill) | getUndefRegState(isUndef))
- .addFrameIndex(FI)
- .addImm(0).addImm(Pred).addReg(PredReg);
- } else { // move -> load
- unsigned DstReg = MI->getOperand(0).getReg();
- bool isDead = MI->getOperand(0).isDead();
- bool isUndef = MI->getOperand(0).isUndef();
- NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::FLDS))
- .addReg(DstReg,
- RegState::Define |
- getDeadRegState(isDead) |
- getUndefRegState(isUndef))
- .addFrameIndex(FI).addImm(0).addImm(Pred).addReg(PredReg);
- }
- break;
- }
- case ARM::FCPYD: {
- unsigned Pred = MI->getOperand(2).getImm();
- unsigned PredReg = MI->getOperand(3).getReg();
- if (OpNum == 0) { // move -> store
- unsigned SrcReg = MI->getOperand(1).getReg();
- bool isKill = MI->getOperand(1).isKill();
- bool isUndef = MI->getOperand(1).isUndef();
- NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::FSTD))
- .addReg(SrcReg, getKillRegState(isKill) | getUndefRegState(isUndef))
- .addFrameIndex(FI).addImm(0).addImm(Pred).addReg(PredReg);
- } else { // move -> load
- unsigned DstReg = MI->getOperand(0).getReg();
- bool isDead = MI->getOperand(0).isDead();
- bool isUndef = MI->getOperand(0).isUndef();
- NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::FLDD))
- .addReg(DstReg,
- RegState::Define |
- getDeadRegState(isDead) |
- getUndefRegState(isUndef))
- .addFrameIndex(FI).addImm(0).addImm(Pred).addReg(PredReg);
- }
- break;
- }
+void ARMInstrInfo::
+reMaterialize(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ unsigned DestReg, unsigned SubIdx,
+ const MachineInstr *Orig) const {
+ DebugLoc dl = Orig->getDebugLoc();
+ if (Orig->getOpcode() == ARM::MOVi2pieces) {
+ RI.emitLoadConstPool(MBB, I, dl,
+ DestReg, SubIdx,
+ Orig->getOperand(1).getImm(),
+ (ARMCC::CondCodes)Orig->getOperand(2).getImm(),
+ Orig->getOperand(3).getReg());
+ return;
}
- return NewMI;
-}
-
-MachineInstr*
-ARMBaseInstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
- MachineInstr* MI,
- const SmallVectorImpl<unsigned> &Ops,
- MachineInstr* LoadMI) const {
- return 0;
+ MachineInstr *MI = MBB.getParent()->CloneMachineInstr(Orig);
+ MI->getOperand(0).setReg(DestReg);
+ MBB.insert(I, MI);
}
-bool
-ARMBaseInstrInfo::canFoldMemoryOperand(const MachineInstr *MI,
- const SmallVectorImpl<unsigned> &Ops) const {
- if (Ops.size() != 1) return false;
-
- unsigned Opc = MI->getOpcode();
- switch (Opc) {
- default: break;
- case ARM::MOVr:
- // If it is updating CPSR, then it cannot be folded.
- return MI->getOperand(4).getReg() != ARM::CPSR;
- case ARM::FCPYS:
- case ARM::FCPYD:
- return true;
-
- case ARM::VMOVD:
- case ARM::VMOVQ:
- return false; // FIXME
- }
-
- return false;
-}
diff --git a/lib/Target/ARM/ARMInstrInfo.h b/lib/Target/ARM/ARMInstrInfo.h
index 8c8f788..c616949 100644
--- a/lib/Target/ARM/ARMInstrInfo.h
+++ b/lib/Target/ARM/ARMInstrInfo.h
@@ -15,247 +15,27 @@
#define ARMINSTRUCTIONINFO_H
#include "llvm/Target/TargetInstrInfo.h"
+#include "ARMBaseInstrInfo.h"
#include "ARMRegisterInfo.h"
+#include "ARMSubtarget.h"
#include "ARM.h"
namespace llvm {
class ARMSubtarget;
-/// ARMII - This namespace holds all of the target specific flags that
-/// instruction info tracks.
-///
-namespace ARMII {
- enum {
- //===------------------------------------------------------------------===//
- // Instruction Flags.
-
- //===------------------------------------------------------------------===//
- // This four-bit field describes the addressing mode used.
-
- AddrModeMask = 0xf,
- AddrModeNone = 0,
- AddrMode1 = 1,
- AddrMode2 = 2,
- AddrMode3 = 3,
- AddrMode4 = 4,
- AddrMode5 = 5,
- AddrMode6 = 6,
- AddrModeT1_1 = 7,
- AddrModeT1_2 = 8,
- AddrModeT1_4 = 9,
- AddrModeT1_s = 10, // i8 * 4 for pc and sp relative data
- AddrModeT2_i12 = 11,
- AddrModeT2_i8 = 12,
- AddrModeT2_so = 13,
- AddrModeT2_pc = 14, // +/- i12 for pc relative data
- AddrModeT2_i8s4 = 15, // i8 * 4
-
- // Size* - Flags to keep track of the size of an instruction.
- SizeShift = 4,
- SizeMask = 7 << SizeShift,
- SizeSpecial = 1, // 0 byte pseudo or special case.
- Size8Bytes = 2,
- Size4Bytes = 3,
- Size2Bytes = 4,
-
- // IndexMode - Unindex, pre-indexed, or post-indexed. Only valid for load
- // and store ops
- IndexModeShift = 7,
- IndexModeMask = 3 << IndexModeShift,
- IndexModePre = 1,
- IndexModePost = 2,
-
- //===------------------------------------------------------------------===//
- // Misc flags.
-
- // UnaryDP - Indicates this is a unary data processing instruction, i.e.
- // it doesn't have a Rn operand.
- UnaryDP = 1 << 9,
-
- //===------------------------------------------------------------------===//
- // Instruction encoding formats.
- //
- FormShift = 10,
- FormMask = 0x1f << FormShift,
-
- // Pseudo instructions
- Pseudo = 0 << FormShift,
-
- // Multiply instructions
- MulFrm = 1 << FormShift,
-
- // Branch instructions
- BrFrm = 2 << FormShift,
- BrMiscFrm = 3 << FormShift,
-
- // Data Processing instructions
- DPFrm = 4 << FormShift,
- DPSoRegFrm = 5 << FormShift,
-
- // Load and Store
- LdFrm = 6 << FormShift,
- StFrm = 7 << FormShift,
- LdMiscFrm = 8 << FormShift,
- StMiscFrm = 9 << FormShift,
- LdStMulFrm = 10 << FormShift,
-
- // Miscellaneous arithmetic instructions
- ArithMiscFrm = 11 << FormShift,
-
- // Extend instructions
- ExtFrm = 12 << FormShift,
-
- // VFP formats
- VFPUnaryFrm = 13 << FormShift,
- VFPBinaryFrm = 14 << FormShift,
- VFPConv1Frm = 15 << FormShift,
- VFPConv2Frm = 16 << FormShift,
- VFPConv3Frm = 17 << FormShift,
- VFPConv4Frm = 18 << FormShift,
- VFPConv5Frm = 19 << FormShift,
- VFPLdStFrm = 20 << FormShift,
- VFPLdStMulFrm = 21 << FormShift,
- VFPMiscFrm = 22 << FormShift,
-
- // Thumb format
- ThumbFrm = 23 << FormShift,
-
- // NEON format
- NEONFrm = 24 << FormShift,
- NEONGetLnFrm = 25 << FormShift,
- NEONSetLnFrm = 26 << FormShift,
- NEONDupFrm = 27 << FormShift,
-
- //===------------------------------------------------------------------===//
- // Field shifts - such shifts are used to set field while generating
- // machine instructions.
- M_BitShift = 5,
- ShiftImmShift = 5,
- ShiftShift = 7,
- N_BitShift = 7,
- ImmHiShift = 8,
- SoRotImmShift = 8,
- RegRsShift = 8,
- ExtRotImmShift = 10,
- RegRdLoShift = 12,
- RegRdShift = 12,
- RegRdHiShift = 16,
- RegRnShift = 16,
- S_BitShift = 20,
- W_BitShift = 21,
- AM3_I_BitShift = 22,
- D_BitShift = 22,
- U_BitShift = 23,
- P_BitShift = 24,
- I_BitShift = 25,
- CondShift = 28
- };
-}
-
-class ARMBaseInstrInfo : public TargetInstrInfoImpl {
-protected:
- // Can be only subclassed.
- explicit ARMBaseInstrInfo(const ARMSubtarget &STI);
-public:
- virtual MachineInstr *convertToThreeAddress(MachineFunction::iterator &MFI,
- MachineBasicBlock::iterator &MBBI,
- LiveVariables *LV) const;
-
- virtual const ARMBaseRegisterInfo &getRegisterInfo() const =0;
-
- // Branch analysis.
- virtual bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
- MachineBasicBlock *&FBB,
- SmallVectorImpl<MachineOperand> &Cond,
- bool AllowModify) const;
- virtual unsigned RemoveBranch(MachineBasicBlock &MBB) const;
- virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
- MachineBasicBlock *FBB,
- const SmallVectorImpl<MachineOperand> &Cond) const;
-
- virtual bool BlockHasNoFallThrough(const MachineBasicBlock &MBB) const;
- virtual
- bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const;
-
- // Predication support.
- virtual bool isPredicated(const MachineInstr *MI) const;
-
- ARMCC::CondCodes getPredicate(const MachineInstr *MI) const {
- int PIdx = MI->findFirstPredOperandIdx();
- return PIdx != -1 ? (ARMCC::CondCodes)MI->getOperand(PIdx).getImm()
- : ARMCC::AL;
- }
-
- virtual
- bool PredicateInstruction(MachineInstr *MI,
- const SmallVectorImpl<MachineOperand> &Pred) const;
-
- virtual
- bool SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1,
- const SmallVectorImpl<MachineOperand> &Pred2) const;
-
- virtual bool DefinesPredicate(MachineInstr *MI,
- std::vector<MachineOperand> &Pred) const;
-
- /// GetInstSize - Returns the size of the specified MachineInstr.
- ///
- virtual unsigned GetInstSizeInBytes(const MachineInstr* MI) const;
-
- /// Return true if the instruction is a register to register move and return
- /// the source and dest operands and their sub-register indices by reference.
- virtual bool isMoveInstr(const MachineInstr &MI,
- unsigned &SrcReg, unsigned &DstReg,
- unsigned &SrcSubIdx, unsigned &DstSubIdx) const;
-
- virtual unsigned isLoadFromStackSlot(const MachineInstr *MI,
- int &FrameIndex) const;
- virtual unsigned isStoreToStackSlot(const MachineInstr *MI,
- int &FrameIndex) const;
-
- virtual bool copyRegToReg(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator I,
- unsigned DestReg, unsigned SrcReg,
- const TargetRegisterClass *DestRC,
- const TargetRegisterClass *SrcRC) const;
- virtual void storeRegToStackSlot(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MBBI,
- unsigned SrcReg, bool isKill, int FrameIndex,
- const TargetRegisterClass *RC) const;
-
- virtual void storeRegToAddr(MachineFunction &MF, unsigned SrcReg, bool isKill,
- SmallVectorImpl<MachineOperand> &Addr,
- const TargetRegisterClass *RC,
- SmallVectorImpl<MachineInstr*> &NewMIs) const;
-
- virtual void loadRegFromStackSlot(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MBBI,
- unsigned DestReg, int FrameIndex,
- const TargetRegisterClass *RC) const;
-
- virtual void loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
- SmallVectorImpl<MachineOperand> &Addr,
- const TargetRegisterClass *RC,
- SmallVectorImpl<MachineInstr*> &NewMIs) const;
-
- virtual bool canFoldMemoryOperand(const MachineInstr *MI,
- const SmallVectorImpl<unsigned> &Ops) const;
-
- virtual MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
- MachineInstr* MI,
- const SmallVectorImpl<unsigned> &Ops,
- int FrameIndex) const;
-
- virtual MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
- MachineInstr* MI,
- const SmallVectorImpl<unsigned> &Ops,
- MachineInstr* LoadMI) const;
-};
-
class ARMInstrInfo : public ARMBaseInstrInfo {
ARMRegisterInfo RI;
+ const ARMSubtarget &Subtarget;
public:
explicit ARMInstrInfo(const ARMSubtarget &STI);
+ // Return the non-pre/post incrementing version of 'Opc'. Return 0
+ // if there is not such an opcode.
+ unsigned getUnindexedOpcode(unsigned Opc) const;
+
+ // Return true if the block does not fall through.
+ bool BlockHasNoFallThrough(const MachineBasicBlock &MBB) const;
+
/// getRegisterInfo - TargetInstrInfo is a superset of MRegister info. As
/// such, whenever a client has an instance of instruction info, it should
/// always be able to get register info as well (through this method).
@@ -263,7 +43,8 @@ public:
const ARMRegisterInfo &getRegisterInfo() const { return RI; }
void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
- unsigned DestReg, const MachineInstr *Orig) const;
+ unsigned DestReg, unsigned SubIdx,
+ const MachineInstr *Orig) const;
};
}
diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td
index 408f47a..8adfac3 100644
--- a/lib/Target/ARM/ARMInstrInfo.td
+++ b/lib/Target/ARM/ARMInstrInfo.td
@@ -34,6 +34,10 @@ def SDT_ARMBrJT : SDTypeProfile<0, 3,
[SDTCisPtrTy<0>, SDTCisVT<1, i32>,
SDTCisVT<2, i32>]>;
+def SDT_ARMBr2JT : SDTypeProfile<0, 4,
+ [SDTCisPtrTy<0>, SDTCisVT<1, i32>,
+ SDTCisVT<2, i32>, SDTCisVT<3, i32>]>;
+
def SDT_ARMCmp : SDTypeProfile<0, 2, [SDTCisSameAs<0, 1>]>;
def SDT_ARMPICAdd : SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>,
@@ -71,6 +75,8 @@ def ARMbrcond : SDNode<"ARMISD::BRCOND", SDT_ARMBrcond,
def ARMbrjt : SDNode<"ARMISD::BR_JT", SDT_ARMBrJT,
[SDNPHasChain]>;
+def ARMbr2jt : SDNode<"ARMISD::BR2_JT", SDT_ARMBr2JT,
+ [SDNPHasChain]>;
def ARMcmp : SDNode<"ARMISD::CMP", SDT_ARMCmp,
[SDNPOutFlag]>;
@@ -93,10 +99,14 @@ def ARMeh_sjlj_setjmp: SDNode<"ARMISD::EH_SJLJ_SETJMP", SDT_ARMEH_SJLJ_Setjmp>;
def HasV5T : Predicate<"Subtarget->hasV5TOps()">;
def HasV5TE : Predicate<"Subtarget->hasV5TEOps()">;
def HasV6 : Predicate<"Subtarget->hasV6Ops()">;
+def HasV6T2 : Predicate<"Subtarget->hasV6T2Ops()">;
+def NoV6T2 : Predicate<"!Subtarget->hasV6T2Ops()">;
def HasV7 : Predicate<"Subtarget->hasV7Ops()">;
def HasVFP2 : Predicate<"Subtarget->hasVFP2()">;
def HasVFP3 : Predicate<"Subtarget->hasVFP3()">;
def HasNEON : Predicate<"Subtarget->hasNEON()">;
+def UseNEONForFP : Predicate<"Subtarget->useNEONForSinglePrecisionFP()">;
+def DontUseNEONForFP : Predicate<"!Subtarget->useNEONForSinglePrecisionFP()">;
def IsThumb : Predicate<"Subtarget->isThumb()">;
def IsThumb1Only : Predicate<"Subtarget->isThumb1Only()">;
def IsThumb2 : Predicate<"Subtarget->isThumb2()">;
@@ -117,25 +127,16 @@ class RegConstraint<string C> {
// ARM specific transformation functions and pattern fragments.
//
-// so_imm_XFORM - Return a so_imm value packed into the format described for
-// so_imm def below.
-def so_imm_XFORM : SDNodeXForm<imm, [{
- return CurDAG->getTargetConstant(ARM_AM::getSOImmVal(N->getZExtValue()),
- MVT::i32);
-}]>;
-
// so_imm_neg_XFORM - Return a so_imm value packed into the format described for
// so_imm_neg def below.
def so_imm_neg_XFORM : SDNodeXForm<imm, [{
- return CurDAG->getTargetConstant(ARM_AM::getSOImmVal(-(int)N->getZExtValue()),
- MVT::i32);
+ return CurDAG->getTargetConstant(-(int)N->getZExtValue(), MVT::i32);
}]>;
// so_imm_not_XFORM - Return a so_imm value packed into the format described for
// so_imm_not def below.
def so_imm_not_XFORM : SDNodeXForm<imm, [{
- return CurDAG->getTargetConstant(ARM_AM::getSOImmVal(~(int)N->getZExtValue()),
- MVT::i32);
+ return CurDAG->getTargetConstant(~(int)N->getZExtValue(), MVT::i32);
}]>;
// rot_imm predicate - True if the 32-bit immediate is equal to 8, 16, or 24.
@@ -169,6 +170,48 @@ def sext_16_node : PatLeaf<(i32 GPR:$a), [{
return CurDAG->ComputeNumSignBits(SDValue(N,0)) >= 17;
}]>;
+/// bf_inv_mask_imm predicate - An AND mask to clear an arbitrary width bitfield
+/// e.g., 0xf000ffff
+def bf_inv_mask_imm : Operand<i32>,
+ PatLeaf<(imm), [{
+ uint32_t v = (uint32_t)N->getZExtValue();
+ if (v == 0xffffffff)
+ return 0;
+ // there can be 1's on either or both "outsides", all the "inside"
+ // bits must be 0's
+ unsigned int lsb = 0, msb = 31;
+ while (v & (1 << msb)) --msb;
+ while (v & (1 << lsb)) ++lsb;
+ for (unsigned int i = lsb; i <= msb; ++i) {
+ if (v & (1 << i))
+ return 0;
+ }
+ return 1;
+}] > {
+ let PrintMethod = "printBitfieldInvMaskImmOperand";
+}
+
+/// Split a 32-bit immediate into two 16 bit parts.
+def lo16 : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant((uint32_t)N->getZExtValue() & 0xffff,
+ MVT::i32);
+}]>;
+
+def hi16 : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant((uint32_t)N->getZExtValue() >> 16, MVT::i32);
+}]>;
+
+def lo16AllZero : PatLeaf<(i32 imm), [{
+ // Returns true if all low 16-bits are 0.
+ return (((uint32_t)N->getZExtValue()) & 0xFFFFUL) == 0;
+ }], hi16>;
+
+/// imm0_65535 predicate - True if the 32-bit immediate is in the range
+/// [0.65535].
+def imm0_65535 : PatLeaf<(i32 imm), [{
+ return (uint32_t)N->getZExtValue() < 65536;
+}]>;
+
class BinOpFrag<dag res> : PatFrag<(ops node:$LHS, node:$RHS), res>;
class UnOpFrag <dag res> : PatFrag<(ops node:$Src), res>;
@@ -192,6 +235,9 @@ def cpinst_operand : Operand<i32> {
def jtblock_operand : Operand<i32> {
let PrintMethod = "printJTBlockOperand";
}
+def jt2block_operand : Operand<i32> {
+ let PrintMethod = "printJT2BlockOperand";
+}
// Local PC labels.
def pclabel : Operand<i32> {
@@ -212,9 +258,9 @@ def so_reg : Operand<i32>, // reg reg imm
// into so_imm instructions: the 8-bit immediate is the least significant bits
// [bits 0-7], the 4-bit shift amount is the next 4 bits [bits 8-11].
def so_imm : Operand<i32>,
- PatLeaf<(imm),
- [{ return ARM_AM::getSOImmVal(N->getZExtValue()) != -1; }],
- so_imm_XFORM> {
+ PatLeaf<(imm), [{
+ return ARM_AM::getSOImmVal(N->getZExtValue()) != -1;
+ }]> {
let PrintMethod = "printSOImmOperand";
}
@@ -230,14 +276,18 @@ def so_imm2part : Operand<i32>,
def so_imm2part_1 : SDNodeXForm<imm, [{
unsigned V = ARM_AM::getSOImmTwoPartFirst((unsigned)N->getZExtValue());
- return CurDAG->getTargetConstant(ARM_AM::getSOImmVal(V), MVT::i32);
+ return CurDAG->getTargetConstant(V, MVT::i32);
}]>;
def so_imm2part_2 : SDNodeXForm<imm, [{
unsigned V = ARM_AM::getSOImmTwoPartSecond((unsigned)N->getZExtValue());
- return CurDAG->getTargetConstant(ARM_AM::getSOImmVal(V), MVT::i32);
+ return CurDAG->getTargetConstant(V, MVT::i32);
}]>;
+/// imm0_31 predicate - True if the 32-bit immediate is in the range [0,31].
+def imm0_31 : Operand<i32>, PatLeaf<(imm), [{
+ return (int32_t)N->getZExtValue() < 32;
+}]>;
// Define ARM specific addressing modes.
@@ -274,7 +324,7 @@ def am3offset : Operand<i32>,
// addrmode4 := reg, <mode|W>
//
def addrmode4 : Operand<i32>,
- ComplexPattern<i32, 2, "", []> {
+ ComplexPattern<i32, 2, "SelectAddrMode4", []> {
let PrintMethod = "printAddrMode4Operand";
let MIOperandInfo = (ops GPR, i32imm);
}
@@ -303,17 +353,8 @@ def addrmodepc : Operand<i32>,
let MIOperandInfo = (ops GPR, i32imm);
}
-// ARM Predicate operand. Default to 14 = always (AL). Second part is CC
-// register whose default is 0 (no register).
-def pred : PredicateOperand<OtherVT, (ops i32imm, CCR),
- (ops (i32 14), (i32 zero_reg))> {
- let PrintMethod = "printPredicateOperand";
-}
-
-// Conditional code result for instructions whose 's' bit is set, e.g. subs.
-//
-def cc_out : OptionalDefOperand<OtherVT, (ops CCR), (ops (i32 zero_reg))> {
- let PrintMethod = "printSBitModifierOperand";
+def nohash_imm : Operand<i32> {
+ let PrintMethod = "printNoHashImmediate";
}
//===----------------------------------------------------------------------===//
@@ -329,34 +370,44 @@ include "ARMInstrFormats.td"
multiclass AsI1_bin_irs<bits<4> opcod, string opc, PatFrag opnode,
bit Commutable = 0> {
def ri : AsI1<opcod, (outs GPR:$dst), (ins GPR:$a, so_imm:$b), DPFrm,
- opc, " $dst, $a, $b",
- [(set GPR:$dst, (opnode GPR:$a, so_imm:$b))]>;
+ IIC_iALUi, opc, " $dst, $a, $b",
+ [(set GPR:$dst, (opnode GPR:$a, so_imm:$b))]> {
+ let Inst{25} = 1;
+ }
def rr : AsI1<opcod, (outs GPR:$dst), (ins GPR:$a, GPR:$b), DPFrm,
- opc, " $dst, $a, $b",
+ IIC_iALUr, opc, " $dst, $a, $b",
[(set GPR:$dst, (opnode GPR:$a, GPR:$b))]> {
+ let Inst{25} = 0;
let isCommutable = Commutable;
}
def rs : AsI1<opcod, (outs GPR:$dst), (ins GPR:$a, so_reg:$b), DPSoRegFrm,
- opc, " $dst, $a, $b",
- [(set GPR:$dst, (opnode GPR:$a, so_reg:$b))]>;
+ IIC_iALUsr, opc, " $dst, $a, $b",
+ [(set GPR:$dst, (opnode GPR:$a, so_reg:$b))]> {
+ let Inst{25} = 0;
+ }
}
/// AI1_bin_s_irs - Similar to AsI1_bin_irs except it sets the 's' bit so the
-/// instruction modifies the CSPR register.
+/// instruction modifies the CPSR register.
let Defs = [CPSR] in {
multiclass AI1_bin_s_irs<bits<4> opcod, string opc, PatFrag opnode,
bit Commutable = 0> {
def ri : AI1<opcod, (outs GPR:$dst), (ins GPR:$a, so_imm:$b), DPFrm,
- opc, "s $dst, $a, $b",
- [(set GPR:$dst, (opnode GPR:$a, so_imm:$b))]>;
+ IIC_iALUi, opc, "s $dst, $a, $b",
+ [(set GPR:$dst, (opnode GPR:$a, so_imm:$b))]> {
+ let Inst{25} = 1;
+ }
def rr : AI1<opcod, (outs GPR:$dst), (ins GPR:$a, GPR:$b), DPFrm,
- opc, "s $dst, $a, $b",
+ IIC_iALUr, opc, "s $dst, $a, $b",
[(set GPR:$dst, (opnode GPR:$a, GPR:$b))]> {
let isCommutable = Commutable;
+ let Inst{25} = 0;
}
def rs : AI1<opcod, (outs GPR:$dst), (ins GPR:$a, so_reg:$b), DPSoRegFrm,
- opc, "s $dst, $a, $b",
- [(set GPR:$dst, (opnode GPR:$a, so_reg:$b))]>;
+ IIC_iALUsr, opc, "s $dst, $a, $b",
+ [(set GPR:$dst, (opnode GPR:$a, so_reg:$b))]> {
+ let Inst{25} = 0;
+ }
}
}
@@ -366,17 +417,25 @@ multiclass AI1_bin_s_irs<bits<4> opcod, string opc, PatFrag opnode,
let Defs = [CPSR] in {
multiclass AI1_cmp_irs<bits<4> opcod, string opc, PatFrag opnode,
bit Commutable = 0> {
- def ri : AI1<opcod, (outs), (ins GPR:$a, so_imm:$b), DPFrm,
+ def ri : AI1<opcod, (outs), (ins GPR:$a, so_imm:$b), DPFrm, IIC_iCMPi,
opc, " $a, $b",
- [(opnode GPR:$a, so_imm:$b)]>;
- def rr : AI1<opcod, (outs), (ins GPR:$a, GPR:$b), DPFrm,
+ [(opnode GPR:$a, so_imm:$b)]> {
+ let Inst{20} = 1;
+ let Inst{25} = 1;
+ }
+ def rr : AI1<opcod, (outs), (ins GPR:$a, GPR:$b), DPFrm, IIC_iCMPr,
opc, " $a, $b",
[(opnode GPR:$a, GPR:$b)]> {
+ let Inst{20} = 1;
+ let Inst{25} = 0;
let isCommutable = Commutable;
}
- def rs : AI1<opcod, (outs), (ins GPR:$a, so_reg:$b), DPSoRegFrm,
+ def rs : AI1<opcod, (outs), (ins GPR:$a, so_reg:$b), DPSoRegFrm, IIC_iCMPsr,
opc, " $a, $b",
- [(opnode GPR:$a, so_reg:$b)]>;
+ [(opnode GPR:$a, so_reg:$b)]> {
+ let Inst{20} = 1;
+ let Inst{25} = 0;
+ }
}
}
@@ -384,15 +443,15 @@ multiclass AI1_cmp_irs<bits<4> opcod, string opc, PatFrag opnode,
/// register and one whose operand is a register rotated by 8/16/24.
/// FIXME: Remove the 'r' variant. Its rot_imm is zero.
multiclass AI_unary_rrot<bits<8> opcod, string opc, PatFrag opnode> {
- def r : AExtI<opcod, (outs GPR:$dst), (ins GPR:$Src),
- opc, " $dst, $Src",
- [(set GPR:$dst, (opnode GPR:$Src))]>,
+ def r : AExtI<opcod, (outs GPR:$dst), (ins GPR:$src),
+ IIC_iUNAr, opc, " $dst, $src",
+ [(set GPR:$dst, (opnode GPR:$src))]>,
Requires<[IsARM, HasV6]> {
let Inst{19-16} = 0b1111;
}
- def r_rot : AExtI<opcod, (outs GPR:$dst), (ins GPR:$Src, i32imm:$rot),
- opc, " $dst, $Src, ror $rot",
- [(set GPR:$dst, (opnode (rotr GPR:$Src, rot_imm:$rot)))]>,
+ def r_rot : AExtI<opcod, (outs GPR:$dst), (ins GPR:$src, i32imm:$rot),
+ IIC_iUNAsi, opc, " $dst, $src, ror $rot",
+ [(set GPR:$dst, (opnode (rotr GPR:$src, rot_imm:$rot)))]>,
Requires<[IsARM, HasV6]> {
let Inst{19-16} = 0b1111;
}
@@ -402,11 +461,11 @@ multiclass AI_unary_rrot<bits<8> opcod, string opc, PatFrag opnode> {
/// register and one whose operand is a register rotated by 8/16/24.
multiclass AI_bin_rrot<bits<8> opcod, string opc, PatFrag opnode> {
def rr : AExtI<opcod, (outs GPR:$dst), (ins GPR:$LHS, GPR:$RHS),
- opc, " $dst, $LHS, $RHS",
+ IIC_iALUr, opc, " $dst, $LHS, $RHS",
[(set GPR:$dst, (opnode GPR:$LHS, GPR:$RHS))]>,
Requires<[IsARM, HasV6]>;
def rr_rot : AExtI<opcod, (outs GPR:$dst), (ins GPR:$LHS, GPR:$RHS, i32imm:$rot),
- opc, " $dst, $LHS, $RHS, ror $rot",
+ IIC_iALUsi, opc, " $dst, $LHS, $RHS, ror $rot",
[(set GPR:$dst, (opnode GPR:$LHS,
(rotr GPR:$RHS, rot_imm:$rot)))]>,
Requires<[IsARM, HasV6]>;
@@ -417,37 +476,45 @@ let Uses = [CPSR] in {
multiclass AI1_adde_sube_irs<bits<4> opcod, string opc, PatFrag opnode,
bit Commutable = 0> {
def ri : AsI1<opcod, (outs GPR:$dst), (ins GPR:$a, so_imm:$b),
- DPFrm, opc, " $dst, $a, $b",
+ DPFrm, IIC_iALUi, opc, " $dst, $a, $b",
[(set GPR:$dst, (opnode GPR:$a, so_imm:$b))]>,
- Requires<[IsARM, CarryDefIsUnused]>;
+ Requires<[IsARM, CarryDefIsUnused]> {
+ let Inst{25} = 1;
+ }
def rr : AsI1<opcod, (outs GPR:$dst), (ins GPR:$a, GPR:$b),
- DPFrm, opc, " $dst, $a, $b",
+ DPFrm, IIC_iALUr, opc, " $dst, $a, $b",
[(set GPR:$dst, (opnode GPR:$a, GPR:$b))]>,
Requires<[IsARM, CarryDefIsUnused]> {
let isCommutable = Commutable;
+ let Inst{25} = 0;
}
def rs : AsI1<opcod, (outs GPR:$dst), (ins GPR:$a, so_reg:$b),
- DPSoRegFrm, opc, " $dst, $a, $b",
+ DPSoRegFrm, IIC_iALUsr, opc, " $dst, $a, $b",
[(set GPR:$dst, (opnode GPR:$a, so_reg:$b))]>,
- Requires<[IsARM, CarryDefIsUnused]>;
+ Requires<[IsARM, CarryDefIsUnused]> {
+ let Inst{25} = 0;
+ }
// Carry setting variants
def Sri : AXI1<opcod, (outs GPR:$dst), (ins GPR:$a, so_imm:$b),
- DPFrm, !strconcat(opc, "s $dst, $a, $b"),
+ DPFrm, IIC_iALUi, !strconcat(opc, "s $dst, $a, $b"),
[(set GPR:$dst, (opnode GPR:$a, so_imm:$b))]>,
Requires<[IsARM, CarryDefIsUsed]> {
- let Defs = [CPSR];
+ let Defs = [CPSR];
+ let Inst{25} = 1;
}
def Srr : AXI1<opcod, (outs GPR:$dst), (ins GPR:$a, GPR:$b),
- DPFrm, !strconcat(opc, "s $dst, $a, $b"),
+ DPFrm, IIC_iALUr, !strconcat(opc, "s $dst, $a, $b"),
[(set GPR:$dst, (opnode GPR:$a, GPR:$b))]>,
Requires<[IsARM, CarryDefIsUsed]> {
- let Defs = [CPSR];
+ let Defs = [CPSR];
+ let Inst{25} = 0;
}
def Srs : AXI1<opcod, (outs GPR:$dst), (ins GPR:$a, so_reg:$b),
- DPSoRegFrm, !strconcat(opc, "s $dst, $a, $b"),
+ DPSoRegFrm, IIC_iALUsr, !strconcat(opc, "s $dst, $a, $b"),
[(set GPR:$dst, (opnode GPR:$a, so_reg:$b))]>,
Requires<[IsARM, CarryDefIsUsed]> {
- let Defs = [CPSR];
+ let Defs = [CPSR];
+ let Inst{25} = 0;
}
}
}
@@ -467,23 +534,23 @@ multiclass AI1_adde_sube_irs<bits<4> opcod, string opc, PatFrag opnode,
let neverHasSideEffects = 1, isNotDuplicable = 1 in
def CONSTPOOL_ENTRY :
PseudoInst<(outs), (ins cpinst_operand:$instid, cpinst_operand:$cpidx,
- i32imm:$size),
+ i32imm:$size), NoItinerary,
"${instid:label} ${cpidx:cpentry}", []>;
let Defs = [SP], Uses = [SP] in {
def ADJCALLSTACKUP :
-PseudoInst<(outs), (ins i32imm:$amt1, i32imm:$amt2, pred:$p),
+PseudoInst<(outs), (ins i32imm:$amt1, i32imm:$amt2, pred:$p), NoItinerary,
"@ ADJCALLSTACKUP $amt1",
[(ARMcallseq_end timm:$amt1, timm:$amt2)]>;
def ADJCALLSTACKDOWN :
-PseudoInst<(outs), (ins i32imm:$amt, pred:$p),
+PseudoInst<(outs), (ins i32imm:$amt, pred:$p), NoItinerary,
"@ ADJCALLSTACKDOWN $amt",
[(ARMcallseq_start timm:$amt)]>;
}
def DWARF_LOC :
-PseudoInst<(outs), (ins i32imm:$line, i32imm:$col, i32imm:$file),
+PseudoInst<(outs), (ins i32imm:$line, i32imm:$col, i32imm:$file), NoItinerary,
".loc $file, $line, $col",
[(dwarf_loc (i32 imm:$line), (i32 imm:$col), (i32 imm:$file))]>;
@@ -491,42 +558,42 @@ PseudoInst<(outs), (ins i32imm:$line, i32imm:$col, i32imm:$file),
// Address computation and loads and stores in PIC mode.
let isNotDuplicable = 1 in {
def PICADD : AXI1<0b0100, (outs GPR:$dst), (ins GPR:$a, pclabel:$cp, pred:$p),
- Pseudo, "$cp:\n\tadd$p $dst, pc, $a",
+ Pseudo, IIC_iALUr, "\n$cp:\n\tadd$p $dst, pc, $a",
[(set GPR:$dst, (ARMpic_add GPR:$a, imm:$cp))]>;
let AddedComplexity = 10 in {
let canFoldAsLoad = 1 in
def PICLDR : AXI2ldw<(outs GPR:$dst), (ins addrmodepc:$addr, pred:$p),
- Pseudo, "${addr:label}:\n\tldr$p $dst, $addr",
+ Pseudo, IIC_iLoadr, "\n${addr:label}:\n\tldr$p $dst, $addr",
[(set GPR:$dst, (load addrmodepc:$addr))]>;
def PICLDRH : AXI3ldh<(outs GPR:$dst), (ins addrmodepc:$addr, pred:$p),
- Pseudo, "${addr:label}:\n\tldr${p}h $dst, $addr",
+ Pseudo, IIC_iLoadr, "\n${addr:label}:\n\tldr${p}h $dst, $addr",
[(set GPR:$dst, (zextloadi16 addrmodepc:$addr))]>;
def PICLDRB : AXI2ldb<(outs GPR:$dst), (ins addrmodepc:$addr, pred:$p),
- Pseudo, "${addr:label}:\n\tldr${p}b $dst, $addr",
+ Pseudo, IIC_iLoadr, "\n${addr:label}:\n\tldr${p}b $dst, $addr",
[(set GPR:$dst, (zextloadi8 addrmodepc:$addr))]>;
def PICLDRSH : AXI3ldsh<(outs GPR:$dst), (ins addrmodepc:$addr, pred:$p),
- Pseudo, "${addr:label}:\n\tldr${p}sh $dst, $addr",
+ Pseudo, IIC_iLoadr, "\n${addr:label}:\n\tldr${p}sh $dst, $addr",
[(set GPR:$dst, (sextloadi16 addrmodepc:$addr))]>;
def PICLDRSB : AXI3ldsb<(outs GPR:$dst), (ins addrmodepc:$addr, pred:$p),
- Pseudo, "${addr:label}:\n\tldr${p}sb $dst, $addr",
+ Pseudo, IIC_iLoadr, "\n${addr:label}:\n\tldr${p}sb $dst, $addr",
[(set GPR:$dst, (sextloadi8 addrmodepc:$addr))]>;
}
let AddedComplexity = 10 in {
def PICSTR : AXI2stw<(outs), (ins GPR:$src, addrmodepc:$addr, pred:$p),
- Pseudo, "${addr:label}:\n\tstr$p $src, $addr",
+ Pseudo, IIC_iStorer, "\n${addr:label}:\n\tstr$p $src, $addr",
[(store GPR:$src, addrmodepc:$addr)]>;
def PICSTRH : AXI3sth<(outs), (ins GPR:$src, addrmodepc:$addr, pred:$p),
- Pseudo, "${addr:label}:\n\tstr${p}h $src, $addr",
+ Pseudo, IIC_iStorer, "\n${addr:label}:\n\tstr${p}h $src, $addr",
[(truncstorei16 GPR:$src, addrmodepc:$addr)]>;
def PICSTRB : AXI2stb<(outs), (ins GPR:$src, addrmodepc:$addr, pred:$p),
- Pseudo, "${addr:label}:\n\tstr${p}b $src, $addr",
+ Pseudo, IIC_iStorer, "\n${addr:label}:\n\tstr${p}b $src, $addr",
[(truncstorei8 GPR:$src, addrmodepc:$addr)]>;
}
} // isNotDuplicable = 1
@@ -534,135 +601,152 @@ def PICSTRB : AXI2stb<(outs), (ins GPR:$src, addrmodepc:$addr, pred:$p),
// LEApcrel - Load a pc-relative address into a register without offending the
// assembler.
-def LEApcrel : AXI1<0x0, (outs GPR:$dst), (ins i32imm:$label, pred:$p), Pseudo,
- !strconcat(!strconcat(".set PCRELV${:uid}, ($label-(",
- "${:private}PCRELL${:uid}+8))\n"),
- !strconcat("${:private}PCRELL${:uid}:\n\t",
- "add$p $dst, pc, #PCRELV${:uid}")),
+def LEApcrel : AXI1<0x0, (outs GPR:$dst), (ins i32imm:$label, pred:$p),
+ Pseudo, IIC_iALUi,
+ !strconcat(!strconcat(".set ${:private}PCRELV${:uid}, ($label-(",
+ "${:private}PCRELL${:uid}+8))\n"),
+ !strconcat("${:private}PCRELL${:uid}:\n\t",
+ "add$p $dst, pc, #${:private}PCRELV${:uid}")),
[]>;
def LEApcrelJT : AXI1<0x0, (outs GPR:$dst),
- (ins i32imm:$label, i32imm:$id, pred:$p),
- Pseudo,
- !strconcat(!strconcat(".set PCRELV${:uid}, (${label}_${id:no_hash}-(",
- "${:private}PCRELL${:uid}+8))\n"),
- !strconcat("${:private}PCRELL${:uid}:\n\t",
- "add$p $dst, pc, #PCRELV${:uid}")),
- []>;
+ (ins i32imm:$label, nohash_imm:$id, pred:$p),
+ Pseudo, IIC_iALUi,
+ !strconcat(!strconcat(".set ${:private}PCRELV${:uid}, "
+ "(${label}_${id}-(",
+ "${:private}PCRELL${:uid}+8))\n"),
+ !strconcat("${:private}PCRELL${:uid}:\n\t",
+ "add$p $dst, pc, #${:private}PCRELV${:uid}")),
+ []> {
+ let Inst{25} = 1;
+}
//===----------------------------------------------------------------------===//
// Control Flow Instructions.
//
-let isReturn = 1, isTerminator = 1 in
- def BX_RET : AI<(outs), (ins), BrMiscFrm, "bx", " lr", [(ARMretflag)]> {
+let isReturn = 1, isTerminator = 1, isBarrier = 1 in
+ def BX_RET : AI<(outs), (ins), BrMiscFrm, IIC_Br,
+ "bx", " lr", [(ARMretflag)]> {
let Inst{7-4} = 0b0001;
let Inst{19-8} = 0b111111111111;
let Inst{27-20} = 0b00010010;
}
// FIXME: remove when we have a way to marking a MI with these properties.
-// FIXME: $dst1 should be a def. But the extra ops must be in the end of the
-// operand list.
// FIXME: Should pc be an implicit operand like PICADD, etc?
-let isReturn = 1, isTerminator = 1 in
+let isReturn = 1, isTerminator = 1, isBarrier = 1, mayLoad = 1,
+ hasExtraDefRegAllocReq = 1 in
def LDM_RET : AXI4ld<(outs),
- (ins addrmode4:$addr, pred:$p, reglist:$dst1, variable_ops),
- LdStMulFrm, "ldm${p}${addr:submode} $addr, $dst1",
+ (ins addrmode4:$addr, pred:$p, reglist:$wb, variable_ops),
+ LdStMulFrm, IIC_Br, "ldm${p}${addr:submode} $addr, $wb",
[]>;
// On non-Darwin platforms R9 is callee-saved.
-let isCall = 1, Itinerary = IIC_Br,
- Defs = [R0, R1, R2, R3, R12, LR,
- D0, D1, D2, D3, D4, D5, D6, D7, CPSR] in {
+let isCall = 1,
+ Defs = [R0, R1, R2, R3, R12, LR,
+ D0, D1, D2, D3, D4, D5, D6, D7,
+ D16, D17, D18, D19, D20, D21, D22, D23,
+ D24, D25, D26, D27, D28, D29, D30, D31, CPSR, FPSCR] in {
def BL : ABXI<0b1011, (outs), (ins i32imm:$func, variable_ops),
- "bl ${func:call}",
- [(ARMcall tglobaladdr:$func)]>, Requires<[IsNotDarwin]>;
+ IIC_Br, "bl ${func:call}",
+ [(ARMcall tglobaladdr:$func)]>,
+ Requires<[IsARM, IsNotDarwin]>;
def BL_pred : ABI<0b1011, (outs), (ins i32imm:$func, variable_ops),
- "bl", " ${func:call}",
- [(ARMcall_pred tglobaladdr:$func)]>, Requires<[IsNotDarwin]>;
+ IIC_Br, "bl", " ${func:call}",
+ [(ARMcall_pred tglobaladdr:$func)]>,
+ Requires<[IsARM, IsNotDarwin]>;
// ARMv5T and above
def BLX : AXI<(outs), (ins GPR:$func, variable_ops), BrMiscFrm,
- "blx $func",
- [(ARMcall GPR:$func)]>, Requires<[IsARM, HasV5T, IsNotDarwin]> {
+ IIC_Br, "blx $func",
+ [(ARMcall GPR:$func)]>,
+ Requires<[IsARM, HasV5T, IsNotDarwin]> {
let Inst{7-4} = 0b0011;
let Inst{19-8} = 0b111111111111;
let Inst{27-20} = 0b00010010;
}
- let Uses = [LR] in {
- // ARMv4T
- def BX : ABXIx2<(outs), (ins GPR:$func, variable_ops),
- "mov lr, pc\n\tbx $func",
- [(ARMcall_nolink GPR:$func)]>, Requires<[IsNotDarwin]>;
+ // ARMv4T
+ def BX : ABXIx2<(outs), (ins GPR:$func, variable_ops),
+ IIC_Br, "mov lr, pc\n\tbx $func",
+ [(ARMcall_nolink GPR:$func)]>,
+ Requires<[IsARM, IsNotDarwin]> {
+ let Inst{7-4} = 0b0001;
+ let Inst{19-8} = 0b111111111111;
+ let Inst{27-20} = 0b00010010;
}
}
// On Darwin R9 is call-clobbered.
-let isCall = 1, Itinerary = IIC_Br,
- Defs = [R0, R1, R2, R3, R9, R12, LR,
- D0, D1, D2, D3, D4, D5, D6, D7, CPSR] in {
+let isCall = 1,
+ Defs = [R0, R1, R2, R3, R9, R12, LR,
+ D0, D1, D2, D3, D4, D5, D6, D7,
+ D16, D17, D18, D19, D20, D21, D22, D23,
+ D24, D25, D26, D27, D28, D29, D30, D31, CPSR, FPSCR] in {
def BLr9 : ABXI<0b1011, (outs), (ins i32imm:$func, variable_ops),
- "bl ${func:call}",
- [(ARMcall tglobaladdr:$func)]>, Requires<[IsDarwin]>;
+ IIC_Br, "bl ${func:call}",
+ [(ARMcall tglobaladdr:$func)]>, Requires<[IsARM, IsDarwin]>;
def BLr9_pred : ABI<0b1011, (outs), (ins i32imm:$func, variable_ops),
- "bl", " ${func:call}",
- [(ARMcall_pred tglobaladdr:$func)]>, Requires<[IsDarwin]>;
+ IIC_Br, "bl", " ${func:call}",
+ [(ARMcall_pred tglobaladdr:$func)]>,
+ Requires<[IsARM, IsDarwin]>;
// ARMv5T and above
def BLXr9 : AXI<(outs), (ins GPR:$func, variable_ops), BrMiscFrm,
- "blx $func",
+ IIC_Br, "blx $func",
[(ARMcall GPR:$func)]>, Requires<[IsARM, HasV5T, IsDarwin]> {
let Inst{7-4} = 0b0011;
let Inst{19-8} = 0b111111111111;
let Inst{27-20} = 0b00010010;
}
- let Uses = [LR] in {
- // ARMv4T
- def BXr9 : ABXIx2<(outs), (ins GPR:$func, variable_ops),
- "mov lr, pc\n\tbx $func",
- [(ARMcall_nolink GPR:$func)]>, Requires<[IsDarwin]>;
+ // ARMv4T
+ def BXr9 : ABXIx2<(outs), (ins GPR:$func, variable_ops),
+ IIC_Br, "mov lr, pc\n\tbx $func",
+ [(ARMcall_nolink GPR:$func)]>, Requires<[IsARM, IsDarwin]> {
+ let Inst{7-4} = 0b0001;
+ let Inst{19-8} = 0b111111111111;
+ let Inst{27-20} = 0b00010010;
}
}
-let isBranch = 1, isTerminator = 1, Itinerary = IIC_Br in {
+let isBranch = 1, isTerminator = 1 in {
// B is "predicable" since it can be xformed into a Bcc.
let isBarrier = 1 in {
let isPredicable = 1 in
- def B : ABXI<0b1010, (outs), (ins brtarget:$target), "b $target",
- [(br bb:$target)]>;
+ def B : ABXI<0b1010, (outs), (ins brtarget:$target), IIC_Br,
+ "b $target", [(br bb:$target)]>;
let isNotDuplicable = 1, isIndirectBranch = 1 in {
def BR_JTr : JTI<(outs), (ins GPR:$target, jtblock_operand:$jt, i32imm:$id),
- "mov pc, $target \n$jt",
+ IIC_Br, "mov pc, $target \n$jt",
[(ARMbrjt GPR:$target, tjumptable:$jt, imm:$id)]> {
let Inst{20} = 0; // S Bit
let Inst{24-21} = 0b1101;
- let Inst{27-26} = {0,0};
+ let Inst{27-25} = 0b000;
}
def BR_JTm : JTI<(outs),
(ins addrmode2:$target, jtblock_operand:$jt, i32imm:$id),
- "ldr pc, $target \n$jt",
- [(ARMbrjt (i32 (load addrmode2:$target)), tjumptable:$jt,
- imm:$id)]> {
+ IIC_Br, "ldr pc, $target \n$jt",
+ [(ARMbrjt (i32 (load addrmode2:$target)), tjumptable:$jt,
+ imm:$id)]> {
let Inst{20} = 1; // L bit
let Inst{21} = 0; // W bit
let Inst{22} = 0; // B bit
let Inst{24} = 1; // P bit
- let Inst{27-26} = {0,1};
+ let Inst{27-25} = 0b011;
}
def BR_JTadd : JTI<(outs),
(ins GPR:$target, GPR:$idx, jtblock_operand:$jt, i32imm:$id),
- "add pc, $target, $idx \n$jt",
+ IIC_Br, "add pc, $target, $idx \n$jt",
[(ARMbrjt (add GPR:$target, GPR:$idx), tjumptable:$jt,
imm:$id)]> {
let Inst{20} = 0; // S bit
let Inst{24-21} = 0b0100;
- let Inst{27-26} = {0,0};
+ let Inst{27-25} = 0b000;
}
} // isNotDuplicable = 1, isIndirectBranch = 1
} // isBarrier = 1
@@ -670,7 +754,7 @@ let isBranch = 1, isTerminator = 1, Itinerary = IIC_Br in {
// FIXME: should be able to write a pattern for ARMBrcond, but can't use
// a two-value operand where a dag node expects two operands. :(
def Bcc : ABI<0b1010, (outs), (ins brtarget:$target),
- "b", " $target",
+ IIC_Br, "b", " $target",
[/*(ARMbrcond bb:$target, imm:$cc, CCR:$ccr)*/]>;
}
@@ -679,133 +763,141 @@ let isBranch = 1, isTerminator = 1, Itinerary = IIC_Br in {
//
// Load
-let canFoldAsLoad = 1 in
-def LDR : AI2ldw<(outs GPR:$dst), (ins addrmode2:$addr), LdFrm,
+let canFoldAsLoad = 1, isReMaterializable = 1 in
+def LDR : AI2ldw<(outs GPR:$dst), (ins addrmode2:$addr), LdFrm, IIC_iLoadr,
"ldr", " $dst, $addr",
[(set GPR:$dst, (load addrmode2:$addr))]>;
// Special LDR for loads from non-pc-relative constpools.
let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1 in
-def LDRcp : AI2ldw<(outs GPR:$dst), (ins addrmode2:$addr), LdFrm,
+def LDRcp : AI2ldw<(outs GPR:$dst), (ins addrmode2:$addr), LdFrm, IIC_iLoadr,
"ldr", " $dst, $addr", []>;
// Loads with zero extension
def LDRH : AI3ldh<(outs GPR:$dst), (ins addrmode3:$addr), LdMiscFrm,
- "ldr", "h $dst, $addr",
- [(set GPR:$dst, (zextloadi16 addrmode3:$addr))]>;
+ IIC_iLoadr, "ldr", "h $dst, $addr",
+ [(set GPR:$dst, (zextloadi16 addrmode3:$addr))]>;
-def LDRB : AI2ldb<(outs GPR:$dst), (ins addrmode2:$addr), LdFrm,
- "ldr", "b $dst, $addr",
- [(set GPR:$dst, (zextloadi8 addrmode2:$addr))]>;
+def LDRB : AI2ldb<(outs GPR:$dst), (ins addrmode2:$addr), LdFrm,
+ IIC_iLoadr, "ldr", "b $dst, $addr",
+ [(set GPR:$dst, (zextloadi8 addrmode2:$addr))]>;
// Loads with sign extension
def LDRSH : AI3ldsh<(outs GPR:$dst), (ins addrmode3:$addr), LdMiscFrm,
- "ldr", "sh $dst, $addr",
- [(set GPR:$dst, (sextloadi16 addrmode3:$addr))]>;
+ IIC_iLoadr, "ldr", "sh $dst, $addr",
+ [(set GPR:$dst, (sextloadi16 addrmode3:$addr))]>;
def LDRSB : AI3ldsb<(outs GPR:$dst), (ins addrmode3:$addr), LdMiscFrm,
- "ldr", "sb $dst, $addr",
- [(set GPR:$dst, (sextloadi8 addrmode3:$addr))]>;
+ IIC_iLoadr, "ldr", "sb $dst, $addr",
+ [(set GPR:$dst, (sextloadi8 addrmode3:$addr))]>;
-let mayLoad = 1 in {
+let mayLoad = 1, hasExtraDefRegAllocReq = 1 in {
// Load doubleword
def LDRD : AI3ldd<(outs GPR:$dst1, GPR:$dst2), (ins addrmode3:$addr), LdMiscFrm,
- "ldr", "d $dst1, $addr", []>, Requires<[IsARM, HasV5T]>;
+ IIC_iLoadr, "ldr", "d $dst1, $addr",
+ []>, Requires<[IsARM, HasV5TE]>;
// Indexed loads
def LDR_PRE : AI2ldwpr<(outs GPR:$dst, GPR:$base_wb),
- (ins addrmode2:$addr), LdFrm,
+ (ins addrmode2:$addr), LdFrm, IIC_iLoadru,
"ldr", " $dst, $addr!", "$addr.base = $base_wb", []>;
def LDR_POST : AI2ldwpo<(outs GPR:$dst, GPR:$base_wb),
- (ins GPR:$base, am2offset:$offset), LdFrm,
+ (ins GPR:$base, am2offset:$offset), LdFrm, IIC_iLoadru,
"ldr", " $dst, [$base], $offset", "$base = $base_wb", []>;
def LDRH_PRE : AI3ldhpr<(outs GPR:$dst, GPR:$base_wb),
- (ins addrmode3:$addr), LdMiscFrm,
+ (ins addrmode3:$addr), LdMiscFrm, IIC_iLoadru,
"ldr", "h $dst, $addr!", "$addr.base = $base_wb", []>;
def LDRH_POST : AI3ldhpo<(outs GPR:$dst, GPR:$base_wb),
- (ins GPR:$base,am3offset:$offset), LdMiscFrm,
+ (ins GPR:$base,am3offset:$offset), LdMiscFrm, IIC_iLoadru,
"ldr", "h $dst, [$base], $offset", "$base = $base_wb", []>;
def LDRB_PRE : AI2ldbpr<(outs GPR:$dst, GPR:$base_wb),
- (ins addrmode2:$addr), LdFrm,
+ (ins addrmode2:$addr), LdFrm, IIC_iLoadru,
"ldr", "b $dst, $addr!", "$addr.base = $base_wb", []>;
def LDRB_POST : AI2ldbpo<(outs GPR:$dst, GPR:$base_wb),
- (ins GPR:$base,am2offset:$offset), LdFrm,
+ (ins GPR:$base,am2offset:$offset), LdFrm, IIC_iLoadru,
"ldr", "b $dst, [$base], $offset", "$base = $base_wb", []>;
def LDRSH_PRE : AI3ldshpr<(outs GPR:$dst, GPR:$base_wb),
- (ins addrmode3:$addr), LdMiscFrm,
+ (ins addrmode3:$addr), LdMiscFrm, IIC_iLoadru,
"ldr", "sh $dst, $addr!", "$addr.base = $base_wb", []>;
def LDRSH_POST: AI3ldshpo<(outs GPR:$dst, GPR:$base_wb),
- (ins GPR:$base,am3offset:$offset), LdMiscFrm,
+ (ins GPR:$base,am3offset:$offset), LdMiscFrm, IIC_iLoadru,
"ldr", "sh $dst, [$base], $offset", "$base = $base_wb", []>;
def LDRSB_PRE : AI3ldsbpr<(outs GPR:$dst, GPR:$base_wb),
- (ins addrmode3:$addr), LdMiscFrm,
+ (ins addrmode3:$addr), LdMiscFrm, IIC_iLoadru,
"ldr", "sb $dst, $addr!", "$addr.base = $base_wb", []>;
def LDRSB_POST: AI3ldsbpo<(outs GPR:$dst, GPR:$base_wb),
- (ins GPR:$base,am3offset:$offset), LdMiscFrm,
+ (ins GPR:$base,am3offset:$offset), LdMiscFrm, IIC_iLoadru,
"ldr", "sb $dst, [$base], $offset", "$base = $base_wb", []>;
}
// Store
-def STR : AI2stw<(outs), (ins GPR:$src, addrmode2:$addr), StFrm,
+def STR : AI2stw<(outs), (ins GPR:$src, addrmode2:$addr), StFrm, IIC_iStorer,
"str", " $src, $addr",
[(store GPR:$src, addrmode2:$addr)]>;
// Stores with truncate
-def STRH : AI3sth<(outs), (ins GPR:$src, addrmode3:$addr), StMiscFrm,
+def STRH : AI3sth<(outs), (ins GPR:$src, addrmode3:$addr), StMiscFrm, IIC_iStorer,
"str", "h $src, $addr",
[(truncstorei16 GPR:$src, addrmode3:$addr)]>;
-def STRB : AI2stb<(outs), (ins GPR:$src, addrmode2:$addr), StFrm,
+def STRB : AI2stb<(outs), (ins GPR:$src, addrmode2:$addr), StFrm, IIC_iStorer,
"str", "b $src, $addr",
[(truncstorei8 GPR:$src, addrmode2:$addr)]>;
// Store doubleword
-let mayStore = 1 in
-def STRD : AI3std<(outs), (ins GPR:$src1, GPR:$src2, addrmode3:$addr),StMiscFrm,
- "str", "d $src1, $addr", []>, Requires<[IsARM, HasV5T]>;
+let mayStore = 1, hasExtraSrcRegAllocReq = 1 in
+def STRD : AI3std<(outs), (ins GPR:$src1, GPR:$src2, addrmode3:$addr),
+ StMiscFrm, IIC_iStorer,
+ "str", "d $src1, $addr", []>, Requires<[IsARM, HasV5TE]>;
// Indexed stores
def STR_PRE : AI2stwpr<(outs GPR:$base_wb),
- (ins GPR:$src, GPR:$base, am2offset:$offset), StFrm,
+ (ins GPR:$src, GPR:$base, am2offset:$offset),
+ StFrm, IIC_iStoreru,
"str", " $src, [$base, $offset]!", "$base = $base_wb",
[(set GPR:$base_wb,
(pre_store GPR:$src, GPR:$base, am2offset:$offset))]>;
def STR_POST : AI2stwpo<(outs GPR:$base_wb),
- (ins GPR:$src, GPR:$base,am2offset:$offset), StFrm,
+ (ins GPR:$src, GPR:$base,am2offset:$offset),
+ StFrm, IIC_iStoreru,
"str", " $src, [$base], $offset", "$base = $base_wb",
[(set GPR:$base_wb,
(post_store GPR:$src, GPR:$base, am2offset:$offset))]>;
def STRH_PRE : AI3sthpr<(outs GPR:$base_wb),
- (ins GPR:$src, GPR:$base,am3offset:$offset), StMiscFrm,
+ (ins GPR:$src, GPR:$base,am3offset:$offset),
+ StMiscFrm, IIC_iStoreru,
"str", "h $src, [$base, $offset]!", "$base = $base_wb",
[(set GPR:$base_wb,
(pre_truncsti16 GPR:$src, GPR:$base,am3offset:$offset))]>;
def STRH_POST: AI3sthpo<(outs GPR:$base_wb),
- (ins GPR:$src, GPR:$base,am3offset:$offset), StMiscFrm,
+ (ins GPR:$src, GPR:$base,am3offset:$offset),
+ StMiscFrm, IIC_iStoreru,
"str", "h $src, [$base], $offset", "$base = $base_wb",
[(set GPR:$base_wb, (post_truncsti16 GPR:$src,
GPR:$base, am3offset:$offset))]>;
def STRB_PRE : AI2stbpr<(outs GPR:$base_wb),
- (ins GPR:$src, GPR:$base,am2offset:$offset), StFrm,
+ (ins GPR:$src, GPR:$base,am2offset:$offset),
+ StFrm, IIC_iStoreru,
"str", "b $src, [$base, $offset]!", "$base = $base_wb",
[(set GPR:$base_wb, (pre_truncsti8 GPR:$src,
GPR:$base, am2offset:$offset))]>;
def STRB_POST: AI2stbpo<(outs GPR:$base_wb),
- (ins GPR:$src, GPR:$base,am2offset:$offset), StFrm,
+ (ins GPR:$src, GPR:$base,am2offset:$offset),
+ StFrm, IIC_iStoreru,
"str", "b $src, [$base], $offset", "$base = $base_wb",
[(set GPR:$base_wb, (post_truncsti8 GPR:$src,
GPR:$base, am2offset:$offset))]>;
@@ -814,17 +906,16 @@ def STRB_POST: AI2stbpo<(outs GPR:$base_wb),
// Load / store multiple Instructions.
//
-// FIXME: $dst1 should be a def.
-let mayLoad = 1 in
+let mayLoad = 1, hasExtraDefRegAllocReq = 1 in
def LDM : AXI4ld<(outs),
- (ins addrmode4:$addr, pred:$p, reglist:$dst1, variable_ops),
- LdStMulFrm, "ldm${p}${addr:submode} $addr, $dst1",
+ (ins addrmode4:$addr, pred:$p, reglist:$wb, variable_ops),
+ LdStMulFrm, IIC_iLoadm, "ldm${p}${addr:submode} $addr, $wb",
[]>;
-let mayStore = 1 in
+let mayStore = 1, hasExtraSrcRegAllocReq = 1 in
def STM : AXI4st<(outs),
- (ins addrmode4:$addr, pred:$p, reglist:$src1, variable_ops),
- LdStMulFrm, "stm${p}${addr:submode} $addr, $src1",
+ (ins addrmode4:$addr, pred:$p, reglist:$wb, variable_ops),
+ LdStMulFrm, IIC_iStorem, "stm${p}${addr:submode} $addr, $wb",
[]>;
//===----------------------------------------------------------------------===//
@@ -832,16 +923,42 @@ def STM : AXI4st<(outs),
//
let neverHasSideEffects = 1 in
-def MOVr : AsI1<0b1101, (outs GPR:$dst), (ins GPR:$src), DPFrm,
- "mov", " $dst, $src", []>, UnaryDP;
-def MOVs : AsI1<0b1101, (outs GPR:$dst), (ins so_reg:$src), DPSoRegFrm,
- "mov", " $dst, $src", [(set GPR:$dst, so_reg:$src)]>, UnaryDP;
+def MOVr : AsI1<0b1101, (outs GPR:$dst), (ins GPR:$src), DPFrm, IIC_iMOVr,
+ "mov", " $dst, $src", []>, UnaryDP;
+def MOVs : AsI1<0b1101, (outs GPR:$dst), (ins so_reg:$src),
+ DPSoRegFrm, IIC_iMOVsr,
+ "mov", " $dst, $src", [(set GPR:$dst, so_reg:$src)]>, UnaryDP;
let isReMaterializable = 1, isAsCheapAsAMove = 1 in
-def MOVi : AsI1<0b1101, (outs GPR:$dst), (ins so_imm:$src), DPFrm,
- "mov", " $dst, $src", [(set GPR:$dst, so_imm:$src)]>, UnaryDP;
+def MOVi : AsI1<0b1101, (outs GPR:$dst), (ins so_imm:$src), DPFrm, IIC_iMOVi,
+ "mov", " $dst, $src", [(set GPR:$dst, so_imm:$src)]>, UnaryDP {
+ let Inst{25} = 1;
+}
-def MOVrx : AsI1<0b1101, (outs GPR:$dst), (ins GPR:$src), Pseudo,
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in
+def MOVi16 : AI1<0b1000, (outs GPR:$dst), (ins i32imm:$src),
+ DPFrm, IIC_iMOVi,
+ "movw", " $dst, $src",
+ [(set GPR:$dst, imm0_65535:$src)]>,
+ Requires<[IsARM, HasV6T2]> {
+ let Inst{20} = 0;
+ let Inst{25} = 1;
+}
+
+let Constraints = "$src = $dst" in
+def MOVTi16 : AI1<0b1010, (outs GPR:$dst), (ins GPR:$src, i32imm:$imm),
+ DPFrm, IIC_iMOVi,
+ "movt", " $dst, $imm",
+ [(set GPR:$dst,
+ (or (and GPR:$src, 0xffff),
+ lo16AllZero:$imm))]>, UnaryDP,
+ Requires<[IsARM, HasV6T2]> {
+ let Inst{20} = 0;
+ let Inst{25} = 1;
+}
+
+let Uses = [CPSR] in
+def MOVrx : AsI1<0b1101, (outs GPR:$dst), (ins GPR:$src), Pseudo, IIC_iMOVsi,
"mov", " $dst, $src, rrx",
[(set GPR:$dst, (ARMrrx GPR:$src))]>, UnaryDP;
@@ -849,11 +966,11 @@ def MOVrx : AsI1<0b1101, (outs GPR:$dst), (ins GPR:$src), Pseudo,
// due to flag operands.
let Defs = [CPSR] in {
-def MOVsrl_flag : AI1<0b1101, (outs GPR:$dst), (ins GPR:$src), Pseudo,
- "mov", "s $dst, $src, lsr #1",
+def MOVsrl_flag : AI1<0b1101, (outs GPR:$dst), (ins GPR:$src), Pseudo,
+ IIC_iMOVsi, "mov", "s $dst, $src, lsr #1",
[(set GPR:$dst, (ARMsrl_flag GPR:$src))]>, UnaryDP;
def MOVsra_flag : AI1<0b1101, (outs GPR:$dst), (ins GPR:$src), Pseudo,
- "mov", "s $dst, $src, asr #1",
+ IIC_iMOVsi, "mov", "s $dst, $src, asr #1",
[(set GPR:$dst, (ARMsra_flag GPR:$src))]>, UnaryDP;
}
@@ -901,6 +1018,24 @@ defm UXTAH : AI_bin_rrot<0b01101111, "uxtah",
// TODO: UXT(A){B|H}16
+def SBFX : I<(outs GPR:$dst),
+ (ins GPR:$src, imm0_31:$lsb, imm0_31:$width),
+ AddrMode1, Size4Bytes, IndexModeNone, DPFrm, IIC_iALUi,
+ "sbfx", " $dst, $src, $lsb, $width", "", []>,
+ Requires<[IsARM, HasV6T2]> {
+ let Inst{27-21} = 0b0111101;
+ let Inst{6-4} = 0b101;
+}
+
+def UBFX : I<(outs GPR:$dst),
+ (ins GPR:$src, imm0_31:$lsb, imm0_31:$width),
+ AddrMode1, Size4Bytes, IndexModeNone, DPFrm, IIC_iALUi,
+ "ubfx", " $dst, $src, $lsb, $width", "", []>,
+ Requires<[IsARM, HasV6T2]> {
+ let Inst{27-21} = 0b0111111;
+ let Inst{6-4} = 0b101;
+}
+
//===----------------------------------------------------------------------===//
// Arithmetic Instructions.
//
@@ -923,30 +1058,36 @@ defm SBC : AI1_adde_sube_irs<0b0110, "sbc",
// These don't define reg/reg forms, because they are handled above.
def RSBri : AsI1<0b0011, (outs GPR:$dst), (ins GPR:$a, so_imm:$b), DPFrm,
- "rsb", " $dst, $a, $b",
- [(set GPR:$dst, (sub so_imm:$b, GPR:$a))]>;
+ IIC_iALUi, "rsb", " $dst, $a, $b",
+ [(set GPR:$dst, (sub so_imm:$b, GPR:$a))]> {
+ let Inst{25} = 1;
+}
def RSBrs : AsI1<0b0011, (outs GPR:$dst), (ins GPR:$a, so_reg:$b), DPSoRegFrm,
- "rsb", " $dst, $a, $b",
+ IIC_iALUsr, "rsb", " $dst, $a, $b",
[(set GPR:$dst, (sub so_reg:$b, GPR:$a))]>;
// RSB with 's' bit set.
let Defs = [CPSR] in {
def RSBSri : AI1<0b0011, (outs GPR:$dst), (ins GPR:$a, so_imm:$b), DPFrm,
- "rsb", "s $dst, $a, $b",
- [(set GPR:$dst, (subc so_imm:$b, GPR:$a))]>;
+ IIC_iALUi, "rsb", "s $dst, $a, $b",
+ [(set GPR:$dst, (subc so_imm:$b, GPR:$a))]> {
+ let Inst{25} = 1;
+}
def RSBSrs : AI1<0b0011, (outs GPR:$dst), (ins GPR:$a, so_reg:$b), DPSoRegFrm,
- "rsb", "s $dst, $a, $b",
+ IIC_iALUsr, "rsb", "s $dst, $a, $b",
[(set GPR:$dst, (subc so_reg:$b, GPR:$a))]>;
}
let Uses = [CPSR] in {
def RSCri : AsI1<0b0111, (outs GPR:$dst), (ins GPR:$a, so_imm:$b),
- DPFrm, "rsc", " $dst, $a, $b",
+ DPFrm, IIC_iALUi, "rsc", " $dst, $a, $b",
[(set GPR:$dst, (sube so_imm:$b, GPR:$a))]>,
- Requires<[IsARM, CarryDefIsUnused]>;
+ Requires<[IsARM, CarryDefIsUnused]> {
+ let Inst{25} = 1;
+}
def RSCrs : AsI1<0b0111, (outs GPR:$dst), (ins GPR:$a, so_reg:$b),
- DPSoRegFrm, "rsc", " $dst, $a, $b",
+ DPSoRegFrm, IIC_iALUsr, "rsc", " $dst, $a, $b",
[(set GPR:$dst, (sube so_reg:$b, GPR:$a))]>,
Requires<[IsARM, CarryDefIsUnused]>;
}
@@ -954,11 +1095,13 @@ def RSCrs : AsI1<0b0111, (outs GPR:$dst), (ins GPR:$a, so_reg:$b),
// FIXME: Allow these to be predicated.
let Defs = [CPSR], Uses = [CPSR] in {
def RSCSri : AXI1<0b0111, (outs GPR:$dst), (ins GPR:$a, so_imm:$b),
- DPFrm, "rscs $dst, $a, $b",
+ DPFrm, IIC_iALUi, "rscs $dst, $a, $b",
[(set GPR:$dst, (sube so_imm:$b, GPR:$a))]>,
- Requires<[IsARM, CarryDefIsUnused]>;
+ Requires<[IsARM, CarryDefIsUnused]> {
+ let Inst{25} = 1;
+}
def RSCSrs : AXI1<0b0111, (outs GPR:$dst), (ins GPR:$a, so_reg:$b),
- DPSoRegFrm, "rscs $dst, $a, $b",
+ DPSoRegFrm, IIC_iALUsr, "rscs $dst, $a, $b",
[(set GPR:$dst, (sube so_reg:$b, GPR:$a))]>,
Requires<[IsARM, CarryDefIsUnused]>;
}
@@ -992,16 +1135,27 @@ defm EOR : AsI1_bin_irs<0b0001, "eor",
defm BIC : AsI1_bin_irs<0b1110, "bic",
BinOpFrag<(and node:$LHS, (not node:$RHS))>>;
-def MVNr : AsI1<0b1111, (outs GPR:$dst), (ins GPR:$src), DPFrm,
+def BFC : I<(outs GPR:$dst), (ins GPR:$src, bf_inv_mask_imm:$imm),
+ AddrMode1, Size4Bytes, IndexModeNone, DPFrm, IIC_iALUi,
+ "bfc", " $dst, $imm", "$src = $dst",
+ [(set GPR:$dst, (and GPR:$src, bf_inv_mask_imm:$imm))]>,
+ Requires<[IsARM, HasV6T2]> {
+ let Inst{27-21} = 0b0111110;
+ let Inst{6-0} = 0b0011111;
+}
+
+def MVNr : AsI1<0b1111, (outs GPR:$dst), (ins GPR:$src), DPFrm, IIC_iMOVr,
"mvn", " $dst, $src",
[(set GPR:$dst, (not GPR:$src))]>, UnaryDP;
def MVNs : AsI1<0b1111, (outs GPR:$dst), (ins so_reg:$src), DPSoRegFrm,
- "mvn", " $dst, $src",
+ IIC_iMOVsr, "mvn", " $dst, $src",
[(set GPR:$dst, (not so_reg:$src))]>, UnaryDP;
let isReMaterializable = 1, isAsCheapAsAMove = 1 in
-def MVNi : AsI1<0b1111, (outs GPR:$dst), (ins so_imm:$imm), DPFrm,
- "mvn", " $dst, $imm",
- [(set GPR:$dst, so_imm_not:$imm)]>,UnaryDP;
+def MVNi : AsI1<0b1111, (outs GPR:$dst), (ins so_imm:$imm), DPFrm,
+ IIC_iMOVi, "mvn", " $dst, $imm",
+ [(set GPR:$dst, so_imm_not:$imm)]>,UnaryDP {
+ let Inst{25} = 1;
+}
def : ARMPat<(and GPR:$src, so_imm_not:$imm),
(BICri GPR:$src, so_imm_not:$imm)>;
@@ -1012,43 +1166,48 @@ def : ARMPat<(and GPR:$src, so_imm_not:$imm),
let isCommutable = 1 in
def MUL : AsMul1I<0b0000000, (outs GPR:$dst), (ins GPR:$a, GPR:$b),
- "mul", " $dst, $a, $b",
+ IIC_iMUL32, "mul", " $dst, $a, $b",
[(set GPR:$dst, (mul GPR:$a, GPR:$b))]>;
def MLA : AsMul1I<0b0000001, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c),
- "mla", " $dst, $a, $b, $c",
+ IIC_iMAC32, "mla", " $dst, $a, $b, $c",
[(set GPR:$dst, (add (mul GPR:$a, GPR:$b), GPR:$c))]>;
+def MLS : AMul1I<0b0000011, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c),
+ IIC_iMAC32, "mls", " $dst, $a, $b, $c",
+ [(set GPR:$dst, (sub GPR:$c, (mul GPR:$a, GPR:$b)))]>,
+ Requires<[IsARM, HasV6T2]>;
+
// Extra precision multiplies with low / high results
let neverHasSideEffects = 1 in {
let isCommutable = 1 in {
def SMULL : AsMul1I<0b0000110, (outs GPR:$ldst, GPR:$hdst),
- (ins GPR:$a, GPR:$b),
+ (ins GPR:$a, GPR:$b), IIC_iMUL64,
"smull", " $ldst, $hdst, $a, $b", []>;
def UMULL : AsMul1I<0b0000100, (outs GPR:$ldst, GPR:$hdst),
- (ins GPR:$a, GPR:$b),
+ (ins GPR:$a, GPR:$b), IIC_iMUL64,
"umull", " $ldst, $hdst, $a, $b", []>;
}
// Multiply + accumulate
def SMLAL : AsMul1I<0b0000111, (outs GPR:$ldst, GPR:$hdst),
- (ins GPR:$a, GPR:$b),
+ (ins GPR:$a, GPR:$b), IIC_iMAC64,
"smlal", " $ldst, $hdst, $a, $b", []>;
def UMLAL : AsMul1I<0b0000101, (outs GPR:$ldst, GPR:$hdst),
- (ins GPR:$a, GPR:$b),
+ (ins GPR:$a, GPR:$b), IIC_iMAC64,
"umlal", " $ldst, $hdst, $a, $b", []>;
def UMAAL : AMul1I <0b0000010, (outs GPR:$ldst, GPR:$hdst),
- (ins GPR:$a, GPR:$b),
+ (ins GPR:$a, GPR:$b), IIC_iMAC64,
"umaal", " $ldst, $hdst, $a, $b", []>,
Requires<[IsARM, HasV6]>;
} // neverHasSideEffects
// Most significant word multiply
def SMMUL : AMul2I <0b0111010, (outs GPR:$dst), (ins GPR:$a, GPR:$b),
- "smmul", " $dst, $a, $b",
+ IIC_iMUL32, "smmul", " $dst, $a, $b",
[(set GPR:$dst, (mulhs GPR:$a, GPR:$b))]>,
Requires<[IsARM, HasV6]> {
let Inst{7-4} = 0b0001;
@@ -1056,7 +1215,7 @@ def SMMUL : AMul2I <0b0111010, (outs GPR:$dst), (ins GPR:$a, GPR:$b),
}
def SMMLA : AMul2I <0b0111010, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c),
- "smmla", " $dst, $a, $b, $c",
+ IIC_iMAC32, "smmla", " $dst, $a, $b, $c",
[(set GPR:$dst, (add (mulhs GPR:$a, GPR:$b), GPR:$c))]>,
Requires<[IsARM, HasV6]> {
let Inst{7-4} = 0b0001;
@@ -1064,7 +1223,7 @@ def SMMLA : AMul2I <0b0111010, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c),
def SMMLS : AMul2I <0b0111010, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c),
- "smmls", " $dst, $a, $b, $c",
+ IIC_iMAC32, "smmls", " $dst, $a, $b, $c",
[(set GPR:$dst, (sub GPR:$c, (mulhs GPR:$a, GPR:$b)))]>,
Requires<[IsARM, HasV6]> {
let Inst{7-4} = 0b1101;
@@ -1072,7 +1231,7 @@ def SMMLS : AMul2I <0b0111010, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c),
multiclass AI_smul<string opc, PatFrag opnode> {
def BB : AMulxyI<0b0001011, (outs GPR:$dst), (ins GPR:$a, GPR:$b),
- !strconcat(opc, "bb"), " $dst, $a, $b",
+ IIC_iMUL32, !strconcat(opc, "bb"), " $dst, $a, $b",
[(set GPR:$dst, (opnode (sext_inreg GPR:$a, i16),
(sext_inreg GPR:$b, i16)))]>,
Requires<[IsARM, HasV5TE]> {
@@ -1081,7 +1240,7 @@ multiclass AI_smul<string opc, PatFrag opnode> {
}
def BT : AMulxyI<0b0001011, (outs GPR:$dst), (ins GPR:$a, GPR:$b),
- !strconcat(opc, "bt"), " $dst, $a, $b",
+ IIC_iMUL32, !strconcat(opc, "bt"), " $dst, $a, $b",
[(set GPR:$dst, (opnode (sext_inreg GPR:$a, i16),
(sra GPR:$b, (i32 16))))]>,
Requires<[IsARM, HasV5TE]> {
@@ -1090,7 +1249,7 @@ multiclass AI_smul<string opc, PatFrag opnode> {
}
def TB : AMulxyI<0b0001011, (outs GPR:$dst), (ins GPR:$a, GPR:$b),
- !strconcat(opc, "tb"), " $dst, $a, $b",
+ IIC_iMUL32, !strconcat(opc, "tb"), " $dst, $a, $b",
[(set GPR:$dst, (opnode (sra GPR:$a, (i32 16)),
(sext_inreg GPR:$b, i16)))]>,
Requires<[IsARM, HasV5TE]> {
@@ -1099,7 +1258,7 @@ multiclass AI_smul<string opc, PatFrag opnode> {
}
def TT : AMulxyI<0b0001011, (outs GPR:$dst), (ins GPR:$a, GPR:$b),
- !strconcat(opc, "tt"), " $dst, $a, $b",
+ IIC_iMUL32, !strconcat(opc, "tt"), " $dst, $a, $b",
[(set GPR:$dst, (opnode (sra GPR:$a, (i32 16)),
(sra GPR:$b, (i32 16))))]>,
Requires<[IsARM, HasV5TE]> {
@@ -1108,7 +1267,7 @@ multiclass AI_smul<string opc, PatFrag opnode> {
}
def WB : AMulxyI<0b0001001, (outs GPR:$dst), (ins GPR:$a, GPR:$b),
- !strconcat(opc, "wb"), " $dst, $a, $b",
+ IIC_iMUL16, !strconcat(opc, "wb"), " $dst, $a, $b",
[(set GPR:$dst, (sra (opnode GPR:$a,
(sext_inreg GPR:$b, i16)), (i32 16)))]>,
Requires<[IsARM, HasV5TE]> {
@@ -1117,7 +1276,7 @@ multiclass AI_smul<string opc, PatFrag opnode> {
}
def WT : AMulxyI<0b0001001, (outs GPR:$dst), (ins GPR:$a, GPR:$b),
- !strconcat(opc, "wt"), " $dst, $a, $b",
+ IIC_iMUL16, !strconcat(opc, "wt"), " $dst, $a, $b",
[(set GPR:$dst, (sra (opnode GPR:$a,
(sra GPR:$b, (i32 16))), (i32 16)))]>,
Requires<[IsARM, HasV5TE]> {
@@ -1129,7 +1288,7 @@ multiclass AI_smul<string opc, PatFrag opnode> {
multiclass AI_smla<string opc, PatFrag opnode> {
def BB : AMulxyI<0b0001000, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc),
- !strconcat(opc, "bb"), " $dst, $a, $b, $acc",
+ IIC_iMAC16, !strconcat(opc, "bb"), " $dst, $a, $b, $acc",
[(set GPR:$dst, (add GPR:$acc,
(opnode (sext_inreg GPR:$a, i16),
(sext_inreg GPR:$b, i16))))]>,
@@ -1139,7 +1298,7 @@ multiclass AI_smla<string opc, PatFrag opnode> {
}
def BT : AMulxyI<0b0001000, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc),
- !strconcat(opc, "bt"), " $dst, $a, $b, $acc",
+ IIC_iMAC16, !strconcat(opc, "bt"), " $dst, $a, $b, $acc",
[(set GPR:$dst, (add GPR:$acc, (opnode (sext_inreg GPR:$a, i16),
(sra GPR:$b, (i32 16)))))]>,
Requires<[IsARM, HasV5TE]> {
@@ -1148,7 +1307,7 @@ multiclass AI_smla<string opc, PatFrag opnode> {
}
def TB : AMulxyI<0b0001000, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc),
- !strconcat(opc, "tb"), " $dst, $a, $b, $acc",
+ IIC_iMAC16, !strconcat(opc, "tb"), " $dst, $a, $b, $acc",
[(set GPR:$dst, (add GPR:$acc, (opnode (sra GPR:$a, (i32 16)),
(sext_inreg GPR:$b, i16))))]>,
Requires<[IsARM, HasV5TE]> {
@@ -1157,7 +1316,7 @@ multiclass AI_smla<string opc, PatFrag opnode> {
}
def TT : AMulxyI<0b0001000, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc),
- !strconcat(opc, "tt"), " $dst, $a, $b, $acc",
+ IIC_iMAC16, !strconcat(opc, "tt"), " $dst, $a, $b, $acc",
[(set GPR:$dst, (add GPR:$acc, (opnode (sra GPR:$a, (i32 16)),
(sra GPR:$b, (i32 16)))))]>,
Requires<[IsARM, HasV5TE]> {
@@ -1166,7 +1325,7 @@ multiclass AI_smla<string opc, PatFrag opnode> {
}
def WB : AMulxyI<0b0001001, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc),
- !strconcat(opc, "wb"), " $dst, $a, $b, $acc",
+ IIC_iMAC16, !strconcat(opc, "wb"), " $dst, $a, $b, $acc",
[(set GPR:$dst, (add GPR:$acc, (sra (opnode GPR:$a,
(sext_inreg GPR:$b, i16)), (i32 16))))]>,
Requires<[IsARM, HasV5TE]> {
@@ -1175,7 +1334,7 @@ multiclass AI_smla<string opc, PatFrag opnode> {
}
def WT : AMulxyI<0b0001001, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc),
- !strconcat(opc, "wt"), " $dst, $a, $b, $acc",
+ IIC_iMAC16, !strconcat(opc, "wt"), " $dst, $a, $b, $acc",
[(set GPR:$dst, (add GPR:$acc, (sra (opnode GPR:$a,
(sra GPR:$b, (i32 16))), (i32 16))))]>,
Requires<[IsARM, HasV5TE]> {
@@ -1194,7 +1353,7 @@ defm SMLA : AI_smla<"smla", BinOpFrag<(mul node:$LHS, node:$RHS)>>;
// Misc. Arithmetic Instructions.
//
-def CLZ : AMiscA1I<0b000010110, (outs GPR:$dst), (ins GPR:$src),
+def CLZ : AMiscA1I<0b000010110, (outs GPR:$dst), (ins GPR:$src), IIC_iUNAr,
"clz", " $dst, $src",
[(set GPR:$dst, (ctlz GPR:$src))]>, Requires<[IsARM, HasV5T]> {
let Inst{7-4} = 0b0001;
@@ -1202,7 +1361,7 @@ def CLZ : AMiscA1I<0b000010110, (outs GPR:$dst), (ins GPR:$src),
let Inst{19-16} = 0b1111;
}
-def REV : AMiscA1I<0b01101011, (outs GPR:$dst), (ins GPR:$src),
+def REV : AMiscA1I<0b01101011, (outs GPR:$dst), (ins GPR:$src), IIC_iUNAr,
"rev", " $dst, $src",
[(set GPR:$dst, (bswap GPR:$src))]>, Requires<[IsARM, HasV6]> {
let Inst{7-4} = 0b0011;
@@ -1210,7 +1369,7 @@ def REV : AMiscA1I<0b01101011, (outs GPR:$dst), (ins GPR:$src),
let Inst{19-16} = 0b1111;
}
-def REV16 : AMiscA1I<0b01101011, (outs GPR:$dst), (ins GPR:$src),
+def REV16 : AMiscA1I<0b01101011, (outs GPR:$dst), (ins GPR:$src), IIC_iUNAr,
"rev16", " $dst, $src",
[(set GPR:$dst,
(or (and (srl GPR:$src, (i32 8)), 0xFF),
@@ -1223,7 +1382,7 @@ def REV16 : AMiscA1I<0b01101011, (outs GPR:$dst), (ins GPR:$src),
let Inst{19-16} = 0b1111;
}
-def REVSH : AMiscA1I<0b01101111, (outs GPR:$dst), (ins GPR:$src),
+def REVSH : AMiscA1I<0b01101111, (outs GPR:$dst), (ins GPR:$src), IIC_iUNAr,
"revsh", " $dst, $src",
[(set GPR:$dst,
(sext_inreg
@@ -1237,7 +1396,7 @@ def REVSH : AMiscA1I<0b01101111, (outs GPR:$dst), (ins GPR:$src),
def PKHBT : AMiscA1I<0b01101000, (outs GPR:$dst),
(ins GPR:$src1, GPR:$src2, i32imm:$shamt),
- "pkhbt", " $dst, $src1, $src2, LSL $shamt",
+ IIC_iALUsi, "pkhbt", " $dst, $src1, $src2, LSL $shamt",
[(set GPR:$dst, (or (and GPR:$src1, 0xFFFF),
(and (shl GPR:$src2, (i32 imm:$shamt)),
0xFFFF0000)))]>,
@@ -1254,7 +1413,7 @@ def : ARMV6Pat<(or (and GPR:$src1, 0xFFFF), (shl GPR:$src2, imm16_31:$shamt)),
def PKHTB : AMiscA1I<0b01101000, (outs GPR:$dst),
(ins GPR:$src1, GPR:$src2, i32imm:$shamt),
- "pkhtb", " $dst, $src1, $src2, ASR $shamt",
+ IIC_iALUsi, "pkhtb", " $dst, $src1, $src2, ASR $shamt",
[(set GPR:$dst, (or (and GPR:$src1, 0xFFFF0000),
(and (sra GPR:$src2, imm16_31:$shamt),
0xFFFF)))]>, Requires<[IsARM, HasV6]> {
@@ -1300,21 +1459,23 @@ def : ARMPat<(ARMcmpZ GPR:$src, so_imm_neg:$imm),
// FIXME: should be able to write a pattern for ARMcmov, but can't use
// a two-value operand where a dag node expects two operands. :(
def MOVCCr : AI1<0b1101, (outs GPR:$dst), (ins GPR:$false, GPR:$true), DPFrm,
- "mov", " $dst, $true",
+ IIC_iCMOVr, "mov", " $dst, $true",
[/*(set GPR:$dst, (ARMcmov GPR:$false, GPR:$true, imm:$cc, CCR:$ccr))*/]>,
RegConstraint<"$false = $dst">, UnaryDP;
def MOVCCs : AI1<0b1101, (outs GPR:$dst),
- (ins GPR:$false, so_reg:$true), DPSoRegFrm,
+ (ins GPR:$false, so_reg:$true), DPSoRegFrm, IIC_iCMOVsr,
"mov", " $dst, $true",
[/*(set GPR:$dst, (ARMcmov GPR:$false, so_reg:$true, imm:$cc, CCR:$ccr))*/]>,
RegConstraint<"$false = $dst">, UnaryDP;
def MOVCCi : AI1<0b1101, (outs GPR:$dst),
- (ins GPR:$false, so_imm:$true), DPFrm,
+ (ins GPR:$false, so_imm:$true), DPFrm, IIC_iCMOVi,
"mov", " $dst, $true",
[/*(set GPR:$dst, (ARMcmov GPR:$false, so_imm:$true, imm:$cc, CCR:$ccr))*/]>,
- RegConstraint<"$false = $dst">, UnaryDP;
+ RegConstraint<"$false = $dst">, UnaryDP {
+ let Inst{25} = 1;
+}
//===----------------------------------------------------------------------===//
@@ -1324,14 +1485,14 @@ def MOVCCi : AI1<0b1101, (outs GPR:$dst),
// __aeabi_read_tp preserves the registers r1-r3.
let isCall = 1,
Defs = [R0, R12, LR, CPSR] in {
- def TPsoft : ABXI<0b1011, (outs), (ins),
+ def TPsoft : ABXI<0b1011, (outs), (ins), IIC_Br,
"bl __aeabi_read_tp",
[(set R0, ARMthread_pointer)]>;
}
//===----------------------------------------------------------------------===//
// SJLJ Exception handling intrinsics
-// eh_sjlj_setjmp() is a three instruction sequence to store the return
+// eh_sjlj_setjmp() is an instruction sequence to store the return
// address and save #0 in R0 for the non-longjmp case.
// Since by its nature we may be coming from some other function to get
// here, and we're using the stack frame for the containing function to
@@ -1342,13 +1503,19 @@ let isCall = 1,
// doing so, we also cause the prologue/epilogue code to actively preserve
// all of the callee-saved resgisters, which is exactly what we want.
let Defs =
- [ R0, R1, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, LR,
- D0, D2, D3, D4, D5, D6, D7, D8, D9, D10, D11, D12, D13, D14, D15 ] in {
+ [ R0, R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, LR, D0,
+ D1, D2, D3, D4, D5, D6, D7, D8, D9, D10, D11, D12, D13, D14, D15,
+ D16, D17, D18, D19, D20, D21, D22, D23, D24, D25, D26, D27, D28, D29, D30,
+ D31 ] in {
def Int_eh_sjlj_setjmp : XI<(outs), (ins GPR:$src),
- AddrModeNone, SizeSpecial, IndexModeNone, Pseudo,
- "add r0, pc, #4\n\t"
- "str r0, [$src, #+4]\n\t"
- "mov r0, #0 @ eh_setjmp", "",
+ AddrModeNone, SizeSpecial, IndexModeNone,
+ Pseudo, NoItinerary,
+ "str sp, [$src, #+8] @ eh_setjmp begin\n\t"
+ "add r12, pc, #8\n\t"
+ "str r12, [$src, #+4]\n\t"
+ "mov r0, #0\n\t"
+ "add pc, pc, #0\n\t"
+ "mov r0, #1 @ eh_setjmp end", "",
[(set R0, (ARMeh_sjlj_setjmp GPR:$src))]>;
}
@@ -1366,25 +1533,36 @@ def : ARMPat<(ARMWrapperJT tjumptable:$dst, imm:$id),
// Two piece so_imms.
let isReMaterializable = 1 in
-def MOVi2pieces : AI1x2<(outs GPR:$dst), (ins so_imm2part:$src), Pseudo,
+def MOVi2pieces : AI1x2<(outs GPR:$dst), (ins so_imm2part:$src),
+ Pseudo, IIC_iMOVi,
"mov", " $dst, $src",
- [(set GPR:$dst, so_imm2part:$src)]>;
+ [(set GPR:$dst, so_imm2part:$src)]>,
+ Requires<[IsARM, NoV6T2]>;
def : ARMPat<(or GPR:$LHS, so_imm2part:$RHS),
- (ORRri (ORRri GPR:$LHS, (so_imm2part_1 imm:$RHS)),
- (so_imm2part_2 imm:$RHS))>;
+ (ORRri (ORRri GPR:$LHS, (so_imm2part_1 imm:$RHS)),
+ (so_imm2part_2 imm:$RHS))>;
def : ARMPat<(xor GPR:$LHS, so_imm2part:$RHS),
- (EORri (EORri GPR:$LHS, (so_imm2part_1 imm:$RHS)),
- (so_imm2part_2 imm:$RHS))>;
+ (EORri (EORri GPR:$LHS, (so_imm2part_1 imm:$RHS)),
+ (so_imm2part_2 imm:$RHS))>;
+
+// 32-bit immediate using movw + movt.
+// This is a single pseudo instruction to make it re-materializable. Remove
+// when we can do generalized remat.
+let isReMaterializable = 1 in
+def MOVi32imm : AI1x2<(outs GPR:$dst), (ins i32imm:$src), Pseudo, IIC_iMOVi,
+ "movw", " $dst, ${src:lo16}\n\tmovt${p} $dst, ${src:hi16}",
+ [(set GPR:$dst, (i32 imm:$src))]>,
+ Requires<[IsARM, HasV6T2]>;
// TODO: add,sub,and, 3-instr forms?
// Direct calls
def : ARMPat<(ARMcall texternalsym:$func), (BL texternalsym:$func)>,
- Requires<[IsNotDarwin]>;
+ Requires<[IsARM, IsNotDarwin]>;
def : ARMPat<(ARMcall texternalsym:$func), (BLr9 texternalsym:$func)>,
- Requires<[IsDarwin]>;
+ Requires<[IsARM, IsDarwin]>;
// zextload i1 -> zextload i8
def : ARMPat<(zextloadi1 addrmode2:$addr), (LDRB addrmode2:$addr)>;
diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td
index a62597b..cd370aa 100644
--- a/lib/Target/ARM/ARMInstrNEON.td
+++ b/lib/Target/ARM/ARMInstrNEON.td
@@ -65,8 +65,28 @@ def SDTARMVGETLN : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisInt<1>,
def NEONvgetlaneu : SDNode<"ARMISD::VGETLANEu", SDTARMVGETLN>;
def NEONvgetlanes : SDNode<"ARMISD::VGETLANEs", SDTARMVGETLN>;
-def NEONvduplaneq : SDNode<"ARMISD::VDUPLANEQ",
- SDTypeProfile<1, 2, [SDTCisVT<2, i32>]>>;
+def NEONvdup : SDNode<"ARMISD::VDUP", SDTypeProfile<1, 1, [SDTCisVec<0>]>>;
+
+// VDUPLANE can produce a quad-register result from a double-register source,
+// so the result is not constrained to match the source.
+def NEONvduplane : SDNode<"ARMISD::VDUPLANE",
+ SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
+ SDTCisVT<2, i32>]>>;
+
+def SDTARMVEXT : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0, 1>,
+ SDTCisSameAs<0, 2>, SDTCisVT<3, i32>]>;
+def NEONvext : SDNode<"ARMISD::VEXT", SDTARMVEXT>;
+
+def SDTARMVSHUF : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0, 1>]>;
+def NEONvrev64 : SDNode<"ARMISD::VREV64", SDTARMVSHUF>;
+def NEONvrev32 : SDNode<"ARMISD::VREV32", SDTARMVSHUF>;
+def NEONvrev16 : SDNode<"ARMISD::VREV16", SDTARMVSHUF>;
+
+def SDTARMVSHUF2 : SDTypeProfile<2, 2, [SDTCisVec<0>, SDTCisSameAs<0, 1>,
+ SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>]>;
+def NEONzip : SDNode<"ARMISD::VZIP", SDTARMVSHUF2>;
+def NEONuzp : SDNode<"ARMISD::VUZP", SDTARMVSHUF2>;
+def NEONtrn : SDNode<"ARMISD::VTRN", SDTARMVSHUF2>;
//===----------------------------------------------------------------------===//
// NEON operand definitions
@@ -87,28 +107,409 @@ def addrmode_neonldstm : Operand<i32>,
//===----------------------------------------------------------------------===//
/* TODO: Take advantage of vldm.
-let mayLoad = 1 in {
+let mayLoad = 1, hasExtraDefRegAllocReq = 1 in {
def VLDMD : NI<(outs),
(ins addrmode_neonldstm:$addr, reglist:$dst1, variable_ops),
+ IIC_fpLoadm,
"vldm${addr:submode} ${addr:base}, $dst1",
- []>;
+ []> {
+ let Inst{27-25} = 0b110;
+ let Inst{20} = 1;
+ let Inst{11-9} = 0b101;
+}
def VLDMS : NI<(outs),
(ins addrmode_neonldstm:$addr, reglist:$dst1, variable_ops),
+ IIC_fpLoadm,
"vldm${addr:submode} ${addr:base}, $dst1",
- []>;
+ []> {
+ let Inst{27-25} = 0b110;
+ let Inst{20} = 1;
+ let Inst{11-9} = 0b101;
+}
}
*/
// Use vldmia to load a Q register as a D register pair.
-def VLDRQ : NI<(outs QPR:$dst), (ins GPR:$addr),
+def VLDRQ : NI4<(outs QPR:$dst), (ins addrmode4:$addr),
+ IIC_fpLoadm,
"vldmia $addr, ${dst:dregpair}",
- [(set QPR:$dst, (v2f64 (load GPR:$addr)))]>;
+ [(set QPR:$dst, (v2f64 (load addrmode4:$addr)))]> {
+ let Inst{27-25} = 0b110;
+ let Inst{24} = 0; // P bit
+ let Inst{23} = 1; // U bit
+ let Inst{20} = 1;
+ let Inst{11-9} = 0b101;
+}
// Use vstmia to store a Q register as a D register pair.
-def VSTRQ : NI<(outs), (ins QPR:$src, GPR:$addr),
+def VSTRQ : NI4<(outs), (ins QPR:$src, addrmode4:$addr),
+ IIC_fpStorem,
"vstmia $addr, ${src:dregpair}",
- [(store (v2f64 QPR:$src), GPR:$addr)]>;
+ [(store (v2f64 QPR:$src), addrmode4:$addr)]> {
+ let Inst{27-25} = 0b110;
+ let Inst{24} = 0; // P bit
+ let Inst{23} = 1; // U bit
+ let Inst{20} = 0;
+ let Inst{11-9} = 0b101;
+}
+
+// VLD1 : Vector Load (multiple single elements)
+class VLD1D<bits<4> op7_4, string OpcodeStr, ValueType Ty, Intrinsic IntOp>
+ : NLdSt<0,0b10,0b0111,op7_4, (outs DPR:$dst), (ins addrmode6:$addr), IIC_VLD1,
+ !strconcat(OpcodeStr, "\t\\{$dst\\}, $addr"), "",
+ [(set DPR:$dst, (Ty (IntOp addrmode6:$addr)))]>;
+class VLD1Q<bits<4> op7_4, string OpcodeStr, ValueType Ty, Intrinsic IntOp>
+ : NLdSt<0,0b10,0b1010,op7_4, (outs QPR:$dst), (ins addrmode6:$addr), IIC_VLD1,
+ !strconcat(OpcodeStr, "\t${dst:dregpair}, $addr"), "",
+ [(set QPR:$dst, (Ty (IntOp addrmode6:$addr)))]>;
+
+def VLD1d8 : VLD1D<0b0000, "vld1.8", v8i8, int_arm_neon_vld1>;
+def VLD1d16 : VLD1D<0b0100, "vld1.16", v4i16, int_arm_neon_vld1>;
+def VLD1d32 : VLD1D<0b1000, "vld1.32", v2i32, int_arm_neon_vld1>;
+def VLD1df : VLD1D<0b1000, "vld1.32", v2f32, int_arm_neon_vld1>;
+def VLD1d64 : VLD1D<0b1100, "vld1.64", v1i64, int_arm_neon_vld1>;
+
+def VLD1q8 : VLD1Q<0b0000, "vld1.8", v16i8, int_arm_neon_vld1>;
+def VLD1q16 : VLD1Q<0b0100, "vld1.16", v8i16, int_arm_neon_vld1>;
+def VLD1q32 : VLD1Q<0b1000, "vld1.32", v4i32, int_arm_neon_vld1>;
+def VLD1qf : VLD1Q<0b1000, "vld1.32", v4f32, int_arm_neon_vld1>;
+def VLD1q64 : VLD1Q<0b1100, "vld1.64", v2i64, int_arm_neon_vld1>;
+
+let mayLoad = 1, hasExtraDefRegAllocReq = 1 in {
+
+// VLD2 : Vector Load (multiple 2-element structures)
+class VLD2D<bits<4> op7_4, string OpcodeStr>
+ : NLdSt<0,0b10,0b1000,op7_4, (outs DPR:$dst1, DPR:$dst2),
+ (ins addrmode6:$addr), IIC_VLD2,
+ !strconcat(OpcodeStr, "\t\\{$dst1,$dst2\\}, $addr"), "", []>;
+class VLD2Q<bits<4> op7_4, string OpcodeStr>
+ : NLdSt<0,0b10,0b0011,op7_4,
+ (outs DPR:$dst1, DPR:$dst2, DPR:$dst3, DPR:$dst4),
+ (ins addrmode6:$addr), IIC_VLD2,
+ !strconcat(OpcodeStr, "\t\\{$dst1,$dst2,$dst3,$dst4\\}, $addr"),
+ "", []>;
+
+def VLD2d8 : VLD2D<0b0000, "vld2.8">;
+def VLD2d16 : VLD2D<0b0100, "vld2.16">;
+def VLD2d32 : VLD2D<0b1000, "vld2.32">;
+def VLD2d64 : NLdSt<0,0b10,0b1010,0b1100, (outs DPR:$dst1, DPR:$dst2),
+ (ins addrmode6:$addr), IIC_VLD1,
+ "vld1.64\t\\{$dst1,$dst2\\}, $addr", "", []>;
+
+def VLD2q8 : VLD2Q<0b0000, "vld2.8">;
+def VLD2q16 : VLD2Q<0b0100, "vld2.16">;
+def VLD2q32 : VLD2Q<0b1000, "vld2.32">;
+
+// VLD3 : Vector Load (multiple 3-element structures)
+class VLD3D<bits<4> op7_4, string OpcodeStr>
+ : NLdSt<0,0b10,0b0100,op7_4, (outs DPR:$dst1, DPR:$dst2, DPR:$dst3),
+ (ins addrmode6:$addr), IIC_VLD3,
+ !strconcat(OpcodeStr, "\t\\{$dst1,$dst2,$dst3\\}, $addr"), "", []>;
+class VLD3WB<bits<4> op7_4, string OpcodeStr>
+ : NLdSt<0,0b10,0b0101,op7_4, (outs DPR:$dst1, DPR:$dst2, DPR:$dst3, GPR:$wb),
+ (ins addrmode6:$addr), IIC_VLD3,
+ !strconcat(OpcodeStr, "\t\\{$dst1,$dst2,$dst3\\}, $addr"),
+ "$addr.addr = $wb", []>;
+
+def VLD3d8 : VLD3D<0b0000, "vld3.8">;
+def VLD3d16 : VLD3D<0b0100, "vld3.16">;
+def VLD3d32 : VLD3D<0b1000, "vld3.32">;
+def VLD3d64 : NLdSt<0,0b10,0b0110,0b1100,
+ (outs DPR:$dst1, DPR:$dst2, DPR:$dst3),
+ (ins addrmode6:$addr), IIC_VLD1,
+ "vld1.64\t\\{$dst1,$dst2,$dst3\\}, $addr", "", []>;
+
+// vld3 to double-spaced even registers.
+def VLD3q8a : VLD3WB<0b0000, "vld3.8">;
+def VLD3q16a : VLD3WB<0b0100, "vld3.16">;
+def VLD3q32a : VLD3WB<0b1000, "vld3.32">;
+
+// vld3 to double-spaced odd registers.
+def VLD3q8b : VLD3WB<0b0000, "vld3.8">;
+def VLD3q16b : VLD3WB<0b0100, "vld3.16">;
+def VLD3q32b : VLD3WB<0b1000, "vld3.32">;
+
+// VLD4 : Vector Load (multiple 4-element structures)
+class VLD4D<bits<4> op7_4, string OpcodeStr>
+ : NLdSt<0,0b10,0b0000,op7_4,
+ (outs DPR:$dst1, DPR:$dst2, DPR:$dst3, DPR:$dst4),
+ (ins addrmode6:$addr), IIC_VLD4,
+ !strconcat(OpcodeStr, "\t\\{$dst1,$dst2,$dst3,$dst4\\}, $addr"),
+ "", []>;
+class VLD4WB<bits<4> op7_4, string OpcodeStr>
+ : NLdSt<0,0b10,0b0001,op7_4,
+ (outs DPR:$dst1, DPR:$dst2, DPR:$dst3, DPR:$dst4, GPR:$wb),
+ (ins addrmode6:$addr), IIC_VLD4,
+ !strconcat(OpcodeStr, "\t\\{$dst1,$dst2,$dst3,$dst4\\}, $addr"),
+ "$addr.addr = $wb", []>;
+
+def VLD4d8 : VLD4D<0b0000, "vld4.8">;
+def VLD4d16 : VLD4D<0b0100, "vld4.16">;
+def VLD4d32 : VLD4D<0b1000, "vld4.32">;
+def VLD4d64 : NLdSt<0,0b10,0b0010,0b1100,
+ (outs DPR:$dst1, DPR:$dst2, DPR:$dst3, DPR:$dst4),
+ (ins addrmode6:$addr), IIC_VLD1,
+ "vld1.64\t\\{$dst1,$dst2,$dst3,$dst4\\}, $addr", "", []>;
+
+// vld4 to double-spaced even registers.
+def VLD4q8a : VLD4WB<0b0000, "vld4.8">;
+def VLD4q16a : VLD4WB<0b0100, "vld4.16">;
+def VLD4q32a : VLD4WB<0b1000, "vld4.32">;
+
+// vld4 to double-spaced odd registers.
+def VLD4q8b : VLD4WB<0b0000, "vld4.8">;
+def VLD4q16b : VLD4WB<0b0100, "vld4.16">;
+def VLD4q32b : VLD4WB<0b1000, "vld4.32">;
+
+// VLD1LN : Vector Load (single element to one lane)
+// FIXME: Not yet implemented.
+
+// VLD2LN : Vector Load (single 2-element structure to one lane)
+class VLD2LN<bits<4> op11_8, string OpcodeStr>
+ : NLdSt<1,0b10,op11_8,0b0000, (outs DPR:$dst1, DPR:$dst2),
+ (ins addrmode6:$addr, DPR:$src1, DPR:$src2, nohash_imm:$lane),
+ IIC_VLD2,
+ !strconcat(OpcodeStr, "\t\\{$dst1[$lane],$dst2[$lane]\\}, $addr"),
+ "$src1 = $dst1, $src2 = $dst2", []>;
+
+def VLD2LNd8 : VLD2LN<0b0001, "vld2.8">;
+def VLD2LNd16 : VLD2LN<0b0101, "vld2.16">;
+def VLD2LNd32 : VLD2LN<0b1001, "vld2.32">;
+
+// vld2 to double-spaced even registers.
+def VLD2LNq16a: VLD2LN<0b0101, "vld2.16">;
+def VLD2LNq32a: VLD2LN<0b1001, "vld2.32">;
+
+// vld2 to double-spaced odd registers.
+def VLD2LNq16b: VLD2LN<0b0101, "vld2.16">;
+def VLD2LNq32b: VLD2LN<0b1001, "vld2.32">;
+
+// VLD3LN : Vector Load (single 3-element structure to one lane)
+class VLD3LN<bits<4> op11_8, string OpcodeStr>
+ : NLdSt<1,0b10,op11_8,0b0000, (outs DPR:$dst1, DPR:$dst2, DPR:$dst3),
+ (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3,
+ nohash_imm:$lane), IIC_VLD3,
+ !strconcat(OpcodeStr,
+ "\t\\{$dst1[$lane],$dst2[$lane],$dst3[$lane]\\}, $addr"),
+ "$src1 = $dst1, $src2 = $dst2, $src3 = $dst3", []>;
+
+def VLD3LNd8 : VLD3LN<0b0010, "vld3.8">;
+def VLD3LNd16 : VLD3LN<0b0110, "vld3.16">;
+def VLD3LNd32 : VLD3LN<0b1010, "vld3.32">;
+
+// vld3 to double-spaced even registers.
+def VLD3LNq16a: VLD3LN<0b0110, "vld3.16">;
+def VLD3LNq32a: VLD3LN<0b1010, "vld3.32">;
+
+// vld3 to double-spaced odd registers.
+def VLD3LNq16b: VLD3LN<0b0110, "vld3.16">;
+def VLD3LNq32b: VLD3LN<0b1010, "vld3.32">;
+
+// VLD4LN : Vector Load (single 4-element structure to one lane)
+class VLD4LN<bits<4> op11_8, string OpcodeStr>
+ : NLdSt<1,0b10,op11_8,0b0000,
+ (outs DPR:$dst1, DPR:$dst2, DPR:$dst3, DPR:$dst4),
+ (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4,
+ nohash_imm:$lane), IIC_VLD4,
+ !strconcat(OpcodeStr,
+ "\t\\{$dst1[$lane],$dst2[$lane],$dst3[$lane],$dst4[$lane]\\}, $addr"),
+ "$src1 = $dst1, $src2 = $dst2, $src3 = $dst3, $src4 = $dst4", []>;
+
+def VLD4LNd8 : VLD4LN<0b0011, "vld4.8">;
+def VLD4LNd16 : VLD4LN<0b0111, "vld4.16">;
+def VLD4LNd32 : VLD4LN<0b1011, "vld4.32">;
+
+// vld4 to double-spaced even registers.
+def VLD4LNq16a: VLD4LN<0b0111, "vld4.16">;
+def VLD4LNq32a: VLD4LN<0b1011, "vld4.32">;
+
+// vld4 to double-spaced odd registers.
+def VLD4LNq16b: VLD4LN<0b0111, "vld4.16">;
+def VLD4LNq32b: VLD4LN<0b1011, "vld4.32">;
+
+// VLD1DUP : Vector Load (single element to all lanes)
+// VLD2DUP : Vector Load (single 2-element structure to all lanes)
+// VLD3DUP : Vector Load (single 3-element structure to all lanes)
+// VLD4DUP : Vector Load (single 4-element structure to all lanes)
+// FIXME: Not yet implemented.
+} // mayLoad = 1, hasExtraDefRegAllocReq = 1
+
+// VST1 : Vector Store (multiple single elements)
+class VST1D<bits<4> op7_4, string OpcodeStr, ValueType Ty, Intrinsic IntOp>
+ : NLdSt<0,0b00,0b0111,op7_4, (outs), (ins addrmode6:$addr, DPR:$src), IIC_VST,
+ !strconcat(OpcodeStr, "\t\\{$src\\}, $addr"), "",
+ [(IntOp addrmode6:$addr, (Ty DPR:$src))]>;
+class VST1Q<bits<4> op7_4, string OpcodeStr, ValueType Ty, Intrinsic IntOp>
+ : NLdSt<0,0b00,0b1010,op7_4, (outs), (ins addrmode6:$addr, QPR:$src), IIC_VST,
+ !strconcat(OpcodeStr, "\t${src:dregpair}, $addr"), "",
+ [(IntOp addrmode6:$addr, (Ty QPR:$src))]>;
+
+let hasExtraSrcRegAllocReq = 1 in {
+def VST1d8 : VST1D<0b0000, "vst1.8", v8i8, int_arm_neon_vst1>;
+def VST1d16 : VST1D<0b0100, "vst1.16", v4i16, int_arm_neon_vst1>;
+def VST1d32 : VST1D<0b1000, "vst1.32", v2i32, int_arm_neon_vst1>;
+def VST1df : VST1D<0b1000, "vst1.32", v2f32, int_arm_neon_vst1>;
+def VST1d64 : VST1D<0b1100, "vst1.64", v1i64, int_arm_neon_vst1>;
+
+def VST1q8 : VST1Q<0b0000, "vst1.8", v16i8, int_arm_neon_vst1>;
+def VST1q16 : VST1Q<0b0100, "vst1.16", v8i16, int_arm_neon_vst1>;
+def VST1q32 : VST1Q<0b1000, "vst1.32", v4i32, int_arm_neon_vst1>;
+def VST1qf : VST1Q<0b1000, "vst1.32", v4f32, int_arm_neon_vst1>;
+def VST1q64 : VST1Q<0b1100, "vst1.64", v2i64, int_arm_neon_vst1>;
+} // hasExtraSrcRegAllocReq
+
+let mayStore = 1, hasExtraSrcRegAllocReq = 1 in {
+
+// VST2 : Vector Store (multiple 2-element structures)
+class VST2D<bits<4> op7_4, string OpcodeStr>
+ : NLdSt<0,0b00,0b1000,op7_4, (outs),
+ (ins addrmode6:$addr, DPR:$src1, DPR:$src2), IIC_VST,
+ !strconcat(OpcodeStr, "\t\\{$src1,$src2\\}, $addr"), "", []>;
+class VST2Q<bits<4> op7_4, string OpcodeStr>
+ : NLdSt<0,0b00,0b0011,op7_4, (outs),
+ (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4),
+ IIC_VST,
+ !strconcat(OpcodeStr, "\t\\{$src1,$src2,$src3,$src4\\}, $addr"),
+ "", []>;
+
+def VST2d8 : VST2D<0b0000, "vst2.8">;
+def VST2d16 : VST2D<0b0100, "vst2.16">;
+def VST2d32 : VST2D<0b1000, "vst2.32">;
+def VST2d64 : NLdSt<0,0b00,0b1010,0b1100, (outs),
+ (ins addrmode6:$addr, DPR:$src1, DPR:$src2), IIC_VST,
+ "vst1.64\t\\{$src1,$src2\\}, $addr", "", []>;
+
+def VST2q8 : VST2Q<0b0000, "vst2.8">;
+def VST2q16 : VST2Q<0b0100, "vst2.16">;
+def VST2q32 : VST2Q<0b1000, "vst2.32">;
+
+// VST3 : Vector Store (multiple 3-element structures)
+class VST3D<bits<4> op7_4, string OpcodeStr>
+ : NLdSt<0,0b00,0b0100,op7_4, (outs),
+ (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3), IIC_VST,
+ !strconcat(OpcodeStr, "\t\\{$src1,$src2,$src3\\}, $addr"), "", []>;
+class VST3WB<bits<4> op7_4, string OpcodeStr>
+ : NLdSt<0,0b00,0b0101,op7_4, (outs GPR:$wb),
+ (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3), IIC_VST,
+ !strconcat(OpcodeStr, "\t\\{$src1,$src2,$src3\\}, $addr"),
+ "$addr.addr = $wb", []>;
+
+def VST3d8 : VST3D<0b0000, "vst3.8">;
+def VST3d16 : VST3D<0b0100, "vst3.16">;
+def VST3d32 : VST3D<0b1000, "vst3.32">;
+def VST3d64 : NLdSt<0,0b00,0b0110,0b1100, (outs),
+ (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3),
+ IIC_VST,
+ "vst1.64\t\\{$src1,$src2,$src3\\}, $addr", "", []>;
+
+// vst3 to double-spaced even registers.
+def VST3q8a : VST3WB<0b0000, "vst3.8">;
+def VST3q16a : VST3WB<0b0100, "vst3.16">;
+def VST3q32a : VST3WB<0b1000, "vst3.32">;
+
+// vst3 to double-spaced odd registers.
+def VST3q8b : VST3WB<0b0000, "vst3.8">;
+def VST3q16b : VST3WB<0b0100, "vst3.16">;
+def VST3q32b : VST3WB<0b1000, "vst3.32">;
+
+// VST4 : Vector Store (multiple 4-element structures)
+class VST4D<bits<4> op7_4, string OpcodeStr>
+ : NLdSt<0,0b00,0b0000,op7_4, (outs),
+ (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4),
+ IIC_VST,
+ !strconcat(OpcodeStr, "\t\\{$src1,$src2,$src3,$src4\\}, $addr"),
+ "", []>;
+class VST4WB<bits<4> op7_4, string OpcodeStr>
+ : NLdSt<0,0b00,0b0001,op7_4, (outs GPR:$wb),
+ (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4),
+ IIC_VST,
+ !strconcat(OpcodeStr, "\t\\{$src1,$src2,$src3,$src4\\}, $addr"),
+ "$addr.addr = $wb", []>;
+
+def VST4d8 : VST4D<0b0000, "vst4.8">;
+def VST4d16 : VST4D<0b0100, "vst4.16">;
+def VST4d32 : VST4D<0b1000, "vst4.32">;
+def VST4d64 : NLdSt<0,0b00,0b0010,0b1100, (outs),
+ (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3,
+ DPR:$src4), IIC_VST,
+ "vst1.64\t\\{$src1,$src2,$src3,$src4\\}, $addr", "", []>;
+
+// vst4 to double-spaced even registers.
+def VST4q8a : VST4WB<0b0000, "vst4.8">;
+def VST4q16a : VST4WB<0b0100, "vst4.16">;
+def VST4q32a : VST4WB<0b1000, "vst4.32">;
+
+// vst4 to double-spaced odd registers.
+def VST4q8b : VST4WB<0b0000, "vst4.8">;
+def VST4q16b : VST4WB<0b0100, "vst4.16">;
+def VST4q32b : VST4WB<0b1000, "vst4.32">;
+
+// VST1LN : Vector Store (single element from one lane)
+// FIXME: Not yet implemented.
+
+// VST2LN : Vector Store (single 2-element structure from one lane)
+class VST2LN<bits<4> op11_8, string OpcodeStr>
+ : NLdSt<1,0b00,op11_8,0b0000, (outs),
+ (ins addrmode6:$addr, DPR:$src1, DPR:$src2, nohash_imm:$lane),
+ IIC_VST,
+ !strconcat(OpcodeStr, "\t\\{$src1[$lane],$src2[$lane]\\}, $addr"),
+ "", []>;
+
+def VST2LNd8 : VST2LN<0b0000, "vst2.8">;
+def VST2LNd16 : VST2LN<0b0100, "vst2.16">;
+def VST2LNd32 : VST2LN<0b1000, "vst2.32">;
+
+// vst2 to double-spaced even registers.
+def VST2LNq16a: VST2LN<0b0100, "vst2.16">;
+def VST2LNq32a: VST2LN<0b1000, "vst2.32">;
+
+// vst2 to double-spaced odd registers.
+def VST2LNq16b: VST2LN<0b0100, "vst2.16">;
+def VST2LNq32b: VST2LN<0b1000, "vst2.32">;
+
+// VST3LN : Vector Store (single 3-element structure from one lane)
+class VST3LN<bits<4> op11_8, string OpcodeStr>
+ : NLdSt<1,0b00,op11_8,0b0000, (outs),
+ (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3,
+ nohash_imm:$lane), IIC_VST,
+ !strconcat(OpcodeStr,
+ "\t\\{$src1[$lane],$src2[$lane],$src3[$lane]\\}, $addr"), "", []>;
+
+def VST3LNd8 : VST3LN<0b0010, "vst3.8">;
+def VST3LNd16 : VST3LN<0b0110, "vst3.16">;
+def VST3LNd32 : VST3LN<0b1010, "vst3.32">;
+
+// vst3 to double-spaced even registers.
+def VST3LNq16a: VST3LN<0b0110, "vst3.16">;
+def VST3LNq32a: VST3LN<0b1010, "vst3.32">;
+
+// vst3 to double-spaced odd registers.
+def VST3LNq16b: VST3LN<0b0110, "vst3.16">;
+def VST3LNq32b: VST3LN<0b1010, "vst3.32">;
+
+// VST4LN : Vector Store (single 4-element structure from one lane)
+class VST4LN<bits<4> op11_8, string OpcodeStr>
+ : NLdSt<1,0b00,op11_8,0b0000, (outs),
+ (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4,
+ nohash_imm:$lane), IIC_VST,
+ !strconcat(OpcodeStr,
+ "\t\\{$src1[$lane],$src2[$lane],$src3[$lane],$src4[$lane]\\}, $addr"),
+ "", []>;
+
+def VST4LNd8 : VST4LN<0b0011, "vst4.8">;
+def VST4LNd16 : VST4LN<0b0111, "vst4.16">;
+def VST4LNd32 : VST4LN<0b1011, "vst4.32">;
+
+// vst4 to double-spaced even registers.
+def VST4LNq16a: VST4LN<0b0111, "vst4.16">;
+def VST4LNq32a: VST4LN<0b1011, "vst4.32">;
+
+// vst4 to double-spaced odd registers.
+def VST4LNq16b: VST4LN<0b0111, "vst4.16">;
+def VST4LNq32b: VST4LN<0b1011, "vst4.32">;
+
+} // mayStore = 1, hasExtraSrcRegAllocReq = 1
//===----------------------------------------------------------------------===//
@@ -117,18 +518,27 @@ def VSTRQ : NI<(outs), (ins QPR:$src, GPR:$addr),
// Extract D sub-registers of Q registers.
// (arm_dsubreg_0 is 5; arm_dsubreg_1 is 6)
-def SubReg_i8_reg : SDNodeXForm<imm, [{
+def DSubReg_i8_reg : SDNodeXForm<imm, [{
return CurDAG->getTargetConstant(5 + N->getZExtValue() / 8, MVT::i32);
}]>;
-def SubReg_i16_reg : SDNodeXForm<imm, [{
+def DSubReg_i16_reg : SDNodeXForm<imm, [{
return CurDAG->getTargetConstant(5 + N->getZExtValue() / 4, MVT::i32);
}]>;
-def SubReg_i32_reg : SDNodeXForm<imm, [{
+def DSubReg_i32_reg : SDNodeXForm<imm, [{
return CurDAG->getTargetConstant(5 + N->getZExtValue() / 2, MVT::i32);
}]>;
-def SubReg_f64_reg : SDNodeXForm<imm, [{
+def DSubReg_f64_reg : SDNodeXForm<imm, [{
return CurDAG->getTargetConstant(5 + N->getZExtValue(), MVT::i32);
}]>;
+def DSubReg_f64_other_reg : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(5 + (1 - N->getZExtValue()), MVT::i32);
+}]>;
+
+// Extract S sub-registers of Q/D registers.
+// (arm_ssubreg_0 is 1; arm_ssubreg_1 is 2; etc.)
+def SSubReg_f32_reg : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(1 + N->getZExtValue(), MVT::i32);
+}]>;
// Translate lane numbers from Q registers to D subregs.
def SubReg_i8_lane : SDNodeXForm<imm, [{
@@ -150,117 +560,337 @@ class N2VD<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
bits<2> op17_16, bits<5> op11_7, bit op4, string OpcodeStr,
ValueType ResTy, ValueType OpTy, SDNode OpNode>
: N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 0, op4, (outs DPR:$dst),
- (ins DPR:$src), !strconcat(OpcodeStr, "\t$dst, $src"), "",
+ (ins DPR:$src), IIC_VUNAD, !strconcat(OpcodeStr, "\t$dst, $src"), "",
[(set DPR:$dst, (ResTy (OpNode (OpTy DPR:$src))))]>;
class N2VQ<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
bits<2> op17_16, bits<5> op11_7, bit op4, string OpcodeStr,
ValueType ResTy, ValueType OpTy, SDNode OpNode>
: N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 1, op4, (outs QPR:$dst),
- (ins QPR:$src), !strconcat(OpcodeStr, "\t$dst, $src"), "",
+ (ins QPR:$src), IIC_VUNAQ, !strconcat(OpcodeStr, "\t$dst, $src"), "",
[(set QPR:$dst, (ResTy (OpNode (OpTy QPR:$src))))]>;
+// Basic 2-register operations, scalar single-precision.
+class N2VDs<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
+ bits<2> op17_16, bits<5> op11_7, bit op4, string OpcodeStr,
+ ValueType ResTy, ValueType OpTy, SDNode OpNode>
+ : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 0, op4,
+ (outs DPR_VFP2:$dst), (ins DPR_VFP2:$src),
+ IIC_VUNAD, !strconcat(OpcodeStr, "\t$dst, $src"), "", []>;
+
+class N2VDsPat<SDNode OpNode, ValueType ResTy, ValueType OpTy, NeonI Inst>
+ : NEONFPPat<(ResTy (OpNode SPR:$a)),
+ (EXTRACT_SUBREG
+ (Inst (INSERT_SUBREG (OpTy (IMPLICIT_DEF)), SPR:$a, arm_ssubreg_0)),
+ arm_ssubreg_0)>;
+
// Basic 2-register intrinsics, both double- and quad-register.
class N2VDInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
- bits<2> op17_16, bits<5> op11_7, bit op4, string OpcodeStr,
+ bits<2> op17_16, bits<5> op11_7, bit op4,
+ InstrItinClass itin, string OpcodeStr,
ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
: N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 0, op4, (outs DPR:$dst),
- (ins DPR:$src), !strconcat(OpcodeStr, "\t$dst, $src"), "",
+ (ins DPR:$src), itin, !strconcat(OpcodeStr, "\t$dst, $src"), "",
[(set DPR:$dst, (ResTy (IntOp (OpTy DPR:$src))))]>;
class N2VQInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
- bits<2> op17_16, bits<5> op11_7, bit op4, string OpcodeStr,
+ bits<2> op17_16, bits<5> op11_7, bit op4,
+ InstrItinClass itin, string OpcodeStr,
ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
: N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 1, op4, (outs QPR:$dst),
- (ins QPR:$src), !strconcat(OpcodeStr, "\t$dst, $src"), "",
+ (ins QPR:$src), itin, !strconcat(OpcodeStr, "\t$dst, $src"), "",
[(set QPR:$dst, (ResTy (IntOp (OpTy QPR:$src))))]>;
+// Basic 2-register intrinsics, scalar single-precision
+class N2VDInts<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
+ bits<2> op17_16, bits<5> op11_7, bit op4,
+ InstrItinClass itin, string OpcodeStr,
+ ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
+ : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 0, op4,
+ (outs DPR_VFP2:$dst), (ins DPR_VFP2:$src), itin,
+ !strconcat(OpcodeStr, "\t$dst, $src"), "", []>;
+
+class N2VDIntsPat<SDNode OpNode, NeonI Inst>
+ : NEONFPPat<(f32 (OpNode SPR:$a)),
+ (EXTRACT_SUBREG
+ (Inst (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), SPR:$a, arm_ssubreg_0)),
+ arm_ssubreg_0)>;
+
// Narrow 2-register intrinsics.
class N2VNInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
bits<2> op17_16, bits<5> op11_7, bit op6, bit op4,
- string OpcodeStr, ValueType TyD, ValueType TyQ, Intrinsic IntOp>
+ InstrItinClass itin, string OpcodeStr,
+ ValueType TyD, ValueType TyQ, Intrinsic IntOp>
: N2V<op24_23, op21_20, op19_18, op17_16, op11_7, op6, op4, (outs DPR:$dst),
- (ins QPR:$src), !strconcat(OpcodeStr, "\t$dst, $src"), "",
+ (ins QPR:$src), itin, !strconcat(OpcodeStr, "\t$dst, $src"), "",
[(set DPR:$dst, (TyD (IntOp (TyQ QPR:$src))))]>;
// Long 2-register intrinsics. (This is currently only used for VMOVL and is
// derived from N2VImm instead of N2V because of the way the size is encoded.)
class N2VLInt<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7,
- bit op6, bit op4, string OpcodeStr, ValueType TyQ, ValueType TyD,
- Intrinsic IntOp>
+ bit op6, bit op4, InstrItinClass itin, string OpcodeStr,
+ ValueType TyQ, ValueType TyD, Intrinsic IntOp>
: N2VImm<op24, op23, op21_16, op11_8, op7, op6, op4, (outs QPR:$dst),
- (ins DPR:$src), !strconcat(OpcodeStr, "\t$dst, $src"), "",
+ (ins DPR:$src), itin, !strconcat(OpcodeStr, "\t$dst, $src"), "",
[(set QPR:$dst, (TyQ (IntOp (TyD DPR:$src))))]>;
+// 2-register shuffles (VTRN/VZIP/VUZP), both double- and quad-register.
+class N2VDShuffle<bits<2> op19_18, bits<5> op11_7, string OpcodeStr>
+ : N2V<0b11, 0b11, op19_18, 0b10, op11_7, 0, 0, (outs DPR:$dst1, DPR:$dst2),
+ (ins DPR:$src1, DPR:$src2), IIC_VPERMD,
+ !strconcat(OpcodeStr, "\t$dst1, $dst2"),
+ "$src1 = $dst1, $src2 = $dst2", []>;
+class N2VQShuffle<bits<2> op19_18, bits<5> op11_7,
+ InstrItinClass itin, string OpcodeStr>
+ : N2V<0b11, 0b11, op19_18, 0b10, op11_7, 1, 0, (outs QPR:$dst1, QPR:$dst2),
+ (ins QPR:$src1, QPR:$src2), itin,
+ !strconcat(OpcodeStr, "\t$dst1, $dst2"),
+ "$src1 = $dst1, $src2 = $dst2", []>;
+
// Basic 3-register operations, both double- and quad-register.
class N3VD<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
- string OpcodeStr, ValueType ResTy, ValueType OpTy,
+ InstrItinClass itin, string OpcodeStr, ValueType ResTy, ValueType OpTy,
SDNode OpNode, bit Commutable>
: N3V<op24, op23, op21_20, op11_8, 0, op4,
- (outs DPR:$dst), (ins DPR:$src1, DPR:$src2),
+ (outs DPR:$dst), (ins DPR:$src1, DPR:$src2), itin,
!strconcat(OpcodeStr, "\t$dst, $src1, $src2"), "",
[(set DPR:$dst, (ResTy (OpNode (OpTy DPR:$src1), (OpTy DPR:$src2))))]> {
let isCommutable = Commutable;
}
+class N3VDSL<bits<2> op21_20, bits<4> op11_8,
+ InstrItinClass itin, string OpcodeStr, ValueType Ty, SDNode ShOp>
+ : N3V<0, 1, op21_20, op11_8, 1, 0,
+ (outs DPR:$dst), (ins DPR:$src1, DPR_VFP2:$src2, nohash_imm:$lane),
+ itin, !strconcat(OpcodeStr, "\t$dst, $src1, $src2[$lane]"), "",
+ [(set (Ty DPR:$dst),
+ (Ty (ShOp (Ty DPR:$src1),
+ (Ty (NEONvduplane (Ty DPR_VFP2:$src2),
+ imm:$lane)))))]> {
+ let isCommutable = 0;
+}
+class N3VDSL16<bits<2> op21_20, bits<4> op11_8,
+ string OpcodeStr, ValueType Ty, SDNode ShOp>
+ : N3V<0, 1, op21_20, op11_8, 1, 0,
+ (outs DPR:$dst), (ins DPR:$src1, DPR_8:$src2, nohash_imm:$lane),
+ IIC_VMULi16D,
+ !strconcat(OpcodeStr, "\t$dst, $src1, $src2[$lane]"), "",
+ [(set (Ty DPR:$dst),
+ (Ty (ShOp (Ty DPR:$src1),
+ (Ty (NEONvduplane (Ty DPR_8:$src2),
+ imm:$lane)))))]> {
+ let isCommutable = 0;
+}
+
class N3VQ<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
- string OpcodeStr, ValueType ResTy, ValueType OpTy,
+ InstrItinClass itin, string OpcodeStr, ValueType ResTy, ValueType OpTy,
SDNode OpNode, bit Commutable>
: N3V<op24, op23, op21_20, op11_8, 1, op4,
- (outs QPR:$dst), (ins QPR:$src1, QPR:$src2),
+ (outs QPR:$dst), (ins QPR:$src1, QPR:$src2), itin,
!strconcat(OpcodeStr, "\t$dst, $src1, $src2"), "",
[(set QPR:$dst, (ResTy (OpNode (OpTy QPR:$src1), (OpTy QPR:$src2))))]> {
let isCommutable = Commutable;
}
+class N3VQSL<bits<2> op21_20, bits<4> op11_8,
+ InstrItinClass itin, string OpcodeStr,
+ ValueType ResTy, ValueType OpTy, SDNode ShOp>
+ : N3V<1, 1, op21_20, op11_8, 1, 0,
+ (outs QPR:$dst), (ins QPR:$src1, DPR_VFP2:$src2, nohash_imm:$lane),
+ itin, !strconcat(OpcodeStr, "\t$dst, $src1, $src2[$lane]"), "",
+ [(set (ResTy QPR:$dst),
+ (ResTy (ShOp (ResTy QPR:$src1),
+ (ResTy (NEONvduplane (OpTy DPR_VFP2:$src2),
+ imm:$lane)))))]> {
+ let isCommutable = 0;
+}
+class N3VQSL16<bits<2> op21_20, bits<4> op11_8,
+ string OpcodeStr, ValueType ResTy, ValueType OpTy, SDNode ShOp>
+ : N3V<1, 1, op21_20, op11_8, 1, 0,
+ (outs QPR:$dst), (ins QPR:$src1, DPR_8:$src2, nohash_imm:$lane),
+ IIC_VMULi16Q,
+ !strconcat(OpcodeStr, "\t$dst, $src1, $src2[$lane]"), "",
+ [(set (ResTy QPR:$dst),
+ (ResTy (ShOp (ResTy QPR:$src1),
+ (ResTy (NEONvduplane (OpTy DPR_8:$src2),
+ imm:$lane)))))]> {
+ let isCommutable = 0;
+}
+
+// Basic 3-register operations, scalar single-precision
+class N3VDs<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+ string OpcodeStr, ValueType ResTy, ValueType OpTy,
+ SDNode OpNode, bit Commutable>
+ : N3V<op24, op23, op21_20, op11_8, 0, op4,
+ (outs DPR_VFP2:$dst), (ins DPR_VFP2:$src1, DPR_VFP2:$src2), IIC_VBIND,
+ !strconcat(OpcodeStr, "\t$dst, $src1, $src2"), "", []> {
+ let isCommutable = Commutable;
+}
+class N3VDsPat<SDNode OpNode, NeonI Inst>
+ : NEONFPPat<(f32 (OpNode SPR:$a, SPR:$b)),
+ (EXTRACT_SUBREG
+ (Inst (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), SPR:$a, arm_ssubreg_0),
+ (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), SPR:$b, arm_ssubreg_0)),
+ arm_ssubreg_0)>;
// Basic 3-register intrinsics, both double- and quad-register.
class N3VDInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
- string OpcodeStr, ValueType ResTy, ValueType OpTy,
+ InstrItinClass itin, string OpcodeStr, ValueType ResTy, ValueType OpTy,
Intrinsic IntOp, bit Commutable>
: N3V<op24, op23, op21_20, op11_8, 0, op4,
- (outs DPR:$dst), (ins DPR:$src1, DPR:$src2),
+ (outs DPR:$dst), (ins DPR:$src1, DPR:$src2), itin,
!strconcat(OpcodeStr, "\t$dst, $src1, $src2"), "",
[(set DPR:$dst, (ResTy (IntOp (OpTy DPR:$src1), (OpTy DPR:$src2))))]> {
let isCommutable = Commutable;
}
+class N3VDIntSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
+ string OpcodeStr, ValueType Ty, Intrinsic IntOp>
+ : N3V<0, 1, op21_20, op11_8, 1, 0,
+ (outs DPR:$dst), (ins DPR:$src1, DPR_VFP2:$src2, nohash_imm:$lane),
+ itin, !strconcat(OpcodeStr, "\t$dst, $src1, $src2[$lane]"), "",
+ [(set (Ty DPR:$dst),
+ (Ty (IntOp (Ty DPR:$src1),
+ (Ty (NEONvduplane (Ty DPR_VFP2:$src2),
+ imm:$lane)))))]> {
+ let isCommutable = 0;
+}
+class N3VDIntSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
+ string OpcodeStr, ValueType Ty, Intrinsic IntOp>
+ : N3V<0, 1, op21_20, op11_8, 1, 0,
+ (outs DPR:$dst), (ins DPR:$src1, DPR_8:$src2, nohash_imm:$lane),
+ itin, !strconcat(OpcodeStr, "\t$dst, $src1, $src2[$lane]"), "",
+ [(set (Ty DPR:$dst),
+ (Ty (IntOp (Ty DPR:$src1),
+ (Ty (NEONvduplane (Ty DPR_8:$src2),
+ imm:$lane)))))]> {
+ let isCommutable = 0;
+}
+
class N3VQInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
- string OpcodeStr, ValueType ResTy, ValueType OpTy,
+ InstrItinClass itin, string OpcodeStr, ValueType ResTy, ValueType OpTy,
Intrinsic IntOp, bit Commutable>
: N3V<op24, op23, op21_20, op11_8, 1, op4,
- (outs QPR:$dst), (ins QPR:$src1, QPR:$src2),
+ (outs QPR:$dst), (ins QPR:$src1, QPR:$src2), itin,
!strconcat(OpcodeStr, "\t$dst, $src1, $src2"), "",
[(set QPR:$dst, (ResTy (IntOp (OpTy QPR:$src1), (OpTy QPR:$src2))))]> {
let isCommutable = Commutable;
}
+class N3VQIntSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
+ string OpcodeStr, ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
+ : N3V<1, 1, op21_20, op11_8, 1, 0,
+ (outs QPR:$dst), (ins QPR:$src1, DPR_VFP2:$src2, nohash_imm:$lane),
+ itin, !strconcat(OpcodeStr, "\t$dst, $src1, $src2[$lane]"), "",
+ [(set (ResTy QPR:$dst),
+ (ResTy (IntOp (ResTy QPR:$src1),
+ (ResTy (NEONvduplane (OpTy DPR_VFP2:$src2),
+ imm:$lane)))))]> {
+ let isCommutable = 0;
+}
+class N3VQIntSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
+ string OpcodeStr, ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
+ : N3V<1, 1, op21_20, op11_8, 1, 0,
+ (outs QPR:$dst), (ins QPR:$src1, DPR_8:$src2, nohash_imm:$lane),
+ itin, !strconcat(OpcodeStr, "\t$dst, $src1, $src2[$lane]"), "",
+ [(set (ResTy QPR:$dst),
+ (ResTy (IntOp (ResTy QPR:$src1),
+ (ResTy (NEONvduplane (OpTy DPR_8:$src2),
+ imm:$lane)))))]> {
+ let isCommutable = 0;
+}
// Multiply-Add/Sub operations, both double- and quad-register.
class N3VDMulOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
- string OpcodeStr, ValueType Ty, SDNode MulOp, SDNode OpNode>
+ InstrItinClass itin, string OpcodeStr,
+ ValueType Ty, SDNode MulOp, SDNode OpNode>
: N3V<op24, op23, op21_20, op11_8, 0, op4,
- (outs DPR:$dst), (ins DPR:$src1, DPR:$src2, DPR:$src3),
+ (outs DPR:$dst), (ins DPR:$src1, DPR:$src2, DPR:$src3), itin,
!strconcat(OpcodeStr, "\t$dst, $src2, $src3"), "$src1 = $dst",
[(set DPR:$dst, (Ty (OpNode DPR:$src1,
(Ty (MulOp DPR:$src2, DPR:$src3)))))]>;
+class N3VDMulOpSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
+ string OpcodeStr, ValueType Ty, SDNode MulOp, SDNode ShOp>
+ : N3V<0, 1, op21_20, op11_8, 1, 0,
+ (outs DPR:$dst),
+ (ins DPR:$src1, DPR:$src2, DPR_VFP2:$src3, nohash_imm:$lane), itin,
+ !strconcat(OpcodeStr, "\t$dst, $src2, $src3[$lane]"), "$src1 = $dst",
+ [(set (Ty DPR:$dst),
+ (Ty (ShOp (Ty DPR:$src1),
+ (Ty (MulOp DPR:$src2,
+ (Ty (NEONvduplane (Ty DPR_VFP2:$src3),
+ imm:$lane)))))))]>;
+class N3VDMulOpSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
+ string OpcodeStr, ValueType Ty, SDNode MulOp, SDNode ShOp>
+ : N3V<0, 1, op21_20, op11_8, 1, 0,
+ (outs DPR:$dst),
+ (ins DPR:$src1, DPR:$src2, DPR_8:$src3, nohash_imm:$lane), itin,
+ !strconcat(OpcodeStr, "\t$dst, $src2, $src3[$lane]"), "$src1 = $dst",
+ [(set (Ty DPR:$dst),
+ (Ty (ShOp (Ty DPR:$src1),
+ (Ty (MulOp DPR:$src2,
+ (Ty (NEONvduplane (Ty DPR_8:$src3),
+ imm:$lane)))))))]>;
+
class N3VQMulOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
- string OpcodeStr, ValueType Ty, SDNode MulOp, SDNode OpNode>
+ InstrItinClass itin, string OpcodeStr, ValueType Ty,
+ SDNode MulOp, SDNode OpNode>
: N3V<op24, op23, op21_20, op11_8, 1, op4,
- (outs QPR:$dst), (ins QPR:$src1, QPR:$src2, QPR:$src3),
+ (outs QPR:$dst), (ins QPR:$src1, QPR:$src2, QPR:$src3), itin,
!strconcat(OpcodeStr, "\t$dst, $src2, $src3"), "$src1 = $dst",
[(set QPR:$dst, (Ty (OpNode QPR:$src1,
(Ty (MulOp QPR:$src2, QPR:$src3)))))]>;
+class N3VQMulOpSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
+ string OpcodeStr, ValueType ResTy, ValueType OpTy,
+ SDNode MulOp, SDNode ShOp>
+ : N3V<1, 1, op21_20, op11_8, 1, 0,
+ (outs QPR:$dst),
+ (ins QPR:$src1, QPR:$src2, DPR_VFP2:$src3, nohash_imm:$lane), itin,
+ !strconcat(OpcodeStr, "\t$dst, $src2, $src3[$lane]"), "$src1 = $dst",
+ [(set (ResTy QPR:$dst),
+ (ResTy (ShOp (ResTy QPR:$src1),
+ (ResTy (MulOp QPR:$src2,
+ (ResTy (NEONvduplane (OpTy DPR_VFP2:$src3),
+ imm:$lane)))))))]>;
+class N3VQMulOpSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
+ string OpcodeStr, ValueType ResTy, ValueType OpTy,
+ SDNode MulOp, SDNode ShOp>
+ : N3V<1, 1, op21_20, op11_8, 1, 0,
+ (outs QPR:$dst),
+ (ins QPR:$src1, QPR:$src2, DPR_8:$src3, nohash_imm:$lane), itin,
+ !strconcat(OpcodeStr, "\t$dst, $src2, $src3[$lane]"), "$src1 = $dst",
+ [(set (ResTy QPR:$dst),
+ (ResTy (ShOp (ResTy QPR:$src1),
+ (ResTy (MulOp QPR:$src2,
+ (ResTy (NEONvduplane (OpTy DPR_8:$src3),
+ imm:$lane)))))))]>;
+
+// Multiply-Add/Sub operations, scalar single-precision
+class N3VDMulOps<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+ InstrItinClass itin, string OpcodeStr,
+ ValueType Ty, SDNode MulOp, SDNode OpNode>
+ : N3V<op24, op23, op21_20, op11_8, 0, op4,
+ (outs DPR_VFP2:$dst),
+ (ins DPR_VFP2:$src1, DPR_VFP2:$src2, DPR_VFP2:$src3), itin,
+ !strconcat(OpcodeStr, "\t$dst, $src2, $src3"), "$src1 = $dst", []>;
+
+class N3VDMulOpsPat<SDNode MulNode, SDNode OpNode, NeonI Inst>
+ : NEONFPPat<(f32 (OpNode SPR:$acc, (f32 (MulNode SPR:$a, SPR:$b)))),
+ (EXTRACT_SUBREG
+ (Inst (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), SPR:$acc, arm_ssubreg_0),
+ (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), SPR:$a, arm_ssubreg_0),
+ (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), SPR:$b, arm_ssubreg_0)),
+ arm_ssubreg_0)>;
// Neon 3-argument intrinsics, both double- and quad-register.
// The destination register is also used as the first source operand register.
class N3VDInt3<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
- string OpcodeStr, ValueType ResTy, ValueType OpTy,
- Intrinsic IntOp>
+ InstrItinClass itin, string OpcodeStr,
+ ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
: N3V<op24, op23, op21_20, op11_8, 0, op4,
- (outs DPR:$dst), (ins DPR:$src1, DPR:$src2, DPR:$src3),
+ (outs DPR:$dst), (ins DPR:$src1, DPR:$src2, DPR:$src3), itin,
!strconcat(OpcodeStr, "\t$dst, $src2, $src3"), "$src1 = $dst",
[(set DPR:$dst, (ResTy (IntOp (OpTy DPR:$src1),
(OpTy DPR:$src2), (OpTy DPR:$src3))))]>;
class N3VQInt3<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
- string OpcodeStr, ValueType ResTy, ValueType OpTy,
- Intrinsic IntOp>
+ InstrItinClass itin, string OpcodeStr,
+ ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
: N3V<op24, op23, op21_20, op11_8, 1, op4,
- (outs QPR:$dst), (ins QPR:$src1, QPR:$src2, QPR:$src3),
+ (outs QPR:$dst), (ins QPR:$src1, QPR:$src2, QPR:$src3), itin,
!strconcat(OpcodeStr, "\t$dst, $src2, $src3"), "$src1 = $dst",
[(set QPR:$dst, (ResTy (IntOp (OpTy QPR:$src1),
(OpTy QPR:$src2), (OpTy QPR:$src3))))]>;
@@ -268,19 +898,44 @@ class N3VQInt3<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
// Neon Long 3-argument intrinsic. The destination register is
// a quad-register and is also used as the first source operand register.
class N3VLInt3<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
- string OpcodeStr, ValueType TyQ, ValueType TyD, Intrinsic IntOp>
+ InstrItinClass itin, string OpcodeStr,
+ ValueType TyQ, ValueType TyD, Intrinsic IntOp>
: N3V<op24, op23, op21_20, op11_8, 0, op4,
- (outs QPR:$dst), (ins QPR:$src1, DPR:$src2, DPR:$src3),
+ (outs QPR:$dst), (ins QPR:$src1, DPR:$src2, DPR:$src3), itin,
!strconcat(OpcodeStr, "\t$dst, $src2, $src3"), "$src1 = $dst",
[(set QPR:$dst,
(TyQ (IntOp (TyQ QPR:$src1), (TyD DPR:$src2), (TyD DPR:$src3))))]>;
+class N3VLInt3SL<bit op24, bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
+ string OpcodeStr, ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
+ : N3V<op24, 1, op21_20, op11_8, 1, 0,
+ (outs QPR:$dst),
+ (ins QPR:$src1, DPR:$src2, DPR_VFP2:$src3, nohash_imm:$lane), itin,
+ !strconcat(OpcodeStr, "\t$dst, $src2, $src3[$lane]"), "$src1 = $dst",
+ [(set (ResTy QPR:$dst),
+ (ResTy (IntOp (ResTy QPR:$src1),
+ (OpTy DPR:$src2),
+ (OpTy (NEONvduplane (OpTy DPR_VFP2:$src3),
+ imm:$lane)))))]>;
+class N3VLInt3SL16<bit op24, bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
+ string OpcodeStr, ValueType ResTy, ValueType OpTy,
+ Intrinsic IntOp>
+ : N3V<op24, 1, op21_20, op11_8, 1, 0,
+ (outs QPR:$dst),
+ (ins QPR:$src1, DPR:$src2, DPR_8:$src3, nohash_imm:$lane), itin,
+ !strconcat(OpcodeStr, "\t$dst, $src2, $src3[$lane]"), "$src1 = $dst",
+ [(set (ResTy QPR:$dst),
+ (ResTy (IntOp (ResTy QPR:$src1),
+ (OpTy DPR:$src2),
+ (OpTy (NEONvduplane (OpTy DPR_8:$src3),
+ imm:$lane)))))]>;
+
// Narrowing 3-register intrinsics.
class N3VNInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
string OpcodeStr, ValueType TyD, ValueType TyQ,
Intrinsic IntOp, bit Commutable>
: N3V<op24, op23, op21_20, op11_8, 0, op4,
- (outs DPR:$dst), (ins QPR:$src1, QPR:$src2),
+ (outs DPR:$dst), (ins QPR:$src1, QPR:$src2), IIC_VBINi4D,
!strconcat(OpcodeStr, "\t$dst, $src1, $src2"), "",
[(set DPR:$dst, (TyD (IntOp (TyQ QPR:$src1), (TyQ QPR:$src2))))]> {
let isCommutable = Commutable;
@@ -288,21 +943,40 @@ class N3VNInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
// Long 3-register intrinsics.
class N3VLInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
- string OpcodeStr, ValueType TyQ, ValueType TyD,
+ InstrItinClass itin, string OpcodeStr, ValueType TyQ, ValueType TyD,
Intrinsic IntOp, bit Commutable>
: N3V<op24, op23, op21_20, op11_8, 0, op4,
- (outs QPR:$dst), (ins DPR:$src1, DPR:$src2),
+ (outs QPR:$dst), (ins DPR:$src1, DPR:$src2), itin,
!strconcat(OpcodeStr, "\t$dst, $src1, $src2"), "",
[(set QPR:$dst, (TyQ (IntOp (TyD DPR:$src1), (TyD DPR:$src2))))]> {
let isCommutable = Commutable;
}
+class N3VLIntSL<bit op24, bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
+ string OpcodeStr, ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
+ : N3V<op24, 1, op21_20, op11_8, 1, 0,
+ (outs QPR:$dst), (ins DPR:$src1, DPR_VFP2:$src2, nohash_imm:$lane),
+ itin, !strconcat(OpcodeStr, "\t$dst, $src1, $src2[$lane]"), "",
+ [(set (ResTy QPR:$dst),
+ (ResTy (IntOp (OpTy DPR:$src1),
+ (OpTy (NEONvduplane (OpTy DPR_VFP2:$src2),
+ imm:$lane)))))]>;
+class N3VLIntSL16<bit op24, bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
+ string OpcodeStr, ValueType ResTy, ValueType OpTy,
+ Intrinsic IntOp>
+ : N3V<op24, 1, op21_20, op11_8, 1, 0,
+ (outs QPR:$dst), (ins DPR:$src1, DPR_8:$src2, nohash_imm:$lane),
+ itin, !strconcat(OpcodeStr, "\t$dst, $src1, $src2[$lane]"), "",
+ [(set (ResTy QPR:$dst),
+ (ResTy (IntOp (OpTy DPR:$src1),
+ (OpTy (NEONvduplane (OpTy DPR_8:$src2),
+ imm:$lane)))))]>;
// Wide 3-register intrinsics.
class N3VWInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
string OpcodeStr, ValueType TyQ, ValueType TyD,
Intrinsic IntOp, bit Commutable>
: N3V<op24, op23, op21_20, op11_8, 0, op4,
- (outs QPR:$dst), (ins QPR:$src1, DPR:$src2),
+ (outs QPR:$dst), (ins QPR:$src1, DPR:$src2), IIC_VSUBiD,
!strconcat(OpcodeStr, "\t$dst, $src1, $src2"), "",
[(set QPR:$dst, (TyQ (IntOp (TyQ QPR:$src1), (TyD DPR:$src2))))]> {
let isCommutable = Commutable;
@@ -313,13 +987,13 @@ class N2VDPLInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
bits<2> op17_16, bits<5> op11_7, bit op4, string OpcodeStr,
ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
: N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 0, op4, (outs DPR:$dst),
- (ins DPR:$src), !strconcat(OpcodeStr, "\t$dst, $src"), "",
+ (ins DPR:$src), IIC_VSHLiD, !strconcat(OpcodeStr, "\t$dst, $src"), "",
[(set DPR:$dst, (ResTy (IntOp (OpTy DPR:$src))))]>;
class N2VQPLInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
bits<2> op17_16, bits<5> op11_7, bit op4, string OpcodeStr,
ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
: N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 1, op4, (outs QPR:$dst),
- (ins QPR:$src), !strconcat(OpcodeStr, "\t$dst, $src"), "",
+ (ins QPR:$src), IIC_VSHLiD, !strconcat(OpcodeStr, "\t$dst, $src"), "",
[(set QPR:$dst, (ResTy (IntOp (OpTy QPR:$src))))]>;
// Pairwise long 2-register accumulate intrinsics,
@@ -329,29 +1003,31 @@ class N2VDPLInt2<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
bits<2> op17_16, bits<5> op11_7, bit op4, string OpcodeStr,
ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
: N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 0, op4,
- (outs DPR:$dst), (ins DPR:$src1, DPR:$src2),
+ (outs DPR:$dst), (ins DPR:$src1, DPR:$src2), IIC_VPALiD,
!strconcat(OpcodeStr, "\t$dst, $src2"), "$src1 = $dst",
[(set DPR:$dst, (ResTy (IntOp (ResTy DPR:$src1), (OpTy DPR:$src2))))]>;
class N2VQPLInt2<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
bits<2> op17_16, bits<5> op11_7, bit op4, string OpcodeStr,
ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
: N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 1, op4,
- (outs QPR:$dst), (ins QPR:$src1, QPR:$src2),
+ (outs QPR:$dst), (ins QPR:$src1, QPR:$src2), IIC_VPALiQ,
!strconcat(OpcodeStr, "\t$dst, $src2"), "$src1 = $dst",
[(set QPR:$dst, (ResTy (IntOp (ResTy QPR:$src1), (OpTy QPR:$src2))))]>;
// Shift by immediate,
// both double- and quad-register.
class N2VDSh<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7,
- bit op4, string OpcodeStr, ValueType Ty, SDNode OpNode>
+ bit op4, InstrItinClass itin, string OpcodeStr,
+ ValueType Ty, SDNode OpNode>
: N2VImm<op24, op23, op21_16, op11_8, op7, 0, op4,
- (outs DPR:$dst), (ins DPR:$src, i32imm:$SIMM),
+ (outs DPR:$dst), (ins DPR:$src, i32imm:$SIMM), itin,
!strconcat(OpcodeStr, "\t$dst, $src, $SIMM"), "",
[(set DPR:$dst, (Ty (OpNode (Ty DPR:$src), (i32 imm:$SIMM))))]>;
class N2VQSh<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7,
- bit op4, string OpcodeStr, ValueType Ty, SDNode OpNode>
+ bit op4, InstrItinClass itin, string OpcodeStr,
+ ValueType Ty, SDNode OpNode>
: N2VImm<op24, op23, op21_16, op11_8, op7, 1, op4,
- (outs QPR:$dst), (ins QPR:$src, i32imm:$SIMM),
+ (outs QPR:$dst), (ins QPR:$src, i32imm:$SIMM), itin,
!strconcat(OpcodeStr, "\t$dst, $src, $SIMM"), "",
[(set QPR:$dst, (Ty (OpNode (Ty QPR:$src), (i32 imm:$SIMM))))]>;
@@ -360,17 +1036,17 @@ class N2VLSh<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7,
bit op6, bit op4, string OpcodeStr, ValueType ResTy,
ValueType OpTy, SDNode OpNode>
: N2VImm<op24, op23, op21_16, op11_8, op7, op6, op4,
- (outs QPR:$dst), (ins DPR:$src, i32imm:$SIMM),
+ (outs QPR:$dst), (ins DPR:$src, i32imm:$SIMM), IIC_VSHLiD,
!strconcat(OpcodeStr, "\t$dst, $src, $SIMM"), "",
[(set QPR:$dst, (ResTy (OpNode (OpTy DPR:$src),
(i32 imm:$SIMM))))]>;
// Narrow shift by immediate.
class N2VNSh<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7,
- bit op6, bit op4, string OpcodeStr, ValueType ResTy,
- ValueType OpTy, SDNode OpNode>
+ bit op6, bit op4, InstrItinClass itin, string OpcodeStr,
+ ValueType ResTy, ValueType OpTy, SDNode OpNode>
: N2VImm<op24, op23, op21_16, op11_8, op7, op6, op4,
- (outs DPR:$dst), (ins QPR:$src, i32imm:$SIMM),
+ (outs DPR:$dst), (ins QPR:$src, i32imm:$SIMM), itin,
!strconcat(OpcodeStr, "\t$dst, $src, $SIMM"), "",
[(set DPR:$dst, (ResTy (OpNode (OpTy QPR:$src),
(i32 imm:$SIMM))))]>;
@@ -381,6 +1057,7 @@ class N2VDShAdd<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7,
bit op4, string OpcodeStr, ValueType Ty, SDNode ShOp>
: N2VImm<op24, op23, op21_16, op11_8, op7, 0, op4,
(outs DPR:$dst), (ins DPR:$src1, DPR:$src2, i32imm:$SIMM),
+ IIC_VPALiD,
!strconcat(OpcodeStr, "\t$dst, $src2, $SIMM"), "$src1 = $dst",
[(set DPR:$dst, (Ty (add DPR:$src1,
(Ty (ShOp DPR:$src2, (i32 imm:$SIMM))))))]>;
@@ -388,6 +1065,7 @@ class N2VQShAdd<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7,
bit op4, string OpcodeStr, ValueType Ty, SDNode ShOp>
: N2VImm<op24, op23, op21_16, op11_8, op7, 1, op4,
(outs QPR:$dst), (ins QPR:$src1, QPR:$src2, i32imm:$SIMM),
+ IIC_VPALiD,
!strconcat(OpcodeStr, "\t$dst, $src2, $SIMM"), "$src1 = $dst",
[(set QPR:$dst, (Ty (add QPR:$src1,
(Ty (ShOp QPR:$src2, (i32 imm:$SIMM))))))]>;
@@ -398,12 +1076,14 @@ class N2VDShIns<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7,
bit op4, string OpcodeStr, ValueType Ty, SDNode ShOp>
: N2VImm<op24, op23, op21_16, op11_8, op7, 0, op4,
(outs DPR:$dst), (ins DPR:$src1, DPR:$src2, i32imm:$SIMM),
+ IIC_VSHLiD,
!strconcat(OpcodeStr, "\t$dst, $src2, $SIMM"), "$src1 = $dst",
[(set DPR:$dst, (Ty (ShOp DPR:$src1, DPR:$src2, (i32 imm:$SIMM))))]>;
class N2VQShIns<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7,
bit op4, string OpcodeStr, ValueType Ty, SDNode ShOp>
: N2VImm<op24, op23, op21_16, op11_8, op7, 1, op4,
(outs QPR:$dst), (ins QPR:$src1, QPR:$src2, i32imm:$SIMM),
+ IIC_VSHLiQ,
!strconcat(OpcodeStr, "\t$dst, $src2, $SIMM"), "$src1 = $dst",
[(set QPR:$dst, (Ty (ShOp QPR:$src1, QPR:$src2, (i32 imm:$SIMM))))]>;
@@ -413,14 +1093,14 @@ class N2VCvtD<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7,
bit op4, string OpcodeStr, ValueType ResTy, ValueType OpTy,
Intrinsic IntOp>
: N2VImm<op24, op23, op21_16, op11_8, op7, 0, op4,
- (outs DPR:$dst), (ins DPR:$src, i32imm:$SIMM),
+ (outs DPR:$dst), (ins DPR:$src, i32imm:$SIMM), IIC_VUNAD,
!strconcat(OpcodeStr, "\t$dst, $src, $SIMM"), "",
[(set DPR:$dst, (ResTy (IntOp (OpTy DPR:$src), (i32 imm:$SIMM))))]>;
class N2VCvtQ<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7,
bit op4, string OpcodeStr, ValueType ResTy, ValueType OpTy,
Intrinsic IntOp>
: N2VImm<op24, op23, op21_16, op11_8, op7, 1, op4,
- (outs QPR:$dst), (ins QPR:$src, i32imm:$SIMM),
+ (outs QPR:$dst), (ins QPR:$src, i32imm:$SIMM), IIC_VUNAQ,
!strconcat(OpcodeStr, "\t$dst, $src, $SIMM"), "",
[(set QPR:$dst, (ResTy (IntOp (OpTy QPR:$src), (i32 imm:$SIMM))))]>;
@@ -428,50 +1108,68 @@ class N2VCvtQ<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7,
// Multiclasses
//===----------------------------------------------------------------------===//
+// Abbreviations used in multiclass suffixes:
+// Q = quarter int (8 bit) elements
+// H = half int (16 bit) elements
+// S = single int (32 bit) elements
+// D = double int (64 bit) elements
+
// Neon 3-register vector operations.
// First with only element sizes of 8, 16 and 32 bits:
multiclass N3V_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
+ InstrItinClass itinD16, InstrItinClass itinD32,
+ InstrItinClass itinQ16, InstrItinClass itinQ32,
string OpcodeStr, SDNode OpNode, bit Commutable = 0> {
// 64-bit vector types.
- def v8i8 : N3VD<op24, op23, 0b00, op11_8, op4, !strconcat(OpcodeStr, "8"),
- v8i8, v8i8, OpNode, Commutable>;
- def v4i16 : N3VD<op24, op23, 0b01, op11_8, op4, !strconcat(OpcodeStr, "16"),
- v4i16, v4i16, OpNode, Commutable>;
- def v2i32 : N3VD<op24, op23, 0b10, op11_8, op4, !strconcat(OpcodeStr, "32"),
- v2i32, v2i32, OpNode, Commutable>;
+ def v8i8 : N3VD<op24, op23, 0b00, op11_8, op4, itinD16,
+ !strconcat(OpcodeStr, "8"), v8i8, v8i8, OpNode, Commutable>;
+ def v4i16 : N3VD<op24, op23, 0b01, op11_8, op4, itinD16,
+ !strconcat(OpcodeStr, "16"), v4i16, v4i16, OpNode, Commutable>;
+ def v2i32 : N3VD<op24, op23, 0b10, op11_8, op4, itinD32,
+ !strconcat(OpcodeStr, "32"), v2i32, v2i32, OpNode, Commutable>;
// 128-bit vector types.
- def v16i8 : N3VQ<op24, op23, 0b00, op11_8, op4, !strconcat(OpcodeStr, "8"),
- v16i8, v16i8, OpNode, Commutable>;
- def v8i16 : N3VQ<op24, op23, 0b01, op11_8, op4, !strconcat(OpcodeStr, "16"),
- v8i16, v8i16, OpNode, Commutable>;
- def v4i32 : N3VQ<op24, op23, 0b10, op11_8, op4, !strconcat(OpcodeStr, "32"),
- v4i32, v4i32, OpNode, Commutable>;
+ def v16i8 : N3VQ<op24, op23, 0b00, op11_8, op4, itinQ16,
+ !strconcat(OpcodeStr, "8"), v16i8, v16i8, OpNode, Commutable>;
+ def v8i16 : N3VQ<op24, op23, 0b01, op11_8, op4, itinQ16,
+ !strconcat(OpcodeStr, "16"), v8i16, v8i16, OpNode, Commutable>;
+ def v4i32 : N3VQ<op24, op23, 0b10, op11_8, op4, itinQ32,
+ !strconcat(OpcodeStr, "32"), v4i32, v4i32, OpNode, Commutable>;
+}
+
+multiclass N3VSL_HS<bits<4> op11_8, string OpcodeStr, SDNode ShOp> {
+ def v4i16 : N3VDSL16<0b01, op11_8, !strconcat(OpcodeStr, "16"), v4i16, ShOp>;
+ def v2i32 : N3VDSL<0b10, op11_8, IIC_VMULi32D, !strconcat(OpcodeStr, "32"), v2i32, ShOp>;
+ def v8i16 : N3VQSL16<0b01, op11_8, !strconcat(OpcodeStr, "16"), v8i16, v4i16, ShOp>;
+ def v4i32 : N3VQSL<0b10, op11_8, IIC_VMULi32Q, !strconcat(OpcodeStr, "32"), v4i32, v2i32, ShOp>;
}
// ....then also with element size 64 bits:
multiclass N3V_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4,
+ InstrItinClass itinD, InstrItinClass itinQ,
string OpcodeStr, SDNode OpNode, bit Commutable = 0>
- : N3V_QHS<op24, op23, op11_8, op4, OpcodeStr, OpNode, Commutable> {
- def v1i64 : N3VD<op24, op23, 0b11, op11_8, op4, !strconcat(OpcodeStr, "64"),
- v1i64, v1i64, OpNode, Commutable>;
- def v2i64 : N3VQ<op24, op23, 0b11, op11_8, op4, !strconcat(OpcodeStr, "64"),
- v2i64, v2i64, OpNode, Commutable>;
+ : N3V_QHS<op24, op23, op11_8, op4, itinD, itinD, itinQ, itinQ,
+ OpcodeStr, OpNode, Commutable> {
+ def v1i64 : N3VD<op24, op23, 0b11, op11_8, op4, itinD,
+ !strconcat(OpcodeStr, "64"), v1i64, v1i64, OpNode, Commutable>;
+ def v2i64 : N3VQ<op24, op23, 0b11, op11_8, op4, itinQ,
+ !strconcat(OpcodeStr, "64"), v2i64, v2i64, OpNode, Commutable>;
}
// Neon Narrowing 2-register vector intrinsics,
// source operand element sizes of 16, 32 and 64 bits:
multiclass N2VNInt_HSD<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16,
- bits<5> op11_7, bit op6, bit op4, string OpcodeStr,
+ bits<5> op11_7, bit op6, bit op4,
+ InstrItinClass itin, string OpcodeStr,
Intrinsic IntOp> {
def v8i8 : N2VNInt<op24_23, op21_20, 0b00, op17_16, op11_7, op6, op4,
- !strconcat(OpcodeStr, "16"), v8i8, v8i16, IntOp>;
+ itin, !strconcat(OpcodeStr, "16"), v8i8, v8i16, IntOp>;
def v4i16 : N2VNInt<op24_23, op21_20, 0b01, op17_16, op11_7, op6, op4,
- !strconcat(OpcodeStr, "32"), v4i16, v4i32, IntOp>;
+ itin, !strconcat(OpcodeStr, "32"), v4i16, v4i32, IntOp>;
def v2i32 : N2VNInt<op24_23, op21_20, 0b10, op17_16, op11_7, op6, op4,
- !strconcat(OpcodeStr, "64"), v2i32, v2i64, IntOp>;
+ itin, !strconcat(OpcodeStr, "64"), v2i32, v2i64, IntOp>;
}
@@ -480,11 +1178,11 @@ multiclass N2VNInt_HSD<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16,
multiclass N2VLInt_QHS<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6,
bit op4, string OpcodeStr, Intrinsic IntOp> {
def v8i16 : N2VLInt<op24, op23, 0b001000, op11_8, op7, op6, op4,
- !strconcat(OpcodeStr, "8"), v8i16, v8i8, IntOp>;
+ IIC_VQUNAiD, !strconcat(OpcodeStr, "8"), v8i16, v8i8, IntOp>;
def v4i32 : N2VLInt<op24, op23, 0b010000, op11_8, op7, op6, op4,
- !strconcat(OpcodeStr, "16"), v4i32, v4i16, IntOp>;
+ IIC_VQUNAiD, !strconcat(OpcodeStr, "16"), v4i32, v4i16, IntOp>;
def v2i64 : N2VLInt<op24, op23, 0b100000, op11_8, op7, op6, op4,
- !strconcat(OpcodeStr, "32"), v2i64, v2i32, IntOp>;
+ IIC_VQUNAiD, !strconcat(OpcodeStr, "32"), v2i64, v2i32, IntOp>;
}
@@ -492,38 +1190,56 @@ multiclass N2VLInt_QHS<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6,
// First with only element sizes of 16 and 32 bits:
multiclass N3VInt_HS<bit op24, bit op23, bits<4> op11_8, bit op4,
+ InstrItinClass itinD16, InstrItinClass itinD32,
+ InstrItinClass itinQ16, InstrItinClass itinQ32,
string OpcodeStr, Intrinsic IntOp, bit Commutable = 0> {
// 64-bit vector types.
- def v4i16 : N3VDInt<op24, op23, 0b01, op11_8, op4, !strconcat(OpcodeStr,"16"),
+ def v4i16 : N3VDInt<op24, op23, 0b01, op11_8, op4, itinD16, !strconcat(OpcodeStr,"16"),
v4i16, v4i16, IntOp, Commutable>;
- def v2i32 : N3VDInt<op24, op23, 0b10, op11_8, op4, !strconcat(OpcodeStr,"32"),
+ def v2i32 : N3VDInt<op24, op23, 0b10, op11_8, op4, itinD32, !strconcat(OpcodeStr,"32"),
v2i32, v2i32, IntOp, Commutable>;
// 128-bit vector types.
- def v8i16 : N3VQInt<op24, op23, 0b01, op11_8, op4, !strconcat(OpcodeStr,"16"),
+ def v8i16 : N3VQInt<op24, op23, 0b01, op11_8, op4, itinQ16, !strconcat(OpcodeStr,"16"),
v8i16, v8i16, IntOp, Commutable>;
- def v4i32 : N3VQInt<op24, op23, 0b10, op11_8, op4, !strconcat(OpcodeStr,"32"),
+ def v4i32 : N3VQInt<op24, op23, 0b10, op11_8, op4, itinQ32, !strconcat(OpcodeStr,"32"),
v4i32, v4i32, IntOp, Commutable>;
}
+multiclass N3VIntSL_HS<bits<4> op11_8,
+ InstrItinClass itinD16, InstrItinClass itinD32,
+ InstrItinClass itinQ16, InstrItinClass itinQ32,
+ string OpcodeStr, Intrinsic IntOp> {
+ def v4i16 : N3VDIntSL16<0b01, op11_8, itinD16, !strconcat(OpcodeStr, "16"), v4i16, IntOp>;
+ def v2i32 : N3VDIntSL<0b10, op11_8, itinD32, !strconcat(OpcodeStr, "32"), v2i32, IntOp>;
+ def v8i16 : N3VQIntSL16<0b01, op11_8, itinQ16, !strconcat(OpcodeStr, "16"), v8i16, v4i16, IntOp>;
+ def v4i32 : N3VQIntSL<0b10, op11_8, itinQ32, !strconcat(OpcodeStr, "32"), v4i32, v2i32, IntOp>;
+}
+
// ....then also with element size of 8 bits:
multiclass N3VInt_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
+ InstrItinClass itinD16, InstrItinClass itinD32,
+ InstrItinClass itinQ16, InstrItinClass itinQ32,
string OpcodeStr, Intrinsic IntOp, bit Commutable = 0>
- : N3VInt_HS<op24, op23, op11_8, op4, OpcodeStr, IntOp, Commutable> {
- def v8i8 : N3VDInt<op24, op23, 0b00, op11_8, op4, !strconcat(OpcodeStr, "8"),
- v8i8, v8i8, IntOp, Commutable>;
- def v16i8 : N3VQInt<op24, op23, 0b00, op11_8, op4, !strconcat(OpcodeStr, "8"),
- v16i8, v16i8, IntOp, Commutable>;
+ : N3VInt_HS<op24, op23, op11_8, op4, itinD16, itinD32, itinQ16, itinQ32,
+ OpcodeStr, IntOp, Commutable> {
+ def v8i8 : N3VDInt<op24, op23, 0b00, op11_8, op4, itinD16,
+ !strconcat(OpcodeStr, "8"), v8i8, v8i8, IntOp, Commutable>;
+ def v16i8 : N3VQInt<op24, op23, 0b00, op11_8, op4, itinQ16,
+ !strconcat(OpcodeStr, "8"), v16i8, v16i8, IntOp, Commutable>;
}
// ....then also with element size of 64 bits:
multiclass N3VInt_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4,
+ InstrItinClass itinD16, InstrItinClass itinD32,
+ InstrItinClass itinQ16, InstrItinClass itinQ32,
string OpcodeStr, Intrinsic IntOp, bit Commutable = 0>
- : N3VInt_QHS<op24, op23, op11_8, op4, OpcodeStr, IntOp, Commutable> {
- def v1i64 : N3VDInt<op24, op23, 0b11, op11_8, op4, !strconcat(OpcodeStr,"64"),
- v1i64, v1i64, IntOp, Commutable>;
- def v2i64 : N3VQInt<op24, op23, 0b11, op11_8, op4, !strconcat(OpcodeStr,"64"),
- v2i64, v2i64, IntOp, Commutable>;
+ : N3VInt_QHS<op24, op23, op11_8, op4, itinD16, itinD32, itinQ16, itinQ32,
+ OpcodeStr, IntOp, Commutable> {
+ def v1i64 : N3VDInt<op24, op23, 0b11, op11_8, op4, itinD32,
+ !strconcat(OpcodeStr,"64"), v1i64, v1i64, IntOp, Commutable>;
+ def v2i64 : N3VQInt<op24, op23, 0b11, op11_8, op4, itinQ32,
+ !strconcat(OpcodeStr,"64"), v2i64, v2i64, IntOp, Commutable>;
}
@@ -544,19 +1260,29 @@ multiclass N3VNInt_HSD<bit op24, bit op23, bits<4> op11_8, bit op4,
// First with only element sizes of 16 and 32 bits:
multiclass N3VLInt_HS<bit op24, bit op23, bits<4> op11_8, bit op4,
- string OpcodeStr, Intrinsic IntOp, bit Commutable = 0> {
- def v4i32 : N3VLInt<op24, op23, 0b01, op11_8, op4, !strconcat(OpcodeStr,"16"),
- v4i32, v4i16, IntOp, Commutable>;
- def v2i64 : N3VLInt<op24, op23, 0b10, op11_8, op4, !strconcat(OpcodeStr,"32"),
- v2i64, v2i32, IntOp, Commutable>;
+ InstrItinClass itin, string OpcodeStr,
+ Intrinsic IntOp, bit Commutable = 0> {
+ def v4i32 : N3VLInt<op24, op23, 0b01, op11_8, op4, itin,
+ !strconcat(OpcodeStr,"16"), v4i32, v4i16, IntOp, Commutable>;
+ def v2i64 : N3VLInt<op24, op23, 0b10, op11_8, op4, itin,
+ !strconcat(OpcodeStr,"32"), v2i64, v2i32, IntOp, Commutable>;
+}
+
+multiclass N3VLIntSL_HS<bit op24, bits<4> op11_8,
+ InstrItinClass itin, string OpcodeStr, Intrinsic IntOp> {
+ def v4i16 : N3VLIntSL16<op24, 0b01, op11_8, itin,
+ !strconcat(OpcodeStr, "16"), v4i32, v4i16, IntOp>;
+ def v2i32 : N3VLIntSL<op24, 0b10, op11_8, itin,
+ !strconcat(OpcodeStr, "32"), v2i64, v2i32, IntOp>;
}
// ....then also with element size of 8 bits:
multiclass N3VLInt_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
- string OpcodeStr, Intrinsic IntOp, bit Commutable = 0>
- : N3VLInt_HS<op24, op23, op11_8, op4, OpcodeStr, IntOp, Commutable> {
- def v8i16 : N3VLInt<op24, op23, 0b00, op11_8, op4, !strconcat(OpcodeStr, "8"),
- v8i16, v8i8, IntOp, Commutable>;
+ InstrItinClass itin, string OpcodeStr,
+ Intrinsic IntOp, bit Commutable = 0>
+ : N3VLInt_HS<op24, op23, op11_8, op4, itin, OpcodeStr, IntOp, Commutable> {
+ def v8i16 : N3VLInt<op24, op23, 0b00, op11_8, op4, itin,
+ !strconcat(OpcodeStr, "8"), v8i16, v8i8, IntOp, Commutable>;
}
@@ -576,43 +1302,58 @@ multiclass N3VWInt_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
// Neon Multiply-Op vector operations,
// element sizes of 8, 16 and 32 bits:
multiclass N3VMulOp_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
+ InstrItinClass itinD16, InstrItinClass itinD32,
+ InstrItinClass itinQ16, InstrItinClass itinQ32,
string OpcodeStr, SDNode OpNode> {
// 64-bit vector types.
- def v8i8 : N3VDMulOp<op24, op23, 0b00, op11_8, op4,
+ def v8i8 : N3VDMulOp<op24, op23, 0b00, op11_8, op4, itinD16,
!strconcat(OpcodeStr, "8"), v8i8, mul, OpNode>;
- def v4i16 : N3VDMulOp<op24, op23, 0b01, op11_8, op4,
+ def v4i16 : N3VDMulOp<op24, op23, 0b01, op11_8, op4, itinD16,
!strconcat(OpcodeStr, "16"), v4i16, mul, OpNode>;
- def v2i32 : N3VDMulOp<op24, op23, 0b10, op11_8, op4,
+ def v2i32 : N3VDMulOp<op24, op23, 0b10, op11_8, op4, itinD32,
!strconcat(OpcodeStr, "32"), v2i32, mul, OpNode>;
// 128-bit vector types.
- def v16i8 : N3VQMulOp<op24, op23, 0b00, op11_8, op4,
+ def v16i8 : N3VQMulOp<op24, op23, 0b00, op11_8, op4, itinQ16,
!strconcat(OpcodeStr, "8"), v16i8, mul, OpNode>;
- def v8i16 : N3VQMulOp<op24, op23, 0b01, op11_8, op4,
+ def v8i16 : N3VQMulOp<op24, op23, 0b01, op11_8, op4, itinQ16,
!strconcat(OpcodeStr, "16"), v8i16, mul, OpNode>;
- def v4i32 : N3VQMulOp<op24, op23, 0b10, op11_8, op4,
+ def v4i32 : N3VQMulOp<op24, op23, 0b10, op11_8, op4, itinQ32,
!strconcat(OpcodeStr, "32"), v4i32, mul, OpNode>;
}
+multiclass N3VMulOpSL_HS<bits<4> op11_8,
+ InstrItinClass itinD16, InstrItinClass itinD32,
+ InstrItinClass itinQ16, InstrItinClass itinQ32,
+ string OpcodeStr, SDNode ShOp> {
+ def v4i16 : N3VDMulOpSL16<0b01, op11_8, itinD16,
+ !strconcat(OpcodeStr, "16"), v4i16, mul, ShOp>;
+ def v2i32 : N3VDMulOpSL<0b10, op11_8, itinD32,
+ !strconcat(OpcodeStr, "32"), v2i32, mul, ShOp>;
+ def v8i16 : N3VQMulOpSL16<0b01, op11_8, itinQ16,
+ !strconcat(OpcodeStr, "16"), v8i16, v4i16, mul, ShOp>;
+ def v4i32 : N3VQMulOpSL<0b10, op11_8, itinQ32,
+ !strconcat(OpcodeStr, "32"), v4i32, v2i32, mul, ShOp>;
+}
// Neon 3-argument intrinsics,
// element sizes of 8, 16 and 32 bits:
multiclass N3VInt3_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
string OpcodeStr, Intrinsic IntOp> {
// 64-bit vector types.
- def v8i8 : N3VDInt3<op24, op23, 0b00, op11_8, op4,
+ def v8i8 : N3VDInt3<op24, op23, 0b00, op11_8, op4, IIC_VMACi16D,
!strconcat(OpcodeStr, "8"), v8i8, v8i8, IntOp>;
- def v4i16 : N3VDInt3<op24, op23, 0b01, op11_8, op4,
+ def v4i16 : N3VDInt3<op24, op23, 0b01, op11_8, op4, IIC_VMACi16D,
!strconcat(OpcodeStr, "16"), v4i16, v4i16, IntOp>;
- def v2i32 : N3VDInt3<op24, op23, 0b10, op11_8, op4,
+ def v2i32 : N3VDInt3<op24, op23, 0b10, op11_8, op4, IIC_VMACi32D,
!strconcat(OpcodeStr, "32"), v2i32, v2i32, IntOp>;
// 128-bit vector types.
- def v16i8 : N3VQInt3<op24, op23, 0b00, op11_8, op4,
+ def v16i8 : N3VQInt3<op24, op23, 0b00, op11_8, op4, IIC_VMACi16Q,
!strconcat(OpcodeStr, "8"), v16i8, v16i8, IntOp>;
- def v8i16 : N3VQInt3<op24, op23, 0b01, op11_8, op4,
+ def v8i16 : N3VQInt3<op24, op23, 0b01, op11_8, op4, IIC_VMACi16Q,
!strconcat(OpcodeStr, "16"), v8i16, v8i16, IntOp>;
- def v4i32 : N3VQInt3<op24, op23, 0b10, op11_8, op4,
+ def v4i32 : N3VQInt3<op24, op23, 0b10, op11_8, op4, IIC_VMACi32Q,
!strconcat(OpcodeStr, "32"), v4i32, v4i32, IntOp>;
}
@@ -622,17 +1363,25 @@ multiclass N3VInt3_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
// First with only element sizes of 16 and 32 bits:
multiclass N3VLInt3_HS<bit op24, bit op23, bits<4> op11_8, bit op4,
string OpcodeStr, Intrinsic IntOp> {
- def v4i32 : N3VLInt3<op24, op23, 0b01, op11_8, op4,
+ def v4i32 : N3VLInt3<op24, op23, 0b01, op11_8, op4, IIC_VMACi16D,
!strconcat(OpcodeStr, "16"), v4i32, v4i16, IntOp>;
- def v2i64 : N3VLInt3<op24, op23, 0b10, op11_8, op4,
+ def v2i64 : N3VLInt3<op24, op23, 0b10, op11_8, op4, IIC_VMACi16D,
!strconcat(OpcodeStr, "32"), v2i64, v2i32, IntOp>;
}
+multiclass N3VLInt3SL_HS<bit op24, bits<4> op11_8,
+ string OpcodeStr, Intrinsic IntOp> {
+ def v4i16 : N3VLInt3SL16<op24, 0b01, op11_8, IIC_VMACi16D,
+ !strconcat(OpcodeStr, "16"), v4i32, v4i16, IntOp>;
+ def v2i32 : N3VLInt3SL<op24, 0b10, op11_8, IIC_VMACi32D,
+ !strconcat(OpcodeStr, "32"), v2i64, v2i32, IntOp>;
+}
+
// ....then also with element size of 8 bits:
multiclass N3VLInt3_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
string OpcodeStr, Intrinsic IntOp>
: N3VLInt3_HS<op24, op23, op11_8, op4, OpcodeStr, IntOp> {
- def v8i16 : N3VLInt3<op24, op23, 0b01, op11_8, op4,
+ def v8i16 : N3VLInt3<op24, op23, 0b01, op11_8, op4, IIC_VMACi16D,
!strconcat(OpcodeStr, "8"), v8i16, v8i8, IntOp>;
}
@@ -640,23 +1389,24 @@ multiclass N3VLInt3_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
// Neon 2-register vector intrinsics,
// element sizes of 8, 16 and 32 bits:
multiclass N2VInt_QHS<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16,
- bits<5> op11_7, bit op4, string OpcodeStr,
- Intrinsic IntOp> {
+ bits<5> op11_7, bit op4,
+ InstrItinClass itinD, InstrItinClass itinQ,
+ string OpcodeStr, Intrinsic IntOp> {
// 64-bit vector types.
def v8i8 : N2VDInt<op24_23, op21_20, 0b00, op17_16, op11_7, op4,
- !strconcat(OpcodeStr, "8"), v8i8, v8i8, IntOp>;
+ itinD, !strconcat(OpcodeStr, "8"), v8i8, v8i8, IntOp>;
def v4i16 : N2VDInt<op24_23, op21_20, 0b01, op17_16, op11_7, op4,
- !strconcat(OpcodeStr, "16"), v4i16, v4i16, IntOp>;
+ itinD, !strconcat(OpcodeStr, "16"), v4i16, v4i16, IntOp>;
def v2i32 : N2VDInt<op24_23, op21_20, 0b10, op17_16, op11_7, op4,
- !strconcat(OpcodeStr, "32"), v2i32, v2i32, IntOp>;
+ itinD, !strconcat(OpcodeStr, "32"), v2i32, v2i32, IntOp>;
// 128-bit vector types.
def v16i8 : N2VQInt<op24_23, op21_20, 0b00, op17_16, op11_7, op4,
- !strconcat(OpcodeStr, "8"), v16i8, v16i8, IntOp>;
+ itinQ, !strconcat(OpcodeStr, "8"), v16i8, v16i8, IntOp>;
def v8i16 : N2VQInt<op24_23, op21_20, 0b01, op17_16, op11_7, op4,
- !strconcat(OpcodeStr, "16"), v8i16, v8i16, IntOp>;
+ itinQ, !strconcat(OpcodeStr, "16"), v8i16, v8i16, IntOp>;
def v4i32 : N2VQInt<op24_23, op21_20, 0b10, op17_16, op11_7, op4,
- !strconcat(OpcodeStr, "32"), v4i32, v4i32, IntOp>;
+ itinQ, !strconcat(OpcodeStr, "32"), v4i32, v4i32, IntOp>;
}
@@ -709,25 +1459,25 @@ multiclass N2VPLInt2_QHS<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16,
// Neon 2-register vector shift by immediate,
// element sizes of 8, 16, 32 and 64 bits:
multiclass N2VSh_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4,
- string OpcodeStr, SDNode OpNode> {
+ InstrItinClass itin, string OpcodeStr, SDNode OpNode> {
// 64-bit vector types.
- def v8i8 : N2VDSh<op24, op23, 0b001000, op11_8, 0, op4,
+ def v8i8 : N2VDSh<op24, op23, 0b001000, op11_8, 0, op4, itin,
!strconcat(OpcodeStr, "8"), v8i8, OpNode>;
- def v4i16 : N2VDSh<op24, op23, 0b010000, op11_8, 0, op4,
+ def v4i16 : N2VDSh<op24, op23, 0b010000, op11_8, 0, op4, itin,
!strconcat(OpcodeStr, "16"), v4i16, OpNode>;
- def v2i32 : N2VDSh<op24, op23, 0b100000, op11_8, 0, op4,
+ def v2i32 : N2VDSh<op24, op23, 0b100000, op11_8, 0, op4, itin,
!strconcat(OpcodeStr, "32"), v2i32, OpNode>;
- def v1i64 : N2VDSh<op24, op23, 0b000000, op11_8, 1, op4,
+ def v1i64 : N2VDSh<op24, op23, 0b000000, op11_8, 1, op4, itin,
!strconcat(OpcodeStr, "64"), v1i64, OpNode>;
// 128-bit vector types.
- def v16i8 : N2VQSh<op24, op23, 0b001000, op11_8, 0, op4,
+ def v16i8 : N2VQSh<op24, op23, 0b001000, op11_8, 0, op4, itin,
!strconcat(OpcodeStr, "8"), v16i8, OpNode>;
- def v8i16 : N2VQSh<op24, op23, 0b010000, op11_8, 0, op4,
+ def v8i16 : N2VQSh<op24, op23, 0b010000, op11_8, 0, op4, itin,
!strconcat(OpcodeStr, "16"), v8i16, OpNode>;
- def v4i32 : N2VQSh<op24, op23, 0b100000, op11_8, 0, op4,
+ def v4i32 : N2VQSh<op24, op23, 0b100000, op11_8, 0, op4, itin,
!strconcat(OpcodeStr, "32"), v4i32, OpNode>;
- def v2i64 : N2VQSh<op24, op23, 0b000000, op11_8, 1, op4,
+ def v2i64 : N2VQSh<op24, op23, 0b000000, op11_8, 1, op4, itin,
!strconcat(OpcodeStr, "64"), v2i64, OpNode>;
}
@@ -790,24 +1540,30 @@ multiclass N2VShIns_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4,
// Vector Add Operations.
// VADD : Vector Add (integer and floating-point)
-defm VADD : N3V_QHSD<0, 0, 0b1000, 0, "vadd.i", add, 1>;
-def VADDfd : N3VD<0, 0, 0b00, 0b1101, 0, "vadd.f32", v2f32, v2f32, fadd, 1>;
-def VADDfq : N3VQ<0, 0, 0b00, 0b1101, 0, "vadd.f32", v4f32, v4f32, fadd, 1>;
+defm VADD : N3V_QHSD<0, 0, 0b1000, 0, IIC_VBINiD, IIC_VBINiQ, "vadd.i", add, 1>;
+def VADDfd : N3VD<0, 0, 0b00, 0b1101, 0, IIC_VBIND, "vadd.f32", v2f32, v2f32, fadd, 1>;
+def VADDfq : N3VQ<0, 0, 0b00, 0b1101, 0, IIC_VBINQ, "vadd.f32", v4f32, v4f32, fadd, 1>;
// VADDL : Vector Add Long (Q = D + D)
-defm VADDLs : N3VLInt_QHS<0,1,0b0000,0, "vaddl.s", int_arm_neon_vaddls, 1>;
-defm VADDLu : N3VLInt_QHS<1,1,0b0000,0, "vaddl.u", int_arm_neon_vaddlu, 1>;
+defm VADDLs : N3VLInt_QHS<0,1,0b0000,0, IIC_VSHLiD, "vaddl.s", int_arm_neon_vaddls, 1>;
+defm VADDLu : N3VLInt_QHS<1,1,0b0000,0, IIC_VSHLiD, "vaddl.u", int_arm_neon_vaddlu, 1>;
// VADDW : Vector Add Wide (Q = Q + D)
defm VADDWs : N3VWInt_QHS<0,1,0b0001,0, "vaddw.s", int_arm_neon_vaddws, 0>;
defm VADDWu : N3VWInt_QHS<1,1,0b0001,0, "vaddw.u", int_arm_neon_vaddwu, 0>;
// VHADD : Vector Halving Add
-defm VHADDs : N3VInt_QHS<0,0,0b0000,0, "vhadd.s", int_arm_neon_vhadds, 1>;
-defm VHADDu : N3VInt_QHS<1,0,0b0000,0, "vhadd.u", int_arm_neon_vhaddu, 1>;
+defm VHADDs : N3VInt_QHS<0,0,0b0000,0, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q,
+ IIC_VBINi4Q, "vhadd.s", int_arm_neon_vhadds, 1>;
+defm VHADDu : N3VInt_QHS<1,0,0b0000,0, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q,
+ IIC_VBINi4Q, "vhadd.u", int_arm_neon_vhaddu, 1>;
// VRHADD : Vector Rounding Halving Add
-defm VRHADDs : N3VInt_QHS<0,0,0b0001,0, "vrhadd.s", int_arm_neon_vrhadds, 1>;
-defm VRHADDu : N3VInt_QHS<1,0,0b0001,0, "vrhadd.u", int_arm_neon_vrhaddu, 1>;
+defm VRHADDs : N3VInt_QHS<0,0,0b0001,0, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q,
+ IIC_VBINi4Q, "vrhadd.s", int_arm_neon_vrhadds, 1>;
+defm VRHADDu : N3VInt_QHS<1,0,0b0001,0, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q,
+ IIC_VBINi4Q, "vrhadd.u", int_arm_neon_vrhaddu, 1>;
// VQADD : Vector Saturating Add
-defm VQADDs : N3VInt_QHSD<0,0,0b0000,1, "vqadd.s", int_arm_neon_vqadds, 1>;
-defm VQADDu : N3VInt_QHSD<1,0,0b0000,1, "vqadd.u", int_arm_neon_vqaddu, 1>;
+defm VQADDs : N3VInt_QHSD<0,0,0b0000,1, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q,
+ IIC_VBINi4Q, "vqadd.s", int_arm_neon_vqadds, 1>;
+defm VQADDu : N3VInt_QHSD<1,0,0b0000,1, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q,
+ IIC_VBINi4Q, "vqadd.u", int_arm_neon_vqaddu, 1>;
// VADDHN : Vector Add and Narrow Returning High Half (D = Q + Q)
defm VADDHN : N3VNInt_HSD<0,1,0b0100,0, "vaddhn.i", int_arm_neon_vaddhn, 1>;
// VRADDHN : Vector Rounding Add and Narrow Returning High Half (D = Q + Q)
@@ -816,64 +1572,208 @@ defm VRADDHN : N3VNInt_HSD<1,1,0b0100,0, "vraddhn.i", int_arm_neon_vraddhn, 1>;
// Vector Multiply Operations.
// VMUL : Vector Multiply (integer, polynomial and floating-point)
-defm VMUL : N3V_QHS<0, 0, 0b1001, 1, "vmul.i", mul, 1>;
-def VMULpd : N3VDInt<1, 0, 0b00, 0b1001, 1, "vmul.p8", v8i8, v8i8,
+defm VMUL : N3V_QHS<0, 0, 0b1001, 1, IIC_VMULi16D, IIC_VMULi32D, IIC_VMULi16Q,
+ IIC_VMULi32Q, "vmul.i", mul, 1>;
+def VMULpd : N3VDInt<1, 0, 0b00, 0b1001, 1, IIC_VMULi16D, "vmul.p8", v8i8, v8i8,
int_arm_neon_vmulp, 1>;
-def VMULpq : N3VQInt<1, 0, 0b00, 0b1001, 1, "vmul.p8", v16i8, v16i8,
+def VMULpq : N3VQInt<1, 0, 0b00, 0b1001, 1, IIC_VMULi16Q, "vmul.p8", v16i8, v16i8,
int_arm_neon_vmulp, 1>;
-def VMULfd : N3VD<1, 0, 0b00, 0b1101, 1, "vmul.f32", v2f32, v2f32, fmul, 1>;
-def VMULfq : N3VQ<1, 0, 0b00, 0b1101, 1, "vmul.f32", v4f32, v4f32, fmul, 1>;
+def VMULfd : N3VD<1, 0, 0b00, 0b1101, 1, IIC_VBIND, "vmul.f32", v2f32, v2f32, fmul, 1>;
+def VMULfq : N3VQ<1, 0, 0b00, 0b1101, 1, IIC_VBINQ, "vmul.f32", v4f32, v4f32, fmul, 1>;
+defm VMULsl : N3VSL_HS<0b1000, "vmul.i", mul>;
+def VMULslfd : N3VDSL<0b10, 0b1001, IIC_VBIND, "vmul.f32", v2f32, fmul>;
+def VMULslfq : N3VQSL<0b10, 0b1001, IIC_VBINQ, "vmul.f32", v4f32, v2f32, fmul>;
+def : Pat<(v8i16 (mul (v8i16 QPR:$src1),
+ (v8i16 (NEONvduplane (v8i16 QPR:$src2), imm:$lane)))),
+ (v8i16 (VMULslv8i16 (v8i16 QPR:$src1),
+ (v4i16 (EXTRACT_SUBREG QPR:$src2,
+ (DSubReg_i16_reg imm:$lane))),
+ (SubReg_i16_lane imm:$lane)))>;
+def : Pat<(v4i32 (mul (v4i32 QPR:$src1),
+ (v4i32 (NEONvduplane (v4i32 QPR:$src2), imm:$lane)))),
+ (v4i32 (VMULslv4i32 (v4i32 QPR:$src1),
+ (v2i32 (EXTRACT_SUBREG QPR:$src2,
+ (DSubReg_i32_reg imm:$lane))),
+ (SubReg_i32_lane imm:$lane)))>;
+def : Pat<(v4f32 (fmul (v4f32 QPR:$src1),
+ (v4f32 (NEONvduplane (v4f32 QPR:$src2), imm:$lane)))),
+ (v4f32 (VMULslfq (v4f32 QPR:$src1),
+ (v2f32 (EXTRACT_SUBREG QPR:$src2,
+ (DSubReg_i32_reg imm:$lane))),
+ (SubReg_i32_lane imm:$lane)))>;
+
// VQDMULH : Vector Saturating Doubling Multiply Returning High Half
-defm VQDMULH : N3VInt_HS<0,0,0b1011,0, "vqdmulh.s", int_arm_neon_vqdmulh, 1>;
+defm VQDMULH : N3VInt_HS<0, 0, 0b1011, 0, IIC_VMULi16D, IIC_VMULi32D,
+ IIC_VMULi16Q, IIC_VMULi32Q,
+ "vqdmulh.s", int_arm_neon_vqdmulh, 1>;
+defm VQDMULHsl: N3VIntSL_HS<0b1100, IIC_VMULi16D, IIC_VMULi32D,
+ IIC_VMULi16Q, IIC_VMULi32Q,
+ "vqdmulh.s", int_arm_neon_vqdmulh>;
+def : Pat<(v8i16 (int_arm_neon_vqdmulh (v8i16 QPR:$src1),
+ (v8i16 (NEONvduplane (v8i16 QPR:$src2), imm:$lane)))),
+ (v8i16 (VQDMULHslv8i16 (v8i16 QPR:$src1),
+ (v4i16 (EXTRACT_SUBREG QPR:$src2,
+ (DSubReg_i16_reg imm:$lane))),
+ (SubReg_i16_lane imm:$lane)))>;
+def : Pat<(v4i32 (int_arm_neon_vqdmulh (v4i32 QPR:$src1),
+ (v4i32 (NEONvduplane (v4i32 QPR:$src2), imm:$lane)))),
+ (v4i32 (VQDMULHslv4i32 (v4i32 QPR:$src1),
+ (v2i32 (EXTRACT_SUBREG QPR:$src2,
+ (DSubReg_i32_reg imm:$lane))),
+ (SubReg_i32_lane imm:$lane)))>;
+
// VQRDMULH : Vector Rounding Saturating Doubling Multiply Returning High Half
-defm VQRDMULH : N3VInt_HS<1,0,0b1011,0, "vqrdmulh.s", int_arm_neon_vqrdmulh, 1>;
+defm VQRDMULH : N3VInt_HS<1, 0, 0b1011, 0, IIC_VMULi16D, IIC_VMULi32D,
+ IIC_VMULi16Q, IIC_VMULi32Q,
+ "vqrdmulh.s", int_arm_neon_vqrdmulh, 1>;
+defm VQRDMULHsl : N3VIntSL_HS<0b1101, IIC_VMULi16D, IIC_VMULi32D,
+ IIC_VMULi16Q, IIC_VMULi32Q,
+ "vqrdmulh.s", int_arm_neon_vqrdmulh>;
+def : Pat<(v8i16 (int_arm_neon_vqrdmulh (v8i16 QPR:$src1),
+ (v8i16 (NEONvduplane (v8i16 QPR:$src2), imm:$lane)))),
+ (v8i16 (VQRDMULHslv8i16 (v8i16 QPR:$src1),
+ (v4i16 (EXTRACT_SUBREG QPR:$src2,
+ (DSubReg_i16_reg imm:$lane))),
+ (SubReg_i16_lane imm:$lane)))>;
+def : Pat<(v4i32 (int_arm_neon_vqrdmulh (v4i32 QPR:$src1),
+ (v4i32 (NEONvduplane (v4i32 QPR:$src2), imm:$lane)))),
+ (v4i32 (VQRDMULHslv4i32 (v4i32 QPR:$src1),
+ (v2i32 (EXTRACT_SUBREG QPR:$src2,
+ (DSubReg_i32_reg imm:$lane))),
+ (SubReg_i32_lane imm:$lane)))>;
+
// VMULL : Vector Multiply Long (integer and polynomial) (Q = D * D)
-defm VMULLs : N3VLInt_QHS<0,1,0b1100,0, "vmull.s", int_arm_neon_vmulls, 1>;
-defm VMULLu : N3VLInt_QHS<1,1,0b1100,0, "vmull.u", int_arm_neon_vmullu, 1>;
-def VMULLp : N3VLInt<0, 1, 0b00, 0b1110, 0, "vmull.p8", v8i16, v8i8,
+defm VMULLs : N3VLInt_QHS<0,1,0b1100,0, IIC_VMULi16D, "vmull.s", int_arm_neon_vmulls, 1>;
+defm VMULLu : N3VLInt_QHS<1,1,0b1100,0, IIC_VMULi16D, "vmull.u", int_arm_neon_vmullu, 1>;
+def VMULLp : N3VLInt<0, 1, 0b00, 0b1110, 0, IIC_VMULi16D, "vmull.p8", v8i16, v8i8,
int_arm_neon_vmullp, 1>;
+defm VMULLsls : N3VLIntSL_HS<0, 0b1010, IIC_VMULi16D, "vmull.s", int_arm_neon_vmulls>;
+defm VMULLslu : N3VLIntSL_HS<1, 0b1010, IIC_VMULi16D, "vmull.u", int_arm_neon_vmullu>;
+
// VQDMULL : Vector Saturating Doubling Multiply Long (Q = D * D)
-defm VQDMULL : N3VLInt_HS<0,1,0b1101,0, "vqdmull.s", int_arm_neon_vqdmull, 1>;
+defm VQDMULL : N3VLInt_HS<0,1,0b1101,0, IIC_VMULi16D, "vqdmull.s", int_arm_neon_vqdmull, 1>;
+defm VQDMULLsl: N3VLIntSL_HS<0, 0b1011, IIC_VMULi16D, "vqdmull.s", int_arm_neon_vqdmull>;
// Vector Multiply-Accumulate and Multiply-Subtract Operations.
// VMLA : Vector Multiply Accumulate (integer and floating-point)
-defm VMLA : N3VMulOp_QHS<0, 0, 0b1001, 0, "vmla.i", add>;
-def VMLAfd : N3VDMulOp<0, 0, 0b00, 0b1101, 1, "vmla.f32", v2f32, fmul, fadd>;
-def VMLAfq : N3VQMulOp<0, 0, 0b00, 0b1101, 1, "vmla.f32", v4f32, fmul, fadd>;
+defm VMLA : N3VMulOp_QHS<0, 0, 0b1001, 0, IIC_VMACi16D, IIC_VMACi32D,
+ IIC_VMACi16Q, IIC_VMACi32Q, "vmla.i", add>;
+def VMLAfd : N3VDMulOp<0, 0, 0b00, 0b1101, 1, IIC_VMACD, "vmla.f32", v2f32, fmul, fadd>;
+def VMLAfq : N3VQMulOp<0, 0, 0b00, 0b1101, 1, IIC_VMACQ, "vmla.f32", v4f32, fmul, fadd>;
+defm VMLAsl : N3VMulOpSL_HS<0b0000, IIC_VMACi16D, IIC_VMACi32D,
+ IIC_VMACi16Q, IIC_VMACi32Q, "vmla.i", add>;
+def VMLAslfd : N3VDMulOpSL<0b10, 0b0001, IIC_VMACD, "vmla.f32", v2f32, fmul, fadd>;
+def VMLAslfq : N3VQMulOpSL<0b10, 0b0001, IIC_VMACQ, "vmla.f32", v4f32, v2f32, fmul, fadd>;
+
+def : Pat<(v8i16 (add (v8i16 QPR:$src1),
+ (mul (v8i16 QPR:$src2),
+ (v8i16 (NEONvduplane (v8i16 QPR:$src3), imm:$lane))))),
+ (v8i16 (VMLAslv8i16 (v8i16 QPR:$src1),
+ (v8i16 QPR:$src2),
+ (v4i16 (EXTRACT_SUBREG QPR:$src3,
+ (DSubReg_i16_reg imm:$lane))),
+ (SubReg_i16_lane imm:$lane)))>;
+
+def : Pat<(v4i32 (add (v4i32 QPR:$src1),
+ (mul (v4i32 QPR:$src2),
+ (v4i32 (NEONvduplane (v4i32 QPR:$src3), imm:$lane))))),
+ (v4i32 (VMLAslv4i32 (v4i32 QPR:$src1),
+ (v4i32 QPR:$src2),
+ (v2i32 (EXTRACT_SUBREG QPR:$src3,
+ (DSubReg_i32_reg imm:$lane))),
+ (SubReg_i32_lane imm:$lane)))>;
+
+def : Pat<(v4f32 (fadd (v4f32 QPR:$src1),
+ (fmul (v4f32 QPR:$src2),
+ (v4f32 (NEONvduplane (v4f32 QPR:$src3), imm:$lane))))),
+ (v4f32 (VMLAslfq (v4f32 QPR:$src1),
+ (v4f32 QPR:$src2),
+ (v2f32 (EXTRACT_SUBREG QPR:$src3,
+ (DSubReg_i32_reg imm:$lane))),
+ (SubReg_i32_lane imm:$lane)))>;
+
// VMLAL : Vector Multiply Accumulate Long (Q += D * D)
defm VMLALs : N3VLInt3_QHS<0,1,0b1000,0, "vmlal.s", int_arm_neon_vmlals>;
defm VMLALu : N3VLInt3_QHS<1,1,0b1000,0, "vmlal.u", int_arm_neon_vmlalu>;
+
+defm VMLALsls : N3VLInt3SL_HS<0, 0b0010, "vmlal.s", int_arm_neon_vmlals>;
+defm VMLALslu : N3VLInt3SL_HS<1, 0b0010, "vmlal.u", int_arm_neon_vmlalu>;
+
// VQDMLAL : Vector Saturating Doubling Multiply Accumulate Long (Q += D * D)
defm VQDMLAL : N3VLInt3_HS<0, 1, 0b1001, 0, "vqdmlal.s", int_arm_neon_vqdmlal>;
+defm VQDMLALsl: N3VLInt3SL_HS<0, 0b0011, "vqdmlal.s", int_arm_neon_vqdmlal>;
+
// VMLS : Vector Multiply Subtract (integer and floating-point)
-defm VMLS : N3VMulOp_QHS<0, 0, 0b1001, 0, "vmls.i", sub>;
-def VMLSfd : N3VDMulOp<0, 0, 0b10, 0b1101, 1, "vmls.f32", v2f32, fmul, fsub>;
-def VMLSfq : N3VQMulOp<0, 0, 0b10, 0b1101, 1, "vmls.f32", v4f32, fmul, fsub>;
+defm VMLS : N3VMulOp_QHS<1, 0, 0b1001, 0, IIC_VMACi16D, IIC_VMACi32D,
+ IIC_VMACi16Q, IIC_VMACi32Q, "vmls.i", sub>;
+def VMLSfd : N3VDMulOp<0, 0, 0b10, 0b1101, 1, IIC_VMACD, "vmls.f32", v2f32, fmul, fsub>;
+def VMLSfq : N3VQMulOp<0, 0, 0b10, 0b1101, 1, IIC_VMACQ, "vmls.f32", v4f32, fmul, fsub>;
+defm VMLSsl : N3VMulOpSL_HS<0b0100, IIC_VMACi16D, IIC_VMACi32D,
+ IIC_VMACi16Q, IIC_VMACi32Q, "vmls.i", sub>;
+def VMLSslfd : N3VDMulOpSL<0b10, 0b0101, IIC_VMACD, "vmls.f32", v2f32, fmul, fsub>;
+def VMLSslfq : N3VQMulOpSL<0b10, 0b0101, IIC_VMACQ, "vmls.f32", v4f32, v2f32, fmul, fsub>;
+
+def : Pat<(v8i16 (sub (v8i16 QPR:$src1),
+ (mul (v8i16 QPR:$src2),
+ (v8i16 (NEONvduplane (v8i16 QPR:$src3), imm:$lane))))),
+ (v8i16 (VMLSslv8i16 (v8i16 QPR:$src1),
+ (v8i16 QPR:$src2),
+ (v4i16 (EXTRACT_SUBREG QPR:$src3,
+ (DSubReg_i16_reg imm:$lane))),
+ (SubReg_i16_lane imm:$lane)))>;
+
+def : Pat<(v4i32 (sub (v4i32 QPR:$src1),
+ (mul (v4i32 QPR:$src2),
+ (v4i32 (NEONvduplane (v4i32 QPR:$src3), imm:$lane))))),
+ (v4i32 (VMLSslv4i32 (v4i32 QPR:$src1),
+ (v4i32 QPR:$src2),
+ (v2i32 (EXTRACT_SUBREG QPR:$src3,
+ (DSubReg_i32_reg imm:$lane))),
+ (SubReg_i32_lane imm:$lane)))>;
+
+def : Pat<(v4f32 (fsub (v4f32 QPR:$src1),
+ (fmul (v4f32 QPR:$src2),
+ (v4f32 (NEONvduplane (v4f32 QPR:$src3), imm:$lane))))),
+ (v4f32 (VMLSslfq (v4f32 QPR:$src1),
+ (v4f32 QPR:$src2),
+ (v2f32 (EXTRACT_SUBREG QPR:$src3,
+ (DSubReg_i32_reg imm:$lane))),
+ (SubReg_i32_lane imm:$lane)))>;
+
// VMLSL : Vector Multiply Subtract Long (Q -= D * D)
defm VMLSLs : N3VLInt3_QHS<0,1,0b1010,0, "vmlsl.s", int_arm_neon_vmlsls>;
defm VMLSLu : N3VLInt3_QHS<1,1,0b1010,0, "vmlsl.u", int_arm_neon_vmlslu>;
+
+defm VMLSLsls : N3VLInt3SL_HS<0, 0b0110, "vmlsl.s", int_arm_neon_vmlsls>;
+defm VMLSLslu : N3VLInt3SL_HS<1, 0b0110, "vmlsl.u", int_arm_neon_vmlslu>;
+
// VQDMLSL : Vector Saturating Doubling Multiply Subtract Long (Q -= D * D)
defm VQDMLSL : N3VLInt3_HS<0, 1, 0b1011, 0, "vqdmlsl.s", int_arm_neon_vqdmlsl>;
+defm VQDMLSLsl: N3VLInt3SL_HS<0, 0b111, "vqdmlsl.s", int_arm_neon_vqdmlsl>;
// Vector Subtract Operations.
// VSUB : Vector Subtract (integer and floating-point)
-defm VSUB : N3V_QHSD<1, 0, 0b1000, 0, "vsub.i", sub, 0>;
-def VSUBfd : N3VD<0, 0, 0b10, 0b1101, 0, "vsub.f32", v2f32, v2f32, fsub, 0>;
-def VSUBfq : N3VQ<0, 0, 0b10, 0b1101, 0, "vsub.f32", v4f32, v4f32, fsub, 0>;
+defm VSUB : N3V_QHSD<1, 0, 0b1000, 0, IIC_VSUBiD, IIC_VSUBiQ, "vsub.i", sub, 0>;
+def VSUBfd : N3VD<0, 0, 0b10, 0b1101, 0, IIC_VBIND, "vsub.f32", v2f32, v2f32, fsub, 0>;
+def VSUBfq : N3VQ<0, 0, 0b10, 0b1101, 0, IIC_VBINQ, "vsub.f32", v4f32, v4f32, fsub, 0>;
// VSUBL : Vector Subtract Long (Q = D - D)
-defm VSUBLs : N3VLInt_QHS<0,1,0b0010,0, "vsubl.s", int_arm_neon_vsubls, 1>;
-defm VSUBLu : N3VLInt_QHS<1,1,0b0010,0, "vsubl.u", int_arm_neon_vsublu, 1>;
+defm VSUBLs : N3VLInt_QHS<0,1,0b0010,0, IIC_VSHLiD, "vsubl.s", int_arm_neon_vsubls, 1>;
+defm VSUBLu : N3VLInt_QHS<1,1,0b0010,0, IIC_VSHLiD, "vsubl.u", int_arm_neon_vsublu, 1>;
// VSUBW : Vector Subtract Wide (Q = Q - D)
defm VSUBWs : N3VWInt_QHS<0,1,0b0011,0, "vsubw.s", int_arm_neon_vsubws, 0>;
defm VSUBWu : N3VWInt_QHS<1,1,0b0011,0, "vsubw.u", int_arm_neon_vsubwu, 0>;
// VHSUB : Vector Halving Subtract
-defm VHSUBs : N3VInt_QHS<0, 0, 0b0010, 0, "vhsub.s", int_arm_neon_vhsubs, 0>;
-defm VHSUBu : N3VInt_QHS<1, 0, 0b0010, 0, "vhsub.u", int_arm_neon_vhsubu, 0>;
+defm VHSUBs : N3VInt_QHS<0, 0, 0b0010, 0, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q,
+ IIC_VBINi4Q, "vhsub.s", int_arm_neon_vhsubs, 0>;
+defm VHSUBu : N3VInt_QHS<1, 0, 0b0010, 0, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q,
+ IIC_VBINi4Q, "vhsub.u", int_arm_neon_vhsubu, 0>;
// VQSUB : Vector Saturing Subtract
-defm VQSUBs : N3VInt_QHSD<0, 0, 0b0010, 1, "vqsub.s", int_arm_neon_vqsubs, 0>;
-defm VQSUBu : N3VInt_QHSD<1, 0, 0b0010, 1, "vqsub.u", int_arm_neon_vqsubu, 0>;
+defm VQSUBs : N3VInt_QHSD<0, 0, 0b0010, 1, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q,
+ IIC_VBINi4Q, "vqsub.s", int_arm_neon_vqsubs, 0>;
+defm VQSUBu : N3VInt_QHSD<1, 0, 0b0010, 1, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q,
+ IIC_VBINi4Q, "vqsub.u", int_arm_neon_vqsubu, 0>;
// VSUBHN : Vector Subtract and Narrow Returning High Half (D = Q - Q)
defm VSUBHN : N3VNInt_HSD<0,1,0b0110,0, "vsubhn.i", int_arm_neon_vsubhn, 0>;
// VRSUBHN : Vector Rounding Subtract and Narrow Returning High Half (D=Q-Q)
@@ -882,85 +1782,101 @@ defm VRSUBHN : N3VNInt_HSD<1,1,0b0110,0, "vrsubhn.i", int_arm_neon_vrsubhn, 0>;
// Vector Comparisons.
// VCEQ : Vector Compare Equal
-defm VCEQ : N3V_QHS<1, 0, 0b1000, 1, "vceq.i", NEONvceq, 1>;
-def VCEQfd : N3VD<0,0,0b00,0b1110,0, "vceq.f32", v2i32, v2f32, NEONvceq, 1>;
-def VCEQfq : N3VQ<0,0,0b00,0b1110,0, "vceq.f32", v4i32, v4f32, NEONvceq, 1>;
+defm VCEQ : N3V_QHS<1, 0, 0b1000, 1, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q,
+ IIC_VBINi4Q, "vceq.i", NEONvceq, 1>;
+def VCEQfd : N3VD<0,0,0b00,0b1110,0, IIC_VBIND, "vceq.f32", v2i32, v2f32, NEONvceq, 1>;
+def VCEQfq : N3VQ<0,0,0b00,0b1110,0, IIC_VBINQ, "vceq.f32", v4i32, v4f32, NEONvceq, 1>;
// VCGE : Vector Compare Greater Than or Equal
-defm VCGEs : N3V_QHS<0, 0, 0b0011, 1, "vcge.s", NEONvcge, 0>;
-defm VCGEu : N3V_QHS<1, 0, 0b0011, 1, "vcge.u", NEONvcgeu, 0>;
-def VCGEfd : N3VD<1,0,0b00,0b1110,0, "vcge.f32", v2i32, v2f32, NEONvcge, 0>;
-def VCGEfq : N3VQ<1,0,0b00,0b1110,0, "vcge.f32", v4i32, v4f32, NEONvcge, 0>;
+defm VCGEs : N3V_QHS<0, 0, 0b0011, 1, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q,
+ IIC_VBINi4Q, "vcge.s", NEONvcge, 0>;
+defm VCGEu : N3V_QHS<1, 0, 0b0011, 1, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q,
+ IIC_VBINi4Q, "vcge.u", NEONvcgeu, 0>;
+def VCGEfd : N3VD<1,0,0b00,0b1110,0, IIC_VBIND, "vcge.f32", v2i32, v2f32, NEONvcge, 0>;
+def VCGEfq : N3VQ<1,0,0b00,0b1110,0, IIC_VBINQ, "vcge.f32", v4i32, v4f32, NEONvcge, 0>;
// VCGT : Vector Compare Greater Than
-defm VCGTs : N3V_QHS<0, 0, 0b0011, 0, "vcgt.s", NEONvcgt, 0>;
-defm VCGTu : N3V_QHS<1, 0, 0b0011, 0, "vcgt.u", NEONvcgtu, 0>;
-def VCGTfd : N3VD<1,0,0b10,0b1110,0, "vcgt.f32", v2i32, v2f32, NEONvcgt, 0>;
-def VCGTfq : N3VQ<1,0,0b10,0b1110,0, "vcgt.f32", v4i32, v4f32, NEONvcgt, 0>;
+defm VCGTs : N3V_QHS<0, 0, 0b0011, 0, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q,
+ IIC_VBINi4Q, "vcgt.s", NEONvcgt, 0>;
+defm VCGTu : N3V_QHS<1, 0, 0b0011, 0, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q,
+ IIC_VBINi4Q, "vcgt.u", NEONvcgtu, 0>;
+def VCGTfd : N3VD<1,0,0b10,0b1110,0, IIC_VBIND, "vcgt.f32", v2i32, v2f32, NEONvcgt, 0>;
+def VCGTfq : N3VQ<1,0,0b10,0b1110,0, IIC_VBINQ, "vcgt.f32", v4i32, v4f32, NEONvcgt, 0>;
// VACGE : Vector Absolute Compare Greater Than or Equal (aka VCAGE)
-def VACGEd : N3VDInt<1, 0, 0b00, 0b1110, 1, "vacge.f32", v2i32, v2f32,
+def VACGEd : N3VDInt<1, 0, 0b00, 0b1110, 1, IIC_VBIND, "vacge.f32", v2i32, v2f32,
int_arm_neon_vacged, 0>;
-def VACGEq : N3VQInt<1, 0, 0b00, 0b1110, 1, "vacge.f32", v4i32, v4f32,
+def VACGEq : N3VQInt<1, 0, 0b00, 0b1110, 1, IIC_VBINQ, "vacge.f32", v4i32, v4f32,
int_arm_neon_vacgeq, 0>;
// VACGT : Vector Absolute Compare Greater Than (aka VCAGT)
-def VACGTd : N3VDInt<1, 0, 0b10, 0b1110, 1, "vacgt.f32", v2i32, v2f32,
+def VACGTd : N3VDInt<1, 0, 0b10, 0b1110, 1, IIC_VBIND, "vacgt.f32", v2i32, v2f32,
int_arm_neon_vacgtd, 0>;
-def VACGTq : N3VQInt<1, 0, 0b10, 0b1110, 1, "vacgt.f32", v4i32, v4f32,
+def VACGTq : N3VQInt<1, 0, 0b10, 0b1110, 1, IIC_VBINQ, "vacgt.f32", v4i32, v4f32,
int_arm_neon_vacgtq, 0>;
// VTST : Vector Test Bits
-defm VTST : N3V_QHS<0, 0, 0b1000, 1, "vtst.i", NEONvtst, 1>;
+defm VTST : N3V_QHS<0, 0, 0b1000, 1, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q,
+ IIC_VBINi4Q, "vtst.i", NEONvtst, 1>;
// Vector Bitwise Operations.
// VAND : Vector Bitwise AND
-def VANDd : N3VD<0, 0, 0b00, 0b0001, 1, "vand", v2i32, v2i32, and, 1>;
-def VANDq : N3VQ<0, 0, 0b00, 0b0001, 1, "vand", v4i32, v4i32, and, 1>;
+def VANDd : N3VD<0, 0, 0b00, 0b0001, 1, IIC_VBINiD, "vand", v2i32, v2i32, and, 1>;
+def VANDq : N3VQ<0, 0, 0b00, 0b0001, 1, IIC_VBINiQ, "vand", v4i32, v4i32, and, 1>;
// VEOR : Vector Bitwise Exclusive OR
-def VEORd : N3VD<1, 0, 0b00, 0b0001, 1, "veor", v2i32, v2i32, xor, 1>;
-def VEORq : N3VQ<1, 0, 0b00, 0b0001, 1, "veor", v4i32, v4i32, xor, 1>;
+def VEORd : N3VD<1, 0, 0b00, 0b0001, 1, IIC_VBINiD, "veor", v2i32, v2i32, xor, 1>;
+def VEORq : N3VQ<1, 0, 0b00, 0b0001, 1, IIC_VBINiQ, "veor", v4i32, v4i32, xor, 1>;
// VORR : Vector Bitwise OR
-def VORRd : N3VD<0, 0, 0b10, 0b0001, 1, "vorr", v2i32, v2i32, or, 1>;
-def VORRq : N3VQ<0, 0, 0b10, 0b0001, 1, "vorr", v4i32, v4i32, or, 1>;
+def VORRd : N3VD<0, 0, 0b10, 0b0001, 1, IIC_VBINiD, "vorr", v2i32, v2i32, or, 1>;
+def VORRq : N3VQ<0, 0, 0b10, 0b0001, 1, IIC_VBINiQ, "vorr", v4i32, v4i32, or, 1>;
// VBIC : Vector Bitwise Bit Clear (AND NOT)
def VBICd : N3V<0, 0, 0b01, 0b0001, 0, 1, (outs DPR:$dst),
- (ins DPR:$src1, DPR:$src2), "vbic\t$dst, $src1, $src2", "",
- [(set DPR:$dst, (v2i32 (and DPR:$src1,(vnot DPR:$src2))))]>;
+ (ins DPR:$src1, DPR:$src2), IIC_VBINiD,
+ "vbic\t$dst, $src1, $src2", "",
+ [(set DPR:$dst, (v2i32 (and DPR:$src1,
+ (vnot_conv DPR:$src2))))]>;
def VBICq : N3V<0, 0, 0b01, 0b0001, 1, 1, (outs QPR:$dst),
- (ins QPR:$src1, QPR:$src2), "vbic\t$dst, $src1, $src2", "",
- [(set QPR:$dst, (v4i32 (and QPR:$src1,(vnot QPR:$src2))))]>;
+ (ins QPR:$src1, QPR:$src2), IIC_VBINiQ,
+ "vbic\t$dst, $src1, $src2", "",
+ [(set QPR:$dst, (v4i32 (and QPR:$src1,
+ (vnot_conv QPR:$src2))))]>;
// VORN : Vector Bitwise OR NOT
def VORNd : N3V<0, 0, 0b11, 0b0001, 0, 1, (outs DPR:$dst),
- (ins DPR:$src1, DPR:$src2), "vorn\t$dst, $src1, $src2", "",
- [(set DPR:$dst, (v2i32 (or DPR:$src1, (vnot DPR:$src2))))]>;
+ (ins DPR:$src1, DPR:$src2), IIC_VBINiD,
+ "vorn\t$dst, $src1, $src2", "",
+ [(set DPR:$dst, (v2i32 (or DPR:$src1,
+ (vnot_conv DPR:$src2))))]>;
def VORNq : N3V<0, 0, 0b11, 0b0001, 1, 1, (outs QPR:$dst),
- (ins QPR:$src1, QPR:$src2), "vorn\t$dst, $src1, $src2", "",
- [(set QPR:$dst, (v4i32 (or QPR:$src1, (vnot QPR:$src2))))]>;
+ (ins QPR:$src1, QPR:$src2), IIC_VBINiQ,
+ "vorn\t$dst, $src1, $src2", "",
+ [(set QPR:$dst, (v4i32 (or QPR:$src1,
+ (vnot_conv QPR:$src2))))]>;
// VMVN : Vector Bitwise NOT
def VMVNd : N2V<0b11, 0b11, 0b00, 0b00, 0b01011, 0, 0,
- (outs DPR:$dst), (ins DPR:$src), "vmvn\t$dst, $src", "",
+ (outs DPR:$dst), (ins DPR:$src), IIC_VSHLiD,
+ "vmvn\t$dst, $src", "",
[(set DPR:$dst, (v2i32 (vnot DPR:$src)))]>;
def VMVNq : N2V<0b11, 0b11, 0b00, 0b00, 0b01011, 1, 0,
- (outs QPR:$dst), (ins QPR:$src), "vmvn\t$dst, $src", "",
+ (outs QPR:$dst), (ins QPR:$src), IIC_VSHLiD,
+ "vmvn\t$dst, $src", "",
[(set QPR:$dst, (v4i32 (vnot QPR:$src)))]>;
def : Pat<(v2i32 (vnot_conv DPR:$src)), (VMVNd DPR:$src)>;
def : Pat<(v4i32 (vnot_conv QPR:$src)), (VMVNq QPR:$src)>;
// VBSL : Vector Bitwise Select
def VBSLd : N3V<1, 0, 0b01, 0b0001, 0, 1, (outs DPR:$dst),
- (ins DPR:$src1, DPR:$src2, DPR:$src3),
+ (ins DPR:$src1, DPR:$src2, DPR:$src3), IIC_VCNTiD,
"vbsl\t$dst, $src2, $src3", "$src1 = $dst",
[(set DPR:$dst,
(v2i32 (or (and DPR:$src2, DPR:$src1),
- (and DPR:$src3, (vnot DPR:$src1)))))]>;
+ (and DPR:$src3, (vnot_conv DPR:$src1)))))]>;
def VBSLq : N3V<1, 0, 0b01, 0b0001, 1, 1, (outs QPR:$dst),
- (ins QPR:$src1, QPR:$src2, QPR:$src3),
+ (ins QPR:$src1, QPR:$src2, QPR:$src3), IIC_VCNTiQ,
"vbsl\t$dst, $src2, $src3", "$src1 = $dst",
[(set QPR:$dst,
(v4i32 (or (and QPR:$src2, QPR:$src1),
- (and QPR:$src3, (vnot QPR:$src1)))))]>;
+ (and QPR:$src3, (vnot_conv QPR:$src1)))))]>;
// VBIF : Vector Bitwise Insert if False
// like VBSL but with: "vbif\t$dst, $src3, $src1", "$src2 = $dst",
@@ -973,16 +1889,18 @@ def VBSLq : N3V<1, 0, 0b01, 0b0001, 1, 1, (outs QPR:$dst),
// Vector Absolute Differences.
// VABD : Vector Absolute Difference
-defm VABDs : N3VInt_QHS<0, 0, 0b0111, 0, "vabd.s", int_arm_neon_vabds, 0>;
-defm VABDu : N3VInt_QHS<1, 0, 0b0111, 0, "vabd.u", int_arm_neon_vabdu, 0>;
-def VABDfd : N3VDInt<1, 0, 0b10, 0b1101, 0, "vabd.f32", v2f32, v2f32,
- int_arm_neon_vabdf, 0>;
-def VABDfq : N3VQInt<1, 0, 0b10, 0b1101, 0, "vabd.f32", v4f32, v4f32,
- int_arm_neon_vabdf, 0>;
+defm VABDs : N3VInt_QHS<0, 0, 0b0111, 0, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q,
+ IIC_VBINi4Q, "vabd.s", int_arm_neon_vabds, 0>;
+defm VABDu : N3VInt_QHS<1, 0, 0b0111, 0, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q,
+ IIC_VBINi4Q, "vabd.u", int_arm_neon_vabdu, 0>;
+def VABDfd : N3VDInt<1, 0, 0b10, 0b1101, 0, IIC_VBIND, "vabd.f32", v2f32, v2f32,
+ int_arm_neon_vabds, 0>;
+def VABDfq : N3VQInt<1, 0, 0b10, 0b1101, 0, IIC_VBINQ, "vabd.f32", v4f32, v4f32,
+ int_arm_neon_vabds, 0>;
// VABDL : Vector Absolute Difference Long (Q = | D - D |)
-defm VABDLs : N3VLInt_QHS<0,1,0b0111,0, "vabdl.s", int_arm_neon_vabdls, 0>;
-defm VABDLu : N3VLInt_QHS<1,1,0b0111,0, "vabdl.u", int_arm_neon_vabdlu, 0>;
+defm VABDLs : N3VLInt_QHS<0,1,0b0111,0, IIC_VBINi4Q, "vabdl.s", int_arm_neon_vabdls, 0>;
+defm VABDLu : N3VLInt_QHS<1,1,0b0111,0, IIC_VBINi4Q, "vabdl.u", int_arm_neon_vabdlu, 0>;
// VABA : Vector Absolute Difference and Accumulate
defm VABAs : N3VInt3_QHS<0,1,0b0101,0, "vaba.s", int_arm_neon_vabas>;
@@ -995,32 +1913,36 @@ defm VABALu : N3VLInt3_QHS<1,1,0b0101,0, "vabal.u", int_arm_neon_vabalu>;
// Vector Maximum and Minimum.
// VMAX : Vector Maximum
-defm VMAXs : N3VInt_QHS<0, 0, 0b0110, 0, "vmax.s", int_arm_neon_vmaxs, 1>;
-defm VMAXu : N3VInt_QHS<1, 0, 0b0110, 0, "vmax.u", int_arm_neon_vmaxu, 1>;
-def VMAXfd : N3VDInt<0, 0, 0b00, 0b1111, 0, "vmax.f32", v2f32, v2f32,
- int_arm_neon_vmaxf, 1>;
-def VMAXfq : N3VQInt<0, 0, 0b00, 0b1111, 0, "vmax.f32", v4f32, v4f32,
- int_arm_neon_vmaxf, 1>;
+defm VMAXs : N3VInt_QHS<0, 0, 0b0110, 0, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q,
+ IIC_VBINi4Q, "vmax.s", int_arm_neon_vmaxs, 1>;
+defm VMAXu : N3VInt_QHS<1, 0, 0b0110, 0, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q,
+ IIC_VBINi4Q, "vmax.u", int_arm_neon_vmaxu, 1>;
+def VMAXfd : N3VDInt<0, 0, 0b00, 0b1111, 0, IIC_VBIND, "vmax.f32", v2f32, v2f32,
+ int_arm_neon_vmaxs, 1>;
+def VMAXfq : N3VQInt<0, 0, 0b00, 0b1111, 0, IIC_VBINQ, "vmax.f32", v4f32, v4f32,
+ int_arm_neon_vmaxs, 1>;
// VMIN : Vector Minimum
-defm VMINs : N3VInt_QHS<0, 0, 0b0110, 1, "vmin.s", int_arm_neon_vmins, 1>;
-defm VMINu : N3VInt_QHS<1, 0, 0b0110, 1, "vmin.u", int_arm_neon_vminu, 1>;
-def VMINfd : N3VDInt<0, 0, 0b10, 0b1111, 0, "vmin.f32", v2f32, v2f32,
- int_arm_neon_vminf, 1>;
-def VMINfq : N3VQInt<0, 0, 0b10, 0b1111, 0, "vmin.f32", v4f32, v4f32,
- int_arm_neon_vminf, 1>;
+defm VMINs : N3VInt_QHS<0, 0, 0b0110, 1, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q,
+ IIC_VBINi4Q, "vmin.s", int_arm_neon_vmins, 1>;
+defm VMINu : N3VInt_QHS<1, 0, 0b0110, 1, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q,
+ IIC_VBINi4Q, "vmin.u", int_arm_neon_vminu, 1>;
+def VMINfd : N3VDInt<0, 0, 0b10, 0b1111, 0, IIC_VBIND, "vmin.f32", v2f32, v2f32,
+ int_arm_neon_vmins, 1>;
+def VMINfq : N3VQInt<0, 0, 0b10, 0b1111, 0, IIC_VBINQ, "vmin.f32", v4f32, v4f32,
+ int_arm_neon_vmins, 1>;
// Vector Pairwise Operations.
// VPADD : Vector Pairwise Add
-def VPADDi8 : N3VDInt<0, 0, 0b00, 0b1011, 1, "vpadd.i8", v8i8, v8i8,
- int_arm_neon_vpaddi, 0>;
-def VPADDi16 : N3VDInt<0, 0, 0b01, 0b1011, 1, "vpadd.i16", v4i16, v4i16,
- int_arm_neon_vpaddi, 0>;
-def VPADDi32 : N3VDInt<0, 0, 0b10, 0b1011, 1, "vpadd.i32", v2i32, v2i32,
- int_arm_neon_vpaddi, 0>;
-def VPADDf : N3VDInt<1, 0, 0b00, 0b1101, 0, "vpadd.f32", v2f32, v2f32,
- int_arm_neon_vpaddf, 0>;
+def VPADDi8 : N3VDInt<0, 0, 0b00, 0b1011, 1, IIC_VBINiD, "vpadd.i8", v8i8, v8i8,
+ int_arm_neon_vpadd, 0>;
+def VPADDi16 : N3VDInt<0, 0, 0b01, 0b1011, 1, IIC_VBINiD, "vpadd.i16", v4i16, v4i16,
+ int_arm_neon_vpadd, 0>;
+def VPADDi32 : N3VDInt<0, 0, 0b10, 0b1011, 1, IIC_VBINiD, "vpadd.i32", v2i32, v2i32,
+ int_arm_neon_vpadd, 0>;
+def VPADDf : N3VDInt<1, 0, 0b00, 0b1101, 0, IIC_VBIND, "vpadd.f32", v2f32, v2f32,
+ int_arm_neon_vpadd, 0>;
// VPADDL : Vector Pairwise Add Long
defm VPADDLs : N2VPLInt_QHS<0b11, 0b11, 0b00, 0b00100, 0, "vpaddl.s",
@@ -1035,81 +1957,91 @@ defm VPADALu : N2VPLInt2_QHS<0b11, 0b11, 0b00, 0b00101, 0, "vpadal.u",
int_arm_neon_vpadalu>;
// VPMAX : Vector Pairwise Maximum
-def VPMAXs8 : N3VDInt<0, 0, 0b00, 0b1010, 0, "vpmax.s8", v8i8, v8i8,
+def VPMAXs8 : N3VDInt<0, 0, 0b00, 0b1010, 0, IIC_VBINi4D, "vpmax.s8", v8i8, v8i8,
int_arm_neon_vpmaxs, 0>;
-def VPMAXs16 : N3VDInt<0, 0, 0b01, 0b1010, 0, "vpmax.s16", v4i16, v4i16,
+def VPMAXs16 : N3VDInt<0, 0, 0b01, 0b1010, 0, IIC_VBINi4D, "vpmax.s16", v4i16, v4i16,
int_arm_neon_vpmaxs, 0>;
-def VPMAXs32 : N3VDInt<0, 0, 0b10, 0b1010, 0, "vpmax.s32", v2i32, v2i32,
+def VPMAXs32 : N3VDInt<0, 0, 0b10, 0b1010, 0, IIC_VBINi4D, "vpmax.s32", v2i32, v2i32,
int_arm_neon_vpmaxs, 0>;
-def VPMAXu8 : N3VDInt<1, 0, 0b00, 0b1010, 0, "vpmax.u8", v8i8, v8i8,
+def VPMAXu8 : N3VDInt<1, 0, 0b00, 0b1010, 0, IIC_VBINi4D, "vpmax.u8", v8i8, v8i8,
int_arm_neon_vpmaxu, 0>;
-def VPMAXu16 : N3VDInt<1, 0, 0b01, 0b1010, 0, "vpmax.u16", v4i16, v4i16,
+def VPMAXu16 : N3VDInt<1, 0, 0b01, 0b1010, 0, IIC_VBINi4D, "vpmax.u16", v4i16, v4i16,
int_arm_neon_vpmaxu, 0>;
-def VPMAXu32 : N3VDInt<1, 0, 0b10, 0b1010, 0, "vpmax.u32", v2i32, v2i32,
+def VPMAXu32 : N3VDInt<1, 0, 0b10, 0b1010, 0, IIC_VBINi4D, "vpmax.u32", v2i32, v2i32,
int_arm_neon_vpmaxu, 0>;
-def VPMAXf : N3VDInt<1, 0, 0b00, 0b1111, 0, "vpmax.f32", v2f32, v2f32,
- int_arm_neon_vpmaxf, 0>;
+def VPMAXf : N3VDInt<1, 0, 0b00, 0b1111, 0, IIC_VBINi4D, "vpmax.f32", v2f32, v2f32,
+ int_arm_neon_vpmaxs, 0>;
// VPMIN : Vector Pairwise Minimum
-def VPMINs8 : N3VDInt<0, 0, 0b00, 0b1010, 1, "vpmin.s8", v8i8, v8i8,
+def VPMINs8 : N3VDInt<0, 0, 0b00, 0b1010, 1, IIC_VBINi4D, "vpmin.s8", v8i8, v8i8,
int_arm_neon_vpmins, 0>;
-def VPMINs16 : N3VDInt<0, 0, 0b01, 0b1010, 1, "vpmin.s16", v4i16, v4i16,
+def VPMINs16 : N3VDInt<0, 0, 0b01, 0b1010, 1, IIC_VBINi4D, "vpmin.s16", v4i16, v4i16,
int_arm_neon_vpmins, 0>;
-def VPMINs32 : N3VDInt<0, 0, 0b10, 0b1010, 1, "vpmin.s32", v2i32, v2i32,
+def VPMINs32 : N3VDInt<0, 0, 0b10, 0b1010, 1, IIC_VBINi4D, "vpmin.s32", v2i32, v2i32,
int_arm_neon_vpmins, 0>;
-def VPMINu8 : N3VDInt<1, 0, 0b00, 0b1010, 1, "vpmin.u8", v8i8, v8i8,
+def VPMINu8 : N3VDInt<1, 0, 0b00, 0b1010, 1, IIC_VBINi4D, "vpmin.u8", v8i8, v8i8,
int_arm_neon_vpminu, 0>;
-def VPMINu16 : N3VDInt<1, 0, 0b01, 0b1010, 1, "vpmin.u16", v4i16, v4i16,
+def VPMINu16 : N3VDInt<1, 0, 0b01, 0b1010, 1, IIC_VBINi4D, "vpmin.u16", v4i16, v4i16,
int_arm_neon_vpminu, 0>;
-def VPMINu32 : N3VDInt<1, 0, 0b10, 0b1010, 1, "vpmin.u32", v2i32, v2i32,
+def VPMINu32 : N3VDInt<1, 0, 0b10, 0b1010, 1, IIC_VBINi4D, "vpmin.u32", v2i32, v2i32,
int_arm_neon_vpminu, 0>;
-def VPMINf : N3VDInt<1, 0, 0b10, 0b1111, 0, "vpmin.f32", v2f32, v2f32,
- int_arm_neon_vpminf, 0>;
+def VPMINf : N3VDInt<1, 0, 0b10, 0b1111, 0, IIC_VBINi4D, "vpmin.f32", v2f32, v2f32,
+ int_arm_neon_vpmins, 0>;
// Vector Reciprocal and Reciprocal Square Root Estimate and Step.
// VRECPE : Vector Reciprocal Estimate
-def VRECPEd : N2VDInt<0b11, 0b11, 0b10, 0b11, 0b01000, 0, "vrecpe.u32",
+def VRECPEd : N2VDInt<0b11, 0b11, 0b10, 0b11, 0b01000, 0,
+ IIC_VUNAD, "vrecpe.u32",
v2i32, v2i32, int_arm_neon_vrecpe>;
-def VRECPEq : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01000, 0, "vrecpe.u32",
+def VRECPEq : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01000, 0,
+ IIC_VUNAQ, "vrecpe.u32",
v4i32, v4i32, int_arm_neon_vrecpe>;
-def VRECPEfd : N2VDInt<0b11, 0b11, 0b10, 0b11, 0b01010, 0, "vrecpe.f32",
- v2f32, v2f32, int_arm_neon_vrecpef>;
-def VRECPEfq : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01010, 0, "vrecpe.f32",
- v4f32, v4f32, int_arm_neon_vrecpef>;
+def VRECPEfd : N2VDInt<0b11, 0b11, 0b10, 0b11, 0b01010, 0,
+ IIC_VUNAD, "vrecpe.f32",
+ v2f32, v2f32, int_arm_neon_vrecpe>;
+def VRECPEfq : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01010, 0,
+ IIC_VUNAQ, "vrecpe.f32",
+ v4f32, v4f32, int_arm_neon_vrecpe>;
// VRECPS : Vector Reciprocal Step
-def VRECPSfd : N3VDInt<0, 0, 0b00, 0b1111, 1, "vrecps.f32", v2f32, v2f32,
+def VRECPSfd : N3VDInt<0, 0, 0b00, 0b1111, 1, IIC_VRECSD, "vrecps.f32", v2f32, v2f32,
int_arm_neon_vrecps, 1>;
-def VRECPSfq : N3VQInt<0, 0, 0b00, 0b1111, 1, "vrecps.f32", v4f32, v4f32,
+def VRECPSfq : N3VQInt<0, 0, 0b00, 0b1111, 1, IIC_VRECSQ, "vrecps.f32", v4f32, v4f32,
int_arm_neon_vrecps, 1>;
// VRSQRTE : Vector Reciprocal Square Root Estimate
-def VRSQRTEd : N2VDInt<0b11, 0b11, 0b10, 0b11, 0b01001, 0, "vrsqrte.u32",
- v2i32, v2i32, int_arm_neon_vrsqrte>;
-def VRSQRTEq : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01001, 0, "vrsqrte.u32",
- v4i32, v4i32, int_arm_neon_vrsqrte>;
-def VRSQRTEfd : N2VDInt<0b11, 0b11, 0b10, 0b11, 0b01011, 0, "vrsqrte.f32",
- v2f32, v2f32, int_arm_neon_vrsqrtef>;
-def VRSQRTEfq : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01011, 0, "vrsqrte.f32",
- v4f32, v4f32, int_arm_neon_vrsqrtef>;
+def VRSQRTEd : N2VDInt<0b11, 0b11, 0b10, 0b11, 0b01001, 0,
+ IIC_VUNAD, "vrsqrte.u32",
+ v2i32, v2i32, int_arm_neon_vrsqrte>;
+def VRSQRTEq : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01001, 0,
+ IIC_VUNAQ, "vrsqrte.u32",
+ v4i32, v4i32, int_arm_neon_vrsqrte>;
+def VRSQRTEfd : N2VDInt<0b11, 0b11, 0b10, 0b11, 0b01011, 0,
+ IIC_VUNAD, "vrsqrte.f32",
+ v2f32, v2f32, int_arm_neon_vrsqrte>;
+def VRSQRTEfq : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01011, 0,
+ IIC_VUNAQ, "vrsqrte.f32",
+ v4f32, v4f32, int_arm_neon_vrsqrte>;
// VRSQRTS : Vector Reciprocal Square Root Step
-def VRSQRTSfd : N3VDInt<0, 0, 0b10, 0b1111, 1, "vrsqrts.f32", v2f32, v2f32,
+def VRSQRTSfd : N3VDInt<0, 0, 0b10, 0b1111, 1, IIC_VRECSD, "vrsqrts.f32", v2f32, v2f32,
int_arm_neon_vrsqrts, 1>;
-def VRSQRTSfq : N3VQInt<0, 0, 0b10, 0b1111, 1, "vrsqrts.f32", v4f32, v4f32,
+def VRSQRTSfq : N3VQInt<0, 0, 0b10, 0b1111, 1, IIC_VRECSQ, "vrsqrts.f32", v4f32, v4f32,
int_arm_neon_vrsqrts, 1>;
// Vector Shifts.
// VSHL : Vector Shift
-defm VSHLs : N3VInt_QHSD<0, 0, 0b0100, 0, "vshl.s", int_arm_neon_vshifts, 0>;
-defm VSHLu : N3VInt_QHSD<1, 0, 0b0100, 0, "vshl.u", int_arm_neon_vshiftu, 0>;
+defm VSHLs : N3VInt_QHSD<0, 0, 0b0100, 0, IIC_VSHLiD, IIC_VSHLiD, IIC_VSHLiQ,
+ IIC_VSHLiQ, "vshl.s", int_arm_neon_vshifts, 0>;
+defm VSHLu : N3VInt_QHSD<1, 0, 0b0100, 0, IIC_VSHLiD, IIC_VSHLiD, IIC_VSHLiQ,
+ IIC_VSHLiQ, "vshl.u", int_arm_neon_vshiftu, 0>;
// VSHL : Vector Shift Left (Immediate)
-defm VSHLi : N2VSh_QHSD<0, 1, 0b0111, 1, "vshl.i", NEONvshl>;
+defm VSHLi : N2VSh_QHSD<0, 1, 0b0111, 1, IIC_VSHLiD, "vshl.i", NEONvshl>;
// VSHR : Vector Shift Right (Immediate)
-defm VSHRs : N2VSh_QHSD<0, 1, 0b0000, 1, "vshr.s", NEONvshrs>;
-defm VSHRu : N2VSh_QHSD<1, 1, 0b0000, 1, "vshr.u", NEONvshru>;
+defm VSHRs : N2VSh_QHSD<0, 1, 0b0000, 1, IIC_VSHLiD, "vshr.s", NEONvshrs>;
+defm VSHRu : N2VSh_QHSD<1, 1, 0b0000, 1, IIC_VSHLiD, "vshr.u", NEONvshru>;
// VSHLL : Vector Shift Left Long
def VSHLLs8 : N2VLSh<0, 1, 0b001000, 0b1010, 0, 0, 1, "vshll.s8",
@@ -1134,86 +2066,90 @@ def VSHLLi32 : N2VLSh<1, 1, 0b111010, 0b0011, 0, 0, 0, "vshll.i32",
v2i64, v2i32, NEONvshlli>;
// VSHRN : Vector Shift Right and Narrow
-def VSHRN16 : N2VNSh<0, 1, 0b001000, 0b1000, 0, 0, 1, "vshrn.i16",
- v8i8, v8i16, NEONvshrn>;
-def VSHRN32 : N2VNSh<0, 1, 0b010000, 0b1000, 0, 0, 1, "vshrn.i32",
- v4i16, v4i32, NEONvshrn>;
-def VSHRN64 : N2VNSh<0, 1, 0b100000, 0b1000, 0, 0, 1, "vshrn.i64",
- v2i32, v2i64, NEONvshrn>;
+def VSHRN16 : N2VNSh<0, 1, 0b001000, 0b1000, 0, 0, 1,
+ IIC_VSHLiD, "vshrn.i16", v8i8, v8i16, NEONvshrn>;
+def VSHRN32 : N2VNSh<0, 1, 0b010000, 0b1000, 0, 0, 1,
+ IIC_VSHLiD, "vshrn.i32", v4i16, v4i32, NEONvshrn>;
+def VSHRN64 : N2VNSh<0, 1, 0b100000, 0b1000, 0, 0, 1,
+ IIC_VSHLiD, "vshrn.i64", v2i32, v2i64, NEONvshrn>;
// VRSHL : Vector Rounding Shift
-defm VRSHLs : N3VInt_QHSD<0,0,0b0101,0, "vrshl.s", int_arm_neon_vrshifts, 0>;
-defm VRSHLu : N3VInt_QHSD<1,0,0b0101,0, "vrshl.u", int_arm_neon_vrshiftu, 0>;
+defm VRSHLs : N3VInt_QHSD<0,0,0b0101,0, IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q,
+ IIC_VSHLi4Q, "vrshl.s", int_arm_neon_vrshifts, 0>;
+defm VRSHLu : N3VInt_QHSD<1,0,0b0101,0, IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q,
+ IIC_VSHLi4Q, "vrshl.u", int_arm_neon_vrshiftu, 0>;
// VRSHR : Vector Rounding Shift Right
-defm VRSHRs : N2VSh_QHSD<0, 1, 0b0010, 1, "vrshr.s", NEONvrshrs>;
-defm VRSHRu : N2VSh_QHSD<1, 1, 0b0010, 1, "vrshr.u", NEONvrshru>;
+defm VRSHRs : N2VSh_QHSD<0, 1, 0b0010, 1, IIC_VSHLi4D, "vrshr.s", NEONvrshrs>;
+defm VRSHRu : N2VSh_QHSD<1, 1, 0b0010, 1, IIC_VSHLi4D, "vrshr.u", NEONvrshru>;
// VRSHRN : Vector Rounding Shift Right and Narrow
-def VRSHRN16 : N2VNSh<0, 1, 0b001000, 0b1000, 0, 1, 1, "vrshrn.i16",
- v8i8, v8i16, NEONvrshrn>;
-def VRSHRN32 : N2VNSh<0, 1, 0b010000, 0b1000, 0, 1, 1, "vrshrn.i32",
- v4i16, v4i32, NEONvrshrn>;
-def VRSHRN64 : N2VNSh<0, 1, 0b100000, 0b1000, 0, 1, 1, "vrshrn.i64",
- v2i32, v2i64, NEONvrshrn>;
+def VRSHRN16 : N2VNSh<0, 1, 0b001000, 0b1000, 0, 1, 1,
+ IIC_VSHLi4D, "vrshrn.i16", v8i8, v8i16, NEONvrshrn>;
+def VRSHRN32 : N2VNSh<0, 1, 0b010000, 0b1000, 0, 1, 1,
+ IIC_VSHLi4D, "vrshrn.i32", v4i16, v4i32, NEONvrshrn>;
+def VRSHRN64 : N2VNSh<0, 1, 0b100000, 0b1000, 0, 1, 1,
+ IIC_VSHLi4D, "vrshrn.i64", v2i32, v2i64, NEONvrshrn>;
// VQSHL : Vector Saturating Shift
-defm VQSHLs : N3VInt_QHSD<0,0,0b0100,1, "vqshl.s", int_arm_neon_vqshifts, 0>;
-defm VQSHLu : N3VInt_QHSD<1,0,0b0100,1, "vqshl.u", int_arm_neon_vqshiftu, 0>;
+defm VQSHLs : N3VInt_QHSD<0,0,0b0100,1, IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q,
+ IIC_VSHLi4Q, "vqshl.s", int_arm_neon_vqshifts, 0>;
+defm VQSHLu : N3VInt_QHSD<1,0,0b0100,1, IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q,
+ IIC_VSHLi4Q, "vqshl.u", int_arm_neon_vqshiftu, 0>;
// VQSHL : Vector Saturating Shift Left (Immediate)
-defm VQSHLsi : N2VSh_QHSD<0, 1, 0b0111, 1, "vqshl.s", NEONvqshls>;
-defm VQSHLui : N2VSh_QHSD<1, 1, 0b0111, 1, "vqshl.u", NEONvqshlu>;
+defm VQSHLsi : N2VSh_QHSD<0, 1, 0b0111, 1, IIC_VSHLi4D, "vqshl.s", NEONvqshls>;
+defm VQSHLui : N2VSh_QHSD<1, 1, 0b0111, 1, IIC_VSHLi4D, "vqshl.u", NEONvqshlu>;
// VQSHLU : Vector Saturating Shift Left (Immediate, Unsigned)
-defm VQSHLsu : N2VSh_QHSD<1, 1, 0b0110, 1, "vqshlu.s", NEONvqshlsu>;
+defm VQSHLsu : N2VSh_QHSD<1, 1, 0b0110, 1, IIC_VSHLi4D, "vqshlu.s", NEONvqshlsu>;
// VQSHRN : Vector Saturating Shift Right and Narrow
-def VQSHRNs16 : N2VNSh<0, 1, 0b001000, 0b1001, 0, 0, 1, "vqshrn.s16",
- v8i8, v8i16, NEONvqshrns>;
-def VQSHRNs32 : N2VNSh<0, 1, 0b010000, 0b1001, 0, 0, 1, "vqshrn.s32",
- v4i16, v4i32, NEONvqshrns>;
-def VQSHRNs64 : N2VNSh<0, 1, 0b100000, 0b1001, 0, 0, 1, "vqshrn.s64",
- v2i32, v2i64, NEONvqshrns>;
-def VQSHRNu16 : N2VNSh<1, 1, 0b001000, 0b1001, 0, 0, 1, "vqshrn.u16",
- v8i8, v8i16, NEONvqshrnu>;
-def VQSHRNu32 : N2VNSh<1, 1, 0b010000, 0b1001, 0, 0, 1, "vqshrn.u32",
- v4i16, v4i32, NEONvqshrnu>;
-def VQSHRNu64 : N2VNSh<1, 1, 0b100000, 0b1001, 0, 0, 1, "vqshrn.u64",
- v2i32, v2i64, NEONvqshrnu>;
+def VQSHRNs16 : N2VNSh<0, 1, 0b001000, 0b1001, 0, 0, 1,
+ IIC_VSHLi4D, "vqshrn.s16", v8i8, v8i16, NEONvqshrns>;
+def VQSHRNs32 : N2VNSh<0, 1, 0b010000, 0b1001, 0, 0, 1,
+ IIC_VSHLi4D, "vqshrn.s32", v4i16, v4i32, NEONvqshrns>;
+def VQSHRNs64 : N2VNSh<0, 1, 0b100000, 0b1001, 0, 0, 1,
+ IIC_VSHLi4D, "vqshrn.s64", v2i32, v2i64, NEONvqshrns>;
+def VQSHRNu16 : N2VNSh<1, 1, 0b001000, 0b1001, 0, 0, 1,
+ IIC_VSHLi4D, "vqshrn.u16", v8i8, v8i16, NEONvqshrnu>;
+def VQSHRNu32 : N2VNSh<1, 1, 0b010000, 0b1001, 0, 0, 1,
+ IIC_VSHLi4D, "vqshrn.u32", v4i16, v4i32, NEONvqshrnu>;
+def VQSHRNu64 : N2VNSh<1, 1, 0b100000, 0b1001, 0, 0, 1,
+ IIC_VSHLi4D, "vqshrn.u64", v2i32, v2i64, NEONvqshrnu>;
// VQSHRUN : Vector Saturating Shift Right and Narrow (Unsigned)
-def VQSHRUN16 : N2VNSh<1, 1, 0b001000, 0b1000, 0, 0, 1, "vqshrun.s16",
- v8i8, v8i16, NEONvqshrnsu>;
-def VQSHRUN32 : N2VNSh<1, 1, 0b010000, 0b1000, 0, 0, 1, "vqshrun.s32",
- v4i16, v4i32, NEONvqshrnsu>;
-def VQSHRUN64 : N2VNSh<1, 1, 0b100000, 0b1000, 0, 0, 1, "vqshrun.s64",
- v2i32, v2i64, NEONvqshrnsu>;
+def VQSHRUN16 : N2VNSh<1, 1, 0b001000, 0b1000, 0, 0, 1,
+ IIC_VSHLi4D, "vqshrun.s16", v8i8, v8i16, NEONvqshrnsu>;
+def VQSHRUN32 : N2VNSh<1, 1, 0b010000, 0b1000, 0, 0, 1,
+ IIC_VSHLi4D, "vqshrun.s32", v4i16, v4i32, NEONvqshrnsu>;
+def VQSHRUN64 : N2VNSh<1, 1, 0b100000, 0b1000, 0, 0, 1,
+ IIC_VSHLi4D, "vqshrun.s64", v2i32, v2i64, NEONvqshrnsu>;
// VQRSHL : Vector Saturating Rounding Shift
-defm VQRSHLs : N3VInt_QHSD<0, 0, 0b0101, 1, "vqrshl.s",
- int_arm_neon_vqrshifts, 0>;
-defm VQRSHLu : N3VInt_QHSD<1, 0, 0b0101, 1, "vqrshl.u",
- int_arm_neon_vqrshiftu, 0>;
+defm VQRSHLs : N3VInt_QHSD<0, 0, 0b0101, 1, IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q,
+ IIC_VSHLi4Q, "vqrshl.s", int_arm_neon_vqrshifts, 0>;
+defm VQRSHLu : N3VInt_QHSD<1, 0, 0b0101, 1, IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q,
+ IIC_VSHLi4Q, "vqrshl.u", int_arm_neon_vqrshiftu, 0>;
// VQRSHRN : Vector Saturating Rounding Shift Right and Narrow
-def VQRSHRNs16: N2VNSh<0, 1, 0b001000, 0b1001, 0, 1, 1, "vqrshrn.s16",
- v8i8, v8i16, NEONvqrshrns>;
-def VQRSHRNs32: N2VNSh<0, 1, 0b010000, 0b1001, 0, 1, 1, "vqrshrn.s32",
- v4i16, v4i32, NEONvqrshrns>;
-def VQRSHRNs64: N2VNSh<0, 1, 0b100000, 0b1001, 0, 1, 1, "vqrshrn.s64",
- v2i32, v2i64, NEONvqrshrns>;
-def VQRSHRNu16: N2VNSh<1, 1, 0b001000, 0b1001, 0, 1, 1, "vqrshrn.u16",
- v8i8, v8i16, NEONvqrshrnu>;
-def VQRSHRNu32: N2VNSh<1, 1, 0b010000, 0b1001, 0, 1, 1, "vqrshrn.u32",
- v4i16, v4i32, NEONvqrshrnu>;
-def VQRSHRNu64: N2VNSh<1, 1, 0b100000, 0b1001, 0, 1, 1, "vqrshrn.u64",
- v2i32, v2i64, NEONvqrshrnu>;
+def VQRSHRNs16: N2VNSh<0, 1, 0b001000, 0b1001, 0, 1, 1,
+ IIC_VSHLi4D, "vqrshrn.s16", v8i8, v8i16, NEONvqrshrns>;
+def VQRSHRNs32: N2VNSh<0, 1, 0b010000, 0b1001, 0, 1, 1,
+ IIC_VSHLi4D, "vqrshrn.s32", v4i16, v4i32, NEONvqrshrns>;
+def VQRSHRNs64: N2VNSh<0, 1, 0b100000, 0b1001, 0, 1, 1,
+ IIC_VSHLi4D, "vqrshrn.s64", v2i32, v2i64, NEONvqrshrns>;
+def VQRSHRNu16: N2VNSh<1, 1, 0b001000, 0b1001, 0, 1, 1,
+ IIC_VSHLi4D, "vqrshrn.u16", v8i8, v8i16, NEONvqrshrnu>;
+def VQRSHRNu32: N2VNSh<1, 1, 0b010000, 0b1001, 0, 1, 1,
+ IIC_VSHLi4D, "vqrshrn.u32", v4i16, v4i32, NEONvqrshrnu>;
+def VQRSHRNu64: N2VNSh<1, 1, 0b100000, 0b1001, 0, 1, 1,
+ IIC_VSHLi4D, "vqrshrn.u64", v2i32, v2i64, NEONvqrshrnu>;
// VQRSHRUN : Vector Saturating Rounding Shift Right and Narrow (Unsigned)
-def VQRSHRUN16: N2VNSh<1, 1, 0b001000, 0b1000, 0, 1, 1, "vqrshrun.s16",
- v8i8, v8i16, NEONvqrshrnsu>;
-def VQRSHRUN32: N2VNSh<1, 1, 0b010000, 0b1000, 0, 1, 1, "vqrshrun.s32",
- v4i16, v4i32, NEONvqrshrnsu>;
-def VQRSHRUN64: N2VNSh<1, 1, 0b100000, 0b1000, 0, 1, 1, "vqrshrun.s64",
- v2i32, v2i64, NEONvqrshrnsu>;
+def VQRSHRUN16: N2VNSh<1, 1, 0b001000, 0b1000, 0, 1, 1,
+ IIC_VSHLi4D, "vqrshrun.s16", v8i8, v8i16, NEONvqrshrnsu>;
+def VQRSHRUN32: N2VNSh<1, 1, 0b010000, 0b1000, 0, 1, 1,
+ IIC_VSHLi4D, "vqrshrun.s32", v4i16, v4i32, NEONvqrshrnsu>;
+def VQRSHRUN64: N2VNSh<1, 1, 0b100000, 0b1000, 0, 1, 1,
+ IIC_VSHLi4D, "vqrshrun.s64", v2i32, v2i64, NEONvqrshrnsu>;
// VSRA : Vector Shift Right and Accumulate
defm VSRAs : N2VShAdd_QHSD<0, 1, 0b0001, 1, "vsra.s", NEONvshrs>;
@@ -1230,15 +2166,19 @@ defm VSRI : N2VShIns_QHSD<1, 1, 0b0100, 1, "vsri.", NEONvsri>;
// Vector Absolute and Saturating Absolute.
// VABS : Vector Absolute Value
-defm VABS : N2VInt_QHS<0b11, 0b11, 0b01, 0b00110, 0, "vabs.s",
+defm VABS : N2VInt_QHS<0b11, 0b11, 0b01, 0b00110, 0,
+ IIC_VUNAiD, IIC_VUNAiQ, "vabs.s",
int_arm_neon_vabs>;
-def VABSfd : N2VDInt<0b11, 0b11, 0b10, 0b01, 0b01110, 0, "vabs.f32",
- v2f32, v2f32, int_arm_neon_vabsf>;
-def VABSfq : N2VQInt<0b11, 0b11, 0b10, 0b01, 0b01110, 0, "vabs.f32",
- v4f32, v4f32, int_arm_neon_vabsf>;
+def VABSfd : N2VDInt<0b11, 0b11, 0b10, 0b01, 0b01110, 0,
+ IIC_VUNAD, "vabs.f32",
+ v2f32, v2f32, int_arm_neon_vabs>;
+def VABSfq : N2VQInt<0b11, 0b11, 0b10, 0b01, 0b01110, 0,
+ IIC_VUNAQ, "vabs.f32",
+ v4f32, v4f32, int_arm_neon_vabs>;
// VQABS : Vector Saturating Absolute Value
-defm VQABS : N2VInt_QHS<0b11, 0b11, 0b00, 0b01110, 0, "vqabs.s",
+defm VQABS : N2VInt_QHS<0b11, 0b11, 0b00, 0b01110, 0,
+ IIC_VQUNAiD, IIC_VQUNAiQ, "vqabs.s",
int_arm_neon_vqabs>;
// Vector Negate.
@@ -1248,11 +2188,11 @@ def vneg_conv : PatFrag<(ops node:$in), (sub immAllZerosV_bc, node:$in)>;
class VNEGD<bits<2> size, string OpcodeStr, ValueType Ty>
: N2V<0b11, 0b11, size, 0b01, 0b00111, 0, 0, (outs DPR:$dst), (ins DPR:$src),
- !strconcat(OpcodeStr, "\t$dst, $src"), "",
+ IIC_VSHLiD, !strconcat(OpcodeStr, "\t$dst, $src"), "",
[(set DPR:$dst, (Ty (vneg DPR:$src)))]>;
class VNEGQ<bits<2> size, string OpcodeStr, ValueType Ty>
: N2V<0b11, 0b11, size, 0b01, 0b00111, 1, 0, (outs QPR:$dst), (ins QPR:$src),
- !strconcat(OpcodeStr, "\t$dst, $src"), "",
+ IIC_VSHLiD, !strconcat(OpcodeStr, "\t$dst, $src"), "",
[(set QPR:$dst, (Ty (vneg QPR:$src)))]>;
// VNEG : Vector Negate
@@ -1265,10 +2205,12 @@ def VNEGs32q : VNEGQ<0b10, "vneg.s32", v4i32>;
// VNEG : Vector Negate (floating-point)
def VNEGf32d : N2V<0b11, 0b11, 0b10, 0b01, 0b01111, 0, 0,
- (outs DPR:$dst), (ins DPR:$src), "vneg.f32\t$dst, $src", "",
+ (outs DPR:$dst), (ins DPR:$src), IIC_VUNAD,
+ "vneg.f32\t$dst, $src", "",
[(set DPR:$dst, (v2f32 (fneg DPR:$src)))]>;
def VNEGf32q : N2V<0b11, 0b11, 0b10, 0b01, 0b01111, 1, 0,
- (outs QPR:$dst), (ins QPR:$src), "vneg.f32\t$dst, $src", "",
+ (outs QPR:$dst), (ins QPR:$src), IIC_VUNAQ,
+ "vneg.f32\t$dst, $src", "",
[(set QPR:$dst, (v4f32 (fneg QPR:$src)))]>;
def : Pat<(v8i8 (vneg_conv DPR:$src)), (VNEGs8d DPR:$src)>;
@@ -1279,21 +2221,26 @@ def : Pat<(v8i16 (vneg_conv QPR:$src)), (VNEGs16q QPR:$src)>;
def : Pat<(v4i32 (vneg_conv QPR:$src)), (VNEGs32q QPR:$src)>;
// VQNEG : Vector Saturating Negate
-defm VQNEG : N2VInt_QHS<0b11, 0b11, 0b00, 0b01111, 0, "vqneg.s",
+defm VQNEG : N2VInt_QHS<0b11, 0b11, 0b00, 0b01111, 0,
+ IIC_VQUNAiD, IIC_VQUNAiQ, "vqneg.s",
int_arm_neon_vqneg>;
// Vector Bit Counting Operations.
// VCLS : Vector Count Leading Sign Bits
-defm VCLS : N2VInt_QHS<0b11, 0b11, 0b00, 0b01000, 0, "vcls.s",
+defm VCLS : N2VInt_QHS<0b11, 0b11, 0b00, 0b01000, 0,
+ IIC_VCNTiD, IIC_VCNTiQ, "vcls.s",
int_arm_neon_vcls>;
// VCLZ : Vector Count Leading Zeros
-defm VCLZ : N2VInt_QHS<0b11, 0b11, 0b00, 0b01001, 0, "vclz.i",
+defm VCLZ : N2VInt_QHS<0b11, 0b11, 0b00, 0b01001, 0,
+ IIC_VCNTiD, IIC_VCNTiQ, "vclz.i",
int_arm_neon_vclz>;
// VCNT : Vector Count One Bits
-def VCNTd : N2VDInt<0b11, 0b11, 0b00, 0b00, 0b01010, 0, "vcnt.8",
+def VCNTd : N2VDInt<0b11, 0b11, 0b00, 0b00, 0b01010, 0,
+ IIC_VCNTiD, "vcnt.8",
v8i8, v8i8, int_arm_neon_vcnt>;
-def VCNTq : N2VQInt<0b11, 0b11, 0b00, 0b00, 0b01010, 0, "vcnt.8",
+def VCNTq : N2VQInt<0b11, 0b11, 0b00, 0b00, 0b01010, 0,
+ IIC_VCNTiQ, "vcnt.8",
v16i8, v16i8, int_arm_neon_vcnt>;
// Vector Move Operations.
@@ -1301,9 +2248,9 @@ def VCNTq : N2VQInt<0b11, 0b11, 0b00, 0b00, 0b01010, 0, "vcnt.8",
// VMOV : Vector Move (Register)
def VMOVD : N3V<0, 0, 0b10, 0b0001, 0, 1, (outs DPR:$dst), (ins DPR:$src),
- "vmov\t$dst, $src", "", []>;
+ IIC_VMOVD, "vmov\t$dst, $src", "", []>;
def VMOVQ : N3V<0, 0, 0b10, 0b0001, 1, 1, (outs QPR:$dst), (ins QPR:$src),
- "vmov\t$dst, $src", "", []>;
+ IIC_VMOVD, "vmov\t$dst, $src", "", []>;
// VMOV : Vector Move (Immediate)
@@ -1343,146 +2290,188 @@ def vmovImm64 : PatLeaf<(build_vector), [{
// be encoded based on the immed values.
def VMOVv8i8 : N1ModImm<1, 0b000, 0b1110, 0, 0, 0, 1, (outs DPR:$dst),
- (ins i8imm:$SIMM), "vmov.i8\t$dst, $SIMM", "",
+ (ins i8imm:$SIMM), IIC_VMOVImm,
+ "vmov.i8\t$dst, $SIMM", "",
[(set DPR:$dst, (v8i8 vmovImm8:$SIMM))]>;
def VMOVv16i8 : N1ModImm<1, 0b000, 0b1110, 0, 1, 0, 1, (outs QPR:$dst),
- (ins i8imm:$SIMM), "vmov.i8\t$dst, $SIMM", "",
+ (ins i8imm:$SIMM), IIC_VMOVImm,
+ "vmov.i8\t$dst, $SIMM", "",
[(set QPR:$dst, (v16i8 vmovImm8:$SIMM))]>;
def VMOVv4i16 : N1ModImm<1, 0b000, 0b1000, 0, 0, 0, 1, (outs DPR:$dst),
- (ins i16imm:$SIMM), "vmov.i16\t$dst, $SIMM", "",
+ (ins i16imm:$SIMM), IIC_VMOVImm,
+ "vmov.i16\t$dst, $SIMM", "",
[(set DPR:$dst, (v4i16 vmovImm16:$SIMM))]>;
def VMOVv8i16 : N1ModImm<1, 0b000, 0b1000, 0, 1, 0, 1, (outs QPR:$dst),
- (ins i16imm:$SIMM), "vmov.i16\t$dst, $SIMM", "",
+ (ins i16imm:$SIMM), IIC_VMOVImm,
+ "vmov.i16\t$dst, $SIMM", "",
[(set QPR:$dst, (v8i16 vmovImm16:$SIMM))]>;
def VMOVv2i32 : N1ModImm<1, 0b000, 0b0000, 0, 0, 0, 1, (outs DPR:$dst),
- (ins i32imm:$SIMM), "vmov.i32\t$dst, $SIMM", "",
+ (ins i32imm:$SIMM), IIC_VMOVImm,
+ "vmov.i32\t$dst, $SIMM", "",
[(set DPR:$dst, (v2i32 vmovImm32:$SIMM))]>;
def VMOVv4i32 : N1ModImm<1, 0b000, 0b0000, 0, 1, 0, 1, (outs QPR:$dst),
- (ins i32imm:$SIMM), "vmov.i32\t$dst, $SIMM", "",
+ (ins i32imm:$SIMM), IIC_VMOVImm,
+ "vmov.i32\t$dst, $SIMM", "",
[(set QPR:$dst, (v4i32 vmovImm32:$SIMM))]>;
def VMOVv1i64 : N1ModImm<1, 0b000, 0b1110, 0, 0, 1, 1, (outs DPR:$dst),
- (ins i64imm:$SIMM), "vmov.i64\t$dst, $SIMM", "",
+ (ins i64imm:$SIMM), IIC_VMOVImm,
+ "vmov.i64\t$dst, $SIMM", "",
[(set DPR:$dst, (v1i64 vmovImm64:$SIMM))]>;
def VMOVv2i64 : N1ModImm<1, 0b000, 0b1110, 0, 1, 1, 1, (outs QPR:$dst),
- (ins i64imm:$SIMM), "vmov.i64\t$dst, $SIMM", "",
+ (ins i64imm:$SIMM), IIC_VMOVImm,
+ "vmov.i64\t$dst, $SIMM", "",
[(set QPR:$dst, (v2i64 vmovImm64:$SIMM))]>;
// VMOV : Vector Get Lane (move scalar to ARM core register)
def VGETLNs8 : NVGetLane<0b11100101, 0b1011, 0b00,
- (outs GPR:$dst), (ins DPR:$src, i32imm:$lane),
- "vmov", ".s8\t$dst, $src[$lane]",
+ (outs GPR:$dst), (ins DPR:$src, nohash_imm:$lane),
+ IIC_VMOVSI, "vmov", ".s8\t$dst, $src[$lane]",
[(set GPR:$dst, (NEONvgetlanes (v8i8 DPR:$src),
imm:$lane))]>;
def VGETLNs16 : NVGetLane<0b11100001, 0b1011, 0b01,
- (outs GPR:$dst), (ins DPR:$src, i32imm:$lane),
- "vmov", ".s16\t$dst, $src[$lane]",
+ (outs GPR:$dst), (ins DPR:$src, nohash_imm:$lane),
+ IIC_VMOVSI, "vmov", ".s16\t$dst, $src[$lane]",
[(set GPR:$dst, (NEONvgetlanes (v4i16 DPR:$src),
imm:$lane))]>;
def VGETLNu8 : NVGetLane<0b11101101, 0b1011, 0b00,
- (outs GPR:$dst), (ins DPR:$src, i32imm:$lane),
- "vmov", ".u8\t$dst, $src[$lane]",
+ (outs GPR:$dst), (ins DPR:$src, nohash_imm:$lane),
+ IIC_VMOVSI, "vmov", ".u8\t$dst, $src[$lane]",
[(set GPR:$dst, (NEONvgetlaneu (v8i8 DPR:$src),
imm:$lane))]>;
def VGETLNu16 : NVGetLane<0b11101001, 0b1011, 0b01,
- (outs GPR:$dst), (ins DPR:$src, i32imm:$lane),
- "vmov", ".u16\t$dst, $src[$lane]",
+ (outs GPR:$dst), (ins DPR:$src, nohash_imm:$lane),
+ IIC_VMOVSI, "vmov", ".u16\t$dst, $src[$lane]",
[(set GPR:$dst, (NEONvgetlaneu (v4i16 DPR:$src),
imm:$lane))]>;
def VGETLNi32 : NVGetLane<0b11100001, 0b1011, 0b00,
- (outs GPR:$dst), (ins DPR:$src, i32imm:$lane),
- "vmov", ".32\t$dst, $src[$lane]",
+ (outs GPR:$dst), (ins DPR:$src, nohash_imm:$lane),
+ IIC_VMOVSI, "vmov", ".32\t$dst, $src[$lane]",
[(set GPR:$dst, (extractelt (v2i32 DPR:$src),
imm:$lane))]>;
// def VGETLNf32: see FMRDH and FMRDL in ARMInstrVFP.td
def : Pat<(NEONvgetlanes (v16i8 QPR:$src), imm:$lane),
(VGETLNs8 (v8i8 (EXTRACT_SUBREG QPR:$src,
- (SubReg_i8_reg imm:$lane))),
+ (DSubReg_i8_reg imm:$lane))),
(SubReg_i8_lane imm:$lane))>;
def : Pat<(NEONvgetlanes (v8i16 QPR:$src), imm:$lane),
(VGETLNs16 (v4i16 (EXTRACT_SUBREG QPR:$src,
- (SubReg_i16_reg imm:$lane))),
+ (DSubReg_i16_reg imm:$lane))),
(SubReg_i16_lane imm:$lane))>;
def : Pat<(NEONvgetlaneu (v16i8 QPR:$src), imm:$lane),
(VGETLNu8 (v8i8 (EXTRACT_SUBREG QPR:$src,
- (SubReg_i8_reg imm:$lane))),
+ (DSubReg_i8_reg imm:$lane))),
(SubReg_i8_lane imm:$lane))>;
def : Pat<(NEONvgetlaneu (v8i16 QPR:$src), imm:$lane),
(VGETLNu16 (v4i16 (EXTRACT_SUBREG QPR:$src,
- (SubReg_i16_reg imm:$lane))),
+ (DSubReg_i16_reg imm:$lane))),
(SubReg_i16_lane imm:$lane))>;
def : Pat<(extractelt (v4i32 QPR:$src), imm:$lane),
(VGETLNi32 (v2i32 (EXTRACT_SUBREG QPR:$src,
- (SubReg_i32_reg imm:$lane))),
+ (DSubReg_i32_reg imm:$lane))),
(SubReg_i32_lane imm:$lane))>;
+def : Pat<(extractelt (v2f32 DPR:$src1), imm:$src2),
+ (EXTRACT_SUBREG (COPY_TO_REGCLASS DPR:$src1, DPR_VFP2),
+ (SSubReg_f32_reg imm:$src2))>;
+def : Pat<(extractelt (v4f32 QPR:$src1), imm:$src2),
+ (EXTRACT_SUBREG (COPY_TO_REGCLASS QPR:$src1, QPR_VFP2),
+ (SSubReg_f32_reg imm:$src2))>;
//def : Pat<(extractelt (v2i64 QPR:$src1), imm:$src2),
-// (EXTRACT_SUBREG QPR:$src1, (SubReg_f64_reg imm:$src2))>;
+// (EXTRACT_SUBREG QPR:$src1, (DSubReg_f64_reg imm:$src2))>;
def : Pat<(extractelt (v2f64 QPR:$src1), imm:$src2),
- (EXTRACT_SUBREG QPR:$src1, (SubReg_f64_reg imm:$src2))>;
+ (EXTRACT_SUBREG QPR:$src1, (DSubReg_f64_reg imm:$src2))>;
// VMOV : Vector Set Lane (move ARM core register to scalar)
let Constraints = "$src1 = $dst" in {
def VSETLNi8 : NVSetLane<0b11100100, 0b1011, 0b00, (outs DPR:$dst),
- (ins DPR:$src1, GPR:$src2, i32imm:$lane),
- "vmov", ".8\t$dst[$lane], $src2",
+ (ins DPR:$src1, GPR:$src2, nohash_imm:$lane),
+ IIC_VMOVISL, "vmov", ".8\t$dst[$lane], $src2",
[(set DPR:$dst, (vector_insert (v8i8 DPR:$src1),
GPR:$src2, imm:$lane))]>;
def VSETLNi16 : NVSetLane<0b11100000, 0b1011, 0b01, (outs DPR:$dst),
- (ins DPR:$src1, GPR:$src2, i32imm:$lane),
- "vmov", ".16\t$dst[$lane], $src2",
+ (ins DPR:$src1, GPR:$src2, nohash_imm:$lane),
+ IIC_VMOVISL, "vmov", ".16\t$dst[$lane], $src2",
[(set DPR:$dst, (vector_insert (v4i16 DPR:$src1),
GPR:$src2, imm:$lane))]>;
def VSETLNi32 : NVSetLane<0b11100000, 0b1011, 0b00, (outs DPR:$dst),
- (ins DPR:$src1, GPR:$src2, i32imm:$lane),
- "vmov", ".32\t$dst[$lane], $src2",
+ (ins DPR:$src1, GPR:$src2, nohash_imm:$lane),
+ IIC_VMOVISL, "vmov", ".32\t$dst[$lane], $src2",
[(set DPR:$dst, (insertelt (v2i32 DPR:$src1),
GPR:$src2, imm:$lane))]>;
}
def : Pat<(vector_insert (v16i8 QPR:$src1), GPR:$src2, imm:$lane),
(v16i8 (INSERT_SUBREG QPR:$src1,
(VSETLNi8 (v8i8 (EXTRACT_SUBREG QPR:$src1,
- (SubReg_i8_reg imm:$lane))),
+ (DSubReg_i8_reg imm:$lane))),
GPR:$src2, (SubReg_i8_lane imm:$lane)),
- (SubReg_i8_reg imm:$lane)))>;
+ (DSubReg_i8_reg imm:$lane)))>;
def : Pat<(vector_insert (v8i16 QPR:$src1), GPR:$src2, imm:$lane),
(v8i16 (INSERT_SUBREG QPR:$src1,
(VSETLNi16 (v4i16 (EXTRACT_SUBREG QPR:$src1,
- (SubReg_i16_reg imm:$lane))),
+ (DSubReg_i16_reg imm:$lane))),
GPR:$src2, (SubReg_i16_lane imm:$lane)),
- (SubReg_i16_reg imm:$lane)))>;
+ (DSubReg_i16_reg imm:$lane)))>;
def : Pat<(insertelt (v4i32 QPR:$src1), GPR:$src2, imm:$lane),
(v4i32 (INSERT_SUBREG QPR:$src1,
(VSETLNi32 (v2i32 (EXTRACT_SUBREG QPR:$src1,
- (SubReg_i32_reg imm:$lane))),
+ (DSubReg_i32_reg imm:$lane))),
GPR:$src2, (SubReg_i32_lane imm:$lane)),
- (SubReg_i32_reg imm:$lane)))>;
+ (DSubReg_i32_reg imm:$lane)))>;
+
+def : Pat<(v2f32 (insertelt DPR:$src1, SPR:$src2, imm:$src3)),
+ (INSERT_SUBREG (COPY_TO_REGCLASS DPR:$src1, DPR_VFP2),
+ SPR:$src2, (SSubReg_f32_reg imm:$src3))>;
+def : Pat<(v4f32 (insertelt QPR:$src1, SPR:$src2, imm:$src3)),
+ (INSERT_SUBREG (COPY_TO_REGCLASS QPR:$src1, QPR_VFP2),
+ SPR:$src2, (SSubReg_f32_reg imm:$src3))>;
//def : Pat<(v2i64 (insertelt QPR:$src1, DPR:$src2, imm:$src3)),
-// (INSERT_SUBREG QPR:$src1, DPR:$src2, (SubReg_f64_reg imm:$src3))>;
+// (INSERT_SUBREG QPR:$src1, DPR:$src2, (DSubReg_f64_reg imm:$src3))>;
def : Pat<(v2f64 (insertelt QPR:$src1, DPR:$src2, imm:$src3)),
- (INSERT_SUBREG QPR:$src1, DPR:$src2, (SubReg_f64_reg imm:$src3))>;
+ (INSERT_SUBREG QPR:$src1, DPR:$src2, (DSubReg_f64_reg imm:$src3))>;
+
+def : Pat<(v2f32 (scalar_to_vector SPR:$src)),
+ (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), SPR:$src, arm_ssubreg_0)>;
+def : Pat<(v2f64 (scalar_to_vector DPR:$src)),
+ (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), DPR:$src, arm_dsubreg_0)>;
+def : Pat<(v4f32 (scalar_to_vector SPR:$src)),
+ (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), SPR:$src, arm_ssubreg_0)>;
+
+def : Pat<(v8i8 (scalar_to_vector GPR:$src)),
+ (VSETLNi8 (v8i8 (IMPLICIT_DEF)), GPR:$src, (i32 0))>;
+def : Pat<(v4i16 (scalar_to_vector GPR:$src)),
+ (VSETLNi16 (v4i16 (IMPLICIT_DEF)), GPR:$src, (i32 0))>;
+def : Pat<(v2i32 (scalar_to_vector GPR:$src)),
+ (VSETLNi32 (v2i32 (IMPLICIT_DEF)), GPR:$src, (i32 0))>;
+
+def : Pat<(v16i8 (scalar_to_vector GPR:$src)),
+ (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+ (VSETLNi8 (v8i8 (IMPLICIT_DEF)), GPR:$src, (i32 0)),
+ arm_dsubreg_0)>;
+def : Pat<(v8i16 (scalar_to_vector GPR:$src)),
+ (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)),
+ (VSETLNi16 (v4i16 (IMPLICIT_DEF)), GPR:$src, (i32 0)),
+ arm_dsubreg_0)>;
+def : Pat<(v4i32 (scalar_to_vector GPR:$src)),
+ (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
+ (VSETLNi32 (v2i32 (IMPLICIT_DEF)), GPR:$src, (i32 0)),
+ arm_dsubreg_0)>;
// VDUP : Vector Duplicate (from ARM core register to all elements)
-def splat_lo : PatFrag<(ops node:$lhs, node:$rhs),
- (vector_shuffle node:$lhs, node:$rhs), [{
- ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
- return SVOp->isSplat() && SVOp->getSplatIndex() == 0;
-}]>;
-
class VDUPD<bits<8> opcod1, bits<2> opcod3, string asmSize, ValueType Ty>
: NVDup<opcod1, 0b1011, opcod3, (outs DPR:$dst), (ins GPR:$src),
- "vdup", !strconcat(asmSize, "\t$dst, $src"),
- [(set DPR:$dst, (Ty (splat_lo (scalar_to_vector GPR:$src), undef)))]>;
+ IIC_VMOVIS, "vdup", !strconcat(asmSize, "\t$dst, $src"),
+ [(set DPR:$dst, (Ty (NEONvdup (i32 GPR:$src))))]>;
class VDUPQ<bits<8> opcod1, bits<2> opcod3, string asmSize, ValueType Ty>
: NVDup<opcod1, 0b1011, opcod3, (outs QPR:$dst), (ins GPR:$src),
- "vdup", !strconcat(asmSize, "\t$dst, $src"),
- [(set QPR:$dst, (Ty (splat_lo (scalar_to_vector GPR:$src), undef)))]>;
+ IIC_VMOVIS, "vdup", !strconcat(asmSize, "\t$dst, $src"),
+ [(set QPR:$dst, (Ty (NEONvdup (i32 GPR:$src))))]>;
def VDUP8d : VDUPD<0b11101100, 0b00, ".8", v8i8>;
def VDUP16d : VDUPD<0b11101000, 0b01, ".16", v4i16>;
@@ -1492,45 +2481,28 @@ def VDUP16q : VDUPQ<0b11101010, 0b01, ".16", v8i16>;
def VDUP32q : VDUPQ<0b11101010, 0b00, ".32", v4i32>;
def VDUPfd : NVDup<0b11101000, 0b1011, 0b00, (outs DPR:$dst), (ins GPR:$src),
- "vdup", ".32\t$dst, $src",
- [(set DPR:$dst, (v2f32 (splat_lo
- (scalar_to_vector
- (f32 (bitconvert GPR:$src))),
- undef)))]>;
+ IIC_VMOVIS, "vdup", ".32\t$dst, $src",
+ [(set DPR:$dst, (v2f32 (NEONvdup
+ (f32 (bitconvert GPR:$src)))))]>;
def VDUPfq : NVDup<0b11101010, 0b1011, 0b00, (outs QPR:$dst), (ins GPR:$src),
- "vdup", ".32\t$dst, $src",
- [(set QPR:$dst, (v4f32 (splat_lo
- (scalar_to_vector
- (f32 (bitconvert GPR:$src))),
- undef)))]>;
+ IIC_VMOVIS, "vdup", ".32\t$dst, $src",
+ [(set QPR:$dst, (v4f32 (NEONvdup
+ (f32 (bitconvert GPR:$src)))))]>;
// VDUP : Vector Duplicate Lane (from scalar to all elements)
-def SHUFFLE_get_splat_lane : SDNodeXForm<vector_shuffle, [{
- ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
- return CurDAG->getTargetConstant(SVOp->getSplatIndex(), MVT::i32);
-}]>;
-
-def splat_lane : PatFrag<(ops node:$lhs, node:$rhs),
- (vector_shuffle node:$lhs, node:$rhs), [{
- ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
- return SVOp->isSplat();
-}], SHUFFLE_get_splat_lane>;
-
class VDUPLND<bits<2> op19_18, bits<2> op17_16, string OpcodeStr, ValueType Ty>
: N2V<0b11, 0b11, op19_18, op17_16, 0b11000, 0, 0,
- (outs DPR:$dst), (ins DPR:$src, i32imm:$lane),
+ (outs DPR:$dst), (ins DPR:$src, nohash_imm:$lane), IIC_VMOVD,
!strconcat(OpcodeStr, "\t$dst, $src[$lane]"), "",
- [(set DPR:$dst, (Ty (splat_lane:$lane DPR:$src, undef)))]>;
+ [(set DPR:$dst, (Ty (NEONvduplane (Ty DPR:$src), imm:$lane)))]>;
-// vector_shuffle requires that the source and destination types match, so
-// VDUP to a 128-bit result uses a target-specific VDUPLANEQ node.
class VDUPLNQ<bits<2> op19_18, bits<2> op17_16, string OpcodeStr,
ValueType ResTy, ValueType OpTy>
: N2V<0b11, 0b11, op19_18, op17_16, 0b11000, 1, 0,
- (outs QPR:$dst), (ins DPR:$src, i32imm:$lane),
+ (outs QPR:$dst), (ins DPR:$src, nohash_imm:$lane), IIC_VMOVD,
!strconcat(OpcodeStr, "\t$dst, $src[$lane]"), "",
- [(set QPR:$dst, (ResTy (NEONvduplaneq (OpTy DPR:$src), imm:$lane)))]>;
+ [(set QPR:$dst, (ResTy (NEONvduplane (OpTy DPR:$src), imm:$lane)))]>;
def VDUPLN8d : VDUPLND<0b00, 0b01, "vdup.8", v8i8>;
def VDUPLN16d : VDUPLND<0b00, 0b10, "vdup.16", v4i16>;
@@ -1541,15 +2513,51 @@ def VDUPLN16q : VDUPLNQ<0b00, 0b10, "vdup.16", v8i16, v4i16>;
def VDUPLN32q : VDUPLNQ<0b01, 0b00, "vdup.32", v4i32, v2i32>;
def VDUPLNfq : VDUPLNQ<0b01, 0b00, "vdup.32", v4f32, v2f32>;
+def : Pat<(v16i8 (NEONvduplane (v16i8 QPR:$src), imm:$lane)),
+ (v16i8 (VDUPLN8q (v8i8 (EXTRACT_SUBREG QPR:$src,
+ (DSubReg_i8_reg imm:$lane))),
+ (SubReg_i8_lane imm:$lane)))>;
+def : Pat<(v8i16 (NEONvduplane (v8i16 QPR:$src), imm:$lane)),
+ (v8i16 (VDUPLN16q (v4i16 (EXTRACT_SUBREG QPR:$src,
+ (DSubReg_i16_reg imm:$lane))),
+ (SubReg_i16_lane imm:$lane)))>;
+def : Pat<(v4i32 (NEONvduplane (v4i32 QPR:$src), imm:$lane)),
+ (v4i32 (VDUPLN32q (v2i32 (EXTRACT_SUBREG QPR:$src,
+ (DSubReg_i32_reg imm:$lane))),
+ (SubReg_i32_lane imm:$lane)))>;
+def : Pat<(v4f32 (NEONvduplane (v4f32 QPR:$src), imm:$lane)),
+ (v4f32 (VDUPLNfq (v2f32 (EXTRACT_SUBREG QPR:$src,
+ (DSubReg_i32_reg imm:$lane))),
+ (SubReg_i32_lane imm:$lane)))>;
+
+def VDUPfdf : N2V<0b11, 0b11, 0b01, 0b00, 0b11000, 0, 0,
+ (outs DPR:$dst), (ins SPR:$src),
+ IIC_VMOVD, "vdup.32\t$dst, ${src:lane}", "",
+ [(set DPR:$dst, (v2f32 (NEONvdup (f32 SPR:$src))))]>;
+
+def VDUPfqf : N2V<0b11, 0b11, 0b01, 0b00, 0b11000, 1, 0,
+ (outs QPR:$dst), (ins SPR:$src),
+ IIC_VMOVD, "vdup.32\t$dst, ${src:lane}", "",
+ [(set QPR:$dst, (v4f32 (NEONvdup (f32 SPR:$src))))]>;
+
+def : Pat<(v2i64 (NEONvduplane (v2i64 QPR:$src), imm:$lane)),
+ (INSERT_SUBREG QPR:$src,
+ (i64 (EXTRACT_SUBREG QPR:$src, (DSubReg_f64_reg imm:$lane))),
+ (DSubReg_f64_other_reg imm:$lane))>;
+def : Pat<(v2f64 (NEONvduplane (v2f64 QPR:$src), imm:$lane)),
+ (INSERT_SUBREG QPR:$src,
+ (f64 (EXTRACT_SUBREG QPR:$src, (DSubReg_f64_reg imm:$lane))),
+ (DSubReg_f64_other_reg imm:$lane))>;
+
// VMOVN : Vector Narrowing Move
-defm VMOVN : N2VNInt_HSD<0b11,0b11,0b10,0b00100,0,0, "vmovn.i",
+defm VMOVN : N2VNInt_HSD<0b11,0b11,0b10,0b00100,0,0, IIC_VMOVD, "vmovn.i",
int_arm_neon_vmovn>;
// VQMOVN : Vector Saturating Narrowing Move
-defm VQMOVNs : N2VNInt_HSD<0b11,0b11,0b10,0b00101,0,0, "vqmovn.s",
+defm VQMOVNs : N2VNInt_HSD<0b11,0b11,0b10,0b00101,0,0, IIC_VQUNAiD, "vqmovn.s",
int_arm_neon_vqmovns>;
-defm VQMOVNu : N2VNInt_HSD<0b11,0b11,0b10,0b00101,1,0, "vqmovn.u",
+defm VQMOVNu : N2VNInt_HSD<0b11,0b11,0b10,0b00101,1,0, IIC_VQUNAiD, "vqmovn.u",
int_arm_neon_vqmovnu>;
-defm VQMOVNsu : N2VNInt_HSD<0b11,0b11,0b10,0b00100,1,0, "vqmovun.s",
+defm VQMOVNsu : N2VNInt_HSD<0b11,0b11,0b10,0b00100,1,0, IIC_VQUNAiD, "vqmovun.s",
int_arm_neon_vqmovnsu>;
// VMOVL : Vector Lengthening Move
defm VMOVLs : N2VLInt_QHS<0,1,0b1010,0,0,1, "vmovl.s", int_arm_neon_vmovls>;
@@ -1597,6 +2605,247 @@ def VCVTxs2fq : N2VCvtQ<0, 1, 0b000000, 0b1110, 0, 1, "vcvt.f32.s32",
def VCVTxu2fq : N2VCvtQ<1, 1, 0b000000, 0b1110, 0, 1, "vcvt.f32.u32",
v4f32, v4i32, int_arm_neon_vcvtfxu2fp>;
+// Vector Reverse.
+
+// VREV64 : Vector Reverse elements within 64-bit doublewords
+
+class VREV64D<bits<2> op19_18, string OpcodeStr, ValueType Ty>
+ : N2V<0b11, 0b11, op19_18, 0b00, 0b00000, 0, 0, (outs DPR:$dst),
+ (ins DPR:$src), IIC_VMOVD,
+ !strconcat(OpcodeStr, "\t$dst, $src"), "",
+ [(set DPR:$dst, (Ty (NEONvrev64 (Ty DPR:$src))))]>;
+class VREV64Q<bits<2> op19_18, string OpcodeStr, ValueType Ty>
+ : N2V<0b11, 0b11, op19_18, 0b00, 0b00000, 1, 0, (outs QPR:$dst),
+ (ins QPR:$src), IIC_VMOVD,
+ !strconcat(OpcodeStr, "\t$dst, $src"), "",
+ [(set QPR:$dst, (Ty (NEONvrev64 (Ty QPR:$src))))]>;
+
+def VREV64d8 : VREV64D<0b00, "vrev64.8", v8i8>;
+def VREV64d16 : VREV64D<0b01, "vrev64.16", v4i16>;
+def VREV64d32 : VREV64D<0b10, "vrev64.32", v2i32>;
+def VREV64df : VREV64D<0b10, "vrev64.32", v2f32>;
+
+def VREV64q8 : VREV64Q<0b00, "vrev64.8", v16i8>;
+def VREV64q16 : VREV64Q<0b01, "vrev64.16", v8i16>;
+def VREV64q32 : VREV64Q<0b10, "vrev64.32", v4i32>;
+def VREV64qf : VREV64Q<0b10, "vrev64.32", v4f32>;
+
+// VREV32 : Vector Reverse elements within 32-bit words
+
+class VREV32D<bits<2> op19_18, string OpcodeStr, ValueType Ty>
+ : N2V<0b11, 0b11, op19_18, 0b00, 0b00001, 0, 0, (outs DPR:$dst),
+ (ins DPR:$src), IIC_VMOVD,
+ !strconcat(OpcodeStr, "\t$dst, $src"), "",
+ [(set DPR:$dst, (Ty (NEONvrev32 (Ty DPR:$src))))]>;
+class VREV32Q<bits<2> op19_18, string OpcodeStr, ValueType Ty>
+ : N2V<0b11, 0b11, op19_18, 0b00, 0b00001, 1, 0, (outs QPR:$dst),
+ (ins QPR:$src), IIC_VMOVD,
+ !strconcat(OpcodeStr, "\t$dst, $src"), "",
+ [(set QPR:$dst, (Ty (NEONvrev32 (Ty QPR:$src))))]>;
+
+def VREV32d8 : VREV32D<0b00, "vrev32.8", v8i8>;
+def VREV32d16 : VREV32D<0b01, "vrev32.16", v4i16>;
+
+def VREV32q8 : VREV32Q<0b00, "vrev32.8", v16i8>;
+def VREV32q16 : VREV32Q<0b01, "vrev32.16", v8i16>;
+
+// VREV16 : Vector Reverse elements within 16-bit halfwords
+
+class VREV16D<bits<2> op19_18, string OpcodeStr, ValueType Ty>
+ : N2V<0b11, 0b11, op19_18, 0b00, 0b00010, 0, 0, (outs DPR:$dst),
+ (ins DPR:$src), IIC_VMOVD,
+ !strconcat(OpcodeStr, "\t$dst, $src"), "",
+ [(set DPR:$dst, (Ty (NEONvrev16 (Ty DPR:$src))))]>;
+class VREV16Q<bits<2> op19_18, string OpcodeStr, ValueType Ty>
+ : N2V<0b11, 0b11, op19_18, 0b00, 0b00010, 1, 0, (outs QPR:$dst),
+ (ins QPR:$src), IIC_VMOVD,
+ !strconcat(OpcodeStr, "\t$dst, $src"), "",
+ [(set QPR:$dst, (Ty (NEONvrev16 (Ty QPR:$src))))]>;
+
+def VREV16d8 : VREV16D<0b00, "vrev16.8", v8i8>;
+def VREV16q8 : VREV16Q<0b00, "vrev16.8", v16i8>;
+
+// Other Vector Shuffles.
+
+// VEXT : Vector Extract
+
+class VEXTd<string OpcodeStr, ValueType Ty>
+ : N3V<0,1,0b11,0b0000,0,0, (outs DPR:$dst),
+ (ins DPR:$lhs, DPR:$rhs, i32imm:$index), IIC_VEXTD,
+ !strconcat(OpcodeStr, "\t$dst, $lhs, $rhs, $index"), "",
+ [(set DPR:$dst, (Ty (NEONvext (Ty DPR:$lhs),
+ (Ty DPR:$rhs), imm:$index)))]>;
+
+class VEXTq<string OpcodeStr, ValueType Ty>
+ : N3V<0,1,0b11,0b0000,1,0, (outs QPR:$dst),
+ (ins QPR:$lhs, QPR:$rhs, i32imm:$index), IIC_VEXTQ,
+ !strconcat(OpcodeStr, "\t$dst, $lhs, $rhs, $index"), "",
+ [(set QPR:$dst, (Ty (NEONvext (Ty QPR:$lhs),
+ (Ty QPR:$rhs), imm:$index)))]>;
+
+def VEXTd8 : VEXTd<"vext.8", v8i8>;
+def VEXTd16 : VEXTd<"vext.16", v4i16>;
+def VEXTd32 : VEXTd<"vext.32", v2i32>;
+def VEXTdf : VEXTd<"vext.32", v2f32>;
+
+def VEXTq8 : VEXTq<"vext.8", v16i8>;
+def VEXTq16 : VEXTq<"vext.16", v8i16>;
+def VEXTq32 : VEXTq<"vext.32", v4i32>;
+def VEXTqf : VEXTq<"vext.32", v4f32>;
+
+// VTRN : Vector Transpose
+
+def VTRNd8 : N2VDShuffle<0b00, 0b00001, "vtrn.8">;
+def VTRNd16 : N2VDShuffle<0b01, 0b00001, "vtrn.16">;
+def VTRNd32 : N2VDShuffle<0b10, 0b00001, "vtrn.32">;
+
+def VTRNq8 : N2VQShuffle<0b00, 0b00001, IIC_VPERMQ, "vtrn.8">;
+def VTRNq16 : N2VQShuffle<0b01, 0b00001, IIC_VPERMQ, "vtrn.16">;
+def VTRNq32 : N2VQShuffle<0b10, 0b00001, IIC_VPERMQ, "vtrn.32">;
+
+// VUZP : Vector Unzip (Deinterleave)
+
+def VUZPd8 : N2VDShuffle<0b00, 0b00010, "vuzp.8">;
+def VUZPd16 : N2VDShuffle<0b01, 0b00010, "vuzp.16">;
+def VUZPd32 : N2VDShuffle<0b10, 0b00010, "vuzp.32">;
+
+def VUZPq8 : N2VQShuffle<0b00, 0b00010, IIC_VPERMQ3, "vuzp.8">;
+def VUZPq16 : N2VQShuffle<0b01, 0b00010, IIC_VPERMQ3, "vuzp.16">;
+def VUZPq32 : N2VQShuffle<0b10, 0b00010, IIC_VPERMQ3, "vuzp.32">;
+
+// VZIP : Vector Zip (Interleave)
+
+def VZIPd8 : N2VDShuffle<0b00, 0b00011, "vzip.8">;
+def VZIPd16 : N2VDShuffle<0b01, 0b00011, "vzip.16">;
+def VZIPd32 : N2VDShuffle<0b10, 0b00011, "vzip.32">;
+
+def VZIPq8 : N2VQShuffle<0b00, 0b00011, IIC_VPERMQ3, "vzip.8">;
+def VZIPq16 : N2VQShuffle<0b01, 0b00011, IIC_VPERMQ3, "vzip.16">;
+def VZIPq32 : N2VQShuffle<0b10, 0b00011, IIC_VPERMQ3, "vzip.32">;
+
+// Vector Table Lookup and Table Extension.
+
+// VTBL : Vector Table Lookup
+def VTBL1
+ : N3V<1,1,0b11,0b1000,0,0, (outs DPR:$dst),
+ (ins DPR:$tbl1, DPR:$src), IIC_VTB1,
+ "vtbl.8\t$dst, \\{$tbl1\\}, $src", "",
+ [(set DPR:$dst, (v8i8 (int_arm_neon_vtbl1 DPR:$tbl1, DPR:$src)))]>;
+let hasExtraSrcRegAllocReq = 1 in {
+def VTBL2
+ : N3V<1,1,0b11,0b1001,0,0, (outs DPR:$dst),
+ (ins DPR:$tbl1, DPR:$tbl2, DPR:$src), IIC_VTB2,
+ "vtbl.8\t$dst, \\{$tbl1,$tbl2\\}, $src", "",
+ [(set DPR:$dst, (v8i8 (int_arm_neon_vtbl2
+ DPR:$tbl1, DPR:$tbl2, DPR:$src)))]>;
+def VTBL3
+ : N3V<1,1,0b11,0b1010,0,0, (outs DPR:$dst),
+ (ins DPR:$tbl1, DPR:$tbl2, DPR:$tbl3, DPR:$src), IIC_VTB3,
+ "vtbl.8\t$dst, \\{$tbl1,$tbl2,$tbl3\\}, $src", "",
+ [(set DPR:$dst, (v8i8 (int_arm_neon_vtbl3
+ DPR:$tbl1, DPR:$tbl2, DPR:$tbl3, DPR:$src)))]>;
+def VTBL4
+ : N3V<1,1,0b11,0b1011,0,0, (outs DPR:$dst),
+ (ins DPR:$tbl1, DPR:$tbl2, DPR:$tbl3, DPR:$tbl4, DPR:$src), IIC_VTB4,
+ "vtbl.8\t$dst, \\{$tbl1,$tbl2,$tbl3,$tbl4\\}, $src", "",
+ [(set DPR:$dst, (v8i8 (int_arm_neon_vtbl4 DPR:$tbl1, DPR:$tbl2,
+ DPR:$tbl3, DPR:$tbl4, DPR:$src)))]>;
+} // hasExtraSrcRegAllocReq = 1
+
+// VTBX : Vector Table Extension
+def VTBX1
+ : N3V<1,1,0b11,0b1000,1,0, (outs DPR:$dst),
+ (ins DPR:$orig, DPR:$tbl1, DPR:$src), IIC_VTBX1,
+ "vtbx.8\t$dst, \\{$tbl1\\}, $src", "$orig = $dst",
+ [(set DPR:$dst, (v8i8 (int_arm_neon_vtbx1
+ DPR:$orig, DPR:$tbl1, DPR:$src)))]>;
+let hasExtraSrcRegAllocReq = 1 in {
+def VTBX2
+ : N3V<1,1,0b11,0b1001,1,0, (outs DPR:$dst),
+ (ins DPR:$orig, DPR:$tbl1, DPR:$tbl2, DPR:$src), IIC_VTBX2,
+ "vtbx.8\t$dst, \\{$tbl1,$tbl2\\}, $src", "$orig = $dst",
+ [(set DPR:$dst, (v8i8 (int_arm_neon_vtbx2
+ DPR:$orig, DPR:$tbl1, DPR:$tbl2, DPR:$src)))]>;
+def VTBX3
+ : N3V<1,1,0b11,0b1010,1,0, (outs DPR:$dst),
+ (ins DPR:$orig, DPR:$tbl1, DPR:$tbl2, DPR:$tbl3, DPR:$src), IIC_VTBX3,
+ "vtbx.8\t$dst, \\{$tbl1,$tbl2,$tbl3\\}, $src", "$orig = $dst",
+ [(set DPR:$dst, (v8i8 (int_arm_neon_vtbx3 DPR:$orig, DPR:$tbl1,
+ DPR:$tbl2, DPR:$tbl3, DPR:$src)))]>;
+def VTBX4
+ : N3V<1,1,0b11,0b1011,1,0, (outs DPR:$dst), (ins DPR:$orig, DPR:$tbl1,
+ DPR:$tbl2, DPR:$tbl3, DPR:$tbl4, DPR:$src), IIC_VTBX4,
+ "vtbx.8\t$dst, \\{$tbl1,$tbl2,$tbl3,$tbl4\\}, $src", "$orig = $dst",
+ [(set DPR:$dst, (v8i8 (int_arm_neon_vtbx4 DPR:$orig, DPR:$tbl1,
+ DPR:$tbl2, DPR:$tbl3, DPR:$tbl4, DPR:$src)))]>;
+} // hasExtraSrcRegAllocReq = 1
+
+//===----------------------------------------------------------------------===//
+// NEON instructions for single-precision FP math
+//===----------------------------------------------------------------------===//
+
+// These need separate instructions because they must use DPR_VFP2 register
+// class which have SPR sub-registers.
+
+// Vector Add Operations used for single-precision FP
+let neverHasSideEffects = 1 in
+def VADDfd_sfp : N3VDs<0, 0, 0b00, 0b1101, 0, "vadd.f32", v2f32, v2f32, fadd,1>;
+def : N3VDsPat<fadd, VADDfd_sfp>;
+
+// Vector Sub Operations used for single-precision FP
+let neverHasSideEffects = 1 in
+def VSUBfd_sfp : N3VDs<0, 0, 0b10, 0b1101, 0, "vsub.f32", v2f32, v2f32, fsub,0>;
+def : N3VDsPat<fsub, VSUBfd_sfp>;
+
+// Vector Multiply Operations used for single-precision FP
+let neverHasSideEffects = 1 in
+def VMULfd_sfp : N3VDs<1, 0, 0b00, 0b1101, 1, "vmul.f32", v2f32, v2f32, fmul,1>;
+def : N3VDsPat<fmul, VMULfd_sfp>;
+
+// Vector Multiply-Accumulate/Subtract used for single-precision FP
+let neverHasSideEffects = 1 in
+def VMLAfd_sfp : N3VDMulOps<0, 0, 0b00, 0b1101, 1, IIC_VMACD, "vmla.f32", v2f32,fmul,fadd>;
+def : N3VDMulOpsPat<fmul, fadd, VMLAfd_sfp>;
+
+let neverHasSideEffects = 1 in
+def VMLSfd_sfp : N3VDMulOps<0, 0, 0b10, 0b1101, 1, IIC_VMACD, "vmls.f32", v2f32,fmul,fsub>;
+def : N3VDMulOpsPat<fmul, fsub, VMLSfd_sfp>;
+
+// Vector Absolute used for single-precision FP
+let neverHasSideEffects = 1 in
+def VABSfd_sfp : N2VDInts<0b11, 0b11, 0b10, 0b01, 0b01110, 0,
+ IIC_VUNAD, "vabs.f32",
+ v2f32, v2f32, int_arm_neon_vabs>;
+def : N2VDIntsPat<fabs, VABSfd_sfp>;
+
+// Vector Negate used for single-precision FP
+let neverHasSideEffects = 1 in
+def VNEGf32d_sfp : N2V<0b11, 0b11, 0b10, 0b01, 0b01111, 0, 0,
+ (outs DPR_VFP2:$dst), (ins DPR_VFP2:$src), IIC_VUNAD,
+ "vneg.f32\t$dst, $src", "", []>;
+def : N2VDIntsPat<fneg, VNEGf32d_sfp>;
+
+// Vector Convert between single-precision FP and integer
+let neverHasSideEffects = 1 in
+def VCVTf2sd_sfp : N2VDs<0b11, 0b11, 0b10, 0b11, 0b01110, 0, "vcvt.s32.f32",
+ v2i32, v2f32, fp_to_sint>;
+def : N2VDsPat<arm_ftosi, f32, v2f32, VCVTf2sd_sfp>;
+
+let neverHasSideEffects = 1 in
+def VCVTf2ud_sfp : N2VDs<0b11, 0b11, 0b10, 0b11, 0b01111, 0, "vcvt.u32.f32",
+ v2i32, v2f32, fp_to_uint>;
+def : N2VDsPat<arm_ftoui, f32, v2f32, VCVTf2ud_sfp>;
+
+let neverHasSideEffects = 1 in
+def VCVTs2fd_sfp : N2VDs<0b11, 0b11, 0b10, 0b11, 0b01100, 0, "vcvt.f32.s32",
+ v2f32, v2i32, sint_to_fp>;
+def : N2VDsPat<arm_sitof, f32, v2i32, VCVTs2fd_sfp>;
+
+let neverHasSideEffects = 1 in
+def VCVTu2fd_sfp : N2VDs<0b11, 0b11, 0b10, 0b11, 0b01101, 0, "vcvt.f32.u32",
+ v2f32, v2i32, uint_to_fp>;
+def : N2VDsPat<arm_uitof, f32, v2i32, VCVTu2fd_sfp>;
+
//===----------------------------------------------------------------------===//
// Non-Instruction Patterns
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/ARM/ARMInstrThumb.td b/lib/Target/ARM/ARMInstrThumb.td
index 904d9b1..9816add 100644
--- a/lib/Target/ARM/ARMInstrThumb.td
+++ b/lib/Target/ARM/ARMInstrThumb.td
@@ -117,86 +117,150 @@ def t_addrmode_sp : Operand<i32>,
let Defs = [SP], Uses = [SP] in {
def tADJCALLSTACKUP :
-PseudoInst<(outs), (ins i32imm:$amt1, i32imm:$amt2),
+PseudoInst<(outs), (ins i32imm:$amt1, i32imm:$amt2), NoItinerary,
"@ tADJCALLSTACKUP $amt1",
- [(ARMcallseq_end imm:$amt1, imm:$amt2)]>, Requires<[IsThumb]>;
+ [(ARMcallseq_end imm:$amt1, imm:$amt2)]>, Requires<[IsThumb1Only]>;
def tADJCALLSTACKDOWN :
-PseudoInst<(outs), (ins i32imm:$amt),
+PseudoInst<(outs), (ins i32imm:$amt), NoItinerary,
"@ tADJCALLSTACKDOWN $amt",
- [(ARMcallseq_start imm:$amt)]>, Requires<[IsThumb]>;
+ [(ARMcallseq_start imm:$amt)]>, Requires<[IsThumb1Only]>;
}
+// For both thumb1 and thumb2.
let isNotDuplicable = 1 in
-def tPICADD : T1It<(outs tGPR:$dst), (ins tGPR:$lhs, pclabel:$cp),
- "$cp:\n\tadd $dst, pc",
- [(set tGPR:$dst, (ARMpic_add tGPR:$lhs, imm:$cp))]>;
+def tPICADD : TIt<(outs GPR:$dst), (ins GPR:$lhs, pclabel:$cp), IIC_iALUr,
+ "\n$cp:\n\tadd $dst, pc",
+ [(set GPR:$dst, (ARMpic_add GPR:$lhs, imm:$cp))]>;
// PC relative add.
-def tADDrPCi : T1I<(outs tGPR:$dst), (ins i32imm:$rhs),
+def tADDrPCi : T1I<(outs tGPR:$dst), (ins i32imm:$rhs), IIC_iALUi,
"add $dst, pc, $rhs * 4", []>;
// ADD rd, sp, #imm8
-// FIXME: hard code sp?
-def tADDrSPi : T1I<(outs tGPR:$dst), (ins GPR:$sp, i32imm:$rhs),
- "add $dst, $sp, $rhs * 4 @ addrspi", []>;
+def tADDrSPi : T1I<(outs tGPR:$dst), (ins GPR:$sp, i32imm:$rhs), IIC_iALUi,
+ "add $dst, $sp, $rhs * 4", []>;
// ADD sp, sp, #imm7
-// FIXME: hard code sp?
-def tADDspi : T1It<(outs GPR:$dst), (ins GPR:$lhs, i32imm:$rhs),
+def tADDspi : TIt<(outs GPR:$dst), (ins GPR:$lhs, i32imm:$rhs), IIC_iALUi,
"add $dst, $rhs * 4", []>;
-// FIXME: Make use of the following?
-// ADD rm, sp, rm
+// SUB sp, sp, #imm7
+def tSUBspi : TIt<(outs GPR:$dst), (ins GPR:$lhs, i32imm:$rhs), IIC_iALUi,
+ "sub $dst, $rhs * 4", []>;
+
+// ADD rm, sp
+def tADDrSP : TIt<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs), IIC_iALUr,
+ "add $dst, $rhs", []>;
+
// ADD sp, rm
+def tADDspr : TIt<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs), IIC_iALUr,
+ "add $dst, $rhs", []>;
+
+// Pseudo instruction that will expand into a tSUBspi + a copy.
+let usesCustomDAGSchedInserter = 1 in { // Expanded by the scheduler.
+def tSUBspi_ : PseudoInst<(outs GPR:$dst), (ins GPR:$lhs, i32imm:$rhs),
+ NoItinerary, "@ sub $dst, $rhs * 4", []>;
+
+def tADDspr_ : PseudoInst<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs),
+ NoItinerary, "@ add $dst, $rhs", []>;
+
+let Defs = [CPSR] in
+def tANDsp : PseudoInst<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs),
+ NoItinerary, "@ and $dst, $rhs", []>;
+} // usesCustomDAGSchedInserter
//===----------------------------------------------------------------------===//
// Control Flow Instructions.
//
-let isReturn = 1, isTerminator = 1 in {
- def tBX_RET : TI<(outs), (ins), "bx lr", [(ARMretflag)]>;
+let isReturn = 1, isTerminator = 1, isBarrier = 1 in {
+ def tBX_RET : TI<(outs), (ins), IIC_Br, "bx lr", [(ARMretflag)]>;
// Alternative return instruction used by vararg functions.
- def tBX_RET_vararg : TI<(outs), (ins tGPR:$target), "bx $target", []>;
+ def tBX_RET_vararg : TI<(outs), (ins tGPR:$target), IIC_Br, "bx $target", []>;
}
// FIXME: remove when we have a way to marking a MI with these properties.
-let isReturn = 1, isTerminator = 1 in
-def tPOP_RET : TI<(outs reglist:$dst1, variable_ops), (ins),
- "pop $dst1", []>;
+let isReturn = 1, isTerminator = 1, isBarrier = 1, mayLoad = 1,
+ hasExtraDefRegAllocReq = 1 in
+def tPOP_RET : T1I<(outs), (ins pred:$p, reglist:$wb, variable_ops), IIC_Br,
+ "pop${p} $wb", []>;
let isCall = 1,
- Defs = [R0, R1, R2, R3, LR,
- D0, D1, D2, D3, D4, D5, D6, D7] in {
- def tBL : TIx2<(outs), (ins i32imm:$func, variable_ops),
+ Defs = [R0, R1, R2, R3, R12, LR,
+ D0, D1, D2, D3, D4, D5, D6, D7,
+ D16, D17, D18, D19, D20, D21, D22, D23,
+ D24, D25, D26, D27, D28, D29, D30, D31, CPSR, FPSCR] in {
+ // Also used for Thumb2
+ def tBL : TIx2<(outs), (ins i32imm:$func, variable_ops), IIC_Br,
"bl ${func:call}",
- [(ARMtcall tglobaladdr:$func)]>;
- // ARMv5T and above
- def tBLXi : TIx2<(outs), (ins i32imm:$func, variable_ops),
+ [(ARMtcall tglobaladdr:$func)]>,
+ Requires<[IsThumb, IsNotDarwin]>;
+
+ // ARMv5T and above, also used for Thumb2
+ def tBLXi : TIx2<(outs), (ins i32imm:$func, variable_ops), IIC_Br,
"blx ${func:call}",
- [(ARMcall tglobaladdr:$func)]>, Requires<[HasV5T]>;
- def tBLXr : TI<(outs), (ins tGPR:$func, variable_ops),
+ [(ARMcall tglobaladdr:$func)]>,
+ Requires<[IsThumb, HasV5T, IsNotDarwin]>;
+
+ // Also used for Thumb2
+ def tBLXr : TI<(outs), (ins GPR:$func, variable_ops), IIC_Br,
+ "blx $func",
+ [(ARMtcall GPR:$func)]>,
+ Requires<[IsThumb, HasV5T, IsNotDarwin]>;
+
+ // ARMv4T
+ def tBX : TIx2<(outs), (ins tGPR:$func, variable_ops), IIC_Br,
+ "mov lr, pc\n\tbx $func",
+ [(ARMcall_nolink tGPR:$func)]>,
+ Requires<[IsThumb1Only, IsNotDarwin]>;
+}
+
+// On Darwin R9 is call-clobbered.
+let isCall = 1,
+ Defs = [R0, R1, R2, R3, R9, R12, LR,
+ D0, D1, D2, D3, D4, D5, D6, D7,
+ D16, D17, D18, D19, D20, D21, D22, D23,
+ D24, D25, D26, D27, D28, D29, D30, D31, CPSR, FPSCR] in {
+ // Also used for Thumb2
+ def tBLr9 : TIx2<(outs), (ins i32imm:$func, variable_ops), IIC_Br,
+ "bl ${func:call}",
+ [(ARMtcall tglobaladdr:$func)]>,
+ Requires<[IsThumb, IsDarwin]>;
+
+ // ARMv5T and above, also used for Thumb2
+ def tBLXi_r9 : TIx2<(outs), (ins i32imm:$func, variable_ops), IIC_Br,
+ "blx ${func:call}",
+ [(ARMcall tglobaladdr:$func)]>,
+ Requires<[IsThumb, HasV5T, IsDarwin]>;
+
+ // Also used for Thumb2
+ def tBLXr_r9 : TI<(outs), (ins GPR:$func, variable_ops), IIC_Br,
"blx $func",
- [(ARMtcall tGPR:$func)]>, Requires<[HasV5T]>;
+ [(ARMtcall GPR:$func)]>,
+ Requires<[IsThumb, HasV5T, IsDarwin]>;
+
// ARMv4T
- def tBX : TIx2<(outs), (ins tGPR:$func, variable_ops),
- "cpy lr, pc\n\tbx $func",
- [(ARMcall_nolink tGPR:$func)]>;
+ def tBXr9 : TIx2<(outs), (ins tGPR:$func, variable_ops), IIC_Br,
+ "mov lr, pc\n\tbx $func",
+ [(ARMcall_nolink tGPR:$func)]>,
+ Requires<[IsThumb1Only, IsDarwin]>;
}
let isBranch = 1, isTerminator = 1 in {
let isBarrier = 1 in {
let isPredicable = 1 in
- def tB : T1I<(outs), (ins brtarget:$target), "b $target",
- [(br bb:$target)]>;
+ def tB : T1I<(outs), (ins brtarget:$target), IIC_Br,
+ "b $target", [(br bb:$target)]>;
// Far jump
- def tBfar : T1Ix2<(outs), (ins brtarget:$target),
+ let Defs = [LR] in
+ def tBfar : TIx2<(outs), (ins brtarget:$target), IIC_Br,
"bl $target\t@ far jump",[]>;
def tBR_JTr : T1JTI<(outs),
(ins tGPR:$target, jtblock_operand:$jt, i32imm:$id),
- "cpy pc, $target \n\t.align\t2\n$jt",
+ IIC_Br, "mov pc, $target\n\t.align\t2\n$jt",
[(ARMbrjt tGPR:$target, tjumptable:$jt, imm:$id)]>;
}
}
@@ -204,7 +268,8 @@ let isBranch = 1, isTerminator = 1 in {
// FIXME: should be able to write a pattern for ARMBrcond, but can't use
// a two-value operand where a dag node expects two operands. :(
let isBranch = 1, isTerminator = 1 in
- def tBcc : T1I<(outs), (ins brtarget:$target, pred:$cc), "b$cc $target",
+ def tBcc : T1I<(outs), (ins brtarget:$target, pred:$cc), IIC_Br,
+ "b$cc $target",
[/*(ARMbrcond bb:$target, imm:$cc)*/]>;
//===----------------------------------------------------------------------===//
@@ -212,384 +277,363 @@ let isBranch = 1, isTerminator = 1 in
//
let canFoldAsLoad = 1 in
-def tLDR : T1I4<(outs tGPR:$dst), (ins t_addrmode_s4:$addr),
- "ldr $dst, $addr",
+def tLDR : T1pI4<(outs tGPR:$dst), (ins t_addrmode_s4:$addr), IIC_iLoadr,
+ "ldr", " $dst, $addr",
[(set tGPR:$dst, (load t_addrmode_s4:$addr))]>;
-def tLDRB : T1I1<(outs tGPR:$dst), (ins t_addrmode_s1:$addr),
- "ldrb $dst, $addr",
+def tLDRB : T1pI1<(outs tGPR:$dst), (ins t_addrmode_s1:$addr), IIC_iLoadr,
+ "ldrb", " $dst, $addr",
[(set tGPR:$dst, (zextloadi8 t_addrmode_s1:$addr))]>;
-def tLDRH : T1I2<(outs tGPR:$dst), (ins t_addrmode_s2:$addr),
- "ldrh $dst, $addr",
+def tLDRH : T1pI2<(outs tGPR:$dst), (ins t_addrmode_s2:$addr), IIC_iLoadr,
+ "ldrh", " $dst, $addr",
[(set tGPR:$dst, (zextloadi16 t_addrmode_s2:$addr))]>;
-def tLDRSB : T1I1<(outs tGPR:$dst), (ins t_addrmode_rr:$addr),
- "ldrsb $dst, $addr",
+let AddedComplexity = 10 in
+def tLDRSB : T1pI1<(outs tGPR:$dst), (ins t_addrmode_rr:$addr), IIC_iLoadr,
+ "ldrsb", " $dst, $addr",
[(set tGPR:$dst, (sextloadi8 t_addrmode_rr:$addr))]>;
-def tLDRSH : T1I2<(outs tGPR:$dst), (ins t_addrmode_rr:$addr),
- "ldrsh $dst, $addr",
+let AddedComplexity = 10 in
+def tLDRSH : T1pI2<(outs tGPR:$dst), (ins t_addrmode_rr:$addr), IIC_iLoadr,
+ "ldrsh", " $dst, $addr",
[(set tGPR:$dst, (sextloadi16 t_addrmode_rr:$addr))]>;
let canFoldAsLoad = 1 in
-def tLDRspi : T1Is<(outs tGPR:$dst), (ins t_addrmode_sp:$addr),
- "ldr $dst, $addr",
+def tLDRspi : T1pIs<(outs tGPR:$dst), (ins t_addrmode_sp:$addr), IIC_iLoadi,
+ "ldr", " $dst, $addr",
[(set tGPR:$dst, (load t_addrmode_sp:$addr))]>;
// Special instruction for restore. It cannot clobber condition register
// when it's expanded by eliminateCallFramePseudoInstr().
let canFoldAsLoad = 1, mayLoad = 1 in
-def tRestore : T1Is<(outs tGPR:$dst), (ins t_addrmode_sp:$addr),
- "ldr $dst, $addr", []>;
+def tRestore : T1pIs<(outs tGPR:$dst), (ins t_addrmode_sp:$addr), IIC_iLoadi,
+ "ldr", " $dst, $addr", []>;
// Load tconstpool
let canFoldAsLoad = 1 in
-def tLDRpci : T1Is<(outs tGPR:$dst), (ins i32imm:$addr),
- "ldr $dst, $addr",
+def tLDRpci : T1pIs<(outs tGPR:$dst), (ins i32imm:$addr), IIC_iLoadi,
+ "ldr", " $dst, $addr",
[(set tGPR:$dst, (load (ARMWrapper tconstpool:$addr)))]>;
// Special LDR for loads from non-pc-relative constpools.
let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1 in
-def tLDRcp : T1Is<(outs tGPR:$dst), (ins i32imm:$addr),
- "ldr $dst, $addr", []>;
+def tLDRcp : T1pIs<(outs tGPR:$dst), (ins i32imm:$addr), IIC_iLoadi,
+ "ldr", " $dst, $addr", []>;
-def tSTR : T1I4<(outs), (ins tGPR:$src, t_addrmode_s4:$addr),
- "str $src, $addr",
+def tSTR : T1pI4<(outs), (ins tGPR:$src, t_addrmode_s4:$addr), IIC_iStorer,
+ "str", " $src, $addr",
[(store tGPR:$src, t_addrmode_s4:$addr)]>;
-def tSTRB : T1I1<(outs), (ins tGPR:$src, t_addrmode_s1:$addr),
- "strb $src, $addr",
+def tSTRB : T1pI1<(outs), (ins tGPR:$src, t_addrmode_s1:$addr), IIC_iStorer,
+ "strb", " $src, $addr",
[(truncstorei8 tGPR:$src, t_addrmode_s1:$addr)]>;
-def tSTRH : T1I2<(outs), (ins tGPR:$src, t_addrmode_s2:$addr),
- "strh $src, $addr",
+def tSTRH : T1pI2<(outs), (ins tGPR:$src, t_addrmode_s2:$addr), IIC_iStorer,
+ "strh", " $src, $addr",
[(truncstorei16 tGPR:$src, t_addrmode_s2:$addr)]>;
-def tSTRspi : T1Is<(outs), (ins tGPR:$src, t_addrmode_sp:$addr),
- "str $src, $addr",
+def tSTRspi : T1pIs<(outs), (ins tGPR:$src, t_addrmode_sp:$addr), IIC_iStorei,
+ "str", " $src, $addr",
[(store tGPR:$src, t_addrmode_sp:$addr)]>;
let mayStore = 1 in {
// Special instruction for spill. It cannot clobber condition register
// when it's expanded by eliminateCallFramePseudoInstr().
-def tSpill : T1Is<(outs), (ins tGPR:$src, t_addrmode_sp:$addr),
- "str $src, $addr", []>;
+def tSpill : T1pIs<(outs), (ins tGPR:$src, t_addrmode_sp:$addr), IIC_iStorei,
+ "str", " $src, $addr", []>;
}
//===----------------------------------------------------------------------===//
// Load / store multiple Instructions.
//
-// TODO: A7-44: LDMIA - load multiple
+// These requires base address to be written back or one of the loaded regs.
+let mayLoad = 1, hasExtraDefRegAllocReq = 1 in
+def tLDM : T1I<(outs),
+ (ins addrmode4:$addr, pred:$p, reglist:$wb, variable_ops),
+ IIC_iLoadm,
+ "ldm${addr:submode}${p} $addr, $wb", []>;
-let mayLoad = 1 in
-def tPOP : TI<(outs reglist:$dst1, variable_ops), (ins),
- "pop $dst1", []>;
+let mayStore = 1, hasExtraSrcRegAllocReq = 1 in
+def tSTM : T1I<(outs),
+ (ins addrmode4:$addr, pred:$p, reglist:$wb, variable_ops),
+ IIC_iStorem,
+ "stm${addr:submode}${p} $addr, $wb", []>;
-let mayStore = 1 in
-def tPUSH : TI<(outs), (ins reglist:$src1, variable_ops),
- "push $src1", []>;
+let mayLoad = 1, Uses = [SP], Defs = [SP], hasExtraDefRegAllocReq = 1 in
+def tPOP : T1I<(outs), (ins pred:$p, reglist:$wb, variable_ops), IIC_Br,
+ "pop${p} $wb", []>;
+
+let mayStore = 1, Uses = [SP], Defs = [SP], hasExtraSrcRegAllocReq = 1 in
+def tPUSH : T1I<(outs), (ins pred:$p, reglist:$wb, variable_ops), IIC_Br,
+ "push${p} $wb", []>;
//===----------------------------------------------------------------------===//
// Arithmetic Instructions.
//
// Add with carry register
-let isCommutable = 1, Defs = [CPSR], Uses = [CPSR] in
-def tADCS : T1It<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs),
- "adc $dst, $rhs",
- [(set tGPR:$dst, (adde tGPR:$lhs, tGPR:$rhs))]>;
+let isCommutable = 1, Uses = [CPSR] in
+def tADC : T1sIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), IIC_iALUr,
+ "adc", " $dst, $rhs",
+ [(set tGPR:$dst, (adde tGPR:$lhs, tGPR:$rhs))]>;
// Add immediate
-let Defs = [CPSR] in {
-def tADDi3 : T1I<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs),
- "add $dst, $lhs, $rhs",
- [(set tGPR:$dst, (add tGPR:$lhs, imm0_7:$rhs))]>;
-def tADDSi3 : T1I<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs),
- "add $dst, $lhs, $rhs",
- [(set tGPR:$dst, (addc tGPR:$lhs, imm0_7:$rhs))]>;
-}
+def tADDi3 : T1sI<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs), IIC_iALUi,
+ "add", " $dst, $lhs, $rhs",
+ [(set tGPR:$dst, (add tGPR:$lhs, imm0_7:$rhs))]>;
-let Defs = [CPSR] in {
-def tADDi8 : T1It<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs),
- "add $dst, $rhs",
- [(set tGPR:$dst, (add tGPR:$lhs, imm8_255:$rhs))]>;
-def tADDSi8 : T1It<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs),
- "add $dst, $rhs",
- [(set tGPR:$dst, (addc tGPR:$lhs, imm8_255:$rhs))]>;
-}
+def tADDi8 : T1sIt<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs), IIC_iALUi,
+ "add", " $dst, $rhs",
+ [(set tGPR:$dst, (add tGPR:$lhs, imm8_255:$rhs))]>;
// Add register
-let isCommutable = 1, Defs = [CPSR] in {
-def tADDrr : T1I<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs),
- "add $dst, $lhs, $rhs",
- [(set tGPR:$dst, (add tGPR:$lhs, tGPR:$rhs))]>;
-def tADDSrr : T1I<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs),
- "add $dst, $lhs, $rhs",
- [(set tGPR:$dst, (addc tGPR:$lhs, tGPR:$rhs))]>;
-}
+let isCommutable = 1 in
+def tADDrr : T1sI<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), IIC_iALUr,
+ "add", " $dst, $lhs, $rhs",
+ [(set tGPR:$dst, (add tGPR:$lhs, tGPR:$rhs))]>;
let neverHasSideEffects = 1 in
-def tADDhirr : T1It<(outs tGPR:$dst), (ins GPR:$lhs, GPR:$rhs),
- "add $dst, $rhs @ addhirr", []>;
+def tADDhirr : T1pIt<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs), IIC_iALUr,
+ "add", " $dst, $rhs", []>;
// And register
-let isCommutable = 1, Defs = [CPSR] in
-def tAND : T1It<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs),
- "and $dst, $rhs",
- [(set tGPR:$dst, (and tGPR:$lhs, tGPR:$rhs))]>;
+let isCommutable = 1 in
+def tAND : T1sIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), IIC_iALUr,
+ "and", " $dst, $rhs",
+ [(set tGPR:$dst, (and tGPR:$lhs, tGPR:$rhs))]>;
// ASR immediate
-let Defs = [CPSR] in
-def tASRri : T1I<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs),
- "asr $dst, $lhs, $rhs",
- [(set tGPR:$dst, (sra tGPR:$lhs, (i32 imm:$rhs)))]>;
+def tASRri : T1sI<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs), IIC_iMOVsi,
+ "asr", " $dst, $lhs, $rhs",
+ [(set tGPR:$dst, (sra tGPR:$lhs, (i32 imm:$rhs)))]>;
// ASR register
-let Defs = [CPSR] in
-def tASRrr : T1It<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs),
- "asr $dst, $rhs",
- [(set tGPR:$dst, (sra tGPR:$lhs, tGPR:$rhs))]>;
+def tASRrr : T1sIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), IIC_iMOVsr,
+ "asr", " $dst, $rhs",
+ [(set tGPR:$dst, (sra tGPR:$lhs, tGPR:$rhs))]>;
// BIC register
-let Defs = [CPSR] in
-def tBIC : T1It<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs),
- "bic $dst, $rhs",
- [(set tGPR:$dst, (and tGPR:$lhs, (not tGPR:$rhs)))]>;
+def tBIC : T1sIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), IIC_iALUr,
+ "bic", " $dst, $rhs",
+ [(set tGPR:$dst, (and tGPR:$lhs, (not tGPR:$rhs)))]>;
// CMN register
let Defs = [CPSR] in {
-def tCMN : T1I<(outs), (ins tGPR:$lhs, tGPR:$rhs),
- "cmn $lhs, $rhs",
- [(ARMcmp tGPR:$lhs, (ineg tGPR:$rhs))]>;
-def tCMNZ : T1I<(outs), (ins tGPR:$lhs, tGPR:$rhs),
- "cmn $lhs, $rhs",
- [(ARMcmpZ tGPR:$lhs, (ineg tGPR:$rhs))]>;
+def tCMN : T1pI<(outs), (ins tGPR:$lhs, tGPR:$rhs), IIC_iCMPr,
+ "cmn", " $lhs, $rhs",
+ [(ARMcmp tGPR:$lhs, (ineg tGPR:$rhs))]>;
+def tCMNZ : T1pI<(outs), (ins tGPR:$lhs, tGPR:$rhs), IIC_iCMPr,
+ "cmn", " $lhs, $rhs",
+ [(ARMcmpZ tGPR:$lhs, (ineg tGPR:$rhs))]>;
}
// CMP immediate
let Defs = [CPSR] in {
-def tCMPi8 : T1I<(outs), (ins tGPR:$lhs, i32imm:$rhs),
- "cmp $lhs, $rhs",
- [(ARMcmp tGPR:$lhs, imm0_255:$rhs)]>;
-def tCMPZi8 : T1I<(outs), (ins tGPR:$lhs, i32imm:$rhs),
- "cmp $lhs, $rhs",
- [(ARMcmpZ tGPR:$lhs, imm0_255:$rhs)]>;
+def tCMPi8 : T1pI<(outs), (ins tGPR:$lhs, i32imm:$rhs), IIC_iCMPi,
+ "cmp", " $lhs, $rhs",
+ [(ARMcmp tGPR:$lhs, imm0_255:$rhs)]>;
+def tCMPzi8 : T1pI<(outs), (ins tGPR:$lhs, i32imm:$rhs), IIC_iCMPi,
+ "cmp", " $lhs, $rhs",
+ [(ARMcmpZ tGPR:$lhs, imm0_255:$rhs)]>;
}
// CMP register
let Defs = [CPSR] in {
-def tCMPr : T1I<(outs), (ins tGPR:$lhs, tGPR:$rhs),
- "cmp $lhs, $rhs",
- [(ARMcmp tGPR:$lhs, tGPR:$rhs)]>;
-def tCMPZr : T1I<(outs), (ins tGPR:$lhs, tGPR:$rhs),
- "cmp $lhs, $rhs",
- [(ARMcmpZ tGPR:$lhs, tGPR:$rhs)]>;
+def tCMPr : T1pI<(outs), (ins tGPR:$lhs, tGPR:$rhs), IIC_iCMPr,
+ "cmp", " $lhs, $rhs",
+ [(ARMcmp tGPR:$lhs, tGPR:$rhs)]>;
+def tCMPzr : T1pI<(outs), (ins tGPR:$lhs, tGPR:$rhs), IIC_iCMPr,
+ "cmp", " $lhs, $rhs",
+ [(ARMcmpZ tGPR:$lhs, tGPR:$rhs)]>;
+
+def tCMPhir : T1pI<(outs), (ins GPR:$lhs, GPR:$rhs), IIC_iCMPr,
+ "cmp", " $lhs, $rhs", []>;
+def tCMPzhir : T1pI<(outs), (ins GPR:$lhs, GPR:$rhs), IIC_iCMPr,
+ "cmp", " $lhs, $rhs", []>;
}
-// TODO: A7-37: CMP(3) - cmp hi regs
// XOR register
-let isCommutable = 1, Defs = [CPSR] in
-def tEOR : T1It<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs),
- "eor $dst, $rhs",
- [(set tGPR:$dst, (xor tGPR:$lhs, tGPR:$rhs))]>;
+let isCommutable = 1 in
+def tEOR : T1sIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), IIC_iALUr,
+ "eor", " $dst, $rhs",
+ [(set tGPR:$dst, (xor tGPR:$lhs, tGPR:$rhs))]>;
// LSL immediate
-let Defs = [CPSR] in
-def tLSLri : T1I<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs),
- "lsl $dst, $lhs, $rhs",
- [(set tGPR:$dst, (shl tGPR:$lhs, (i32 imm:$rhs)))]>;
+def tLSLri : T1sI<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs), IIC_iMOVsi,
+ "lsl", " $dst, $lhs, $rhs",
+ [(set tGPR:$dst, (shl tGPR:$lhs, (i32 imm:$rhs)))]>;
// LSL register
-let Defs = [CPSR] in
-def tLSLrr : T1It<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs),
- "lsl $dst, $rhs",
- [(set tGPR:$dst, (shl tGPR:$lhs, tGPR:$rhs))]>;
+def tLSLrr : T1sIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), IIC_iMOVsr,
+ "lsl", " $dst, $rhs",
+ [(set tGPR:$dst, (shl tGPR:$lhs, tGPR:$rhs))]>;
// LSR immediate
-let Defs = [CPSR] in
-def tLSRri : T1I<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs),
- "lsr $dst, $lhs, $rhs",
- [(set tGPR:$dst, (srl tGPR:$lhs, (i32 imm:$rhs)))]>;
+def tLSRri : T1sI<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs), IIC_iMOVsi,
+ "lsr", " $dst, $lhs, $rhs",
+ [(set tGPR:$dst, (srl tGPR:$lhs, (i32 imm:$rhs)))]>;
// LSR register
-let Defs = [CPSR] in
-def tLSRrr : T1It<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs),
- "lsr $dst, $rhs",
- [(set tGPR:$dst, (srl tGPR:$lhs, tGPR:$rhs))]>;
+def tLSRrr : T1sIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), IIC_iMOVsr,
+ "lsr", " $dst, $rhs",
+ [(set tGPR:$dst, (srl tGPR:$lhs, tGPR:$rhs))]>;
// move register
-let Defs = [CPSR] in
-def tMOVi8 : T1I<(outs tGPR:$dst), (ins i32imm:$src),
- "mov $dst, $src",
- [(set tGPR:$dst, imm0_255:$src)]>;
+def tMOVi8 : T1sI<(outs tGPR:$dst), (ins i32imm:$src), IIC_iMOVi,
+ "mov", " $dst, $src",
+ [(set tGPR:$dst, imm0_255:$src)]>;
// TODO: A7-73: MOV(2) - mov setting flag.
-// Note: MOV(2) of two low regs updates the flags, so we emit this as 'cpy',
-// which is MOV(3). This also supports high registers.
let neverHasSideEffects = 1 in {
-def tMOVr : T1I<(outs tGPR:$dst), (ins tGPR:$src),
- "cpy $dst, $src", []>;
-def tMOVhir2lor : T1I<(outs tGPR:$dst), (ins GPR:$src),
- "cpy $dst, $src\t@ hir2lor", []>;
-def tMOVlor2hir : T1I<(outs GPR:$dst), (ins tGPR:$src),
- "cpy $dst, $src\t@ lor2hir", []>;
-def tMOVhir2hir : T1I<(outs GPR:$dst), (ins GPR:$src),
- "cpy $dst, $src\t@ hir2hir", []>;
+// FIXME: Make this predicable.
+def tMOVr : T1I<(outs tGPR:$dst), (ins tGPR:$src), IIC_iMOVr,
+ "mov $dst, $src", []>;
+let Defs = [CPSR] in
+def tMOVSr : T1I<(outs tGPR:$dst), (ins tGPR:$src), IIC_iMOVr,
+ "movs $dst, $src", []>;
+
+// FIXME: Make these predicable.
+def tMOVgpr2tgpr : T1I<(outs tGPR:$dst), (ins GPR:$src), IIC_iMOVr,
+ "mov $dst, $src", []>;
+def tMOVtgpr2gpr : T1I<(outs GPR:$dst), (ins tGPR:$src), IIC_iMOVr,
+ "mov $dst, $src", []>;
+def tMOVgpr2gpr : T1I<(outs GPR:$dst), (ins GPR:$src), IIC_iMOVr,
+ "mov $dst, $src", []>;
} // neverHasSideEffects
// multiply register
-let isCommutable = 1, Defs = [CPSR] in
-def tMUL : T1It<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs),
- "mul $dst, $rhs",
- [(set tGPR:$dst, (mul tGPR:$lhs, tGPR:$rhs))]>;
+let isCommutable = 1 in
+def tMUL : T1sIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), IIC_iMUL32,
+ "mul", " $dst, $rhs",
+ [(set tGPR:$dst, (mul tGPR:$lhs, tGPR:$rhs))]>;
// move inverse register
-let Defs = [CPSR] in
-def tMVN : T1I<(outs tGPR:$dst), (ins tGPR:$src),
- "mvn $dst, $src",
- [(set tGPR:$dst, (not tGPR:$src))]>;
-
-// negate register
-let Defs = [CPSR] in
-def tNEG : T1I<(outs tGPR:$dst), (ins tGPR:$src),
- "neg $dst, $src",
- [(set tGPR:$dst, (ineg tGPR:$src))]>;
+def tMVN : T1sI<(outs tGPR:$dst), (ins tGPR:$src), IIC_iMOVr,
+ "mvn", " $dst, $src",
+ [(set tGPR:$dst, (not tGPR:$src))]>;
// bitwise or register
-let isCommutable = 1, Defs = [CPSR] in
-def tORR : T1It<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs),
- "orr $dst, $rhs",
- [(set tGPR:$dst, (or tGPR:$lhs, tGPR:$rhs))]>;
+let isCommutable = 1 in
+def tORR : T1sIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), IIC_iALUr,
+ "orr", " $dst, $rhs",
+ [(set tGPR:$dst, (or tGPR:$lhs, tGPR:$rhs))]>;
// swaps
-def tREV : T1I<(outs tGPR:$dst), (ins tGPR:$src),
- "rev $dst, $src",
- [(set tGPR:$dst, (bswap tGPR:$src))]>,
- Requires<[IsThumb, HasV6]>;
-
-def tREV16 : T1I<(outs tGPR:$dst), (ins tGPR:$src),
- "rev16 $dst, $src",
- [(set tGPR:$dst,
- (or (and (srl tGPR:$src, (i32 8)), 0xFF),
- (or (and (shl tGPR:$src, (i32 8)), 0xFF00),
- (or (and (srl tGPR:$src, (i32 8)), 0xFF0000),
- (and (shl tGPR:$src, (i32 8)), 0xFF000000)))))]>,
- Requires<[IsThumb, HasV6]>;
-
-def tREVSH : T1I<(outs tGPR:$dst), (ins tGPR:$src),
- "revsh $dst, $src",
- [(set tGPR:$dst,
- (sext_inreg
- (or (srl (and tGPR:$src, 0xFFFF), (i32 8)),
- (shl tGPR:$src, (i32 8))), i16))]>,
- Requires<[IsThumb, HasV6]>;
+def tREV : T1pI<(outs tGPR:$dst), (ins tGPR:$src), IIC_iUNAr,
+ "rev", " $dst, $src",
+ [(set tGPR:$dst, (bswap tGPR:$src))]>,
+ Requires<[IsThumb1Only, HasV6]>;
+
+def tREV16 : T1pI<(outs tGPR:$dst), (ins tGPR:$src), IIC_iUNAr,
+ "rev16", " $dst, $src",
+ [(set tGPR:$dst,
+ (or (and (srl tGPR:$src, (i32 8)), 0xFF),
+ (or (and (shl tGPR:$src, (i32 8)), 0xFF00),
+ (or (and (srl tGPR:$src, (i32 8)), 0xFF0000),
+ (and (shl tGPR:$src, (i32 8)), 0xFF000000)))))]>,
+ Requires<[IsThumb1Only, HasV6]>;
+
+def tREVSH : T1pI<(outs tGPR:$dst), (ins tGPR:$src), IIC_iUNAr,
+ "revsh", " $dst, $src",
+ [(set tGPR:$dst,
+ (sext_inreg
+ (or (srl (and tGPR:$src, 0xFF00), (i32 8)),
+ (shl tGPR:$src, (i32 8))), i16))]>,
+ Requires<[IsThumb1Only, HasV6]>;
// rotate right register
-let Defs = [CPSR] in
-def tROR : T1It<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs),
- "ror $dst, $rhs",
- [(set tGPR:$dst, (rotr tGPR:$lhs, tGPR:$rhs))]>;
+def tROR : T1sIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), IIC_iMOVsr,
+ "ror", " $dst, $rhs",
+ [(set tGPR:$dst, (rotr tGPR:$lhs, tGPR:$rhs))]>;
+
+// negate register
+def tRSB : T1sI<(outs tGPR:$dst), (ins tGPR:$src), IIC_iALUi,
+ "rsb", " $dst, $src, #0",
+ [(set tGPR:$dst, (ineg tGPR:$src))]>;
// Subtract with carry register
-let Defs = [CPSR], Uses = [CPSR] in
-def tSBCS : T1It<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs),
- "sbc $dst, $rhs",
- [(set tGPR:$dst, (sube tGPR:$lhs, tGPR:$rhs))]>;
+let Uses = [CPSR] in
+def tSBC : T1sIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), IIC_iALUr,
+ "sbc", " $dst, $rhs",
+ [(set tGPR:$dst, (sube tGPR:$lhs, tGPR:$rhs))]>;
// Subtract immediate
-let Defs = [CPSR] in {
-def tSUBi3 : T1I<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs),
- "sub $dst, $lhs, $rhs",
- [(set tGPR:$dst, (add tGPR:$lhs, imm0_7_neg:$rhs))]>;
-def tSUBSi3 : T1I<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs),
- "sub $dst, $lhs, $rhs",
- [(set tGPR:$dst, (addc tGPR:$lhs, imm0_7_neg:$rhs))]>;
-}
+def tSUBi3 : T1sI<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs), IIC_iALUi,
+ "sub", " $dst, $lhs, $rhs",
+ [(set tGPR:$dst, (add tGPR:$lhs, imm0_7_neg:$rhs))]>;
-let Defs = [CPSR] in {
-def tSUBi8 : T1It<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs),
- "sub $dst, $rhs",
- [(set tGPR:$dst, (add tGPR:$lhs, imm8_255_neg:$rhs))]>;
-def tSUBSi8 : T1It<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs),
- "sub $dst, $rhs",
- [(set tGPR:$dst, (addc tGPR:$lhs, imm8_255_neg:$rhs))]>;
-}
+def tSUBi8 : T1sIt<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs), IIC_iALUi,
+ "sub", " $dst, $rhs",
+ [(set tGPR:$dst, (add tGPR:$lhs, imm8_255_neg:$rhs))]>;
// subtract register
-let Defs = [CPSR] in {
-def tSUBrr : T1I<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs),
- "sub $dst, $lhs, $rhs",
- [(set tGPR:$dst, (sub tGPR:$lhs, tGPR:$rhs))]>;
-def tSUBSrr : T1I<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs),
- "sub $dst, $lhs, $rhs",
- [(set tGPR:$dst, (subc tGPR:$lhs, tGPR:$rhs))]>;
-}
+def tSUBrr : T1sI<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), IIC_iALUr,
+ "sub", " $dst, $lhs, $rhs",
+ [(set tGPR:$dst, (sub tGPR:$lhs, tGPR:$rhs))]>;
// TODO: A7-96: STMIA - store multiple.
-def tSUBspi : T1It<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs),
- "sub $dst, $rhs * 4", []>;
-
// sign-extend byte
-def tSXTB : T1I<(outs tGPR:$dst), (ins tGPR:$src),
- "sxtb $dst, $src",
- [(set tGPR:$dst, (sext_inreg tGPR:$src, i8))]>,
- Requires<[IsThumb, HasV6]>;
+def tSXTB : T1pI<(outs tGPR:$dst), (ins tGPR:$src), IIC_iUNAr,
+ "sxtb", " $dst, $src",
+ [(set tGPR:$dst, (sext_inreg tGPR:$src, i8))]>,
+ Requires<[IsThumb1Only, HasV6]>;
// sign-extend short
-def tSXTH : T1I<(outs tGPR:$dst), (ins tGPR:$src),
- "sxth $dst, $src",
- [(set tGPR:$dst, (sext_inreg tGPR:$src, i16))]>,
- Requires<[IsThumb, HasV6]>;
+def tSXTH : T1pI<(outs tGPR:$dst), (ins tGPR:$src), IIC_iUNAr,
+ "sxth", " $dst, $src",
+ [(set tGPR:$dst, (sext_inreg tGPR:$src, i16))]>,
+ Requires<[IsThumb1Only, HasV6]>;
// test
let isCommutable = 1, Defs = [CPSR] in
-def tTST : T1I<(outs), (ins tGPR:$lhs, tGPR:$rhs),
- "tst $lhs, $rhs",
- [(ARMcmpZ (and tGPR:$lhs, tGPR:$rhs), 0)]>;
+def tTST : T1pI<(outs), (ins tGPR:$lhs, tGPR:$rhs), IIC_iCMPr,
+ "tst", " $lhs, $rhs",
+ [(ARMcmpZ (and tGPR:$lhs, tGPR:$rhs), 0)]>;
// zero-extend byte
-def tUXTB : T1I<(outs tGPR:$dst), (ins tGPR:$src),
- "uxtb $dst, $src",
- [(set tGPR:$dst, (and tGPR:$src, 0xFF))]>,
- Requires<[IsThumb, HasV6]>;
+def tUXTB : T1pI<(outs tGPR:$dst), (ins tGPR:$src), IIC_iUNAr,
+ "uxtb", " $dst, $src",
+ [(set tGPR:$dst, (and tGPR:$src, 0xFF))]>,
+ Requires<[IsThumb1Only, HasV6]>;
// zero-extend short
-def tUXTH : T1I<(outs tGPR:$dst), (ins tGPR:$src),
- "uxth $dst, $src",
- [(set tGPR:$dst, (and tGPR:$src, 0xFFFF))]>,
- Requires<[IsThumb, HasV6]>;
+def tUXTH : T1pI<(outs tGPR:$dst), (ins tGPR:$src), IIC_iUNAr,
+ "uxth", " $dst, $src",
+ [(set tGPR:$dst, (and tGPR:$src, 0xFFFF))]>,
+ Requires<[IsThumb1Only, HasV6]>;
// Conditional move tMOVCCr - Used to implement the Thumb SELECT_CC DAG operation.
// Expanded by the scheduler into a branch sequence.
let usesCustomDAGSchedInserter = 1 in // Expanded by the scheduler.
- def tMOVCCr :
+ def tMOVCCr_pseudo :
PseudoInst<(outs tGPR:$dst), (ins tGPR:$false, tGPR:$true, pred:$cc),
- "@ tMOVCCr $cc",
- [/*(set tGPR:$dst, (ARMcmov tGPR:$false, tGPR:$true, imm:$cc))*/]>;
+ NoItinerary, "@ tMOVCCr $cc",
+ [/*(set tGPR:$dst, (ARMcmov tGPR:$false, tGPR:$true, imm:$cc))*/]>;
+
+
+// 16-bit movcc in IT blocks for Thumb2.
+def tMOVCCr : T1pIt<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs), IIC_iCMOVr,
+ "mov", " $dst, $rhs", []>;
+
+def tMOVCCi : T1pIt<(outs GPR:$dst), (ins GPR:$lhs, i32imm:$rhs), IIC_iCMOVi,
+ "mov", " $dst, $rhs", []>;
// tLEApcrel - Load a pc-relative address into a register without offending the
// assembler.
-def tLEApcrel : TIx2<(outs tGPR:$dst), (ins i32imm:$label),
- !strconcat(!strconcat(".set PCRELV${:uid}, ($label-(",
- "${:private}PCRELL${:uid}+4))\n"),
- !strconcat("\tmov $dst, #PCRELV${:uid}\n",
- "${:private}PCRELL${:uid}:\n\tadd $dst, pc")),
- []>;
-
-def tLEApcrelJT : TIx2<(outs tGPR:$dst), (ins i32imm:$label, i32imm:$id),
- !strconcat(!strconcat(".set PCRELV${:uid}, (${label}_${id:no_hash}-(",
- "${:private}PCRELL${:uid}+4))\n"),
- !strconcat("\tmov $dst, #PCRELV${:uid}\n",
- "${:private}PCRELL${:uid}:\n\tadd $dst, pc")),
- []>;
+def tLEApcrel : T1I<(outs tGPR:$dst), (ins i32imm:$label, pred:$p), IIC_iALUi,
+ "adr$p $dst, #$label", []>;
+
+def tLEApcrelJT : T1I<(outs tGPR:$dst),
+ (ins i32imm:$label, nohash_imm:$id, pred:$p),
+ IIC_iALUi, "adr$p $dst, #${label}_${id}", []>;
//===----------------------------------------------------------------------===//
// TLS Instructions
@@ -598,7 +642,7 @@ def tLEApcrelJT : TIx2<(outs tGPR:$dst), (ins i32imm:$label, i32imm:$id),
// __aeabi_read_tp preserves the registers r1-r3.
let isCall = 1,
Defs = [R0, LR] in {
- def tTPsoft : TIx2<(outs), (ins),
+ def tTPsoft : TIx2<(outs), (ins), IIC_Br,
"bl __aeabi_read_tp",
[(set R0, ARMthread_pointer)]>;
}
@@ -607,20 +651,46 @@ let isCall = 1,
// Non-Instruction Patterns
//
+// Add with carry
+def : T1Pat<(addc tGPR:$lhs, imm0_7:$rhs),
+ (tADDi3 tGPR:$lhs, imm0_7:$rhs)>;
+def : T1Pat<(addc tGPR:$lhs, imm8_255:$rhs),
+ (tADDi8 tGPR:$lhs, imm8_255:$rhs)>;
+def : T1Pat<(addc tGPR:$lhs, tGPR:$rhs),
+ (tADDrr tGPR:$lhs, tGPR:$rhs)>;
+
+// Subtract with carry
+def : T1Pat<(addc tGPR:$lhs, imm0_7_neg:$rhs),
+ (tSUBi3 tGPR:$lhs, imm0_7_neg:$rhs)>;
+def : T1Pat<(addc tGPR:$lhs, imm8_255_neg:$rhs),
+ (tSUBi8 tGPR:$lhs, imm8_255_neg:$rhs)>;
+def : T1Pat<(subc tGPR:$lhs, tGPR:$rhs),
+ (tSUBrr tGPR:$lhs, tGPR:$rhs)>;
+
// ConstantPool, GlobalAddress
-def : TPat<(ARMWrapper tglobaladdr :$dst), (tLEApcrel tglobaladdr :$dst)>;
-def : TPat<(ARMWrapper tconstpool :$dst), (tLEApcrel tconstpool :$dst)>;
+def : T1Pat<(ARMWrapper tglobaladdr :$dst), (tLEApcrel tglobaladdr :$dst)>;
+def : T1Pat<(ARMWrapper tconstpool :$dst), (tLEApcrel tconstpool :$dst)>;
// JumpTable
-def : TPat<(ARMWrapperJT tjumptable:$dst, imm:$id),
- (tLEApcrelJT tjumptable:$dst, imm:$id)>;
+def : T1Pat<(ARMWrapperJT tjumptable:$dst, imm:$id),
+ (tLEApcrelJT tjumptable:$dst, imm:$id)>;
// Direct calls
-def : TPat<(ARMtcall texternalsym:$func), (tBL texternalsym:$func)>;
-def : Tv5Pat<(ARMcall texternalsym:$func), (tBLXi texternalsym:$func)>;
+def : T1Pat<(ARMtcall texternalsym:$func), (tBL texternalsym:$func)>,
+ Requires<[IsThumb, IsNotDarwin]>;
+def : T1Pat<(ARMtcall texternalsym:$func), (tBLr9 texternalsym:$func)>,
+ Requires<[IsThumb, IsDarwin]>;
+
+def : Tv5Pat<(ARMcall texternalsym:$func), (tBLXi texternalsym:$func)>,
+ Requires<[IsThumb, HasV5T, IsNotDarwin]>;
+def : Tv5Pat<(ARMcall texternalsym:$func), (tBLXi_r9 texternalsym:$func)>,
+ Requires<[IsThumb, HasV5T, IsDarwin]>;
// Indirect calls to ARM routines
-def : Tv5Pat<(ARMcall tGPR:$dst), (tBLXr tGPR:$dst)>;
+def : Tv5Pat<(ARMcall GPR:$dst), (tBLXr GPR:$dst)>,
+ Requires<[IsThumb, HasV5T, IsNotDarwin]>;
+def : Tv5Pat<(ARMcall GPR:$dst), (tBLXr_r9 GPR:$dst)>,
+ Requires<[IsThumb, HasV5T, IsDarwin]>;
// zextload i1 -> zextload i8
def : T1Pat<(zextloadi1 t_addrmode_s1:$addr),
@@ -631,6 +701,20 @@ def : T1Pat<(extloadi1 t_addrmode_s1:$addr), (tLDRB t_addrmode_s1:$addr)>;
def : T1Pat<(extloadi8 t_addrmode_s1:$addr), (tLDRB t_addrmode_s1:$addr)>;
def : T1Pat<(extloadi16 t_addrmode_s2:$addr), (tLDRH t_addrmode_s2:$addr)>;
+// If it's impossible to use [r,r] address mode for sextload, select to
+// ldr{b|h} + sxt{b|h} instead.
+def : T1Pat<(sextloadi8 t_addrmode_s1:$addr),
+ (tSXTB (tLDRB t_addrmode_s1:$addr))>,
+ Requires<[IsThumb1Only, HasV6]>;
+def : T1Pat<(sextloadi16 t_addrmode_s2:$addr),
+ (tSXTH (tLDRH t_addrmode_s2:$addr))>,
+ Requires<[IsThumb1Only, HasV6]>;
+
+def : T1Pat<(sextloadi8 t_addrmode_s1:$addr),
+ (tASRri (tLSLri (tLDRB t_addrmode_s1:$addr), 24), 24)>;
+def : T1Pat<(sextloadi16 t_addrmode_s1:$addr),
+ (tASRri (tLSLri (tLDRH t_addrmode_s1:$addr), 16), 16)>;
+
// Large immediate handling.
// Two piece imms.
diff --git a/lib/Target/ARM/ARMInstrThumb2.td b/lib/Target/ARM/ARMInstrThumb2.td
index 50345a6..0750dcc 100644
--- a/lib/Target/ARM/ARMInstrThumb2.td
+++ b/lib/Target/ARM/ARMInstrThumb2.td
@@ -11,6 +11,21 @@
//
//===----------------------------------------------------------------------===//
+// IT block predicate field
+def it_pred : Operand<i32> {
+ let PrintMethod = "printPredicateOperand";
+}
+
+// IT block condition mask
+def it_mask : Operand<i32> {
+ let PrintMethod = "printThumbITMask";
+}
+
+// Table branch address
+def tb_addrmode : Operand<i32> {
+ let PrintMethod = "printTBAddrMode";
+}
+
// Shifted operands. No register controlled shifts for Thumb2.
// Note: We do not support rrx shifted operands yet.
def t2_so_reg : Operand<i32>, // reg imm
@@ -20,23 +35,14 @@ def t2_so_reg : Operand<i32>, // reg imm
let MIOperandInfo = (ops GPR, i32imm);
}
-// t2_so_imm_XFORM - Return a t2_so_imm value packed into the format
-// described for t2_so_imm def below.
-def t2_so_imm_XFORM : SDNodeXForm<imm, [{
- return CurDAG->getTargetConstant(
- ARM_AM::getT2SOImmVal(N->getZExtValue()), MVT::i32);
-}]>;
-
// t2_so_imm_not_XFORM - Return the complement of a t2_so_imm value
def t2_so_imm_not_XFORM : SDNodeXForm<imm, [{
- return CurDAG->getTargetConstant(
- ARM_AM::getT2SOImmVal(~((uint32_t)N->getZExtValue())), MVT::i32);
+ return CurDAG->getTargetConstant(~((uint32_t)N->getZExtValue()), MVT::i32);
}]>;
// t2_so_imm_neg_XFORM - Return the negation of a t2_so_imm value
def t2_so_imm_neg_XFORM : SDNodeXForm<imm, [{
- return CurDAG->getTargetConstant(
- ARM_AM::getT2SOImmVal(-((int)N->getZExtValue())), MVT::i32);
+ return CurDAG->getTargetConstant(-((int)N->getZExtValue()), MVT::i32);
}]>;
// t2_so_imm - Match a 32-bit immediate operand, which is an
@@ -47,27 +53,21 @@ def t2_so_imm_neg_XFORM : SDNodeXForm<imm, [{
// [bits 0-7], the 4-bit shift/splat amount is the next 4 bits [bits 8-11].
def t2_so_imm : Operand<i32>,
PatLeaf<(imm), [{
- return ARM_AM::getT2SOImmVal((uint32_t)N->getZExtValue()) != -1;
- }], t2_so_imm_XFORM> {
- let PrintMethod = "printT2SOImmOperand";
-}
+ return ARM_AM::getT2SOImmVal((uint32_t)N->getZExtValue()) != -1;
+}]>;
// t2_so_imm_not - Match an immediate that is a complement
// of a t2_so_imm.
def t2_so_imm_not : Operand<i32>,
PatLeaf<(imm), [{
- return ARM_AM::getT2SOImmVal(~((uint32_t)N->getZExtValue())) != -1;
- }], t2_so_imm_not_XFORM> {
- let PrintMethod = "printT2SOImmOperand";
-}
+ return ARM_AM::getT2SOImmVal(~((uint32_t)N->getZExtValue())) != -1;
+}], t2_so_imm_not_XFORM>;
// t2_so_imm_neg - Match an immediate that is a negation of a t2_so_imm.
def t2_so_imm_neg : Operand<i32>,
PatLeaf<(imm), [{
- return ARM_AM::getT2SOImmVal(-((int)N->getZExtValue())) != -1;
- }], t2_so_imm_neg_XFORM> {
- let PrintMethod = "printT2SOImmOperand";
-}
+ return ARM_AM::getT2SOImmVal(-((int)N->getZExtValue())) != -1;
+}], t2_so_imm_neg_XFORM>;
/// imm1_31 predicate - True if the 32-bit immediate is in the range [1,31].
def imm1_31 : PatLeaf<(i32 imm), [{
@@ -75,7 +75,8 @@ def imm1_31 : PatLeaf<(i32 imm), [{
}]>;
/// imm0_4095 predicate - True if the 32-bit immediate is in the range [0.4095].
-def imm0_4095 : PatLeaf<(i32 imm), [{
+def imm0_4095 : Operand<i32>,
+ PatLeaf<(i32 imm), [{
return (uint32_t)N->getZExtValue() < 4096;
}]>;
@@ -83,48 +84,9 @@ def imm0_4095_neg : PatLeaf<(i32 imm), [{
return (uint32_t)(-N->getZExtValue()) < 4096;
}], imm_neg_XFORM>;
-/// imm0_65535 predicate - True if the 32-bit immediate is in the range
-/// [0.65535].
-def imm0_65535 : PatLeaf<(i32 imm), [{
- return (uint32_t)N->getZExtValue() < 65536;
-}]>;
-
-/// bf_inv_mask_imm predicate - An AND mask to clear an arbitrary width bitfield
-/// e.g., 0xf000ffff
-def bf_inv_mask_imm : Operand<i32>,
- PatLeaf<(imm), [{
- uint32_t v = (uint32_t)N->getZExtValue();
- if (v == 0xffffffff)
- return 0;
- // naive checker. should do better, but simple is best for now since it's
- // more likely to be correct.
- while (v & 1) v >>= 1; // shift off the leading 1's
- if (v)
- {
- while (!(v & 1)) v >>=1; // shift off the mask
- while (v & 1) v >>= 1; // shift off the trailing 1's
- }
- // if this is a mask for clearing a bitfield, what's left should be zero.
- return (v == 0);
-}] > {
- let PrintMethod = "printBitfieldInvMaskImmOperand";
-}
-
-/// Split a 32-bit immediate into two 16 bit parts.
-def t2_lo16 : SDNodeXForm<imm, [{
- return CurDAG->getTargetConstant((uint32_t)N->getZExtValue() & 0xffff,
- MVT::i32);
-}]>;
-
-def t2_hi16 : SDNodeXForm<imm, [{
- return CurDAG->getTargetConstant((uint32_t)N->getZExtValue() >> 16, MVT::i32);
-}]>;
-
-def t2_lo16AllZero : PatLeaf<(i32 imm), [{
- // Returns true if all low 16-bits are 0.
- return (((uint32_t)N->getZExtValue()) & 0xFFFFUL) == 0;
- }], t2_hi16>;
-
+def imm0_255_neg : PatLeaf<(i32 imm), [{
+ return (uint32_t)(-N->getZExtValue()) < 255;
+}], imm_neg_XFORM>;
// Define Thumb2 specific addressing modes.
@@ -147,14 +109,14 @@ def t2am_imm8_offset : Operand<i32>,
let PrintMethod = "printT2AddrModeImm8OffsetOperand";
}
-// t2addrmode_imm8s4 := reg + (imm8 << 2)
+// t2addrmode_imm8s4 := reg +/- (imm8 << 2)
def t2addrmode_imm8s4 : Operand<i32>,
ComplexPattern<i32, 2, "SelectT2AddrModeImm8s4", []> {
- let PrintMethod = "printT2AddrModeImm8Operand";
+ let PrintMethod = "printT2AddrModeImm8s4Operand";
let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm);
}
-// t2addrmode_so_reg := reg + reg << imm2
+// t2addrmode_so_reg := reg + (reg << imm2)
def t2addrmode_so_reg : Operand<i32>,
ComplexPattern<i32, 3, "SelectT2AddrModeSoReg", []> {
let PrintMethod = "printT2AddrModeSoRegOperand";
@@ -171,52 +133,58 @@ def t2addrmode_so_reg : Operand<i32>,
/// changed to modify CPSR.
multiclass T2I_un_irs<string opc, PatFrag opnode, bit Cheap = 0, bit ReMat = 0>{
// shifted imm
- def i : T2sI<(outs GPR:$dst), (ins t2_so_imm:$src),
+ def i : T2sI<(outs GPR:$dst), (ins t2_so_imm:$src), IIC_iMOVi,
opc, " $dst, $src",
[(set GPR:$dst, (opnode t2_so_imm:$src))]> {
let isAsCheapAsAMove = Cheap;
let isReMaterializable = ReMat;
}
// register
- def r : T2I<(outs GPR:$dst), (ins GPR:$src),
- opc, " $dst, $src",
+ def r : T2I<(outs GPR:$dst), (ins GPR:$src), IIC_iMOVr,
+ opc, ".w $dst, $src",
[(set GPR:$dst, (opnode GPR:$src))]>;
// shifted register
- def s : T2I<(outs GPR:$dst), (ins t2_so_reg:$src),
- opc, " $dst, $src",
+ def s : T2I<(outs GPR:$dst), (ins t2_so_reg:$src), IIC_iMOVsi,
+ opc, ".w $dst, $src",
[(set GPR:$dst, (opnode t2_so_reg:$src))]>;
}
/// T2I_bin_irs - Defines a set of (op reg, {so_imm|r|so_reg}) patterns for a
// binary operation that produces a value. These are predicable and can be
/// changed to modify CPSR.
-multiclass T2I_bin_irs<string opc, PatFrag opnode, bit Commutable = 0> {
+multiclass T2I_bin_irs<string opc, PatFrag opnode,
+ bit Commutable = 0, string wide =""> {
// shifted imm
- def ri : T2sI<(outs GPR:$dst), (ins GPR:$lhs, t2_so_imm:$rhs),
+ def ri : T2sI<(outs GPR:$dst), (ins GPR:$lhs, t2_so_imm:$rhs), IIC_iALUi,
opc, " $dst, $lhs, $rhs",
[(set GPR:$dst, (opnode GPR:$lhs, t2_so_imm:$rhs))]>;
// register
- def rr : T2sI<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs),
- opc, " $dst, $lhs, $rhs",
+ def rr : T2sI<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs), IIC_iALUr,
+ opc, !strconcat(wide, " $dst, $lhs, $rhs"),
[(set GPR:$dst, (opnode GPR:$lhs, GPR:$rhs))]> {
let isCommutable = Commutable;
}
// shifted register
- def rs : T2sI<(outs GPR:$dst), (ins GPR:$lhs, t2_so_reg:$rhs),
- opc, " $dst, $lhs, $rhs",
+ def rs : T2sI<(outs GPR:$dst), (ins GPR:$lhs, t2_so_reg:$rhs), IIC_iALUsi,
+ opc, !strconcat(wide, " $dst, $lhs, $rhs"),
[(set GPR:$dst, (opnode GPR:$lhs, t2_so_reg:$rhs))]>;
}
+/// T2I_bin_w_irs - Same as T2I_bin_irs except these operations need
+// the ".w" prefix to indicate that they are wide.
+multiclass T2I_bin_w_irs<string opc, PatFrag opnode, bit Commutable = 0> :
+ T2I_bin_irs<opc, opnode, Commutable, ".w">;
+
/// T2I_rbin_is - Same as T2I_bin_irs except the order of operands are
/// reversed. It doesn't define the 'rr' form since it's handled by its
/// T2I_bin_irs counterpart.
multiclass T2I_rbin_is<string opc, PatFrag opnode> {
// shifted imm
- def ri : T2I<(outs GPR:$dst), (ins GPR:$rhs, t2_so_imm:$lhs),
- opc, " $dst, $rhs, $lhs",
+ def ri : T2I<(outs GPR:$dst), (ins GPR:$rhs, t2_so_imm:$lhs), IIC_iALUi,
+ opc, ".w $dst, $rhs, $lhs",
[(set GPR:$dst, (opnode t2_so_imm:$lhs, GPR:$rhs))]>;
// shifted register
- def rs : T2I<(outs GPR:$dst), (ins GPR:$rhs, t2_so_reg:$lhs),
+ def rs : T2I<(outs GPR:$dst), (ins GPR:$rhs, t2_so_reg:$lhs), IIC_iALUsi,
opc, " $dst, $rhs, $lhs",
[(set GPR:$dst, (opnode t2_so_reg:$lhs, GPR:$rhs))]>;
}
@@ -226,18 +194,18 @@ multiclass T2I_rbin_is<string opc, PatFrag opnode> {
let Defs = [CPSR] in {
multiclass T2I_bin_s_irs<string opc, PatFrag opnode, bit Commutable = 0> {
// shifted imm
- def ri : T2I<(outs GPR:$dst), (ins GPR:$lhs, t2_so_imm:$rhs),
- !strconcat(opc, "s"), " $dst, $lhs, $rhs",
+ def ri : T2I<(outs GPR:$dst), (ins GPR:$lhs, t2_so_imm:$rhs), IIC_iALUi,
+ !strconcat(opc, "s"), ".w $dst, $lhs, $rhs",
[(set GPR:$dst, (opnode GPR:$lhs, t2_so_imm:$rhs))]>;
// register
- def rr : T2I<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs),
- !strconcat(opc, "s"), " $dst, $lhs, $rhs",
+ def rr : T2I<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs), IIC_iALUr,
+ !strconcat(opc, "s"), ".w $dst, $lhs, $rhs",
[(set GPR:$dst, (opnode GPR:$lhs, GPR:$rhs))]> {
let isCommutable = Commutable;
}
// shifted register
- def rs : T2I<(outs GPR:$dst), (ins GPR:$lhs, t2_so_reg:$rhs),
- !strconcat(opc, "s"), " $dst, $lhs, $rhs",
+ def rs : T2I<(outs GPR:$dst), (ins GPR:$lhs, t2_so_reg:$rhs), IIC_iALUsi,
+ !strconcat(opc, "s"), ".w $dst, $lhs, $rhs",
[(set GPR:$dst, (opnode GPR:$lhs, t2_so_reg:$rhs))]>;
}
}
@@ -246,22 +214,22 @@ multiclass T2I_bin_s_irs<string opc, PatFrag opnode, bit Commutable = 0> {
/// patterns for a binary operation that produces a value.
multiclass T2I_bin_ii12rs<string opc, PatFrag opnode, bit Commutable = 0> {
// shifted imm
- def ri : T2sI<(outs GPR:$dst), (ins GPR:$lhs, t2_so_imm:$rhs),
- opc, " $dst, $lhs, $rhs",
+ def ri : T2sI<(outs GPR:$dst), (ins GPR:$lhs, t2_so_imm:$rhs), IIC_iALUi,
+ opc, ".w $dst, $lhs, $rhs",
[(set GPR:$dst, (opnode GPR:$lhs, t2_so_imm:$rhs))]>;
// 12-bit imm
- def ri12 : T2sI<(outs GPR:$dst), (ins GPR:$lhs, i32imm:$rhs),
+ def ri12 : T2sI<(outs GPR:$dst), (ins GPR:$lhs, imm0_4095:$rhs), IIC_iALUi,
!strconcat(opc, "w"), " $dst, $lhs, $rhs",
[(set GPR:$dst, (opnode GPR:$lhs, imm0_4095:$rhs))]>;
// register
- def rr : T2sI<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs),
- opc, " $dst, $lhs, $rhs",
+ def rr : T2sI<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs), IIC_iALUr,
+ opc, ".w $dst, $lhs, $rhs",
[(set GPR:$dst, (opnode GPR:$lhs, GPR:$rhs))]> {
let isCommutable = Commutable;
}
// shifted register
- def rs : T2sI<(outs GPR:$dst), (ins GPR:$lhs, t2_so_reg:$rhs),
- opc, " $dst, $lhs, $rhs",
+ def rs : T2sI<(outs GPR:$dst), (ins GPR:$lhs, t2_so_reg:$rhs), IIC_iALUsi,
+ opc, ".w $dst, $lhs, $rhs",
[(set GPR:$dst, (opnode GPR:$lhs, t2_so_reg:$rhs))]>;
}
@@ -271,41 +239,41 @@ multiclass T2I_bin_ii12rs<string opc, PatFrag opnode, bit Commutable = 0> {
let Uses = [CPSR] in {
multiclass T2I_adde_sube_irs<string opc, PatFrag opnode, bit Commutable = 0> {
// shifted imm
- def ri : T2sI<(outs GPR:$dst), (ins GPR:$lhs, t2_so_imm:$rhs),
+ def ri : T2sI<(outs GPR:$dst), (ins GPR:$lhs, t2_so_imm:$rhs), IIC_iALUi,
opc, " $dst, $lhs, $rhs",
[(set GPR:$dst, (opnode GPR:$lhs, t2_so_imm:$rhs))]>,
Requires<[IsThumb2, CarryDefIsUnused]>;
// register
- def rr : T2sI<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs),
- opc, " $dst, $lhs, $rhs",
+ def rr : T2sI<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs), IIC_iALUr,
+ opc, ".w $dst, $lhs, $rhs",
[(set GPR:$dst, (opnode GPR:$lhs, GPR:$rhs))]>,
Requires<[IsThumb2, CarryDefIsUnused]> {
let isCommutable = Commutable;
}
// shifted register
- def rs : T2sI<(outs GPR:$dst), (ins GPR:$lhs, t2_so_reg:$rhs),
- opc, " $dst, $lhs, $rhs",
+ def rs : T2sI<(outs GPR:$dst), (ins GPR:$lhs, t2_so_reg:$rhs), IIC_iALUsi,
+ opc, ".w $dst, $lhs, $rhs",
[(set GPR:$dst, (opnode GPR:$lhs, t2_so_reg:$rhs))]>,
Requires<[IsThumb2, CarryDefIsUnused]>;
// Carry setting variants
// shifted imm
- def Sri : T2XI<(outs GPR:$dst), (ins GPR:$lhs, t2_so_imm:$rhs),
+ def Sri : T2XI<(outs GPR:$dst), (ins GPR:$lhs, t2_so_imm:$rhs), IIC_iALUi,
!strconcat(opc, "s $dst, $lhs, $rhs"),
[(set GPR:$dst, (opnode GPR:$lhs, t2_so_imm:$rhs))]>,
Requires<[IsThumb2, CarryDefIsUsed]> {
let Defs = [CPSR];
}
// register
- def Srr : T2XI<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs),
- !strconcat(opc, "s $dst, $lhs, $rhs"),
+ def Srr : T2XI<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs), IIC_iALUr,
+ !strconcat(opc, "s.w $dst, $lhs, $rhs"),
[(set GPR:$dst, (opnode GPR:$lhs, GPR:$rhs))]>,
Requires<[IsThumb2, CarryDefIsUsed]> {
let Defs = [CPSR];
let isCommutable = Commutable;
}
// shifted register
- def Srs : T2XI<(outs GPR:$dst), (ins GPR:$lhs, t2_so_reg:$rhs),
- !strconcat(opc, "s $dst, $lhs, $rhs"),
+ def Srs : T2XI<(outs GPR:$dst), (ins GPR:$lhs, t2_so_reg:$rhs), IIC_iALUsi,
+ !strconcat(opc, "s.w $dst, $lhs, $rhs"),
[(set GPR:$dst, (opnode GPR:$lhs, t2_so_reg:$rhs))]>,
Requires<[IsThumb2, CarryDefIsUsed]> {
let Defs = [CPSR];
@@ -313,49 +281,17 @@ multiclass T2I_adde_sube_irs<string opc, PatFrag opnode, bit Commutable = 0> {
}
}
-/// T2I_rsc_is - Same as T2I_adde_sube_irs except the order of operands are
-/// reversed. It doesn't define the 'rr' form since it's handled by its
-/// T2I_adde_sube_irs counterpart.
-let Defs = [CPSR], Uses = [CPSR] in {
-multiclass T2I_rsc_is<string opc, PatFrag opnode> {
- // shifted imm
- def ri : T2sI<(outs GPR:$dst), (ins GPR:$rhs, t2_so_imm:$lhs),
- opc, " $dst, $rhs, $lhs",
- [(set GPR:$dst, (opnode t2_so_imm:$lhs, GPR:$rhs))]>,
- Requires<[IsThumb2, CarryDefIsUnused]>;
- // shifted register
- def rs : T2sI<(outs GPR:$dst), (ins GPR:$rhs, t2_so_reg:$lhs),
- opc, " $dst, $rhs, $lhs",
- [(set GPR:$dst, (opnode t2_so_reg:$lhs, GPR:$rhs))]>,
- Requires<[IsThumb2, CarryDefIsUnused]>;
- // shifted imm
- def Sri : T2XI<(outs GPR:$dst), (ins GPR:$rhs, t2_so_imm:$lhs),
- !strconcat(opc, "s $dst, $rhs, $lhs"),
- [(set GPR:$dst, (opnode t2_so_imm:$lhs, GPR:$rhs))]>,
- Requires<[IsThumb2, CarryDefIsUsed]> {
- let Defs = [CPSR];
- }
- // shifted register
- def Srs : T2XI<(outs GPR:$dst), (ins GPR:$rhs, t2_so_reg:$lhs),
- !strconcat(opc, "s $dst, $rhs, $lhs"),
- [(set GPR:$dst, (opnode t2_so_reg:$lhs, GPR:$rhs))]>,
- Requires<[IsThumb2, CarryDefIsUsed]> {
- let Defs = [CPSR];
- }
-}
-}
-
-/// T2I_rbin_s_is - Same as T2I_bin_s_irs except the order of operands are
-/// reversed. It doesn't define the 'rr' form since it's handled by its
-/// T2I_bin_s_irs counterpart.
+/// T2I_rbin_s_is - Same as T2I_rbin_is except sets 's' bit.
let Defs = [CPSR] in {
multiclass T2I_rbin_s_is<string opc, PatFrag opnode> {
// shifted imm
def ri : T2XI<(outs GPR:$dst), (ins GPR:$rhs, t2_so_imm:$lhs, cc_out:$s),
- !strconcat(opc, "${s} $dst, $rhs, $lhs"),
+ IIC_iALUi,
+ !strconcat(opc, "${s}.w $dst, $rhs, $lhs"),
[(set GPR:$dst, (opnode t2_so_imm:$lhs, GPR:$rhs))]>;
// shifted register
def rs : T2XI<(outs GPR:$dst), (ins GPR:$rhs, t2_so_reg:$lhs, cc_out:$s),
+ IIC_iALUsi,
!strconcat(opc, "${s} $dst, $rhs, $lhs"),
[(set GPR:$dst, (opnode t2_so_reg:$lhs, GPR:$rhs))]>;
}
@@ -365,96 +301,96 @@ multiclass T2I_rbin_s_is<string opc, PatFrag opnode> {
// rotate operation that produces a value.
multiclass T2I_sh_ir<string opc, PatFrag opnode> {
// 5-bit imm
- def ri : T2sI<(outs GPR:$dst), (ins GPR:$lhs, i32imm:$rhs),
- opc, " $dst, $lhs, $rhs",
+ def ri : T2sI<(outs GPR:$dst), (ins GPR:$lhs, i32imm:$rhs), IIC_iMOVsi,
+ opc, ".w $dst, $lhs, $rhs",
[(set GPR:$dst, (opnode GPR:$lhs, imm1_31:$rhs))]>;
// register
- def rr : T2sI<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs),
- opc, " $dst, $lhs, $rhs",
+ def rr : T2sI<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs), IIC_iMOVsr,
+ opc, ".w $dst, $lhs, $rhs",
[(set GPR:$dst, (opnode GPR:$lhs, GPR:$rhs))]>;
}
-/// T21_cmp_irs - Defines a set of (op r, {so_imm|r|so_reg}) cmp / test
+/// T2I_cmp_is - Defines a set of (op r, {so_imm|r|so_reg}) cmp / test
/// patterns. Similar to T2I_bin_irs except the instruction does not produce
/// a explicit result, only implicitly set CPSR.
-let Uses = [CPSR] in {
+let Defs = [CPSR] in {
multiclass T2I_cmp_is<string opc, PatFrag opnode> {
// shifted imm
- def ri : T2I<(outs), (ins GPR:$lhs, t2_so_imm:$rhs),
- opc, " $lhs, $rhs",
+ def ri : T2I<(outs), (ins GPR:$lhs, t2_so_imm:$rhs), IIC_iCMPi,
+ opc, ".w $lhs, $rhs",
[(opnode GPR:$lhs, t2_so_imm:$rhs)]>;
// register
- def rr : T2I<(outs), (ins GPR:$lhs, GPR:$rhs),
- opc, " $lhs, $rhs",
+ def rr : T2I<(outs), (ins GPR:$lhs, GPR:$rhs), IIC_iCMPr,
+ opc, ".w $lhs, $rhs",
[(opnode GPR:$lhs, GPR:$rhs)]>;
// shifted register
- def rs : T2I<(outs), (ins GPR:$lhs, t2_so_reg:$rhs),
- opc, " $lhs, $rhs",
+ def rs : T2I<(outs), (ins GPR:$lhs, t2_so_reg:$rhs), IIC_iCMPsi,
+ opc, ".w $lhs, $rhs",
[(opnode GPR:$lhs, t2_so_reg:$rhs)]>;
}
}
/// T2I_ld - Defines a set of (op r, {imm12|imm8|so_reg}) load patterns.
multiclass T2I_ld<string opc, PatFrag opnode> {
- def i12 : T2Ii12<(outs GPR:$dst), (ins t2addrmode_imm12:$addr),
- opc, " $dst, $addr",
+ def i12 : T2Ii12<(outs GPR:$dst), (ins t2addrmode_imm12:$addr), IIC_iLoadi,
+ opc, ".w $dst, $addr",
[(set GPR:$dst, (opnode t2addrmode_imm12:$addr))]>;
- def i8 : T2Ii8 <(outs GPR:$dst), (ins t2addrmode_imm8:$addr),
+ def i8 : T2Ii8 <(outs GPR:$dst), (ins t2addrmode_imm8:$addr), IIC_iLoadi,
opc, " $dst, $addr",
[(set GPR:$dst, (opnode t2addrmode_imm8:$addr))]>;
- def s : T2Iso <(outs GPR:$dst), (ins t2addrmode_so_reg:$addr),
- opc, " $dst, $addr",
+ def s : T2Iso <(outs GPR:$dst), (ins t2addrmode_so_reg:$addr), IIC_iLoadr,
+ opc, ".w $dst, $addr",
[(set GPR:$dst, (opnode t2addrmode_so_reg:$addr))]>;
- def pci : T2Ipc <(outs GPR:$dst), (ins i32imm:$addr),
- opc, " $dst, $addr",
+ def pci : T2Ipc <(outs GPR:$dst), (ins i32imm:$addr), IIC_iLoadi,
+ opc, ".w $dst, $addr",
[(set GPR:$dst, (opnode (ARMWrapper tconstpool:$addr)))]>;
}
/// T2I_st - Defines a set of (op r, {imm12|imm8|so_reg}) store patterns.
multiclass T2I_st<string opc, PatFrag opnode> {
- def i12 : T2Ii12<(outs), (ins GPR:$src, t2addrmode_imm12:$addr),
- opc, " $src, $addr",
+ def i12 : T2Ii12<(outs), (ins GPR:$src, t2addrmode_imm12:$addr), IIC_iStorei,
+ opc, ".w $src, $addr",
[(opnode GPR:$src, t2addrmode_imm12:$addr)]>;
- def i8 : T2Ii8 <(outs), (ins GPR:$src, t2addrmode_imm8:$addr),
+ def i8 : T2Ii8 <(outs), (ins GPR:$src, t2addrmode_imm8:$addr), IIC_iStorei,
opc, " $src, $addr",
[(opnode GPR:$src, t2addrmode_imm8:$addr)]>;
- def s : T2Iso <(outs), (ins GPR:$src, t2addrmode_so_reg:$addr),
- opc, " $src, $addr",
+ def s : T2Iso <(outs), (ins GPR:$src, t2addrmode_so_reg:$addr), IIC_iStorer,
+ opc, ".w $src, $addr",
[(opnode GPR:$src, t2addrmode_so_reg:$addr)]>;
}
/// T2I_picld - Defines the PIC load pattern.
class T2I_picld<string opc, PatFrag opnode> :
- T2I<(outs GPR:$dst), (ins addrmodepc:$addr),
- !strconcat("${addr:label}:\n\t", opc), " $dst, $addr",
+ T2I<(outs GPR:$dst), (ins addrmodepc:$addr), IIC_iLoadi,
+ !strconcat("\n${addr:label}:\n\t", opc), " $dst, $addr",
[(set GPR:$dst, (opnode addrmodepc:$addr))]>;
/// T2I_picst - Defines the PIC store pattern.
class T2I_picst<string opc, PatFrag opnode> :
- T2I<(outs), (ins GPR:$src, addrmodepc:$addr),
- !strconcat("${addr:label}:\n\t", opc), " $src, $addr",
+ T2I<(outs), (ins GPR:$src, addrmodepc:$addr), IIC_iStorer,
+ !strconcat("\n${addr:label}:\n\t", opc), " $src, $addr",
[(opnode GPR:$src, addrmodepc:$addr)]>;
/// T2I_unary_rrot - A unary operation with two forms: one whose operand is a
/// register and one whose operand is a register rotated by 8/16/24.
multiclass T2I_unary_rrot<string opc, PatFrag opnode> {
- def r : T2I<(outs GPR:$dst), (ins GPR:$Src),
- opc, " $dst, $Src",
- [(set GPR:$dst, (opnode GPR:$Src))]>;
- def r_rot : T2I<(outs GPR:$dst), (ins GPR:$Src, i32imm:$rot),
- opc, " $dst, $Src, ror $rot",
- [(set GPR:$dst, (opnode (rotr GPR:$Src, rot_imm:$rot)))]>;
+ def r : T2I<(outs GPR:$dst), (ins GPR:$src), IIC_iUNAr,
+ opc, ".w $dst, $src",
+ [(set GPR:$dst, (opnode GPR:$src))]>;
+ def r_rot : T2I<(outs GPR:$dst), (ins GPR:$src, i32imm:$rot), IIC_iUNAsi,
+ opc, ".w $dst, $src, ror $rot",
+ [(set GPR:$dst, (opnode (rotr GPR:$src, rot_imm:$rot)))]>;
}
/// T2I_bin_rrot - A binary operation with two forms: one whose operand is a
/// register and one whose operand is a register rotated by 8/16/24.
multiclass T2I_bin_rrot<string opc, PatFrag opnode> {
- def rr : T2I<(outs GPR:$dst), (ins GPR:$LHS, GPR:$RHS),
+ def rr : T2I<(outs GPR:$dst), (ins GPR:$LHS, GPR:$RHS), IIC_iALUr,
opc, " $dst, $LHS, $RHS",
[(set GPR:$dst, (opnode GPR:$LHS, GPR:$RHS))]>;
def rr_rot : T2I<(outs GPR:$dst), (ins GPR:$LHS, GPR:$RHS, i32imm:$rot),
- opc, " $dst, $LHS, $RHS, ror $rot",
+ IIC_iALUsr, opc, " $dst, $LHS, $RHS, ror $rot",
[(set GPR:$dst, (opnode GPR:$LHS,
(rotr GPR:$RHS, rot_imm:$rot)))]>;
}
@@ -467,42 +403,46 @@ multiclass T2I_bin_rrot<string opc, PatFrag opnode> {
// Miscellaneous Instructions.
//
-let isNotDuplicable = 1 in
-def t2PICADD : T2XI<(outs tGPR:$dst), (ins tGPR:$lhs, pclabel:$cp),
- "$cp:\n\tadd $dst, pc",
- [(set tGPR:$dst, (ARMpic_add tGPR:$lhs, imm:$cp))]>;
-
-
// LEApcrel - Load a pc-relative address into a register without offending the
// assembler.
-def t2LEApcrel : T2XI<(outs GPR:$dst), (ins i32imm:$label, pred:$p),
- !strconcat(!strconcat(".set PCRELV${:uid}, ($label-(",
- "${:private}PCRELL${:uid}+8))\n"),
- !strconcat("${:private}PCRELL${:uid}:\n\t",
- "add$p $dst, pc, #PCRELV${:uid}")),
- []>;
+def t2LEApcrel : T2XI<(outs GPR:$dst), (ins i32imm:$label, pred:$p), IIC_iALUi,
+ "adr$p.w $dst, #$label", []>;
def t2LEApcrelJT : T2XI<(outs GPR:$dst),
- (ins i32imm:$label, i32imm:$id, pred:$p),
- !strconcat(!strconcat(".set PCRELV${:uid}, (${label}_${id:no_hash}-(",
- "${:private}PCRELL${:uid}+8))\n"),
- !strconcat("${:private}PCRELL${:uid}:\n\t",
- "add$p $dst, pc, #PCRELV${:uid}")),
- []>;
-
-// ADD rd, sp, #so_imm
-def t2ADDrSPi : T2XI<(outs GPR:$dst), (ins GPR:$sp, t2_so_imm:$imm),
- "add $dst, $sp, $imm",
- []>;
-
-// ADD rd, sp, #imm12
-def t2ADDrSPi12 : T2XI<(outs GPR:$dst), (ins GPR:$sp, i32imm:$imm),
- "addw $dst, $sp, $imm",
- []>;
-
-def t2ADDrSPs : T2XI<(outs GPR:$dst), (ins GPR:$sp, t2_so_reg:$rhs),
- "addw $dst, $sp, $rhs",
- []>;
+ (ins i32imm:$label, nohash_imm:$id, pred:$p), IIC_iALUi,
+ "adr$p.w $dst, #${label}_${id}", []>;
+
+// ADD r, sp, {so_imm|i12}
+def t2ADDrSPi : T2sI<(outs GPR:$dst), (ins GPR:$sp, t2_so_imm:$imm),
+ IIC_iALUi, "add", ".w $dst, $sp, $imm", []>;
+def t2ADDrSPi12 : T2I<(outs GPR:$dst), (ins GPR:$sp, imm0_4095:$imm),
+ IIC_iALUi, "addw", " $dst, $sp, $imm", []>;
+
+// ADD r, sp, so_reg
+def t2ADDrSPs : T2sI<(outs GPR:$dst), (ins GPR:$sp, t2_so_reg:$rhs),
+ IIC_iALUsi, "add", ".w $dst, $sp, $rhs", []>;
+
+// SUB r, sp, {so_imm|i12}
+def t2SUBrSPi : T2sI<(outs GPR:$dst), (ins GPR:$sp, t2_so_imm:$imm),
+ IIC_iALUi, "sub", ".w $dst, $sp, $imm", []>;
+def t2SUBrSPi12 : T2I<(outs GPR:$dst), (ins GPR:$sp, imm0_4095:$imm),
+ IIC_iALUi, "subw", " $dst, $sp, $imm", []>;
+
+// SUB r, sp, so_reg
+def t2SUBrSPs : T2sI<(outs GPR:$dst), (ins GPR:$sp, t2_so_reg:$rhs),
+ IIC_iALUsi,
+ "sub", " $dst, $sp, $rhs", []>;
+
+
+// Pseudo instruction that will expand into a t2SUBrSPi + a copy.
+let usesCustomDAGSchedInserter = 1 in { // Expanded by the scheduler.
+def t2SUBrSPi_ : PseudoInst<(outs GPR:$dst), (ins GPR:$sp, t2_so_imm:$imm),
+ NoItinerary, "@ sub.w $dst, $sp, $imm", []>;
+def t2SUBrSPi12_ : PseudoInst<(outs GPR:$dst), (ins GPR:$sp, imm0_4095:$imm),
+ NoItinerary, "@ subw $dst, $sp, $imm", []>;
+def t2SUBrSPs_ : PseudoInst<(outs GPR:$dst), (ins GPR:$sp, t2_so_reg:$rhs),
+ NoItinerary, "@ sub $dst, $sp, $rhs", []>;
+} // usesCustomDAGSchedInserter
//===----------------------------------------------------------------------===//
@@ -521,12 +461,14 @@ defm t2LDRB : T2I_ld<"ldrb", UnOpFrag<(zextloadi8 node:$Src)>>;
defm t2LDRSH : T2I_ld<"ldrsh", UnOpFrag<(sextloadi16 node:$Src)>>;
defm t2LDRSB : T2I_ld<"ldrsb", UnOpFrag<(sextloadi8 node:$Src)>>;
-let mayLoad = 1 in {
+let mayLoad = 1, hasExtraDefRegAllocReq = 1 in {
// Load doubleword
-def t2LDRDi8 : T2Ii8s4<(outs GPR:$dst), (ins t2addrmode_imm8s4:$addr),
- "ldrd", " $dst, $addr", []>;
-def t2LDRDpci : T2Ii8s4<(outs GPR:$dst), (ins i32imm:$addr),
- "ldrd", " $dst, $addr", []>;
+def t2LDRDi8 : T2Ii8s4<(outs GPR:$dst1, GPR:$dst2),
+ (ins t2addrmode_imm8s4:$addr),
+ IIC_iLoadi, "ldrd", " $dst1, $addr", []>;
+def t2LDRDpci : T2Ii8s4<(outs GPR:$dst1, GPR:$dst2),
+ (ins i32imm:$addr), IIC_iLoadi,
+ "ldrd", " $dst1, $addr", []>;
}
// zextload i1 -> zextload i8
@@ -573,57 +515,57 @@ def : T2Pat<(extloadi16 (ARMWrapper tconstpool:$addr)),
let mayLoad = 1 in {
def t2LDR_PRE : T2Iidxldst<(outs GPR:$dst, GPR:$base_wb),
(ins t2addrmode_imm8:$addr),
- AddrModeT2_i8, IndexModePre,
+ AddrModeT2_i8, IndexModePre, IIC_iLoadiu,
"ldr", " $dst, $addr!", "$addr.base = $base_wb",
[]>;
def t2LDR_POST : T2Iidxldst<(outs GPR:$dst, GPR:$base_wb),
(ins GPR:$base, t2am_imm8_offset:$offset),
- AddrModeT2_i8, IndexModePost,
+ AddrModeT2_i8, IndexModePost, IIC_iLoadiu,
"ldr", " $dst, [$base], $offset", "$base = $base_wb",
[]>;
def t2LDRB_PRE : T2Iidxldst<(outs GPR:$dst, GPR:$base_wb),
(ins t2addrmode_imm8:$addr),
- AddrModeT2_i8, IndexModePre,
+ AddrModeT2_i8, IndexModePre, IIC_iLoadiu,
"ldrb", " $dst, $addr!", "$addr.base = $base_wb",
[]>;
def t2LDRB_POST : T2Iidxldst<(outs GPR:$dst, GPR:$base_wb),
(ins GPR:$base, t2am_imm8_offset:$offset),
- AddrModeT2_i8, IndexModePost,
+ AddrModeT2_i8, IndexModePost, IIC_iLoadiu,
"ldrb", " $dst, [$base], $offset", "$base = $base_wb",
[]>;
def t2LDRH_PRE : T2Iidxldst<(outs GPR:$dst, GPR:$base_wb),
(ins t2addrmode_imm8:$addr),
- AddrModeT2_i8, IndexModePre,
+ AddrModeT2_i8, IndexModePre, IIC_iLoadiu,
"ldrh", " $dst, $addr!", "$addr.base = $base_wb",
[]>;
def t2LDRH_POST : T2Iidxldst<(outs GPR:$dst, GPR:$base_wb),
(ins GPR:$base, t2am_imm8_offset:$offset),
- AddrModeT2_i8, IndexModePost,
+ AddrModeT2_i8, IndexModePost, IIC_iLoadiu,
"ldrh", " $dst, [$base], $offset", "$base = $base_wb",
[]>;
def t2LDRSB_PRE : T2Iidxldst<(outs GPR:$dst, GPR:$base_wb),
(ins t2addrmode_imm8:$addr),
- AddrModeT2_i8, IndexModePre,
+ AddrModeT2_i8, IndexModePre, IIC_iLoadiu,
"ldrsb", " $dst, $addr!", "$addr.base = $base_wb",
[]>;
def t2LDRSB_POST : T2Iidxldst<(outs GPR:$dst, GPR:$base_wb),
(ins GPR:$base, t2am_imm8_offset:$offset),
- AddrModeT2_i8, IndexModePost,
+ AddrModeT2_i8, IndexModePost, IIC_iLoadiu,
"ldrsb", " $dst, [$base], $offset", "$base = $base_wb",
[]>;
def t2LDRSH_PRE : T2Iidxldst<(outs GPR:$dst, GPR:$base_wb),
(ins t2addrmode_imm8:$addr),
- AddrModeT2_i8, IndexModePre,
+ AddrModeT2_i8, IndexModePre, IIC_iLoadiu,
"ldrsh", " $dst, $addr!", "$addr.base = $base_wb",
[]>;
def t2LDRSH_POST : T2Iidxldst<(outs GPR:$dst, GPR:$base_wb),
(ins GPR:$base, t2am_imm8_offset:$offset),
- AddrModeT2_i8, IndexModePost,
+ AddrModeT2_i8, IndexModePost, IIC_iLoadiu,
"ldrsh", " $dst, [$base], $offset", "$base = $base_wb",
[]>;
}
@@ -634,108 +576,95 @@ defm t2STRB : T2I_st<"strb", BinOpFrag<(truncstorei8 node:$LHS, node:$RHS)>>;
defm t2STRH : T2I_st<"strh", BinOpFrag<(truncstorei16 node:$LHS, node:$RHS)>>;
// Store doubleword
-let mayLoad = 1 in
-def t2STRDi8 : T2Ii8s4<(outs), (ins GPR:$src, t2addrmode_imm8s4:$addr),
- "strd", " $src, $addr", []>;
+let mayLoad = 1, hasExtraSrcRegAllocReq = 1 in
+def t2STRDi8 : T2Ii8s4<(outs),
+ (ins GPR:$src1, GPR:$src2, t2addrmode_imm8s4:$addr),
+ IIC_iStorer, "strd", " $src1, $addr", []>;
// Indexed stores
def t2STR_PRE : T2Iidxldst<(outs GPR:$base_wb),
(ins GPR:$src, GPR:$base, t2am_imm8_offset:$offset),
- AddrModeT2_i8, IndexModePre,
+ AddrModeT2_i8, IndexModePre, IIC_iStoreiu,
"str", " $src, [$base, $offset]!", "$base = $base_wb",
[(set GPR:$base_wb,
(pre_store GPR:$src, GPR:$base, t2am_imm8_offset:$offset))]>;
def t2STR_POST : T2Iidxldst<(outs GPR:$base_wb),
(ins GPR:$src, GPR:$base, t2am_imm8_offset:$offset),
- AddrModeT2_i8, IndexModePost,
+ AddrModeT2_i8, IndexModePost, IIC_iStoreiu,
"str", " $src, [$base], $offset", "$base = $base_wb",
[(set GPR:$base_wb,
(post_store GPR:$src, GPR:$base, t2am_imm8_offset:$offset))]>;
def t2STRH_PRE : T2Iidxldst<(outs GPR:$base_wb),
(ins GPR:$src, GPR:$base, t2am_imm8_offset:$offset),
- AddrModeT2_i8, IndexModePre,
+ AddrModeT2_i8, IndexModePre, IIC_iStoreiu,
"strh", " $src, [$base, $offset]!", "$base = $base_wb",
[(set GPR:$base_wb,
(pre_truncsti16 GPR:$src, GPR:$base, t2am_imm8_offset:$offset))]>;
def t2STRH_POST : T2Iidxldst<(outs GPR:$base_wb),
(ins GPR:$src, GPR:$base, t2am_imm8_offset:$offset),
- AddrModeT2_i8, IndexModePost,
+ AddrModeT2_i8, IndexModePost, IIC_iStoreiu,
"strh", " $src, [$base], $offset", "$base = $base_wb",
[(set GPR:$base_wb,
(post_truncsti16 GPR:$src, GPR:$base, t2am_imm8_offset:$offset))]>;
def t2STRB_PRE : T2Iidxldst<(outs GPR:$base_wb),
(ins GPR:$src, GPR:$base, t2am_imm8_offset:$offset),
- AddrModeT2_i8, IndexModePre,
+ AddrModeT2_i8, IndexModePre, IIC_iStoreiu,
"strb", " $src, [$base, $offset]!", "$base = $base_wb",
[(set GPR:$base_wb,
(pre_truncsti8 GPR:$src, GPR:$base, t2am_imm8_offset:$offset))]>;
def t2STRB_POST : T2Iidxldst<(outs GPR:$base_wb),
(ins GPR:$src, GPR:$base, t2am_imm8_offset:$offset),
- AddrModeT2_i8, IndexModePost,
+ AddrModeT2_i8, IndexModePost, IIC_iStoreiu,
"strb", " $src, [$base], $offset", "$base = $base_wb",
[(set GPR:$base_wb,
(post_truncsti8 GPR:$src, GPR:$base, t2am_imm8_offset:$offset))]>;
-// Address computation and loads and stores in PIC mode.
-let isNotDuplicable = 1, AddedComplexity = 10 in {
-let canFoldAsLoad = 1 in
-def t2PICLDR : T2I_picld<"ldr", UnOpFrag<(load node:$Src)>>;
-
-def t2PICLDRH : T2I_picld<"ldrh", UnOpFrag<(zextloadi16 node:$Src)>>;
-def t2PICLDRB : T2I_picld<"ldrb", UnOpFrag<(zextloadi8 node:$Src)>>;
-def t2PICLDRSH : T2I_picld<"ldrsh", UnOpFrag<(sextloadi16 node:$Src)>>;
-def t2PICLDRSB : T2I_picld<"ldrsb", UnOpFrag<(sextloadi8 node:$Src)>>;
-
-def t2PICSTR : T2I_picst<"str", BinOpFrag<(store node:$LHS, node:$RHS)>>;
-def t2PICSTRH : T2I_picst<"strh", BinOpFrag<(truncstorei16 node:$LHS, node:$RHS)>>;
-def t2PICSTRB : T2I_picst<"strb", BinOpFrag<(truncstorei8 node:$LHS, node:$RHS)>>;
-} // isNotDuplicable = 1, AddedComplexity = 10
-
+// FIXME: ldrd / strd pre / post variants
//===----------------------------------------------------------------------===//
// Load / store multiple Instructions.
//
-let mayLoad = 1 in
+let mayLoad = 1, hasExtraDefRegAllocReq = 1 in
def t2LDM : T2XI<(outs),
- (ins addrmode4:$addr, pred:$p, reglist:$dst1, variable_ops),
- "ldm${p}${addr:submode} $addr, $dst1", []>;
+ (ins addrmode4:$addr, pred:$p, reglist:$wb, variable_ops),
+ IIC_iLoadm, "ldm${addr:submode}${p}${addr:wide} $addr, $wb", []>;
-let mayStore = 1 in
+let mayStore = 1, hasExtraSrcRegAllocReq = 1 in
def t2STM : T2XI<(outs),
- (ins addrmode4:$addr, pred:$p, reglist:$src1, variable_ops),
- "stm${p}${addr:submode} $addr, $src1", []>;
+ (ins addrmode4:$addr, pred:$p, reglist:$wb, variable_ops),
+ IIC_iStorem, "stm${addr:submode}${p}${addr:wide} $addr, $wb", []>;
//===----------------------------------------------------------------------===//
// Move Instructions.
//
let neverHasSideEffects = 1 in
-def t2MOVr : T2sI<(outs GPR:$dst), (ins GPR:$src),
- "mov", " $dst, $src", []>;
+def t2MOVr : T2sI<(outs GPR:$dst), (ins GPR:$src), IIC_iMOVr,
+ "mov", ".w $dst, $src", []>;
-let isReMaterializable = 1, isAsCheapAsAMove = 1 in
-def t2MOVi : T2sI<(outs GPR:$dst), (ins t2_so_imm:$src),
- "mov", " $dst, $src",
+// AddedComplexity to ensure isel tries t2MOVi before t2MOVi16.
+let isReMaterializable = 1, isAsCheapAsAMove = 1, AddedComplexity = 1 in
+def t2MOVi : T2sI<(outs GPR:$dst), (ins t2_so_imm:$src), IIC_iMOVi,
+ "mov", ".w $dst, $src",
[(set GPR:$dst, t2_so_imm:$src)]>;
let isReMaterializable = 1, isAsCheapAsAMove = 1 in
-def t2MOVi16 : T2I<(outs GPR:$dst), (ins i32imm:$src),
+def t2MOVi16 : T2I<(outs GPR:$dst), (ins i32imm:$src), IIC_iMOVi,
"movw", " $dst, $src",
[(set GPR:$dst, imm0_65535:$src)]>;
-// FIXME: Also available in ARM mode.
let Constraints = "$src = $dst" in
-def t2MOVTi16 : T2sI<(outs GPR:$dst), (ins GPR:$src, i32imm:$imm),
- "movt", " $dst, $imm",
- [(set GPR:$dst,
- (or (and GPR:$src, 0xffff), t2_lo16AllZero:$imm))]>;
+def t2MOVTi16 : T2I<(outs GPR:$dst), (ins GPR:$src, i32imm:$imm), IIC_iMOVi,
+ "movt", " $dst, $imm",
+ [(set GPR:$dst,
+ (or (and GPR:$src, 0xffff), lo16AllZero:$imm))]>;
//===----------------------------------------------------------------------===//
// Extend Instructions.
@@ -785,12 +714,14 @@ defm t2SUBS : T2I_bin_s_irs <"sub", BinOpFrag<(subc node:$LHS, node:$RHS)>>;
defm t2ADC : T2I_adde_sube_irs<"adc",BinOpFrag<(adde node:$LHS, node:$RHS)>,1>;
defm t2SBC : T2I_adde_sube_irs<"sbc",BinOpFrag<(sube node:$LHS, node:$RHS)>>;
-// RSB, RSC
+// RSB
defm t2RSB : T2I_rbin_is <"rsb", BinOpFrag<(sub node:$LHS, node:$RHS)>>;
defm t2RSBS : T2I_rbin_s_is <"rsb", BinOpFrag<(subc node:$LHS, node:$RHS)>>;
-defm t2RSC : T2I_rsc_is <"rsc", BinOpFrag<(sube node:$LHS, node:$RHS)>>;
// (sub X, imm) gets canonicalized to (add X, -imm). Match this form.
+let AddedComplexity = 1 in
+def : T2Pat<(add GPR:$src, imm0_255_neg:$imm),
+ (t2SUBri GPR:$src, imm0_255_neg:$imm)>;
def : T2Pat<(add GPR:$src, t2_so_imm_neg:$imm),
(t2SUBri GPR:$src, t2_so_imm_neg:$imm)>;
def : T2Pat<(add GPR:$src, imm0_4095_neg:$imm),
@@ -806,105 +737,250 @@ defm t2LSR : T2I_sh_ir<"lsr", BinOpFrag<(srl node:$LHS, node:$RHS)>>;
defm t2ASR : T2I_sh_ir<"asr", BinOpFrag<(sra node:$LHS, node:$RHS)>>;
defm t2ROR : T2I_sh_ir<"ror", BinOpFrag<(rotr node:$LHS, node:$RHS)>>;
-def t2MOVrx : T2sI<(outs GPR:$dst), (ins GPR:$src),
- "mov", " $dst, $src, rrx",
+let Uses = [CPSR] in {
+def t2MOVrx : T2sI<(outs GPR:$dst), (ins GPR:$src), IIC_iMOVsi,
+ "rrx", " $dst, $src",
[(set GPR:$dst, (ARMrrx GPR:$src))]>;
+}
+
+let Defs = [CPSR] in {
+def t2MOVsrl_flag : T2XI<(outs GPR:$dst), (ins GPR:$src), IIC_iMOVsi,
+ "lsrs.w $dst, $src, #1",
+ [(set GPR:$dst, (ARMsrl_flag GPR:$src))]>;
+def t2MOVsra_flag : T2XI<(outs GPR:$dst), (ins GPR:$src), IIC_iMOVsi,
+ "asrs.w $dst, $src, #1",
+ [(set GPR:$dst, (ARMsra_flag GPR:$src))]>;
+}
//===----------------------------------------------------------------------===//
// Bitwise Instructions.
//
-defm t2AND : T2I_bin_irs<"and", BinOpFrag<(and node:$LHS, node:$RHS)>, 1>;
-defm t2ORR : T2I_bin_irs<"orr", BinOpFrag<(or node:$LHS, node:$RHS)>, 1>;
-defm t2EOR : T2I_bin_irs<"eor", BinOpFrag<(xor node:$LHS, node:$RHS)>, 1>;
+defm t2AND : T2I_bin_w_irs<"and", BinOpFrag<(and node:$LHS, node:$RHS)>, 1>;
+defm t2ORR : T2I_bin_w_irs<"orr", BinOpFrag<(or node:$LHS, node:$RHS)>, 1>;
+defm t2EOR : T2I_bin_w_irs<"eor", BinOpFrag<(xor node:$LHS, node:$RHS)>, 1>;
-defm t2BIC : T2I_bin_irs<"bic", BinOpFrag<(and node:$LHS, (not node:$RHS))>>;
+defm t2BIC : T2I_bin_w_irs<"bic", BinOpFrag<(and node:$LHS, (not node:$RHS))>>;
-def : T2Pat<(and GPR:$src, t2_so_imm_not:$imm),
- (t2BICri GPR:$src, t2_so_imm_not:$imm)>;
+let Constraints = "$src = $dst" in
+def t2BFC : T2I<(outs GPR:$dst), (ins GPR:$src, bf_inv_mask_imm:$imm),
+ IIC_iALUi, "bfc", " $dst, $imm",
+ [(set GPR:$dst, (and GPR:$src, bf_inv_mask_imm:$imm))]>;
-defm t2ORN : T2I_bin_irs<"orn", BinOpFrag<(or node:$LHS, (not node:$RHS))>>;
+def t2SBFX : T2I<(outs GPR:$dst), (ins GPR:$src, imm0_31:$lsb, imm0_31:$width),
+ IIC_iALUi, "sbfx", " $dst, $src, $lsb, $width", []>;
-def : T2Pat<(or GPR:$src, t2_so_imm_not:$imm),
- (t2ORNri GPR:$src, t2_so_imm_not:$imm)>;
+def t2UBFX : T2I<(outs GPR:$dst), (ins GPR:$src, imm0_31:$lsb, imm0_31:$width),
+ IIC_iALUi, "ubfx", " $dst, $src, $lsb, $width", []>;
+
+// FIXME: A8.6.18 BFI - Bitfield insert (Encoding T1)
+
+defm t2ORN : T2I_bin_irs<"orn", BinOpFrag<(or node:$LHS, (not node:$RHS))>>;
// Prefer over of t2EORri ra, rb, -1 because mvn has 16-bit version
let AddedComplexity = 1 in
defm t2MVN : T2I_un_irs <"mvn", UnOpFrag<(not node:$Src)>, 1, 1>;
-def : T2Pat<(t2_so_imm_not:$src),
- (t2MVNi t2_so_imm_not:$src)>;
-// A8.6.17 BFC - Bitfield clear
-// FIXME: Also available in ARM mode.
-let Constraints = "$src = $dst" in
-def t2BFC : T2I<(outs GPR:$dst), (ins GPR:$src, bf_inv_mask_imm:$imm),
- "bfc", " $dst, $imm",
- [(set GPR:$dst, (and GPR:$src, bf_inv_mask_imm:$imm))]>;
+def : T2Pat<(and GPR:$src, t2_so_imm_not:$imm),
+ (t2BICri GPR:$src, t2_so_imm_not:$imm)>;
-// FIXME: A8.6.18 BFI - Bitfield insert (Encoding T1)
+// FIXME: Disable this pattern on Darwin to workaround an assembler bug.
+def : T2Pat<(or GPR:$src, t2_so_imm_not:$imm),
+ (t2ORNri GPR:$src, t2_so_imm_not:$imm)>,
+ Requires<[IsThumb2]>;
+
+def : T2Pat<(t2_so_imm_not:$src),
+ (t2MVNi t2_so_imm_not:$src)>;
//===----------------------------------------------------------------------===//
// Multiply Instructions.
//
let isCommutable = 1 in
-def t2MUL: T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b),
+def t2MUL: T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b), IIC_iMUL32,
"mul", " $dst, $a, $b",
[(set GPR:$dst, (mul GPR:$a, GPR:$b))]>;
-def t2MLA: T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c),
+def t2MLA: T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c), IIC_iMAC32,
"mla", " $dst, $a, $b, $c",
[(set GPR:$dst, (add (mul GPR:$a, GPR:$b), GPR:$c))]>;
-def t2MLS: T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c),
+def t2MLS: T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c), IIC_iMAC32,
"mls", " $dst, $a, $b, $c",
[(set GPR:$dst, (sub GPR:$c, (mul GPR:$a, GPR:$b)))]>;
-// FIXME: SMULL, etc.
+// Extra precision multiplies with low / high results
+let neverHasSideEffects = 1 in {
+let isCommutable = 1 in {
+def t2SMULL : T2I<(outs GPR:$ldst, GPR:$hdst), (ins GPR:$a, GPR:$b), IIC_iMUL64,
+ "smull", " $ldst, $hdst, $a, $b", []>;
+
+def t2UMULL : T2I<(outs GPR:$ldst, GPR:$hdst), (ins GPR:$a, GPR:$b), IIC_iMUL64,
+ "umull", " $ldst, $hdst, $a, $b", []>;
+}
+
+// Multiply + accumulate
+def t2SMLAL : T2I<(outs GPR:$ldst, GPR:$hdst), (ins GPR:$a, GPR:$b), IIC_iMAC64,
+ "smlal", " $ldst, $hdst, $a, $b", []>;
+
+def t2UMLAL : T2I<(outs GPR:$ldst, GPR:$hdst), (ins GPR:$a, GPR:$b), IIC_iMAC64,
+ "umlal", " $ldst, $hdst, $a, $b", []>;
+
+def t2UMAAL : T2I<(outs GPR:$ldst, GPR:$hdst), (ins GPR:$a, GPR:$b), IIC_iMAC64,
+ "umaal", " $ldst, $hdst, $a, $b", []>;
+} // neverHasSideEffects
+
+// Most significant word multiply
+def t2SMMUL : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b), IIC_iMUL32,
+ "smmul", " $dst, $a, $b",
+ [(set GPR:$dst, (mulhs GPR:$a, GPR:$b))]>;
+
+def t2SMMLA : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c), IIC_iMAC32,
+ "smmla", " $dst, $a, $b, $c",
+ [(set GPR:$dst, (add (mulhs GPR:$a, GPR:$b), GPR:$c))]>;
+
+
+def t2SMMLS : T2I <(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c), IIC_iMAC32,
+ "smmls", " $dst, $a, $b, $c",
+ [(set GPR:$dst, (sub GPR:$c, (mulhs GPR:$a, GPR:$b)))]>;
+
+multiclass T2I_smul<string opc, PatFrag opnode> {
+ def BB : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b), IIC_iMUL32,
+ !strconcat(opc, "bb"), " $dst, $a, $b",
+ [(set GPR:$dst, (opnode (sext_inreg GPR:$a, i16),
+ (sext_inreg GPR:$b, i16)))]>;
+
+ def BT : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b), IIC_iMUL32,
+ !strconcat(opc, "bt"), " $dst, $a, $b",
+ [(set GPR:$dst, (opnode (sext_inreg GPR:$a, i16),
+ (sra GPR:$b, (i32 16))))]>;
+
+ def TB : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b), IIC_iMUL32,
+ !strconcat(opc, "tb"), " $dst, $a, $b",
+ [(set GPR:$dst, (opnode (sra GPR:$a, (i32 16)),
+ (sext_inreg GPR:$b, i16)))]>;
+
+ def TT : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b), IIC_iMUL32,
+ !strconcat(opc, "tt"), " $dst, $a, $b",
+ [(set GPR:$dst, (opnode (sra GPR:$a, (i32 16)),
+ (sra GPR:$b, (i32 16))))]>;
+
+ def WB : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b), IIC_iMUL16,
+ !strconcat(opc, "wb"), " $dst, $a, $b",
+ [(set GPR:$dst, (sra (opnode GPR:$a,
+ (sext_inreg GPR:$b, i16)), (i32 16)))]>;
+
+ def WT : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b), IIC_iMUL16,
+ !strconcat(opc, "wt"), " $dst, $a, $b",
+ [(set GPR:$dst, (sra (opnode GPR:$a,
+ (sra GPR:$b, (i32 16))), (i32 16)))]>;
+}
+
+
+multiclass T2I_smla<string opc, PatFrag opnode> {
+ def BB : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc), IIC_iMAC16,
+ !strconcat(opc, "bb"), " $dst, $a, $b, $acc",
+ [(set GPR:$dst, (add GPR:$acc,
+ (opnode (sext_inreg GPR:$a, i16),
+ (sext_inreg GPR:$b, i16))))]>;
+
+ def BT : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc), IIC_iMAC16,
+ !strconcat(opc, "bt"), " $dst, $a, $b, $acc",
+ [(set GPR:$dst, (add GPR:$acc, (opnode (sext_inreg GPR:$a, i16),
+ (sra GPR:$b, (i32 16)))))]>;
+
+ def TB : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc), IIC_iMAC16,
+ !strconcat(opc, "tb"), " $dst, $a, $b, $acc",
+ [(set GPR:$dst, (add GPR:$acc, (opnode (sra GPR:$a, (i32 16)),
+ (sext_inreg GPR:$b, i16))))]>;
+
+ def TT : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc), IIC_iMAC16,
+ !strconcat(opc, "tt"), " $dst, $a, $b, $acc",
+ [(set GPR:$dst, (add GPR:$acc, (opnode (sra GPR:$a, (i32 16)),
+ (sra GPR:$b, (i32 16)))))]>;
+
+ def WB : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc), IIC_iMAC16,
+ !strconcat(opc, "wb"), " $dst, $a, $b, $acc",
+ [(set GPR:$dst, (add GPR:$acc, (sra (opnode GPR:$a,
+ (sext_inreg GPR:$b, i16)), (i32 16))))]>;
+
+ def WT : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc), IIC_iMAC16,
+ !strconcat(opc, "wt"), " $dst, $a, $b, $acc",
+ [(set GPR:$dst, (add GPR:$acc, (sra (opnode GPR:$a,
+ (sra GPR:$b, (i32 16))), (i32 16))))]>;
+}
+
+defm t2SMUL : T2I_smul<"smul", BinOpFrag<(mul node:$LHS, node:$RHS)>>;
+defm t2SMLA : T2I_smla<"smla", BinOpFrag<(mul node:$LHS, node:$RHS)>>;
+
+// TODO: Halfword multiple accumulate long: SMLAL<x><y>
+// TODO: Dual halfword multiple: SMUAD, SMUSD, SMLAD, SMLSD, SMLALD, SMLSLD
+
//===----------------------------------------------------------------------===//
// Misc. Arithmetic Instructions.
//
-def t2CLZ : T2I<(outs GPR:$dst), (ins GPR:$src),
+def t2CLZ : T2I<(outs GPR:$dst), (ins GPR:$src), IIC_iUNAr,
"clz", " $dst, $src",
[(set GPR:$dst, (ctlz GPR:$src))]>;
-def t2REV : T2I<(outs GPR:$dst), (ins GPR:$src),
- "rev", " $dst, $src",
+def t2REV : T2I<(outs GPR:$dst), (ins GPR:$src), IIC_iUNAr,
+ "rev", ".w $dst, $src",
[(set GPR:$dst, (bswap GPR:$src))]>;
-def t2REV16 : T2I<(outs GPR:$dst), (ins GPR:$src),
- "rev16", " $dst, $src",
+def t2REV16 : T2I<(outs GPR:$dst), (ins GPR:$src), IIC_iUNAr,
+ "rev16", ".w $dst, $src",
[(set GPR:$dst,
(or (and (srl GPR:$src, (i32 8)), 0xFF),
(or (and (shl GPR:$src, (i32 8)), 0xFF00),
(or (and (srl GPR:$src, (i32 8)), 0xFF0000),
(and (shl GPR:$src, (i32 8)), 0xFF000000)))))]>;
-/////
-/// A8.6.137 REVSH
-/////
-def t2REVSH : T2I<(outs GPR:$dst), (ins GPR:$src),
- "revsh", " $dst, $src",
+def t2REVSH : T2I<(outs GPR:$dst), (ins GPR:$src), IIC_iUNAr,
+ "revsh", ".w $dst, $src",
[(set GPR:$dst,
(sext_inreg
- (or (srl (and GPR:$src, 0xFFFF), (i32 8)),
+ (or (srl (and GPR:$src, 0xFF00), (i32 8)),
(shl GPR:$src, (i32 8))), i16))]>;
-// FIXME: PKHxx etc.
+def t2PKHBT : T2I<(outs GPR:$dst), (ins GPR:$src1, GPR:$src2, i32imm:$shamt),
+ IIC_iALUsi, "pkhbt", " $dst, $src1, $src2, LSL $shamt",
+ [(set GPR:$dst, (or (and GPR:$src1, 0xFFFF),
+ (and (shl GPR:$src2, (i32 imm:$shamt)),
+ 0xFFFF0000)))]>;
+
+// Alternate cases for PKHBT where identities eliminate some nodes.
+def : T2Pat<(or (and GPR:$src1, 0xFFFF), (and GPR:$src2, 0xFFFF0000)),
+ (t2PKHBT GPR:$src1, GPR:$src2, 0)>;
+def : T2Pat<(or (and GPR:$src1, 0xFFFF), (shl GPR:$src2, imm16_31:$shamt)),
+ (t2PKHBT GPR:$src1, GPR:$src2, imm16_31:$shamt)>;
+
+def t2PKHTB : T2I<(outs GPR:$dst), (ins GPR:$src1, GPR:$src2, i32imm:$shamt),
+ IIC_iALUsi, "pkhtb", " $dst, $src1, $src2, ASR $shamt",
+ [(set GPR:$dst, (or (and GPR:$src1, 0xFFFF0000),
+ (and (sra GPR:$src2, imm16_31:$shamt),
+ 0xFFFF)))]>;
+
+// Alternate cases for PKHTB where identities eliminate some nodes. Note that
+// a shift amount of 0 is *not legal* here, it is PKHBT instead.
+def : T2Pat<(or (and GPR:$src1, 0xFFFF0000), (srl GPR:$src2, (i32 16))),
+ (t2PKHTB GPR:$src1, GPR:$src2, 16)>;
+def : T2Pat<(or (and GPR:$src1, 0xFFFF0000),
+ (and (srl GPR:$src2, imm1_15:$shamt), 0xFFFF)),
+ (t2PKHTB GPR:$src1, GPR:$src2, imm1_15:$shamt)>;
//===----------------------------------------------------------------------===//
// Comparison Instructions...
//
-defm t2CMP : T2I_cmp_is<"cmp",
- BinOpFrag<(ARMcmp node:$LHS, node:$RHS)>>;
+defm t2CMP : T2I_cmp_is<"cmp",
+ BinOpFrag<(ARMcmp node:$LHS, node:$RHS)>>;
defm t2CMPz : T2I_cmp_is<"cmp",
BinOpFrag<(ARMcmpZ node:$LHS, node:$RHS)>>;
-defm t2CMN : T2I_cmp_is<"cmn",
- BinOpFrag<(ARMcmp node:$LHS,(ineg node:$RHS))>>;
+defm t2CMN : T2I_cmp_is<"cmn",
+ BinOpFrag<(ARMcmp node:$LHS,(ineg node:$RHS))>>;
defm t2CMNz : T2I_cmp_is<"cmn",
BinOpFrag<(ARMcmpZ node:$LHS,(ineg node:$RHS))>>;
@@ -923,45 +999,132 @@ defm t2TEQ : T2I_cmp_is<"teq",
// Short range conditional branch. Looks awesome for loops. Need to figure
// out how to use this one.
-// FIXME: Conditional moves
+
+// Conditional moves
+// FIXME: should be able to write a pattern for ARMcmov, but can't use
+// a two-value operand where a dag node expects two operands. :(
+def t2MOVCCr : T2I<(outs GPR:$dst), (ins GPR:$false, GPR:$true), IIC_iCMOVr,
+ "mov", ".w $dst, $true",
+ [/*(set GPR:$dst, (ARMcmov GPR:$false, GPR:$true, imm:$cc, CCR:$ccr))*/]>,
+ RegConstraint<"$false = $dst">;
+
+def t2MOVCCi : T2I<(outs GPR:$dst), (ins GPR:$false, t2_so_imm:$true),
+ IIC_iCMOVi, "mov", ".w $dst, $true",
+[/*(set GPR:$dst, (ARMcmov GPR:$false, t2_so_imm:$true, imm:$cc, CCR:$ccr))*/]>,
+ RegConstraint<"$false = $dst">;
+
+def t2MOVCClsl : T2I<(outs GPR:$dst), (ins GPR:$false, GPR:$true, i32imm:$rhs),
+ IIC_iCMOVsi, "lsl", ".w $dst, $true, $rhs", []>,
+ RegConstraint<"$false = $dst">;
+def t2MOVCClsr : T2I<(outs GPR:$dst), (ins GPR:$false, GPR:$true, i32imm:$rhs),
+ IIC_iCMOVsi, "lsr", ".w $dst, $true, $rhs", []>,
+ RegConstraint<"$false = $dst">;
+def t2MOVCCasr : T2I<(outs GPR:$dst), (ins GPR:$false, GPR:$true, i32imm:$rhs),
+ IIC_iCMOVsi, "asr", ".w $dst, $true, $rhs", []>,
+ RegConstraint<"$false = $dst">;
+def t2MOVCCror : T2I<(outs GPR:$dst), (ins GPR:$false, GPR:$true, i32imm:$rhs),
+ IIC_iCMOVsi, "ror", ".w $dst, $true, $rhs", []>,
+ RegConstraint<"$false = $dst">;
+
+//===----------------------------------------------------------------------===//
+// TLS Instructions
+//
+
+// __aeabi_read_tp preserves the registers r1-r3.
+let isCall = 1,
+ Defs = [R0, R12, LR, CPSR] in {
+ def t2TPsoft : T2XI<(outs), (ins), IIC_Br,
+ "bl __aeabi_read_tp",
+ [(set R0, ARMthread_pointer)]>;
+}
+
+//===----------------------------------------------------------------------===//
+// SJLJ Exception handling intrinsics
+// eh_sjlj_setjmp() is an instruction sequence to store the return
+// address and save #0 in R0 for the non-longjmp case.
+// Since by its nature we may be coming from some other function to get
+// here, and we're using the stack frame for the containing function to
+// save/restore registers, we can't keep anything live in regs across
+// the eh_sjlj_setjmp(), else it will almost certainly have been tromped upon
+// when we get here from a longjmp(). We force everthing out of registers
+// except for our own input by listing the relevant registers in Defs. By
+// doing so, we also cause the prologue/epilogue code to actively preserve
+// all of the callee-saved resgisters, which is exactly what we want.
+let Defs =
+ [ R0, R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, LR, D0,
+ D1, D2, D3, D4, D5, D6, D7, D8, D9, D10, D11, D12, D13, D14, D15,
+ D16, D17, D18, D19, D20, D21, D22, D23, D24, D25, D26, D27, D28, D29, D30,
+ D31 ] in {
+ def t2Int_eh_sjlj_setjmp : Thumb2XI<(outs), (ins GPR:$src),
+ AddrModeNone, SizeSpecial, NoItinerary,
+ "str.w sp, [$src, #+8] @ eh_setjmp begin\n"
+ "\tadr r12, 0f\n"
+ "\torr r12, #1\n"
+ "\tstr.w r12, [$src, #+4]\n"
+ "\tmovs r0, #0\n"
+ "\tb 1f\n"
+ "0:\tmovs r0, #1 @ eh_setjmp end\n"
+ "1:", "",
+ [(set R0, (ARMeh_sjlj_setjmp GPR:$src))]>;
+}
+
+
//===----------------------------------------------------------------------===//
// Control-Flow Instructions
//
+// FIXME: remove when we have a way to marking a MI with these properties.
+// FIXME: $dst1 should be a def. But the extra ops must be in the end of the
+// operand list.
+// FIXME: Should pc be an implicit operand like PICADD, etc?
+let isReturn = 1, isTerminator = 1, isBarrier = 1, mayLoad = 1,
+ hasExtraDefRegAllocReq = 1 in
+ def t2LDM_RET : T2XI<(outs),
+ (ins addrmode4:$addr, pred:$p, reglist:$wb, variable_ops),
+ IIC_Br, "ldm${addr:submode}${p}${addr:wide} $addr, $wb",
+ []>;
+
let isBranch = 1, isTerminator = 1, isBarrier = 1 in {
let isPredicable = 1 in
-def t2B : T2XI<(outs), (ins brtarget:$target),
- "b $target",
+def t2B : T2XI<(outs), (ins brtarget:$target), IIC_Br,
+ "b.w $target",
[(br bb:$target)]>;
let isNotDuplicable = 1, isIndirectBranch = 1 in {
-def t2BR_JTr : T2JTI<(outs), (ins GPR:$target, jtblock_operand:$jt, i32imm:$id),
- "mov pc, $target \n$jt",
- [(ARMbrjt GPR:$target, tjumptable:$jt, imm:$id)]>;
+def t2BR_JT :
+ T2JTI<(outs),
+ (ins GPR:$target, GPR:$index, jt2block_operand:$jt, i32imm:$id),
+ IIC_Br, "mov pc, $target\n$jt",
+ [(ARMbr2jt GPR:$target, GPR:$index, tjumptable:$jt, imm:$id)]>;
-def t2BR_JTm :
+// FIXME: Add a non-pc based case that can be predicated.
+def t2TBB :
T2JTI<(outs),
- (ins t2addrmode_so_reg:$target, jtblock_operand:$jt, i32imm:$id),
- "ldr pc, $target \n$jt",
- [(ARMbrjt (i32 (load t2addrmode_so_reg:$target)), tjumptable:$jt,
- imm:$id)]>;
+ (ins tb_addrmode:$index, jt2block_operand:$jt, i32imm:$id),
+ IIC_Br, "tbb $index\n$jt", []>;
-def t2BR_JTadd :
+def t2TBH :
T2JTI<(outs),
- (ins GPR:$target, GPR:$idx, jtblock_operand:$jt, i32imm:$id),
- "add pc, $target, $idx \n$jt",
- [(ARMbrjt (add GPR:$target, GPR:$idx), tjumptable:$jt, imm:$id)]>;
-} // isNotDuplicate, isIndirectBranch
+ (ins tb_addrmode:$index, jt2block_operand:$jt, i32imm:$id),
+ IIC_Br, "tbh $index\n$jt", []>;
+} // isNotDuplicable, isIndirectBranch
+
} // isBranch, isTerminator, isBarrier
// FIXME: should be able to write a pattern for ARMBrcond, but can't use
// a two-value operand where a dag node expects two operands. :(
let isBranch = 1, isTerminator = 1 in
-def t2Bcc : T2I<(outs), (ins brtarget:$target),
- "b", " $target",
+def t2Bcc : T2I<(outs), (ins brtarget:$target), IIC_Br,
+ "b", ".w $target",
[/*(ARMbrcond bb:$target, imm:$cc)*/]>;
+
+// IT block
+def t2IT : Thumb2XI<(outs), (ins it_pred:$cc, it_mask:$mask),
+ AddrModeNone, Size2Bytes, IIC_iALUx,
+ "it$mask $cc", "", []>;
+
//===----------------------------------------------------------------------===//
// Non-Instruction Patterns
//
@@ -972,7 +1135,10 @@ def : T2Pat<(ARMWrapper tconstpool :$dst), (t2LEApcrel tconstpool :$dst)>;
def : T2Pat<(ARMWrapperJT tjumptable:$dst, imm:$id),
(t2LEApcrelJT tjumptable:$dst, imm:$id)>;
-// Large immediate handling.
-
-def : T2Pat<(i32 imm:$src),
- (t2MOVTi16 (t2MOVi16 (t2_lo16 imm:$src)), (t2_hi16 imm:$src))>;
+// 32-bit immediate using movw + movt.
+// This is a single pseudo instruction to make it re-materializable. Remove
+// when we can do generalized remat.
+let isReMaterializable = 1 in
+def t2MOVi32imm : T2Ix2<(outs GPR:$dst), (ins i32imm:$src), IIC_iMOVi,
+ "movw", " $dst, ${src:lo16}\n\tmovt${p} $dst, ${src:hi16}",
+ [(set GPR:$dst, (i32 imm:$src))]>;
diff --git a/lib/Target/ARM/ARMInstrVFP.td b/lib/Target/ARM/ARMInstrVFP.td
index 9104c77..56336d1 100644
--- a/lib/Target/ARM/ARMInstrVFP.td
+++ b/lib/Target/ARM/ARMInstrVFP.td
@@ -36,57 +36,57 @@ def arm_fmdrr : SDNode<"ARMISD::FMDRR", SDT_FMDRR>;
let canFoldAsLoad = 1 in {
def FLDD : ADI5<0b1101, 0b01, (outs DPR:$dst), (ins addrmode5:$addr),
- "fldd", " $dst, $addr",
+ IIC_fpLoad64, "fldd", " $dst, $addr",
[(set DPR:$dst, (load addrmode5:$addr))]>;
def FLDS : ASI5<0b1101, 0b01, (outs SPR:$dst), (ins addrmode5:$addr),
- "flds", " $dst, $addr",
+ IIC_fpLoad32, "flds", " $dst, $addr",
[(set SPR:$dst, (load addrmode5:$addr))]>;
} // canFoldAsLoad
def FSTD : ADI5<0b1101, 0b00, (outs), (ins DPR:$src, addrmode5:$addr),
- "fstd", " $src, $addr",
+ IIC_fpStore64, "fstd", " $src, $addr",
[(store DPR:$src, addrmode5:$addr)]>;
def FSTS : ASI5<0b1101, 0b00, (outs), (ins SPR:$src, addrmode5:$addr),
- "fsts", " $src, $addr",
+ IIC_fpStore32, "fsts", " $src, $addr",
[(store SPR:$src, addrmode5:$addr)]>;
//===----------------------------------------------------------------------===//
// Load / store multiple Instructions.
//
-let mayLoad = 1 in {
-def FLDMD : AXDI5<(outs), (ins addrmode5:$addr, pred:$p, reglist:$dst1,
- variable_ops),
- "fldm${addr:submode}d${p} ${addr:base}, $dst1",
+let mayLoad = 1, hasExtraDefRegAllocReq = 1 in {
+def FLDMD : AXDI5<(outs), (ins addrmode5:$addr, pred:$p, reglist:$wb,
+ variable_ops), IIC_fpLoadm,
+ "fldm${addr:submode}d${p} ${addr:base}, $wb",
[]> {
let Inst{20} = 1;
}
-def FLDMS : AXSI5<(outs), (ins addrmode5:$addr, pred:$p, reglist:$dst1,
- variable_ops),
- "fldm${addr:submode}s${p} ${addr:base}, $dst1",
+def FLDMS : AXSI5<(outs), (ins addrmode5:$addr, pred:$p, reglist:$wb,
+ variable_ops), IIC_fpLoadm,
+ "fldm${addr:submode}s${p} ${addr:base}, $wb",
[]> {
let Inst{20} = 1;
}
-}
+} // mayLoad, hasExtraDefRegAllocReq
-let mayStore = 1 in {
-def FSTMD : AXDI5<(outs), (ins addrmode5:$addr, pred:$p, reglist:$src1,
- variable_ops),
- "fstm${addr:submode}d${p} ${addr:base}, $src1",
+let mayStore = 1, hasExtraSrcRegAllocReq = 1 in {
+def FSTMD : AXDI5<(outs), (ins addrmode5:$addr, pred:$p, reglist:$wb,
+ variable_ops), IIC_fpStorem,
+ "fstm${addr:submode}d${p} ${addr:base}, $wb",
[]> {
let Inst{20} = 0;
}
-def FSTMS : AXSI5<(outs), (ins addrmode5:$addr, pred:$p, reglist:$src1,
- variable_ops),
- "fstm${addr:submode}s${p} ${addr:base}, $src1",
+def FSTMS : AXSI5<(outs), (ins addrmode5:$addr, pred:$p, reglist:$wb,
+ variable_ops), IIC_fpStorem,
+ "fstm${addr:submode}s${p} ${addr:base}, $wb",
[]> {
let Inst{20} = 0;
}
-} // mayStore
+} // mayStore, hasExtraSrcRegAllocReq
// FLDMX, FSTMX - mixing S/D registers for pre-armv6 cores
@@ -95,46 +95,48 @@ def FSTMS : AXSI5<(outs), (ins addrmode5:$addr, pred:$p, reglist:$src1,
//
def FADDD : ADbI<0b11100011, (outs DPR:$dst), (ins DPR:$a, DPR:$b),
- "faddd", " $dst, $a, $b",
+ IIC_fpALU64, "faddd", " $dst, $a, $b",
[(set DPR:$dst, (fadd DPR:$a, DPR:$b))]>;
-def FADDS : ASbI<0b11100011, (outs SPR:$dst), (ins SPR:$a, SPR:$b),
- "fadds", " $dst, $a, $b",
- [(set SPR:$dst, (fadd SPR:$a, SPR:$b))]>;
+def FADDS : ASbIn<0b11100011, (outs SPR:$dst), (ins SPR:$a, SPR:$b),
+ IIC_fpALU32, "fadds", " $dst, $a, $b",
+ [(set SPR:$dst, (fadd SPR:$a, SPR:$b))]>;
// These are encoded as unary instructions.
+let Defs = [FPSCR] in {
def FCMPED : ADuI<0b11101011, 0b0100, 0b1100, (outs), (ins DPR:$a, DPR:$b),
- "fcmped", " $a, $b",
+ IIC_fpCMP64, "fcmped", " $a, $b",
[(arm_cmpfp DPR:$a, DPR:$b)]>;
def FCMPES : ASuI<0b11101011, 0b0100, 0b1100, (outs), (ins SPR:$a, SPR:$b),
- "fcmpes", " $a, $b",
+ IIC_fpCMP32, "fcmpes", " $a, $b",
[(arm_cmpfp SPR:$a, SPR:$b)]>;
+}
def FDIVD : ADbI<0b11101000, (outs DPR:$dst), (ins DPR:$a, DPR:$b),
- "fdivd", " $dst, $a, $b",
+ IIC_fpDIV64, "fdivd", " $dst, $a, $b",
[(set DPR:$dst, (fdiv DPR:$a, DPR:$b))]>;
def FDIVS : ASbI<0b11101000, (outs SPR:$dst), (ins SPR:$a, SPR:$b),
- "fdivs", " $dst, $a, $b",
+ IIC_fpDIV32, "fdivs", " $dst, $a, $b",
[(set SPR:$dst, (fdiv SPR:$a, SPR:$b))]>;
def FMULD : ADbI<0b11100010, (outs DPR:$dst), (ins DPR:$a, DPR:$b),
- "fmuld", " $dst, $a, $b",
+ IIC_fpMUL64, "fmuld", " $dst, $a, $b",
[(set DPR:$dst, (fmul DPR:$a, DPR:$b))]>;
-def FMULS : ASbI<0b11100010, (outs SPR:$dst), (ins SPR:$a, SPR:$b),
- "fmuls", " $dst, $a, $b",
- [(set SPR:$dst, (fmul SPR:$a, SPR:$b))]>;
+def FMULS : ASbIn<0b11100010, (outs SPR:$dst), (ins SPR:$a, SPR:$b),
+ IIC_fpMUL32, "fmuls", " $dst, $a, $b",
+ [(set SPR:$dst, (fmul SPR:$a, SPR:$b))]>;
def FNMULD : ADbI<0b11100010, (outs DPR:$dst), (ins DPR:$a, DPR:$b),
- "fnmuld", " $dst, $a, $b",
+ IIC_fpMUL64, "fnmuld", " $dst, $a, $b",
[(set DPR:$dst, (fneg (fmul DPR:$a, DPR:$b)))]> {
let Inst{6} = 1;
}
def FNMULS : ASbI<0b11100010, (outs SPR:$dst), (ins SPR:$a, SPR:$b),
- "fnmuls", " $dst, $a, $b",
+ IIC_fpMUL32, "fnmuls", " $dst, $a, $b",
[(set SPR:$dst, (fneg (fmul SPR:$a, SPR:$b)))]> {
let Inst{6} = 1;
}
@@ -147,14 +149,14 @@ def : Pat<(fmul (fneg SPR:$a), SPR:$b),
def FSUBD : ADbI<0b11100011, (outs DPR:$dst), (ins DPR:$a, DPR:$b),
- "fsubd", " $dst, $a, $b",
+ IIC_fpALU64, "fsubd", " $dst, $a, $b",
[(set DPR:$dst, (fsub DPR:$a, DPR:$b))]> {
let Inst{6} = 1;
}
-def FSUBS : ASbI<0b11100011, (outs SPR:$dst), (ins SPR:$a, SPR:$b),
- "fsubs", " $dst, $a, $b",
- [(set SPR:$dst, (fsub SPR:$a, SPR:$b))]> {
+def FSUBS : ASbIn<0b11100011, (outs SPR:$dst), (ins SPR:$a, SPR:$b),
+ IIC_fpALU32, "fsubs", " $dst, $a, $b",
+ [(set SPR:$dst, (fsub SPR:$a, SPR:$b))]> {
let Inst{6} = 1;
}
@@ -163,29 +165,31 @@ def FSUBS : ASbI<0b11100011, (outs SPR:$dst), (ins SPR:$a, SPR:$b),
//
def FABSD : ADuI<0b11101011, 0b0000, 0b1100, (outs DPR:$dst), (ins DPR:$a),
- "fabsd", " $dst, $a",
+ IIC_fpUNA64, "fabsd", " $dst, $a",
[(set DPR:$dst, (fabs DPR:$a))]>;
-def FABSS : ASuI<0b11101011, 0b0000, 0b1100, (outs SPR:$dst), (ins SPR:$a),
- "fabss", " $dst, $a",
- [(set SPR:$dst, (fabs SPR:$a))]>;
+def FABSS : ASuIn<0b11101011, 0b0000, 0b1100, (outs SPR:$dst), (ins SPR:$a),
+ IIC_fpUNA32, "fabss", " $dst, $a",
+ [(set SPR:$dst, (fabs SPR:$a))]>;
+let Defs = [FPSCR] in {
def FCMPEZD : ADuI<0b11101011, 0b0101, 0b1100, (outs), (ins DPR:$a),
- "fcmpezd", " $a",
+ IIC_fpCMP64, "fcmpezd", " $a",
[(arm_cmpfp0 DPR:$a)]>;
def FCMPEZS : ASuI<0b11101011, 0b0101, 0b1100, (outs), (ins SPR:$a),
- "fcmpezs", " $a",
+ IIC_fpCMP32, "fcmpezs", " $a",
[(arm_cmpfp0 SPR:$a)]>;
+}
def FCVTDS : ASuI<0b11101011, 0b0111, 0b1100, (outs DPR:$dst), (ins SPR:$a),
- "fcvtds", " $dst, $a",
+ IIC_fpCVTDS, "fcvtds", " $dst, $a",
[(set DPR:$dst, (fextend SPR:$a))]>;
// Special case encoding: bits 11-8 is 0b1011.
-def FCVTSD : AI<(outs SPR:$dst), (ins DPR:$a), VFPUnaryFrm,
- "fcvtsd", " $dst, $a",
- [(set SPR:$dst, (fround DPR:$a))]> {
+def FCVTSD : VFPAI<(outs SPR:$dst), (ins DPR:$a), VFPUnaryFrm,
+ IIC_fpCVTSD, "fcvtsd", " $dst, $a",
+ [(set SPR:$dst, (fround DPR:$a))]> {
let Inst{27-23} = 0b11101;
let Inst{21-16} = 0b110111;
let Inst{11-8} = 0b1011;
@@ -194,26 +198,26 @@ def FCVTSD : AI<(outs SPR:$dst), (ins DPR:$a), VFPUnaryFrm,
let neverHasSideEffects = 1 in {
def FCPYD : ADuI<0b11101011, 0b0000, 0b0100, (outs DPR:$dst), (ins DPR:$a),
- "fcpyd", " $dst, $a", []>;
+ IIC_fpUNA64, "fcpyd", " $dst, $a", []>;
def FCPYS : ASuI<0b11101011, 0b0000, 0b0100, (outs SPR:$dst), (ins SPR:$a),
- "fcpys", " $dst, $a", []>;
+ IIC_fpUNA32, "fcpys", " $dst, $a", []>;
} // neverHasSideEffects
def FNEGD : ADuI<0b11101011, 0b0001, 0b0100, (outs DPR:$dst), (ins DPR:$a),
- "fnegd", " $dst, $a",
+ IIC_fpUNA64, "fnegd", " $dst, $a",
[(set DPR:$dst, (fneg DPR:$a))]>;
-def FNEGS : ASuI<0b11101011, 0b0001, 0b0100, (outs SPR:$dst), (ins SPR:$a),
- "fnegs", " $dst, $a",
- [(set SPR:$dst, (fneg SPR:$a))]>;
+def FNEGS : ASuIn<0b11101011, 0b0001, 0b0100, (outs SPR:$dst), (ins SPR:$a),
+ IIC_fpUNA32, "fnegs", " $dst, $a",
+ [(set SPR:$dst, (fneg SPR:$a))]>;
def FSQRTD : ADuI<0b11101011, 0b0001, 0b1100, (outs DPR:$dst), (ins DPR:$a),
- "fsqrtd", " $dst, $a",
+ IIC_fpSQRT64, "fsqrtd", " $dst, $a",
[(set DPR:$dst, (fsqrt DPR:$a))]>;
def FSQRTS : ASuI<0b11101011, 0b0001, 0b1100, (outs SPR:$dst), (ins SPR:$a),
- "fsqrts", " $dst, $a",
+ IIC_fpSQRT32, "fsqrts", " $dst, $a",
[(set SPR:$dst, (fsqrt SPR:$a))]>;
//===----------------------------------------------------------------------===//
@@ -221,16 +225,16 @@ def FSQRTS : ASuI<0b11101011, 0b0001, 0b1100, (outs SPR:$dst), (ins SPR:$a),
//
def FMRS : AVConv2I<0b11100001, 0b1010, (outs GPR:$dst), (ins SPR:$src),
- "fmrs", " $dst, $src",
+ IIC_VMOVSI, "fmrs", " $dst, $src",
[(set GPR:$dst, (bitconvert SPR:$src))]>;
def FMSR : AVConv4I<0b11100000, 0b1010, (outs SPR:$dst), (ins GPR:$src),
- "fmsr", " $dst, $src",
+ IIC_VMOVIS, "fmsr", " $dst, $src",
[(set SPR:$dst, (bitconvert GPR:$src))]>;
def FMRRD : AVConv3I<0b11000101, 0b1011,
- (outs GPR:$dst1, GPR:$dst2), (ins DPR:$src),
- "fmrrd", " $dst1, $dst2, $src",
+ (outs GPR:$wb, GPR:$dst2), (ins DPR:$src),
+ IIC_VMOVDI, "fmrrd", " $wb, $dst2, $src",
[/* FIXME: Can't write pattern for multiple result instr*/]>;
// FMDHR: GPR -> SPR
@@ -238,7 +242,7 @@ def FMRRD : AVConv3I<0b11000101, 0b1011,
def FMDRR : AVConv5I<0b11000100, 0b1011,
(outs DPR:$dst), (ins GPR:$src1, GPR:$src2),
- "fmdrr", " $dst, $src1, $src2",
+ IIC_VMOVID, "fmdrr", " $dst, $src1, $src2",
[(set DPR:$dst, (arm_fmdrr GPR:$src1, GPR:$src2))]>;
// FMRDH: SPR -> GPR
@@ -254,23 +258,23 @@ def FMDRR : AVConv5I<0b11000100, 0b1011,
// Int to FP:
def FSITOD : AVConv1I<0b11101011, 0b1000, 0b1011, (outs DPR:$dst), (ins SPR:$a),
- "fsitod", " $dst, $a",
+ IIC_fpCVTID, "fsitod", " $dst, $a",
[(set DPR:$dst, (arm_sitof SPR:$a))]> {
let Inst{7} = 1;
}
-def FSITOS : AVConv1I<0b11101011, 0b1000, 0b1010, (outs SPR:$dst), (ins SPR:$a),
- "fsitos", " $dst, $a",
+def FSITOS : AVConv1In<0b11101011, 0b1000, 0b1010, (outs SPR:$dst),(ins SPR:$a),
+ IIC_fpCVTIS, "fsitos", " $dst, $a",
[(set SPR:$dst, (arm_sitof SPR:$a))]> {
let Inst{7} = 1;
}
def FUITOD : AVConv1I<0b11101011, 0b1000, 0b1011, (outs DPR:$dst), (ins SPR:$a),
- "fuitod", " $dst, $a",
+ IIC_fpCVTID, "fuitod", " $dst, $a",
[(set DPR:$dst, (arm_uitof SPR:$a))]>;
-def FUITOS : AVConv1I<0b11101011, 0b1000, 0b1010, (outs SPR:$dst), (ins SPR:$a),
- "fuitos", " $dst, $a",
+def FUITOS : AVConv1In<0b11101011, 0b1000, 0b1010, (outs SPR:$dst),(ins SPR:$a),
+ IIC_fpCVTIS, "fuitos", " $dst, $a",
[(set SPR:$dst, (arm_uitof SPR:$a))]>;
// FP to Int:
@@ -278,28 +282,28 @@ def FUITOS : AVConv1I<0b11101011, 0b1000, 0b1010, (outs SPR:$dst), (ins SPR:$a),
def FTOSIZD : AVConv1I<0b11101011, 0b1101, 0b1011,
(outs SPR:$dst), (ins DPR:$a),
- "ftosizd", " $dst, $a",
+ IIC_fpCVTDI, "ftosizd", " $dst, $a",
[(set SPR:$dst, (arm_ftosi DPR:$a))]> {
let Inst{7} = 1; // Z bit
}
-def FTOSIZS : AVConv1I<0b11101011, 0b1101, 0b1010,
- (outs SPR:$dst), (ins SPR:$a),
- "ftosizs", " $dst, $a",
+def FTOSIZS : AVConv1In<0b11101011, 0b1101, 0b1010,
+ (outs SPR:$dst), (ins SPR:$a),
+ IIC_fpCVTSI, "ftosizs", " $dst, $a",
[(set SPR:$dst, (arm_ftosi SPR:$a))]> {
let Inst{7} = 1; // Z bit
}
def FTOUIZD : AVConv1I<0b11101011, 0b1100, 0b1011,
(outs SPR:$dst), (ins DPR:$a),
- "ftouizd", " $dst, $a",
+ IIC_fpCVTDI, "ftouizd", " $dst, $a",
[(set SPR:$dst, (arm_ftoui DPR:$a))]> {
let Inst{7} = 1; // Z bit
}
-def FTOUIZS : AVConv1I<0b11101011, 0b1100, 0b1010,
- (outs SPR:$dst), (ins SPR:$a),
- "ftouizs", " $dst, $a",
+def FTOUIZS : AVConv1In<0b11101011, 0b1100, 0b1010,
+ (outs SPR:$dst), (ins SPR:$a),
+ IIC_fpCVTSI, "ftouizs", " $dst, $a",
[(set SPR:$dst, (arm_ftoui SPR:$a))]> {
let Inst{7} = 1; // Z bit
}
@@ -309,48 +313,53 @@ def FTOUIZS : AVConv1I<0b11101011, 0b1100, 0b1010,
//
def FMACD : ADbI<0b11100000, (outs DPR:$dst), (ins DPR:$dstin, DPR:$a, DPR:$b),
- "fmacd", " $dst, $a, $b",
+ IIC_fpMAC64, "fmacd", " $dst, $a, $b",
[(set DPR:$dst, (fadd (fmul DPR:$a, DPR:$b), DPR:$dstin))]>,
RegConstraint<"$dstin = $dst">;
-def FMACS : ASbI<0b11100000, (outs SPR:$dst), (ins SPR:$dstin, SPR:$a, SPR:$b),
- "fmacs", " $dst, $a, $b",
- [(set SPR:$dst, (fadd (fmul SPR:$a, SPR:$b), SPR:$dstin))]>,
- RegConstraint<"$dstin = $dst">;
+def FMACS : ASbIn<0b11100000, (outs SPR:$dst), (ins SPR:$dstin, SPR:$a, SPR:$b),
+ IIC_fpMAC32, "fmacs", " $dst, $a, $b",
+ [(set SPR:$dst, (fadd (fmul SPR:$a, SPR:$b), SPR:$dstin))]>,
+ RegConstraint<"$dstin = $dst">;
def FMSCD : ADbI<0b11100001, (outs DPR:$dst), (ins DPR:$dstin, DPR:$a, DPR:$b),
- "fmscd", " $dst, $a, $b",
+ IIC_fpMAC64, "fmscd", " $dst, $a, $b",
[(set DPR:$dst, (fsub (fmul DPR:$a, DPR:$b), DPR:$dstin))]>,
RegConstraint<"$dstin = $dst">;
def FMSCS : ASbI<0b11100001, (outs SPR:$dst), (ins SPR:$dstin, SPR:$a, SPR:$b),
- "fmscs", " $dst, $a, $b",
+ IIC_fpMAC32, "fmscs", " $dst, $a, $b",
[(set SPR:$dst, (fsub (fmul SPR:$a, SPR:$b), SPR:$dstin))]>,
RegConstraint<"$dstin = $dst">;
def FNMACD : ADbI<0b11100000, (outs DPR:$dst), (ins DPR:$dstin, DPR:$a, DPR:$b),
- "fnmacd", " $dst, $a, $b",
+ IIC_fpMAC64, "fnmacd", " $dst, $a, $b",
[(set DPR:$dst, (fadd (fneg (fmul DPR:$a, DPR:$b)), DPR:$dstin))]>,
RegConstraint<"$dstin = $dst"> {
let Inst{6} = 1;
}
-def FNMACS : ASbI<0b11100000, (outs SPR:$dst), (ins SPR:$dstin, SPR:$a, SPR:$b),
- "fnmacs", " $dst, $a, $b",
+def FNMACS : ASbIn<0b11100000, (outs SPR:$dst), (ins SPR:$dstin, SPR:$a, SPR:$b),
+ IIC_fpMAC32, "fnmacs", " $dst, $a, $b",
[(set SPR:$dst, (fadd (fneg (fmul SPR:$a, SPR:$b)), SPR:$dstin))]>,
RegConstraint<"$dstin = $dst"> {
let Inst{6} = 1;
}
+def : Pat<(fsub DPR:$dstin, (fmul DPR:$a, DPR:$b)),
+ (FNMACD DPR:$dstin, DPR:$a, DPR:$b)>, Requires<[DontUseNEONForFP]>;
+def : Pat<(fsub SPR:$dstin, (fmul SPR:$a, SPR:$b)),
+ (FNMACS SPR:$dstin, SPR:$a, SPR:$b)>, Requires<[DontUseNEONForFP]>;
+
def FNMSCD : ADbI<0b11100001, (outs DPR:$dst), (ins DPR:$dstin, DPR:$a, DPR:$b),
- "fnmscd", " $dst, $a, $b",
+ IIC_fpMAC64, "fnmscd", " $dst, $a, $b",
[(set DPR:$dst, (fsub (fneg (fmul DPR:$a, DPR:$b)), DPR:$dstin))]>,
RegConstraint<"$dstin = $dst"> {
let Inst{6} = 1;
}
def FNMSCS : ASbI<0b11100001, (outs SPR:$dst), (ins SPR:$dstin, SPR:$a, SPR:$b),
- "fnmscs", " $dst, $a, $b",
+ IIC_fpMAC32, "fnmscs", " $dst, $a, $b",
[(set SPR:$dst, (fsub (fneg (fmul SPR:$a, SPR:$b)), SPR:$dstin))]>,
RegConstraint<"$dstin = $dst"> {
let Inst{6} = 1;
@@ -362,25 +371,25 @@ def FNMSCS : ASbI<0b11100001, (outs SPR:$dst), (ins SPR:$dstin, SPR:$a, SPR:$b),
def FCPYDcc : ADuI<0b11101011, 0b0000, 0b0100,
(outs DPR:$dst), (ins DPR:$false, DPR:$true),
- "fcpyd", " $dst, $true",
+ IIC_fpUNA64, "fcpyd", " $dst, $true",
[/*(set DPR:$dst, (ARMcmov DPR:$false, DPR:$true, imm:$cc))*/]>,
RegConstraint<"$false = $dst">;
def FCPYScc : ASuI<0b11101011, 0b0000, 0b0100,
(outs SPR:$dst), (ins SPR:$false, SPR:$true),
- "fcpys", " $dst, $true",
+ IIC_fpUNA32, "fcpys", " $dst, $true",
[/*(set SPR:$dst, (ARMcmov SPR:$false, SPR:$true, imm:$cc))*/]>,
RegConstraint<"$false = $dst">;
def FNEGDcc : ADuI<0b11101011, 0b0001, 0b0100,
(outs DPR:$dst), (ins DPR:$false, DPR:$true),
- "fnegd", " $dst, $true",
+ IIC_fpUNA64, "fnegd", " $dst, $true",
[/*(set DPR:$dst, (ARMcneg DPR:$false, DPR:$true, imm:$cc))*/]>,
RegConstraint<"$false = $dst">;
def FNEGScc : ASuI<0b11101011, 0b0001, 0b0100,
(outs SPR:$dst), (ins SPR:$false, SPR:$true),
- "fnegs", " $dst, $true",
+ IIC_fpUNA32, "fnegs", " $dst, $true",
[/*(set SPR:$dst, (ARMcneg SPR:$false, SPR:$true, imm:$cc))*/]>,
RegConstraint<"$false = $dst">;
@@ -389,8 +398,8 @@ def FNEGScc : ASuI<0b11101011, 0b0001, 0b0100,
// Misc.
//
-let Defs = [CPSR] in
-def FMSTAT : AI<(outs), (ins), VFPMiscFrm, "fmstat", "", [(arm_fmstat)]> {
+let Defs = [CPSR], Uses = [FPSCR] in
+def FMSTAT : VFPAI<(outs), (ins), VFPMiscFrm, IIC_fpSTAT, "fmstat", "", [(arm_fmstat)]> {
let Inst{27-20} = 0b11101111;
let Inst{19-16} = 0b0001;
let Inst{15-12} = 0b1111;
diff --git a/lib/Target/ARM/ARMJITInfo.cpp b/lib/Target/ARM/ARMJITInfo.cpp
index e551c41..24990e6 100644
--- a/lib/Target/ARM/ARMJITInfo.cpp
+++ b/lib/Target/ARM/ARMJITInfo.cpp
@@ -19,15 +19,15 @@
#include "ARMSubtarget.h"
#include "llvm/Function.h"
#include "llvm/CodeGen/JITCodeEmitter.h"
-#include "llvm/Config/alloca.h"
#include "llvm/Support/Debug.h"
-#include "llvm/Support/Streams.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
#include "llvm/System/Memory.h"
#include <cstdlib>
using namespace llvm;
void ARMJITInfo::replaceMachineCodeForFunction(void *Old, void *New) {
- abort();
+ llvm_report_error("ARMJITInfo::replaceMachineCodeForFunction");
}
/// JITCompilerFunction - This contains the address of the JIT function used to
@@ -45,11 +45,11 @@ static TargetJITInfo::JITCompilerFn JITCompilerFunction;
// CompilationCallback stub - We can't use a C function with inline assembly in
// it, because we the prolog/epilog inserted by GCC won't work for us (we need
// to preserve more context and manipulate the stack directly). Instead,
-// write our own wrapper, which does things our way, so we have complete
+// write our own wrapper, which does things our way, so we have complete
// control over register saving and restoring.
extern "C" {
#if defined(__arm__)
- void ARMCompilationCallback(void);
+ void ARMCompilationCallback();
asm(
".text\n"
".align 2\n"
@@ -77,11 +77,11 @@ extern "C" {
// order for the registers.
// +--------+
// 0 | LR | Original return address
- // +--------+
+ // +--------+
// 1 | LR | Stub address (start of stub)
// 2-5 | R3..R0 | Saved registers (we need to preserve all regs)
// 6-20 | D0..D7 | Saved VFP registers
- // +--------+
+ // +--------+
//
#ifndef __SOFTFP__
// Restore VFP caller-saved registers.
@@ -103,15 +103,14 @@ extern "C" {
);
#else // Not an ARM host
void ARMCompilationCallback() {
- assert(0 && "Cannot call ARMCompilationCallback() on a non-ARM arch!\n");
- abort();
+ llvm_unreachable("Cannot call ARMCompilationCallback() on a non-ARM arch!");
}
#endif
}
-/// ARMCompilationCallbackC - This is the target-specific function invoked
-/// by the function stub when we did not know the real target of a call.
-/// This function must locate the start of the stub or call site and pass
+/// ARMCompilationCallbackC - This is the target-specific function invoked
+/// by the function stub when we did not know the real target of a call.
+/// This function must locate the start of the stub or call site and pass
/// it into the JIT compiler function.
extern "C" void ARMCompilationCallbackC(intptr_t StubAddr) {
// Get the address of the compiled code for this function.
@@ -123,14 +122,12 @@ extern "C" void ARMCompilationCallbackC(intptr_t StubAddr) {
// ldr pc, [pc,#-4]
// <addr>
if (!sys::Memory::setRangeWritable((void*)StubAddr, 8)) {
- cerr << "ERROR: Unable to mark stub writable\n";
- abort();
+ llvm_unreachable("ERROR: Unable to mark stub writable");
}
*(intptr_t *)StubAddr = 0xe51ff004; // ldr pc, [pc, #-4]
*(intptr_t *)(StubAddr+4) = NewVal;
if (!sys::Memory::setRangeExecutable((void*)StubAddr, 8)) {
- cerr << "ERROR: Unable to mark stub executable\n";
- abort();
+ llvm_unreachable("ERROR: Unable to mark stub executable");
}
}
@@ -143,7 +140,14 @@ ARMJITInfo::getLazyResolverFunction(JITCompilerFn F) {
void *ARMJITInfo::emitGlobalValueIndirectSym(const GlobalValue *GV, void *Ptr,
JITCodeEmitter &JCE) {
JCE.startGVStub(GV, 4, 4);
+ intptr_t Addr = (intptr_t)JCE.getCurrentPCValue();
+ if (!sys::Memory::setRangeWritable((void*)Addr, 4)) {
+ llvm_unreachable("ERROR: Unable to mark indirect symbol writable");
+ }
JCE.emitWordLE((intptr_t)Ptr);
+ if (!sys::Memory::setRangeExecutable((void*)Addr, 4)) {
+ llvm_unreachable("ERROR: Unable to mark indirect symbol executable");
+ }
void *PtrAddr = JCE.finishGVStub(GV);
addIndirectSymAddr(Ptr, (intptr_t)PtrAddr);
return PtrAddr;
@@ -161,31 +165,43 @@ void *ARMJITInfo::emitFunctionStub(const Function* F, void *Fn,
if (!LazyPtr) {
// In PIC mode, the function stub is loading a lazy-ptr.
LazyPtr= (intptr_t)emitGlobalValueIndirectSym((GlobalValue*)F, Fn, JCE);
- if (F)
- DOUT << "JIT: Indirect symbol emitted at [" << LazyPtr << "] for GV '"
- << F->getName() << "'\n";
- else
- DOUT << "JIT: Stub emitted at [" << LazyPtr
- << "] for external function at '" << Fn << "'\n";
+ DEBUG(if (F)
+ errs() << "JIT: Indirect symbol emitted at [" << LazyPtr
+ << "] for GV '" << F->getName() << "'\n";
+ else
+ errs() << "JIT: Stub emitted at [" << LazyPtr
+ << "] for external function at '" << Fn << "'\n");
}
JCE.startGVStub(F, 16, 4);
intptr_t Addr = (intptr_t)JCE.getCurrentPCValue();
+ if (!sys::Memory::setRangeWritable((void*)Addr, 16)) {
+ llvm_unreachable("ERROR: Unable to mark stub writable");
+ }
JCE.emitWordLE(0xe59fc004); // ldr pc, [pc, #+4]
JCE.emitWordLE(0xe08fc00c); // L_func$scv: add ip, pc, ip
JCE.emitWordLE(0xe59cf000); // ldr pc, [ip]
JCE.emitWordLE(LazyPtr - (Addr+4+8)); // func - (L_func$scv+8)
sys::Memory::InvalidateInstructionCache((void*)Addr, 16);
+ if (!sys::Memory::setRangeExecutable((void*)Addr, 16)) {
+ llvm_unreachable("ERROR: Unable to mark stub executable");
+ }
} else {
// The stub is 8-byte size and 4-aligned.
JCE.startGVStub(F, 8, 4);
intptr_t Addr = (intptr_t)JCE.getCurrentPCValue();
+ if (!sys::Memory::setRangeWritable((void*)Addr, 8)) {
+ llvm_unreachable("ERROR: Unable to mark stub writable");
+ }
JCE.emitWordLE(0xe51ff004); // ldr pc, [pc, #-4]
JCE.emitWordLE((intptr_t)Fn); // addr of function
sys::Memory::InvalidateInstructionCache((void*)Addr, 8);
+ if (!sys::Memory::setRangeExecutable((void*)Addr, 8)) {
+ llvm_unreachable("ERROR: Unable to mark stub executable");
+ }
}
} else {
// The compilation callback will overwrite the first two words of this
- // stub with indirect branch instructions targeting the compiled code.
+ // stub with indirect branch instructions targeting the compiled code.
// This stub sets the return address to restart the stub, so that
// the new branch will be invoked when we come back.
//
@@ -193,6 +209,9 @@ void *ARMJITInfo::emitFunctionStub(const Function* F, void *Fn,
// The stub is 16-byte size and 4-byte aligned.
JCE.startGVStub(F, 16, 4);
intptr_t Addr = (intptr_t)JCE.getCurrentPCValue();
+ if (!sys::Memory::setRangeWritable((void*)Addr, 16)) {
+ llvm_unreachable("ERROR: Unable to mark stub writable");
+ }
// Save LR so the callback can determine which stub called it.
// The compilation callback is responsible for popping this prior
// to returning.
@@ -204,6 +223,9 @@ void *ARMJITInfo::emitFunctionStub(const Function* F, void *Fn,
// The address of the compilation callback.
JCE.emitWordLE((intptr_t)ARMCompilationCallback);
sys::Memory::InvalidateInstructionCache((void*)Addr, 16);
+ if (!sys::Memory::setRangeExecutable((void*)Addr, 16)) {
+ llvm_unreachable("ERROR: Unable to mark stub executable");
+ }
}
return JCE.finishGVStub(F);
diff --git a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
index 59cf125..d2ec9ee 100644
--- a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
+++ b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
@@ -15,9 +15,11 @@
#define DEBUG_TYPE "arm-ldst-opt"
#include "ARM.h"
#include "ARMAddressingModes.h"
+#include "ARMBaseInstrInfo.h"
#include "ARMMachineFunctionInfo.h"
#include "ARMRegisterInfo.h"
#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstr.h"
@@ -29,6 +31,7 @@
#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetRegisterInfo.h"
#include "llvm/Support/Compiler.h"
+#include "llvm/Support/ErrorHandling.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallPtrSet.h"
@@ -61,6 +64,7 @@ namespace {
const TargetRegisterInfo *TRI;
ARMFunctionInfo *AFI;
RegScavenger *RS;
+ bool isThumb2;
virtual bool runOnMachineFunction(MachineFunction &Fn);
@@ -93,6 +97,15 @@ namespace {
void AdvanceRS(MachineBasicBlock &MBB, MemOpQueue &MemOps);
bool FixInvalidRegPairOp(MachineBasicBlock &MBB,
MachineBasicBlock::iterator &MBBI);
+ bool MergeBaseUpdateLoadStore(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ const TargetInstrInfo *TII,
+ bool &Advance,
+ MachineBasicBlock::iterator &I);
+ bool MergeBaseUpdateLSMultiple(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ bool &Advance,
+ MachineBasicBlock::iterator &I);
bool LoadStoreMultipleOpti(MachineBasicBlock &MBB);
bool MergeReturnIntoLDM(MachineBasicBlock &MBB);
};
@@ -107,6 +120,14 @@ static int getLoadStoreMultipleOpcode(int Opcode) {
case ARM::STR:
NumSTMGened++;
return ARM::STM;
+ case ARM::t2LDRi8:
+ case ARM::t2LDRi12:
+ NumLDMGened++;
+ return ARM::t2LDM;
+ case ARM::t2STRi8:
+ case ARM::t2STRi12:
+ NumSTMGened++;
+ return ARM::t2STM;
case ARM::FLDS:
NumFLDMGened++;
return ARM::FLDMS;
@@ -119,14 +140,30 @@ static int getLoadStoreMultipleOpcode(int Opcode) {
case ARM::FSTD:
NumFSTMGened++;
return ARM::FSTMD;
- default: abort();
+ default: llvm_unreachable("Unhandled opcode!");
}
return 0;
}
+static bool isT2i32Load(unsigned Opc) {
+ return Opc == ARM::t2LDRi12 || Opc == ARM::t2LDRi8;
+}
+
+static bool isi32Load(unsigned Opc) {
+ return Opc == ARM::LDR || isT2i32Load(Opc);
+}
+
+static bool isT2i32Store(unsigned Opc) {
+ return Opc == ARM::t2STRi12 || Opc == ARM::t2STRi8;
+}
+
+static bool isi32Store(unsigned Opc) {
+ return Opc == ARM::STR || isT2i32Store(Opc);
+}
+
/// MergeOps - Create and insert a LDM or STM with Base as base register and
/// registers in Regs as the register operands that would be loaded / stored.
-/// It returns true if the transformation is done.
+/// It returns true if the transformation is done.
bool
ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI,
@@ -140,14 +177,20 @@ ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB,
return false;
ARM_AM::AMSubMode Mode = ARM_AM::ia;
- bool isAM4 = Opcode == ARM::LDR || Opcode == ARM::STR;
- if (isAM4 && Offset == 4)
+ bool isAM4 = isi32Load(Opcode) || isi32Store(Opcode);
+ if (isAM4 && Offset == 4) {
+ if (isThumb2)
+ // Thumb2 does not support ldmib / stmib.
+ return false;
Mode = ARM_AM::ib;
- else if (isAM4 && Offset == -4 * (int)NumRegs + 4)
+ } else if (isAM4 && Offset == -4 * (int)NumRegs + 4) {
+ if (isThumb2)
+ // Thumb2 does not support ldmda / stmda.
+ return false;
Mode = ARM_AM::da;
- else if (isAM4 && Offset == -4 * (int)NumRegs)
+ } else if (isAM4 && Offset == -4 * (int)NumRegs) {
Mode = ARM_AM::db;
- else if (Offset != 0) {
+ } else if (Offset != 0) {
// If starting offset isn't zero, insert a MI to materialize a new base.
// But only do so if it is cost effective, i.e. merging more than two
// loads / stores.
@@ -155,7 +198,7 @@ ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB,
return false;
unsigned NewBase;
- if (Opcode == ARM::LDR)
+ if (isi32Load(Opcode))
// If it is a load, then just use one of the destination register to
// use as the new base.
NewBase = Regs[NumRegs-1].first;
@@ -165,24 +208,30 @@ ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB,
if (NewBase == 0)
return false;
}
- int BaseOpc = ARM::ADDri;
+ int BaseOpc = !isThumb2
+ ? ARM::ADDri
+ : ((Base == ARM::SP) ? ARM::t2ADDrSPi : ARM::t2ADDri);
if (Offset < 0) {
- BaseOpc = ARM::SUBri;
+ BaseOpc = !isThumb2
+ ? ARM::SUBri
+ : ((Base == ARM::SP) ? ARM::t2SUBrSPi : ARM::t2SUBri);
Offset = - Offset;
}
- int ImmedOffset = ARM_AM::getSOImmVal(Offset);
+ int ImmedOffset = isThumb2
+ ? ARM_AM::getT2SOImmVal(Offset) : ARM_AM::getSOImmVal(Offset);
if (ImmedOffset == -1)
+ // FIXME: Try t2ADDri12 or t2SUBri12?
return false; // Probably not worth it then.
BuildMI(MBB, MBBI, dl, TII->get(BaseOpc), NewBase)
- .addReg(Base, getKillRegState(BaseKill)).addImm(ImmedOffset)
+ .addReg(Base, getKillRegState(BaseKill)).addImm(Offset)
.addImm(Pred).addReg(PredReg).addReg(0);
Base = NewBase;
BaseKill = true; // New base is always killed right its use.
}
bool isDPR = Opcode == ARM::FLDD || Opcode == ARM::FSTD;
- bool isDef = Opcode == ARM::LDR || Opcode == ARM::FLDS || Opcode == ARM::FLDD;
+ bool isDef = isi32Load(Opcode) || Opcode == ARM::FLDS || Opcode == ARM::FLDD;
Opcode = getLoadStoreMultipleOpcode(Opcode);
MachineInstrBuilder MIB = (isAM4)
? BuildMI(MBB, MBBI, dl, TII->get(Opcode))
@@ -192,6 +241,7 @@ ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB,
.addReg(Base, getKillRegState(BaseKill))
.addImm(ARM_AM::getAM5Opc(Mode, false, isDPR ? NumRegs<<1 : NumRegs))
.addImm(Pred).addReg(PredReg);
+ MIB.addReg(0); // Add optional writeback (0 for now).
for (unsigned i = 0; i != NumRegs; ++i)
MIB = MIB.addReg(Regs[i].first, getDefRegState(isDef)
| getKillRegState(Regs[i].second));
@@ -207,7 +257,7 @@ ARMLoadStoreOpt::MergeLDR_STR(MachineBasicBlock &MBB, unsigned SIndex,
ARMCC::CondCodes Pred, unsigned PredReg,
unsigned Scratch, MemOpQueue &MemOps,
SmallVector<MachineBasicBlock::iterator, 4> &Merges) {
- bool isAM4 = Opcode == ARM::LDR || Opcode == ARM::STR;
+ bool isAM4 = isi32Load(Opcode) || isi32Store(Opcode);
int Offset = MemOps[SIndex].Offset;
int SOffset = Offset;
unsigned Pos = MemOps[SIndex].Position;
@@ -265,41 +315,53 @@ ARMLoadStoreOpt::MergeLDR_STR(MachineBasicBlock &MBB, unsigned SIndex,
return;
}
-/// getInstrPredicate - If instruction is predicated, returns its predicate
-/// condition, otherwise returns AL. It also returns the condition code
-/// register by reference.
-static ARMCC::CondCodes getInstrPredicate(MachineInstr *MI, unsigned &PredReg) {
- int PIdx = MI->findFirstPredOperandIdx();
- if (PIdx == -1) {
- PredReg = 0;
- return ARMCC::AL;
- }
-
- PredReg = MI->getOperand(PIdx+1).getReg();
- return (ARMCC::CondCodes)MI->getOperand(PIdx).getImm();
-}
-
static inline bool isMatchingDecrement(MachineInstr *MI, unsigned Base,
- unsigned Bytes, ARMCC::CondCodes Pred,
- unsigned PredReg) {
+ unsigned Bytes, unsigned Limit,
+ ARMCC::CondCodes Pred, unsigned PredReg){
unsigned MyPredReg = 0;
- return (MI && MI->getOpcode() == ARM::SUBri &&
- MI->getOperand(0).getReg() == Base &&
+ if (!MI)
+ return false;
+ if (MI->getOpcode() != ARM::t2SUBri &&
+ MI->getOpcode() != ARM::t2SUBrSPi &&
+ MI->getOpcode() != ARM::t2SUBrSPi12 &&
+ MI->getOpcode() != ARM::tSUBspi &&
+ MI->getOpcode() != ARM::SUBri)
+ return false;
+
+ // Make sure the offset fits in 8 bits.
+ if (Bytes <= 0 || (Limit && Bytes >= Limit))
+ return false;
+
+ unsigned Scale = (MI->getOpcode() == ARM::tSUBspi) ? 4 : 1; // FIXME
+ return (MI->getOperand(0).getReg() == Base &&
MI->getOperand(1).getReg() == Base &&
- ARM_AM::getAM2Offset(MI->getOperand(2).getImm()) == Bytes &&
- getInstrPredicate(MI, MyPredReg) == Pred &&
+ (MI->getOperand(2).getImm()*Scale) == Bytes &&
+ llvm::getInstrPredicate(MI, MyPredReg) == Pred &&
MyPredReg == PredReg);
}
static inline bool isMatchingIncrement(MachineInstr *MI, unsigned Base,
- unsigned Bytes, ARMCC::CondCodes Pred,
- unsigned PredReg) {
+ unsigned Bytes, unsigned Limit,
+ ARMCC::CondCodes Pred, unsigned PredReg){
unsigned MyPredReg = 0;
- return (MI && MI->getOpcode() == ARM::ADDri &&
- MI->getOperand(0).getReg() == Base &&
+ if (!MI)
+ return false;
+ if (MI->getOpcode() != ARM::t2ADDri &&
+ MI->getOpcode() != ARM::t2ADDrSPi &&
+ MI->getOpcode() != ARM::t2ADDrSPi12 &&
+ MI->getOpcode() != ARM::tADDspi &&
+ MI->getOpcode() != ARM::ADDri)
+ return false;
+
+ if (Bytes <= 0 || (Limit && Bytes >= Limit))
+ // Make sure the offset fits in 8 bits.
+ return false;
+
+ unsigned Scale = (MI->getOpcode() == ARM::tADDspi) ? 4 : 1; // FIXME
+ return (MI->getOperand(0).getReg() == Base &&
MI->getOperand(1).getReg() == Base &&
- ARM_AM::getAM2Offset(MI->getOperand(2).getImm()) == Bytes &&
- getInstrPredicate(MI, MyPredReg) == Pred &&
+ (MI->getOperand(2).getImm()*Scale) == Bytes &&
+ llvm::getInstrPredicate(MI, MyPredReg) == Pred &&
MyPredReg == PredReg);
}
@@ -308,6 +370,10 @@ static inline unsigned getLSMultipleTransferSize(MachineInstr *MI) {
default: return 0;
case ARM::LDR:
case ARM::STR:
+ case ARM::t2LDRi8:
+ case ARM::t2LDRi12:
+ case ARM::t2STRi8:
+ case ARM::t2STRi12:
case ARM::FLDS:
case ARM::FSTS:
return 4;
@@ -316,7 +382,9 @@ static inline unsigned getLSMultipleTransferSize(MachineInstr *MI) {
return 8;
case ARM::LDM:
case ARM::STM:
- return (MI->getNumOperands() - 4) * 4;
+ case ARM::t2LDM:
+ case ARM::t2STM:
+ return (MI->getNumOperands() - 5) * 4;
case ARM::FLDMS:
case ARM::FSTMS:
case ARM::FLDMD:
@@ -325,7 +393,7 @@ static inline unsigned getLSMultipleTransferSize(MachineInstr *MI) {
}
}
-/// mergeBaseUpdateLSMultiple - Fold proceeding/trailing inc/dec of base
+/// MergeBaseUpdateLSMultiple - Fold proceeding/trailing inc/dec of base
/// register into the LDM/STM/FLDM{D|S}/FSTM{D|S} op when possible:
///
/// stmia rn, <ra, rb, rc>
@@ -337,17 +405,18 @@ static inline unsigned getLSMultipleTransferSize(MachineInstr *MI) {
/// ldmia rn, <ra, rb, rc>
/// =>
/// ldmdb rn!, <ra, rb, rc>
-static bool mergeBaseUpdateLSMultiple(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MBBI,
- bool &Advance,
- MachineBasicBlock::iterator &I) {
+bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ bool &Advance,
+ MachineBasicBlock::iterator &I) {
MachineInstr *MI = MBBI;
unsigned Base = MI->getOperand(0).getReg();
unsigned Bytes = getLSMultipleTransferSize(MI);
unsigned PredReg = 0;
- ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg);
+ ARMCC::CondCodes Pred = llvm::getInstrPredicate(MI, PredReg);
int Opcode = MI->getOpcode();
- bool isAM4 = Opcode == ARM::LDM || Opcode == ARM::STM;
+ bool isAM4 = Opcode == ARM::LDM || Opcode == ARM::t2LDM ||
+ Opcode == ARM::STM || Opcode == ARM::t2STM;
if (isAM4) {
if (ARM_AM::getAM4WBFlag(MI->getOperand(1).getImm()))
@@ -364,13 +433,17 @@ static bool mergeBaseUpdateLSMultiple(MachineBasicBlock &MBB,
if (MBBI != MBB.begin()) {
MachineBasicBlock::iterator PrevMBBI = prior(MBBI);
if (Mode == ARM_AM::ia &&
- isMatchingDecrement(PrevMBBI, Base, Bytes, Pred, PredReg)) {
+ isMatchingDecrement(PrevMBBI, Base, Bytes, 0, Pred, PredReg)) {
MI->getOperand(1).setImm(ARM_AM::getAM4ModeImm(ARM_AM::db, true));
+ MI->getOperand(4).setReg(Base);
+ MI->getOperand(4).setIsDef();
MBB.erase(PrevMBBI);
return true;
} else if (Mode == ARM_AM::ib &&
- isMatchingDecrement(PrevMBBI, Base, Bytes, Pred, PredReg)) {
+ isMatchingDecrement(PrevMBBI, Base, Bytes, 0, Pred, PredReg)) {
MI->getOperand(1).setImm(ARM_AM::getAM4ModeImm(ARM_AM::da, true));
+ MI->getOperand(4).setReg(Base); // WB to base
+ MI->getOperand(4).setIsDef();
MBB.erase(PrevMBBI);
return true;
}
@@ -379,8 +452,10 @@ static bool mergeBaseUpdateLSMultiple(MachineBasicBlock &MBB,
if (MBBI != MBB.end()) {
MachineBasicBlock::iterator NextMBBI = next(MBBI);
if ((Mode == ARM_AM::ia || Mode == ARM_AM::ib) &&
- isMatchingIncrement(NextMBBI, Base, Bytes, Pred, PredReg)) {
+ isMatchingIncrement(NextMBBI, Base, Bytes, 0, Pred, PredReg)) {
MI->getOperand(1).setImm(ARM_AM::getAM4ModeImm(Mode, true));
+ MI->getOperand(4).setReg(Base); // WB to base
+ MI->getOperand(4).setIsDef();
if (NextMBBI == I) {
Advance = true;
++I;
@@ -388,8 +463,10 @@ static bool mergeBaseUpdateLSMultiple(MachineBasicBlock &MBB,
MBB.erase(NextMBBI);
return true;
} else if ((Mode == ARM_AM::da || Mode == ARM_AM::db) &&
- isMatchingDecrement(NextMBBI, Base, Bytes, Pred, PredReg)) {
+ isMatchingDecrement(NextMBBI, Base, Bytes, 0, Pred, PredReg)) {
MI->getOperand(1).setImm(ARM_AM::getAM4ModeImm(Mode, true));
+ MI->getOperand(4).setReg(Base); // WB to base
+ MI->getOperand(4).setIsDef();
if (NextMBBI == I) {
Advance = true;
++I;
@@ -408,8 +485,10 @@ static bool mergeBaseUpdateLSMultiple(MachineBasicBlock &MBB,
if (MBBI != MBB.begin()) {
MachineBasicBlock::iterator PrevMBBI = prior(MBBI);
if (Mode == ARM_AM::ia &&
- isMatchingDecrement(PrevMBBI, Base, Bytes, Pred, PredReg)) {
+ isMatchingDecrement(PrevMBBI, Base, Bytes, 0, Pred, PredReg)) {
MI->getOperand(1).setImm(ARM_AM::getAM5Opc(ARM_AM::db, true, Offset));
+ MI->getOperand(4).setReg(Base); // WB to base
+ MI->getOperand(4).setIsDef();
MBB.erase(PrevMBBI);
return true;
}
@@ -418,8 +497,10 @@ static bool mergeBaseUpdateLSMultiple(MachineBasicBlock &MBB,
if (MBBI != MBB.end()) {
MachineBasicBlock::iterator NextMBBI = next(MBBI);
if (Mode == ARM_AM::ia &&
- isMatchingIncrement(NextMBBI, Base, Bytes, Pred, PredReg)) {
+ isMatchingIncrement(NextMBBI, Base, Bytes, 0, Pred, PredReg)) {
MI->getOperand(1).setImm(ARM_AM::getAM5Opc(ARM_AM::ia, true, Offset));
+ MI->getOperand(4).setReg(Base); // WB to base
+ MI->getOperand(4).setIsDef();
if (NextMBBI == I) {
Advance = true;
++I;
@@ -441,7 +522,13 @@ static unsigned getPreIndexedLoadStoreOpcode(unsigned Opc) {
case ARM::FLDD: return ARM::FLDMD;
case ARM::FSTS: return ARM::FSTMS;
case ARM::FSTD: return ARM::FSTMD;
- default: abort();
+ case ARM::t2LDRi8:
+ case ARM::t2LDRi12:
+ return ARM::t2LDR_PRE;
+ case ARM::t2STRi8:
+ case ARM::t2STRi12:
+ return ARM::t2STR_PRE;
+ default: llvm_unreachable("Unhandled opcode!");
}
return 0;
}
@@ -454,48 +541,62 @@ static unsigned getPostIndexedLoadStoreOpcode(unsigned Opc) {
case ARM::FLDD: return ARM::FLDMD;
case ARM::FSTS: return ARM::FSTMS;
case ARM::FSTD: return ARM::FSTMD;
- default: abort();
+ case ARM::t2LDRi8:
+ case ARM::t2LDRi12:
+ return ARM::t2LDR_POST;
+ case ARM::t2STRi8:
+ case ARM::t2STRi12:
+ return ARM::t2STR_POST;
+ default: llvm_unreachable("Unhandled opcode!");
}
return 0;
}
-/// mergeBaseUpdateLoadStore - Fold proceeding/trailing inc/dec of base
+/// MergeBaseUpdateLoadStore - Fold proceeding/trailing inc/dec of base
/// register into the LDR/STR/FLD{D|S}/FST{D|S} op when possible:
-static bool mergeBaseUpdateLoadStore(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MBBI,
- const TargetInstrInfo *TII,
- bool &Advance,
- MachineBasicBlock::iterator &I) {
+bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ const TargetInstrInfo *TII,
+ bool &Advance,
+ MachineBasicBlock::iterator &I) {
MachineInstr *MI = MBBI;
unsigned Base = MI->getOperand(1).getReg();
bool BaseKill = MI->getOperand(1).isKill();
unsigned Bytes = getLSMultipleTransferSize(MI);
int Opcode = MI->getOpcode();
DebugLoc dl = MI->getDebugLoc();
+ bool isAM5 = Opcode == ARM::FLDD || Opcode == ARM::FLDS ||
+ Opcode == ARM::FSTD || Opcode == ARM::FSTS;
bool isAM2 = Opcode == ARM::LDR || Opcode == ARM::STR;
- if ((isAM2 && ARM_AM::getAM2Offset(MI->getOperand(3).getImm()) != 0) ||
- (!isAM2 && ARM_AM::getAM5Offset(MI->getOperand(2).getImm()) != 0))
+ if (isAM2 && ARM_AM::getAM2Offset(MI->getOperand(3).getImm()) != 0)
return false;
+ else if (isAM5 && ARM_AM::getAM5Offset(MI->getOperand(2).getImm()) != 0)
+ return false;
+ else if (isT2i32Load(Opcode) || isT2i32Store(Opcode))
+ if (MI->getOperand(2).getImm() != 0)
+ return false;
- bool isLd = Opcode == ARM::LDR || Opcode == ARM::FLDS || Opcode == ARM::FLDD;
+ bool isLd = isi32Load(Opcode) || Opcode == ARM::FLDS || Opcode == ARM::FLDD;
// Can't do the merge if the destination register is the same as the would-be
// writeback register.
if (isLd && MI->getOperand(0).getReg() == Base)
return false;
unsigned PredReg = 0;
- ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg);
+ ARMCC::CondCodes Pred = llvm::getInstrPredicate(MI, PredReg);
bool DoMerge = false;
ARM_AM::AddrOpc AddSub = ARM_AM::add;
unsigned NewOpc = 0;
+ // AM2 - 12 bits, thumb2 - 8 bits.
+ unsigned Limit = isAM5 ? 0 : (isAM2 ? 0x1000 : 0x100);
if (MBBI != MBB.begin()) {
MachineBasicBlock::iterator PrevMBBI = prior(MBBI);
- if (isMatchingDecrement(PrevMBBI, Base, Bytes, Pred, PredReg)) {
+ if (isMatchingDecrement(PrevMBBI, Base, Bytes, Limit, Pred, PredReg)) {
DoMerge = true;
AddSub = ARM_AM::sub;
NewOpc = getPreIndexedLoadStoreOpcode(Opcode);
- } else if (isAM2 && isMatchingIncrement(PrevMBBI, Base, Bytes,
- Pred, PredReg)) {
+ } else if (!isAM5 &&
+ isMatchingIncrement(PrevMBBI, Base, Bytes, Limit,Pred,PredReg)) {
DoMerge = true;
NewOpc = getPreIndexedLoadStoreOpcode(Opcode);
}
@@ -505,11 +606,12 @@ static bool mergeBaseUpdateLoadStore(MachineBasicBlock &MBB,
if (!DoMerge && MBBI != MBB.end()) {
MachineBasicBlock::iterator NextMBBI = next(MBBI);
- if (isAM2 && isMatchingDecrement(NextMBBI, Base, Bytes, Pred, PredReg)) {
+ if (!isAM5 &&
+ isMatchingDecrement(NextMBBI, Base, Bytes, Limit, Pred, PredReg)) {
DoMerge = true;
AddSub = ARM_AM::sub;
NewOpc = getPostIndexedLoadStoreOpcode(Opcode);
- } else if (isMatchingIncrement(NextMBBI, Base, Bytes, Pred, PredReg)) {
+ } else if (isMatchingIncrement(NextMBBI, Base, Bytes, Limit,Pred,PredReg)) {
DoMerge = true;
NewOpc = getPostIndexedLoadStoreOpcode(Opcode);
}
@@ -526,33 +628,51 @@ static bool mergeBaseUpdateLoadStore(MachineBasicBlock &MBB,
return false;
bool isDPR = NewOpc == ARM::FLDMD || NewOpc == ARM::FSTMD;
- unsigned Offset = isAM2 ? ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift)
- : ARM_AM::getAM5Opc((AddSub == ARM_AM::sub) ? ARM_AM::db : ARM_AM::ia,
- true, isDPR ? 2 : 1);
+ unsigned Offset = 0;
+ if (isAM5)
+ Offset = ARM_AM::getAM5Opc((AddSub == ARM_AM::sub)
+ ? ARM_AM::db
+ : ARM_AM::ia, true, (isDPR ? 2 : 1));
+ else if (isAM2)
+ Offset = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift);
+ else
+ Offset = AddSub == ARM_AM::sub ? -Bytes : Bytes;
if (isLd) {
- if (isAM2)
- // LDR_PRE, LDR_POST;
- BuildMI(MBB, MBBI, dl, TII->get(NewOpc), MI->getOperand(0).getReg())
- .addReg(Base, RegState::Define)
- .addReg(Base).addReg(0).addImm(Offset).addImm(Pred).addReg(PredReg);
- else
+ if (isAM5)
// FLDMS, FLDMD
BuildMI(MBB, MBBI, dl, TII->get(NewOpc))
.addReg(Base, getKillRegState(BaseKill))
.addImm(Offset).addImm(Pred).addReg(PredReg)
+ .addReg(Base, getDefRegState(true)) // WB base register
.addReg(MI->getOperand(0).getReg(), RegState::Define);
- } else {
- MachineOperand &MO = MI->getOperand(0);
- if (isAM2)
- // STR_PRE, STR_POST;
- BuildMI(MBB, MBBI, dl, TII->get(NewOpc), Base)
- .addReg(MO.getReg(), getKillRegState(MO.isKill()))
+ else if (isAM2)
+ // LDR_PRE, LDR_POST,
+ BuildMI(MBB, MBBI, dl, TII->get(NewOpc), MI->getOperand(0).getReg())
+ .addReg(Base, RegState::Define)
.addReg(Base).addReg(0).addImm(Offset).addImm(Pred).addReg(PredReg);
else
+ // t2LDR_PRE, t2LDR_POST
+ BuildMI(MBB, MBBI, dl, TII->get(NewOpc), MI->getOperand(0).getReg())
+ .addReg(Base, RegState::Define)
+ .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg);
+ } else {
+ MachineOperand &MO = MI->getOperand(0);
+ if (isAM5)
// FSTMS, FSTMD
BuildMI(MBB, MBBI, dl, TII->get(NewOpc)).addReg(Base).addImm(Offset)
.addImm(Pred).addReg(PredReg)
+ .addReg(Base, getDefRegState(true)) // WB base register
.addReg(MO.getReg(), getKillRegState(MO.isKill()));
+ else if (isAM2)
+ // STR_PRE, STR_POST
+ BuildMI(MBB, MBBI, dl, TII->get(NewOpc), Base)
+ .addReg(MO.getReg(), getKillRegState(MO.isKill()))
+ .addReg(Base).addReg(0).addImm(Offset).addImm(Pred).addReg(PredReg);
+ else
+ // t2STR_PRE, t2STR_POST
+ BuildMI(MBB, MBBI, dl, TII->get(NewOpc), Base)
+ .addReg(MO.getReg(), getKillRegState(MO.isKill()))
+ .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg);
}
MBB.erase(MBBI);
@@ -561,7 +681,7 @@ static bool mergeBaseUpdateLoadStore(MachineBasicBlock &MBB,
/// isMemoryOp - Returns true if instruction is a memory operations (that this
/// pass is capable of operating on).
-static bool isMemoryOp(MachineInstr *MI) {
+static bool isMemoryOp(const MachineInstr *MI) {
int Opcode = MI->getOpcode();
switch (Opcode) {
default: break;
@@ -574,6 +694,11 @@ static bool isMemoryOp(MachineInstr *MI) {
case ARM::FLDD:
case ARM::FSTD:
return MI->getOperand(1).isReg();
+ case ARM::t2LDRi8:
+ case ARM::t2LDRi12:
+ case ARM::t2STRi8:
+ case ARM::t2STRi12:
+ return MI->getOperand(1).isReg();
}
return false;
}
@@ -600,6 +725,12 @@ static int getMemoryOpOffset(const MachineInstr *MI) {
bool isAM3 = Opcode == ARM::LDRD || Opcode == ARM::STRD;
unsigned NumOperands = MI->getDesc().getNumOperands();
unsigned OffField = MI->getOperand(NumOperands-3).getImm();
+
+ if (Opcode == ARM::t2LDRi12 || Opcode == ARM::t2LDRi8 ||
+ Opcode == ARM::t2STRi12 || Opcode == ARM::t2STRi8 ||
+ Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8)
+ return OffField;
+
int Offset = isAM2
? ARM_AM::getAM2Offset(OffField)
: (isAM3 ? ARM_AM::getAM3Offset(OffField)
@@ -621,37 +752,43 @@ static void InsertLDR_STR(MachineBasicBlock &MBB,
MachineBasicBlock::iterator &MBBI,
int OffImm, bool isDef,
DebugLoc dl, unsigned NewOpc,
- unsigned Reg, bool RegDeadKill,
- unsigned BaseReg, bool BaseKill,
- unsigned OffReg, bool OffKill,
+ unsigned Reg, bool RegDeadKill, bool RegUndef,
+ unsigned BaseReg, bool BaseKill, bool BaseUndef,
+ unsigned OffReg, bool OffKill, bool OffUndef,
ARMCC::CondCodes Pred, unsigned PredReg,
- const TargetInstrInfo *TII) {
- unsigned Offset;
- if (OffImm < 0)
- Offset = ARM_AM::getAM2Opc(ARM_AM::sub, -OffImm, ARM_AM::no_shift);
- else
- Offset = ARM_AM::getAM2Opc(ARM_AM::add, OffImm, ARM_AM::no_shift);
- if (isDef)
- BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(NewOpc))
+ const TargetInstrInfo *TII, bool isT2) {
+ int Offset = OffImm;
+ if (!isT2) {
+ if (OffImm < 0)
+ Offset = ARM_AM::getAM2Opc(ARM_AM::sub, -OffImm, ARM_AM::no_shift);
+ else
+ Offset = ARM_AM::getAM2Opc(ARM_AM::add, OffImm, ARM_AM::no_shift);
+ }
+ if (isDef) {
+ MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MBBI->getDebugLoc(),
+ TII->get(NewOpc))
.addReg(Reg, getDefRegState(true) | getDeadRegState(RegDeadKill))
- .addReg(BaseReg, getKillRegState(BaseKill))
- .addReg(OffReg, getKillRegState(OffKill))
- .addImm(Offset)
- .addImm(Pred).addReg(PredReg);
- else
- BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(NewOpc))
- .addReg(Reg, getKillRegState(RegDeadKill))
- .addReg(BaseReg, getKillRegState(BaseKill))
- .addReg(OffReg, getKillRegState(OffKill))
- .addImm(Offset)
- .addImm(Pred).addReg(PredReg);
+ .addReg(BaseReg, getKillRegState(BaseKill)|getUndefRegState(BaseUndef));
+ if (!isT2)
+ MIB.addReg(OffReg, getKillRegState(OffKill)|getUndefRegState(OffUndef));
+ MIB.addImm(Offset).addImm(Pred).addReg(PredReg);
+ } else {
+ MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MBBI->getDebugLoc(),
+ TII->get(NewOpc))
+ .addReg(Reg, getKillRegState(RegDeadKill) | getUndefRegState(RegUndef))
+ .addReg(BaseReg, getKillRegState(BaseKill)|getUndefRegState(BaseUndef));
+ if (!isT2)
+ MIB.addReg(OffReg, getKillRegState(OffKill)|getUndefRegState(OffUndef));
+ MIB.addImm(Offset).addImm(Pred).addReg(PredReg);
+ }
}
bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB,
MachineBasicBlock::iterator &MBBI) {
MachineInstr *MI = &*MBBI;
unsigned Opcode = MI->getOpcode();
- if (Opcode == ARM::LDRD || Opcode == ARM::STRD) {
+ if (Opcode == ARM::LDRD || Opcode == ARM::STRD ||
+ Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8) {
unsigned EvenReg = MI->getOperand(0).getReg();
unsigned OddReg = MI->getOperand(1).getReg();
unsigned EvenRegNum = TRI->getDwarfRegNum(EvenReg, false);
@@ -659,45 +796,59 @@ bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB,
if ((EvenRegNum & 1) == 0 && (EvenRegNum + 1) == OddRegNum)
return false;
- bool isLd = Opcode == ARM::LDRD;
+ bool isT2 = Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8;
+ bool isLd = Opcode == ARM::LDRD || Opcode == ARM::t2LDRDi8;
bool EvenDeadKill = isLd ?
MI->getOperand(0).isDead() : MI->getOperand(0).isKill();
+ bool EvenUndef = MI->getOperand(0).isUndef();
bool OddDeadKill = isLd ?
MI->getOperand(1).isDead() : MI->getOperand(1).isKill();
+ bool OddUndef = MI->getOperand(1).isUndef();
const MachineOperand &BaseOp = MI->getOperand(2);
unsigned BaseReg = BaseOp.getReg();
bool BaseKill = BaseOp.isKill();
- const MachineOperand &OffOp = MI->getOperand(3);
- unsigned OffReg = OffOp.getReg();
- bool OffKill = OffOp.isKill();
+ bool BaseUndef = BaseOp.isUndef();
+ unsigned OffReg = isT2 ? 0 : MI->getOperand(3).getReg();
+ bool OffKill = isT2 ? false : MI->getOperand(3).isKill();
+ bool OffUndef = isT2 ? false : MI->getOperand(3).isUndef();
int OffImm = getMemoryOpOffset(MI);
unsigned PredReg = 0;
- ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg);
+ ARMCC::CondCodes Pred = llvm::getInstrPredicate(MI, PredReg);
if (OddRegNum > EvenRegNum && OffReg == 0 && OffImm == 0) {
// Ascending register numbers and no offset. It's safe to change it to a
// ldm or stm.
- unsigned NewOpc = (Opcode == ARM::LDRD) ? ARM::LDM : ARM::STM;
+ unsigned NewOpc = (isLd)
+ ? (isT2 ? ARM::t2LDM : ARM::LDM)
+ : (isT2 ? ARM::t2STM : ARM::STM);
if (isLd) {
BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(NewOpc))
.addReg(BaseReg, getKillRegState(BaseKill))
.addImm(ARM_AM::getAM4ModeImm(ARM_AM::ia))
.addImm(Pred).addReg(PredReg)
+ .addReg(0)
.addReg(EvenReg, getDefRegState(isLd) | getDeadRegState(EvenDeadKill))
- .addReg(OddReg, getDefRegState(isLd) | getDeadRegState(OddDeadKill));
+ .addReg(OddReg, getDefRegState(isLd) | getDeadRegState(OddDeadKill));
++NumLDRD2LDM;
} else {
BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(NewOpc))
.addReg(BaseReg, getKillRegState(BaseKill))
.addImm(ARM_AM::getAM4ModeImm(ARM_AM::ia))
.addImm(Pred).addReg(PredReg)
- .addReg(EvenReg, getKillRegState(EvenDeadKill))
- .addReg(OddReg, getKillRegState(OddDeadKill));
+ .addReg(0)
+ .addReg(EvenReg,
+ getKillRegState(EvenDeadKill) | getUndefRegState(EvenUndef))
+ .addReg(OddReg,
+ getKillRegState(OddDeadKill) | getUndefRegState(OddUndef));
++NumSTRD2STM;
}
} else {
// Split into two instructions.
- unsigned NewOpc = (Opcode == ARM::LDRD) ? ARM::LDR : ARM::STR;
+ assert((!isT2 || !OffReg) &&
+ "Thumb2 ldrd / strd does not encode offset register!");
+ unsigned NewOpc = (isLd)
+ ? (isT2 ? (OffImm < 0 ? ARM::t2LDRi8 : ARM::t2LDRi12) : ARM::LDR)
+ : (isT2 ? (OffImm < 0 ? ARM::t2STRi8 : ARM::t2STRi12) : ARM::STR);
DebugLoc dl = MBBI->getDebugLoc();
// If this is a load and base register is killed, it may have been
// re-defed by the load, make sure the first load does not clobber it.
@@ -707,17 +858,23 @@ bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB,
(OffReg && TRI->regsOverlap(EvenReg, OffReg)))) {
assert(!TRI->regsOverlap(OddReg, BaseReg) &&
(!OffReg || !TRI->regsOverlap(OddReg, OffReg)));
- InsertLDR_STR(MBB, MBBI, OffImm+4, isLd, dl, NewOpc, OddReg, OddDeadKill,
- BaseReg, false, OffReg, false, Pred, PredReg, TII);
- InsertLDR_STR(MBB, MBBI, OffImm, isLd, dl, NewOpc, EvenReg, EvenDeadKill,
- BaseReg, BaseKill, OffReg, OffKill, Pred, PredReg, TII);
+ InsertLDR_STR(MBB, MBBI, OffImm+4, isLd, dl, NewOpc,
+ OddReg, OddDeadKill, false,
+ BaseReg, false, BaseUndef, OffReg, false, OffUndef,
+ Pred, PredReg, TII, isT2);
+ InsertLDR_STR(MBB, MBBI, OffImm, isLd, dl, NewOpc,
+ EvenReg, EvenDeadKill, false,
+ BaseReg, BaseKill, BaseUndef, OffReg, OffKill, OffUndef,
+ Pred, PredReg, TII, isT2);
} else {
InsertLDR_STR(MBB, MBBI, OffImm, isLd, dl, NewOpc,
- EvenReg, EvenDeadKill, BaseReg, false, OffReg, false,
- Pred, PredReg, TII);
+ EvenReg, EvenDeadKill, EvenUndef,
+ BaseReg, false, BaseUndef, OffReg, false, OffUndef,
+ Pred, PredReg, TII, isT2);
InsertLDR_STR(MBB, MBBI, OffImm+4, isLd, dl, NewOpc,
- OddReg, OddDeadKill, BaseReg, BaseKill, OffReg, OffKill,
- Pred, PredReg, TII);
+ OddReg, OddDeadKill, OddUndef,
+ BaseReg, BaseKill, BaseUndef, OffReg, OffKill, OffUndef,
+ Pred, PredReg, TII, isT2);
}
if (isLd)
++NumLDRD2LDR;
@@ -761,7 +918,7 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
unsigned Size = getLSMultipleTransferSize(MBBI);
unsigned Base = MBBI->getOperand(1).getReg();
unsigned PredReg = 0;
- ARMCC::CondCodes Pred = getInstrPredicate(MBBI, PredReg);
+ ARMCC::CondCodes Pred = llvm::getInstrPredicate(MBBI, PredReg);
int Offset = getMemoryOpOffset(MBBI);
// Watch out for:
// r4 := ldr [r5]
@@ -772,7 +929,7 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
// looks like the later ldr(s) use the same base register. Try to
// merge the ldr's so far, including this one. But don't try to
// combine the following ldr(s).
- Clobber = (Opcode == ARM::LDR && Base == MBBI->getOperand(0).getReg());
+ Clobber = (isi32Load(Opcode) && Base == MBBI->getOperand(0).getReg());
if (CurrBase == 0 && !Clobber) {
// Start of a new chain.
CurrBase = Base;
@@ -825,12 +982,8 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
// Try to find a free register to use as a new base in case it's needed.
// First advance to the instruction just before the start of the chain.
AdvanceRS(MBB, MemOps);
- // Find a scratch register. Make sure it's a call clobbered register or
- // a spilled callee-saved register.
- unsigned Scratch = RS->FindUnusedReg(&ARM::GPRRegClass, true);
- if (!Scratch)
- Scratch = RS->FindUnusedReg(&ARM::GPRRegClass,
- AFI->getSpilledCSRegisters());
+ // Find a scratch register.
+ unsigned Scratch = RS->FindUnusedReg(ARM::GPRRegisterClass);
// Process the load / store instructions.
RS->forward(prior(MBBI));
@@ -842,7 +995,7 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
// Try folding preceeding/trailing base inc/dec into the generated
// LDM/STM ops.
for (unsigned i = 0, e = Merges.size(); i < e; ++i)
- if (mergeBaseUpdateLSMultiple(MBB, Merges[i], Advance, MBBI))
+ if (MergeBaseUpdateLSMultiple(MBB, Merges[i], Advance, MBBI))
++NumMerges;
NumMerges += Merges.size();
@@ -850,15 +1003,15 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
// that were not merged to form LDM/STM ops.
for (unsigned i = 0; i != NumMemOps; ++i)
if (!MemOps[i].Merged)
- if (mergeBaseUpdateLoadStore(MBB, MemOps[i].MBBI, TII,Advance,MBBI))
+ if (MergeBaseUpdateLoadStore(MBB, MemOps[i].MBBI, TII,Advance,MBBI))
++NumMerges;
- // RS may be pointing to an instruction that's deleted.
+ // RS may be pointing to an instruction that's deleted.
RS->skipTo(prior(MBBI));
} else if (NumMemOps == 1) {
// Try folding preceeding/trailing base inc/dec into the single
// load/store.
- if (mergeBaseUpdateLoadStore(MBB, MemOps[0].MBBI, TII, Advance, MBBI)) {
+ if (MergeBaseUpdateLoadStore(MBB, MemOps[0].MBBI, TII, Advance, MBBI)) {
++NumMerges;
RS->forward(prior(MBBI));
}
@@ -907,16 +1060,18 @@ bool ARMLoadStoreOpt::MergeReturnIntoLDM(MachineBasicBlock &MBB) {
if (MBB.empty()) return false;
MachineBasicBlock::iterator MBBI = prior(MBB.end());
- if (MBBI->getOpcode() == ARM::BX_RET && MBBI != MBB.begin()) {
+ if (MBBI != MBB.begin() &&
+ (MBBI->getOpcode() == ARM::BX_RET || MBBI->getOpcode() == ARM::tBX_RET)) {
MachineInstr *PrevMI = prior(MBBI);
- if (PrevMI->getOpcode() == ARM::LDM) {
+ if (PrevMI->getOpcode() == ARM::LDM || PrevMI->getOpcode() == ARM::t2LDM) {
MachineOperand &MO = PrevMI->getOperand(PrevMI->getNumOperands()-1);
- if (MO.getReg() == ARM::LR) {
- PrevMI->setDesc(TII->get(ARM::LDM_RET));
- MO.setReg(ARM::PC);
- MBB.erase(MBBI);
- return true;
- }
+ if (MO.getReg() != ARM::LR)
+ return false;
+ unsigned NewOpc = isThumb2 ? ARM::t2LDM_RET : ARM::LDM_RET;
+ PrevMI->setDesc(TII->get(NewOpc));
+ MO.setReg(ARM::PC);
+ MBB.erase(MBBI);
+ return true;
}
}
return false;
@@ -928,6 +1083,7 @@ bool ARMLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
TII = TM.getInstrInfo();
TRI = TM.getRegisterInfo();
RS = new RegScavenger();
+ isThumb2 = AFI->isThumb2Function();
bool Modified = false;
for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
@@ -956,6 +1112,7 @@ namespace {
const TargetRegisterInfo *TRI;
const ARMSubtarget *STI;
MachineRegisterInfo *MRI;
+ MachineFunction *MF;
virtual bool runOnMachineFunction(MachineFunction &Fn);
@@ -967,8 +1124,9 @@ namespace {
bool CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1, DebugLoc &dl,
unsigned &NewOpc, unsigned &EvenReg,
unsigned &OddReg, unsigned &BaseReg,
- unsigned &OffReg, unsigned &Offset,
- unsigned &PredReg, ARMCC::CondCodes &Pred);
+ unsigned &OffReg, int &Offset,
+ unsigned &PredReg, ARMCC::CondCodes &Pred,
+ bool &isT2);
bool RescheduleOps(MachineBasicBlock *MBB,
SmallVector<MachineInstr*, 4> &Ops,
unsigned Base, bool isLd,
@@ -984,6 +1142,7 @@ bool ARMPreAllocLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
TRI = Fn.getTarget().getRegisterInfo();
STI = &Fn.getTarget().getSubtarget<ARMSubtarget>();
MRI = &Fn.getRegInfo();
+ MF = &Fn;
bool Modified = false;
for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
@@ -1045,48 +1204,83 @@ ARMPreAllocLoadStoreOpt::CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1,
DebugLoc &dl,
unsigned &NewOpc, unsigned &EvenReg,
unsigned &OddReg, unsigned &BaseReg,
- unsigned &OffReg, unsigned &Offset,
+ unsigned &OffReg, int &Offset,
unsigned &PredReg,
- ARMCC::CondCodes &Pred) {
+ ARMCC::CondCodes &Pred,
+ bool &isT2) {
+ // Make sure we're allowed to generate LDRD/STRD.
+ if (!STI->hasV5TEOps())
+ return false;
+
// FIXME: FLDS / FSTS -> FLDD / FSTD
+ unsigned Scale = 1;
unsigned Opcode = Op0->getOpcode();
if (Opcode == ARM::LDR)
NewOpc = ARM::LDRD;
else if (Opcode == ARM::STR)
NewOpc = ARM::STRD;
- else
- return 0;
+ else if (Opcode == ARM::t2LDRi8 || Opcode == ARM::t2LDRi12) {
+ NewOpc = ARM::t2LDRDi8;
+ Scale = 4;
+ isT2 = true;
+ } else if (Opcode == ARM::t2STRi8 || Opcode == ARM::t2STRi12) {
+ NewOpc = ARM::t2STRDi8;
+ Scale = 4;
+ isT2 = true;
+ } else
+ return false;
+
+ // Make sure the offset registers match.
+ if (!isT2 &&
+ (Op0->getOperand(2).getReg() != Op1->getOperand(2).getReg()))
+ return false;
// Must sure the base address satisfies i64 ld / st alignment requirement.
if (!Op0->hasOneMemOperand() ||
- !Op0->memoperands_begin()->getValue() ||
- Op0->memoperands_begin()->isVolatile())
+ !(*Op0->memoperands_begin())->getValue() ||
+ (*Op0->memoperands_begin())->isVolatile())
return false;
- unsigned Align = Op0->memoperands_begin()->getAlignment();
+ unsigned Align = (*Op0->memoperands_begin())->getAlignment();
+ Function *Func = MF->getFunction();
unsigned ReqAlign = STI->hasV6Ops()
- ? TD->getPrefTypeAlignment(Type::Int64Ty) : 8; // Pre-v6 need 8-byte align
+ ? TD->getPrefTypeAlignment(Type::getInt64Ty(Func->getContext()))
+ : 8; // Pre-v6 need 8-byte align
if (Align < ReqAlign)
return false;
// Then make sure the immediate offset fits.
int OffImm = getMemoryOpOffset(Op0);
- ARM_AM::AddrOpc AddSub = ARM_AM::add;
- if (OffImm < 0) {
- AddSub = ARM_AM::sub;
- OffImm = - OffImm;
+ if (isT2) {
+ if (OffImm < 0) {
+ if (OffImm < -255)
+ // Can't fall back to t2LDRi8 / t2STRi8.
+ return false;
+ } else {
+ int Limit = (1 << 8) * Scale;
+ if (OffImm >= Limit || (OffImm & (Scale-1)))
+ return false;
+ }
+ Offset = OffImm;
+ } else {
+ ARM_AM::AddrOpc AddSub = ARM_AM::add;
+ if (OffImm < 0) {
+ AddSub = ARM_AM::sub;
+ OffImm = - OffImm;
+ }
+ int Limit = (1 << 8) * Scale;
+ if (OffImm >= Limit || (OffImm & (Scale-1)))
+ return false;
+ Offset = ARM_AM::getAM3Opc(AddSub, OffImm);
}
- if (OffImm >= 256) // 8 bits
- return false;
- Offset = ARM_AM::getAM3Opc(AddSub, OffImm);
-
EvenReg = Op0->getOperand(0).getReg();
OddReg = Op1->getOperand(0).getReg();
if (EvenReg == OddReg)
return false;
BaseReg = Op0->getOperand(1).getReg();
- OffReg = Op0->getOperand(2).getReg();
- Pred = getInstrPredicate(Op0, PredReg);
+ if (!isT2)
+ OffReg = Op0->getOperand(2).getReg();
+ Pred = llvm::getInstrPredicate(Op0, PredReg);
dl = Op0->getDebugLoc();
return true;
}
@@ -1138,7 +1332,7 @@ bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB,
LastOffset = Offset;
LastBytes = Bytes;
LastOpcode = Opcode;
- if (++NumMove == 8) // FIXME: Tune
+ if (++NumMove == 8) // FIXME: Tune this limit.
break;
}
@@ -1174,29 +1368,36 @@ bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB,
unsigned EvenReg = 0, OddReg = 0;
unsigned BaseReg = 0, OffReg = 0, PredReg = 0;
ARMCC::CondCodes Pred = ARMCC::AL;
+ bool isT2 = false;
unsigned NewOpc = 0;
- unsigned Offset = 0;
+ int Offset = 0;
DebugLoc dl;
if (NumMove == 2 && CanFormLdStDWord(Op0, Op1, dl, NewOpc,
EvenReg, OddReg, BaseReg, OffReg,
- Offset, PredReg, Pred)) {
+ Offset, PredReg, Pred, isT2)) {
Ops.pop_back();
Ops.pop_back();
// Form the pair instruction.
if (isLd) {
- BuildMI(*MBB, InsertPos, dl, TII->get(NewOpc))
+ MachineInstrBuilder MIB = BuildMI(*MBB, InsertPos,
+ dl, TII->get(NewOpc))
.addReg(EvenReg, RegState::Define)
.addReg(OddReg, RegState::Define)
- .addReg(BaseReg).addReg(0).addImm(Offset)
- .addImm(Pred).addReg(PredReg);
+ .addReg(BaseReg);
+ if (!isT2)
+ MIB.addReg(OffReg);
+ MIB.addImm(Offset).addImm(Pred).addReg(PredReg);
++NumLDRDFormed;
} else {
- BuildMI(*MBB, InsertPos, dl, TII->get(NewOpc))
+ MachineInstrBuilder MIB = BuildMI(*MBB, InsertPos,
+ dl, TII->get(NewOpc))
.addReg(EvenReg)
.addReg(OddReg)
- .addReg(BaseReg).addReg(0).addImm(Offset)
- .addImm(Pred).addReg(PredReg);
+ .addReg(BaseReg);
+ if (!isT2)
+ MIB.addReg(OffReg);
+ MIB.addImm(Offset).addImm(Pred).addReg(PredReg);
++NumSTRDFormed;
}
MBB->erase(Op0);
@@ -1249,12 +1450,11 @@ ARMPreAllocLoadStoreOpt::RescheduleLoadStoreInstrs(MachineBasicBlock *MBB) {
if (!isMemoryOp(MI))
continue;
unsigned PredReg = 0;
- if (getInstrPredicate(MI, PredReg) != ARMCC::AL)
+ if (llvm::getInstrPredicate(MI, PredReg) != ARMCC::AL)
continue;
- int Opcode = MI->getOpcode();
- bool isLd = Opcode == ARM::LDR ||
- Opcode == ARM::FLDS || Opcode == ARM::FLDD;
+ int Opc = MI->getOpcode();
+ bool isLd = isi32Load(Opc) || Opc == ARM::FLDS || Opc == ARM::FLDD;
unsigned Base = MI->getOperand(1).getReg();
int Offset = getMemoryOpOffset(MI);
diff --git a/lib/Target/ARM/ARMMCAsmInfo.cpp b/lib/Target/ARM/ARMMCAsmInfo.cpp
new file mode 100644
index 0000000..0ff65d2
--- /dev/null
+++ b/lib/Target/ARM/ARMMCAsmInfo.cpp
@@ -0,0 +1,72 @@
+//===-- ARMMCAsmInfo.cpp - ARM asm properties -------------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the ARMMCAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMMCAsmInfo.h"
+using namespace llvm;
+
+static const char *const arm_asm_table[] = {
+ "{r0}", "r0",
+ "{r1}", "r1",
+ "{r2}", "r2",
+ "{r3}", "r3",
+ "{r4}", "r4",
+ "{r5}", "r5",
+ "{r6}", "r6",
+ "{r7}", "r7",
+ "{r8}", "r8",
+ "{r9}", "r9",
+ "{r10}", "r10",
+ "{r11}", "r11",
+ "{r12}", "r12",
+ "{r13}", "r13",
+ "{r14}", "r14",
+ "{lr}", "lr",
+ "{sp}", "sp",
+ "{ip}", "ip",
+ "{fp}", "fp",
+ "{sl}", "sl",
+ "{memory}", "memory",
+ "{cc}", "cc",
+ 0,0
+};
+
+ARMMCAsmInfoDarwin::ARMMCAsmInfoDarwin() {
+ AsmTransCBE = arm_asm_table;
+ Data64bitsDirective = 0;
+ CommentString = "@";
+ COMMDirectiveTakesAlignment = false;
+ SupportsDebugInformation = true;
+
+ // Exceptions handling
+ ExceptionsType = ExceptionHandling::SjLj;
+ AbsoluteEHSectionOffsets = false;
+}
+
+ARMELFMCAsmInfo::ARMELFMCAsmInfo() {
+ AlignmentIsInBytes = false;
+ Data64bitsDirective = 0;
+ CommentString = "@";
+ COMMDirectiveTakesAlignment = false;
+
+ NeedsSet = false;
+ HasLEB128 = true;
+ AbsoluteDebugSectionOffsets = true;
+ PrivateGlobalPrefix = ".L";
+ WeakRefDirective = "\t.weak\t";
+ SetDirective = "\t.set\t";
+ LCOMMDirective = "\t.lcomm\t";
+
+ DwarfRequiresFrameSection = false;
+
+ SupportsDebugInformation = true;
+}
diff --git a/lib/Target/ARM/ARMMCAsmInfo.h b/lib/Target/ARM/ARMMCAsmInfo.h
new file mode 100644
index 0000000..90f7822
--- /dev/null
+++ b/lib/Target/ARM/ARMMCAsmInfo.h
@@ -0,0 +1,31 @@
+//=====-- ARMMCAsmInfo.h - ARM asm properties -------------*- C++ -*--====//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the ARMMCAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ARMTARGETASMINFO_H
+#define LLVM_ARMTARGETASMINFO_H
+
+#include "llvm/MC/MCAsmInfoDarwin.h"
+
+namespace llvm {
+
+ struct ARMMCAsmInfoDarwin : public MCAsmInfoDarwin {
+ explicit ARMMCAsmInfoDarwin();
+ };
+
+ struct ARMELFMCAsmInfo : public MCAsmInfo {
+ explicit ARMELFMCAsmInfo();
+ };
+
+} // namespace llvm
+
+#endif
diff --git a/lib/Target/ARM/ARMMachineFunctionInfo.h b/lib/Target/ARM/ARMMachineFunctionInfo.h
index 66d3df6..2176b27 100644
--- a/lib/Target/ARM/ARMMachineFunctionInfo.h
+++ b/lib/Target/ARM/ARMMachineFunctionInfo.h
@@ -1,10 +1,10 @@
//====- ARMMachineFuctionInfo.h - ARM machine function info -----*- C++ -*-===//
-//
+//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
-//
+//
//===----------------------------------------------------------------------===//
//
// This file declares ARM-specific per-machine-function information.
@@ -52,10 +52,6 @@ class ARMFunctionInfo : public MachineFunctionInfo {
/// enable far jump.
bool LRSpilledForFarJump;
- /// R3IsLiveIn - True if R3 is live in to this function.
- /// FIXME: Remove when register scavenger for Thumb is done.
- bool R3IsLiveIn;
-
/// FramePtrSpillOffset - If HasStackFrame, this records the frame pointer
/// spill stack offset.
unsigned FramePtrSpillOffset;
@@ -100,7 +96,7 @@ public:
hasThumb2(false),
Align(2U),
VarArgsRegSaveSize(0), HasStackFrame(false),
- LRSpilledForFarJump(false), R3IsLiveIn(false),
+ LRSpilledForFarJump(false),
FramePtrSpillOffset(0), GPRCS1Offset(0), GPRCS2Offset(0), DPRCSOffset(0),
GPRCS1Size(0), GPRCS2Size(0), DPRCSSize(0),
GPRCS1Frames(0), GPRCS2Frames(0), DPRCSFrames(0),
@@ -111,7 +107,7 @@ public:
hasThumb2(MF.getTarget().getSubtarget<ARMSubtarget>().hasThumb2()),
Align(isThumb ? 1U : 2U),
VarArgsRegSaveSize(0), HasStackFrame(false),
- LRSpilledForFarJump(false), R3IsLiveIn(false),
+ LRSpilledForFarJump(false),
FramePtrSpillOffset(0), GPRCS1Offset(0), GPRCS2Offset(0), DPRCSOffset(0),
GPRCS1Size(0), GPRCS2Size(0), DPRCSSize(0),
GPRCS1Frames(32), GPRCS2Frames(32), DPRCSFrames(32),
@@ -119,6 +115,7 @@ public:
JumpTableUId(0), ConstPoolEntryUId(0) {}
bool isThumbFunction() const { return isThumb; }
+ bool isThumb1OnlyFunction() const { return isThumb && !hasThumb2; }
bool isThumb2Function() const { return isThumb && hasThumb2; }
unsigned getAlign() const { return Align; }
@@ -133,13 +130,9 @@ public:
bool isLRSpilledForFarJump() const { return LRSpilledForFarJump; }
void setLRIsSpilledForFarJump(bool s) { LRSpilledForFarJump = s; }
- // FIXME: Remove when register scavenger for Thumb is done.
- bool isR3LiveIn() const { return R3IsLiveIn; }
- void setR3IsLiveIn(bool l) { R3IsLiveIn = l; }
-
unsigned getFramePtrSpillOffset() const { return FramePtrSpillOffset; }
void setFramePtrSpillOffset(unsigned o) { FramePtrSpillOffset = o; }
-
+
unsigned getGPRCalleeSavedArea1Offset() const { return GPRCS1Offset; }
unsigned getGPRCalleeSavedArea2Offset() const { return GPRCS2Offset; }
unsigned getDPRCalleeSavedAreaOffset() const { return DPRCSOffset; }
diff --git a/lib/Target/ARM/ARMPerfectShuffle.h b/lib/Target/ARM/ARMPerfectShuffle.h
new file mode 100644
index 0000000..5ff7c38
--- /dev/null
+++ b/lib/Target/ARM/ARMPerfectShuffle.h
@@ -0,0 +1,6586 @@
+//===-- ARMPerfectShuffle.h - NEON Perfect Shuffle Table ------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file, which was autogenerated by llvm-PerfectShuffle, contains data
+// for the optimal way to build a perfect shuffle using neon instructions.
+//
+//===----------------------------------------------------------------------===//
+
+// 31 entries have cost 0
+// 242 entries have cost 1
+// 1447 entries have cost 2
+// 3602 entries have cost 3
+// 1237 entries have cost 4
+// 2 entries have cost 5
+
+// This table is 6561*4 = 26244 bytes in size.
+static const unsigned PerfectShuffleTable[6561+1] = {
+ 135053414U, // <0,0,0,0>: Cost 1 vdup0 LHS
+ 1543503974U, // <0,0,0,1>: Cost 2 vext2 <0,0,0,0>, LHS
+ 2618572962U, // <0,0,0,2>: Cost 3 vext2 <0,2,0,0>, <0,2,0,0>
+ 2568054923U, // <0,0,0,3>: Cost 3 vext1 <3,0,0,0>, <3,0,0,0>
+ 1476398390U, // <0,0,0,4>: Cost 2 vext1 <0,0,0,0>, RHS
+ 2550140624U, // <0,0,0,5>: Cost 3 vext1 <0,0,0,0>, <5,1,7,3>
+ 2550141434U, // <0,0,0,6>: Cost 3 vext1 <0,0,0,0>, <6,2,7,3>
+ 2591945711U, // <0,0,0,7>: Cost 3 vext1 <7,0,0,0>, <7,0,0,0>
+ 135053414U, // <0,0,0,u>: Cost 1 vdup0 LHS
+ 2886516736U, // <0,0,1,0>: Cost 3 vzipl LHS, <0,0,0,0>
+ 1812775014U, // <0,0,1,1>: Cost 2 vzipl LHS, LHS
+ 1618133094U, // <0,0,1,2>: Cost 2 vext3 <1,2,3,0>, LHS
+ 2625209292U, // <0,0,1,3>: Cost 3 vext2 <1,3,0,0>, <1,3,0,0>
+ 2886558034U, // <0,0,1,4>: Cost 3 vzipl LHS, <0,4,1,5>
+ 2617246864U, // <0,0,1,5>: Cost 3 vext2 <0,0,0,0>, <1,5,3,7>
+ 3659723031U, // <0,0,1,6>: Cost 4 vext1 <6,0,0,1>, <6,0,0,1>
+ 2591953904U, // <0,0,1,7>: Cost 3 vext1 <7,0,0,1>, <7,0,0,1>
+ 1812775581U, // <0,0,1,u>: Cost 2 vzipl LHS, LHS
+ 3020734464U, // <0,0,2,0>: Cost 3 vtrnl LHS, <0,0,0,0>
+ 3020734474U, // <0,0,2,1>: Cost 3 vtrnl LHS, <0,0,1,1>
+ 1946992742U, // <0,0,2,2>: Cost 2 vtrnl LHS, LHS
+ 2631181989U, // <0,0,2,3>: Cost 3 vext2 <2,3,0,0>, <2,3,0,0>
+ 3020734668U, // <0,0,2,4>: Cost 3 vtrnl LHS, <0,2,4,6>
+ 3826550569U, // <0,0,2,5>: Cost 4 vuzpl <0,2,0,2>, <2,4,5,6>
+ 2617247674U, // <0,0,2,6>: Cost 3 vext2 <0,0,0,0>, <2,6,3,7>
+ 2591962097U, // <0,0,2,7>: Cost 3 vext1 <7,0,0,2>, <7,0,0,2>
+ 1946992796U, // <0,0,2,u>: Cost 2 vtrnl LHS, LHS
+ 2635163787U, // <0,0,3,0>: Cost 3 vext2 <3,0,0,0>, <3,0,0,0>
+ 2686419196U, // <0,0,3,1>: Cost 3 vext3 <0,3,1,0>, <0,3,1,0>
+ 2686492933U, // <0,0,3,2>: Cost 3 vext3 <0,3,2,0>, <0,3,2,0>
+ 2617248156U, // <0,0,3,3>: Cost 3 vext2 <0,0,0,0>, <3,3,3,3>
+ 2617248258U, // <0,0,3,4>: Cost 3 vext2 <0,0,0,0>, <3,4,5,6>
+ 3826551298U, // <0,0,3,5>: Cost 4 vuzpl <0,2,0,2>, <3,4,5,6>
+ 3690990200U, // <0,0,3,6>: Cost 4 vext2 <0,0,0,0>, <3,6,0,7>
+ 3713551042U, // <0,0,3,7>: Cost 4 vext2 <3,7,0,0>, <3,7,0,0>
+ 2635163787U, // <0,0,3,u>: Cost 3 vext2 <3,0,0,0>, <3,0,0,0>
+ 2617248658U, // <0,0,4,0>: Cost 3 vext2 <0,0,0,0>, <4,0,5,1>
+ 2888450150U, // <0,0,4,1>: Cost 3 vzipl <0,4,1,5>, LHS
+ 3021570150U, // <0,0,4,2>: Cost 3 vtrnl <0,2,4,6>, LHS
+ 3641829519U, // <0,0,4,3>: Cost 4 vext1 <3,0,0,4>, <3,0,0,4>
+ 3021570252U, // <0,0,4,4>: Cost 3 vtrnl <0,2,4,6>, <0,2,4,6>
+ 1543507254U, // <0,0,4,5>: Cost 2 vext2 <0,0,0,0>, RHS
+ 2752810294U, // <0,0,4,6>: Cost 3 vuzpl <0,2,0,2>, RHS
+ 3786998152U, // <0,0,4,7>: Cost 4 vext3 <4,7,5,0>, <0,4,7,5>
+ 1543507497U, // <0,0,4,u>: Cost 2 vext2 <0,0,0,0>, RHS
+ 2684354972U, // <0,0,5,0>: Cost 3 vext3 <0,0,0,0>, <0,5,0,7>
+ 2617249488U, // <0,0,5,1>: Cost 3 vext2 <0,0,0,0>, <5,1,7,3>
+ 3765617070U, // <0,0,5,2>: Cost 4 vext3 <1,2,3,0>, <0,5,2,7>
+ 3635865780U, // <0,0,5,3>: Cost 4 vext1 <2,0,0,5>, <3,0,4,5>
+ 2617249734U, // <0,0,5,4>: Cost 3 vext2 <0,0,0,0>, <5,4,7,6>
+ 2617249796U, // <0,0,5,5>: Cost 3 vext2 <0,0,0,0>, <5,5,5,5>
+ 2718712274U, // <0,0,5,6>: Cost 3 vext3 <5,6,7,0>, <0,5,6,7>
+ 2617249960U, // <0,0,5,7>: Cost 3 vext2 <0,0,0,0>, <5,7,5,7>
+ 2720039396U, // <0,0,5,u>: Cost 3 vext3 <5,u,7,0>, <0,5,u,7>
+ 2684355053U, // <0,0,6,0>: Cost 3 vext3 <0,0,0,0>, <0,6,0,7>
+ 3963609190U, // <0,0,6,1>: Cost 4 vzipl <0,6,2,7>, LHS
+ 2617250298U, // <0,0,6,2>: Cost 3 vext2 <0,0,0,0>, <6,2,7,3>
+ 3796435464U, // <0,0,6,3>: Cost 4 vext3 <6,3,7,0>, <0,6,3,7>
+ 3659762998U, // <0,0,6,4>: Cost 4 vext1 <6,0,0,6>, RHS
+ 3659763810U, // <0,0,6,5>: Cost 4 vext1 <6,0,0,6>, <5,6,7,0>
+ 2617250616U, // <0,0,6,6>: Cost 3 vext2 <0,0,0,0>, <6,6,6,6>
+ 2657727309U, // <0,0,6,7>: Cost 3 vext2 <6,7,0,0>, <6,7,0,0>
+ 2658390942U, // <0,0,6,u>: Cost 3 vext2 <6,u,0,0>, <6,u,0,0>
+ 2659054575U, // <0,0,7,0>: Cost 3 vext2 <7,0,0,0>, <7,0,0,0>
+ 3635880854U, // <0,0,7,1>: Cost 4 vext1 <2,0,0,7>, <1,2,3,0>
+ 3635881401U, // <0,0,7,2>: Cost 4 vext1 <2,0,0,7>, <2,0,0,7>
+ 3734787298U, // <0,0,7,3>: Cost 4 vext2 <7,3,0,0>, <7,3,0,0>
+ 2617251174U, // <0,0,7,4>: Cost 3 vext2 <0,0,0,0>, <7,4,5,6>
+ 3659772002U, // <0,0,7,5>: Cost 4 vext1 <6,0,0,7>, <5,6,7,0>
+ 3659772189U, // <0,0,7,6>: Cost 4 vext1 <6,0,0,7>, <6,0,0,7>
+ 2617251436U, // <0,0,7,7>: Cost 3 vext2 <0,0,0,0>, <7,7,7,7>
+ 2659054575U, // <0,0,7,u>: Cost 3 vext2 <7,0,0,0>, <7,0,0,0>
+ 135053414U, // <0,0,u,0>: Cost 1 vdup0 LHS
+ 1817419878U, // <0,0,u,1>: Cost 2 vzipl LHS, LHS
+ 1947435110U, // <0,0,u,2>: Cost 2 vtrnl LHS, LHS
+ 2568120467U, // <0,0,u,3>: Cost 3 vext1 <3,0,0,u>, <3,0,0,u>
+ 1476463926U, // <0,0,u,4>: Cost 2 vext1 <0,0,0,u>, RHS
+ 1543510170U, // <0,0,u,5>: Cost 2 vext2 <0,0,0,0>, RHS
+ 2752813210U, // <0,0,u,6>: Cost 3 vuzpl <0,2,0,2>, RHS
+ 2592011255U, // <0,0,u,7>: Cost 3 vext1 <7,0,0,u>, <7,0,0,u>
+ 135053414U, // <0,0,u,u>: Cost 1 vdup0 LHS
+ 2618581002U, // <0,1,0,0>: Cost 3 vext2 <0,2,0,1>, <0,0,1,1>
+ 1557446758U, // <0,1,0,1>: Cost 2 vext2 <2,3,0,1>, LHS
+ 2618581155U, // <0,1,0,2>: Cost 3 vext2 <0,2,0,1>, <0,2,0,1>
+ 2690548468U, // <0,1,0,3>: Cost 3 vext3 <1,0,3,0>, <1,0,3,0>
+ 2626543954U, // <0,1,0,4>: Cost 3 vext2 <1,5,0,1>, <0,4,1,5>
+ 4094985216U, // <0,1,0,5>: Cost 4 vtrnl <0,2,0,2>, <1,3,5,7>
+ 2592019278U, // <0,1,0,6>: Cost 3 vext1 <7,0,1,0>, <6,7,0,1>
+ 2592019448U, // <0,1,0,7>: Cost 3 vext1 <7,0,1,0>, <7,0,1,0>
+ 1557447325U, // <0,1,0,u>: Cost 2 vext2 <2,3,0,1>, LHS
+ 1476476938U, // <0,1,1,0>: Cost 2 vext1 <0,0,1,1>, <0,0,1,1>
+ 2886517556U, // <0,1,1,1>: Cost 3 vzipl LHS, <1,1,1,1>
+ 2886517654U, // <0,1,1,2>: Cost 3 vzipl LHS, <1,2,3,0>
+ 2886517720U, // <0,1,1,3>: Cost 3 vzipl LHS, <1,3,1,3>
+ 1476480310U, // <0,1,1,4>: Cost 2 vext1 <0,0,1,1>, RHS
+ 2886558864U, // <0,1,1,5>: Cost 3 vzipl LHS, <1,5,3,7>
+ 2550223354U, // <0,1,1,6>: Cost 3 vext1 <0,0,1,1>, <6,2,7,3>
+ 2550223856U, // <0,1,1,7>: Cost 3 vext1 <0,0,1,1>, <7,0,0,1>
+ 1476482862U, // <0,1,1,u>: Cost 2 vext1 <0,0,1,1>, LHS
+ 1494401126U, // <0,1,2,0>: Cost 2 vext1 <3,0,1,2>, LHS
+ 3020735284U, // <0,1,2,1>: Cost 3 vtrnl LHS, <1,1,1,1>
+ 2562172349U, // <0,1,2,2>: Cost 3 vext1 <2,0,1,2>, <2,0,1,2>
+ 835584U, // <0,1,2,3>: Cost 0 copy LHS
+ 1494404406U, // <0,1,2,4>: Cost 2 vext1 <3,0,1,2>, RHS
+ 3020735488U, // <0,1,2,5>: Cost 3 vtrnl LHS, <1,3,5,7>
+ 2631190458U, // <0,1,2,6>: Cost 3 vext2 <2,3,0,1>, <2,6,3,7>
+ 1518294010U, // <0,1,2,7>: Cost 2 vext1 <7,0,1,2>, <7,0,1,2>
+ 835584U, // <0,1,2,u>: Cost 0 copy LHS
+ 2692318156U, // <0,1,3,0>: Cost 3 vext3 <1,3,0,0>, <1,3,0,0>
+ 2691875800U, // <0,1,3,1>: Cost 3 vext3 <1,2,3,0>, <1,3,1,3>
+ 2691875806U, // <0,1,3,2>: Cost 3 vext3 <1,2,3,0>, <1,3,2,0>
+ 2692539367U, // <0,1,3,3>: Cost 3 vext3 <1,3,3,0>, <1,3,3,0>
+ 2562182454U, // <0,1,3,4>: Cost 3 vext1 <2,0,1,3>, RHS
+ 2691875840U, // <0,1,3,5>: Cost 3 vext3 <1,2,3,0>, <1,3,5,7>
+ 2692760578U, // <0,1,3,6>: Cost 3 vext3 <1,3,6,0>, <1,3,6,0>
+ 2639817411U, // <0,1,3,7>: Cost 3 vext2 <3,7,0,1>, <3,7,0,1>
+ 2691875863U, // <0,1,3,u>: Cost 3 vext3 <1,2,3,0>, <1,3,u,3>
+ 2568159334U, // <0,1,4,0>: Cost 3 vext1 <3,0,1,4>, LHS
+ 4095312692U, // <0,1,4,1>: Cost 4 vtrnl <0,2,4,6>, <1,1,1,1>
+ 2568160934U, // <0,1,4,2>: Cost 3 vext1 <3,0,1,4>, <2,3,0,1>
+ 2568161432U, // <0,1,4,3>: Cost 3 vext1 <3,0,1,4>, <3,0,1,4>
+ 2568162614U, // <0,1,4,4>: Cost 3 vext1 <3,0,1,4>, RHS
+ 1557450038U, // <0,1,4,5>: Cost 2 vext2 <2,3,0,1>, RHS
+ 2754235702U, // <0,1,4,6>: Cost 3 vuzpl <0,4,1,5>, RHS
+ 2592052220U, // <0,1,4,7>: Cost 3 vext1 <7,0,1,4>, <7,0,1,4>
+ 1557450281U, // <0,1,4,u>: Cost 2 vext2 <2,3,0,1>, RHS
+ 3765617775U, // <0,1,5,0>: Cost 4 vext3 <1,2,3,0>, <1,5,0,1>
+ 2647781007U, // <0,1,5,1>: Cost 3 vext2 <5,1,0,1>, <5,1,0,1>
+ 3704934138U, // <0,1,5,2>: Cost 4 vext2 <2,3,0,1>, <5,2,3,0>
+ 2691875984U, // <0,1,5,3>: Cost 3 vext3 <1,2,3,0>, <1,5,3,7>
+ 2657734598U, // <0,1,5,4>: Cost 3 vext2 <6,7,0,1>, <5,4,7,6>
+ 2650435539U, // <0,1,5,5>: Cost 3 vext2 <5,5,0,1>, <5,5,0,1>
+ 2651099172U, // <0,1,5,6>: Cost 3 vext2 <5,6,0,1>, <5,6,0,1>
+ 2651762805U, // <0,1,5,7>: Cost 3 vext2 <5,7,0,1>, <5,7,0,1>
+ 2691876029U, // <0,1,5,u>: Cost 3 vext3 <1,2,3,0>, <1,5,u,7>
+ 2592063590U, // <0,1,6,0>: Cost 3 vext1 <7,0,1,6>, LHS
+ 3765617871U, // <0,1,6,1>: Cost 4 vext3 <1,2,3,0>, <1,6,1,7>
+ 2654417337U, // <0,1,6,2>: Cost 3 vext2 <6,2,0,1>, <6,2,0,1>
+ 3765617889U, // <0,1,6,3>: Cost 4 vext3 <1,2,3,0>, <1,6,3,7>
+ 2592066870U, // <0,1,6,4>: Cost 3 vext1 <7,0,1,6>, RHS
+ 3765617907U, // <0,1,6,5>: Cost 4 vext3 <1,2,3,0>, <1,6,5,7>
+ 2657071869U, // <0,1,6,6>: Cost 3 vext2 <6,6,0,1>, <6,6,0,1>
+ 1583993678U, // <0,1,6,7>: Cost 2 vext2 <6,7,0,1>, <6,7,0,1>
+ 1584657311U, // <0,1,6,u>: Cost 2 vext2 <6,u,0,1>, <6,u,0,1>
+ 2657735672U, // <0,1,7,0>: Cost 3 vext2 <6,7,0,1>, <7,0,1,0>
+ 2657735808U, // <0,1,7,1>: Cost 3 vext2 <6,7,0,1>, <7,1,7,1>
+ 2631193772U, // <0,1,7,2>: Cost 3 vext2 <2,3,0,1>, <7,2,3,0>
+ 2661053667U, // <0,1,7,3>: Cost 3 vext2 <7,3,0,1>, <7,3,0,1>
+ 2657736038U, // <0,1,7,4>: Cost 3 vext2 <6,7,0,1>, <7,4,5,6>
+ 3721524621U, // <0,1,7,5>: Cost 4 vext2 <5,1,0,1>, <7,5,1,0>
+ 2657736158U, // <0,1,7,6>: Cost 3 vext2 <6,7,0,1>, <7,6,1,0>
+ 2657736300U, // <0,1,7,7>: Cost 3 vext2 <6,7,0,1>, <7,7,7,7>
+ 2657736322U, // <0,1,7,u>: Cost 3 vext2 <6,7,0,1>, <7,u,1,2>
+ 1494450278U, // <0,1,u,0>: Cost 2 vext1 <3,0,1,u>, LHS
+ 1557452590U, // <0,1,u,1>: Cost 2 vext2 <2,3,0,1>, LHS
+ 2754238254U, // <0,1,u,2>: Cost 3 vuzpl <0,4,1,5>, LHS
+ 835584U, // <0,1,u,3>: Cost 0 copy LHS
+ 1494453558U, // <0,1,u,4>: Cost 2 vext1 <3,0,1,u>, RHS
+ 1557452954U, // <0,1,u,5>: Cost 2 vext2 <2,3,0,1>, RHS
+ 2754238618U, // <0,1,u,6>: Cost 3 vuzpl <0,4,1,5>, RHS
+ 1518343168U, // <0,1,u,7>: Cost 2 vext1 <7,0,1,u>, <7,0,1,u>
+ 835584U, // <0,1,u,u>: Cost 0 copy LHS
+ 2752299008U, // <0,2,0,0>: Cost 3 vuzpl LHS, <0,0,0,0>
+ 1544847462U, // <0,2,0,1>: Cost 2 vext2 <0,2,0,2>, LHS
+ 1678557286U, // <0,2,0,2>: Cost 2 vuzpl LHS, LHS
+ 2696521165U, // <0,2,0,3>: Cost 3 vext3 <2,0,3,0>, <2,0,3,0>
+ 2752340172U, // <0,2,0,4>: Cost 3 vuzpl LHS, <0,2,4,6>
+ 2691876326U, // <0,2,0,5>: Cost 3 vext3 <1,2,3,0>, <2,0,5,7>
+ 2618589695U, // <0,2,0,6>: Cost 3 vext2 <0,2,0,2>, <0,6,2,7>
+ 2592093185U, // <0,2,0,7>: Cost 3 vext1 <7,0,2,0>, <7,0,2,0>
+ 1678557340U, // <0,2,0,u>: Cost 2 vuzpl LHS, LHS
+ 2618589942U, // <0,2,1,0>: Cost 3 vext2 <0,2,0,2>, <1,0,3,2>
+ 2752299828U, // <0,2,1,1>: Cost 3 vuzpl LHS, <1,1,1,1>
+ 2886518376U, // <0,2,1,2>: Cost 3 vzipl LHS, <2,2,2,2>
+ 2752299766U, // <0,2,1,3>: Cost 3 vuzpl LHS, <1,0,3,2>
+ 2550295862U, // <0,2,1,4>: Cost 3 vext1 <0,0,2,1>, RHS
+ 2752340992U, // <0,2,1,5>: Cost 3 vuzpl LHS, <1,3,5,7>
+ 2886559674U, // <0,2,1,6>: Cost 3 vzipl LHS, <2,6,3,7>
+ 3934208106U, // <0,2,1,7>: Cost 4 vuzpr <7,0,1,2>, <0,1,2,7>
+ 2752340771U, // <0,2,1,u>: Cost 3 vuzpl LHS, <1,0,u,2>
+ 1476558868U, // <0,2,2,0>: Cost 2 vext1 <0,0,2,2>, <0,0,2,2>
+ 2226628029U, // <0,2,2,1>: Cost 3 vrev <2,0,1,2>
+ 2752300648U, // <0,2,2,2>: Cost 3 vuzpl LHS, <2,2,2,2>
+ 3020736114U, // <0,2,2,3>: Cost 3 vtrnl LHS, <2,2,3,3>
+ 1476562230U, // <0,2,2,4>: Cost 2 vext1 <0,0,2,2>, RHS
+ 2550304464U, // <0,2,2,5>: Cost 3 vext1 <0,0,2,2>, <5,1,7,3>
+ 2618591162U, // <0,2,2,6>: Cost 3 vext2 <0,2,0,2>, <2,6,3,7>
+ 2550305777U, // <0,2,2,7>: Cost 3 vext1 <0,0,2,2>, <7,0,0,2>
+ 1476564782U, // <0,2,2,u>: Cost 2 vext1 <0,0,2,2>, LHS
+ 2618591382U, // <0,2,3,0>: Cost 3 vext2 <0,2,0,2>, <3,0,1,2>
+ 2752301206U, // <0,2,3,1>: Cost 3 vuzpl LHS, <3,0,1,2>
+ 3826043121U, // <0,2,3,2>: Cost 4 vuzpl LHS, <3,1,2,3>
+ 2752301468U, // <0,2,3,3>: Cost 3 vuzpl LHS, <3,3,3,3>
+ 2618591746U, // <0,2,3,4>: Cost 3 vext2 <0,2,0,2>, <3,4,5,6>
+ 2752301570U, // <0,2,3,5>: Cost 3 vuzpl LHS, <3,4,5,6>
+ 3830688102U, // <0,2,3,6>: Cost 4 vuzpl LHS, <3,2,6,3>
+ 2698807012U, // <0,2,3,7>: Cost 3 vext3 <2,3,7,0>, <2,3,7,0>
+ 2752301269U, // <0,2,3,u>: Cost 3 vuzpl LHS, <3,0,u,2>
+ 2562261094U, // <0,2,4,0>: Cost 3 vext1 <2,0,2,4>, LHS
+ 4095313828U, // <0,2,4,1>: Cost 4 vtrnl <0,2,4,6>, <2,6,1,3>
+ 2226718152U, // <0,2,4,2>: Cost 3 vrev <2,0,2,4>
+ 2568235169U, // <0,2,4,3>: Cost 3 vext1 <3,0,2,4>, <3,0,2,4>
+ 2562264374U, // <0,2,4,4>: Cost 3 vext1 <2,0,2,4>, RHS
+ 1544850742U, // <0,2,4,5>: Cost 2 vext2 <0,2,0,2>, RHS
+ 1678560566U, // <0,2,4,6>: Cost 2 vuzpl LHS, RHS
+ 2592125957U, // <0,2,4,7>: Cost 3 vext1 <7,0,2,4>, <7,0,2,4>
+ 1678560584U, // <0,2,4,u>: Cost 2 vuzpl LHS, RHS
+ 2691876686U, // <0,2,5,0>: Cost 3 vext3 <1,2,3,0>, <2,5,0,7>
+ 2618592976U, // <0,2,5,1>: Cost 3 vext2 <0,2,0,2>, <5,1,7,3>
+ 3765618528U, // <0,2,5,2>: Cost 4 vext3 <1,2,3,0>, <2,5,2,7>
+ 3765618536U, // <0,2,5,3>: Cost 4 vext3 <1,2,3,0>, <2,5,3,6>
+ 2618593222U, // <0,2,5,4>: Cost 3 vext2 <0,2,0,2>, <5,4,7,6>
+ 2752303108U, // <0,2,5,5>: Cost 3 vuzpl LHS, <5,5,5,5>
+ 2618593378U, // <0,2,5,6>: Cost 3 vext2 <0,2,0,2>, <5,6,7,0>
+ 2824785206U, // <0,2,5,7>: Cost 3 vuzpr <1,0,3,2>, RHS
+ 2824785207U, // <0,2,5,u>: Cost 3 vuzpr <1,0,3,2>, RHS
+ 2752303950U, // <0,2,6,0>: Cost 3 vuzpl LHS, <6,7,0,1>
+ 3830690081U, // <0,2,6,1>: Cost 4 vuzpl LHS, <6,0,1,2>
+ 2618593786U, // <0,2,6,2>: Cost 3 vext2 <0,2,0,2>, <6,2,7,3>
+ 2691876794U, // <0,2,6,3>: Cost 3 vext3 <1,2,3,0>, <2,6,3,7>
+ 2752303990U, // <0,2,6,4>: Cost 3 vuzpl LHS, <6,7,4,5>
+ 3830690445U, // <0,2,6,5>: Cost 4 vuzpl LHS, <6,4,5,6>
+ 2752303928U, // <0,2,6,6>: Cost 3 vuzpl LHS, <6,6,6,6>
+ 2657743695U, // <0,2,6,7>: Cost 3 vext2 <6,7,0,2>, <6,7,0,2>
+ 2691876839U, // <0,2,6,u>: Cost 3 vext3 <1,2,3,0>, <2,6,u,7>
+ 2659070961U, // <0,2,7,0>: Cost 3 vext2 <7,0,0,2>, <7,0,0,2>
+ 2659734594U, // <0,2,7,1>: Cost 3 vext2 <7,1,0,2>, <7,1,0,2>
+ 3734140051U, // <0,2,7,2>: Cost 4 vext2 <7,2,0,2>, <7,2,0,2>
+ 2701166596U, // <0,2,7,3>: Cost 3 vext3 <2,7,3,0>, <2,7,3,0>
+ 2662389094U, // <0,2,7,4>: Cost 3 vext2 <7,5,0,2>, <7,4,5,6>
+ 2662389126U, // <0,2,7,5>: Cost 3 vext2 <7,5,0,2>, <7,5,0,2>
+ 3736794583U, // <0,2,7,6>: Cost 4 vext2 <7,6,0,2>, <7,6,0,2>
+ 2752304748U, // <0,2,7,7>: Cost 3 vuzpl LHS, <7,7,7,7>
+ 2659070961U, // <0,2,7,u>: Cost 3 vext2 <7,0,0,2>, <7,0,0,2>
+ 1476608026U, // <0,2,u,0>: Cost 2 vext1 <0,0,2,u>, <0,0,2,u>
+ 1544853294U, // <0,2,u,1>: Cost 2 vext2 <0,2,0,2>, LHS
+ 1678563118U, // <0,2,u,2>: Cost 2 vuzpl LHS, LHS
+ 3021178482U, // <0,2,u,3>: Cost 3 vtrnl LHS, <2,2,3,3>
+ 1476611382U, // <0,2,u,4>: Cost 2 vext1 <0,0,2,u>, RHS
+ 1544853658U, // <0,2,u,5>: Cost 2 vext2 <0,2,0,2>, RHS
+ 1678563482U, // <0,2,u,6>: Cost 2 vuzpl LHS, RHS
+ 2824785449U, // <0,2,u,7>: Cost 3 vuzpr <1,0,3,2>, RHS
+ 1678563172U, // <0,2,u,u>: Cost 2 vuzpl LHS, LHS
+ 2556329984U, // <0,3,0,0>: Cost 3 vext1 <1,0,3,0>, <0,0,0,0>
+ 2686421142U, // <0,3,0,1>: Cost 3 vext3 <0,3,1,0>, <3,0,1,2>
+ 2562303437U, // <0,3,0,2>: Cost 3 vext1 <2,0,3,0>, <2,0,3,0>
+ 4094986652U, // <0,3,0,3>: Cost 4 vtrnl <0,2,0,2>, <3,3,3,3>
+ 2556333366U, // <0,3,0,4>: Cost 3 vext1 <1,0,3,0>, RHS
+ 4094986754U, // <0,3,0,5>: Cost 4 vtrnl <0,2,0,2>, <3,4,5,6>
+ 3798796488U, // <0,3,0,6>: Cost 4 vext3 <6,7,3,0>, <3,0,6,7>
+ 3776530634U, // <0,3,0,7>: Cost 4 vext3 <3,0,7,0>, <3,0,7,0>
+ 2556335918U, // <0,3,0,u>: Cost 3 vext1 <1,0,3,0>, LHS
+ 2886518934U, // <0,3,1,0>: Cost 3 vzipl LHS, <3,0,1,2>
+ 2556338933U, // <0,3,1,1>: Cost 3 vext1 <1,0,3,1>, <1,0,3,1>
+ 2691877105U, // <0,3,1,2>: Cost 3 vext3 <1,2,3,0>, <3,1,2,3>
+ 2886519196U, // <0,3,1,3>: Cost 3 vzipl LHS, <3,3,3,3>
+ 2886519298U, // <0,3,1,4>: Cost 3 vzipl LHS, <3,4,5,6>
+ 4095740418U, // <0,3,1,5>: Cost 4 vtrnl <0,3,1,4>, <3,4,5,6>
+ 3659944242U, // <0,3,1,6>: Cost 4 vext1 <6,0,3,1>, <6,0,3,1>
+ 3769600286U, // <0,3,1,7>: Cost 4 vext3 <1,u,3,0>, <3,1,7,3>
+ 2886519582U, // <0,3,1,u>: Cost 3 vzipl LHS, <3,u,1,2>
+ 1482604646U, // <0,3,2,0>: Cost 2 vext1 <1,0,3,2>, LHS
+ 1482605302U, // <0,3,2,1>: Cost 2 vext1 <1,0,3,2>, <1,0,3,2>
+ 2556348008U, // <0,3,2,2>: Cost 3 vext1 <1,0,3,2>, <2,2,2,2>
+ 3020736924U, // <0,3,2,3>: Cost 3 vtrnl LHS, <3,3,3,3>
+ 1482607926U, // <0,3,2,4>: Cost 2 vext1 <1,0,3,2>, RHS
+ 3020737026U, // <0,3,2,5>: Cost 3 vtrnl LHS, <3,4,5,6>
+ 2598154746U, // <0,3,2,6>: Cost 3 vext1 <u,0,3,2>, <6,2,7,3>
+ 2598155258U, // <0,3,2,7>: Cost 3 vext1 <u,0,3,2>, <7,0,1,2>
+ 1482610478U, // <0,3,2,u>: Cost 2 vext1 <1,0,3,2>, LHS
+ 3692341398U, // <0,3,3,0>: Cost 4 vext2 <0,2,0,3>, <3,0,1,2>
+ 2635851999U, // <0,3,3,1>: Cost 3 vext2 <3,1,0,3>, <3,1,0,3>
+ 3636069840U, // <0,3,3,2>: Cost 4 vext1 <2,0,3,3>, <2,0,3,3>
+ 2691877276U, // <0,3,3,3>: Cost 3 vext3 <1,2,3,0>, <3,3,3,3>
+ 3961522690U, // <0,3,3,4>: Cost 4 vzipl <0,3,1,4>, <3,4,5,6>
+ 3826797058U, // <0,3,3,5>: Cost 4 vuzpl <0,2,3,5>, <3,4,5,6>
+ 3703622282U, // <0,3,3,6>: Cost 4 vext2 <2,1,0,3>, <3,6,2,7>
+ 3769600452U, // <0,3,3,7>: Cost 4 vext3 <1,u,3,0>, <3,3,7,7>
+ 2640497430U, // <0,3,3,u>: Cost 3 vext2 <3,u,0,3>, <3,u,0,3>
+ 3962194070U, // <0,3,4,0>: Cost 4 vzipl <0,4,1,5>, <3,0,1,2>
+ 2232617112U, // <0,3,4,1>: Cost 3 vrev <3,0,1,4>
+ 2232690849U, // <0,3,4,2>: Cost 3 vrev <3,0,2,4>
+ 4095314332U, // <0,3,4,3>: Cost 4 vtrnl <0,2,4,6>, <3,3,3,3>
+ 3962194434U, // <0,3,4,4>: Cost 4 vzipl <0,4,1,5>, <3,4,5,6>
+ 2691877378U, // <0,3,4,5>: Cost 3 vext3 <1,2,3,0>, <3,4,5,6>
+ 3826765110U, // <0,3,4,6>: Cost 4 vuzpl <0,2,3,1>, RHS
+ 3665941518U, // <0,3,4,7>: Cost 4 vext1 <7,0,3,4>, <7,0,3,4>
+ 2691877405U, // <0,3,4,u>: Cost 3 vext3 <1,2,3,0>, <3,4,u,6>
+ 3630112870U, // <0,3,5,0>: Cost 4 vext1 <1,0,3,5>, LHS
+ 3630113526U, // <0,3,5,1>: Cost 4 vext1 <1,0,3,5>, <1,0,3,2>
+ 4035199734U, // <0,3,5,2>: Cost 4 vzipr <1,4,0,5>, <1,0,3,2>
+ 3769600578U, // <0,3,5,3>: Cost 4 vext3 <1,u,3,0>, <3,5,3,7>
+ 2232846516U, // <0,3,5,4>: Cost 3 vrev <3,0,4,5>
+ 3779037780U, // <0,3,5,5>: Cost 4 vext3 <3,4,5,0>, <3,5,5,7>
+ 2718714461U, // <0,3,5,6>: Cost 3 vext3 <5,6,7,0>, <3,5,6,7>
+ 2706106975U, // <0,3,5,7>: Cost 3 vext3 <3,5,7,0>, <3,5,7,0>
+ 2233141464U, // <0,3,5,u>: Cost 3 vrev <3,0,u,5>
+ 2691877496U, // <0,3,6,0>: Cost 3 vext3 <1,2,3,0>, <3,6,0,7>
+ 3727511914U, // <0,3,6,1>: Cost 4 vext2 <6,1,0,3>, <6,1,0,3>
+ 3765619338U, // <0,3,6,2>: Cost 4 vext3 <1,2,3,0>, <3,6,2,7>
+ 3765619347U, // <0,3,6,3>: Cost 4 vext3 <1,2,3,0>, <3,6,3,7>
+ 3765987996U, // <0,3,6,4>: Cost 4 vext3 <1,2,u,0>, <3,6,4,7>
+ 3306670270U, // <0,3,6,5>: Cost 4 vrev <3,0,5,6>
+ 3792456365U, // <0,3,6,6>: Cost 4 vext3 <5,6,7,0>, <3,6,6,6>
+ 2706770608U, // <0,3,6,7>: Cost 3 vext3 <3,6,7,0>, <3,6,7,0>
+ 2706844345U, // <0,3,6,u>: Cost 3 vext3 <3,6,u,0>, <3,6,u,0>
+ 3769600707U, // <0,3,7,0>: Cost 4 vext3 <1,u,3,0>, <3,7,0,1>
+ 2659742787U, // <0,3,7,1>: Cost 3 vext2 <7,1,0,3>, <7,1,0,3>
+ 3636102612U, // <0,3,7,2>: Cost 4 vext1 <2,0,3,7>, <2,0,3,7>
+ 3769600740U, // <0,3,7,3>: Cost 4 vext3 <1,u,3,0>, <3,7,3,7>
+ 3769600747U, // <0,3,7,4>: Cost 4 vext3 <1,u,3,0>, <3,7,4,5>
+ 3769600758U, // <0,3,7,5>: Cost 4 vext3 <1,u,3,0>, <3,7,5,7>
+ 3659993400U, // <0,3,7,6>: Cost 4 vext1 <6,0,3,7>, <6,0,3,7>
+ 3781176065U, // <0,3,7,7>: Cost 4 vext3 <3,7,7,0>, <3,7,7,0>
+ 2664388218U, // <0,3,7,u>: Cost 3 vext2 <7,u,0,3>, <7,u,0,3>
+ 1482653798U, // <0,3,u,0>: Cost 2 vext1 <1,0,3,u>, LHS
+ 1482654460U, // <0,3,u,1>: Cost 2 vext1 <1,0,3,u>, <1,0,3,u>
+ 2556397160U, // <0,3,u,2>: Cost 3 vext1 <1,0,3,u>, <2,2,2,2>
+ 3021179292U, // <0,3,u,3>: Cost 3 vtrnl LHS, <3,3,3,3>
+ 1482657078U, // <0,3,u,4>: Cost 2 vext1 <1,0,3,u>, RHS
+ 3021179394U, // <0,3,u,5>: Cost 3 vtrnl LHS, <3,4,5,6>
+ 2598203898U, // <0,3,u,6>: Cost 3 vext1 <u,0,3,u>, <6,2,7,3>
+ 2708097874U, // <0,3,u,7>: Cost 3 vext3 <3,u,7,0>, <3,u,7,0>
+ 1482659630U, // <0,3,u,u>: Cost 2 vext1 <1,0,3,u>, LHS
+ 2617278468U, // <0,4,0,0>: Cost 3 vext2 <0,0,0,4>, <0,0,0,4>
+ 2618605670U, // <0,4,0,1>: Cost 3 vext2 <0,2,0,4>, LHS
+ 2618605734U, // <0,4,0,2>: Cost 3 vext2 <0,2,0,4>, <0,2,0,4>
+ 3642091695U, // <0,4,0,3>: Cost 4 vext1 <3,0,4,0>, <3,0,4,0>
+ 2753134796U, // <0,4,0,4>: Cost 3 vuzpl <0,2,4,6>, <0,2,4,6>
+ 2718714770U, // <0,4,0,5>: Cost 3 vext3 <5,6,7,0>, <4,0,5,1>
+ 3021245750U, // <0,4,0,6>: Cost 3 vtrnl <0,2,0,2>, RHS
+ 3665982483U, // <0,4,0,7>: Cost 4 vext1 <7,0,4,0>, <7,0,4,0>
+ 3021245768U, // <0,4,0,u>: Cost 3 vtrnl <0,2,0,2>, RHS
+ 2568355942U, // <0,4,1,0>: Cost 3 vext1 <3,0,4,1>, LHS
+ 3692348212U, // <0,4,1,1>: Cost 4 vext2 <0,2,0,4>, <1,1,1,1>
+ 3692348310U, // <0,4,1,2>: Cost 4 vext2 <0,2,0,4>, <1,2,3,0>
+ 2568358064U, // <0,4,1,3>: Cost 3 vext1 <3,0,4,1>, <3,0,4,1>
+ 2568359222U, // <0,4,1,4>: Cost 3 vext1 <3,0,4,1>, RHS
+ 1812778294U, // <0,4,1,5>: Cost 2 vzipl LHS, RHS
+ 3022671158U, // <0,4,1,6>: Cost 3 vtrnl <0,4,1,5>, RHS
+ 2592248852U, // <0,4,1,7>: Cost 3 vext1 <7,0,4,1>, <7,0,4,1>
+ 1812778537U, // <0,4,1,u>: Cost 2 vzipl LHS, RHS
+ 2568364134U, // <0,4,2,0>: Cost 3 vext1 <3,0,4,2>, LHS
+ 2238573423U, // <0,4,2,1>: Cost 3 vrev <4,0,1,2>
+ 3692349032U, // <0,4,2,2>: Cost 4 vext2 <0,2,0,4>, <2,2,2,2>
+ 2631214761U, // <0,4,2,3>: Cost 3 vext2 <2,3,0,4>, <2,3,0,4>
+ 2568367414U, // <0,4,2,4>: Cost 3 vext1 <3,0,4,2>, RHS
+ 2887028022U, // <0,4,2,5>: Cost 3 vzipl <0,2,0,2>, RHS
+ 1946996022U, // <0,4,2,6>: Cost 2 vtrnl LHS, RHS
+ 2592257045U, // <0,4,2,7>: Cost 3 vext1 <7,0,4,2>, <7,0,4,2>
+ 1946996040U, // <0,4,2,u>: Cost 2 vtrnl LHS, RHS
+ 3692349590U, // <0,4,3,0>: Cost 4 vext2 <0,2,0,4>, <3,0,1,2>
+ 3826878614U, // <0,4,3,1>: Cost 4 vuzpl <0,2,4,6>, <3,0,1,2>
+ 3826878625U, // <0,4,3,2>: Cost 4 vuzpl <0,2,4,6>, <3,0,2,4>
+ 3692349852U, // <0,4,3,3>: Cost 4 vext2 <0,2,0,4>, <3,3,3,3>
+ 3692349954U, // <0,4,3,4>: Cost 4 vext2 <0,2,0,4>, <3,4,5,6>
+ 3826878978U, // <0,4,3,5>: Cost 4 vuzpl <0,2,4,6>, <3,4,5,6>
+ 4095200566U, // <0,4,3,6>: Cost 4 vtrnl <0,2,3,1>, RHS
+ 3713583814U, // <0,4,3,7>: Cost 4 vext2 <3,7,0,4>, <3,7,0,4>
+ 3692350238U, // <0,4,3,u>: Cost 4 vext2 <0,2,0,4>, <3,u,1,2>
+ 2550464552U, // <0,4,4,0>: Cost 3 vext1 <0,0,4,4>, <0,0,4,4>
+ 3962194914U, // <0,4,4,1>: Cost 4 vzipl <0,4,1,5>, <4,1,5,0>
+ 3693677631U, // <0,4,4,2>: Cost 4 vext2 <0,4,0,4>, <4,2,6,3>
+ 3642124467U, // <0,4,4,3>: Cost 4 vext1 <3,0,4,4>, <3,0,4,4>
+ 2718715088U, // <0,4,4,4>: Cost 3 vext3 <5,6,7,0>, <4,4,4,4>
+ 2618608950U, // <0,4,4,5>: Cost 3 vext2 <0,2,0,4>, RHS
+ 2753137974U, // <0,4,4,6>: Cost 3 vuzpl <0,2,4,6>, RHS
+ 3666015255U, // <0,4,4,7>: Cost 4 vext1 <7,0,4,4>, <7,0,4,4>
+ 2618609193U, // <0,4,4,u>: Cost 3 vext2 <0,2,0,4>, RHS
+ 2568388710U, // <0,4,5,0>: Cost 3 vext1 <3,0,4,5>, LHS
+ 2568389526U, // <0,4,5,1>: Cost 3 vext1 <3,0,4,5>, <1,2,3,0>
+ 3636159963U, // <0,4,5,2>: Cost 4 vext1 <2,0,4,5>, <2,0,4,5>
+ 2568390836U, // <0,4,5,3>: Cost 3 vext1 <3,0,4,5>, <3,0,4,5>
+ 2568391990U, // <0,4,5,4>: Cost 3 vext1 <3,0,4,5>, RHS
+ 2718715180U, // <0,4,5,5>: Cost 3 vext3 <5,6,7,0>, <4,5,5,6>
+ 1618136374U, // <0,4,5,6>: Cost 2 vext3 <1,2,3,0>, RHS
+ 2592281624U, // <0,4,5,7>: Cost 3 vext1 <7,0,4,5>, <7,0,4,5>
+ 1618136392U, // <0,4,5,u>: Cost 2 vext3 <1,2,3,0>, RHS
+ 2550480938U, // <0,4,6,0>: Cost 3 vext1 <0,0,4,6>, <0,0,4,6>
+ 3826880801U, // <0,4,6,1>: Cost 4 vuzpl <0,2,4,6>, <6,0,1,2>
+ 2562426332U, // <0,4,6,2>: Cost 3 vext1 <2,0,4,6>, <2,0,4,6>
+ 3786190181U, // <0,4,6,3>: Cost 4 vext3 <4,6,3,0>, <4,6,3,0>
+ 2718715252U, // <0,4,6,4>: Cost 3 vext3 <5,6,7,0>, <4,6,4,6>
+ 3826881165U, // <0,4,6,5>: Cost 4 vuzpl <0,2,4,6>, <6,4,5,6>
+ 2712669568U, // <0,4,6,6>: Cost 3 vext3 <4,6,6,0>, <4,6,6,0>
+ 2657760081U, // <0,4,6,7>: Cost 3 vext2 <6,7,0,4>, <6,7,0,4>
+ 2718715284U, // <0,4,6,u>: Cost 3 vext3 <5,6,7,0>, <4,6,u,2>
+ 3654090854U, // <0,4,7,0>: Cost 4 vext1 <5,0,4,7>, LHS
+ 3934229326U, // <0,4,7,1>: Cost 4 vuzpr <7,0,1,4>, <6,7,0,1>
+ 3734156437U, // <0,4,7,2>: Cost 4 vext2 <7,2,0,4>, <7,2,0,4>
+ 3734820070U, // <0,4,7,3>: Cost 4 vext2 <7,3,0,4>, <7,3,0,4>
+ 3654094134U, // <0,4,7,4>: Cost 4 vext1 <5,0,4,7>, RHS
+ 2713259464U, // <0,4,7,5>: Cost 3 vext3 <4,7,5,0>, <4,7,5,0>
+ 2713333201U, // <0,4,7,6>: Cost 3 vext3 <4,7,6,0>, <4,7,6,0>
+ 3654095866U, // <0,4,7,7>: Cost 4 vext1 <5,0,4,7>, <7,0,1,2>
+ 2713259464U, // <0,4,7,u>: Cost 3 vext3 <4,7,5,0>, <4,7,5,0>
+ 2568413286U, // <0,4,u,0>: Cost 3 vext1 <3,0,4,u>, LHS
+ 2618611502U, // <0,4,u,1>: Cost 3 vext2 <0,2,0,4>, LHS
+ 2753140526U, // <0,4,u,2>: Cost 3 vuzpl <0,2,4,6>, LHS
+ 2568415415U, // <0,4,u,3>: Cost 3 vext1 <3,0,4,u>, <3,0,4,u>
+ 2568416566U, // <0,4,u,4>: Cost 3 vext1 <3,0,4,u>, RHS
+ 1817423158U, // <0,4,u,5>: Cost 2 vzipl LHS, RHS
+ 1947438390U, // <0,4,u,6>: Cost 2 vtrnl LHS, RHS
+ 2592306203U, // <0,4,u,7>: Cost 3 vext1 <7,0,4,u>, <7,0,4,u>
+ 1947438408U, // <0,4,u,u>: Cost 2 vtrnl LHS, RHS
+ 3630219264U, // <0,5,0,0>: Cost 4 vext1 <1,0,5,0>, <0,0,0,0>
+ 2625912934U, // <0,5,0,1>: Cost 3 vext2 <1,4,0,5>, LHS
+ 3692355748U, // <0,5,0,2>: Cost 4 vext2 <0,2,0,5>, <0,2,0,2>
+ 3693019384U, // <0,5,0,3>: Cost 4 vext2 <0,3,0,5>, <0,3,0,5>
+ 3630222646U, // <0,5,0,4>: Cost 4 vext1 <1,0,5,0>, RHS
+ 3699655062U, // <0,5,0,5>: Cost 4 vext2 <1,4,0,5>, <0,5,0,1>
+ 2718715508U, // <0,5,0,6>: Cost 3 vext3 <5,6,7,0>, <5,0,6,1>
+ 3087011126U, // <0,5,0,7>: Cost 3 vtrnr <0,0,0,0>, RHS
+ 2625913501U, // <0,5,0,u>: Cost 3 vext2 <1,4,0,5>, LHS
+ 1500659814U, // <0,5,1,0>: Cost 2 vext1 <4,0,5,1>, LHS
+ 2886520528U, // <0,5,1,1>: Cost 3 vzipl LHS, <5,1,7,3>
+ 2574403176U, // <0,5,1,2>: Cost 3 vext1 <4,0,5,1>, <2,2,2,2>
+ 2574403734U, // <0,5,1,3>: Cost 3 vext1 <4,0,5,1>, <3,0,1,2>
+ 1500662674U, // <0,5,1,4>: Cost 2 vext1 <4,0,5,1>, <4,0,5,1>
+ 2886520836U, // <0,5,1,5>: Cost 3 vzipl LHS, <5,5,5,5>
+ 2886520930U, // <0,5,1,6>: Cost 3 vzipl LHS, <5,6,7,0>
+ 2718715600U, // <0,5,1,7>: Cost 3 vext3 <5,6,7,0>, <5,1,7,3>
+ 1500665646U, // <0,5,1,u>: Cost 2 vext1 <4,0,5,1>, LHS
+ 2556493926U, // <0,5,2,0>: Cost 3 vext1 <1,0,5,2>, LHS
+ 2244546120U, // <0,5,2,1>: Cost 3 vrev <5,0,1,2>
+ 3692357256U, // <0,5,2,2>: Cost 4 vext2 <0,2,0,5>, <2,2,5,7>
+ 2568439994U, // <0,5,2,3>: Cost 3 vext1 <3,0,5,2>, <3,0,5,2>
+ 2556497206U, // <0,5,2,4>: Cost 3 vext1 <1,0,5,2>, RHS
+ 3020738564U, // <0,5,2,5>: Cost 3 vtrnl LHS, <5,5,5,5>
+ 4027877161U, // <0,5,2,6>: Cost 4 vzipr <0,2,0,2>, <2,4,5,6>
+ 3093220662U, // <0,5,2,7>: Cost 3 vtrnr <1,0,3,2>, RHS
+ 3093220663U, // <0,5,2,u>: Cost 3 vtrnr <1,0,3,2>, RHS
+ 3699656854U, // <0,5,3,0>: Cost 4 vext2 <1,4,0,5>, <3,0,1,2>
+ 3699656927U, // <0,5,3,1>: Cost 4 vext2 <1,4,0,5>, <3,1,0,3>
+ 3699657006U, // <0,5,3,2>: Cost 4 vext2 <1,4,0,5>, <3,2,0,1>
+ 3699657116U, // <0,5,3,3>: Cost 4 vext2 <1,4,0,5>, <3,3,3,3>
+ 2637859284U, // <0,5,3,4>: Cost 3 vext2 <3,4,0,5>, <3,4,0,5>
+ 3790319453U, // <0,5,3,5>: Cost 4 vext3 <5,3,5,0>, <5,3,5,0>
+ 3699657354U, // <0,5,3,6>: Cost 4 vext2 <1,4,0,5>, <3,6,2,7>
+ 2716725103U, // <0,5,3,7>: Cost 3 vext3 <5,3,7,0>, <5,3,7,0>
+ 2716798840U, // <0,5,3,u>: Cost 3 vext3 <5,3,u,0>, <5,3,u,0>
+ 2661747602U, // <0,5,4,0>: Cost 3 vext2 <7,4,0,5>, <4,0,5,1>
+ 3630252810U, // <0,5,4,1>: Cost 4 vext1 <1,0,5,4>, <1,0,5,4>
+ 3636225507U, // <0,5,4,2>: Cost 4 vext1 <2,0,5,4>, <2,0,5,4>
+ 3716910172U, // <0,5,4,3>: Cost 4 vext2 <4,3,0,5>, <4,3,0,5>
+ 3962195892U, // <0,5,4,4>: Cost 4 vzipl <0,4,1,5>, <5,4,5,6>
+ 2625916214U, // <0,5,4,5>: Cost 3 vext2 <1,4,0,5>, RHS
+ 3718901071U, // <0,5,4,6>: Cost 4 vext2 <4,6,0,5>, <4,6,0,5>
+ 2718715846U, // <0,5,4,7>: Cost 3 vext3 <5,6,7,0>, <5,4,7,6>
+ 2625916457U, // <0,5,4,u>: Cost 3 vext2 <1,4,0,5>, RHS
+ 3791278034U, // <0,5,5,0>: Cost 4 vext3 <5,5,0,0>, <5,5,0,0>
+ 3791351771U, // <0,5,5,1>: Cost 4 vext3 <5,5,1,0>, <5,5,1,0>
+ 3318386260U, // <0,5,5,2>: Cost 4 vrev <5,0,2,5>
+ 3791499245U, // <0,5,5,3>: Cost 4 vext3 <5,5,3,0>, <5,5,3,0>
+ 3318533734U, // <0,5,5,4>: Cost 4 vrev <5,0,4,5>
+ 2718715908U, // <0,5,5,5>: Cost 3 vext3 <5,6,7,0>, <5,5,5,5>
+ 2657767522U, // <0,5,5,6>: Cost 3 vext2 <6,7,0,5>, <5,6,7,0>
+ 2718715928U, // <0,5,5,7>: Cost 3 vext3 <5,6,7,0>, <5,5,7,7>
+ 2718715937U, // <0,5,5,u>: Cost 3 vext3 <5,6,7,0>, <5,5,u,7>
+ 2592358502U, // <0,5,6,0>: Cost 3 vext1 <7,0,5,6>, LHS
+ 3792015404U, // <0,5,6,1>: Cost 4 vext3 <5,6,1,0>, <5,6,1,0>
+ 3731509754U, // <0,5,6,2>: Cost 4 vext2 <6,7,0,5>, <6,2,7,3>
+ 3785748546U, // <0,5,6,3>: Cost 4 vext3 <4,5,6,0>, <5,6,3,4>
+ 2592361782U, // <0,5,6,4>: Cost 3 vext1 <7,0,5,6>, RHS
+ 2592362594U, // <0,5,6,5>: Cost 3 vext1 <7,0,5,6>, <5,6,7,0>
+ 3785748576U, // <0,5,6,6>: Cost 4 vext3 <4,5,6,0>, <5,6,6,7>
+ 1644974178U, // <0,5,6,7>: Cost 2 vext3 <5,6,7,0>, <5,6,7,0>
+ 1645047915U, // <0,5,6,u>: Cost 2 vext3 <5,6,u,0>, <5,6,u,0>
+ 2562506854U, // <0,5,7,0>: Cost 3 vext1 <2,0,5,7>, LHS
+ 2562507670U, // <0,5,7,1>: Cost 3 vext1 <2,0,5,7>, <1,2,3,0>
+ 2562508262U, // <0,5,7,2>: Cost 3 vext1 <2,0,5,7>, <2,0,5,7>
+ 3636250774U, // <0,5,7,3>: Cost 4 vext1 <2,0,5,7>, <3,0,1,2>
+ 2562510134U, // <0,5,7,4>: Cost 3 vext1 <2,0,5,7>, RHS
+ 2718716072U, // <0,5,7,5>: Cost 3 vext3 <5,6,7,0>, <5,7,5,7>
+ 2718716074U, // <0,5,7,6>: Cost 3 vext3 <5,6,7,0>, <5,7,6,0>
+ 2719379635U, // <0,5,7,7>: Cost 3 vext3 <5,7,7,0>, <5,7,7,0>
+ 2562512686U, // <0,5,7,u>: Cost 3 vext1 <2,0,5,7>, LHS
+ 1500717158U, // <0,5,u,0>: Cost 2 vext1 <4,0,5,u>, LHS
+ 2625918766U, // <0,5,u,1>: Cost 3 vext2 <1,4,0,5>, LHS
+ 2719674583U, // <0,5,u,2>: Cost 3 vext3 <5,u,2,0>, <5,u,2,0>
+ 2568489152U, // <0,5,u,3>: Cost 3 vext1 <3,0,5,u>, <3,0,5,u>
+ 1500720025U, // <0,5,u,4>: Cost 2 vext1 <4,0,5,u>, <4,0,5,u>
+ 2625919130U, // <0,5,u,5>: Cost 3 vext2 <1,4,0,5>, RHS
+ 2586407243U, // <0,5,u,6>: Cost 3 vext1 <6,0,5,u>, <6,0,5,u>
+ 1646301444U, // <0,5,u,7>: Cost 2 vext3 <5,u,7,0>, <5,u,7,0>
+ 1646375181U, // <0,5,u,u>: Cost 2 vext3 <5,u,u,0>, <5,u,u,0>
+ 2586411110U, // <0,6,0,0>: Cost 3 vext1 <6,0,6,0>, LHS
+ 2619949158U, // <0,6,0,1>: Cost 3 vext2 <0,4,0,6>, LHS
+ 2619949220U, // <0,6,0,2>: Cost 3 vext2 <0,4,0,6>, <0,2,0,2>
+ 3785748789U, // <0,6,0,3>: Cost 4 vext3 <4,5,6,0>, <6,0,3,4>
+ 2619949386U, // <0,6,0,4>: Cost 3 vext2 <0,4,0,6>, <0,4,0,6>
+ 2586415202U, // <0,6,0,5>: Cost 3 vext1 <6,0,6,0>, <5,6,7,0>
+ 2586415436U, // <0,6,0,6>: Cost 3 vext1 <6,0,6,0>, <6,0,6,0>
+ 2952793398U, // <0,6,0,7>: Cost 3 vzipr <0,0,0,0>, RHS
+ 2619949725U, // <0,6,0,u>: Cost 3 vext2 <0,4,0,6>, LHS
+ 2562531430U, // <0,6,1,0>: Cost 3 vext1 <2,0,6,1>, LHS
+ 3693691700U, // <0,6,1,1>: Cost 4 vext2 <0,4,0,6>, <1,1,1,1>
+ 2886521338U, // <0,6,1,2>: Cost 3 vzipl LHS, <6,2,7,3>
+ 3693691864U, // <0,6,1,3>: Cost 4 vext2 <0,4,0,6>, <1,3,1,3>
+ 2562534710U, // <0,6,1,4>: Cost 3 vext1 <2,0,6,1>, RHS
+ 2580450932U, // <0,6,1,5>: Cost 3 vext1 <5,0,6,1>, <5,0,6,1>
+ 2886521656U, // <0,6,1,6>: Cost 3 vzipl LHS, <6,6,6,6>
+ 2966736182U, // <0,6,1,7>: Cost 3 vzipr <2,3,0,1>, RHS
+ 2966736183U, // <0,6,1,u>: Cost 3 vzipr <2,3,0,1>, RHS
+ 1500741734U, // <0,6,2,0>: Cost 2 vext1 <4,0,6,2>, LHS
+ 2250518817U, // <0,6,2,1>: Cost 3 vrev <6,0,1,2>
+ 2574485096U, // <0,6,2,2>: Cost 3 vext1 <4,0,6,2>, <2,2,2,2>
+ 2631894694U, // <0,6,2,3>: Cost 3 vext2 <2,4,0,6>, <2,3,0,1>
+ 1500744604U, // <0,6,2,4>: Cost 2 vext1 <4,0,6,2>, <4,0,6,2>
+ 2574487248U, // <0,6,2,5>: Cost 3 vext1 <4,0,6,2>, <5,1,7,3>
+ 3020739384U, // <0,6,2,6>: Cost 3 vtrnl LHS, <6,6,6,6>
+ 2954136886U, // <0,6,2,7>: Cost 3 vzipr <0,2,0,2>, RHS
+ 1500747566U, // <0,6,2,u>: Cost 2 vext1 <4,0,6,2>, LHS
+ 3693693078U, // <0,6,3,0>: Cost 4 vext2 <0,4,0,6>, <3,0,1,2>
+ 3705637136U, // <0,6,3,1>: Cost 4 vext2 <2,4,0,6>, <3,1,5,7>
+ 3705637192U, // <0,6,3,2>: Cost 4 vext2 <2,4,0,6>, <3,2,3,0>
+ 3693693340U, // <0,6,3,3>: Cost 4 vext2 <0,4,0,6>, <3,3,3,3>
+ 2637867477U, // <0,6,3,4>: Cost 3 vext2 <3,4,0,6>, <3,4,0,6>
+ 3705637424U, // <0,6,3,5>: Cost 4 vext2 <2,4,0,6>, <3,5,1,7>
+ 3666154056U, // <0,6,3,6>: Cost 4 vext1 <7,0,6,3>, <6,3,7,0>
+ 2722697800U, // <0,6,3,7>: Cost 3 vext3 <6,3,7,0>, <6,3,7,0>
+ 2722771537U, // <0,6,3,u>: Cost 3 vext3 <6,3,u,0>, <6,3,u,0>
+ 2562556006U, // <0,6,4,0>: Cost 3 vext1 <2,0,6,4>, LHS
+ 4095316257U, // <0,6,4,1>: Cost 4 vtrnl <0,2,4,6>, <6,0,1,2>
+ 2562557420U, // <0,6,4,2>: Cost 3 vext1 <2,0,6,4>, <2,0,6,4>
+ 3636299926U, // <0,6,4,3>: Cost 4 vext1 <2,0,6,4>, <3,0,1,2>
+ 2562559286U, // <0,6,4,4>: Cost 3 vext1 <2,0,6,4>, RHS
+ 2619952438U, // <0,6,4,5>: Cost 3 vext2 <0,4,0,6>, RHS
+ 2723287696U, // <0,6,4,6>: Cost 3 vext3 <6,4,6,0>, <6,4,6,0>
+ 4027895094U, // <0,6,4,7>: Cost 4 vzipr <0,2,0,4>, RHS
+ 2619952681U, // <0,6,4,u>: Cost 3 vext2 <0,4,0,6>, RHS
+ 2718716594U, // <0,6,5,0>: Cost 3 vext3 <5,6,7,0>, <6,5,0,7>
+ 3648250774U, // <0,6,5,1>: Cost 4 vext1 <4,0,6,5>, <1,2,3,0>
+ 3792458436U, // <0,6,5,2>: Cost 4 vext3 <5,6,7,0>, <6,5,2,7>
+ 3705638767U, // <0,6,5,3>: Cost 5 vext2 <2,4,0,6>, <5,3,7,0>
+ 3648252831U, // <0,6,5,4>: Cost 4 vext1 <4,0,6,5>, <4,0,6,5>
+ 3797619416U, // <0,6,5,5>: Cost 4 vext3 <6,5,5,0>, <6,5,5,0>
+ 3792458472U, // <0,6,5,6>: Cost 4 vext3 <5,6,7,0>, <6,5,6,7>
+ 4035202358U, // <0,6,5,7>: Cost 4 vzipr <1,4,0,5>, RHS
+ 2718716594U, // <0,6,5,u>: Cost 3 vext3 <5,6,7,0>, <6,5,0,7>
+ 3786412796U, // <0,6,6,0>: Cost 4 vext3 <4,6,6,0>, <6,6,0,0>
+ 3792458504U, // <0,6,6,1>: Cost 4 vext3 <5,6,7,0>, <6,6,1,3>
+ 3728200126U, // <0,6,6,2>: Cost 4 vext2 <6,2,0,6>, <6,2,0,6>
+ 3798135575U, // <0,6,6,3>: Cost 4 vext3 <6,6,3,0>, <6,6,3,0>
+ 3786412836U, // <0,6,6,4>: Cost 4 vext3 <4,6,6,0>, <6,6,4,4>
+ 3792458543U, // <0,6,6,5>: Cost 4 vext3 <5,6,7,0>, <6,6,5,6>
+ 2718716728U, // <0,6,6,6>: Cost 3 vext3 <5,6,7,0>, <6,6,6,6>
+ 2718716738U, // <0,6,6,7>: Cost 3 vext3 <5,6,7,0>, <6,6,7,7>
+ 2718716747U, // <0,6,6,u>: Cost 3 vext3 <5,6,7,0>, <6,6,u,7>
+ 2718716750U, // <0,6,7,0>: Cost 3 vext3 <5,6,7,0>, <6,7,0,1>
+ 2724909910U, // <0,6,7,1>: Cost 3 vext3 <6,7,1,0>, <6,7,1,0>
+ 3636323823U, // <0,6,7,2>: Cost 4 vext1 <2,0,6,7>, <2,0,6,7>
+ 2725057384U, // <0,6,7,3>: Cost 3 vext3 <6,7,3,0>, <6,7,3,0>
+ 2718716790U, // <0,6,7,4>: Cost 3 vext3 <5,6,7,0>, <6,7,4,5>
+ 2718716800U, // <0,6,7,5>: Cost 3 vext3 <5,6,7,0>, <6,7,5,6>
+ 3792458629U, // <0,6,7,6>: Cost 4 vext3 <5,6,7,0>, <6,7,6,2>
+ 2725352332U, // <0,6,7,7>: Cost 3 vext3 <6,7,7,0>, <6,7,7,0>
+ 2718716822U, // <0,6,7,u>: Cost 3 vext3 <5,6,7,0>, <6,7,u,1>
+ 1500790886U, // <0,6,u,0>: Cost 2 vext1 <4,0,6,u>, LHS
+ 2619954990U, // <0,6,u,1>: Cost 3 vext2 <0,4,0,6>, LHS
+ 2562590192U, // <0,6,u,2>: Cost 3 vext1 <2,0,6,u>, <2,0,6,u>
+ 2725721017U, // <0,6,u,3>: Cost 3 vext3 <6,u,3,0>, <6,u,3,0>
+ 1500793762U, // <0,6,u,4>: Cost 2 vext1 <4,0,6,u>, <4,0,6,u>
+ 2619955354U, // <0,6,u,5>: Cost 3 vext2 <0,4,0,6>, RHS
+ 2725942228U, // <0,6,u,6>: Cost 3 vext3 <6,u,6,0>, <6,u,6,0>
+ 2954186038U, // <0,6,u,7>: Cost 3 vzipr <0,2,0,u>, RHS
+ 1500796718U, // <0,6,u,u>: Cost 2 vext1 <4,0,6,u>, LHS
+ 2256401391U, // <0,7,0,0>: Cost 3 vrev <7,0,0,0>
+ 2632564838U, // <0,7,0,1>: Cost 3 vext2 <2,5,0,7>, LHS
+ 2256548865U, // <0,7,0,2>: Cost 3 vrev <7,0,2,0>
+ 3700998396U, // <0,7,0,3>: Cost 4 vext2 <1,6,0,7>, <0,3,1,0>
+ 2718716952U, // <0,7,0,4>: Cost 3 vext3 <5,6,7,0>, <7,0,4,5>
+ 2718716962U, // <0,7,0,5>: Cost 3 vext3 <5,6,7,0>, <7,0,5,6>
+ 2621284845U, // <0,7,0,6>: Cost 3 vext2 <0,6,0,7>, <0,6,0,7>
+ 3904685542U, // <0,7,0,7>: Cost 4 vuzpr <2,0,5,7>, <2,0,5,7>
+ 2632565405U, // <0,7,0,u>: Cost 3 vext2 <2,5,0,7>, LHS
+ 2256409584U, // <0,7,1,0>: Cost 3 vrev <7,0,0,1>
+ 3706307380U, // <0,7,1,1>: Cost 4 vext2 <2,5,0,7>, <1,1,1,1>
+ 2632565654U, // <0,7,1,2>: Cost 3 vext2 <2,5,0,7>, <1,2,3,0>
+ 3769603168U, // <0,7,1,3>: Cost 4 vext3 <1,u,3,0>, <7,1,3,5>
+ 2256704532U, // <0,7,1,4>: Cost 3 vrev <7,0,4,1>
+ 3769603184U, // <0,7,1,5>: Cost 4 vext3 <1,u,3,0>, <7,1,5,3>
+ 3700999366U, // <0,7,1,6>: Cost 4 vext2 <1,6,0,7>, <1,6,0,7>
+ 2886522476U, // <0,7,1,7>: Cost 3 vzipl LHS, <7,7,7,7>
+ 2256999480U, // <0,7,1,u>: Cost 3 vrev <7,0,u,1>
+ 2586501222U, // <0,7,2,0>: Cost 3 vext1 <6,0,7,2>, LHS
+ 1182749690U, // <0,7,2,1>: Cost 2 vrev <7,0,1,2>
+ 3636356595U, // <0,7,2,2>: Cost 4 vext1 <2,0,7,2>, <2,0,7,2>
+ 2727711916U, // <0,7,2,3>: Cost 3 vext3 <7,2,3,0>, <7,2,3,0>
+ 2586504502U, // <0,7,2,4>: Cost 3 vext1 <6,0,7,2>, RHS
+ 2632566606U, // <0,7,2,5>: Cost 3 vext2 <2,5,0,7>, <2,5,0,7>
+ 2586505559U, // <0,7,2,6>: Cost 3 vext1 <6,0,7,2>, <6,0,7,2>
+ 3020740204U, // <0,7,2,7>: Cost 3 vtrnl LHS, <7,7,7,7>
+ 1183265849U, // <0,7,2,u>: Cost 2 vrev <7,0,u,2>
+ 3701000342U, // <0,7,3,0>: Cost 4 vext2 <1,6,0,7>, <3,0,1,2>
+ 3706308849U, // <0,7,3,1>: Cost 4 vext2 <2,5,0,7>, <3,1,2,3>
+ 3330315268U, // <0,7,3,2>: Cost 4 vrev <7,0,2,3>
+ 3706309020U, // <0,7,3,3>: Cost 4 vext2 <2,5,0,7>, <3,3,3,3>
+ 3706309122U, // <0,7,3,4>: Cost 4 vext2 <2,5,0,7>, <3,4,5,6>
+ 3712281127U, // <0,7,3,5>: Cost 4 vext2 <3,5,0,7>, <3,5,0,7>
+ 2639202936U, // <0,7,3,6>: Cost 3 vext2 <3,6,0,7>, <3,6,0,7>
+ 3802412321U, // <0,7,3,7>: Cost 4 vext3 <7,3,7,0>, <7,3,7,0>
+ 2640530202U, // <0,7,3,u>: Cost 3 vext2 <3,u,0,7>, <3,u,0,7>
+ 3654287462U, // <0,7,4,0>: Cost 4 vext1 <5,0,7,4>, LHS
+ 2256507900U, // <0,7,4,1>: Cost 3 vrev <7,0,1,4>
+ 2256581637U, // <0,7,4,2>: Cost 3 vrev <7,0,2,4>
+ 3660262008U, // <0,7,4,3>: Cost 4 vext1 <6,0,7,4>, <3,6,0,7>
+ 3786413405U, // <0,7,4,4>: Cost 4 vext3 <4,6,6,0>, <7,4,4,6>
+ 2632568118U, // <0,7,4,5>: Cost 3 vext2 <2,5,0,7>, RHS
+ 3718917457U, // <0,7,4,6>: Cost 4 vext2 <4,6,0,7>, <4,6,0,7>
+ 3787003255U, // <0,7,4,7>: Cost 4 vext3 <4,7,5,0>, <7,4,7,5>
+ 2632568361U, // <0,7,4,u>: Cost 3 vext2 <2,5,0,7>, RHS
+ 3706310268U, // <0,7,5,0>: Cost 4 vext2 <2,5,0,7>, <5,0,7,0>
+ 3792459156U, // <0,7,5,1>: Cost 4 vext3 <5,6,7,0>, <7,5,1,7>
+ 3330331654U, // <0,7,5,2>: Cost 4 vrev <7,0,2,5>
+ 3722899255U, // <0,7,5,3>: Cost 4 vext2 <5,3,0,7>, <5,3,0,7>
+ 2256737304U, // <0,7,5,4>: Cost 3 vrev <7,0,4,5>
+ 3724226521U, // <0,7,5,5>: Cost 4 vext2 <5,5,0,7>, <5,5,0,7>
+ 2718717377U, // <0,7,5,6>: Cost 3 vext3 <5,6,7,0>, <7,5,6,7>
+ 2729997763U, // <0,7,5,7>: Cost 3 vext3 <7,5,7,0>, <7,5,7,0>
+ 2720044499U, // <0,7,5,u>: Cost 3 vext3 <5,u,7,0>, <7,5,u,7>
+ 3712946517U, // <0,7,6,0>: Cost 4 vext2 <3,6,0,7>, <6,0,7,0>
+ 2256524286U, // <0,7,6,1>: Cost 3 vrev <7,0,1,6>
+ 3792459246U, // <0,7,6,2>: Cost 4 vext3 <5,6,7,0>, <7,6,2,7>
+ 3796440567U, // <0,7,6,3>: Cost 4 vext3 <6,3,7,0>, <7,6,3,7>
+ 3654307126U, // <0,7,6,4>: Cost 4 vext1 <5,0,7,6>, RHS
+ 2656457394U, // <0,7,6,5>: Cost 3 vext2 <6,5,0,7>, <6,5,0,7>
+ 3792459281U, // <0,7,6,6>: Cost 4 vext3 <5,6,7,0>, <7,6,6,6>
+ 2730661396U, // <0,7,6,7>: Cost 3 vext3 <7,6,7,0>, <7,6,7,0>
+ 2658448293U, // <0,7,6,u>: Cost 3 vext2 <6,u,0,7>, <6,u,0,7>
+ 3787003431U, // <0,7,7,0>: Cost 4 vext3 <4,7,5,0>, <7,7,0,1>
+ 3654312854U, // <0,7,7,1>: Cost 4 vext1 <5,0,7,7>, <1,2,3,0>
+ 3654313446U, // <0,7,7,2>: Cost 4 vext1 <5,0,7,7>, <2,0,5,7>
+ 3804771905U, // <0,7,7,3>: Cost 4 vext3 <7,7,3,0>, <7,7,3,0>
+ 3654315318U, // <0,7,7,4>: Cost 4 vext1 <5,0,7,7>, RHS
+ 3654315651U, // <0,7,7,5>: Cost 4 vext1 <5,0,7,7>, <5,0,7,7>
+ 3660288348U, // <0,7,7,6>: Cost 4 vext1 <6,0,7,7>, <6,0,7,7>
+ 2718717548U, // <0,7,7,7>: Cost 3 vext3 <5,6,7,0>, <7,7,7,7>
+ 2664420990U, // <0,7,7,u>: Cost 3 vext2 <7,u,0,7>, <7,u,0,7>
+ 2256466935U, // <0,7,u,0>: Cost 3 vrev <7,0,0,u>
+ 1182798848U, // <0,7,u,1>: Cost 2 vrev <7,0,1,u>
+ 2256614409U, // <0,7,u,2>: Cost 3 vrev <7,0,2,u>
+ 2731693714U, // <0,7,u,3>: Cost 3 vext3 <7,u,3,0>, <7,u,3,0>
+ 2256761883U, // <0,7,u,4>: Cost 3 vrev <7,0,4,u>
+ 2632571034U, // <0,7,u,5>: Cost 3 vext2 <2,5,0,7>, RHS
+ 2669066421U, // <0,7,u,6>: Cost 3 vext2 <u,6,0,7>, <u,6,0,7>
+ 2731988662U, // <0,7,u,7>: Cost 3 vext3 <7,u,7,0>, <7,u,7,0>
+ 1183315007U, // <0,7,u,u>: Cost 2 vrev <7,0,u,u>
+ 135053414U, // <0,u,0,0>: Cost 1 vdup0 LHS
+ 1544896614U, // <0,u,0,1>: Cost 2 vext2 <0,2,0,u>, LHS
+ 1678999654U, // <0,u,0,2>: Cost 2 vuzpl LHS, LHS
+ 2691880677U, // <0,u,0,3>: Cost 3 vext3 <1,2,3,0>, <u,0,3,2>
+ 1476988214U, // <0,u,0,4>: Cost 2 vext1 <0,0,u,0>, RHS
+ 2718791419U, // <0,u,0,5>: Cost 3 vext3 <5,6,u,0>, <u,0,5,6>
+ 3021248666U, // <0,u,0,6>: Cost 3 vtrnl <0,2,0,2>, RHS
+ 2592535607U, // <0,u,0,7>: Cost 3 vext1 <7,0,u,0>, <7,0,u,0>
+ 135053414U, // <0,u,0,u>: Cost 1 vdup0 LHS
+ 1476993097U, // <0,u,1,0>: Cost 2 vext1 <0,0,u,1>, <0,0,u,1>
+ 1812780846U, // <0,u,1,1>: Cost 2 vzipl LHS, LHS
+ 1618138926U, // <0,u,1,2>: Cost 2 vext3 <1,2,3,0>, LHS
+ 2752742134U, // <0,u,1,3>: Cost 3 vuzpl LHS, <1,0,3,2>
+ 1476996406U, // <0,u,1,4>: Cost 2 vext1 <0,0,u,1>, RHS
+ 1812781210U, // <0,u,1,5>: Cost 2 vzipl LHS, RHS
+ 2887006416U, // <0,u,1,6>: Cost 3 vzipl LHS, <u,6,3,7>
+ 2966736200U, // <0,u,1,7>: Cost 3 vzipr <2,3,0,1>, RHS
+ 1812781413U, // <0,u,1,u>: Cost 2 vzipl LHS, LHS
+ 1482973286U, // <0,u,2,0>: Cost 2 vext1 <1,0,u,2>, LHS
+ 1482973987U, // <0,u,2,1>: Cost 2 vext1 <1,0,u,2>, <1,0,u,2>
+ 1946998574U, // <0,u,2,2>: Cost 2 vtrnl LHS, LHS
+ 835584U, // <0,u,2,3>: Cost 0 copy LHS
+ 1482976566U, // <0,u,2,4>: Cost 2 vext1 <1,0,u,2>, RHS
+ 3020781631U, // <0,u,2,5>: Cost 3 vtrnl LHS, <u,4,5,6>
+ 1946998938U, // <0,u,2,6>: Cost 2 vtrnl LHS, RHS
+ 1518810169U, // <0,u,2,7>: Cost 2 vext1 <7,0,u,2>, <7,0,u,2>
+ 835584U, // <0,u,2,u>: Cost 0 copy LHS
+ 2618640534U, // <0,u,3,0>: Cost 3 vext2 <0,2,0,u>, <3,0,1,2>
+ 2752743574U, // <0,u,3,1>: Cost 3 vuzpl LHS, <3,0,1,2>
+ 2636556597U, // <0,u,3,2>: Cost 3 vext2 <3,2,0,u>, <3,2,0,u>
+ 2752743836U, // <0,u,3,3>: Cost 3 vuzpl LHS, <3,3,3,3>
+ 2618640898U, // <0,u,3,4>: Cost 3 vext2 <0,2,0,u>, <3,4,5,6>
+ 2752743938U, // <0,u,3,5>: Cost 3 vuzpl LHS, <3,4,5,6>
+ 2639202936U, // <0,u,3,6>: Cost 3 vext2 <3,6,0,7>, <3,6,0,7>
+ 2639874762U, // <0,u,3,7>: Cost 3 vext2 <3,7,0,u>, <3,7,0,u>
+ 2752743637U, // <0,u,3,u>: Cost 3 vuzpl LHS, <3,0,u,2>
+ 2562703462U, // <0,u,4,0>: Cost 3 vext1 <2,0,u,4>, LHS
+ 2888455982U, // <0,u,4,1>: Cost 3 vzipl <0,4,1,5>, LHS
+ 3021575982U, // <0,u,4,2>: Cost 3 vtrnl <0,2,4,6>, LHS
+ 2568677591U, // <0,u,4,3>: Cost 3 vext1 <3,0,u,4>, <3,0,u,4>
+ 2562706742U, // <0,u,4,4>: Cost 3 vext1 <2,0,u,4>, RHS
+ 1544899894U, // <0,u,4,5>: Cost 2 vext2 <0,2,0,u>, RHS
+ 1679002934U, // <0,u,4,6>: Cost 2 vuzpl LHS, RHS
+ 2718718033U, // <0,u,4,7>: Cost 3 vext3 <5,6,7,0>, <u,4,7,6>
+ 1679002952U, // <0,u,4,u>: Cost 2 vuzpl LHS, RHS
+ 2568683622U, // <0,u,5,0>: Cost 3 vext1 <3,0,u,5>, LHS
+ 2568684438U, // <0,u,5,1>: Cost 3 vext1 <3,0,u,5>, <1,2,3,0>
+ 3765622902U, // <0,u,5,2>: Cost 4 vext3 <1,2,3,0>, <u,5,2,7>
+ 2691881087U, // <0,u,5,3>: Cost 3 vext3 <1,2,3,0>, <u,5,3,7>
+ 2568686902U, // <0,u,5,4>: Cost 3 vext1 <3,0,u,5>, RHS
+ 2650492890U, // <0,u,5,5>: Cost 3 vext2 <5,5,0,u>, <5,5,0,u>
+ 1618139290U, // <0,u,5,6>: Cost 2 vext3 <1,2,3,0>, RHS
+ 2824834358U, // <0,u,5,7>: Cost 3 vuzpr <1,0,3,u>, RHS
+ 1618139308U, // <0,u,5,u>: Cost 2 vext3 <1,2,3,0>, RHS
+ 2592579686U, // <0,u,6,0>: Cost 3 vext1 <7,0,u,6>, LHS
+ 2262496983U, // <0,u,6,1>: Cost 3 vrev <u,0,1,6>
+ 2654474688U, // <0,u,6,2>: Cost 3 vext2 <6,2,0,u>, <6,2,0,u>
+ 2691881168U, // <0,u,6,3>: Cost 3 vext3 <1,2,3,0>, <u,6,3,7>
+ 2592582966U, // <0,u,6,4>: Cost 3 vext1 <7,0,u,6>, RHS
+ 2656465587U, // <0,u,6,5>: Cost 3 vext2 <6,5,0,u>, <6,5,0,u>
+ 2657129220U, // <0,u,6,6>: Cost 3 vext2 <6,6,0,u>, <6,6,0,u>
+ 1584051029U, // <0,u,6,7>: Cost 2 vext2 <6,7,0,u>, <6,7,0,u>
+ 1584714662U, // <0,u,6,u>: Cost 2 vext2 <6,u,0,u>, <6,u,0,u>
+ 2562728038U, // <0,u,7,0>: Cost 3 vext1 <2,0,u,7>, LHS
+ 2562728854U, // <0,u,7,1>: Cost 3 vext1 <2,0,u,7>, <1,2,3,0>
+ 2562729473U, // <0,u,7,2>: Cost 3 vext1 <2,0,u,7>, <2,0,u,7>
+ 2661111018U, // <0,u,7,3>: Cost 3 vext2 <7,3,0,u>, <7,3,0,u>
+ 2562731318U, // <0,u,7,4>: Cost 3 vext1 <2,0,u,7>, RHS
+ 2718718258U, // <0,u,7,5>: Cost 3 vext3 <5,6,7,0>, <u,7,5,6>
+ 2586620261U, // <0,u,7,6>: Cost 3 vext1 <6,0,u,7>, <6,0,u,7>
+ 2657793644U, // <0,u,7,7>: Cost 3 vext2 <6,7,0,u>, <7,7,7,7>
+ 2562733870U, // <0,u,7,u>: Cost 3 vext1 <2,0,u,7>, LHS
+ 135053414U, // <0,u,u,0>: Cost 1 vdup0 LHS
+ 1544902446U, // <0,u,u,1>: Cost 2 vext2 <0,2,0,u>, LHS
+ 1679005486U, // <0,u,u,2>: Cost 2 vuzpl LHS, LHS
+ 835584U, // <0,u,u,3>: Cost 0 copy LHS
+ 1483025718U, // <0,u,u,4>: Cost 2 vext1 <1,0,u,u>, RHS
+ 1544902810U, // <0,u,u,5>: Cost 2 vext2 <0,2,0,u>, RHS
+ 1679005850U, // <0,u,u,6>: Cost 2 vuzpl LHS, RHS
+ 1518859327U, // <0,u,u,7>: Cost 2 vext1 <7,0,u,u>, <7,0,u,u>
+ 835584U, // <0,u,u,u>: Cost 0 copy LHS
+ 2689744896U, // <1,0,0,0>: Cost 3 vext3 <0,u,1,1>, <0,0,0,0>
+ 1610694666U, // <1,0,0,1>: Cost 2 vext3 <0,0,1,1>, <0,0,1,1>
+ 2689744916U, // <1,0,0,2>: Cost 3 vext3 <0,u,1,1>, <0,0,2,2>
+ 2619310332U, // <1,0,0,3>: Cost 3 vext2 <0,3,1,0>, <0,3,1,0>
+ 2684657701U, // <1,0,0,4>: Cost 3 vext3 <0,0,4,1>, <0,0,4,1>
+ 2620637598U, // <1,0,0,5>: Cost 3 vext2 <0,5,1,0>, <0,5,1,0>
+ 3708977654U, // <1,0,0,6>: Cost 4 vext2 <3,0,1,0>, <0,6,1,7>
+ 3666351168U, // <1,0,0,7>: Cost 4 vext1 <7,1,0,0>, <7,1,0,0>
+ 1611210825U, // <1,0,0,u>: Cost 2 vext3 <0,0,u,1>, <0,0,u,1>
+ 2556780646U, // <1,0,1,0>: Cost 3 vext1 <1,1,0,1>, LHS
+ 2556781355U, // <1,0,1,1>: Cost 3 vext1 <1,1,0,1>, <1,1,0,1>
+ 1616003174U, // <1,0,1,2>: Cost 2 vext3 <0,u,1,1>, LHS
+ 3693052888U, // <1,0,1,3>: Cost 4 vext2 <0,3,1,0>, <1,3,1,3>
+ 2556783926U, // <1,0,1,4>: Cost 3 vext1 <1,1,0,1>, RHS
+ 2580672143U, // <1,0,1,5>: Cost 3 vext1 <5,1,0,1>, <5,1,0,1>
+ 2724839566U, // <1,0,1,6>: Cost 3 vext3 <6,7,0,1>, <0,1,6,7>
+ 3654415354U, // <1,0,1,7>: Cost 4 vext1 <5,1,0,1>, <7,0,1,2>
+ 1616003228U, // <1,0,1,u>: Cost 2 vext3 <0,u,1,1>, LHS
+ 2685690019U, // <1,0,2,0>: Cost 3 vext3 <0,2,0,1>, <0,2,0,1>
+ 2685763756U, // <1,0,2,1>: Cost 3 vext3 <0,2,1,1>, <0,2,1,1>
+ 2698297524U, // <1,0,2,2>: Cost 3 vext3 <2,3,0,1>, <0,2,2,0>
+ 2685911230U, // <1,0,2,3>: Cost 3 vext3 <0,2,3,1>, <0,2,3,1>
+ 2689745100U, // <1,0,2,4>: Cost 3 vext3 <0,u,1,1>, <0,2,4,6>
+ 3764814038U, // <1,0,2,5>: Cost 4 vext3 <1,1,1,1>, <0,2,5,7>
+ 2724839640U, // <1,0,2,6>: Cost 3 vext3 <6,7,0,1>, <0,2,6,0>
+ 2592625658U, // <1,0,2,7>: Cost 3 vext1 <7,1,0,2>, <7,0,1,2>
+ 2686279915U, // <1,0,2,u>: Cost 3 vext3 <0,2,u,1>, <0,2,u,1>
+ 3087843328U, // <1,0,3,0>: Cost 3 vtrnr LHS, <0,0,0,0>
+ 3087843338U, // <1,0,3,1>: Cost 3 vtrnr LHS, <0,0,1,1>
+ 67944550U, // <1,0,3,2>: Cost 1 vrev LHS
+ 2568743135U, // <1,0,3,3>: Cost 3 vext1 <3,1,0,3>, <3,1,0,3>
+ 2562772278U, // <1,0,3,4>: Cost 3 vext1 <2,1,0,3>, RHS
+ 4099850454U, // <1,0,3,5>: Cost 4 vtrnl <1,0,3,2>, <0,2,5,7>
+ 3704998538U, // <1,0,3,6>: Cost 4 vext2 <2,3,1,0>, <3,6,2,7>
+ 2592633923U, // <1,0,3,7>: Cost 3 vext1 <7,1,0,3>, <7,1,0,3>
+ 68386972U, // <1,0,3,u>: Cost 1 vrev LHS
+ 2620640146U, // <1,0,4,0>: Cost 3 vext2 <0,5,1,0>, <4,0,5,1>
+ 2689745234U, // <1,0,4,1>: Cost 3 vext3 <0,u,1,1>, <0,4,1,5>
+ 2689745244U, // <1,0,4,2>: Cost 3 vext3 <0,u,1,1>, <0,4,2,6>
+ 3760980320U, // <1,0,4,3>: Cost 4 vext3 <0,4,3,1>, <0,4,3,1>
+ 3761054057U, // <1,0,4,4>: Cost 4 vext3 <0,4,4,1>, <0,4,4,1>
+ 2619313462U, // <1,0,4,5>: Cost 3 vext2 <0,3,1,0>, RHS
+ 3761201531U, // <1,0,4,6>: Cost 4 vext3 <0,4,6,1>, <0,4,6,1>
+ 3666383940U, // <1,0,4,7>: Cost 4 vext1 <7,1,0,4>, <7,1,0,4>
+ 2619313705U, // <1,0,4,u>: Cost 3 vext2 <0,3,1,0>, RHS
+ 4029300736U, // <1,0,5,0>: Cost 4 vzipr <0,4,1,5>, <0,0,0,0>
+ 2895249510U, // <1,0,5,1>: Cost 3 vzipl <1,5,3,7>, LHS
+ 3028287590U, // <1,0,5,2>: Cost 3 vtrnl <1,3,5,7>, LHS
+ 3642501345U, // <1,0,5,3>: Cost 4 vext1 <3,1,0,5>, <3,1,0,5>
+ 2215592058U, // <1,0,5,4>: Cost 3 vrev <0,1,4,5>
+ 3724242907U, // <1,0,5,5>: Cost 4 vext2 <5,5,1,0>, <5,5,1,0>
+ 3724906540U, // <1,0,5,6>: Cost 4 vext2 <5,6,1,0>, <5,6,1,0>
+ 3911118134U, // <1,0,5,7>: Cost 4 vuzpr <3,1,3,0>, RHS
+ 3028287644U, // <1,0,5,u>: Cost 3 vtrnl <1,3,5,7>, LHS
+ 3762086375U, // <1,0,6,0>: Cost 4 vext3 <0,6,0,1>, <0,6,0,1>
+ 2698297846U, // <1,0,6,1>: Cost 3 vext3 <2,3,0,1>, <0,6,1,7>
+ 3760022015U, // <1,0,6,2>: Cost 4 vext3 <0,2,u,1>, <0,6,2,7>
+ 3642509538U, // <1,0,6,3>: Cost 4 vext1 <3,1,0,6>, <3,1,0,6>
+ 3762381323U, // <1,0,6,4>: Cost 4 vext3 <0,6,4,1>, <0,6,4,1>
+ 3730215604U, // <1,0,6,5>: Cost 4 vext2 <6,5,1,0>, <6,5,1,0>
+ 3730879237U, // <1,0,6,6>: Cost 4 vext2 <6,6,1,0>, <6,6,1,0>
+ 2657801046U, // <1,0,6,7>: Cost 3 vext2 <6,7,1,0>, <6,7,1,0>
+ 2658464679U, // <1,0,6,u>: Cost 3 vext2 <6,u,1,0>, <6,u,1,0>
+ 2659128312U, // <1,0,7,0>: Cost 3 vext2 <7,0,1,0>, <7,0,1,0>
+ 4047898278U, // <1,0,7,1>: Cost 4 vzipr <3,5,1,7>, <2,3,0,1>
+ 2215460970U, // <1,0,7,2>: Cost 3 vrev <0,1,2,7>
+ 3734861035U, // <1,0,7,3>: Cost 4 vext2 <7,3,1,0>, <7,3,1,0>
+ 3731543398U, // <1,0,7,4>: Cost 4 vext2 <6,7,1,0>, <7,4,5,6>
+ 3736188301U, // <1,0,7,5>: Cost 4 vext2 <7,5,1,0>, <7,5,1,0>
+ 2663110110U, // <1,0,7,6>: Cost 3 vext2 <7,6,1,0>, <7,6,1,0>
+ 3731543660U, // <1,0,7,7>: Cost 4 vext2 <6,7,1,0>, <7,7,7,7>
+ 2664437376U, // <1,0,7,u>: Cost 3 vext2 <7,u,1,0>, <7,u,1,0>
+ 3087884288U, // <1,0,u,0>: Cost 3 vtrnr LHS, <0,0,0,0>
+ 1616003730U, // <1,0,u,1>: Cost 2 vext3 <0,u,1,1>, <0,u,1,1>
+ 67985515U, // <1,0,u,2>: Cost 1 vrev LHS
+ 2689893028U, // <1,0,u,3>: Cost 3 vext3 <0,u,3,1>, <0,u,3,1>
+ 2689745586U, // <1,0,u,4>: Cost 3 vext3 <0,u,1,1>, <0,u,4,6>
+ 2619316378U, // <1,0,u,5>: Cost 3 vext2 <0,3,1,0>, RHS
+ 2669082807U, // <1,0,u,6>: Cost 3 vext2 <u,6,1,0>, <u,6,1,0>
+ 2592674888U, // <1,0,u,7>: Cost 3 vext1 <7,1,0,u>, <7,1,0,u>
+ 68427937U, // <1,0,u,u>: Cost 1 vrev LHS
+ 1543585802U, // <1,1,0,0>: Cost 2 vext2 <0,0,1,1>, <0,0,1,1>
+ 1548894310U, // <1,1,0,1>: Cost 2 vext2 <0,u,1,1>, LHS
+ 2618654892U, // <1,1,0,2>: Cost 3 vext2 <0,2,1,1>, <0,2,1,1>
+ 2689745654U, // <1,1,0,3>: Cost 3 vext3 <0,u,1,1>, <1,0,3,2>
+ 2622636370U, // <1,1,0,4>: Cost 3 vext2 <0,u,1,1>, <0,4,1,5>
+ 2620645791U, // <1,1,0,5>: Cost 3 vext2 <0,5,1,1>, <0,5,1,1>
+ 3696378367U, // <1,1,0,6>: Cost 4 vext2 <0,u,1,1>, <0,6,2,7>
+ 3666424905U, // <1,1,0,7>: Cost 4 vext1 <7,1,1,0>, <7,1,1,0>
+ 1548894866U, // <1,1,0,u>: Cost 2 vext2 <0,u,1,1>, <0,u,1,1>
+ 1483112550U, // <1,1,1,0>: Cost 2 vext1 <1,1,1,1>, LHS
+ 202162278U, // <1,1,1,1>: Cost 1 vdup1 LHS
+ 2622636950U, // <1,1,1,2>: Cost 3 vext2 <0,u,1,1>, <1,2,3,0>
+ 2622637016U, // <1,1,1,3>: Cost 3 vext2 <0,u,1,1>, <1,3,1,3>
+ 1483115830U, // <1,1,1,4>: Cost 2 vext1 <1,1,1,1>, RHS
+ 2622637200U, // <1,1,1,5>: Cost 3 vext2 <0,u,1,1>, <1,5,3,7>
+ 2622637263U, // <1,1,1,6>: Cost 3 vext2 <0,u,1,1>, <1,6,1,7>
+ 2592691274U, // <1,1,1,7>: Cost 3 vext1 <7,1,1,1>, <7,1,1,1>
+ 202162278U, // <1,1,1,u>: Cost 1 vdup1 LHS
+ 2550890588U, // <1,1,2,0>: Cost 3 vext1 <0,1,1,2>, <0,1,1,2>
+ 2617329183U, // <1,1,2,1>: Cost 3 vext2 <0,0,1,1>, <2,1,3,1>
+ 2622637672U, // <1,1,2,2>: Cost 3 vext2 <0,u,1,1>, <2,2,2,2>
+ 2622637734U, // <1,1,2,3>: Cost 3 vext2 <0,u,1,1>, <2,3,0,1>
+ 2550893878U, // <1,1,2,4>: Cost 3 vext1 <0,1,1,2>, RHS
+ 3696379744U, // <1,1,2,5>: Cost 4 vext2 <0,u,1,1>, <2,5,2,7>
+ 2622638010U, // <1,1,2,6>: Cost 3 vext2 <0,u,1,1>, <2,6,3,7>
+ 3804554170U, // <1,1,2,7>: Cost 4 vext3 <7,7,0,1>, <1,2,7,0>
+ 2622638139U, // <1,1,2,u>: Cost 3 vext2 <0,u,1,1>, <2,u,0,1>
+ 2622638230U, // <1,1,3,0>: Cost 3 vext2 <0,u,1,1>, <3,0,1,2>
+ 3087844148U, // <1,1,3,1>: Cost 3 vtrnr LHS, <1,1,1,1>
+ 4161585244U, // <1,1,3,2>: Cost 4 vtrnr LHS, <0,1,1,2>
+ 2014101606U, // <1,1,3,3>: Cost 2 vtrnr LHS, LHS
+ 2622638594U, // <1,1,3,4>: Cost 3 vext2 <0,u,1,1>, <3,4,5,6>
+ 2689745920U, // <1,1,3,5>: Cost 3 vext3 <0,u,1,1>, <1,3,5,7>
+ 3763487753U, // <1,1,3,6>: Cost 4 vext3 <0,u,1,1>, <1,3,6,7>
+ 2592707660U, // <1,1,3,7>: Cost 3 vext1 <7,1,1,3>, <7,1,1,3>
+ 2014101611U, // <1,1,3,u>: Cost 2 vtrnr LHS, LHS
+ 2556878950U, // <1,1,4,0>: Cost 3 vext1 <1,1,1,4>, LHS
+ 2221335351U, // <1,1,4,1>: Cost 3 vrev <1,1,1,4>
+ 3696380988U, // <1,1,4,2>: Cost 4 vext2 <0,u,1,1>, <4,2,6,0>
+ 3763487805U, // <1,1,4,3>: Cost 4 vext3 <0,u,1,1>, <1,4,3,5>
+ 2556882230U, // <1,1,4,4>: Cost 3 vext1 <1,1,1,4>, RHS
+ 1548897590U, // <1,1,4,5>: Cost 2 vext2 <0,u,1,1>, RHS
+ 2758184246U, // <1,1,4,6>: Cost 3 vuzpl <1,1,1,1>, RHS
+ 3666457677U, // <1,1,4,7>: Cost 4 vext1 <7,1,1,4>, <7,1,1,4>
+ 1548897833U, // <1,1,4,u>: Cost 2 vext2 <0,u,1,1>, RHS
+ 2693653615U, // <1,1,5,0>: Cost 3 vext3 <1,5,0,1>, <1,5,0,1>
+ 2617331408U, // <1,1,5,1>: Cost 3 vext2 <0,0,1,1>, <5,1,7,3>
+ 4029302934U, // <1,1,5,2>: Cost 4 vzipr <0,4,1,5>, <3,0,1,2>
+ 2689746064U, // <1,1,5,3>: Cost 3 vext3 <0,u,1,1>, <1,5,3,7>
+ 2221564755U, // <1,1,5,4>: Cost 3 vrev <1,1,4,5>
+ 2955559250U, // <1,1,5,5>: Cost 3 vzipr <0,4,1,5>, <0,4,1,5>
+ 2617331810U, // <1,1,5,6>: Cost 3 vext2 <0,0,1,1>, <5,6,7,0>
+ 2825293110U, // <1,1,5,7>: Cost 3 vuzpr <1,1,1,1>, RHS
+ 2689746109U, // <1,1,5,u>: Cost 3 vext3 <0,u,1,1>, <1,5,u,7>
+ 3696382241U, // <1,1,6,0>: Cost 4 vext2 <0,u,1,1>, <6,0,1,2>
+ 2689746127U, // <1,1,6,1>: Cost 3 vext3 <0,u,1,1>, <1,6,1,7>
+ 2617332218U, // <1,1,6,2>: Cost 3 vext2 <0,0,1,1>, <6,2,7,3>
+ 3763487969U, // <1,1,6,3>: Cost 4 vext3 <0,u,1,1>, <1,6,3,7>
+ 3696382605U, // <1,1,6,4>: Cost 4 vext2 <0,u,1,1>, <6,4,5,6>
+ 4029309266U, // <1,1,6,5>: Cost 4 vzipr <0,4,1,6>, <0,4,1,5>
+ 2617332536U, // <1,1,6,6>: Cost 3 vext2 <0,0,1,1>, <6,6,6,6>
+ 2724840702U, // <1,1,6,7>: Cost 3 vext3 <6,7,0,1>, <1,6,7,0>
+ 2725504263U, // <1,1,6,u>: Cost 3 vext3 <6,u,0,1>, <1,6,u,0>
+ 2617332720U, // <1,1,7,0>: Cost 3 vext2 <0,0,1,1>, <7,0,0,1>
+ 2659800138U, // <1,1,7,1>: Cost 3 vext2 <7,1,1,1>, <7,1,1,1>
+ 3691074717U, // <1,1,7,2>: Cost 4 vext2 <0,0,1,1>, <7,2,1,3>
+ 4167811174U, // <1,1,7,3>: Cost 4 vtrnr <1,1,5,7>, LHS
+ 2617333094U, // <1,1,7,4>: Cost 3 vext2 <0,0,1,1>, <7,4,5,6>
+ 3295396702U, // <1,1,7,5>: Cost 4 vrev <1,1,5,7>
+ 3803891014U, // <1,1,7,6>: Cost 4 vext3 <7,6,0,1>, <1,7,6,0>
+ 2617333356U, // <1,1,7,7>: Cost 3 vext2 <0,0,1,1>, <7,7,7,7>
+ 2659800138U, // <1,1,7,u>: Cost 3 vext2 <7,1,1,1>, <7,1,1,1>
+ 1483112550U, // <1,1,u,0>: Cost 2 vext1 <1,1,1,1>, LHS
+ 202162278U, // <1,1,u,1>: Cost 1 vdup1 LHS
+ 2622642056U, // <1,1,u,2>: Cost 3 vext2 <0,u,1,1>, <u,2,3,3>
+ 2014142566U, // <1,1,u,3>: Cost 2 vtrnr LHS, LHS
+ 1483115830U, // <1,1,u,4>: Cost 2 vext1 <1,1,1,1>, RHS
+ 1548900506U, // <1,1,u,5>: Cost 2 vext2 <0,u,1,1>, RHS
+ 2622642384U, // <1,1,u,6>: Cost 3 vext2 <0,u,1,1>, <u,6,3,7>
+ 2825293353U, // <1,1,u,7>: Cost 3 vuzpr <1,1,1,1>, RHS
+ 202162278U, // <1,1,u,u>: Cost 1 vdup1 LHS
+ 2635251712U, // <1,2,0,0>: Cost 3 vext2 <3,0,1,2>, <0,0,0,0>
+ 1561509990U, // <1,2,0,1>: Cost 2 vext2 <3,0,1,2>, LHS
+ 2618663085U, // <1,2,0,2>: Cost 3 vext2 <0,2,1,2>, <0,2,1,2>
+ 2696529358U, // <1,2,0,3>: Cost 3 vext3 <2,0,3,1>, <2,0,3,1>
+ 2635252050U, // <1,2,0,4>: Cost 3 vext2 <3,0,1,2>, <0,4,1,5>
+ 3769533926U, // <1,2,0,5>: Cost 4 vext3 <1,u,2,1>, <2,0,5,7>
+ 2621317617U, // <1,2,0,6>: Cost 3 vext2 <0,6,1,2>, <0,6,1,2>
+ 2659140170U, // <1,2,0,7>: Cost 3 vext2 <7,0,1,2>, <0,7,2,1>
+ 1561510557U, // <1,2,0,u>: Cost 2 vext2 <3,0,1,2>, LHS
+ 2623308516U, // <1,2,1,0>: Cost 3 vext2 <1,0,1,2>, <1,0,1,2>
+ 2635252532U, // <1,2,1,1>: Cost 3 vext2 <3,0,1,2>, <1,1,1,1>
+ 2631271318U, // <1,2,1,2>: Cost 3 vext2 <2,3,1,2>, <1,2,3,0>
+ 2958180454U, // <1,2,1,3>: Cost 3 vzipr <0,u,1,1>, LHS
+ 2550959414U, // <1,2,1,4>: Cost 3 vext1 <0,1,2,1>, RHS
+ 2635252880U, // <1,2,1,5>: Cost 3 vext2 <3,0,1,2>, <1,5,3,7>
+ 2635252952U, // <1,2,1,6>: Cost 3 vext2 <3,0,1,2>, <1,6,2,7>
+ 3732882731U, // <1,2,1,7>: Cost 4 vext2 <7,0,1,2>, <1,7,3,0>
+ 2958180459U, // <1,2,1,u>: Cost 3 vzipr <0,u,1,1>, LHS
+ 2629281213U, // <1,2,2,0>: Cost 3 vext2 <2,0,1,2>, <2,0,1,2>
+ 2635253280U, // <1,2,2,1>: Cost 3 vext2 <3,0,1,2>, <2,1,3,2>
+ 2618664552U, // <1,2,2,2>: Cost 3 vext2 <0,2,1,2>, <2,2,2,2>
+ 2689746546U, // <1,2,2,3>: Cost 3 vext3 <0,u,1,1>, <2,2,3,3>
+ 3764815485U, // <1,2,2,4>: Cost 4 vext3 <1,1,1,1>, <2,2,4,5>
+ 3760023176U, // <1,2,2,5>: Cost 4 vext3 <0,2,u,1>, <2,2,5,7>
+ 2635253690U, // <1,2,2,6>: Cost 3 vext2 <3,0,1,2>, <2,6,3,7>
+ 2659141610U, // <1,2,2,7>: Cost 3 vext2 <7,0,1,2>, <2,7,0,1>
+ 2689746591U, // <1,2,2,u>: Cost 3 vext3 <0,u,1,1>, <2,2,u,3>
+ 403488870U, // <1,2,3,0>: Cost 1 vext1 LHS, LHS
+ 1477231350U, // <1,2,3,1>: Cost 2 vext1 LHS, <1,0,3,2>
+ 1477232232U, // <1,2,3,2>: Cost 2 vext1 LHS, <2,2,2,2>
+ 1477233052U, // <1,2,3,3>: Cost 2 vext1 LHS, <3,3,3,3>
+ 403492150U, // <1,2,3,4>: Cost 1 vext1 LHS, RHS
+ 1525010128U, // <1,2,3,5>: Cost 2 vext1 LHS, <5,1,7,3>
+ 1525010938U, // <1,2,3,6>: Cost 2 vext1 LHS, <6,2,7,3>
+ 1525011450U, // <1,2,3,7>: Cost 2 vext1 LHS, <7,0,1,2>
+ 403494702U, // <1,2,3,u>: Cost 1 vext1 LHS, LHS
+ 2641226607U, // <1,2,4,0>: Cost 3 vext2 <4,0,1,2>, <4,0,1,2>
+ 3624723446U, // <1,2,4,1>: Cost 4 vext1 <0,1,2,4>, <1,3,4,6>
+ 3301123609U, // <1,2,4,2>: Cost 4 vrev <2,1,2,4>
+ 2598759198U, // <1,2,4,3>: Cost 3 vext1 <u,1,2,4>, <3,u,1,2>
+ 2659142864U, // <1,2,4,4>: Cost 3 vext2 <7,0,1,2>, <4,4,4,4>
+ 1561513270U, // <1,2,4,5>: Cost 2 vext2 <3,0,1,2>, RHS
+ 2659143028U, // <1,2,4,6>: Cost 3 vext2 <7,0,1,2>, <4,6,4,6>
+ 2659143112U, // <1,2,4,7>: Cost 3 vext2 <7,0,1,2>, <4,7,5,0>
+ 1561513513U, // <1,2,4,u>: Cost 2 vext2 <3,0,1,2>, RHS
+ 2550988902U, // <1,2,5,0>: Cost 3 vext1 <0,1,2,5>, LHS
+ 2550989824U, // <1,2,5,1>: Cost 3 vext1 <0,1,2,5>, <1,3,5,7>
+ 3624732264U, // <1,2,5,2>: Cost 4 vext1 <0,1,2,5>, <2,2,2,2>
+ 2955559014U, // <1,2,5,3>: Cost 3 vzipr <0,4,1,5>, LHS
+ 2550992182U, // <1,2,5,4>: Cost 3 vext1 <0,1,2,5>, RHS
+ 2659143684U, // <1,2,5,5>: Cost 3 vext2 <7,0,1,2>, <5,5,5,5>
+ 2659143778U, // <1,2,5,6>: Cost 3 vext2 <7,0,1,2>, <5,6,7,0>
+ 2659143848U, // <1,2,5,7>: Cost 3 vext2 <7,0,1,2>, <5,7,5,7>
+ 2550994734U, // <1,2,5,u>: Cost 3 vext1 <0,1,2,5>, LHS
+ 2700289945U, // <1,2,6,0>: Cost 3 vext3 <2,6,0,1>, <2,6,0,1>
+ 2635256232U, // <1,2,6,1>: Cost 3 vext2 <3,0,1,2>, <6,1,7,2>
+ 2659144186U, // <1,2,6,2>: Cost 3 vext2 <7,0,1,2>, <6,2,7,3>
+ 2689746874U, // <1,2,6,3>: Cost 3 vext3 <0,u,1,1>, <2,6,3,7>
+ 3763488705U, // <1,2,6,4>: Cost 4 vext3 <0,u,1,1>, <2,6,4,5>
+ 3763488716U, // <1,2,6,5>: Cost 4 vext3 <0,u,1,1>, <2,6,5,7>
+ 2659144504U, // <1,2,6,6>: Cost 3 vext2 <7,0,1,2>, <6,6,6,6>
+ 2657817432U, // <1,2,6,7>: Cost 3 vext2 <6,7,1,2>, <6,7,1,2>
+ 2689746919U, // <1,2,6,u>: Cost 3 vext3 <0,u,1,1>, <2,6,u,7>
+ 1585402874U, // <1,2,7,0>: Cost 2 vext2 <7,0,1,2>, <7,0,1,2>
+ 2659144770U, // <1,2,7,1>: Cost 3 vext2 <7,0,1,2>, <7,1,0,2>
+ 3708998858U, // <1,2,7,2>: Cost 4 vext2 <3,0,1,2>, <7,2,6,3>
+ 2635257059U, // <1,2,7,3>: Cost 3 vext2 <3,0,1,2>, <7,3,0,1>
+ 2659145062U, // <1,2,7,4>: Cost 3 vext2 <7,0,1,2>, <7,4,5,6>
+ 3732886916U, // <1,2,7,5>: Cost 4 vext2 <7,0,1,2>, <7,5,0,0>
+ 3732886998U, // <1,2,7,6>: Cost 4 vext2 <7,0,1,2>, <7,6,0,1>
+ 2659145255U, // <1,2,7,7>: Cost 3 vext2 <7,0,1,2>, <7,7,0,1>
+ 1590711938U, // <1,2,7,u>: Cost 2 vext2 <7,u,1,2>, <7,u,1,2>
+ 403529835U, // <1,2,u,0>: Cost 1 vext1 LHS, LHS
+ 1477272310U, // <1,2,u,1>: Cost 2 vext1 LHS, <1,0,3,2>
+ 1477273192U, // <1,2,u,2>: Cost 2 vext1 LHS, <2,2,2,2>
+ 1477273750U, // <1,2,u,3>: Cost 2 vext1 LHS, <3,0,1,2>
+ 403533110U, // <1,2,u,4>: Cost 1 vext1 LHS, RHS
+ 1561516186U, // <1,2,u,5>: Cost 2 vext2 <3,0,1,2>, RHS
+ 1525051898U, // <1,2,u,6>: Cost 2 vext1 LHS, <6,2,7,3>
+ 1525052410U, // <1,2,u,7>: Cost 2 vext1 LHS, <7,0,1,2>
+ 403535662U, // <1,2,u,u>: Cost 1 vext1 LHS, LHS
+ 2819407872U, // <1,3,0,0>: Cost 3 vuzpr LHS, <0,0,0,0>
+ 1551564902U, // <1,3,0,1>: Cost 2 vext2 <1,3,1,3>, LHS
+ 2819408630U, // <1,3,0,2>: Cost 3 vuzpr LHS, <1,0,3,2>
+ 2619334911U, // <1,3,0,3>: Cost 3 vext2 <0,3,1,3>, <0,3,1,3>
+ 2625306962U, // <1,3,0,4>: Cost 3 vext2 <1,3,1,3>, <0,4,1,5>
+ 3832725879U, // <1,3,0,5>: Cost 4 vuzpl <1,2,3,0>, <0,4,5,6>
+ 3699048959U, // <1,3,0,6>: Cost 4 vext2 <1,3,1,3>, <0,6,2,7>
+ 3776538827U, // <1,3,0,7>: Cost 4 vext3 <3,0,7,1>, <3,0,7,1>
+ 1551565469U, // <1,3,0,u>: Cost 2 vext2 <1,3,1,3>, LHS
+ 2618671862U, // <1,3,1,0>: Cost 3 vext2 <0,2,1,3>, <1,0,3,2>
+ 2819408692U, // <1,3,1,1>: Cost 3 vuzpr LHS, <1,1,1,1>
+ 2624643975U, // <1,3,1,2>: Cost 3 vext2 <1,2,1,3>, <1,2,1,3>
+ 1745666150U, // <1,3,1,3>: Cost 2 vuzpr LHS, LHS
+ 2557005110U, // <1,3,1,4>: Cost 3 vext1 <1,1,3,1>, RHS
+ 2625307792U, // <1,3,1,5>: Cost 3 vext2 <1,3,1,3>, <1,5,3,7>
+ 3698386127U, // <1,3,1,6>: Cost 4 vext2 <1,2,1,3>, <1,6,1,7>
+ 2592838748U, // <1,3,1,7>: Cost 3 vext1 <7,1,3,1>, <7,1,3,1>
+ 1745666155U, // <1,3,1,u>: Cost 2 vuzpr LHS, LHS
+ 2819408790U, // <1,3,2,0>: Cost 3 vuzpr LHS, <1,2,3,0>
+ 2625308193U, // <1,3,2,1>: Cost 3 vext2 <1,3,1,3>, <2,1,3,3>
+ 2819408036U, // <1,3,2,2>: Cost 3 vuzpr LHS, <0,2,0,2>
+ 2819851890U, // <1,3,2,3>: Cost 3 vuzpr LHS, <2,2,3,3>
+ 2819408794U, // <1,3,2,4>: Cost 3 vuzpr LHS, <1,2,3,4>
+ 3893149890U, // <1,3,2,5>: Cost 4 vuzpr LHS, <0,2,3,5>
+ 2819408076U, // <1,3,2,6>: Cost 3 vuzpr LHS, <0,2,4,6>
+ 3772041583U, // <1,3,2,7>: Cost 4 vext3 <2,3,0,1>, <3,2,7,3>
+ 2819408042U, // <1,3,2,u>: Cost 3 vuzpr LHS, <0,2,0,u>
+ 1483276390U, // <1,3,3,0>: Cost 2 vext1 <1,1,3,3>, LHS
+ 1483277128U, // <1,3,3,1>: Cost 2 vext1 <1,1,3,3>, <1,1,3,3>
+ 2557019752U, // <1,3,3,2>: Cost 3 vext1 <1,1,3,3>, <2,2,2,2>
+ 2819408856U, // <1,3,3,3>: Cost 3 vuzpr LHS, <1,3,1,3>
+ 1483279670U, // <1,3,3,4>: Cost 2 vext1 <1,1,3,3>, RHS
+ 2819409614U, // <1,3,3,5>: Cost 3 vuzpr LHS, <2,3,4,5>
+ 2598826490U, // <1,3,3,6>: Cost 3 vext1 <u,1,3,3>, <6,2,7,3>
+ 3087844352U, // <1,3,3,7>: Cost 3 vtrnr LHS, <1,3,5,7>
+ 1483282222U, // <1,3,3,u>: Cost 2 vext1 <1,1,3,3>, LHS
+ 2568970342U, // <1,3,4,0>: Cost 3 vext1 <3,1,3,4>, LHS
+ 2568971224U, // <1,3,4,1>: Cost 3 vext1 <3,1,3,4>, <1,3,1,3>
+ 3832761290U, // <1,3,4,2>: Cost 4 vuzpl <1,2,3,4>, <4,1,2,3>
+ 2233428219U, // <1,3,4,3>: Cost 3 vrev <3,1,3,4>
+ 2568973622U, // <1,3,4,4>: Cost 3 vext1 <3,1,3,4>, RHS
+ 1551568182U, // <1,3,4,5>: Cost 2 vext2 <1,3,1,3>, RHS
+ 2819410434U, // <1,3,4,6>: Cost 3 vuzpr LHS, <3,4,5,6>
+ 3666605151U, // <1,3,4,7>: Cost 4 vext1 <7,1,3,4>, <7,1,3,4>
+ 1551568425U, // <1,3,4,u>: Cost 2 vext2 <1,3,1,3>, RHS
+ 2563006566U, // <1,3,5,0>: Cost 3 vext1 <2,1,3,5>, LHS
+ 2568979456U, // <1,3,5,1>: Cost 3 vext1 <3,1,3,5>, <1,3,5,7>
+ 2563008035U, // <1,3,5,2>: Cost 3 vext1 <2,1,3,5>, <2,1,3,5>
+ 2233436412U, // <1,3,5,3>: Cost 3 vrev <3,1,3,5>
+ 2563009846U, // <1,3,5,4>: Cost 3 vext1 <2,1,3,5>, RHS
+ 2867187716U, // <1,3,5,5>: Cost 3 vuzpr LHS, <5,5,5,5>
+ 2655834214U, // <1,3,5,6>: Cost 3 vext2 <6,4,1,3>, <5,6,7,4>
+ 1745669430U, // <1,3,5,7>: Cost 2 vuzpr LHS, RHS
+ 1745669431U, // <1,3,5,u>: Cost 2 vuzpr LHS, RHS
+ 2867187810U, // <1,3,6,0>: Cost 3 vuzpr LHS, <5,6,7,0>
+ 3699052931U, // <1,3,6,1>: Cost 4 vext2 <1,3,1,3>, <6,1,3,1>
+ 2654507460U, // <1,3,6,2>: Cost 3 vext2 <6,2,1,3>, <6,2,1,3>
+ 3766291091U, // <1,3,6,3>: Cost 4 vext3 <1,3,3,1>, <3,6,3,7>
+ 2655834726U, // <1,3,6,4>: Cost 3 vext2 <6,4,1,3>, <6,4,1,3>
+ 3923384562U, // <1,3,6,5>: Cost 4 vuzpr <5,1,7,3>, <u,6,7,5>
+ 2657161992U, // <1,3,6,6>: Cost 3 vext2 <6,6,1,3>, <6,6,1,3>
+ 2819852218U, // <1,3,6,7>: Cost 3 vuzpr LHS, <2,6,3,7>
+ 2819852219U, // <1,3,6,u>: Cost 3 vuzpr LHS, <2,6,3,u>
+ 2706926275U, // <1,3,7,0>: Cost 3 vext3 <3,7,0,1>, <3,7,0,1>
+ 2659816524U, // <1,3,7,1>: Cost 3 vext2 <7,1,1,3>, <7,1,1,3>
+ 3636766245U, // <1,3,7,2>: Cost 4 vext1 <2,1,3,7>, <2,1,3,7>
+ 2867187903U, // <1,3,7,3>: Cost 3 vuzpr LHS, <5,7,u,3>
+ 2625312102U, // <1,3,7,4>: Cost 3 vext2 <1,3,1,3>, <7,4,5,6>
+ 2867188598U, // <1,3,7,5>: Cost 3 vuzpr LHS, <6,7,4,5>
+ 3728250344U, // <1,3,7,6>: Cost 4 vext2 <6,2,1,3>, <7,6,2,1>
+ 2867187880U, // <1,3,7,7>: Cost 3 vuzpr LHS, <5,7,5,7>
+ 2707516171U, // <1,3,7,u>: Cost 3 vext3 <3,7,u,1>, <3,7,u,1>
+ 1483317350U, // <1,3,u,0>: Cost 2 vext1 <1,1,3,u>, LHS
+ 1483318093U, // <1,3,u,1>: Cost 2 vext1 <1,1,3,u>, <1,1,3,u>
+ 2819410718U, // <1,3,u,2>: Cost 3 vuzpr LHS, <3,u,1,2>
+ 1745666717U, // <1,3,u,3>: Cost 2 vuzpr LHS, LHS
+ 1483320630U, // <1,3,u,4>: Cost 2 vext1 <1,1,3,u>, RHS
+ 1551571098U, // <1,3,u,5>: Cost 2 vext2 <1,3,1,3>, RHS
+ 2819410758U, // <1,3,u,6>: Cost 3 vuzpr LHS, <3,u,5,6>
+ 1745669673U, // <1,3,u,7>: Cost 2 vuzpr LHS, RHS
+ 1745666722U, // <1,3,u,u>: Cost 2 vuzpr LHS, LHS
+ 2617352205U, // <1,4,0,0>: Cost 3 vext2 <0,0,1,4>, <0,0,1,4>
+ 2619342950U, // <1,4,0,1>: Cost 3 vext2 <0,3,1,4>, LHS
+ 3692421295U, // <1,4,0,2>: Cost 4 vext2 <0,2,1,4>, <0,2,1,4>
+ 2619343104U, // <1,4,0,3>: Cost 3 vext2 <0,3,1,4>, <0,3,1,4>
+ 2617352530U, // <1,4,0,4>: Cost 3 vext2 <0,0,1,4>, <0,4,1,5>
+ 1634880402U, // <1,4,0,5>: Cost 2 vext3 <4,0,5,1>, <4,0,5,1>
+ 2713930652U, // <1,4,0,6>: Cost 3 vext3 <4,u,5,1>, <4,0,6,2>
+ 3732898396U, // <1,4,0,7>: Cost 4 vext2 <7,0,1,4>, <0,7,4,1>
+ 1635101613U, // <1,4,0,u>: Cost 2 vext3 <4,0,u,1>, <4,0,u,1>
+ 3693085430U, // <1,4,1,0>: Cost 4 vext2 <0,3,1,4>, <1,0,3,2>
+ 2623988535U, // <1,4,1,1>: Cost 3 vext2 <1,1,1,4>, <1,1,1,4>
+ 3693085590U, // <1,4,1,2>: Cost 4 vext2 <0,3,1,4>, <1,2,3,0>
+ 3692422134U, // <1,4,1,3>: Cost 4 vext2 <0,2,1,4>, <1,3,4,6>
+ 3693085726U, // <1,4,1,4>: Cost 4 vext2 <0,3,1,4>, <1,4,0,1>
+ 2892401974U, // <1,4,1,5>: Cost 3 vzipl <1,1,1,1>, RHS
+ 3026619702U, // <1,4,1,6>: Cost 3 vtrnl <1,1,1,1>, RHS
+ 3800206324U, // <1,4,1,7>: Cost 4 vext3 <7,0,4,1>, <4,1,7,0>
+ 2892402217U, // <1,4,1,u>: Cost 3 vzipl <1,1,1,1>, RHS
+ 3966978927U, // <1,4,2,0>: Cost 4 vzipl <1,2,3,4>, <4,0,1,2>
+ 3966979018U, // <1,4,2,1>: Cost 4 vzipl <1,2,3,4>, <4,1,2,3>
+ 3693086312U, // <1,4,2,2>: Cost 4 vext2 <0,3,1,4>, <2,2,2,2>
+ 2635269798U, // <1,4,2,3>: Cost 3 vext2 <3,0,1,4>, <2,3,0,1>
+ 3966979280U, // <1,4,2,4>: Cost 4 vzipl <1,2,3,4>, <4,4,4,4>
+ 2893204790U, // <1,4,2,5>: Cost 3 vzipl <1,2,3,0>, RHS
+ 3693086650U, // <1,4,2,6>: Cost 4 vext2 <0,3,1,4>, <2,6,3,7>
+ 3666662502U, // <1,4,2,7>: Cost 4 vext1 <7,1,4,2>, <7,1,4,2>
+ 2893205033U, // <1,4,2,u>: Cost 3 vzipl <1,2,3,0>, RHS
+ 2563063910U, // <1,4,3,0>: Cost 3 vext1 <2,1,4,3>, LHS
+ 2563064730U, // <1,4,3,1>: Cost 3 vext1 <2,1,4,3>, <1,2,3,4>
+ 2563065386U, // <1,4,3,2>: Cost 3 vext1 <2,1,4,3>, <2,1,4,3>
+ 3693087132U, // <1,4,3,3>: Cost 4 vext2 <0,3,1,4>, <3,3,3,3>
+ 2619345410U, // <1,4,3,4>: Cost 3 vext2 <0,3,1,4>, <3,4,5,6>
+ 3087843666U, // <1,4,3,5>: Cost 3 vtrnr LHS, <0,4,1,5>
+ 3087843676U, // <1,4,3,6>: Cost 3 vtrnr LHS, <0,4,2,6>
+ 3666670695U, // <1,4,3,7>: Cost 4 vext1 <7,1,4,3>, <7,1,4,3>
+ 3087843669U, // <1,4,3,u>: Cost 3 vtrnr LHS, <0,4,1,u>
+ 2620672914U, // <1,4,4,0>: Cost 3 vext2 <0,5,1,4>, <4,0,5,1>
+ 3630842706U, // <1,4,4,1>: Cost 4 vext1 <1,1,4,4>, <1,1,4,4>
+ 3313069003U, // <1,4,4,2>: Cost 4 vrev <4,1,2,4>
+ 3642788100U, // <1,4,4,3>: Cost 4 vext1 <3,1,4,4>, <3,1,4,4>
+ 2713930960U, // <1,4,4,4>: Cost 3 vext3 <4,u,5,1>, <4,4,4,4>
+ 2619346230U, // <1,4,4,5>: Cost 3 vext2 <0,3,1,4>, RHS
+ 2713930980U, // <1,4,4,6>: Cost 3 vext3 <4,u,5,1>, <4,4,6,6>
+ 3736882642U, // <1,4,4,7>: Cost 4 vext2 <7,6,1,4>, <4,7,6,1>
+ 2619346473U, // <1,4,4,u>: Cost 3 vext2 <0,3,1,4>, RHS
+ 2557108326U, // <1,4,5,0>: Cost 3 vext1 <1,1,4,5>, LHS
+ 2557109075U, // <1,4,5,1>: Cost 3 vext1 <1,1,4,5>, <1,1,4,5>
+ 2598913774U, // <1,4,5,2>: Cost 3 vext1 <u,1,4,5>, <2,3,u,1>
+ 3630852246U, // <1,4,5,3>: Cost 4 vext1 <1,1,4,5>, <3,0,1,2>
+ 2557111606U, // <1,4,5,4>: Cost 3 vext1 <1,1,4,5>, RHS
+ 2895252790U, // <1,4,5,5>: Cost 3 vzipl <1,5,3,7>, RHS
+ 1616006454U, // <1,4,5,6>: Cost 2 vext3 <0,u,1,1>, RHS
+ 3899059510U, // <1,4,5,7>: Cost 4 vuzpr <1,1,1,4>, RHS
+ 1616006472U, // <1,4,5,u>: Cost 2 vext3 <0,u,1,1>, RHS
+ 2557116518U, // <1,4,6,0>: Cost 3 vext1 <1,1,4,6>, LHS
+ 2557117236U, // <1,4,6,1>: Cost 3 vext1 <1,1,4,6>, <1,1,1,1>
+ 3630859880U, // <1,4,6,2>: Cost 4 vext1 <1,1,4,6>, <2,2,2,2>
+ 2569062550U, // <1,4,6,3>: Cost 3 vext1 <3,1,4,6>, <3,0,1,2>
+ 2557119798U, // <1,4,6,4>: Cost 3 vext1 <1,1,4,6>, RHS
+ 3763490174U, // <1,4,6,5>: Cost 4 vext3 <0,u,1,1>, <4,6,5,7>
+ 3763490183U, // <1,4,6,6>: Cost 4 vext3 <0,u,1,1>, <4,6,6,7>
+ 2712751498U, // <1,4,6,7>: Cost 3 vext3 <4,6,7,1>, <4,6,7,1>
+ 2557122350U, // <1,4,6,u>: Cost 3 vext1 <1,1,4,6>, LHS
+ 2659161084U, // <1,4,7,0>: Cost 3 vext2 <7,0,1,4>, <7,0,1,4>
+ 3732903040U, // <1,4,7,1>: Cost 4 vext2 <7,0,1,4>, <7,1,7,1>
+ 3734230174U, // <1,4,7,2>: Cost 4 vext2 <7,2,1,4>, <7,2,1,4>
+ 3734893807U, // <1,4,7,3>: Cost 4 vext2 <7,3,1,4>, <7,3,1,4>
+ 3660729654U, // <1,4,7,4>: Cost 4 vext1 <6,1,4,7>, RHS
+ 3786493384U, // <1,4,7,5>: Cost 4 vext3 <4,6,7,1>, <4,7,5,0>
+ 2713341394U, // <1,4,7,6>: Cost 3 vext3 <4,7,6,1>, <4,7,6,1>
+ 3660731386U, // <1,4,7,7>: Cost 4 vext1 <6,1,4,7>, <7,0,1,2>
+ 2664470148U, // <1,4,7,u>: Cost 3 vext2 <7,u,1,4>, <7,u,1,4>
+ 2557132902U, // <1,4,u,0>: Cost 3 vext1 <1,1,4,u>, LHS
+ 2619348782U, // <1,4,u,1>: Cost 3 vext2 <0,3,1,4>, LHS
+ 2563106351U, // <1,4,u,2>: Cost 3 vext1 <2,1,4,u>, <2,1,4,u>
+ 2713783816U, // <1,4,u,3>: Cost 3 vext3 <4,u,3,1>, <4,u,3,1>
+ 2622666815U, // <1,4,u,4>: Cost 3 vext2 <0,u,1,4>, <u,4,5,6>
+ 1640189466U, // <1,4,u,5>: Cost 2 vext3 <4,u,5,1>, <4,u,5,1>
+ 1616006697U, // <1,4,u,6>: Cost 2 vext3 <0,u,1,1>, RHS
+ 2712751498U, // <1,4,u,7>: Cost 3 vext3 <4,6,7,1>, <4,6,7,1>
+ 1616006715U, // <1,4,u,u>: Cost 2 vext3 <0,u,1,1>, RHS
+ 2620014592U, // <1,5,0,0>: Cost 3 vext2 <0,4,1,5>, <0,0,0,0>
+ 1546272870U, // <1,5,0,1>: Cost 2 vext2 <0,4,1,5>, LHS
+ 2618687664U, // <1,5,0,2>: Cost 3 vext2 <0,2,1,5>, <0,2,1,5>
+ 3693093120U, // <1,5,0,3>: Cost 4 vext2 <0,3,1,5>, <0,3,1,4>
+ 1546273106U, // <1,5,0,4>: Cost 2 vext2 <0,4,1,5>, <0,4,1,5>
+ 2620678563U, // <1,5,0,5>: Cost 3 vext2 <0,5,1,5>, <0,5,1,5>
+ 2714668660U, // <1,5,0,6>: Cost 3 vext3 <5,0,6,1>, <5,0,6,1>
+ 3772042877U, // <1,5,0,7>: Cost 4 vext3 <2,3,0,1>, <5,0,7,1>
+ 1546273437U, // <1,5,0,u>: Cost 2 vext2 <0,4,1,5>, LHS
+ 2620015350U, // <1,5,1,0>: Cost 3 vext2 <0,4,1,5>, <1,0,3,2>
+ 2620015412U, // <1,5,1,1>: Cost 3 vext2 <0,4,1,5>, <1,1,1,1>
+ 2620015510U, // <1,5,1,2>: Cost 3 vext2 <0,4,1,5>, <1,2,3,0>
+ 2618688512U, // <1,5,1,3>: Cost 3 vext2 <0,2,1,5>, <1,3,5,7>
+ 2620015677U, // <1,5,1,4>: Cost 3 vext2 <0,4,1,5>, <1,4,3,5>
+ 2620015727U, // <1,5,1,5>: Cost 3 vext2 <0,4,1,5>, <1,5,0,1>
+ 2620015859U, // <1,5,1,6>: Cost 3 vext2 <0,4,1,5>, <1,6,5,7>
+ 3093728566U, // <1,5,1,7>: Cost 3 vtrnr <1,1,1,1>, RHS
+ 2620015981U, // <1,5,1,u>: Cost 3 vext2 <0,4,1,5>, <1,u,1,3>
+ 3692430816U, // <1,5,2,0>: Cost 4 vext2 <0,2,1,5>, <2,0,5,1>
+ 2620016163U, // <1,5,2,1>: Cost 3 vext2 <0,4,1,5>, <2,1,3,5>
+ 2620016232U, // <1,5,2,2>: Cost 3 vext2 <0,4,1,5>, <2,2,2,2>
+ 2620016294U, // <1,5,2,3>: Cost 3 vext2 <0,4,1,5>, <2,3,0,1>
+ 3693758221U, // <1,5,2,4>: Cost 4 vext2 <0,4,1,5>, <2,4,2,5>
+ 3692431209U, // <1,5,2,5>: Cost 4 vext2 <0,2,1,5>, <2,5,3,7>
+ 2620016570U, // <1,5,2,6>: Cost 3 vext2 <0,4,1,5>, <2,6,3,7>
+ 4173598006U, // <1,5,2,7>: Cost 4 vtrnr <2,1,3,2>, RHS
+ 2620016699U, // <1,5,2,u>: Cost 3 vext2 <0,4,1,5>, <2,u,0,1>
+ 2620016790U, // <1,5,3,0>: Cost 3 vext2 <0,4,1,5>, <3,0,1,2>
+ 2569110672U, // <1,5,3,1>: Cost 3 vext1 <3,1,5,3>, <1,5,3,7>
+ 3693758785U, // <1,5,3,2>: Cost 4 vext2 <0,4,1,5>, <3,2,2,2>
+ 2620017052U, // <1,5,3,3>: Cost 3 vext2 <0,4,1,5>, <3,3,3,3>
+ 2620017154U, // <1,5,3,4>: Cost 3 vext2 <0,4,1,5>, <3,4,5,6>
+ 3135623172U, // <1,5,3,5>: Cost 3 vtrnr LHS, <5,5,5,5>
+ 4161587048U, // <1,5,3,6>: Cost 4 vtrnr LHS, <2,5,3,6>
+ 2014104886U, // <1,5,3,7>: Cost 2 vtrnr LHS, RHS
+ 2014104887U, // <1,5,3,u>: Cost 2 vtrnr LHS, RHS
+ 2620017554U, // <1,5,4,0>: Cost 3 vext2 <0,4,1,5>, <4,0,5,1>
+ 2620017634U, // <1,5,4,1>: Cost 3 vext2 <0,4,1,5>, <4,1,5,0>
+ 3693759551U, // <1,5,4,2>: Cost 4 vext2 <0,4,1,5>, <4,2,6,3>
+ 3642861837U, // <1,5,4,3>: Cost 4 vext1 <3,1,5,4>, <3,1,5,4>
+ 2575092710U, // <1,5,4,4>: Cost 3 vext1 <4,1,5,4>, <4,1,5,4>
+ 1546276150U, // <1,5,4,5>: Cost 2 vext2 <0,4,1,5>, RHS
+ 2759855414U, // <1,5,4,6>: Cost 3 vuzpl <1,3,5,7>, RHS
+ 2713931718U, // <1,5,4,7>: Cost 3 vext3 <4,u,5,1>, <5,4,7,6>
+ 1546276393U, // <1,5,4,u>: Cost 2 vext2 <0,4,1,5>, RHS
+ 2557182054U, // <1,5,5,0>: Cost 3 vext1 <1,1,5,5>, LHS
+ 2557182812U, // <1,5,5,1>: Cost 3 vext1 <1,1,5,5>, <1,1,5,5>
+ 3630925347U, // <1,5,5,2>: Cost 4 vext1 <1,1,5,5>, <2,1,3,5>
+ 4029301675U, // <1,5,5,3>: Cost 4 vzipr <0,4,1,5>, <1,2,5,3>
+ 2557185334U, // <1,5,5,4>: Cost 3 vext1 <1,1,5,5>, RHS
+ 2713931780U, // <1,5,5,5>: Cost 3 vext3 <4,u,5,1>, <5,5,5,5>
+ 2667794530U, // <1,5,5,6>: Cost 3 vext2 <u,4,1,5>, <5,6,7,0>
+ 2713931800U, // <1,5,5,7>: Cost 3 vext3 <4,u,5,1>, <5,5,7,7>
+ 2557187886U, // <1,5,5,u>: Cost 3 vext1 <1,1,5,5>, LHS
+ 2718208036U, // <1,5,6,0>: Cost 3 vext3 <5,6,0,1>, <5,6,0,1>
+ 2620019115U, // <1,5,6,1>: Cost 3 vext2 <0,4,1,5>, <6,1,7,5>
+ 2667794938U, // <1,5,6,2>: Cost 3 vext2 <u,4,1,5>, <6,2,7,3>
+ 3787673666U, // <1,5,6,3>: Cost 4 vext3 <4,u,5,1>, <5,6,3,4>
+ 3693761165U, // <1,5,6,4>: Cost 4 vext2 <0,4,1,5>, <6,4,5,6>
+ 3319279297U, // <1,5,6,5>: Cost 4 vrev <5,1,5,6>
+ 2667795256U, // <1,5,6,6>: Cost 3 vext2 <u,4,1,5>, <6,6,6,6>
+ 2713931874U, // <1,5,6,7>: Cost 3 vext3 <4,u,5,1>, <5,6,7,0>
+ 2713931883U, // <1,5,6,u>: Cost 3 vext3 <4,u,5,1>, <5,6,u,0>
+ 2557198438U, // <1,5,7,0>: Cost 3 vext1 <1,1,5,7>, LHS
+ 2557199156U, // <1,5,7,1>: Cost 3 vext1 <1,1,5,7>, <1,1,1,1>
+ 2569143974U, // <1,5,7,2>: Cost 3 vext1 <3,1,5,7>, <2,3,0,1>
+ 2569144592U, // <1,5,7,3>: Cost 3 vext1 <3,1,5,7>, <3,1,5,7>
+ 2557201718U, // <1,5,7,4>: Cost 3 vext1 <1,1,5,7>, RHS
+ 2713931944U, // <1,5,7,5>: Cost 3 vext3 <4,u,5,1>, <5,7,5,7>
+ 3787673770U, // <1,5,7,6>: Cost 4 vext3 <4,u,5,1>, <5,7,6,0>
+ 2719387828U, // <1,5,7,7>: Cost 3 vext3 <5,7,7,1>, <5,7,7,1>
+ 2557204270U, // <1,5,7,u>: Cost 3 vext1 <1,1,5,7>, LHS
+ 2620020435U, // <1,5,u,0>: Cost 3 vext2 <0,4,1,5>, <u,0,1,2>
+ 1546278702U, // <1,5,u,1>: Cost 2 vext2 <0,4,1,5>, LHS
+ 2620020616U, // <1,5,u,2>: Cost 3 vext2 <0,4,1,5>, <u,2,3,3>
+ 2620020668U, // <1,5,u,3>: Cost 3 vext2 <0,4,1,5>, <u,3,0,1>
+ 1594054682U, // <1,5,u,4>: Cost 2 vext2 <u,4,1,5>, <u,4,1,5>
+ 1546279066U, // <1,5,u,5>: Cost 2 vext2 <0,4,1,5>, RHS
+ 2620020944U, // <1,5,u,6>: Cost 3 vext2 <0,4,1,5>, <u,6,3,7>
+ 2014145846U, // <1,5,u,7>: Cost 2 vtrnr LHS, RHS
+ 2014145847U, // <1,5,u,u>: Cost 2 vtrnr LHS, RHS
+ 3692437504U, // <1,6,0,0>: Cost 4 vext2 <0,2,1,6>, <0,0,0,0>
+ 2618695782U, // <1,6,0,1>: Cost 3 vext2 <0,2,1,6>, LHS
+ 2618695857U, // <1,6,0,2>: Cost 3 vext2 <0,2,1,6>, <0,2,1,6>
+ 3794161970U, // <1,6,0,3>: Cost 4 vext3 <6,0,3,1>, <6,0,3,1>
+ 2620023122U, // <1,6,0,4>: Cost 3 vext2 <0,4,1,6>, <0,4,1,5>
+ 2620686756U, // <1,6,0,5>: Cost 3 vext2 <0,5,1,6>, <0,5,1,6>
+ 2621350389U, // <1,6,0,6>: Cost 3 vext2 <0,6,1,6>, <0,6,1,6>
+ 4028599606U, // <1,6,0,7>: Cost 4 vzipr <0,3,1,0>, RHS
+ 2618696349U, // <1,6,0,u>: Cost 3 vext2 <0,2,1,6>, LHS
+ 3692438262U, // <1,6,1,0>: Cost 4 vext2 <0,2,1,6>, <1,0,3,2>
+ 2625995572U, // <1,6,1,1>: Cost 3 vext2 <1,4,1,6>, <1,1,1,1>
+ 3692438422U, // <1,6,1,2>: Cost 4 vext2 <0,2,1,6>, <1,2,3,0>
+ 3692438488U, // <1,6,1,3>: Cost 4 vext2 <0,2,1,6>, <1,3,1,3>
+ 2625995820U, // <1,6,1,4>: Cost 3 vext2 <1,4,1,6>, <1,4,1,6>
+ 3692438672U, // <1,6,1,5>: Cost 4 vext2 <0,2,1,6>, <1,5,3,7>
+ 3692438720U, // <1,6,1,6>: Cost 4 vext2 <0,2,1,6>, <1,6,0,1>
+ 2958183734U, // <1,6,1,7>: Cost 3 vzipr <0,u,1,1>, RHS
+ 2958183735U, // <1,6,1,u>: Cost 3 vzipr <0,u,1,1>, RHS
+ 2721526201U, // <1,6,2,0>: Cost 3 vext3 <6,2,0,1>, <6,2,0,1>
+ 3692439097U, // <1,6,2,1>: Cost 4 vext2 <0,2,1,6>, <2,1,6,0>
+ 3692439144U, // <1,6,2,2>: Cost 4 vext2 <0,2,1,6>, <2,2,2,2>
+ 3692439206U, // <1,6,2,3>: Cost 4 vext2 <0,2,1,6>, <2,3,0,1>
+ 3636948278U, // <1,6,2,4>: Cost 4 vext1 <2,1,6,2>, RHS
+ 3787674092U, // <1,6,2,5>: Cost 4 vext3 <4,u,5,1>, <6,2,5,7>
+ 2618697658U, // <1,6,2,6>: Cost 3 vext2 <0,2,1,6>, <2,6,3,7>
+ 2970799414U, // <1,6,2,7>: Cost 3 vzipr <3,0,1,2>, RHS
+ 2970799415U, // <1,6,2,u>: Cost 3 vzipr <3,0,1,2>, RHS
+ 2563211366U, // <1,6,3,0>: Cost 3 vext1 <2,1,6,3>, LHS
+ 3699738854U, // <1,6,3,1>: Cost 4 vext2 <1,4,1,6>, <3,1,1,1>
+ 2563212860U, // <1,6,3,2>: Cost 3 vext1 <2,1,6,3>, <2,1,6,3>
+ 3692439964U, // <1,6,3,3>: Cost 4 vext2 <0,2,1,6>, <3,3,3,3>
+ 2563214646U, // <1,6,3,4>: Cost 3 vext1 <2,1,6,3>, RHS
+ 4191820018U, // <1,6,3,5>: Cost 4 vtrnr <5,1,7,3>, <u,6,7,5>
+ 2587103648U, // <1,6,3,6>: Cost 3 vext1 <6,1,6,3>, <6,1,6,3>
+ 3087845306U, // <1,6,3,7>: Cost 3 vtrnr LHS, <2,6,3,7>
+ 3087845307U, // <1,6,3,u>: Cost 3 vtrnr LHS, <2,6,3,u>
+ 3693767570U, // <1,6,4,0>: Cost 4 vext2 <0,4,1,6>, <4,0,5,1>
+ 3693767650U, // <1,6,4,1>: Cost 4 vext2 <0,4,1,6>, <4,1,5,0>
+ 3636962877U, // <1,6,4,2>: Cost 4 vext1 <2,1,6,4>, <2,1,6,4>
+ 3325088134U, // <1,6,4,3>: Cost 4 vrev <6,1,3,4>
+ 3693767898U, // <1,6,4,4>: Cost 4 vext2 <0,4,1,6>, <4,4,5,5>
+ 2618699062U, // <1,6,4,5>: Cost 3 vext2 <0,2,1,6>, RHS
+ 3833670966U, // <1,6,4,6>: Cost 4 vuzpl <1,3,6,7>, RHS
+ 4028632374U, // <1,6,4,7>: Cost 4 vzipr <0,3,1,4>, RHS
+ 2618699305U, // <1,6,4,u>: Cost 3 vext2 <0,2,1,6>, RHS
+ 3693768264U, // <1,6,5,0>: Cost 4 vext2 <0,4,1,6>, <5,0,1,2>
+ 3630998373U, // <1,6,5,1>: Cost 4 vext1 <1,1,6,5>, <1,1,6,5>
+ 3636971070U, // <1,6,5,2>: Cost 4 vext1 <2,1,6,5>, <2,1,6,5>
+ 3642943767U, // <1,6,5,3>: Cost 4 vext1 <3,1,6,5>, <3,1,6,5>
+ 3693768628U, // <1,6,5,4>: Cost 4 vext2 <0,4,1,6>, <5,4,5,6>
+ 3732918276U, // <1,6,5,5>: Cost 4 vext2 <7,0,1,6>, <5,5,5,5>
+ 2620690530U, // <1,6,5,6>: Cost 3 vext2 <0,5,1,6>, <5,6,7,0>
+ 2955562294U, // <1,6,5,7>: Cost 3 vzipr <0,4,1,5>, RHS
+ 2955562295U, // <1,6,5,u>: Cost 3 vzipr <0,4,1,5>, RHS
+ 2724180733U, // <1,6,6,0>: Cost 3 vext3 <6,6,0,1>, <6,6,0,1>
+ 3631006566U, // <1,6,6,1>: Cost 4 vext1 <1,1,6,6>, <1,1,6,6>
+ 3631007674U, // <1,6,6,2>: Cost 4 vext1 <1,1,6,6>, <2,6,3,7>
+ 3692442184U, // <1,6,6,3>: Cost 4 vext2 <0,2,1,6>, <6,3,7,0>
+ 3631009078U, // <1,6,6,4>: Cost 4 vext1 <1,1,6,6>, RHS
+ 3787674416U, // <1,6,6,5>: Cost 4 vext3 <4,u,5,1>, <6,6,5,7>
+ 2713932600U, // <1,6,6,6>: Cost 3 vext3 <4,u,5,1>, <6,6,6,6>
+ 2713932610U, // <1,6,6,7>: Cost 3 vext3 <4,u,5,1>, <6,6,7,7>
+ 2713932619U, // <1,6,6,u>: Cost 3 vext3 <4,u,5,1>, <6,6,u,7>
+ 1651102542U, // <1,6,7,0>: Cost 2 vext3 <6,7,0,1>, <6,7,0,1>
+ 2724918103U, // <1,6,7,1>: Cost 3 vext3 <6,7,1,1>, <6,7,1,1>
+ 2698302306U, // <1,6,7,2>: Cost 3 vext3 <2,3,0,1>, <6,7,2,3>
+ 3642960153U, // <1,6,7,3>: Cost 4 vext1 <3,1,6,7>, <3,1,6,7>
+ 2713932662U, // <1,6,7,4>: Cost 3 vext3 <4,u,5,1>, <6,7,4,5>
+ 2725213051U, // <1,6,7,5>: Cost 3 vext3 <6,7,5,1>, <6,7,5,1>
+ 2724844426U, // <1,6,7,6>: Cost 3 vext3 <6,7,0,1>, <6,7,6,7>
+ 4035956022U, // <1,6,7,7>: Cost 4 vzipr <1,5,1,7>, RHS
+ 1651692438U, // <1,6,7,u>: Cost 2 vext3 <6,7,u,1>, <6,7,u,1>
+ 1651766175U, // <1,6,u,0>: Cost 2 vext3 <6,u,0,1>, <6,u,0,1>
+ 2618701614U, // <1,6,u,1>: Cost 3 vext2 <0,2,1,6>, LHS
+ 3135663508U, // <1,6,u,2>: Cost 3 vtrnr LHS, <4,6,u,2>
+ 3692443580U, // <1,6,u,3>: Cost 4 vext2 <0,2,1,6>, <u,3,0,1>
+ 2713932743U, // <1,6,u,4>: Cost 3 vext3 <4,u,5,1>, <6,u,4,5>
+ 2618701978U, // <1,6,u,5>: Cost 3 vext2 <0,2,1,6>, RHS
+ 2622683344U, // <1,6,u,6>: Cost 3 vext2 <0,u,1,6>, <u,6,3,7>
+ 3087886266U, // <1,6,u,7>: Cost 3 vtrnr LHS, <2,6,3,7>
+ 1652356071U, // <1,6,u,u>: Cost 2 vext3 <6,u,u,1>, <6,u,u,1>
+ 2726171632U, // <1,7,0,0>: Cost 3 vext3 <7,0,0,1>, <7,0,0,1>
+ 2626666598U, // <1,7,0,1>: Cost 3 vext2 <1,5,1,7>, LHS
+ 3695100067U, // <1,7,0,2>: Cost 4 vext2 <0,6,1,7>, <0,2,0,1>
+ 3707044102U, // <1,7,0,3>: Cost 4 vext2 <2,6,1,7>, <0,3,2,1>
+ 2726466580U, // <1,7,0,4>: Cost 3 vext3 <7,0,4,1>, <7,0,4,1>
+ 3654921933U, // <1,7,0,5>: Cost 4 vext1 <5,1,7,0>, <5,1,7,0>
+ 2621358582U, // <1,7,0,6>: Cost 3 vext2 <0,6,1,7>, <0,6,1,7>
+ 2622022215U, // <1,7,0,7>: Cost 3 vext2 <0,7,1,7>, <0,7,1,7>
+ 2626667165U, // <1,7,0,u>: Cost 3 vext2 <1,5,1,7>, LHS
+ 2593128550U, // <1,7,1,0>: Cost 3 vext1 <7,1,7,1>, LHS
+ 2626667316U, // <1,7,1,1>: Cost 3 vext2 <1,5,1,7>, <1,1,1,1>
+ 3700409238U, // <1,7,1,2>: Cost 4 vext2 <1,5,1,7>, <1,2,3,0>
+ 2257294428U, // <1,7,1,3>: Cost 3 vrev <7,1,3,1>
+ 2593131830U, // <1,7,1,4>: Cost 3 vext1 <7,1,7,1>, RHS
+ 2626667646U, // <1,7,1,5>: Cost 3 vext2 <1,5,1,7>, <1,5,1,7>
+ 2627331279U, // <1,7,1,6>: Cost 3 vext2 <1,6,1,7>, <1,6,1,7>
+ 2593133696U, // <1,7,1,7>: Cost 3 vext1 <7,1,7,1>, <7,1,7,1>
+ 2628658545U, // <1,7,1,u>: Cost 3 vext2 <1,u,1,7>, <1,u,1,7>
+ 2587164774U, // <1,7,2,0>: Cost 3 vext1 <6,1,7,2>, LHS
+ 3701073445U, // <1,7,2,1>: Cost 4 vext2 <1,6,1,7>, <2,1,3,7>
+ 3700409960U, // <1,7,2,2>: Cost 4 vext2 <1,5,1,7>, <2,2,2,2>
+ 2638612134U, // <1,7,2,3>: Cost 3 vext2 <3,5,1,7>, <2,3,0,1>
+ 2587168054U, // <1,7,2,4>: Cost 3 vext1 <6,1,7,2>, RHS
+ 3706382167U, // <1,7,2,5>: Cost 4 vext2 <2,5,1,7>, <2,5,1,7>
+ 2587169192U, // <1,7,2,6>: Cost 3 vext1 <6,1,7,2>, <6,1,7,2>
+ 3660911610U, // <1,7,2,7>: Cost 4 vext1 <6,1,7,2>, <7,0,1,2>
+ 2587170606U, // <1,7,2,u>: Cost 3 vext1 <6,1,7,2>, LHS
+ 1507459174U, // <1,7,3,0>: Cost 2 vext1 <5,1,7,3>, LHS
+ 2569257984U, // <1,7,3,1>: Cost 3 vext1 <3,1,7,3>, <1,3,5,7>
+ 2581202536U, // <1,7,3,2>: Cost 3 vext1 <5,1,7,3>, <2,2,2,2>
+ 2569259294U, // <1,7,3,3>: Cost 3 vext1 <3,1,7,3>, <3,1,7,3>
+ 1507462454U, // <1,7,3,4>: Cost 2 vext1 <5,1,7,3>, RHS
+ 1507462864U, // <1,7,3,5>: Cost 2 vext1 <5,1,7,3>, <5,1,7,3>
+ 2581205498U, // <1,7,3,6>: Cost 3 vext1 <5,1,7,3>, <6,2,7,3>
+ 2581206010U, // <1,7,3,7>: Cost 3 vext1 <5,1,7,3>, <7,0,1,2>
+ 1507465006U, // <1,7,3,u>: Cost 2 vext1 <5,1,7,3>, LHS
+ 2728826164U, // <1,7,4,0>: Cost 3 vext3 <7,4,0,1>, <7,4,0,1>
+ 3654951732U, // <1,7,4,1>: Cost 4 vext1 <5,1,7,4>, <1,1,1,1>
+ 3330987094U, // <1,7,4,2>: Cost 4 vrev <7,1,2,4>
+ 3331060831U, // <1,7,4,3>: Cost 4 vrev <7,1,3,4>
+ 3787674971U, // <1,7,4,4>: Cost 4 vext3 <4,u,5,1>, <7,4,4,4>
+ 2626669878U, // <1,7,4,5>: Cost 3 vext2 <1,5,1,7>, RHS
+ 3785979241U, // <1,7,4,6>: Cost 4 vext3 <4,6,0,1>, <7,4,6,0>
+ 3787085176U, // <1,7,4,7>: Cost 4 vext3 <4,7,6,1>, <7,4,7,6>
+ 2626670121U, // <1,7,4,u>: Cost 3 vext2 <1,5,1,7>, RHS
+ 2569273446U, // <1,7,5,0>: Cost 3 vext1 <3,1,7,5>, LHS
+ 2569274368U, // <1,7,5,1>: Cost 3 vext1 <3,1,7,5>, <1,3,5,7>
+ 3643016808U, // <1,7,5,2>: Cost 4 vext1 <3,1,7,5>, <2,2,2,2>
+ 2569275680U, // <1,7,5,3>: Cost 3 vext1 <3,1,7,5>, <3,1,7,5>
+ 2569276726U, // <1,7,5,4>: Cost 3 vext1 <3,1,7,5>, RHS
+ 4102034790U, // <1,7,5,5>: Cost 4 vtrnl <1,3,5,7>, <7,4,5,6>
+ 2651222067U, // <1,7,5,6>: Cost 3 vext2 <5,6,1,7>, <5,6,1,7>
+ 3899378998U, // <1,7,5,7>: Cost 4 vuzpr <1,1,5,7>, RHS
+ 2569279278U, // <1,7,5,u>: Cost 3 vext1 <3,1,7,5>, LHS
+ 2730153430U, // <1,7,6,0>: Cost 3 vext3 <7,6,0,1>, <7,6,0,1>
+ 2724845022U, // <1,7,6,1>: Cost 3 vext3 <6,7,0,1>, <7,6,1,0>
+ 3643025338U, // <1,7,6,2>: Cost 4 vext1 <3,1,7,6>, <2,6,3,7>
+ 3643025697U, // <1,7,6,3>: Cost 4 vext1 <3,1,7,6>, <3,1,7,6>
+ 3643026742U, // <1,7,6,4>: Cost 4 vext1 <3,1,7,6>, RHS
+ 3654971091U, // <1,7,6,5>: Cost 4 vext1 <5,1,7,6>, <5,1,7,6>
+ 3787675153U, // <1,7,6,6>: Cost 4 vext3 <4,u,5,1>, <7,6,6,6>
+ 2724845076U, // <1,7,6,7>: Cost 3 vext3 <6,7,0,1>, <7,6,7,0>
+ 2725508637U, // <1,7,6,u>: Cost 3 vext3 <6,u,0,1>, <7,6,u,0>
+ 2730817063U, // <1,7,7,0>: Cost 3 vext3 <7,7,0,1>, <7,7,0,1>
+ 3631088436U, // <1,7,7,1>: Cost 4 vext1 <1,1,7,7>, <1,1,1,1>
+ 3660949158U, // <1,7,7,2>: Cost 4 vext1 <6,1,7,7>, <2,3,0,1>
+ 3801904705U, // <1,7,7,3>: Cost 4 vext3 <7,3,0,1>, <7,7,3,0>
+ 3631090998U, // <1,7,7,4>: Cost 4 vext1 <1,1,7,7>, RHS
+ 2662503828U, // <1,7,7,5>: Cost 3 vext2 <7,5,1,7>, <7,5,1,7>
+ 3660951981U, // <1,7,7,6>: Cost 4 vext1 <6,1,7,7>, <6,1,7,7>
+ 2713933420U, // <1,7,7,7>: Cost 3 vext3 <4,u,5,1>, <7,7,7,7>
+ 2731406959U, // <1,7,7,u>: Cost 3 vext3 <7,7,u,1>, <7,7,u,1>
+ 1507500134U, // <1,7,u,0>: Cost 2 vext1 <5,1,7,u>, LHS
+ 2626672430U, // <1,7,u,1>: Cost 3 vext2 <1,5,1,7>, LHS
+ 2581243496U, // <1,7,u,2>: Cost 3 vext1 <5,1,7,u>, <2,2,2,2>
+ 2569300259U, // <1,7,u,3>: Cost 3 vext1 <3,1,7,u>, <3,1,7,u>
+ 1507503414U, // <1,7,u,4>: Cost 2 vext1 <5,1,7,u>, RHS
+ 1507503829U, // <1,7,u,5>: Cost 2 vext1 <5,1,7,u>, <5,1,7,u>
+ 2581246458U, // <1,7,u,6>: Cost 3 vext1 <5,1,7,u>, <6,2,7,3>
+ 2581246970U, // <1,7,u,7>: Cost 3 vext1 <5,1,7,u>, <7,0,1,2>
+ 1507505966U, // <1,7,u,u>: Cost 2 vext1 <5,1,7,u>, LHS
+ 1543643153U, // <1,u,0,0>: Cost 2 vext2 <0,0,1,u>, <0,0,1,u>
+ 1546297446U, // <1,u,0,1>: Cost 2 vext2 <0,4,1,u>, LHS
+ 2819448852U, // <1,u,0,2>: Cost 3 vuzpr LHS, <0,0,2,2>
+ 2619375876U, // <1,u,0,3>: Cost 3 vext2 <0,3,1,u>, <0,3,1,u>
+ 1546297685U, // <1,u,0,4>: Cost 2 vext2 <0,4,1,u>, <0,4,1,u>
+ 1658771190U, // <1,u,0,5>: Cost 2 vext3 <u,0,5,1>, <u,0,5,1>
+ 2736789248U, // <1,u,0,6>: Cost 3 vext3 <u,7,0,1>, <u,0,6,2>
+ 2659189376U, // <1,u,0,7>: Cost 3 vext2 <7,0,1,u>, <0,7,u,1>
+ 1546298013U, // <1,u,0,u>: Cost 2 vext2 <0,4,1,u>, LHS
+ 1483112550U, // <1,u,1,0>: Cost 2 vext1 <1,1,1,1>, LHS
+ 202162278U, // <1,u,1,1>: Cost 1 vdup1 LHS
+ 1616009006U, // <1,u,1,2>: Cost 2 vext3 <0,u,1,1>, LHS
+ 1745707110U, // <1,u,1,3>: Cost 2 vuzpr LHS, LHS
+ 1483115830U, // <1,u,1,4>: Cost 2 vext1 <1,1,1,1>, RHS
+ 2620040336U, // <1,u,1,5>: Cost 3 vext2 <0,4,1,u>, <1,5,3,7>
+ 3026622618U, // <1,u,1,6>: Cost 3 vtrnl <1,1,1,1>, RHS
+ 2958183752U, // <1,u,1,7>: Cost 3 vzipr <0,u,1,1>, RHS
+ 202162278U, // <1,u,1,u>: Cost 1 vdup1 LHS
+ 2819449750U, // <1,u,2,0>: Cost 3 vuzpr LHS, <1,2,3,0>
+ 2893207342U, // <1,u,2,1>: Cost 3 vzipl <1,2,3,0>, LHS
+ 2819448996U, // <1,u,2,2>: Cost 3 vuzpr LHS, <0,2,0,2>
+ 2819450482U, // <1,u,2,3>: Cost 3 vuzpr LHS, <2,2,3,3>
+ 2819449754U, // <1,u,2,4>: Cost 3 vuzpr LHS, <1,2,3,4>
+ 2893207706U, // <1,u,2,5>: Cost 3 vzipl <1,2,3,0>, RHS
+ 2819449036U, // <1,u,2,6>: Cost 3 vuzpr LHS, <0,2,4,6>
+ 2970799432U, // <1,u,2,7>: Cost 3 vzipr <3,0,1,2>, RHS
+ 2819449002U, // <1,u,2,u>: Cost 3 vuzpr LHS, <0,2,0,u>
+ 403931292U, // <1,u,3,0>: Cost 1 vext1 LHS, LHS
+ 1477673718U, // <1,u,3,1>: Cost 2 vext1 LHS, <1,0,3,2>
+ 115726126U, // <1,u,3,2>: Cost 1 vrev LHS
+ 2014102173U, // <1,u,3,3>: Cost 2 vtrnr LHS, LHS
+ 403934518U, // <1,u,3,4>: Cost 1 vext1 LHS, RHS
+ 1507536601U, // <1,u,3,5>: Cost 2 vext1 <5,1,u,3>, <5,1,u,3>
+ 1525453306U, // <1,u,3,6>: Cost 2 vext1 LHS, <6,2,7,3>
+ 2014105129U, // <1,u,3,7>: Cost 2 vtrnr LHS, RHS
+ 403937070U, // <1,u,3,u>: Cost 1 vext1 LHS, LHS
+ 2620042157U, // <1,u,4,0>: Cost 3 vext2 <0,4,1,u>, <4,0,u,1>
+ 2620042237U, // <1,u,4,1>: Cost 3 vext2 <0,4,1,u>, <4,1,u,0>
+ 2263217967U, // <1,u,4,2>: Cost 3 vrev <u,1,2,4>
+ 2569341224U, // <1,u,4,3>: Cost 3 vext1 <3,1,u,4>, <3,1,u,4>
+ 2569342262U, // <1,u,4,4>: Cost 3 vext1 <3,1,u,4>, RHS
+ 1546300726U, // <1,u,4,5>: Cost 2 vext2 <0,4,1,u>, RHS
+ 2819449180U, // <1,u,4,6>: Cost 3 vuzpr LHS, <0,4,2,6>
+ 2724845649U, // <1,u,4,7>: Cost 3 vext3 <6,7,0,1>, <u,4,7,6>
+ 1546300969U, // <1,u,4,u>: Cost 2 vext2 <0,4,1,u>, RHS
+ 2551431270U, // <1,u,5,0>: Cost 3 vext1 <0,1,u,5>, LHS
+ 2551432192U, // <1,u,5,1>: Cost 3 vext1 <0,1,u,5>, <1,3,5,7>
+ 3028293422U, // <1,u,5,2>: Cost 3 vtrnl <1,3,5,7>, LHS
+ 2955559068U, // <1,u,5,3>: Cost 3 vzipr <0,4,1,5>, LHS
+ 2551434550U, // <1,u,5,4>: Cost 3 vext1 <0,1,u,5>, RHS
+ 2895255706U, // <1,u,5,5>: Cost 3 vzipl <1,5,3,7>, RHS
+ 1616009370U, // <1,u,5,6>: Cost 2 vext3 <0,u,1,1>, RHS
+ 1745710390U, // <1,u,5,7>: Cost 2 vuzpr LHS, RHS
+ 1745710391U, // <1,u,5,u>: Cost 2 vuzpr LHS, RHS
+ 2653221159U, // <1,u,6,0>: Cost 3 vext2 <6,0,1,u>, <6,0,1,u>
+ 2725509303U, // <1,u,6,1>: Cost 3 vext3 <6,u,0,1>, <u,6,1,0>
+ 2659193338U, // <1,u,6,2>: Cost 3 vext2 <7,0,1,u>, <6,2,7,3>
+ 2689751248U, // <1,u,6,3>: Cost 3 vext3 <0,u,1,1>, <u,6,3,7>
+ 2867228774U, // <1,u,6,4>: Cost 3 vuzpr LHS, <5,6,7,4>
+ 3764820194U, // <1,u,6,5>: Cost 4 vext3 <1,1,1,1>, <u,6,5,7>
+ 2657202957U, // <1,u,6,6>: Cost 3 vext2 <6,6,1,u>, <6,6,1,u>
+ 2819450810U, // <1,u,6,7>: Cost 3 vuzpr LHS, <2,6,3,7>
+ 2819450811U, // <1,u,6,u>: Cost 3 vuzpr LHS, <2,6,3,u>
+ 1585452032U, // <1,u,7,0>: Cost 2 vext2 <7,0,1,u>, <7,0,1,u>
+ 2557420340U, // <1,u,7,1>: Cost 3 vext1 <1,1,u,7>, <1,1,1,1>
+ 2569365158U, // <1,u,7,2>: Cost 3 vext1 <3,1,u,7>, <2,3,0,1>
+ 2569365803U, // <1,u,7,3>: Cost 3 vext1 <3,1,u,7>, <3,1,u,7>
+ 2557422902U, // <1,u,7,4>: Cost 3 vext1 <1,1,u,7>, RHS
+ 2662512021U, // <1,u,7,5>: Cost 3 vext2 <7,5,1,u>, <7,5,1,u>
+ 2724845884U, // <1,u,7,6>: Cost 3 vext3 <6,7,0,1>, <u,7,6,7>
+ 2659194476U, // <1,u,7,7>: Cost 3 vext2 <7,0,1,u>, <7,7,7,7>
+ 1590761096U, // <1,u,7,u>: Cost 2 vext2 <7,u,1,u>, <7,u,1,u>
+ 403972257U, // <1,u,u,0>: Cost 1 vext1 LHS, LHS
+ 202162278U, // <1,u,u,1>: Cost 1 vdup1 LHS
+ 115767091U, // <1,u,u,2>: Cost 1 vrev LHS
+ 1745707677U, // <1,u,u,3>: Cost 2 vuzpr LHS, LHS
+ 403975478U, // <1,u,u,4>: Cost 1 vext1 LHS, RHS
+ 1546303642U, // <1,u,u,5>: Cost 2 vext2 <0,4,1,u>, RHS
+ 1616009613U, // <1,u,u,6>: Cost 2 vext3 <0,u,1,1>, RHS
+ 1745710633U, // <1,u,u,7>: Cost 2 vuzpr LHS, RHS
+ 403978030U, // <1,u,u,u>: Cost 1 vext1 LHS, LHS
+ 2551463936U, // <2,0,0,0>: Cost 3 vext1 <0,2,0,0>, <0,0,0,0>
+ 2685698058U, // <2,0,0,1>: Cost 3 vext3 <0,2,0,2>, <0,0,1,1>
+ 1610776596U, // <2,0,0,2>: Cost 2 vext3 <0,0,2,2>, <0,0,2,2>
+ 2619384069U, // <2,0,0,3>: Cost 3 vext2 <0,3,2,0>, <0,3,2,0>
+ 2551467318U, // <2,0,0,4>: Cost 3 vext1 <0,2,0,0>, RHS
+ 3899836596U, // <2,0,0,5>: Cost 4 vuzpr <1,2,3,0>, <3,0,4,5>
+ 2621374968U, // <2,0,0,6>: Cost 3 vext2 <0,6,2,0>, <0,6,2,0>
+ 4168271334U, // <2,0,0,7>: Cost 4 vtrnr <1,2,3,0>, <2,0,5,7>
+ 1611219018U, // <2,0,0,u>: Cost 2 vext3 <0,0,u,2>, <0,0,u,2>
+ 2551472138U, // <2,0,1,0>: Cost 3 vext1 <0,2,0,1>, <0,0,1,1>
+ 2690564186U, // <2,0,1,1>: Cost 3 vext3 <1,0,3,2>, <0,1,1,0>
+ 1611956326U, // <2,0,1,2>: Cost 2 vext3 <0,2,0,2>, LHS
+ 2826092646U, // <2,0,1,3>: Cost 3 vuzpr <1,2,3,0>, LHS
+ 2551475510U, // <2,0,1,4>: Cost 3 vext1 <0,2,0,1>, RHS
+ 3692463248U, // <2,0,1,5>: Cost 4 vext2 <0,2,2,0>, <1,5,3,7>
+ 2587308473U, // <2,0,1,6>: Cost 3 vext1 <6,2,0,1>, <6,2,0,1>
+ 3661050874U, // <2,0,1,7>: Cost 4 vext1 <6,2,0,1>, <7,0,1,2>
+ 1611956380U, // <2,0,1,u>: Cost 2 vext3 <0,2,0,2>, LHS
+ 1477738598U, // <2,0,2,0>: Cost 2 vext1 <0,2,0,2>, LHS
+ 2551481078U, // <2,0,2,1>: Cost 3 vext1 <0,2,0,2>, <1,0,3,2>
+ 2551481796U, // <2,0,2,2>: Cost 3 vext1 <0,2,0,2>, <2,0,2,0>
+ 2551482518U, // <2,0,2,3>: Cost 3 vext1 <0,2,0,2>, <3,0,1,2>
+ 1477741878U, // <2,0,2,4>: Cost 2 vext1 <0,2,0,2>, RHS
+ 2551484112U, // <2,0,2,5>: Cost 3 vext1 <0,2,0,2>, <5,1,7,3>
+ 2551484759U, // <2,0,2,6>: Cost 3 vext1 <0,2,0,2>, <6,0,7,2>
+ 2551485434U, // <2,0,2,7>: Cost 3 vext1 <0,2,0,2>, <7,0,1,2>
+ 1477744430U, // <2,0,2,u>: Cost 2 vext1 <0,2,0,2>, LHS
+ 2953625600U, // <2,0,3,0>: Cost 3 vzipr LHS, <0,0,0,0>
+ 2953627302U, // <2,0,3,1>: Cost 3 vzipr LHS, <2,3,0,1>
+ 2953625764U, // <2,0,3,2>: Cost 3 vzipr LHS, <0,2,0,2>
+ 4027369695U, // <2,0,3,3>: Cost 4 vzipr LHS, <3,1,0,3>
+ 3625233718U, // <2,0,3,4>: Cost 4 vext1 <0,2,0,3>, RHS
+ 3899836110U, // <2,0,3,5>: Cost 4 vuzpr <1,2,3,0>, <2,3,4,5>
+ 4032012618U, // <2,0,3,6>: Cost 4 vzipr LHS, <0,4,0,6>
+ 3899835392U, // <2,0,3,7>: Cost 4 vuzpr <1,2,3,0>, <1,3,5,7>
+ 2953625770U, // <2,0,3,u>: Cost 3 vzipr LHS, <0,2,0,u>
+ 2551496806U, // <2,0,4,0>: Cost 3 vext1 <0,2,0,4>, LHS
+ 2685698386U, // <2,0,4,1>: Cost 3 vext3 <0,2,0,2>, <0,4,1,5>
+ 2685698396U, // <2,0,4,2>: Cost 3 vext3 <0,2,0,2>, <0,4,2,6>
+ 3625240726U, // <2,0,4,3>: Cost 4 vext1 <0,2,0,4>, <3,0,1,2>
+ 2551500086U, // <2,0,4,4>: Cost 3 vext1 <0,2,0,4>, RHS
+ 2618723638U, // <2,0,4,5>: Cost 3 vext2 <0,2,2,0>, RHS
+ 2765409590U, // <2,0,4,6>: Cost 3 vuzpl <2,3,0,1>, RHS
+ 3799990664U, // <2,0,4,7>: Cost 4 vext3 <7,0,1,2>, <0,4,7,5>
+ 2685698450U, // <2,0,4,u>: Cost 3 vext3 <0,2,0,2>, <0,4,u,6>
+ 3625246822U, // <2,0,5,0>: Cost 4 vext1 <0,2,0,5>, LHS
+ 3289776304U, // <2,0,5,1>: Cost 4 vrev <0,2,1,5>
+ 2690564526U, // <2,0,5,2>: Cost 3 vext3 <1,0,3,2>, <0,5,2,7>
+ 3289923778U, // <2,0,5,3>: Cost 4 vrev <0,2,3,5>
+ 2216255691U, // <2,0,5,4>: Cost 3 vrev <0,2,4,5>
+ 3726307332U, // <2,0,5,5>: Cost 4 vext2 <5,u,2,0>, <5,5,5,5>
+ 3726307426U, // <2,0,5,6>: Cost 4 vext2 <5,u,2,0>, <5,6,7,0>
+ 2826095926U, // <2,0,5,7>: Cost 3 vuzpr <1,2,3,0>, RHS
+ 2216550639U, // <2,0,5,u>: Cost 3 vrev <0,2,u,5>
+ 4162420736U, // <2,0,6,0>: Cost 4 vtrnr <0,2,4,6>, <0,0,0,0>
+ 2901885030U, // <2,0,6,1>: Cost 3 vzipl <2,6,3,7>, LHS
+ 2685698559U, // <2,0,6,2>: Cost 3 vext3 <0,2,0,2>, <0,6,2,7>
+ 3643173171U, // <2,0,6,3>: Cost 4 vext1 <3,2,0,6>, <3,2,0,6>
+ 2216263884U, // <2,0,6,4>: Cost 3 vrev <0,2,4,6>
+ 3730289341U, // <2,0,6,5>: Cost 4 vext2 <6,5,2,0>, <6,5,2,0>
+ 3726308152U, // <2,0,6,6>: Cost 4 vext2 <5,u,2,0>, <6,6,6,6>
+ 3899836346U, // <2,0,6,7>: Cost 4 vuzpr <1,2,3,0>, <2,6,3,7>
+ 2216558832U, // <2,0,6,u>: Cost 3 vrev <0,2,u,6>
+ 2659202049U, // <2,0,7,0>: Cost 3 vext2 <7,0,2,0>, <7,0,2,0>
+ 3726308437U, // <2,0,7,1>: Cost 4 vext2 <5,u,2,0>, <7,1,2,3>
+ 2726249034U, // <2,0,7,2>: Cost 3 vext3 <7,0,1,2>, <0,7,2,1>
+ 3734934772U, // <2,0,7,3>: Cost 4 vext2 <7,3,2,0>, <7,3,2,0>
+ 3726308710U, // <2,0,7,4>: Cost 4 vext2 <5,u,2,0>, <7,4,5,6>
+ 3726308814U, // <2,0,7,5>: Cost 4 vext2 <5,u,2,0>, <7,5,u,2>
+ 3736925671U, // <2,0,7,6>: Cost 4 vext2 <7,6,2,0>, <7,6,2,0>
+ 3726308972U, // <2,0,7,7>: Cost 4 vext2 <5,u,2,0>, <7,7,7,7>
+ 2659202049U, // <2,0,7,u>: Cost 3 vext2 <7,0,2,0>, <7,0,2,0>
+ 1477787750U, // <2,0,u,0>: Cost 2 vext1 <0,2,0,u>, LHS
+ 2953668262U, // <2,0,u,1>: Cost 3 vzipr LHS, <2,3,0,1>
+ 1611956893U, // <2,0,u,2>: Cost 2 vext3 <0,2,0,2>, LHS
+ 2551531670U, // <2,0,u,3>: Cost 3 vext1 <0,2,0,u>, <3,0,1,2>
+ 1477791030U, // <2,0,u,4>: Cost 2 vext1 <0,2,0,u>, RHS
+ 2618726554U, // <2,0,u,5>: Cost 3 vext2 <0,2,2,0>, RHS
+ 2765412506U, // <2,0,u,6>: Cost 3 vuzpl <2,3,0,1>, RHS
+ 2826096169U, // <2,0,u,7>: Cost 3 vuzpr <1,2,3,0>, RHS
+ 1611956947U, // <2,0,u,u>: Cost 2 vext3 <0,2,0,2>, LHS
+ 2569453670U, // <2,1,0,0>: Cost 3 vext1 <3,2,1,0>, LHS
+ 2619392102U, // <2,1,0,1>: Cost 3 vext2 <0,3,2,1>, LHS
+ 3759440619U, // <2,1,0,2>: Cost 4 vext3 <0,2,0,2>, <1,0,2,0>
+ 1616823030U, // <2,1,0,3>: Cost 2 vext3 <1,0,3,2>, <1,0,3,2>
+ 2569456950U, // <2,1,0,4>: Cost 3 vext1 <3,2,1,0>, RHS
+ 2690712328U, // <2,1,0,5>: Cost 3 vext3 <1,0,5,2>, <1,0,5,2>
+ 3661115841U, // <2,1,0,6>: Cost 4 vext1 <6,2,1,0>, <6,2,1,0>
+ 2622046794U, // <2,1,0,7>: Cost 3 vext2 <0,7,2,1>, <0,7,2,1>
+ 1617191715U, // <2,1,0,u>: Cost 2 vext3 <1,0,u,2>, <1,0,u,2>
+ 2551545958U, // <2,1,1,0>: Cost 3 vext1 <0,2,1,1>, LHS
+ 2685698868U, // <2,1,1,1>: Cost 3 vext3 <0,2,0,2>, <1,1,1,1>
+ 2628682646U, // <2,1,1,2>: Cost 3 vext2 <1,u,2,1>, <1,2,3,0>
+ 2685698888U, // <2,1,1,3>: Cost 3 vext3 <0,2,0,2>, <1,1,3,3>
+ 2551549238U, // <2,1,1,4>: Cost 3 vext1 <0,2,1,1>, RHS
+ 3693134992U, // <2,1,1,5>: Cost 4 vext2 <0,3,2,1>, <1,5,3,7>
+ 3661124034U, // <2,1,1,6>: Cost 4 vext1 <6,2,1,1>, <6,2,1,1>
+ 3625292794U, // <2,1,1,7>: Cost 4 vext1 <0,2,1,1>, <7,0,1,2>
+ 2685698933U, // <2,1,1,u>: Cost 3 vext3 <0,2,0,2>, <1,1,u,3>
+ 2551554150U, // <2,1,2,0>: Cost 3 vext1 <0,2,1,2>, LHS
+ 3893649571U, // <2,1,2,1>: Cost 4 vuzpr <0,2,0,1>, <0,2,0,1>
+ 2551555688U, // <2,1,2,2>: Cost 3 vext1 <0,2,1,2>, <2,2,2,2>
+ 2685698966U, // <2,1,2,3>: Cost 3 vext3 <0,2,0,2>, <1,2,3,0>
+ 2551557430U, // <2,1,2,4>: Cost 3 vext1 <0,2,1,2>, RHS
+ 3763422123U, // <2,1,2,5>: Cost 4 vext3 <0,u,0,2>, <1,2,5,3>
+ 3693135802U, // <2,1,2,6>: Cost 4 vext2 <0,3,2,1>, <2,6,3,7>
+ 2726249402U, // <2,1,2,7>: Cost 3 vext3 <7,0,1,2>, <1,2,7,0>
+ 2685699011U, // <2,1,2,u>: Cost 3 vext3 <0,2,0,2>, <1,2,u,0>
+ 2551562342U, // <2,1,3,0>: Cost 3 vext1 <0,2,1,3>, LHS
+ 2953625610U, // <2,1,3,1>: Cost 3 vzipr LHS, <0,0,1,1>
+ 2953627798U, // <2,1,3,2>: Cost 3 vzipr LHS, <3,0,1,2>
+ 2953626584U, // <2,1,3,3>: Cost 3 vzipr LHS, <1,3,1,3>
+ 2551565622U, // <2,1,3,4>: Cost 3 vext1 <0,2,1,3>, RHS
+ 2953625938U, // <2,1,3,5>: Cost 3 vzipr LHS, <0,4,1,5>
+ 2587398596U, // <2,1,3,6>: Cost 3 vext1 <6,2,1,3>, <6,2,1,3>
+ 4032013519U, // <2,1,3,7>: Cost 4 vzipr LHS, <1,6,1,7>
+ 2953625617U, // <2,1,3,u>: Cost 3 vzipr LHS, <0,0,1,u>
+ 2690565154U, // <2,1,4,0>: Cost 3 vext3 <1,0,3,2>, <1,4,0,5>
+ 3625313270U, // <2,1,4,1>: Cost 4 vext1 <0,2,1,4>, <1,3,4,6>
+ 3771532340U, // <2,1,4,2>: Cost 4 vext3 <2,2,2,2>, <1,4,2,5>
+ 1148404634U, // <2,1,4,3>: Cost 2 vrev <1,2,3,4>
+ 3625315638U, // <2,1,4,4>: Cost 4 vext1 <0,2,1,4>, RHS
+ 2619395382U, // <2,1,4,5>: Cost 3 vext2 <0,3,2,1>, RHS
+ 3837242678U, // <2,1,4,6>: Cost 4 vuzpl <2,0,1,2>, RHS
+ 3799991394U, // <2,1,4,7>: Cost 4 vext3 <7,0,1,2>, <1,4,7,6>
+ 1148773319U, // <2,1,4,u>: Cost 2 vrev <1,2,u,4>
+ 2551578726U, // <2,1,5,0>: Cost 3 vext1 <0,2,1,5>, LHS
+ 2551579648U, // <2,1,5,1>: Cost 3 vext1 <0,2,1,5>, <1,3,5,7>
+ 3625321952U, // <2,1,5,2>: Cost 4 vext1 <0,2,1,5>, <2,0,5,1>
+ 2685699216U, // <2,1,5,3>: Cost 3 vext3 <0,2,0,2>, <1,5,3,7>
+ 2551582006U, // <2,1,5,4>: Cost 3 vext1 <0,2,1,5>, RHS
+ 3740913668U, // <2,1,5,5>: Cost 4 vext2 <u,3,2,1>, <5,5,5,5>
+ 3661156806U, // <2,1,5,6>: Cost 4 vext1 <6,2,1,5>, <6,2,1,5>
+ 3893652790U, // <2,1,5,7>: Cost 4 vuzpr <0,2,0,1>, RHS
+ 2685699261U, // <2,1,5,u>: Cost 3 vext3 <0,2,0,2>, <1,5,u,7>
+ 2551586918U, // <2,1,6,0>: Cost 3 vext1 <0,2,1,6>, LHS
+ 3625329398U, // <2,1,6,1>: Cost 4 vext1 <0,2,1,6>, <1,0,3,2>
+ 2551588794U, // <2,1,6,2>: Cost 3 vext1 <0,2,1,6>, <2,6,3,7>
+ 3088679014U, // <2,1,6,3>: Cost 3 vtrnr <0,2,4,6>, LHS
+ 2551590198U, // <2,1,6,4>: Cost 3 vext1 <0,2,1,6>, RHS
+ 4029382994U, // <2,1,6,5>: Cost 4 vzipr <0,4,2,6>, <0,4,1,5>
+ 3625333560U, // <2,1,6,6>: Cost 4 vext1 <0,2,1,6>, <6,6,6,6>
+ 3731624800U, // <2,1,6,7>: Cost 4 vext2 <6,7,2,1>, <6,7,2,1>
+ 2551592750U, // <2,1,6,u>: Cost 3 vext1 <0,2,1,6>, LHS
+ 2622051322U, // <2,1,7,0>: Cost 3 vext2 <0,7,2,1>, <7,0,1,2>
+ 3733615699U, // <2,1,7,1>: Cost 4 vext2 <7,1,2,1>, <7,1,2,1>
+ 3795125538U, // <2,1,7,2>: Cost 4 vext3 <6,1,7,2>, <1,7,2,0>
+ 2222171037U, // <2,1,7,3>: Cost 3 vrev <1,2,3,7>
+ 3740915046U, // <2,1,7,4>: Cost 4 vext2 <u,3,2,1>, <7,4,5,6>
+ 3296060335U, // <2,1,7,5>: Cost 4 vrev <1,2,5,7>
+ 3736933864U, // <2,1,7,6>: Cost 4 vext2 <7,6,2,1>, <7,6,2,1>
+ 3805300055U, // <2,1,7,7>: Cost 4 vext3 <7,u,1,2>, <1,7,7,u>
+ 2669827714U, // <2,1,7,u>: Cost 3 vext2 <u,7,2,1>, <7,u,1,2>
+ 2551603302U, // <2,1,u,0>: Cost 3 vext1 <0,2,1,u>, LHS
+ 2953666570U, // <2,1,u,1>: Cost 3 vzipr LHS, <0,0,1,1>
+ 2953668758U, // <2,1,u,2>: Cost 3 vzipr LHS, <3,0,1,2>
+ 1148437406U, // <2,1,u,3>: Cost 2 vrev <1,2,3,u>
+ 2551606582U, // <2,1,u,4>: Cost 3 vext1 <0,2,1,u>, RHS
+ 2953666898U, // <2,1,u,5>: Cost 3 vzipr LHS, <0,4,1,5>
+ 2587398596U, // <2,1,u,6>: Cost 3 vext1 <6,2,1,3>, <6,2,1,3>
+ 2669828370U, // <2,1,u,7>: Cost 3 vext2 <u,7,2,1>, <u,7,2,1>
+ 1148806091U, // <2,1,u,u>: Cost 2 vrev <1,2,u,u>
+ 1543667732U, // <2,2,0,0>: Cost 2 vext2 <0,0,2,2>, <0,0,2,2>
+ 1548976230U, // <2,2,0,1>: Cost 2 vext2 <0,u,2,2>, LHS
+ 2685699524U, // <2,2,0,2>: Cost 3 vext3 <0,2,0,2>, <2,0,2,0>
+ 2685699535U, // <2,2,0,3>: Cost 3 vext3 <0,2,0,2>, <2,0,3,2>
+ 2551614774U, // <2,2,0,4>: Cost 3 vext1 <0,2,2,0>, RHS
+ 3704422830U, // <2,2,0,5>: Cost 4 vext2 <2,2,2,2>, <0,5,2,7>
+ 3893657642U, // <2,2,0,6>: Cost 4 vuzpr <0,2,0,2>, <0,0,4,6>
+ 3770574323U, // <2,2,0,7>: Cost 4 vext3 <2,0,7,2>, <2,0,7,2>
+ 1548976796U, // <2,2,0,u>: Cost 2 vext2 <0,u,2,2>, <0,u,2,2>
+ 2622718710U, // <2,2,1,0>: Cost 3 vext2 <0,u,2,2>, <1,0,3,2>
+ 2622718772U, // <2,2,1,1>: Cost 3 vext2 <0,u,2,2>, <1,1,1,1>
+ 2622718870U, // <2,2,1,2>: Cost 3 vext2 <0,u,2,2>, <1,2,3,0>
+ 2819915878U, // <2,2,1,3>: Cost 3 vuzpr <0,2,0,2>, LHS
+ 3625364790U, // <2,2,1,4>: Cost 4 vext1 <0,2,2,1>, RHS
+ 2622719120U, // <2,2,1,5>: Cost 3 vext2 <0,u,2,2>, <1,5,3,7>
+ 3760031292U, // <2,2,1,6>: Cost 4 vext3 <0,2,u,2>, <2,1,6,3>
+ 3667170468U, // <2,2,1,7>: Cost 4 vext1 <7,2,2,1>, <7,2,2,1>
+ 2819915883U, // <2,2,1,u>: Cost 3 vuzpr <0,2,0,2>, LHS
+ 1489829990U, // <2,2,2,0>: Cost 2 vext1 <2,2,2,2>, LHS
+ 2563572470U, // <2,2,2,1>: Cost 3 vext1 <2,2,2,2>, <1,0,3,2>
+ 269271142U, // <2,2,2,2>: Cost 1 vdup2 LHS
+ 2685699698U, // <2,2,2,3>: Cost 3 vext3 <0,2,0,2>, <2,2,3,3>
+ 1489833270U, // <2,2,2,4>: Cost 2 vext1 <2,2,2,2>, RHS
+ 2685699720U, // <2,2,2,5>: Cost 3 vext3 <0,2,0,2>, <2,2,5,7>
+ 2622719930U, // <2,2,2,6>: Cost 3 vext2 <0,u,2,2>, <2,6,3,7>
+ 2593436837U, // <2,2,2,7>: Cost 3 vext1 <7,2,2,2>, <7,2,2,2>
+ 269271142U, // <2,2,2,u>: Cost 1 vdup2 LHS
+ 2685699750U, // <2,2,3,0>: Cost 3 vext3 <0,2,0,2>, <2,3,0,1>
+ 2690565806U, // <2,2,3,1>: Cost 3 vext3 <1,0,3,2>, <2,3,1,0>
+ 2953627240U, // <2,2,3,2>: Cost 3 vzipr LHS, <2,2,2,2>
+ 1879883878U, // <2,2,3,3>: Cost 2 vzipr LHS, LHS
+ 2685699790U, // <2,2,3,4>: Cost 3 vext3 <0,2,0,2>, <2,3,4,5>
+ 3893659342U, // <2,2,3,5>: Cost 4 vuzpr <0,2,0,2>, <2,3,4,5>
+ 2958270812U, // <2,2,3,6>: Cost 3 vzipr LHS, <0,4,2,6>
+ 2593445030U, // <2,2,3,7>: Cost 3 vext1 <7,2,2,3>, <7,2,2,3>
+ 1879883883U, // <2,2,3,u>: Cost 2 vzipr LHS, LHS
+ 2551644262U, // <2,2,4,0>: Cost 3 vext1 <0,2,2,4>, LHS
+ 3625386742U, // <2,2,4,1>: Cost 4 vext1 <0,2,2,4>, <1,0,3,2>
+ 2551645902U, // <2,2,4,2>: Cost 3 vext1 <0,2,2,4>, <2,3,4,5>
+ 3759441686U, // <2,2,4,3>: Cost 4 vext3 <0,2,0,2>, <2,4,3,5>
+ 2551647542U, // <2,2,4,4>: Cost 3 vext1 <0,2,2,4>, RHS
+ 1548979510U, // <2,2,4,5>: Cost 2 vext2 <0,u,2,2>, RHS
+ 2764901686U, // <2,2,4,6>: Cost 3 vuzpl <2,2,2,2>, RHS
+ 3667195047U, // <2,2,4,7>: Cost 4 vext1 <7,2,2,4>, <7,2,2,4>
+ 1548979753U, // <2,2,4,u>: Cost 2 vext2 <0,u,2,2>, RHS
+ 3696463432U, // <2,2,5,0>: Cost 4 vext2 <0,u,2,2>, <5,0,1,2>
+ 2617413328U, // <2,2,5,1>: Cost 3 vext2 <0,0,2,2>, <5,1,7,3>
+ 2685699936U, // <2,2,5,2>: Cost 3 vext3 <0,2,0,2>, <2,5,2,7>
+ 4027383910U, // <2,2,5,3>: Cost 4 vzipr <0,1,2,5>, LHS
+ 2228201085U, // <2,2,5,4>: Cost 3 vrev <2,2,4,5>
+ 2617413636U, // <2,2,5,5>: Cost 3 vext2 <0,0,2,2>, <5,5,5,5>
+ 2617413730U, // <2,2,5,6>: Cost 3 vext2 <0,0,2,2>, <5,6,7,0>
+ 2819919158U, // <2,2,5,7>: Cost 3 vuzpr <0,2,0,2>, RHS
+ 2819919159U, // <2,2,5,u>: Cost 3 vuzpr <0,2,0,2>, RHS
+ 3625402554U, // <2,2,6,0>: Cost 4 vext1 <0,2,2,6>, <0,2,2,6>
+ 3760031652U, // <2,2,6,1>: Cost 4 vext3 <0,2,u,2>, <2,6,1,3>
+ 2617414138U, // <2,2,6,2>: Cost 3 vext2 <0,0,2,2>, <6,2,7,3>
+ 2685700026U, // <2,2,6,3>: Cost 3 vext3 <0,2,0,2>, <2,6,3,7>
+ 3625405750U, // <2,2,6,4>: Cost 4 vext1 <0,2,2,6>, RHS
+ 3760031692U, // <2,2,6,5>: Cost 4 vext3 <0,2,u,2>, <2,6,5,7>
+ 3088679116U, // <2,2,6,6>: Cost 3 vtrnr <0,2,4,6>, <0,2,4,6>
+ 2657891169U, // <2,2,6,7>: Cost 3 vext2 <6,7,2,2>, <6,7,2,2>
+ 2685700071U, // <2,2,6,u>: Cost 3 vext3 <0,2,0,2>, <2,6,u,7>
+ 2726250474U, // <2,2,7,0>: Cost 3 vext3 <7,0,1,2>, <2,7,0,1>
+ 3704427616U, // <2,2,7,1>: Cost 4 vext2 <2,2,2,2>, <7,1,3,5>
+ 2660545701U, // <2,2,7,2>: Cost 3 vext2 <7,2,2,2>, <7,2,2,2>
+ 4030718054U, // <2,2,7,3>: Cost 4 vzipr <0,6,2,7>, LHS
+ 2617415014U, // <2,2,7,4>: Cost 3 vext2 <0,0,2,2>, <7,4,5,6>
+ 3302033032U, // <2,2,7,5>: Cost 4 vrev <2,2,5,7>
+ 3661246929U, // <2,2,7,6>: Cost 4 vext1 <6,2,2,7>, <6,2,2,7>
+ 2617415276U, // <2,2,7,7>: Cost 3 vext2 <0,0,2,2>, <7,7,7,7>
+ 2731558962U, // <2,2,7,u>: Cost 3 vext3 <7,u,1,2>, <2,7,u,1>
+ 1489829990U, // <2,2,u,0>: Cost 2 vext1 <2,2,2,2>, LHS
+ 1548982062U, // <2,2,u,1>: Cost 2 vext2 <0,u,2,2>, LHS
+ 269271142U, // <2,2,u,2>: Cost 1 vdup2 LHS
+ 1879924838U, // <2,2,u,3>: Cost 2 vzipr LHS, LHS
+ 1489833270U, // <2,2,u,4>: Cost 2 vext1 <2,2,2,2>, RHS
+ 1548982426U, // <2,2,u,5>: Cost 2 vext2 <0,u,2,2>, RHS
+ 2953666908U, // <2,2,u,6>: Cost 3 vzipr LHS, <0,4,2,6>
+ 2819919401U, // <2,2,u,7>: Cost 3 vuzpr <0,2,0,2>, RHS
+ 269271142U, // <2,2,u,u>: Cost 1 vdup2 LHS
+ 1544339456U, // <2,3,0,0>: Cost 2 vext2 LHS, <0,0,0,0>
+ 470597734U, // <2,3,0,1>: Cost 1 vext2 LHS, LHS
+ 1548984484U, // <2,3,0,2>: Cost 2 vext2 LHS, <0,2,0,2>
+ 2619408648U, // <2,3,0,3>: Cost 3 vext2 <0,3,2,3>, <0,3,2,3>
+ 1548984658U, // <2,3,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
+ 2665857454U, // <2,3,0,5>: Cost 3 vext2 LHS, <0,5,2,7>
+ 2622726655U, // <2,3,0,6>: Cost 3 vext2 LHS, <0,6,2,7>
+ 2593494188U, // <2,3,0,7>: Cost 3 vext1 <7,2,3,0>, <7,2,3,0>
+ 470598301U, // <2,3,0,u>: Cost 1 vext2 LHS, LHS
+ 1544340214U, // <2,3,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
+ 1544340276U, // <2,3,1,1>: Cost 2 vext2 LHS, <1,1,1,1>
+ 1544340374U, // <2,3,1,2>: Cost 2 vext2 LHS, <1,2,3,0>
+ 1548985304U, // <2,3,1,3>: Cost 2 vext2 LHS, <1,3,1,3>
+ 2551696694U, // <2,3,1,4>: Cost 3 vext1 <0,2,3,1>, RHS
+ 1548985488U, // <2,3,1,5>: Cost 2 vext2 LHS, <1,5,3,7>
+ 2622727375U, // <2,3,1,6>: Cost 3 vext2 LHS, <1,6,1,7>
+ 2665858347U, // <2,3,1,7>: Cost 3 vext2 LHS, <1,7,3,0>
+ 1548985709U, // <2,3,1,u>: Cost 2 vext2 LHS, <1,u,1,3>
+ 2622727613U, // <2,3,2,0>: Cost 3 vext2 LHS, <2,0,1,2>
+ 2622727711U, // <2,3,2,1>: Cost 3 vext2 LHS, <2,1,3,1>
+ 1544341096U, // <2,3,2,2>: Cost 2 vext2 LHS, <2,2,2,2>
+ 1544341158U, // <2,3,2,3>: Cost 2 vext2 LHS, <2,3,0,1>
+ 2622727958U, // <2,3,2,4>: Cost 3 vext2 LHS, <2,4,3,5>
+ 2622728032U, // <2,3,2,5>: Cost 3 vext2 LHS, <2,5,2,7>
+ 1548986298U, // <2,3,2,6>: Cost 2 vext2 LHS, <2,6,3,7>
+ 2665859050U, // <2,3,2,7>: Cost 3 vext2 LHS, <2,7,0,1>
+ 1548986427U, // <2,3,2,u>: Cost 2 vext2 LHS, <2,u,0,1>
+ 1548986518U, // <2,3,3,0>: Cost 2 vext2 LHS, <3,0,1,2>
+ 2622728415U, // <2,3,3,1>: Cost 3 vext2 LHS, <3,1,0,3>
+ 1489913458U, // <2,3,3,2>: Cost 2 vext1 <2,2,3,3>, <2,2,3,3>
+ 1544341916U, // <2,3,3,3>: Cost 2 vext2 LHS, <3,3,3,3>
+ 1548986882U, // <2,3,3,4>: Cost 2 vext2 LHS, <3,4,5,6>
+ 2665859632U, // <2,3,3,5>: Cost 3 vext2 LHS, <3,5,1,7>
+ 2234304870U, // <2,3,3,6>: Cost 3 vrev <3,2,6,3>
+ 2958271632U, // <2,3,3,7>: Cost 3 vzipr LHS, <1,5,3,7>
+ 1548987166U, // <2,3,3,u>: Cost 2 vext2 LHS, <3,u,1,2>
+ 1483948134U, // <2,3,4,0>: Cost 2 vext1 <1,2,3,4>, LHS
+ 1483948954U, // <2,3,4,1>: Cost 2 vext1 <1,2,3,4>, <1,2,3,4>
+ 2622729276U, // <2,3,4,2>: Cost 3 vext2 LHS, <4,2,6,0>
+ 2557692054U, // <2,3,4,3>: Cost 3 vext1 <1,2,3,4>, <3,0,1,2>
+ 1483951414U, // <2,3,4,4>: Cost 2 vext1 <1,2,3,4>, RHS
+ 470601014U, // <2,3,4,5>: Cost 1 vext2 LHS, RHS
+ 1592118644U, // <2,3,4,6>: Cost 2 vext2 LHS, <4,6,4,6>
+ 2593526960U, // <2,3,4,7>: Cost 3 vext1 <7,2,3,4>, <7,2,3,4>
+ 470601257U, // <2,3,4,u>: Cost 1 vext2 LHS, RHS
+ 2551726182U, // <2,3,5,0>: Cost 3 vext1 <0,2,3,5>, LHS
+ 1592118992U, // <2,3,5,1>: Cost 2 vext2 LHS, <5,1,7,3>
+ 2665860862U, // <2,3,5,2>: Cost 3 vext2 LHS, <5,2,3,4>
+ 2551728642U, // <2,3,5,3>: Cost 3 vext1 <0,2,3,5>, <3,4,5,6>
+ 1592119238U, // <2,3,5,4>: Cost 2 vext2 LHS, <5,4,7,6>
+ 1592119300U, // <2,3,5,5>: Cost 2 vext2 LHS, <5,5,5,5>
+ 1592119394U, // <2,3,5,6>: Cost 2 vext2 LHS, <5,6,7,0>
+ 1592119464U, // <2,3,5,7>: Cost 2 vext2 LHS, <5,7,5,7>
+ 1592119545U, // <2,3,5,u>: Cost 2 vext2 LHS, <5,u,5,7>
+ 2622730529U, // <2,3,6,0>: Cost 3 vext2 LHS, <6,0,1,2>
+ 2557707164U, // <2,3,6,1>: Cost 3 vext1 <1,2,3,6>, <1,2,3,6>
+ 1592119802U, // <2,3,6,2>: Cost 2 vext2 LHS, <6,2,7,3>
+ 2665861682U, // <2,3,6,3>: Cost 3 vext2 LHS, <6,3,4,5>
+ 2622730893U, // <2,3,6,4>: Cost 3 vext2 LHS, <6,4,5,6>
+ 2665861810U, // <2,3,6,5>: Cost 3 vext2 LHS, <6,5,0,7>
+ 1592120120U, // <2,3,6,6>: Cost 2 vext2 LHS, <6,6,6,6>
+ 1592120142U, // <2,3,6,7>: Cost 2 vext2 LHS, <6,7,0,1>
+ 1592120223U, // <2,3,6,u>: Cost 2 vext2 LHS, <6,u,0,1>
+ 1592120314U, // <2,3,7,0>: Cost 2 vext2 LHS, <7,0,1,2>
+ 2659890261U, // <2,3,7,1>: Cost 3 vext2 <7,1,2,3>, <7,1,2,3>
+ 2660553894U, // <2,3,7,2>: Cost 3 vext2 <7,2,2,3>, <7,2,2,3>
+ 2665862371U, // <2,3,7,3>: Cost 3 vext2 LHS, <7,3,0,1>
+ 1592120678U, // <2,3,7,4>: Cost 2 vext2 LHS, <7,4,5,6>
+ 2665862534U, // <2,3,7,5>: Cost 3 vext2 LHS, <7,5,0,2>
+ 2665862614U, // <2,3,7,6>: Cost 3 vext2 LHS, <7,6,0,1>
+ 1592120940U, // <2,3,7,7>: Cost 2 vext2 LHS, <7,7,7,7>
+ 1592120962U, // <2,3,7,u>: Cost 2 vext2 LHS, <7,u,1,2>
+ 1548990163U, // <2,3,u,0>: Cost 2 vext2 LHS, <u,0,1,2>
+ 470603566U, // <2,3,u,1>: Cost 1 vext2 LHS, LHS
+ 1548990341U, // <2,3,u,2>: Cost 2 vext2 LHS, <u,2,3,0>
+ 1548990396U, // <2,3,u,3>: Cost 2 vext2 LHS, <u,3,0,1>
+ 1548990527U, // <2,3,u,4>: Cost 2 vext2 LHS, <u,4,5,6>
+ 470603930U, // <2,3,u,5>: Cost 1 vext2 LHS, RHS
+ 1548990672U, // <2,3,u,6>: Cost 2 vext2 LHS, <u,6,3,7>
+ 1592121600U, // <2,3,u,7>: Cost 2 vext2 LHS, <u,7,0,1>
+ 470604133U, // <2,3,u,u>: Cost 1 vext2 LHS, LHS
+ 2617425942U, // <2,4,0,0>: Cost 3 vext2 <0,0,2,4>, <0,0,2,4>
+ 2618753126U, // <2,4,0,1>: Cost 3 vext2 <0,2,2,4>, LHS
+ 2618753208U, // <2,4,0,2>: Cost 3 vext2 <0,2,2,4>, <0,2,2,4>
+ 2619416841U, // <2,4,0,3>: Cost 3 vext2 <0,3,2,4>, <0,3,2,4>
+ 2587593628U, // <2,4,0,4>: Cost 3 vext1 <6,2,4,0>, <4,0,6,2>
+ 2712832914U, // <2,4,0,5>: Cost 3 vext3 <4,6,u,2>, <4,0,5,1>
+ 1634962332U, // <2,4,0,6>: Cost 2 vext3 <4,0,6,2>, <4,0,6,2>
+ 3799993252U, // <2,4,0,7>: Cost 4 vext3 <7,0,1,2>, <4,0,7,1>
+ 1634962332U, // <2,4,0,u>: Cost 2 vext3 <4,0,6,2>, <4,0,6,2>
+ 2619417334U, // <2,4,1,0>: Cost 3 vext2 <0,3,2,4>, <1,0,3,2>
+ 3692495668U, // <2,4,1,1>: Cost 4 vext2 <0,2,2,4>, <1,1,1,1>
+ 2625389466U, // <2,4,1,2>: Cost 3 vext2 <1,3,2,4>, <1,2,3,4>
+ 2826125414U, // <2,4,1,3>: Cost 3 vuzpr <1,2,3,4>, LHS
+ 3699794995U, // <2,4,1,4>: Cost 4 vext2 <1,4,2,4>, <1,4,2,4>
+ 3692496016U, // <2,4,1,5>: Cost 4 vext2 <0,2,2,4>, <1,5,3,7>
+ 3763424238U, // <2,4,1,6>: Cost 4 vext3 <0,u,0,2>, <4,1,6,3>
+ 3667317942U, // <2,4,1,7>: Cost 4 vext1 <7,2,4,1>, <7,2,4,1>
+ 2826125419U, // <2,4,1,u>: Cost 3 vuzpr <1,2,3,4>, LHS
+ 2629371336U, // <2,4,2,0>: Cost 3 vext2 <2,0,2,4>, <2,0,2,4>
+ 3699131946U, // <2,4,2,1>: Cost 4 vext2 <1,3,2,4>, <2,1,4,3>
+ 2630698602U, // <2,4,2,2>: Cost 3 vext2 <2,2,2,4>, <2,2,2,4>
+ 2618754766U, // <2,4,2,3>: Cost 3 vext2 <0,2,2,4>, <2,3,4,5>
+ 2826126234U, // <2,4,2,4>: Cost 3 vuzpr <1,2,3,4>, <1,2,3,4>
+ 2899119414U, // <2,4,2,5>: Cost 3 vzipl <2,2,2,2>, RHS
+ 3033337142U, // <2,4,2,6>: Cost 3 vtrnl <2,2,2,2>, RHS
+ 3800214597U, // <2,4,2,7>: Cost 4 vext3 <7,0,4,2>, <4,2,7,0>
+ 2899119657U, // <2,4,2,u>: Cost 3 vzipl <2,2,2,2>, RHS
+ 2635344033U, // <2,4,3,0>: Cost 3 vext2 <3,0,2,4>, <3,0,2,4>
+ 4032012325U, // <2,4,3,1>: Cost 4 vzipr LHS, <0,0,4,1>
+ 3692497228U, // <2,4,3,2>: Cost 4 vext2 <0,2,2,4>, <3,2,3,4>
+ 3692497308U, // <2,4,3,3>: Cost 4 vext2 <0,2,2,4>, <3,3,3,3>
+ 3001404624U, // <2,4,3,4>: Cost 3 vzipr LHS, <4,4,4,4>
+ 2953627342U, // <2,4,3,5>: Cost 3 vzipr LHS, <2,3,4,5>
+ 2953625804U, // <2,4,3,6>: Cost 3 vzipr LHS, <0,2,4,6>
+ 3899868160U, // <2,4,3,7>: Cost 4 vuzpr <1,2,3,4>, <1,3,5,7>
+ 2953625806U, // <2,4,3,u>: Cost 3 vzipr LHS, <0,2,4,u>
+ 2710916266U, // <2,4,4,0>: Cost 3 vext3 <4,4,0,2>, <4,4,0,2>
+ 3899869648U, // <2,4,4,1>: Cost 4 vuzpr <1,2,3,4>, <3,4,0,1>
+ 3899869658U, // <2,4,4,2>: Cost 4 vuzpr <1,2,3,4>, <3,4,1,2>
+ 3899868930U, // <2,4,4,3>: Cost 4 vuzpr <1,2,3,4>, <2,4,1,3>
+ 2712833232U, // <2,4,4,4>: Cost 3 vext3 <4,6,u,2>, <4,4,4,4>
+ 2618756406U, // <2,4,4,5>: Cost 3 vext2 <0,2,2,4>, RHS
+ 2765737270U, // <2,4,4,6>: Cost 3 vuzpl <2,3,4,5>, RHS
+ 4168304426U, // <2,4,4,7>: Cost 4 vtrnr <1,2,3,4>, <2,4,5,7>
+ 2618756649U, // <2,4,4,u>: Cost 3 vext2 <0,2,2,4>, RHS
+ 2551800011U, // <2,4,5,0>: Cost 3 vext1 <0,2,4,5>, <0,2,4,5>
+ 2569716470U, // <2,4,5,1>: Cost 3 vext1 <3,2,4,5>, <1,0,3,2>
+ 2563745405U, // <2,4,5,2>: Cost 3 vext1 <2,2,4,5>, <2,2,4,5>
+ 2569718102U, // <2,4,5,3>: Cost 3 vext1 <3,2,4,5>, <3,2,4,5>
+ 2551803190U, // <2,4,5,4>: Cost 3 vext1 <0,2,4,5>, RHS
+ 3625545732U, // <2,4,5,5>: Cost 4 vext1 <0,2,4,5>, <5,5,5,5>
+ 1611959606U, // <2,4,5,6>: Cost 2 vext3 <0,2,0,2>, RHS
+ 2826128694U, // <2,4,5,7>: Cost 3 vuzpr <1,2,3,4>, RHS
+ 1611959624U, // <2,4,5,u>: Cost 2 vext3 <0,2,0,2>, RHS
+ 1478066278U, // <2,4,6,0>: Cost 2 vext1 <0,2,4,6>, LHS
+ 2551808758U, // <2,4,6,1>: Cost 3 vext1 <0,2,4,6>, <1,0,3,2>
+ 2551809516U, // <2,4,6,2>: Cost 3 vext1 <0,2,4,6>, <2,0,6,4>
+ 2551810198U, // <2,4,6,3>: Cost 3 vext1 <0,2,4,6>, <3,0,1,2>
+ 1478069558U, // <2,4,6,4>: Cost 2 vext1 <0,2,4,6>, RHS
+ 2901888310U, // <2,4,6,5>: Cost 3 vzipl <2,6,3,7>, RHS
+ 2551812920U, // <2,4,6,6>: Cost 3 vext1 <0,2,4,6>, <6,6,6,6>
+ 2726251914U, // <2,4,6,7>: Cost 3 vext3 <7,0,1,2>, <4,6,7,1>
+ 1478072110U, // <2,4,6,u>: Cost 2 vext1 <0,2,4,6>, LHS
+ 2659234821U, // <2,4,7,0>: Cost 3 vext2 <7,0,2,4>, <7,0,2,4>
+ 3786722726U, // <2,4,7,1>: Cost 4 vext3 <4,7,1,2>, <4,7,1,2>
+ 3734303911U, // <2,4,7,2>: Cost 4 vext2 <7,2,2,4>, <7,2,2,4>
+ 3734967544U, // <2,4,7,3>: Cost 4 vext2 <7,3,2,4>, <7,3,2,4>
+ 3727005030U, // <2,4,7,4>: Cost 4 vext2 <6,0,2,4>, <7,4,5,6>
+ 2726251976U, // <2,4,7,5>: Cost 3 vext3 <7,0,1,2>, <4,7,5,0>
+ 2726251986U, // <2,4,7,6>: Cost 3 vext3 <7,0,1,2>, <4,7,6,1>
+ 3727005292U, // <2,4,7,7>: Cost 4 vext2 <6,0,2,4>, <7,7,7,7>
+ 2659234821U, // <2,4,7,u>: Cost 3 vext2 <7,0,2,4>, <7,0,2,4>
+ 1478082662U, // <2,4,u,0>: Cost 2 vext1 <0,2,4,u>, LHS
+ 2618758958U, // <2,4,u,1>: Cost 3 vext2 <0,2,2,4>, LHS
+ 2551826024U, // <2,4,u,2>: Cost 3 vext1 <0,2,4,u>, <2,2,2,2>
+ 2551826582U, // <2,4,u,3>: Cost 3 vext1 <0,2,4,u>, <3,0,1,2>
+ 1478085942U, // <2,4,u,4>: Cost 2 vext1 <0,2,4,u>, RHS
+ 2953668302U, // <2,4,u,5>: Cost 3 vzipr LHS, <2,3,4,5>
+ 1611959849U, // <2,4,u,6>: Cost 2 vext3 <0,2,0,2>, RHS
+ 2826128937U, // <2,4,u,7>: Cost 3 vuzpr <1,2,3,4>, RHS
+ 1611959867U, // <2,4,u,u>: Cost 2 vext3 <0,2,0,2>, RHS
+ 3691839488U, // <2,5,0,0>: Cost 4 vext2 <0,1,2,5>, <0,0,0,0>
+ 2618097766U, // <2,5,0,1>: Cost 3 vext2 <0,1,2,5>, LHS
+ 2620088484U, // <2,5,0,2>: Cost 3 vext2 <0,4,2,5>, <0,2,0,2>
+ 2619425034U, // <2,5,0,3>: Cost 3 vext2 <0,3,2,5>, <0,3,2,5>
+ 2620088667U, // <2,5,0,4>: Cost 3 vext2 <0,4,2,5>, <0,4,2,5>
+ 2620752300U, // <2,5,0,5>: Cost 3 vext2 <0,5,2,5>, <0,5,2,5>
+ 3693830655U, // <2,5,0,6>: Cost 4 vext2 <0,4,2,5>, <0,6,2,7>
+ 3094531382U, // <2,5,0,7>: Cost 3 vtrnr <1,2,3,0>, RHS
+ 2618098333U, // <2,5,0,u>: Cost 3 vext2 <0,1,2,5>, LHS
+ 3691840246U, // <2,5,1,0>: Cost 4 vext2 <0,1,2,5>, <1,0,3,2>
+ 3691840308U, // <2,5,1,1>: Cost 4 vext2 <0,1,2,5>, <1,1,1,1>
+ 2626061206U, // <2,5,1,2>: Cost 3 vext2 <1,4,2,5>, <1,2,3,0>
+ 2618098688U, // <2,5,1,3>: Cost 3 vext2 <0,1,2,5>, <1,3,5,7>
+ 2626061364U, // <2,5,1,4>: Cost 3 vext2 <1,4,2,5>, <1,4,2,5>
+ 3691840656U, // <2,5,1,5>: Cost 4 vext2 <0,1,2,5>, <1,5,3,7>
+ 3789082310U, // <2,5,1,6>: Cost 4 vext3 <5,1,6,2>, <5,1,6,2>
+ 2712833744U, // <2,5,1,7>: Cost 3 vext3 <4,6,u,2>, <5,1,7,3>
+ 2628715896U, // <2,5,1,u>: Cost 3 vext2 <1,u,2,5>, <1,u,2,5>
+ 3693831613U, // <2,5,2,0>: Cost 4 vext2 <0,4,2,5>, <2,0,1,2>
+ 4026698642U, // <2,5,2,1>: Cost 4 vzipr <0,0,2,2>, <4,0,5,1>
+ 2632033896U, // <2,5,2,2>: Cost 3 vext2 <2,4,2,5>, <2,2,2,2>
+ 3691841190U, // <2,5,2,3>: Cost 4 vext2 <0,1,2,5>, <2,3,0,1>
+ 2632034061U, // <2,5,2,4>: Cost 3 vext2 <2,4,2,5>, <2,4,2,5>
+ 3691841352U, // <2,5,2,5>: Cost 4 vext2 <0,1,2,5>, <2,5,0,1>
+ 3691841466U, // <2,5,2,6>: Cost 4 vext2 <0,1,2,5>, <2,6,3,7>
+ 3088354614U, // <2,5,2,7>: Cost 3 vtrnr <0,2,0,2>, RHS
+ 3088354615U, // <2,5,2,u>: Cost 3 vtrnr <0,2,0,2>, RHS
+ 2557829222U, // <2,5,3,0>: Cost 3 vext1 <1,2,5,3>, LHS
+ 2557830059U, // <2,5,3,1>: Cost 3 vext1 <1,2,5,3>, <1,2,5,3>
+ 2575746766U, // <2,5,3,2>: Cost 3 vext1 <4,2,5,3>, <2,3,4,5>
+ 3691841948U, // <2,5,3,3>: Cost 4 vext2 <0,1,2,5>, <3,3,3,3>
+ 2619427330U, // <2,5,3,4>: Cost 3 vext2 <0,3,2,5>, <3,4,5,6>
+ 2581720847U, // <2,5,3,5>: Cost 3 vext1 <5,2,5,3>, <5,2,5,3>
+ 2953628162U, // <2,5,3,6>: Cost 3 vzipr LHS, <3,4,5,6>
+ 2953626624U, // <2,5,3,7>: Cost 3 vzipr LHS, <1,3,5,7>
+ 2953626625U, // <2,5,3,u>: Cost 3 vzipr LHS, <1,3,5,u>
+ 2569781350U, // <2,5,4,0>: Cost 3 vext1 <3,2,5,4>, LHS
+ 3631580076U, // <2,5,4,1>: Cost 4 vext1 <1,2,5,4>, <1,2,5,4>
+ 2569782990U, // <2,5,4,2>: Cost 3 vext1 <3,2,5,4>, <2,3,4,5>
+ 2569783646U, // <2,5,4,3>: Cost 3 vext1 <3,2,5,4>, <3,2,5,4>
+ 2569784630U, // <2,5,4,4>: Cost 3 vext1 <3,2,5,4>, RHS
+ 2618101046U, // <2,5,4,5>: Cost 3 vext2 <0,1,2,5>, RHS
+ 3893905922U, // <2,5,4,6>: Cost 4 vuzpr <0,2,3,5>, <3,4,5,6>
+ 3094564150U, // <2,5,4,7>: Cost 3 vtrnr <1,2,3,4>, RHS
+ 2618101289U, // <2,5,4,u>: Cost 3 vext2 <0,1,2,5>, RHS
+ 2551873638U, // <2,5,5,0>: Cost 3 vext1 <0,2,5,5>, LHS
+ 3637560320U, // <2,5,5,1>: Cost 4 vext1 <2,2,5,5>, <1,3,5,7>
+ 3637560966U, // <2,5,5,2>: Cost 4 vext1 <2,2,5,5>, <2,2,5,5>
+ 3723030343U, // <2,5,5,3>: Cost 4 vext2 <5,3,2,5>, <5,3,2,5>
+ 2551876918U, // <2,5,5,4>: Cost 3 vext1 <0,2,5,5>, RHS
+ 2712834052U, // <2,5,5,5>: Cost 3 vext3 <4,6,u,2>, <5,5,5,5>
+ 4028713474U, // <2,5,5,6>: Cost 4 vzipr <0,3,2,5>, <3,4,5,6>
+ 2712834072U, // <2,5,5,7>: Cost 3 vext3 <4,6,u,2>, <5,5,7,7>
+ 2712834081U, // <2,5,5,u>: Cost 3 vext3 <4,6,u,2>, <5,5,u,7>
+ 2575769702U, // <2,5,6,0>: Cost 3 vext1 <4,2,5,6>, LHS
+ 3631596462U, // <2,5,6,1>: Cost 4 vext1 <1,2,5,6>, <1,2,5,6>
+ 2655924730U, // <2,5,6,2>: Cost 3 vext2 <6,4,2,5>, <6,2,7,3>
+ 3643541856U, // <2,5,6,3>: Cost 4 vext1 <3,2,5,6>, <3,2,5,6>
+ 2655924849U, // <2,5,6,4>: Cost 3 vext2 <6,4,2,5>, <6,4,2,5>
+ 3787755607U, // <2,5,6,5>: Cost 4 vext3 <4,u,6,2>, <5,6,5,7>
+ 4029385218U, // <2,5,6,6>: Cost 4 vzipr <0,4,2,6>, <3,4,5,6>
+ 3088682294U, // <2,5,6,7>: Cost 3 vtrnr <0,2,4,6>, RHS
+ 3088682295U, // <2,5,6,u>: Cost 3 vtrnr <0,2,4,6>, RHS
+ 2563833958U, // <2,5,7,0>: Cost 3 vext1 <2,2,5,7>, LHS
+ 2551890678U, // <2,5,7,1>: Cost 3 vext1 <0,2,5,7>, <1,0,3,2>
+ 2563835528U, // <2,5,7,2>: Cost 3 vext1 <2,2,5,7>, <2,2,5,7>
+ 3637577878U, // <2,5,7,3>: Cost 4 vext1 <2,2,5,7>, <3,0,1,2>
+ 2563837238U, // <2,5,7,4>: Cost 3 vext1 <2,2,5,7>, RHS
+ 2712834216U, // <2,5,7,5>: Cost 3 vext3 <4,6,u,2>, <5,7,5,7>
+ 2712834220U, // <2,5,7,6>: Cost 3 vext3 <4,6,u,2>, <5,7,6,2>
+ 4174449974U, // <2,5,7,7>: Cost 4 vtrnr <2,2,5,7>, RHS
+ 2563839790U, // <2,5,7,u>: Cost 3 vext1 <2,2,5,7>, LHS
+ 2563842150U, // <2,5,u,0>: Cost 3 vext1 <2,2,5,u>, LHS
+ 2618103598U, // <2,5,u,1>: Cost 3 vext2 <0,1,2,5>, LHS
+ 2563843721U, // <2,5,u,2>: Cost 3 vext1 <2,2,5,u>, <2,2,5,u>
+ 2569816418U, // <2,5,u,3>: Cost 3 vext1 <3,2,5,u>, <3,2,5,u>
+ 2622748735U, // <2,5,u,4>: Cost 3 vext2 <0,u,2,5>, <u,4,5,6>
+ 2618103962U, // <2,5,u,5>: Cost 3 vext2 <0,1,2,5>, RHS
+ 2953669122U, // <2,5,u,6>: Cost 3 vzipr LHS, <3,4,5,6>
+ 2953667584U, // <2,5,u,7>: Cost 3 vzipr LHS, <1,3,5,7>
+ 2618104165U, // <2,5,u,u>: Cost 3 vext2 <0,1,2,5>, LHS
+ 2620096512U, // <2,6,0,0>: Cost 3 vext2 <0,4,2,6>, <0,0,0,0>
+ 1546354790U, // <2,6,0,1>: Cost 2 vext2 <0,4,2,6>, LHS
+ 2620096676U, // <2,6,0,2>: Cost 3 vext2 <0,4,2,6>, <0,2,0,2>
+ 3693838588U, // <2,6,0,3>: Cost 4 vext2 <0,4,2,6>, <0,3,1,0>
+ 1546355036U, // <2,6,0,4>: Cost 2 vext2 <0,4,2,6>, <0,4,2,6>
+ 3694502317U, // <2,6,0,5>: Cost 4 vext2 <0,5,2,6>, <0,5,2,6>
+ 2551911246U, // <2,6,0,6>: Cost 3 vext1 <0,2,6,0>, <6,7,0,1>
+ 2720723287U, // <2,6,0,7>: Cost 3 vext3 <6,0,7,2>, <6,0,7,2>
+ 1546355357U, // <2,6,0,u>: Cost 2 vext2 <0,4,2,6>, LHS
+ 2620097270U, // <2,6,1,0>: Cost 3 vext2 <0,4,2,6>, <1,0,3,2>
+ 2620097332U, // <2,6,1,1>: Cost 3 vext2 <0,4,2,6>, <1,1,1,1>
+ 2620097430U, // <2,6,1,2>: Cost 3 vext2 <0,4,2,6>, <1,2,3,0>
+ 2820243558U, // <2,6,1,3>: Cost 3 vuzpr <0,2,4,6>, LHS
+ 2620097598U, // <2,6,1,4>: Cost 3 vext2 <0,4,2,6>, <1,4,3,6>
+ 2620097680U, // <2,6,1,5>: Cost 3 vext2 <0,4,2,6>, <1,5,3,7>
+ 3693839585U, // <2,6,1,6>: Cost 4 vext2 <0,4,2,6>, <1,6,3,7>
+ 2721386920U, // <2,6,1,7>: Cost 3 vext3 <6,1,7,2>, <6,1,7,2>
+ 2820243563U, // <2,6,1,u>: Cost 3 vuzpr <0,2,4,6>, LHS
+ 2714014137U, // <2,6,2,0>: Cost 3 vext3 <4,u,6,2>, <6,2,0,1>
+ 2712834500U, // <2,6,2,1>: Cost 3 vext3 <4,6,u,2>, <6,2,1,3>
+ 2620098152U, // <2,6,2,2>: Cost 3 vext2 <0,4,2,6>, <2,2,2,2>
+ 2620098214U, // <2,6,2,3>: Cost 3 vext2 <0,4,2,6>, <2,3,0,1>
+ 2632042254U, // <2,6,2,4>: Cost 3 vext2 <2,4,2,6>, <2,4,2,6>
+ 2712834540U, // <2,6,2,5>: Cost 3 vext3 <4,6,u,2>, <6,2,5,7>
+ 2820243660U, // <2,6,2,6>: Cost 3 vuzpr <0,2,4,6>, <0,2,4,6>
+ 2958265654U, // <2,6,2,7>: Cost 3 vzipr <0,u,2,2>, RHS
+ 2620098619U, // <2,6,2,u>: Cost 3 vext2 <0,4,2,6>, <2,u,0,1>
+ 2620098710U, // <2,6,3,0>: Cost 3 vext2 <0,4,2,6>, <3,0,1,2>
+ 3893986982U, // <2,6,3,1>: Cost 4 vuzpr <0,2,4,6>, <2,3,0,1>
+ 2569848762U, // <2,6,3,2>: Cost 3 vext1 <3,2,6,3>, <2,6,3,7>
+ 2620098972U, // <2,6,3,3>: Cost 3 vext2 <0,4,2,6>, <3,3,3,3>
+ 2620099074U, // <2,6,3,4>: Cost 3 vext2 <0,4,2,6>, <3,4,5,6>
+ 3893987022U, // <2,6,3,5>: Cost 4 vuzpr <0,2,4,6>, <2,3,4,5>
+ 3001404644U, // <2,6,3,6>: Cost 3 vzipr LHS, <4,4,6,6>
+ 1879887158U, // <2,6,3,7>: Cost 2 vzipr LHS, RHS
+ 1879887159U, // <2,6,3,u>: Cost 2 vzipr LHS, RHS
+ 2620099484U, // <2,6,4,0>: Cost 3 vext2 <0,4,2,6>, <4,0,6,2>
+ 2620099566U, // <2,6,4,1>: Cost 3 vext2 <0,4,2,6>, <4,1,6,3>
+ 2620099644U, // <2,6,4,2>: Cost 3 vext2 <0,4,2,6>, <4,2,6,0>
+ 3643599207U, // <2,6,4,3>: Cost 4 vext1 <3,2,6,4>, <3,2,6,4>
+ 2575830080U, // <2,6,4,4>: Cost 3 vext1 <4,2,6,4>, <4,2,6,4>
+ 1546358070U, // <2,6,4,5>: Cost 2 vext2 <0,4,2,6>, RHS
+ 2667875700U, // <2,6,4,6>: Cost 3 vext2 <u,4,2,6>, <4,6,4,6>
+ 4028042550U, // <2,6,4,7>: Cost 4 vzipr <0,2,2,4>, RHS
+ 1546358313U, // <2,6,4,u>: Cost 2 vext2 <0,4,2,6>, RHS
+ 3693841992U, // <2,6,5,0>: Cost 4 vext2 <0,4,2,6>, <5,0,1,2>
+ 2667876048U, // <2,6,5,1>: Cost 3 vext2 <u,4,2,6>, <5,1,7,3>
+ 2712834756U, // <2,6,5,2>: Cost 3 vext3 <4,6,u,2>, <6,5,2,7>
+ 3643607400U, // <2,6,5,3>: Cost 4 vext1 <3,2,6,5>, <3,2,6,5>
+ 2252091873U, // <2,6,5,4>: Cost 3 vrev <6,2,4,5>
+ 2667876356U, // <2,6,5,5>: Cost 3 vext2 <u,4,2,6>, <5,5,5,5>
+ 2667876450U, // <2,6,5,6>: Cost 3 vext2 <u,4,2,6>, <5,6,7,0>
+ 2820246838U, // <2,6,5,7>: Cost 3 vuzpr <0,2,4,6>, RHS
+ 2820246839U, // <2,6,5,u>: Cost 3 vuzpr <0,2,4,6>, RHS
+ 2563899494U, // <2,6,6,0>: Cost 3 vext1 <2,2,6,6>, LHS
+ 3893988683U, // <2,6,6,1>: Cost 4 vuzpr <0,2,4,6>, <4,6,0,1>
+ 2563901072U, // <2,6,6,2>: Cost 3 vext1 <2,2,6,6>, <2,2,6,6>
+ 3893987236U, // <2,6,6,3>: Cost 4 vuzpr <0,2,4,6>, <2,6,1,3>
+ 2563902774U, // <2,6,6,4>: Cost 3 vext1 <2,2,6,6>, RHS
+ 3893988723U, // <2,6,6,5>: Cost 4 vuzpr <0,2,4,6>, <4,6,4,5>
+ 2712834872U, // <2,6,6,6>: Cost 3 vext3 <4,6,u,2>, <6,6,6,6>
+ 2955644214U, // <2,6,6,7>: Cost 3 vzipr <0,4,2,6>, RHS
+ 2955644215U, // <2,6,6,u>: Cost 3 vzipr <0,4,2,6>, RHS
+ 2712834894U, // <2,6,7,0>: Cost 3 vext3 <4,6,u,2>, <6,7,0,1>
+ 2724926296U, // <2,6,7,1>: Cost 3 vext3 <6,7,1,2>, <6,7,1,2>
+ 2725000033U, // <2,6,7,2>: Cost 3 vext3 <6,7,2,2>, <6,7,2,2>
+ 2702365544U, // <2,6,7,3>: Cost 3 vext3 <3,0,1,2>, <6,7,3,0>
+ 2712834934U, // <2,6,7,4>: Cost 3 vext3 <4,6,u,2>, <6,7,4,5>
+ 3776107393U, // <2,6,7,5>: Cost 4 vext3 <3,0,1,2>, <6,7,5,7>
+ 2725294981U, // <2,6,7,6>: Cost 3 vext3 <6,7,6,2>, <6,7,6,2>
+ 2726253452U, // <2,6,7,7>: Cost 3 vext3 <7,0,1,2>, <6,7,7,0>
+ 2712834966U, // <2,6,7,u>: Cost 3 vext3 <4,6,u,2>, <6,7,u,1>
+ 2620102355U, // <2,6,u,0>: Cost 3 vext2 <0,4,2,6>, <u,0,1,2>
+ 1546360622U, // <2,6,u,1>: Cost 2 vext2 <0,4,2,6>, LHS
+ 2620102536U, // <2,6,u,2>: Cost 3 vext2 <0,4,2,6>, <u,2,3,3>
+ 2820244125U, // <2,6,u,3>: Cost 3 vuzpr <0,2,4,6>, LHS
+ 1594136612U, // <2,6,u,4>: Cost 2 vext2 <u,4,2,6>, <u,4,2,6>
+ 1546360986U, // <2,6,u,5>: Cost 2 vext2 <0,4,2,6>, RHS
+ 2620102864U, // <2,6,u,6>: Cost 3 vext2 <0,4,2,6>, <u,6,3,7>
+ 1879928118U, // <2,6,u,7>: Cost 2 vzipr LHS, RHS
+ 1879928119U, // <2,6,u,u>: Cost 2 vzipr LHS, RHS
+ 2726179825U, // <2,7,0,0>: Cost 3 vext3 <7,0,0,2>, <7,0,0,2>
+ 1652511738U, // <2,7,0,1>: Cost 2 vext3 <7,0,1,2>, <7,0,1,2>
+ 2621431972U, // <2,7,0,2>: Cost 3 vext2 <0,6,2,7>, <0,2,0,2>
+ 2257949868U, // <2,7,0,3>: Cost 3 vrev <7,2,3,0>
+ 2726474773U, // <2,7,0,4>: Cost 3 vext3 <7,0,4,2>, <7,0,4,2>
+ 2620768686U, // <2,7,0,5>: Cost 3 vext2 <0,5,2,7>, <0,5,2,7>
+ 2621432319U, // <2,7,0,6>: Cost 3 vext2 <0,6,2,7>, <0,6,2,7>
+ 2599760953U, // <2,7,0,7>: Cost 3 vext1 <u,2,7,0>, <7,0,u,2>
+ 1653027897U, // <2,7,0,u>: Cost 2 vext3 <7,0,u,2>, <7,0,u,2>
+ 2639348470U, // <2,7,1,0>: Cost 3 vext2 <3,6,2,7>, <1,0,3,2>
+ 3695174452U, // <2,7,1,1>: Cost 4 vext2 <0,6,2,7>, <1,1,1,1>
+ 3695174550U, // <2,7,1,2>: Cost 4 vext2 <0,6,2,7>, <1,2,3,0>
+ 3694511104U, // <2,7,1,3>: Cost 4 vext2 <0,5,2,7>, <1,3,5,7>
+ 3713090594U, // <2,7,1,4>: Cost 4 vext2 <3,6,2,7>, <1,4,0,5>
+ 3693184144U, // <2,7,1,5>: Cost 4 vext2 <0,3,2,7>, <1,5,3,7>
+ 2627405016U, // <2,7,1,6>: Cost 3 vext2 <1,6,2,7>, <1,6,2,7>
+ 3799995519U, // <2,7,1,7>: Cost 4 vext3 <7,0,1,2>, <7,1,7,0>
+ 2639348470U, // <2,7,1,u>: Cost 3 vext2 <3,6,2,7>, <1,0,3,2>
+ 3695175101U, // <2,7,2,0>: Cost 4 vext2 <0,6,2,7>, <2,0,1,2>
+ 3643655168U, // <2,7,2,1>: Cost 4 vext1 <3,2,7,2>, <1,3,5,7>
+ 2257892517U, // <2,7,2,2>: Cost 3 vrev <7,2,2,2>
+ 3695175334U, // <2,7,2,3>: Cost 4 vext2 <0,6,2,7>, <2,3,0,1>
+ 3695175465U, // <2,7,2,4>: Cost 4 vext2 <0,6,2,7>, <2,4,5,6>
+ 2632714080U, // <2,7,2,5>: Cost 3 vext2 <2,5,2,7>, <2,5,2,7>
+ 2633377713U, // <2,7,2,6>: Cost 3 vext2 <2,6,2,7>, <2,6,2,7>
+ 3695175658U, // <2,7,2,7>: Cost 4 vext2 <0,6,2,7>, <2,7,0,1>
+ 2634704979U, // <2,7,2,u>: Cost 3 vext2 <2,u,2,7>, <2,u,2,7>
+ 1514094694U, // <2,7,3,0>: Cost 2 vext1 <6,2,7,3>, LHS
+ 2569921680U, // <2,7,3,1>: Cost 3 vext1 <3,2,7,3>, <1,5,3,7>
+ 2587838056U, // <2,7,3,2>: Cost 3 vext1 <6,2,7,3>, <2,2,2,2>
+ 2569922927U, // <2,7,3,3>: Cost 3 vext1 <3,2,7,3>, <3,2,7,3>
+ 1514097974U, // <2,7,3,4>: Cost 2 vext1 <6,2,7,3>, RHS
+ 2581868321U, // <2,7,3,5>: Cost 3 vext1 <5,2,7,3>, <5,2,7,3>
+ 1514099194U, // <2,7,3,6>: Cost 2 vext1 <6,2,7,3>, <6,2,7,3>
+ 2587841530U, // <2,7,3,7>: Cost 3 vext1 <6,2,7,3>, <7,0,1,2>
+ 1514100526U, // <2,7,3,u>: Cost 2 vext1 <6,2,7,3>, LHS
+ 2708706617U, // <2,7,4,0>: Cost 3 vext3 <4,0,6,2>, <7,4,0,6>
+ 3649643418U, // <2,7,4,1>: Cost 4 vext1 <4,2,7,4>, <1,2,3,4>
+ 3649644330U, // <2,7,4,2>: Cost 4 vext1 <4,2,7,4>, <2,4,5,7>
+ 2257982640U, // <2,7,4,3>: Cost 3 vrev <7,2,3,4>
+ 3649645641U, // <2,7,4,4>: Cost 4 vext1 <4,2,7,4>, <4,2,7,4>
+ 2621435190U, // <2,7,4,5>: Cost 3 vext2 <0,6,2,7>, RHS
+ 2712835441U, // <2,7,4,6>: Cost 3 vext3 <4,6,u,2>, <7,4,6,u>
+ 3799995762U, // <2,7,4,7>: Cost 4 vext3 <7,0,1,2>, <7,4,7,0>
+ 2621435433U, // <2,7,4,u>: Cost 3 vext2 <0,6,2,7>, RHS
+ 2729497990U, // <2,7,5,0>: Cost 3 vext3 <7,5,0,2>, <7,5,0,2>
+ 3643679744U, // <2,7,5,1>: Cost 4 vext1 <3,2,7,5>, <1,3,5,7>
+ 3637708424U, // <2,7,5,2>: Cost 4 vext1 <2,2,7,5>, <2,2,5,7>
+ 3643681137U, // <2,7,5,3>: Cost 4 vext1 <3,2,7,5>, <3,2,7,5>
+ 2599800118U, // <2,7,5,4>: Cost 3 vext1 <u,2,7,5>, RHS
+ 3786577334U, // <2,7,5,5>: Cost 4 vext3 <4,6,u,2>, <7,5,5,5>
+ 3786577345U, // <2,7,5,6>: Cost 4 vext3 <4,6,u,2>, <7,5,6,7>
+ 2599802214U, // <2,7,5,7>: Cost 3 vext1 <u,2,7,5>, <7,4,5,6>
+ 2599802670U, // <2,7,5,u>: Cost 3 vext1 <u,2,7,5>, LHS
+ 2581889126U, // <2,7,6,0>: Cost 3 vext1 <5,2,7,6>, LHS
+ 3643687936U, // <2,7,6,1>: Cost 4 vext1 <3,2,7,6>, <1,3,5,7>
+ 2663240186U, // <2,7,6,2>: Cost 3 vext2 <7,6,2,7>, <6,2,7,3>
+ 3643689330U, // <2,7,6,3>: Cost 4 vext1 <3,2,7,6>, <3,2,7,6>
+ 2581892406U, // <2,7,6,4>: Cost 3 vext1 <5,2,7,6>, RHS
+ 2581892900U, // <2,7,6,5>: Cost 3 vext1 <5,2,7,6>, <5,2,7,6>
+ 2587865597U, // <2,7,6,6>: Cost 3 vext1 <6,2,7,6>, <6,2,7,6>
+ 3786577428U, // <2,7,6,7>: Cost 4 vext3 <4,6,u,2>, <7,6,7,0>
+ 2581894958U, // <2,7,6,u>: Cost 3 vext1 <5,2,7,6>, LHS
+ 2726254119U, // <2,7,7,0>: Cost 3 vext3 <7,0,1,2>, <7,7,0,1>
+ 3804640817U, // <2,7,7,1>: Cost 4 vext3 <7,7,1,2>, <7,7,1,2>
+ 3637724826U, // <2,7,7,2>: Cost 4 vext1 <2,2,7,7>, <2,2,7,7>
+ 3734992123U, // <2,7,7,3>: Cost 4 vext2 <7,3,2,7>, <7,3,2,7>
+ 2552040758U, // <2,7,7,4>: Cost 3 vext1 <0,2,7,7>, RHS
+ 3799995992U, // <2,7,7,5>: Cost 4 vext3 <7,0,1,2>, <7,7,5,5>
+ 2663241198U, // <2,7,7,6>: Cost 3 vext2 <7,6,2,7>, <7,6,2,7>
+ 2712835692U, // <2,7,7,7>: Cost 3 vext3 <4,6,u,2>, <7,7,7,7>
+ 2731562607U, // <2,7,7,u>: Cost 3 vext3 <7,u,1,2>, <7,7,u,1>
+ 1514135654U, // <2,7,u,0>: Cost 2 vext1 <6,2,7,u>, LHS
+ 1657820802U, // <2,7,u,1>: Cost 2 vext3 <7,u,1,2>, <7,u,1,2>
+ 2587879016U, // <2,7,u,2>: Cost 3 vext1 <6,2,7,u>, <2,2,2,2>
+ 2569963892U, // <2,7,u,3>: Cost 3 vext1 <3,2,7,u>, <3,2,7,u>
+ 1514138934U, // <2,7,u,4>: Cost 2 vext1 <6,2,7,u>, RHS
+ 2621438106U, // <2,7,u,5>: Cost 3 vext2 <0,6,2,7>, RHS
+ 1514140159U, // <2,7,u,6>: Cost 2 vext1 <6,2,7,u>, <6,2,7,u>
+ 2587882490U, // <2,7,u,7>: Cost 3 vext1 <6,2,7,u>, <7,0,1,2>
+ 1514141486U, // <2,7,u,u>: Cost 2 vext1 <6,2,7,u>, LHS
+ 1544380416U, // <2,u,0,0>: Cost 2 vext2 LHS, <0,0,0,0>
+ 470638699U, // <2,u,0,1>: Cost 1 vext2 LHS, LHS
+ 1544380580U, // <2,u,0,2>: Cost 2 vext2 LHS, <0,2,0,2>
+ 1658631909U, // <2,u,0,3>: Cost 2 vext3 <u,0,3,2>, <u,0,3,2>
+ 1544380754U, // <2,u,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
+ 2665898414U, // <2,u,0,5>: Cost 3 vext2 LHS, <0,5,2,7>
+ 1658853120U, // <2,u,0,6>: Cost 2 vext3 <u,0,6,2>, <u,0,6,2>
+ 3094531625U, // <2,u,0,7>: Cost 3 vtrnr <1,2,3,0>, RHS
+ 470639261U, // <2,u,0,u>: Cost 1 vext2 LHS, LHS
+ 1544381174U, // <2,u,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
+ 1544381236U, // <2,u,1,1>: Cost 2 vext2 LHS, <1,1,1,1>
+ 1544381334U, // <2,u,1,2>: Cost 2 vext2 LHS, <1,2,3,0>
+ 1544381400U, // <2,u,1,3>: Cost 2 vext2 LHS, <1,3,1,3>
+ 2618123325U, // <2,u,1,4>: Cost 3 vext2 LHS, <1,4,3,5>
+ 1544381584U, // <2,u,1,5>: Cost 2 vext2 LHS, <1,5,3,7>
+ 2618123489U, // <2,u,1,6>: Cost 3 vext2 LHS, <1,6,3,7>
+ 2726254427U, // <2,u,1,7>: Cost 3 vext3 <7,0,1,2>, <u,1,7,3>
+ 1544381823U, // <2,u,1,u>: Cost 2 vext2 LHS, <1,u,3,3>
+ 1478328422U, // <2,u,2,0>: Cost 2 vext1 <0,2,u,2>, LHS
+ 2618123807U, // <2,u,2,1>: Cost 3 vext2 LHS, <2,1,3,1>
+ 269271142U, // <2,u,2,2>: Cost 1 vdup2 LHS
+ 1544382118U, // <2,u,2,3>: Cost 2 vext2 LHS, <2,3,0,1>
+ 1478331702U, // <2,u,2,4>: Cost 2 vext1 <0,2,u,2>, RHS
+ 2618124136U, // <2,u,2,5>: Cost 3 vext2 LHS, <2,5,3,6>
+ 1544382394U, // <2,u,2,6>: Cost 2 vext2 LHS, <2,6,3,7>
+ 3088354857U, // <2,u,2,7>: Cost 3 vtrnr <0,2,0,2>, RHS
+ 269271142U, // <2,u,2,u>: Cost 1 vdup2 LHS
+ 1544382614U, // <2,u,3,0>: Cost 2 vext2 LHS, <3,0,1,2>
+ 2953627374U, // <2,u,3,1>: Cost 3 vzipr LHS, <2,3,u,1>
+ 1490282143U, // <2,u,3,2>: Cost 2 vext1 <2,2,u,3>, <2,2,u,3>
+ 1879883932U, // <2,u,3,3>: Cost 2 vzipr LHS, LHS
+ 1544382978U, // <2,u,3,4>: Cost 2 vext2 LHS, <3,4,5,6>
+ 2953627378U, // <2,u,3,5>: Cost 3 vzipr LHS, <2,3,u,5>
+ 1514172931U, // <2,u,3,6>: Cost 2 vext1 <6,2,u,3>, <6,2,u,3>
+ 1879887176U, // <2,u,3,7>: Cost 2 vzipr LHS, RHS
+ 1879883937U, // <2,u,3,u>: Cost 2 vzipr LHS, LHS
+ 1484316774U, // <2,u,4,0>: Cost 2 vext1 <1,2,u,4>, LHS
+ 1484317639U, // <2,u,4,1>: Cost 2 vext1 <1,2,u,4>, <1,2,u,4>
+ 2552088270U, // <2,u,4,2>: Cost 3 vext1 <0,2,u,4>, <2,3,4,5>
+ 1190213513U, // <2,u,4,3>: Cost 2 vrev <u,2,3,4>
+ 1484320054U, // <2,u,4,4>: Cost 2 vext1 <1,2,u,4>, RHS
+ 470641974U, // <2,u,4,5>: Cost 1 vext2 LHS, RHS
+ 1592159604U, // <2,u,4,6>: Cost 2 vext2 LHS, <4,6,4,6>
+ 3094564393U, // <2,u,4,7>: Cost 3 vtrnr <1,2,3,4>, RHS
+ 470642217U, // <2,u,4,u>: Cost 1 vext2 LHS, RHS
+ 2552094959U, // <2,u,5,0>: Cost 3 vext1 <0,2,u,5>, <0,2,u,5>
+ 1592159952U, // <2,u,5,1>: Cost 2 vext2 LHS, <5,1,7,3>
+ 2564040353U, // <2,u,5,2>: Cost 3 vext1 <2,2,u,5>, <2,2,u,5>
+ 2690275455U, // <2,u,5,3>: Cost 3 vext3 <0,u,u,2>, <u,5,3,7>
+ 1592160198U, // <2,u,5,4>: Cost 2 vext2 LHS, <5,4,7,6>
+ 1592160260U, // <2,u,5,5>: Cost 2 vext2 LHS, <5,5,5,5>
+ 1611962522U, // <2,u,5,6>: Cost 2 vext3 <0,2,0,2>, RHS
+ 1592160424U, // <2,u,5,7>: Cost 2 vext2 LHS, <5,7,5,7>
+ 1611962540U, // <2,u,5,u>: Cost 2 vext3 <0,2,0,2>, RHS
+ 1478361190U, // <2,u,6,0>: Cost 2 vext1 <0,2,u,6>, LHS
+ 2552103670U, // <2,u,6,1>: Cost 3 vext1 <0,2,u,6>, <1,0,3,2>
+ 1592160762U, // <2,u,6,2>: Cost 2 vext2 LHS, <6,2,7,3>
+ 2685704400U, // <2,u,6,3>: Cost 3 vext3 <0,2,0,2>, <u,6,3,7>
+ 1478364470U, // <2,u,6,4>: Cost 2 vext1 <0,2,u,6>, RHS
+ 2901891226U, // <2,u,6,5>: Cost 3 vzipl <2,6,3,7>, RHS
+ 1592161080U, // <2,u,6,6>: Cost 2 vext2 LHS, <6,6,6,6>
+ 1592161102U, // <2,u,6,7>: Cost 2 vext2 LHS, <6,7,0,1>
+ 1478367022U, // <2,u,6,u>: Cost 2 vext1 <0,2,u,6>, LHS
+ 1592161274U, // <2,u,7,0>: Cost 2 vext2 LHS, <7,0,1,2>
+ 2659931226U, // <2,u,7,1>: Cost 3 vext2 <7,1,2,u>, <7,1,2,u>
+ 2564056739U, // <2,u,7,2>: Cost 3 vext1 <2,2,u,7>, <2,2,u,7>
+ 2665903331U, // <2,u,7,3>: Cost 3 vext2 LHS, <7,3,0,1>
+ 1592161638U, // <2,u,7,4>: Cost 2 vext2 LHS, <7,4,5,6>
+ 2665903494U, // <2,u,7,5>: Cost 3 vext2 LHS, <7,5,0,2>
+ 2587947527U, // <2,u,7,6>: Cost 3 vext1 <6,2,u,7>, <6,2,u,7>
+ 1592161900U, // <2,u,7,7>: Cost 2 vext2 LHS, <7,7,7,7>
+ 1592161922U, // <2,u,7,u>: Cost 2 vext2 LHS, <7,u,1,2>
+ 1478377574U, // <2,u,u,0>: Cost 2 vext1 <0,2,u,u>, LHS
+ 470644526U, // <2,u,u,1>: Cost 1 vext2 LHS, LHS
+ 269271142U, // <2,u,u,2>: Cost 1 vdup2 LHS
+ 1879924892U, // <2,u,u,3>: Cost 2 vzipr LHS, LHS
+ 1478380854U, // <2,u,u,4>: Cost 2 vext1 <0,2,u,u>, RHS
+ 470644890U, // <2,u,u,5>: Cost 1 vext2 LHS, RHS
+ 1611962765U, // <2,u,u,6>: Cost 2 vext3 <0,2,0,2>, RHS
+ 1879928136U, // <2,u,u,7>: Cost 2 vzipr LHS, RHS
+ 470645093U, // <2,u,u,u>: Cost 1 vext2 LHS, LHS
+ 1611448320U, // <3,0,0,0>: Cost 2 vext3 LHS, <0,0,0,0>
+ 1611890698U, // <3,0,0,1>: Cost 2 vext3 LHS, <0,0,1,1>
+ 1611890708U, // <3,0,0,2>: Cost 2 vext3 LHS, <0,0,2,2>
+ 3763576860U, // <3,0,0,3>: Cost 4 vext3 LHS, <0,0,3,1>
+ 2689835045U, // <3,0,0,4>: Cost 3 vext3 LHS, <0,0,4,1>
+ 3698508206U, // <3,0,0,5>: Cost 4 vext2 <1,2,3,0>, <0,5,2,7>
+ 3763576887U, // <3,0,0,6>: Cost 4 vext3 LHS, <0,0,6,1>
+ 3667678434U, // <3,0,0,7>: Cost 4 vext1 <7,3,0,0>, <7,3,0,0>
+ 1616093258U, // <3,0,0,u>: Cost 2 vext3 LHS, <0,0,u,2>
+ 1490337894U, // <3,0,1,0>: Cost 2 vext1 <2,3,0,1>, LHS
+ 2685632602U, // <3,0,1,1>: Cost 3 vext3 LHS, <0,1,1,0>
+ 537706598U, // <3,0,1,2>: Cost 1 vext3 LHS, LHS
+ 2624766936U, // <3,0,1,3>: Cost 3 vext2 <1,2,3,0>, <1,3,1,3>
+ 1490341174U, // <3,0,1,4>: Cost 2 vext1 <2,3,0,1>, RHS
+ 2624767120U, // <3,0,1,5>: Cost 3 vext2 <1,2,3,0>, <1,5,3,7>
+ 2732966030U, // <3,0,1,6>: Cost 3 vext3 LHS, <0,1,6,7>
+ 2593944803U, // <3,0,1,7>: Cost 3 vext1 <7,3,0,1>, <7,3,0,1>
+ 537706652U, // <3,0,1,u>: Cost 1 vext3 LHS, LHS
+ 1611890852U, // <3,0,2,0>: Cost 2 vext3 LHS, <0,2,0,2>
+ 2685632684U, // <3,0,2,1>: Cost 3 vext3 LHS, <0,2,1,1>
+ 2685632692U, // <3,0,2,2>: Cost 3 vext3 LHS, <0,2,2,0>
+ 2685632702U, // <3,0,2,3>: Cost 3 vext3 LHS, <0,2,3,1>
+ 1611890892U, // <3,0,2,4>: Cost 2 vext3 LHS, <0,2,4,6>
+ 2732966102U, // <3,0,2,5>: Cost 3 vext3 LHS, <0,2,5,7>
+ 2624767930U, // <3,0,2,6>: Cost 3 vext2 <1,2,3,0>, <2,6,3,7>
+ 2685632744U, // <3,0,2,7>: Cost 3 vext3 LHS, <0,2,7,7>
+ 1611890924U, // <3,0,2,u>: Cost 2 vext3 LHS, <0,2,u,2>
+ 2624768150U, // <3,0,3,0>: Cost 3 vext2 <1,2,3,0>, <3,0,1,2>
+ 2685632764U, // <3,0,3,1>: Cost 3 vext3 LHS, <0,3,1,0>
+ 2685632774U, // <3,0,3,2>: Cost 3 vext3 LHS, <0,3,2,1>
+ 2624768412U, // <3,0,3,3>: Cost 3 vext2 <1,2,3,0>, <3,3,3,3>
+ 2624768514U, // <3,0,3,4>: Cost 3 vext2 <1,2,3,0>, <3,4,5,6>
+ 3702491714U, // <3,0,3,5>: Cost 4 vext2 <1,u,3,0>, <3,5,3,7>
+ 2624768632U, // <3,0,3,6>: Cost 3 vext2 <1,2,3,0>, <3,6,0,7>
+ 3702491843U, // <3,0,3,7>: Cost 4 vext2 <1,u,3,0>, <3,7,0,1>
+ 2686959934U, // <3,0,3,u>: Cost 3 vext3 <0,3,u,3>, <0,3,u,3>
+ 2689835336U, // <3,0,4,0>: Cost 3 vext3 LHS, <0,4,0,4>
+ 1611891026U, // <3,0,4,1>: Cost 2 vext3 LHS, <0,4,1,5>
+ 1611891036U, // <3,0,4,2>: Cost 2 vext3 LHS, <0,4,2,6>
+ 3763577184U, // <3,0,4,3>: Cost 4 vext3 LHS, <0,4,3,1>
+ 2689835374U, // <3,0,4,4>: Cost 3 vext3 LHS, <0,4,4,6>
+ 1551027510U, // <3,0,4,5>: Cost 2 vext2 <1,2,3,0>, RHS
+ 2666573172U, // <3,0,4,6>: Cost 3 vext2 <u,2,3,0>, <4,6,4,6>
+ 3667711206U, // <3,0,4,7>: Cost 4 vext1 <7,3,0,4>, <7,3,0,4>
+ 1616093586U, // <3,0,4,u>: Cost 2 vext3 LHS, <0,4,u,6>
+ 2685190556U, // <3,0,5,0>: Cost 3 vext3 LHS, <0,5,0,7>
+ 2666573520U, // <3,0,5,1>: Cost 3 vext2 <u,2,3,0>, <5,1,7,3>
+ 3040886886U, // <3,0,5,2>: Cost 3 vtrnl <3,4,5,6>, LHS
+ 3625912834U, // <3,0,5,3>: Cost 4 vext1 <0,3,0,5>, <3,4,5,6>
+ 2666573766U, // <3,0,5,4>: Cost 3 vext2 <u,2,3,0>, <5,4,7,6>
+ 2666573828U, // <3,0,5,5>: Cost 3 vext2 <u,2,3,0>, <5,5,5,5>
+ 2732966354U, // <3,0,5,6>: Cost 3 vext3 LHS, <0,5,6,7>
+ 2666573992U, // <3,0,5,7>: Cost 3 vext2 <u,2,3,0>, <5,7,5,7>
+ 3040886940U, // <3,0,5,u>: Cost 3 vtrnl <3,4,5,6>, LHS
+ 2685190637U, // <3,0,6,0>: Cost 3 vext3 LHS, <0,6,0,7>
+ 2732966390U, // <3,0,6,1>: Cost 3 vext3 LHS, <0,6,1,7>
+ 2689835519U, // <3,0,6,2>: Cost 3 vext3 LHS, <0,6,2,7>
+ 3667724438U, // <3,0,6,3>: Cost 4 vext1 <7,3,0,6>, <3,0,1,2>
+ 3763577355U, // <3,0,6,4>: Cost 4 vext3 LHS, <0,6,4,1>
+ 3806708243U, // <3,0,6,5>: Cost 4 vext3 LHS, <0,6,5,0>
+ 2666574648U, // <3,0,6,6>: Cost 3 vext2 <u,2,3,0>, <6,6,6,6>
+ 2657948520U, // <3,0,6,7>: Cost 3 vext2 <6,7,3,0>, <6,7,3,0>
+ 2689835573U, // <3,0,6,u>: Cost 3 vext3 LHS, <0,6,u,7>
+ 2666574842U, // <3,0,7,0>: Cost 3 vext2 <u,2,3,0>, <7,0,1,2>
+ 2685633095U, // <3,0,7,1>: Cost 3 vext3 LHS, <0,7,1,7>
+ 2660603052U, // <3,0,7,2>: Cost 3 vext2 <7,2,3,0>, <7,2,3,0>
+ 3643844997U, // <3,0,7,3>: Cost 4 vext1 <3,3,0,7>, <3,3,0,7>
+ 2666575206U, // <3,0,7,4>: Cost 3 vext2 <u,2,3,0>, <7,4,5,6>
+ 3655790391U, // <3,0,7,5>: Cost 4 vext1 <5,3,0,7>, <5,3,0,7>
+ 3731690968U, // <3,0,7,6>: Cost 4 vext2 <6,7,3,0>, <7,6,0,3>
+ 2666575468U, // <3,0,7,7>: Cost 3 vext2 <u,2,3,0>, <7,7,7,7>
+ 2664584850U, // <3,0,7,u>: Cost 3 vext2 <7,u,3,0>, <7,u,3,0>
+ 1616093834U, // <3,0,u,0>: Cost 2 vext3 LHS, <0,u,0,2>
+ 1611891346U, // <3,0,u,1>: Cost 2 vext3 LHS, <0,u,1,1>
+ 537707165U, // <3,0,u,2>: Cost 1 vext3 LHS, LHS
+ 2689835684U, // <3,0,u,3>: Cost 3 vext3 LHS, <0,u,3,1>
+ 1616093874U, // <3,0,u,4>: Cost 2 vext3 LHS, <0,u,4,6>
+ 1551030426U, // <3,0,u,5>: Cost 2 vext2 <1,2,3,0>, RHS
+ 2624772304U, // <3,0,u,6>: Cost 3 vext2 <1,2,3,0>, <u,6,3,7>
+ 2594002154U, // <3,0,u,7>: Cost 3 vext1 <7,3,0,u>, <7,3,0,u>
+ 537707219U, // <3,0,u,u>: Cost 1 vext3 LHS, LHS
+ 2552201318U, // <3,1,0,0>: Cost 3 vext1 <0,3,1,0>, LHS
+ 2618802278U, // <3,1,0,1>: Cost 3 vext2 <0,2,3,1>, LHS
+ 2618802366U, // <3,1,0,2>: Cost 3 vext2 <0,2,3,1>, <0,2,3,1>
+ 1611449078U, // <3,1,0,3>: Cost 2 vext3 LHS, <1,0,3,2>
+ 2552204598U, // <3,1,0,4>: Cost 3 vext1 <0,3,1,0>, RHS
+ 2732966663U, // <3,1,0,5>: Cost 3 vext3 LHS, <1,0,5,1>
+ 3906258396U, // <3,1,0,6>: Cost 4 vuzpr <2,3,0,1>, <2,0,4,6>
+ 3667752171U, // <3,1,0,7>: Cost 4 vext1 <7,3,1,0>, <7,3,1,0>
+ 1611891491U, // <3,1,0,u>: Cost 2 vext3 LHS, <1,0,u,2>
+ 2689835819U, // <3,1,1,0>: Cost 3 vext3 LHS, <1,1,0,1>
+ 1611449140U, // <3,1,1,1>: Cost 2 vext3 LHS, <1,1,1,1>
+ 2624775063U, // <3,1,1,2>: Cost 3 vext2 <1,2,3,1>, <1,2,3,1>
+ 1611891528U, // <3,1,1,3>: Cost 2 vext3 LHS, <1,1,3,3>
+ 2689835859U, // <3,1,1,4>: Cost 3 vext3 LHS, <1,1,4,5>
+ 2689835868U, // <3,1,1,5>: Cost 3 vext3 LHS, <1,1,5,5>
+ 3763577701U, // <3,1,1,6>: Cost 4 vext3 LHS, <1,1,6,5>
+ 3765273452U, // <3,1,1,7>: Cost 4 vext3 <1,1,7,3>, <1,1,7,3>
+ 1611891573U, // <3,1,1,u>: Cost 2 vext3 LHS, <1,1,u,3>
+ 2629420494U, // <3,1,2,0>: Cost 3 vext2 <2,0,3,1>, <2,0,3,1>
+ 2689835911U, // <3,1,2,1>: Cost 3 vext3 LHS, <1,2,1,3>
+ 2564163248U, // <3,1,2,2>: Cost 3 vext1 <2,3,1,2>, <2,3,1,2>
+ 1611449238U, // <3,1,2,3>: Cost 2 vext3 LHS, <1,2,3,0>
+ 2564164918U, // <3,1,2,4>: Cost 3 vext1 <2,3,1,2>, RHS
+ 2689835947U, // <3,1,2,5>: Cost 3 vext3 LHS, <1,2,5,3>
+ 3692545978U, // <3,1,2,6>: Cost 4 vext2 <0,2,3,1>, <2,6,3,7>
+ 2732966842U, // <3,1,2,7>: Cost 3 vext3 LHS, <1,2,7,0>
+ 1611891651U, // <3,1,2,u>: Cost 2 vext3 LHS, <1,2,u,0>
+ 1484456038U, // <3,1,3,0>: Cost 2 vext1 <1,3,1,3>, LHS
+ 1611891672U, // <3,1,3,1>: Cost 2 vext3 LHS, <1,3,1,3>
+ 2685633502U, // <3,1,3,2>: Cost 3 vext3 LHS, <1,3,2,0>
+ 2685633512U, // <3,1,3,3>: Cost 3 vext3 LHS, <1,3,3,1>
+ 1484459318U, // <3,1,3,4>: Cost 2 vext1 <1,3,1,3>, RHS
+ 1611891712U, // <3,1,3,5>: Cost 2 vext3 LHS, <1,3,5,7>
+ 2689836041U, // <3,1,3,6>: Cost 3 vext3 LHS, <1,3,6,7>
+ 2733409294U, // <3,1,3,7>: Cost 3 vext3 LHS, <1,3,7,3>
+ 1611891735U, // <3,1,3,u>: Cost 2 vext3 LHS, <1,3,u,3>
+ 2552234086U, // <3,1,4,0>: Cost 3 vext1 <0,3,1,4>, LHS
+ 2732966955U, // <3,1,4,1>: Cost 3 vext3 LHS, <1,4,1,5>
+ 2732966964U, // <3,1,4,2>: Cost 3 vext3 LHS, <1,4,2,5>
+ 2685633597U, // <3,1,4,3>: Cost 3 vext3 LHS, <1,4,3,5>
+ 2552237366U, // <3,1,4,4>: Cost 3 vext1 <0,3,1,4>, RHS
+ 2618805558U, // <3,1,4,5>: Cost 3 vext2 <0,2,3,1>, RHS
+ 2769472822U, // <3,1,4,6>: Cost 3 vuzpl <3,0,1,2>, RHS
+ 3667784943U, // <3,1,4,7>: Cost 4 vext1 <7,3,1,4>, <7,3,1,4>
+ 2685633642U, // <3,1,4,u>: Cost 3 vext3 LHS, <1,4,u,5>
+ 2689836143U, // <3,1,5,0>: Cost 3 vext3 LHS, <1,5,0,1>
+ 2564187280U, // <3,1,5,1>: Cost 3 vext1 <2,3,1,5>, <1,5,3,7>
+ 2564187827U, // <3,1,5,2>: Cost 3 vext1 <2,3,1,5>, <2,3,1,5>
+ 1611891856U, // <3,1,5,3>: Cost 2 vext3 LHS, <1,5,3,7>
+ 2689836183U, // <3,1,5,4>: Cost 3 vext3 LHS, <1,5,4,5>
+ 3759375522U, // <3,1,5,5>: Cost 4 vext3 LHS, <1,5,5,7>
+ 3720417378U, // <3,1,5,6>: Cost 4 vext2 <4,u,3,1>, <5,6,7,0>
+ 2832518454U, // <3,1,5,7>: Cost 3 vuzpr <2,3,0,1>, RHS
+ 1611891901U, // <3,1,5,u>: Cost 2 vext3 LHS, <1,5,u,7>
+ 3763578048U, // <3,1,6,0>: Cost 4 vext3 LHS, <1,6,0,1>
+ 2689836239U, // <3,1,6,1>: Cost 3 vext3 LHS, <1,6,1,7>
+ 2732967128U, // <3,1,6,2>: Cost 3 vext3 LHS, <1,6,2,7>
+ 2685633761U, // <3,1,6,3>: Cost 3 vext3 LHS, <1,6,3,7>
+ 3763578088U, // <3,1,6,4>: Cost 4 vext3 LHS, <1,6,4,5>
+ 2689836275U, // <3,1,6,5>: Cost 3 vext3 LHS, <1,6,5,7>
+ 3763578108U, // <3,1,6,6>: Cost 4 vext3 LHS, <1,6,6,7>
+ 2732967166U, // <3,1,6,7>: Cost 3 vext3 LHS, <1,6,7,0>
+ 2685633806U, // <3,1,6,u>: Cost 3 vext3 LHS, <1,6,u,7>
+ 3631972454U, // <3,1,7,0>: Cost 4 vext1 <1,3,1,7>, LHS
+ 2659947612U, // <3,1,7,1>: Cost 3 vext2 <7,1,3,1>, <7,1,3,1>
+ 4036102294U, // <3,1,7,2>: Cost 4 vzipr <1,5,3,7>, <3,0,1,2>
+ 3095396454U, // <3,1,7,3>: Cost 3 vtrnr <1,3,5,7>, LHS
+ 3631975734U, // <3,1,7,4>: Cost 4 vext1 <1,3,1,7>, RHS
+ 2222982144U, // <3,1,7,5>: Cost 3 vrev <1,3,5,7>
+ 3296797705U, // <3,1,7,6>: Cost 4 vrev <1,3,6,7>
+ 3720418924U, // <3,1,7,7>: Cost 4 vext2 <4,u,3,1>, <7,7,7,7>
+ 3095396459U, // <3,1,7,u>: Cost 3 vtrnr <1,3,5,7>, LHS
+ 1484496998U, // <3,1,u,0>: Cost 2 vext1 <1,3,1,u>, LHS
+ 1611892077U, // <3,1,u,1>: Cost 2 vext3 LHS, <1,u,1,3>
+ 2685633907U, // <3,1,u,2>: Cost 3 vext3 LHS, <1,u,2,0>
+ 1611892092U, // <3,1,u,3>: Cost 2 vext3 LHS, <1,u,3,0>
+ 1484500278U, // <3,1,u,4>: Cost 2 vext1 <1,3,1,u>, RHS
+ 1611892117U, // <3,1,u,5>: Cost 2 vext3 LHS, <1,u,5,7>
+ 2685633950U, // <3,1,u,6>: Cost 3 vext3 LHS, <1,u,6,7>
+ 2832518697U, // <3,1,u,7>: Cost 3 vuzpr <2,3,0,1>, RHS
+ 1611892140U, // <3,1,u,u>: Cost 2 vext3 LHS, <1,u,u,3>
+ 2623455232U, // <3,2,0,0>: Cost 3 vext2 <1,0,3,2>, <0,0,0,0>
+ 1549713510U, // <3,2,0,1>: Cost 2 vext2 <1,0,3,2>, LHS
+ 2689836484U, // <3,2,0,2>: Cost 3 vext3 LHS, <2,0,2,0>
+ 2685633997U, // <3,2,0,3>: Cost 3 vext3 LHS, <2,0,3,0>
+ 2623455570U, // <3,2,0,4>: Cost 3 vext2 <1,0,3,2>, <0,4,1,5>
+ 2732967398U, // <3,2,0,5>: Cost 3 vext3 LHS, <2,0,5,7>
+ 2689836524U, // <3,2,0,6>: Cost 3 vext3 LHS, <2,0,6,4>
+ 2229044964U, // <3,2,0,7>: Cost 3 vrev <2,3,7,0>
+ 1549714077U, // <3,2,0,u>: Cost 2 vext2 <1,0,3,2>, LHS
+ 1549714166U, // <3,2,1,0>: Cost 2 vext2 <1,0,3,2>, <1,0,3,2>
+ 2623456052U, // <3,2,1,1>: Cost 3 vext2 <1,0,3,2>, <1,1,1,1>
+ 2623456150U, // <3,2,1,2>: Cost 3 vext2 <1,0,3,2>, <1,2,3,0>
+ 2685634079U, // <3,2,1,3>: Cost 3 vext3 LHS, <2,1,3,1>
+ 2552286518U, // <3,2,1,4>: Cost 3 vext1 <0,3,2,1>, RHS
+ 2623456400U, // <3,2,1,5>: Cost 3 vext2 <1,0,3,2>, <1,5,3,7>
+ 2689836604U, // <3,2,1,6>: Cost 3 vext3 LHS, <2,1,6,3>
+ 3667834101U, // <3,2,1,7>: Cost 4 vext1 <7,3,2,1>, <7,3,2,1>
+ 1155385070U, // <3,2,1,u>: Cost 2 vrev <2,3,u,1>
+ 2689836629U, // <3,2,2,0>: Cost 3 vext3 LHS, <2,2,0,1>
+ 2689836640U, // <3,2,2,1>: Cost 3 vext3 LHS, <2,2,1,3>
+ 1611449960U, // <3,2,2,2>: Cost 2 vext3 LHS, <2,2,2,2>
+ 1611892338U, // <3,2,2,3>: Cost 2 vext3 LHS, <2,2,3,3>
+ 2689836669U, // <3,2,2,4>: Cost 3 vext3 LHS, <2,2,4,5>
+ 2689836680U, // <3,2,2,5>: Cost 3 vext3 LHS, <2,2,5,7>
+ 2689836688U, // <3,2,2,6>: Cost 3 vext3 LHS, <2,2,6,6>
+ 3763578518U, // <3,2,2,7>: Cost 4 vext3 LHS, <2,2,7,3>
+ 1611892383U, // <3,2,2,u>: Cost 2 vext3 LHS, <2,2,u,3>
+ 1611450022U, // <3,2,3,0>: Cost 2 vext3 LHS, <2,3,0,1>
+ 2685191854U, // <3,2,3,1>: Cost 3 vext3 LHS, <2,3,1,0>
+ 2685191865U, // <3,2,3,2>: Cost 3 vext3 LHS, <2,3,2,2>
+ 2685191875U, // <3,2,3,3>: Cost 3 vext3 LHS, <2,3,3,3>
+ 1611450062U, // <3,2,3,4>: Cost 2 vext3 LHS, <2,3,4,5>
+ 2732967635U, // <3,2,3,5>: Cost 3 vext3 LHS, <2,3,5,1>
+ 2732967645U, // <3,2,3,6>: Cost 3 vext3 LHS, <2,3,6,2>
+ 2732967652U, // <3,2,3,7>: Cost 3 vext3 LHS, <2,3,7,0>
+ 1611450094U, // <3,2,3,u>: Cost 2 vext3 LHS, <2,3,u,1>
+ 2558279782U, // <3,2,4,0>: Cost 3 vext1 <1,3,2,4>, LHS
+ 2558280602U, // <3,2,4,1>: Cost 3 vext1 <1,3,2,4>, <1,2,3,4>
+ 2732967692U, // <3,2,4,2>: Cost 3 vext3 LHS, <2,4,2,4>
+ 2685634326U, // <3,2,4,3>: Cost 3 vext3 LHS, <2,4,3,5>
+ 2558283062U, // <3,2,4,4>: Cost 3 vext1 <1,3,2,4>, RHS
+ 1549716790U, // <3,2,4,5>: Cost 2 vext2 <1,0,3,2>, RHS
+ 2689836844U, // <3,2,4,6>: Cost 3 vext3 LHS, <2,4,6,0>
+ 2229077736U, // <3,2,4,7>: Cost 3 vrev <2,3,7,4>
+ 1549717033U, // <3,2,4,u>: Cost 2 vext2 <1,0,3,2>, RHS
+ 2552316006U, // <3,2,5,0>: Cost 3 vext1 <0,3,2,5>, LHS
+ 2228643507U, // <3,2,5,1>: Cost 3 vrev <2,3,1,5>
+ 2689836896U, // <3,2,5,2>: Cost 3 vext3 LHS, <2,5,2,7>
+ 2685634408U, // <3,2,5,3>: Cost 3 vext3 LHS, <2,5,3,6>
+ 1155122894U, // <3,2,5,4>: Cost 2 vrev <2,3,4,5>
+ 2665263108U, // <3,2,5,5>: Cost 3 vext2 <u,0,3,2>, <5,5,5,5>
+ 2689836932U, // <3,2,5,6>: Cost 3 vext3 LHS, <2,5,6,7>
+ 2665263272U, // <3,2,5,7>: Cost 3 vext2 <u,0,3,2>, <5,7,5,7>
+ 1155417842U, // <3,2,5,u>: Cost 2 vrev <2,3,u,5>
+ 2689836953U, // <3,2,6,0>: Cost 3 vext3 LHS, <2,6,0,1>
+ 2689836964U, // <3,2,6,1>: Cost 3 vext3 LHS, <2,6,1,3>
+ 2689836976U, // <3,2,6,2>: Cost 3 vext3 LHS, <2,6,2,6>
+ 1611892666U, // <3,2,6,3>: Cost 2 vext3 LHS, <2,6,3,7>
+ 2689836993U, // <3,2,6,4>: Cost 3 vext3 LHS, <2,6,4,5>
+ 2689837004U, // <3,2,6,5>: Cost 3 vext3 LHS, <2,6,5,7>
+ 2689837013U, // <3,2,6,6>: Cost 3 vext3 LHS, <2,6,6,7>
+ 2665263950U, // <3,2,6,7>: Cost 3 vext2 <u,0,3,2>, <6,7,0,1>
+ 1611892711U, // <3,2,6,u>: Cost 2 vext3 LHS, <2,6,u,7>
+ 2665264122U, // <3,2,7,0>: Cost 3 vext2 <u,0,3,2>, <7,0,1,2>
+ 2623460419U, // <3,2,7,1>: Cost 3 vext2 <1,0,3,2>, <7,1,0,3>
+ 4169138340U, // <3,2,7,2>: Cost 4 vtrnr <1,3,5,7>, <0,2,0,2>
+ 2962358374U, // <3,2,7,3>: Cost 3 vzipr <1,5,3,7>, LHS
+ 2665264486U, // <3,2,7,4>: Cost 3 vext2 <u,0,3,2>, <7,4,5,6>
+ 2228954841U, // <3,2,7,5>: Cost 3 vrev <2,3,5,7>
+ 2229028578U, // <3,2,7,6>: Cost 3 vrev <2,3,6,7>
+ 2665264748U, // <3,2,7,7>: Cost 3 vext2 <u,0,3,2>, <7,7,7,7>
+ 2962358379U, // <3,2,7,u>: Cost 3 vzipr <1,5,3,7>, LHS
+ 1611892795U, // <3,2,u,0>: Cost 2 vext3 LHS, <2,u,0,1>
+ 1549719342U, // <3,2,u,1>: Cost 2 vext2 <1,0,3,2>, LHS
+ 1611449960U, // <3,2,u,2>: Cost 2 vext3 LHS, <2,2,2,2>
+ 1611892824U, // <3,2,u,3>: Cost 2 vext3 LHS, <2,u,3,3>
+ 1611892835U, // <3,2,u,4>: Cost 2 vext3 LHS, <2,u,4,5>
+ 1549719706U, // <3,2,u,5>: Cost 2 vext2 <1,0,3,2>, RHS
+ 2689837168U, // <3,2,u,6>: Cost 3 vext3 LHS, <2,u,6,0>
+ 2665265408U, // <3,2,u,7>: Cost 3 vext2 <u,0,3,2>, <u,7,0,1>
+ 1611892867U, // <3,2,u,u>: Cost 2 vext3 LHS, <2,u,u,1>
+ 2685192331U, // <3,3,0,0>: Cost 3 vext3 LHS, <3,0,0,0>
+ 1611450518U, // <3,3,0,1>: Cost 2 vext3 LHS, <3,0,1,2>
+ 2685634717U, // <3,3,0,2>: Cost 3 vext3 LHS, <3,0,2,0>
+ 2564294806U, // <3,3,0,3>: Cost 3 vext1 <2,3,3,0>, <3,0,1,2>
+ 2685634736U, // <3,3,0,4>: Cost 3 vext3 LHS, <3,0,4,1>
+ 2732968122U, // <3,3,0,5>: Cost 3 vext3 LHS, <3,0,5,2>
+ 3763579075U, // <3,3,0,6>: Cost 4 vext3 LHS, <3,0,6,2>
+ 4034053264U, // <3,3,0,7>: Cost 4 vzipr <1,2,3,0>, <1,5,3,7>
+ 1611450581U, // <3,3,0,u>: Cost 2 vext3 LHS, <3,0,u,2>
+ 2685192415U, // <3,3,1,0>: Cost 3 vext3 LHS, <3,1,0,3>
+ 1550385992U, // <3,3,1,1>: Cost 2 vext2 <1,1,3,3>, <1,1,3,3>
+ 2685192433U, // <3,3,1,2>: Cost 3 vext3 LHS, <3,1,2,3>
+ 2685634808U, // <3,3,1,3>: Cost 3 vext3 LHS, <3,1,3,1>
+ 2558332214U, // <3,3,1,4>: Cost 3 vext1 <1,3,3,1>, RHS
+ 2685634828U, // <3,3,1,5>: Cost 3 vext3 LHS, <3,1,5,3>
+ 3759376661U, // <3,3,1,6>: Cost 4 vext3 LHS, <3,1,6,3>
+ 2703477022U, // <3,3,1,7>: Cost 3 vext3 <3,1,7,3>, <3,1,7,3>
+ 1555031423U, // <3,3,1,u>: Cost 2 vext2 <1,u,3,3>, <1,u,3,3>
+ 2564309094U, // <3,3,2,0>: Cost 3 vext1 <2,3,3,2>, LHS
+ 2630100513U, // <3,3,2,1>: Cost 3 vext2 <2,1,3,3>, <2,1,3,3>
+ 1557022322U, // <3,3,2,2>: Cost 2 vext2 <2,2,3,3>, <2,2,3,3>
+ 2685192520U, // <3,3,2,3>: Cost 3 vext3 LHS, <3,2,3,0>
+ 2564312374U, // <3,3,2,4>: Cost 3 vext1 <2,3,3,2>, RHS
+ 2732968286U, // <3,3,2,5>: Cost 3 vext3 LHS, <3,2,5,4>
+ 2685634918U, // <3,3,2,6>: Cost 3 vext3 LHS, <3,2,6,3>
+ 2704140655U, // <3,3,2,7>: Cost 3 vext3 <3,2,7,3>, <3,2,7,3>
+ 1561004120U, // <3,3,2,u>: Cost 2 vext2 <2,u,3,3>, <2,u,3,3>
+ 1496547430U, // <3,3,3,0>: Cost 2 vext1 <3,3,3,3>, LHS
+ 2624129256U, // <3,3,3,1>: Cost 3 vext2 <1,1,3,3>, <3,1,1,3>
+ 2630764866U, // <3,3,3,2>: Cost 3 vext2 <2,2,3,3>, <3,2,2,3>
+ 336380006U, // <3,3,3,3>: Cost 1 vdup3 LHS
+ 1496550710U, // <3,3,3,4>: Cost 2 vext1 <3,3,3,3>, RHS
+ 2732968368U, // <3,3,3,5>: Cost 3 vext3 LHS, <3,3,5,5>
+ 2624129683U, // <3,3,3,6>: Cost 3 vext2 <1,1,3,3>, <3,6,3,7>
+ 2594182400U, // <3,3,3,7>: Cost 3 vext1 <7,3,3,3>, <7,3,3,3>
+ 336380006U, // <3,3,3,u>: Cost 1 vdup3 LHS
+ 2558353510U, // <3,3,4,0>: Cost 3 vext1 <1,3,3,4>, LHS
+ 2558354411U, // <3,3,4,1>: Cost 3 vext1 <1,3,3,4>, <1,3,3,4>
+ 2564327108U, // <3,3,4,2>: Cost 3 vext1 <2,3,3,4>, <2,3,3,4>
+ 2564327938U, // <3,3,4,3>: Cost 3 vext1 <2,3,3,4>, <3,4,5,6>
+ 2960343962U, // <3,3,4,4>: Cost 3 vzipr <1,2,3,4>, <1,2,3,4>
+ 1611893250U, // <3,3,4,5>: Cost 2 vext3 LHS, <3,4,5,6>
+ 2771619126U, // <3,3,4,6>: Cost 3 vuzpl <3,3,3,3>, RHS
+ 4034086032U, // <3,3,4,7>: Cost 4 vzipr <1,2,3,4>, <1,5,3,7>
+ 1611893277U, // <3,3,4,u>: Cost 2 vext3 LHS, <3,4,u,6>
+ 2558361702U, // <3,3,5,0>: Cost 3 vext1 <1,3,3,5>, LHS
+ 2558362604U, // <3,3,5,1>: Cost 3 vext1 <1,3,3,5>, <1,3,3,5>
+ 2558363342U, // <3,3,5,2>: Cost 3 vext1 <1,3,3,5>, <2,3,4,5>
+ 2732968512U, // <3,3,5,3>: Cost 3 vext3 LHS, <3,5,3,5>
+ 2558364982U, // <3,3,5,4>: Cost 3 vext1 <1,3,3,5>, RHS
+ 3101279950U, // <3,3,5,5>: Cost 3 vtrnr <2,3,4,5>, <2,3,4,5>
+ 2665934946U, // <3,3,5,6>: Cost 3 vext2 <u,1,3,3>, <5,6,7,0>
+ 2826636598U, // <3,3,5,7>: Cost 3 vuzpr <1,3,1,3>, RHS
+ 2826636599U, // <3,3,5,u>: Cost 3 vuzpr <1,3,1,3>, RHS
+ 2732968568U, // <3,3,6,0>: Cost 3 vext3 LHS, <3,6,0,7>
+ 3763579521U, // <3,3,6,1>: Cost 4 vext3 LHS, <3,6,1,7>
+ 2732968586U, // <3,3,6,2>: Cost 3 vext3 LHS, <3,6,2,7>
+ 2732968595U, // <3,3,6,3>: Cost 3 vext3 LHS, <3,6,3,7>
+ 2732968604U, // <3,3,6,4>: Cost 3 vext3 LHS, <3,6,4,7>
+ 3763579557U, // <3,3,6,5>: Cost 4 vext3 LHS, <3,6,5,7>
+ 2732968621U, // <3,3,6,6>: Cost 3 vext3 LHS, <3,6,6,6>
+ 2657973099U, // <3,3,6,7>: Cost 3 vext2 <6,7,3,3>, <6,7,3,3>
+ 2658636732U, // <3,3,6,u>: Cost 3 vext2 <6,u,3,3>, <6,u,3,3>
+ 2558378086U, // <3,3,7,0>: Cost 3 vext1 <1,3,3,7>, LHS
+ 2558378990U, // <3,3,7,1>: Cost 3 vext1 <1,3,3,7>, <1,3,3,7>
+ 2564351687U, // <3,3,7,2>: Cost 3 vext1 <2,3,3,7>, <2,3,3,7>
+ 2661291264U, // <3,3,7,3>: Cost 3 vext2 <7,3,3,3>, <7,3,3,3>
+ 2558381366U, // <3,3,7,4>: Cost 3 vext1 <1,3,3,7>, RHS
+ 2732968694U, // <3,3,7,5>: Cost 3 vext3 LHS, <3,7,5,7>
+ 3781126907U, // <3,3,7,6>: Cost 4 vext3 <3,7,6,3>, <3,7,6,3>
+ 3095397376U, // <3,3,7,7>: Cost 3 vtrnr <1,3,5,7>, <1,3,5,7>
+ 2558383918U, // <3,3,7,u>: Cost 3 vext1 <1,3,3,7>, LHS
+ 1496547430U, // <3,3,u,0>: Cost 2 vext1 <3,3,3,3>, LHS
+ 1611893534U, // <3,3,u,1>: Cost 2 vext3 LHS, <3,u,1,2>
+ 1592858504U, // <3,3,u,2>: Cost 2 vext2 <u,2,3,3>, <u,2,3,3>
+ 336380006U, // <3,3,u,3>: Cost 1 vdup3 LHS
+ 1496550710U, // <3,3,u,4>: Cost 2 vext1 <3,3,3,3>, RHS
+ 1611893574U, // <3,3,u,5>: Cost 2 vext3 LHS, <3,u,5,6>
+ 2690280268U, // <3,3,u,6>: Cost 3 vext3 LHS, <3,u,6,3>
+ 2826636841U, // <3,3,u,7>: Cost 3 vuzpr <1,3,1,3>, RHS
+ 336380006U, // <3,3,u,u>: Cost 1 vdup3 LHS
+ 2624798720U, // <3,4,0,0>: Cost 3 vext2 <1,2,3,4>, <0,0,0,0>
+ 1551056998U, // <3,4,0,1>: Cost 2 vext2 <1,2,3,4>, LHS
+ 2624798884U, // <3,4,0,2>: Cost 3 vext2 <1,2,3,4>, <0,2,0,2>
+ 3693232384U, // <3,4,0,3>: Cost 4 vext2 <0,3,3,4>, <0,3,1,4>
+ 2624799058U, // <3,4,0,4>: Cost 3 vext2 <1,2,3,4>, <0,4,1,5>
+ 1659227026U, // <3,4,0,5>: Cost 2 vext3 LHS, <4,0,5,1>
+ 1659227036U, // <3,4,0,6>: Cost 2 vext3 LHS, <4,0,6,2>
+ 3667973382U, // <3,4,0,7>: Cost 4 vext1 <7,3,4,0>, <7,3,4,0>
+ 1551057565U, // <3,4,0,u>: Cost 2 vext2 <1,2,3,4>, LHS
+ 2624799478U, // <3,4,1,0>: Cost 3 vext2 <1,2,3,4>, <1,0,3,2>
+ 2624799540U, // <3,4,1,1>: Cost 3 vext2 <1,2,3,4>, <1,1,1,1>
+ 1551057818U, // <3,4,1,2>: Cost 2 vext2 <1,2,3,4>, <1,2,3,4>
+ 2624799704U, // <3,4,1,3>: Cost 3 vext2 <1,2,3,4>, <1,3,1,3>
+ 2564377910U, // <3,4,1,4>: Cost 3 vext1 <2,3,4,1>, RHS
+ 2689838050U, // <3,4,1,5>: Cost 3 vext3 LHS, <4,1,5,0>
+ 2689838062U, // <3,4,1,6>: Cost 3 vext3 LHS, <4,1,6,3>
+ 2628117807U, // <3,4,1,7>: Cost 3 vext2 <1,7,3,4>, <1,7,3,4>
+ 1555039616U, // <3,4,1,u>: Cost 2 vext2 <1,u,3,4>, <1,u,3,4>
+ 3626180710U, // <3,4,2,0>: Cost 4 vext1 <0,3,4,2>, LHS
+ 2624800298U, // <3,4,2,1>: Cost 3 vext2 <1,2,3,4>, <2,1,4,3>
+ 2624800360U, // <3,4,2,2>: Cost 3 vext2 <1,2,3,4>, <2,2,2,2>
+ 2624800422U, // <3,4,2,3>: Cost 3 vext2 <1,2,3,4>, <2,3,0,1>
+ 2624800514U, // <3,4,2,4>: Cost 3 vext2 <1,2,3,4>, <2,4,1,3>
+ 2709965878U, // <3,4,2,5>: Cost 3 vext3 <4,2,5,3>, <4,2,5,3>
+ 2689838140U, // <3,4,2,6>: Cost 3 vext3 LHS, <4,2,6,0>
+ 2634090504U, // <3,4,2,7>: Cost 3 vext2 <2,7,3,4>, <2,7,3,4>
+ 2689838158U, // <3,4,2,u>: Cost 3 vext3 LHS, <4,2,u,0>
+ 2624800918U, // <3,4,3,0>: Cost 3 vext2 <1,2,3,4>, <3,0,1,2>
+ 2636081403U, // <3,4,3,1>: Cost 3 vext2 <3,1,3,4>, <3,1,3,4>
+ 2636745036U, // <3,4,3,2>: Cost 3 vext2 <3,2,3,4>, <3,2,3,4>
+ 2624801180U, // <3,4,3,3>: Cost 3 vext2 <1,2,3,4>, <3,3,3,3>
+ 2624801232U, // <3,4,3,4>: Cost 3 vext2 <1,2,3,4>, <3,4,0,1>
+ 2905836854U, // <3,4,3,5>: Cost 3 vzipl <3,3,3,3>, RHS
+ 3040054582U, // <3,4,3,6>: Cost 3 vtrnl <3,3,3,3>, RHS
+ 3702524611U, // <3,4,3,7>: Cost 4 vext2 <1,u,3,4>, <3,7,0,1>
+ 2624801566U, // <3,4,3,u>: Cost 3 vext2 <1,2,3,4>, <3,u,1,2>
+ 2564399206U, // <3,4,4,0>: Cost 3 vext1 <2,3,4,4>, LHS
+ 2564400026U, // <3,4,4,1>: Cost 3 vext1 <2,3,4,4>, <1,2,3,4>
+ 2564400845U, // <3,4,4,2>: Cost 3 vext1 <2,3,4,4>, <2,3,4,4>
+ 2570373542U, // <3,4,4,3>: Cost 3 vext1 <3,3,4,4>, <3,3,4,4>
+ 1659227344U, // <3,4,4,4>: Cost 2 vext3 LHS, <4,4,4,4>
+ 1551060278U, // <3,4,4,5>: Cost 2 vext2 <1,2,3,4>, RHS
+ 1659227364U, // <3,4,4,6>: Cost 2 vext3 LHS, <4,4,6,6>
+ 3668006154U, // <3,4,4,7>: Cost 4 vext1 <7,3,4,4>, <7,3,4,4>
+ 1551060521U, // <3,4,4,u>: Cost 2 vext2 <1,2,3,4>, RHS
+ 1490665574U, // <3,4,5,0>: Cost 2 vext1 <2,3,4,5>, LHS
+ 2689838341U, // <3,4,5,1>: Cost 3 vext3 LHS, <4,5,1,3>
+ 1490667214U, // <3,4,5,2>: Cost 2 vext1 <2,3,4,5>, <2,3,4,5>
+ 2564409494U, // <3,4,5,3>: Cost 3 vext1 <2,3,4,5>, <3,0,1,2>
+ 1490668854U, // <3,4,5,4>: Cost 2 vext1 <2,3,4,5>, RHS
+ 2689838381U, // <3,4,5,5>: Cost 3 vext3 LHS, <4,5,5,7>
+ 537709878U, // <3,4,5,6>: Cost 1 vext3 LHS, RHS
+ 2594272523U, // <3,4,5,7>: Cost 3 vext1 <7,3,4,5>, <7,3,4,5>
+ 537709896U, // <3,4,5,u>: Cost 1 vext3 LHS, RHS
+ 2689838411U, // <3,4,6,0>: Cost 3 vext3 LHS, <4,6,0,1>
+ 2558444534U, // <3,4,6,1>: Cost 3 vext1 <1,3,4,6>, <1,3,4,6>
+ 2666607098U, // <3,4,6,2>: Cost 3 vext2 <u,2,3,4>, <6,2,7,3>
+ 2558446082U, // <3,4,6,3>: Cost 3 vext1 <1,3,4,6>, <3,4,5,6>
+ 1659227508U, // <3,4,6,4>: Cost 2 vext3 LHS, <4,6,4,6>
+ 2689838462U, // <3,4,6,5>: Cost 3 vext3 LHS, <4,6,5,7>
+ 2689838471U, // <3,4,6,6>: Cost 3 vext3 LHS, <4,6,6,7>
+ 2657981292U, // <3,4,6,7>: Cost 3 vext2 <6,7,3,4>, <6,7,3,4>
+ 1659227540U, // <3,4,6,u>: Cost 2 vext3 LHS, <4,6,u,2>
+ 2666607610U, // <3,4,7,0>: Cost 3 vext2 <u,2,3,4>, <7,0,1,2>
+ 3702527072U, // <3,4,7,1>: Cost 4 vext2 <1,u,3,4>, <7,1,3,5>
+ 2660635824U, // <3,4,7,2>: Cost 3 vext2 <7,2,3,4>, <7,2,3,4>
+ 3644139945U, // <3,4,7,3>: Cost 4 vext1 <3,3,4,7>, <3,3,4,7>
+ 2666607974U, // <3,4,7,4>: Cost 3 vext2 <u,2,3,4>, <7,4,5,6>
+ 2732969416U, // <3,4,7,5>: Cost 3 vext3 LHS, <4,7,5,0>
+ 2732969425U, // <3,4,7,6>: Cost 3 vext3 LHS, <4,7,6,0>
+ 2666608236U, // <3,4,7,7>: Cost 3 vext2 <u,2,3,4>, <7,7,7,7>
+ 2664617622U, // <3,4,7,u>: Cost 3 vext2 <7,u,3,4>, <7,u,3,4>
+ 1490690150U, // <3,4,u,0>: Cost 2 vext1 <2,3,4,u>, LHS
+ 1551062830U, // <3,4,u,1>: Cost 2 vext2 <1,2,3,4>, LHS
+ 1490691793U, // <3,4,u,2>: Cost 2 vext1 <2,3,4,u>, <2,3,4,u>
+ 2624804796U, // <3,4,u,3>: Cost 3 vext2 <1,2,3,4>, <u,3,0,1>
+ 1490693430U, // <3,4,u,4>: Cost 2 vext1 <2,3,4,u>, RHS
+ 1551063194U, // <3,4,u,5>: Cost 2 vext2 <1,2,3,4>, RHS
+ 537710121U, // <3,4,u,6>: Cost 1 vext3 LHS, RHS
+ 2594297102U, // <3,4,u,7>: Cost 3 vext1 <7,3,4,u>, <7,3,4,u>
+ 537710139U, // <3,4,u,u>: Cost 1 vext3 LHS, RHS
+ 3692576768U, // <3,5,0,0>: Cost 4 vext2 <0,2,3,5>, <0,0,0,0>
+ 2618835046U, // <3,5,0,1>: Cost 3 vext2 <0,2,3,5>, LHS
+ 2618835138U, // <3,5,0,2>: Cost 3 vext2 <0,2,3,5>, <0,2,3,5>
+ 3692577024U, // <3,5,0,3>: Cost 4 vext2 <0,2,3,5>, <0,3,1,4>
+ 2689838690U, // <3,5,0,4>: Cost 3 vext3 LHS, <5,0,4,1>
+ 2732969579U, // <3,5,0,5>: Cost 3 vext3 LHS, <5,0,5,1>
+ 2732969588U, // <3,5,0,6>: Cost 3 vext3 LHS, <5,0,6,1>
+ 2246963055U, // <3,5,0,7>: Cost 3 vrev <5,3,7,0>
+ 2618835613U, // <3,5,0,u>: Cost 3 vext2 <0,2,3,5>, LHS
+ 2594308198U, // <3,5,1,0>: Cost 3 vext1 <7,3,5,1>, LHS
+ 3692577588U, // <3,5,1,1>: Cost 4 vext2 <0,2,3,5>, <1,1,1,1>
+ 2624807835U, // <3,5,1,2>: Cost 3 vext2 <1,2,3,5>, <1,2,3,5>
+ 2625471468U, // <3,5,1,3>: Cost 3 vext2 <1,3,3,5>, <1,3,3,5>
+ 2626135101U, // <3,5,1,4>: Cost 3 vext2 <1,4,3,5>, <1,4,3,5>
+ 2594311888U, // <3,5,1,5>: Cost 3 vext1 <7,3,5,1>, <5,1,7,3>
+ 3699877107U, // <3,5,1,6>: Cost 4 vext2 <1,4,3,5>, <1,6,5,7>
+ 1641680592U, // <3,5,1,7>: Cost 2 vext3 <5,1,7,3>, <5,1,7,3>
+ 1641754329U, // <3,5,1,u>: Cost 2 vext3 <5,1,u,3>, <5,1,u,3>
+ 3692578274U, // <3,5,2,0>: Cost 4 vext2 <0,2,3,5>, <2,0,5,3>
+ 2630116899U, // <3,5,2,1>: Cost 3 vext2 <2,1,3,5>, <2,1,3,5>
+ 3692578408U, // <3,5,2,2>: Cost 4 vext2 <0,2,3,5>, <2,2,2,2>
+ 2625472206U, // <3,5,2,3>: Cost 3 vext2 <1,3,3,5>, <2,3,4,5>
+ 2632107798U, // <3,5,2,4>: Cost 3 vext2 <2,4,3,5>, <2,4,3,5>
+ 2715938575U, // <3,5,2,5>: Cost 3 vext3 <5,2,5,3>, <5,2,5,3>
+ 3692578746U, // <3,5,2,6>: Cost 4 vext2 <0,2,3,5>, <2,6,3,7>
+ 2716086049U, // <3,5,2,7>: Cost 3 vext3 <5,2,7,3>, <5,2,7,3>
+ 2634762330U, // <3,5,2,u>: Cost 3 vext2 <2,u,3,5>, <2,u,3,5>
+ 3692578966U, // <3,5,3,0>: Cost 4 vext2 <0,2,3,5>, <3,0,1,2>
+ 2636089596U, // <3,5,3,1>: Cost 3 vext2 <3,1,3,5>, <3,1,3,5>
+ 3699214668U, // <3,5,3,2>: Cost 4 vext2 <1,3,3,5>, <3,2,3,4>
+ 2638080412U, // <3,5,3,3>: Cost 3 vext2 <3,4,3,5>, <3,3,3,3>
+ 2618837506U, // <3,5,3,4>: Cost 3 vext2 <0,2,3,5>, <3,4,5,6>
+ 2832844494U, // <3,5,3,5>: Cost 3 vuzpr <2,3,4,5>, <2,3,4,5>
+ 4033415682U, // <3,5,3,6>: Cost 4 vzipr <1,1,3,3>, <3,4,5,6>
+ 3095072054U, // <3,5,3,7>: Cost 3 vtrnr <1,3,1,3>, RHS
+ 3095072055U, // <3,5,3,u>: Cost 3 vtrnr <1,3,1,3>, RHS
+ 2600304742U, // <3,5,4,0>: Cost 3 vext1 <u,3,5,4>, LHS
+ 3763580815U, // <3,5,4,1>: Cost 4 vext3 LHS, <5,4,1,5>
+ 2564474582U, // <3,5,4,2>: Cost 3 vext1 <2,3,5,4>, <2,3,5,4>
+ 3699879044U, // <3,5,4,3>: Cost 4 vext2 <1,4,3,5>, <4,3,5,0>
+ 2600308022U, // <3,5,4,4>: Cost 3 vext1 <u,3,5,4>, RHS
+ 2618838326U, // <3,5,4,5>: Cost 3 vext2 <0,2,3,5>, RHS
+ 2772454710U, // <3,5,4,6>: Cost 3 vuzpl <3,4,5,6>, RHS
+ 1659228102U, // <3,5,4,7>: Cost 2 vext3 LHS, <5,4,7,6>
+ 1659228111U, // <3,5,4,u>: Cost 2 vext3 LHS, <5,4,u,6>
+ 2570453094U, // <3,5,5,0>: Cost 3 vext1 <3,3,5,5>, LHS
+ 2624810704U, // <3,5,5,1>: Cost 3 vext2 <1,2,3,5>, <5,1,7,3>
+ 2570454734U, // <3,5,5,2>: Cost 3 vext1 <3,3,5,5>, <2,3,4,5>
+ 2570455472U, // <3,5,5,3>: Cost 3 vext1 <3,3,5,5>, <3,3,5,5>
+ 2570456374U, // <3,5,5,4>: Cost 3 vext1 <3,3,5,5>, RHS
+ 1659228164U, // <3,5,5,5>: Cost 2 vext3 LHS, <5,5,5,5>
+ 2732969998U, // <3,5,5,6>: Cost 3 vext3 LHS, <5,5,6,6>
+ 1659228184U, // <3,5,5,7>: Cost 2 vext3 LHS, <5,5,7,7>
+ 1659228193U, // <3,5,5,u>: Cost 2 vext3 LHS, <5,5,u,7>
+ 2732970020U, // <3,5,6,0>: Cost 3 vext3 LHS, <5,6,0,1>
+ 2732970035U, // <3,5,6,1>: Cost 3 vext3 LHS, <5,6,1,7>
+ 2564490968U, // <3,5,6,2>: Cost 3 vext1 <2,3,5,6>, <2,3,5,6>
+ 2732970050U, // <3,5,6,3>: Cost 3 vext3 LHS, <5,6,3,4>
+ 2732970060U, // <3,5,6,4>: Cost 3 vext3 LHS, <5,6,4,5>
+ 2732970071U, // <3,5,6,5>: Cost 3 vext3 LHS, <5,6,5,7>
+ 2732970080U, // <3,5,6,6>: Cost 3 vext3 LHS, <5,6,6,7>
+ 1659228258U, // <3,5,6,7>: Cost 2 vext3 LHS, <5,6,7,0>
+ 1659228267U, // <3,5,6,u>: Cost 2 vext3 LHS, <5,6,u,0>
+ 1484783718U, // <3,5,7,0>: Cost 2 vext1 <1,3,5,7>, LHS
+ 1484784640U, // <3,5,7,1>: Cost 2 vext1 <1,3,5,7>, <1,3,5,7>
+ 2558527080U, // <3,5,7,2>: Cost 3 vext1 <1,3,5,7>, <2,2,2,2>
+ 2558527638U, // <3,5,7,3>: Cost 3 vext1 <1,3,5,7>, <3,0,1,2>
+ 1484786998U, // <3,5,7,4>: Cost 2 vext1 <1,3,5,7>, RHS
+ 1659228328U, // <3,5,7,5>: Cost 2 vext3 LHS, <5,7,5,7>
+ 2732970154U, // <3,5,7,6>: Cost 3 vext3 LHS, <5,7,6,0>
+ 2558531180U, // <3,5,7,7>: Cost 3 vext1 <1,3,5,7>, <7,7,7,7>
+ 1484789550U, // <3,5,7,u>: Cost 2 vext1 <1,3,5,7>, LHS
+ 1484791910U, // <3,5,u,0>: Cost 2 vext1 <1,3,5,u>, LHS
+ 1484792833U, // <3,5,u,1>: Cost 2 vext1 <1,3,5,u>, <1,3,5,u>
+ 2558535272U, // <3,5,u,2>: Cost 3 vext1 <1,3,5,u>, <2,2,2,2>
+ 2558535830U, // <3,5,u,3>: Cost 3 vext1 <1,3,5,u>, <3,0,1,2>
+ 1484795190U, // <3,5,u,4>: Cost 2 vext1 <1,3,5,u>, RHS
+ 1659228409U, // <3,5,u,5>: Cost 2 vext3 LHS, <5,u,5,7>
+ 2772457626U, // <3,5,u,6>: Cost 3 vuzpl <3,4,5,6>, RHS
+ 1646326023U, // <3,5,u,7>: Cost 2 vext3 <5,u,7,3>, <5,u,7,3>
+ 1484797742U, // <3,5,u,u>: Cost 2 vext1 <1,3,5,u>, LHS
+ 2558541926U, // <3,6,0,0>: Cost 3 vext1 <1,3,6,0>, LHS
+ 2689839393U, // <3,6,0,1>: Cost 3 vext3 LHS, <6,0,1,2>
+ 2689839404U, // <3,6,0,2>: Cost 3 vext3 LHS, <6,0,2,4>
+ 3706519808U, // <3,6,0,3>: Cost 4 vext2 <2,5,3,6>, <0,3,1,4>
+ 2689839420U, // <3,6,0,4>: Cost 3 vext3 LHS, <6,0,4,2>
+ 2732970314U, // <3,6,0,5>: Cost 3 vext3 LHS, <6,0,5,7>
+ 2732970316U, // <3,6,0,6>: Cost 3 vext3 LHS, <6,0,6,0>
+ 2960313654U, // <3,6,0,7>: Cost 3 vzipr <1,2,3,0>, RHS
+ 2689839456U, // <3,6,0,u>: Cost 3 vext3 LHS, <6,0,u,2>
+ 3763581290U, // <3,6,1,0>: Cost 4 vext3 LHS, <6,1,0,3>
+ 3763581297U, // <3,6,1,1>: Cost 4 vext3 LHS, <6,1,1,1>
+ 2624816028U, // <3,6,1,2>: Cost 3 vext2 <1,2,3,6>, <1,2,3,6>
+ 3763581315U, // <3,6,1,3>: Cost 4 vext3 LHS, <6,1,3,1>
+ 2626143294U, // <3,6,1,4>: Cost 3 vext2 <1,4,3,6>, <1,4,3,6>
+ 3763581335U, // <3,6,1,5>: Cost 4 vext3 LHS, <6,1,5,3>
+ 2721321376U, // <3,6,1,6>: Cost 3 vext3 <6,1,6,3>, <6,1,6,3>
+ 2721395113U, // <3,6,1,7>: Cost 3 vext3 <6,1,7,3>, <6,1,7,3>
+ 2628797826U, // <3,6,1,u>: Cost 3 vext2 <1,u,3,6>, <1,u,3,6>
+ 2594390118U, // <3,6,2,0>: Cost 3 vext1 <7,3,6,2>, LHS
+ 2721616324U, // <3,6,2,1>: Cost 3 vext3 <6,2,1,3>, <6,2,1,3>
+ 2630788725U, // <3,6,2,2>: Cost 3 vext2 <2,2,3,6>, <2,2,3,6>
+ 3763581395U, // <3,6,2,3>: Cost 4 vext3 LHS, <6,2,3,0>
+ 2632115991U, // <3,6,2,4>: Cost 3 vext2 <2,4,3,6>, <2,4,3,6>
+ 2632779624U, // <3,6,2,5>: Cost 3 vext2 <2,5,3,6>, <2,5,3,6>
+ 2594394618U, // <3,6,2,6>: Cost 3 vext1 <7,3,6,2>, <6,2,7,3>
+ 1648316922U, // <3,6,2,7>: Cost 2 vext3 <6,2,7,3>, <6,2,7,3>
+ 1648390659U, // <3,6,2,u>: Cost 2 vext3 <6,2,u,3>, <6,2,u,3>
+ 3693914262U, // <3,6,3,0>: Cost 4 vext2 <0,4,3,6>, <3,0,1,2>
+ 3638281176U, // <3,6,3,1>: Cost 4 vext1 <2,3,6,3>, <1,3,1,3>
+ 3696568678U, // <3,6,3,2>: Cost 4 vext2 <0,u,3,6>, <3,2,6,3>
+ 2638088604U, // <3,6,3,3>: Cost 3 vext2 <3,4,3,6>, <3,3,3,3>
+ 2632780290U, // <3,6,3,4>: Cost 3 vext2 <2,5,3,6>, <3,4,5,6>
+ 3712494145U, // <3,6,3,5>: Cost 4 vext2 <3,5,3,6>, <3,5,3,6>
+ 3698559612U, // <3,6,3,6>: Cost 4 vext2 <1,2,3,6>, <3,6,1,2>
+ 2959674678U, // <3,6,3,7>: Cost 3 vzipr <1,1,3,3>, RHS
+ 2959674679U, // <3,6,3,u>: Cost 3 vzipr <1,1,3,3>, RHS
+ 3763581536U, // <3,6,4,0>: Cost 4 vext3 LHS, <6,4,0,6>
+ 2722943590U, // <3,6,4,1>: Cost 3 vext3 <6,4,1,3>, <6,4,1,3>
+ 2732970609U, // <3,6,4,2>: Cost 3 vext3 LHS, <6,4,2,5>
+ 3698560147U, // <3,6,4,3>: Cost 4 vext2 <1,2,3,6>, <4,3,6,6>
+ 2732970628U, // <3,6,4,4>: Cost 3 vext3 LHS, <6,4,4,6>
+ 2689839757U, // <3,6,4,5>: Cost 3 vext3 LHS, <6,4,5,6>
+ 2732970640U, // <3,6,4,6>: Cost 3 vext3 LHS, <6,4,6,0>
+ 2960346422U, // <3,6,4,7>: Cost 3 vzipr <1,2,3,4>, RHS
+ 2689839784U, // <3,6,4,u>: Cost 3 vext3 LHS, <6,4,u,6>
+ 2576498790U, // <3,6,5,0>: Cost 3 vext1 <4,3,6,5>, LHS
+ 3650241270U, // <3,6,5,1>: Cost 4 vext1 <4,3,6,5>, <1,0,3,2>
+ 2732970692U, // <3,6,5,2>: Cost 3 vext3 LHS, <6,5,2,7>
+ 2576501250U, // <3,6,5,3>: Cost 3 vext1 <4,3,6,5>, <3,4,5,6>
+ 2576501906U, // <3,6,5,4>: Cost 3 vext1 <4,3,6,5>, <4,3,6,5>
+ 3650244622U, // <3,6,5,5>: Cost 4 vext1 <4,3,6,5>, <5,5,6,6>
+ 4114633528U, // <3,6,5,6>: Cost 4 vtrnl <3,4,5,6>, <6,6,6,6>
+ 2732970735U, // <3,6,5,7>: Cost 3 vext3 LHS, <6,5,7,5>
+ 2576504622U, // <3,6,5,u>: Cost 3 vext1 <4,3,6,5>, LHS
+ 2732970749U, // <3,6,6,0>: Cost 3 vext3 LHS, <6,6,0,1>
+ 2724270856U, // <3,6,6,1>: Cost 3 vext3 <6,6,1,3>, <6,6,1,3>
+ 2624819706U, // <3,6,6,2>: Cost 3 vext2 <1,2,3,6>, <6,2,7,3>
+ 3656223234U, // <3,6,6,3>: Cost 4 vext1 <5,3,6,6>, <3,4,5,6>
+ 2732970788U, // <3,6,6,4>: Cost 3 vext3 LHS, <6,6,4,4>
+ 2732970800U, // <3,6,6,5>: Cost 3 vext3 LHS, <6,6,5,7>
+ 1659228984U, // <3,6,6,6>: Cost 2 vext3 LHS, <6,6,6,6>
+ 1659228994U, // <3,6,6,7>: Cost 2 vext3 LHS, <6,6,7,7>
+ 1659229003U, // <3,6,6,u>: Cost 2 vext3 LHS, <6,6,u,7>
+ 1659229006U, // <3,6,7,0>: Cost 2 vext3 LHS, <6,7,0,1>
+ 2558600201U, // <3,6,7,1>: Cost 3 vext1 <1,3,6,7>, <1,3,6,7>
+ 2558601146U, // <3,6,7,2>: Cost 3 vext1 <1,3,6,7>, <2,6,3,7>
+ 2725081963U, // <3,6,7,3>: Cost 3 vext3 <6,7,3,3>, <6,7,3,3>
+ 1659229046U, // <3,6,7,4>: Cost 2 vext3 LHS, <6,7,4,5>
+ 2715423611U, // <3,6,7,5>: Cost 3 vext3 <5,1,7,3>, <6,7,5,1>
+ 2722059141U, // <3,6,7,6>: Cost 3 vext3 <6,2,7,3>, <6,7,6,2>
+ 2962361654U, // <3,6,7,7>: Cost 3 vzipr <1,5,3,7>, RHS
+ 1659229078U, // <3,6,7,u>: Cost 2 vext3 LHS, <6,7,u,1>
+ 1659229087U, // <3,6,u,0>: Cost 2 vext3 LHS, <6,u,0,1>
+ 2689840041U, // <3,6,u,1>: Cost 3 vext3 LHS, <6,u,1,2>
+ 2558609339U, // <3,6,u,2>: Cost 3 vext1 <1,3,6,u>, <2,6,3,u>
+ 2576525853U, // <3,6,u,3>: Cost 3 vext1 <4,3,6,u>, <3,4,u,6>
+ 1659229127U, // <3,6,u,4>: Cost 2 vext3 LHS, <6,u,4,5>
+ 2689840081U, // <3,6,u,5>: Cost 3 vext3 LHS, <6,u,5,6>
+ 1659228984U, // <3,6,u,6>: Cost 2 vext3 LHS, <6,6,6,6>
+ 1652298720U, // <3,6,u,7>: Cost 2 vext3 <6,u,7,3>, <6,u,7,3>
+ 1659229159U, // <3,6,u,u>: Cost 2 vext3 LHS, <6,u,u,1>
+ 2626813952U, // <3,7,0,0>: Cost 3 vext2 <1,5,3,7>, <0,0,0,0>
+ 1553072230U, // <3,7,0,1>: Cost 2 vext2 <1,5,3,7>, LHS
+ 2626814116U, // <3,7,0,2>: Cost 3 vext2 <1,5,3,7>, <0,2,0,2>
+ 3700556028U, // <3,7,0,3>: Cost 4 vext2 <1,5,3,7>, <0,3,1,0>
+ 2626814290U, // <3,7,0,4>: Cost 3 vext2 <1,5,3,7>, <0,4,1,5>
+ 2582507375U, // <3,7,0,5>: Cost 3 vext1 <5,3,7,0>, <5,3,7,0>
+ 2588480072U, // <3,7,0,6>: Cost 3 vext1 <6,3,7,0>, <6,3,7,0>
+ 2732971055U, // <3,7,0,7>: Cost 3 vext3 LHS, <7,0,7,1>
+ 1553072797U, // <3,7,0,u>: Cost 2 vext2 <1,5,3,7>, LHS
+ 2626814710U, // <3,7,1,0>: Cost 3 vext2 <1,5,3,7>, <1,0,3,2>
+ 2626814772U, // <3,7,1,1>: Cost 3 vext2 <1,5,3,7>, <1,1,1,1>
+ 2626814870U, // <3,7,1,2>: Cost 3 vext2 <1,5,3,7>, <1,2,3,0>
+ 2625487854U, // <3,7,1,3>: Cost 3 vext2 <1,3,3,7>, <1,3,3,7>
+ 2582514998U, // <3,7,1,4>: Cost 3 vext1 <5,3,7,1>, RHS
+ 1553073296U, // <3,7,1,5>: Cost 2 vext2 <1,5,3,7>, <1,5,3,7>
+ 2627478753U, // <3,7,1,6>: Cost 3 vext2 <1,6,3,7>, <1,6,3,7>
+ 2727367810U, // <3,7,1,7>: Cost 3 vext3 <7,1,7,3>, <7,1,7,3>
+ 1555064195U, // <3,7,1,u>: Cost 2 vext2 <1,u,3,7>, <1,u,3,7>
+ 2588491878U, // <3,7,2,0>: Cost 3 vext1 <6,3,7,2>, LHS
+ 3700557318U, // <3,7,2,1>: Cost 4 vext2 <1,5,3,7>, <2,1,0,3>
+ 2626815592U, // <3,7,2,2>: Cost 3 vext2 <1,5,3,7>, <2,2,2,2>
+ 2626815654U, // <3,7,2,3>: Cost 3 vext2 <1,5,3,7>, <2,3,0,1>
+ 2588495158U, // <3,7,2,4>: Cost 3 vext1 <6,3,7,2>, RHS
+ 2632787817U, // <3,7,2,5>: Cost 3 vext2 <2,5,3,7>, <2,5,3,7>
+ 1559709626U, // <3,7,2,6>: Cost 2 vext2 <2,6,3,7>, <2,6,3,7>
+ 2728031443U, // <3,7,2,7>: Cost 3 vext3 <7,2,7,3>, <7,2,7,3>
+ 1561036892U, // <3,7,2,u>: Cost 2 vext2 <2,u,3,7>, <2,u,3,7>
+ 2626816150U, // <3,7,3,0>: Cost 3 vext2 <1,5,3,7>, <3,0,1,2>
+ 2626816268U, // <3,7,3,1>: Cost 3 vext2 <1,5,3,7>, <3,1,5,3>
+ 2633451878U, // <3,7,3,2>: Cost 3 vext2 <2,6,3,7>, <3,2,6,3>
+ 2626816412U, // <3,7,3,3>: Cost 3 vext2 <1,5,3,7>, <3,3,3,3>
+ 2626816514U, // <3,7,3,4>: Cost 3 vext2 <1,5,3,7>, <3,4,5,6>
+ 2638760514U, // <3,7,3,5>: Cost 3 vext2 <3,5,3,7>, <3,5,3,7>
+ 2639424147U, // <3,7,3,6>: Cost 3 vext2 <3,6,3,7>, <3,6,3,7>
+ 2826961920U, // <3,7,3,7>: Cost 3 vuzpr <1,3,5,7>, <1,3,5,7>
+ 2626816798U, // <3,7,3,u>: Cost 3 vext2 <1,5,3,7>, <3,u,1,2>
+ 2582536294U, // <3,7,4,0>: Cost 3 vext1 <5,3,7,4>, LHS
+ 2582537360U, // <3,7,4,1>: Cost 3 vext1 <5,3,7,4>, <1,5,3,7>
+ 2588510138U, // <3,7,4,2>: Cost 3 vext1 <6,3,7,4>, <2,6,3,7>
+ 3700558996U, // <3,7,4,3>: Cost 4 vext2 <1,5,3,7>, <4,3,6,7>
+ 2582539574U, // <3,7,4,4>: Cost 3 vext1 <5,3,7,4>, RHS
+ 1553075510U, // <3,7,4,5>: Cost 2 vext2 <1,5,3,7>, RHS
+ 2588512844U, // <3,7,4,6>: Cost 3 vext1 <6,3,7,4>, <6,3,7,4>
+ 2564625766U, // <3,7,4,7>: Cost 3 vext1 <2,3,7,4>, <7,4,5,6>
+ 1553075753U, // <3,7,4,u>: Cost 2 vext2 <1,5,3,7>, RHS
+ 2732971398U, // <3,7,5,0>: Cost 3 vext3 LHS, <7,5,0,2>
+ 2626817744U, // <3,7,5,1>: Cost 3 vext2 <1,5,3,7>, <5,1,7,3>
+ 3700559649U, // <3,7,5,2>: Cost 4 vext2 <1,5,3,7>, <5,2,7,3>
+ 2626817903U, // <3,7,5,3>: Cost 3 vext2 <1,5,3,7>, <5,3,7,0>
+ 2258728203U, // <3,7,5,4>: Cost 3 vrev <7,3,4,5>
+ 2732971446U, // <3,7,5,5>: Cost 3 vext3 LHS, <7,5,5,5>
+ 2732971457U, // <3,7,5,6>: Cost 3 vext3 LHS, <7,5,6,7>
+ 2826964278U, // <3,7,5,7>: Cost 3 vuzpr <1,3,5,7>, RHS
+ 2826964279U, // <3,7,5,u>: Cost 3 vuzpr <1,3,5,7>, RHS
+ 2732971478U, // <3,7,6,0>: Cost 3 vext3 LHS, <7,6,0,1>
+ 2732971486U, // <3,7,6,1>: Cost 3 vext3 LHS, <7,6,1,0>
+ 2633454074U, // <3,7,6,2>: Cost 3 vext2 <2,6,3,7>, <6,2,7,3>
+ 2633454152U, // <3,7,6,3>: Cost 3 vext2 <2,6,3,7>, <6,3,7,0>
+ 2732971518U, // <3,7,6,4>: Cost 3 vext3 LHS, <7,6,4,5>
+ 2732971526U, // <3,7,6,5>: Cost 3 vext3 LHS, <7,6,5,4>
+ 2732971537U, // <3,7,6,6>: Cost 3 vext3 LHS, <7,6,6,6>
+ 2732971540U, // <3,7,6,7>: Cost 3 vext3 LHS, <7,6,7,0>
+ 2726041124U, // <3,7,6,u>: Cost 3 vext3 <6,u,7,3>, <7,6,u,7>
+ 2570616934U, // <3,7,7,0>: Cost 3 vext1 <3,3,7,7>, LHS
+ 2570617856U, // <3,7,7,1>: Cost 3 vext1 <3,3,7,7>, <1,3,5,7>
+ 2564646635U, // <3,7,7,2>: Cost 3 vext1 <2,3,7,7>, <2,3,7,7>
+ 2570619332U, // <3,7,7,3>: Cost 3 vext1 <3,3,7,7>, <3,3,7,7>
+ 2570620214U, // <3,7,7,4>: Cost 3 vext1 <3,3,7,7>, RHS
+ 2582564726U, // <3,7,7,5>: Cost 3 vext1 <5,3,7,7>, <5,3,7,7>
+ 2588537423U, // <3,7,7,6>: Cost 3 vext1 <6,3,7,7>, <6,3,7,7>
+ 1659229804U, // <3,7,7,7>: Cost 2 vext3 LHS, <7,7,7,7>
+ 1659229804U, // <3,7,7,u>: Cost 2 vext3 LHS, <7,7,7,7>
+ 2626819795U, // <3,7,u,0>: Cost 3 vext2 <1,5,3,7>, <u,0,1,2>
+ 1553078062U, // <3,7,u,1>: Cost 2 vext2 <1,5,3,7>, LHS
+ 2626819973U, // <3,7,u,2>: Cost 3 vext2 <1,5,3,7>, <u,2,3,0>
+ 2826961565U, // <3,7,u,3>: Cost 3 vuzpr <1,3,5,7>, LHS
+ 2626820159U, // <3,7,u,4>: Cost 3 vext2 <1,5,3,7>, <u,4,5,6>
+ 1553078426U, // <3,7,u,5>: Cost 2 vext2 <1,5,3,7>, RHS
+ 1595545808U, // <3,7,u,6>: Cost 2 vext2 <u,6,3,7>, <u,6,3,7>
+ 1659229804U, // <3,7,u,7>: Cost 2 vext3 LHS, <7,7,7,7>
+ 1553078629U, // <3,7,u,u>: Cost 2 vext2 <1,5,3,7>, LHS
+ 1611448320U, // <3,u,0,0>: Cost 2 vext3 LHS, <0,0,0,0>
+ 1611896531U, // <3,u,0,1>: Cost 2 vext3 LHS, <u,0,1,2>
+ 1659672284U, // <3,u,0,2>: Cost 2 vext3 LHS, <u,0,2,2>
+ 1616099045U, // <3,u,0,3>: Cost 2 vext3 LHS, <u,0,3,2>
+ 2685638381U, // <3,u,0,4>: Cost 3 vext3 LHS, <u,0,4,1>
+ 1663874806U, // <3,u,0,5>: Cost 2 vext3 LHS, <u,0,5,1>
+ 1663874816U, // <3,u,0,6>: Cost 2 vext3 LHS, <u,0,6,2>
+ 2960313672U, // <3,u,0,7>: Cost 3 vzipr <1,2,3,0>, RHS
+ 1611896594U, // <3,u,0,u>: Cost 2 vext3 LHS, <u,0,u,2>
+ 1549763324U, // <3,u,1,0>: Cost 2 vext2 <1,0,3,u>, <1,0,3,u>
+ 1550426957U, // <3,u,1,1>: Cost 2 vext2 <1,1,3,u>, <1,1,3,u>
+ 537712430U, // <3,u,1,2>: Cost 1 vext3 LHS, LHS
+ 1616541495U, // <3,u,1,3>: Cost 2 vext3 LHS, <u,1,3,3>
+ 1490930998U, // <3,u,1,4>: Cost 2 vext1 <2,3,u,1>, RHS
+ 1553081489U, // <3,u,1,5>: Cost 2 vext2 <1,5,3,u>, <1,5,3,u>
+ 2627486946U, // <3,u,1,6>: Cost 3 vext2 <1,6,3,u>, <1,6,3,u>
+ 1659230043U, // <3,u,1,7>: Cost 2 vext3 LHS, <u,1,7,3>
+ 537712484U, // <3,u,1,u>: Cost 1 vext3 LHS, LHS
+ 1611890852U, // <3,u,2,0>: Cost 2 vext3 LHS, <0,2,0,2>
+ 2624833102U, // <3,u,2,1>: Cost 3 vext2 <1,2,3,u>, <2,1,u,3>
+ 1557063287U, // <3,u,2,2>: Cost 2 vext2 <2,2,3,u>, <2,2,3,u>
+ 1616099205U, // <3,u,2,3>: Cost 2 vext3 LHS, <u,2,3,0>
+ 1611890892U, // <3,u,2,4>: Cost 2 vext3 LHS, <0,2,4,6>
+ 2689841054U, // <3,u,2,5>: Cost 3 vext3 LHS, <u,2,5,7>
+ 1559717819U, // <3,u,2,6>: Cost 2 vext2 <2,6,3,u>, <2,6,3,u>
+ 1659230124U, // <3,u,2,7>: Cost 2 vext3 LHS, <u,2,7,3>
+ 1616541618U, // <3,u,2,u>: Cost 2 vext3 LHS, <u,2,u,0>
+ 1611896764U, // <3,u,3,0>: Cost 2 vext3 LHS, <u,3,0,1>
+ 1484973079U, // <3,u,3,1>: Cost 2 vext1 <1,3,u,3>, <1,3,u,3>
+ 2685638607U, // <3,u,3,2>: Cost 3 vext3 LHS, <u,3,2,2>
+ 336380006U, // <3,u,3,3>: Cost 1 vdup3 LHS
+ 1611896804U, // <3,u,3,4>: Cost 2 vext3 LHS, <u,3,4,5>
+ 1616541679U, // <3,u,3,5>: Cost 2 vext3 LHS, <u,3,5,7>
+ 2690283512U, // <3,u,3,6>: Cost 3 vext3 LHS, <u,3,6,7>
+ 2959674696U, // <3,u,3,7>: Cost 3 vzipr <1,1,3,3>, RHS
+ 336380006U, // <3,u,3,u>: Cost 1 vdup3 LHS
+ 2558722150U, // <3,u,4,0>: Cost 3 vext1 <1,3,u,4>, LHS
+ 1659672602U, // <3,u,4,1>: Cost 2 vext3 LHS, <u,4,1,5>
+ 1659672612U, // <3,u,4,2>: Cost 2 vext3 LHS, <u,4,2,6>
+ 2689841196U, // <3,u,4,3>: Cost 3 vext3 LHS, <u,4,3,5>
+ 1659227344U, // <3,u,4,4>: Cost 2 vext3 LHS, <4,4,4,4>
+ 1611896895U, // <3,u,4,5>: Cost 2 vext3 LHS, <u,4,5,6>
+ 1663875144U, // <3,u,4,6>: Cost 2 vext3 LHS, <u,4,6,6>
+ 1659230289U, // <3,u,4,7>: Cost 2 vext3 LHS, <u,4,7,6>
+ 1611896922U, // <3,u,4,u>: Cost 2 vext3 LHS, <u,4,u,6>
+ 1490960486U, // <3,u,5,0>: Cost 2 vext1 <2,3,u,5>, LHS
+ 2689841261U, // <3,u,5,1>: Cost 3 vext3 LHS, <u,5,1,7>
+ 1490962162U, // <3,u,5,2>: Cost 2 vext1 <2,3,u,5>, <2,3,u,5>
+ 1616541823U, // <3,u,5,3>: Cost 2 vext3 LHS, <u,5,3,7>
+ 1490963766U, // <3,u,5,4>: Cost 2 vext1 <2,3,u,5>, RHS
+ 1659228164U, // <3,u,5,5>: Cost 2 vext3 LHS, <5,5,5,5>
+ 537712794U, // <3,u,5,6>: Cost 1 vext3 LHS, RHS
+ 1659230371U, // <3,u,5,7>: Cost 2 vext3 LHS, <u,5,7,7>
+ 537712812U, // <3,u,5,u>: Cost 1 vext3 LHS, RHS
+ 2689841327U, // <3,u,6,0>: Cost 3 vext3 LHS, <u,6,0,1>
+ 2558739482U, // <3,u,6,1>: Cost 3 vext1 <1,3,u,6>, <1,3,u,6>
+ 2689841351U, // <3,u,6,2>: Cost 3 vext3 LHS, <u,6,2,7>
+ 1616099536U, // <3,u,6,3>: Cost 2 vext3 LHS, <u,6,3,7>
+ 1659227508U, // <3,u,6,4>: Cost 2 vext3 LHS, <4,6,4,6>
+ 2690283746U, // <3,u,6,5>: Cost 3 vext3 LHS, <u,6,5,7>
+ 1659228984U, // <3,u,6,6>: Cost 2 vext3 LHS, <6,6,6,6>
+ 1659230445U, // <3,u,6,7>: Cost 2 vext3 LHS, <u,6,7,0>
+ 1616099581U, // <3,u,6,u>: Cost 2 vext3 LHS, <u,6,u,7>
+ 1485004902U, // <3,u,7,0>: Cost 2 vext1 <1,3,u,7>, LHS
+ 1485005851U, // <3,u,7,1>: Cost 2 vext1 <1,3,u,7>, <1,3,u,7>
+ 2558748264U, // <3,u,7,2>: Cost 3 vext1 <1,3,u,7>, <2,2,2,2>
+ 3095397021U, // <3,u,7,3>: Cost 3 vtrnr <1,3,5,7>, LHS
+ 1485008182U, // <3,u,7,4>: Cost 2 vext1 <1,3,u,7>, RHS
+ 1659228328U, // <3,u,7,5>: Cost 2 vext3 LHS, <5,7,5,7>
+ 2722060599U, // <3,u,7,6>: Cost 3 vext3 <6,2,7,3>, <u,7,6,2>
+ 1659229804U, // <3,u,7,7>: Cost 2 vext3 LHS, <7,7,7,7>
+ 1485010734U, // <3,u,7,u>: Cost 2 vext1 <1,3,u,7>, LHS
+ 1616099665U, // <3,u,u,0>: Cost 2 vext3 LHS, <u,u,0,1>
+ 1611897179U, // <3,u,u,1>: Cost 2 vext3 LHS, <u,u,1,2>
+ 537712997U, // <3,u,u,2>: Cost 1 vext3 LHS, LHS
+ 336380006U, // <3,u,u,3>: Cost 1 vdup3 LHS
+ 1616099705U, // <3,u,u,4>: Cost 2 vext3 LHS, <u,u,4,5>
+ 1611897219U, // <3,u,u,5>: Cost 2 vext3 LHS, <u,u,5,6>
+ 537713037U, // <3,u,u,6>: Cost 1 vext3 LHS, RHS
+ 1659230607U, // <3,u,u,7>: Cost 2 vext3 LHS, <u,u,7,0>
+ 537713051U, // <3,u,u,u>: Cost 1 vext3 LHS, LHS
+ 2691907584U, // <4,0,0,0>: Cost 3 vext3 <1,2,3,4>, <0,0,0,0>
+ 2691907594U, // <4,0,0,1>: Cost 3 vext3 <1,2,3,4>, <0,0,1,1>
+ 2691907604U, // <4,0,0,2>: Cost 3 vext3 <1,2,3,4>, <0,0,2,2>
+ 3709862144U, // <4,0,0,3>: Cost 4 vext2 <3,1,4,0>, <0,3,1,4>
+ 2684682280U, // <4,0,0,4>: Cost 3 vext3 <0,0,4,4>, <0,0,4,4>
+ 3694600633U, // <4,0,0,5>: Cost 4 vext2 <0,5,4,0>, <0,5,4,0>
+ 3291431290U, // <4,0,0,6>: Cost 4 vrev <0,4,6,0>
+ 3668342067U, // <4,0,0,7>: Cost 4 vext1 <7,4,0,0>, <7,4,0,0>
+ 2691907657U, // <4,0,0,u>: Cost 3 vext3 <1,2,3,4>, <0,0,u,1>
+ 2570715238U, // <4,0,1,0>: Cost 3 vext1 <3,4,0,1>, LHS
+ 2570716058U, // <4,0,1,1>: Cost 3 vext1 <3,4,0,1>, <1,2,3,4>
+ 1618165862U, // <4,0,1,2>: Cost 2 vext3 <1,2,3,4>, LHS
+ 2570717648U, // <4,0,1,3>: Cost 3 vext1 <3,4,0,1>, <3,4,0,1>
+ 2570718518U, // <4,0,1,4>: Cost 3 vext1 <3,4,0,1>, RHS
+ 2594607206U, // <4,0,1,5>: Cost 3 vext1 <7,4,0,1>, <5,6,7,4>
+ 3662377563U, // <4,0,1,6>: Cost 4 vext1 <6,4,0,1>, <6,4,0,1>
+ 2594608436U, // <4,0,1,7>: Cost 3 vext1 <7,4,0,1>, <7,4,0,1>
+ 1618165916U, // <4,0,1,u>: Cost 2 vext3 <1,2,3,4>, LHS
+ 2685714598U, // <4,0,2,0>: Cost 3 vext3 <0,2,0,4>, <0,2,0,4>
+ 3759530159U, // <4,0,2,1>: Cost 4 vext3 <0,2,1,4>, <0,2,1,4>
+ 2685862072U, // <4,0,2,2>: Cost 3 vext3 <0,2,2,4>, <0,2,2,4>
+ 2631476937U, // <4,0,2,3>: Cost 3 vext2 <2,3,4,0>, <2,3,4,0>
+ 2685714636U, // <4,0,2,4>: Cost 3 vext3 <0,2,0,4>, <0,2,4,6>
+ 3765649622U, // <4,0,2,5>: Cost 4 vext3 <1,2,3,4>, <0,2,5,7>
+ 2686157020U, // <4,0,2,6>: Cost 3 vext3 <0,2,6,4>, <0,2,6,4>
+ 3668358453U, // <4,0,2,7>: Cost 4 vext1 <7,4,0,2>, <7,4,0,2>
+ 2686304494U, // <4,0,2,u>: Cost 3 vext3 <0,2,u,4>, <0,2,u,4>
+ 3632529510U, // <4,0,3,0>: Cost 4 vext1 <1,4,0,3>, LHS
+ 2686451968U, // <4,0,3,1>: Cost 3 vext3 <0,3,1,4>, <0,3,1,4>
+ 2686525705U, // <4,0,3,2>: Cost 3 vext3 <0,3,2,4>, <0,3,2,4>
+ 3760341266U, // <4,0,3,3>: Cost 4 vext3 <0,3,3,4>, <0,3,3,4>
+ 3632532790U, // <4,0,3,4>: Cost 4 vext1 <1,4,0,3>, RHS
+ 3913254606U, // <4,0,3,5>: Cost 4 vuzpr <3,4,5,0>, <2,3,4,5>
+ 3705219740U, // <4,0,3,6>: Cost 4 vext2 <2,3,4,0>, <3,6,4,7>
+ 3713845990U, // <4,0,3,7>: Cost 4 vext2 <3,7,4,0>, <3,7,4,0>
+ 2686451968U, // <4,0,3,u>: Cost 3 vext3 <0,3,1,4>, <0,3,1,4>
+ 2552823910U, // <4,0,4,0>: Cost 3 vext1 <0,4,0,4>, LHS
+ 2691907922U, // <4,0,4,1>: Cost 3 vext3 <1,2,3,4>, <0,4,1,5>
+ 2691907932U, // <4,0,4,2>: Cost 3 vext3 <1,2,3,4>, <0,4,2,6>
+ 3626567830U, // <4,0,4,3>: Cost 4 vext1 <0,4,0,4>, <3,0,1,2>
+ 2552827190U, // <4,0,4,4>: Cost 3 vext1 <0,4,0,4>, RHS
+ 2631478582U, // <4,0,4,5>: Cost 3 vext2 <2,3,4,0>, RHS
+ 3626570017U, // <4,0,4,6>: Cost 4 vext1 <0,4,0,4>, <6,0,1,2>
+ 3668374839U, // <4,0,4,7>: Cost 4 vext1 <7,4,0,4>, <7,4,0,4>
+ 2552829742U, // <4,0,4,u>: Cost 3 vext1 <0,4,0,4>, LHS
+ 2558804070U, // <4,0,5,0>: Cost 3 vext1 <1,4,0,5>, LHS
+ 1839644774U, // <4,0,5,1>: Cost 2 vzipl RHS, LHS
+ 2913386660U, // <4,0,5,2>: Cost 3 vzipl RHS, <0,2,0,2>
+ 2570750420U, // <4,0,5,3>: Cost 3 vext1 <3,4,0,5>, <3,4,0,5>
+ 2558807350U, // <4,0,5,4>: Cost 3 vext1 <1,4,0,5>, RHS
+ 3987128750U, // <4,0,5,5>: Cost 4 vzipl RHS, <0,5,2,7>
+ 3987128822U, // <4,0,5,6>: Cost 4 vzipl RHS, <0,6,1,7>
+ 2594641208U, // <4,0,5,7>: Cost 3 vext1 <7,4,0,5>, <7,4,0,5>
+ 1839645341U, // <4,0,5,u>: Cost 2 vzipl RHS, LHS
+ 2552840294U, // <4,0,6,0>: Cost 3 vext1 <0,4,0,6>, LHS
+ 3047604234U, // <4,0,6,1>: Cost 3 vtrnl RHS, <0,0,1,1>
+ 1973862502U, // <4,0,6,2>: Cost 2 vtrnl RHS, LHS
+ 2570758613U, // <4,0,6,3>: Cost 3 vext1 <3,4,0,6>, <3,4,0,6>
+ 2552843574U, // <4,0,6,4>: Cost 3 vext1 <0,4,0,6>, RHS
+ 2217664887U, // <4,0,6,5>: Cost 3 vrev <0,4,5,6>
+ 3662418528U, // <4,0,6,6>: Cost 4 vext1 <6,4,0,6>, <6,4,0,6>
+ 2658022257U, // <4,0,6,7>: Cost 3 vext2 <6,7,4,0>, <6,7,4,0>
+ 1973862556U, // <4,0,6,u>: Cost 2 vtrnl RHS, LHS
+ 3731764218U, // <4,0,7,0>: Cost 4 vext2 <6,7,4,0>, <7,0,1,2>
+ 3988324454U, // <4,0,7,1>: Cost 4 vzipl <4,7,5,0>, LHS
+ 4122034278U, // <4,0,7,2>: Cost 4 vtrnl <4,6,7,1>, LHS
+ 3735082246U, // <4,0,7,3>: Cost 4 vext2 <7,3,4,0>, <7,3,4,0>
+ 3731764536U, // <4,0,7,4>: Cost 4 vext2 <6,7,4,0>, <7,4,0,5>
+ 3937145718U, // <4,0,7,5>: Cost 4 vuzpr <7,4,5,0>, <6,7,4,5>
+ 3737073145U, // <4,0,7,6>: Cost 4 vext2 <7,6,4,0>, <7,6,4,0>
+ 3731764844U, // <4,0,7,7>: Cost 4 vext2 <6,7,4,0>, <7,7,7,7>
+ 4122034332U, // <4,0,7,u>: Cost 4 vtrnl <4,6,7,1>, LHS
+ 2552856678U, // <4,0,u,0>: Cost 3 vext1 <0,4,0,u>, LHS
+ 1841635430U, // <4,0,u,1>: Cost 2 vzipl RHS, LHS
+ 1618166429U, // <4,0,u,2>: Cost 2 vext3 <1,2,3,4>, LHS
+ 2570774999U, // <4,0,u,3>: Cost 3 vext1 <3,4,0,u>, <3,4,0,u>
+ 2552859958U, // <4,0,u,4>: Cost 3 vext1 <0,4,0,u>, RHS
+ 2631481498U, // <4,0,u,5>: Cost 3 vext2 <2,3,4,0>, RHS
+ 2686157020U, // <4,0,u,6>: Cost 3 vext3 <0,2,6,4>, <0,2,6,4>
+ 2594665787U, // <4,0,u,7>: Cost 3 vext1 <7,4,0,u>, <7,4,0,u>
+ 1618166483U, // <4,0,u,u>: Cost 2 vext3 <1,2,3,4>, LHS
+ 2617548837U, // <4,1,0,0>: Cost 3 vext2 <0,0,4,1>, <0,0,4,1>
+ 2622857318U, // <4,1,0,1>: Cost 3 vext2 <0,u,4,1>, LHS
+ 3693281484U, // <4,1,0,2>: Cost 4 vext2 <0,3,4,1>, <0,2,4,6>
+ 2691908342U, // <4,1,0,3>: Cost 3 vext3 <1,2,3,4>, <1,0,3,2>
+ 2622857554U, // <4,1,0,4>: Cost 3 vext2 <0,u,4,1>, <0,4,1,5>
+ 3764470538U, // <4,1,0,5>: Cost 4 vext3 <1,0,5,4>, <1,0,5,4>
+ 3695272459U, // <4,1,0,6>: Cost 4 vext2 <0,6,4,1>, <0,6,4,1>
+ 3733094980U, // <4,1,0,7>: Cost 4 vext2 <7,0,4,1>, <0,7,1,4>
+ 2622857885U, // <4,1,0,u>: Cost 3 vext2 <0,u,4,1>, LHS
+ 3696599798U, // <4,1,1,0>: Cost 4 vext2 <0,u,4,1>, <1,0,3,2>
+ 2691097399U, // <4,1,1,1>: Cost 3 vext3 <1,1,1,4>, <1,1,1,4>
+ 2631484314U, // <4,1,1,2>: Cost 3 vext2 <2,3,4,1>, <1,2,3,4>
+ 2691908424U, // <4,1,1,3>: Cost 3 vext3 <1,2,3,4>, <1,1,3,3>
+ 3696600125U, // <4,1,1,4>: Cost 4 vext2 <0,u,4,1>, <1,4,3,5>
+ 3696600175U, // <4,1,1,5>: Cost 4 vext2 <0,u,4,1>, <1,5,0,1>
+ 3696600307U, // <4,1,1,6>: Cost 4 vext2 <0,u,4,1>, <1,6,5,7>
+ 3668423997U, // <4,1,1,7>: Cost 4 vext1 <7,4,1,1>, <7,4,1,1>
+ 2691908469U, // <4,1,1,u>: Cost 3 vext3 <1,2,3,4>, <1,1,u,3>
+ 2570797158U, // <4,1,2,0>: Cost 3 vext1 <3,4,1,2>, LHS
+ 2570797978U, // <4,1,2,1>: Cost 3 vext1 <3,4,1,2>, <1,2,3,4>
+ 3696600680U, // <4,1,2,2>: Cost 4 vext2 <0,u,4,1>, <2,2,2,2>
+ 1618166682U, // <4,1,2,3>: Cost 2 vext3 <1,2,3,4>, <1,2,3,4>
+ 2570800438U, // <4,1,2,4>: Cost 3 vext1 <3,4,1,2>, RHS
+ 3765650347U, // <4,1,2,5>: Cost 4 vext3 <1,2,3,4>, <1,2,5,3>
+ 3696601018U, // <4,1,2,6>: Cost 4 vext2 <0,u,4,1>, <2,6,3,7>
+ 3668432190U, // <4,1,2,7>: Cost 4 vext1 <7,4,1,2>, <7,4,1,2>
+ 1618535367U, // <4,1,2,u>: Cost 2 vext3 <1,2,u,4>, <1,2,u,4>
+ 2564833382U, // <4,1,3,0>: Cost 3 vext1 <2,4,1,3>, LHS
+ 2691908568U, // <4,1,3,1>: Cost 3 vext3 <1,2,3,4>, <1,3,1,3>
+ 2691908578U, // <4,1,3,2>: Cost 3 vext3 <1,2,3,4>, <1,3,2,4>
+ 2692572139U, // <4,1,3,3>: Cost 3 vext3 <1,3,3,4>, <1,3,3,4>
+ 2564836662U, // <4,1,3,4>: Cost 3 vext1 <2,4,1,3>, RHS
+ 2691908608U, // <4,1,3,5>: Cost 3 vext3 <1,2,3,4>, <1,3,5,7>
+ 2588725862U, // <4,1,3,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3>
+ 3662468090U, // <4,1,3,7>: Cost 4 vext1 <6,4,1,3>, <7,0,1,2>
+ 2691908631U, // <4,1,3,u>: Cost 3 vext3 <1,2,3,4>, <1,3,u,3>
+ 3760194590U, // <4,1,4,0>: Cost 4 vext3 <0,3,1,4>, <1,4,0,1>
+ 3693947874U, // <4,1,4,1>: Cost 4 vext2 <0,4,4,1>, <4,1,5,0>
+ 3765650484U, // <4,1,4,2>: Cost 4 vext3 <1,2,3,4>, <1,4,2,5>
+ 3113877606U, // <4,1,4,3>: Cost 3 vtrnr <4,4,4,4>, LHS
+ 3760194630U, // <4,1,4,4>: Cost 4 vext3 <0,3,1,4>, <1,4,4,5>
+ 2622860598U, // <4,1,4,5>: Cost 3 vext2 <0,u,4,1>, RHS
+ 3297436759U, // <4,1,4,6>: Cost 4 vrev <1,4,6,4>
+ 3800007772U, // <4,1,4,7>: Cost 4 vext3 <7,0,1,4>, <1,4,7,0>
+ 2622860841U, // <4,1,4,u>: Cost 3 vext2 <0,u,4,1>, RHS
+ 1479164006U, // <4,1,5,0>: Cost 2 vext1 <0,4,1,5>, LHS
+ 2552906486U, // <4,1,5,1>: Cost 3 vext1 <0,4,1,5>, <1,0,3,2>
+ 2552907299U, // <4,1,5,2>: Cost 3 vext1 <0,4,1,5>, <2,1,3,5>
+ 2552907926U, // <4,1,5,3>: Cost 3 vext1 <0,4,1,5>, <3,0,1,2>
+ 1479167286U, // <4,1,5,4>: Cost 2 vext1 <0,4,1,5>, RHS
+ 2913387664U, // <4,1,5,5>: Cost 3 vzipl RHS, <1,5,3,7>
+ 2600686074U, // <4,1,5,6>: Cost 3 vext1 <u,4,1,5>, <6,2,7,3>
+ 2600686586U, // <4,1,5,7>: Cost 3 vext1 <u,4,1,5>, <7,0,1,2>
+ 1479169838U, // <4,1,5,u>: Cost 2 vext1 <0,4,1,5>, LHS
+ 2552914022U, // <4,1,6,0>: Cost 3 vext1 <0,4,1,6>, LHS
+ 2558886708U, // <4,1,6,1>: Cost 3 vext1 <1,4,1,6>, <1,1,1,1>
+ 4028205206U, // <4,1,6,2>: Cost 4 vzipr <0,2,4,6>, <3,0,1,2>
+ 3089858662U, // <4,1,6,3>: Cost 3 vtrnr <0,4,2,6>, LHS
+ 2552917302U, // <4,1,6,4>: Cost 3 vext1 <0,4,1,6>, RHS
+ 2223637584U, // <4,1,6,5>: Cost 3 vrev <1,4,5,6>
+ 4121347081U, // <4,1,6,6>: Cost 4 vtrnl RHS, <1,3,6,7>
+ 3721155406U, // <4,1,6,7>: Cost 4 vext2 <5,0,4,1>, <6,7,0,1>
+ 2552919854U, // <4,1,6,u>: Cost 3 vext1 <0,4,1,6>, LHS
+ 2659357716U, // <4,1,7,0>: Cost 3 vext2 <7,0,4,1>, <7,0,4,1>
+ 3733763173U, // <4,1,7,1>: Cost 4 vext2 <7,1,4,1>, <7,1,4,1>
+ 3734426806U, // <4,1,7,2>: Cost 4 vext2 <7,2,4,1>, <7,2,4,1>
+ 2695226671U, // <4,1,7,3>: Cost 3 vext3 <1,7,3,4>, <1,7,3,4>
+ 3721155942U, // <4,1,7,4>: Cost 4 vext2 <5,0,4,1>, <7,4,5,6>
+ 3721155976U, // <4,1,7,5>: Cost 4 vext2 <5,0,4,1>, <7,5,0,4>
+ 3662500458U, // <4,1,7,6>: Cost 4 vext1 <6,4,1,7>, <6,4,1,7>
+ 3721156204U, // <4,1,7,7>: Cost 4 vext2 <5,0,4,1>, <7,7,7,7>
+ 2659357716U, // <4,1,7,u>: Cost 3 vext2 <7,0,4,1>, <7,0,4,1>
+ 1479188582U, // <4,1,u,0>: Cost 2 vext1 <0,4,1,u>, LHS
+ 2552931062U, // <4,1,u,1>: Cost 3 vext1 <0,4,1,u>, <1,0,3,2>
+ 2552931944U, // <4,1,u,2>: Cost 3 vext1 <0,4,1,u>, <2,2,2,2>
+ 1622148480U, // <4,1,u,3>: Cost 2 vext3 <1,u,3,4>, <1,u,3,4>
+ 1479191862U, // <4,1,u,4>: Cost 2 vext1 <0,4,1,u>, RHS
+ 2622863514U, // <4,1,u,5>: Cost 3 vext2 <0,u,4,1>, RHS
+ 2588725862U, // <4,1,u,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3>
+ 2600686586U, // <4,1,u,7>: Cost 3 vext1 <u,4,1,5>, <7,0,1,2>
+ 1479194414U, // <4,1,u,u>: Cost 2 vext1 <0,4,1,u>, LHS
+ 2617557030U, // <4,2,0,0>: Cost 3 vext2 <0,0,4,2>, <0,0,4,2>
+ 2622865510U, // <4,2,0,1>: Cost 3 vext2 <0,u,4,2>, LHS
+ 2622865612U, // <4,2,0,2>: Cost 3 vext2 <0,u,4,2>, <0,2,4,6>
+ 3693289753U, // <4,2,0,3>: Cost 4 vext2 <0,3,4,2>, <0,3,4,2>
+ 2635473244U, // <4,2,0,4>: Cost 3 vext2 <3,0,4,2>, <0,4,2,6>
+ 3765650918U, // <4,2,0,5>: Cost 4 vext3 <1,2,3,4>, <2,0,5,7>
+ 2696775148U, // <4,2,0,6>: Cost 3 vext3 <2,0,6,4>, <2,0,6,4>
+ 3695944285U, // <4,2,0,7>: Cost 4 vext2 <0,7,4,2>, <0,7,4,2>
+ 2622866077U, // <4,2,0,u>: Cost 3 vext2 <0,u,4,2>, LHS
+ 3696607990U, // <4,2,1,0>: Cost 4 vext2 <0,u,4,2>, <1,0,3,2>
+ 3696608052U, // <4,2,1,1>: Cost 4 vext2 <0,u,4,2>, <1,1,1,1>
+ 3696608150U, // <4,2,1,2>: Cost 4 vext2 <0,u,4,2>, <1,2,3,0>
+ 3895574630U, // <4,2,1,3>: Cost 4 vuzpr <0,4,u,2>, LHS
+ 2691909162U, // <4,2,1,4>: Cost 3 vext3 <1,2,3,4>, <2,1,4,3>
+ 3696608400U, // <4,2,1,5>: Cost 4 vext2 <0,u,4,2>, <1,5,3,7>
+ 3760784956U, // <4,2,1,6>: Cost 4 vext3 <0,4,0,4>, <2,1,6,3>
+ 3773908549U, // <4,2,1,7>: Cost 5 vext3 <2,5,7,4>, <2,1,7,3>
+ 2691909162U, // <4,2,1,u>: Cost 3 vext3 <1,2,3,4>, <2,1,4,3>
+ 3696608748U, // <4,2,2,0>: Cost 4 vext2 <0,u,4,2>, <2,0,6,4>
+ 3696608828U, // <4,2,2,1>: Cost 4 vext2 <0,u,4,2>, <2,1,6,3>
+ 2691909224U, // <4,2,2,2>: Cost 3 vext3 <1,2,3,4>, <2,2,2,2>
+ 2691909234U, // <4,2,2,3>: Cost 3 vext3 <1,2,3,4>, <2,2,3,3>
+ 3759605368U, // <4,2,2,4>: Cost 4 vext3 <0,2,2,4>, <2,2,4,0>
+ 3696609156U, // <4,2,2,5>: Cost 4 vext2 <0,u,4,2>, <2,5,6,7>
+ 3760785040U, // <4,2,2,6>: Cost 4 vext3 <0,4,0,4>, <2,2,6,6>
+ 3668505927U, // <4,2,2,7>: Cost 4 vext1 <7,4,2,2>, <7,4,2,2>
+ 2691909279U, // <4,2,2,u>: Cost 3 vext3 <1,2,3,4>, <2,2,u,3>
+ 2691909286U, // <4,2,3,0>: Cost 3 vext3 <1,2,3,4>, <2,3,0,1>
+ 3764840111U, // <4,2,3,1>: Cost 4 vext3 <1,1,1,4>, <2,3,1,1>
+ 3765651129U, // <4,2,3,2>: Cost 4 vext3 <1,2,3,4>, <2,3,2,2>
+ 2698544836U, // <4,2,3,3>: Cost 3 vext3 <2,3,3,4>, <2,3,3,4>
+ 2685863630U, // <4,2,3,4>: Cost 3 vext3 <0,2,2,4>, <2,3,4,5>
+ 2698692310U, // <4,2,3,5>: Cost 3 vext3 <2,3,5,4>, <2,3,5,4>
+ 3772507871U, // <4,2,3,6>: Cost 4 vext3 <2,3,6,4>, <2,3,6,4>
+ 2698839784U, // <4,2,3,7>: Cost 3 vext3 <2,3,7,4>, <2,3,7,4>
+ 2691909358U, // <4,2,3,u>: Cost 3 vext3 <1,2,3,4>, <2,3,u,1>
+ 2564915302U, // <4,2,4,0>: Cost 3 vext1 <2,4,2,4>, LHS
+ 2564916122U, // <4,2,4,1>: Cost 3 vext1 <2,4,2,4>, <1,2,3,4>
+ 2564917004U, // <4,2,4,2>: Cost 3 vext1 <2,4,2,4>, <2,4,2,4>
+ 2699208469U, // <4,2,4,3>: Cost 3 vext3 <2,4,3,4>, <2,4,3,4>
+ 2564918582U, // <4,2,4,4>: Cost 3 vext1 <2,4,2,4>, RHS
+ 2622868790U, // <4,2,4,5>: Cost 3 vext2 <0,u,4,2>, RHS
+ 2229667632U, // <4,2,4,6>: Cost 3 vrev <2,4,6,4>
+ 3800082229U, // <4,2,4,7>: Cost 4 vext3 <7,0,2,4>, <2,4,7,0>
+ 2622869033U, // <4,2,4,u>: Cost 3 vext2 <0,u,4,2>, RHS
+ 2552979558U, // <4,2,5,0>: Cost 3 vext1 <0,4,2,5>, LHS
+ 2558952342U, // <4,2,5,1>: Cost 3 vext1 <1,4,2,5>, <1,2,3,0>
+ 2564925032U, // <4,2,5,2>: Cost 3 vext1 <2,4,2,5>, <2,2,2,2>
+ 2967060582U, // <4,2,5,3>: Cost 3 vzipr <2,3,4,5>, LHS
+ 2552982838U, // <4,2,5,4>: Cost 3 vext1 <0,4,2,5>, RHS
+ 3987130190U, // <4,2,5,5>: Cost 4 vzipl RHS, <2,5,0,7>
+ 2913388474U, // <4,2,5,6>: Cost 3 vzipl RHS, <2,6,3,7>
+ 3895577910U, // <4,2,5,7>: Cost 4 vuzpr <0,4,u,2>, RHS
+ 2552985390U, // <4,2,5,u>: Cost 3 vext1 <0,4,2,5>, LHS
+ 1479245926U, // <4,2,6,0>: Cost 2 vext1 <0,4,2,6>, LHS
+ 2552988406U, // <4,2,6,1>: Cost 3 vext1 <0,4,2,6>, <1,0,3,2>
+ 2552989288U, // <4,2,6,2>: Cost 3 vext1 <0,4,2,6>, <2,2,2,2>
+ 2954461286U, // <4,2,6,3>: Cost 3 vzipr <0,2,4,6>, LHS
+ 1479249206U, // <4,2,6,4>: Cost 2 vext1 <0,4,2,6>, RHS
+ 2229610281U, // <4,2,6,5>: Cost 3 vrev <2,4,5,6>
+ 2600767994U, // <4,2,6,6>: Cost 3 vext1 <u,4,2,6>, <6,2,7,3>
+ 2600768506U, // <4,2,6,7>: Cost 3 vext1 <u,4,2,6>, <7,0,1,2>
+ 1479251758U, // <4,2,6,u>: Cost 2 vext1 <0,4,2,6>, LHS
+ 2659365909U, // <4,2,7,0>: Cost 3 vext2 <7,0,4,2>, <7,0,4,2>
+ 3733771366U, // <4,2,7,1>: Cost 4 vext2 <7,1,4,2>, <7,1,4,2>
+ 3734434999U, // <4,2,7,2>: Cost 4 vext2 <7,2,4,2>, <7,2,4,2>
+ 2701199368U, // <4,2,7,3>: Cost 3 vext3 <2,7,3,4>, <2,7,3,4>
+ 4175774618U, // <4,2,7,4>: Cost 4 vtrnr <2,4,5,7>, <1,2,3,4>
+ 3303360298U, // <4,2,7,5>: Cost 4 vrev <2,4,5,7>
+ 3727136217U, // <4,2,7,6>: Cost 4 vext2 <6,0,4,2>, <7,6,0,4>
+ 3727136364U, // <4,2,7,7>: Cost 4 vext2 <6,0,4,2>, <7,7,7,7>
+ 2659365909U, // <4,2,7,u>: Cost 3 vext2 <7,0,4,2>, <7,0,4,2>
+ 1479262310U, // <4,2,u,0>: Cost 2 vext1 <0,4,2,u>, LHS
+ 2553004790U, // <4,2,u,1>: Cost 3 vext1 <0,4,2,u>, <1,0,3,2>
+ 2553005672U, // <4,2,u,2>: Cost 3 vext1 <0,4,2,u>, <2,2,2,2>
+ 2954477670U, // <4,2,u,3>: Cost 3 vzipr <0,2,4,u>, LHS
+ 1479265590U, // <4,2,u,4>: Cost 2 vext1 <0,4,2,u>, RHS
+ 2622871706U, // <4,2,u,5>: Cost 3 vext2 <0,u,4,2>, RHS
+ 2229700404U, // <4,2,u,6>: Cost 3 vrev <2,4,6,u>
+ 2600784890U, // <4,2,u,7>: Cost 3 vext1 <u,4,2,u>, <7,0,1,2>
+ 1479268142U, // <4,2,u,u>: Cost 2 vext1 <0,4,2,u>, LHS
+ 3765651595U, // <4,3,0,0>: Cost 4 vext3 <1,2,3,4>, <3,0,0,0>
+ 2691909782U, // <4,3,0,1>: Cost 3 vext3 <1,2,3,4>, <3,0,1,2>
+ 2702452897U, // <4,3,0,2>: Cost 3 vext3 <3,0,2,4>, <3,0,2,4>
+ 3693297946U, // <4,3,0,3>: Cost 4 vext2 <0,3,4,3>, <0,3,4,3>
+ 3760711856U, // <4,3,0,4>: Cost 4 vext3 <0,3,u,4>, <3,0,4,1>
+ 2235533820U, // <4,3,0,5>: Cost 3 vrev <3,4,5,0>
+ 3309349381U, // <4,3,0,6>: Cost 4 vrev <3,4,6,0>
+ 3668563278U, // <4,3,0,7>: Cost 4 vext1 <7,4,3,0>, <7,4,3,0>
+ 2691909845U, // <4,3,0,u>: Cost 3 vext3 <1,2,3,4>, <3,0,u,2>
+ 2235173328U, // <4,3,1,0>: Cost 3 vrev <3,4,0,1>
+ 3764840678U, // <4,3,1,1>: Cost 4 vext3 <1,1,1,4>, <3,1,1,1>
+ 2630173594U, // <4,3,1,2>: Cost 3 vext2 <2,1,4,3>, <1,2,3,4>
+ 2703190267U, // <4,3,1,3>: Cost 3 vext3 <3,1,3,4>, <3,1,3,4>
+ 3760195840U, // <4,3,1,4>: Cost 4 vext3 <0,3,1,4>, <3,1,4,0>
+ 3765651724U, // <4,3,1,5>: Cost 4 vext3 <1,2,3,4>, <3,1,5,3>
+ 3309357574U, // <4,3,1,6>: Cost 4 vrev <3,4,6,1>
+ 3769633054U, // <4,3,1,7>: Cost 4 vext3 <1,u,3,4>, <3,1,7,3>
+ 2703558952U, // <4,3,1,u>: Cost 3 vext3 <3,1,u,4>, <3,1,u,4>
+ 3626770534U, // <4,3,2,0>: Cost 4 vext1 <0,4,3,2>, LHS
+ 2630174250U, // <4,3,2,1>: Cost 3 vext2 <2,1,4,3>, <2,1,4,3>
+ 3765651777U, // <4,3,2,2>: Cost 4 vext3 <1,2,3,4>, <3,2,2,2>
+ 2703853900U, // <4,3,2,3>: Cost 3 vext3 <3,2,3,4>, <3,2,3,4>
+ 3626773814U, // <4,3,2,4>: Cost 4 vext1 <0,4,3,2>, RHS
+ 2704001374U, // <4,3,2,5>: Cost 3 vext3 <3,2,5,4>, <3,2,5,4>
+ 3765651814U, // <4,3,2,6>: Cost 4 vext3 <1,2,3,4>, <3,2,6,3>
+ 3769633135U, // <4,3,2,7>: Cost 4 vext3 <1,u,3,4>, <3,2,7,3>
+ 2634819681U, // <4,3,2,u>: Cost 3 vext2 <2,u,4,3>, <2,u,4,3>
+ 3765651839U, // <4,3,3,0>: Cost 4 vext3 <1,2,3,4>, <3,3,0,1>
+ 3765651848U, // <4,3,3,1>: Cost 4 vext3 <1,2,3,4>, <3,3,1,1>
+ 3710552404U, // <4,3,3,2>: Cost 4 vext2 <3,2,4,3>, <3,2,4,3>
+ 2691910044U, // <4,3,3,3>: Cost 3 vext3 <1,2,3,4>, <3,3,3,3>
+ 2704591270U, // <4,3,3,4>: Cost 3 vext3 <3,3,4,4>, <3,3,4,4>
+ 3769633202U, // <4,3,3,5>: Cost 4 vext3 <1,u,3,4>, <3,3,5,7>
+ 3703917212U, // <4,3,3,6>: Cost 4 vext2 <2,1,4,3>, <3,6,4,7>
+ 3769633220U, // <4,3,3,7>: Cost 4 vext3 <1,u,3,4>, <3,3,7,7>
+ 2691910044U, // <4,3,3,u>: Cost 3 vext3 <1,2,3,4>, <3,3,3,3>
+ 2691910096U, // <4,3,4,0>: Cost 3 vext3 <1,2,3,4>, <3,4,0,1>
+ 2691910106U, // <4,3,4,1>: Cost 3 vext3 <1,2,3,4>, <3,4,1,2>
+ 2564990741U, // <4,3,4,2>: Cost 3 vext1 <2,4,3,4>, <2,4,3,4>
+ 3765651946U, // <4,3,4,3>: Cost 4 vext3 <1,2,3,4>, <3,4,3,0>
+ 2691910136U, // <4,3,4,4>: Cost 3 vext3 <1,2,3,4>, <3,4,4,5>
+ 2686454274U, // <4,3,4,5>: Cost 3 vext3 <0,3,1,4>, <3,4,5,6>
+ 2235640329U, // <4,3,4,6>: Cost 3 vrev <3,4,6,4>
+ 3801483792U, // <4,3,4,7>: Cost 4 vext3 <7,2,3,4>, <3,4,7,2>
+ 2691910168U, // <4,3,4,u>: Cost 3 vext3 <1,2,3,4>, <3,4,u,1>
+ 2559025254U, // <4,3,5,0>: Cost 3 vext1 <1,4,3,5>, LHS
+ 2559026237U, // <4,3,5,1>: Cost 3 vext1 <1,4,3,5>, <1,4,3,5>
+ 2564998862U, // <4,3,5,2>: Cost 3 vext1 <2,4,3,5>, <2,3,4,5>
+ 2570971548U, // <4,3,5,3>: Cost 3 vext1 <3,4,3,5>, <3,3,3,3>
+ 2559028534U, // <4,3,5,4>: Cost 3 vext1 <1,4,3,5>, RHS
+ 4163519477U, // <4,3,5,5>: Cost 4 vtrnr <0,4,1,5>, <1,3,4,5>
+ 3309390346U, // <4,3,5,6>: Cost 4 vrev <3,4,6,5>
+ 2706139747U, // <4,3,5,7>: Cost 3 vext3 <3,5,7,4>, <3,5,7,4>
+ 2559031086U, // <4,3,5,u>: Cost 3 vext1 <1,4,3,5>, LHS
+ 2559033446U, // <4,3,6,0>: Cost 3 vext1 <1,4,3,6>, LHS
+ 2559034430U, // <4,3,6,1>: Cost 3 vext1 <1,4,3,6>, <1,4,3,6>
+ 2565007127U, // <4,3,6,2>: Cost 3 vext1 <2,4,3,6>, <2,4,3,6>
+ 2570979740U, // <4,3,6,3>: Cost 3 vext1 <3,4,3,6>, <3,3,3,3>
+ 2559036726U, // <4,3,6,4>: Cost 3 vext1 <1,4,3,6>, RHS
+ 1161841154U, // <4,3,6,5>: Cost 2 vrev <3,4,5,6>
+ 4028203932U, // <4,3,6,6>: Cost 4 vzipr <0,2,4,6>, <1,2,3,6>
+ 2706803380U, // <4,3,6,7>: Cost 3 vext3 <3,6,7,4>, <3,6,7,4>
+ 1162062365U, // <4,3,6,u>: Cost 2 vrev <3,4,u,6>
+ 3769633475U, // <4,3,7,0>: Cost 4 vext3 <1,u,3,4>, <3,7,0,1>
+ 3769633488U, // <4,3,7,1>: Cost 4 vext3 <1,u,3,4>, <3,7,1,5>
+ 3638757144U, // <4,3,7,2>: Cost 4 vext1 <2,4,3,7>, <2,4,3,7>
+ 3769633508U, // <4,3,7,3>: Cost 4 vext3 <1,u,3,4>, <3,7,3,7>
+ 3769633515U, // <4,3,7,4>: Cost 4 vext3 <1,u,3,4>, <3,7,4,5>
+ 3769633526U, // <4,3,7,5>: Cost 4 vext3 <1,u,3,4>, <3,7,5,7>
+ 3662647932U, // <4,3,7,6>: Cost 4 vext1 <6,4,3,7>, <6,4,3,7>
+ 3781208837U, // <4,3,7,7>: Cost 4 vext3 <3,7,7,4>, <3,7,7,4>
+ 3769633547U, // <4,3,7,u>: Cost 4 vext3 <1,u,3,4>, <3,7,u,1>
+ 2559049830U, // <4,3,u,0>: Cost 3 vext1 <1,4,3,u>, LHS
+ 2691910430U, // <4,3,u,1>: Cost 3 vext3 <1,2,3,4>, <3,u,1,2>
+ 2565023513U, // <4,3,u,2>: Cost 3 vext1 <2,4,3,u>, <2,4,3,u>
+ 2707835698U, // <4,3,u,3>: Cost 3 vext3 <3,u,3,4>, <3,u,3,4>
+ 2559053110U, // <4,3,u,4>: Cost 3 vext1 <1,4,3,u>, RHS
+ 1161857540U, // <4,3,u,5>: Cost 2 vrev <3,4,5,u>
+ 2235673101U, // <4,3,u,6>: Cost 3 vrev <3,4,6,u>
+ 2708130646U, // <4,3,u,7>: Cost 3 vext3 <3,u,7,4>, <3,u,7,4>
+ 1162078751U, // <4,3,u,u>: Cost 2 vrev <3,4,u,u>
+ 2617573416U, // <4,4,0,0>: Cost 3 vext2 <0,0,4,4>, <0,0,4,4>
+ 1570373734U, // <4,4,0,1>: Cost 2 vext2 <4,4,4,4>, LHS
+ 2779676774U, // <4,4,0,2>: Cost 3 vuzpl <4,6,4,6>, LHS
+ 3760196480U, // <4,4,0,3>: Cost 4 vext3 <0,3,1,4>, <4,0,3,1>
+ 2576977100U, // <4,4,0,4>: Cost 3 vext1 <4,4,4,0>, <4,4,4,0>
+ 2718747538U, // <4,4,0,5>: Cost 3 vext3 <5,6,7,4>, <4,0,5,1>
+ 2718747548U, // <4,4,0,6>: Cost 3 vext3 <5,6,7,4>, <4,0,6,2>
+ 3668637015U, // <4,4,0,7>: Cost 4 vext1 <7,4,4,0>, <7,4,4,0>
+ 1570374301U, // <4,4,0,u>: Cost 2 vext2 <4,4,4,4>, LHS
+ 2644116214U, // <4,4,1,0>: Cost 3 vext2 <4,4,4,4>, <1,0,3,2>
+ 2644116276U, // <4,4,1,1>: Cost 3 vext2 <4,4,4,4>, <1,1,1,1>
+ 2691910602U, // <4,4,1,2>: Cost 3 vext3 <1,2,3,4>, <4,1,2,3>
+ 2644116440U, // <4,4,1,3>: Cost 3 vext2 <4,4,4,4>, <1,3,1,3>
+ 2711227356U, // <4,4,1,4>: Cost 3 vext3 <4,4,4,4>, <4,1,4,3>
+ 2709310438U, // <4,4,1,5>: Cost 3 vext3 <4,1,5,4>, <4,1,5,4>
+ 3765652462U, // <4,4,1,6>: Cost 4 vext3 <1,2,3,4>, <4,1,6,3>
+ 3768970231U, // <4,4,1,7>: Cost 4 vext3 <1,7,3,4>, <4,1,7,3>
+ 2695891968U, // <4,4,1,u>: Cost 3 vext3 <1,u,3,4>, <4,1,u,3>
+ 3703260634U, // <4,4,2,0>: Cost 4 vext2 <2,0,4,4>, <2,0,4,4>
+ 3765652499U, // <4,4,2,1>: Cost 4 vext3 <1,2,3,4>, <4,2,1,4>
+ 2644117096U, // <4,4,2,2>: Cost 3 vext2 <4,4,4,4>, <2,2,2,2>
+ 2631509709U, // <4,4,2,3>: Cost 3 vext2 <2,3,4,4>, <2,3,4,4>
+ 2644117269U, // <4,4,2,4>: Cost 3 vext2 <4,4,4,4>, <2,4,3,4>
+ 3705251698U, // <4,4,2,5>: Cost 4 vext2 <2,3,4,4>, <2,5,4,7>
+ 2710047808U, // <4,4,2,6>: Cost 3 vext3 <4,2,6,4>, <4,2,6,4>
+ 3783863369U, // <4,4,2,7>: Cost 4 vext3 <4,2,7,4>, <4,2,7,4>
+ 2634827874U, // <4,4,2,u>: Cost 3 vext2 <2,u,4,4>, <2,u,4,4>
+ 2644117654U, // <4,4,3,0>: Cost 3 vext2 <4,4,4,4>, <3,0,1,2>
+ 3638797210U, // <4,4,3,1>: Cost 4 vext1 <2,4,4,3>, <1,2,3,4>
+ 3638798082U, // <4,4,3,2>: Cost 4 vext1 <2,4,4,3>, <2,4,1,3>
+ 2637482406U, // <4,4,3,3>: Cost 3 vext2 <3,3,4,4>, <3,3,4,4>
+ 2638146039U, // <4,4,3,4>: Cost 3 vext2 <3,4,4,4>, <3,4,4,4>
+ 3913287374U, // <4,4,3,5>: Cost 4 vuzpr <3,4,5,4>, <2,3,4,5>
+ 3765652625U, // <4,4,3,6>: Cost 4 vext3 <1,2,3,4>, <4,3,6,4>
+ 3713878762U, // <4,4,3,7>: Cost 4 vext2 <3,7,4,4>, <3,7,4,4>
+ 2637482406U, // <4,4,3,u>: Cost 3 vext2 <3,3,4,4>, <3,3,4,4>
+ 1503264870U, // <4,4,4,0>: Cost 2 vext1 <4,4,4,4>, LHS
+ 2577007514U, // <4,4,4,1>: Cost 3 vext1 <4,4,4,4>, <1,2,3,4>
+ 2577008232U, // <4,4,4,2>: Cost 3 vext1 <4,4,4,4>, <2,2,2,2>
+ 2571037175U, // <4,4,4,3>: Cost 3 vext1 <3,4,4,4>, <3,4,4,4>
+ 161926454U, // <4,4,4,4>: Cost 1 vdup0 RHS
+ 1570377014U, // <4,4,4,5>: Cost 2 vext2 <4,4,4,4>, RHS
+ 2779680054U, // <4,4,4,6>: Cost 3 vuzpl <4,6,4,6>, RHS
+ 2594927963U, // <4,4,4,7>: Cost 3 vext1 <7,4,4,4>, <7,4,4,4>
+ 161926454U, // <4,4,4,u>: Cost 1 vdup0 RHS
+ 2571042918U, // <4,4,5,0>: Cost 3 vext1 <3,4,4,5>, LHS
+ 2571043738U, // <4,4,5,1>: Cost 3 vext1 <3,4,4,5>, <1,2,3,4>
+ 3638814495U, // <4,4,5,2>: Cost 4 vext1 <2,4,4,5>, <2,4,4,5>
+ 2571045368U, // <4,4,5,3>: Cost 3 vext1 <3,4,4,5>, <3,4,4,5>
+ 2571046198U, // <4,4,5,4>: Cost 3 vext1 <3,4,4,5>, RHS
+ 1839648054U, // <4,4,5,5>: Cost 2 vzipl RHS, RHS
+ 1618169142U, // <4,4,5,6>: Cost 2 vext3 <1,2,3,4>, RHS
+ 2594936156U, // <4,4,5,7>: Cost 3 vext1 <7,4,4,5>, <7,4,4,5>
+ 1618169160U, // <4,4,5,u>: Cost 2 vext3 <1,2,3,4>, RHS
+ 2553135206U, // <4,4,6,0>: Cost 3 vext1 <0,4,4,6>, LHS
+ 3626877686U, // <4,4,6,1>: Cost 4 vext1 <0,4,4,6>, <1,0,3,2>
+ 2565080782U, // <4,4,6,2>: Cost 3 vext1 <2,4,4,6>, <2,3,4,5>
+ 2571053561U, // <4,4,6,3>: Cost 3 vext1 <3,4,4,6>, <3,4,4,6>
+ 2553138486U, // <4,4,6,4>: Cost 3 vext1 <0,4,4,6>, RHS
+ 2241555675U, // <4,4,6,5>: Cost 3 vrev <4,4,5,6>
+ 1973865782U, // <4,4,6,6>: Cost 2 vtrnl RHS, RHS
+ 2658055029U, // <4,4,6,7>: Cost 3 vext2 <6,7,4,4>, <6,7,4,4>
+ 1973865800U, // <4,4,6,u>: Cost 2 vtrnl RHS, RHS
+ 2644120570U, // <4,4,7,0>: Cost 3 vext2 <4,4,4,4>, <7,0,1,2>
+ 3638829978U, // <4,4,7,1>: Cost 4 vext1 <2,4,4,7>, <1,2,3,4>
+ 3638830881U, // <4,4,7,2>: Cost 4 vext1 <2,4,4,7>, <2,4,4,7>
+ 3735115018U, // <4,4,7,3>: Cost 4 vext2 <7,3,4,4>, <7,3,4,4>
+ 2662036827U, // <4,4,7,4>: Cost 3 vext2 <7,4,4,4>, <7,4,4,4>
+ 2713292236U, // <4,4,7,5>: Cost 3 vext3 <4,7,5,4>, <4,7,5,4>
+ 2713365973U, // <4,4,7,6>: Cost 3 vext3 <4,7,6,4>, <4,7,6,4>
+ 2644121196U, // <4,4,7,7>: Cost 3 vext2 <4,4,4,4>, <7,7,7,7>
+ 2662036827U, // <4,4,7,u>: Cost 3 vext2 <7,4,4,4>, <7,4,4,4>
+ 1503297638U, // <4,4,u,0>: Cost 2 vext1 <4,4,4,u>, LHS
+ 1570379566U, // <4,4,u,1>: Cost 2 vext2 <4,4,4,4>, LHS
+ 2779682606U, // <4,4,u,2>: Cost 3 vuzpl <4,6,4,6>, LHS
+ 2571069947U, // <4,4,u,3>: Cost 3 vext1 <3,4,4,u>, <3,4,4,u>
+ 161926454U, // <4,4,u,4>: Cost 1 vdup0 RHS
+ 1841638710U, // <4,4,u,5>: Cost 2 vzipl RHS, RHS
+ 1618169385U, // <4,4,u,6>: Cost 2 vext3 <1,2,3,4>, RHS
+ 2594960735U, // <4,4,u,7>: Cost 3 vext1 <7,4,4,u>, <7,4,4,u>
+ 161926454U, // <4,4,u,u>: Cost 1 vdup0 RHS
+ 2631516160U, // <4,5,0,0>: Cost 3 vext2 <2,3,4,5>, <0,0,0,0>
+ 1557774438U, // <4,5,0,1>: Cost 2 vext2 <2,3,4,5>, LHS
+ 2618908875U, // <4,5,0,2>: Cost 3 vext2 <0,2,4,5>, <0,2,4,5>
+ 2571078140U, // <4,5,0,3>: Cost 3 vext1 <3,4,5,0>, <3,4,5,0>
+ 2626871634U, // <4,5,0,4>: Cost 3 vext2 <1,5,4,5>, <0,4,1,5>
+ 3705258414U, // <4,5,0,5>: Cost 4 vext2 <2,3,4,5>, <0,5,2,7>
+ 2594968438U, // <4,5,0,6>: Cost 3 vext1 <7,4,5,0>, <6,7,4,5>
+ 2594968928U, // <4,5,0,7>: Cost 3 vext1 <7,4,5,0>, <7,4,5,0>
+ 1557775005U, // <4,5,0,u>: Cost 2 vext2 <2,3,4,5>, LHS
+ 2631516918U, // <4,5,1,0>: Cost 3 vext2 <2,3,4,5>, <1,0,3,2>
+ 2624217939U, // <4,5,1,1>: Cost 3 vext2 <1,1,4,5>, <1,1,4,5>
+ 2631517078U, // <4,5,1,2>: Cost 3 vext2 <2,3,4,5>, <1,2,3,0>
+ 2821341286U, // <4,5,1,3>: Cost 3 vuzpr <0,4,1,5>, LHS
+ 3895086054U, // <4,5,1,4>: Cost 4 vuzpr <0,4,1,5>, <4,1,5,4>
+ 2626872471U, // <4,5,1,5>: Cost 3 vext2 <1,5,4,5>, <1,5,4,5>
+ 3895083131U, // <4,5,1,6>: Cost 4 vuzpr <0,4,1,5>, <0,1,4,6>
+ 2718748368U, // <4,5,1,7>: Cost 3 vext3 <5,6,7,4>, <5,1,7,3>
+ 2821341291U, // <4,5,1,u>: Cost 3 vuzpr <0,4,1,5>, LHS
+ 2571092070U, // <4,5,2,0>: Cost 3 vext1 <3,4,5,2>, LHS
+ 3699287585U, // <4,5,2,1>: Cost 4 vext2 <1,3,4,5>, <2,1,3,3>
+ 2630854269U, // <4,5,2,2>: Cost 3 vext2 <2,2,4,5>, <2,2,4,5>
+ 1557776078U, // <4,5,2,3>: Cost 2 vext2 <2,3,4,5>, <2,3,4,5>
+ 2631517974U, // <4,5,2,4>: Cost 3 vext2 <2,3,4,5>, <2,4,3,5>
+ 3692652384U, // <4,5,2,5>: Cost 4 vext2 <0,2,4,5>, <2,5,2,7>
+ 2631518138U, // <4,5,2,6>: Cost 3 vext2 <2,3,4,5>, <2,6,3,7>
+ 4164013366U, // <4,5,2,7>: Cost 4 vtrnr <0,4,u,2>, RHS
+ 1561094243U, // <4,5,2,u>: Cost 2 vext2 <2,u,4,5>, <2,u,4,5>
+ 2631518358U, // <4,5,3,0>: Cost 3 vext2 <2,3,4,5>, <3,0,1,2>
+ 3895084710U, // <4,5,3,1>: Cost 4 vuzpr <0,4,1,5>, <2,3,0,1>
+ 2631518540U, // <4,5,3,2>: Cost 3 vext2 <2,3,4,5>, <3,2,3,4>
+ 2631518620U, // <4,5,3,3>: Cost 3 vext2 <2,3,4,5>, <3,3,3,3>
+ 2631518716U, // <4,5,3,4>: Cost 3 vext2 <2,3,4,5>, <3,4,5,0>
+ 2631518784U, // <4,5,3,5>: Cost 3 vext2 <2,3,4,5>, <3,5,3,5>
+ 2658060980U, // <4,5,3,6>: Cost 3 vext2 <6,7,4,5>, <3,6,7,4>
+ 2640145131U, // <4,5,3,7>: Cost 3 vext2 <3,7,4,5>, <3,7,4,5>
+ 2631519006U, // <4,5,3,u>: Cost 3 vext2 <2,3,4,5>, <3,u,1,2>
+ 2571108454U, // <4,5,4,0>: Cost 3 vext1 <3,4,5,4>, LHS
+ 3632907342U, // <4,5,4,1>: Cost 4 vext1 <1,4,5,4>, <1,4,5,4>
+ 2571110094U, // <4,5,4,2>: Cost 3 vext1 <3,4,5,4>, <2,3,4,5>
+ 2571110912U, // <4,5,4,3>: Cost 3 vext1 <3,4,5,4>, <3,4,5,4>
+ 2571111734U, // <4,5,4,4>: Cost 3 vext1 <3,4,5,4>, RHS
+ 1557777718U, // <4,5,4,5>: Cost 2 vext2 <2,3,4,5>, RHS
+ 2645454195U, // <4,5,4,6>: Cost 3 vext2 <4,6,4,5>, <4,6,4,5>
+ 2718748614U, // <4,5,4,7>: Cost 3 vext3 <5,6,7,4>, <5,4,7,6>
+ 1557777961U, // <4,5,4,u>: Cost 2 vext2 <2,3,4,5>, RHS
+ 1503346790U, // <4,5,5,0>: Cost 2 vext1 <4,4,5,5>, LHS
+ 2913398480U, // <4,5,5,1>: Cost 3 vzipl RHS, <5,1,7,3>
+ 2631519998U, // <4,5,5,2>: Cost 3 vext2 <2,3,4,5>, <5,2,3,4>
+ 2577090710U, // <4,5,5,3>: Cost 3 vext1 <4,4,5,5>, <3,0,1,2>
+ 1503349978U, // <4,5,5,4>: Cost 2 vext1 <4,4,5,5>, <4,4,5,5>
+ 2631520260U, // <4,5,5,5>: Cost 3 vext2 <2,3,4,5>, <5,5,5,5>
+ 2913390690U, // <4,5,5,6>: Cost 3 vzipl RHS, <5,6,7,0>
+ 2821344566U, // <4,5,5,7>: Cost 3 vuzpr <0,4,1,5>, RHS
+ 1503352622U, // <4,5,5,u>: Cost 2 vext1 <4,4,5,5>, LHS
+ 1497383014U, // <4,5,6,0>: Cost 2 vext1 <3,4,5,6>, LHS
+ 2559181904U, // <4,5,6,1>: Cost 3 vext1 <1,4,5,6>, <1,4,5,6>
+ 2565154601U, // <4,5,6,2>: Cost 3 vext1 <2,4,5,6>, <2,4,5,6>
+ 1497385474U, // <4,5,6,3>: Cost 2 vext1 <3,4,5,6>, <3,4,5,6>
+ 1497386294U, // <4,5,6,4>: Cost 2 vext1 <3,4,5,6>, RHS
+ 3047608324U, // <4,5,6,5>: Cost 3 vtrnl RHS, <5,5,5,5>
+ 2571129656U, // <4,5,6,6>: Cost 3 vext1 <3,4,5,6>, <6,6,6,6>
+ 27705344U, // <4,5,6,7>: Cost 0 copy RHS
+ 27705344U, // <4,5,6,u>: Cost 0 copy RHS
+ 2565161062U, // <4,5,7,0>: Cost 3 vext1 <2,4,5,7>, LHS
+ 2565161882U, // <4,5,7,1>: Cost 3 vext1 <2,4,5,7>, <1,2,3,4>
+ 2565162794U, // <4,5,7,2>: Cost 3 vext1 <2,4,5,7>, <2,4,5,7>
+ 2661381387U, // <4,5,7,3>: Cost 3 vext2 <7,3,4,5>, <7,3,4,5>
+ 2565164342U, // <4,5,7,4>: Cost 3 vext1 <2,4,5,7>, RHS
+ 2718748840U, // <4,5,7,5>: Cost 3 vext3 <5,6,7,4>, <5,7,5,7>
+ 2718748846U, // <4,5,7,6>: Cost 3 vext3 <5,6,7,4>, <5,7,6,4>
+ 2719412407U, // <4,5,7,7>: Cost 3 vext3 <5,7,7,4>, <5,7,7,4>
+ 2565166894U, // <4,5,7,u>: Cost 3 vext1 <2,4,5,7>, LHS
+ 1497399398U, // <4,5,u,0>: Cost 2 vext1 <3,4,5,u>, LHS
+ 1557780270U, // <4,5,u,1>: Cost 2 vext2 <2,3,4,5>, LHS
+ 2631522181U, // <4,5,u,2>: Cost 3 vext2 <2,3,4,5>, <u,2,3,0>
+ 1497401860U, // <4,5,u,3>: Cost 2 vext1 <3,4,5,u>, <3,4,5,u>
+ 1497402678U, // <4,5,u,4>: Cost 2 vext1 <3,4,5,u>, RHS
+ 1557780634U, // <4,5,u,5>: Cost 2 vext2 <2,3,4,5>, RHS
+ 2631522512U, // <4,5,u,6>: Cost 3 vext2 <2,3,4,5>, <u,6,3,7>
+ 27705344U, // <4,5,u,7>: Cost 0 copy RHS
+ 27705344U, // <4,5,u,u>: Cost 0 copy RHS
+ 2618916864U, // <4,6,0,0>: Cost 3 vext2 <0,2,4,6>, <0,0,0,0>
+ 1545175142U, // <4,6,0,1>: Cost 2 vext2 <0,2,4,6>, LHS
+ 1545175244U, // <4,6,0,2>: Cost 2 vext2 <0,2,4,6>, <0,2,4,6>
+ 3692658940U, // <4,6,0,3>: Cost 4 vext2 <0,2,4,6>, <0,3,1,0>
+ 2618917202U, // <4,6,0,4>: Cost 3 vext2 <0,2,4,6>, <0,4,1,5>
+ 3852910806U, // <4,6,0,5>: Cost 4 vuzpl RHS, <0,2,5,7>
+ 2253525648U, // <4,6,0,6>: Cost 3 vrev <6,4,6,0>
+ 4040764726U, // <4,6,0,7>: Cost 4 vzipr <2,3,4,0>, RHS
+ 1545175709U, // <4,6,0,u>: Cost 2 vext2 <0,2,4,6>, LHS
+ 2618917622U, // <4,6,1,0>: Cost 3 vext2 <0,2,4,6>, <1,0,3,2>
+ 2618917684U, // <4,6,1,1>: Cost 3 vext2 <0,2,4,6>, <1,1,1,1>
+ 2618917782U, // <4,6,1,2>: Cost 3 vext2 <0,2,4,6>, <1,2,3,0>
+ 2618917848U, // <4,6,1,3>: Cost 3 vext2 <0,2,4,6>, <1,3,1,3>
+ 3692659773U, // <4,6,1,4>: Cost 4 vext2 <0,2,4,6>, <1,4,3,5>
+ 2618918032U, // <4,6,1,5>: Cost 3 vext2 <0,2,4,6>, <1,5,3,7>
+ 3692659937U, // <4,6,1,6>: Cost 4 vext2 <0,2,4,6>, <1,6,3,7>
+ 4032146742U, // <4,6,1,7>: Cost 4 vzipr <0,u,4,1>, RHS
+ 2618918253U, // <4,6,1,u>: Cost 3 vext2 <0,2,4,6>, <1,u,1,3>
+ 2618918380U, // <4,6,2,0>: Cost 3 vext2 <0,2,4,6>, <2,0,6,4>
+ 2618918460U, // <4,6,2,1>: Cost 3 vext2 <0,2,4,6>, <2,1,6,3>
+ 2618918504U, // <4,6,2,2>: Cost 3 vext2 <0,2,4,6>, <2,2,2,2>
+ 2618918566U, // <4,6,2,3>: Cost 3 vext2 <0,2,4,6>, <2,3,0,1>
+ 2618918679U, // <4,6,2,4>: Cost 3 vext2 <0,2,4,6>, <2,4,3,6>
+ 2618918788U, // <4,6,2,5>: Cost 3 vext2 <0,2,4,6>, <2,5,6,7>
+ 2618918842U, // <4,6,2,6>: Cost 3 vext2 <0,2,4,6>, <2,6,3,7>
+ 2718749178U, // <4,6,2,7>: Cost 3 vext3 <5,6,7,4>, <6,2,7,3>
+ 2618918971U, // <4,6,2,u>: Cost 3 vext2 <0,2,4,6>, <2,u,0,1>
+ 2618919062U, // <4,6,3,0>: Cost 3 vext2 <0,2,4,6>, <3,0,1,2>
+ 2636171526U, // <4,6,3,1>: Cost 3 vext2 <3,1,4,6>, <3,1,4,6>
+ 3692661057U, // <4,6,3,2>: Cost 4 vext2 <0,2,4,6>, <3,2,2,2>
+ 2618919324U, // <4,6,3,3>: Cost 3 vext2 <0,2,4,6>, <3,3,3,3>
+ 2618919426U, // <4,6,3,4>: Cost 3 vext2 <0,2,4,6>, <3,4,5,6>
+ 2638826058U, // <4,6,3,5>: Cost 3 vext2 <3,5,4,6>, <3,5,4,6>
+ 3913303030U, // <4,6,3,6>: Cost 4 vuzpr <3,4,5,6>, <1,3,4,6>
+ 2722730572U, // <4,6,3,7>: Cost 3 vext3 <6,3,7,4>, <6,3,7,4>
+ 2618919710U, // <4,6,3,u>: Cost 3 vext2 <0,2,4,6>, <3,u,1,2>
+ 2565210214U, // <4,6,4,0>: Cost 3 vext1 <2,4,6,4>, LHS
+ 2718749286U, // <4,6,4,1>: Cost 3 vext3 <5,6,7,4>, <6,4,1,3>
+ 2565211952U, // <4,6,4,2>: Cost 3 vext1 <2,4,6,4>, <2,4,6,4>
+ 2571184649U, // <4,6,4,3>: Cost 3 vext1 <3,4,6,4>, <3,4,6,4>
+ 2565213494U, // <4,6,4,4>: Cost 3 vext1 <2,4,6,4>, RHS
+ 1545178422U, // <4,6,4,5>: Cost 2 vext2 <0,2,4,6>, RHS
+ 1705430326U, // <4,6,4,6>: Cost 2 vuzpl RHS, RHS
+ 2595075437U, // <4,6,4,7>: Cost 3 vext1 <7,4,6,4>, <7,4,6,4>
+ 1545178665U, // <4,6,4,u>: Cost 2 vext2 <0,2,4,6>, RHS
+ 2565218406U, // <4,6,5,0>: Cost 3 vext1 <2,4,6,5>, LHS
+ 2645462736U, // <4,6,5,1>: Cost 3 vext2 <4,6,4,6>, <5,1,7,3>
+ 2913399290U, // <4,6,5,2>: Cost 3 vzipl RHS, <6,2,7,3>
+ 3913305394U, // <4,6,5,3>: Cost 4 vuzpr <3,4,5,6>, <4,5,6,3>
+ 2645462982U, // <4,6,5,4>: Cost 3 vext2 <4,6,4,6>, <5,4,7,6>
+ 2779172868U, // <4,6,5,5>: Cost 3 vuzpl RHS, <5,5,5,5>
+ 2913391416U, // <4,6,5,6>: Cost 3 vzipl RHS, <6,6,6,6>
+ 2821426486U, // <4,6,5,7>: Cost 3 vuzpr <0,4,2,6>, RHS
+ 2821426487U, // <4,6,5,u>: Cost 3 vuzpr <0,4,2,6>, RHS
+ 1503428710U, // <4,6,6,0>: Cost 2 vext1 <4,4,6,6>, LHS
+ 2577171190U, // <4,6,6,1>: Cost 3 vext1 <4,4,6,6>, <1,0,3,2>
+ 2645463546U, // <4,6,6,2>: Cost 3 vext2 <4,6,4,6>, <6,2,7,3>
+ 2577172630U, // <4,6,6,3>: Cost 3 vext1 <4,4,6,6>, <3,0,1,2>
+ 1503431908U, // <4,6,6,4>: Cost 2 vext1 <4,4,6,6>, <4,4,6,6>
+ 2253501069U, // <4,6,6,5>: Cost 3 vrev <6,4,5,6>
+ 2618921784U, // <4,6,6,6>: Cost 3 vext2 <0,2,4,6>, <6,6,6,6>
+ 2954464566U, // <4,6,6,7>: Cost 3 vzipr <0,2,4,6>, RHS
+ 1503434542U, // <4,6,6,u>: Cost 2 vext1 <4,4,6,6>, LHS
+ 2645464058U, // <4,6,7,0>: Cost 3 vext2 <4,6,4,6>, <7,0,1,2>
+ 2779173882U, // <4,6,7,1>: Cost 3 vuzpl RHS, <7,0,1,2>
+ 3638978355U, // <4,6,7,2>: Cost 4 vext1 <2,4,6,7>, <2,4,6,7>
+ 2725090156U, // <4,6,7,3>: Cost 3 vext3 <6,7,3,4>, <6,7,3,4>
+ 2645464422U, // <4,6,7,4>: Cost 3 vext2 <4,6,4,6>, <7,4,5,6>
+ 2779174246U, // <4,6,7,5>: Cost 3 vuzpl RHS, <7,4,5,6>
+ 3852915914U, // <4,6,7,6>: Cost 4 vuzpl RHS, <7,2,6,3>
+ 2779174508U, // <4,6,7,7>: Cost 3 vuzpl RHS, <7,7,7,7>
+ 2779173945U, // <4,6,7,u>: Cost 3 vuzpl RHS, <7,0,u,2>
+ 1503445094U, // <4,6,u,0>: Cost 2 vext1 <4,4,6,u>, LHS
+ 1545180974U, // <4,6,u,1>: Cost 2 vext2 <0,2,4,6>, LHS
+ 1705432878U, // <4,6,u,2>: Cost 2 vuzpl RHS, LHS
+ 2618922940U, // <4,6,u,3>: Cost 3 vext2 <0,2,4,6>, <u,3,0,1>
+ 1503448294U, // <4,6,u,4>: Cost 2 vext1 <4,4,6,u>, <4,4,6,u>
+ 1545181338U, // <4,6,u,5>: Cost 2 vext2 <0,2,4,6>, RHS
+ 1705433242U, // <4,6,u,6>: Cost 2 vuzpl RHS, RHS
+ 2954480950U, // <4,6,u,7>: Cost 3 vzipr <0,2,4,u>, RHS
+ 1545181541U, // <4,6,u,u>: Cost 2 vext2 <0,2,4,6>, LHS
+ 3706601472U, // <4,7,0,0>: Cost 4 vext2 <2,5,4,7>, <0,0,0,0>
+ 2632859750U, // <4,7,0,1>: Cost 3 vext2 <2,5,4,7>, LHS
+ 2726343685U, // <4,7,0,2>: Cost 3 vext3 <7,0,2,4>, <7,0,2,4>
+ 3701293312U, // <4,7,0,3>: Cost 4 vext2 <1,6,4,7>, <0,3,1,4>
+ 3706601810U, // <4,7,0,4>: Cost 4 vext2 <2,5,4,7>, <0,4,1,5>
+ 2259424608U, // <4,7,0,5>: Cost 3 vrev <7,4,5,0>
+ 3695321617U, // <4,7,0,6>: Cost 4 vext2 <0,6,4,7>, <0,6,4,7>
+ 3800454194U, // <4,7,0,7>: Cost 4 vext3 <7,0,7,4>, <7,0,7,4>
+ 2632860317U, // <4,7,0,u>: Cost 3 vext2 <2,5,4,7>, LHS
+ 2259064116U, // <4,7,1,0>: Cost 3 vrev <7,4,0,1>
+ 3700630324U, // <4,7,1,1>: Cost 4 vext2 <1,5,4,7>, <1,1,1,1>
+ 2632860570U, // <4,7,1,2>: Cost 3 vext2 <2,5,4,7>, <1,2,3,4>
+ 3769635936U, // <4,7,1,3>: Cost 4 vext3 <1,u,3,4>, <7,1,3,5>
+ 3656920374U, // <4,7,1,4>: Cost 4 vext1 <5,4,7,1>, RHS
+ 3700630681U, // <4,7,1,5>: Cost 4 vext2 <1,5,4,7>, <1,5,4,7>
+ 3701294314U, // <4,7,1,6>: Cost 4 vext2 <1,6,4,7>, <1,6,4,7>
+ 3793818754U, // <4,7,1,7>: Cost 4 vext3 <5,u,7,4>, <7,1,7,3>
+ 2259654012U, // <4,7,1,u>: Cost 3 vrev <7,4,u,1>
+ 3656925286U, // <4,7,2,0>: Cost 4 vext1 <5,4,7,2>, LHS
+ 3706603050U, // <4,7,2,1>: Cost 4 vext2 <2,5,4,7>, <2,1,4,3>
+ 3706603112U, // <4,7,2,2>: Cost 4 vext2 <2,5,4,7>, <2,2,2,2>
+ 2727744688U, // <4,7,2,3>: Cost 3 vext3 <7,2,3,4>, <7,2,3,4>
+ 3705939745U, // <4,7,2,4>: Cost 4 vext2 <2,4,4,7>, <2,4,4,7>
+ 2632861554U, // <4,7,2,5>: Cost 3 vext2 <2,5,4,7>, <2,5,4,7>
+ 3706603450U, // <4,7,2,6>: Cost 4 vext2 <2,5,4,7>, <2,6,3,7>
+ 3792491731U, // <4,7,2,7>: Cost 4 vext3 <5,6,7,4>, <7,2,7,3>
+ 2634852453U, // <4,7,2,u>: Cost 3 vext2 <2,u,4,7>, <2,u,4,7>
+ 3706603670U, // <4,7,3,0>: Cost 4 vext2 <2,5,4,7>, <3,0,1,2>
+ 3662906266U, // <4,7,3,1>: Cost 4 vext1 <6,4,7,3>, <1,2,3,4>
+ 3725183326U, // <4,7,3,2>: Cost 4 vext2 <5,6,4,7>, <3,2,5,4>
+ 3706603932U, // <4,7,3,3>: Cost 4 vext2 <2,5,4,7>, <3,3,3,3>
+ 3701295618U, // <4,7,3,4>: Cost 4 vext2 <1,6,4,7>, <3,4,5,6>
+ 2638834251U, // <4,7,3,5>: Cost 3 vext2 <3,5,4,7>, <3,5,4,7>
+ 2639497884U, // <4,7,3,6>: Cost 3 vext2 <3,6,4,7>, <3,6,4,7>
+ 3802445093U, // <4,7,3,7>: Cost 4 vext3 <7,3,7,4>, <7,3,7,4>
+ 2640825150U, // <4,7,3,u>: Cost 3 vext2 <3,u,4,7>, <3,u,4,7>
+ 2718750004U, // <4,7,4,0>: Cost 3 vext3 <5,6,7,4>, <7,4,0,1>
+ 3706604490U, // <4,7,4,1>: Cost 4 vext2 <2,5,4,7>, <4,1,2,3>
+ 3656943474U, // <4,7,4,2>: Cost 4 vext1 <5,4,7,4>, <2,5,4,7>
+ 3779884371U, // <4,7,4,3>: Cost 4 vext3 <3,5,7,4>, <7,4,3,5>
+ 2259383643U, // <4,7,4,4>: Cost 3 vrev <7,4,4,4>
+ 2632863030U, // <4,7,4,5>: Cost 3 vext2 <2,5,4,7>, RHS
+ 2259531117U, // <4,7,4,6>: Cost 3 vrev <7,4,6,4>
+ 3907340074U, // <4,7,4,7>: Cost 4 vuzpr <2,4,5,7>, <2,4,5,7>
+ 2632863273U, // <4,7,4,u>: Cost 3 vext2 <2,5,4,7>, RHS
+ 2913391610U, // <4,7,5,0>: Cost 3 vzipl RHS, <7,0,1,2>
+ 3645006848U, // <4,7,5,1>: Cost 4 vext1 <3,4,7,5>, <1,3,5,7>
+ 2589181646U, // <4,7,5,2>: Cost 3 vext1 <6,4,7,5>, <2,3,4,5>
+ 3645008403U, // <4,7,5,3>: Cost 4 vext1 <3,4,7,5>, <3,4,7,5>
+ 2913391974U, // <4,7,5,4>: Cost 3 vzipl RHS, <7,4,5,6>
+ 2583211973U, // <4,7,5,5>: Cost 3 vext1 <5,4,7,5>, <5,4,7,5>
+ 2589184670U, // <4,7,5,6>: Cost 3 vext1 <6,4,7,5>, <6,4,7,5>
+ 2913392236U, // <4,7,5,7>: Cost 3 vzipl RHS, <7,7,7,7>
+ 2913392258U, // <4,7,5,u>: Cost 3 vzipl RHS, <7,u,1,2>
+ 1509474406U, // <4,7,6,0>: Cost 2 vext1 <5,4,7,6>, LHS
+ 3047609338U, // <4,7,6,1>: Cost 3 vtrnl RHS, <7,0,1,2>
+ 2583217768U, // <4,7,6,2>: Cost 3 vext1 <5,4,7,6>, <2,2,2,2>
+ 2583218326U, // <4,7,6,3>: Cost 3 vext1 <5,4,7,6>, <3,0,1,2>
+ 1509477686U, // <4,7,6,4>: Cost 2 vext1 <5,4,7,6>, RHS
+ 1509478342U, // <4,7,6,5>: Cost 2 vext1 <5,4,7,6>, <5,4,7,6>
+ 2583220730U, // <4,7,6,6>: Cost 3 vext1 <5,4,7,6>, <6,2,7,3>
+ 3047609964U, // <4,7,6,7>: Cost 3 vtrnl RHS, <7,7,7,7>
+ 1509480238U, // <4,7,6,u>: Cost 2 vext1 <5,4,7,6>, LHS
+ 3650994278U, // <4,7,7,0>: Cost 4 vext1 <4,4,7,7>, LHS
+ 3650995098U, // <4,7,7,1>: Cost 4 vext1 <4,4,7,7>, <1,2,3,4>
+ 3650996010U, // <4,7,7,2>: Cost 4 vext1 <4,4,7,7>, <2,4,5,7>
+ 3804804677U, // <4,7,7,3>: Cost 4 vext3 <7,7,3,4>, <7,7,3,4>
+ 3650997486U, // <4,7,7,4>: Cost 4 vext1 <4,4,7,7>, <4,4,7,7>
+ 2662725039U, // <4,7,7,5>: Cost 3 vext2 <7,5,4,7>, <7,5,4,7>
+ 3662942880U, // <4,7,7,6>: Cost 4 vext1 <6,4,7,7>, <6,4,7,7>
+ 2718750316U, // <4,7,7,7>: Cost 3 vext3 <5,6,7,4>, <7,7,7,7>
+ 2664715938U, // <4,7,7,u>: Cost 3 vext2 <7,u,4,7>, <7,u,4,7>
+ 1509490790U, // <4,7,u,0>: Cost 2 vext1 <5,4,7,u>, LHS
+ 2632865582U, // <4,7,u,1>: Cost 3 vext2 <2,5,4,7>, LHS
+ 2583234152U, // <4,7,u,2>: Cost 3 vext1 <5,4,7,u>, <2,2,2,2>
+ 2583234710U, // <4,7,u,3>: Cost 3 vext1 <5,4,7,u>, <3,0,1,2>
+ 1509494070U, // <4,7,u,4>: Cost 2 vext1 <5,4,7,u>, RHS
+ 1509494728U, // <4,7,u,5>: Cost 2 vext1 <5,4,7,u>, <5,4,7,u>
+ 2583237114U, // <4,7,u,6>: Cost 3 vext1 <5,4,7,u>, <6,2,7,3>
+ 3047757420U, // <4,7,u,7>: Cost 3 vtrnl RHS, <7,7,7,7>
+ 1509496622U, // <4,7,u,u>: Cost 2 vext1 <5,4,7,u>, LHS
+ 2618933248U, // <4,u,0,0>: Cost 3 vext2 <0,2,4,u>, <0,0,0,0>
+ 1545191526U, // <4,u,0,1>: Cost 2 vext2 <0,2,4,u>, LHS
+ 1545191630U, // <4,u,0,2>: Cost 2 vext2 <0,2,4,u>, <0,2,4,u>
+ 2691913445U, // <4,u,0,3>: Cost 3 vext3 <1,2,3,4>, <u,0,3,2>
+ 2618933586U, // <4,u,0,4>: Cost 3 vext2 <0,2,4,u>, <0,4,1,5>
+ 2265397305U, // <4,u,0,5>: Cost 3 vrev <u,4,5,0>
+ 2595189625U, // <4,u,0,6>: Cost 3 vext1 <7,4,u,0>, <6,7,4,u>
+ 2595190139U, // <4,u,0,7>: Cost 3 vext1 <7,4,u,0>, <7,4,u,0>
+ 1545192093U, // <4,u,0,u>: Cost 2 vext2 <0,2,4,u>, LHS
+ 2618934006U, // <4,u,1,0>: Cost 3 vext2 <0,2,4,u>, <1,0,3,2>
+ 2618934068U, // <4,u,1,1>: Cost 3 vext2 <0,2,4,u>, <1,1,1,1>
+ 1618171694U, // <4,u,1,2>: Cost 2 vext3 <1,2,3,4>, LHS
+ 2618934232U, // <4,u,1,3>: Cost 3 vext2 <0,2,4,u>, <1,3,1,3>
+ 2695894848U, // <4,u,1,4>: Cost 3 vext3 <1,u,3,4>, <u,1,4,3>
+ 2618934416U, // <4,u,1,5>: Cost 3 vext2 <0,2,4,u>, <1,5,3,7>
+ 3692676321U, // <4,u,1,6>: Cost 4 vext2 <0,2,4,u>, <1,6,3,7>
+ 2718750555U, // <4,u,1,7>: Cost 3 vext3 <5,6,7,4>, <u,1,7,3>
+ 1618171748U, // <4,u,1,u>: Cost 2 vext3 <1,2,3,4>, LHS
+ 2553397350U, // <4,u,2,0>: Cost 3 vext1 <0,4,u,2>, LHS
+ 2630215215U, // <4,u,2,1>: Cost 3 vext2 <2,1,4,u>, <2,1,4,u>
+ 2618934888U, // <4,u,2,2>: Cost 3 vext2 <0,2,4,u>, <2,2,2,2>
+ 1557800657U, // <4,u,2,3>: Cost 2 vext2 <2,3,4,u>, <2,3,4,u>
+ 2618935065U, // <4,u,2,4>: Cost 3 vext2 <0,2,4,u>, <2,4,3,u>
+ 2733864859U, // <4,u,2,5>: Cost 3 vext3 <u,2,5,4>, <u,2,5,4>
+ 2618935226U, // <4,u,2,6>: Cost 3 vext2 <0,2,4,u>, <2,6,3,7>
+ 2718750636U, // <4,u,2,7>: Cost 3 vext3 <5,6,7,4>, <u,2,7,3>
+ 1561118822U, // <4,u,2,u>: Cost 2 vext2 <2,u,4,u>, <2,u,4,u>
+ 2618935446U, // <4,u,3,0>: Cost 3 vext2 <0,2,4,u>, <3,0,1,2>
+ 2779318422U, // <4,u,3,1>: Cost 3 vuzpl RHS, <3,0,1,2>
+ 2636851545U, // <4,u,3,2>: Cost 3 vext2 <3,2,4,u>, <3,2,4,u>
+ 2618935708U, // <4,u,3,3>: Cost 3 vext2 <0,2,4,u>, <3,3,3,3>
+ 2618935810U, // <4,u,3,4>: Cost 3 vext2 <0,2,4,u>, <3,4,5,6>
+ 2691913711U, // <4,u,3,5>: Cost 3 vext3 <1,2,3,4>, <u,3,5,7>
+ 2588725862U, // <4,u,3,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3>
+ 2640169710U, // <4,u,3,7>: Cost 3 vext2 <3,7,4,u>, <3,7,4,u>
+ 2618936094U, // <4,u,3,u>: Cost 3 vext2 <0,2,4,u>, <3,u,1,2>
+ 1503559782U, // <4,u,4,0>: Cost 2 vext1 <4,4,u,4>, LHS
+ 2692282391U, // <4,u,4,1>: Cost 3 vext3 <1,2,u,4>, <u,4,1,2>
+ 2565359426U, // <4,u,4,2>: Cost 3 vext1 <2,4,u,4>, <2,4,u,4>
+ 2571332123U, // <4,u,4,3>: Cost 3 vext1 <3,4,u,4>, <3,4,u,4>
+ 161926454U, // <4,u,4,4>: Cost 1 vdup0 RHS
+ 1545194806U, // <4,u,4,5>: Cost 2 vext2 <0,2,4,u>, RHS
+ 1705577782U, // <4,u,4,6>: Cost 2 vuzpl RHS, RHS
+ 2718750801U, // <4,u,4,7>: Cost 3 vext3 <5,6,7,4>, <u,4,7,6>
+ 161926454U, // <4,u,4,u>: Cost 1 vdup0 RHS
+ 1479164006U, // <4,u,5,0>: Cost 2 vext1 <0,4,1,5>, LHS
+ 1839650606U, // <4,u,5,1>: Cost 2 vzipl RHS, LHS
+ 2565367502U, // <4,u,5,2>: Cost 3 vext1 <2,4,u,5>, <2,3,4,5>
+ 3089777309U, // <4,u,5,3>: Cost 3 vtrnr <0,4,1,5>, LHS
+ 1479167286U, // <4,u,5,4>: Cost 2 vext1 <0,4,1,5>, RHS
+ 1839650970U, // <4,u,5,5>: Cost 2 vzipl RHS, RHS
+ 1618172058U, // <4,u,5,6>: Cost 2 vext3 <1,2,3,4>, RHS
+ 3089780265U, // <4,u,5,7>: Cost 3 vtrnr <0,4,1,5>, RHS
+ 1618172076U, // <4,u,5,u>: Cost 2 vext3 <1,2,3,4>, RHS
+ 1479688294U, // <4,u,6,0>: Cost 2 vext1 <0,4,u,6>, LHS
+ 2553430774U, // <4,u,6,1>: Cost 3 vext1 <0,4,u,6>, <1,0,3,2>
+ 1973868334U, // <4,u,6,2>: Cost 2 vtrnl RHS, LHS
+ 1497606685U, // <4,u,6,3>: Cost 2 vext1 <3,4,u,6>, <3,4,u,6>
+ 1479691574U, // <4,u,6,4>: Cost 2 vext1 <0,4,u,6>, RHS
+ 1509552079U, // <4,u,6,5>: Cost 2 vext1 <5,4,u,6>, <5,4,u,6>
+ 1973868698U, // <4,u,6,6>: Cost 2 vtrnl RHS, RHS
+ 27705344U, // <4,u,6,7>: Cost 0 copy RHS
+ 27705344U, // <4,u,6,u>: Cost 0 copy RHS
+ 2565382246U, // <4,u,7,0>: Cost 3 vext1 <2,4,u,7>, LHS
+ 2565383066U, // <4,u,7,1>: Cost 3 vext1 <2,4,u,7>, <1,2,3,4>
+ 2565384005U, // <4,u,7,2>: Cost 3 vext1 <2,4,u,7>, <2,4,u,7>
+ 2661405966U, // <4,u,7,3>: Cost 3 vext2 <7,3,4,u>, <7,3,4,u>
+ 2565385526U, // <4,u,7,4>: Cost 3 vext1 <2,4,u,7>, RHS
+ 2779321702U, // <4,u,7,5>: Cost 3 vuzpl RHS, <7,4,5,6>
+ 2589274793U, // <4,u,7,6>: Cost 3 vext1 <6,4,u,7>, <6,4,u,7>
+ 2779321964U, // <4,u,7,7>: Cost 3 vuzpl RHS, <7,7,7,7>
+ 2565388078U, // <4,u,7,u>: Cost 3 vext1 <2,4,u,7>, LHS
+ 1479704678U, // <4,u,u,0>: Cost 2 vext1 <0,4,u,u>, LHS
+ 1545197358U, // <4,u,u,1>: Cost 2 vext2 <0,2,4,u>, LHS
+ 1618172261U, // <4,u,u,2>: Cost 2 vext3 <1,2,3,4>, LHS
+ 1497623071U, // <4,u,u,3>: Cost 2 vext1 <3,4,u,u>, <3,4,u,u>
+ 161926454U, // <4,u,u,4>: Cost 1 vdup0 RHS
+ 1545197722U, // <4,u,u,5>: Cost 2 vext2 <0,2,4,u>, RHS
+ 1618172301U, // <4,u,u,6>: Cost 2 vext3 <1,2,3,4>, RHS
+ 27705344U, // <4,u,u,7>: Cost 0 copy RHS
+ 27705344U, // <4,u,u,u>: Cost 0 copy RHS
+ 2687123456U, // <5,0,0,0>: Cost 3 vext3 <0,4,1,5>, <0,0,0,0>
+ 2687123466U, // <5,0,0,1>: Cost 3 vext3 <0,4,1,5>, <0,0,1,1>
+ 2687123476U, // <5,0,0,2>: Cost 3 vext3 <0,4,1,5>, <0,0,2,2>
+ 3710599434U, // <5,0,0,3>: Cost 4 vext2 <3,2,5,0>, <0,3,2,5>
+ 2642166098U, // <5,0,0,4>: Cost 3 vext2 <4,1,5,0>, <0,4,1,5>
+ 3657060306U, // <5,0,0,5>: Cost 4 vext1 <5,5,0,0>, <5,5,0,0>
+ 3292094923U, // <5,0,0,6>: Cost 4 vrev <0,5,6,0>
+ 3669005700U, // <5,0,0,7>: Cost 4 vext1 <7,5,0,0>, <7,5,0,0>
+ 2687123530U, // <5,0,0,u>: Cost 3 vext3 <0,4,1,5>, <0,0,u,2>
+ 2559434854U, // <5,0,1,0>: Cost 3 vext1 <1,5,0,1>, LHS
+ 2559435887U, // <5,0,1,1>: Cost 3 vext1 <1,5,0,1>, <1,5,0,1>
+ 1613381734U, // <5,0,1,2>: Cost 2 vext3 <0,4,1,5>, LHS
+ 3698656256U, // <5,0,1,3>: Cost 4 vext2 <1,2,5,0>, <1,3,5,7>
+ 2559438134U, // <5,0,1,4>: Cost 3 vext1 <1,5,0,1>, RHS
+ 2583326675U, // <5,0,1,5>: Cost 3 vext1 <5,5,0,1>, <5,5,0,1>
+ 3715908851U, // <5,0,1,6>: Cost 4 vext2 <4,1,5,0>, <1,6,5,7>
+ 3657069562U, // <5,0,1,7>: Cost 4 vext1 <5,5,0,1>, <7,0,1,2>
+ 1613381788U, // <5,0,1,u>: Cost 2 vext3 <0,4,1,5>, LHS
+ 2686017700U, // <5,0,2,0>: Cost 3 vext3 <0,2,4,5>, <0,2,0,2>
+ 2685796528U, // <5,0,2,1>: Cost 3 vext3 <0,2,1,5>, <0,2,1,5>
+ 2698625208U, // <5,0,2,2>: Cost 3 vext3 <2,3,4,5>, <0,2,2,4>
+ 2685944002U, // <5,0,2,3>: Cost 3 vext3 <0,2,3,5>, <0,2,3,5>
+ 2686017739U, // <5,0,2,4>: Cost 3 vext3 <0,2,4,5>, <0,2,4,5>
+ 2686091476U, // <5,0,2,5>: Cost 3 vext3 <0,2,5,5>, <0,2,5,5>
+ 2725167324U, // <5,0,2,6>: Cost 3 vext3 <6,7,4,5>, <0,2,6,4>
+ 2595280230U, // <5,0,2,7>: Cost 3 vext1 <7,5,0,2>, <7,4,5,6>
+ 2686312687U, // <5,0,2,u>: Cost 3 vext3 <0,2,u,5>, <0,2,u,5>
+ 3760128248U, // <5,0,3,0>: Cost 4 vext3 <0,3,0,5>, <0,3,0,5>
+ 3759685888U, // <5,0,3,1>: Cost 4 vext3 <0,2,3,5>, <0,3,1,4>
+ 2686533898U, // <5,0,3,2>: Cost 3 vext3 <0,3,2,5>, <0,3,2,5>
+ 3760349459U, // <5,0,3,3>: Cost 4 vext3 <0,3,3,5>, <0,3,3,5>
+ 2638187004U, // <5,0,3,4>: Cost 3 vext2 <3,4,5,0>, <3,4,5,0>
+ 3776348452U, // <5,0,3,5>: Cost 4 vext3 <3,0,4,5>, <0,3,5,4>
+ 3713256094U, // <5,0,3,6>: Cost 4 vext2 <3,6,5,0>, <3,6,5,0>
+ 3914064896U, // <5,0,3,7>: Cost 4 vuzpr <3,5,7,0>, <1,3,5,7>
+ 2686976320U, // <5,0,3,u>: Cost 3 vext3 <0,3,u,5>, <0,3,u,5>
+ 2559459430U, // <5,0,4,0>: Cost 3 vext1 <1,5,0,4>, LHS
+ 1613381970U, // <5,0,4,1>: Cost 2 vext3 <0,4,1,5>, <0,4,1,5>
+ 2687123804U, // <5,0,4,2>: Cost 3 vext3 <0,4,1,5>, <0,4,2,6>
+ 3761013092U, // <5,0,4,3>: Cost 4 vext3 <0,4,3,5>, <0,4,3,5>
+ 2559462710U, // <5,0,4,4>: Cost 3 vext1 <1,5,0,4>, RHS
+ 2638187830U, // <5,0,4,5>: Cost 3 vext2 <3,4,5,0>, RHS
+ 3761234303U, // <5,0,4,6>: Cost 4 vext3 <0,4,6,5>, <0,4,6,5>
+ 2646150600U, // <5,0,4,7>: Cost 3 vext2 <4,7,5,0>, <4,7,5,0>
+ 1613381970U, // <5,0,4,u>: Cost 2 vext3 <0,4,1,5>, <0,4,1,5>
+ 3766763926U, // <5,0,5,0>: Cost 4 vext3 <1,4,0,5>, <0,5,0,1>
+ 2919268454U, // <5,0,5,1>: Cost 3 vzipl <5,5,5,5>, LHS
+ 3053486182U, // <5,0,5,2>: Cost 3 vtrnl <5,5,5,5>, LHS
+ 3723210589U, // <5,0,5,3>: Cost 4 vext2 <5,3,5,0>, <5,3,5,0>
+ 3766763966U, // <5,0,5,4>: Cost 4 vext3 <1,4,0,5>, <0,5,4,5>
+ 2650796031U, // <5,0,5,5>: Cost 3 vext2 <5,5,5,0>, <5,5,5,0>
+ 3719893090U, // <5,0,5,6>: Cost 4 vext2 <4,7,5,0>, <5,6,7,0>
+ 3914067254U, // <5,0,5,7>: Cost 4 vuzpr <3,5,7,0>, RHS
+ 2919269021U, // <5,0,5,u>: Cost 3 vzipl <5,5,5,5>, LHS
+ 4047519744U, // <5,0,6,0>: Cost 4 vzipr <3,4,5,6>, <0,0,0,0>
+ 2920038502U, // <5,0,6,1>: Cost 3 vzipl <5,6,7,0>, LHS
+ 3759759871U, // <5,0,6,2>: Cost 4 vext3 <0,2,4,5>, <0,6,2,7>
+ 3645164070U, // <5,0,6,3>: Cost 4 vext1 <3,5,0,6>, <3,5,0,6>
+ 3762414095U, // <5,0,6,4>: Cost 4 vext3 <0,6,4,5>, <0,6,4,5>
+ 3993780690U, // <5,0,6,5>: Cost 4 vzipl <5,6,7,0>, <0,5,6,7>
+ 3719893816U, // <5,0,6,6>: Cost 4 vext2 <4,7,5,0>, <6,6,6,6>
+ 2662077302U, // <5,0,6,7>: Cost 3 vext2 <7,4,5,0>, <6,7,4,5>
+ 2920039069U, // <5,0,6,u>: Cost 3 vzipl <5,6,7,0>, LHS
+ 2565455974U, // <5,0,7,0>: Cost 3 vext1 <2,5,0,7>, LHS
+ 2565456790U, // <5,0,7,1>: Cost 3 vext1 <2,5,0,7>, <1,2,3,0>
+ 2565457742U, // <5,0,7,2>: Cost 3 vext1 <2,5,0,7>, <2,5,0,7>
+ 3639199894U, // <5,0,7,3>: Cost 4 vext1 <2,5,0,7>, <3,0,1,2>
+ 2565459254U, // <5,0,7,4>: Cost 3 vext1 <2,5,0,7>, RHS
+ 2589347938U, // <5,0,7,5>: Cost 3 vext1 <6,5,0,7>, <5,6,7,0>
+ 2589348530U, // <5,0,7,6>: Cost 3 vext1 <6,5,0,7>, <6,5,0,7>
+ 4188456422U, // <5,0,7,7>: Cost 4 vtrnr RHS, <2,0,5,7>
+ 2565461806U, // <5,0,7,u>: Cost 3 vext1 <2,5,0,7>, LHS
+ 2687124106U, // <5,0,u,0>: Cost 3 vext3 <0,4,1,5>, <0,u,0,2>
+ 1616036502U, // <5,0,u,1>: Cost 2 vext3 <0,u,1,5>, <0,u,1,5>
+ 1613382301U, // <5,0,u,2>: Cost 2 vext3 <0,4,1,5>, LHS
+ 2689925800U, // <5,0,u,3>: Cost 3 vext3 <0,u,3,5>, <0,u,3,5>
+ 2687124146U, // <5,0,u,4>: Cost 3 vext3 <0,4,1,5>, <0,u,4,6>
+ 2638190746U, // <5,0,u,5>: Cost 3 vext2 <3,4,5,0>, RHS
+ 2589356723U, // <5,0,u,6>: Cost 3 vext1 <6,5,0,u>, <6,5,0,u>
+ 2595280230U, // <5,0,u,7>: Cost 3 vext1 <7,5,0,2>, <7,4,5,6>
+ 1613382355U, // <5,0,u,u>: Cost 2 vext3 <0,4,1,5>, LHS
+ 2646818816U, // <5,1,0,0>: Cost 3 vext2 <4,u,5,1>, <0,0,0,0>
+ 1573077094U, // <5,1,0,1>: Cost 2 vext2 <4,u,5,1>, LHS
+ 2646818980U, // <5,1,0,2>: Cost 3 vext2 <4,u,5,1>, <0,2,0,2>
+ 2687124214U, // <5,1,0,3>: Cost 3 vext3 <0,4,1,5>, <1,0,3,2>
+ 2641510738U, // <5,1,0,4>: Cost 3 vext2 <4,0,5,1>, <0,4,1,5>
+ 2641510814U, // <5,1,0,5>: Cost 3 vext2 <4,0,5,1>, <0,5,1,0>
+ 3720561142U, // <5,1,0,6>: Cost 4 vext2 <4,u,5,1>, <0,6,1,7>
+ 3298141357U, // <5,1,0,7>: Cost 4 vrev <1,5,7,0>
+ 1573077661U, // <5,1,0,u>: Cost 2 vext2 <4,u,5,1>, LHS
+ 2223891567U, // <5,1,1,0>: Cost 3 vrev <1,5,0,1>
+ 2687124276U, // <5,1,1,1>: Cost 3 vext3 <0,4,1,5>, <1,1,1,1>
+ 2646819734U, // <5,1,1,2>: Cost 3 vext2 <4,u,5,1>, <1,2,3,0>
+ 2687124296U, // <5,1,1,3>: Cost 3 vext3 <0,4,1,5>, <1,1,3,3>
+ 2691326803U, // <5,1,1,4>: Cost 3 vext3 <1,1,4,5>, <1,1,4,5>
+ 2691400540U, // <5,1,1,5>: Cost 3 vext3 <1,1,5,5>, <1,1,5,5>
+ 3765216101U, // <5,1,1,6>: Cost 4 vext3 <1,1,6,5>, <1,1,6,5>
+ 3765289838U, // <5,1,1,7>: Cost 4 vext3 <1,1,7,5>, <1,1,7,5>
+ 2687124341U, // <5,1,1,u>: Cost 3 vext3 <0,4,1,5>, <1,1,u,3>
+ 3297641584U, // <5,1,2,0>: Cost 4 vrev <1,5,0,2>
+ 3763520391U, // <5,1,2,1>: Cost 4 vext3 <0,u,1,5>, <1,2,1,3>
+ 2646820456U, // <5,1,2,2>: Cost 3 vext2 <4,u,5,1>, <2,2,2,2>
+ 2687124374U, // <5,1,2,3>: Cost 3 vext3 <0,4,1,5>, <1,2,3,0>
+ 2691990436U, // <5,1,2,4>: Cost 3 vext3 <1,2,4,5>, <1,2,4,5>
+ 2687124395U, // <5,1,2,5>: Cost 3 vext3 <0,4,1,5>, <1,2,5,3>
+ 2646820794U, // <5,1,2,6>: Cost 3 vext2 <4,u,5,1>, <2,6,3,7>
+ 3808199610U, // <5,1,2,7>: Cost 4 vext3 <u,3,4,5>, <1,2,7,0>
+ 2687124419U, // <5,1,2,u>: Cost 3 vext3 <0,4,1,5>, <1,2,u,0>
+ 2577440870U, // <5,1,3,0>: Cost 3 vext1 <4,5,1,3>, LHS
+ 2687124440U, // <5,1,3,1>: Cost 3 vext3 <0,4,1,5>, <1,3,1,3>
+ 3759686627U, // <5,1,3,2>: Cost 4 vext3 <0,2,3,5>, <1,3,2,5>
+ 2692580332U, // <5,1,3,3>: Cost 3 vext3 <1,3,3,5>, <1,3,3,5>
+ 2687124469U, // <5,1,3,4>: Cost 3 vext3 <0,4,1,5>, <1,3,4,5>
+ 2685207552U, // <5,1,3,5>: Cost 3 vext3 <0,1,2,5>, <1,3,5,7>
+ 3760866313U, // <5,1,3,6>: Cost 4 vext3 <0,4,1,5>, <1,3,6,7>
+ 2692875280U, // <5,1,3,7>: Cost 3 vext3 <1,3,7,5>, <1,3,7,5>
+ 2687124503U, // <5,1,3,u>: Cost 3 vext3 <0,4,1,5>, <1,3,u,3>
+ 1567771538U, // <5,1,4,0>: Cost 2 vext2 <4,0,5,1>, <4,0,5,1>
+ 2693096491U, // <5,1,4,1>: Cost 3 vext3 <1,4,1,5>, <1,4,1,5>
+ 2693170228U, // <5,1,4,2>: Cost 3 vext3 <1,4,2,5>, <1,4,2,5>
+ 2687124541U, // <5,1,4,3>: Cost 3 vext3 <0,4,1,5>, <1,4,3,5>
+ 2646822096U, // <5,1,4,4>: Cost 3 vext2 <4,u,5,1>, <4,4,4,4>
+ 1573080374U, // <5,1,4,5>: Cost 2 vext2 <4,u,5,1>, RHS
+ 2646822260U, // <5,1,4,6>: Cost 3 vext2 <4,u,5,1>, <4,6,4,6>
+ 3298174129U, // <5,1,4,7>: Cost 4 vrev <1,5,7,4>
+ 1573080602U, // <5,1,4,u>: Cost 2 vext2 <4,u,5,1>, <4,u,5,1>
+ 2687124591U, // <5,1,5,0>: Cost 3 vext3 <0,4,1,5>, <1,5,0,1>
+ 2646822543U, // <5,1,5,1>: Cost 3 vext2 <4,u,5,1>, <5,1,0,1>
+ 3760866433U, // <5,1,5,2>: Cost 4 vext3 <0,4,1,5>, <1,5,2,1>
+ 2687124624U, // <5,1,5,3>: Cost 3 vext3 <0,4,1,5>, <1,5,3,7>
+ 2687124631U, // <5,1,5,4>: Cost 3 vext3 <0,4,1,5>, <1,5,4,5>
+ 2646822916U, // <5,1,5,5>: Cost 3 vext2 <4,u,5,1>, <5,5,5,5>
+ 2646823010U, // <5,1,5,6>: Cost 3 vext2 <4,u,5,1>, <5,6,7,0>
+ 2646823080U, // <5,1,5,7>: Cost 3 vext2 <4,u,5,1>, <5,7,5,7>
+ 2687124663U, // <5,1,5,u>: Cost 3 vext3 <0,4,1,5>, <1,5,u,1>
+ 2553577574U, // <5,1,6,0>: Cost 3 vext1 <0,5,1,6>, LHS
+ 3763520719U, // <5,1,6,1>: Cost 4 vext3 <0,u,1,5>, <1,6,1,7>
+ 2646823418U, // <5,1,6,2>: Cost 3 vext2 <4,u,5,1>, <6,2,7,3>
+ 3760866529U, // <5,1,6,3>: Cost 4 vext3 <0,4,1,5>, <1,6,3,7>
+ 2553580854U, // <5,1,6,4>: Cost 3 vext1 <0,5,1,6>, RHS
+ 2687124723U, // <5,1,6,5>: Cost 3 vext3 <0,4,1,5>, <1,6,5,7>
+ 2646823736U, // <5,1,6,6>: Cost 3 vext2 <4,u,5,1>, <6,6,6,6>
+ 2646823758U, // <5,1,6,7>: Cost 3 vext2 <4,u,5,1>, <6,7,0,1>
+ 2646823839U, // <5,1,6,u>: Cost 3 vext2 <4,u,5,1>, <6,u,0,1>
+ 2559557734U, // <5,1,7,0>: Cost 3 vext1 <1,5,1,7>, LHS
+ 2559558452U, // <5,1,7,1>: Cost 3 vext1 <1,5,1,7>, <1,1,1,1>
+ 2571503270U, // <5,1,7,2>: Cost 3 vext1 <3,5,1,7>, <2,3,0,1>
+ 2040971366U, // <5,1,7,3>: Cost 2 vtrnr RHS, LHS
+ 2559561014U, // <5,1,7,4>: Cost 3 vext1 <1,5,1,7>, RHS
+ 2595393232U, // <5,1,7,5>: Cost 3 vext1 <7,5,1,7>, <5,1,7,3>
+ 4188455035U, // <5,1,7,6>: Cost 4 vtrnr RHS, <0,1,4,6>
+ 2646824556U, // <5,1,7,7>: Cost 3 vext2 <4,u,5,1>, <7,7,7,7>
+ 2040971371U, // <5,1,7,u>: Cost 2 vtrnr RHS, LHS
+ 1591662326U, // <5,1,u,0>: Cost 2 vext2 <u,0,5,1>, <u,0,5,1>
+ 1573082926U, // <5,1,u,1>: Cost 2 vext2 <4,u,5,1>, LHS
+ 2695824760U, // <5,1,u,2>: Cost 3 vext3 <1,u,2,5>, <1,u,2,5>
+ 2040979558U, // <5,1,u,3>: Cost 2 vtrnr RHS, LHS
+ 2687124874U, // <5,1,u,4>: Cost 3 vext3 <0,4,1,5>, <1,u,4,5>
+ 1573083290U, // <5,1,u,5>: Cost 2 vext2 <4,u,5,1>, RHS
+ 2646825168U, // <5,1,u,6>: Cost 3 vext2 <4,u,5,1>, <u,6,3,7>
+ 2646825216U, // <5,1,u,7>: Cost 3 vext2 <4,u,5,1>, <u,7,0,1>
+ 2040979563U, // <5,1,u,u>: Cost 2 vtrnr RHS, LHS
+ 3702652928U, // <5,2,0,0>: Cost 4 vext2 <1,u,5,2>, <0,0,0,0>
+ 2628911206U, // <5,2,0,1>: Cost 3 vext2 <1,u,5,2>, LHS
+ 2641518756U, // <5,2,0,2>: Cost 3 vext2 <4,0,5,2>, <0,2,0,2>
+ 3759760847U, // <5,2,0,3>: Cost 4 vext3 <0,2,4,5>, <2,0,3,2>
+ 3760866775U, // <5,2,0,4>: Cost 4 vext3 <0,4,1,5>, <2,0,4,1>
+ 3759539680U, // <5,2,0,5>: Cost 4 vext3 <0,2,1,5>, <2,0,5,1>
+ 3760866796U, // <5,2,0,6>: Cost 4 vext3 <0,4,1,5>, <2,0,6,4>
+ 3304114054U, // <5,2,0,7>: Cost 4 vrev <2,5,7,0>
+ 2628911773U, // <5,2,0,u>: Cost 3 vext2 <1,u,5,2>, LHS
+ 2623603464U, // <5,2,1,0>: Cost 3 vext2 <1,0,5,2>, <1,0,5,2>
+ 3698008921U, // <5,2,1,1>: Cost 4 vext2 <1,1,5,2>, <1,1,5,2>
+ 3633325603U, // <5,2,1,2>: Cost 4 vext1 <1,5,2,1>, <2,1,3,5>
+ 2687125027U, // <5,2,1,3>: Cost 3 vext3 <0,4,1,5>, <2,1,3,5>
+ 3633327414U, // <5,2,1,4>: Cost 4 vext1 <1,5,2,1>, RHS
+ 3759539760U, // <5,2,1,5>: Cost 4 vext3 <0,2,1,5>, <2,1,5,0>
+ 3760866876U, // <5,2,1,6>: Cost 4 vext3 <0,4,1,5>, <2,1,6,3>
+ 3304122247U, // <5,2,1,7>: Cost 4 vrev <2,5,7,1>
+ 2687125072U, // <5,2,1,u>: Cost 3 vext3 <0,4,1,5>, <2,1,u,5>
+ 3633332326U, // <5,2,2,0>: Cost 4 vext1 <1,5,2,2>, LHS
+ 3759760992U, // <5,2,2,1>: Cost 4 vext3 <0,2,4,5>, <2,2,1,3>
+ 2687125096U, // <5,2,2,2>: Cost 3 vext3 <0,4,1,5>, <2,2,2,2>
+ 2687125106U, // <5,2,2,3>: Cost 3 vext3 <0,4,1,5>, <2,2,3,3>
+ 2697963133U, // <5,2,2,4>: Cost 3 vext3 <2,2,4,5>, <2,2,4,5>
+ 3759466120U, // <5,2,2,5>: Cost 4 vext3 <0,2,0,5>, <2,2,5,7>
+ 3760866960U, // <5,2,2,6>: Cost 4 vext3 <0,4,1,5>, <2,2,6,6>
+ 3771926168U, // <5,2,2,7>: Cost 4 vext3 <2,2,7,5>, <2,2,7,5>
+ 2687125151U, // <5,2,2,u>: Cost 3 vext3 <0,4,1,5>, <2,2,u,3>
+ 2687125158U, // <5,2,3,0>: Cost 3 vext3 <0,4,1,5>, <2,3,0,1>
+ 2698405555U, // <5,2,3,1>: Cost 3 vext3 <2,3,1,5>, <2,3,1,5>
+ 2577516238U, // <5,2,3,2>: Cost 3 vext1 <4,5,2,3>, <2,3,4,5>
+ 3759687365U, // <5,2,3,3>: Cost 4 vext3 <0,2,3,5>, <2,3,3,5>
+ 1624884942U, // <5,2,3,4>: Cost 2 vext3 <2,3,4,5>, <2,3,4,5>
+ 2698700503U, // <5,2,3,5>: Cost 3 vext3 <2,3,5,5>, <2,3,5,5>
+ 3772368608U, // <5,2,3,6>: Cost 4 vext3 <2,3,4,5>, <2,3,6,5>
+ 3702655716U, // <5,2,3,7>: Cost 4 vext2 <1,u,5,2>, <3,7,3,7>
+ 1625179890U, // <5,2,3,u>: Cost 2 vext3 <2,3,u,5>, <2,3,u,5>
+ 2641521555U, // <5,2,4,0>: Cost 3 vext2 <4,0,5,2>, <4,0,5,2>
+ 3772368642U, // <5,2,4,1>: Cost 4 vext3 <2,3,4,5>, <2,4,1,3>
+ 2699142925U, // <5,2,4,2>: Cost 3 vext3 <2,4,2,5>, <2,4,2,5>
+ 2698626838U, // <5,2,4,3>: Cost 3 vext3 <2,3,4,5>, <2,4,3,5>
+ 2698626848U, // <5,2,4,4>: Cost 3 vext3 <2,3,4,5>, <2,4,4,6>
+ 2628914486U, // <5,2,4,5>: Cost 3 vext2 <1,u,5,2>, RHS
+ 2645503353U, // <5,2,4,6>: Cost 3 vext2 <4,6,5,2>, <4,6,5,2>
+ 3304146826U, // <5,2,4,7>: Cost 4 vrev <2,5,7,4>
+ 2628914729U, // <5,2,4,u>: Cost 3 vext2 <1,u,5,2>, RHS
+ 2553643110U, // <5,2,5,0>: Cost 3 vext1 <0,5,2,5>, LHS
+ 3758950227U, // <5,2,5,1>: Cost 4 vext3 <0,1,2,5>, <2,5,1,3>
+ 3759761248U, // <5,2,5,2>: Cost 4 vext3 <0,2,4,5>, <2,5,2,7>
+ 2982396006U, // <5,2,5,3>: Cost 3 vzipr <4,u,5,5>, LHS
+ 2553646390U, // <5,2,5,4>: Cost 3 vext1 <0,5,2,5>, RHS
+ 2553647108U, // <5,2,5,5>: Cost 3 vext1 <0,5,2,5>, <5,5,5,5>
+ 3760867204U, // <5,2,5,6>: Cost 4 vext3 <0,4,1,5>, <2,5,6,7>
+ 3702657141U, // <5,2,5,7>: Cost 4 vext2 <1,u,5,2>, <5,7,0,1>
+ 2982396011U, // <5,2,5,u>: Cost 3 vzipr <4,u,5,5>, LHS
+ 3627393126U, // <5,2,6,0>: Cost 4 vext1 <0,5,2,6>, LHS
+ 3760867236U, // <5,2,6,1>: Cost 4 vext3 <0,4,1,5>, <2,6,1,3>
+ 2645504506U, // <5,2,6,2>: Cost 3 vext2 <4,6,5,2>, <6,2,7,3>
+ 2687125434U, // <5,2,6,3>: Cost 3 vext3 <0,4,1,5>, <2,6,3,7>
+ 2700617665U, // <5,2,6,4>: Cost 3 vext3 <2,6,4,5>, <2,6,4,5>
+ 3760867276U, // <5,2,6,5>: Cost 4 vext3 <0,4,1,5>, <2,6,5,7>
+ 3763521493U, // <5,2,6,6>: Cost 4 vext3 <0,u,1,5>, <2,6,6,7>
+ 3719246670U, // <5,2,6,7>: Cost 4 vext2 <4,6,5,2>, <6,7,0,1>
+ 2687125479U, // <5,2,6,u>: Cost 3 vext3 <0,4,1,5>, <2,6,u,7>
+ 2565603430U, // <5,2,7,0>: Cost 3 vext1 <2,5,2,7>, LHS
+ 2553660150U, // <5,2,7,1>: Cost 3 vext1 <0,5,2,7>, <1,0,3,2>
+ 2565605216U, // <5,2,7,2>: Cost 3 vext1 <2,5,2,7>, <2,5,2,7>
+ 2961178726U, // <5,2,7,3>: Cost 3 vzipr <1,3,5,7>, LHS
+ 2565606710U, // <5,2,7,4>: Cost 3 vext1 <2,5,2,7>, RHS
+ 4034920552U, // <5,2,7,5>: Cost 4 vzipr <1,3,5,7>, <0,1,2,5>
+ 3114713292U, // <5,2,7,6>: Cost 3 vtrnr RHS, <0,2,4,6>
+ 3702658668U, // <5,2,7,7>: Cost 4 vext2 <1,u,5,2>, <7,7,7,7>
+ 2961178731U, // <5,2,7,u>: Cost 3 vzipr <1,3,5,7>, LHS
+ 2687125563U, // <5,2,u,0>: Cost 3 vext3 <0,4,1,5>, <2,u,0,1>
+ 2628917038U, // <5,2,u,1>: Cost 3 vext2 <1,u,5,2>, LHS
+ 2565613409U, // <5,2,u,2>: Cost 3 vext1 <2,5,2,u>, <2,5,2,u>
+ 2687125592U, // <5,2,u,3>: Cost 3 vext3 <0,4,1,5>, <2,u,3,3>
+ 1628203107U, // <5,2,u,4>: Cost 2 vext3 <2,u,4,5>, <2,u,4,5>
+ 2628917402U, // <5,2,u,5>: Cost 3 vext2 <1,u,5,2>, RHS
+ 2702092405U, // <5,2,u,6>: Cost 3 vext3 <2,u,6,5>, <2,u,6,5>
+ 3304179598U, // <5,2,u,7>: Cost 4 vrev <2,5,7,u>
+ 1628498055U, // <5,2,u,u>: Cost 2 vext3 <2,u,u,5>, <2,u,u,5>
+ 3760867467U, // <5,3,0,0>: Cost 4 vext3 <0,4,1,5>, <3,0,0,0>
+ 2687125654U, // <5,3,0,1>: Cost 3 vext3 <0,4,1,5>, <3,0,1,2>
+ 3759761565U, // <5,3,0,2>: Cost 4 vext3 <0,2,4,5>, <3,0,2,0>
+ 3633391766U, // <5,3,0,3>: Cost 4 vext1 <1,5,3,0>, <3,0,1,2>
+ 2687125680U, // <5,3,0,4>: Cost 3 vext3 <0,4,1,5>, <3,0,4,1>
+ 3760277690U, // <5,3,0,5>: Cost 4 vext3 <0,3,2,5>, <3,0,5,2>
+ 3310013014U, // <5,3,0,6>: Cost 4 vrev <3,5,6,0>
+ 2236344927U, // <5,3,0,7>: Cost 3 vrev <3,5,7,0>
+ 2687125717U, // <5,3,0,u>: Cost 3 vext3 <0,4,1,5>, <3,0,u,2>
+ 3760867551U, // <5,3,1,0>: Cost 4 vext3 <0,4,1,5>, <3,1,0,3>
+ 3760867558U, // <5,3,1,1>: Cost 4 vext3 <0,4,1,5>, <3,1,1,1>
+ 2624938923U, // <5,3,1,2>: Cost 3 vext2 <1,2,5,3>, <1,2,5,3>
+ 2703198460U, // <5,3,1,3>: Cost 3 vext3 <3,1,3,5>, <3,1,3,5>
+ 3760867587U, // <5,3,1,4>: Cost 4 vext3 <0,4,1,5>, <3,1,4,3>
+ 2636219536U, // <5,3,1,5>: Cost 3 vext2 <3,1,5,3>, <1,5,3,7>
+ 3698681075U, // <5,3,1,6>: Cost 4 vext2 <1,2,5,3>, <1,6,5,7>
+ 2703493408U, // <5,3,1,7>: Cost 3 vext3 <3,1,7,5>, <3,1,7,5>
+ 2628920721U, // <5,3,1,u>: Cost 3 vext2 <1,u,5,3>, <1,u,5,3>
+ 3766765870U, // <5,3,2,0>: Cost 4 vext3 <1,4,0,5>, <3,2,0,1>
+ 3698681379U, // <5,3,2,1>: Cost 4 vext2 <1,2,5,3>, <2,1,3,5>
+ 3760867649U, // <5,3,2,2>: Cost 4 vext3 <0,4,1,5>, <3,2,2,2>
+ 2698627404U, // <5,3,2,3>: Cost 3 vext3 <2,3,4,5>, <3,2,3,4>
+ 2703935830U, // <5,3,2,4>: Cost 3 vext3 <3,2,4,5>, <3,2,4,5>
+ 2698627422U, // <5,3,2,5>: Cost 3 vext3 <2,3,4,5>, <3,2,5,4>
+ 3760867686U, // <5,3,2,6>: Cost 4 vext3 <0,4,1,5>, <3,2,6,3>
+ 3769788783U, // <5,3,2,7>: Cost 4 vext3 <1,u,5,5>, <3,2,7,3>
+ 2701945209U, // <5,3,2,u>: Cost 3 vext3 <2,u,4,5>, <3,2,u,4>
+ 3760867711U, // <5,3,3,0>: Cost 4 vext3 <0,4,1,5>, <3,3,0,1>
+ 2636220684U, // <5,3,3,1>: Cost 3 vext2 <3,1,5,3>, <3,1,5,3>
+ 3772369298U, // <5,3,3,2>: Cost 4 vext3 <2,3,4,5>, <3,3,2,2>
+ 2687125916U, // <5,3,3,3>: Cost 3 vext3 <0,4,1,5>, <3,3,3,3>
+ 2704599463U, // <5,3,3,4>: Cost 3 vext3 <3,3,4,5>, <3,3,4,5>
+ 2704673200U, // <5,3,3,5>: Cost 3 vext3 <3,3,5,5>, <3,3,5,5>
+ 3709962935U, // <5,3,3,6>: Cost 4 vext2 <3,1,5,3>, <3,6,7,7>
+ 3772369346U, // <5,3,3,7>: Cost 4 vext3 <2,3,4,5>, <3,3,7,5>
+ 2704894411U, // <5,3,3,u>: Cost 3 vext3 <3,3,u,5>, <3,3,u,5>
+ 2704968148U, // <5,3,4,0>: Cost 3 vext3 <3,4,0,5>, <3,4,0,5>
+ 3698682850U, // <5,3,4,1>: Cost 4 vext2 <1,2,5,3>, <4,1,5,0>
+ 2642857014U, // <5,3,4,2>: Cost 3 vext2 <4,2,5,3>, <4,2,5,3>
+ 2705189359U, // <5,3,4,3>: Cost 3 vext3 <3,4,3,5>, <3,4,3,5>
+ 2705263096U, // <5,3,4,4>: Cost 3 vext3 <3,4,4,5>, <3,4,4,5>
+ 2685946370U, // <5,3,4,5>: Cost 3 vext3 <0,2,3,5>, <3,4,5,6>
+ 3779152394U, // <5,3,4,6>: Cost 4 vext3 <3,4,6,5>, <3,4,6,5>
+ 2236377699U, // <5,3,4,7>: Cost 3 vrev <3,5,7,4>
+ 2687126045U, // <5,3,4,u>: Cost 3 vext3 <0,4,1,5>, <3,4,u,6>
+ 2571632742U, // <5,3,5,0>: Cost 3 vext1 <3,5,3,5>, LHS
+ 2559689870U, // <5,3,5,1>: Cost 3 vext1 <1,5,3,5>, <1,5,3,5>
+ 2571634382U, // <5,3,5,2>: Cost 3 vext1 <3,5,3,5>, <2,3,4,5>
+ 2571635264U, // <5,3,5,3>: Cost 3 vext1 <3,5,3,5>, <3,5,3,5>
+ 2571636022U, // <5,3,5,4>: Cost 3 vext1 <3,5,3,5>, RHS
+ 2559692804U, // <5,3,5,5>: Cost 3 vext1 <1,5,3,5>, <5,5,5,5>
+ 3720581218U, // <5,3,5,6>: Cost 4 vext2 <4,u,5,3>, <5,6,7,0>
+ 2236385892U, // <5,3,5,7>: Cost 3 vrev <3,5,7,5>
+ 2571638574U, // <5,3,5,u>: Cost 3 vext1 <3,5,3,5>, LHS
+ 2565668966U, // <5,3,6,0>: Cost 3 vext1 <2,5,3,6>, LHS
+ 3633439887U, // <5,3,6,1>: Cost 4 vext1 <1,5,3,6>, <1,5,3,6>
+ 2565670760U, // <5,3,6,2>: Cost 3 vext1 <2,5,3,6>, <2,5,3,6>
+ 2565671426U, // <5,3,6,3>: Cost 3 vext1 <2,5,3,6>, <3,4,5,6>
+ 2565672246U, // <5,3,6,4>: Cost 3 vext1 <2,5,3,6>, RHS
+ 3639414630U, // <5,3,6,5>: Cost 4 vext1 <2,5,3,6>, <5,3,6,0>
+ 4047521640U, // <5,3,6,6>: Cost 4 vzipr <3,4,5,6>, <2,5,3,6>
+ 2725169844U, // <5,3,6,7>: Cost 3 vext3 <6,7,4,5>, <3,6,7,4>
+ 2565674798U, // <5,3,6,u>: Cost 3 vext1 <2,5,3,6>, LHS
+ 1485963366U, // <5,3,7,0>: Cost 2 vext1 <1,5,3,7>, LHS
+ 1485964432U, // <5,3,7,1>: Cost 2 vext1 <1,5,3,7>, <1,5,3,7>
+ 2559706728U, // <5,3,7,2>: Cost 3 vext1 <1,5,3,7>, <2,2,2,2>
+ 2559707286U, // <5,3,7,3>: Cost 3 vext1 <1,5,3,7>, <3,0,1,2>
+ 1485966646U, // <5,3,7,4>: Cost 2 vext1 <1,5,3,7>, RHS
+ 2559708880U, // <5,3,7,5>: Cost 3 vext1 <1,5,3,7>, <5,1,7,3>
+ 2601513466U, // <5,3,7,6>: Cost 3 vext1 <u,5,3,7>, <6,2,7,3>
+ 3114714112U, // <5,3,7,7>: Cost 3 vtrnr RHS, <1,3,5,7>
+ 1485969198U, // <5,3,7,u>: Cost 2 vext1 <1,5,3,7>, LHS
+ 1485971558U, // <5,3,u,0>: Cost 2 vext1 <1,5,3,u>, LHS
+ 1485972625U, // <5,3,u,1>: Cost 2 vext1 <1,5,3,u>, <1,5,3,u>
+ 2559714920U, // <5,3,u,2>: Cost 3 vext1 <1,5,3,u>, <2,2,2,2>
+ 2559715478U, // <5,3,u,3>: Cost 3 vext1 <1,5,3,u>, <3,0,1,2>
+ 1485974838U, // <5,3,u,4>: Cost 2 vext1 <1,5,3,u>, RHS
+ 2687126342U, // <5,3,u,5>: Cost 3 vext3 <0,4,1,5>, <3,u,5,6>
+ 2601521658U, // <5,3,u,6>: Cost 3 vext1 <u,5,3,u>, <6,2,7,3>
+ 2236410471U, // <5,3,u,7>: Cost 3 vrev <3,5,7,u>
+ 1485977390U, // <5,3,u,u>: Cost 2 vext1 <1,5,3,u>, LHS
+ 3627491430U, // <5,4,0,0>: Cost 4 vext1 <0,5,4,0>, LHS
+ 2636890214U, // <5,4,0,1>: Cost 3 vext2 <3,2,5,4>, LHS
+ 3703333028U, // <5,4,0,2>: Cost 4 vext2 <2,0,5,4>, <0,2,0,2>
+ 3782249348U, // <5,4,0,3>: Cost 4 vext3 <4,0,3,5>, <4,0,3,5>
+ 2642198866U, // <5,4,0,4>: Cost 3 vext2 <4,1,5,4>, <0,4,1,5>
+ 2687126418U, // <5,4,0,5>: Cost 3 vext3 <0,4,1,5>, <4,0,5,1>
+ 2242243887U, // <5,4,0,6>: Cost 3 vrev <4,5,6,0>
+ 3316059448U, // <5,4,0,7>: Cost 4 vrev <4,5,7,0>
+ 2636890781U, // <5,4,0,u>: Cost 3 vext2 <3,2,5,4>, LHS
+ 2241809658U, // <5,4,1,0>: Cost 3 vrev <4,5,0,1>
+ 3698025307U, // <5,4,1,1>: Cost 4 vext2 <1,1,5,4>, <1,1,5,4>
+ 3698688940U, // <5,4,1,2>: Cost 4 vext2 <1,2,5,4>, <1,2,5,4>
+ 3698689024U, // <5,4,1,3>: Cost 4 vext2 <1,2,5,4>, <1,3,5,7>
+ 3700016206U, // <5,4,1,4>: Cost 4 vext2 <1,4,5,4>, <1,4,5,4>
+ 2687126498U, // <5,4,1,5>: Cost 3 vext3 <0,4,1,5>, <4,1,5,0>
+ 3760868336U, // <5,4,1,6>: Cost 4 vext3 <0,4,1,5>, <4,1,6,5>
+ 3316067641U, // <5,4,1,7>: Cost 4 vrev <4,5,7,1>
+ 2242399554U, // <5,4,1,u>: Cost 3 vrev <4,5,u,1>
+ 3703334371U, // <5,4,2,0>: Cost 4 vext2 <2,0,5,4>, <2,0,5,4>
+ 3703998004U, // <5,4,2,1>: Cost 4 vext2 <2,1,5,4>, <2,1,5,4>
+ 3704661637U, // <5,4,2,2>: Cost 4 vext2 <2,2,5,4>, <2,2,5,4>
+ 2636891854U, // <5,4,2,3>: Cost 3 vext2 <3,2,5,4>, <2,3,4,5>
+ 3705988903U, // <5,4,2,4>: Cost 4 vext2 <2,4,5,4>, <2,4,5,4>
+ 2698628150U, // <5,4,2,5>: Cost 3 vext3 <2,3,4,5>, <4,2,5,3>
+ 3760868415U, // <5,4,2,6>: Cost 4 vext3 <0,4,1,5>, <4,2,6,3>
+ 3783871562U, // <5,4,2,7>: Cost 4 vext3 <4,2,7,5>, <4,2,7,5>
+ 2666752099U, // <5,4,2,u>: Cost 3 vext2 <u,2,5,4>, <2,u,4,5>
+ 3639459942U, // <5,4,3,0>: Cost 4 vext1 <2,5,4,3>, LHS
+ 3709970701U, // <5,4,3,1>: Cost 4 vext2 <3,1,5,4>, <3,1,5,4>
+ 2636892510U, // <5,4,3,2>: Cost 3 vext2 <3,2,5,4>, <3,2,5,4>
+ 3710634396U, // <5,4,3,3>: Cost 4 vext2 <3,2,5,4>, <3,3,3,3>
+ 2638219776U, // <5,4,3,4>: Cost 3 vext2 <3,4,5,4>, <3,4,5,4>
+ 3766987908U, // <5,4,3,5>: Cost 4 vext3 <1,4,3,5>, <4,3,5,0>
+ 2710719634U, // <5,4,3,6>: Cost 3 vext3 <4,3,6,5>, <4,3,6,5>
+ 3914097664U, // <5,4,3,7>: Cost 4 vuzpr <3,5,7,4>, <1,3,5,7>
+ 2640874308U, // <5,4,3,u>: Cost 3 vext2 <3,u,5,4>, <3,u,5,4>
+ 2583642214U, // <5,4,4,0>: Cost 3 vext1 <5,5,4,4>, LHS
+ 2642201574U, // <5,4,4,1>: Cost 3 vext2 <4,1,5,4>, <4,1,5,4>
+ 3710635062U, // <5,4,4,2>: Cost 4 vext2 <3,2,5,4>, <4,2,5,3>
+ 3717270664U, // <5,4,4,3>: Cost 4 vext2 <4,3,5,4>, <4,3,5,4>
+ 2713963728U, // <5,4,4,4>: Cost 3 vext3 <4,u,5,5>, <4,4,4,4>
+ 1637567706U, // <5,4,4,5>: Cost 2 vext3 <4,4,5,5>, <4,4,5,5>
+ 2242276659U, // <5,4,4,6>: Cost 3 vrev <4,5,6,4>
+ 2646183372U, // <5,4,4,7>: Cost 3 vext2 <4,7,5,4>, <4,7,5,4>
+ 1637788917U, // <5,4,4,u>: Cost 2 vext3 <4,4,u,5>, <4,4,u,5>
+ 2559762534U, // <5,4,5,0>: Cost 3 vext1 <1,5,4,5>, LHS
+ 2559763607U, // <5,4,5,1>: Cost 3 vext1 <1,5,4,5>, <1,5,4,5>
+ 2698628366U, // <5,4,5,2>: Cost 3 vext3 <2,3,4,5>, <4,5,2,3>
+ 3633506454U, // <5,4,5,3>: Cost 4 vext1 <1,5,4,5>, <3,0,1,2>
+ 2559765814U, // <5,4,5,4>: Cost 3 vext1 <1,5,4,5>, RHS
+ 2583654395U, // <5,4,5,5>: Cost 3 vext1 <5,5,4,5>, <5,5,4,5>
+ 1613385014U, // <5,4,5,6>: Cost 2 vext3 <0,4,1,5>, RHS
+ 3901639990U, // <5,4,5,7>: Cost 4 vuzpr <1,5,0,4>, RHS
+ 1613385032U, // <5,4,5,u>: Cost 2 vext3 <0,4,1,5>, RHS
+ 2559770726U, // <5,4,6,0>: Cost 3 vext1 <1,5,4,6>, LHS
+ 2559771648U, // <5,4,6,1>: Cost 3 vext1 <1,5,4,6>, <1,3,5,7>
+ 3633514088U, // <5,4,6,2>: Cost 4 vext1 <1,5,4,6>, <2,2,2,2>
+ 2571717122U, // <5,4,6,3>: Cost 3 vext1 <3,5,4,6>, <3,4,5,6>
+ 2559774006U, // <5,4,6,4>: Cost 3 vext1 <1,5,4,6>, RHS
+ 2712636796U, // <5,4,6,5>: Cost 3 vext3 <4,6,5,5>, <4,6,5,5>
+ 3760868743U, // <5,4,6,6>: Cost 4 vext3 <0,4,1,5>, <4,6,6,7>
+ 2712784270U, // <5,4,6,7>: Cost 3 vext3 <4,6,7,5>, <4,6,7,5>
+ 2559776558U, // <5,4,6,u>: Cost 3 vext1 <1,5,4,6>, LHS
+ 2565750886U, // <5,4,7,0>: Cost 3 vext1 <2,5,4,7>, LHS
+ 2565751706U, // <5,4,7,1>: Cost 3 vext1 <2,5,4,7>, <1,2,3,4>
+ 2565752690U, // <5,4,7,2>: Cost 3 vext1 <2,5,4,7>, <2,5,4,7>
+ 2571725387U, // <5,4,7,3>: Cost 3 vext1 <3,5,4,7>, <3,5,4,7>
+ 2565754166U, // <5,4,7,4>: Cost 3 vext1 <2,5,4,7>, RHS
+ 3114713426U, // <5,4,7,5>: Cost 3 vtrnr RHS, <0,4,1,5>
+ 94817590U, // <5,4,7,6>: Cost 1 vrev RHS
+ 2595616175U, // <5,4,7,7>: Cost 3 vext1 <7,5,4,7>, <7,5,4,7>
+ 94965064U, // <5,4,7,u>: Cost 1 vrev RHS
+ 2559787110U, // <5,4,u,0>: Cost 3 vext1 <1,5,4,u>, LHS
+ 2559788186U, // <5,4,u,1>: Cost 3 vext1 <1,5,4,u>, <1,5,4,u>
+ 2242014483U, // <5,4,u,2>: Cost 3 vrev <4,5,2,u>
+ 2667419628U, // <5,4,u,3>: Cost 3 vext2 <u,3,5,4>, <u,3,5,4>
+ 2559790390U, // <5,4,u,4>: Cost 3 vext1 <1,5,4,u>, RHS
+ 1640222238U, // <5,4,u,5>: Cost 2 vext3 <4,u,5,5>, <4,u,5,5>
+ 94825783U, // <5,4,u,6>: Cost 1 vrev RHS
+ 2714111536U, // <5,4,u,7>: Cost 3 vext3 <4,u,7,5>, <4,u,7,5>
+ 94973257U, // <5,4,u,u>: Cost 1 vrev RHS
+ 2646851584U, // <5,5,0,0>: Cost 3 vext2 <4,u,5,5>, <0,0,0,0>
+ 1573109862U, // <5,5,0,1>: Cost 2 vext2 <4,u,5,5>, LHS
+ 2646851748U, // <5,5,0,2>: Cost 3 vext2 <4,u,5,5>, <0,2,0,2>
+ 3760279130U, // <5,5,0,3>: Cost 4 vext3 <0,3,2,5>, <5,0,3,2>
+ 2687127138U, // <5,5,0,4>: Cost 3 vext3 <0,4,1,5>, <5,0,4,1>
+ 2248142847U, // <5,5,0,5>: Cost 3 vrev <5,5,5,0>
+ 3720593910U, // <5,5,0,6>: Cost 4 vext2 <4,u,5,5>, <0,6,1,7>
+ 4182502710U, // <5,5,0,7>: Cost 4 vtrnr <3,5,7,0>, RHS
+ 1573110429U, // <5,5,0,u>: Cost 2 vext2 <4,u,5,5>, LHS
+ 2646852342U, // <5,5,1,0>: Cost 3 vext2 <4,u,5,5>, <1,0,3,2>
+ 2624291676U, // <5,5,1,1>: Cost 3 vext2 <1,1,5,5>, <1,1,5,5>
+ 2646852502U, // <5,5,1,2>: Cost 3 vext2 <4,u,5,5>, <1,2,3,0>
+ 2646852568U, // <5,5,1,3>: Cost 3 vext2 <4,u,5,5>, <1,3,1,3>
+ 2715217591U, // <5,5,1,4>: Cost 3 vext3 <5,1,4,5>, <5,1,4,5>
+ 2628936848U, // <5,5,1,5>: Cost 3 vext2 <1,u,5,5>, <1,5,3,7>
+ 3698033907U, // <5,5,1,6>: Cost 4 vext2 <1,1,5,5>, <1,6,5,7>
+ 2713964240U, // <5,5,1,7>: Cost 3 vext3 <4,u,5,5>, <5,1,7,3>
+ 2628937107U, // <5,5,1,u>: Cost 3 vext2 <1,u,5,5>, <1,u,5,5>
+ 3645497446U, // <5,5,2,0>: Cost 4 vext1 <3,5,5,2>, LHS
+ 3760869099U, // <5,5,2,1>: Cost 4 vext3 <0,4,1,5>, <5,2,1,3>
+ 2646853224U, // <5,5,2,2>: Cost 3 vext2 <4,u,5,5>, <2,2,2,2>
+ 2698628862U, // <5,5,2,3>: Cost 3 vext3 <2,3,4,5>, <5,2,3,4>
+ 3772370694U, // <5,5,2,4>: Cost 4 vext3 <2,3,4,5>, <5,2,4,3>
+ 2713964303U, // <5,5,2,5>: Cost 3 vext3 <4,u,5,5>, <5,2,5,3>
+ 2646853562U, // <5,5,2,6>: Cost 3 vext2 <4,u,5,5>, <2,6,3,7>
+ 4038198272U, // <5,5,2,7>: Cost 4 vzipr <1,u,5,2>, <1,3,5,7>
+ 2701946667U, // <5,5,2,u>: Cost 3 vext3 <2,u,4,5>, <5,2,u,4>
+ 2646853782U, // <5,5,3,0>: Cost 3 vext2 <4,u,5,5>, <3,0,1,2>
+ 3698034922U, // <5,5,3,1>: Cost 4 vext2 <1,1,5,5>, <3,1,1,5>
+ 3702679919U, // <5,5,3,2>: Cost 4 vext2 <1,u,5,5>, <3,2,7,3>
+ 2637564336U, // <5,5,3,3>: Cost 3 vext2 <3,3,5,5>, <3,3,5,5>
+ 2646854146U, // <5,5,3,4>: Cost 3 vext2 <4,u,5,5>, <3,4,5,6>
+ 2638891602U, // <5,5,3,5>: Cost 3 vext2 <3,5,5,5>, <3,5,5,5>
+ 3702680247U, // <5,5,3,6>: Cost 4 vext2 <1,u,5,5>, <3,6,7,7>
+ 3702680259U, // <5,5,3,7>: Cost 4 vext2 <1,u,5,5>, <3,7,0,1>
+ 2646854430U, // <5,5,3,u>: Cost 3 vext2 <4,u,5,5>, <3,u,1,2>
+ 2646854546U, // <5,5,4,0>: Cost 3 vext2 <4,u,5,5>, <4,0,5,1>
+ 2642209767U, // <5,5,4,1>: Cost 3 vext2 <4,1,5,5>, <4,1,5,5>
+ 3711306806U, // <5,5,4,2>: Cost 4 vext2 <3,3,5,5>, <4,2,5,3>
+ 3645516369U, // <5,5,4,3>: Cost 4 vext1 <3,5,5,4>, <3,5,5,4>
+ 1570458842U, // <5,5,4,4>: Cost 2 vext2 <4,4,5,5>, <4,4,5,5>
+ 1573113142U, // <5,5,4,5>: Cost 2 vext2 <4,u,5,5>, RHS
+ 2645527932U, // <5,5,4,6>: Cost 3 vext2 <4,6,5,5>, <4,6,5,5>
+ 2713964486U, // <5,5,4,7>: Cost 3 vext3 <4,u,5,5>, <5,4,7,6>
+ 1573113374U, // <5,5,4,u>: Cost 2 vext2 <4,u,5,5>, <4,u,5,5>
+ 1509982310U, // <5,5,5,0>: Cost 2 vext1 <5,5,5,5>, LHS
+ 2646855376U, // <5,5,5,1>: Cost 3 vext2 <4,u,5,5>, <5,1,7,3>
+ 2583725672U, // <5,5,5,2>: Cost 3 vext1 <5,5,5,5>, <2,2,2,2>
+ 2583726230U, // <5,5,5,3>: Cost 3 vext1 <5,5,5,5>, <3,0,1,2>
+ 1509985590U, // <5,5,5,4>: Cost 2 vext1 <5,5,5,5>, RHS
+ 229035318U, // <5,5,5,5>: Cost 1 vdup1 RHS
+ 2646855778U, // <5,5,5,6>: Cost 3 vext2 <4,u,5,5>, <5,6,7,0>
+ 2646855848U, // <5,5,5,7>: Cost 3 vext2 <4,u,5,5>, <5,7,5,7>
+ 229035318U, // <5,5,5,u>: Cost 1 vdup1 RHS
+ 2577760358U, // <5,5,6,0>: Cost 3 vext1 <4,5,5,6>, LHS
+ 3633587361U, // <5,5,6,1>: Cost 4 vext1 <1,5,5,6>, <1,5,5,6>
+ 2646856186U, // <5,5,6,2>: Cost 3 vext2 <4,u,5,5>, <6,2,7,3>
+ 3633588738U, // <5,5,6,3>: Cost 4 vext1 <1,5,5,6>, <3,4,5,6>
+ 2718535756U, // <5,5,6,4>: Cost 3 vext3 <5,6,4,5>, <5,6,4,5>
+ 2644202223U, // <5,5,6,5>: Cost 3 vext2 <4,4,5,5>, <6,5,7,5>
+ 2973780482U, // <5,5,6,6>: Cost 3 vzipr <3,4,5,6>, <3,4,5,6>
+ 2646856526U, // <5,5,6,7>: Cost 3 vext2 <4,u,5,5>, <6,7,0,1>
+ 2646856607U, // <5,5,6,u>: Cost 3 vext2 <4,u,5,5>, <6,u,0,1>
+ 2571796582U, // <5,5,7,0>: Cost 3 vext1 <3,5,5,7>, LHS
+ 3633595392U, // <5,5,7,1>: Cost 4 vext1 <1,5,5,7>, <1,3,5,7>
+ 2571798222U, // <5,5,7,2>: Cost 3 vext1 <3,5,5,7>, <2,3,4,5>
+ 2571799124U, // <5,5,7,3>: Cost 3 vext1 <3,5,5,7>, <3,5,5,7>
+ 2571799862U, // <5,5,7,4>: Cost 3 vext1 <3,5,5,7>, RHS
+ 3114717188U, // <5,5,7,5>: Cost 3 vtrnr RHS, <5,5,5,5>
+ 4034923010U, // <5,5,7,6>: Cost 4 vzipr <1,3,5,7>, <3,4,5,6>
+ 2040974646U, // <5,5,7,7>: Cost 2 vtrnr RHS, RHS
+ 2040974647U, // <5,5,7,u>: Cost 2 vtrnr RHS, RHS
+ 1509982310U, // <5,5,u,0>: Cost 2 vext1 <5,5,5,5>, LHS
+ 1573115694U, // <5,5,u,1>: Cost 2 vext2 <4,u,5,5>, LHS
+ 2571806414U, // <5,5,u,2>: Cost 3 vext1 <3,5,5,u>, <2,3,4,5>
+ 2571807317U, // <5,5,u,3>: Cost 3 vext1 <3,5,5,u>, <3,5,5,u>
+ 1509985590U, // <5,5,u,4>: Cost 2 vext1 <5,5,5,5>, RHS
+ 229035318U, // <5,5,u,5>: Cost 1 vdup1 RHS
+ 2646857936U, // <5,5,u,6>: Cost 3 vext2 <4,u,5,5>, <u,6,3,7>
+ 2040982838U, // <5,5,u,7>: Cost 2 vtrnr RHS, RHS
+ 229035318U, // <5,5,u,u>: Cost 1 vdup1 RHS
+ 2638233600U, // <5,6,0,0>: Cost 3 vext2 <3,4,5,6>, <0,0,0,0>
+ 1564491878U, // <5,6,0,1>: Cost 2 vext2 <3,4,5,6>, LHS
+ 2632261796U, // <5,6,0,2>: Cost 3 vext2 <2,4,5,6>, <0,2,0,2>
+ 2638233856U, // <5,6,0,3>: Cost 3 vext2 <3,4,5,6>, <0,3,1,4>
+ 2638233938U, // <5,6,0,4>: Cost 3 vext2 <3,4,5,6>, <0,4,1,5>
+ 3706003885U, // <5,6,0,5>: Cost 4 vext2 <2,4,5,6>, <0,5,2,6>
+ 3706003967U, // <5,6,0,6>: Cost 4 vext2 <2,4,5,6>, <0,6,2,7>
+ 4047473974U, // <5,6,0,7>: Cost 4 vzipr <3,4,5,0>, RHS
+ 1564492445U, // <5,6,0,u>: Cost 2 vext2 <3,4,5,6>, LHS
+ 2638234358U, // <5,6,1,0>: Cost 3 vext2 <3,4,5,6>, <1,0,3,2>
+ 2638234420U, // <5,6,1,1>: Cost 3 vext2 <3,4,5,6>, <1,1,1,1>
+ 2638234518U, // <5,6,1,2>: Cost 3 vext2 <3,4,5,6>, <1,2,3,0>
+ 2638234584U, // <5,6,1,3>: Cost 3 vext2 <3,4,5,6>, <1,3,1,3>
+ 2626290768U, // <5,6,1,4>: Cost 3 vext2 <1,4,5,6>, <1,4,5,6>
+ 2638234768U, // <5,6,1,5>: Cost 3 vext2 <3,4,5,6>, <1,5,3,7>
+ 3700032719U, // <5,6,1,6>: Cost 4 vext2 <1,4,5,6>, <1,6,1,7>
+ 2982366518U, // <5,6,1,7>: Cost 3 vzipr <4,u,5,1>, RHS
+ 2628945300U, // <5,6,1,u>: Cost 3 vext2 <1,u,5,6>, <1,u,5,6>
+ 3706004925U, // <5,6,2,0>: Cost 4 vext2 <2,4,5,6>, <2,0,1,2>
+ 3711976966U, // <5,6,2,1>: Cost 4 vext2 <3,4,5,6>, <2,1,0,3>
+ 2638235240U, // <5,6,2,2>: Cost 3 vext2 <3,4,5,6>, <2,2,2,2>
+ 2638235302U, // <5,6,2,3>: Cost 3 vext2 <3,4,5,6>, <2,3,0,1>
+ 2632263465U, // <5,6,2,4>: Cost 3 vext2 <2,4,5,6>, <2,4,5,6>
+ 2638235496U, // <5,6,2,5>: Cost 3 vext2 <3,4,5,6>, <2,5,3,6>
+ 2638235578U, // <5,6,2,6>: Cost 3 vext2 <3,4,5,6>, <2,6,3,7>
+ 2713965050U, // <5,6,2,7>: Cost 3 vext3 <4,u,5,5>, <6,2,7,3>
+ 2634917997U, // <5,6,2,u>: Cost 3 vext2 <2,u,5,6>, <2,u,5,6>
+ 2638235798U, // <5,6,3,0>: Cost 3 vext2 <3,4,5,6>, <3,0,1,2>
+ 3711977695U, // <5,6,3,1>: Cost 4 vext2 <3,4,5,6>, <3,1,0,3>
+ 3710650720U, // <5,6,3,2>: Cost 4 vext2 <3,2,5,6>, <3,2,5,6>
+ 2638236060U, // <5,6,3,3>: Cost 3 vext2 <3,4,5,6>, <3,3,3,3>
+ 1564494338U, // <5,6,3,4>: Cost 2 vext2 <3,4,5,6>, <3,4,5,6>
+ 2638236234U, // <5,6,3,5>: Cost 3 vext2 <3,4,5,6>, <3,5,4,6>
+ 3711978104U, // <5,6,3,6>: Cost 4 vext2 <3,4,5,6>, <3,6,0,7>
+ 4034227510U, // <5,6,3,7>: Cost 4 vzipr <1,2,5,3>, RHS
+ 1567148870U, // <5,6,3,u>: Cost 2 vext2 <3,u,5,6>, <3,u,5,6>
+ 2577817702U, // <5,6,4,0>: Cost 3 vext1 <4,5,6,4>, LHS
+ 3700034544U, // <5,6,4,1>: Cost 4 vext2 <1,4,5,6>, <4,1,6,5>
+ 2723033713U, // <5,6,4,2>: Cost 3 vext3 <6,4,2,5>, <6,4,2,5>
+ 2638236818U, // <5,6,4,3>: Cost 3 vext2 <3,4,5,6>, <4,3,6,5>
+ 2644208859U, // <5,6,4,4>: Cost 3 vext2 <4,4,5,6>, <4,4,5,6>
+ 1564495158U, // <5,6,4,5>: Cost 2 vext2 <3,4,5,6>, RHS
+ 2645536125U, // <5,6,4,6>: Cost 3 vext2 <4,6,5,6>, <4,6,5,6>
+ 2723402398U, // <5,6,4,7>: Cost 3 vext3 <6,4,7,5>, <6,4,7,5>
+ 1564495401U, // <5,6,4,u>: Cost 2 vext2 <3,4,5,6>, RHS
+ 2577825894U, // <5,6,5,0>: Cost 3 vext1 <4,5,6,5>, LHS
+ 2662125264U, // <5,6,5,1>: Cost 3 vext2 <7,4,5,6>, <5,1,7,3>
+ 3775836867U, // <5,6,5,2>: Cost 4 vext3 <2,u,6,5>, <6,5,2,6>
+ 3711979343U, // <5,6,5,3>: Cost 4 vext2 <3,4,5,6>, <5,3,3,4>
+ 2650181556U, // <5,6,5,4>: Cost 3 vext2 <5,4,5,6>, <5,4,5,6>
+ 2662125572U, // <5,6,5,5>: Cost 3 vext2 <7,4,5,6>, <5,5,5,5>
+ 2638237732U, // <5,6,5,6>: Cost 3 vext2 <3,4,5,6>, <5,6,0,1>
+ 2982399286U, // <5,6,5,7>: Cost 3 vzipr <4,u,5,5>, RHS
+ 2982399287U, // <5,6,5,u>: Cost 3 vzipr <4,u,5,5>, RHS
+ 2583806054U, // <5,6,6,0>: Cost 3 vext1 <5,5,6,6>, LHS
+ 3711979910U, // <5,6,6,1>: Cost 4 vext2 <3,4,5,6>, <6,1,3,4>
+ 2662126074U, // <5,6,6,2>: Cost 3 vext2 <7,4,5,6>, <6,2,7,3>
+ 2583808514U, // <5,6,6,3>: Cost 3 vext1 <5,5,6,6>, <3,4,5,6>
+ 2583809334U, // <5,6,6,4>: Cost 3 vext1 <5,5,6,6>, RHS
+ 2583810062U, // <5,6,6,5>: Cost 3 vext1 <5,5,6,6>, <5,5,6,6>
+ 2638238520U, // <5,6,6,6>: Cost 3 vext2 <3,4,5,6>, <6,6,6,6>
+ 2973781302U, // <5,6,6,7>: Cost 3 vzipr <3,4,5,6>, RHS
+ 2973781303U, // <5,6,6,u>: Cost 3 vzipr <3,4,5,6>, RHS
+ 430358630U, // <5,6,7,0>: Cost 1 vext1 RHS, LHS
+ 1504101110U, // <5,6,7,1>: Cost 2 vext1 RHS, <1,0,3,2>
+ 1504101992U, // <5,6,7,2>: Cost 2 vext1 RHS, <2,2,2,2>
+ 1504102550U, // <5,6,7,3>: Cost 2 vext1 RHS, <3,0,1,2>
+ 430361910U, // <5,6,7,4>: Cost 1 vext1 RHS, RHS
+ 1504104390U, // <5,6,7,5>: Cost 2 vext1 RHS, <5,4,7,6>
+ 1504105272U, // <5,6,7,6>: Cost 2 vext1 RHS, <6,6,6,6>
+ 1504106092U, // <5,6,7,7>: Cost 2 vext1 RHS, <7,7,7,7>
+ 430364462U, // <5,6,7,u>: Cost 1 vext1 RHS, LHS
+ 430366822U, // <5,6,u,0>: Cost 1 vext1 RHS, LHS
+ 1564497710U, // <5,6,u,1>: Cost 2 vext2 <3,4,5,6>, LHS
+ 1504110184U, // <5,6,u,2>: Cost 2 vext1 RHS, <2,2,2,2>
+ 1504110742U, // <5,6,u,3>: Cost 2 vext1 RHS, <3,0,1,2>
+ 430370103U, // <5,6,u,4>: Cost 1 vext1 RHS, RHS
+ 1564498074U, // <5,6,u,5>: Cost 2 vext2 <3,4,5,6>, RHS
+ 1504113146U, // <5,6,u,6>: Cost 2 vext1 RHS, <6,2,7,3>
+ 1504113658U, // <5,6,u,7>: Cost 2 vext1 RHS, <7,0,1,2>
+ 430372654U, // <5,6,u,u>: Cost 1 vext1 RHS, LHS
+ 2625634304U, // <5,7,0,0>: Cost 3 vext2 <1,3,5,7>, <0,0,0,0>
+ 1551892582U, // <5,7,0,1>: Cost 2 vext2 <1,3,5,7>, LHS
+ 2625634468U, // <5,7,0,2>: Cost 3 vext2 <1,3,5,7>, <0,2,0,2>
+ 2571889247U, // <5,7,0,3>: Cost 3 vext1 <3,5,7,0>, <3,5,7,0>
+ 2625634642U, // <5,7,0,4>: Cost 3 vext2 <1,3,5,7>, <0,4,1,5>
+ 2595778728U, // <5,7,0,5>: Cost 3 vext1 <7,5,7,0>, <5,7,5,7>
+ 3699376639U, // <5,7,0,6>: Cost 4 vext2 <1,3,5,7>, <0,6,2,7>
+ 2260235715U, // <5,7,0,7>: Cost 3 vrev <7,5,7,0>
+ 1551893149U, // <5,7,0,u>: Cost 2 vext2 <1,3,5,7>, LHS
+ 2625635062U, // <5,7,1,0>: Cost 3 vext2 <1,3,5,7>, <1,0,3,2>
+ 2624308020U, // <5,7,1,1>: Cost 3 vext2 <1,1,5,7>, <1,1,1,1>
+ 2625635222U, // <5,7,1,2>: Cost 3 vext2 <1,3,5,7>, <1,2,3,0>
+ 1551893504U, // <5,7,1,3>: Cost 2 vext2 <1,3,5,7>, <1,3,5,7>
+ 2571898166U, // <5,7,1,4>: Cost 3 vext1 <3,5,7,1>, RHS
+ 2625635472U, // <5,7,1,5>: Cost 3 vext2 <1,3,5,7>, <1,5,3,7>
+ 2627626227U, // <5,7,1,6>: Cost 3 vext2 <1,6,5,7>, <1,6,5,7>
+ 3702031684U, // <5,7,1,7>: Cost 4 vext2 <1,7,5,7>, <1,7,5,7>
+ 1555211669U, // <5,7,1,u>: Cost 2 vext2 <1,u,5,7>, <1,u,5,7>
+ 2629617126U, // <5,7,2,0>: Cost 3 vext2 <2,0,5,7>, <2,0,5,7>
+ 3699377670U, // <5,7,2,1>: Cost 4 vext2 <1,3,5,7>, <2,1,0,3>
+ 2625635944U, // <5,7,2,2>: Cost 3 vext2 <1,3,5,7>, <2,2,2,2>
+ 2625636006U, // <5,7,2,3>: Cost 3 vext2 <1,3,5,7>, <2,3,0,1>
+ 2632271658U, // <5,7,2,4>: Cost 3 vext2 <2,4,5,7>, <2,4,5,7>
+ 2625636201U, // <5,7,2,5>: Cost 3 vext2 <1,3,5,7>, <2,5,3,7>
+ 2625636282U, // <5,7,2,6>: Cost 3 vext2 <1,3,5,7>, <2,6,3,7>
+ 3708004381U, // <5,7,2,7>: Cost 4 vext2 <2,7,5,7>, <2,7,5,7>
+ 2625636411U, // <5,7,2,u>: Cost 3 vext2 <1,3,5,7>, <2,u,0,1>
+ 2625636502U, // <5,7,3,0>: Cost 3 vext2 <1,3,5,7>, <3,0,1,2>
+ 2625636604U, // <5,7,3,1>: Cost 3 vext2 <1,3,5,7>, <3,1,3,5>
+ 3699378478U, // <5,7,3,2>: Cost 4 vext2 <1,3,5,7>, <3,2,0,1>
+ 2625636764U, // <5,7,3,3>: Cost 3 vext2 <1,3,5,7>, <3,3,3,3>
+ 2625636866U, // <5,7,3,4>: Cost 3 vext2 <1,3,5,7>, <3,4,5,6>
+ 2625636959U, // <5,7,3,5>: Cost 3 vext2 <1,3,5,7>, <3,5,7,0>
+ 3699378808U, // <5,7,3,6>: Cost 4 vext2 <1,3,5,7>, <3,6,0,7>
+ 2640235254U, // <5,7,3,7>: Cost 3 vext2 <3,7,5,7>, <3,7,5,7>
+ 2625637150U, // <5,7,3,u>: Cost 3 vext2 <1,3,5,7>, <3,u,1,2>
+ 2571919462U, // <5,7,4,0>: Cost 3 vext1 <3,5,7,4>, LHS
+ 2571920384U, // <5,7,4,1>: Cost 3 vext1 <3,5,7,4>, <1,3,5,7>
+ 3699379260U, // <5,7,4,2>: Cost 4 vext2 <1,3,5,7>, <4,2,6,0>
+ 2571922019U, // <5,7,4,3>: Cost 3 vext1 <3,5,7,4>, <3,5,7,4>
+ 2571922742U, // <5,7,4,4>: Cost 3 vext1 <3,5,7,4>, RHS
+ 1551895862U, // <5,7,4,5>: Cost 2 vext2 <1,3,5,7>, RHS
+ 2846277980U, // <5,7,4,6>: Cost 3 vuzpr RHS, <0,4,2,6>
+ 2646207951U, // <5,7,4,7>: Cost 3 vext2 <4,7,5,7>, <4,7,5,7>
+ 1551896105U, // <5,7,4,u>: Cost 2 vext2 <1,3,5,7>, RHS
+ 2583871590U, // <5,7,5,0>: Cost 3 vext1 <5,5,7,5>, LHS
+ 2652180176U, // <5,7,5,1>: Cost 3 vext2 <5,7,5,7>, <5,1,7,3>
+ 2625638177U, // <5,7,5,2>: Cost 3 vext2 <1,3,5,7>, <5,2,7,3>
+ 2625638262U, // <5,7,5,3>: Cost 3 vext2 <1,3,5,7>, <5,3,7,7>
+ 2583874870U, // <5,7,5,4>: Cost 3 vext1 <5,5,7,5>, RHS
+ 2846281732U, // <5,7,5,5>: Cost 3 vuzpr RHS, <5,5,5,5>
+ 2651517015U, // <5,7,5,6>: Cost 3 vext2 <5,6,5,7>, <5,6,5,7>
+ 1772539190U, // <5,7,5,7>: Cost 2 vuzpr RHS, RHS
+ 1772539191U, // <5,7,5,u>: Cost 2 vuzpr RHS, RHS
+ 2846281826U, // <5,7,6,0>: Cost 3 vuzpr RHS, <5,6,7,0>
+ 3699380615U, // <5,7,6,1>: Cost 4 vext2 <1,3,5,7>, <6,1,3,5>
+ 2846281108U, // <5,7,6,2>: Cost 3 vuzpr RHS, <4,6,u,2>
+ 2589854210U, // <5,7,6,3>: Cost 3 vext1 <6,5,7,6>, <3,4,5,6>
+ 2846281830U, // <5,7,6,4>: Cost 3 vuzpr RHS, <5,6,7,4>
+ 2725467658U, // <5,7,6,5>: Cost 3 vext3 <6,7,u,5>, <7,6,5,u>
+ 2846281076U, // <5,7,6,6>: Cost 3 vuzpr RHS, <4,6,4,6>
+ 2846279610U, // <5,7,6,7>: Cost 3 vuzpr RHS, <2,6,3,7>
+ 2846279611U, // <5,7,6,u>: Cost 3 vuzpr RHS, <2,6,3,u>
+ 1510146150U, // <5,7,7,0>: Cost 2 vext1 <5,5,7,7>, LHS
+ 2846282574U, // <5,7,7,1>: Cost 3 vuzpr RHS, <6,7,0,1>
+ 2583889512U, // <5,7,7,2>: Cost 3 vext1 <5,5,7,7>, <2,2,2,2>
+ 2846281919U, // <5,7,7,3>: Cost 3 vuzpr RHS, <5,7,u,3>
+ 1510149430U, // <5,7,7,4>: Cost 2 vext1 <5,5,7,7>, RHS
+ 1510150168U, // <5,7,7,5>: Cost 2 vext1 <5,5,7,7>, <5,5,7,7>
+ 2583892474U, // <5,7,7,6>: Cost 3 vext1 <5,5,7,7>, <6,2,7,3>
+ 2625640044U, // <5,7,7,7>: Cost 3 vext2 <1,3,5,7>, <7,7,7,7>
+ 1510151982U, // <5,7,7,u>: Cost 2 vext1 <5,5,7,7>, LHS
+ 1510154342U, // <5,7,u,0>: Cost 2 vext1 <5,5,7,u>, LHS
+ 1551898414U, // <5,7,u,1>: Cost 2 vext2 <1,3,5,7>, LHS
+ 2625640325U, // <5,7,u,2>: Cost 3 vext2 <1,3,5,7>, <u,2,3,0>
+ 1772536477U, // <5,7,u,3>: Cost 2 vuzpr RHS, LHS
+ 1510157622U, // <5,7,u,4>: Cost 2 vext1 <5,5,7,u>, RHS
+ 1551898778U, // <5,7,u,5>: Cost 2 vext2 <1,3,5,7>, RHS
+ 2625640656U, // <5,7,u,6>: Cost 3 vext2 <1,3,5,7>, <u,6,3,7>
+ 1772539433U, // <5,7,u,7>: Cost 2 vuzpr RHS, RHS
+ 1551898981U, // <5,7,u,u>: Cost 2 vext2 <1,3,5,7>, LHS
+ 2625642496U, // <5,u,0,0>: Cost 3 vext2 <1,3,5,u>, <0,0,0,0>
+ 1551900774U, // <5,u,0,1>: Cost 2 vext2 <1,3,5,u>, LHS
+ 2625642660U, // <5,u,0,2>: Cost 3 vext2 <1,3,5,u>, <0,2,0,2>
+ 2698630885U, // <5,u,0,3>: Cost 3 vext3 <2,3,4,5>, <u,0,3,2>
+ 2687129325U, // <5,u,0,4>: Cost 3 vext3 <0,4,1,5>, <u,0,4,1>
+ 2689783542U, // <5,u,0,5>: Cost 3 vext3 <0,u,1,5>, <u,0,5,1>
+ 2266134675U, // <5,u,0,6>: Cost 3 vrev <u,5,6,0>
+ 2595853772U, // <5,u,0,7>: Cost 3 vext1 <7,5,u,0>, <7,5,u,0>
+ 1551901341U, // <5,u,0,u>: Cost 2 vext2 <1,3,5,u>, LHS
+ 2625643254U, // <5,u,1,0>: Cost 3 vext2 <1,3,5,u>, <1,0,3,2>
+ 2625643316U, // <5,u,1,1>: Cost 3 vext2 <1,3,5,u>, <1,1,1,1>
+ 1613387566U, // <5,u,1,2>: Cost 2 vext3 <0,4,1,5>, LHS
+ 1551901697U, // <5,u,1,3>: Cost 2 vext2 <1,3,5,u>, <1,3,5,u>
+ 2626307154U, // <5,u,1,4>: Cost 3 vext2 <1,4,5,u>, <1,4,5,u>
+ 2689783622U, // <5,u,1,5>: Cost 3 vext3 <0,u,1,5>, <u,1,5,0>
+ 2627634420U, // <5,u,1,6>: Cost 3 vext2 <1,6,5,u>, <1,6,5,u>
+ 2982366536U, // <5,u,1,7>: Cost 3 vzipr <4,u,5,1>, RHS
+ 1613387620U, // <5,u,1,u>: Cost 2 vext3 <0,4,1,5>, LHS
+ 2846286742U, // <5,u,2,0>: Cost 3 vuzpr RHS, <1,2,3,0>
+ 2685796528U, // <5,u,2,1>: Cost 3 vext3 <0,2,1,5>, <0,2,1,5>
+ 2625644136U, // <5,u,2,2>: Cost 3 vext2 <1,3,5,u>, <2,2,2,2>
+ 2687129480U, // <5,u,2,3>: Cost 3 vext3 <0,4,1,5>, <u,2,3,3>
+ 2632279851U, // <5,u,2,4>: Cost 3 vext2 <2,4,5,u>, <2,4,5,u>
+ 2625644394U, // <5,u,2,5>: Cost 3 vext2 <1,3,5,u>, <2,5,3,u>
+ 2625644474U, // <5,u,2,6>: Cost 3 vext2 <1,3,5,u>, <2,6,3,7>
+ 2713966508U, // <5,u,2,7>: Cost 3 vext3 <4,u,5,5>, <u,2,7,3>
+ 2625644603U, // <5,u,2,u>: Cost 3 vext2 <1,3,5,u>, <2,u,0,1>
+ 2687129532U, // <5,u,3,0>: Cost 3 vext3 <0,4,1,5>, <u,3,0,1>
+ 2636261649U, // <5,u,3,1>: Cost 3 vext2 <3,1,5,u>, <3,1,5,u>
+ 2636925282U, // <5,u,3,2>: Cost 3 vext2 <3,2,5,u>, <3,2,5,u>
+ 2625644956U, // <5,u,3,3>: Cost 3 vext2 <1,3,5,u>, <3,3,3,3>
+ 1564510724U, // <5,u,3,4>: Cost 2 vext2 <3,4,5,u>, <3,4,5,u>
+ 2625645160U, // <5,u,3,5>: Cost 3 vext2 <1,3,5,u>, <3,5,u,0>
+ 2734610422U, // <5,u,3,6>: Cost 3 vext3 <u,3,6,5>, <u,3,6,5>
+ 2640243447U, // <5,u,3,7>: Cost 3 vext2 <3,7,5,u>, <3,7,5,u>
+ 1567165256U, // <5,u,3,u>: Cost 2 vext2 <3,u,5,u>, <3,u,5,u>
+ 1567828889U, // <5,u,4,0>: Cost 2 vext2 <4,0,5,u>, <4,0,5,u>
+ 1661163546U, // <5,u,4,1>: Cost 2 vext3 <u,4,1,5>, <u,4,1,5>
+ 2734463012U, // <5,u,4,2>: Cost 3 vext3 <u,3,4,5>, <u,4,2,6>
+ 2698631212U, // <5,u,4,3>: Cost 3 vext3 <2,3,4,5>, <u,4,3,5>
+ 1570458842U, // <5,u,4,4>: Cost 2 vext2 <4,4,5,5>, <4,4,5,5>
+ 1551904054U, // <5,u,4,5>: Cost 2 vext2 <1,3,5,u>, RHS
+ 2846286172U, // <5,u,4,6>: Cost 3 vuzpr RHS, <0,4,2,6>
+ 2646216144U, // <5,u,4,7>: Cost 3 vext2 <4,7,5,u>, <4,7,5,u>
+ 1551904297U, // <5,u,4,u>: Cost 2 vext2 <1,3,5,u>, RHS
+ 1509982310U, // <5,u,5,0>: Cost 2 vext1 <5,5,5,5>, LHS
+ 2560058555U, // <5,u,5,1>: Cost 3 vext1 <1,5,u,5>, <1,5,u,5>
+ 2698926194U, // <5,u,5,2>: Cost 3 vext3 <2,3,u,5>, <u,5,2,3>
+ 2698631295U, // <5,u,5,3>: Cost 3 vext3 <2,3,4,5>, <u,5,3,7>
+ 1509985590U, // <5,u,5,4>: Cost 2 vext1 <5,5,5,5>, RHS
+ 229035318U, // <5,u,5,5>: Cost 1 vdup1 RHS
+ 1613387930U, // <5,u,5,6>: Cost 2 vext3 <0,4,1,5>, RHS
+ 1772547382U, // <5,u,5,7>: Cost 2 vuzpr RHS, RHS
+ 229035318U, // <5,u,5,u>: Cost 1 vdup1 RHS
+ 2566037606U, // <5,u,6,0>: Cost 3 vext1 <2,5,u,6>, LHS
+ 2920044334U, // <5,u,6,1>: Cost 3 vzipl <5,6,7,0>, LHS
+ 2566039445U, // <5,u,6,2>: Cost 3 vext1 <2,5,u,6>, <2,5,u,6>
+ 2687129808U, // <5,u,6,3>: Cost 3 vext3 <0,4,1,5>, <u,6,3,7>
+ 2566040886U, // <5,u,6,4>: Cost 3 vext1 <2,5,u,6>, RHS
+ 2920044698U, // <5,u,6,5>: Cost 3 vzipl <5,6,7,0>, RHS
+ 2846289268U, // <5,u,6,6>: Cost 3 vuzpr RHS, <4,6,4,6>
+ 2973781320U, // <5,u,6,7>: Cost 3 vzipr <3,4,5,6>, RHS
+ 2687129853U, // <5,u,6,u>: Cost 3 vext3 <0,4,1,5>, <u,6,u,7>
+ 430506086U, // <5,u,7,0>: Cost 1 vext1 RHS, LHS
+ 1486333117U, // <5,u,7,1>: Cost 2 vext1 <1,5,u,7>, <1,5,u,7>
+ 1504249448U, // <5,u,7,2>: Cost 2 vext1 RHS, <2,2,2,2>
+ 2040971933U, // <5,u,7,3>: Cost 2 vtrnr RHS, LHS
+ 430509384U, // <5,u,7,4>: Cost 1 vext1 RHS, RHS
+ 1504251600U, // <5,u,7,5>: Cost 2 vext1 RHS, <5,1,7,3>
+ 118708378U, // <5,u,7,6>: Cost 1 vrev RHS
+ 2040974889U, // <5,u,7,7>: Cost 2 vtrnr RHS, RHS
+ 430511918U, // <5,u,7,u>: Cost 1 vext1 RHS, LHS
+ 430514278U, // <5,u,u,0>: Cost 1 vext1 RHS, LHS
+ 1551906606U, // <5,u,u,1>: Cost 2 vext2 <1,3,5,u>, LHS
+ 1613388133U, // <5,u,u,2>: Cost 2 vext3 <0,4,1,5>, LHS
+ 1772544669U, // <5,u,u,3>: Cost 2 vuzpr RHS, LHS
+ 430517577U, // <5,u,u,4>: Cost 1 vext1 RHS, RHS
+ 229035318U, // <5,u,u,5>: Cost 1 vdup1 RHS
+ 118716571U, // <5,u,u,6>: Cost 1 vrev RHS
+ 1772547625U, // <5,u,u,7>: Cost 2 vuzpr RHS, RHS
+ 430520110U, // <5,u,u,u>: Cost 1 vext1 RHS, LHS
+ 2686025728U, // <6,0,0,0>: Cost 3 vext3 <0,2,4,6>, <0,0,0,0>
+ 2686025738U, // <6,0,0,1>: Cost 3 vext3 <0,2,4,6>, <0,0,1,1>
+ 2686025748U, // <6,0,0,2>: Cost 3 vext3 <0,2,4,6>, <0,0,2,2>
+ 3779084320U, // <6,0,0,3>: Cost 4 vext3 <3,4,5,6>, <0,0,3,5>
+ 2642903388U, // <6,0,0,4>: Cost 3 vext2 <4,2,6,0>, <0,4,2,6>
+ 3657723939U, // <6,0,0,5>: Cost 4 vext1 <5,6,0,0>, <5,6,0,0>
+ 3926676514U, // <6,0,0,6>: Cost 4 vuzpr <5,6,7,0>, <7,0,5,6>
+ 3926675786U, // <6,0,0,7>: Cost 4 vuzpr <5,6,7,0>, <6,0,5,7>
+ 2686025802U, // <6,0,0,u>: Cost 3 vext3 <0,2,4,6>, <0,0,u,2>
+ 2566070374U, // <6,0,1,0>: Cost 3 vext1 <2,6,0,1>, LHS
+ 3759767642U, // <6,0,1,1>: Cost 4 vext3 <0,2,4,6>, <0,1,1,0>
+ 1612284006U, // <6,0,1,2>: Cost 2 vext3 <0,2,4,6>, LHS
+ 2583988738U, // <6,0,1,3>: Cost 3 vext1 <5,6,0,1>, <3,4,5,6>
+ 2566073654U, // <6,0,1,4>: Cost 3 vext1 <2,6,0,1>, RHS
+ 2583990308U, // <6,0,1,5>: Cost 3 vext1 <5,6,0,1>, <5,6,0,1>
+ 2589963005U, // <6,0,1,6>: Cost 3 vext1 <6,6,0,1>, <6,6,0,1>
+ 2595935702U, // <6,0,1,7>: Cost 3 vext1 <7,6,0,1>, <7,6,0,1>
+ 1612284060U, // <6,0,1,u>: Cost 2 vext3 <0,2,4,6>, LHS
+ 2686025892U, // <6,0,2,0>: Cost 3 vext3 <0,2,4,6>, <0,2,0,2>
+ 2685804721U, // <6,0,2,1>: Cost 3 vext3 <0,2,1,6>, <0,2,1,6>
+ 3759620282U, // <6,0,2,2>: Cost 4 vext3 <0,2,2,6>, <0,2,2,6>
+ 2705342658U, // <6,0,2,3>: Cost 3 vext3 <3,4,5,6>, <0,2,3,5>
+ 1612284108U, // <6,0,2,4>: Cost 2 vext3 <0,2,4,6>, <0,2,4,6>
+ 3706029956U, // <6,0,2,5>: Cost 4 vext2 <2,4,6,0>, <2,5,6,7>
+ 2686173406U, // <6,0,2,6>: Cost 3 vext3 <0,2,6,6>, <0,2,6,6>
+ 3651769338U, // <6,0,2,7>: Cost 4 vext1 <4,6,0,2>, <7,0,1,2>
+ 1612579056U, // <6,0,2,u>: Cost 2 vext3 <0,2,u,6>, <0,2,u,6>
+ 3706030230U, // <6,0,3,0>: Cost 4 vext2 <2,4,6,0>, <3,0,1,2>
+ 2705342720U, // <6,0,3,1>: Cost 3 vext3 <3,4,5,6>, <0,3,1,4>
+ 2705342730U, // <6,0,3,2>: Cost 3 vext3 <3,4,5,6>, <0,3,2,5>
+ 3706030492U, // <6,0,3,3>: Cost 4 vext2 <2,4,6,0>, <3,3,3,3>
+ 2644896258U, // <6,0,3,4>: Cost 3 vext2 <4,5,6,0>, <3,4,5,6>
+ 3718638154U, // <6,0,3,5>: Cost 4 vext2 <4,5,6,0>, <3,5,4,6>
+ 3729918619U, // <6,0,3,6>: Cost 4 vext2 <6,4,6,0>, <3,6,4,6>
+ 3926672384U, // <6,0,3,7>: Cost 4 vuzpr <5,6,7,0>, <1,3,5,7>
+ 2705342784U, // <6,0,3,u>: Cost 3 vext3 <3,4,5,6>, <0,3,u,5>
+ 2687058250U, // <6,0,4,0>: Cost 3 vext3 <0,4,0,6>, <0,4,0,6>
+ 2686026066U, // <6,0,4,1>: Cost 3 vext3 <0,2,4,6>, <0,4,1,5>
+ 1613463900U, // <6,0,4,2>: Cost 2 vext3 <0,4,2,6>, <0,4,2,6>
+ 3761021285U, // <6,0,4,3>: Cost 4 vext3 <0,4,3,6>, <0,4,3,6>
+ 2687353198U, // <6,0,4,4>: Cost 3 vext3 <0,4,4,6>, <0,4,4,6>
+ 2632289590U, // <6,0,4,5>: Cost 3 vext2 <2,4,6,0>, RHS
+ 2645560704U, // <6,0,4,6>: Cost 3 vext2 <4,6,6,0>, <4,6,6,0>
+ 2646224337U, // <6,0,4,7>: Cost 3 vext2 <4,7,6,0>, <4,7,6,0>
+ 1613906322U, // <6,0,4,u>: Cost 2 vext3 <0,4,u,6>, <0,4,u,6>
+ 3651788902U, // <6,0,5,0>: Cost 4 vext1 <4,6,0,5>, LHS
+ 2687795620U, // <6,0,5,1>: Cost 3 vext3 <0,5,1,6>, <0,5,1,6>
+ 3761611181U, // <6,0,5,2>: Cost 4 vext3 <0,5,2,6>, <0,5,2,6>
+ 3723284326U, // <6,0,5,3>: Cost 4 vext2 <5,3,6,0>, <5,3,6,0>
+ 2646224838U, // <6,0,5,4>: Cost 3 vext2 <4,7,6,0>, <5,4,7,6>
+ 3718639630U, // <6,0,5,5>: Cost 4 vext2 <4,5,6,0>, <5,5,6,6>
+ 2652196962U, // <6,0,5,6>: Cost 3 vext2 <5,7,6,0>, <5,6,7,0>
+ 2852932918U, // <6,0,5,7>: Cost 3 vuzpr <5,6,7,0>, RHS
+ 2852932919U, // <6,0,5,u>: Cost 3 vuzpr <5,6,7,0>, RHS
+ 2852933730U, // <6,0,6,0>: Cost 3 vuzpr <5,6,7,0>, <5,6,7,0>
+ 2925985894U, // <6,0,6,1>: Cost 3 vzipl <6,6,6,6>, LHS
+ 3060203622U, // <6,0,6,2>: Cost 3 vtrnl <6,6,6,6>, LHS
+ 3718640178U, // <6,0,6,3>: Cost 4 vext2 <4,5,6,0>, <6,3,4,5>
+ 2656178832U, // <6,0,6,4>: Cost 3 vext2 <6,4,6,0>, <6,4,6,0>
+ 3725939378U, // <6,0,6,5>: Cost 4 vext2 <5,7,6,0>, <6,5,0,7>
+ 2657506098U, // <6,0,6,6>: Cost 3 vext2 <6,6,6,0>, <6,6,6,0>
+ 2619020110U, // <6,0,6,7>: Cost 3 vext2 <0,2,6,0>, <6,7,0,1>
+ 2925986461U, // <6,0,6,u>: Cost 3 vzipl <6,6,6,6>, LHS
+ 2572091494U, // <6,0,7,0>: Cost 3 vext1 <3,6,0,7>, LHS
+ 2572092310U, // <6,0,7,1>: Cost 3 vext1 <3,6,0,7>, <1,2,3,0>
+ 2980495524U, // <6,0,7,2>: Cost 3 vzipr RHS, <0,2,0,2>
+ 2572094072U, // <6,0,7,3>: Cost 3 vext1 <3,6,0,7>, <3,6,0,7>
+ 2572094774U, // <6,0,7,4>: Cost 3 vext1 <3,6,0,7>, RHS
+ 4054238242U, // <6,0,7,5>: Cost 4 vzipr RHS, <1,4,0,5>
+ 3645837653U, // <6,0,7,6>: Cost 4 vext1 <3,6,0,7>, <6,0,7,0>
+ 4054239054U, // <6,0,7,7>: Cost 4 vzipr RHS, <2,5,0,7>
+ 2572097326U, // <6,0,7,u>: Cost 3 vext1 <3,6,0,7>, LHS
+ 2686026378U, // <6,0,u,0>: Cost 3 vext3 <0,2,4,6>, <0,u,0,2>
+ 2686026386U, // <6,0,u,1>: Cost 3 vext3 <0,2,4,6>, <0,u,1,1>
+ 1612284573U, // <6,0,u,2>: Cost 2 vext3 <0,2,4,6>, LHS
+ 2705343144U, // <6,0,u,3>: Cost 3 vext3 <3,4,5,6>, <0,u,3,5>
+ 1616265906U, // <6,0,u,4>: Cost 2 vext3 <0,u,4,6>, <0,u,4,6>
+ 2632292506U, // <6,0,u,5>: Cost 3 vext2 <2,4,6,0>, RHS
+ 2590020356U, // <6,0,u,6>: Cost 3 vext1 <6,6,0,u>, <6,6,0,u>
+ 2852933161U, // <6,0,u,7>: Cost 3 vuzpr <5,6,7,0>, RHS
+ 1612284627U, // <6,0,u,u>: Cost 2 vext3 <0,2,4,6>, LHS
+ 2595995750U, // <6,1,0,0>: Cost 3 vext1 <7,6,1,0>, LHS
+ 2646229094U, // <6,1,0,1>: Cost 3 vext2 <4,7,6,1>, LHS
+ 3694092492U, // <6,1,0,2>: Cost 4 vext2 <0,4,6,1>, <0,2,4,6>
+ 2686026486U, // <6,1,0,3>: Cost 3 vext3 <0,2,4,6>, <1,0,3,2>
+ 2595999030U, // <6,1,0,4>: Cost 3 vext1 <7,6,1,0>, RHS
+ 3767730952U, // <6,1,0,5>: Cost 4 vext3 <1,5,4,6>, <1,0,5,2>
+ 2596000590U, // <6,1,0,6>: Cost 3 vext1 <7,6,1,0>, <6,7,0,1>
+ 2596001246U, // <6,1,0,7>: Cost 3 vext1 <7,6,1,0>, <7,6,1,0>
+ 2686026531U, // <6,1,0,u>: Cost 3 vext3 <0,2,4,6>, <1,0,u,2>
+ 3763602219U, // <6,1,1,0>: Cost 4 vext3 <0,u,2,6>, <1,1,0,1>
+ 2686026548U, // <6,1,1,1>: Cost 3 vext3 <0,2,4,6>, <1,1,1,1>
+ 3764929346U, // <6,1,1,2>: Cost 4 vext3 <1,1,2,6>, <1,1,2,6>
+ 2686026568U, // <6,1,1,3>: Cost 3 vext3 <0,2,4,6>, <1,1,3,3>
+ 2691334996U, // <6,1,1,4>: Cost 3 vext3 <1,1,4,6>, <1,1,4,6>
+ 3760874332U, // <6,1,1,5>: Cost 4 vext3 <0,4,1,6>, <1,1,5,5>
+ 3765224294U, // <6,1,1,6>: Cost 4 vext3 <1,1,6,6>, <1,1,6,6>
+ 3669751263U, // <6,1,1,7>: Cost 4 vext1 <7,6,1,1>, <7,6,1,1>
+ 2686026613U, // <6,1,1,u>: Cost 3 vext3 <0,2,4,6>, <1,1,u,3>
+ 2554208358U, // <6,1,2,0>: Cost 3 vext1 <0,6,1,2>, LHS
+ 3763602311U, // <6,1,2,1>: Cost 4 vext3 <0,u,2,6>, <1,2,1,3>
+ 3639895971U, // <6,1,2,2>: Cost 4 vext1 <2,6,1,2>, <2,6,1,2>
+ 2686026646U, // <6,1,2,3>: Cost 3 vext3 <0,2,4,6>, <1,2,3,0>
+ 2554211638U, // <6,1,2,4>: Cost 3 vext1 <0,6,1,2>, RHS
+ 3760874411U, // <6,1,2,5>: Cost 4 vext3 <0,4,1,6>, <1,2,5,3>
+ 2554212858U, // <6,1,2,6>: Cost 3 vext1 <0,6,1,2>, <6,2,7,3>
+ 3802973114U, // <6,1,2,7>: Cost 4 vext3 <7,4,5,6>, <1,2,7,0>
+ 2686026691U, // <6,1,2,u>: Cost 3 vext3 <0,2,4,6>, <1,2,u,0>
+ 2566160486U, // <6,1,3,0>: Cost 3 vext1 <2,6,1,3>, LHS
+ 2686026712U, // <6,1,3,1>: Cost 3 vext3 <0,2,4,6>, <1,3,1,3>
+ 2686026724U, // <6,1,3,2>: Cost 3 vext3 <0,2,4,6>, <1,3,2,6>
+ 3759768552U, // <6,1,3,3>: Cost 4 vext3 <0,2,4,6>, <1,3,3,1>
+ 2692662262U, // <6,1,3,4>: Cost 3 vext3 <1,3,4,6>, <1,3,4,6>
+ 2686026752U, // <6,1,3,5>: Cost 3 vext3 <0,2,4,6>, <1,3,5,7>
+ 2590053128U, // <6,1,3,6>: Cost 3 vext1 <6,6,1,3>, <6,6,1,3>
+ 3663795194U, // <6,1,3,7>: Cost 4 vext1 <6,6,1,3>, <7,0,1,2>
+ 2686026775U, // <6,1,3,u>: Cost 3 vext3 <0,2,4,6>, <1,3,u,3>
+ 2641587099U, // <6,1,4,0>: Cost 3 vext2 <4,0,6,1>, <4,0,6,1>
+ 2693104684U, // <6,1,4,1>: Cost 3 vext3 <1,4,1,6>, <1,4,1,6>
+ 3639912357U, // <6,1,4,2>: Cost 4 vext1 <2,6,1,4>, <2,6,1,4>
+ 2687206462U, // <6,1,4,3>: Cost 3 vext3 <0,4,2,6>, <1,4,3,6>
+ 3633941814U, // <6,1,4,4>: Cost 4 vext1 <1,6,1,4>, RHS
+ 2693399632U, // <6,1,4,5>: Cost 3 vext3 <1,4,5,6>, <1,4,5,6>
+ 3765077075U, // <6,1,4,6>: Cost 4 vext3 <1,1,4,6>, <1,4,6,0>
+ 2646232530U, // <6,1,4,7>: Cost 3 vext2 <4,7,6,1>, <4,7,6,1>
+ 2687206507U, // <6,1,4,u>: Cost 3 vext3 <0,4,2,6>, <1,4,u,6>
+ 2647559796U, // <6,1,5,0>: Cost 3 vext2 <5,0,6,1>, <5,0,6,1>
+ 3765077118U, // <6,1,5,1>: Cost 4 vext3 <1,1,4,6>, <1,5,1,7>
+ 3767583878U, // <6,1,5,2>: Cost 4 vext3 <1,5,2,6>, <1,5,2,6>
+ 2686026896U, // <6,1,5,3>: Cost 3 vext3 <0,2,4,6>, <1,5,3,7>
+ 2693989528U, // <6,1,5,4>: Cost 3 vext3 <1,5,4,6>, <1,5,4,6>
+ 3767805089U, // <6,1,5,5>: Cost 4 vext3 <1,5,5,6>, <1,5,5,6>
+ 2652868706U, // <6,1,5,6>: Cost 3 vext2 <5,u,6,1>, <5,6,7,0>
+ 3908250934U, // <6,1,5,7>: Cost 4 vuzpr <2,6,0,1>, RHS
+ 2686026941U, // <6,1,5,u>: Cost 3 vext3 <0,2,4,6>, <1,5,u,7>
+ 2554241126U, // <6,1,6,0>: Cost 3 vext1 <0,6,1,6>, LHS
+ 3763602639U, // <6,1,6,1>: Cost 4 vext3 <0,u,2,6>, <1,6,1,7>
+ 3759547607U, // <6,1,6,2>: Cost 4 vext3 <0,2,1,6>, <1,6,2,6>
+ 3115221094U, // <6,1,6,3>: Cost 3 vtrnr <4,6,4,6>, LHS
+ 2554244406U, // <6,1,6,4>: Cost 3 vext1 <0,6,1,6>, RHS
+ 3760874739U, // <6,1,6,5>: Cost 4 vext3 <0,4,1,6>, <1,6,5,7>
+ 2554245944U, // <6,1,6,6>: Cost 3 vext1 <0,6,1,6>, <6,6,6,6>
+ 3719975758U, // <6,1,6,7>: Cost 4 vext2 <4,7,6,1>, <6,7,0,1>
+ 3115221099U, // <6,1,6,u>: Cost 3 vtrnr <4,6,4,6>, LHS
+ 2560221286U, // <6,1,7,0>: Cost 3 vext1 <1,6,1,7>, LHS
+ 2560222415U, // <6,1,7,1>: Cost 3 vext1 <1,6,1,7>, <1,6,1,7>
+ 2980497558U, // <6,1,7,2>: Cost 3 vzipr RHS, <3,0,1,2>
+ 3103211622U, // <6,1,7,3>: Cost 3 vtrnr <2,6,3,7>, LHS
+ 2560224566U, // <6,1,7,4>: Cost 3 vext1 <1,6,1,7>, RHS
+ 2980495698U, // <6,1,7,5>: Cost 3 vzipr RHS, <0,4,1,5>
+ 3633967526U, // <6,1,7,6>: Cost 4 vext1 <1,6,1,7>, <6,1,7,0>
+ 4054237686U, // <6,1,7,7>: Cost 4 vzipr RHS, <0,6,1,7>
+ 2560227118U, // <6,1,7,u>: Cost 3 vext1 <1,6,1,7>, LHS
+ 2560229478U, // <6,1,u,0>: Cost 3 vext1 <1,6,1,u>, LHS
+ 2686027117U, // <6,1,u,1>: Cost 3 vext3 <0,2,4,6>, <1,u,1,3>
+ 2686027129U, // <6,1,u,2>: Cost 3 vext3 <0,2,4,6>, <1,u,2,6>
+ 2686027132U, // <6,1,u,3>: Cost 3 vext3 <0,2,4,6>, <1,u,3,0>
+ 2687206795U, // <6,1,u,4>: Cost 3 vext3 <0,4,2,6>, <1,u,4,6>
+ 2686027157U, // <6,1,u,5>: Cost 3 vext3 <0,2,4,6>, <1,u,5,7>
+ 2590094093U, // <6,1,u,6>: Cost 3 vext1 <6,6,1,u>, <6,6,1,u>
+ 2596066790U, // <6,1,u,7>: Cost 3 vext1 <7,6,1,u>, <7,6,1,u>
+ 2686027177U, // <6,1,u,u>: Cost 3 vext3 <0,2,4,6>, <1,u,u,0>
+ 2646900736U, // <6,2,0,0>: Cost 3 vext2 <4,u,6,2>, <0,0,0,0>
+ 1573159014U, // <6,2,0,1>: Cost 2 vext2 <4,u,6,2>, LHS
+ 2646900900U, // <6,2,0,2>: Cost 3 vext2 <4,u,6,2>, <0,2,0,2>
+ 3759769037U, // <6,2,0,3>: Cost 4 vext3 <0,2,4,6>, <2,0,3,0>
+ 2641592668U, // <6,2,0,4>: Cost 3 vext2 <4,0,6,2>, <0,4,2,6>
+ 3779085794U, // <6,2,0,5>: Cost 4 vext3 <3,4,5,6>, <2,0,5,3>
+ 2686027244U, // <6,2,0,6>: Cost 3 vext3 <0,2,4,6>, <2,0,6,4>
+ 3669816807U, // <6,2,0,7>: Cost 4 vext1 <7,6,2,0>, <7,6,2,0>
+ 1573159581U, // <6,2,0,u>: Cost 2 vext2 <4,u,6,2>, LHS
+ 2230527897U, // <6,2,1,0>: Cost 3 vrev <2,6,0,1>
+ 2646901556U, // <6,2,1,1>: Cost 3 vext2 <4,u,6,2>, <1,1,1,1>
+ 2646901654U, // <6,2,1,2>: Cost 3 vext2 <4,u,6,2>, <1,2,3,0>
+ 2847047782U, // <6,2,1,3>: Cost 3 vuzpr <4,6,u,2>, LHS
+ 3771049517U, // <6,2,1,4>: Cost 4 vext3 <2,1,4,6>, <2,1,4,6>
+ 2646901904U, // <6,2,1,5>: Cost 3 vext2 <4,u,6,2>, <1,5,3,7>
+ 2686027324U, // <6,2,1,6>: Cost 3 vext3 <0,2,4,6>, <2,1,6,3>
+ 3669825000U, // <6,2,1,7>: Cost 4 vext1 <7,6,2,1>, <7,6,2,1>
+ 2231117793U, // <6,2,1,u>: Cost 3 vrev <2,6,u,1>
+ 3763603029U, // <6,2,2,0>: Cost 4 vext3 <0,u,2,6>, <2,2,0,1>
+ 3759769184U, // <6,2,2,1>: Cost 4 vext3 <0,2,4,6>, <2,2,1,3>
+ 2686027368U, // <6,2,2,2>: Cost 3 vext3 <0,2,4,6>, <2,2,2,2>
+ 2686027378U, // <6,2,2,3>: Cost 3 vext3 <0,2,4,6>, <2,2,3,3>
+ 2697971326U, // <6,2,2,4>: Cost 3 vext3 <2,2,4,6>, <2,2,4,6>
+ 3759769224U, // <6,2,2,5>: Cost 4 vext3 <0,2,4,6>, <2,2,5,7>
+ 2698118800U, // <6,2,2,6>: Cost 3 vext3 <2,2,6,6>, <2,2,6,6>
+ 3920794092U, // <6,2,2,7>: Cost 4 vuzpr <4,6,u,2>, <6,2,5,7>
+ 2686027423U, // <6,2,2,u>: Cost 3 vext3 <0,2,4,6>, <2,2,u,3>
+ 2686027430U, // <6,2,3,0>: Cost 3 vext3 <0,2,4,6>, <2,3,0,1>
+ 3759769262U, // <6,2,3,1>: Cost 4 vext3 <0,2,4,6>, <2,3,1,0>
+ 2698487485U, // <6,2,3,2>: Cost 3 vext3 <2,3,2,6>, <2,3,2,6>
+ 2705344196U, // <6,2,3,3>: Cost 3 vext3 <3,4,5,6>, <2,3,3,4>
+ 2686027470U, // <6,2,3,4>: Cost 3 vext3 <0,2,4,6>, <2,3,4,5>
+ 2698708696U, // <6,2,3,5>: Cost 3 vext3 <2,3,5,6>, <2,3,5,6>
+ 2724660961U, // <6,2,3,6>: Cost 3 vext3 <6,6,6,6>, <2,3,6,6>
+ 2729232104U, // <6,2,3,7>: Cost 3 vext3 <7,4,5,6>, <2,3,7,4>
+ 2686027502U, // <6,2,3,u>: Cost 3 vext3 <0,2,4,6>, <2,3,u,1>
+ 1567853468U, // <6,2,4,0>: Cost 2 vext2 <4,0,6,2>, <4,0,6,2>
+ 3759769351U, // <6,2,4,1>: Cost 4 vext3 <0,2,4,6>, <2,4,1,u>
+ 2699151118U, // <6,2,4,2>: Cost 3 vext3 <2,4,2,6>, <2,4,2,6>
+ 2686027543U, // <6,2,4,3>: Cost 3 vext3 <0,2,4,6>, <2,4,3,6>
+ 2699298592U, // <6,2,4,4>: Cost 3 vext3 <2,4,4,6>, <2,4,4,6>
+ 1573162294U, // <6,2,4,5>: Cost 2 vext2 <4,u,6,2>, RHS
+ 2686027564U, // <6,2,4,6>: Cost 3 vext3 <0,2,4,6>, <2,4,6,0>
+ 3719982547U, // <6,2,4,7>: Cost 4 vext2 <4,7,6,2>, <4,7,6,2>
+ 1573162532U, // <6,2,4,u>: Cost 2 vext2 <4,u,6,2>, <4,u,6,2>
+ 3779086154U, // <6,2,5,0>: Cost 4 vext3 <3,4,5,6>, <2,5,0,3>
+ 2646904528U, // <6,2,5,1>: Cost 3 vext2 <4,u,6,2>, <5,1,7,3>
+ 3759769440U, // <6,2,5,2>: Cost 4 vext3 <0,2,4,6>, <2,5,2,7>
+ 2699888488U, // <6,2,5,3>: Cost 3 vext3 <2,5,3,6>, <2,5,3,6>
+ 2230855617U, // <6,2,5,4>: Cost 3 vrev <2,6,4,5>
+ 2646904836U, // <6,2,5,5>: Cost 3 vext2 <4,u,6,2>, <5,5,5,5>
+ 2646904930U, // <6,2,5,6>: Cost 3 vext2 <4,u,6,2>, <5,6,7,0>
+ 2847051062U, // <6,2,5,7>: Cost 3 vuzpr <4,6,u,2>, RHS
+ 2700257173U, // <6,2,5,u>: Cost 3 vext3 <2,5,u,6>, <2,5,u,6>
+ 2687207321U, // <6,2,6,0>: Cost 3 vext3 <0,4,2,6>, <2,6,0,1>
+ 2686027684U, // <6,2,6,1>: Cost 3 vext3 <0,2,4,6>, <2,6,1,3>
+ 2566260656U, // <6,2,6,2>: Cost 3 vext1 <2,6,2,6>, <2,6,2,6>
+ 2685806522U, // <6,2,6,3>: Cost 3 vext3 <0,2,1,6>, <2,6,3,7>
+ 2687207361U, // <6,2,6,4>: Cost 3 vext3 <0,4,2,6>, <2,6,4,5>
+ 2686027724U, // <6,2,6,5>: Cost 3 vext3 <0,2,4,6>, <2,6,5,7>
+ 2646905656U, // <6,2,6,6>: Cost 3 vext2 <4,u,6,2>, <6,6,6,6>
+ 2646905678U, // <6,2,6,7>: Cost 3 vext2 <4,u,6,2>, <6,7,0,1>
+ 2686027751U, // <6,2,6,u>: Cost 3 vext3 <0,2,4,6>, <2,6,u,7>
+ 2554323046U, // <6,2,7,0>: Cost 3 vext1 <0,6,2,7>, LHS
+ 2572239606U, // <6,2,7,1>: Cost 3 vext1 <3,6,2,7>, <1,0,3,2>
+ 2566268849U, // <6,2,7,2>: Cost 3 vext1 <2,6,2,7>, <2,6,2,7>
+ 1906753638U, // <6,2,7,3>: Cost 2 vzipr RHS, LHS
+ 2554326326U, // <6,2,7,4>: Cost 3 vext1 <0,6,2,7>, RHS
+ 3304687564U, // <6,2,7,5>: Cost 4 vrev <2,6,5,7>
+ 2980495708U, // <6,2,7,6>: Cost 3 vzipr RHS, <0,4,2,6>
+ 2646906476U, // <6,2,7,7>: Cost 3 vext2 <4,u,6,2>, <7,7,7,7>
+ 1906753643U, // <6,2,7,u>: Cost 2 vzipr RHS, LHS
+ 1591744256U, // <6,2,u,0>: Cost 2 vext2 <u,0,6,2>, <u,0,6,2>
+ 1573164846U, // <6,2,u,1>: Cost 2 vext2 <4,u,6,2>, LHS
+ 2701805650U, // <6,2,u,2>: Cost 3 vext3 <2,u,2,6>, <2,u,2,6>
+ 1906761830U, // <6,2,u,3>: Cost 2 vzipr RHS, LHS
+ 2686027875U, // <6,2,u,4>: Cost 3 vext3 <0,2,4,6>, <2,u,4,5>
+ 1573165210U, // <6,2,u,5>: Cost 2 vext2 <4,u,6,2>, RHS
+ 2686322800U, // <6,2,u,6>: Cost 3 vext3 <0,2,u,6>, <2,u,6,0>
+ 2847051305U, // <6,2,u,7>: Cost 3 vuzpr <4,6,u,2>, RHS
+ 1906761835U, // <6,2,u,u>: Cost 2 vzipr RHS, LHS
+ 3759769739U, // <6,3,0,0>: Cost 4 vext3 <0,2,4,6>, <3,0,0,0>
+ 2686027926U, // <6,3,0,1>: Cost 3 vext3 <0,2,4,6>, <3,0,1,2>
+ 2686027937U, // <6,3,0,2>: Cost 3 vext3 <0,2,4,6>, <3,0,2,4>
+ 3640027286U, // <6,3,0,3>: Cost 4 vext1 <2,6,3,0>, <3,0,1,2>
+ 2687207601U, // <6,3,0,4>: Cost 3 vext3 <0,4,2,6>, <3,0,4,2>
+ 2705344698U, // <6,3,0,5>: Cost 3 vext3 <3,4,5,6>, <3,0,5,2>
+ 3663917847U, // <6,3,0,6>: Cost 4 vext1 <6,6,3,0>, <6,6,3,0>
+ 2237008560U, // <6,3,0,7>: Cost 3 vrev <3,6,7,0>
+ 2686027989U, // <6,3,0,u>: Cost 3 vext3 <0,2,4,6>, <3,0,u,2>
+ 3759769823U, // <6,3,1,0>: Cost 4 vext3 <0,2,4,6>, <3,1,0,3>
+ 3759769830U, // <6,3,1,1>: Cost 4 vext3 <0,2,4,6>, <3,1,1,1>
+ 3759769841U, // <6,3,1,2>: Cost 4 vext3 <0,2,4,6>, <3,1,2,3>
+ 3759769848U, // <6,3,1,3>: Cost 4 vext3 <0,2,4,6>, <3,1,3,1>
+ 2703280390U, // <6,3,1,4>: Cost 3 vext3 <3,1,4,6>, <3,1,4,6>
+ 3759769868U, // <6,3,1,5>: Cost 4 vext3 <0,2,4,6>, <3,1,5,3>
+ 3704063194U, // <6,3,1,6>: Cost 4 vext2 <2,1,6,3>, <1,6,3,0>
+ 3767732510U, // <6,3,1,7>: Cost 4 vext3 <1,5,4,6>, <3,1,7,3>
+ 2703280390U, // <6,3,1,u>: Cost 3 vext3 <3,1,4,6>, <3,1,4,6>
+ 3704063468U, // <6,3,2,0>: Cost 4 vext2 <2,1,6,3>, <2,0,6,4>
+ 2630321724U, // <6,3,2,1>: Cost 3 vext2 <2,1,6,3>, <2,1,6,3>
+ 3759769921U, // <6,3,2,2>: Cost 4 vext3 <0,2,4,6>, <3,2,2,2>
+ 3759769928U, // <6,3,2,3>: Cost 4 vext3 <0,2,4,6>, <3,2,3,0>
+ 3704063767U, // <6,3,2,4>: Cost 4 vext2 <2,1,6,3>, <2,4,3,6>
+ 3704063876U, // <6,3,2,5>: Cost 4 vext2 <2,1,6,3>, <2,5,6,7>
+ 2636957626U, // <6,3,2,6>: Cost 3 vext2 <3,2,6,3>, <2,6,3,7>
+ 3777907058U, // <6,3,2,7>: Cost 4 vext3 <3,2,7,6>, <3,2,7,6>
+ 2630321724U, // <6,3,2,u>: Cost 3 vext2 <2,1,6,3>, <2,1,6,3>
+ 3759769983U, // <6,3,3,0>: Cost 4 vext3 <0,2,4,6>, <3,3,0,1>
+ 3710036245U, // <6,3,3,1>: Cost 4 vext2 <3,1,6,3>, <3,1,6,3>
+ 2636958054U, // <6,3,3,2>: Cost 3 vext2 <3,2,6,3>, <3,2,6,3>
+ 2686028188U, // <6,3,3,3>: Cost 3 vext3 <0,2,4,6>, <3,3,3,3>
+ 2704607656U, // <6,3,3,4>: Cost 3 vext3 <3,3,4,6>, <3,3,4,6>
+ 3773041072U, // <6,3,3,5>: Cost 4 vext3 <2,4,4,6>, <3,3,5,5>
+ 3711363731U, // <6,3,3,6>: Cost 4 vext2 <3,3,6,3>, <3,6,3,7>
+ 3767732676U, // <6,3,3,7>: Cost 4 vext3 <1,5,4,6>, <3,3,7,7>
+ 2707999179U, // <6,3,3,u>: Cost 3 vext3 <3,u,5,6>, <3,3,u,5>
+ 2584232038U, // <6,3,4,0>: Cost 3 vext1 <5,6,3,4>, LHS
+ 2642267118U, // <6,3,4,1>: Cost 3 vext2 <4,1,6,3>, <4,1,6,3>
+ 2642930751U, // <6,3,4,2>: Cost 3 vext2 <4,2,6,3>, <4,2,6,3>
+ 2705197552U, // <6,3,4,3>: Cost 3 vext3 <3,4,3,6>, <3,4,3,6>
+ 2584235318U, // <6,3,4,4>: Cost 3 vext1 <5,6,3,4>, RHS
+ 1631603202U, // <6,3,4,5>: Cost 2 vext3 <3,4,5,6>, <3,4,5,6>
+ 2654211444U, // <6,3,4,6>: Cost 3 vext2 <6,1,6,3>, <4,6,4,6>
+ 2237041332U, // <6,3,4,7>: Cost 3 vrev <3,6,7,4>
+ 1631824413U, // <6,3,4,u>: Cost 2 vext3 <3,4,u,6>, <3,4,u,6>
+ 3640066150U, // <6,3,5,0>: Cost 4 vext1 <2,6,3,5>, LHS
+ 3772746288U, // <6,3,5,1>: Cost 4 vext3 <2,4,0,6>, <3,5,1,7>
+ 3640067790U, // <6,3,5,2>: Cost 4 vext1 <2,6,3,5>, <2,3,4,5>
+ 3773041216U, // <6,3,5,3>: Cost 4 vext3 <2,4,4,6>, <3,5,3,5>
+ 2705934922U, // <6,3,5,4>: Cost 3 vext3 <3,5,4,6>, <3,5,4,6>
+ 3773041236U, // <6,3,5,5>: Cost 4 vext3 <2,4,4,6>, <3,5,5,7>
+ 3779086940U, // <6,3,5,6>: Cost 4 vext3 <3,4,5,6>, <3,5,6,6>
+ 3767732831U, // <6,3,5,7>: Cost 4 vext3 <1,5,4,6>, <3,5,7,0>
+ 2706229870U, // <6,3,5,u>: Cost 3 vext3 <3,5,u,6>, <3,5,u,6>
+ 2602164326U, // <6,3,6,0>: Cost 3 vext1 <u,6,3,6>, LHS
+ 2654212512U, // <6,3,6,1>: Cost 3 vext2 <6,1,6,3>, <6,1,6,3>
+ 2566334393U, // <6,3,6,2>: Cost 3 vext1 <2,6,3,6>, <2,6,3,6>
+ 3704066588U, // <6,3,6,3>: Cost 4 vext2 <2,1,6,3>, <6,3,2,1>
+ 2602167524U, // <6,3,6,4>: Cost 3 vext1 <u,6,3,6>, <4,4,6,6>
+ 3710702321U, // <6,3,6,5>: Cost 4 vext2 <3,2,6,3>, <6,5,7,7>
+ 2724661933U, // <6,3,6,6>: Cost 3 vext3 <6,6,6,6>, <3,6,6,6>
+ 3710702465U, // <6,3,6,7>: Cost 4 vext2 <3,2,6,3>, <6,7,5,7>
+ 2602170158U, // <6,3,6,u>: Cost 3 vext1 <u,6,3,6>, LHS
+ 1492598886U, // <6,3,7,0>: Cost 2 vext1 <2,6,3,7>, LHS
+ 2560369889U, // <6,3,7,1>: Cost 3 vext1 <1,6,3,7>, <1,6,3,7>
+ 1492600762U, // <6,3,7,2>: Cost 2 vext1 <2,6,3,7>, <2,6,3,7>
+ 2566342806U, // <6,3,7,3>: Cost 3 vext1 <2,6,3,7>, <3,0,1,2>
+ 1492602166U, // <6,3,7,4>: Cost 2 vext1 <2,6,3,7>, RHS
+ 2602176208U, // <6,3,7,5>: Cost 3 vext1 <u,6,3,7>, <5,1,7,3>
+ 2566345210U, // <6,3,7,6>: Cost 3 vext1 <2,6,3,7>, <6,2,7,3>
+ 2980496528U, // <6,3,7,7>: Cost 3 vzipr RHS, <1,5,3,7>
+ 1492604718U, // <6,3,7,u>: Cost 2 vext1 <2,6,3,7>, LHS
+ 1492607078U, // <6,3,u,0>: Cost 2 vext1 <2,6,3,u>, LHS
+ 2686028574U, // <6,3,u,1>: Cost 3 vext3 <0,2,4,6>, <3,u,1,2>
+ 1492608955U, // <6,3,u,2>: Cost 2 vext1 <2,6,3,u>, <2,6,3,u>
+ 2566350998U, // <6,3,u,3>: Cost 3 vext1 <2,6,3,u>, <3,0,1,2>
+ 1492610358U, // <6,3,u,4>: Cost 2 vext1 <2,6,3,u>, RHS
+ 1634257734U, // <6,3,u,5>: Cost 2 vext3 <3,u,5,6>, <3,u,5,6>
+ 2566353489U, // <6,3,u,6>: Cost 3 vext1 <2,6,3,u>, <6,3,u,0>
+ 2980504720U, // <6,3,u,7>: Cost 3 vzipr RHS, <1,5,3,7>
+ 1492612910U, // <6,3,u,u>: Cost 2 vext1 <2,6,3,u>, LHS
+ 3703406592U, // <6,4,0,0>: Cost 4 vext2 <2,0,6,4>, <0,0,0,0>
+ 2629664870U, // <6,4,0,1>: Cost 3 vext2 <2,0,6,4>, LHS
+ 2629664972U, // <6,4,0,2>: Cost 3 vext2 <2,0,6,4>, <0,2,4,6>
+ 3779087232U, // <6,4,0,3>: Cost 4 vext3 <3,4,5,6>, <4,0,3,1>
+ 2642936156U, // <6,4,0,4>: Cost 3 vext2 <4,2,6,4>, <0,4,2,6>
+ 2712570770U, // <6,4,0,5>: Cost 3 vext3 <4,6,4,6>, <4,0,5,1>
+ 2687208348U, // <6,4,0,6>: Cost 3 vext3 <0,4,2,6>, <4,0,6,2>
+ 3316723081U, // <6,4,0,7>: Cost 4 vrev <4,6,7,0>
+ 2629665437U, // <6,4,0,u>: Cost 3 vext2 <2,0,6,4>, LHS
+ 2242473291U, // <6,4,1,0>: Cost 3 vrev <4,6,0,1>
+ 3700089652U, // <6,4,1,1>: Cost 4 vext2 <1,4,6,4>, <1,1,1,1>
+ 3703407510U, // <6,4,1,2>: Cost 4 vext2 <2,0,6,4>, <1,2,3,0>
+ 2852962406U, // <6,4,1,3>: Cost 3 vuzpr <5,6,7,4>, LHS
+ 3628166454U, // <6,4,1,4>: Cost 4 vext1 <0,6,4,1>, RHS
+ 3760876514U, // <6,4,1,5>: Cost 4 vext3 <0,4,1,6>, <4,1,5,0>
+ 2687208430U, // <6,4,1,6>: Cost 3 vext3 <0,4,2,6>, <4,1,6,3>
+ 3316731274U, // <6,4,1,7>: Cost 4 vrev <4,6,7,1>
+ 2243063187U, // <6,4,1,u>: Cost 3 vrev <4,6,u,1>
+ 2629666284U, // <6,4,2,0>: Cost 3 vext2 <2,0,6,4>, <2,0,6,4>
+ 3703408188U, // <6,4,2,1>: Cost 4 vext2 <2,0,6,4>, <2,1,6,3>
+ 3703408232U, // <6,4,2,2>: Cost 4 vext2 <2,0,6,4>, <2,2,2,2>
+ 3703408294U, // <6,4,2,3>: Cost 4 vext2 <2,0,6,4>, <2,3,0,1>
+ 2632320816U, // <6,4,2,4>: Cost 3 vext2 <2,4,6,4>, <2,4,6,4>
+ 2923384118U, // <6,4,2,5>: Cost 3 vzipl <6,2,7,3>, RHS
+ 2687208508U, // <6,4,2,6>: Cost 3 vext3 <0,4,2,6>, <4,2,6,0>
+ 3760950341U, // <6,4,2,7>: Cost 4 vext3 <0,4,2,6>, <4,2,7,0>
+ 2634975348U, // <6,4,2,u>: Cost 3 vext2 <2,u,6,4>, <2,u,6,4>
+ 3703408790U, // <6,4,3,0>: Cost 4 vext2 <2,0,6,4>, <3,0,1,2>
+ 3316305238U, // <6,4,3,1>: Cost 4 vrev <4,6,1,3>
+ 3703408947U, // <6,4,3,2>: Cost 4 vext2 <2,0,6,4>, <3,2,0,6>
+ 3703409052U, // <6,4,3,3>: Cost 4 vext2 <2,0,6,4>, <3,3,3,3>
+ 2644929026U, // <6,4,3,4>: Cost 3 vext2 <4,5,6,4>, <3,4,5,6>
+ 3718670922U, // <6,4,3,5>: Cost 4 vext2 <4,5,6,4>, <3,5,4,6>
+ 2705345682U, // <6,4,3,6>: Cost 3 vext3 <3,4,5,6>, <4,3,6,5>
+ 3926705152U, // <6,4,3,7>: Cost 4 vuzpr <5,6,7,4>, <1,3,5,7>
+ 2668817222U, // <6,4,3,u>: Cost 3 vext2 <u,5,6,4>, <3,u,5,6>
+ 2590277734U, // <6,4,4,0>: Cost 3 vext1 <6,6,4,4>, LHS
+ 3716017135U, // <6,4,4,1>: Cost 4 vext2 <4,1,6,4>, <4,1,6,4>
+ 2642938944U, // <6,4,4,2>: Cost 3 vext2 <4,2,6,4>, <4,2,6,4>
+ 3717344401U, // <6,4,4,3>: Cost 4 vext2 <4,3,6,4>, <4,3,6,4>
+ 2712571088U, // <6,4,4,4>: Cost 3 vext3 <4,6,4,6>, <4,4,4,4>
+ 2629668150U, // <6,4,4,5>: Cost 3 vext2 <2,0,6,4>, RHS
+ 1637649636U, // <6,4,4,6>: Cost 2 vext3 <4,4,6,6>, <4,4,6,6>
+ 2646257109U, // <6,4,4,7>: Cost 3 vext2 <4,7,6,4>, <4,7,6,4>
+ 1637649636U, // <6,4,4,u>: Cost 2 vext3 <4,4,6,6>, <4,4,6,6>
+ 2566398054U, // <6,4,5,0>: Cost 3 vext1 <2,6,4,5>, LHS
+ 3760876805U, // <6,4,5,1>: Cost 4 vext3 <0,4,1,6>, <4,5,1,3>
+ 2566399937U, // <6,4,5,2>: Cost 3 vext1 <2,6,4,5>, <2,6,4,5>
+ 2584316418U, // <6,4,5,3>: Cost 3 vext1 <5,6,4,5>, <3,4,5,6>
+ 2566401334U, // <6,4,5,4>: Cost 3 vext1 <2,6,4,5>, RHS
+ 2584318028U, // <6,4,5,5>: Cost 3 vext1 <5,6,4,5>, <5,6,4,5>
+ 1612287286U, // <6,4,5,6>: Cost 2 vext3 <0,2,4,6>, RHS
+ 2852965686U, // <6,4,5,7>: Cost 3 vuzpr <5,6,7,4>, RHS
+ 1612287304U, // <6,4,5,u>: Cost 2 vext3 <0,2,4,6>, RHS
+ 1504608358U, // <6,4,6,0>: Cost 2 vext1 <4,6,4,6>, LHS
+ 2578350838U, // <6,4,6,1>: Cost 3 vext1 <4,6,4,6>, <1,0,3,2>
+ 2578351720U, // <6,4,6,2>: Cost 3 vext1 <4,6,4,6>, <2,2,2,2>
+ 2578352278U, // <6,4,6,3>: Cost 3 vext1 <4,6,4,6>, <3,0,1,2>
+ 1504611638U, // <6,4,6,4>: Cost 2 vext1 <4,6,4,6>, RHS
+ 2578353872U, // <6,4,6,5>: Cost 3 vext1 <4,6,4,6>, <5,1,7,3>
+ 2578354682U, // <6,4,6,6>: Cost 3 vext1 <4,6,4,6>, <6,2,7,3>
+ 2578355194U, // <6,4,6,7>: Cost 3 vext1 <4,6,4,6>, <7,0,1,2>
+ 1504614190U, // <6,4,6,u>: Cost 2 vext1 <4,6,4,6>, LHS
+ 2572386406U, // <6,4,7,0>: Cost 3 vext1 <3,6,4,7>, LHS
+ 2572387226U, // <6,4,7,1>: Cost 3 vext1 <3,6,4,7>, <1,2,3,4>
+ 3640157902U, // <6,4,7,2>: Cost 4 vext1 <2,6,4,7>, <2,3,4,5>
+ 2572389020U, // <6,4,7,3>: Cost 3 vext1 <3,6,4,7>, <3,6,4,7>
+ 2572389686U, // <6,4,7,4>: Cost 3 vext1 <3,6,4,7>, RHS
+ 2980497102U, // <6,4,7,5>: Cost 3 vzipr RHS, <2,3,4,5>
+ 2980495564U, // <6,4,7,6>: Cost 3 vzipr RHS, <0,2,4,6>
+ 4054239090U, // <6,4,7,7>: Cost 4 vzipr RHS, <2,5,4,7>
+ 2572392238U, // <6,4,7,u>: Cost 3 vext1 <3,6,4,7>, LHS
+ 1504608358U, // <6,4,u,0>: Cost 2 vext1 <4,6,4,6>, LHS
+ 2629670702U, // <6,4,u,1>: Cost 3 vext2 <2,0,6,4>, LHS
+ 2566424516U, // <6,4,u,2>: Cost 3 vext1 <2,6,4,u>, <2,6,4,u>
+ 2584340994U, // <6,4,u,3>: Cost 3 vext1 <5,6,4,u>, <3,4,5,6>
+ 1640156694U, // <6,4,u,4>: Cost 2 vext3 <4,u,4,6>, <4,u,4,6>
+ 2629671066U, // <6,4,u,5>: Cost 3 vext2 <2,0,6,4>, RHS
+ 1612287529U, // <6,4,u,6>: Cost 2 vext3 <0,2,4,6>, RHS
+ 2852965929U, // <6,4,u,7>: Cost 3 vuzpr <5,6,7,4>, RHS
+ 1612287547U, // <6,4,u,u>: Cost 2 vext3 <0,2,4,6>, RHS
+ 3708723200U, // <6,5,0,0>: Cost 4 vext2 <2,u,6,5>, <0,0,0,0>
+ 2634981478U, // <6,5,0,1>: Cost 3 vext2 <2,u,6,5>, LHS
+ 3694125260U, // <6,5,0,2>: Cost 4 vext2 <0,4,6,5>, <0,2,4,6>
+ 3779087962U, // <6,5,0,3>: Cost 4 vext3 <3,4,5,6>, <5,0,3,2>
+ 3760877154U, // <6,5,0,4>: Cost 4 vext3 <0,4,1,6>, <5,0,4,1>
+ 4195110916U, // <6,5,0,5>: Cost 4 vtrnr <5,6,7,0>, <5,5,5,5>
+ 3696779775U, // <6,5,0,6>: Cost 4 vext2 <0,u,6,5>, <0,6,2,7>
+ 1175212130U, // <6,5,0,7>: Cost 2 vrev <5,6,7,0>
+ 1175285867U, // <6,5,0,u>: Cost 2 vrev <5,6,u,0>
+ 2248445988U, // <6,5,1,0>: Cost 3 vrev <5,6,0,1>
+ 3698107237U, // <6,5,1,1>: Cost 4 vext2 <1,1,6,5>, <1,1,6,5>
+ 3708724118U, // <6,5,1,2>: Cost 4 vext2 <2,u,6,5>, <1,2,3,0>
+ 3908575334U, // <6,5,1,3>: Cost 4 vuzpr <2,6,4,5>, LHS
+ 3716023376U, // <6,5,1,4>: Cost 4 vext2 <4,1,6,5>, <1,4,5,6>
+ 3708724368U, // <6,5,1,5>: Cost 4 vext2 <2,u,6,5>, <1,5,3,7>
+ 3767733960U, // <6,5,1,6>: Cost 4 vext3 <1,5,4,6>, <5,1,6,4>
+ 2712571600U, // <6,5,1,7>: Cost 3 vext3 <4,6,4,6>, <5,1,7,3>
+ 2712571609U, // <6,5,1,u>: Cost 3 vext3 <4,6,4,6>, <5,1,u,3>
+ 2578391142U, // <6,5,2,0>: Cost 3 vext1 <4,6,5,2>, LHS
+ 3704079934U, // <6,5,2,1>: Cost 4 vext2 <2,1,6,5>, <2,1,6,5>
+ 3708724840U, // <6,5,2,2>: Cost 4 vext2 <2,u,6,5>, <2,2,2,2>
+ 3705407182U, // <6,5,2,3>: Cost 4 vext2 <2,3,6,5>, <2,3,4,5>
+ 2578394422U, // <6,5,2,4>: Cost 3 vext1 <4,6,5,2>, RHS
+ 3717351272U, // <6,5,2,5>: Cost 4 vext2 <4,3,6,5>, <2,5,3,6>
+ 2634983354U, // <6,5,2,6>: Cost 3 vext2 <2,u,6,5>, <2,6,3,7>
+ 3115486518U, // <6,5,2,7>: Cost 3 vtrnr <4,6,u,2>, RHS
+ 2634983541U, // <6,5,2,u>: Cost 3 vext2 <2,u,6,5>, <2,u,6,5>
+ 3708725398U, // <6,5,3,0>: Cost 4 vext2 <2,u,6,5>, <3,0,1,2>
+ 3710052631U, // <6,5,3,1>: Cost 4 vext2 <3,1,6,5>, <3,1,6,5>
+ 3708725606U, // <6,5,3,2>: Cost 4 vext2 <2,u,6,5>, <3,2,6,3>
+ 3708725660U, // <6,5,3,3>: Cost 4 vext2 <2,u,6,5>, <3,3,3,3>
+ 2643610114U, // <6,5,3,4>: Cost 3 vext2 <4,3,6,5>, <3,4,5,6>
+ 3717352010U, // <6,5,3,5>: Cost 4 vext2 <4,3,6,5>, <3,5,4,6>
+ 3773632358U, // <6,5,3,6>: Cost 4 vext3 <2,5,3,6>, <5,3,6,0>
+ 2248978533U, // <6,5,3,7>: Cost 3 vrev <5,6,7,3>
+ 2249052270U, // <6,5,3,u>: Cost 3 vrev <5,6,u,3>
+ 2596323430U, // <6,5,4,0>: Cost 3 vext1 <7,6,5,4>, LHS
+ 3716025328U, // <6,5,4,1>: Cost 4 vext2 <4,1,6,5>, <4,1,6,5>
+ 3716688961U, // <6,5,4,2>: Cost 4 vext2 <4,2,6,5>, <4,2,6,5>
+ 2643610770U, // <6,5,4,3>: Cost 3 vext2 <4,3,6,5>, <4,3,6,5>
+ 2596326710U, // <6,5,4,4>: Cost 3 vext1 <7,6,5,4>, RHS
+ 2634984758U, // <6,5,4,5>: Cost 3 vext2 <2,u,6,5>, RHS
+ 3767734199U, // <6,5,4,6>: Cost 4 vext3 <1,5,4,6>, <5,4,6,0>
+ 1643696070U, // <6,5,4,7>: Cost 2 vext3 <5,4,7,6>, <5,4,7,6>
+ 1643769807U, // <6,5,4,u>: Cost 2 vext3 <5,4,u,6>, <5,4,u,6>
+ 2578415718U, // <6,5,5,0>: Cost 3 vext1 <4,6,5,5>, LHS
+ 3652158198U, // <6,5,5,1>: Cost 4 vext1 <4,6,5,5>, <1,0,3,2>
+ 3652159080U, // <6,5,5,2>: Cost 4 vext1 <4,6,5,5>, <2,2,2,2>
+ 3652159638U, // <6,5,5,3>: Cost 4 vext1 <4,6,5,5>, <3,0,1,2>
+ 2578418998U, // <6,5,5,4>: Cost 3 vext1 <4,6,5,5>, RHS
+ 2712571908U, // <6,5,5,5>: Cost 3 vext3 <4,6,4,6>, <5,5,5,5>
+ 2718027790U, // <6,5,5,6>: Cost 3 vext3 <5,5,6,6>, <5,5,6,6>
+ 2712571928U, // <6,5,5,7>: Cost 3 vext3 <4,6,4,6>, <5,5,7,7>
+ 2712571937U, // <6,5,5,u>: Cost 3 vext3 <4,6,4,6>, <5,5,u,7>
+ 2705346596U, // <6,5,6,0>: Cost 3 vext3 <3,4,5,6>, <5,6,0,1>
+ 3767144496U, // <6,5,6,1>: Cost 4 vext3 <1,4,5,6>, <5,6,1,4>
+ 3773116473U, // <6,5,6,2>: Cost 4 vext3 <2,4,5,6>, <5,6,2,4>
+ 2705346626U, // <6,5,6,3>: Cost 3 vext3 <3,4,5,6>, <5,6,3,4>
+ 2705346636U, // <6,5,6,4>: Cost 3 vext3 <3,4,5,6>, <5,6,4,5>
+ 3908577217U, // <6,5,6,5>: Cost 4 vuzpr <2,6,4,5>, <2,6,4,5>
+ 2578428728U, // <6,5,6,6>: Cost 3 vext1 <4,6,5,6>, <6,6,6,6>
+ 2712572002U, // <6,5,6,7>: Cost 3 vext3 <4,6,4,6>, <5,6,7,0>
+ 2705346668U, // <6,5,6,u>: Cost 3 vext3 <3,4,5,6>, <5,6,u,1>
+ 2560516198U, // <6,5,7,0>: Cost 3 vext1 <1,6,5,7>, LHS
+ 2560517363U, // <6,5,7,1>: Cost 3 vext1 <1,6,5,7>, <1,6,5,7>
+ 2566490060U, // <6,5,7,2>: Cost 3 vext1 <2,6,5,7>, <2,6,5,7>
+ 3634260118U, // <6,5,7,3>: Cost 4 vext1 <1,6,5,7>, <3,0,1,2>
+ 2560519478U, // <6,5,7,4>: Cost 3 vext1 <1,6,5,7>, RHS
+ 2980498650U, // <6,5,7,5>: Cost 3 vzipr RHS, <4,4,5,5>
+ 2980497922U, // <6,5,7,6>: Cost 3 vzipr RHS, <3,4,5,6>
+ 3103214902U, // <6,5,7,7>: Cost 3 vtrnr <2,6,3,7>, RHS
+ 2560522030U, // <6,5,7,u>: Cost 3 vext1 <1,6,5,7>, LHS
+ 2560524390U, // <6,5,u,0>: Cost 3 vext1 <1,6,5,u>, LHS
+ 2560525556U, // <6,5,u,1>: Cost 3 vext1 <1,6,5,u>, <1,6,5,u>
+ 2566498253U, // <6,5,u,2>: Cost 3 vext1 <2,6,5,u>, <2,6,5,u>
+ 2646931439U, // <6,5,u,3>: Cost 3 vext2 <4,u,6,5>, <u,3,5,7>
+ 2560527670U, // <6,5,u,4>: Cost 3 vext1 <1,6,5,u>, RHS
+ 2634987674U, // <6,5,u,5>: Cost 3 vext2 <2,u,6,5>, RHS
+ 2980506114U, // <6,5,u,6>: Cost 3 vzipr RHS, <3,4,5,6>
+ 1175277674U, // <6,5,u,7>: Cost 2 vrev <5,6,7,u>
+ 1175351411U, // <6,5,u,u>: Cost 2 vrev <5,6,u,u>
+ 2578448486U, // <6,6,0,0>: Cost 3 vext1 <4,6,6,0>, LHS
+ 1573191782U, // <6,6,0,1>: Cost 2 vext2 <4,u,6,6>, LHS
+ 2686030124U, // <6,6,0,2>: Cost 3 vext3 <0,2,4,6>, <6,0,2,4>
+ 3779088690U, // <6,6,0,3>: Cost 4 vext3 <3,4,5,6>, <6,0,3,1>
+ 2687209788U, // <6,6,0,4>: Cost 3 vext3 <0,4,2,6>, <6,0,4,2>
+ 3652194000U, // <6,6,0,5>: Cost 4 vext1 <4,6,6,0>, <5,1,7,3>
+ 2254852914U, // <6,6,0,6>: Cost 3 vrev <6,6,6,0>
+ 4041575734U, // <6,6,0,7>: Cost 4 vzipr <2,4,6,0>, RHS
+ 1573192349U, // <6,6,0,u>: Cost 2 vext2 <4,u,6,6>, LHS
+ 2646934262U, // <6,6,1,0>: Cost 3 vext2 <4,u,6,6>, <1,0,3,2>
+ 2646934324U, // <6,6,1,1>: Cost 3 vext2 <4,u,6,6>, <1,1,1,1>
+ 2646934422U, // <6,6,1,2>: Cost 3 vext2 <4,u,6,6>, <1,2,3,0>
+ 2846785638U, // <6,6,1,3>: Cost 3 vuzpr <4,6,4,6>, LHS
+ 3760951694U, // <6,6,1,4>: Cost 4 vext3 <0,4,2,6>, <6,1,4,3>
+ 2646934672U, // <6,6,1,5>: Cost 3 vext2 <4,u,6,6>, <1,5,3,7>
+ 2712572320U, // <6,6,1,6>: Cost 3 vext3 <4,6,4,6>, <6,1,6,3>
+ 3775549865U, // <6,6,1,7>: Cost 4 vext3 <2,u,2,6>, <6,1,7,3>
+ 2846785643U, // <6,6,1,u>: Cost 3 vuzpr <4,6,4,6>, LHS
+ 3759772094U, // <6,6,2,0>: Cost 4 vext3 <0,2,4,6>, <6,2,0,6>
+ 3704751676U, // <6,6,2,1>: Cost 4 vext2 <2,2,6,6>, <2,1,6,3>
+ 2631009936U, // <6,6,2,2>: Cost 3 vext2 <2,2,6,6>, <2,2,6,6>
+ 2646935206U, // <6,6,2,3>: Cost 3 vext2 <4,u,6,6>, <2,3,0,1>
+ 3759772127U, // <6,6,2,4>: Cost 4 vext3 <0,2,4,6>, <6,2,4,3>
+ 3704752004U, // <6,6,2,5>: Cost 4 vext2 <2,2,6,6>, <2,5,6,7>
+ 2646935482U, // <6,6,2,6>: Cost 3 vext2 <4,u,6,6>, <2,6,3,7>
+ 2712572410U, // <6,6,2,7>: Cost 3 vext3 <4,6,4,6>, <6,2,7,3>
+ 2712572419U, // <6,6,2,u>: Cost 3 vext3 <4,6,4,6>, <6,2,u,3>
+ 2646935702U, // <6,6,3,0>: Cost 3 vext2 <4,u,6,6>, <3,0,1,2>
+ 3777024534U, // <6,6,3,1>: Cost 4 vext3 <3,1,4,6>, <6,3,1,4>
+ 3704752453U, // <6,6,3,2>: Cost 4 vext2 <2,2,6,6>, <3,2,2,6>
+ 2646935964U, // <6,6,3,3>: Cost 3 vext2 <4,u,6,6>, <3,3,3,3>
+ 2705347122U, // <6,6,3,4>: Cost 3 vext3 <3,4,5,6>, <6,3,4,5>
+ 3779678778U, // <6,6,3,5>: Cost 4 vext3 <3,5,4,6>, <6,3,5,4>
+ 2657553069U, // <6,6,3,6>: Cost 3 vext2 <6,6,6,6>, <3,6,6,6>
+ 4039609654U, // <6,6,3,7>: Cost 4 vzipr <2,1,6,3>, RHS
+ 2708001366U, // <6,6,3,u>: Cost 3 vext3 <3,u,5,6>, <6,3,u,5>
+ 2578481254U, // <6,6,4,0>: Cost 3 vext1 <4,6,6,4>, LHS
+ 3652223734U, // <6,6,4,1>: Cost 4 vext1 <4,6,6,4>, <1,0,3,2>
+ 3760951922U, // <6,6,4,2>: Cost 4 vext3 <0,4,2,6>, <6,4,2,6>
+ 3779089019U, // <6,6,4,3>: Cost 4 vext3 <3,4,5,6>, <6,4,3,6>
+ 1570540772U, // <6,6,4,4>: Cost 2 vext2 <4,4,6,6>, <4,4,6,6>
+ 1573195062U, // <6,6,4,5>: Cost 2 vext2 <4,u,6,6>, RHS
+ 2712572560U, // <6,6,4,6>: Cost 3 vext3 <4,6,4,6>, <6,4,6,0>
+ 2723410591U, // <6,6,4,7>: Cost 3 vext3 <6,4,7,6>, <6,4,7,6>
+ 1573195304U, // <6,6,4,u>: Cost 2 vext2 <4,u,6,6>, <4,u,6,6>
+ 3640287334U, // <6,6,5,0>: Cost 4 vext1 <2,6,6,5>, LHS
+ 2646937296U, // <6,6,5,1>: Cost 3 vext2 <4,u,6,6>, <5,1,7,3>
+ 3640289235U, // <6,6,5,2>: Cost 4 vext1 <2,6,6,5>, <2,6,6,5>
+ 3720679279U, // <6,6,5,3>: Cost 4 vext2 <4,u,6,6>, <5,3,7,0>
+ 2646937542U, // <6,6,5,4>: Cost 3 vext2 <4,u,6,6>, <5,4,7,6>
+ 2646937604U, // <6,6,5,5>: Cost 3 vext2 <4,u,6,6>, <5,5,5,5>
+ 2646937698U, // <6,6,5,6>: Cost 3 vext2 <4,u,6,6>, <5,6,7,0>
+ 2846788918U, // <6,6,5,7>: Cost 3 vuzpr <4,6,4,6>, RHS
+ 2846788919U, // <6,6,5,u>: Cost 3 vuzpr <4,6,4,6>, RHS
+ 1516699750U, // <6,6,6,0>: Cost 2 vext1 <6,6,6,6>, LHS
+ 2590442230U, // <6,6,6,1>: Cost 3 vext1 <6,6,6,6>, <1,0,3,2>
+ 2646938106U, // <6,6,6,2>: Cost 3 vext2 <4,u,6,6>, <6,2,7,3>
+ 2590443670U, // <6,6,6,3>: Cost 3 vext1 <6,6,6,6>, <3,0,1,2>
+ 1516703030U, // <6,6,6,4>: Cost 2 vext1 <6,6,6,6>, RHS
+ 2590445264U, // <6,6,6,5>: Cost 3 vext1 <6,6,6,6>, <5,1,7,3>
+ 296144182U, // <6,6,6,6>: Cost 1 vdup2 RHS
+ 2712572738U, // <6,6,6,7>: Cost 3 vext3 <4,6,4,6>, <6,6,7,7>
+ 296144182U, // <6,6,6,u>: Cost 1 vdup2 RHS
+ 2566561894U, // <6,6,7,0>: Cost 3 vext1 <2,6,6,7>, LHS
+ 3634332924U, // <6,6,7,1>: Cost 4 vext1 <1,6,6,7>, <1,6,6,7>
+ 2566563797U, // <6,6,7,2>: Cost 3 vext1 <2,6,6,7>, <2,6,6,7>
+ 2584480258U, // <6,6,7,3>: Cost 3 vext1 <5,6,6,7>, <3,4,5,6>
+ 2566565174U, // <6,6,7,4>: Cost 3 vext1 <2,6,6,7>, RHS
+ 2717438846U, // <6,6,7,5>: Cost 3 vext3 <5,4,7,6>, <6,7,5,4>
+ 2980500280U, // <6,6,7,6>: Cost 3 vzipr RHS, <6,6,6,6>
+ 1906756918U, // <6,6,7,7>: Cost 2 vzipr RHS, RHS
+ 1906756919U, // <6,6,7,u>: Cost 2 vzipr RHS, RHS
+ 1516699750U, // <6,6,u,0>: Cost 2 vext1 <6,6,6,6>, LHS
+ 1573197614U, // <6,6,u,1>: Cost 2 vext2 <4,u,6,6>, LHS
+ 2566571990U, // <6,6,u,2>: Cost 3 vext1 <2,6,6,u>, <2,6,6,u>
+ 2846786205U, // <6,6,u,3>: Cost 3 vuzpr <4,6,4,6>, LHS
+ 1516703030U, // <6,6,u,4>: Cost 2 vext1 <6,6,6,6>, RHS
+ 1573197978U, // <6,6,u,5>: Cost 2 vext2 <4,u,6,6>, RHS
+ 296144182U, // <6,6,u,6>: Cost 1 vdup2 RHS
+ 1906765110U, // <6,6,u,7>: Cost 2 vzipr RHS, RHS
+ 296144182U, // <6,6,u,u>: Cost 1 vdup2 RHS
+ 1571209216U, // <6,7,0,0>: Cost 2 vext2 RHS, <0,0,0,0>
+ 497467494U, // <6,7,0,1>: Cost 1 vext2 RHS, LHS
+ 1571209380U, // <6,7,0,2>: Cost 2 vext2 RHS, <0,2,0,2>
+ 2644951292U, // <6,7,0,3>: Cost 3 vext2 RHS, <0,3,1,0>
+ 1571209554U, // <6,7,0,4>: Cost 2 vext2 RHS, <0,4,1,5>
+ 1510756450U, // <6,7,0,5>: Cost 2 vext1 <5,6,7,0>, <5,6,7,0>
+ 2644951542U, // <6,7,0,6>: Cost 3 vext2 RHS, <0,6,1,7>
+ 2584499194U, // <6,7,0,7>: Cost 3 vext1 <5,6,7,0>, <7,0,1,2>
+ 497468061U, // <6,7,0,u>: Cost 1 vext2 RHS, LHS
+ 1571209974U, // <6,7,1,0>: Cost 2 vext2 RHS, <1,0,3,2>
+ 1571210036U, // <6,7,1,1>: Cost 2 vext2 RHS, <1,1,1,1>
+ 1571210134U, // <6,7,1,2>: Cost 2 vext2 RHS, <1,2,3,0>
+ 1571210200U, // <6,7,1,3>: Cost 2 vext2 RHS, <1,3,1,3>
+ 2644952098U, // <6,7,1,4>: Cost 3 vext2 RHS, <1,4,0,5>
+ 1571210384U, // <6,7,1,5>: Cost 2 vext2 RHS, <1,5,3,7>
+ 2644952271U, // <6,7,1,6>: Cost 3 vext2 RHS, <1,6,1,7>
+ 2578535418U, // <6,7,1,7>: Cost 3 vext1 <4,6,7,1>, <7,0,1,2>
+ 1571210605U, // <6,7,1,u>: Cost 2 vext2 RHS, <1,u,1,3>
+ 2644952509U, // <6,7,2,0>: Cost 3 vext2 RHS, <2,0,1,2>
+ 2644952582U, // <6,7,2,1>: Cost 3 vext2 RHS, <2,1,0,3>
+ 1571210856U, // <6,7,2,2>: Cost 2 vext2 RHS, <2,2,2,2>
+ 1571210918U, // <6,7,2,3>: Cost 2 vext2 RHS, <2,3,0,1>
+ 2644952828U, // <6,7,2,4>: Cost 3 vext2 RHS, <2,4,0,6>
+ 2633009028U, // <6,7,2,5>: Cost 3 vext2 <2,5,6,7>, <2,5,6,7>
+ 1571211194U, // <6,7,2,6>: Cost 2 vext2 RHS, <2,6,3,7>
+ 2668840938U, // <6,7,2,7>: Cost 3 vext2 RHS, <2,7,0,1>
+ 1571211323U, // <6,7,2,u>: Cost 2 vext2 RHS, <2,u,0,1>
+ 1571211414U, // <6,7,3,0>: Cost 2 vext2 RHS, <3,0,1,2>
+ 2644953311U, // <6,7,3,1>: Cost 3 vext2 RHS, <3,1,0,3>
+ 2644953390U, // <6,7,3,2>: Cost 3 vext2 RHS, <3,2,0,1>
+ 1571211676U, // <6,7,3,3>: Cost 2 vext2 RHS, <3,3,3,3>
+ 1571211778U, // <6,7,3,4>: Cost 2 vext2 RHS, <3,4,5,6>
+ 2644953648U, // <6,7,3,5>: Cost 3 vext2 RHS, <3,5,1,7>
+ 2644953720U, // <6,7,3,6>: Cost 3 vext2 RHS, <3,6,0,7>
+ 2644953795U, // <6,7,3,7>: Cost 3 vext2 RHS, <3,7,0,1>
+ 1571212062U, // <6,7,3,u>: Cost 2 vext2 RHS, <3,u,1,2>
+ 1573202834U, // <6,7,4,0>: Cost 2 vext2 RHS, <4,0,5,1>
+ 2644954058U, // <6,7,4,1>: Cost 3 vext2 RHS, <4,1,2,3>
+ 2644954166U, // <6,7,4,2>: Cost 3 vext2 RHS, <4,2,5,3>
+ 2644954258U, // <6,7,4,3>: Cost 3 vext2 RHS, <4,3,6,5>
+ 1571212496U, // <6,7,4,4>: Cost 2 vext2 RHS, <4,4,4,4>
+ 497470774U, // <6,7,4,5>: Cost 1 vext2 RHS, RHS
+ 1573203316U, // <6,7,4,6>: Cost 2 vext2 RHS, <4,6,4,6>
+ 2646281688U, // <6,7,4,7>: Cost 3 vext2 <4,7,6,7>, <4,7,6,7>
+ 497471017U, // <6,7,4,u>: Cost 1 vext2 RHS, RHS
+ 2644954696U, // <6,7,5,0>: Cost 3 vext2 RHS, <5,0,1,2>
+ 1573203664U, // <6,7,5,1>: Cost 2 vext2 RHS, <5,1,7,3>
+ 2644954878U, // <6,7,5,2>: Cost 3 vext2 RHS, <5,2,3,4>
+ 2644954991U, // <6,7,5,3>: Cost 3 vext2 RHS, <5,3,7,0>
+ 1571213254U, // <6,7,5,4>: Cost 2 vext2 RHS, <5,4,7,6>
+ 1571213316U, // <6,7,5,5>: Cost 2 vext2 RHS, <5,5,5,5>
+ 1571213410U, // <6,7,5,6>: Cost 2 vext2 RHS, <5,6,7,0>
+ 1573204136U, // <6,7,5,7>: Cost 2 vext2 RHS, <5,7,5,7>
+ 1573204217U, // <6,7,5,u>: Cost 2 vext2 RHS, <5,u,5,7>
+ 2644955425U, // <6,7,6,0>: Cost 3 vext2 RHS, <6,0,1,2>
+ 2644955561U, // <6,7,6,1>: Cost 3 vext2 RHS, <6,1,7,3>
+ 1573204474U, // <6,7,6,2>: Cost 2 vext2 RHS, <6,2,7,3>
+ 2644955698U, // <6,7,6,3>: Cost 3 vext2 RHS, <6,3,4,5>
+ 2644955789U, // <6,7,6,4>: Cost 3 vext2 RHS, <6,4,5,6>
+ 2644955889U, // <6,7,6,5>: Cost 3 vext2 RHS, <6,5,7,7>
+ 1571214136U, // <6,7,6,6>: Cost 2 vext2 RHS, <6,6,6,6>
+ 1571214158U, // <6,7,6,7>: Cost 2 vext2 RHS, <6,7,0,1>
+ 1573204895U, // <6,7,6,u>: Cost 2 vext2 RHS, <6,u,0,1>
+ 1573204986U, // <6,7,7,0>: Cost 2 vext2 RHS, <7,0,1,2>
+ 2572608656U, // <6,7,7,1>: Cost 3 vext1 <3,6,7,7>, <1,5,3,7>
+ 2644956362U, // <6,7,7,2>: Cost 3 vext2 RHS, <7,2,6,3>
+ 2572610231U, // <6,7,7,3>: Cost 3 vext1 <3,6,7,7>, <3,6,7,7>
+ 1573205350U, // <6,7,7,4>: Cost 2 vext2 RHS, <7,4,5,6>
+ 2646947220U, // <6,7,7,5>: Cost 3 vext2 RHS, <7,5,1,7>
+ 1516786498U, // <6,7,7,6>: Cost 2 vext1 <6,6,7,7>, <6,6,7,7>
+ 1571214956U, // <6,7,7,7>: Cost 2 vext2 RHS, <7,7,7,7>
+ 1573205634U, // <6,7,7,u>: Cost 2 vext2 RHS, <7,u,1,2>
+ 1571215059U, // <6,7,u,0>: Cost 2 vext2 RHS, <u,0,1,2>
+ 497473326U, // <6,7,u,1>: Cost 1 vext2 RHS, LHS
+ 1571215237U, // <6,7,u,2>: Cost 2 vext2 RHS, <u,2,3,0>
+ 1571215292U, // <6,7,u,3>: Cost 2 vext2 RHS, <u,3,0,1>
+ 1571215423U, // <6,7,u,4>: Cost 2 vext2 RHS, <u,4,5,6>
+ 497473690U, // <6,7,u,5>: Cost 1 vext2 RHS, RHS
+ 1571215568U, // <6,7,u,6>: Cost 2 vext2 RHS, <u,6,3,7>
+ 1573206272U, // <6,7,u,7>: Cost 2 vext2 RHS, <u,7,0,1>
+ 497473893U, // <6,7,u,u>: Cost 1 vext2 RHS, LHS
+ 1571217408U, // <6,u,0,0>: Cost 2 vext2 RHS, <0,0,0,0>
+ 497475686U, // <6,u,0,1>: Cost 1 vext2 RHS, LHS
+ 1571217572U, // <6,u,0,2>: Cost 2 vext2 RHS, <0,2,0,2>
+ 2689865445U, // <6,u,0,3>: Cost 3 vext3 <0,u,2,6>, <u,0,3,2>
+ 1571217746U, // <6,u,0,4>: Cost 2 vext2 RHS, <0,4,1,5>
+ 1510830187U, // <6,u,0,5>: Cost 2 vext1 <5,6,u,0>, <5,6,u,0>
+ 2644959734U, // <6,u,0,6>: Cost 3 vext2 RHS, <0,6,1,7>
+ 1193130221U, // <6,u,0,7>: Cost 2 vrev <u,6,7,0>
+ 497476253U, // <6,u,0,u>: Cost 1 vext2 RHS, LHS
+ 1571218166U, // <6,u,1,0>: Cost 2 vext2 RHS, <1,0,3,2>
+ 1571218228U, // <6,u,1,1>: Cost 2 vext2 RHS, <1,1,1,1>
+ 1612289838U, // <6,u,1,2>: Cost 2 vext3 <0,2,4,6>, LHS
+ 1571218392U, // <6,u,1,3>: Cost 2 vext2 RHS, <1,3,1,3>
+ 2566663478U, // <6,u,1,4>: Cost 3 vext1 <2,6,u,1>, RHS
+ 1571218576U, // <6,u,1,5>: Cost 2 vext2 RHS, <1,5,3,7>
+ 2644960463U, // <6,u,1,6>: Cost 3 vext2 RHS, <1,6,1,7>
+ 2717439835U, // <6,u,1,7>: Cost 3 vext3 <5,4,7,6>, <u,1,7,3>
+ 1612289892U, // <6,u,1,u>: Cost 2 vext3 <0,2,4,6>, LHS
+ 1504870502U, // <6,u,2,0>: Cost 2 vext1 <4,6,u,2>, LHS
+ 2644960774U, // <6,u,2,1>: Cost 3 vext2 RHS, <2,1,0,3>
+ 1571219048U, // <6,u,2,2>: Cost 2 vext2 RHS, <2,2,2,2>
+ 1571219110U, // <6,u,2,3>: Cost 2 vext2 RHS, <2,3,0,1>
+ 1504873782U, // <6,u,2,4>: Cost 2 vext1 <4,6,u,2>, RHS
+ 2633017221U, // <6,u,2,5>: Cost 3 vext2 <2,5,6,u>, <2,5,6,u>
+ 1571219386U, // <6,u,2,6>: Cost 2 vext2 RHS, <2,6,3,7>
+ 2712573868U, // <6,u,2,7>: Cost 3 vext3 <4,6,4,6>, <u,2,7,3>
+ 1571219515U, // <6,u,2,u>: Cost 2 vext2 RHS, <2,u,0,1>
+ 1571219606U, // <6,u,3,0>: Cost 2 vext2 RHS, <3,0,1,2>
+ 2644961503U, // <6,u,3,1>: Cost 3 vext2 RHS, <3,1,0,3>
+ 2566678499U, // <6,u,3,2>: Cost 3 vext1 <2,6,u,3>, <2,6,u,3>
+ 1571219868U, // <6,u,3,3>: Cost 2 vext2 RHS, <3,3,3,3>
+ 1571219970U, // <6,u,3,4>: Cost 2 vext2 RHS, <3,4,5,6>
+ 2689865711U, // <6,u,3,5>: Cost 3 vext3 <0,u,2,6>, <u,3,5,7>
+ 2708002806U, // <6,u,3,6>: Cost 3 vext3 <3,u,5,6>, <u,3,6,5>
+ 2644961987U, // <6,u,3,7>: Cost 3 vext2 RHS, <3,7,0,1>
+ 1571220254U, // <6,u,3,u>: Cost 2 vext2 RHS, <3,u,1,2>
+ 1571220370U, // <6,u,4,0>: Cost 2 vext2 RHS, <4,0,5,1>
+ 2644962250U, // <6,u,4,1>: Cost 3 vext2 RHS, <4,1,2,3>
+ 1661245476U, // <6,u,4,2>: Cost 2 vext3 <u,4,2,6>, <u,4,2,6>
+ 2686031917U, // <6,u,4,3>: Cost 3 vext3 <0,2,4,6>, <u,4,3,6>
+ 1571220688U, // <6,u,4,4>: Cost 2 vext2 RHS, <4,4,4,4>
+ 497478967U, // <6,u,4,5>: Cost 1 vext2 RHS, RHS
+ 1571220852U, // <6,u,4,6>: Cost 2 vext2 RHS, <4,6,4,6>
+ 1661614161U, // <6,u,4,7>: Cost 2 vext3 <u,4,7,6>, <u,4,7,6>
+ 497479209U, // <6,u,4,u>: Cost 1 vext2 RHS, RHS
+ 2566692966U, // <6,u,5,0>: Cost 3 vext1 <2,6,u,5>, LHS
+ 1571221200U, // <6,u,5,1>: Cost 2 vext2 RHS, <5,1,7,3>
+ 2566694885U, // <6,u,5,2>: Cost 3 vext1 <2,6,u,5>, <2,6,u,5>
+ 2689865855U, // <6,u,5,3>: Cost 3 vext3 <0,u,2,6>, <u,5,3,7>
+ 1571221446U, // <6,u,5,4>: Cost 2 vext2 RHS, <5,4,7,6>
+ 1571221508U, // <6,u,5,5>: Cost 2 vext2 RHS, <5,5,5,5>
+ 1612290202U, // <6,u,5,6>: Cost 2 vext3 <0,2,4,6>, RHS
+ 1571221672U, // <6,u,5,7>: Cost 2 vext2 RHS, <5,7,5,7>
+ 1612290220U, // <6,u,5,u>: Cost 2 vext3 <0,2,4,6>, RHS
+ 1504903270U, // <6,u,6,0>: Cost 2 vext1 <4,6,u,6>, LHS
+ 2644963752U, // <6,u,6,1>: Cost 3 vext2 RHS, <6,1,7,2>
+ 1571222010U, // <6,u,6,2>: Cost 2 vext2 RHS, <6,2,7,3>
+ 2686032080U, // <6,u,6,3>: Cost 3 vext3 <0,2,4,6>, <u,6,3,7>
+ 1504906550U, // <6,u,6,4>: Cost 2 vext1 <4,6,u,6>, RHS
+ 2644964079U, // <6,u,6,5>: Cost 3 vext2 RHS, <6,5,7,5>
+ 296144182U, // <6,u,6,6>: Cost 1 vdup2 RHS
+ 1571222350U, // <6,u,6,7>: Cost 2 vext2 RHS, <6,7,0,1>
+ 296144182U, // <6,u,6,u>: Cost 1 vdup2 RHS
+ 1492967526U, // <6,u,7,0>: Cost 2 vext1 <2,6,u,7>, LHS
+ 2560738574U, // <6,u,7,1>: Cost 3 vext1 <1,6,u,7>, <1,6,u,7>
+ 1492969447U, // <6,u,7,2>: Cost 2 vext1 <2,6,u,7>, <2,6,u,7>
+ 1906753692U, // <6,u,7,3>: Cost 2 vzipr RHS, LHS
+ 1492970806U, // <6,u,7,4>: Cost 2 vext1 <2,6,u,7>, RHS
+ 2980495761U, // <6,u,7,5>: Cost 3 vzipr RHS, <0,4,u,5>
+ 1516860235U, // <6,u,7,6>: Cost 2 vext1 <6,6,u,7>, <6,6,u,7>
+ 1906756936U, // <6,u,7,7>: Cost 2 vzipr RHS, RHS
+ 1492973358U, // <6,u,7,u>: Cost 2 vext1 <2,6,u,7>, LHS
+ 1492975718U, // <6,u,u,0>: Cost 2 vext1 <2,6,u,u>, LHS
+ 497481518U, // <6,u,u,1>: Cost 1 vext2 RHS, LHS
+ 1612290405U, // <6,u,u,2>: Cost 2 vext3 <0,2,4,6>, LHS
+ 1571223484U, // <6,u,u,3>: Cost 2 vext2 RHS, <u,3,0,1>
+ 1492978998U, // <6,u,u,4>: Cost 2 vext1 <2,6,u,u>, RHS
+ 497481882U, // <6,u,u,5>: Cost 1 vext2 RHS, RHS
+ 296144182U, // <6,u,u,6>: Cost 1 vdup2 RHS
+ 1906765128U, // <6,u,u,7>: Cost 2 vzipr RHS, RHS
+ 497482085U, // <6,u,u,u>: Cost 1 vext2 RHS, LHS
+ 1638318080U, // <7,0,0,0>: Cost 2 vext3 RHS, <0,0,0,0>
+ 1638318090U, // <7,0,0,1>: Cost 2 vext3 RHS, <0,0,1,1>
+ 1638318100U, // <7,0,0,2>: Cost 2 vext3 RHS, <0,0,2,2>
+ 3646442178U, // <7,0,0,3>: Cost 4 vext1 <3,7,0,0>, <3,7,0,0>
+ 2712059941U, // <7,0,0,4>: Cost 3 vext3 RHS, <0,0,4,1>
+ 2651603364U, // <7,0,0,5>: Cost 3 vext2 <5,6,7,0>, <0,5,1,6>
+ 2590618445U, // <7,0,0,6>: Cost 3 vext1 <6,7,0,0>, <6,7,0,0>
+ 3785801798U, // <7,0,0,7>: Cost 4 vext3 RHS, <0,0,7,7>
+ 1638318153U, // <7,0,0,u>: Cost 2 vext3 RHS, <0,0,u,1>
+ 1516879974U, // <7,0,1,0>: Cost 2 vext1 <6,7,0,1>, LHS
+ 2693922911U, // <7,0,1,1>: Cost 3 vext3 <1,5,3,7>, <0,1,1,5>
+ 564576358U, // <7,0,1,2>: Cost 1 vext3 RHS, LHS
+ 2638996480U, // <7,0,1,3>: Cost 3 vext2 <3,5,7,0>, <1,3,5,7>
+ 1516883254U, // <7,0,1,4>: Cost 2 vext1 <6,7,0,1>, RHS
+ 2649613456U, // <7,0,1,5>: Cost 3 vext2 <5,3,7,0>, <1,5,3,7>
+ 1516884814U, // <7,0,1,6>: Cost 2 vext1 <6,7,0,1>, <6,7,0,1>
+ 2590626808U, // <7,0,1,7>: Cost 3 vext1 <6,7,0,1>, <7,0,1,0>
+ 564576412U, // <7,0,1,u>: Cost 1 vext3 RHS, LHS
+ 1638318244U, // <7,0,2,0>: Cost 2 vext3 RHS, <0,2,0,2>
+ 2692743344U, // <7,0,2,1>: Cost 3 vext3 <1,3,5,7>, <0,2,1,5>
+ 2712060084U, // <7,0,2,2>: Cost 3 vext3 RHS, <0,2,2,0>
+ 2712060094U, // <7,0,2,3>: Cost 3 vext3 RHS, <0,2,3,1>
+ 1638318284U, // <7,0,2,4>: Cost 2 vext3 RHS, <0,2,4,6>
+ 2712060118U, // <7,0,2,5>: Cost 3 vext3 RHS, <0,2,5,7>
+ 2651604922U, // <7,0,2,6>: Cost 3 vext2 <5,6,7,0>, <2,6,3,7>
+ 2686255336U, // <7,0,2,7>: Cost 3 vext3 <0,2,7,7>, <0,2,7,7>
+ 1638318316U, // <7,0,2,u>: Cost 2 vext3 RHS, <0,2,u,2>
+ 2651605142U, // <7,0,3,0>: Cost 3 vext2 <5,6,7,0>, <3,0,1,2>
+ 2712060156U, // <7,0,3,1>: Cost 3 vext3 RHS, <0,3,1,0>
+ 2712060165U, // <7,0,3,2>: Cost 3 vext3 RHS, <0,3,2,0>
+ 2651605404U, // <7,0,3,3>: Cost 3 vext2 <5,6,7,0>, <3,3,3,3>
+ 2651605506U, // <7,0,3,4>: Cost 3 vext2 <5,6,7,0>, <3,4,5,6>
+ 2638998111U, // <7,0,3,5>: Cost 3 vext2 <3,5,7,0>, <3,5,7,0>
+ 2639661744U, // <7,0,3,6>: Cost 3 vext2 <3,6,7,0>, <3,6,7,0>
+ 3712740068U, // <7,0,3,7>: Cost 4 vext2 <3,5,7,0>, <3,7,3,7>
+ 2640989010U, // <7,0,3,u>: Cost 3 vext2 <3,u,7,0>, <3,u,7,0>
+ 2712060232U, // <7,0,4,0>: Cost 3 vext3 RHS, <0,4,0,4>
+ 1638318418U, // <7,0,4,1>: Cost 2 vext3 RHS, <0,4,1,5>
+ 1638318428U, // <7,0,4,2>: Cost 2 vext3 RHS, <0,4,2,6>
+ 3646474950U, // <7,0,4,3>: Cost 4 vext1 <3,7,0,4>, <3,7,0,4>
+ 2712060270U, // <7,0,4,4>: Cost 3 vext3 RHS, <0,4,4,6>
+ 1577864502U, // <7,0,4,5>: Cost 2 vext2 <5,6,7,0>, RHS
+ 2651606388U, // <7,0,4,6>: Cost 3 vext2 <5,6,7,0>, <4,6,4,6>
+ 3787792776U, // <7,0,4,7>: Cost 4 vext3 RHS, <0,4,7,5>
+ 1638318481U, // <7,0,4,u>: Cost 2 vext3 RHS, <0,4,u,5>
+ 2590654566U, // <7,0,5,0>: Cost 3 vext1 <6,7,0,5>, LHS
+ 2651606736U, // <7,0,5,1>: Cost 3 vext2 <5,6,7,0>, <5,1,7,3>
+ 2712060334U, // <7,0,5,2>: Cost 3 vext3 RHS, <0,5,2,7>
+ 2649616239U, // <7,0,5,3>: Cost 3 vext2 <5,3,7,0>, <5,3,7,0>
+ 2651606982U, // <7,0,5,4>: Cost 3 vext2 <5,6,7,0>, <5,4,7,6>
+ 2651607044U, // <7,0,5,5>: Cost 3 vext2 <5,6,7,0>, <5,5,5,5>
+ 1577865314U, // <7,0,5,6>: Cost 2 vext2 <5,6,7,0>, <5,6,7,0>
+ 2651607208U, // <7,0,5,7>: Cost 3 vext2 <5,6,7,0>, <5,7,5,7>
+ 1579192580U, // <7,0,5,u>: Cost 2 vext2 <5,u,7,0>, <5,u,7,0>
+ 2688393709U, // <7,0,6,0>: Cost 3 vext3 <0,6,0,7>, <0,6,0,7>
+ 2712060406U, // <7,0,6,1>: Cost 3 vext3 RHS, <0,6,1,7>
+ 2688541183U, // <7,0,6,2>: Cost 3 vext3 <0,6,2,7>, <0,6,2,7>
+ 2655588936U, // <7,0,6,3>: Cost 3 vext2 <6,3,7,0>, <6,3,7,0>
+ 3762430481U, // <7,0,6,4>: Cost 4 vext3 <0,6,4,7>, <0,6,4,7>
+ 2651607730U, // <7,0,6,5>: Cost 3 vext2 <5,6,7,0>, <6,5,0,7>
+ 2651607864U, // <7,0,6,6>: Cost 3 vext2 <5,6,7,0>, <6,6,6,6>
+ 2651607886U, // <7,0,6,7>: Cost 3 vext2 <5,6,7,0>, <6,7,0,1>
+ 2688983605U, // <7,0,6,u>: Cost 3 vext3 <0,6,u,7>, <0,6,u,7>
+ 2651608058U, // <7,0,7,0>: Cost 3 vext2 <5,6,7,0>, <7,0,1,2>
+ 2932703334U, // <7,0,7,1>: Cost 3 vzipl <7,7,7,7>, LHS
+ 3066921062U, // <7,0,7,2>: Cost 3 vtrnl <7,7,7,7>, LHS
+ 3712742678U, // <7,0,7,3>: Cost 4 vext2 <3,5,7,0>, <7,3,5,7>
+ 2651608422U, // <7,0,7,4>: Cost 3 vext2 <5,6,7,0>, <7,4,5,6>
+ 2651608513U, // <7,0,7,5>: Cost 3 vext2 <5,6,7,0>, <7,5,6,7>
+ 2663552532U, // <7,0,7,6>: Cost 3 vext2 <7,6,7,0>, <7,6,7,0>
+ 2651608684U, // <7,0,7,7>: Cost 3 vext2 <5,6,7,0>, <7,7,7,7>
+ 2651608706U, // <7,0,7,u>: Cost 3 vext2 <5,6,7,0>, <7,u,1,2>
+ 1638318730U, // <7,0,u,0>: Cost 2 vext3 RHS, <0,u,0,2>
+ 1638318738U, // <7,0,u,1>: Cost 2 vext3 RHS, <0,u,1,1>
+ 564576925U, // <7,0,u,2>: Cost 1 vext3 RHS, LHS
+ 2572765898U, // <7,0,u,3>: Cost 3 vext1 <3,7,0,u>, <3,7,0,u>
+ 1638318770U, // <7,0,u,4>: Cost 2 vext3 RHS, <0,u,4,6>
+ 1577867418U, // <7,0,u,5>: Cost 2 vext2 <5,6,7,0>, RHS
+ 1516942165U, // <7,0,u,6>: Cost 2 vext1 <6,7,0,u>, <6,7,0,u>
+ 2651609344U, // <7,0,u,7>: Cost 3 vext2 <5,6,7,0>, <u,7,0,1>
+ 564576979U, // <7,0,u,u>: Cost 1 vext3 RHS, LHS
+ 2590687334U, // <7,1,0,0>: Cost 3 vext1 <6,7,1,0>, LHS
+ 2639003750U, // <7,1,0,1>: Cost 3 vext2 <3,5,7,1>, LHS
+ 2793357414U, // <7,1,0,2>: Cost 3 vuzpl <7,0,1,2>, LHS
+ 1638318838U, // <7,1,0,3>: Cost 2 vext3 RHS, <1,0,3,2>
+ 2590690614U, // <7,1,0,4>: Cost 3 vext1 <6,7,1,0>, RHS
+ 2712060679U, // <7,1,0,5>: Cost 3 vext3 RHS, <1,0,5,1>
+ 2590692182U, // <7,1,0,6>: Cost 3 vext1 <6,7,1,0>, <6,7,1,0>
+ 3785802521U, // <7,1,0,7>: Cost 4 vext3 RHS, <1,0,7,1>
+ 1638318883U, // <7,1,0,u>: Cost 2 vext3 RHS, <1,0,u,2>
+ 2712060715U, // <7,1,1,0>: Cost 3 vext3 RHS, <1,1,0,1>
+ 1638318900U, // <7,1,1,1>: Cost 2 vext3 RHS, <1,1,1,1>
+ 3774300994U, // <7,1,1,2>: Cost 4 vext3 <2,6,3,7>, <1,1,2,6>
+ 1638318920U, // <7,1,1,3>: Cost 2 vext3 RHS, <1,1,3,3>
+ 2712060755U, // <7,1,1,4>: Cost 3 vext3 RHS, <1,1,4,5>
+ 2691416926U, // <7,1,1,5>: Cost 3 vext3 <1,1,5,7>, <1,1,5,7>
+ 2590700375U, // <7,1,1,6>: Cost 3 vext1 <6,7,1,1>, <6,7,1,1>
+ 3765158766U, // <7,1,1,7>: Cost 4 vext3 <1,1,5,7>, <1,1,7,5>
+ 1638318965U, // <7,1,1,u>: Cost 2 vext3 RHS, <1,1,u,3>
+ 2712060796U, // <7,1,2,0>: Cost 3 vext3 RHS, <1,2,0,1>
+ 2712060807U, // <7,1,2,1>: Cost 3 vext3 RHS, <1,2,1,3>
+ 3712747112U, // <7,1,2,2>: Cost 4 vext2 <3,5,7,1>, <2,2,2,2>
+ 1638318998U, // <7,1,2,3>: Cost 2 vext3 RHS, <1,2,3,0>
+ 2712060836U, // <7,1,2,4>: Cost 3 vext3 RHS, <1,2,4,5>
+ 2712060843U, // <7,1,2,5>: Cost 3 vext3 RHS, <1,2,5,3>
+ 2590708568U, // <7,1,2,6>: Cost 3 vext1 <6,7,1,2>, <6,7,1,2>
+ 2735948730U, // <7,1,2,7>: Cost 3 vext3 RHS, <1,2,7,0>
+ 1638319043U, // <7,1,2,u>: Cost 2 vext3 RHS, <1,2,u,0>
+ 2712060876U, // <7,1,3,0>: Cost 3 vext3 RHS, <1,3,0,0>
+ 1638319064U, // <7,1,3,1>: Cost 2 vext3 RHS, <1,3,1,3>
+ 2712060894U, // <7,1,3,2>: Cost 3 vext3 RHS, <1,3,2,0>
+ 2692596718U, // <7,1,3,3>: Cost 3 vext3 <1,3,3,7>, <1,3,3,7>
+ 2712060917U, // <7,1,3,4>: Cost 3 vext3 RHS, <1,3,4,5>
+ 1619002368U, // <7,1,3,5>: Cost 2 vext3 <1,3,5,7>, <1,3,5,7>
+ 2692817929U, // <7,1,3,6>: Cost 3 vext3 <1,3,6,7>, <1,3,6,7>
+ 2735948814U, // <7,1,3,7>: Cost 3 vext3 RHS, <1,3,7,3>
+ 1619223579U, // <7,1,3,u>: Cost 2 vext3 <1,3,u,7>, <1,3,u,7>
+ 2712060962U, // <7,1,4,0>: Cost 3 vext3 RHS, <1,4,0,5>
+ 2712060971U, // <7,1,4,1>: Cost 3 vext3 RHS, <1,4,1,5>
+ 2712060980U, // <7,1,4,2>: Cost 3 vext3 RHS, <1,4,2,5>
+ 2712060989U, // <7,1,4,3>: Cost 3 vext3 RHS, <1,4,3,5>
+ 3785802822U, // <7,1,4,4>: Cost 4 vext3 RHS, <1,4,4,5>
+ 2639007030U, // <7,1,4,5>: Cost 3 vext2 <3,5,7,1>, RHS
+ 2645642634U, // <7,1,4,6>: Cost 3 vext2 <4,6,7,1>, <4,6,7,1>
+ 3719384520U, // <7,1,4,7>: Cost 4 vext2 <4,6,7,1>, <4,7,5,0>
+ 2639007273U, // <7,1,4,u>: Cost 3 vext2 <3,5,7,1>, RHS
+ 2572812390U, // <7,1,5,0>: Cost 3 vext1 <3,7,1,5>, LHS
+ 2693776510U, // <7,1,5,1>: Cost 3 vext3 <1,5,1,7>, <1,5,1,7>
+ 3774301318U, // <7,1,5,2>: Cost 4 vext3 <2,6,3,7>, <1,5,2,6>
+ 1620182160U, // <7,1,5,3>: Cost 2 vext3 <1,5,3,7>, <1,5,3,7>
+ 2572815670U, // <7,1,5,4>: Cost 3 vext1 <3,7,1,5>, RHS
+ 3766486178U, // <7,1,5,5>: Cost 4 vext3 <1,3,5,7>, <1,5,5,7>
+ 2651615331U, // <7,1,5,6>: Cost 3 vext2 <5,6,7,1>, <5,6,7,1>
+ 2652278964U, // <7,1,5,7>: Cost 3 vext2 <5,7,7,1>, <5,7,7,1>
+ 1620550845U, // <7,1,5,u>: Cost 2 vext3 <1,5,u,7>, <1,5,u,7>
+ 3768108230U, // <7,1,6,0>: Cost 4 vext3 <1,6,0,7>, <1,6,0,7>
+ 2694440143U, // <7,1,6,1>: Cost 3 vext3 <1,6,1,7>, <1,6,1,7>
+ 2712061144U, // <7,1,6,2>: Cost 3 vext3 RHS, <1,6,2,7>
+ 2694587617U, // <7,1,6,3>: Cost 3 vext3 <1,6,3,7>, <1,6,3,7>
+ 3768403178U, // <7,1,6,4>: Cost 4 vext3 <1,6,4,7>, <1,6,4,7>
+ 2694735091U, // <7,1,6,5>: Cost 3 vext3 <1,6,5,7>, <1,6,5,7>
+ 3768550652U, // <7,1,6,6>: Cost 4 vext3 <1,6,6,7>, <1,6,6,7>
+ 2652279630U, // <7,1,6,7>: Cost 3 vext2 <5,7,7,1>, <6,7,0,1>
+ 2694956302U, // <7,1,6,u>: Cost 3 vext3 <1,6,u,7>, <1,6,u,7>
+ 2645644282U, // <7,1,7,0>: Cost 3 vext2 <4,6,7,1>, <7,0,1,2>
+ 2859062094U, // <7,1,7,1>: Cost 3 vuzpr <6,7,0,1>, <6,7,0,1>
+ 3779462437U, // <7,1,7,2>: Cost 4 vext3 <3,5,1,7>, <1,7,2,3>
+ 3121938534U, // <7,1,7,3>: Cost 3 vtrnr <5,7,5,7>, LHS
+ 2554916150U, // <7,1,7,4>: Cost 3 vext1 <0,7,1,7>, RHS
+ 3769140548U, // <7,1,7,5>: Cost 4 vext3 <1,7,5,7>, <1,7,5,7>
+ 3726022164U, // <7,1,7,6>: Cost 4 vext2 <5,7,7,1>, <7,6,7,0>
+ 2554918508U, // <7,1,7,7>: Cost 3 vext1 <0,7,1,7>, <7,7,7,7>
+ 3121938539U, // <7,1,7,u>: Cost 3 vtrnr <5,7,5,7>, LHS
+ 2572836966U, // <7,1,u,0>: Cost 3 vext1 <3,7,1,u>, LHS
+ 1638319469U, // <7,1,u,1>: Cost 2 vext3 RHS, <1,u,1,3>
+ 2712061299U, // <7,1,u,2>: Cost 3 vext3 RHS, <1,u,2,0>
+ 1622173059U, // <7,1,u,3>: Cost 2 vext3 <1,u,3,7>, <1,u,3,7>
+ 2572840246U, // <7,1,u,4>: Cost 3 vext1 <3,7,1,u>, RHS
+ 1622320533U, // <7,1,u,5>: Cost 2 vext3 <1,u,5,7>, <1,u,5,7>
+ 2696136094U, // <7,1,u,6>: Cost 3 vext3 <1,u,6,7>, <1,u,6,7>
+ 2859060777U, // <7,1,u,7>: Cost 3 vuzpr <6,7,0,1>, RHS
+ 1622541744U, // <7,1,u,u>: Cost 2 vext3 <1,u,u,7>, <1,u,u,7>
+ 2712061364U, // <7,2,0,0>: Cost 3 vext3 RHS, <2,0,0,2>
+ 2712061373U, // <7,2,0,1>: Cost 3 vext3 RHS, <2,0,1,2>
+ 2712061380U, // <7,2,0,2>: Cost 3 vext3 RHS, <2,0,2,0>
+ 2712061389U, // <7,2,0,3>: Cost 3 vext3 RHS, <2,0,3,0>
+ 2712061404U, // <7,2,0,4>: Cost 3 vext3 RHS, <2,0,4,6>
+ 2696725990U, // <7,2,0,5>: Cost 3 vext3 <2,0,5,7>, <2,0,5,7>
+ 2712061417U, // <7,2,0,6>: Cost 3 vext3 RHS, <2,0,6,1>
+ 3785803251U, // <7,2,0,7>: Cost 4 vext3 RHS, <2,0,7,2>
+ 2696947201U, // <7,2,0,u>: Cost 3 vext3 <2,0,u,7>, <2,0,u,7>
+ 2712061446U, // <7,2,1,0>: Cost 3 vext3 RHS, <2,1,0,3>
+ 3785803276U, // <7,2,1,1>: Cost 4 vext3 RHS, <2,1,1,0>
+ 3785803285U, // <7,2,1,2>: Cost 4 vext3 RHS, <2,1,2,0>
+ 2712061471U, // <7,2,1,3>: Cost 3 vext3 RHS, <2,1,3,1>
+ 2712061482U, // <7,2,1,4>: Cost 3 vext3 RHS, <2,1,4,3>
+ 3766486576U, // <7,2,1,5>: Cost 4 vext3 <1,3,5,7>, <2,1,5,0>
+ 2712061500U, // <7,2,1,6>: Cost 3 vext3 RHS, <2,1,6,3>
+ 2602718850U, // <7,2,1,7>: Cost 3 vext1 <u,7,2,1>, <7,u,1,2>
+ 2712061516U, // <7,2,1,u>: Cost 3 vext3 RHS, <2,1,u,1>
+ 2712061525U, // <7,2,2,0>: Cost 3 vext3 RHS, <2,2,0,1>
+ 2712061536U, // <7,2,2,1>: Cost 3 vext3 RHS, <2,2,1,3>
+ 1638319720U, // <7,2,2,2>: Cost 2 vext3 RHS, <2,2,2,2>
+ 1638319730U, // <7,2,2,3>: Cost 2 vext3 RHS, <2,2,3,3>
+ 2712061565U, // <7,2,2,4>: Cost 3 vext3 RHS, <2,2,4,5>
+ 2698053256U, // <7,2,2,5>: Cost 3 vext3 <2,2,5,7>, <2,2,5,7>
+ 2712061584U, // <7,2,2,6>: Cost 3 vext3 RHS, <2,2,6,6>
+ 3771795096U, // <7,2,2,7>: Cost 4 vext3 <2,2,5,7>, <2,2,7,5>
+ 1638319775U, // <7,2,2,u>: Cost 2 vext3 RHS, <2,2,u,3>
+ 1638319782U, // <7,2,3,0>: Cost 2 vext3 RHS, <2,3,0,1>
+ 2693924531U, // <7,2,3,1>: Cost 3 vext3 <1,5,3,7>, <2,3,1,5>
+ 2700560061U, // <7,2,3,2>: Cost 3 vext3 <2,6,3,7>, <2,3,2,6>
+ 2693924551U, // <7,2,3,3>: Cost 3 vext3 <1,5,3,7>, <2,3,3,7>
+ 1638319822U, // <7,2,3,4>: Cost 2 vext3 RHS, <2,3,4,5>
+ 2698716889U, // <7,2,3,5>: Cost 3 vext3 <2,3,5,7>, <2,3,5,7>
+ 2712061665U, // <7,2,3,6>: Cost 3 vext3 RHS, <2,3,6,6>
+ 2735949540U, // <7,2,3,7>: Cost 3 vext3 RHS, <2,3,7,0>
+ 1638319854U, // <7,2,3,u>: Cost 2 vext3 RHS, <2,3,u,1>
+ 2712061692U, // <7,2,4,0>: Cost 3 vext3 RHS, <2,4,0,6>
+ 2712061698U, // <7,2,4,1>: Cost 3 vext3 RHS, <2,4,1,3>
+ 2712061708U, // <7,2,4,2>: Cost 3 vext3 RHS, <2,4,2,4>
+ 2712061718U, // <7,2,4,3>: Cost 3 vext3 RHS, <2,4,3,5>
+ 2712061728U, // <7,2,4,4>: Cost 3 vext3 RHS, <2,4,4,6>
+ 2699380522U, // <7,2,4,5>: Cost 3 vext3 <2,4,5,7>, <2,4,5,7>
+ 2712061740U, // <7,2,4,6>: Cost 3 vext3 RHS, <2,4,6,0>
+ 3809691445U, // <7,2,4,7>: Cost 4 vext3 RHS, <2,4,7,0>
+ 2699601733U, // <7,2,4,u>: Cost 3 vext3 <2,4,u,7>, <2,4,u,7>
+ 2699675470U, // <7,2,5,0>: Cost 3 vext3 <2,5,0,7>, <2,5,0,7>
+ 3766486867U, // <7,2,5,1>: Cost 4 vext3 <1,3,5,7>, <2,5,1,3>
+ 2699822944U, // <7,2,5,2>: Cost 3 vext3 <2,5,2,7>, <2,5,2,7>
+ 2692745065U, // <7,2,5,3>: Cost 3 vext3 <1,3,5,7>, <2,5,3,7>
+ 2699970418U, // <7,2,5,4>: Cost 3 vext3 <2,5,4,7>, <2,5,4,7>
+ 3766486907U, // <7,2,5,5>: Cost 4 vext3 <1,3,5,7>, <2,5,5,7>
+ 2700117892U, // <7,2,5,6>: Cost 3 vext3 <2,5,6,7>, <2,5,6,7>
+ 3771795334U, // <7,2,5,7>: Cost 4 vext3 <2,2,5,7>, <2,5,7,0>
+ 2692745110U, // <7,2,5,u>: Cost 3 vext3 <1,3,5,7>, <2,5,u,7>
+ 2572894310U, // <7,2,6,0>: Cost 3 vext1 <3,7,2,6>, LHS
+ 2712061860U, // <7,2,6,1>: Cost 3 vext3 RHS, <2,6,1,3>
+ 2700486577U, // <7,2,6,2>: Cost 3 vext3 <2,6,2,7>, <2,6,2,7>
+ 1626818490U, // <7,2,6,3>: Cost 2 vext3 <2,6,3,7>, <2,6,3,7>
+ 2572897590U, // <7,2,6,4>: Cost 3 vext1 <3,7,2,6>, RHS
+ 2700707788U, // <7,2,6,5>: Cost 3 vext3 <2,6,5,7>, <2,6,5,7>
+ 2700781525U, // <7,2,6,6>: Cost 3 vext3 <2,6,6,7>, <2,6,6,7>
+ 3774597086U, // <7,2,6,7>: Cost 4 vext3 <2,6,7,7>, <2,6,7,7>
+ 1627187175U, // <7,2,6,u>: Cost 2 vext3 <2,6,u,7>, <2,6,u,7>
+ 2735949802U, // <7,2,7,0>: Cost 3 vext3 RHS, <2,7,0,1>
+ 3780200434U, // <7,2,7,1>: Cost 4 vext3 <3,6,2,7>, <2,7,1,0>
+ 3773564928U, // <7,2,7,2>: Cost 4 vext3 <2,5,2,7>, <2,7,2,5>
+ 2986541158U, // <7,2,7,3>: Cost 3 vzipr <5,5,7,7>, LHS
+ 2554989878U, // <7,2,7,4>: Cost 3 vext1 <0,7,2,7>, RHS
+ 3775113245U, // <7,2,7,5>: Cost 4 vext3 <2,7,5,7>, <2,7,5,7>
+ 4060283228U, // <7,2,7,6>: Cost 4 vzipr <5,5,7,7>, <0,4,2,6>
+ 2554992236U, // <7,2,7,7>: Cost 3 vext1 <0,7,2,7>, <7,7,7,7>
+ 2986541163U, // <7,2,7,u>: Cost 3 vzipr <5,5,7,7>, LHS
+ 1638320187U, // <7,2,u,0>: Cost 2 vext3 RHS, <2,u,0,1>
+ 2693924936U, // <7,2,u,1>: Cost 3 vext3 <1,5,3,7>, <2,u,1,5>
+ 1638319720U, // <7,2,u,2>: Cost 2 vext3 RHS, <2,2,2,2>
+ 1628145756U, // <7,2,u,3>: Cost 2 vext3 <2,u,3,7>, <2,u,3,7>
+ 1638320227U, // <7,2,u,4>: Cost 2 vext3 RHS, <2,u,4,5>
+ 2702035054U, // <7,2,u,5>: Cost 3 vext3 <2,u,5,7>, <2,u,5,7>
+ 2702108791U, // <7,2,u,6>: Cost 3 vext3 <2,u,6,7>, <2,u,6,7>
+ 2735949945U, // <7,2,u,7>: Cost 3 vext3 RHS, <2,u,7,0>
+ 1628514441U, // <7,2,u,u>: Cost 2 vext3 <2,u,u,7>, <2,u,u,7>
+ 2712062091U, // <7,3,0,0>: Cost 3 vext3 RHS, <3,0,0,0>
+ 1638320278U, // <7,3,0,1>: Cost 2 vext3 RHS, <3,0,1,2>
+ 2712062109U, // <7,3,0,2>: Cost 3 vext3 RHS, <3,0,2,0>
+ 2590836886U, // <7,3,0,3>: Cost 3 vext1 <6,7,3,0>, <3,0,1,2>
+ 2712062128U, // <7,3,0,4>: Cost 3 vext3 RHS, <3,0,4,1>
+ 2712062138U, // <7,3,0,5>: Cost 3 vext3 RHS, <3,0,5,2>
+ 2590839656U, // <7,3,0,6>: Cost 3 vext1 <6,7,3,0>, <6,7,3,0>
+ 3311414017U, // <7,3,0,7>: Cost 4 vrev <3,7,7,0>
+ 1638320341U, // <7,3,0,u>: Cost 2 vext3 RHS, <3,0,u,2>
+ 2237164227U, // <7,3,1,0>: Cost 3 vrev <3,7,0,1>
+ 2712062182U, // <7,3,1,1>: Cost 3 vext3 RHS, <3,1,1,1>
+ 2712062193U, // <7,3,1,2>: Cost 3 vext3 RHS, <3,1,2,3>
+ 2692745468U, // <7,3,1,3>: Cost 3 vext3 <1,3,5,7>, <3,1,3,5>
+ 2712062214U, // <7,3,1,4>: Cost 3 vext3 RHS, <3,1,4,6>
+ 2693925132U, // <7,3,1,5>: Cost 3 vext3 <1,5,3,7>, <3,1,5,3>
+ 3768183059U, // <7,3,1,6>: Cost 4 vext3 <1,6,1,7>, <3,1,6,1>
+ 2692745504U, // <7,3,1,7>: Cost 3 vext3 <1,3,5,7>, <3,1,7,5>
+ 2696063273U, // <7,3,1,u>: Cost 3 vext3 <1,u,5,7>, <3,1,u,5>
+ 2712062254U, // <7,3,2,0>: Cost 3 vext3 RHS, <3,2,0,1>
+ 2712062262U, // <7,3,2,1>: Cost 3 vext3 RHS, <3,2,1,0>
+ 2712062273U, // <7,3,2,2>: Cost 3 vext3 RHS, <3,2,2,2>
+ 2712062280U, // <7,3,2,3>: Cost 3 vext3 RHS, <3,2,3,0>
+ 2712062294U, // <7,3,2,4>: Cost 3 vext3 RHS, <3,2,4,5>
+ 2712062302U, // <7,3,2,5>: Cost 3 vext3 RHS, <3,2,5,4>
+ 2700560742U, // <7,3,2,6>: Cost 3 vext3 <2,6,3,7>, <3,2,6,3>
+ 2712062319U, // <7,3,2,7>: Cost 3 vext3 RHS, <3,2,7,3>
+ 2712062325U, // <7,3,2,u>: Cost 3 vext3 RHS, <3,2,u,0>
+ 2712062335U, // <7,3,3,0>: Cost 3 vext3 RHS, <3,3,0,1>
+ 2636368158U, // <7,3,3,1>: Cost 3 vext2 <3,1,7,3>, <3,1,7,3>
+ 2637031791U, // <7,3,3,2>: Cost 3 vext2 <3,2,7,3>, <3,2,7,3>
+ 1638320540U, // <7,3,3,3>: Cost 2 vext3 RHS, <3,3,3,3>
+ 2712062374U, // <7,3,3,4>: Cost 3 vext3 RHS, <3,3,4,4>
+ 2704689586U, // <7,3,3,5>: Cost 3 vext3 <3,3,5,7>, <3,3,5,7>
+ 2590864235U, // <7,3,3,6>: Cost 3 vext1 <6,7,3,3>, <6,7,3,3>
+ 2704837060U, // <7,3,3,7>: Cost 3 vext3 <3,3,7,7>, <3,3,7,7>
+ 1638320540U, // <7,3,3,u>: Cost 2 vext3 RHS, <3,3,3,3>
+ 2712062416U, // <7,3,4,0>: Cost 3 vext3 RHS, <3,4,0,1>
+ 2712062426U, // <7,3,4,1>: Cost 3 vext3 RHS, <3,4,1,2>
+ 2566981640U, // <7,3,4,2>: Cost 3 vext1 <2,7,3,4>, <2,7,3,4>
+ 2712062447U, // <7,3,4,3>: Cost 3 vext3 RHS, <3,4,3,5>
+ 2712062456U, // <7,3,4,4>: Cost 3 vext3 RHS, <3,4,4,5>
+ 1638320642U, // <7,3,4,5>: Cost 2 vext3 RHS, <3,4,5,6>
+ 2648313204U, // <7,3,4,6>: Cost 3 vext2 <5,1,7,3>, <4,6,4,6>
+ 3311446789U, // <7,3,4,7>: Cost 4 vrev <3,7,7,4>
+ 1638320669U, // <7,3,4,u>: Cost 2 vext3 RHS, <3,4,u,6>
+ 2602819686U, // <7,3,5,0>: Cost 3 vext1 <u,7,3,5>, LHS
+ 1574571728U, // <7,3,5,1>: Cost 2 vext2 <5,1,7,3>, <5,1,7,3>
+ 2648977185U, // <7,3,5,2>: Cost 3 vext2 <5,2,7,3>, <5,2,7,3>
+ 2705869378U, // <7,3,5,3>: Cost 3 vext3 <3,5,3,7>, <3,5,3,7>
+ 2237491947U, // <7,3,5,4>: Cost 3 vrev <3,7,4,5>
+ 2706016852U, // <7,3,5,5>: Cost 3 vext3 <3,5,5,7>, <3,5,5,7>
+ 2648313954U, // <7,3,5,6>: Cost 3 vext2 <5,1,7,3>, <5,6,7,0>
+ 2692745823U, // <7,3,5,7>: Cost 3 vext3 <1,3,5,7>, <3,5,7,0>
+ 1579217159U, // <7,3,5,u>: Cost 2 vext2 <5,u,7,3>, <5,u,7,3>
+ 2706311800U, // <7,3,6,0>: Cost 3 vext3 <3,6,0,7>, <3,6,0,7>
+ 2654286249U, // <7,3,6,1>: Cost 3 vext2 <6,1,7,3>, <6,1,7,3>
+ 1581208058U, // <7,3,6,2>: Cost 2 vext2 <6,2,7,3>, <6,2,7,3>
+ 2706533011U, // <7,3,6,3>: Cost 3 vext3 <3,6,3,7>, <3,6,3,7>
+ 2706606748U, // <7,3,6,4>: Cost 3 vext3 <3,6,4,7>, <3,6,4,7>
+ 3780422309U, // <7,3,6,5>: Cost 4 vext3 <3,6,5,7>, <3,6,5,7>
+ 2712062637U, // <7,3,6,6>: Cost 3 vext3 RHS, <3,6,6,6>
+ 2706827959U, // <7,3,6,7>: Cost 3 vext3 <3,6,7,7>, <3,6,7,7>
+ 1585189856U, // <7,3,6,u>: Cost 2 vext2 <6,u,7,3>, <6,u,7,3>
+ 2693925571U, // <7,3,7,0>: Cost 3 vext3 <1,5,3,7>, <3,7,0,1>
+ 2693925584U, // <7,3,7,1>: Cost 3 vext3 <1,5,3,7>, <3,7,1,5>
+ 2700561114U, // <7,3,7,2>: Cost 3 vext3 <2,6,3,7>, <3,7,2,6>
+ 2572978916U, // <7,3,7,3>: Cost 3 vext1 <3,7,3,7>, <3,7,3,7>
+ 2693925611U, // <7,3,7,4>: Cost 3 vext3 <1,5,3,7>, <3,7,4,5>
+ 2707344118U, // <7,3,7,5>: Cost 3 vext3 <3,7,5,7>, <3,7,5,7>
+ 2654950894U, // <7,3,7,6>: Cost 3 vext2 <6,2,7,3>, <7,6,2,7>
+ 2648315500U, // <7,3,7,7>: Cost 3 vext2 <5,1,7,3>, <7,7,7,7>
+ 2693925643U, // <7,3,7,u>: Cost 3 vext3 <1,5,3,7>, <3,7,u,1>
+ 2237221578U, // <7,3,u,0>: Cost 3 vrev <3,7,0,u>
+ 1638320926U, // <7,3,u,1>: Cost 2 vext3 RHS, <3,u,1,2>
+ 1593153452U, // <7,3,u,2>: Cost 2 vext2 <u,2,7,3>, <u,2,7,3>
+ 1638320540U, // <7,3,u,3>: Cost 2 vext3 RHS, <3,3,3,3>
+ 2237516526U, // <7,3,u,4>: Cost 3 vrev <3,7,4,u>
+ 1638320966U, // <7,3,u,5>: Cost 2 vext3 RHS, <3,u,5,6>
+ 2712062796U, // <7,3,u,6>: Cost 3 vext3 RHS, <3,u,6,3>
+ 2692967250U, // <7,3,u,7>: Cost 3 vext3 <1,3,u,7>, <3,u,7,0>
+ 1638320989U, // <7,3,u,u>: Cost 2 vext3 RHS, <3,u,u,2>
+ 2651635712U, // <7,4,0,0>: Cost 3 vext2 <5,6,7,4>, <0,0,0,0>
+ 1577893990U, // <7,4,0,1>: Cost 2 vext2 <5,6,7,4>, LHS
+ 2651635876U, // <7,4,0,2>: Cost 3 vext2 <5,6,7,4>, <0,2,0,2>
+ 3785804672U, // <7,4,0,3>: Cost 4 vext3 RHS, <4,0,3,1>
+ 2651636050U, // <7,4,0,4>: Cost 3 vext2 <5,6,7,4>, <0,4,1,5>
+ 1638468498U, // <7,4,0,5>: Cost 2 vext3 RHS, <4,0,5,1>
+ 1638468508U, // <7,4,0,6>: Cost 2 vext3 RHS, <4,0,6,2>
+ 3787795364U, // <7,4,0,7>: Cost 4 vext3 RHS, <4,0,7,1>
+ 1640459181U, // <7,4,0,u>: Cost 2 vext3 RHS, <4,0,u,1>
+ 2651636470U, // <7,4,1,0>: Cost 3 vext2 <5,6,7,4>, <1,0,3,2>
+ 2651636532U, // <7,4,1,1>: Cost 3 vext2 <5,6,7,4>, <1,1,1,1>
+ 2712062922U, // <7,4,1,2>: Cost 3 vext3 RHS, <4,1,2,3>
+ 2639029248U, // <7,4,1,3>: Cost 3 vext2 <3,5,7,4>, <1,3,5,7>
+ 2712062940U, // <7,4,1,4>: Cost 3 vext3 RHS, <4,1,4,3>
+ 2712062946U, // <7,4,1,5>: Cost 3 vext3 RHS, <4,1,5,0>
+ 2712062958U, // <7,4,1,6>: Cost 3 vext3 RHS, <4,1,6,3>
+ 3785804791U, // <7,4,1,7>: Cost 4 vext3 RHS, <4,1,7,3>
+ 2712062973U, // <7,4,1,u>: Cost 3 vext3 RHS, <4,1,u,0>
+ 3785804807U, // <7,4,2,0>: Cost 4 vext3 RHS, <4,2,0,1>
+ 3785804818U, // <7,4,2,1>: Cost 4 vext3 RHS, <4,2,1,3>
+ 2651637352U, // <7,4,2,2>: Cost 3 vext2 <5,6,7,4>, <2,2,2,2>
+ 2651637414U, // <7,4,2,3>: Cost 3 vext2 <5,6,7,4>, <2,3,0,1>
+ 3716753194U, // <7,4,2,4>: Cost 4 vext2 <4,2,7,4>, <2,4,5,7>
+ 2712063030U, // <7,4,2,5>: Cost 3 vext3 RHS, <4,2,5,3>
+ 2712063036U, // <7,4,2,6>: Cost 3 vext3 RHS, <4,2,6,0>
+ 3773123658U, // <7,4,2,7>: Cost 4 vext3 <2,4,5,7>, <4,2,7,5>
+ 2712063054U, // <7,4,2,u>: Cost 3 vext3 RHS, <4,2,u,0>
+ 2651637910U, // <7,4,3,0>: Cost 3 vext2 <5,6,7,4>, <3,0,1,2>
+ 3712772348U, // <7,4,3,1>: Cost 4 vext2 <3,5,7,4>, <3,1,3,5>
+ 3785804906U, // <7,4,3,2>: Cost 4 vext3 RHS, <4,3,2,1>
+ 2651638172U, // <7,4,3,3>: Cost 3 vext2 <5,6,7,4>, <3,3,3,3>
+ 2651638274U, // <7,4,3,4>: Cost 3 vext2 <5,6,7,4>, <3,4,5,6>
+ 2639030883U, // <7,4,3,5>: Cost 3 vext2 <3,5,7,4>, <3,5,7,4>
+ 2712063122U, // <7,4,3,6>: Cost 3 vext3 RHS, <4,3,6,5>
+ 3712772836U, // <7,4,3,7>: Cost 4 vext2 <3,5,7,4>, <3,7,3,7>
+ 2641021782U, // <7,4,3,u>: Cost 3 vext2 <3,u,7,4>, <3,u,7,4>
+ 2714053802U, // <7,4,4,0>: Cost 3 vext3 RHS, <4,4,0,2>
+ 3785804978U, // <7,4,4,1>: Cost 4 vext3 RHS, <4,4,1,1>
+ 3716754505U, // <7,4,4,2>: Cost 4 vext2 <4,2,7,4>, <4,2,7,4>
+ 3785804998U, // <7,4,4,3>: Cost 4 vext3 RHS, <4,4,3,3>
+ 1638321360U, // <7,4,4,4>: Cost 2 vext3 RHS, <4,4,4,4>
+ 1638468826U, // <7,4,4,5>: Cost 2 vext3 RHS, <4,4,5,5>
+ 1638468836U, // <7,4,4,6>: Cost 2 vext3 RHS, <4,4,6,6>
+ 3785215214U, // <7,4,4,7>: Cost 4 vext3 <4,4,7,7>, <4,4,7,7>
+ 1640459509U, // <7,4,4,u>: Cost 2 vext3 RHS, <4,4,u,5>
+ 1517207654U, // <7,4,5,0>: Cost 2 vext1 <6,7,4,5>, LHS
+ 2573034640U, // <7,4,5,1>: Cost 3 vext1 <3,7,4,5>, <1,5,3,7>
+ 2712063246U, // <7,4,5,2>: Cost 3 vext3 RHS, <4,5,2,3>
+ 2573036267U, // <7,4,5,3>: Cost 3 vext1 <3,7,4,5>, <3,7,4,5>
+ 1517210934U, // <7,4,5,4>: Cost 2 vext1 <6,7,4,5>, RHS
+ 2711989549U, // <7,4,5,5>: Cost 3 vext3 <4,5,5,7>, <4,5,5,7>
+ 564579638U, // <7,4,5,6>: Cost 1 vext3 RHS, RHS
+ 2651639976U, // <7,4,5,7>: Cost 3 vext2 <5,6,7,4>, <5,7,5,7>
+ 564579656U, // <7,4,5,u>: Cost 1 vext3 RHS, RHS
+ 2712063307U, // <7,4,6,0>: Cost 3 vext3 RHS, <4,6,0,1>
+ 3767668056U, // <7,4,6,1>: Cost 4 vext3 <1,5,3,7>, <4,6,1,5>
+ 2651640314U, // <7,4,6,2>: Cost 3 vext2 <5,6,7,4>, <6,2,7,3>
+ 2655621708U, // <7,4,6,3>: Cost 3 vext2 <6,3,7,4>, <6,3,7,4>
+ 1638468980U, // <7,4,6,4>: Cost 2 vext3 RHS, <4,6,4,6>
+ 2712063358U, // <7,4,6,5>: Cost 3 vext3 RHS, <4,6,5,7>
+ 2712063367U, // <7,4,6,6>: Cost 3 vext3 RHS, <4,6,6,7>
+ 2712210826U, // <7,4,6,7>: Cost 3 vext3 RHS, <4,6,7,1>
+ 1638469012U, // <7,4,6,u>: Cost 2 vext3 RHS, <4,6,u,2>
+ 2651640826U, // <7,4,7,0>: Cost 3 vext2 <5,6,7,4>, <7,0,1,2>
+ 3773713830U, // <7,4,7,1>: Cost 4 vext3 <2,5,4,7>, <4,7,1,2>
+ 3773713842U, // <7,4,7,2>: Cost 4 vext3 <2,5,4,7>, <4,7,2,5>
+ 3780349372U, // <7,4,7,3>: Cost 4 vext3 <3,6,4,7>, <4,7,3,6>
+ 2651641140U, // <7,4,7,4>: Cost 3 vext2 <5,6,7,4>, <7,4,0,1>
+ 2712210888U, // <7,4,7,5>: Cost 3 vext3 RHS, <4,7,5,0>
+ 2712210898U, // <7,4,7,6>: Cost 3 vext3 RHS, <4,7,6,1>
+ 2651641452U, // <7,4,7,7>: Cost 3 vext2 <5,6,7,4>, <7,7,7,7>
+ 2713538026U, // <7,4,7,u>: Cost 3 vext3 <4,7,u,7>, <4,7,u,7>
+ 1517232230U, // <7,4,u,0>: Cost 2 vext1 <6,7,4,u>, LHS
+ 1577899822U, // <7,4,u,1>: Cost 2 vext2 <5,6,7,4>, LHS
+ 2712063489U, // <7,4,u,2>: Cost 3 vext3 RHS, <4,u,2,3>
+ 2573060846U, // <7,4,u,3>: Cost 3 vext1 <3,7,4,u>, <3,7,4,u>
+ 1640312342U, // <7,4,u,4>: Cost 2 vext3 RHS, <4,u,4,6>
+ 1638469146U, // <7,4,u,5>: Cost 2 vext3 RHS, <4,u,5,1>
+ 564579881U, // <7,4,u,6>: Cost 1 vext3 RHS, RHS
+ 2714054192U, // <7,4,u,7>: Cost 3 vext3 RHS, <4,u,7,5>
+ 564579899U, // <7,4,u,u>: Cost 1 vext3 RHS, RHS
+ 2579038310U, // <7,5,0,0>: Cost 3 vext1 <4,7,5,0>, LHS
+ 2636382310U, // <7,5,0,1>: Cost 3 vext2 <3,1,7,5>, LHS
+ 2796339302U, // <7,5,0,2>: Cost 3 vuzpl <7,4,5,6>, LHS
+ 3646810719U, // <7,5,0,3>: Cost 4 vext1 <3,7,5,0>, <3,5,7,0>
+ 2712063586U, // <7,5,0,4>: Cost 3 vext3 RHS, <5,0,4,1>
+ 2735951467U, // <7,5,0,5>: Cost 3 vext3 RHS, <5,0,5,1>
+ 2735951476U, // <7,5,0,6>: Cost 3 vext3 RHS, <5,0,6,1>
+ 2579043322U, // <7,5,0,7>: Cost 3 vext1 <4,7,5,0>, <7,0,1,2>
+ 2636382877U, // <7,5,0,u>: Cost 3 vext2 <3,1,7,5>, LHS
+ 2712211087U, // <7,5,1,0>: Cost 3 vext3 RHS, <5,1,0,1>
+ 3698180916U, // <7,5,1,1>: Cost 4 vext2 <1,1,7,5>, <1,1,1,1>
+ 3710124950U, // <7,5,1,2>: Cost 4 vext2 <3,1,7,5>, <1,2,3,0>
+ 2636383232U, // <7,5,1,3>: Cost 3 vext2 <3,1,7,5>, <1,3,5,7>
+ 2712211127U, // <7,5,1,4>: Cost 3 vext3 RHS, <5,1,4,5>
+ 2590994128U, // <7,5,1,5>: Cost 3 vext1 <6,7,5,1>, <5,1,7,3>
+ 2590995323U, // <7,5,1,6>: Cost 3 vext1 <6,7,5,1>, <6,7,5,1>
+ 1638469328U, // <7,5,1,7>: Cost 2 vext3 RHS, <5,1,7,3>
+ 1638469337U, // <7,5,1,u>: Cost 2 vext3 RHS, <5,1,u,3>
+ 3785805536U, // <7,5,2,0>: Cost 4 vext3 RHS, <5,2,0,1>
+ 3785805544U, // <7,5,2,1>: Cost 4 vext3 RHS, <5,2,1,0>
+ 3704817288U, // <7,5,2,2>: Cost 4 vext2 <2,2,7,5>, <2,2,5,7>
+ 2712063742U, // <7,5,2,3>: Cost 3 vext3 RHS, <5,2,3,4>
+ 3716761386U, // <7,5,2,4>: Cost 4 vext2 <4,2,7,5>, <2,4,5,7>
+ 2714054415U, // <7,5,2,5>: Cost 3 vext3 RHS, <5,2,5,3>
+ 3774304024U, // <7,5,2,6>: Cost 4 vext3 <2,6,3,7>, <5,2,6,3>
+ 2712063777U, // <7,5,2,7>: Cost 3 vext3 RHS, <5,2,7,3>
+ 2712063787U, // <7,5,2,u>: Cost 3 vext3 RHS, <5,2,u,4>
+ 3634888806U, // <7,5,3,0>: Cost 4 vext1 <1,7,5,3>, LHS
+ 2636384544U, // <7,5,3,1>: Cost 3 vext2 <3,1,7,5>, <3,1,7,5>
+ 3710790001U, // <7,5,3,2>: Cost 4 vext2 <3,2,7,5>, <3,2,7,5>
+ 3710126492U, // <7,5,3,3>: Cost 4 vext2 <3,1,7,5>, <3,3,3,3>
+ 3634892086U, // <7,5,3,4>: Cost 4 vext1 <1,7,5,3>, RHS
+ 2639039076U, // <7,5,3,5>: Cost 3 vext2 <3,5,7,5>, <3,5,7,5>
+ 3713444533U, // <7,5,3,6>: Cost 4 vext2 <3,6,7,5>, <3,6,7,5>
+ 2693926767U, // <7,5,3,7>: Cost 3 vext3 <1,5,3,7>, <5,3,7,0>
+ 2712063864U, // <7,5,3,u>: Cost 3 vext3 RHS, <5,3,u,0>
+ 2579071078U, // <7,5,4,0>: Cost 3 vext1 <4,7,5,4>, LHS
+ 3646841856U, // <7,5,4,1>: Cost 4 vext1 <3,7,5,4>, <1,3,5,7>
+ 3716762698U, // <7,5,4,2>: Cost 4 vext2 <4,2,7,5>, <4,2,7,5>
+ 3646843491U, // <7,5,4,3>: Cost 4 vext1 <3,7,5,4>, <3,5,7,4>
+ 2579074358U, // <7,5,4,4>: Cost 3 vext1 <4,7,5,4>, RHS
+ 2636385590U, // <7,5,4,5>: Cost 3 vext2 <3,1,7,5>, RHS
+ 2645675406U, // <7,5,4,6>: Cost 3 vext2 <4,6,7,5>, <4,6,7,5>
+ 1638322118U, // <7,5,4,7>: Cost 2 vext3 RHS, <5,4,7,6>
+ 1638469583U, // <7,5,4,u>: Cost 2 vext3 RHS, <5,4,u,6>
+ 2714054611U, // <7,5,5,0>: Cost 3 vext3 RHS, <5,5,0,1>
+ 2652974800U, // <7,5,5,1>: Cost 3 vext2 <5,u,7,5>, <5,1,7,3>
+ 3710127905U, // <7,5,5,2>: Cost 4 vext2 <3,1,7,5>, <5,2,7,3>
+ 3785805808U, // <7,5,5,3>: Cost 4 vext3 RHS, <5,5,3,3>
+ 2712211450U, // <7,5,5,4>: Cost 3 vext3 RHS, <5,5,4,4>
+ 1638322180U, // <7,5,5,5>: Cost 2 vext3 RHS, <5,5,5,5>
+ 2712064014U, // <7,5,5,6>: Cost 3 vext3 RHS, <5,5,6,6>
+ 1638469656U, // <7,5,5,7>: Cost 2 vext3 RHS, <5,5,7,7>
+ 1638469665U, // <7,5,5,u>: Cost 2 vext3 RHS, <5,5,u,7>
+ 2712064036U, // <7,5,6,0>: Cost 3 vext3 RHS, <5,6,0,1>
+ 2714054707U, // <7,5,6,1>: Cost 3 vext3 RHS, <5,6,1,7>
+ 3785805879U, // <7,5,6,2>: Cost 4 vext3 RHS, <5,6,2,2>
+ 2712064066U, // <7,5,6,3>: Cost 3 vext3 RHS, <5,6,3,4>
+ 2712064076U, // <7,5,6,4>: Cost 3 vext3 RHS, <5,6,4,5>
+ 2714054743U, // <7,5,6,5>: Cost 3 vext3 RHS, <5,6,5,7>
+ 2712064096U, // <7,5,6,6>: Cost 3 vext3 RHS, <5,6,6,7>
+ 1638322274U, // <7,5,6,7>: Cost 2 vext3 RHS, <5,6,7,0>
+ 1638469739U, // <7,5,6,u>: Cost 2 vext3 RHS, <5,6,u,0>
+ 1511325798U, // <7,5,7,0>: Cost 2 vext1 <5,7,5,7>, LHS
+ 2692747392U, // <7,5,7,1>: Cost 3 vext3 <1,3,5,7>, <5,7,1,3>
+ 2585069160U, // <7,5,7,2>: Cost 3 vext1 <5,7,5,7>, <2,2,2,2>
+ 2573126390U, // <7,5,7,3>: Cost 3 vext1 <3,7,5,7>, <3,7,5,7>
+ 1511329078U, // <7,5,7,4>: Cost 2 vext1 <5,7,5,7>, RHS
+ 1638469800U, // <7,5,7,5>: Cost 2 vext3 RHS, <5,7,5,7>
+ 2712211626U, // <7,5,7,6>: Cost 3 vext3 RHS, <5,7,6,0>
+ 2712211636U, // <7,5,7,7>: Cost 3 vext3 RHS, <5,7,7,1>
+ 1638469823U, // <7,5,7,u>: Cost 2 vext3 RHS, <5,7,u,3>
+ 1511333990U, // <7,5,u,0>: Cost 2 vext1 <5,7,5,u>, LHS
+ 2636388142U, // <7,5,u,1>: Cost 3 vext2 <3,1,7,5>, LHS
+ 2712211671U, // <7,5,u,2>: Cost 3 vext3 RHS, <5,u,2,0>
+ 2573134583U, // <7,5,u,3>: Cost 3 vext1 <3,7,5,u>, <3,7,5,u>
+ 1511337270U, // <7,5,u,4>: Cost 2 vext1 <5,7,5,u>, RHS
+ 1638469881U, // <7,5,u,5>: Cost 2 vext3 RHS, <5,u,5,7>
+ 2712064258U, // <7,5,u,6>: Cost 3 vext3 RHS, <5,u,6,7>
+ 1638469892U, // <7,5,u,7>: Cost 2 vext3 RHS, <5,u,7,0>
+ 1638469904U, // <7,5,u,u>: Cost 2 vext3 RHS, <5,u,u,3>
+ 2650324992U, // <7,6,0,0>: Cost 3 vext2 <5,4,7,6>, <0,0,0,0>
+ 1576583270U, // <7,6,0,1>: Cost 2 vext2 <5,4,7,6>, LHS
+ 2712064300U, // <7,6,0,2>: Cost 3 vext3 RHS, <6,0,2,4>
+ 2255295336U, // <7,6,0,3>: Cost 3 vrev <6,7,3,0>
+ 2712064316U, // <7,6,0,4>: Cost 3 vext3 RHS, <6,0,4,2>
+ 2585088098U, // <7,6,0,5>: Cost 3 vext1 <5,7,6,0>, <5,6,7,0>
+ 2735952204U, // <7,6,0,6>: Cost 3 vext3 RHS, <6,0,6,0>
+ 2712211799U, // <7,6,0,7>: Cost 3 vext3 RHS, <6,0,7,2>
+ 1576583837U, // <7,6,0,u>: Cost 2 vext2 <5,4,7,6>, LHS
+ 1181340494U, // <7,6,1,0>: Cost 2 vrev <6,7,0,1>
+ 2650325812U, // <7,6,1,1>: Cost 3 vext2 <5,4,7,6>, <1,1,1,1>
+ 2650325910U, // <7,6,1,2>: Cost 3 vext2 <5,4,7,6>, <1,2,3,0>
+ 2650325976U, // <7,6,1,3>: Cost 3 vext2 <5,4,7,6>, <1,3,1,3>
+ 2579123510U, // <7,6,1,4>: Cost 3 vext1 <4,7,6,1>, RHS
+ 2650326160U, // <7,6,1,5>: Cost 3 vext2 <5,4,7,6>, <1,5,3,7>
+ 2714055072U, // <7,6,1,6>: Cost 3 vext3 RHS, <6,1,6,3>
+ 2712064425U, // <7,6,1,7>: Cost 3 vext3 RHS, <6,1,7,3>
+ 1181930390U, // <7,6,1,u>: Cost 2 vrev <6,7,u,1>
+ 2712211897U, // <7,6,2,0>: Cost 3 vext3 RHS, <6,2,0,1>
+ 2714055108U, // <7,6,2,1>: Cost 3 vext3 RHS, <6,2,1,3>
+ 2650326632U, // <7,6,2,2>: Cost 3 vext2 <5,4,7,6>, <2,2,2,2>
+ 2650326694U, // <7,6,2,3>: Cost 3 vext2 <5,4,7,6>, <2,3,0,1>
+ 2714055137U, // <7,6,2,4>: Cost 3 vext3 RHS, <6,2,4,5>
+ 2714055148U, // <7,6,2,5>: Cost 3 vext3 RHS, <6,2,5,7>
+ 2650326970U, // <7,6,2,6>: Cost 3 vext2 <5,4,7,6>, <2,6,3,7>
+ 1638470138U, // <7,6,2,7>: Cost 2 vext3 RHS, <6,2,7,3>
+ 1638470147U, // <7,6,2,u>: Cost 2 vext3 RHS, <6,2,u,3>
+ 2650327190U, // <7,6,3,0>: Cost 3 vext2 <5,4,7,6>, <3,0,1,2>
+ 2255172441U, // <7,6,3,1>: Cost 3 vrev <6,7,1,3>
+ 2255246178U, // <7,6,3,2>: Cost 3 vrev <6,7,2,3>
+ 2650327452U, // <7,6,3,3>: Cost 3 vext2 <5,4,7,6>, <3,3,3,3>
+ 2712064562U, // <7,6,3,4>: Cost 3 vext3 RHS, <6,3,4,5>
+ 2650327627U, // <7,6,3,5>: Cost 3 vext2 <5,4,7,6>, <3,5,4,7>
+ 3713452726U, // <7,6,3,6>: Cost 4 vext2 <3,6,7,6>, <3,6,7,6>
+ 2700563016U, // <7,6,3,7>: Cost 3 vext3 <2,6,3,7>, <6,3,7,0>
+ 2712064593U, // <7,6,3,u>: Cost 3 vext3 RHS, <6,3,u,0>
+ 2650327954U, // <7,6,4,0>: Cost 3 vext2 <5,4,7,6>, <4,0,5,1>
+ 2735952486U, // <7,6,4,1>: Cost 3 vext3 RHS, <6,4,1,3>
+ 2735952497U, // <7,6,4,2>: Cost 3 vext3 RHS, <6,4,2,5>
+ 2255328108U, // <7,6,4,3>: Cost 3 vrev <6,7,3,4>
+ 2712212100U, // <7,6,4,4>: Cost 3 vext3 RHS, <6,4,4,6>
+ 1576586550U, // <7,6,4,5>: Cost 2 vext2 <5,4,7,6>, RHS
+ 2714055312U, // <7,6,4,6>: Cost 3 vext3 RHS, <6,4,6,0>
+ 2712212126U, // <7,6,4,7>: Cost 3 vext3 RHS, <6,4,7,5>
+ 1576586793U, // <7,6,4,u>: Cost 2 vext2 <5,4,7,6>, RHS
+ 2579152998U, // <7,6,5,0>: Cost 3 vext1 <4,7,6,5>, LHS
+ 2650328784U, // <7,6,5,1>: Cost 3 vext2 <5,4,7,6>, <5,1,7,3>
+ 2714055364U, // <7,6,5,2>: Cost 3 vext3 RHS, <6,5,2,7>
+ 3785806538U, // <7,6,5,3>: Cost 4 vext3 RHS, <6,5,3,4>
+ 1576587206U, // <7,6,5,4>: Cost 2 vext2 <5,4,7,6>, <5,4,7,6>
+ 2650329092U, // <7,6,5,5>: Cost 3 vext2 <5,4,7,6>, <5,5,5,5>
+ 2650329186U, // <7,6,5,6>: Cost 3 vext2 <5,4,7,6>, <5,6,7,0>
+ 2712064753U, // <7,6,5,7>: Cost 3 vext3 RHS, <6,5,7,7>
+ 1181963162U, // <7,6,5,u>: Cost 2 vrev <6,7,u,5>
+ 2714055421U, // <7,6,6,0>: Cost 3 vext3 RHS, <6,6,0,1>
+ 2714055432U, // <7,6,6,1>: Cost 3 vext3 RHS, <6,6,1,3>
+ 2650329594U, // <7,6,6,2>: Cost 3 vext2 <5,4,7,6>, <6,2,7,3>
+ 3785806619U, // <7,6,6,3>: Cost 4 vext3 RHS, <6,6,3,4>
+ 2712212260U, // <7,6,6,4>: Cost 3 vext3 RHS, <6,6,4,4>
+ 2714055472U, // <7,6,6,5>: Cost 3 vext3 RHS, <6,6,5,7>
+ 1638323000U, // <7,6,6,6>: Cost 2 vext3 RHS, <6,6,6,6>
+ 1638470466U, // <7,6,6,7>: Cost 2 vext3 RHS, <6,6,7,7>
+ 1638470475U, // <7,6,6,u>: Cost 2 vext3 RHS, <6,6,u,7>
+ 1638323022U, // <7,6,7,0>: Cost 2 vext3 RHS, <6,7,0,1>
+ 2712064854U, // <7,6,7,1>: Cost 3 vext3 RHS, <6,7,1,0>
+ 2712064865U, // <7,6,7,2>: Cost 3 vext3 RHS, <6,7,2,2>
+ 2712064872U, // <7,6,7,3>: Cost 3 vext3 RHS, <6,7,3,0>
+ 1638323062U, // <7,6,7,4>: Cost 2 vext3 RHS, <6,7,4,5>
+ 2712064894U, // <7,6,7,5>: Cost 3 vext3 RHS, <6,7,5,4>
+ 2712064905U, // <7,6,7,6>: Cost 3 vext3 RHS, <6,7,6,6>
+ 2712064915U, // <7,6,7,7>: Cost 3 vext3 RHS, <6,7,7,7>
+ 1638323094U, // <7,6,7,u>: Cost 2 vext3 RHS, <6,7,u,1>
+ 1638470559U, // <7,6,u,0>: Cost 2 vext3 RHS, <6,u,0,1>
+ 1576589102U, // <7,6,u,1>: Cost 2 vext2 <5,4,7,6>, LHS
+ 2712212402U, // <7,6,u,2>: Cost 3 vext3 RHS, <6,u,2,2>
+ 2712212409U, // <7,6,u,3>: Cost 3 vext3 RHS, <6,u,3,0>
+ 1638470599U, // <7,6,u,4>: Cost 2 vext3 RHS, <6,u,4,5>
+ 1576589466U, // <7,6,u,5>: Cost 2 vext2 <5,4,7,6>, RHS
+ 1638323000U, // <7,6,u,6>: Cost 2 vext3 RHS, <6,6,6,6>
+ 1638470624U, // <7,6,u,7>: Cost 2 vext3 RHS, <6,u,7,3>
+ 1638470631U, // <7,6,u,u>: Cost 2 vext3 RHS, <6,u,u,1>
+ 2712065007U, // <7,7,0,0>: Cost 3 vext3 RHS, <7,0,0,0>
+ 1638323194U, // <7,7,0,1>: Cost 2 vext3 RHS, <7,0,1,2>
+ 2712065025U, // <7,7,0,2>: Cost 3 vext3 RHS, <7,0,2,0>
+ 3646958337U, // <7,7,0,3>: Cost 4 vext1 <3,7,7,0>, <3,7,7,0>
+ 2712065044U, // <7,7,0,4>: Cost 3 vext3 RHS, <7,0,4,1>
+ 2585161907U, // <7,7,0,5>: Cost 3 vext1 <5,7,7,0>, <5,7,7,0>
+ 2591134604U, // <7,7,0,6>: Cost 3 vext1 <6,7,7,0>, <6,7,7,0>
+ 2591134714U, // <7,7,0,7>: Cost 3 vext1 <6,7,7,0>, <7,0,1,2>
+ 1638323257U, // <7,7,0,u>: Cost 2 vext3 RHS, <7,0,u,2>
+ 2712065091U, // <7,7,1,0>: Cost 3 vext3 RHS, <7,1,0,3>
+ 2712065098U, // <7,7,1,1>: Cost 3 vext3 RHS, <7,1,1,1>
+ 2712065109U, // <7,7,1,2>: Cost 3 vext3 RHS, <7,1,2,3>
+ 2692748384U, // <7,7,1,3>: Cost 3 vext3 <1,3,5,7>, <7,1,3,5>
+ 2585169206U, // <7,7,1,4>: Cost 3 vext1 <5,7,7,1>, RHS
+ 2693928048U, // <7,7,1,5>: Cost 3 vext3 <1,5,3,7>, <7,1,5,3>
+ 2585170766U, // <7,7,1,6>: Cost 3 vext1 <5,7,7,1>, <6,7,0,1>
+ 2735953024U, // <7,7,1,7>: Cost 3 vext3 RHS, <7,1,7,1>
+ 2695918731U, // <7,7,1,u>: Cost 3 vext3 <1,u,3,7>, <7,1,u,3>
+ 3770471574U, // <7,7,2,0>: Cost 4 vext3 <2,0,5,7>, <7,2,0,5>
+ 3785807002U, // <7,7,2,1>: Cost 4 vext3 RHS, <7,2,1,0>
+ 2712065189U, // <7,7,2,2>: Cost 3 vext3 RHS, <7,2,2,2>
+ 2712065196U, // <7,7,2,3>: Cost 3 vext3 RHS, <7,2,3,0>
+ 3773125818U, // <7,7,2,4>: Cost 4 vext3 <2,4,5,7>, <7,2,4,5>
+ 3766490305U, // <7,7,2,5>: Cost 4 vext3 <1,3,5,7>, <7,2,5,3>
+ 2700563658U, // <7,7,2,6>: Cost 3 vext3 <2,6,3,7>, <7,2,6,3>
+ 2735953107U, // <7,7,2,7>: Cost 3 vext3 RHS, <7,2,7,3>
+ 2701890780U, // <7,7,2,u>: Cost 3 vext3 <2,u,3,7>, <7,2,u,3>
+ 2712065251U, // <7,7,3,0>: Cost 3 vext3 RHS, <7,3,0,1>
+ 3766490350U, // <7,7,3,1>: Cost 4 vext3 <1,3,5,7>, <7,3,1,3>
+ 3774305530U, // <7,7,3,2>: Cost 4 vext3 <2,6,3,7>, <7,3,2,6>
+ 2637728196U, // <7,7,3,3>: Cost 3 vext2 <3,3,7,7>, <3,3,7,7>
+ 2712065291U, // <7,7,3,4>: Cost 3 vext3 RHS, <7,3,4,5>
+ 2585186486U, // <7,7,3,5>: Cost 3 vext1 <5,7,7,3>, <5,7,7,3>
+ 2639719095U, // <7,7,3,6>: Cost 3 vext2 <3,6,7,7>, <3,6,7,7>
+ 2640382728U, // <7,7,3,7>: Cost 3 vext2 <3,7,7,7>, <3,7,7,7>
+ 2641046361U, // <7,7,3,u>: Cost 3 vext2 <3,u,7,7>, <3,u,7,7>
+ 2712212792U, // <7,7,4,0>: Cost 3 vext3 RHS, <7,4,0,5>
+ 3646989312U, // <7,7,4,1>: Cost 4 vext1 <3,7,7,4>, <1,3,5,7>
+ 3785807176U, // <7,7,4,2>: Cost 4 vext3 RHS, <7,4,2,3>
+ 3646991109U, // <7,7,4,3>: Cost 4 vext1 <3,7,7,4>, <3,7,7,4>
+ 2712065371U, // <7,7,4,4>: Cost 3 vext3 RHS, <7,4,4,4>
+ 1638323558U, // <7,7,4,5>: Cost 2 vext3 RHS, <7,4,5,6>
+ 2712212845U, // <7,7,4,6>: Cost 3 vext3 RHS, <7,4,6,4>
+ 2591167846U, // <7,7,4,7>: Cost 3 vext1 <6,7,7,4>, <7,4,5,6>
+ 1638323585U, // <7,7,4,u>: Cost 2 vext3 RHS, <7,4,u,6>
+ 2585198694U, // <7,7,5,0>: Cost 3 vext1 <5,7,7,5>, LHS
+ 2712212884U, // <7,7,5,1>: Cost 3 vext3 RHS, <7,5,1,7>
+ 3711471393U, // <7,7,5,2>: Cost 4 vext2 <3,3,7,7>, <5,2,7,3>
+ 2649673590U, // <7,7,5,3>: Cost 3 vext2 <5,3,7,7>, <5,3,7,7>
+ 2712065455U, // <7,7,5,4>: Cost 3 vext3 RHS, <7,5,4,7>
+ 1577259032U, // <7,7,5,5>: Cost 2 vext2 <5,5,7,7>, <5,5,7,7>
+ 2712065473U, // <7,7,5,6>: Cost 3 vext3 RHS, <7,5,6,7>
+ 2712212936U, // <7,7,5,7>: Cost 3 vext3 RHS, <7,5,7,5>
+ 1579249931U, // <7,7,5,u>: Cost 2 vext2 <5,u,7,7>, <5,u,7,7>
+ 2591178854U, // <7,7,6,0>: Cost 3 vext1 <6,7,7,6>, LHS
+ 2735953374U, // <7,7,6,1>: Cost 3 vext3 RHS, <7,6,1,0>
+ 2712212974U, // <7,7,6,2>: Cost 3 vext3 RHS, <7,6,2,7>
+ 2655646287U, // <7,7,6,3>: Cost 3 vext2 <6,3,7,7>, <6,3,7,7>
+ 2591182134U, // <7,7,6,4>: Cost 3 vext1 <6,7,7,6>, RHS
+ 2656973553U, // <7,7,6,5>: Cost 3 vext2 <6,5,7,7>, <6,5,7,7>
+ 1583895362U, // <7,7,6,6>: Cost 2 vext2 <6,6,7,7>, <6,6,7,7>
+ 2712065556U, // <7,7,6,7>: Cost 3 vext3 RHS, <7,6,7,0>
+ 1585222628U, // <7,7,6,u>: Cost 2 vext2 <6,u,7,7>, <6,u,7,7>
+ 1523417190U, // <7,7,7,0>: Cost 2 vext1 <7,7,7,7>, LHS
+ 2597159670U, // <7,7,7,1>: Cost 3 vext1 <7,7,7,7>, <1,0,3,2>
+ 2597160552U, // <7,7,7,2>: Cost 3 vext1 <7,7,7,7>, <2,2,2,2>
+ 2597161110U, // <7,7,7,3>: Cost 3 vext1 <7,7,7,7>, <3,0,1,2>
+ 1523420470U, // <7,7,7,4>: Cost 2 vext1 <7,7,7,7>, RHS
+ 2651002296U, // <7,7,7,5>: Cost 3 vext2 <5,5,7,7>, <7,5,5,7>
+ 2657637906U, // <7,7,7,6>: Cost 3 vext2 <6,6,7,7>, <7,6,6,7>
+ 363253046U, // <7,7,7,7>: Cost 1 vdup3 RHS
+ 363253046U, // <7,7,7,u>: Cost 1 vdup3 RHS
+ 1523417190U, // <7,7,u,0>: Cost 2 vext1 <7,7,7,7>, LHS
+ 1638471298U, // <7,7,u,1>: Cost 2 vext3 RHS, <7,u,1,2>
+ 2712213132U, // <7,7,u,2>: Cost 3 vext3 RHS, <7,u,2,3>
+ 2712213138U, // <7,7,u,3>: Cost 3 vext3 RHS, <7,u,3,0>
+ 1523420470U, // <7,7,u,4>: Cost 2 vext1 <7,7,7,7>, RHS
+ 1638471338U, // <7,7,u,5>: Cost 2 vext3 RHS, <7,u,5,6>
+ 1595840756U, // <7,7,u,6>: Cost 2 vext2 <u,6,7,7>, <u,6,7,7>
+ 363253046U, // <7,7,u,7>: Cost 1 vdup3 RHS
+ 363253046U, // <7,7,u,u>: Cost 1 vdup3 RHS
+ 1638318080U, // <7,u,0,0>: Cost 2 vext3 RHS, <0,0,0,0>
+ 1638323923U, // <7,u,0,1>: Cost 2 vext3 RHS, <u,0,1,2>
+ 1662211804U, // <7,u,0,2>: Cost 2 vext3 RHS, <u,0,2,2>
+ 1638323941U, // <7,u,0,3>: Cost 2 vext3 RHS, <u,0,3,2>
+ 2712065773U, // <7,u,0,4>: Cost 3 vext3 RHS, <u,0,4,1>
+ 1662359286U, // <7,u,0,5>: Cost 2 vext3 RHS, <u,0,5,1>
+ 1662359296U, // <7,u,0,6>: Cost 2 vext3 RHS, <u,0,6,2>
+ 2987150664U, // <7,u,0,7>: Cost 3 vzipr <5,6,7,0>, RHS
+ 1638323986U, // <7,u,0,u>: Cost 2 vext3 RHS, <u,0,u,2>
+ 1517469798U, // <7,u,1,0>: Cost 2 vext1 <6,7,u,1>, LHS
+ 1638318900U, // <7,u,1,1>: Cost 2 vext3 RHS, <1,1,1,1>
+ 564582190U, // <7,u,1,2>: Cost 1 vext3 RHS, LHS
+ 1638324023U, // <7,u,1,3>: Cost 2 vext3 RHS, <u,1,3,3>
+ 1517473078U, // <7,u,1,4>: Cost 2 vext1 <6,7,u,1>, RHS
+ 2693928777U, // <7,u,1,5>: Cost 3 vext3 <1,5,3,7>, <u,1,5,3>
+ 1517474710U, // <7,u,1,6>: Cost 2 vext1 <6,7,u,1>, <6,7,u,1>
+ 1640462171U, // <7,u,1,7>: Cost 2 vext3 RHS, <u,1,7,3>
+ 564582244U, // <7,u,1,u>: Cost 1 vext3 RHS, LHS
+ 1638318244U, // <7,u,2,0>: Cost 2 vext3 RHS, <0,2,0,2>
+ 2712065907U, // <7,u,2,1>: Cost 3 vext3 RHS, <u,2,1,0>
+ 1638319720U, // <7,u,2,2>: Cost 2 vext3 RHS, <2,2,2,2>
+ 1638324101U, // <7,u,2,3>: Cost 2 vext3 RHS, <u,2,3,0>
+ 1638318284U, // <7,u,2,4>: Cost 2 vext3 RHS, <0,2,4,6>
+ 2712065947U, // <7,u,2,5>: Cost 3 vext3 RHS, <u,2,5,4>
+ 2700564387U, // <7,u,2,6>: Cost 3 vext3 <2,6,3,7>, <u,2,6,3>
+ 1640314796U, // <7,u,2,7>: Cost 2 vext3 RHS, <u,2,7,3>
+ 1638324146U, // <7,u,2,u>: Cost 2 vext3 RHS, <u,2,u,0>
+ 1638324156U, // <7,u,3,0>: Cost 2 vext3 RHS, <u,3,0,1>
+ 1638319064U, // <7,u,3,1>: Cost 2 vext3 RHS, <1,3,1,3>
+ 2700564435U, // <7,u,3,2>: Cost 3 vext3 <2,6,3,7>, <u,3,2,6>
+ 1638320540U, // <7,u,3,3>: Cost 2 vext3 RHS, <3,3,3,3>
+ 1638324196U, // <7,u,3,4>: Cost 2 vext3 RHS, <u,3,4,5>
+ 1638324207U, // <7,u,3,5>: Cost 2 vext3 RHS, <u,3,5,7>
+ 2700564472U, // <7,u,3,6>: Cost 3 vext3 <2,6,3,7>, <u,3,6,7>
+ 2695919610U, // <7,u,3,7>: Cost 3 vext3 <1,u,3,7>, <u,3,7,0>
+ 1638324228U, // <7,u,3,u>: Cost 2 vext3 RHS, <u,3,u,1>
+ 2712066061U, // <7,u,4,0>: Cost 3 vext3 RHS, <u,4,0,1>
+ 1662212122U, // <7,u,4,1>: Cost 2 vext3 RHS, <u,4,1,5>
+ 1662212132U, // <7,u,4,2>: Cost 2 vext3 RHS, <u,4,2,6>
+ 2712066092U, // <7,u,4,3>: Cost 3 vext3 RHS, <u,4,3,5>
+ 1638321360U, // <7,u,4,4>: Cost 2 vext3 RHS, <4,4,4,4>
+ 1638324287U, // <7,u,4,5>: Cost 2 vext3 RHS, <u,4,5,6>
+ 1662359624U, // <7,u,4,6>: Cost 2 vext3 RHS, <u,4,6,6>
+ 1640314961U, // <7,u,4,7>: Cost 2 vext3 RHS, <u,4,7,6>
+ 1638324314U, // <7,u,4,u>: Cost 2 vext3 RHS, <u,4,u,6>
+ 1517502566U, // <7,u,5,0>: Cost 2 vext1 <6,7,u,5>, LHS
+ 1574612693U, // <7,u,5,1>: Cost 2 vext2 <5,1,7,u>, <5,1,7,u>
+ 2712066162U, // <7,u,5,2>: Cost 3 vext3 RHS, <u,5,2,3>
+ 1638324351U, // <7,u,5,3>: Cost 2 vext3 RHS, <u,5,3,7>
+ 1576603592U, // <7,u,5,4>: Cost 2 vext2 <5,4,7,u>, <5,4,7,u>
+ 1577267225U, // <7,u,5,5>: Cost 2 vext2 <5,5,7,u>, <5,5,7,u>
+ 564582554U, // <7,u,5,6>: Cost 1 vext3 RHS, RHS
+ 1640462499U, // <7,u,5,7>: Cost 2 vext3 RHS, <u,5,7,7>
+ 564582572U, // <7,u,5,u>: Cost 1 vext3 RHS, RHS
+ 2712066223U, // <7,u,6,0>: Cost 3 vext3 RHS, <u,6,0,1>
+ 2712066238U, // <7,u,6,1>: Cost 3 vext3 RHS, <u,6,1,7>
+ 1581249023U, // <7,u,6,2>: Cost 2 vext2 <6,2,7,u>, <6,2,7,u>
+ 1638324432U, // <7,u,6,3>: Cost 2 vext3 RHS, <u,6,3,7>
+ 1638468980U, // <7,u,6,4>: Cost 2 vext3 RHS, <4,6,4,6>
+ 2712066274U, // <7,u,6,5>: Cost 3 vext3 RHS, <u,6,5,7>
+ 1583903555U, // <7,u,6,6>: Cost 2 vext2 <6,6,7,u>, <6,6,7,u>
+ 1640315117U, // <7,u,6,7>: Cost 2 vext3 RHS, <u,6,7,0>
+ 1638324477U, // <7,u,6,u>: Cost 2 vext3 RHS, <u,6,u,7>
+ 1638471936U, // <7,u,7,0>: Cost 2 vext3 RHS, <u,7,0,1>
+ 2692970763U, // <7,u,7,1>: Cost 3 vext3 <1,3,u,7>, <u,7,1,3>
+ 2700933399U, // <7,u,7,2>: Cost 3 vext3 <2,6,u,7>, <u,7,2,6>
+ 2573347601U, // <7,u,7,3>: Cost 3 vext1 <3,7,u,7>, <3,7,u,7>
+ 1638471976U, // <7,u,7,4>: Cost 2 vext3 RHS, <u,7,4,5>
+ 1511551171U, // <7,u,7,5>: Cost 2 vext1 <5,7,u,7>, <5,7,u,7>
+ 2712213815U, // <7,u,7,6>: Cost 3 vext3 RHS, <u,7,6,2>
+ 363253046U, // <7,u,7,7>: Cost 1 vdup3 RHS
+ 363253046U, // <7,u,7,u>: Cost 1 vdup3 RHS
+ 1638324561U, // <7,u,u,0>: Cost 2 vext3 RHS, <u,u,0,1>
+ 1638324571U, // <7,u,u,1>: Cost 2 vext3 RHS, <u,u,1,2>
+ 564582757U, // <7,u,u,2>: Cost 1 vext3 RHS, LHS
+ 1638324587U, // <7,u,u,3>: Cost 2 vext3 RHS, <u,u,3,0>
+ 1638324601U, // <7,u,u,4>: Cost 2 vext3 RHS, <u,u,4,5>
+ 1638324611U, // <7,u,u,5>: Cost 2 vext3 RHS, <u,u,5,6>
+ 564582797U, // <7,u,u,6>: Cost 1 vext3 RHS, RHS
+ 363253046U, // <7,u,u,7>: Cost 1 vdup3 RHS
+ 564582811U, // <7,u,u,u>: Cost 1 vext3 RHS, LHS
+ 135053414U, // <u,0,0,0>: Cost 1 vdup0 LHS
+ 1611489290U, // <u,0,0,1>: Cost 2 vext3 LHS, <0,0,1,1>
+ 1611489300U, // <u,0,0,2>: Cost 2 vext3 LHS, <0,0,2,2>
+ 2568054923U, // <u,0,0,3>: Cost 3 vext1 <3,0,0,0>, <3,0,0,0>
+ 1481706806U, // <u,0,0,4>: Cost 2 vext1 <0,u,0,0>, RHS
+ 2555449040U, // <u,0,0,5>: Cost 3 vext1 <0,u,0,0>, <5,1,7,3>
+ 2591282078U, // <u,0,0,6>: Cost 3 vext1 <6,u,0,0>, <6,u,0,0>
+ 2591945711U, // <u,0,0,7>: Cost 3 vext1 <7,0,0,0>, <7,0,0,0>
+ 135053414U, // <u,0,0,u>: Cost 1 vdup0 LHS
+ 1493655654U, // <u,0,1,0>: Cost 2 vext1 <2,u,0,1>, LHS
+ 1860550758U, // <u,0,1,1>: Cost 2 vzipl LHS, LHS
+ 537747563U, // <u,0,1,2>: Cost 1 vext3 LHS, LHS
+ 2625135576U, // <u,0,1,3>: Cost 3 vext2 <1,2,u,0>, <1,3,1,3>
+ 1493658934U, // <u,0,1,4>: Cost 2 vext1 <2,u,0,1>, RHS
+ 2625135760U, // <u,0,1,5>: Cost 3 vext2 <1,2,u,0>, <1,5,3,7>
+ 1517548447U, // <u,0,1,6>: Cost 2 vext1 <6,u,0,1>, <6,u,0,1>
+ 2591290362U, // <u,0,1,7>: Cost 3 vext1 <6,u,0,1>, <7,0,1,2>
+ 537747612U, // <u,0,1,u>: Cost 1 vext3 LHS, LHS
+ 1611489444U, // <u,0,2,0>: Cost 2 vext3 LHS, <0,2,0,2>
+ 2685231276U, // <u,0,2,1>: Cost 3 vext3 LHS, <0,2,1,1>
+ 1994768486U, // <u,0,2,2>: Cost 2 vtrnl LHS, LHS
+ 2685231294U, // <u,0,2,3>: Cost 3 vext3 LHS, <0,2,3,1>
+ 1611489484U, // <u,0,2,4>: Cost 2 vext3 LHS, <0,2,4,6>
+ 2712068310U, // <u,0,2,5>: Cost 3 vext3 RHS, <0,2,5,7>
+ 2625136570U, // <u,0,2,6>: Cost 3 vext2 <1,2,u,0>, <2,6,3,7>
+ 2591962097U, // <u,0,2,7>: Cost 3 vext1 <7,0,0,2>, <7,0,0,2>
+ 1611489516U, // <u,0,2,u>: Cost 2 vext3 LHS, <0,2,u,2>
+ 2954067968U, // <u,0,3,0>: Cost 3 vzipr LHS, <0,0,0,0>
+ 2685231356U, // <u,0,3,1>: Cost 3 vext3 LHS, <0,3,1,0>
+ 72589981U, // <u,0,3,2>: Cost 1 vrev LHS
+ 2625137052U, // <u,0,3,3>: Cost 3 vext2 <1,2,u,0>, <3,3,3,3>
+ 2625137154U, // <u,0,3,4>: Cost 3 vext2 <1,2,u,0>, <3,4,5,6>
+ 2639071848U, // <u,0,3,5>: Cost 3 vext2 <3,5,u,0>, <3,5,u,0>
+ 2639735481U, // <u,0,3,6>: Cost 3 vext2 <3,6,u,0>, <3,6,u,0>
+ 2597279354U, // <u,0,3,7>: Cost 3 vext1 <7,u,0,3>, <7,u,0,3>
+ 73032403U, // <u,0,3,u>: Cost 1 vrev LHS
+ 2687074636U, // <u,0,4,0>: Cost 3 vext3 <0,4,0,u>, <0,4,0,u>
+ 1611489618U, // <u,0,4,1>: Cost 2 vext3 LHS, <0,4,1,5>
+ 1611489628U, // <u,0,4,2>: Cost 2 vext3 LHS, <0,4,2,6>
+ 3629222038U, // <u,0,4,3>: Cost 4 vext1 <0,u,0,4>, <3,0,1,2>
+ 2555481398U, // <u,0,4,4>: Cost 3 vext1 <0,u,0,4>, RHS
+ 1551396150U, // <u,0,4,5>: Cost 2 vext2 <1,2,u,0>, RHS
+ 2651680116U, // <u,0,4,6>: Cost 3 vext2 <5,6,u,0>, <4,6,4,6>
+ 2646150600U, // <u,0,4,7>: Cost 3 vext2 <4,7,5,0>, <4,7,5,0>
+ 1611932050U, // <u,0,4,u>: Cost 2 vext3 LHS, <0,4,u,6>
+ 2561458278U, // <u,0,5,0>: Cost 3 vext1 <1,u,0,5>, LHS
+ 1863532646U, // <u,0,5,1>: Cost 2 vzipl RHS, LHS
+ 2712068526U, // <u,0,5,2>: Cost 3 vext3 RHS, <0,5,2,7>
+ 2649689976U, // <u,0,5,3>: Cost 3 vext2 <5,3,u,0>, <5,3,u,0>
+ 2220237489U, // <u,0,5,4>: Cost 3 vrev <0,u,4,5>
+ 2651680772U, // <u,0,5,5>: Cost 3 vext2 <5,6,u,0>, <5,5,5,5>
+ 1577939051U, // <u,0,5,6>: Cost 2 vext2 <5,6,u,0>, <5,6,u,0>
+ 2830077238U, // <u,0,5,7>: Cost 3 vuzpr <1,u,3,0>, RHS
+ 1579266317U, // <u,0,5,u>: Cost 2 vext2 <5,u,u,0>, <5,u,u,0>
+ 2555494502U, // <u,0,6,0>: Cost 3 vext1 <0,u,0,6>, LHS
+ 2712068598U, // <u,0,6,1>: Cost 3 vext3 RHS, <0,6,1,7>
+ 1997750374U, // <u,0,6,2>: Cost 2 vtrnl RHS, LHS
+ 2655662673U, // <u,0,6,3>: Cost 3 vext2 <6,3,u,0>, <6,3,u,0>
+ 2555497782U, // <u,0,6,4>: Cost 3 vext1 <0,u,0,6>, RHS
+ 2651681459U, // <u,0,6,5>: Cost 3 vext2 <5,6,u,0>, <6,5,0,u>
+ 2651681592U, // <u,0,6,6>: Cost 3 vext2 <5,6,u,0>, <6,6,6,6>
+ 2651681614U, // <u,0,6,7>: Cost 3 vext2 <5,6,u,0>, <6,7,0,1>
+ 1997750428U, // <u,0,6,u>: Cost 2 vtrnl RHS, LHS
+ 2567446630U, // <u,0,7,0>: Cost 3 vext1 <2,u,0,7>, LHS
+ 2567447446U, // <u,0,7,1>: Cost 3 vext1 <2,u,0,7>, <1,2,3,0>
+ 2567448641U, // <u,0,7,2>: Cost 3 vext1 <2,u,0,7>, <2,u,0,7>
+ 2573421338U, // <u,0,7,3>: Cost 3 vext1 <3,u,0,7>, <3,u,0,7>
+ 2567449910U, // <u,0,7,4>: Cost 3 vext1 <2,u,0,7>, RHS
+ 2651682242U, // <u,0,7,5>: Cost 3 vext2 <5,6,u,0>, <7,5,6,u>
+ 2591339429U, // <u,0,7,6>: Cost 3 vext1 <6,u,0,7>, <6,u,0,7>
+ 2651682412U, // <u,0,7,7>: Cost 3 vext2 <5,6,u,0>, <7,7,7,7>
+ 2567452462U, // <u,0,7,u>: Cost 3 vext1 <2,u,0,7>, LHS
+ 135053414U, // <u,0,u,0>: Cost 1 vdup0 LHS
+ 1611489938U, // <u,0,u,1>: Cost 2 vext3 LHS, <0,u,1,1>
+ 537748125U, // <u,0,u,2>: Cost 1 vext3 LHS, LHS
+ 2685674148U, // <u,0,u,3>: Cost 3 vext3 LHS, <0,u,3,1>
+ 1611932338U, // <u,0,u,4>: Cost 2 vext3 LHS, <0,u,4,6>
+ 1551399066U, // <u,0,u,5>: Cost 2 vext2 <1,2,u,0>, RHS
+ 1517605798U, // <u,0,u,6>: Cost 2 vext1 <6,u,0,u>, <6,u,0,u>
+ 2830077481U, // <u,0,u,7>: Cost 3 vuzpr <1,u,3,0>, RHS
+ 537748179U, // <u,0,u,u>: Cost 1 vext3 LHS, LHS
+ 1544101961U, // <u,1,0,0>: Cost 2 vext2 <0,0,u,1>, <0,0,u,1>
+ 1558036582U, // <u,1,0,1>: Cost 2 vext2 <2,3,u,1>, LHS
+ 2619171051U, // <u,1,0,2>: Cost 3 vext2 <0,2,u,1>, <0,2,u,1>
+ 1611490038U, // <u,1,0,3>: Cost 2 vext3 LHS, <1,0,3,2>
+ 2555522358U, // <u,1,0,4>: Cost 3 vext1 <0,u,1,0>, RHS
+ 2712068871U, // <u,1,0,5>: Cost 3 vext3 RHS, <1,0,5,1>
+ 2591355815U, // <u,1,0,6>: Cost 3 vext1 <6,u,1,0>, <6,u,1,0>
+ 2597328512U, // <u,1,0,7>: Cost 3 vext1 <7,u,1,0>, <7,u,1,0>
+ 1611490083U, // <u,1,0,u>: Cost 2 vext3 LHS, <1,0,u,2>
+ 1481785446U, // <u,1,1,0>: Cost 2 vext1 <0,u,1,1>, LHS
+ 202162278U, // <u,1,1,1>: Cost 1 vdup1 LHS
+ 2555528808U, // <u,1,1,2>: Cost 3 vext1 <0,u,1,1>, <2,2,2,2>
+ 1611490120U, // <u,1,1,3>: Cost 2 vext3 LHS, <1,1,3,3>
+ 1481788726U, // <u,1,1,4>: Cost 2 vext1 <0,u,1,1>, RHS
+ 2689876828U, // <u,1,1,5>: Cost 3 vext3 LHS, <1,1,5,5>
+ 2591364008U, // <u,1,1,6>: Cost 3 vext1 <6,u,1,1>, <6,u,1,1>
+ 2592691274U, // <u,1,1,7>: Cost 3 vext1 <7,1,1,1>, <7,1,1,1>
+ 202162278U, // <u,1,1,u>: Cost 1 vdup1 LHS
+ 1499709542U, // <u,1,2,0>: Cost 2 vext1 <3,u,1,2>, LHS
+ 2689876871U, // <u,1,2,1>: Cost 3 vext3 LHS, <1,2,1,3>
+ 2631116445U, // <u,1,2,2>: Cost 3 vext2 <2,2,u,1>, <2,2,u,1>
+ 835584U, // <u,1,2,3>: Cost 0 copy LHS
+ 1499712822U, // <u,1,2,4>: Cost 2 vext1 <3,u,1,2>, RHS
+ 2689876907U, // <u,1,2,5>: Cost 3 vext3 LHS, <1,2,5,3>
+ 2631780282U, // <u,1,2,6>: Cost 3 vext2 <2,3,u,1>, <2,6,3,7>
+ 1523603074U, // <u,1,2,7>: Cost 2 vext1 <7,u,1,2>, <7,u,1,2>
+ 835584U, // <u,1,2,u>: Cost 0 copy LHS
+ 1487773798U, // <u,1,3,0>: Cost 2 vext1 <1,u,1,3>, LHS
+ 1611490264U, // <u,1,3,1>: Cost 2 vext3 LHS, <1,3,1,3>
+ 2685232094U, // <u,1,3,2>: Cost 3 vext3 LHS, <1,3,2,0>
+ 2018746470U, // <u,1,3,3>: Cost 2 vtrnr LHS, LHS
+ 1487777078U, // <u,1,3,4>: Cost 2 vext1 <1,u,1,3>, RHS
+ 1611490304U, // <u,1,3,5>: Cost 2 vext3 LHS, <1,3,5,7>
+ 2685674505U, // <u,1,3,6>: Cost 3 vext3 LHS, <1,3,6,7>
+ 2640407307U, // <u,1,3,7>: Cost 3 vext2 <3,7,u,1>, <3,7,u,1>
+ 1611490327U, // <u,1,3,u>: Cost 2 vext3 LHS, <1,3,u,3>
+ 1567992749U, // <u,1,4,0>: Cost 2 vext2 <4,0,u,1>, <4,0,u,1>
+ 2693121070U, // <u,1,4,1>: Cost 3 vext3 <1,4,1,u>, <1,4,1,u>
+ 2693194807U, // <u,1,4,2>: Cost 3 vext3 <1,4,2,u>, <1,4,2,u>
+ 1152386432U, // <u,1,4,3>: Cost 2 vrev <1,u,3,4>
+ 2555555126U, // <u,1,4,4>: Cost 3 vext1 <0,u,1,4>, RHS
+ 1558039862U, // <u,1,4,5>: Cost 2 vext2 <2,3,u,1>, RHS
+ 2645716371U, // <u,1,4,6>: Cost 3 vext2 <4,6,u,1>, <4,6,u,1>
+ 2597361284U, // <u,1,4,7>: Cost 3 vext1 <7,u,1,4>, <7,u,1,4>
+ 1152755117U, // <u,1,4,u>: Cost 2 vrev <1,u,u,4>
+ 1481818214U, // <u,1,5,0>: Cost 2 vext1 <0,u,1,5>, LHS
+ 2555560694U, // <u,1,5,1>: Cost 3 vext1 <0,u,1,5>, <1,0,3,2>
+ 2555561576U, // <u,1,5,2>: Cost 3 vext1 <0,u,1,5>, <2,2,2,2>
+ 1611490448U, // <u,1,5,3>: Cost 2 vext3 LHS, <1,5,3,7>
+ 1481821494U, // <u,1,5,4>: Cost 2 vext1 <0,u,1,5>, RHS
+ 2651025435U, // <u,1,5,5>: Cost 3 vext2 <5,5,u,1>, <5,5,u,1>
+ 2651689068U, // <u,1,5,6>: Cost 3 vext2 <5,6,u,1>, <5,6,u,1>
+ 2823966006U, // <u,1,5,7>: Cost 3 vuzpr <0,u,1,1>, RHS
+ 1611932861U, // <u,1,5,u>: Cost 2 vext3 LHS, <1,5,u,7>
+ 2555568230U, // <u,1,6,0>: Cost 3 vext1 <0,u,1,6>, LHS
+ 2689877199U, // <u,1,6,1>: Cost 3 vext3 LHS, <1,6,1,7>
+ 2712069336U, // <u,1,6,2>: Cost 3 vext3 RHS, <1,6,2,7>
+ 2685232353U, // <u,1,6,3>: Cost 3 vext3 LHS, <1,6,3,7>
+ 2555571510U, // <u,1,6,4>: Cost 3 vext1 <0,u,1,6>, RHS
+ 2689877235U, // <u,1,6,5>: Cost 3 vext3 LHS, <1,6,5,7>
+ 2657661765U, // <u,1,6,6>: Cost 3 vext2 <6,6,u,1>, <6,6,u,1>
+ 1584583574U, // <u,1,6,7>: Cost 2 vext2 <6,7,u,1>, <6,7,u,1>
+ 1585247207U, // <u,1,6,u>: Cost 2 vext2 <6,u,u,1>, <6,u,u,1>
+ 2561548390U, // <u,1,7,0>: Cost 3 vext1 <1,u,1,7>, LHS
+ 2561549681U, // <u,1,7,1>: Cost 3 vext1 <1,u,1,7>, <1,u,1,7>
+ 2573493926U, // <u,1,7,2>: Cost 3 vext1 <3,u,1,7>, <2,3,0,1>
+ 2042962022U, // <u,1,7,3>: Cost 2 vtrnr RHS, LHS
+ 2561551670U, // <u,1,7,4>: Cost 3 vext1 <1,u,1,7>, RHS
+ 2226300309U, // <u,1,7,5>: Cost 3 vrev <1,u,5,7>
+ 2658325990U, // <u,1,7,6>: Cost 3 vext2 <6,7,u,1>, <7,6,1,u>
+ 2658326124U, // <u,1,7,7>: Cost 3 vext2 <6,7,u,1>, <7,7,7,7>
+ 2042962027U, // <u,1,7,u>: Cost 2 vtrnr RHS, LHS
+ 1481842790U, // <u,1,u,0>: Cost 2 vext1 <0,u,1,u>, LHS
+ 202162278U, // <u,1,u,1>: Cost 1 vdup1 LHS
+ 2685674867U, // <u,1,u,2>: Cost 3 vext3 LHS, <1,u,2,0>
+ 835584U, // <u,1,u,3>: Cost 0 copy LHS
+ 1481846070U, // <u,1,u,4>: Cost 2 vext1 <0,u,1,u>, RHS
+ 1611933077U, // <u,1,u,5>: Cost 2 vext3 LHS, <1,u,5,7>
+ 2685674910U, // <u,1,u,6>: Cost 3 vext3 LHS, <1,u,6,7>
+ 1523652232U, // <u,1,u,7>: Cost 2 vext1 <7,u,1,u>, <7,u,1,u>
+ 835584U, // <u,1,u,u>: Cost 0 copy LHS
+ 1544110154U, // <u,2,0,0>: Cost 2 vext2 <0,0,u,2>, <0,0,u,2>
+ 1545437286U, // <u,2,0,1>: Cost 2 vext2 <0,2,u,2>, LHS
+ 1545437420U, // <u,2,0,2>: Cost 2 vext2 <0,2,u,2>, <0,2,u,2>
+ 2685232589U, // <u,2,0,3>: Cost 3 vext3 LHS, <2,0,3,0>
+ 2619179346U, // <u,2,0,4>: Cost 3 vext2 <0,2,u,2>, <0,4,1,5>
+ 2712069606U, // <u,2,0,5>: Cost 3 vext3 RHS, <2,0,5,7>
+ 2689877484U, // <u,2,0,6>: Cost 3 vext3 LHS, <2,0,6,4>
+ 2659656273U, // <u,2,0,7>: Cost 3 vext2 <7,0,u,2>, <0,7,2,u>
+ 1545437853U, // <u,2,0,u>: Cost 2 vext2 <0,2,u,2>, LHS
+ 1550082851U, // <u,2,1,0>: Cost 2 vext2 <1,0,u,2>, <1,0,u,2>
+ 2619179828U, // <u,2,1,1>: Cost 3 vext2 <0,2,u,2>, <1,1,1,1>
+ 2619179926U, // <u,2,1,2>: Cost 3 vext2 <0,2,u,2>, <1,2,3,0>
+ 2685232671U, // <u,2,1,3>: Cost 3 vext3 LHS, <2,1,3,1>
+ 2555604278U, // <u,2,1,4>: Cost 3 vext1 <0,u,2,1>, RHS
+ 2619180176U, // <u,2,1,5>: Cost 3 vext2 <0,2,u,2>, <1,5,3,7>
+ 2689877564U, // <u,2,1,6>: Cost 3 vext3 LHS, <2,1,6,3>
+ 2602718850U, // <u,2,1,7>: Cost 3 vext1 <u,7,2,1>, <7,u,1,2>
+ 1158703235U, // <u,2,1,u>: Cost 2 vrev <2,u,u,1>
+ 1481867366U, // <u,2,2,0>: Cost 2 vext1 <0,u,2,2>, LHS
+ 2555609846U, // <u,2,2,1>: Cost 3 vext1 <0,u,2,2>, <1,0,3,2>
+ 269271142U, // <u,2,2,2>: Cost 1 vdup2 LHS
+ 1611490930U, // <u,2,2,3>: Cost 2 vext3 LHS, <2,2,3,3>
+ 1481870646U, // <u,2,2,4>: Cost 2 vext1 <0,u,2,2>, RHS
+ 2689877640U, // <u,2,2,5>: Cost 3 vext3 LHS, <2,2,5,7>
+ 2619180986U, // <u,2,2,6>: Cost 3 vext2 <0,2,u,2>, <2,6,3,7>
+ 2593436837U, // <u,2,2,7>: Cost 3 vext1 <7,2,2,2>, <7,2,2,2>
+ 269271142U, // <u,2,2,u>: Cost 1 vdup2 LHS
+ 408134301U, // <u,2,3,0>: Cost 1 vext1 LHS, LHS
+ 1481876214U, // <u,2,3,1>: Cost 2 vext1 LHS, <1,0,3,2>
+ 1481877096U, // <u,2,3,2>: Cost 2 vext1 LHS, <2,2,2,2>
+ 1880326246U, // <u,2,3,3>: Cost 2 vzipr LHS, LHS
+ 408137014U, // <u,2,3,4>: Cost 1 vext1 LHS, RHS
+ 1529654992U, // <u,2,3,5>: Cost 2 vext1 LHS, <5,1,7,3>
+ 1529655802U, // <u,2,3,6>: Cost 2 vext1 LHS, <6,2,7,3>
+ 1529656314U, // <u,2,3,7>: Cost 2 vext1 LHS, <7,0,1,2>
+ 408139566U, // <u,2,3,u>: Cost 1 vext1 LHS, LHS
+ 1567853468U, // <u,2,4,0>: Cost 2 vext2 <4,0,6,2>, <4,0,6,2>
+ 2561598362U, // <u,2,4,1>: Cost 3 vext1 <1,u,2,4>, <1,2,3,4>
+ 2555627214U, // <u,2,4,2>: Cost 3 vext1 <0,u,2,4>, <2,3,4,5>
+ 2685232918U, // <u,2,4,3>: Cost 3 vext3 LHS, <2,4,3,5>
+ 2555628854U, // <u,2,4,4>: Cost 3 vext1 <0,u,2,4>, RHS
+ 1545440566U, // <u,2,4,5>: Cost 2 vext2 <0,2,u,2>, RHS
+ 1571982740U, // <u,2,4,6>: Cost 2 vext2 <4,6,u,2>, <4,6,u,2>
+ 2592125957U, // <u,2,4,7>: Cost 3 vext1 <7,0,2,4>, <7,0,2,4>
+ 1545440809U, // <u,2,4,u>: Cost 2 vext2 <0,2,u,2>, RHS
+ 2555633766U, // <u,2,5,0>: Cost 3 vext1 <0,u,2,5>, LHS
+ 2561606550U, // <u,2,5,1>: Cost 3 vext1 <1,u,2,5>, <1,2,3,0>
+ 2689877856U, // <u,2,5,2>: Cost 3 vext3 LHS, <2,5,2,7>
+ 2685233000U, // <u,2,5,3>: Cost 3 vext3 LHS, <2,5,3,6>
+ 1158441059U, // <u,2,5,4>: Cost 2 vrev <2,u,4,5>
+ 2645725188U, // <u,2,5,5>: Cost 3 vext2 <4,6,u,2>, <5,5,5,5>
+ 2689877892U, // <u,2,5,6>: Cost 3 vext3 LHS, <2,5,6,7>
+ 2823900470U, // <u,2,5,7>: Cost 3 vuzpr <0,u,0,2>, RHS
+ 1158736007U, // <u,2,5,u>: Cost 2 vrev <2,u,u,5>
+ 1481900134U, // <u,2,6,0>: Cost 2 vext1 <0,u,2,6>, LHS
+ 2555642614U, // <u,2,6,1>: Cost 3 vext1 <0,u,2,6>, <1,0,3,2>
+ 2555643496U, // <u,2,6,2>: Cost 3 vext1 <0,u,2,6>, <2,2,2,2>
+ 1611491258U, // <u,2,6,3>: Cost 2 vext3 LHS, <2,6,3,7>
+ 1481903414U, // <u,2,6,4>: Cost 2 vext1 <0,u,2,6>, RHS
+ 2689877964U, // <u,2,6,5>: Cost 3 vext3 LHS, <2,6,5,7>
+ 2689877973U, // <u,2,6,6>: Cost 3 vext3 LHS, <2,6,6,7>
+ 2645726030U, // <u,2,6,7>: Cost 3 vext2 <4,6,u,2>, <6,7,0,1>
+ 1611933671U, // <u,2,6,u>: Cost 2 vext3 LHS, <2,6,u,7>
+ 1585919033U, // <u,2,7,0>: Cost 2 vext2 <7,0,u,2>, <7,0,u,2>
+ 2573566710U, // <u,2,7,1>: Cost 3 vext1 <3,u,2,7>, <1,0,3,2>
+ 2567596115U, // <u,2,7,2>: Cost 3 vext1 <2,u,2,7>, <2,u,2,7>
+ 1906901094U, // <u,2,7,3>: Cost 2 vzipr RHS, LHS
+ 2555653430U, // <u,2,7,4>: Cost 3 vext1 <0,u,2,7>, RHS
+ 2800080230U, // <u,2,7,5>: Cost 3 vuzpl LHS, <7,4,5,6>
+ 2980643164U, // <u,2,7,6>: Cost 3 vzipr RHS, <0,4,2,6>
+ 2645726828U, // <u,2,7,7>: Cost 3 vext2 <4,6,u,2>, <7,7,7,7>
+ 1906901099U, // <u,2,7,u>: Cost 2 vzipr RHS, LHS
+ 408175266U, // <u,2,u,0>: Cost 1 vext1 LHS, LHS
+ 1545443118U, // <u,2,u,1>: Cost 2 vext2 <0,2,u,2>, LHS
+ 269271142U, // <u,2,u,2>: Cost 1 vdup2 LHS
+ 1611491416U, // <u,2,u,3>: Cost 2 vext3 LHS, <2,u,3,3>
+ 408177974U, // <u,2,u,4>: Cost 1 vext1 LHS, RHS
+ 1545443482U, // <u,2,u,5>: Cost 2 vext2 <0,2,u,2>, RHS
+ 1726339226U, // <u,2,u,6>: Cost 2 vuzpl LHS, RHS
+ 1529697274U, // <u,2,u,7>: Cost 2 vext1 LHS, <7,0,1,2>
+ 408180526U, // <u,2,u,u>: Cost 1 vext1 LHS, LHS
+ 1544781824U, // <u,3,0,0>: Cost 2 vext2 LHS, <0,0,0,0>
+ 471040156U, // <u,3,0,1>: Cost 1 vext2 LHS, LHS
+ 1544781988U, // <u,3,0,2>: Cost 2 vext2 LHS, <0,2,0,2>
+ 2618523900U, // <u,3,0,3>: Cost 3 vext2 LHS, <0,3,1,0>
+ 1544782162U, // <u,3,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
+ 2238188352U, // <u,3,0,5>: Cost 3 vrev <3,u,5,0>
+ 2623169023U, // <u,3,0,6>: Cost 3 vext2 LHS, <0,6,2,7>
+ 2238335826U, // <u,3,0,7>: Cost 3 vrev <3,u,7,0>
+ 471040669U, // <u,3,0,u>: Cost 1 vext2 LHS, LHS
+ 1544782582U, // <u,3,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
+ 1544782644U, // <u,3,1,1>: Cost 2 vext2 LHS, <1,1,1,1>
+ 1544782742U, // <u,3,1,2>: Cost 2 vext2 LHS, <1,2,3,0>
+ 1544782808U, // <u,3,1,3>: Cost 2 vext2 LHS, <1,3,1,3>
+ 2618524733U, // <u,3,1,4>: Cost 3 vext2 LHS, <1,4,3,5>
+ 1544782992U, // <u,3,1,5>: Cost 2 vext2 LHS, <1,5,3,7>
+ 2618524897U, // <u,3,1,6>: Cost 3 vext2 LHS, <1,6,3,7>
+ 2703517987U, // <u,3,1,7>: Cost 3 vext3 <3,1,7,u>, <3,1,7,u>
+ 1544783213U, // <u,3,1,u>: Cost 2 vext2 LHS, <1,u,1,3>
+ 1529716838U, // <u,3,2,0>: Cost 2 vext1 <u,u,3,2>, LHS
+ 1164167966U, // <u,3,2,1>: Cost 2 vrev <3,u,1,2>
+ 1544783464U, // <u,3,2,2>: Cost 2 vext2 LHS, <2,2,2,2>
+ 1544783526U, // <u,3,2,3>: Cost 2 vext2 LHS, <2,3,0,1>
+ 1529720118U, // <u,3,2,4>: Cost 2 vext1 <u,u,3,2>, RHS
+ 2618525544U, // <u,3,2,5>: Cost 3 vext2 LHS, <2,5,3,6>
+ 1544783802U, // <u,3,2,6>: Cost 2 vext2 LHS, <2,6,3,7>
+ 2704181620U, // <u,3,2,7>: Cost 3 vext3 <3,2,7,u>, <3,2,7,u>
+ 1544783931U, // <u,3,2,u>: Cost 2 vext2 LHS, <2,u,0,1>
+ 1544784022U, // <u,3,3,0>: Cost 2 vext2 LHS, <3,0,1,2>
+ 1487922559U, // <u,3,3,1>: Cost 2 vext1 <1,u,3,3>, <1,u,3,3>
+ 1493895256U, // <u,3,3,2>: Cost 2 vext1 <2,u,3,3>, <2,u,3,3>
+ 336380006U, // <u,3,3,3>: Cost 1 vdup3 LHS
+ 1544784386U, // <u,3,3,4>: Cost 2 vext2 LHS, <3,4,5,6>
+ 2824054478U, // <u,3,3,5>: Cost 3 vuzpr LHS, <2,3,4,5>
+ 2238286668U, // <u,3,3,6>: Cost 3 vrev <3,u,6,3>
+ 2954069136U, // <u,3,3,7>: Cost 3 vzipr LHS, <1,5,3,7>
+ 336380006U, // <u,3,3,u>: Cost 1 vdup3 LHS
+ 1487929446U, // <u,3,4,0>: Cost 2 vext1 <1,u,3,4>, LHS
+ 1487930752U, // <u,3,4,1>: Cost 2 vext1 <1,u,3,4>, <1,u,3,4>
+ 2623171644U, // <u,3,4,2>: Cost 3 vext2 LHS, <4,2,6,0>
+ 2561673366U, // <u,3,4,3>: Cost 3 vext1 <1,u,3,4>, <3,0,1,2>
+ 1487932726U, // <u,3,4,4>: Cost 2 vext1 <1,u,3,4>, RHS
+ 471043382U, // <u,3,4,5>: Cost 1 vext2 LHS, RHS
+ 1592561012U, // <u,3,4,6>: Cost 2 vext2 LHS, <4,6,4,6>
+ 2238368598U, // <u,3,4,7>: Cost 3 vrev <3,u,7,4>
+ 471043625U, // <u,3,4,u>: Cost 1 vext2 LHS, RHS
+ 2555707494U, // <u,3,5,0>: Cost 3 vext1 <0,u,3,5>, LHS
+ 1574645465U, // <u,3,5,1>: Cost 2 vext2 <5,1,u,3>, <5,1,u,3>
+ 2567653106U, // <u,3,5,2>: Cost 3 vext1 <2,u,3,5>, <2,3,u,5>
+ 2555709954U, // <u,3,5,3>: Cost 3 vext1 <0,u,3,5>, <3,4,5,6>
+ 1592561606U, // <u,3,5,4>: Cost 2 vext2 LHS, <5,4,7,6>
+ 1592561668U, // <u,3,5,5>: Cost 2 vext2 LHS, <5,5,5,5>
+ 1592561762U, // <u,3,5,6>: Cost 2 vext2 LHS, <5,6,7,0>
+ 1750314294U, // <u,3,5,7>: Cost 2 vuzpr LHS, RHS
+ 1750314295U, // <u,3,5,u>: Cost 2 vuzpr LHS, RHS
+ 2623172897U, // <u,3,6,0>: Cost 3 vext2 LHS, <6,0,1,2>
+ 2561688962U, // <u,3,6,1>: Cost 3 vext1 <1,u,3,6>, <1,u,3,6>
+ 1581281795U, // <u,3,6,2>: Cost 2 vext2 <6,2,u,3>, <6,2,u,3>
+ 2706541204U, // <u,3,6,3>: Cost 3 vext3 <3,6,3,u>, <3,6,3,u>
+ 2623173261U, // <u,3,6,4>: Cost 3 vext2 LHS, <6,4,5,6>
+ 1164495686U, // <u,3,6,5>: Cost 2 vrev <3,u,5,6>
+ 1592562488U, // <u,3,6,6>: Cost 2 vext2 LHS, <6,6,6,6>
+ 1592562510U, // <u,3,6,7>: Cost 2 vext2 LHS, <6,7,0,1>
+ 1164716897U, // <u,3,6,u>: Cost 2 vrev <3,u,u,6>
+ 1487954022U, // <u,3,7,0>: Cost 2 vext1 <1,u,3,7>, LHS
+ 1487955331U, // <u,3,7,1>: Cost 2 vext1 <1,u,3,7>, <1,u,3,7>
+ 1493928028U, // <u,3,7,2>: Cost 2 vext1 <2,u,3,7>, <2,u,3,7>
+ 2561697942U, // <u,3,7,3>: Cost 3 vext1 <1,u,3,7>, <3,0,1,2>
+ 1487957302U, // <u,3,7,4>: Cost 2 vext1 <1,u,3,7>, RHS
+ 2707352311U, // <u,3,7,5>: Cost 3 vext3 <3,7,5,u>, <3,7,5,u>
+ 2655024623U, // <u,3,7,6>: Cost 3 vext2 <6,2,u,3>, <7,6,2,u>
+ 1592563308U, // <u,3,7,7>: Cost 2 vext2 LHS, <7,7,7,7>
+ 1487959854U, // <u,3,7,u>: Cost 2 vext1 <1,u,3,7>, LHS
+ 1544787667U, // <u,3,u,0>: Cost 2 vext2 LHS, <u,0,1,2>
+ 471045934U, // <u,3,u,1>: Cost 1 vext2 LHS, LHS
+ 1549432709U, // <u,3,u,2>: Cost 2 vext2 LHS, <u,2,3,0>
+ 336380006U, // <u,3,u,3>: Cost 1 vdup3 LHS
+ 1544788031U, // <u,3,u,4>: Cost 2 vext2 LHS, <u,4,5,6>
+ 471046298U, // <u,3,u,5>: Cost 1 vext2 LHS, RHS
+ 1549433040U, // <u,3,u,6>: Cost 2 vext2 LHS, <u,6,3,7>
+ 1750314537U, // <u,3,u,7>: Cost 2 vuzpr LHS, RHS
+ 471046501U, // <u,3,u,u>: Cost 1 vext2 LHS, LHS
+ 2625167360U, // <u,4,0,0>: Cost 3 vext2 <1,2,u,4>, <0,0,0,0>
+ 1551425638U, // <u,4,0,1>: Cost 2 vext2 <1,2,u,4>, LHS
+ 2619195630U, // <u,4,0,2>: Cost 3 vext2 <0,2,u,4>, <0,2,u,4>
+ 2619343104U, // <u,4,0,3>: Cost 3 vext2 <0,3,1,4>, <0,3,1,4>
+ 2625167698U, // <u,4,0,4>: Cost 3 vext2 <1,2,u,4>, <0,4,1,5>
+ 1638329234U, // <u,4,0,5>: Cost 2 vext3 RHS, <4,0,5,1>
+ 1638329244U, // <u,4,0,6>: Cost 2 vext3 RHS, <4,0,6,2>
+ 3787803556U, // <u,4,0,7>: Cost 4 vext3 RHS, <4,0,7,1>
+ 1551426205U, // <u,4,0,u>: Cost 2 vext2 <1,2,u,4>, LHS
+ 2555748454U, // <u,4,1,0>: Cost 3 vext1 <0,u,4,1>, LHS
+ 2625168180U, // <u,4,1,1>: Cost 3 vext2 <1,2,u,4>, <1,1,1,1>
+ 1551426503U, // <u,4,1,2>: Cost 2 vext2 <1,2,u,4>, <1,2,u,4>
+ 2625168344U, // <u,4,1,3>: Cost 3 vext2 <1,2,u,4>, <1,3,1,3>
+ 2555751734U, // <u,4,1,4>: Cost 3 vext1 <0,u,4,1>, RHS
+ 1860554038U, // <u,4,1,5>: Cost 2 vzipl LHS, RHS
+ 2689879022U, // <u,4,1,6>: Cost 3 vext3 LHS, <4,1,6,3>
+ 2592248852U, // <u,4,1,7>: Cost 3 vext1 <7,0,4,1>, <7,0,4,1>
+ 1555408301U, // <u,4,1,u>: Cost 2 vext2 <1,u,u,4>, <1,u,u,4>
+ 2555756646U, // <u,4,2,0>: Cost 3 vext1 <0,u,4,2>, LHS
+ 2625168943U, // <u,4,2,1>: Cost 3 vext2 <1,2,u,4>, <2,1,4,u>
+ 2625169000U, // <u,4,2,2>: Cost 3 vext2 <1,2,u,4>, <2,2,2,2>
+ 2619197134U, // <u,4,2,3>: Cost 3 vext2 <0,2,u,4>, <2,3,4,5>
+ 2555759926U, // <u,4,2,4>: Cost 3 vext1 <0,u,4,2>, RHS
+ 2712071222U, // <u,4,2,5>: Cost 3 vext3 RHS, <4,2,5,3>
+ 1994771766U, // <u,4,2,6>: Cost 2 vtrnl LHS, RHS
+ 2592257045U, // <u,4,2,7>: Cost 3 vext1 <7,0,4,2>, <7,0,4,2>
+ 1994771784U, // <u,4,2,u>: Cost 2 vtrnl LHS, RHS
+ 2625169558U, // <u,4,3,0>: Cost 3 vext2 <1,2,u,4>, <3,0,1,2>
+ 2567709594U, // <u,4,3,1>: Cost 3 vext1 <2,u,4,3>, <1,2,3,4>
+ 2567710817U, // <u,4,3,2>: Cost 3 vext1 <2,u,4,3>, <2,u,4,3>
+ 2625169820U, // <u,4,3,3>: Cost 3 vext2 <1,2,u,4>, <3,3,3,3>
+ 2625169922U, // <u,4,3,4>: Cost 3 vext2 <1,2,u,4>, <3,4,5,6>
+ 2954069710U, // <u,4,3,5>: Cost 3 vzipr LHS, <2,3,4,5>
+ 2954068172U, // <u,4,3,6>: Cost 3 vzipr LHS, <0,2,4,6>
+ 3903849472U, // <u,4,3,7>: Cost 4 vuzpr <1,u,3,4>, <1,3,5,7>
+ 2954068174U, // <u,4,3,u>: Cost 3 vzipr LHS, <0,2,4,u>
+ 1505919078U, // <u,4,4,0>: Cost 2 vext1 <4,u,4,4>, LHS
+ 2567717831U, // <u,4,4,1>: Cost 3 vext1 <2,u,4,4>, <1,2,u,4>
+ 2567719010U, // <u,4,4,2>: Cost 3 vext1 <2,u,4,4>, <2,u,4,4>
+ 2570373542U, // <u,4,4,3>: Cost 3 vext1 <3,3,4,4>, <3,3,4,4>
+ 161926454U, // <u,4,4,4>: Cost 1 vdup0 RHS
+ 1551428918U, // <u,4,4,5>: Cost 2 vext2 <1,2,u,4>, RHS
+ 1638329572U, // <u,4,4,6>: Cost 2 vext3 RHS, <4,4,6,6>
+ 2594927963U, // <u,4,4,7>: Cost 3 vext1 <7,4,4,4>, <7,4,4,4>
+ 161926454U, // <u,4,4,u>: Cost 1 vdup0 RHS
+ 1493983334U, // <u,4,5,0>: Cost 2 vext1 <2,u,4,5>, LHS
+ 2689879301U, // <u,4,5,1>: Cost 3 vext3 LHS, <4,5,1,3>
+ 1493985379U, // <u,4,5,2>: Cost 2 vext1 <2,u,4,5>, <2,u,4,5>
+ 2567727254U, // <u,4,5,3>: Cost 3 vext1 <2,u,4,5>, <3,0,1,2>
+ 1493986614U, // <u,4,5,4>: Cost 2 vext1 <2,u,4,5>, RHS
+ 1863535926U, // <u,4,5,5>: Cost 2 vzipl RHS, RHS
+ 537750838U, // <u,4,5,6>: Cost 1 vext3 LHS, RHS
+ 2830110006U, // <u,4,5,7>: Cost 3 vuzpr <1,u,3,4>, RHS
+ 537750856U, // <u,4,5,u>: Cost 1 vext3 LHS, RHS
+ 1482047590U, // <u,4,6,0>: Cost 2 vext1 <0,u,4,6>, LHS
+ 2555790070U, // <u,4,6,1>: Cost 3 vext1 <0,u,4,6>, <1,0,3,2>
+ 2555790952U, // <u,4,6,2>: Cost 3 vext1 <0,u,4,6>, <2,2,2,2>
+ 2555791510U, // <u,4,6,3>: Cost 3 vext1 <0,u,4,6>, <3,0,1,2>
+ 1482050870U, // <u,4,6,4>: Cost 2 vext1 <0,u,4,6>, RHS
+ 2689879422U, // <u,4,6,5>: Cost 3 vext3 LHS, <4,6,5,7>
+ 1997753654U, // <u,4,6,6>: Cost 2 vtrnl RHS, RHS
+ 2712071562U, // <u,4,6,7>: Cost 3 vext3 RHS, <4,6,7,1>
+ 1482053422U, // <u,4,6,u>: Cost 2 vext1 <0,u,4,6>, LHS
+ 2567741542U, // <u,4,7,0>: Cost 3 vext1 <2,u,4,7>, LHS
+ 2567742362U, // <u,4,7,1>: Cost 3 vext1 <2,u,4,7>, <1,2,3,4>
+ 2567743589U, // <u,4,7,2>: Cost 3 vext1 <2,u,4,7>, <2,u,4,7>
+ 2573716286U, // <u,4,7,3>: Cost 3 vext1 <3,u,4,7>, <3,u,4,7>
+ 2567744822U, // <u,4,7,4>: Cost 3 vext1 <2,u,4,7>, RHS
+ 2712071624U, // <u,4,7,5>: Cost 3 vext3 RHS, <4,7,5,0>
+ 96808489U, // <u,4,7,6>: Cost 1 vrev RHS
+ 2651715180U, // <u,4,7,7>: Cost 3 vext2 <5,6,u,4>, <7,7,7,7>
+ 96955963U, // <u,4,7,u>: Cost 1 vrev RHS
+ 1482063974U, // <u,4,u,0>: Cost 2 vext1 <0,u,4,u>, LHS
+ 1551431470U, // <u,4,u,1>: Cost 2 vext2 <1,2,u,4>, LHS
+ 1494009958U, // <u,4,u,2>: Cost 2 vext1 <2,u,4,u>, <2,u,4,u>
+ 2555807894U, // <u,4,u,3>: Cost 3 vext1 <0,u,4,u>, <3,0,1,2>
+ 161926454U, // <u,4,u,4>: Cost 1 vdup0 RHS
+ 1551431834U, // <u,4,u,5>: Cost 2 vext2 <1,2,u,4>, RHS
+ 537751081U, // <u,4,u,6>: Cost 1 vext3 LHS, RHS
+ 2830110249U, // <u,4,u,7>: Cost 3 vuzpr <1,u,3,4>, RHS
+ 537751099U, // <u,4,u,u>: Cost 1 vext3 LHS, RHS
+ 2631811072U, // <u,5,0,0>: Cost 3 vext2 <2,3,u,5>, <0,0,0,0>
+ 1558069350U, // <u,5,0,1>: Cost 2 vext2 <2,3,u,5>, LHS
+ 2619203823U, // <u,5,0,2>: Cost 3 vext2 <0,2,u,5>, <0,2,u,5>
+ 2619867456U, // <u,5,0,3>: Cost 3 vext2 <0,3,u,5>, <0,3,u,5>
+ 1546273106U, // <u,5,0,4>: Cost 2 vext2 <0,4,1,5>, <0,4,1,5>
+ 2733010539U, // <u,5,0,5>: Cost 3 vext3 LHS, <5,0,5,1>
+ 2597622682U, // <u,5,0,6>: Cost 3 vext1 <7,u,5,0>, <6,7,u,5>
+ 1176539396U, // <u,5,0,7>: Cost 2 vrev <5,u,7,0>
+ 1558069917U, // <u,5,0,u>: Cost 2 vext2 <2,3,u,5>, LHS
+ 1505968230U, // <u,5,1,0>: Cost 2 vext1 <4,u,5,1>, LHS
+ 2624512887U, // <u,5,1,1>: Cost 3 vext2 <1,1,u,5>, <1,1,u,5>
+ 2631811990U, // <u,5,1,2>: Cost 3 vext2 <2,3,u,5>, <1,2,3,0>
+ 2618541056U, // <u,5,1,3>: Cost 3 vext2 <0,1,u,5>, <1,3,5,7>
+ 1505971510U, // <u,5,1,4>: Cost 2 vext1 <4,u,5,1>, RHS
+ 2627167419U, // <u,5,1,5>: Cost 3 vext2 <1,5,u,5>, <1,5,u,5>
+ 2579714554U, // <u,5,1,6>: Cost 3 vext1 <4,u,5,1>, <6,2,7,3>
+ 1638330064U, // <u,5,1,7>: Cost 2 vext3 RHS, <5,1,7,3>
+ 1638477529U, // <u,5,1,u>: Cost 2 vext3 RHS, <5,1,u,3>
+ 2561802342U, // <u,5,2,0>: Cost 3 vext1 <1,u,5,2>, LHS
+ 2561803264U, // <u,5,2,1>: Cost 3 vext1 <1,u,5,2>, <1,3,5,7>
+ 2631149217U, // <u,5,2,2>: Cost 3 vext2 <2,2,u,5>, <2,2,u,5>
+ 1558071026U, // <u,5,2,3>: Cost 2 vext2 <2,3,u,5>, <2,3,u,5>
+ 2561805622U, // <u,5,2,4>: Cost 3 vext1 <1,u,5,2>, RHS
+ 2714062607U, // <u,5,2,5>: Cost 3 vext3 RHS, <5,2,5,3>
+ 2631813050U, // <u,5,2,6>: Cost 3 vext2 <2,3,u,5>, <2,6,3,7>
+ 3092335926U, // <u,5,2,7>: Cost 3 vtrnr <0,u,0,2>, RHS
+ 1561389191U, // <u,5,2,u>: Cost 2 vext2 <2,u,u,5>, <2,u,u,5>
+ 2561810534U, // <u,5,3,0>: Cost 3 vext1 <1,u,5,3>, LHS
+ 2561811857U, // <u,5,3,1>: Cost 3 vext1 <1,u,5,3>, <1,u,5,3>
+ 2631813474U, // <u,5,3,2>: Cost 3 vext2 <2,3,u,5>, <3,2,5,u>
+ 2631813532U, // <u,5,3,3>: Cost 3 vext2 <2,3,u,5>, <3,3,3,3>
+ 2619869698U, // <u,5,3,4>: Cost 3 vext2 <0,3,u,5>, <3,4,5,6>
+ 3001847002U, // <u,5,3,5>: Cost 3 vzipr LHS, <4,4,5,5>
+ 2954070530U, // <u,5,3,6>: Cost 3 vzipr LHS, <3,4,5,6>
+ 2018749750U, // <u,5,3,7>: Cost 2 vtrnr LHS, RHS
+ 2018749751U, // <u,5,3,u>: Cost 2 vtrnr LHS, RHS
+ 2573762662U, // <u,5,4,0>: Cost 3 vext1 <3,u,5,4>, LHS
+ 2620017634U, // <u,5,4,1>: Cost 3 vext2 <0,4,1,5>, <4,1,5,0>
+ 2573764338U, // <u,5,4,2>: Cost 3 vext1 <3,u,5,4>, <2,3,u,5>
+ 2573765444U, // <u,5,4,3>: Cost 3 vext1 <3,u,5,4>, <3,u,5,4>
+ 1570680053U, // <u,5,4,4>: Cost 2 vext2 <4,4,u,5>, <4,4,u,5>
+ 1558072630U, // <u,5,4,5>: Cost 2 vext2 <2,3,u,5>, RHS
+ 2645749143U, // <u,5,4,6>: Cost 3 vext2 <4,6,u,5>, <4,6,u,5>
+ 1638330310U, // <u,5,4,7>: Cost 2 vext3 RHS, <5,4,7,6>
+ 1558072873U, // <u,5,4,u>: Cost 2 vext2 <2,3,u,5>, RHS
+ 1506000998U, // <u,5,5,0>: Cost 2 vext1 <4,u,5,5>, LHS
+ 2561827984U, // <u,5,5,1>: Cost 3 vext1 <1,u,5,5>, <1,5,3,7>
+ 2579744360U, // <u,5,5,2>: Cost 3 vext1 <4,u,5,5>, <2,2,2,2>
+ 2579744918U, // <u,5,5,3>: Cost 3 vext1 <4,u,5,5>, <3,0,1,2>
+ 1506004278U, // <u,5,5,4>: Cost 2 vext1 <4,u,5,5>, RHS
+ 229035318U, // <u,5,5,5>: Cost 1 vdup1 RHS
+ 2712072206U, // <u,5,5,6>: Cost 3 vext3 RHS, <5,5,6,6>
+ 1638330392U, // <u,5,5,7>: Cost 2 vext3 RHS, <5,5,7,7>
+ 229035318U, // <u,5,5,u>: Cost 1 vdup1 RHS
+ 1500037222U, // <u,5,6,0>: Cost 2 vext1 <3,u,5,6>, LHS
+ 2561836436U, // <u,5,6,1>: Cost 3 vext1 <1,u,5,6>, <1,u,5,6>
+ 2567809133U, // <u,5,6,2>: Cost 3 vext1 <2,u,5,6>, <2,u,5,6>
+ 1500040006U, // <u,5,6,3>: Cost 2 vext1 <3,u,5,6>, <3,u,5,6>
+ 1500040502U, // <u,5,6,4>: Cost 2 vext1 <3,u,5,6>, RHS
+ 2714062935U, // <u,5,6,5>: Cost 3 vext3 RHS, <5,6,5,7>
+ 2712072288U, // <u,5,6,6>: Cost 3 vext3 RHS, <5,6,6,7>
+ 27705344U, // <u,5,6,7>: Cost 0 copy RHS
+ 27705344U, // <u,5,6,u>: Cost 0 copy RHS
+ 1488101478U, // <u,5,7,0>: Cost 2 vext1 <1,u,5,7>, LHS
+ 1488102805U, // <u,5,7,1>: Cost 2 vext1 <1,u,5,7>, <1,u,5,7>
+ 2561844840U, // <u,5,7,2>: Cost 3 vext1 <1,u,5,7>, <2,2,2,2>
+ 2561845398U, // <u,5,7,3>: Cost 3 vext1 <1,u,5,7>, <3,0,1,2>
+ 1488104758U, // <u,5,7,4>: Cost 2 vext1 <1,u,5,7>, RHS
+ 1638330536U, // <u,5,7,5>: Cost 2 vext3 RHS, <5,7,5,7>
+ 2712072362U, // <u,5,7,6>: Cost 3 vext3 RHS, <5,7,6,0>
+ 2042965302U, // <u,5,7,7>: Cost 2 vtrnr RHS, RHS
+ 1488107310U, // <u,5,7,u>: Cost 2 vext1 <1,u,5,7>, LHS
+ 1488109670U, // <u,5,u,0>: Cost 2 vext1 <1,u,5,u>, LHS
+ 1488110998U, // <u,5,u,1>: Cost 2 vext1 <1,u,5,u>, <1,u,5,u>
+ 2561853032U, // <u,5,u,2>: Cost 3 vext1 <1,u,5,u>, <2,2,2,2>
+ 1500056392U, // <u,5,u,3>: Cost 2 vext1 <3,u,5,u>, <3,u,5,u>
+ 1488112950U, // <u,5,u,4>: Cost 2 vext1 <1,u,5,u>, RHS
+ 229035318U, // <u,5,u,5>: Cost 1 vdup1 RHS
+ 2954111490U, // <u,5,u,6>: Cost 3 vzipr LHS, <3,4,5,6>
+ 27705344U, // <u,5,u,7>: Cost 0 copy RHS
+ 27705344U, // <u,5,u,u>: Cost 0 copy RHS
+ 2619211776U, // <u,6,0,0>: Cost 3 vext2 <0,2,u,6>, <0,0,0,0>
+ 1545470054U, // <u,6,0,1>: Cost 2 vext2 <0,2,u,6>, LHS
+ 1545470192U, // <u,6,0,2>: Cost 2 vext2 <0,2,u,6>, <0,2,u,6>
+ 2255958969U, // <u,6,0,3>: Cost 3 vrev <6,u,3,0>
+ 1546797458U, // <u,6,0,4>: Cost 2 vext2 <0,4,u,6>, <0,4,u,6>
+ 2720624971U, // <u,6,0,5>: Cost 3 vext3 <6,0,5,u>, <6,0,5,u>
+ 2256180180U, // <u,6,0,6>: Cost 3 vrev <6,u,6,0>
+ 2960682294U, // <u,6,0,7>: Cost 3 vzipr <1,2,u,0>, RHS
+ 1545470621U, // <u,6,0,u>: Cost 2 vext2 <0,2,u,6>, LHS
+ 1182004127U, // <u,6,1,0>: Cost 2 vrev <6,u,0,1>
+ 2619212596U, // <u,6,1,1>: Cost 3 vext2 <0,2,u,6>, <1,1,1,1>
+ 2619212694U, // <u,6,1,2>: Cost 3 vext2 <0,2,u,6>, <1,2,3,0>
+ 2619212760U, // <u,6,1,3>: Cost 3 vext2 <0,2,u,6>, <1,3,1,3>
+ 2626511979U, // <u,6,1,4>: Cost 3 vext2 <1,4,u,6>, <1,4,u,6>
+ 2619212944U, // <u,6,1,5>: Cost 3 vext2 <0,2,u,6>, <1,5,3,7>
+ 2714063264U, // <u,6,1,6>: Cost 3 vext3 RHS, <6,1,6,3>
+ 2967326006U, // <u,6,1,7>: Cost 3 vzipr <2,3,u,1>, RHS
+ 1182594023U, // <u,6,1,u>: Cost 2 vrev <6,u,u,1>
+ 1506050150U, // <u,6,2,0>: Cost 2 vext1 <4,u,6,2>, LHS
+ 2579792630U, // <u,6,2,1>: Cost 3 vext1 <4,u,6,2>, <1,0,3,2>
+ 2619213416U, // <u,6,2,2>: Cost 3 vext2 <0,2,u,6>, <2,2,2,2>
+ 2619213478U, // <u,6,2,3>: Cost 3 vext2 <0,2,u,6>, <2,3,0,1>
+ 1506053430U, // <u,6,2,4>: Cost 2 vext1 <4,u,6,2>, RHS
+ 2633148309U, // <u,6,2,5>: Cost 3 vext2 <2,5,u,6>, <2,5,u,6>
+ 2619213754U, // <u,6,2,6>: Cost 3 vext2 <0,2,u,6>, <2,6,3,7>
+ 1638330874U, // <u,6,2,7>: Cost 2 vext3 RHS, <6,2,7,3>
+ 1638478339U, // <u,6,2,u>: Cost 2 vext3 RHS, <6,2,u,3>
+ 2619213974U, // <u,6,3,0>: Cost 3 vext2 <0,2,u,6>, <3,0,1,2>
+ 2255836074U, // <u,6,3,1>: Cost 3 vrev <6,u,1,3>
+ 2255909811U, // <u,6,3,2>: Cost 3 vrev <6,u,2,3>
+ 2619214236U, // <u,6,3,3>: Cost 3 vext2 <0,2,u,6>, <3,3,3,3>
+ 1564715549U, // <u,6,3,4>: Cost 2 vext2 <3,4,u,6>, <3,4,u,6>
+ 2639121006U, // <u,6,3,5>: Cost 3 vext2 <3,5,u,6>, <3,5,u,6>
+ 3001847012U, // <u,6,3,6>: Cost 3 vzipr LHS, <4,4,6,6>
+ 1880329526U, // <u,6,3,7>: Cost 2 vzipr LHS, RHS
+ 1880329527U, // <u,6,3,u>: Cost 2 vzipr LHS, RHS
+ 2567864422U, // <u,6,4,0>: Cost 3 vext1 <2,u,6,4>, LHS
+ 2733011558U, // <u,6,4,1>: Cost 3 vext3 LHS, <6,4,1,3>
+ 2567866484U, // <u,6,4,2>: Cost 3 vext1 <2,u,6,4>, <2,u,6,4>
+ 2638458005U, // <u,6,4,3>: Cost 3 vext2 <3,4,u,6>, <4,3,6,u>
+ 1570540772U, // <u,6,4,4>: Cost 2 vext2 <4,4,6,6>, <4,4,6,6>
+ 1545473334U, // <u,6,4,5>: Cost 2 vext2 <0,2,u,6>, RHS
+ 1572015512U, // <u,6,4,6>: Cost 2 vext2 <4,6,u,6>, <4,6,u,6>
+ 2960715062U, // <u,6,4,7>: Cost 3 vzipr <1,2,u,4>, RHS
+ 1545473577U, // <u,6,4,u>: Cost 2 vext2 <0,2,u,6>, RHS
+ 2567872614U, // <u,6,5,0>: Cost 3 vext1 <2,u,6,5>, LHS
+ 2645757648U, // <u,6,5,1>: Cost 3 vext2 <4,6,u,6>, <5,1,7,3>
+ 2567874490U, // <u,6,5,2>: Cost 3 vext1 <2,u,6,5>, <2,6,3,7>
+ 2576501250U, // <u,6,5,3>: Cost 3 vext1 <4,3,6,5>, <3,4,5,6>
+ 1576660943U, // <u,6,5,4>: Cost 2 vext2 <5,4,u,6>, <5,4,u,6>
+ 2645757956U, // <u,6,5,5>: Cost 3 vext2 <4,6,u,6>, <5,5,5,5>
+ 2645758050U, // <u,6,5,6>: Cost 3 vext2 <4,6,u,6>, <5,6,7,0>
+ 2824080694U, // <u,6,5,7>: Cost 3 vuzpr <0,u,2,6>, RHS
+ 1182626795U, // <u,6,5,u>: Cost 2 vrev <6,u,u,5>
+ 1506082918U, // <u,6,6,0>: Cost 2 vext1 <4,u,6,6>, LHS
+ 2579825398U, // <u,6,6,1>: Cost 3 vext1 <4,u,6,6>, <1,0,3,2>
+ 2645758458U, // <u,6,6,2>: Cost 3 vext2 <4,6,u,6>, <6,2,7,3>
+ 2579826838U, // <u,6,6,3>: Cost 3 vext1 <4,u,6,6>, <3,0,1,2>
+ 1506086198U, // <u,6,6,4>: Cost 2 vext1 <4,u,6,6>, RHS
+ 2579828432U, // <u,6,6,5>: Cost 3 vext1 <4,u,6,6>, <5,1,7,3>
+ 296144182U, // <u,6,6,6>: Cost 1 vdup2 RHS
+ 1638331202U, // <u,6,6,7>: Cost 2 vext3 RHS, <6,6,7,7>
+ 296144182U, // <u,6,6,u>: Cost 1 vdup2 RHS
+ 432349286U, // <u,6,7,0>: Cost 1 vext1 RHS, LHS
+ 1506091766U, // <u,6,7,1>: Cost 2 vext1 RHS, <1,0,3,2>
+ 1506092648U, // <u,6,7,2>: Cost 2 vext1 RHS, <2,2,2,2>
+ 1506093206U, // <u,6,7,3>: Cost 2 vext1 RHS, <3,0,1,2>
+ 432352809U, // <u,6,7,4>: Cost 1 vext1 RHS, RHS
+ 1506094800U, // <u,6,7,5>: Cost 2 vext1 RHS, <5,1,7,3>
+ 1506095610U, // <u,6,7,6>: Cost 2 vext1 RHS, <6,2,7,3>
+ 1906904374U, // <u,6,7,7>: Cost 2 vzipr RHS, RHS
+ 432355118U, // <u,6,7,u>: Cost 1 vext1 RHS, LHS
+ 432357478U, // <u,6,u,0>: Cost 1 vext1 RHS, LHS
+ 1545475886U, // <u,6,u,1>: Cost 2 vext2 <0,2,u,6>, LHS
+ 1506100840U, // <u,6,u,2>: Cost 2 vext1 RHS, <2,2,2,2>
+ 1506101398U, // <u,6,u,3>: Cost 2 vext1 RHS, <3,0,1,2>
+ 432361002U, // <u,6,u,4>: Cost 1 vext1 RHS, RHS
+ 1545476250U, // <u,6,u,5>: Cost 2 vext2 <0,2,u,6>, RHS
+ 296144182U, // <u,6,u,6>: Cost 1 vdup2 RHS
+ 1880370486U, // <u,6,u,7>: Cost 2 vzipr LHS, RHS
+ 432363310U, // <u,6,u,u>: Cost 1 vext1 RHS, LHS
+ 1571356672U, // <u,7,0,0>: Cost 2 vext2 RHS, <0,0,0,0>
+ 497614950U, // <u,7,0,1>: Cost 1 vext2 RHS, LHS
+ 1571356836U, // <u,7,0,2>: Cost 2 vext2 RHS, <0,2,0,2>
+ 2573880146U, // <u,7,0,3>: Cost 3 vext1 <3,u,7,0>, <3,u,7,0>
+ 1571357010U, // <u,7,0,4>: Cost 2 vext2 RHS, <0,4,1,5>
+ 1512083716U, // <u,7,0,5>: Cost 2 vext1 <5,u,7,0>, <5,u,7,0>
+ 2621874741U, // <u,7,0,6>: Cost 3 vext2 <0,6,u,7>, <0,6,u,7>
+ 2585826298U, // <u,7,0,7>: Cost 3 vext1 <5,u,7,0>, <7,0,1,2>
+ 497615517U, // <u,7,0,u>: Cost 1 vext2 RHS, LHS
+ 1571357430U, // <u,7,1,0>: Cost 2 vext2 RHS, <1,0,3,2>
+ 1571357492U, // <u,7,1,1>: Cost 2 vext2 RHS, <1,1,1,1>
+ 1571357590U, // <u,7,1,2>: Cost 2 vext2 RHS, <1,2,3,0>
+ 1552114715U, // <u,7,1,3>: Cost 2 vext2 <1,3,u,7>, <1,3,u,7>
+ 2573888822U, // <u,7,1,4>: Cost 3 vext1 <3,u,7,1>, RHS
+ 1553441981U, // <u,7,1,5>: Cost 2 vext2 <1,5,u,7>, <1,5,u,7>
+ 2627847438U, // <u,7,1,6>: Cost 3 vext2 <1,6,u,7>, <1,6,u,7>
+ 2727408775U, // <u,7,1,7>: Cost 3 vext3 <7,1,7,u>, <7,1,7,u>
+ 1555432880U, // <u,7,1,u>: Cost 2 vext2 <1,u,u,7>, <1,u,u,7>
+ 2629838337U, // <u,7,2,0>: Cost 3 vext2 <2,0,u,7>, <2,0,u,7>
+ 1188058754U, // <u,7,2,1>: Cost 2 vrev <7,u,1,2>
+ 1571358312U, // <u,7,2,2>: Cost 2 vext2 RHS, <2,2,2,2>
+ 1571358374U, // <u,7,2,3>: Cost 2 vext2 RHS, <2,3,0,1>
+ 2632492869U, // <u,7,2,4>: Cost 3 vext2 <2,4,u,7>, <2,4,u,7>
+ 2633156502U, // <u,7,2,5>: Cost 3 vext2 <2,5,u,7>, <2,5,u,7>
+ 1560078311U, // <u,7,2,6>: Cost 2 vext2 <2,6,u,7>, <2,6,u,7>
+ 2728072408U, // <u,7,2,7>: Cost 3 vext3 <7,2,7,u>, <7,2,7,u>
+ 1561405577U, // <u,7,2,u>: Cost 2 vext2 <2,u,u,7>, <2,u,u,7>
+ 1571358870U, // <u,7,3,0>: Cost 2 vext2 RHS, <3,0,1,2>
+ 2627184913U, // <u,7,3,1>: Cost 3 vext2 <1,5,u,7>, <3,1,5,u>
+ 2633820523U, // <u,7,3,2>: Cost 3 vext2 <2,6,u,7>, <3,2,6,u>
+ 1571359132U, // <u,7,3,3>: Cost 2 vext2 RHS, <3,3,3,3>
+ 1571359234U, // <u,7,3,4>: Cost 2 vext2 RHS, <3,4,5,6>
+ 1512108295U, // <u,7,3,5>: Cost 2 vext1 <5,u,7,3>, <5,u,7,3>
+ 1518080992U, // <u,7,3,6>: Cost 2 vext1 <6,u,7,3>, <6,u,7,3>
+ 2640456465U, // <u,7,3,7>: Cost 3 vext2 <3,7,u,7>, <3,7,u,7>
+ 1571359518U, // <u,7,3,u>: Cost 2 vext2 RHS, <3,u,1,2>
+ 1571359634U, // <u,7,4,0>: Cost 2 vext2 RHS, <4,0,5,1>
+ 2573911067U, // <u,7,4,1>: Cost 3 vext1 <3,u,7,4>, <1,3,u,7>
+ 2645101622U, // <u,7,4,2>: Cost 3 vext2 RHS, <4,2,5,3>
+ 2573912918U, // <u,7,4,3>: Cost 3 vext1 <3,u,7,4>, <3,u,7,4>
+ 1571359952U, // <u,7,4,4>: Cost 2 vext2 RHS, <4,4,4,4>
+ 497618248U, // <u,7,4,5>: Cost 1 vext2 RHS, RHS
+ 1571360116U, // <u,7,4,6>: Cost 2 vext2 RHS, <4,6,4,6>
+ 2645102024U, // <u,7,4,7>: Cost 3 vext2 RHS, <4,7,5,0>
+ 497618473U, // <u,7,4,u>: Cost 1 vext2 RHS, RHS
+ 2645102152U, // <u,7,5,0>: Cost 3 vext2 RHS, <5,0,1,2>
+ 1571360464U, // <u,7,5,1>: Cost 2 vext2 RHS, <5,1,7,3>
+ 2645102334U, // <u,7,5,2>: Cost 3 vext2 RHS, <5,2,3,4>
+ 2645102447U, // <u,7,5,3>: Cost 3 vext2 RHS, <5,3,7,0>
+ 1571360710U, // <u,7,5,4>: Cost 2 vext2 RHS, <5,4,7,6>
+ 1571360772U, // <u,7,5,5>: Cost 2 vext2 RHS, <5,5,5,5>
+ 1571360866U, // <u,7,5,6>: Cost 2 vext2 RHS, <5,6,7,0>
+ 1571360936U, // <u,7,5,7>: Cost 2 vext2 RHS, <5,7,5,7>
+ 1571361017U, // <u,7,5,u>: Cost 2 vext2 RHS, <5,u,5,7>
+ 1530044518U, // <u,7,6,0>: Cost 2 vext1 <u,u,7,6>, LHS
+ 2645103016U, // <u,7,6,1>: Cost 3 vext2 RHS, <6,1,7,2>
+ 1571361274U, // <u,7,6,2>: Cost 2 vext2 RHS, <6,2,7,3>
+ 2645103154U, // <u,7,6,3>: Cost 3 vext2 RHS, <6,3,4,5>
+ 1530047798U, // <u,7,6,4>: Cost 2 vext1 <u,u,7,6>, RHS
+ 1188386474U, // <u,7,6,5>: Cost 2 vrev <7,u,5,6>
+ 1571361592U, // <u,7,6,6>: Cost 2 vext2 RHS, <6,6,6,6>
+ 1571361614U, // <u,7,6,7>: Cost 2 vext2 RHS, <6,7,0,1>
+ 1571361695U, // <u,7,6,u>: Cost 2 vext2 RHS, <6,u,0,1>
+ 1571361786U, // <u,7,7,0>: Cost 2 vext2 RHS, <7,0,1,2>
+ 2573935616U, // <u,7,7,1>: Cost 3 vext1 <3,u,7,7>, <1,3,5,7>
+ 2645103781U, // <u,7,7,2>: Cost 3 vext2 RHS, <7,2,2,2>
+ 2573937497U, // <u,7,7,3>: Cost 3 vext1 <3,u,7,7>, <3,u,7,7>
+ 1571362150U, // <u,7,7,4>: Cost 2 vext2 RHS, <7,4,5,6>
+ 1512141067U, // <u,7,7,5>: Cost 2 vext1 <5,u,7,7>, <5,u,7,7>
+ 1518113764U, // <u,7,7,6>: Cost 2 vext1 <6,u,7,7>, <6,u,7,7>
+ 363253046U, // <u,7,7,7>: Cost 1 vdup3 RHS
+ 363253046U, // <u,7,7,u>: Cost 1 vdup3 RHS
+ 1571362515U, // <u,7,u,0>: Cost 2 vext2 RHS, <u,0,1,2>
+ 497620782U, // <u,7,u,1>: Cost 1 vext2 RHS, LHS
+ 1571362693U, // <u,7,u,2>: Cost 2 vext2 RHS, <u,2,3,0>
+ 1571362748U, // <u,7,u,3>: Cost 2 vext2 RHS, <u,3,0,1>
+ 1571362879U, // <u,7,u,4>: Cost 2 vext2 RHS, <u,4,5,6>
+ 497621146U, // <u,7,u,5>: Cost 1 vext2 RHS, RHS
+ 1571363024U, // <u,7,u,6>: Cost 2 vext2 RHS, <u,6,3,7>
+ 363253046U, // <u,7,u,7>: Cost 1 vdup3 RHS
+ 497621349U, // <u,7,u,u>: Cost 1 vext2 RHS, LHS
+ 135053414U, // <u,u,0,0>: Cost 1 vdup0 LHS
+ 471081121U, // <u,u,0,1>: Cost 1 vext2 LHS, LHS
+ 1544822948U, // <u,u,0,2>: Cost 2 vext2 LHS, <0,2,0,2>
+ 1616140005U, // <u,u,0,3>: Cost 2 vext3 LHS, <u,0,3,2>
+ 1544823122U, // <u,u,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
+ 1512157453U, // <u,u,0,5>: Cost 2 vext1 <5,u,u,0>, <5,u,u,0>
+ 1662220032U, // <u,u,0,6>: Cost 2 vext3 RHS, <u,0,6,2>
+ 1194457487U, // <u,u,0,7>: Cost 2 vrev <u,u,7,0>
+ 471081629U, // <u,u,0,u>: Cost 1 vext2 LHS, LHS
+ 1544823542U, // <u,u,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
+ 202162278U, // <u,u,1,1>: Cost 1 vdup1 LHS
+ 537753390U, // <u,u,1,2>: Cost 1 vext3 LHS, LHS
+ 1544823768U, // <u,u,1,3>: Cost 2 vext2 LHS, <1,3,1,3>
+ 1494248758U, // <u,u,1,4>: Cost 2 vext1 <2,u,u,1>, RHS
+ 1544823952U, // <u,u,1,5>: Cost 2 vext2 LHS, <1,5,3,7>
+ 1518138343U, // <u,u,1,6>: Cost 2 vext1 <6,u,u,1>, <6,u,u,1>
+ 1640322907U, // <u,u,1,7>: Cost 2 vext3 RHS, <u,1,7,3>
+ 537753444U, // <u,u,1,u>: Cost 1 vext3 LHS, LHS
+ 1482309734U, // <u,u,2,0>: Cost 2 vext1 <0,u,u,2>, LHS
+ 1194031451U, // <u,u,2,1>: Cost 2 vrev <u,u,1,2>
+ 269271142U, // <u,u,2,2>: Cost 1 vdup2 LHS
+ 835584U, // <u,u,2,3>: Cost 0 copy LHS
+ 1482313014U, // <u,u,2,4>: Cost 2 vext1 <0,u,u,2>, RHS
+ 2618566504U, // <u,u,2,5>: Cost 3 vext2 LHS, <2,5,3,6>
+ 1544824762U, // <u,u,2,6>: Cost 2 vext2 LHS, <2,6,3,7>
+ 1638479788U, // <u,u,2,7>: Cost 2 vext3 RHS, <u,2,7,3>
+ 835584U, // <u,u,2,u>: Cost 0 copy LHS
+ 408576723U, // <u,u,3,0>: Cost 1 vext1 LHS, LHS
+ 1482318582U, // <u,u,3,1>: Cost 2 vext1 LHS, <1,0,3,2>
+ 120371557U, // <u,u,3,2>: Cost 1 vrev LHS
+ 336380006U, // <u,u,3,3>: Cost 1 vdup3 LHS
+ 408579382U, // <u,u,3,4>: Cost 1 vext1 LHS, RHS
+ 1616140271U, // <u,u,3,5>: Cost 2 vext3 LHS, <u,3,5,7>
+ 1530098170U, // <u,u,3,6>: Cost 2 vext1 LHS, <6,2,7,3>
+ 1880329544U, // <u,u,3,7>: Cost 2 vzipr LHS, RHS
+ 408581934U, // <u,u,3,u>: Cost 1 vext1 LHS, LHS
+ 1488298086U, // <u,u,4,0>: Cost 2 vext1 <1,u,u,4>, LHS
+ 1488299437U, // <u,u,4,1>: Cost 2 vext1 <1,u,u,4>, <1,u,u,4>
+ 1659271204U, // <u,u,4,2>: Cost 2 vext3 LHS, <u,4,2,6>
+ 1194195311U, // <u,u,4,3>: Cost 2 vrev <u,u,3,4>
+ 161926454U, // <u,u,4,4>: Cost 1 vdup0 RHS
+ 471084342U, // <u,u,4,5>: Cost 1 vext2 LHS, RHS
+ 1571368308U, // <u,u,4,6>: Cost 2 vext2 RHS, <4,6,4,6>
+ 1640323153U, // <u,u,4,7>: Cost 2 vext3 RHS, <u,4,7,6>
+ 471084585U, // <u,u,4,u>: Cost 1 vext2 LHS, RHS
+ 1494278246U, // <u,u,5,0>: Cost 2 vext1 <2,u,u,5>, LHS
+ 1571368656U, // <u,u,5,1>: Cost 2 vext2 RHS, <5,1,7,3>
+ 1494280327U, // <u,u,5,2>: Cost 2 vext1 <2,u,u,5>, <2,u,u,5>
+ 1616140415U, // <u,u,5,3>: Cost 2 vext3 LHS, <u,5,3,7>
+ 1494281526U, // <u,u,5,4>: Cost 2 vext1 <2,u,u,5>, RHS
+ 229035318U, // <u,u,5,5>: Cost 1 vdup1 RHS
+ 537753754U, // <u,u,5,6>: Cost 1 vext3 LHS, RHS
+ 1750355254U, // <u,u,5,7>: Cost 2 vuzpr LHS, RHS
+ 537753772U, // <u,u,5,u>: Cost 1 vext3 LHS, RHS
+ 1482342502U, // <u,u,6,0>: Cost 2 vext1 <0,u,u,6>, LHS
+ 2556084982U, // <u,u,6,1>: Cost 3 vext1 <0,u,u,6>, <1,0,3,2>
+ 1571369466U, // <u,u,6,2>: Cost 2 vext2 RHS, <6,2,7,3>
+ 1611938000U, // <u,u,6,3>: Cost 2 vext3 LHS, <u,6,3,7>
+ 1482345782U, // <u,u,6,4>: Cost 2 vext1 <0,u,u,6>, RHS
+ 1194359171U, // <u,u,6,5>: Cost 2 vrev <u,u,5,6>
+ 296144182U, // <u,u,6,6>: Cost 1 vdup2 RHS
+ 27705344U, // <u,u,6,7>: Cost 0 copy RHS
+ 27705344U, // <u,u,6,u>: Cost 0 copy RHS
+ 432496742U, // <u,u,7,0>: Cost 1 vext1 RHS, LHS
+ 1488324016U, // <u,u,7,1>: Cost 2 vext1 <1,u,u,7>, <1,u,u,7>
+ 1494296713U, // <u,u,7,2>: Cost 2 vext1 <2,u,u,7>, <2,u,u,7>
+ 1906901148U, // <u,u,7,3>: Cost 2 vzipr RHS, LHS
+ 432500283U, // <u,u,7,4>: Cost 1 vext1 RHS, RHS
+ 1506242256U, // <u,u,7,5>: Cost 2 vext1 RHS, <5,1,7,3>
+ 120699277U, // <u,u,7,6>: Cost 1 vrev RHS
+ 363253046U, // <u,u,7,7>: Cost 1 vdup3 RHS
+ 432502574U, // <u,u,7,u>: Cost 1 vext1 RHS, LHS
+ 408617688U, // <u,u,u,0>: Cost 1 vext1 LHS, LHS
+ 471086894U, // <u,u,u,1>: Cost 1 vext2 LHS, LHS
+ 537753957U, // <u,u,u,2>: Cost 1 vext3 LHS, LHS
+ 835584U, // <u,u,u,3>: Cost 0 copy LHS
+ 408620342U, // <u,u,u,4>: Cost 1 vext1 LHS, RHS
+ 471087258U, // <u,u,u,5>: Cost 1 vext2 LHS, RHS
+ 537753997U, // <u,u,u,6>: Cost 1 vext3 LHS, RHS
+ 27705344U, // <u,u,u,7>: Cost 0 copy RHS
+ 835584U, // <u,u,u,u>: Cost 0 copy LHS
+ 0
+};
diff --git a/lib/Target/ARM/ARMRegisterInfo.cpp b/lib/Target/ARM/ARMRegisterInfo.cpp
index f809f37..d5bc3f6 100644
--- a/lib/Target/ARM/ARMRegisterInfo.cpp
+++ b/lib/Target/ARM/ARMRegisterInfo.cpp
@@ -13,6 +13,7 @@
#include "ARM.h"
#include "ARMAddressingModes.h"
+#include "ARMBaseInstrInfo.h"
#include "ARMInstrInfo.h"
#include "ARMMachineFunctionInfo.h"
#include "ARMRegisterInfo.h"
@@ -26,6 +27,7 @@
#include "llvm/CodeGen/MachineLocation.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/Support/ErrorHandling.h"
#include "llvm/Target/TargetFrameInfo.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetOptions.h"
@@ -33,1370 +35,7 @@
#include "llvm/ADT/SmallVector.h"
using namespace llvm;
-unsigned ARMBaseRegisterInfo::getRegisterNumbering(unsigned RegEnum) {
- using namespace ARM;
- switch (RegEnum) {
- case R0: case S0: case D0: return 0;
- case R1: case S1: case D1: return 1;
- case R2: case S2: case D2: return 2;
- case R3: case S3: case D3: return 3;
- case R4: case S4: case D4: return 4;
- case R5: case S5: case D5: return 5;
- case R6: case S6: case D6: return 6;
- case R7: case S7: case D7: return 7;
- case R8: case S8: case D8: return 8;
- case R9: case S9: case D9: return 9;
- case R10: case S10: case D10: return 10;
- case R11: case S11: case D11: return 11;
- case R12: case S12: case D12: return 12;
- case SP: case S13: case D13: return 13;
- case LR: case S14: case D14: return 14;
- case PC: case S15: case D15: return 15;
- case S16: return 16;
- case S17: return 17;
- case S18: return 18;
- case S19: return 19;
- case S20: return 20;
- case S21: return 21;
- case S22: return 22;
- case S23: return 23;
- case S24: return 24;
- case S25: return 25;
- case S26: return 26;
- case S27: return 27;
- case S28: return 28;
- case S29: return 29;
- case S30: return 30;
- case S31: return 31;
- default:
- assert(0 && "Unknown ARM register!");
- abort();
- }
-}
-
-unsigned ARMBaseRegisterInfo::getRegisterNumbering(unsigned RegEnum,
- bool &isSPVFP) {
- isSPVFP = false;
-
- using namespace ARM;
- switch (RegEnum) {
- default:
- assert(0 && "Unknown ARM register!");
- abort();
- case R0: case D0: return 0;
- case R1: case D1: return 1;
- case R2: case D2: return 2;
- case R3: case D3: return 3;
- case R4: case D4: return 4;
- case R5: case D5: return 5;
- case R6: case D6: return 6;
- case R7: case D7: return 7;
- case R8: case D8: return 8;
- case R9: case D9: return 9;
- case R10: case D10: return 10;
- case R11: case D11: return 11;
- case R12: case D12: return 12;
- case SP: case D13: return 13;
- case LR: case D14: return 14;
- case PC: case D15: return 15;
-
- case S0: case S1: case S2: case S3:
- case S4: case S5: case S6: case S7:
- case S8: case S9: case S10: case S11:
- case S12: case S13: case S14: case S15:
- case S16: case S17: case S18: case S19:
- case S20: case S21: case S22: case S23:
- case S24: case S25: case S26: case S27:
- case S28: case S29: case S30: case S31: {
- isSPVFP = true;
- switch (RegEnum) {
- default: return 0; // Avoid compile time warning.
- case S0: return 0;
- case S1: return 1;
- case S2: return 2;
- case S3: return 3;
- case S4: return 4;
- case S5: return 5;
- case S6: return 6;
- case S7: return 7;
- case S8: return 8;
- case S9: return 9;
- case S10: return 10;
- case S11: return 11;
- case S12: return 12;
- case S13: return 13;
- case S14: return 14;
- case S15: return 15;
- case S16: return 16;
- case S17: return 17;
- case S18: return 18;
- case S19: return 19;
- case S20: return 20;
- case S21: return 21;
- case S22: return 22;
- case S23: return 23;
- case S24: return 24;
- case S25: return 25;
- case S26: return 26;
- case S27: return 27;
- case S28: return 28;
- case S29: return 29;
- case S30: return 30;
- case S31: return 31;
- }
- }
- }
-}
-
-ARMBaseRegisterInfo::ARMBaseRegisterInfo(const TargetInstrInfo &tii,
- const ARMSubtarget &sti)
- : ARMGenRegisterInfo(ARM::ADJCALLSTACKDOWN, ARM::ADJCALLSTACKUP),
- TII(tii), STI(sti),
- FramePtr((STI.isTargetDarwin() || STI.isThumb()) ? ARM::R7 : ARM::R11) {
-}
-
-ARMRegisterInfo::ARMRegisterInfo(const TargetInstrInfo &tii,
+ARMRegisterInfo::ARMRegisterInfo(const ARMBaseInstrInfo &tii,
const ARMSubtarget &sti)
: ARMBaseRegisterInfo(tii, sti) {
}
-
-static inline
-const MachineInstrBuilder &AddDefaultPred(const MachineInstrBuilder &MIB) {
- return MIB.addImm((int64_t)ARMCC::AL).addReg(0);
-}
-
-static inline
-const MachineInstrBuilder &AddDefaultCC(const MachineInstrBuilder &MIB) {
- return MIB.addReg(0);
-}
-
-/// emitLoadConstPool - Emits a load from constpool to materialize the
-/// specified immediate.
-void ARMRegisterInfo::emitLoadConstPool(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator &MBBI,
- const TargetInstrInfo *TII, DebugLoc dl,
- unsigned DestReg, int Val,
- ARMCC::CondCodes Pred,
- unsigned PredReg) const {
- MachineFunction &MF = *MBB.getParent();
- MachineConstantPool *ConstantPool = MF.getConstantPool();
- Constant *C = ConstantInt::get(Type::Int32Ty, Val);
- unsigned Idx = ConstantPool->getConstantPoolIndex(C, 4);
-
- BuildMI(MBB, MBBI, dl, TII->get(ARM::LDRcp), DestReg)
- .addConstantPoolIndex(Idx)
- .addReg(0).addImm(0).addImm(Pred).addReg(PredReg);
-}
-
-const unsigned*
-ARMBaseRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
- static const unsigned CalleeSavedRegs[] = {
- ARM::LR, ARM::R11, ARM::R10, ARM::R9, ARM::R8,
- ARM::R7, ARM::R6, ARM::R5, ARM::R4,
-
- ARM::D15, ARM::D14, ARM::D13, ARM::D12,
- ARM::D11, ARM::D10, ARM::D9, ARM::D8,
- 0
- };
-
- static const unsigned DarwinCalleeSavedRegs[] = {
- // Darwin ABI deviates from ARM standard ABI. R9 is not a callee-saved
- // register.
- ARM::LR, ARM::R7, ARM::R6, ARM::R5, ARM::R4,
- ARM::R11, ARM::R10, ARM::R8,
-
- ARM::D15, ARM::D14, ARM::D13, ARM::D12,
- ARM::D11, ARM::D10, ARM::D9, ARM::D8,
- 0
- };
- return STI.isTargetDarwin() ? DarwinCalleeSavedRegs : CalleeSavedRegs;
-}
-
-const TargetRegisterClass* const *
-ARMBaseRegisterInfo::getCalleeSavedRegClasses(const MachineFunction *MF) const {
- static const TargetRegisterClass * const CalleeSavedRegClasses[] = {
- &ARM::GPRRegClass, &ARM::GPRRegClass, &ARM::GPRRegClass,
- &ARM::GPRRegClass, &ARM::GPRRegClass, &ARM::GPRRegClass,
- &ARM::GPRRegClass, &ARM::GPRRegClass, &ARM::GPRRegClass,
-
- &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass,
- &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass,
- 0
- };
-
- static const TargetRegisterClass * const ThumbCalleeSavedRegClasses[] = {
- &ARM::GPRRegClass, &ARM::GPRRegClass, &ARM::GPRRegClass,
- &ARM::GPRRegClass, &ARM::GPRRegClass, &ARM::tGPRRegClass,
- &ARM::tGPRRegClass,&ARM::tGPRRegClass,&ARM::tGPRRegClass,
-
- &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass,
- &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass,
- 0
- };
-
- static const TargetRegisterClass * const DarwinCalleeSavedRegClasses[] = {
- &ARM::GPRRegClass, &ARM::GPRRegClass, &ARM::GPRRegClass,
- &ARM::GPRRegClass, &ARM::GPRRegClass, &ARM::GPRRegClass,
- &ARM::GPRRegClass, &ARM::GPRRegClass,
-
- &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass,
- &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass,
- 0
- };
-
- static const TargetRegisterClass * const DarwinThumbCalleeSavedRegClasses[] ={
- &ARM::GPRRegClass, &ARM::tGPRRegClass, &ARM::tGPRRegClass,
- &ARM::tGPRRegClass, &ARM::tGPRRegClass, &ARM::GPRRegClass,
- &ARM::GPRRegClass, &ARM::GPRRegClass,
-
- &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass,
- &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass,
- 0
- };
-
- if (STI.isThumb()) {
- return STI.isTargetDarwin()
- ? DarwinThumbCalleeSavedRegClasses : ThumbCalleeSavedRegClasses;
- }
- return STI.isTargetDarwin()
- ? DarwinCalleeSavedRegClasses : CalleeSavedRegClasses;
-}
-
-BitVector ARMBaseRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
- // FIXME: avoid re-calculating this everytime.
- BitVector Reserved(getNumRegs());
- Reserved.set(ARM::SP);
- Reserved.set(ARM::PC);
- if (STI.isTargetDarwin() || hasFP(MF))
- Reserved.set(FramePtr);
- // Some targets reserve R9.
- if (STI.isR9Reserved())
- Reserved.set(ARM::R9);
- return Reserved;
-}
-
-bool
-ARMBaseRegisterInfo::isReservedReg(const MachineFunction &MF, unsigned Reg) const {
- switch (Reg) {
- default: break;
- case ARM::SP:
- case ARM::PC:
- return true;
- case ARM::R7:
- case ARM::R11:
- if (FramePtr == Reg && (STI.isTargetDarwin() || hasFP(MF)))
- return true;
- break;
- case ARM::R9:
- return STI.isR9Reserved();
- }
-
- return false;
-}
-
-const TargetRegisterClass *ARMBaseRegisterInfo::getPointerRegClass() const {
- return &ARM::GPRRegClass;
-}
-
-/// getAllocationOrder - Returns the register allocation order for a specified
-/// register class in the form of a pair of TargetRegisterClass iterators.
-std::pair<TargetRegisterClass::iterator,TargetRegisterClass::iterator>
-ARMBaseRegisterInfo::getAllocationOrder(const TargetRegisterClass *RC,
- unsigned HintType, unsigned HintReg,
- const MachineFunction &MF) const {
- // Alternative register allocation orders when favoring even / odd registers
- // of register pairs.
-
- // No FP, R9 is available.
- static const unsigned GPREven1[] = {
- ARM::R0, ARM::R2, ARM::R4, ARM::R6, ARM::R8, ARM::R10,
- ARM::R1, ARM::R3, ARM::R12,ARM::LR, ARM::R5, ARM::R7,
- ARM::R9, ARM::R11
- };
- static const unsigned GPROdd1[] = {
- ARM::R1, ARM::R3, ARM::R5, ARM::R7, ARM::R9, ARM::R11,
- ARM::R0, ARM::R2, ARM::R12,ARM::LR, ARM::R4, ARM::R6,
- ARM::R8, ARM::R10
- };
-
- // FP is R7, R9 is available.
- static const unsigned GPREven2[] = {
- ARM::R0, ARM::R2, ARM::R4, ARM::R8, ARM::R10,
- ARM::R1, ARM::R3, ARM::R12,ARM::LR, ARM::R5, ARM::R6,
- ARM::R9, ARM::R11
- };
- static const unsigned GPROdd2[] = {
- ARM::R1, ARM::R3, ARM::R5, ARM::R9, ARM::R11,
- ARM::R0, ARM::R2, ARM::R12,ARM::LR, ARM::R4, ARM::R6,
- ARM::R8, ARM::R10
- };
-
- // FP is R11, R9 is available.
- static const unsigned GPREven3[] = {
- ARM::R0, ARM::R2, ARM::R4, ARM::R6, ARM::R8,
- ARM::R1, ARM::R3, ARM::R10,ARM::R12,ARM::LR, ARM::R5, ARM::R7,
- ARM::R9
- };
- static const unsigned GPROdd3[] = {
- ARM::R1, ARM::R3, ARM::R5, ARM::R6, ARM::R9,
- ARM::R0, ARM::R2, ARM::R10,ARM::R12,ARM::LR, ARM::R4, ARM::R7,
- ARM::R8
- };
-
- // No FP, R9 is not available.
- static const unsigned GPREven4[] = {
- ARM::R0, ARM::R2, ARM::R4, ARM::R6, ARM::R10,
- ARM::R1, ARM::R3, ARM::R12,ARM::LR, ARM::R5, ARM::R7, ARM::R8,
- ARM::R11
- };
- static const unsigned GPROdd4[] = {
- ARM::R1, ARM::R3, ARM::R5, ARM::R7, ARM::R11,
- ARM::R0, ARM::R2, ARM::R12,ARM::LR, ARM::R4, ARM::R6, ARM::R8,
- ARM::R10
- };
-
- // FP is R7, R9 is not available.
- static const unsigned GPREven5[] = {
- ARM::R0, ARM::R2, ARM::R4, ARM::R10,
- ARM::R1, ARM::R3, ARM::R12,ARM::LR, ARM::R5, ARM::R6, ARM::R8,
- ARM::R11
- };
- static const unsigned GPROdd5[] = {
- ARM::R1, ARM::R3, ARM::R5, ARM::R11,
- ARM::R0, ARM::R2, ARM::R12,ARM::LR, ARM::R4, ARM::R6, ARM::R8,
- ARM::R10
- };
-
- // FP is R11, R9 is not available.
- static const unsigned GPREven6[] = {
- ARM::R0, ARM::R2, ARM::R4, ARM::R6,
- ARM::R1, ARM::R3, ARM::R10,ARM::R12,ARM::LR, ARM::R5, ARM::R7, ARM::R8
- };
- static const unsigned GPROdd6[] = {
- ARM::R1, ARM::R3, ARM::R5, ARM::R7,
- ARM::R0, ARM::R2, ARM::R10,ARM::R12,ARM::LR, ARM::R4, ARM::R6, ARM::R8
- };
-
-
- if (HintType == ARMRI::RegPairEven) {
- if (isPhysicalRegister(HintReg) && getRegisterPairEven(HintReg, MF) == 0)
- // It's no longer possible to fulfill this hint. Return the default
- // allocation order.
- return std::make_pair(RC->allocation_order_begin(MF),
- RC->allocation_order_end(MF));
-
- if (!STI.isTargetDarwin() && !hasFP(MF)) {
- if (!STI.isR9Reserved())
- return std::make_pair(GPREven1,
- GPREven1 + (sizeof(GPREven1)/sizeof(unsigned)));
- else
- return std::make_pair(GPREven4,
- GPREven4 + (sizeof(GPREven4)/sizeof(unsigned)));
- } else if (FramePtr == ARM::R7) {
- if (!STI.isR9Reserved())
- return std::make_pair(GPREven2,
- GPREven2 + (sizeof(GPREven2)/sizeof(unsigned)));
- else
- return std::make_pair(GPREven5,
- GPREven5 + (sizeof(GPREven5)/sizeof(unsigned)));
- } else { // FramePtr == ARM::R11
- if (!STI.isR9Reserved())
- return std::make_pair(GPREven3,
- GPREven3 + (sizeof(GPREven3)/sizeof(unsigned)));
- else
- return std::make_pair(GPREven6,
- GPREven6 + (sizeof(GPREven6)/sizeof(unsigned)));
- }
- } else if (HintType == ARMRI::RegPairOdd) {
- if (isPhysicalRegister(HintReg) && getRegisterPairOdd(HintReg, MF) == 0)
- // It's no longer possible to fulfill this hint. Return the default
- // allocation order.
- return std::make_pair(RC->allocation_order_begin(MF),
- RC->allocation_order_end(MF));
-
- if (!STI.isTargetDarwin() && !hasFP(MF)) {
- if (!STI.isR9Reserved())
- return std::make_pair(GPROdd1,
- GPROdd1 + (sizeof(GPROdd1)/sizeof(unsigned)));
- else
- return std::make_pair(GPROdd4,
- GPROdd4 + (sizeof(GPROdd4)/sizeof(unsigned)));
- } else if (FramePtr == ARM::R7) {
- if (!STI.isR9Reserved())
- return std::make_pair(GPROdd2,
- GPROdd2 + (sizeof(GPROdd2)/sizeof(unsigned)));
- else
- return std::make_pair(GPROdd5,
- GPROdd5 + (sizeof(GPROdd5)/sizeof(unsigned)));
- } else { // FramePtr == ARM::R11
- if (!STI.isR9Reserved())
- return std::make_pair(GPROdd3,
- GPROdd3 + (sizeof(GPROdd3)/sizeof(unsigned)));
- else
- return std::make_pair(GPROdd6,
- GPROdd6 + (sizeof(GPROdd6)/sizeof(unsigned)));
- }
- }
- return std::make_pair(RC->allocation_order_begin(MF),
- RC->allocation_order_end(MF));
-}
-
-/// ResolveRegAllocHint - Resolves the specified register allocation hint
-/// to a physical register. Returns the physical register if it is successful.
-unsigned
-ARMBaseRegisterInfo::ResolveRegAllocHint(unsigned Type, unsigned Reg,
- const MachineFunction &MF) const {
- if (Reg == 0 || !isPhysicalRegister(Reg))
- return 0;
- if (Type == 0)
- return Reg;
- else if (Type == (unsigned)ARMRI::RegPairOdd)
- // Odd register.
- return getRegisterPairOdd(Reg, MF);
- else if (Type == (unsigned)ARMRI::RegPairEven)
- // Even register.
- return getRegisterPairEven(Reg, MF);
- return 0;
-}
-
-void
-ARMBaseRegisterInfo::UpdateRegAllocHint(unsigned Reg, unsigned NewReg,
- MachineFunction &MF) const {
- MachineRegisterInfo *MRI = &MF.getRegInfo();
- std::pair<unsigned, unsigned> Hint = MRI->getRegAllocationHint(Reg);
- if ((Hint.first == (unsigned)ARMRI::RegPairOdd ||
- Hint.first == (unsigned)ARMRI::RegPairEven) &&
- Hint.second && TargetRegisterInfo::isVirtualRegister(Hint.second)) {
- // If 'Reg' is one of the even / odd register pair and it's now changed
- // (e.g. coalesced) into a different register. The other register of the
- // pair allocation hint must be updated to reflect the relationship
- // change.
- unsigned OtherReg = Hint.second;
- Hint = MRI->getRegAllocationHint(OtherReg);
- if (Hint.second == Reg)
- // Make sure the pair has not already divorced.
- MRI->setRegAllocationHint(OtherReg, Hint.first, NewReg);
- }
-}
-
-bool
-ARMRegisterInfo::requiresRegisterScavenging(const MachineFunction &MF) const {
- return true;
-}
-
-/// hasFP - Return true if the specified function should have a dedicated frame
-/// pointer register. This is true if the function has variable sized allocas
-/// or if frame pointer elimination is disabled.
-///
-bool ARMBaseRegisterInfo::hasFP(const MachineFunction &MF) const {
- const MachineFrameInfo *MFI = MF.getFrameInfo();
- return (NoFramePointerElim ||
- MFI->hasVarSizedObjects() ||
- MFI->isFrameAddressTaken());
-}
-
-// hasReservedCallFrame - Under normal circumstances, when a frame pointer is
-// not required, we reserve argument space for call sites in the function
-// immediately on entry to the current function. This eliminates the need for
-// add/sub sp brackets around call sites. Returns true if the call frame is
-// included as part of the stack frame.
-bool ARMRegisterInfo::hasReservedCallFrame(MachineFunction &MF) const {
- const MachineFrameInfo *FFI = MF.getFrameInfo();
- unsigned CFSize = FFI->getMaxCallFrameSize();
- // It's not always a good idea to include the call frame as part of the
- // stack frame. ARM (especially Thumb) has small immediate offset to
- // address the stack frame. So a large call frame can cause poor codegen
- // and may even makes it impossible to scavenge a register.
- if (CFSize >= ((1 << 12) - 1) / 2) // Half of imm12
- return false;
-
- return !MF.getFrameInfo()->hasVarSizedObjects();
-}
-
-/// emitARMRegPlusImmediate - Emits a series of instructions to materialize
-/// a destreg = basereg + immediate in ARM code.
-static
-void emitARMRegPlusImmediate(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator &MBBI,
- unsigned DestReg, unsigned BaseReg, int NumBytes,
- ARMCC::CondCodes Pred, unsigned PredReg,
- const TargetInstrInfo &TII,
- DebugLoc dl) {
- bool isSub = NumBytes < 0;
- if (isSub) NumBytes = -NumBytes;
-
- while (NumBytes) {
- unsigned RotAmt = ARM_AM::getSOImmValRotate(NumBytes);
- unsigned ThisVal = NumBytes & ARM_AM::rotr32(0xFF, RotAmt);
- assert(ThisVal && "Didn't extract field correctly");
-
- // We will handle these bits from offset, clear them.
- NumBytes &= ~ThisVal;
-
- // Get the properly encoded SOImmVal field.
- int SOImmVal = ARM_AM::getSOImmVal(ThisVal);
- assert(SOImmVal != -1 && "Bit extraction didn't work?");
-
- // Build the new ADD / SUB.
- BuildMI(MBB, MBBI, dl, TII.get(isSub ? ARM::SUBri : ARM::ADDri), DestReg)
- .addReg(BaseReg, RegState::Kill).addImm(SOImmVal)
- .addImm((unsigned)Pred).addReg(PredReg).addReg(0);
- BaseReg = DestReg;
- }
-}
-
-static void
-emitSPUpdate(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
- const TargetInstrInfo &TII, DebugLoc dl,
- int NumBytes,
- ARMCC::CondCodes Pred = ARMCC::AL, unsigned PredReg = 0) {
- emitARMRegPlusImmediate(MBB, MBBI, ARM::SP, ARM::SP, NumBytes,
- Pred, PredReg, TII, dl);
-}
-
-void ARMRegisterInfo::
-eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
- MachineBasicBlock::iterator I) const {
- if (!hasReservedCallFrame(MF)) {
- // If we have alloca, convert as follows:
- // ADJCALLSTACKDOWN -> sub, sp, sp, amount
- // ADJCALLSTACKUP -> add, sp, sp, amount
- MachineInstr *Old = I;
- DebugLoc dl = Old->getDebugLoc();
- unsigned Amount = Old->getOperand(0).getImm();
- if (Amount != 0) {
- // We need to keep the stack aligned properly. To do this, we round the
- // amount of space needed for the outgoing arguments up to the next
- // alignment boundary.
- unsigned Align = MF.getTarget().getFrameInfo()->getStackAlignment();
- Amount = (Amount+Align-1)/Align*Align;
-
- // Replace the pseudo instruction with a new instruction...
- unsigned Opc = Old->getOpcode();
- ARMCC::CondCodes Pred = (ARMCC::CondCodes)Old->getOperand(1).getImm();
- if (Opc == ARM::ADJCALLSTACKDOWN || Opc == ARM::tADJCALLSTACKDOWN) {
- // Note: PredReg is operand 2 for ADJCALLSTACKDOWN.
- unsigned PredReg = Old->getOperand(2).getReg();
- emitSPUpdate(MBB, I, TII, dl, -Amount, Pred, PredReg);
- } else {
- // Note: PredReg is operand 3 for ADJCALLSTACKUP.
- unsigned PredReg = Old->getOperand(3).getReg();
- assert(Opc == ARM::ADJCALLSTACKUP || Opc == ARM::tADJCALLSTACKUP);
- emitSPUpdate(MBB, I, TII, dl, Amount, Pred, PredReg);
- }
- }
- }
- MBB.erase(I);
-}
-
-/// findScratchRegister - Find a 'free' ARM register. If register scavenger
-/// is not being used, R12 is available. Otherwise, try for a call-clobbered
-/// register first and then a spilled callee-saved register if that fails.
-static
-unsigned findScratchRegister(RegScavenger *RS, const TargetRegisterClass *RC,
- ARMFunctionInfo *AFI) {
- unsigned Reg = RS ? RS->FindUnusedReg(RC, true) : (unsigned) ARM::R12;
- assert (!AFI->isThumbFunction());
- if (Reg == 0)
- // Try a already spilled CS register.
- Reg = RS->FindUnusedReg(RC, AFI->getSpilledCSRegisters());
-
- return Reg;
-}
-
-void ARMRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
- int SPAdj, RegScavenger *RS) const{
- unsigned i = 0;
- MachineInstr &MI = *II;
- MachineBasicBlock &MBB = *MI.getParent();
- MachineFunction &MF = *MBB.getParent();
- ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
- DebugLoc dl = MI.getDebugLoc();
-
- while (!MI.getOperand(i).isFI()) {
- ++i;
- assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!");
- }
-
- unsigned FrameReg = ARM::SP;
- int FrameIndex = MI.getOperand(i).getIndex();
- int Offset = MF.getFrameInfo()->getObjectOffset(FrameIndex) +
- MF.getFrameInfo()->getStackSize() + SPAdj;
-
- if (AFI->isGPRCalleeSavedArea1Frame(FrameIndex))
- Offset -= AFI->getGPRCalleeSavedArea1Offset();
- else if (AFI->isGPRCalleeSavedArea2Frame(FrameIndex))
- Offset -= AFI->getGPRCalleeSavedArea2Offset();
- else if (AFI->isDPRCalleeSavedAreaFrame(FrameIndex))
- Offset -= AFI->getDPRCalleeSavedAreaOffset();
- else if (hasFP(MF)) {
- assert(SPAdj == 0 && "Unexpected");
- // There is alloca()'s in this function, must reference off the frame
- // pointer instead.
- FrameReg = getFrameRegister(MF);
- Offset -= AFI->getFramePtrSpillOffset();
- }
-
- unsigned Opcode = MI.getOpcode();
- const TargetInstrDesc &Desc = MI.getDesc();
- unsigned AddrMode = (Desc.TSFlags & ARMII::AddrModeMask);
- bool isSub = false;
-
- // Memory operands in inline assembly always use AddrMode2.
- if (Opcode == ARM::INLINEASM)
- AddrMode = ARMII::AddrMode2;
-
- if (Opcode == ARM::ADDri) {
- Offset += MI.getOperand(i+1).getImm();
- if (Offset == 0) {
- // Turn it into a move.
- MI.setDesc(TII.get(ARM::MOVr));
- MI.getOperand(i).ChangeToRegister(FrameReg, false);
- MI.RemoveOperand(i+1);
- return;
- } else if (Offset < 0) {
- Offset = -Offset;
- isSub = true;
- MI.setDesc(TII.get(ARM::SUBri));
- }
-
- // Common case: small offset, fits into instruction.
- int ImmedOffset = ARM_AM::getSOImmVal(Offset);
- if (ImmedOffset != -1) {
- // Replace the FrameIndex with sp / fp
- MI.getOperand(i).ChangeToRegister(FrameReg, false);
- MI.getOperand(i+1).ChangeToImmediate(ImmedOffset);
- return;
- }
-
- // Otherwise, we fallback to common code below to form the imm offset with
- // a sequence of ADDri instructions. First though, pull as much of the imm
- // into this ADDri as possible.
- unsigned RotAmt = ARM_AM::getSOImmValRotate(Offset);
- unsigned ThisImmVal = Offset & ARM_AM::rotr32(0xFF, RotAmt);
-
- // We will handle these bits from offset, clear them.
- Offset &= ~ThisImmVal;
-
- // Get the properly encoded SOImmVal field.
- int ThisSOImmVal = ARM_AM::getSOImmVal(ThisImmVal);
- assert(ThisSOImmVal != -1 && "Bit extraction didn't work?");
- MI.getOperand(i+1).ChangeToImmediate(ThisSOImmVal);
- } else {
- unsigned ImmIdx = 0;
- int InstrOffs = 0;
- unsigned NumBits = 0;
- unsigned Scale = 1;
- switch (AddrMode) {
- case ARMII::AddrMode2: {
- ImmIdx = i+2;
- InstrOffs = ARM_AM::getAM2Offset(MI.getOperand(ImmIdx).getImm());
- if (ARM_AM::getAM2Op(MI.getOperand(ImmIdx).getImm()) == ARM_AM::sub)
- InstrOffs *= -1;
- NumBits = 12;
- break;
- }
- case ARMII::AddrMode3: {
- ImmIdx = i+2;
- InstrOffs = ARM_AM::getAM3Offset(MI.getOperand(ImmIdx).getImm());
- if (ARM_AM::getAM3Op(MI.getOperand(ImmIdx).getImm()) == ARM_AM::sub)
- InstrOffs *= -1;
- NumBits = 8;
- break;
- }
- case ARMII::AddrMode5: {
- ImmIdx = i+1;
- InstrOffs = ARM_AM::getAM5Offset(MI.getOperand(ImmIdx).getImm());
- if (ARM_AM::getAM5Op(MI.getOperand(ImmIdx).getImm()) == ARM_AM::sub)
- InstrOffs *= -1;
- NumBits = 8;
- Scale = 4;
- break;
- }
- default:
- assert(0 && "Unsupported addressing mode!");
- abort();
- break;
- }
-
- Offset += InstrOffs * Scale;
- assert((Offset & (Scale-1)) == 0 && "Can't encode this offset!");
- if (Offset < 0) {
- Offset = -Offset;
- isSub = true;
- }
-
- // Common case: small offset, fits into instruction.
- MachineOperand &ImmOp = MI.getOperand(ImmIdx);
- int ImmedOffset = Offset / Scale;
- unsigned Mask = (1 << NumBits) - 1;
- if ((unsigned)Offset <= Mask * Scale) {
- // Replace the FrameIndex with sp
- MI.getOperand(i).ChangeToRegister(FrameReg, false);
- if (isSub)
- ImmedOffset |= 1 << NumBits;
- ImmOp.ChangeToImmediate(ImmedOffset);
- return;
- }
-
- // Otherwise, it didn't fit. Pull in what we can to simplify the immed.
- ImmedOffset = ImmedOffset & Mask;
- if (isSub)
- ImmedOffset |= 1 << NumBits;
- ImmOp.ChangeToImmediate(ImmedOffset);
- Offset &= ~(Mask*Scale);
- }
-
- // If we get here, the immediate doesn't fit into the instruction. We folded
- // as much as possible above, handle the rest, providing a register that is
- // SP+LargeImm.
- assert(Offset && "This code isn't needed if offset already handled!");
-
- // Insert a set of r12 with the full address: r12 = sp + offset
- // If the offset we have is too large to fit into the instruction, we need
- // to form it with a series of ADDri's. Do this by taking 8-bit chunks
- // out of 'Offset'.
- unsigned ScratchReg = findScratchRegister(RS, &ARM::GPRRegClass, AFI);
- if (ScratchReg == 0)
- // No register is "free". Scavenge a register.
- ScratchReg = RS->scavengeRegister(&ARM::GPRRegClass, II, SPAdj);
- int PIdx = MI.findFirstPredOperandIdx();
- ARMCC::CondCodes Pred = (PIdx == -1)
- ? ARMCC::AL : (ARMCC::CondCodes)MI.getOperand(PIdx).getImm();
- unsigned PredReg = (PIdx == -1) ? 0 : MI.getOperand(PIdx+1).getReg();
- emitARMRegPlusImmediate(MBB, II, ScratchReg, FrameReg,
- isSub ? -Offset : Offset, Pred, PredReg, TII, dl);
- MI.getOperand(i).ChangeToRegister(ScratchReg, false, false, true);
-}
-
-static unsigned estimateStackSize(MachineFunction &MF, MachineFrameInfo *MFI) {
- const MachineFrameInfo *FFI = MF.getFrameInfo();
- int Offset = 0;
- for (int i = FFI->getObjectIndexBegin(); i != 0; ++i) {
- int FixedOff = -FFI->getObjectOffset(i);
- if (FixedOff > Offset) Offset = FixedOff;
- }
- for (unsigned i = 0, e = FFI->getObjectIndexEnd(); i != e; ++i) {
- if (FFI->isDeadObjectIndex(i))
- continue;
- Offset += FFI->getObjectSize(i);
- unsigned Align = FFI->getObjectAlignment(i);
- // Adjust to alignment boundary
- Offset = (Offset+Align-1)/Align*Align;
- }
- return (unsigned)Offset;
-}
-
-void
-ARMBaseRegisterInfo::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
- RegScavenger *RS) const {
- // This tells PEI to spill the FP as if it is any other callee-save register
- // to take advantage the eliminateFrameIndex machinery. This also ensures it
- // is spilled in the order specified by getCalleeSavedRegs() to make it easier
- // to combine multiple loads / stores.
- bool CanEliminateFrame = true;
- bool CS1Spilled = false;
- bool LRSpilled = false;
- unsigned NumGPRSpills = 0;
- SmallVector<unsigned, 4> UnspilledCS1GPRs;
- SmallVector<unsigned, 4> UnspilledCS2GPRs;
- ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
-
- // Don't spill FP if the frame can be eliminated. This is determined
- // by scanning the callee-save registers to see if any is used.
- const unsigned *CSRegs = getCalleeSavedRegs();
- const TargetRegisterClass* const *CSRegClasses = getCalleeSavedRegClasses();
- for (unsigned i = 0; CSRegs[i]; ++i) {
- unsigned Reg = CSRegs[i];
- bool Spilled = false;
- if (MF.getRegInfo().isPhysRegUsed(Reg)) {
- AFI->setCSRegisterIsSpilled(Reg);
- Spilled = true;
- CanEliminateFrame = false;
- } else {
- // Check alias registers too.
- for (const unsigned *Aliases = getAliasSet(Reg); *Aliases; ++Aliases) {
- if (MF.getRegInfo().isPhysRegUsed(*Aliases)) {
- Spilled = true;
- CanEliminateFrame = false;
- }
- }
- }
-
- if (CSRegClasses[i] == &ARM::GPRRegClass) {
- if (Spilled) {
- NumGPRSpills++;
-
- if (!STI.isTargetDarwin()) {
- if (Reg == ARM::LR)
- LRSpilled = true;
- CS1Spilled = true;
- continue;
- }
-
- // Keep track if LR and any of R4, R5, R6, and R7 is spilled.
- switch (Reg) {
- case ARM::LR:
- LRSpilled = true;
- // Fallthrough
- case ARM::R4:
- case ARM::R5:
- case ARM::R6:
- case ARM::R7:
- CS1Spilled = true;
- break;
- default:
- break;
- }
- } else {
- if (!STI.isTargetDarwin()) {
- UnspilledCS1GPRs.push_back(Reg);
- continue;
- }
-
- switch (Reg) {
- case ARM::R4:
- case ARM::R5:
- case ARM::R6:
- case ARM::R7:
- case ARM::LR:
- UnspilledCS1GPRs.push_back(Reg);
- break;
- default:
- UnspilledCS2GPRs.push_back(Reg);
- break;
- }
- }
- }
- }
-
- bool ForceLRSpill = false;
- if (!LRSpilled && AFI->isThumbFunction()) {
- unsigned FnSize = TII.GetFunctionSizeInBytes(MF);
- // Force LR to be spilled if the Thumb function size is > 2048. This enables
- // use of BL to implement far jump. If it turns out that it's not needed
- // then the branch fix up path will undo it.
- if (FnSize >= (1 << 11)) {
- CanEliminateFrame = false;
- ForceLRSpill = true;
- }
- }
-
- bool ExtraCSSpill = false;
- if (!CanEliminateFrame || hasFP(MF)) {
- AFI->setHasStackFrame(true);
-
- // If LR is not spilled, but at least one of R4, R5, R6, and R7 is spilled.
- // Spill LR as well so we can fold BX_RET to the registers restore (LDM).
- if (!LRSpilled && CS1Spilled) {
- MF.getRegInfo().setPhysRegUsed(ARM::LR);
- AFI->setCSRegisterIsSpilled(ARM::LR);
- NumGPRSpills++;
- UnspilledCS1GPRs.erase(std::find(UnspilledCS1GPRs.begin(),
- UnspilledCS1GPRs.end(), (unsigned)ARM::LR));
- ForceLRSpill = false;
- ExtraCSSpill = true;
- }
-
- // Darwin ABI requires FP to point to the stack slot that contains the
- // previous FP.
- if (STI.isTargetDarwin() || hasFP(MF)) {
- MF.getRegInfo().setPhysRegUsed(FramePtr);
- NumGPRSpills++;
- }
-
- // If stack and double are 8-byte aligned and we are spilling an odd number
- // of GPRs. Spill one extra callee save GPR so we won't have to pad between
- // the integer and double callee save areas.
- unsigned TargetAlign = MF.getTarget().getFrameInfo()->getStackAlignment();
- if (TargetAlign == 8 && (NumGPRSpills & 1)) {
- if (CS1Spilled && !UnspilledCS1GPRs.empty()) {
- for (unsigned i = 0, e = UnspilledCS1GPRs.size(); i != e; ++i) {
- unsigned Reg = UnspilledCS1GPRs[i];
- // Don't spiil high register if the function is thumb
- if (!AFI->isThumbFunction() ||
- isARMLowRegister(Reg) || Reg == ARM::LR) {
- MF.getRegInfo().setPhysRegUsed(Reg);
- AFI->setCSRegisterIsSpilled(Reg);
- if (!isReservedReg(MF, Reg))
- ExtraCSSpill = true;
- break;
- }
- }
- } else if (!UnspilledCS2GPRs.empty() &&
- !AFI->isThumbFunction()) {
- unsigned Reg = UnspilledCS2GPRs.front();
- MF.getRegInfo().setPhysRegUsed(Reg);
- AFI->setCSRegisterIsSpilled(Reg);
- if (!isReservedReg(MF, Reg))
- ExtraCSSpill = true;
- }
- }
-
- // Estimate if we might need to scavenge a register at some point in order
- // to materialize a stack offset. If so, either spill one additional
- // callee-saved register or reserve a special spill slot to facilitate
- // register scavenging.
- if (RS && !ExtraCSSpill && !AFI->isThumbFunction()) {
- MachineFrameInfo *MFI = MF.getFrameInfo();
- unsigned Size = estimateStackSize(MF, MFI);
- unsigned Limit = (1 << 12) - 1;
- for (MachineFunction::iterator BB = MF.begin(),E = MF.end();BB != E; ++BB)
- for (MachineBasicBlock::iterator I= BB->begin(); I != BB->end(); ++I) {
- for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i)
- if (I->getOperand(i).isFI()) {
- unsigned Opcode = I->getOpcode();
- const TargetInstrDesc &Desc = TII.get(Opcode);
- unsigned AddrMode = (Desc.TSFlags & ARMII::AddrModeMask);
- if (AddrMode == ARMII::AddrMode3) {
- Limit = (1 << 8) - 1;
- goto DoneEstimating;
- } else if (AddrMode == ARMII::AddrMode5) {
- unsigned ThisLimit = ((1 << 8) - 1) * 4;
- if (ThisLimit < Limit)
- Limit = ThisLimit;
- }
- }
- }
- DoneEstimating:
- if (Size >= Limit) {
- // If any non-reserved CS register isn't spilled, just spill one or two
- // extra. That should take care of it!
- unsigned NumExtras = TargetAlign / 4;
- SmallVector<unsigned, 2> Extras;
- while (NumExtras && !UnspilledCS1GPRs.empty()) {
- unsigned Reg = UnspilledCS1GPRs.back();
- UnspilledCS1GPRs.pop_back();
- if (!isReservedReg(MF, Reg)) {
- Extras.push_back(Reg);
- NumExtras--;
- }
- }
- while (NumExtras && !UnspilledCS2GPRs.empty()) {
- unsigned Reg = UnspilledCS2GPRs.back();
- UnspilledCS2GPRs.pop_back();
- if (!isReservedReg(MF, Reg)) {
- Extras.push_back(Reg);
- NumExtras--;
- }
- }
- if (Extras.size() && NumExtras == 0) {
- for (unsigned i = 0, e = Extras.size(); i != e; ++i) {
- MF.getRegInfo().setPhysRegUsed(Extras[i]);
- AFI->setCSRegisterIsSpilled(Extras[i]);
- }
- } else {
- // Reserve a slot closest to SP or frame pointer.
- const TargetRegisterClass *RC = &ARM::GPRRegClass;
- RS->setScavengingFrameIndex(MFI->CreateStackObject(RC->getSize(),
- RC->getAlignment()));
- }
- }
- }
- }
-
- if (ForceLRSpill) {
- MF.getRegInfo().setPhysRegUsed(ARM::LR);
- AFI->setCSRegisterIsSpilled(ARM::LR);
- AFI->setLRIsSpilledForFarJump(true);
- }
-}
-
-/// Move iterator pass the next bunch of callee save load / store ops for
-/// the particular spill area (1: integer area 1, 2: integer area 2,
-/// 3: fp area, 0: don't care).
-static void movePastCSLoadStoreOps(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator &MBBI,
- int Opc, unsigned Area,
- const ARMSubtarget &STI) {
- while (MBBI != MBB.end() &&
- MBBI->getOpcode() == Opc && MBBI->getOperand(1).isFI()) {
- if (Area != 0) {
- bool Done = false;
- unsigned Category = 0;
- switch (MBBI->getOperand(0).getReg()) {
- case ARM::R4: case ARM::R5: case ARM::R6: case ARM::R7:
- case ARM::LR:
- Category = 1;
- break;
- case ARM::R8: case ARM::R9: case ARM::R10: case ARM::R11:
- Category = STI.isTargetDarwin() ? 2 : 1;
- break;
- case ARM::D8: case ARM::D9: case ARM::D10: case ARM::D11:
- case ARM::D12: case ARM::D13: case ARM::D14: case ARM::D15:
- Category = 3;
- break;
- default:
- Done = true;
- break;
- }
- if (Done || Category != Area)
- break;
- }
-
- ++MBBI;
- }
-}
-
-void ARMRegisterInfo::emitPrologue(MachineFunction &MF) const {
- MachineBasicBlock &MBB = MF.front();
- MachineBasicBlock::iterator MBBI = MBB.begin();
- MachineFrameInfo *MFI = MF.getFrameInfo();
- ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
- unsigned VARegSaveSize = AFI->getVarArgsRegSaveSize();
- unsigned NumBytes = MFI->getStackSize();
- const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
- DebugLoc dl = (MBBI != MBB.end() ?
- MBBI->getDebugLoc() : DebugLoc::getUnknownLoc());
-
- // Determine the sizes of each callee-save spill areas and record which frame
- // belongs to which callee-save spill areas.
- unsigned GPRCS1Size = 0, GPRCS2Size = 0, DPRCSSize = 0;
- int FramePtrSpillFI = 0;
-
- if (VARegSaveSize)
- emitSPUpdate(MBB, MBBI, TII, dl, -VARegSaveSize);
-
- if (!AFI->hasStackFrame()) {
- if (NumBytes != 0)
- emitSPUpdate(MBB, MBBI, TII, dl, -NumBytes);
- return;
- }
-
- for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
- unsigned Reg = CSI[i].getReg();
- int FI = CSI[i].getFrameIdx();
- switch (Reg) {
- case ARM::R4:
- case ARM::R5:
- case ARM::R6:
- case ARM::R7:
- case ARM::LR:
- if (Reg == FramePtr)
- FramePtrSpillFI = FI;
- AFI->addGPRCalleeSavedArea1Frame(FI);
- GPRCS1Size += 4;
- break;
- case ARM::R8:
- case ARM::R9:
- case ARM::R10:
- case ARM::R11:
- if (Reg == FramePtr)
- FramePtrSpillFI = FI;
- if (STI.isTargetDarwin()) {
- AFI->addGPRCalleeSavedArea2Frame(FI);
- GPRCS2Size += 4;
- } else {
- AFI->addGPRCalleeSavedArea1Frame(FI);
- GPRCS1Size += 4;
- }
- break;
- default:
- AFI->addDPRCalleeSavedAreaFrame(FI);
- DPRCSSize += 8;
- }
- }
-
- // Build the new SUBri to adjust SP for integer callee-save spill area 1.
- emitSPUpdate(MBB, MBBI, TII, dl, -GPRCS1Size);
- movePastCSLoadStoreOps(MBB, MBBI, ARM::STR, 1, STI);
-
- // Darwin ABI requires FP to point to the stack slot that contains the
- // previous FP.
- if (STI.isTargetDarwin() || hasFP(MF)) {
- MachineInstrBuilder MIB =
- BuildMI(MBB, MBBI, dl, TII.get(ARM::ADDri), FramePtr)
- .addFrameIndex(FramePtrSpillFI).addImm(0);
- AddDefaultCC(AddDefaultPred(MIB));
- }
-
- // Build the new SUBri to adjust SP for integer callee-save spill area 2.
- emitSPUpdate(MBB, MBBI, TII, dl, -GPRCS2Size);
-
- // Build the new SUBri to adjust SP for FP callee-save spill area.
- movePastCSLoadStoreOps(MBB, MBBI, ARM::STR, 2, STI);
- emitSPUpdate(MBB, MBBI, TII, dl, -DPRCSSize);
-
- // Determine starting offsets of spill areas.
- unsigned DPRCSOffset = NumBytes - (GPRCS1Size + GPRCS2Size + DPRCSSize);
- unsigned GPRCS2Offset = DPRCSOffset + DPRCSSize;
- unsigned GPRCS1Offset = GPRCS2Offset + GPRCS2Size;
- AFI->setFramePtrSpillOffset(MFI->getObjectOffset(FramePtrSpillFI) + NumBytes);
- AFI->setGPRCalleeSavedArea1Offset(GPRCS1Offset);
- AFI->setGPRCalleeSavedArea2Offset(GPRCS2Offset);
- AFI->setDPRCalleeSavedAreaOffset(DPRCSOffset);
-
- NumBytes = DPRCSOffset;
- if (NumBytes) {
- // Insert it after all the callee-save spills.
- movePastCSLoadStoreOps(MBB, MBBI, ARM::FSTD, 3, STI);
- emitSPUpdate(MBB, MBBI, TII, dl, -NumBytes);
- }
-
- if (STI.isTargetELF() && hasFP(MF)) {
- MFI->setOffsetAdjustment(MFI->getOffsetAdjustment() -
- AFI->getFramePtrSpillOffset());
- }
-
- AFI->setGPRCalleeSavedArea1Size(GPRCS1Size);
- AFI->setGPRCalleeSavedArea2Size(GPRCS2Size);
- AFI->setDPRCalleeSavedAreaSize(DPRCSSize);
-}
-
-static bool isCalleeSavedRegister(unsigned Reg, const unsigned *CSRegs) {
- for (unsigned i = 0; CSRegs[i]; ++i)
- if (Reg == CSRegs[i])
- return true;
- return false;
-}
-
-static bool isCSRestore(MachineInstr *MI, const unsigned *CSRegs) {
- return ((MI->getOpcode() == ARM::FLDD ||
- MI->getOpcode() == ARM::LDR) &&
- MI->getOperand(1).isFI() &&
- isCalleeSavedRegister(MI->getOperand(0).getReg(), CSRegs));
-}
-
-void ARMRegisterInfo::emitEpilogue(MachineFunction &MF,
- MachineBasicBlock &MBB) const {
- MachineBasicBlock::iterator MBBI = prior(MBB.end());
- assert(MBBI->getOpcode() == ARM::BX_RET &&
- "Can only insert epilog into returning blocks");
- DebugLoc dl = MBBI->getDebugLoc();
- MachineFrameInfo *MFI = MF.getFrameInfo();
- ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
- unsigned VARegSaveSize = AFI->getVarArgsRegSaveSize();
- int NumBytes = (int)MFI->getStackSize();
-
- if (!AFI->hasStackFrame()) {
- if (NumBytes != 0)
- emitSPUpdate(MBB, MBBI, TII, dl, NumBytes);
- } else {
- // Unwind MBBI to point to first LDR / FLDD.
- const unsigned *CSRegs = getCalleeSavedRegs();
- if (MBBI != MBB.begin()) {
- do
- --MBBI;
- while (MBBI != MBB.begin() && isCSRestore(MBBI, CSRegs));
- if (!isCSRestore(MBBI, CSRegs))
- ++MBBI;
- }
-
- // Move SP to start of FP callee save spill area.
- NumBytes -= (AFI->getGPRCalleeSavedArea1Size() +
- AFI->getGPRCalleeSavedArea2Size() +
- AFI->getDPRCalleeSavedAreaSize());
-
- // Darwin ABI requires FP to point to the stack slot that contains the
- // previous FP.
- if ((STI.isTargetDarwin() && NumBytes) || hasFP(MF)) {
- NumBytes = AFI->getFramePtrSpillOffset() - NumBytes;
- // Reset SP based on frame pointer only if the stack frame extends beyond
- // frame pointer stack slot or target is ELF and the function has FP.
- if (AFI->getGPRCalleeSavedArea2Size() ||
- AFI->getDPRCalleeSavedAreaSize() ||
- AFI->getDPRCalleeSavedAreaOffset()||
- hasFP(MF)) {
- if (NumBytes)
- BuildMI(MBB, MBBI, dl, TII.get(ARM::SUBri), ARM::SP).addReg(FramePtr)
- .addImm(NumBytes)
- .addImm((unsigned)ARMCC::AL).addReg(0).addReg(0);
- else
- BuildMI(MBB, MBBI, dl, TII.get(ARM::MOVr), ARM::SP).addReg(FramePtr)
- .addImm((unsigned)ARMCC::AL).addReg(0).addReg(0);
- }
- } else if (NumBytes) {
- emitSPUpdate(MBB, MBBI, TII, dl, NumBytes);
- }
-
- // Move SP to start of integer callee save spill area 2.
- movePastCSLoadStoreOps(MBB, MBBI, ARM::FLDD, 3, STI);
- emitSPUpdate(MBB, MBBI, TII, dl, AFI->getDPRCalleeSavedAreaSize());
-
- // Move SP to start of integer callee save spill area 1.
- movePastCSLoadStoreOps(MBB, MBBI, ARM::LDR, 2, STI);
- emitSPUpdate(MBB, MBBI, TII, dl, AFI->getGPRCalleeSavedArea2Size());
-
- // Move SP to SP upon entry to the function.
- movePastCSLoadStoreOps(MBB, MBBI, ARM::LDR, 1, STI);
- emitSPUpdate(MBB, MBBI, TII, dl, AFI->getGPRCalleeSavedArea1Size());
- }
-
- if (VARegSaveSize)
- emitSPUpdate(MBB, MBBI, TII, dl, VARegSaveSize);
-
-}
-
-unsigned ARMBaseRegisterInfo::getRARegister() const {
- return ARM::LR;
-}
-
-unsigned ARMBaseRegisterInfo::getFrameRegister(MachineFunction &MF) const {
- if (STI.isTargetDarwin() || hasFP(MF))
- return FramePtr;
- return ARM::SP;
-}
-
-unsigned ARMBaseRegisterInfo::getEHExceptionRegister() const {
- assert(0 && "What is the exception register");
- return 0;
-}
-
-unsigned ARMBaseRegisterInfo::getEHHandlerRegister() const {
- assert(0 && "What is the exception handler register");
- return 0;
-}
-
-int ARMBaseRegisterInfo::getDwarfRegNum(unsigned RegNum, bool isEH) const {
- return ARMGenRegisterInfo::getDwarfRegNumFull(RegNum, 0);
-}
-
-unsigned ARMBaseRegisterInfo::getRegisterPairEven(unsigned Reg,
- const MachineFunction &MF) const {
- switch (Reg) {
- default: break;
- // Return 0 if either register of the pair is a special register.
- // So no R12, etc.
- case ARM::R1:
- return ARM::R0;
- case ARM::R3:
- // FIXME!
- return STI.isThumb() ? 0 : ARM::R2;
- case ARM::R5:
- return ARM::R4;
- case ARM::R7:
- return isReservedReg(MF, ARM::R7) ? 0 : ARM::R6;
- case ARM::R9:
- return isReservedReg(MF, ARM::R9) ? 0 :ARM::R8;
- case ARM::R11:
- return isReservedReg(MF, ARM::R11) ? 0 : ARM::R10;
-
- case ARM::S1:
- return ARM::S0;
- case ARM::S3:
- return ARM::S2;
- case ARM::S5:
- return ARM::S4;
- case ARM::S7:
- return ARM::S6;
- case ARM::S9:
- return ARM::S8;
- case ARM::S11:
- return ARM::S10;
- case ARM::S13:
- return ARM::S12;
- case ARM::S15:
- return ARM::S14;
- case ARM::S17:
- return ARM::S16;
- case ARM::S19:
- return ARM::S18;
- case ARM::S21:
- return ARM::S20;
- case ARM::S23:
- return ARM::S22;
- case ARM::S25:
- return ARM::S24;
- case ARM::S27:
- return ARM::S26;
- case ARM::S29:
- return ARM::S28;
- case ARM::S31:
- return ARM::S30;
-
- case ARM::D1:
- return ARM::D0;
- case ARM::D3:
- return ARM::D2;
- case ARM::D5:
- return ARM::D4;
- case ARM::D7:
- return ARM::D6;
- case ARM::D9:
- return ARM::D8;
- case ARM::D11:
- return ARM::D10;
- case ARM::D13:
- return ARM::D12;
- case ARM::D15:
- return ARM::D14;
- }
-
- return 0;
-}
-
-unsigned ARMBaseRegisterInfo::getRegisterPairOdd(unsigned Reg,
- const MachineFunction &MF) const {
- switch (Reg) {
- default: break;
- // Return 0 if either register of the pair is a special register.
- // So no R12, etc.
- case ARM::R0:
- return ARM::R1;
- case ARM::R2:
- // FIXME!
- return STI.isThumb() ? 0 : ARM::R3;
- case ARM::R4:
- return ARM::R5;
- case ARM::R6:
- return isReservedReg(MF, ARM::R7) ? 0 : ARM::R7;
- case ARM::R8:
- return isReservedReg(MF, ARM::R9) ? 0 :ARM::R9;
- case ARM::R10:
- return isReservedReg(MF, ARM::R11) ? 0 : ARM::R11;
-
- case ARM::S0:
- return ARM::S1;
- case ARM::S2:
- return ARM::S3;
- case ARM::S4:
- return ARM::S5;
- case ARM::S6:
- return ARM::S7;
- case ARM::S8:
- return ARM::S9;
- case ARM::S10:
- return ARM::S11;
- case ARM::S12:
- return ARM::S13;
- case ARM::S14:
- return ARM::S15;
- case ARM::S16:
- return ARM::S17;
- case ARM::S18:
- return ARM::S19;
- case ARM::S20:
- return ARM::S21;
- case ARM::S22:
- return ARM::S23;
- case ARM::S24:
- return ARM::S25;
- case ARM::S26:
- return ARM::S27;
- case ARM::S28:
- return ARM::S29;
- case ARM::S30:
- return ARM::S31;
-
- case ARM::D0:
- return ARM::D1;
- case ARM::D2:
- return ARM::D3;
- case ARM::D4:
- return ARM::D5;
- case ARM::D6:
- return ARM::D7;
- case ARM::D8:
- return ARM::D9;
- case ARM::D10:
- return ARM::D11;
- case ARM::D12:
- return ARM::D13;
- case ARM::D14:
- return ARM::D15;
- }
-
- return 0;
-}
-
-#include "ARMGenRegisterInfo.inc"
diff --git a/lib/Target/ARM/ARMRegisterInfo.h b/lib/Target/ARM/ARMRegisterInfo.h
index 7fe075a..041afd0 100644
--- a/lib/Target/ARM/ARMRegisterInfo.h
+++ b/lib/Target/ARM/ARMRegisterInfo.h
@@ -16,127 +16,26 @@
#include "ARM.h"
#include "llvm/Target/TargetRegisterInfo.h"
-#include "ARMGenRegisterInfo.h.inc"
+#include "ARMBaseRegisterInfo.h"
namespace llvm {
class ARMSubtarget;
- class TargetInstrInfo;
+ class ARMBaseInstrInfo;
class Type;
-/// Register allocation hints.
-namespace ARMRI {
- enum {
- RegPairOdd = 1,
- RegPairEven = 2
+namespace ARM {
+ /// SubregIndex - The index of various subregister classes. Note that
+ /// these indices must be kept in sync with the class indices in the
+ /// ARMRegisterInfo.td file.
+ enum SubregIndex {
+ SSUBREG_0 = 1, SSUBREG_1 = 2, SSUBREG_2 = 3, SSUBREG_3 = 4,
+ DSUBREG_0 = 5, DSUBREG_1 = 6
};
}
-/// isARMLowRegister - Returns true if the register is low register r0-r7.
-///
-static inline bool isARMLowRegister(unsigned Reg) {
- using namespace ARM;
- switch (Reg) {
- case R0: case R1: case R2: case R3:
- case R4: case R5: case R6: case R7:
- return true;
- default:
- return false;
- }
-}
-
-struct ARMBaseRegisterInfo : public ARMGenRegisterInfo {
-protected:
- const TargetInstrInfo &TII;
- const ARMSubtarget &STI;
-
- /// FramePtr - ARM physical register used as frame ptr.
- unsigned FramePtr;
-public:
- ARMBaseRegisterInfo(const TargetInstrInfo &tii, const ARMSubtarget &STI);
-
- /// getRegisterNumbering - Given the enum value for some register, e.g.
- /// ARM::LR, return the number that it corresponds to (e.g. 14).
- static unsigned getRegisterNumbering(unsigned RegEnum);
-
- /// Same as previous getRegisterNumbering except it returns true in isSPVFP
- /// if the register is a single precision VFP register.
- static unsigned getRegisterNumbering(unsigned RegEnum, bool &isSPVFP);
-
- /// Code Generation virtual methods...
- const unsigned *getCalleeSavedRegs(const MachineFunction *MF = 0) const;
-
- const TargetRegisterClass* const*
- getCalleeSavedRegClasses(const MachineFunction *MF = 0) const;
-
- BitVector getReservedRegs(const MachineFunction &MF) const;
-
- bool isReservedReg(const MachineFunction &MF, unsigned Reg) const;
-
- const TargetRegisterClass *getPointerRegClass() const;
-
- std::pair<TargetRegisterClass::iterator,TargetRegisterClass::iterator>
- getAllocationOrder(const TargetRegisterClass *RC,
- unsigned HintType, unsigned HintReg,
- const MachineFunction &MF) const;
-
- unsigned ResolveRegAllocHint(unsigned Type, unsigned Reg,
- const MachineFunction &MF) const;
-
- void UpdateRegAllocHint(unsigned Reg, unsigned NewReg,
- MachineFunction &MF) const;
-
- bool hasFP(const MachineFunction &MF) const;
-
- void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
- RegScavenger *RS = NULL) const;
-
- // Debug information queries.
- unsigned getRARegister() const;
- unsigned getFrameRegister(MachineFunction &MF) const;
-
- // Exception handling queries.
- unsigned getEHExceptionRegister() const;
- unsigned getEHHandlerRegister() const;
-
- int getDwarfRegNum(unsigned RegNum, bool isEH) const;
-
- bool isLowRegister(unsigned Reg) const;
-
-private:
- unsigned getRegisterPairEven(unsigned Reg, const MachineFunction &MF) const;
-
- unsigned getRegisterPairOdd(unsigned Reg, const MachineFunction &MF) const;
-};
-
struct ARMRegisterInfo : public ARMBaseRegisterInfo {
public:
- ARMRegisterInfo(const TargetInstrInfo &tii, const ARMSubtarget &STI);
-
- /// emitLoadConstPool - Emits a load from constpool to materialize the
- /// specified immediate.
- void emitLoadConstPool(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator &MBBI,
- const TargetInstrInfo *TII, DebugLoc dl,
- unsigned DestReg, int Val,
- ARMCC::CondCodes Pred = ARMCC::AL,
- unsigned PredReg = 0) const;
-
- /// Code Generation virtual methods...
- bool isReservedReg(const MachineFunction &MF, unsigned Reg) const;
-
- bool requiresRegisterScavenging(const MachineFunction &MF) const;
-
- bool hasReservedCallFrame(MachineFunction &MF) const;
-
- void eliminateCallFramePseudoInstr(MachineFunction &MF,
- MachineBasicBlock &MBB,
- MachineBasicBlock::iterator I) const;
-
- void eliminateFrameIndex(MachineBasicBlock::iterator II,
- int SPAdj, RegScavenger *RS = NULL) const;
-
- void emitPrologue(MachineFunction &MF) const;
- void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
+ ARMRegisterInfo(const ARMBaseInstrInfo &tii, const ARMSubtarget &STI);
};
} // end namespace llvm
diff --git a/lib/Target/ARM/ARMRegisterInfo.td b/lib/Target/ARM/ARMRegisterInfo.td
index a057e5c..20a7355 100644
--- a/lib/Target/ARM/ARMRegisterInfo.td
+++ b/lib/Target/ARM/ARMRegisterInfo.td
@@ -18,8 +18,8 @@ class ARMReg<bits<4> num, string n, list<Register> subregs = []> : Register<n> {
let SubRegs = subregs;
}
-class ARMFReg<bits<5> num, string n> : Register<n> {
- field bits<5> Num;
+class ARMFReg<bits<6> num, string n> : Register<n> {
+ field bits<6> Num;
let Namespace = "ARM";
}
@@ -58,10 +58,11 @@ def S24 : ARMFReg<24, "s24">; def S25 : ARMFReg<25, "s25">;
def S26 : ARMFReg<26, "s26">; def S27 : ARMFReg<27, "s27">;
def S28 : ARMFReg<28, "s28">; def S29 : ARMFReg<29, "s29">;
def S30 : ARMFReg<30, "s30">; def S31 : ARMFReg<31, "s31">;
+def SDummy : ARMFReg<63, "sINVALID">;
// Aliases of the F* registers used to hold 64-bit fp values (doubles)
def D0 : ARMReg< 0, "d0", [S0, S1]>;
-def D1 : ARMReg< 1, "d1", [S2, S3]>;
+def D1 : ARMReg< 1, "d1", [S2, S3]>;
def D2 : ARMReg< 2, "d2", [S4, S5]>;
def D3 : ARMReg< 3, "d3", [S6, S7]>;
def D4 : ARMReg< 4, "d4", [S8, S9]>;
@@ -78,18 +79,18 @@ def D14 : ARMReg<14, "d14", [S28, S29]>;
def D15 : ARMReg<15, "d15", [S30, S31]>;
// VFP3 defines 16 additional double registers
-def D16 : ARMFReg<16, "d16">; def D17 : ARMFReg<17, "d16">;
-def D18 : ARMFReg<18, "d16">; def D19 : ARMFReg<19, "d16">;
-def D20 : ARMFReg<20, "d16">; def D21 : ARMFReg<21, "d16">;
-def D22 : ARMFReg<22, "d16">; def D23 : ARMFReg<23, "d16">;
-def D24 : ARMFReg<24, "d16">; def D25 : ARMFReg<25, "d16">;
-def D26 : ARMFReg<26, "d16">; def D27 : ARMFReg<27, "d16">;
-def D28 : ARMFReg<28, "d16">; def D29 : ARMFReg<29, "d16">;
-def D30 : ARMFReg<30, "d16">; def D31 : ARMFReg<31, "d16">;
+def D16 : ARMFReg<16, "d16">; def D17 : ARMFReg<17, "d17">;
+def D18 : ARMFReg<18, "d18">; def D19 : ARMFReg<19, "d19">;
+def D20 : ARMFReg<20, "d20">; def D21 : ARMFReg<21, "d21">;
+def D22 : ARMFReg<22, "d22">; def D23 : ARMFReg<23, "d23">;
+def D24 : ARMFReg<24, "d24">; def D25 : ARMFReg<25, "d25">;
+def D26 : ARMFReg<26, "d26">; def D27 : ARMFReg<27, "d27">;
+def D28 : ARMFReg<28, "d28">; def D29 : ARMFReg<29, "d29">;
+def D30 : ARMFReg<30, "d30">; def D31 : ARMFReg<31, "d31">;
// Advanced SIMD (NEON) defines 16 quad-word aliases
def Q0 : ARMReg< 0, "q0", [D0, D1]>;
-def Q1 : ARMReg< 1, "q1", [D2, D3]>;
+def Q1 : ARMReg< 1, "q1", [D2, D3]>;
def Q2 : ARMReg< 2, "q2", [D4, D5]>;
def Q3 : ARMReg< 3, "q3", [D6, D7]>;
def Q4 : ARMReg< 4, "q4", [D8, D9]>;
@@ -106,7 +107,9 @@ def Q14 : ARMReg<14, "q14", [D28, D29]>;
def Q15 : ARMReg<15, "q15", [D30, D31]>;
// Current Program Status Register.
-def CPSR : ARMReg<0, "cpsr">;
+def CPSR : ARMReg<0, "cpsr">;
+
+def FPSCR : ARMReg<1, "fpscr">;
// Register classes.
//
@@ -158,6 +161,13 @@ def GPR : RegisterClass<"ARM", [i32], 32, [R0, R1, R2, R3, R4, R5, R6,
ARM::R4, ARM::R5, ARM::R6,
ARM::R8, ARM::R10,ARM::R11,
ARM::R7 };
+ // FP is R7, R9 is available as callee-saved register.
+ // This is used by non-Darwin platform in Thumb mode.
+ static const unsigned ARM_GPR_AO_5[] = {
+ ARM::R0, ARM::R1, ARM::R2, ARM::R3,
+ ARM::R12,ARM::LR,
+ ARM::R4, ARM::R5, ARM::R6,
+ ARM::R8, ARM::R9, ARM::R10,ARM::R11,ARM::R7 };
GPRClass::iterator
GPRClass::allocation_order_begin(const MachineFunction &MF) const {
@@ -171,6 +181,8 @@ def GPR : RegisterClass<"ARM", [i32], 32, [R0, R1, R2, R3, R4, R5, R6,
} else {
if (Subtarget.isR9Reserved())
return ARM_GPR_AO_2;
+ else if (Subtarget.isThumb())
+ return ARM_GPR_AO_5;
else
return ARM_GPR_AO_1;
}
@@ -191,6 +203,8 @@ def GPR : RegisterClass<"ARM", [i32], 32, [R0, R1, R2, R3, R4, R5, R6,
} else {
if (Subtarget.isR9Reserved())
I = ARM_GPR_AO_2 + (sizeof(ARM_GPR_AO_2)/sizeof(unsigned));
+ else if (Subtarget.isThumb())
+ I = ARM_GPR_AO_5 + (sizeof(ARM_GPR_AO_5)/sizeof(unsigned));
else
I = ARM_GPR_AO_1 + (sizeof(ARM_GPR_AO_1)/sizeof(unsigned));
}
@@ -240,32 +254,45 @@ def SPR : RegisterClass<"ARM", [f32], 32, [S0, S1, S2, S3, S4, S5, S6, S7, S8,
S9, S10, S11, S12, S13, S14, S15, S16, S17, S18, S19, S20, S21, S22,
S23, S24, S25, S26, S27, S28, S29, S30, S31]>;
+// Subset of SPR which can be used as a source of NEON scalars for 16-bit
+// operations
+def SPR_8 : RegisterClass<"ARM", [f32], 32,
+ [S0, S1, S2, S3, S4, S5, S6, S7,
+ S8, S9, S10, S11, S12, S13, S14, S15]>;
+
+// Dummy f32 regclass to represent impossible subreg indices.
+def SPR_INVALID : RegisterClass<"ARM", [f32], 32, [SDummy]> {
+ let CopyCost = -1;
+}
+
// Scalar double precision floating point / generic 64-bit vector register
// class.
// ARM requires only word alignment for double. It's more performant if it
// is double-word alignment though.
def DPR : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32], 64,
[D0, D1, D2, D3, D4, D5, D6, D7,
- D8, D9, D10, D11, D12, D13, D14, D15]> {
- let SubRegClassList = [SPR, SPR];
+ D8, D9, D10, D11, D12, D13, D14, D15,
+ D16, D17, D18, D19, D20, D21, D22, D23,
+ D24, D25, D26, D27, D28, D29, D30, D31]> {
+ let SubRegClassList = [SPR_INVALID, SPR_INVALID];
let MethodProtos = [{
iterator allocation_order_begin(const MachineFunction &MF) const;
iterator allocation_order_end(const MachineFunction &MF) const;
}];
let MethodBodies = [{
// VFP2
- static const unsigned ARM_DPR_VFP2[] = {
- ARM::D0, ARM::D1, ARM::D2, ARM::D3,
- ARM::D4, ARM::D5, ARM::D6, ARM::D7,
- ARM::D8, ARM::D9, ARM::D10, ARM::D11,
+ static const unsigned ARM_DPR_VFP2[] = {
+ ARM::D0, ARM::D1, ARM::D2, ARM::D3,
+ ARM::D4, ARM::D5, ARM::D6, ARM::D7,
+ ARM::D8, ARM::D9, ARM::D10, ARM::D11,
ARM::D12, ARM::D13, ARM::D14, ARM::D15 };
// VFP3
static const unsigned ARM_DPR_VFP3[] = {
- ARM::D0, ARM::D1, ARM::D2, ARM::D3,
- ARM::D4, ARM::D5, ARM::D6, ARM::D7,
- ARM::D8, ARM::D9, ARM::D10, ARM::D11,
+ ARM::D0, ARM::D1, ARM::D2, ARM::D3,
+ ARM::D4, ARM::D5, ARM::D6, ARM::D7,
+ ARM::D8, ARM::D9, ARM::D10, ARM::D11,
ARM::D12, ARM::D13, ARM::D14, ARM::D15,
- ARM::D16, ARM::D17, ARM::D18, ARM::D15,
+ ARM::D16, ARM::D17, ARM::D18, ARM::D19,
ARM::D20, ARM::D21, ARM::D22, ARM::D23,
ARM::D24, ARM::D25, ARM::D26, ARM::D27,
ARM::D28, ARM::D29, ARM::D30, ARM::D31 };
@@ -290,11 +317,34 @@ def DPR : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32], 64,
}];
}
+// Subset of DPR that are accessible with VFP2 (and so that also have
+// 32-bit SPR subregs).
+def DPR_VFP2 : RegisterClass<"ARM", [f64, v2i32, v2f32], 64,
+ [D0, D1, D2, D3, D4, D5, D6, D7,
+ D8, D9, D10, D11, D12, D13, D14, D15]> {
+ let SubRegClassList = [SPR, SPR];
+}
+
+// Subset of DPR which can be used as a source of NEON scalars for 16-bit
+// operations
+def DPR_8 : RegisterClass<"ARM", [f64, v4i16, v2f32], 64,
+ [D0, D1, D2, D3, D4, D5, D6, D7]> {
+ let SubRegClassList = [SPR_8, SPR_8];
+}
+
// Generic 128-bit vector register class.
def QPR : RegisterClass<"ARM", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], 128,
[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7,
Q8, Q9, Q10, Q11, Q12, Q13, Q14, Q15]> {
- let SubRegClassList = [SPR, SPR, SPR, SPR, DPR, DPR];
+ let SubRegClassList = [SPR_INVALID, SPR_INVALID, SPR_INVALID, SPR_INVALID,
+ DPR, DPR];
+}
+
+// Subset of QPR that have 32-bit SPR subregs.
+def QPR_VFP2 : RegisterClass<"ARM", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+ 128,
+ [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]> {
+ let SubRegClassList = [SPR, SPR, SPR, SPR, DPR_VFP2, DPR_VFP2];
}
// Condition code registers.
@@ -341,4 +391,3 @@ def : SubRegSet<6, [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7,
Q8, Q9, Q10, Q11, Q12, Q13, Q14, Q15],
[D1, D3, D5, D7, D9, D11, D13, D15,
D17, D19, D21, D23, D25, D27, D29, D31]>;
-
diff --git a/lib/Target/ARM/ARMSchedule.td b/lib/Target/ARM/ARMSchedule.td
index 75fa707..fc4c5f5 100644
--- a/lib/Target/ARM/ARMSchedule.td
+++ b/lib/Target/ARM/ARMSchedule.td
@@ -10,26 +10,151 @@
//===----------------------------------------------------------------------===//
// Functional units across ARM processors
//
-def FU_iALU : FuncUnit; // Integer alu unit
-def FU_iLdSt : FuncUnit; // Integer load / store unit
-def FU_FpALU : FuncUnit; // FP alu unit
-def FU_FpLdSt : FuncUnit; // FP load / store unit
-def FU_Br : FuncUnit; // Branch unit
+def FU_Issue : FuncUnit; // issue
+def FU_Pipe0 : FuncUnit; // pipeline 0
+def FU_Pipe1 : FuncUnit; // pipeline 1
+def FU_LdSt0 : FuncUnit; // pipeline 0 load/store
+def FU_LdSt1 : FuncUnit; // pipeline 1 load/store
+def FU_NPipe : FuncUnit; // NEON ALU/MUL pipe
+def FU_NLSPipe : FuncUnit; // NEON LS pipe
//===----------------------------------------------------------------------===//
// Instruction Itinerary classes used for ARM
//
-def IIC_iALU : InstrItinClass;
-def IIC_iLoad : InstrItinClass;
-def IIC_iStore : InstrItinClass;
-def IIC_fpALU : InstrItinClass;
-def IIC_fpLoad : InstrItinClass;
-def IIC_fpStore : InstrItinClass;
-def IIC_Br : InstrItinClass;
+def IIC_iALUx : InstrItinClass;
+def IIC_iALUi : InstrItinClass;
+def IIC_iALUr : InstrItinClass;
+def IIC_iALUsi : InstrItinClass;
+def IIC_iALUsr : InstrItinClass;
+def IIC_iUNAr : InstrItinClass;
+def IIC_iUNAsi : InstrItinClass;
+def IIC_iUNAsr : InstrItinClass;
+def IIC_iCMPi : InstrItinClass;
+def IIC_iCMPr : InstrItinClass;
+def IIC_iCMPsi : InstrItinClass;
+def IIC_iCMPsr : InstrItinClass;
+def IIC_iMOVi : InstrItinClass;
+def IIC_iMOVr : InstrItinClass;
+def IIC_iMOVsi : InstrItinClass;
+def IIC_iMOVsr : InstrItinClass;
+def IIC_iCMOVi : InstrItinClass;
+def IIC_iCMOVr : InstrItinClass;
+def IIC_iCMOVsi : InstrItinClass;
+def IIC_iCMOVsr : InstrItinClass;
+def IIC_iMUL16 : InstrItinClass;
+def IIC_iMAC16 : InstrItinClass;
+def IIC_iMUL32 : InstrItinClass;
+def IIC_iMAC32 : InstrItinClass;
+def IIC_iMUL64 : InstrItinClass;
+def IIC_iMAC64 : InstrItinClass;
+def IIC_iLoadi : InstrItinClass;
+def IIC_iLoadr : InstrItinClass;
+def IIC_iLoadsi : InstrItinClass;
+def IIC_iLoadiu : InstrItinClass;
+def IIC_iLoadru : InstrItinClass;
+def IIC_iLoadsiu : InstrItinClass;
+def IIC_iLoadm : InstrItinClass;
+def IIC_iStorei : InstrItinClass;
+def IIC_iStorer : InstrItinClass;
+def IIC_iStoresi : InstrItinClass;
+def IIC_iStoreiu : InstrItinClass;
+def IIC_iStoreru : InstrItinClass;
+def IIC_iStoresiu : InstrItinClass;
+def IIC_iStorem : InstrItinClass;
+def IIC_Br : InstrItinClass;
+def IIC_fpSTAT : InstrItinClass;
+def IIC_fpUNA32 : InstrItinClass;
+def IIC_fpUNA64 : InstrItinClass;
+def IIC_fpCMP32 : InstrItinClass;
+def IIC_fpCMP64 : InstrItinClass;
+def IIC_fpCVTSD : InstrItinClass;
+def IIC_fpCVTDS : InstrItinClass;
+def IIC_fpCVTIS : InstrItinClass;
+def IIC_fpCVTID : InstrItinClass;
+def IIC_fpCVTSI : InstrItinClass;
+def IIC_fpCVTDI : InstrItinClass;
+def IIC_fpALU32 : InstrItinClass;
+def IIC_fpALU64 : InstrItinClass;
+def IIC_fpMUL32 : InstrItinClass;
+def IIC_fpMUL64 : InstrItinClass;
+def IIC_fpMAC32 : InstrItinClass;
+def IIC_fpMAC64 : InstrItinClass;
+def IIC_fpDIV32 : InstrItinClass;
+def IIC_fpDIV64 : InstrItinClass;
+def IIC_fpSQRT32 : InstrItinClass;
+def IIC_fpSQRT64 : InstrItinClass;
+def IIC_fpLoad32 : InstrItinClass;
+def IIC_fpLoad64 : InstrItinClass;
+def IIC_fpLoadm : InstrItinClass;
+def IIC_fpStore32 : InstrItinClass;
+def IIC_fpStore64 : InstrItinClass;
+def IIC_fpStorem : InstrItinClass;
+def IIC_VLD1 : InstrItinClass;
+def IIC_VLD2 : InstrItinClass;
+def IIC_VLD3 : InstrItinClass;
+def IIC_VLD4 : InstrItinClass;
+def IIC_VST : InstrItinClass;
+def IIC_VUNAD : InstrItinClass;
+def IIC_VUNAQ : InstrItinClass;
+def IIC_VBIND : InstrItinClass;
+def IIC_VBINQ : InstrItinClass;
+def IIC_VMOVImm : InstrItinClass;
+def IIC_VMOVD : InstrItinClass;
+def IIC_VMOVQ : InstrItinClass;
+def IIC_VMOVIS : InstrItinClass;
+def IIC_VMOVID : InstrItinClass;
+def IIC_VMOVISL : InstrItinClass;
+def IIC_VMOVSI : InstrItinClass;
+def IIC_VMOVDI : InstrItinClass;
+def IIC_VPERMD : InstrItinClass;
+def IIC_VPERMQ : InstrItinClass;
+def IIC_VPERMQ3 : InstrItinClass;
+def IIC_VMACD : InstrItinClass;
+def IIC_VMACQ : InstrItinClass;
+def IIC_VRECSD : InstrItinClass;
+def IIC_VRECSQ : InstrItinClass;
+def IIC_VCNTiD : InstrItinClass;
+def IIC_VCNTiQ : InstrItinClass;
+def IIC_VUNAiD : InstrItinClass;
+def IIC_VUNAiQ : InstrItinClass;
+def IIC_VQUNAiD : InstrItinClass;
+def IIC_VQUNAiQ : InstrItinClass;
+def IIC_VBINiD : InstrItinClass;
+def IIC_VBINiQ : InstrItinClass;
+def IIC_VSUBiD : InstrItinClass;
+def IIC_VSUBiQ : InstrItinClass;
+def IIC_VBINi4D : InstrItinClass;
+def IIC_VBINi4Q : InstrItinClass;
+def IIC_VSHLiD : InstrItinClass;
+def IIC_VSHLiQ : InstrItinClass;
+def IIC_VSHLi4D : InstrItinClass;
+def IIC_VSHLi4Q : InstrItinClass;
+def IIC_VPALiD : InstrItinClass;
+def IIC_VPALiQ : InstrItinClass;
+def IIC_VMULi16D : InstrItinClass;
+def IIC_VMULi32D : InstrItinClass;
+def IIC_VMULi16Q : InstrItinClass;
+def IIC_VMULi32Q : InstrItinClass;
+def IIC_VMACi16D : InstrItinClass;
+def IIC_VMACi32D : InstrItinClass;
+def IIC_VMACi16Q : InstrItinClass;
+def IIC_VMACi32Q : InstrItinClass;
+def IIC_VEXTD : InstrItinClass;
+def IIC_VEXTQ : InstrItinClass;
+def IIC_VTB1 : InstrItinClass;
+def IIC_VTB2 : InstrItinClass;
+def IIC_VTB3 : InstrItinClass;
+def IIC_VTB4 : InstrItinClass;
+def IIC_VTBX1 : InstrItinClass;
+def IIC_VTBX2 : InstrItinClass;
+def IIC_VTBX3 : InstrItinClass;
+def IIC_VTBX4 : InstrItinClass;
//===----------------------------------------------------------------------===//
// Processor instruction itineraries.
def GenericItineraries : ProcessorItineraries<[]>;
+
include "ARMScheduleV6.td"
+include "ARMScheduleV7.td"
diff --git a/lib/Target/ARM/ARMScheduleV6.td b/lib/Target/ARM/ARMScheduleV6.td
index 596a57f..1ace718 100644
--- a/lib/Target/ARM/ARMScheduleV6.td
+++ b/lib/Target/ARM/ARMScheduleV6.td
@@ -1,4 +1,4 @@
-//===- ARMSchedule.td - ARM v6 Scheduling Definitions ------*- tablegen -*-===//
+//===- ARMScheduleV6.td - ARM v6 Scheduling Definitions ----*- tablegen -*-===//
//
// The LLVM Compiler Infrastructure
//
@@ -11,12 +11,4 @@
//
//===----------------------------------------------------------------------===//
-def V6Itineraries : ProcessorItineraries<[
- InstrItinData<IIC_iALU , [InstrStage<1, [FU_iALU]>]>,
- InstrItinData<IIC_iLoad , [InstrStage<2, [FU_iLdSt]>]>,
- InstrItinData<IIC_iStore , [InstrStage<1, [FU_iLdSt]>]>,
- InstrItinData<IIC_fpALU , [InstrStage<6, [FU_FpALU]>]>,
- InstrItinData<IIC_fpLoad , [InstrStage<2, [FU_FpLdSt]>]>,
- InstrItinData<IIC_fpStore , [InstrStage<1, [FU_FpLdSt]>]>,
- InstrItinData<IIC_Br , [InstrStage<3, [FU_Br]>]>
-]>;
+// TODO: Add model for an ARM11
diff --git a/lib/Target/ARM/ARMScheduleV7.td b/lib/Target/ARM/ARMScheduleV7.td
new file mode 100644
index 0000000..e565813
--- /dev/null
+++ b/lib/Target/ARM/ARMScheduleV7.td
@@ -0,0 +1,587 @@
+//===- ARMScheduleV7.td - ARM v7 Scheduling Definitions ----*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the itinerary class data for the ARM v7 processors.
+//
+//===----------------------------------------------------------------------===//
+
+//
+// Scheduling information derived from "Cortex-A8 Technical Reference Manual".
+//
+// Dual issue pipeline represented by FU_Pipe0 | FU_Pipe1
+//
+def CortexA8Itineraries : ProcessorItineraries<[
+
+ // Two fully-pipelined integer ALU pipelines
+ //
+ // No operand cycles
+ InstrItinData<IIC_iALUx , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>]>,
+ //
+ // Binary Instructions that produce a result
+ InstrItinData<IIC_iALUi , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>], [2, 2]>,
+ InstrItinData<IIC_iALUr , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>], [2, 2, 2]>,
+ InstrItinData<IIC_iALUsi , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>], [2, 2, 1]>,
+ InstrItinData<IIC_iALUsr , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>], [2, 2, 1, 1]>,
+ //
+ // Unary Instructions that produce a result
+ InstrItinData<IIC_iUNAr , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>], [2, 2]>,
+ InstrItinData<IIC_iUNAsi , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>], [2, 1]>,
+ InstrItinData<IIC_iUNAsr , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>], [2, 1, 1]>,
+ //
+ // Compare instructions
+ InstrItinData<IIC_iCMPi , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>], [2]>,
+ InstrItinData<IIC_iCMPr , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>], [2, 2]>,
+ InstrItinData<IIC_iCMPsi , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>], [2, 1]>,
+ InstrItinData<IIC_iCMPsr , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>], [2, 1, 1]>,
+ //
+ // Move instructions, unconditional
+ InstrItinData<IIC_iMOVi , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>], [1]>,
+ InstrItinData<IIC_iMOVr , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>], [1, 1]>,
+ InstrItinData<IIC_iMOVsi , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>], [1, 1]>,
+ InstrItinData<IIC_iMOVsr , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>], [1, 1, 1]>,
+ //
+ // Move instructions, conditional
+ InstrItinData<IIC_iCMOVi , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>], [2]>,
+ InstrItinData<IIC_iCMOVr , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>], [2, 1]>,
+ InstrItinData<IIC_iCMOVsi , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>], [2, 1]>,
+ InstrItinData<IIC_iCMOVsr , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>], [2, 1, 1]>,
+
+ // Integer multiply pipeline
+ // Result written in E5, but that is relative to the last cycle of multicycle,
+ // so we use 6 for those cases
+ //
+ InstrItinData<IIC_iMUL16 , [InstrStage<1, [FU_Pipe0]>], [5, 1, 1]>,
+ InstrItinData<IIC_iMAC16 , [InstrStage<1, [FU_Pipe1], 0>,
+ InstrStage<2, [FU_Pipe0]>], [6, 1, 1, 4]>,
+ InstrItinData<IIC_iMUL32 , [InstrStage<1, [FU_Pipe1], 0>,
+ InstrStage<2, [FU_Pipe0]>], [6, 1, 1]>,
+ InstrItinData<IIC_iMAC32 , [InstrStage<1, [FU_Pipe1], 0>,
+ InstrStage<2, [FU_Pipe0]>], [6, 1, 1, 4]>,
+ InstrItinData<IIC_iMUL64 , [InstrStage<2, [FU_Pipe1], 0>,
+ InstrStage<3, [FU_Pipe0]>], [6, 6, 1, 1]>,
+ InstrItinData<IIC_iMAC64 , [InstrStage<2, [FU_Pipe1], 0>,
+ InstrStage<3, [FU_Pipe0]>], [6, 6, 1, 1]>,
+
+ // Integer load pipeline
+ //
+ // loads have an extra cycle of latency, but are fully pipelined
+ // use FU_Issue to enforce the 1 load/store per cycle limit
+ //
+ // Immediate offset
+ InstrItinData<IIC_iLoadi , [InstrStage<1, [FU_Issue], 0>,
+ InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+ InstrStage<1, [FU_LdSt0]>], [3, 1]>,
+ //
+ // Register offset
+ InstrItinData<IIC_iLoadr , [InstrStage<1, [FU_Issue], 0>,
+ InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+ InstrStage<1, [FU_LdSt0]>], [3, 1, 1]>,
+ //
+ // Scaled register offset, issues over 2 cycles
+ InstrItinData<IIC_iLoadsi , [InstrStage<2, [FU_Issue], 0>,
+ InstrStage<1, [FU_Pipe0], 0>,
+ InstrStage<1, [FU_Pipe1]>,
+ InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+ InstrStage<1, [FU_LdSt0]>], [4, 1, 1]>,
+ //
+ // Immediate offset with update
+ InstrItinData<IIC_iLoadiu , [InstrStage<1, [FU_Issue], 0>,
+ InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+ InstrStage<1, [FU_LdSt0]>], [3, 2, 1]>,
+ //
+ // Register offset with update
+ InstrItinData<IIC_iLoadru , [InstrStage<1, [FU_Issue], 0>,
+ InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+ InstrStage<1, [FU_LdSt0]>], [3, 2, 1, 1]>,
+ //
+ // Scaled register offset with update, issues over 2 cycles
+ InstrItinData<IIC_iLoadsiu , [InstrStage<2, [FU_Issue], 0>,
+ InstrStage<1, [FU_Pipe0], 0>,
+ InstrStage<1, [FU_Pipe1]>,
+ InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+ InstrStage<1, [FU_LdSt0]>], [4, 3, 1, 1]>,
+ //
+ // Load multiple
+ InstrItinData<IIC_iLoadm , [InstrStage<2, [FU_Issue], 0>,
+ InstrStage<2, [FU_Pipe0], 0>,
+ InstrStage<2, [FU_Pipe1]>,
+ InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+ InstrStage<1, [FU_LdSt0]>]>,
+
+ // Integer store pipeline
+ //
+ // use FU_Issue to enforce the 1 load/store per cycle limit
+ //
+ // Immediate offset
+ InstrItinData<IIC_iStorei , [InstrStage<1, [FU_Issue], 0>,
+ InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+ InstrStage<1, [FU_LdSt0]>], [3, 1]>,
+ //
+ // Register offset
+ InstrItinData<IIC_iStorer , [InstrStage<1, [FU_Issue], 0>,
+ InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+ InstrStage<1, [FU_LdSt0]>], [3, 1, 1]>,
+ //
+ // Scaled register offset, issues over 2 cycles
+ InstrItinData<IIC_iStoresi , [InstrStage<2, [FU_Issue], 0>,
+ InstrStage<1, [FU_Pipe0], 0>,
+ InstrStage<1, [FU_Pipe1]>,
+ InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+ InstrStage<1, [FU_LdSt0]>], [3, 1, 1]>,
+ //
+ // Immediate offset with update
+ InstrItinData<IIC_iStoreiu , [InstrStage<1, [FU_Issue], 0>,
+ InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+ InstrStage<1, [FU_LdSt0]>], [2, 3, 1]>,
+ //
+ // Register offset with update
+ InstrItinData<IIC_iStoreru , [InstrStage<1, [FU_Issue], 0>,
+ InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+ InstrStage<1, [FU_LdSt0]>], [2, 3, 1, 1]>,
+ //
+ // Scaled register offset with update, issues over 2 cycles
+ InstrItinData<IIC_iStoresiu, [InstrStage<2, [FU_Issue], 0>,
+ InstrStage<1, [FU_Pipe0], 0>,
+ InstrStage<1, [FU_Pipe1]>,
+ InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+ InstrStage<1, [FU_LdSt0]>], [3, 3, 1, 1]>,
+ //
+ // Store multiple
+ InstrItinData<IIC_iStorem , [InstrStage<2, [FU_Issue], 0>,
+ InstrStage<2, [FU_Pipe0], 0>,
+ InstrStage<2, [FU_Pipe1]>,
+ InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+ InstrStage<1, [FU_LdSt0]>]>,
+
+ // Branch
+ //
+ // no delay slots, so the latency of a branch is unimportant
+ InstrItinData<IIC_Br , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>]>,
+
+ // VFP
+ // Issue through integer pipeline, and execute in NEON unit. We assume
+ // RunFast mode so that NFP pipeline is used for single-precision when
+ // possible.
+ //
+ // FP Special Register to Integer Register File Move
+ InstrItinData<IIC_fpSTAT , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+ InstrStage<1, [FU_NLSPipe]>]>,
+ //
+ // Single-precision FP Unary
+ InstrItinData<IIC_fpUNA32 , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+ InstrStage<1, [FU_NPipe]>], [7, 1]>,
+ //
+ // Double-precision FP Unary
+ InstrItinData<IIC_fpUNA64 , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+ InstrStage<4, [FU_NPipe], 0>,
+ InstrStage<4, [FU_NLSPipe]>]>,
+ //
+ // Single-precision FP Compare
+ InstrItinData<IIC_fpCMP32 , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+ InstrStage<1, [FU_NPipe]>], [7, 1]>,
+ //
+ // Double-precision FP Compare
+ InstrItinData<IIC_fpCMP64 , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+ InstrStage<4, [FU_NPipe], 0>,
+ InstrStage<4, [FU_NLSPipe]>]>,
+ //
+ // Single to Double FP Convert
+ InstrItinData<IIC_fpCVTSD , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+ InstrStage<7, [FU_NPipe], 0>,
+ InstrStage<7, [FU_NLSPipe]>]>,
+ //
+ // Double to Single FP Convert
+ InstrItinData<IIC_fpCVTDS , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+ InstrStage<5, [FU_NPipe], 0>,
+ InstrStage<5, [FU_NLSPipe]>]>,
+ //
+ // Single-Precision FP to Integer Convert
+ InstrItinData<IIC_fpCVTSI , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+ InstrStage<1, [FU_NPipe]>], [7, 1]>,
+ //
+ // Double-Precision FP to Integer Convert
+ InstrItinData<IIC_fpCVTDI , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+ InstrStage<8, [FU_NPipe], 0>,
+ InstrStage<8, [FU_NLSPipe]>]>,
+ //
+ // Integer to Single-Precision FP Convert
+ InstrItinData<IIC_fpCVTIS , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+ InstrStage<1, [FU_NPipe]>], [7, 1]>,
+ //
+ // Integer to Double-Precision FP Convert
+ InstrItinData<IIC_fpCVTID , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+ InstrStage<8, [FU_NPipe], 0>,
+ InstrStage<8, [FU_NLSPipe]>]>,
+ //
+ // Single-precision FP ALU
+ InstrItinData<IIC_fpALU32 , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+ InstrStage<1, [FU_NPipe]>], [7, 1]>,
+ //
+ // Double-precision FP ALU
+ InstrItinData<IIC_fpALU64 , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+ InstrStage<9, [FU_NPipe], 0>,
+ InstrStage<9, [FU_NLSPipe]>]>,
+ //
+ // Single-precision FP Multiply
+ InstrItinData<IIC_fpMUL32 , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+ InstrStage<1, [FU_NPipe]>], [7, 1]>,
+ //
+ // Double-precision FP Multiply
+ InstrItinData<IIC_fpMUL64 , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+ InstrStage<11, [FU_NPipe], 0>,
+ InstrStage<11, [FU_NLSPipe]>]>,
+ //
+ // Single-precision FP MAC
+ InstrItinData<IIC_fpMAC32 , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+ InstrStage<1, [FU_NPipe]>], [7, 1]>,
+ //
+ // Double-precision FP MAC
+ InstrItinData<IIC_fpMAC64 , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+ InstrStage<19, [FU_NPipe], 0>,
+ InstrStage<19, [FU_NLSPipe]>]>,
+ //
+ // Single-precision FP DIV
+ InstrItinData<IIC_fpDIV32 , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+ InstrStage<20, [FU_NPipe], 0>,
+ InstrStage<20, [FU_NLSPipe]>]>,
+ //
+ // Double-precision FP DIV
+ InstrItinData<IIC_fpDIV64 , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+ InstrStage<29, [FU_NPipe], 0>,
+ InstrStage<29, [FU_NLSPipe]>]>,
+ //
+ // Single-precision FP SQRT
+ InstrItinData<IIC_fpSQRT32, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+ InstrStage<19, [FU_NPipe], 0>,
+ InstrStage<19, [FU_NLSPipe]>]>,
+ //
+ // Double-precision FP SQRT
+ InstrItinData<IIC_fpSQRT64, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+ InstrStage<29, [FU_NPipe], 0>,
+ InstrStage<29, [FU_NLSPipe]>]>,
+ //
+ // Single-precision FP Load
+ // use FU_Issue to enforce the 1 load/store per cycle limit
+ InstrItinData<IIC_fpLoad32, [InstrStage<1, [FU_Issue], 0>,
+ InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+ InstrStage<1, [FU_LdSt0], 0>,
+ InstrStage<1, [FU_NLSPipe]>]>,
+ //
+ // Double-precision FP Load
+ // use FU_Issue to enforce the 1 load/store per cycle limit
+ InstrItinData<IIC_fpLoad64, [InstrStage<2, [FU_Issue], 0>,
+ InstrStage<1, [FU_Pipe0], 0>,
+ InstrStage<1, [FU_Pipe1]>,
+ InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+ InstrStage<1, [FU_LdSt0], 0>,
+ InstrStage<1, [FU_NLSPipe]>]>,
+ //
+ // FP Load Multiple
+ // use FU_Issue to enforce the 1 load/store per cycle limit
+ InstrItinData<IIC_fpLoadm, [InstrStage<3, [FU_Issue], 0>,
+ InstrStage<2, [FU_Pipe0], 0>,
+ InstrStage<2, [FU_Pipe1]>,
+ InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+ InstrStage<1, [FU_LdSt0], 0>,
+ InstrStage<1, [FU_NLSPipe]>]>,
+ //
+ // Single-precision FP Store
+ // use FU_Issue to enforce the 1 load/store per cycle limit
+ InstrItinData<IIC_fpStore32,[InstrStage<1, [FU_Issue], 0>,
+ InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+ InstrStage<1, [FU_LdSt0], 0>,
+ InstrStage<1, [FU_NLSPipe]>]>,
+ //
+ // Double-precision FP Store
+ // use FU_Issue to enforce the 1 load/store per cycle limit
+ InstrItinData<IIC_fpStore64,[InstrStage<2, [FU_Issue], 0>,
+ InstrStage<1, [FU_Pipe0], 0>,
+ InstrStage<1, [FU_Pipe1]>,
+ InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+ InstrStage<1, [FU_LdSt0], 0>,
+ InstrStage<1, [FU_NLSPipe]>]>,
+ //
+ // FP Store Multiple
+ // use FU_Issue to enforce the 1 load/store per cycle limit
+ InstrItinData<IIC_fpStorem, [InstrStage<3, [FU_Issue], 0>,
+ InstrStage<2, [FU_Pipe0], 0>,
+ InstrStage<2, [FU_Pipe1]>,
+ InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+ InstrStage<1, [FU_LdSt0], 0>,
+ InstrStage<1, [FU_NLSPipe]>]>,
+
+ // NEON
+ // Issue through integer pipeline, and execute in NEON unit.
+ //
+ // VLD1
+ InstrItinData<IIC_VLD1, [InstrStage<1, [FU_Issue], 0>,
+ InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+ InstrStage<1, [FU_LdSt0], 0>,
+ InstrStage<1, [FU_NLSPipe]>]>,
+ //
+ // VLD2
+ InstrItinData<IIC_VLD2, [InstrStage<1, [FU_Issue], 0>,
+ InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+ InstrStage<1, [FU_LdSt0], 0>,
+ InstrStage<1, [FU_NLSPipe]>], [2, 2, 1]>,
+ //
+ // VLD3
+ InstrItinData<IIC_VLD3, [InstrStage<1, [FU_Issue], 0>,
+ InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+ InstrStage<1, [FU_LdSt0], 0>,
+ InstrStage<1, [FU_NLSPipe]>], [2, 2, 2, 1]>,
+ //
+ // VLD4
+ InstrItinData<IIC_VLD4, [InstrStage<1, [FU_Issue], 0>,
+ InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+ InstrStage<1, [FU_LdSt0], 0>,
+ InstrStage<1, [FU_NLSPipe]>], [2, 2, 2, 2, 1]>,
+ //
+ // VST
+ InstrItinData<IIC_VST, [InstrStage<1, [FU_Issue], 0>,
+ InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+ InstrStage<1, [FU_LdSt0], 0>,
+ InstrStage<1, [FU_NLSPipe]>]>,
+ //
+ // Double-register FP Unary
+ InstrItinData<IIC_VUNAD, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+ InstrStage<1, [FU_NPipe]>], [5, 2]>,
+ //
+ // Quad-register FP Unary
+ // Result written in N5, but that is relative to the last cycle of multicycle,
+ // so we use 6 for those cases
+ InstrItinData<IIC_VUNAQ, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+ InstrStage<2, [FU_NPipe]>], [6, 2]>,
+ //
+ // Double-register FP Binary
+ InstrItinData<IIC_VBIND, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+ InstrStage<1, [FU_NPipe]>], [5, 2, 2]>,
+ //
+ // Quad-register FP Binary
+ // Result written in N5, but that is relative to the last cycle of multicycle,
+ // so we use 6 for those cases
+ InstrItinData<IIC_VBINQ, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+ InstrStage<2, [FU_NPipe]>], [6, 2, 2]>,
+ //
+ // Move Immediate
+ InstrItinData<IIC_VMOVImm, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+ InstrStage<1, [FU_NPipe]>], [3]>,
+ //
+ // Double-register Permute Move
+ InstrItinData<IIC_VMOVD, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+ InstrStage<1, [FU_NLSPipe]>], [2, 1]>,
+ //
+ // Quad-register Permute Move
+ // Result written in N2, but that is relative to the last cycle of multicycle,
+ // so we use 3 for those cases
+ InstrItinData<IIC_VMOVQ, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+ InstrStage<2, [FU_NLSPipe]>], [3, 1]>,
+ //
+ // Integer to Single-precision Move
+ InstrItinData<IIC_VMOVIS , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+ InstrStage<1, [FU_NLSPipe]>], [2, 1]>,
+ //
+ // Integer to Double-precision Move
+ InstrItinData<IIC_VMOVID , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+ InstrStage<1, [FU_NLSPipe]>], [2, 1, 1]>,
+ //
+ // Single-precision to Integer Move
+ InstrItinData<IIC_VMOVSI , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+ InstrStage<1, [FU_NLSPipe]>], [20, 1]>,
+ //
+ // Double-precision to Integer Move
+ InstrItinData<IIC_VMOVDI , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+ InstrStage<1, [FU_NLSPipe]>], [20, 20, 1]>,
+ //
+ // Integer to Lane Move
+ InstrItinData<IIC_VMOVISL , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+ InstrStage<2, [FU_NLSPipe]>], [3, 1, 1]>,
+ //
+ // Double-register Permute
+ InstrItinData<IIC_VPERMD, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+ InstrStage<1, [FU_NLSPipe]>], [2, 2, 1, 1]>,
+ //
+ // Quad-register Permute
+ // Result written in N2, but that is relative to the last cycle of multicycle,
+ // so we use 3 for those cases
+ InstrItinData<IIC_VPERMQ, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+ InstrStage<2, [FU_NLSPipe]>], [3, 3, 1, 1]>,
+ //
+ // Quad-register Permute (3 cycle issue)
+ // Result written in N2, but that is relative to the last cycle of multicycle,
+ // so we use 4 for those cases
+ InstrItinData<IIC_VPERMQ3, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+ InstrStage<1, [FU_NLSPipe]>,
+ InstrStage<1, [FU_NPipe], 0>,
+ InstrStage<2, [FU_NLSPipe]>], [4, 4, 1, 1]>,
+ //
+ // Double-register FP Multiple-Accumulate
+ InstrItinData<IIC_VMACD, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+ InstrStage<1, [FU_NPipe]>], [9, 2, 2, 3]>,
+ //
+ // Quad-register FP Multiple-Accumulate
+ // Result written in N9, but that is relative to the last cycle of multicycle,
+ // so we use 10 for those cases
+ InstrItinData<IIC_VMACQ, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+ InstrStage<2, [FU_NPipe]>], [10, 2, 2, 3]>,
+ //
+ // Double-register Reciprical Step
+ InstrItinData<IIC_VRECSD, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+ InstrStage<1, [FU_NPipe]>], [9, 2, 2]>,
+ //
+ // Quad-register Reciprical Step
+ InstrItinData<IIC_VRECSQ, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+ InstrStage<2, [FU_NPipe]>], [10, 2, 2]>,
+ //
+ // Double-register Integer Count
+ InstrItinData<IIC_VCNTiD, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+ InstrStage<1, [FU_NPipe]>], [3, 2, 2]>,
+ //
+ // Quad-register Integer Count
+ // Result written in N3, but that is relative to the last cycle of multicycle,
+ // so we use 4 for those cases
+ InstrItinData<IIC_VCNTiQ, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+ InstrStage<2, [FU_NPipe]>], [4, 2, 2]>,
+ //
+ // Double-register Integer Unary
+ InstrItinData<IIC_VUNAiD, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+ InstrStage<1, [FU_NPipe]>], [4, 2]>,
+ //
+ // Quad-register Integer Unary
+ InstrItinData<IIC_VUNAiQ, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+ InstrStage<1, [FU_NPipe]>], [4, 2]>,
+ //
+ // Double-register Integer Q-Unary
+ InstrItinData<IIC_VQUNAiD, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+ InstrStage<1, [FU_NPipe]>], [4, 1]>,
+ //
+ // Quad-register Integer CountQ-Unary
+ InstrItinData<IIC_VQUNAiQ, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+ InstrStage<1, [FU_NPipe]>], [4, 1]>,
+ //
+ // Double-register Integer Binary
+ InstrItinData<IIC_VBINiD, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+ InstrStage<1, [FU_NPipe]>], [3, 2, 2]>,
+ //
+ // Quad-register Integer Binary
+ InstrItinData<IIC_VBINiQ, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+ InstrStage<1, [FU_NPipe]>], [3, 2, 2]>,
+ //
+ // Double-register Integer Binary (4 cycle)
+ InstrItinData<IIC_VBINi4D, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+ InstrStage<1, [FU_NPipe]>], [4, 2, 1]>,
+ //
+ // Quad-register Integer Binary (4 cycle)
+ InstrItinData<IIC_VBINi4Q, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+ InstrStage<1, [FU_NPipe]>], [4, 2, 1]>,
+ //
+ // Double-register Integer Subtract
+ InstrItinData<IIC_VSUBiD, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+ InstrStage<1, [FU_NPipe]>], [3, 2, 1]>,
+ //
+ // Quad-register Integer Subtract
+ InstrItinData<IIC_VSUBiQ, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+ InstrStage<1, [FU_NPipe]>], [3, 2, 1]>,
+ //
+ // Double-register Integer Shift
+ InstrItinData<IIC_VSHLiD, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+ InstrStage<1, [FU_NPipe]>], [3, 1, 1]>,
+ //
+ // Quad-register Integer Shift
+ InstrItinData<IIC_VSHLiQ, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+ InstrStage<2, [FU_NPipe]>], [4, 1, 1]>,
+ //
+ // Double-register Integer Shift (4 cycle)
+ InstrItinData<IIC_VSHLi4D, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+ InstrStage<1, [FU_NPipe]>], [4, 1, 1]>,
+ //
+ // Quad-register Integer Shift (4 cycle)
+ InstrItinData<IIC_VSHLi4Q, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+ InstrStage<2, [FU_NPipe]>], [5, 1, 1]>,
+ //
+ // Double-register Integer Pair Add Long
+ InstrItinData<IIC_VPALiD, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+ InstrStage<1, [FU_NPipe]>], [6, 3, 2, 1]>,
+ //
+ // Quad-register Integer Pair Add Long
+ InstrItinData<IIC_VPALiQ, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+ InstrStage<2, [FU_NPipe]>], [7, 3, 2, 1]>,
+ //
+ // Double-register Integer Multiply (.8, .16)
+ InstrItinData<IIC_VMULi16D, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+ InstrStage<1, [FU_NPipe]>], [6, 2, 2]>,
+ //
+ // Double-register Integer Multiply (.32)
+ InstrItinData<IIC_VMULi32D, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+ InstrStage<2, [FU_NPipe]>], [7, 2, 1]>,
+ //
+ // Quad-register Integer Multiply (.8, .16)
+ InstrItinData<IIC_VMULi16Q, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+ InstrStage<2, [FU_NPipe]>], [7, 2, 2]>,
+ //
+ // Quad-register Integer Multiply (.32)
+ InstrItinData<IIC_VMULi32Q, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+ InstrStage<1, [FU_NPipe]>,
+ InstrStage<2, [FU_NLSPipe], 0>,
+ InstrStage<3, [FU_NPipe]>], [9, 2, 1]>,
+ //
+ // Double-register Integer Multiply-Accumulate (.8, .16)
+ InstrItinData<IIC_VMACi16D, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+ InstrStage<1, [FU_NPipe]>], [6, 2, 2, 3]>,
+ //
+ // Double-register Integer Multiply-Accumulate (.32)
+ InstrItinData<IIC_VMACi32D, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+ InstrStage<2, [FU_NPipe]>], [7, 2, 1, 3]>,
+ //
+ // Quad-register Integer Multiply-Accumulate (.8, .16)
+ InstrItinData<IIC_VMACi16Q, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+ InstrStage<2, [FU_NPipe]>], [7, 2, 2, 3]>,
+ //
+ // Quad-register Integer Multiply-Accumulate (.32)
+ InstrItinData<IIC_VMACi32Q, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+ InstrStage<1, [FU_NPipe]>,
+ InstrStage<2, [FU_NLSPipe], 0>,
+ InstrStage<3, [FU_NPipe]>], [9, 2, 1, 3]>,
+ //
+ // Double-register VEXT
+ InstrItinData<IIC_VEXTD, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+ InstrStage<1, [FU_NLSPipe]>], [2, 1, 1]>,
+ //
+ // Quad-register VEXT
+ InstrItinData<IIC_VEXTQ, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+ InstrStage<2, [FU_NLSPipe]>], [3, 1, 1]>,
+ //
+ // VTB
+ InstrItinData<IIC_VTB1, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+ InstrStage<2, [FU_NLSPipe]>], [3, 2, 1]>,
+ InstrItinData<IIC_VTB2, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+ InstrStage<2, [FU_NLSPipe]>], [3, 2, 2, 1]>,
+ InstrItinData<IIC_VTB3, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+ InstrStage<1, [FU_NLSPipe]>,
+ InstrStage<1, [FU_NPipe], 0>,
+ InstrStage<2, [FU_NLSPipe]>], [4, 2, 2, 3, 1]>,
+ InstrItinData<IIC_VTB4, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+ InstrStage<1, [FU_NLSPipe]>,
+ InstrStage<1, [FU_NPipe], 0>,
+ InstrStage<2, [FU_NLSPipe]>], [4, 2, 2, 3, 3, 1]>,
+ //
+ // VTBX
+ InstrItinData<IIC_VTBX1, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+ InstrStage<2, [FU_NLSPipe]>], [3, 1, 2, 1]>,
+ InstrItinData<IIC_VTBX2, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+ InstrStage<2, [FU_NLSPipe]>], [3, 1, 2, 2, 1]>,
+ InstrItinData<IIC_VTBX3, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+ InstrStage<1, [FU_NLSPipe]>,
+ InstrStage<1, [FU_NPipe], 0>,
+ InstrStage<2, [FU_NLSPipe]>], [4, 1, 2, 2, 3, 1]>,
+ InstrItinData<IIC_VTBX4, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+ InstrStage<1, [FU_NLSPipe]>,
+ InstrStage<1, [FU_NPipe], 0>,
+ InstrStage<2, [FU_NLSPipe]>], [4, 1, 2, 2, 3, 3, 1]>
+]>;
diff --git a/lib/Target/ARM/ARMSubtarget.cpp b/lib/Target/ARM/ARMSubtarget.cpp
index e611088..cf1ee3f 100644
--- a/lib/Target/ARM/ARMSubtarget.cpp
+++ b/lib/Target/ARM/ARMSubtarget.cpp
@@ -13,8 +13,7 @@
#include "ARMSubtarget.h"
#include "ARMGenSubtarget.inc"
-#include "llvm/Module.h"
-#include "llvm/Target/TargetMachine.h"
+#include "llvm/GlobalValue.h"
#include "llvm/Target/TargetOptions.h"
#include "llvm/Support/CommandLine.h"
using namespace llvm;
@@ -22,13 +21,19 @@ using namespace llvm;
static cl::opt<bool>
ReserveR9("arm-reserve-r9", cl::Hidden,
cl::desc("Reserve R9, making it unavailable as GPR"));
+static cl::opt<bool>
+UseNEONFP("arm-use-neon-fp",
+ cl::desc("Use NEON for single-precision FP"),
+ cl::init(false), cl::Hidden);
-ARMSubtarget::ARMSubtarget(const Module &M, const std::string &FS,
+ARMSubtarget::ARMSubtarget(const std::string &TT, const std::string &FS,
bool isThumb)
: ARMArchVersion(V4T)
, ARMFPUType(None)
+ , UseNEONForSinglePrecisionFP(UseNEONFP)
, IsThumb(isThumb)
, ThumbMode(Thumb1)
+ , PostRAScheduler(false)
, IsR9Reserved(ReserveR9)
, stackAlignment(4)
, CPUString("generic")
@@ -45,7 +50,6 @@ ARMSubtarget::ARMSubtarget(const Module &M, const std::string &FS,
// Set the boolean corresponding to the current target triple, or the default
// if one cannot be determined, to true.
- const std::string& TT = M.getTargetTriple();
unsigned Len = TT.length();
unsigned Idx = 0;
@@ -75,14 +79,14 @@ ARMSubtarget::ARMSubtarget(const Module &M, const std::string &FS,
}
}
+ // Thumb2 implies at least V6T2.
+ if (ARMArchVersion < V6T2 && ThumbMode >= Thumb2)
+ ARMArchVersion = V6T2;
+
if (Len >= 10) {
if (TT.find("-darwin") != std::string::npos)
// arm-darwin
TargetType = isDarwin;
- } else if (TT.empty()) {
-#if defined(__APPLE__)
- TargetType = isDarwin;
-#endif
}
if (TT.find("eabi") != std::string::npos)
@@ -93,4 +97,61 @@ ARMSubtarget::ARMSubtarget(const Module &M, const std::string &FS,
if (isTargetDarwin())
IsR9Reserved = ReserveR9 | (ARMArchVersion < V6);
+
+ // Set CPU specific features.
+ if (CPUString == "cortex-a8") {
+ PostRAScheduler = true;
+ if (UseNEONFP.getPosition() == 0)
+ UseNEONForSinglePrecisionFP = true;
+ }
+}
+
+/// GVIsIndirectSymbol - true if the GV will be accessed via an indirect symbol.
+bool
+ARMSubtarget::GVIsIndirectSymbol(GlobalValue *GV, Reloc::Model RelocM) const {
+ if (RelocM == Reloc::Static)
+ return false;
+
+ // GV with ghost linkage (in JIT lazy compilation mode) do not require an
+ // extra load from stub.
+ bool isDecl = GV->isDeclaration() && !GV->hasNotBeenReadFromBitcode();
+
+ if (!isTargetDarwin()) {
+ // Extra load is needed for all externally visible.
+ if (GV->hasLocalLinkage() || GV->hasHiddenVisibility())
+ return false;
+ return true;
+ } else {
+ if (RelocM == Reloc::PIC_) {
+ // If this is a strong reference to a definition, it is definitely not
+ // through a stub.
+ if (!isDecl && !GV->isWeakForLinker())
+ return false;
+
+ // Unless we have a symbol with hidden visibility, we have to go through a
+ // normal $non_lazy_ptr stub because this symbol might be resolved late.
+ if (!GV->hasHiddenVisibility()) // Non-hidden $non_lazy_ptr reference.
+ return true;
+
+ // If symbol visibility is hidden, we have a stub for common symbol
+ // references and external declarations.
+ if (isDecl || GV->hasCommonLinkage())
+ // Hidden $non_lazy_ptr reference.
+ return true;
+
+ return false;
+ } else {
+ // If this is a strong reference to a definition, it is definitely not
+ // through a stub.
+ if (!isDecl && !GV->isWeakForLinker())
+ return false;
+
+ // Unless we have a symbol with hidden visibility, we have to go through a
+ // normal $non_lazy_ptr stub because this symbol might be resolved late.
+ if (!GV->hasHiddenVisibility()) // Non-hidden $non_lazy_ptr reference.
+ return true;
+ }
+ }
+
+ return false;
}
diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h
index 5110b31..7098fd4 100644
--- a/lib/Target/ARM/ARMSubtarget.h
+++ b/lib/Target/ARM/ARMSubtarget.h
@@ -15,11 +15,12 @@
#define ARMSUBTARGET_H
#include "llvm/Target/TargetInstrItineraries.h"
+#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetSubtarget.h"
#include <string>
namespace llvm {
-class Module;
+class GlobalValue;
class ARMSubtarget : public TargetSubtarget {
protected:
@@ -43,12 +44,20 @@ protected:
/// ARMFPUType - Floating Point Unit type.
ARMFPEnum ARMFPUType;
+ /// UseNEONForSinglePrecisionFP - if the NEONFP attribute has been
+ /// specified. Use the method useNEONForSinglePrecisionFP() to
+ /// determine if NEON should actually be used.
+ bool UseNEONForSinglePrecisionFP;
+
/// IsThumb - True if we are in thumb mode, false if in ARM mode.
bool IsThumb;
/// ThumbMode - Indicates supported Thumb version.
ThumbTypeEnum ThumbMode;
+ /// PostRAScheduler - True if using post-register-allocation scheduler.
+ bool PostRAScheduler;
+
/// IsR9Reserved - True if R9 is a not available as general purpose register.
bool IsR9Reserved;
@@ -61,7 +70,7 @@ protected:
/// Selected instruction itineraries (one entry per itinerary class.)
InstrItineraryData InstrItins;
-
+
public:
enum {
isELF, isDarwin
@@ -73,9 +82,9 @@ protected:
} TargetABI;
/// This constructor initializes the data members to match that
- /// of the specified module.
+ /// of the specified triple.
///
- ARMSubtarget(const Module &M, const std::string &FS, bool isThumb);
+ ARMSubtarget(const std::string &TT, const std::string &FS, bool isThumb);
/// getMaxInlineSizeThreshold - Returns the maximum memset / memcpy size
/// that still makes it profitable to inline the call.
@@ -99,6 +108,8 @@ protected:
bool hasVFP2() const { return ARMFPUType >= VFPv2; }
bool hasVFP3() const { return ARMFPUType >= VFPv3; }
bool hasNEON() const { return ARMFPUType >= NEON; }
+ bool useNEONForSinglePrecisionFP() const {
+ return hasNEON() && UseNEONForSinglePrecisionFP; }
bool isTargetDarwin() const { return TargetType == isDarwin; }
bool isTargetELF() const { return TargetType == isELF; }
@@ -108,14 +119,18 @@ protected:
bool isThumb() const { return IsThumb; }
bool isThumb1Only() const { return IsThumb && (ThumbMode == Thumb1); }
- bool isThumb2() const { return IsThumb && (ThumbMode >= Thumb2); }
+ bool isThumb2() const { return IsThumb && (ThumbMode == Thumb2); }
bool hasThumb2() const { return ThumbMode >= Thumb2; }
bool isR9Reserved() const { return IsR9Reserved; }
const std::string & getCPUString() const { return CPUString; }
+
+ /// enablePostRAScheduler - From TargetSubtarget, return true to
+ /// enable post-RA scheduler.
+ bool enablePostRAScheduler() const { return PostRAScheduler; }
- /// getInstrItins - Return the instruction itineraies based on subtarget
+ /// getInstrItins - Return the instruction itineraies based on subtarget
/// selection.
const InstrItineraryData &getInstrItineraryData() const { return InstrItins; }
@@ -123,6 +138,10 @@ protected:
/// stack frame on entry to the function and which must be maintained by every
/// function for this subtarget.
unsigned getStackAlignment() const { return stackAlignment; }
+
+ /// GVIsIndirectSymbol - true if the GV will be accessed via an indirect
+ /// symbol.
+ bool GVIsIndirectSymbol(GlobalValue *GV, Reloc::Model RelocM) const;
};
} // End llvm namespace
diff --git a/lib/Target/ARM/ARMTargetMachine.cpp b/lib/Target/ARM/ARMTargetMachine.cpp
index 2344733..32ddc20 100644
--- a/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/lib/Target/ARM/ARMTargetMachine.cpp
@@ -11,188 +11,122 @@
//===----------------------------------------------------------------------===//
#include "ARMTargetMachine.h"
-#include "ARMTargetAsmInfo.h"
+#include "ARMMCAsmInfo.h"
#include "ARMFrameInfo.h"
#include "ARM.h"
-#include "llvm/Module.h"
#include "llvm/PassManager.h"
#include "llvm/CodeGen/Passes.h"
#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetMachineRegistry.h"
+#include "llvm/Support/FormattedStream.h"
#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetRegistry.h"
using namespace llvm;
-static cl::opt<bool> DisableLdStOpti("disable-arm-loadstore-opti", cl::Hidden,
- cl::desc("Disable load store optimization pass"));
-static cl::opt<bool> DisableIfConversion("disable-arm-if-conversion",cl::Hidden,
- cl::desc("Disable if-conversion pass"));
-
-/// ARMTargetMachineModule - Note that this is used on hosts that cannot link
-/// in a library unless there are references into the library. In particular,
-/// it seems that it is not possible to get things to work on Win32 without
-/// this. Though it is unused, do not remove it.
-extern "C" int ARMTargetMachineModule;
-int ARMTargetMachineModule = 0;
-
-// Register the target.
-static RegisterTarget<ARMTargetMachine> X("arm", "ARM");
-static RegisterTarget<ThumbTargetMachine> Y("thumb", "Thumb");
-
-// Force static initialization.
-extern "C" void LLVMInitializeARMTarget() { }
-
-// No assembler printer by default
-ARMBaseTargetMachine::AsmPrinterCtorFn ARMBaseTargetMachine::AsmPrinterCtor = 0;
-
-/// ThumbTargetMachine - Create an Thumb architecture model.
-///
-unsigned ThumbTargetMachine::getJITMatchQuality() {
-#if defined(__thumb__)
- return 10;
-#endif
- return 0;
+static const MCAsmInfo *createMCAsmInfo(const Target &T,
+ const StringRef &TT) {
+ Triple TheTriple(TT);
+ switch (TheTriple.getOS()) {
+ case Triple::Darwin:
+ return new ARMMCAsmInfoDarwin();
+ default:
+ return new ARMELFMCAsmInfo();
+ }
}
-unsigned ThumbTargetMachine::getModuleMatchQuality(const Module &M) {
- std::string TT = M.getTargetTriple();
- // Match thumb-foo-bar, as well as things like thumbv5blah-*
- if (TT.size() >= 6 &&
- (TT.substr(0, 6) == "thumb-" || TT.substr(0, 6) == "thumbv"))
- return 20;
- // If the target triple is something non-thumb, we don't match.
- if (!TT.empty()) return 0;
+extern "C" void LLVMInitializeARMTarget() {
+ // Register the target.
+ RegisterTargetMachine<ARMTargetMachine> X(TheARMTarget);
+ RegisterTargetMachine<ThumbTargetMachine> Y(TheThumbTarget);
- if (M.getEndianness() == Module::LittleEndian &&
- M.getPointerSize() == Module::Pointer32)
- return 10; // Weak match
- else if (M.getEndianness() != Module::AnyEndianness ||
- M.getPointerSize() != Module::AnyPointerSize)
- return 0; // Match for some other target
-
- return getJITMatchQuality()/2;
+ // Register the target asm info.
+ RegisterAsmInfoFn A(TheARMTarget, createMCAsmInfo);
+ RegisterAsmInfoFn B(TheThumbTarget, createMCAsmInfo);
}
/// TargetMachine ctor - Create an ARM architecture model.
///
-ARMBaseTargetMachine::ARMBaseTargetMachine(const Module &M,
+ARMBaseTargetMachine::ARMBaseTargetMachine(const Target &T,
+ const std::string &TT,
const std::string &FS,
bool isThumb)
- : Subtarget(M, FS, isThumb),
+ : LLVMTargetMachine(T, TT),
+ Subtarget(TT, FS, isThumb),
FrameInfo(Subtarget),
JITInfo(),
InstrItins(Subtarget.getInstrItineraryData()) {
DefRelocModel = getRelocationModel();
}
-ARMTargetMachine::ARMTargetMachine(const Module &M, const std::string &FS)
- : ARMBaseTargetMachine(M, FS, false), InstrInfo(Subtarget),
+ARMTargetMachine::ARMTargetMachine(const Target &T, const std::string &TT,
+ const std::string &FS)
+ : ARMBaseTargetMachine(T, TT, FS, false), InstrInfo(Subtarget),
DataLayout(Subtarget.isAPCS_ABI() ?
std::string("e-p:32:32-f64:32:32-i64:32:32") :
std::string("e-p:32:32-f64:64:64-i64:64:64")),
TLInfo(*this) {
}
-ThumbTargetMachine::ThumbTargetMachine(const Module &M, const std::string &FS)
- : ARMBaseTargetMachine(M, FS, true),
+ThumbTargetMachine::ThumbTargetMachine(const Target &T, const std::string &TT,
+ const std::string &FS)
+ : ARMBaseTargetMachine(T, TT, FS, true),
+ InstrInfo(Subtarget.hasThumb2()
+ ? ((ARMBaseInstrInfo*)new Thumb2InstrInfo(Subtarget))
+ : ((ARMBaseInstrInfo*)new Thumb1InstrInfo(Subtarget))),
DataLayout(Subtarget.isAPCS_ABI() ?
std::string("e-p:32:32-f64:32:32-i64:32:32-"
"i16:16:32-i8:8:32-i1:8:32-a:0:32") :
std::string("e-p:32:32-f64:64:64-i64:64:64-"
"i16:16:32-i8:8:32-i1:8:32-a:0:32")),
TLInfo(*this) {
- // Create the approriate type of Thumb InstrInfo
- if (Subtarget.hasThumb2())
- InstrInfo = new Thumb2InstrInfo(Subtarget);
- else
- InstrInfo = new Thumb1InstrInfo(Subtarget);
-}
-
-unsigned ARMTargetMachine::getJITMatchQuality() {
-#if defined(__arm__)
- return 10;
-#endif
- return 0;
-}
-
-unsigned ARMTargetMachine::getModuleMatchQuality(const Module &M) {
- std::string TT = M.getTargetTriple();
- // Match arm-foo-bar, as well as things like armv5blah-*
- if (TT.size() >= 4 &&
- (TT.substr(0, 4) == "arm-" || TT.substr(0, 4) == "armv"))
- return 20;
- // If the target triple is something non-arm, we don't match.
- if (!TT.empty()) return 0;
-
- if (M.getEndianness() == Module::LittleEndian &&
- M.getPointerSize() == Module::Pointer32)
- return 10; // Weak match
- else if (M.getEndianness() != Module::AnyEndianness ||
- M.getPointerSize() != Module::AnyPointerSize)
- return 0; // Match for some other target
-
- return getJITMatchQuality()/2;
}
-const TargetAsmInfo *ARMBaseTargetMachine::createTargetAsmInfo() const {
- switch (Subtarget.TargetType) {
- case ARMSubtarget::isDarwin:
- return new ARMDarwinTargetAsmInfo(*this);
- case ARMSubtarget::isELF:
- return new ARMELFTargetAsmInfo(*this);
- default:
- return new ARMGenericTargetAsmInfo(*this);
- }
-}
-
// Pass Pipeline Configuration
bool ARMBaseTargetMachine::addInstSelector(PassManagerBase &PM,
CodeGenOpt::Level OptLevel) {
- PM.add(createARMISelDag(*this));
+ PM.add(createARMISelDag(*this, OptLevel));
return false;
}
bool ARMBaseTargetMachine::addPreRegAlloc(PassManagerBase &PM,
CodeGenOpt::Level OptLevel) {
- // FIXME: temporarily disabling load / store optimization pass for Thumb mode.
- if (OptLevel != CodeGenOpt::None && !DisableLdStOpti && !Subtarget.isThumb())
+ if (Subtarget.hasNEON())
+ PM.add(createNEONPreAllocPass());
+
+ // FIXME: temporarily disabling load / store optimization pass for Thumb1.
+ if (OptLevel != CodeGenOpt::None && !Subtarget.isThumb1Only())
PM.add(createARMLoadStoreOptimizationPass(true));
return true;
}
-bool ARMBaseTargetMachine::addPreEmitPass(PassManagerBase &PM,
- CodeGenOpt::Level OptLevel) {
- // FIXME: temporarily disabling load / store optimization pass for Thumb mode.
- if (OptLevel != CodeGenOpt::None && !DisableLdStOpti && !Subtarget.isThumb())
+bool ARMBaseTargetMachine::addPreSched2(PassManagerBase &PM,
+ CodeGenOpt::Level OptLevel) {
+ // FIXME: temporarily disabling load / store optimization pass for Thumb1.
+ if (OptLevel != CodeGenOpt::None && !Subtarget.isThumb1Only())
PM.add(createARMLoadStoreOptimizationPass());
- if (OptLevel != CodeGenOpt::None &&
- !DisableIfConversion && !Subtarget.isThumb())
- PM.add(createIfConverterPass());
-
- PM.add(createARMConstantIslandPass());
return true;
}
-bool ARMBaseTargetMachine::addAssemblyEmitter(PassManagerBase &PM,
- CodeGenOpt::Level OptLevel,
- bool Verbose,
- raw_ostream &Out) {
- // Output assembly language.
- assert(AsmPrinterCtor && "AsmPrinter was not linked in");
- if (AsmPrinterCtor)
- PM.add(AsmPrinterCtor(Out, *this, Verbose));
+bool ARMBaseTargetMachine::addPreEmitPass(PassManagerBase &PM,
+ CodeGenOpt::Level OptLevel) {
+ // FIXME: temporarily disabling load / store optimization pass for Thumb1.
+ if (OptLevel != CodeGenOpt::None && !Subtarget.isThumb1Only())
+ PM.add(createIfConverterPass());
+
+ if (Subtarget.isThumb2()) {
+ PM.add(createThumb2ITBlockPass());
+ PM.add(createThumb2SizeReductionPass());
+ }
- return false;
+ PM.add(createARMConstantIslandPass());
+ return true;
}
-
bool ARMBaseTargetMachine::addCodeEmitter(PassManagerBase &PM,
CodeGenOpt::Level OptLevel,
- bool DumpAsm,
MachineCodeEmitter &MCE) {
// FIXME: Move this to TargetJITInfo!
if (DefRelocModel == Reloc::Default)
@@ -200,18 +134,11 @@ bool ARMBaseTargetMachine::addCodeEmitter(PassManagerBase &PM,
// Machine code emitter pass for ARM.
PM.add(createARMCodeEmitterPass(*this, MCE));
- if (DumpAsm) {
- assert(AsmPrinterCtor && "AsmPrinter was not linked in");
- if (AsmPrinterCtor)
- PM.add(AsmPrinterCtor(errs(), *this, true));
- }
-
return false;
}
bool ARMBaseTargetMachine::addCodeEmitter(PassManagerBase &PM,
CodeGenOpt::Level OptLevel,
- bool DumpAsm,
JITCodeEmitter &JCE) {
// FIXME: Move this to TargetJITInfo!
if (DefRelocModel == Reloc::Default)
@@ -219,43 +146,42 @@ bool ARMBaseTargetMachine::addCodeEmitter(PassManagerBase &PM,
// Machine code emitter pass for ARM.
PM.add(createARMJITCodeEmitterPass(*this, JCE));
- if (DumpAsm) {
- assert(AsmPrinterCtor && "AsmPrinter was not linked in");
- if (AsmPrinterCtor)
- PM.add(AsmPrinterCtor(errs(), *this, true));
- }
+ return false;
+}
+
+bool ARMBaseTargetMachine::addCodeEmitter(PassManagerBase &PM,
+ CodeGenOpt::Level OptLevel,
+ ObjectCodeEmitter &OCE) {
+ // FIXME: Move this to TargetJITInfo!
+ if (DefRelocModel == Reloc::Default)
+ setRelocationModel(Reloc::Static);
+ // Machine code emitter pass for ARM.
+ PM.add(createARMObjectCodeEmitterPass(*this, OCE));
return false;
}
bool ARMBaseTargetMachine::addSimpleCodeEmitter(PassManagerBase &PM,
CodeGenOpt::Level OptLevel,
- bool DumpAsm,
MachineCodeEmitter &MCE) {
// Machine code emitter pass for ARM.
PM.add(createARMCodeEmitterPass(*this, MCE));
- if (DumpAsm) {
- assert(AsmPrinterCtor && "AsmPrinter was not linked in");
- if (AsmPrinterCtor)
- PM.add(AsmPrinterCtor(errs(), *this, true));
- }
-
return false;
}
bool ARMBaseTargetMachine::addSimpleCodeEmitter(PassManagerBase &PM,
CodeGenOpt::Level OptLevel,
- bool DumpAsm,
JITCodeEmitter &JCE) {
// Machine code emitter pass for ARM.
PM.add(createARMJITCodeEmitterPass(*this, JCE));
- if (DumpAsm) {
- assert(AsmPrinterCtor && "AsmPrinter was not linked in");
- if (AsmPrinterCtor)
- PM.add(AsmPrinterCtor(errs(), *this, true));
- }
-
return false;
}
+bool ARMBaseTargetMachine::addSimpleCodeEmitter(PassManagerBase &PM,
+ CodeGenOpt::Level OptLevel,
+ ObjectCodeEmitter &OCE) {
+ // Machine code emitter pass for ARM.
+ PM.add(createARMObjectCodeEmitterPass(*this, OCE));
+ return false;
+}
diff --git a/lib/Target/ARM/ARMTargetMachine.h b/lib/Target/ARM/ARMTargetMachine.h
index a0df54d..71a5348 100644
--- a/lib/Target/ARM/ARMTargetMachine.h
+++ b/lib/Target/ARM/ARMTargetMachine.h
@@ -16,7 +16,6 @@
#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetData.h"
-#include "llvm/Target/TargetFrameInfo.h"
#include "ARMInstrInfo.h"
#include "ARMFrameInfo.h"
#include "ARMJITInfo.h"
@@ -27,8 +26,6 @@
namespace llvm {
-class Module;
-
class ARMBaseTargetMachine : public LLVMTargetMachine {
protected:
ARMSubtarget Subtarget;
@@ -39,16 +36,9 @@ private:
InstrItineraryData InstrItins;
Reloc::Model DefRelocModel; // Reloc model before it's overridden.
-protected:
- // To avoid having target depend on the asmprinter stuff libraries, asmprinter
- // set this functions to ctor pointer at startup time if they are linked in.
- typedef FunctionPass *(*AsmPrinterCtorFn)(raw_ostream &o,
- ARMBaseTargetMachine &tm,
- bool verbose);
- static AsmPrinterCtorFn AsmPrinterCtor;
-
public:
- ARMBaseTargetMachine(const Module &M, const std::string &FS, bool isThumb);
+ ARMBaseTargetMachine(const Target &T, const std::string &TT,
+ const std::string &FS, bool isThumb);
virtual const ARMFrameInfo *getFrameInfo() const { return &FrameInfo; }
virtual ARMJITInfo *getJITInfo() { return &JITInfo; }
@@ -57,34 +47,26 @@ public:
return InstrItins;
}
- static void registerAsmPrinter(AsmPrinterCtorFn F) {
- AsmPrinterCtor = F;
- }
-
- static unsigned getModuleMatchQuality(const Module &M);
- static unsigned getJITMatchQuality();
-
- virtual const TargetAsmInfo *createTargetAsmInfo() const;
-
// Pass Pipeline Configuration
virtual bool addInstSelector(PassManagerBase &PM, CodeGenOpt::Level OptLevel);
virtual bool addPreRegAlloc(PassManagerBase &PM, CodeGenOpt::Level OptLevel);
+ virtual bool addPreSched2(PassManagerBase &PM, CodeGenOpt::Level OptLevel);
virtual bool addPreEmitPass(PassManagerBase &PM, CodeGenOpt::Level OptLevel);
- virtual bool addAssemblyEmitter(PassManagerBase &PM,
- CodeGenOpt::Level OptLevel,
- bool Verbose, raw_ostream &Out);
virtual bool addCodeEmitter(PassManagerBase &PM, CodeGenOpt::Level OptLevel,
- bool DumpAsm, MachineCodeEmitter &MCE);
+ MachineCodeEmitter &MCE);
+ virtual bool addCodeEmitter(PassManagerBase &PM, CodeGenOpt::Level OptLevel,
+ JITCodeEmitter &MCE);
virtual bool addCodeEmitter(PassManagerBase &PM, CodeGenOpt::Level OptLevel,
- bool DumpAsm, JITCodeEmitter &MCE);
+ ObjectCodeEmitter &OCE);
virtual bool addSimpleCodeEmitter(PassManagerBase &PM,
CodeGenOpt::Level OptLevel,
- bool DumpAsm,
MachineCodeEmitter &MCE);
virtual bool addSimpleCodeEmitter(PassManagerBase &PM,
CodeGenOpt::Level OptLevel,
- bool DumpAsm,
JITCodeEmitter &MCE);
+ virtual bool addSimpleCodeEmitter(PassManagerBase &PM,
+ CodeGenOpt::Level OptLevel,
+ ObjectCodeEmitter &OCE);
};
/// ARMTargetMachine - ARM target machine.
@@ -94,7 +76,8 @@ class ARMTargetMachine : public ARMBaseTargetMachine {
const TargetData DataLayout; // Calculates type size & alignment
ARMTargetLowering TLInfo;
public:
- ARMTargetMachine(const Module &M, const std::string &FS);
+ ARMTargetMachine(const Target &T, const std::string &TT,
+ const std::string &FS);
virtual const ARMRegisterInfo *getRegisterInfo() const {
return &InstrInfo.getRegisterInfo();
@@ -106,9 +89,6 @@ public:
virtual const ARMInstrInfo *getInstrInfo() const { return &InstrInfo; }
virtual const TargetData *getTargetData() const { return &DataLayout; }
-
- static unsigned getJITMatchQuality();
- static unsigned getModuleMatchQuality(const Module &M);
};
/// ThumbTargetMachine - Thumb target machine.
@@ -120,7 +100,8 @@ class ThumbTargetMachine : public ARMBaseTargetMachine {
const TargetData DataLayout; // Calculates type size & alignment
ARMTargetLowering TLInfo;
public:
- ThumbTargetMachine(const Module &M, const std::string &FS);
+ ThumbTargetMachine(const Target &T, const std::string &TT,
+ const std::string &FS);
/// returns either Thumb1RegisterInfo of Thumb2RegisterInfo
virtual const ARMBaseRegisterInfo *getRegisterInfo() const {
@@ -134,9 +115,6 @@ public:
/// returns either Thumb1InstrInfo or Thumb2InstrInfo
virtual const ARMBaseInstrInfo *getInstrInfo() const { return InstrInfo; }
virtual const TargetData *getTargetData() const { return &DataLayout; }
-
- static unsigned getJITMatchQuality();
- static unsigned getModuleMatchQuality(const Module &M);
};
} // end namespace llvm
diff --git a/lib/Target/ARM/ARMTargetObjectFile.h b/lib/Target/ARM/ARMTargetObjectFile.h
new file mode 100644
index 0000000..9703403
--- /dev/null
+++ b/lib/Target/ARM/ARMTargetObjectFile.h
@@ -0,0 +1,39 @@
+//===-- llvm/Target/ARMTargetObjectFile.h - ARM Object Info -----*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_ARM_TARGETOBJECTFILE_H
+#define LLVM_TARGET_ARM_TARGETOBJECTFILE_H
+
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/MC/MCSectionELF.h"
+
+namespace llvm {
+
+ class ARMElfTargetObjectFile : public TargetLoweringObjectFileELF {
+ public:
+ ARMElfTargetObjectFile() : TargetLoweringObjectFileELF() {}
+
+ void Initialize(MCContext &Ctx, const TargetMachine &TM) {
+ TargetLoweringObjectFileELF::Initialize(Ctx, TM);
+
+ if (TM.getSubtarget<ARMSubtarget>().isAAPCS_ABI()) {
+ StaticCtorSection =
+ getELFSection(".init_array", MCSectionELF::SHT_INIT_ARRAY,
+ MCSectionELF::SHF_WRITE | MCSectionELF::SHF_ALLOC,
+ SectionKind::getDataRel());
+ StaticDtorSection =
+ getELFSection(".fini_array", MCSectionELF::SHT_FINI_ARRAY,
+ MCSectionELF::SHF_WRITE | MCSectionELF::SHF_ALLOC,
+ SectionKind::getDataRel());
+ }
+ }
+ };
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
new file mode 100644
index 0000000..7438ea9
--- /dev/null
+++ b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -0,0 +1,618 @@
+//===-- ARMAsmParser.cpp - Parse ARM assembly to MCInst instructions ------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCAsmLexer.h"
+#include "llvm/MC/MCAsmParser.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Target/TargetAsmParser.h"
+using namespace llvm;
+
+namespace {
+struct ARMOperand;
+
+// The shift types for register controlled shifts in arm memory addressing
+enum ShiftType {
+ Lsl,
+ Lsr,
+ Asr,
+ Ror,
+ Rrx
+};
+
+class ARMAsmParser : public TargetAsmParser {
+ MCAsmParser &Parser;
+
+private:
+ MCAsmParser &getParser() const { return Parser; }
+
+ MCAsmLexer &getLexer() const { return Parser.getLexer(); }
+
+ void Warning(SMLoc L, const Twine &Msg) { Parser.Warning(L, Msg); }
+
+ bool Error(SMLoc L, const Twine &Msg) { return Parser.Error(L, Msg); }
+
+ bool ParseRegister(ARMOperand &Op);
+
+ bool ParseRegisterList(ARMOperand &Op);
+
+ bool ParseMemory(ARMOperand &Op);
+
+ bool ParseShift(enum ShiftType *St, const MCExpr *&ShiftAmount);
+
+ bool ParseOperand(ARMOperand &Op);
+
+ bool ParseDirectiveWord(unsigned Size, SMLoc L);
+
+ // TODO - For now hacked versions of the next two are in here in this file to
+ // allow some parser testing until the table gen versions are implemented.
+
+ /// @name Auto-generated Match Functions
+ /// {
+ bool MatchInstruction(SmallVectorImpl<ARMOperand> &Operands,
+ MCInst &Inst);
+
+ /// MatchRegisterName - Match the given string to a register name and return
+ /// its register number, or -1 if there is no match. To allow return values
+ /// to be used directly in register lists, arm registers have values between
+ /// 0 and 15.
+ int MatchRegisterName(const StringRef &Name);
+
+ /// }
+
+
+public:
+ ARMAsmParser(const Target &T, MCAsmParser &_Parser)
+ : TargetAsmParser(T), Parser(_Parser) {}
+
+ virtual bool ParseInstruction(const StringRef &Name, MCInst &Inst);
+
+ virtual bool ParseDirective(AsmToken DirectiveID);
+};
+
+} // end anonymous namespace
+
+namespace {
+
+/// ARMOperand - Instances of this class represent a parsed ARM machine
+/// instruction.
+struct ARMOperand {
+ enum {
+ Token,
+ Register,
+ Immediate,
+ Memory
+ } Kind;
+
+
+ union {
+ struct {
+ const char *Data;
+ unsigned Length;
+ } Tok;
+
+ struct {
+ unsigned RegNum;
+ bool Writeback;
+ } Reg;
+
+ struct {
+ const MCExpr *Val;
+ } Imm;
+
+ // This is for all forms of ARM address expressions
+ struct {
+ unsigned BaseRegNum;
+ bool OffsetIsReg;
+ const MCExpr *Offset; // used when OffsetIsReg is false
+ unsigned OffsetRegNum; // used when OffsetIsReg is true
+ bool OffsetRegShifted; // only used when OffsetIsReg is true
+ enum ShiftType ShiftType; // used when OffsetRegShifted is true
+ const MCExpr *ShiftAmount; // used when OffsetRegShifted is true
+ bool Preindexed;
+ bool Postindexed;
+ bool Negative; // only used when OffsetIsReg is true
+ bool Writeback;
+ } Mem;
+
+ };
+
+ StringRef getToken() const {
+ assert(Kind == Token && "Invalid access!");
+ return StringRef(Tok.Data, Tok.Length);
+ }
+
+ unsigned getReg() const {
+ assert(Kind == Register && "Invalid access!");
+ return Reg.RegNum;
+ }
+
+ const MCExpr *getImm() const {
+ assert(Kind == Immediate && "Invalid access!");
+ return Imm.Val;
+ }
+
+ bool isToken() const {return Kind == Token; }
+
+ bool isReg() const { return Kind == Register; }
+
+ void addRegOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::CreateReg(getReg()));
+ }
+
+ static ARMOperand CreateToken(StringRef Str) {
+ ARMOperand Res;
+ Res.Kind = Token;
+ Res.Tok.Data = Str.data();
+ Res.Tok.Length = Str.size();
+ return Res;
+ }
+
+ static ARMOperand CreateReg(unsigned RegNum, bool Writeback) {
+ ARMOperand Res;
+ Res.Kind = Register;
+ Res.Reg.RegNum = RegNum;
+ Res.Reg.Writeback = Writeback;
+ return Res;
+ }
+
+ static ARMOperand CreateImm(const MCExpr *Val) {
+ ARMOperand Res;
+ Res.Kind = Immediate;
+ Res.Imm.Val = Val;
+ return Res;
+ }
+
+ static ARMOperand CreateMem(unsigned BaseRegNum, bool OffsetIsReg,
+ const MCExpr *Offset, unsigned OffsetRegNum,
+ bool OffsetRegShifted, enum ShiftType ShiftType,
+ const MCExpr *ShiftAmount, bool Preindexed,
+ bool Postindexed, bool Negative, bool Writeback) {
+ ARMOperand Res;
+ Res.Kind = Memory;
+ Res.Mem.BaseRegNum = BaseRegNum;
+ Res.Mem.OffsetIsReg = OffsetIsReg;
+ Res.Mem.Offset = Offset;
+ Res.Mem.OffsetRegNum = OffsetRegNum;
+ Res.Mem.OffsetRegShifted = OffsetRegShifted;
+ Res.Mem.ShiftType = ShiftType;
+ Res.Mem.ShiftAmount = ShiftAmount;
+ Res.Mem.Preindexed = Preindexed;
+ Res.Mem.Postindexed = Postindexed;
+ Res.Mem.Negative = Negative;
+ Res.Mem.Writeback = Writeback;
+ return Res;
+ }
+};
+
+} // end anonymous namespace.
+
+// Try to parse a register name. The token must be an Identifier when called,
+// and if it is a register name a Reg operand is created, the token is eaten
+// and false is returned. Else true is returned and no token is eaten.
+// TODO this is likely to change to allow different register types and or to
+// parse for a specific register type.
+bool ARMAsmParser::ParseRegister(ARMOperand &Op) {
+ const AsmToken &Tok = getLexer().getTok();
+ assert(Tok.is(AsmToken::Identifier) && "Token is not an Identifier");
+
+ // FIXME: Validate register for the current architecture; we have to do
+ // validation later, so maybe there is no need for this here.
+ int RegNum;
+
+ RegNum = MatchRegisterName(Tok.getString());
+ if (RegNum == -1)
+ return true;
+ getLexer().Lex(); // Eat identifier token.
+
+ bool Writeback = false;
+ const AsmToken &ExclaimTok = getLexer().getTok();
+ if (ExclaimTok.is(AsmToken::Exclaim)) {
+ Writeback = true;
+ getLexer().Lex(); // Eat exclaim token
+ }
+
+ Op = ARMOperand::CreateReg(RegNum, Writeback);
+
+ return false;
+}
+
+// Try to parse a register list. The first token must be a '{' when called
+// for now.
+bool ARMAsmParser::ParseRegisterList(ARMOperand &Op) {
+ assert(getLexer().getTok().is(AsmToken::LCurly) &&
+ "Token is not an Left Curly Brace");
+ getLexer().Lex(); // Eat left curly brace token.
+
+ const AsmToken &RegTok = getLexer().getTok();
+ SMLoc RegLoc = RegTok.getLoc();
+ if (RegTok.isNot(AsmToken::Identifier))
+ return Error(RegLoc, "register expected");
+ int RegNum = MatchRegisterName(RegTok.getString());
+ if (RegNum == -1)
+ return Error(RegLoc, "register expected");
+ getLexer().Lex(); // Eat identifier token.
+ unsigned RegList = 1 << RegNum;
+
+ int HighRegNum = RegNum;
+ // TODO ranges like "{Rn-Rm}"
+ while (getLexer().getTok().is(AsmToken::Comma)) {
+ getLexer().Lex(); // Eat comma token.
+
+ const AsmToken &RegTok = getLexer().getTok();
+ SMLoc RegLoc = RegTok.getLoc();
+ if (RegTok.isNot(AsmToken::Identifier))
+ return Error(RegLoc, "register expected");
+ int RegNum = MatchRegisterName(RegTok.getString());
+ if (RegNum == -1)
+ return Error(RegLoc, "register expected");
+
+ if (RegList & (1 << RegNum))
+ Warning(RegLoc, "register duplicated in register list");
+ else if (RegNum <= HighRegNum)
+ Warning(RegLoc, "register not in ascending order in register list");
+ RegList |= 1 << RegNum;
+ HighRegNum = RegNum;
+
+ getLexer().Lex(); // Eat identifier token.
+ }
+ const AsmToken &RCurlyTok = getLexer().getTok();
+ if (RCurlyTok.isNot(AsmToken::RCurly))
+ return Error(RCurlyTok.getLoc(), "'}' expected");
+ getLexer().Lex(); // Eat left curly brace token.
+
+ return false;
+}
+
+// Try to parse an arm memory expression. It must start with a '[' token.
+// TODO Only preindexing and postindexing addressing are started, unindexed
+// with option, etc are still to do.
+bool ARMAsmParser::ParseMemory(ARMOperand &Op) {
+ assert(getLexer().getTok().is(AsmToken::LBrac) &&
+ "Token is not an Left Bracket");
+ getLexer().Lex(); // Eat left bracket token.
+
+ const AsmToken &BaseRegTok = getLexer().getTok();
+ if (BaseRegTok.isNot(AsmToken::Identifier))
+ return Error(BaseRegTok.getLoc(), "register expected");
+ int BaseRegNum = MatchRegisterName(BaseRegTok.getString());
+ if (BaseRegNum == -1)
+ return Error(BaseRegTok.getLoc(), "register expected");
+ getLexer().Lex(); // Eat identifier token.
+
+ bool Preindexed = false;
+ bool Postindexed = false;
+ bool OffsetIsReg = false;
+ bool Negative = false;
+ bool Writeback = false;
+
+ // First look for preindexed address forms:
+ // [Rn, +/-Rm]
+ // [Rn, #offset]
+ // [Rn, +/-Rm, shift]
+ // that is after the "[Rn" we now have see if the next token is a comma.
+ const AsmToken &Tok = getLexer().getTok();
+ if (Tok.is(AsmToken::Comma)) {
+ Preindexed = true;
+ getLexer().Lex(); // Eat comma token.
+
+ const AsmToken &NextTok = getLexer().getTok();
+ if (NextTok.is(AsmToken::Plus))
+ getLexer().Lex(); // Eat plus token.
+ else if (NextTok.is(AsmToken::Minus)) {
+ Negative = true;
+ getLexer().Lex(); // Eat minus token
+ }
+
+ // See if there is a register following the "[Rn," we have so far.
+ const AsmToken &OffsetRegTok = getLexer().getTok();
+ int OffsetRegNum = MatchRegisterName(OffsetRegTok.getString());
+ bool OffsetRegShifted = false;
+ enum ShiftType ShiftType;
+ const MCExpr *ShiftAmount;
+ const MCExpr *Offset;
+ if (OffsetRegNum != -1) {
+ OffsetIsReg = true;
+ getLexer().Lex(); // Eat identifier token for the offset register.
+ // Look for a comma then a shift
+ const AsmToken &Tok = getLexer().getTok();
+ if (Tok.is(AsmToken::Comma)) {
+ getLexer().Lex(); // Eat comma token.
+
+ const AsmToken &Tok = getLexer().getTok();
+ if (ParseShift(&ShiftType, ShiftAmount))
+ return Error(Tok.getLoc(), "shift expected");
+ OffsetRegShifted = true;
+ }
+ }
+ else { // "[Rn," we have so far was not followed by "Rm"
+ // Look for #offset following the "[Rn,"
+ const AsmToken &HashTok = getLexer().getTok();
+ if (HashTok.isNot(AsmToken::Hash))
+ return Error(HashTok.getLoc(), "'#' expected");
+ getLexer().Lex(); // Eat hash token.
+
+ if (getParser().ParseExpression(Offset))
+ return true;
+ }
+ const AsmToken &RBracTok = getLexer().getTok();
+ if (RBracTok.isNot(AsmToken::RBrac))
+ return Error(RBracTok.getLoc(), "']' expected");
+ getLexer().Lex(); // Eat right bracket token.
+
+ const AsmToken &ExclaimTok = getLexer().getTok();
+ if (ExclaimTok.is(AsmToken::Exclaim)) {
+ Writeback = true;
+ getLexer().Lex(); // Eat exclaim token
+ }
+ Op = ARMOperand::CreateMem(BaseRegNum, OffsetIsReg, Offset, OffsetRegNum,
+ OffsetRegShifted, ShiftType, ShiftAmount,
+ Preindexed, Postindexed, Negative, Writeback);
+ return false;
+ }
+ // The "[Rn" we have so far was not followed by a comma.
+ else if (Tok.is(AsmToken::RBrac)) {
+ // This is a post indexing addressing forms:
+ // [Rn], #offset
+ // [Rn], +/-Rm
+ // [Rn], +/-Rm, shift
+ // that is a ']' follows after the "[Rn".
+ Postindexed = true;
+ Writeback = true;
+ getLexer().Lex(); // Eat right bracket token.
+
+ const AsmToken &CommaTok = getLexer().getTok();
+ if (CommaTok.isNot(AsmToken::Comma))
+ return Error(CommaTok.getLoc(), "',' expected");
+ getLexer().Lex(); // Eat comma token.
+
+ const AsmToken &NextTok = getLexer().getTok();
+ if (NextTok.is(AsmToken::Plus))
+ getLexer().Lex(); // Eat plus token.
+ else if (NextTok.is(AsmToken::Minus)) {
+ Negative = true;
+ getLexer().Lex(); // Eat minus token
+ }
+
+ // See if there is a register following the "[Rn]," we have so far.
+ const AsmToken &OffsetRegTok = getLexer().getTok();
+ int OffsetRegNum = MatchRegisterName(OffsetRegTok.getString());
+ bool OffsetRegShifted = false;
+ enum ShiftType ShiftType;
+ const MCExpr *ShiftAmount;
+ const MCExpr *Offset;
+ if (OffsetRegNum != -1) {
+ OffsetIsReg = true;
+ getLexer().Lex(); // Eat identifier token for the offset register.
+ // Look for a comma then a shift
+ const AsmToken &Tok = getLexer().getTok();
+ if (Tok.is(AsmToken::Comma)) {
+ getLexer().Lex(); // Eat comma token.
+
+ const AsmToken &Tok = getLexer().getTok();
+ if (ParseShift(&ShiftType, ShiftAmount))
+ return Error(Tok.getLoc(), "shift expected");
+ OffsetRegShifted = true;
+ }
+ }
+ else { // "[Rn]," we have so far was not followed by "Rm"
+ // Look for #offset following the "[Rn],"
+ const AsmToken &HashTok = getLexer().getTok();
+ if (HashTok.isNot(AsmToken::Hash))
+ return Error(HashTok.getLoc(), "'#' expected");
+ getLexer().Lex(); // Eat hash token.
+
+ if (getParser().ParseExpression(Offset))
+ return true;
+ }
+ Op = ARMOperand::CreateMem(BaseRegNum, OffsetIsReg, Offset, OffsetRegNum,
+ OffsetRegShifted, ShiftType, ShiftAmount,
+ Preindexed, Postindexed, Negative, Writeback);
+ return false;
+ }
+
+ return true;
+}
+
+/// ParseShift as one of these two:
+/// ( lsl | lsr | asr | ror ) , # shift_amount
+/// rrx
+/// and returns true if it parses a shift otherwise it returns false.
+bool ARMAsmParser::ParseShift(ShiftType *St, const MCExpr *&ShiftAmount) {
+ const AsmToken &Tok = getLexer().getTok();
+ if (Tok.isNot(AsmToken::Identifier))
+ return true;
+ const StringRef &ShiftName = Tok.getString();
+ if (ShiftName == "lsl" || ShiftName == "LSL")
+ *St = Lsl;
+ else if (ShiftName == "lsr" || ShiftName == "LSR")
+ *St = Lsr;
+ else if (ShiftName == "asr" || ShiftName == "ASR")
+ *St = Asr;
+ else if (ShiftName == "ror" || ShiftName == "ROR")
+ *St = Ror;
+ else if (ShiftName == "rrx" || ShiftName == "RRX")
+ *St = Rrx;
+ else
+ return true;
+ getLexer().Lex(); // Eat shift type token.
+
+ // For all but a Rotate right there must be a '#' and a shift amount
+ if (*St != Rrx) {
+ // Look for # following the shift type
+ const AsmToken &HashTok = getLexer().getTok();
+ if (HashTok.isNot(AsmToken::Hash))
+ return Error(HashTok.getLoc(), "'#' expected");
+ getLexer().Lex(); // Eat hash token.
+
+ if (getParser().ParseExpression(ShiftAmount))
+ return true;
+ }
+
+ return false;
+}
+
+// A hack to allow some testing
+int ARMAsmParser::MatchRegisterName(const StringRef &Name) {
+ if (Name == "r0" || Name == "R0")
+ return 0;
+ else if (Name == "r1" || Name == "R1")
+ return 1;
+ else if (Name == "r2" || Name == "R2")
+ return 2;
+ else if (Name == "r3" || Name == "R3")
+ return 3;
+ else if (Name == "r3" || Name == "R3")
+ return 3;
+ else if (Name == "r4" || Name == "R4")
+ return 4;
+ else if (Name == "r5" || Name == "R5")
+ return 5;
+ else if (Name == "r6" || Name == "R6")
+ return 6;
+ else if (Name == "r7" || Name == "R7")
+ return 7;
+ else if (Name == "r8" || Name == "R8")
+ return 8;
+ else if (Name == "r9" || Name == "R9")
+ return 9;
+ else if (Name == "r10" || Name == "R10")
+ return 10;
+ else if (Name == "r11" || Name == "R11" || Name == "fp")
+ return 11;
+ else if (Name == "r12" || Name == "R12" || Name == "ip")
+ return 12;
+ else if (Name == "r13" || Name == "R13" || Name == "sp")
+ return 13;
+ else if (Name == "r14" || Name == "R14" || Name == "lr")
+ return 14;
+ else if (Name == "r15" || Name == "R15" || Name == "pc")
+ return 15;
+ return -1;
+}
+
+// A hack to allow some testing
+bool ARMAsmParser::MatchInstruction(SmallVectorImpl<ARMOperand> &Operands,
+ MCInst &Inst) {
+ struct ARMOperand Op0 = Operands[0];
+ assert(Op0.Kind == ARMOperand::Token && "First operand not a Token");
+ const StringRef &Mnemonic = Op0.getToken();
+ if (Mnemonic == "add" ||
+ Mnemonic == "stmfd" ||
+ Mnemonic == "str" ||
+ Mnemonic == "ldmfd" ||
+ Mnemonic == "ldr" ||
+ Mnemonic == "mov" ||
+ Mnemonic == "sub")
+ return false;
+
+ return true;
+}
+
+// TODO - this is a work in progress
+bool ARMAsmParser::ParseOperand(ARMOperand &Op) {
+ switch (getLexer().getKind()) {
+ case AsmToken::Identifier:
+ if (!ParseRegister(Op))
+ return false;
+ // TODO parse other operands that start with an identifier like labels
+ return Error(getLexer().getTok().getLoc(), "labels not yet supported");
+ case AsmToken::LBrac:
+ if (!ParseMemory(Op))
+ return false;
+ case AsmToken::LCurly:
+ if (!ParseRegisterList(Op))
+ return false;
+ case AsmToken::Hash:
+ // #42 -> immediate.
+ // TODO: ":lower16:" and ":upper16:" modifiers after # before immediate
+ getLexer().Lex();
+ const MCExpr *Val;
+ if (getParser().ParseExpression(Val))
+ return true;
+ Op = ARMOperand::CreateImm(Val);
+ return false;
+ default:
+ return Error(getLexer().getTok().getLoc(), "unexpected token in operand");
+ }
+}
+
+bool ARMAsmParser::ParseInstruction(const StringRef &Name, MCInst &Inst) {
+ SmallVector<ARMOperand, 7> Operands;
+
+ Operands.push_back(ARMOperand::CreateToken(Name));
+
+ SMLoc Loc = getLexer().getTok().getLoc();
+ if (getLexer().isNot(AsmToken::EndOfStatement)) {
+
+ // Read the first operand.
+ Operands.push_back(ARMOperand());
+ if (ParseOperand(Operands.back()))
+ return true;
+
+ while (getLexer().is(AsmToken::Comma)) {
+ getLexer().Lex(); // Eat the comma.
+
+ // Parse and remember the operand.
+ Operands.push_back(ARMOperand());
+ if (ParseOperand(Operands.back()))
+ return true;
+ }
+ }
+ if (!MatchInstruction(Operands, Inst))
+ return false;
+
+ Error(Loc, "ARMAsmParser::ParseInstruction only partly implemented");
+ return true;
+}
+
+bool ARMAsmParser::ParseDirective(AsmToken DirectiveID) {
+ StringRef IDVal = DirectiveID.getIdentifier();
+ if (IDVal == ".word")
+ return ParseDirectiveWord(4, DirectiveID.getLoc());
+ return true;
+}
+
+/// ParseDirectiveWord
+/// ::= .word [ expression (, expression)* ]
+bool ARMAsmParser::ParseDirectiveWord(unsigned Size, SMLoc L) {
+ if (getLexer().isNot(AsmToken::EndOfStatement)) {
+ for (;;) {
+ const MCExpr *Value;
+ if (getParser().ParseExpression(Value))
+ return true;
+
+ getParser().getStreamer().EmitValue(Value, Size);
+
+ if (getLexer().is(AsmToken::EndOfStatement))
+ break;
+
+ // FIXME: Improve diagnostic.
+ if (getLexer().isNot(AsmToken::Comma))
+ return Error(L, "unexpected token in directive");
+ getLexer().Lex();
+ }
+ }
+
+ getLexer().Lex();
+ return false;
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeARMAsmParser() {
+ RegisterAsmParser<ARMAsmParser> X(TheARMTarget);
+ RegisterAsmParser<ARMAsmParser> Y(TheThumbTarget);
+}
diff --git a/lib/Target/ARM/AsmParser/CMakeLists.txt b/lib/Target/ARM/AsmParser/CMakeLists.txt
new file mode 100644
index 0000000..308c6cf
--- /dev/null
+++ b/lib/Target/ARM/AsmParser/CMakeLists.txt
@@ -0,0 +1,6 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_llvm_library(LLVMARMAsmParser
+ ARMAsmParser.cpp
+ )
+
diff --git a/lib/Target/ARM/AsmParser/Makefile b/lib/Target/ARM/AsmParser/Makefile
new file mode 100644
index 0000000..97e5612
--- /dev/null
+++ b/lib/Target/ARM/AsmParser/Makefile
@@ -0,0 +1,15 @@
+##===- lib/Target/ARM/AsmParser/Makefile -------------------*- Makefile -*-===##
+#
+# The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../../..
+LIBRARYNAME = LLVMARMAsmParser
+
+# Hack: we need to include 'main' ARM target directory to grab private headers
+CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/Target/ARM/AsmPrinter/ARMAsmPrinter.cpp b/lib/Target/ARM/AsmPrinter/ARMAsmPrinter.cpp
index 434a19a..546731b 100644
--- a/lib/Target/ARM/AsmPrinter/ARMAsmPrinter.cpp
+++ b/lib/Target/ARM/AsmPrinter/ARMAsmPrinter.cpp
@@ -1,5 +1,3 @@
-//===-- ARMAsmPrinter.cpp - ARM LLVM assembly writer ----------------------===//
-//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
@@ -21,23 +19,30 @@
#include "ARMMachineFunctionInfo.h"
#include "llvm/Constants.h"
#include "llvm/Module.h"
-#include "llvm/MDNode.h"
+#include "llvm/Assembly/Writer.h"
#include "llvm/CodeGen/AsmPrinter.h"
#include "llvm/CodeGen/DwarfWriter.h"
#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineJumpTableInfo.h"
-#include "llvm/Target/TargetAsmInfo.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCSymbol.h"
#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetRegistry.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallString.h"
#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/StringSet.h"
#include "llvm/Support/Compiler.h"
+#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/Mangler.h"
#include "llvm/Support/MathExtras.h"
-#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/FormattedStream.h"
#include <cctype>
using namespace llvm;
@@ -45,7 +50,6 @@ STATISTIC(EmittedInsts, "Number of machine instrs printed");
namespace {
class VISIBILITY_HIDDEN ARMAsmPrinter : public AsmPrinter {
- DwarfWriter *DW;
/// Subtarget - Keep a pointer to the ARMSubtarget around so that we can
/// make the right decision when printing asm code for different targets.
@@ -68,22 +72,18 @@ namespace {
/// GVNonLazyPtrs - Keeps the set of GlobalValues that require
/// non-lazy-pointers for indirect access.
- StringSet<> GVNonLazyPtrs;
+ StringMap<std::string> GVNonLazyPtrs;
/// HiddenGVNonLazyPtrs - Keeps the set of GlobalValues with hidden
/// visibility that require non-lazy-pointers for indirect access.
- StringSet<> HiddenGVNonLazyPtrs;
-
- /// FnStubs - Keeps the set of external function GlobalAddresses that the
- /// asm printer should generate stubs for.
- StringSet<> FnStubs;
+ StringMap<std::string> HiddenGVNonLazyPtrs;
/// True if asm printer is printing a series of CONSTPOOL_ENTRY.
bool InCPMode;
public:
- explicit ARMAsmPrinter(raw_ostream &O, TargetMachine &TM,
- const TargetAsmInfo *T, bool V)
- : AsmPrinter(O, TM, T, V), DW(0), AFI(NULL), MCP(NULL),
+ explicit ARMAsmPrinter(formatted_raw_ostream &O, TargetMachine &TM,
+ const MCAsmInfo *T, bool V)
+ : AsmPrinter(O, TM, T, V), AFI(NULL), MCP(NULL),
InCPMode(false) {
Subtarget = &TM.getSubtarget<ARMSubtarget>();
}
@@ -110,6 +110,7 @@ namespace {
const char *Modifier = 0);
void printBitfieldInvMaskImmOperand (const MachineInstr *MI, int OpNum);
+ void printThumbITMask(const MachineInstr *MI, int OpNum);
void printThumbAddrModeRROperand(const MachineInstr *MI, int OpNum);
void printThumbAddrModeRI5Operand(const MachineInstr *MI, int OpNum,
unsigned Scale);
@@ -118,10 +119,10 @@ namespace {
void printThumbAddrModeS4Operand(const MachineInstr *MI, int OpNum);
void printThumbAddrModeSPOperand(const MachineInstr *MI, int OpNum);
- void printT2SOImmOperand(const MachineInstr *MI, int OpNum);
void printT2SOOperand(const MachineInstr *MI, int OpNum);
void printT2AddrModeImm12Operand(const MachineInstr *MI, int OpNum);
void printT2AddrModeImm8Operand(const MachineInstr *MI, int OpNum);
+ void printT2AddrModeImm8s4Operand(const MachineInstr *MI, int OpNum);
void printT2AddrModeImm8OffsetOperand(const MachineInstr *MI, int OpNum);
void printT2AddrModeSoRegOperand(const MachineInstr *MI, int OpNum);
@@ -132,6 +133,9 @@ namespace {
void printCPInstOperand(const MachineInstr *MI, int OpNum,
const char *Modifier);
void printJTBlockOperand(const MachineInstr *MI, int OpNum);
+ void printJT2BlockOperand(const MachineInstr *MI, int OpNum);
+ void printTBAddrMode(const MachineInstr *MI, int OpNum);
+ void printNoHashImmediate(const MachineInstr *MI, int OpNum);
virtual bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
unsigned AsmVariant, const char *ExtraCode);
@@ -139,12 +143,14 @@ namespace {
unsigned AsmVariant,
const char *ExtraCode);
- void printModuleLevelGV(const GlobalVariable* GVar);
- bool printInstruction(const MachineInstr *MI); // autogenerated.
+ void PrintGlobalVariable(const GlobalVariable* GVar);
+ void printInstruction(const MachineInstr *MI); // autogenerated.
+ static const char *getRegisterName(unsigned RegNo);
+
void printMachineInstruction(const MachineInstr *MI);
bool runOnMachineFunction(MachineFunction &F);
- bool doInitialization(Module &M);
bool doFinalization(Module &M);
+ void EmitStartOfAsmFile(Module &M);
/// EmitMachineConstantPoolValue - Print a machine constantpool value to
/// the .s file.
@@ -153,24 +159,35 @@ namespace {
ARMConstantPoolValue *ACPV = static_cast<ARMConstantPoolValue*>(MCPV);
GlobalValue *GV = ACPV->getGV();
- std::string Name = GV ? Mang->getValueName(GV) : TAI->getGlobalPrefix();
- if (!GV)
- Name += ACPV->getSymbol();
- if (ACPV->isNonLazyPointer()) {
- if (GV->hasHiddenVisibility())
- HiddenGVNonLazyPtrs.insert(Name);
- else
- GVNonLazyPtrs.insert(Name);
- printSuffixedName(Name, "$non_lazy_ptr");
- } else if (ACPV->isStub()) {
- FnStubs.insert(Name);
- printSuffixedName(Name, "$stub");
+ std::string Name;
+
+ if (ACPV->isLSDA()) {
+ SmallString<16> LSDAName;
+ raw_svector_ostream(LSDAName) << MAI->getPrivateGlobalPrefix() <<
+ "_LSDA_" << getFunctionNumber();
+ Name = LSDAName.str();
+ } else if (GV) {
+ bool isIndirect = Subtarget->isTargetDarwin() &&
+ Subtarget->GVIsIndirectSymbol(GV, TM.getRelocationModel());
+ if (!isIndirect)
+ Name = Mang->getMangledName(GV);
+ else {
+ // FIXME: Remove this when Darwin transition to @GOT like syntax.
+ std::string SymName = Mang->getMangledName(GV);
+ Name = Mang->getMangledName(GV, "$non_lazy_ptr", true);
+ if (GV->hasHiddenVisibility())
+ HiddenGVNonLazyPtrs[SymName] = Name;
+ else
+ GVNonLazyPtrs[SymName] = Name;
+ }
} else
- O << Name;
+ Name = Mang->makeNameProper(ACPV->getSymbol());
+ O << Name;
+
if (ACPV->hasModifier()) O << "(" << ACPV->getModifier() << ")";
if (ACPV->getPCAdjustment() != 0) {
- O << "-(" << TAI->getPrivateGlobalPrefix() << "PC"
- << utostr(ACPV->getLabelId())
+ O << "-(" << MAI->getPrivateGlobalPrefix() << "PC"
+ << ACPV->getLabelId()
<< "+" << (unsigned)ACPV->getPCAdjustment();
if (ACPV->mustAddCurrentAddress())
O << "-.";
@@ -178,7 +195,7 @@ namespace {
}
O << "\n";
}
-
+
void getAnalysisUsage(AnalysisUsage &AU) const {
AsmPrinter::getAnalysisUsage(AU);
AU.setPreservesAll();
@@ -205,38 +222,39 @@ bool ARMAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
// NOTE: we don't print out constant pools here, they are handled as
// instructions.
- O << "\n";
+ O << '\n';
+
// Print out labels for the function.
const Function *F = MF.getFunction();
+ OutStreamer.SwitchSection(getObjFileLowering().SectionForGlobal(F, Mang, TM));
+
switch (F->getLinkage()) {
- default: assert(0 && "Unknown linkage type!");
+ default: llvm_unreachable("Unknown linkage type!");
case Function::PrivateLinkage:
case Function::InternalLinkage:
- SwitchToTextSection("\t.text", F);
break;
case Function::ExternalLinkage:
- SwitchToTextSection("\t.text", F);
O << "\t.globl\t" << CurrentFnName << "\n";
break;
+ case Function::LinkerPrivateLinkage:
case Function::WeakAnyLinkage:
case Function::WeakODRLinkage:
case Function::LinkOnceAnyLinkage:
case Function::LinkOnceODRLinkage:
if (Subtarget->isTargetDarwin()) {
- SwitchToTextSection(
- ".section __TEXT,__textcoal_nt,coalesced,pure_instructions", F);
O << "\t.globl\t" << CurrentFnName << "\n";
O << "\t.weak_definition\t" << CurrentFnName << "\n";
} else {
- O << TAI->getWeakRefDirective() << CurrentFnName << "\n";
+ O << MAI->getWeakRefDirective() << CurrentFnName << "\n";
}
break;
}
printVisibility(CurrentFnName, F->getVisibility());
+ unsigned FnAlign = 1 << MF.getAlignment(); // MF alignment is log2.
if (AFI->isThumbFunction()) {
- EmitAlignment(MF.getAlignment(), F, AFI->getAlign());
+ EmitAlignment(FnAlign, F, AFI->getAlign());
O << "\t.code\t16\n";
O << "\t.thumb_func";
if (Subtarget->isTargetDarwin())
@@ -244,7 +262,7 @@ bool ARMAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
O << "\n";
InCPMode = false;
} else {
- EmitAlignment(MF.getAlignment(), F);
+ EmitAlignment(FnAlign, F);
}
O << CurrentFnName << ":\n";
@@ -266,8 +284,7 @@ bool ARMAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
I != E; ++I) {
// Print a label for the basic block.
if (I != MF.begin()) {
- printBasicBlockLabel(I, true, true, VerboseAsm);
- O << '\n';
+ EmitBasicBlockStart(I);
}
for (MachineBasicBlock::const_iterator II = I->begin(), E = I->end();
II != E; ++II) {
@@ -276,14 +293,12 @@ bool ARMAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
}
}
- if (TAI->hasDotTypeDotSizeDirective())
+ if (MAI->hasDotTypeDotSizeDirective())
O << "\t.size " << CurrentFnName << ", .-" << CurrentFnName << "\n";
// Emit post-function debug information.
DW->EndFunction(&MF);
- O.flush();
-
return false;
}
@@ -298,37 +313,39 @@ void ARMAsmPrinter::printOperand(const MachineInstr *MI, int OpNum,
unsigned DRegLo = TRI->getSubReg(Reg, 5); // arm_dsubreg_0
unsigned DRegHi = TRI->getSubReg(Reg, 6); // arm_dsubreg_1
O << '{'
- << TRI->getAsmName(DRegLo) << "-" << TRI->getAsmName(DRegHi)
+ << getRegisterName(DRegLo) << ',' << getRegisterName(DRegHi)
<< '}';
+ } else if (Modifier && strcmp(Modifier, "lane") == 0) {
+ unsigned RegNum = ARMRegisterInfo::getRegisterNumbering(Reg);
+ unsigned DReg = TRI->getMatchingSuperReg(Reg, RegNum & 1 ? 2 : 1,
+ &ARM::DPR_VFP2RegClass);
+ O << getRegisterName(DReg) << '[' << (RegNum & 1) << ']';
} else {
- O << TRI->getAsmName(Reg);
+ O << getRegisterName(Reg);
}
} else
- assert(0 && "not implemented");
+ llvm_unreachable("not implemented");
break;
}
case MachineOperand::MO_Immediate: {
- if (!Modifier || strcmp(Modifier, "no_hash") != 0)
- O << "#";
-
- O << MO.getImm();
+ int64_t Imm = MO.getImm();
+ O << '#';
+ if (Modifier) {
+ if (strcmp(Modifier, "lo16") == 0)
+ O << ":lower16:";
+ else if (strcmp(Modifier, "hi16") == 0)
+ O << ":upper16:";
+ }
+ O << Imm;
break;
}
case MachineOperand::MO_MachineBasicBlock:
- printBasicBlockLabel(MO.getMBB());
+ GetMBBSymbol(MO.getMBB()->getNumber())->print(O, MAI);
return;
case MachineOperand::MO_GlobalAddress: {
bool isCallOp = Modifier && !strcmp(Modifier, "call");
GlobalValue *GV = MO.getGlobal();
- std::string Name = Mang->getValueName(GV);
- bool isExt = (GV->isDeclaration() || GV->hasWeakLinkage() ||
- GV->hasLinkOnceLinkage());
- if (isExt && isCallOp && Subtarget->isTargetDarwin() &&
- TM.getRelocationModel() != Reloc::Static) {
- printSuffixedName(Name, "$stub");
- FnStubs.insert(Name);
- } else
- O << Name;
+ O << Mang->getMangledName(GV);
printOffset(MO.getOffset());
@@ -339,25 +356,20 @@ void ARMAsmPrinter::printOperand(const MachineInstr *MI, int OpNum,
}
case MachineOperand::MO_ExternalSymbol: {
bool isCallOp = Modifier && !strcmp(Modifier, "call");
- std::string Name(TAI->getGlobalPrefix());
- Name += MO.getSymbolName();
- if (isCallOp && Subtarget->isTargetDarwin() &&
- TM.getRelocationModel() != Reloc::Static) {
- printSuffixedName(Name, "$stub");
- FnStubs.insert(Name);
- } else
- O << Name;
+ std::string Name = Mang->makeNameProper(MO.getSymbolName());
+
+ O << Name;
if (isCallOp && Subtarget->isTargetELF() &&
TM.getRelocationModel() == Reloc::PIC_)
O << "(PLT)";
break;
}
case MachineOperand::MO_ConstantPoolIndex:
- O << TAI->getPrivateGlobalPrefix() << "CPI" << getFunctionNumber()
+ O << MAI->getPrivateGlobalPrefix() << "CPI" << getFunctionNumber()
<< '_' << MO.getIndex();
break;
case MachineOperand::MO_JumpTableIndex:
- O << TAI->getPrivateGlobalPrefix() << "JTI" << getFunctionNumber()
+ O << MAI->getPrivateGlobalPrefix() << "JTI" << getFunctionNumber()
<< '_' << MO.getIndex();
break;
default:
@@ -365,9 +377,12 @@ void ARMAsmPrinter::printOperand(const MachineInstr *MI, int OpNum,
}
}
-static void printSOImm(raw_ostream &O, int64_t V, bool VerboseAsm,
- const TargetAsmInfo *TAI) {
- assert(V < (1 << 12) && "Not a valid so_imm value!");
+static void printSOImm(formatted_raw_ostream &O, int64_t V, bool VerboseAsm,
+ const MCAsmInfo *MAI) {
+ // Break it up into two parts that make up a shifter immediate.
+ V = ARM_AM::getSOImmVal(V);
+ assert(V != -1 && "Not a valid so_imm value!");
+
unsigned Imm = ARM_AM::getSOImmValImm(V);
unsigned Rot = ARM_AM::getSOImmValRot(V);
@@ -377,7 +392,7 @@ static void printSOImm(raw_ostream &O, int64_t V, bool VerboseAsm,
O << "#" << Imm << ", " << Rot;
// Pretty printed version.
if (VerboseAsm)
- O << ' ' << TAI->getCommentString()
+ O << ' ' << MAI->getCommentString()
<< ' ' << (int)ARM_AM::rotr32(Imm, Rot);
} else {
O << "#" << Imm;
@@ -389,7 +404,7 @@ static void printSOImm(raw_ostream &O, int64_t V, bool VerboseAsm,
void ARMAsmPrinter::printSOImmOperand(const MachineInstr *MI, int OpNum) {
const MachineOperand &MO = MI->getOperand(OpNum);
assert(MO.isImm() && "Not a valid so_imm value!");
- printSOImm(O, MO.getImm(), VerboseAsm, TAI);
+ printSOImm(O, MO.getImm(), VerboseAsm, MAI);
}
/// printSOImm2PartOperand - SOImm is broken into two pieces using a 'mov'
@@ -399,15 +414,15 @@ void ARMAsmPrinter::printSOImm2PartOperand(const MachineInstr *MI, int OpNum) {
assert(MO.isImm() && "Not a valid so_imm value!");
unsigned V1 = ARM_AM::getSOImmTwoPartFirst(MO.getImm());
unsigned V2 = ARM_AM::getSOImmTwoPartSecond(MO.getImm());
- printSOImm(O, ARM_AM::getSOImmVal(V1), VerboseAsm, TAI);
+ printSOImm(O, V1, VerboseAsm, MAI);
O << "\n\torr";
printPredicateOperand(MI, 2);
O << " ";
- printOperand(MI, 0);
+ printOperand(MI, 0);
O << ", ";
- printOperand(MI, 0);
+ printOperand(MI, 0);
O << ", ";
- printSOImm(O, ARM_AM::getSOImmVal(V2), VerboseAsm, TAI);
+ printSOImm(O, V2, VerboseAsm, MAI);
}
// so_reg is a 4-operand unit corresponding to register forms of the A5.1
@@ -420,8 +435,7 @@ void ARMAsmPrinter::printSORegOperand(const MachineInstr *MI, int Op) {
const MachineOperand &MO2 = MI->getOperand(Op+1);
const MachineOperand &MO3 = MI->getOperand(Op+2);
- assert(TargetRegisterInfo::isPhysicalRegister(MO1.getReg()));
- O << TM.getRegisterInfo()->get(MO1.getReg()).AsmName;
+ O << getRegisterName(MO1.getReg());
// Print the shift opc.
O << ", "
@@ -429,8 +443,7 @@ void ARMAsmPrinter::printSORegOperand(const MachineInstr *MI, int Op) {
<< " ";
if (MO2.getReg()) {
- assert(TargetRegisterInfo::isPhysicalRegister(MO2.getReg()));
- O << TM.getRegisterInfo()->get(MO2.getReg()).AsmName;
+ O << getRegisterName(MO2.getReg());
assert(ARM_AM::getSORegOffset(MO3.getImm()) == 0);
} else {
O << "#" << ARM_AM::getSORegOffset(MO3.getImm());
@@ -447,7 +460,7 @@ void ARMAsmPrinter::printAddrMode2Operand(const MachineInstr *MI, int Op) {
return;
}
- O << "[" << TM.getRegisterInfo()->get(MO1.getReg()).AsmName;
+ O << "[" << getRegisterName(MO1.getReg());
if (!MO2.getReg()) {
if (ARM_AM::getAM2Offset(MO3.getImm())) // Don't print +0.
@@ -460,8 +473,8 @@ void ARMAsmPrinter::printAddrMode2Operand(const MachineInstr *MI, int Op) {
O << ", "
<< (char)ARM_AM::getAM2Op(MO3.getImm())
- << TM.getRegisterInfo()->get(MO2.getReg()).AsmName;
-
+ << getRegisterName(MO2.getReg());
+
if (unsigned ShImm = ARM_AM::getAM2Offset(MO3.getImm()))
O << ", "
<< ARM_AM::getShiftOpcStr(ARM_AM::getAM2ShiftOpc(MO3.getImm()))
@@ -483,8 +496,8 @@ void ARMAsmPrinter::printAddrMode2OffsetOperand(const MachineInstr *MI, int Op){
}
O << (char)ARM_AM::getAM2Op(MO2.getImm())
- << TM.getRegisterInfo()->get(MO1.getReg()).AsmName;
-
+ << getRegisterName(MO1.getReg());
+
if (unsigned ShImm = ARM_AM::getAM2Offset(MO2.getImm()))
O << ", "
<< ARM_AM::getShiftOpcStr(ARM_AM::getAM2ShiftOpc(MO2.getImm()))
@@ -495,18 +508,18 @@ void ARMAsmPrinter::printAddrMode3Operand(const MachineInstr *MI, int Op) {
const MachineOperand &MO1 = MI->getOperand(Op);
const MachineOperand &MO2 = MI->getOperand(Op+1);
const MachineOperand &MO3 = MI->getOperand(Op+2);
-
+
assert(TargetRegisterInfo::isPhysicalRegister(MO1.getReg()));
- O << "[" << TM.getRegisterInfo()->get(MO1.getReg()).AsmName;
+ O << "[" << getRegisterName(MO1.getReg());
if (MO2.getReg()) {
O << ", "
<< (char)ARM_AM::getAM3Op(MO3.getImm())
- << TM.getRegisterInfo()->get(MO2.getReg()).AsmName
+ << getRegisterName(MO2.getReg())
<< "]";
return;
}
-
+
if (unsigned ImmOffs = ARM_AM::getAM3Offset(MO3.getImm()))
O << ", #"
<< (char)ARM_AM::getAM3Op(MO3.getImm())
@@ -520,7 +533,7 @@ void ARMAsmPrinter::printAddrMode3OffsetOperand(const MachineInstr *MI, int Op){
if (MO1.getReg()) {
O << (char)ARM_AM::getAM3Op(MO2.getImm())
- << TM.getRegisterInfo()->get(MO1.getReg()).AsmName;
+ << getRegisterName(MO1.getReg());
return;
}
@@ -530,7 +543,7 @@ void ARMAsmPrinter::printAddrMode3OffsetOperand(const MachineInstr *MI, int Op){
<< (char)ARM_AM::getAM3Op(MO2.getImm())
<< ImmOffs;
}
-
+
void ARMAsmPrinter::printAddrMode4Operand(const MachineInstr *MI, int Op,
const char *Modifier) {
const MachineOperand &MO1 = MI->getOperand(Op);
@@ -538,11 +551,18 @@ void ARMAsmPrinter::printAddrMode4Operand(const MachineInstr *MI, int Op,
ARM_AM::AMSubMode Mode = ARM_AM::getAM4SubMode(MO2.getImm());
if (Modifier && strcmp(Modifier, "submode") == 0) {
if (MO1.getReg() == ARM::SP) {
+ // FIXME
bool isLDM = (MI->getOpcode() == ARM::LDM ||
- MI->getOpcode() == ARM::LDM_RET);
+ MI->getOpcode() == ARM::LDM_RET ||
+ MI->getOpcode() == ARM::t2LDM ||
+ MI->getOpcode() == ARM::t2LDM_RET);
O << ARM_AM::getAMSubModeAltStr(Mode, isLDM);
} else
O << ARM_AM::getAMSubModeStr(Mode);
+ } else if (Modifier && strcmp(Modifier, "wide") == 0) {
+ ARM_AM::AMSubMode Mode = ARM_AM::getAM4SubMode(MO2.getImm());
+ if (Mode == ARM_AM::ia)
+ O << ".w";
} else {
printOperand(MI, Op);
if (ARM_AM::getAM4WBFlag(MO2.getImm()))
@@ -559,7 +579,7 @@ void ARMAsmPrinter::printAddrMode5Operand(const MachineInstr *MI, int Op,
printOperand(MI, Op);
return;
}
-
+
assert(TargetRegisterInfo::isPhysicalRegister(MO1.getReg()));
if (Modifier && strcmp(Modifier, "submode") == 0) {
@@ -573,14 +593,14 @@ void ARMAsmPrinter::printAddrMode5Operand(const MachineInstr *MI, int Op,
return;
} else if (Modifier && strcmp(Modifier, "base") == 0) {
// Used for FSTM{D|S} and LSTM{D|S} operations.
- O << TM.getRegisterInfo()->get(MO1.getReg()).AsmName;
+ O << getRegisterName(MO1.getReg());
if (ARM_AM::getAM5WBFlag(MO2.getImm()))
O << "!";
return;
}
-
- O << "[" << TM.getRegisterInfo()->get(MO1.getReg()).AsmName;
-
+
+ O << "[" << getRegisterName(MO1.getReg());
+
if (unsigned ImmOffs = ARM_AM::getAM5Offset(MO2.getImm())) {
O << ", #"
<< (char)ARM_AM::getAM5Op(MO2.getImm())
@@ -595,13 +615,13 @@ void ARMAsmPrinter::printAddrMode6Operand(const MachineInstr *MI, int Op) {
const MachineOperand &MO3 = MI->getOperand(Op+2);
// FIXME: No support yet for specifying alignment.
- O << "[" << TM.getRegisterInfo()->get(MO1.getReg()).AsmName << "]";
+ O << "[" << getRegisterName(MO1.getReg()) << "]";
if (ARM_AM::getAM6WBFlag(MO3.getImm())) {
if (MO2.getReg() == 0)
O << "!";
else
- O << ", " << TM.getRegisterInfo()->get(MO2.getReg()).AsmName;
+ O << ", " << getRegisterName(MO2.getReg());
}
}
@@ -614,7 +634,7 @@ void ARMAsmPrinter::printAddrModePCOperand(const MachineInstr *MI, int Op,
const MachineOperand &MO1 = MI->getOperand(Op);
assert(TargetRegisterInfo::isPhysicalRegister(MO1.getReg()));
- O << "[pc, +" << TM.getRegisterInfo()->get(MO1.getReg()).AsmName << "]";
+ O << "[pc, +" << getRegisterName(MO1.getReg()) << "]";
}
void
@@ -630,11 +650,26 @@ ARMAsmPrinter::printBitfieldInvMaskImmOperand(const MachineInstr *MI, int Op) {
//===--------------------------------------------------------------------===//
void
+ARMAsmPrinter::printThumbITMask(const MachineInstr *MI, int Op) {
+ // (3 - the number of trailing zeros) is the number of then / else.
+ unsigned Mask = MI->getOperand(Op).getImm();
+ unsigned NumTZ = CountTrailingZeros_32(Mask);
+ assert(NumTZ <= 3 && "Invalid IT mask!");
+ for (unsigned Pos = 3, e = NumTZ; Pos > e; --Pos) {
+ bool T = (Mask & (1 << Pos)) == 0;
+ if (T)
+ O << 't';
+ else
+ O << 'e';
+ }
+}
+
+void
ARMAsmPrinter::printThumbAddrModeRROperand(const MachineInstr *MI, int Op) {
const MachineOperand &MO1 = MI->getOperand(Op);
const MachineOperand &MO2 = MI->getOperand(Op+1);
- O << "[" << TM.getRegisterInfo()->get(MO1.getReg()).AsmName;
- O << ", " << TM.getRegisterInfo()->get(MO2.getReg()).AsmName << "]";
+ O << "[" << getRegisterName(MO1.getReg());
+ O << ", " << getRegisterName(MO2.getReg()) << "]";
}
void
@@ -649,9 +684,9 @@ ARMAsmPrinter::printThumbAddrModeRI5Operand(const MachineInstr *MI, int Op,
return;
}
- O << "[" << TM.getRegisterInfo()->get(MO1.getReg()).AsmName;
+ O << "[" << getRegisterName(MO1.getReg());
if (MO3.getReg())
- O << ", " << TM.getRegisterInfo()->get(MO3.getReg()).AsmName;
+ O << ", " << getRegisterName(MO3.getReg());
else if (unsigned ImmOffs = MO2.getImm()) {
O << ", #" << ImmOffs;
if (Scale > 1)
@@ -676,7 +711,7 @@ ARMAsmPrinter::printThumbAddrModeS4Operand(const MachineInstr *MI, int Op) {
void ARMAsmPrinter::printThumbAddrModeSPOperand(const MachineInstr *MI,int Op) {
const MachineOperand &MO1 = MI->getOperand(Op);
const MachineOperand &MO2 = MI->getOperand(Op+1);
- O << "[" << TM.getRegisterInfo()->get(MO1.getReg()).AsmName;
+ O << "[" << getRegisterName(MO1.getReg());
if (unsigned ImmOffs = MO2.getImm())
O << ", #" << ImmOffs << " * 4";
O << "]";
@@ -684,20 +719,6 @@ void ARMAsmPrinter::printThumbAddrModeSPOperand(const MachineInstr *MI,int Op) {
//===--------------------------------------------------------------------===//
-/// printT2SOImmOperand - T2SOImm is:
-/// 1. a 4-bit splat control value and 8 bit immediate value
-/// 2. a 5-bit rotate amount and a non-zero 8-bit immediate value
-/// represented by a normalizedin 7-bit value (msb is always 1)
-void ARMAsmPrinter::printT2SOImmOperand(const MachineInstr *MI, int OpNum) {
- const MachineOperand &MO = MI->getOperand(OpNum);
- assert(MO.isImm() && "Not a valid so_imm value!");
-
- unsigned Imm = ARM_AM::getT2SOImmValDecode(MO.getImm());
- // Always print the immediate directly, as the "rotate" form
- // is deprecated in some contexts.
- O << "#" << Imm;
-}
-
// Constant shifts t2_so_reg is a 2-operand unit corresponding to the Thumb2
// register with shift forms.
// REG 0 0 - e.g. R5
@@ -708,7 +729,7 @@ void ARMAsmPrinter::printT2SOOperand(const MachineInstr *MI, int OpNum) {
unsigned Reg = MO1.getReg();
assert(TargetRegisterInfo::isPhysicalRegister(Reg));
- O << TM.getRegisterInfo()->getAsmName(Reg);
+ O << getRegisterName(Reg);
// Print the shift opc.
O << ", "
@@ -724,7 +745,7 @@ void ARMAsmPrinter::printT2AddrModeImm12Operand(const MachineInstr *MI,
const MachineOperand &MO1 = MI->getOperand(OpNum);
const MachineOperand &MO2 = MI->getOperand(OpNum+1);
- O << "[" << TM.getRegisterInfo()->get(MO1.getReg()).AsmName;
+ O << "[" << getRegisterName(MO1.getReg());
unsigned OffImm = MO2.getImm();
if (OffImm) // Don't print +0.
@@ -737,7 +758,7 @@ void ARMAsmPrinter::printT2AddrModeImm8Operand(const MachineInstr *MI,
const MachineOperand &MO1 = MI->getOperand(OpNum);
const MachineOperand &MO2 = MI->getOperand(OpNum+1);
- O << "[" << TM.getRegisterInfo()->get(MO1.getReg()).AsmName;
+ O << "[" << getRegisterName(MO1.getReg());
int32_t OffImm = (int32_t)MO2.getImm();
// Don't print +0.
@@ -748,6 +769,22 @@ void ARMAsmPrinter::printT2AddrModeImm8Operand(const MachineInstr *MI,
O << "]";
}
+void ARMAsmPrinter::printT2AddrModeImm8s4Operand(const MachineInstr *MI,
+ int OpNum) {
+ const MachineOperand &MO1 = MI->getOperand(OpNum);
+ const MachineOperand &MO2 = MI->getOperand(OpNum+1);
+
+ O << "[" << getRegisterName(MO1.getReg());
+
+ int32_t OffImm = (int32_t)MO2.getImm() / 4;
+ // Don't print +0.
+ if (OffImm < 0)
+ O << ", #-" << -OffImm << " * 4";
+ else if (OffImm > 0)
+ O << ", #+" << OffImm << " * 4";
+ O << "]";
+}
+
void ARMAsmPrinter::printT2AddrModeImm8OffsetOperand(const MachineInstr *MI,
int OpNum) {
const MachineOperand &MO1 = MI->getOperand(OpNum);
@@ -765,17 +802,15 @@ void ARMAsmPrinter::printT2AddrModeSoRegOperand(const MachineInstr *MI,
const MachineOperand &MO2 = MI->getOperand(OpNum+1);
const MachineOperand &MO3 = MI->getOperand(OpNum+2);
- O << "[" << TM.getRegisterInfo()->get(MO1.getReg()).AsmName;
+ O << "[" << getRegisterName(MO1.getReg());
- if (MO2.getReg()) {
- O << ", +"
- << TM.getRegisterInfo()->get(MO2.getReg()).AsmName;
+ assert(MO2.getReg() && "Invalid so_reg load / store address!");
+ O << ", " << getRegisterName(MO2.getReg());
- unsigned ShAmt = MO3.getImm();
- if (ShAmt) {
- assert(ShAmt <= 3 && "Not a valid Thumb2 addressing mode!");
- O << ", lsl #" << ShAmt;
- }
+ unsigned ShAmt = MO3.getImm();
+ if (ShAmt) {
+ assert(ShAmt <= 3 && "Not a valid Thumb2 addressing mode!");
+ O << ", lsl #" << ShAmt;
}
O << "]";
}
@@ -799,14 +834,17 @@ void ARMAsmPrinter::printSBitModifierOperand(const MachineInstr *MI, int OpNum){
void ARMAsmPrinter::printPCLabel(const MachineInstr *MI, int OpNum) {
int Id = (int)MI->getOperand(OpNum).getImm();
- O << TAI->getPrivateGlobalPrefix() << "PC" << Id;
+ O << MAI->getPrivateGlobalPrefix() << "PC" << Id;
}
void ARMAsmPrinter::printRegisterList(const MachineInstr *MI, int OpNum) {
O << "{";
- for (unsigned i = OpNum, e = MI->getNumOperands(); i != e; ++i) {
+ // Always skip the first operand, it's the optional (and implicit writeback).
+ for (unsigned i = OpNum+1, e = MI->getNumOperands(); i != e; ++i) {
+ if (MI->getOperand(i).isImplicit())
+ continue;
+ if ((int)i != OpNum+1) O << ", ";
printOperand(MI, i);
- if (i != e-1) O << ", ";
}
O << "}";
}
@@ -818,14 +856,14 @@ void ARMAsmPrinter::printCPInstOperand(const MachineInstr *MI, int OpNum,
// data itself.
if (!strcmp(Modifier, "label")) {
unsigned ID = MI->getOperand(OpNum).getImm();
- O << TAI->getPrivateGlobalPrefix() << "CPI" << getFunctionNumber()
+ O << MAI->getPrivateGlobalPrefix() << "CPI" << getFunctionNumber()
<< '_' << ID << ":\n";
} else {
assert(!strcmp(Modifier, "cpentry") && "Unknown modifier for CPE");
unsigned CPI = MI->getOperand(OpNum).getIndex();
const MachineConstantPoolEntry &MCPE = MCP->getConstants()[CPI];
-
+
if (MCPE.isMachineConstantPoolEntry()) {
EmitMachineConstantPoolValue(MCPE.Val.MachineCPVal);
} else {
@@ -835,57 +873,119 @@ void ARMAsmPrinter::printCPInstOperand(const MachineInstr *MI, int OpNum,
}
void ARMAsmPrinter::printJTBlockOperand(const MachineInstr *MI, int OpNum) {
+ assert(!Subtarget->isThumb2() && "Thumb2 should use double-jump jumptables!");
+
const MachineOperand &MO1 = MI->getOperand(OpNum);
const MachineOperand &MO2 = MI->getOperand(OpNum+1); // Unique Id
unsigned JTI = MO1.getIndex();
- O << TAI->getPrivateGlobalPrefix() << "JTI" << getFunctionNumber()
+ O << MAI->getPrivateGlobalPrefix() << "JTI" << getFunctionNumber()
<< '_' << JTI << '_' << MO2.getImm() << ":\n";
- const char *JTEntryDirective = TAI->getJumpTableDirective();
- if (!JTEntryDirective)
- JTEntryDirective = TAI->getData32bitsDirective();
+ const char *JTEntryDirective = MAI->getData32bitsDirective();
const MachineFunction *MF = MI->getParent()->getParent();
const MachineJumpTableInfo *MJTI = MF->getJumpTableInfo();
const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables();
const std::vector<MachineBasicBlock*> &JTBBs = JT[JTI].MBBs;
- bool UseSet= TAI->getSetDirective() && TM.getRelocationModel() == Reloc::PIC_;
- std::set<MachineBasicBlock*> JTSets;
+ bool UseSet= MAI->getSetDirective() && TM.getRelocationModel() == Reloc::PIC_;
+ SmallPtrSet<MachineBasicBlock*, 8> JTSets;
for (unsigned i = 0, e = JTBBs.size(); i != e; ++i) {
MachineBasicBlock *MBB = JTBBs[i];
- if (UseSet && JTSets.insert(MBB).second)
+ bool isNew = JTSets.insert(MBB);
+
+ if (UseSet && isNew)
printPICJumpTableSetLabel(JTI, MO2.getImm(), MBB);
O << JTEntryDirective << ' ';
if (UseSet)
- O << TAI->getPrivateGlobalPrefix() << getFunctionNumber()
+ O << MAI->getPrivateGlobalPrefix() << getFunctionNumber()
<< '_' << JTI << '_' << MO2.getImm()
<< "_set_" << MBB->getNumber();
else if (TM.getRelocationModel() == Reloc::PIC_) {
- printBasicBlockLabel(MBB, false, false, false);
- // If the arch uses custom Jump Table directives, don't calc relative to JT
- if (!TAI->getJumpTableDirective())
- O << '-' << TAI->getPrivateGlobalPrefix() << "JTI"
- << getFunctionNumber() << '_' << JTI << '_' << MO2.getImm();
- } else
- printBasicBlockLabel(MBB, false, false, false);
+ GetMBBSymbol(MBB->getNumber())->print(O, MAI);
+ O << '-' << MAI->getPrivateGlobalPrefix() << "JTI"
+ << getFunctionNumber() << '_' << JTI << '_' << MO2.getImm();
+ } else {
+ GetMBBSymbol(MBB->getNumber())->print(O, MAI);
+ }
+ if (i != e-1)
+ O << '\n';
+ }
+}
+
+void ARMAsmPrinter::printJT2BlockOperand(const MachineInstr *MI, int OpNum) {
+ const MachineOperand &MO1 = MI->getOperand(OpNum);
+ const MachineOperand &MO2 = MI->getOperand(OpNum+1); // Unique Id
+ unsigned JTI = MO1.getIndex();
+ O << MAI->getPrivateGlobalPrefix() << "JTI" << getFunctionNumber()
+ << '_' << JTI << '_' << MO2.getImm() << ":\n";
+
+ const MachineFunction *MF = MI->getParent()->getParent();
+ const MachineJumpTableInfo *MJTI = MF->getJumpTableInfo();
+ const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables();
+ const std::vector<MachineBasicBlock*> &JTBBs = JT[JTI].MBBs;
+ bool ByteOffset = false, HalfWordOffset = false;
+ if (MI->getOpcode() == ARM::t2TBB)
+ ByteOffset = true;
+ else if (MI->getOpcode() == ARM::t2TBH)
+ HalfWordOffset = true;
+
+ for (unsigned i = 0, e = JTBBs.size(); i != e; ++i) {
+ MachineBasicBlock *MBB = JTBBs[i];
+ if (ByteOffset)
+ O << MAI->getData8bitsDirective();
+ else if (HalfWordOffset)
+ O << MAI->getData16bitsDirective();
+ if (ByteOffset || HalfWordOffset) {
+ O << '(';
+ GetMBBSymbol(MBB->getNumber())->print(O, MAI);
+ O << "-" << MAI->getPrivateGlobalPrefix() << "JTI" << getFunctionNumber()
+ << '_' << JTI << '_' << MO2.getImm() << ")/2";
+ } else {
+ O << "\tb.w ";
+ GetMBBSymbol(MBB->getNumber())->print(O, MAI);
+ }
if (i != e-1)
O << '\n';
}
+
+ // Make sure the instruction that follows TBB is 2-byte aligned.
+ // FIXME: Constant island pass should insert an "ALIGN" instruction instead.
+ if (ByteOffset && (JTBBs.size() & 1)) {
+ O << '\n';
+ EmitAlignment(1);
+ }
+}
+
+void ARMAsmPrinter::printTBAddrMode(const MachineInstr *MI, int OpNum) {
+ O << "[pc, " << getRegisterName(MI->getOperand(OpNum).getReg());
+ if (MI->getOpcode() == ARM::t2TBH)
+ O << ", lsl #1";
+ O << ']';
}
+void ARMAsmPrinter::printNoHashImmediate(const MachineInstr *MI, int OpNum) {
+ O << MI->getOperand(OpNum).getImm();
+}
bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
unsigned AsmVariant, const char *ExtraCode){
// Does this asm operand have a single letter operand modifier?
if (ExtraCode && ExtraCode[0]) {
if (ExtraCode[1] != 0) return true; // Unknown modifier.
-
+
switch (ExtraCode[0]) {
default: return true; // Unknown modifier.
- case 'a': // Don't print "#" before a global var name or constant.
- case 'c': // Don't print "$" before a global var name or constant.
- printOperand(MI, OpNum, "no_hash");
+ case 'a': // Print as a memory address.
+ if (MI->getOperand(OpNum).isReg()) {
+ O << "[" << getRegisterName(MI->getOperand(OpNum).getReg()) << "]";
+ return false;
+ }
+ // Fallthrough
+ case 'c': // Don't print "#" before an immediate operand.
+ if (!MI->getOperand(OpNum).isImm())
+ return true;
+ printNoHashImmediate(MI, OpNum);
return false;
case 'P': // Print a VFP double precision register.
printOperand(MI, OpNum);
@@ -898,7 +998,7 @@ bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
if (TM.getTargetData()->isBigEndian())
break;
// Fallthrough
- case 'H': // Write second word of DI / DF reference.
+ case 'H': // Write second word of DI / DF reference.
// Verify that this operand has two consecutive registers.
if (!MI->getOperand(OpNum).isReg() ||
OpNum+1 == MI->getNumOperands() ||
@@ -907,7 +1007,7 @@ bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
++OpNum; // Return the high-part.
}
}
-
+
printOperand(MI, OpNum);
return false;
}
@@ -917,7 +1017,10 @@ bool ARMAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
const char *ExtraCode) {
if (ExtraCode && ExtraCode[0])
return true; // Unknown modifier.
- printAddrMode2Operand(MI, OpNum);
+
+ const MachineOperand &MO = MI->getOperand(OpNum);
+ assert(MO.isReg() && "unexpected inline asm memory operand");
+ O << "[" << getRegisterName(MO.getReg()) << "]";
return false;
}
@@ -938,16 +1041,47 @@ void ARMAsmPrinter::printMachineInstruction(const MachineInstr *MI) {
}}
// Call the autogenerated instruction printer routines.
+ processDebugLoc(MI, true);
printInstruction(MI);
+ if (VerboseAsm && !MI->getDebugLoc().isUnknown())
+ EmitComments(*MI);
+ O << '\n';
+ processDebugLoc(MI, false);
}
-bool ARMAsmPrinter::doInitialization(Module &M) {
-
- bool Result = AsmPrinter::doInitialization(M);
- DW = getAnalysisIfAvailable<DwarfWriter>();
+void ARMAsmPrinter::EmitStartOfAsmFile(Module &M) {
+ if (Subtarget->isTargetDarwin()) {
+ Reloc::Model RelocM = TM.getRelocationModel();
+ if (RelocM == Reloc::PIC_ || RelocM == Reloc::DynamicNoPIC) {
+ // Declare all the text sections up front (before the DWARF sections
+ // emitted by AsmPrinter::doInitialization) so the assembler will keep
+ // them together at the beginning of the object file. This helps
+ // avoid out-of-range branches that are due a fundamental limitation of
+ // the way symbol offsets are encoded with the current Darwin ARM
+ // relocations.
+ TargetLoweringObjectFileMachO &TLOFMacho =
+ static_cast<TargetLoweringObjectFileMachO &>(getObjFileLowering());
+ OutStreamer.SwitchSection(TLOFMacho.getTextSection());
+ OutStreamer.SwitchSection(TLOFMacho.getTextCoalSection());
+ OutStreamer.SwitchSection(TLOFMacho.getConstTextCoalSection());
+ if (RelocM == Reloc::DynamicNoPIC) {
+ const MCSection *sect =
+ TLOFMacho.getMachOSection("__TEXT", "__symbol_stub4",
+ MCSectionMachO::S_SYMBOL_STUBS,
+ 12, SectionKind::getText());
+ OutStreamer.SwitchSection(sect);
+ } else {
+ const MCSection *sect =
+ TLOFMacho.getMachOSection("__TEXT", "__picsymbolstub4",
+ MCSectionMachO::S_SYMBOL_STUBS,
+ 16, SectionKind::getText());
+ OutStreamer.SwitchSection(sect);
+ }
+ }
+ }
- // Thumb-2 instructions are supported only in unified assembler syntax mode.
- if (Subtarget->hasThumb2())
+ // Use unified assembler syntax mode for Thumb.
+ if (Subtarget->isThumb())
O << "\t.syntax unified\n";
// Emit ARM Build Attributes
@@ -975,22 +1109,16 @@ bool ARMAsmPrinter::doInitialization(Module &M) {
O << "\t.eabi_attribute " << ARMBuildAttrs::ABI_align8_needed << ", 1\n"
<< "\t.eabi_attribute " << ARMBuildAttrs::ABI_align8_preserved << ", 1\n";
+ // Hard float. Use both S and D registers and conform to AAPCS-VFP.
+ if (Subtarget->isAAPCS_ABI() && FloatABIType == FloatABI::Hard)
+ O << "\t.eabi_attribute " << ARMBuildAttrs::ABI_HardFP_use << ", 3\n"
+ << "\t.eabi_attribute " << ARMBuildAttrs::ABI_VFP_args << ", 1\n";
+
// FIXME: Should we signal R9 usage?
}
-
- return Result;
-}
-
-/// PrintUnmangledNameSafely - Print out the printable characters in the name.
-/// Don't print things like \\n or \\0.
-static void PrintUnmangledNameSafely(const Value *V, raw_ostream &OS) {
- for (const char *Name = V->getNameStart(), *E = Name+V->getNameLen();
- Name != E; ++Name)
- if (isprint(*Name))
- OS << *Name;
}
-void ARMAsmPrinter::printModuleLevelGV(const GlobalVariable* GVar) {
+void ARMAsmPrinter::PrintGlobalVariable(const GlobalVariable* GVar) {
const TargetData *TD = TM.getTargetData();
if (!GVar->hasInitializer()) // External global require no code
@@ -1009,10 +1137,8 @@ void ARMAsmPrinter::printModuleLevelGV(const GlobalVariable* GVar) {
return;
}
- std::string name = Mang->getValueName(GVar);
+ std::string name = Mang->getMangledName(GVar);
Constant *C = GVar->getInitializer();
- if (isa<MDNode>(C) || isa<MDString>(C))
- return;
const Type *Type = C->getType();
unsigned Size = TD->getTypeAllocSize(Type);
unsigned Align = TD->getPreferredAlignmentLog(GVar);
@@ -1023,14 +1149,16 @@ void ARMAsmPrinter::printModuleLevelGV(const GlobalVariable* GVar) {
if (Subtarget->isTargetELF())
O << "\t.type " << name << ",%object\n";
- if (C->isNullValue() && !GVar->hasSection() && !GVar->isThreadLocal() &&
- !(isDarwin &&
- TAI->SectionKindForGlobal(GVar) == SectionKind::RODataMergeStr)) {
- // FIXME: This seems to be pretty darwin-specific
+ const MCSection *TheSection =
+ getObjFileLowering().SectionForGlobal(GVar, Mang, TM);
+ OutStreamer.SwitchSection(TheSection);
+ // FIXME: get this stuff from section kind flags.
+ if (C->isNullValue() && !GVar->hasSection() && !GVar->isThreadLocal() &&
+ // Don't put things that should go in the cstring section into "comm".
+ !TheSection->getKind().isMergeableCString()) {
if (GVar->hasExternalLinkage()) {
- SwitchToSection(TAI->SectionForGlobal(GVar));
- if (const char *Directive = TAI->getZeroFillDirective()) {
+ if (const char *Directive = MAI->getZeroFillDirective()) {
O << "\t.globl\t" << name << "\n";
O << Directive << "__DATA, __common, " << name << ", "
<< Size << ", " << Align << "\n";
@@ -1043,57 +1171,56 @@ void ARMAsmPrinter::printModuleLevelGV(const GlobalVariable* GVar) {
if (isDarwin) {
if (GVar->hasLocalLinkage()) {
- O << TAI->getLCOMMDirective() << name << "," << Size
+ O << MAI->getLCOMMDirective() << name << "," << Size
<< ',' << Align;
} else if (GVar->hasCommonLinkage()) {
- O << TAI->getCOMMDirective() << name << "," << Size
+ O << MAI->getCOMMDirective() << name << "," << Size
<< ',' << Align;
} else {
- SwitchToSection(TAI->SectionForGlobal(GVar));
+ OutStreamer.SwitchSection(TheSection);
O << "\t.globl " << name << '\n'
- << TAI->getWeakDefDirective() << name << '\n';
+ << MAI->getWeakDefDirective() << name << '\n';
EmitAlignment(Align, GVar);
O << name << ":";
if (VerboseAsm) {
- O << "\t\t\t\t" << TAI->getCommentString() << ' ';
- PrintUnmangledNameSafely(GVar, O);
+ O << "\t\t\t\t" << MAI->getCommentString() << ' ';
+ WriteAsOperand(O, GVar, /*PrintType=*/false, GVar->getParent());
}
O << '\n';
EmitGlobalConstant(C);
return;
}
- } else if (TAI->getLCOMMDirective() != NULL) {
+ } else if (MAI->getLCOMMDirective() != NULL) {
if (GVar->hasLocalLinkage()) {
- O << TAI->getLCOMMDirective() << name << "," << Size;
+ O << MAI->getLCOMMDirective() << name << "," << Size;
} else {
- O << TAI->getCOMMDirective() << name << "," << Size;
- if (TAI->getCOMMDirectiveTakesAlignment())
- O << ',' << (TAI->getAlignmentIsInBytes() ? (1 << Align) : Align);
+ O << MAI->getCOMMDirective() << name << "," << Size;
+ if (MAI->getCOMMDirectiveTakesAlignment())
+ O << ',' << (MAI->getAlignmentIsInBytes() ? (1 << Align) : Align);
}
} else {
- SwitchToSection(TAI->SectionForGlobal(GVar));
if (GVar->hasLocalLinkage())
O << "\t.local\t" << name << "\n";
- O << TAI->getCOMMDirective() << name << "," << Size;
- if (TAI->getCOMMDirectiveTakesAlignment())
- O << "," << (TAI->getAlignmentIsInBytes() ? (1 << Align) : Align);
+ O << MAI->getCOMMDirective() << name << "," << Size;
+ if (MAI->getCOMMDirectiveTakesAlignment())
+ O << "," << (MAI->getAlignmentIsInBytes() ? (1 << Align) : Align);
}
if (VerboseAsm) {
- O << "\t\t" << TAI->getCommentString() << " ";
- PrintUnmangledNameSafely(GVar, O);
+ O << "\t\t" << MAI->getCommentString() << " ";
+ WriteAsOperand(O, GVar, /*PrintType=*/false, GVar->getParent());
}
O << "\n";
return;
}
}
- SwitchToSection(TAI->SectionForGlobal(GVar));
switch (GVar->getLinkage()) {
- case GlobalValue::CommonLinkage:
- case GlobalValue::LinkOnceAnyLinkage:
- case GlobalValue::LinkOnceODRLinkage:
- case GlobalValue::WeakAnyLinkage:
- case GlobalValue::WeakODRLinkage:
+ case GlobalValue::CommonLinkage:
+ case GlobalValue::LinkOnceAnyLinkage:
+ case GlobalValue::LinkOnceODRLinkage:
+ case GlobalValue::WeakAnyLinkage:
+ case GlobalValue::WeakODRLinkage:
+ case GlobalValue::LinkerPrivateLinkage:
if (isDarwin) {
O << "\t.globl " << name << "\n"
<< "\t.weak_definition " << name << "\n";
@@ -1101,28 +1228,27 @@ void ARMAsmPrinter::printModuleLevelGV(const GlobalVariable* GVar) {
O << "\t.weak " << name << "\n";
}
break;
- case GlobalValue::AppendingLinkage:
- // FIXME: appending linkage variables should go into a section of
- // their name or something. For now, just emit them as external.
- case GlobalValue::ExternalLinkage:
+ case GlobalValue::AppendingLinkage:
+ // FIXME: appending linkage variables should go into a section of
+ // their name or something. For now, just emit them as external.
+ case GlobalValue::ExternalLinkage:
O << "\t.globl " << name << "\n";
- // FALL THROUGH
- case GlobalValue::PrivateLinkage:
- case GlobalValue::InternalLinkage:
break;
- default:
- assert(0 && "Unknown linkage type!");
+ case GlobalValue::PrivateLinkage:
+ case GlobalValue::InternalLinkage:
break;
+ default:
+ llvm_unreachable("Unknown linkage type!");
}
EmitAlignment(Align, GVar);
O << name << ":";
if (VerboseAsm) {
- O << "\t\t\t\t" << TAI->getCommentString() << " ";
- PrintUnmangledNameSafely(GVar, O);
+ O << "\t\t\t\t" << MAI->getCommentString() << " ";
+ WriteAsOperand(O, GVar, /*PrintType=*/false, GVar->getParent());
}
O << "\n";
- if (TAI->hasDotTypeDotSizeDirective())
+ if (MAI->hasDotTypeDotSizeDirective())
O << "\t.size " << name << ", " << Size << "\n";
EmitGlobalConstant(C);
@@ -1131,83 +1257,36 @@ void ARMAsmPrinter::printModuleLevelGV(const GlobalVariable* GVar) {
bool ARMAsmPrinter::doFinalization(Module &M) {
- for (Module::const_global_iterator I = M.global_begin(), E = M.global_end();
- I != E; ++I)
- printModuleLevelGV(I);
-
if (Subtarget->isTargetDarwin()) {
- SwitchToDataSection("");
-
- // Output stubs for dynamically-linked functions
- for (StringSet<>::iterator i = FnStubs.begin(), e = FnStubs.end();
- i != e; ++i) {
- if (TM.getRelocationModel() == Reloc::PIC_)
- SwitchToTextSection(".section __TEXT,__picsymbolstub4,symbol_stubs,"
- "none,16", 0);
- else
- SwitchToTextSection(".section __TEXT,__symbol_stub4,symbol_stubs,"
- "none,12", 0);
+ // All darwin targets use mach-o.
+ TargetLoweringObjectFileMachO &TLOFMacho =
+ static_cast<TargetLoweringObjectFileMachO &>(getObjFileLowering());
- EmitAlignment(2);
- O << "\t.code\t32\n";
-
- const char *p = i->getKeyData();
- printSuffixedName(p, "$stub");
- O << ":\n";
- O << "\t.indirect_symbol " << p << "\n";
- O << "\tldr ip, ";
- printSuffixedName(p, "$slp");
- O << "\n";
- if (TM.getRelocationModel() == Reloc::PIC_) {
- printSuffixedName(p, "$scv");
- O << ":\n";
- O << "\tadd ip, pc, ip\n";
- }
- O << "\tldr pc, [ip, #0]\n";
- printSuffixedName(p, "$slp");
- O << ":\n";
- O << "\t.long\t";
- printSuffixedName(p, "$lazy_ptr");
- if (TM.getRelocationModel() == Reloc::PIC_) {
- O << "-(";
- printSuffixedName(p, "$scv");
- O << "+8)\n";
- } else
- O << "\n";
- SwitchToDataSection(".lazy_symbol_pointer", 0);
- printSuffixedName(p, "$lazy_ptr");
- O << ":\n";
- O << "\t.indirect_symbol " << p << "\n";
- O << "\t.long\tdyld_stub_binding_helper\n";
- }
- O << "\n";
+ O << '\n';
// Output non-lazy-pointers for external and common global variables.
if (!GVNonLazyPtrs.empty()) {
- SwitchToDataSection("\t.non_lazy_symbol_pointer", 0);
- for (StringSet<>::iterator i = GVNonLazyPtrs.begin(),
- e = GVNonLazyPtrs.end(); i != e; ++i) {
- const char *p = i->getKeyData();
- printSuffixedName(p, "$non_lazy_ptr");
- O << ":\n";
- O << "\t.indirect_symbol " << p << "\n";
+ // Switch with ".non_lazy_symbol_pointer" directive.
+ OutStreamer.SwitchSection(TLOFMacho.getNonLazySymbolPointerSection());
+ EmitAlignment(2);
+ for (StringMap<std::string>::iterator I = GVNonLazyPtrs.begin(),
+ E = GVNonLazyPtrs.end(); I != E; ++I) {
+ O << I->second << ":\n";
+ O << "\t.indirect_symbol " << I->getKeyData() << "\n";
O << "\t.long\t0\n";
}
}
if (!HiddenGVNonLazyPtrs.empty()) {
- SwitchToSection(TAI->getDataSection());
- for (StringSet<>::iterator i = HiddenGVNonLazyPtrs.begin(),
- e = HiddenGVNonLazyPtrs.end(); i != e; ++i) {
- const char *p = i->getKeyData();
- EmitAlignment(2);
- printSuffixedName(p, "$non_lazy_ptr");
- O << ":\n";
- O << "\t.long " << p << "\n";
+ OutStreamer.SwitchSection(getObjFileLowering().getDataSection());
+ EmitAlignment(2);
+ for (StringMap<std::string>::iterator I = HiddenGVNonLazyPtrs.begin(),
+ E = HiddenGVNonLazyPtrs.end(); I != E; ++I) {
+ O << I->second << ":\n";
+ O << "\t.long " << I->getKeyData() << "\n";
}
}
-
// Funny Darwin hack: This flag tells the linker that no global symbols
// contain code that falls through to other global symbols (e.g. the obvious
// implementation of multiple entry points). If this doesn't occur, the
@@ -1219,24 +1298,8 @@ bool ARMAsmPrinter::doFinalization(Module &M) {
return AsmPrinter::doFinalization(M);
}
-/// createARMCodePrinterPass - Returns a pass that prints the ARM
-/// assembly code for a MachineFunction to the given output stream,
-/// using the given target machine description. This should work
-/// regardless of whether the function is in SSA form.
-///
-FunctionPass *llvm::createARMCodePrinterPass(raw_ostream &o,
- ARMBaseTargetMachine &tm,
- bool verbose) {
- return new ARMAsmPrinter(o, tm, tm.getTargetAsmInfo(), verbose);
-}
-
-namespace {
- static struct Register {
- Register() {
- ARMBaseTargetMachine::registerAsmPrinter(createARMCodePrinterPass);
- }
- } Registrator;
-}
-
// Force static initialization.
-extern "C" void LLVMInitializeARMAsmPrinter() { }
+extern "C" void LLVMInitializeARMAsmPrinter() {
+ RegisterAsmPrinter<ARMAsmPrinter> X(TheARMTarget);
+ RegisterAsmPrinter<ARMAsmPrinter> Y(TheThumbTarget);
+}
diff --git a/lib/Target/ARM/AsmPrinter/Makefile b/lib/Target/ARM/AsmPrinter/Makefile
index ce36cec..208becc 100644
--- a/lib/Target/ARM/AsmPrinter/Makefile
+++ b/lib/Target/ARM/AsmPrinter/Makefile
@@ -1,4 +1,4 @@
-##===- lib/Target/ARM/Makefile -----------------------------*- Makefile -*-===##
+##===- lib/Target/ARM/AsmPrinter/Makefile ------------------*- Makefile -*-===##
#
# The LLVM Compiler Infrastructure
#
diff --git a/lib/Target/ARM/CMakeLists.txt b/lib/Target/ARM/CMakeLists.txt
index 9c46fe0..6e09eb2 100644
--- a/lib/Target/ARM/CMakeLists.txt
+++ b/lib/Target/ARM/CMakeLists.txt
@@ -12,6 +12,8 @@ tablegen(ARMGenCallingConv.inc -gen-callingconv)
tablegen(ARMGenSubtarget.inc -gen-subtarget)
add_llvm_target(ARMCodeGen
+ ARMBaseInstrInfo.cpp
+ ARMBaseRegisterInfo.cpp
ARMCodeEmitter.cpp
ARMConstantIslandPass.cpp
ARMConstantPoolValue.cpp
@@ -20,14 +22,17 @@ add_llvm_target(ARMCodeGen
ARMISelLowering.cpp
ARMJITInfo.cpp
ARMLoadStoreOptimizer.cpp
+ ARMMCAsmInfo.cpp
ARMRegisterInfo.cpp
ARMSubtarget.cpp
- ARMTargetAsmInfo.cpp
ARMTargetMachine.cpp
+ NEONPreAllocPass.cpp
Thumb1InstrInfo.cpp
Thumb1RegisterInfo.cpp
+ Thumb2ITBlockPass.cpp
Thumb2InstrInfo.cpp
Thumb2RegisterInfo.cpp
+ Thumb2SizeReduction.cpp
)
target_link_libraries (LLVMARMCodeGen LLVMSelectionDAG)
diff --git a/lib/Target/ARM/Makefile b/lib/Target/ARM/Makefile
index 9a3b9be..a8dd38c 100644
--- a/lib/Target/ARM/Makefile
+++ b/lib/Target/ARM/Makefile
@@ -18,6 +18,6 @@ BUILT_SOURCES = ARMGenRegisterInfo.h.inc ARMGenRegisterNames.inc \
ARMGenDAGISel.inc ARMGenSubtarget.inc \
ARMGenCodeEmitter.inc ARMGenCallingConv.inc
-DIRS = AsmPrinter
+DIRS = AsmPrinter AsmParser TargetInfo
include $(LEVEL)/Makefile.common
diff --git a/lib/Target/ARM/NEONPreAllocPass.cpp b/lib/Target/ARM/NEONPreAllocPass.cpp
new file mode 100644
index 0000000..821b872
--- /dev/null
+++ b/lib/Target/ARM/NEONPreAllocPass.cpp
@@ -0,0 +1,394 @@
+//===-- NEONPreAllocPass.cpp - Allocate adjacent NEON registers--*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "neon-prealloc"
+#include "ARM.h"
+#include "ARMInstrInfo.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+using namespace llvm;
+
+namespace {
+ class VISIBILITY_HIDDEN NEONPreAllocPass : public MachineFunctionPass {
+ const TargetInstrInfo *TII;
+
+ public:
+ static char ID;
+ NEONPreAllocPass() : MachineFunctionPass(&ID) {}
+
+ virtual bool runOnMachineFunction(MachineFunction &MF);
+
+ virtual const char *getPassName() const {
+ return "NEON register pre-allocation pass";
+ }
+
+ private:
+ bool PreAllocNEONRegisters(MachineBasicBlock &MBB);
+ };
+
+ char NEONPreAllocPass::ID = 0;
+}
+
+static bool isNEONMultiRegOp(int Opcode, unsigned &FirstOpnd, unsigned &NumRegs,
+ unsigned &Offset, unsigned &Stride) {
+ // Default to unit stride with no offset.
+ Stride = 1;
+ Offset = 0;
+
+ switch (Opcode) {
+ default:
+ break;
+
+ case ARM::VLD2d8:
+ case ARM::VLD2d16:
+ case ARM::VLD2d32:
+ case ARM::VLD2d64:
+ case ARM::VLD2LNd8:
+ case ARM::VLD2LNd16:
+ case ARM::VLD2LNd32:
+ FirstOpnd = 0;
+ NumRegs = 2;
+ return true;
+
+ case ARM::VLD2q8:
+ case ARM::VLD2q16:
+ case ARM::VLD2q32:
+ FirstOpnd = 0;
+ NumRegs = 4;
+ return true;
+
+ case ARM::VLD2LNq16a:
+ case ARM::VLD2LNq32a:
+ FirstOpnd = 0;
+ NumRegs = 2;
+ Offset = 0;
+ Stride = 2;
+ return true;
+
+ case ARM::VLD2LNq16b:
+ case ARM::VLD2LNq32b:
+ FirstOpnd = 0;
+ NumRegs = 2;
+ Offset = 1;
+ Stride = 2;
+ return true;
+
+ case ARM::VLD3d8:
+ case ARM::VLD3d16:
+ case ARM::VLD3d32:
+ case ARM::VLD3d64:
+ case ARM::VLD3LNd8:
+ case ARM::VLD3LNd16:
+ case ARM::VLD3LNd32:
+ FirstOpnd = 0;
+ NumRegs = 3;
+ return true;
+
+ case ARM::VLD3q8a:
+ case ARM::VLD3q16a:
+ case ARM::VLD3q32a:
+ FirstOpnd = 0;
+ NumRegs = 3;
+ Offset = 0;
+ Stride = 2;
+ return true;
+
+ case ARM::VLD3q8b:
+ case ARM::VLD3q16b:
+ case ARM::VLD3q32b:
+ FirstOpnd = 0;
+ NumRegs = 3;
+ Offset = 1;
+ Stride = 2;
+ return true;
+
+ case ARM::VLD3LNq16a:
+ case ARM::VLD3LNq32a:
+ FirstOpnd = 0;
+ NumRegs = 3;
+ Offset = 0;
+ Stride = 2;
+ return true;
+
+ case ARM::VLD3LNq16b:
+ case ARM::VLD3LNq32b:
+ FirstOpnd = 0;
+ NumRegs = 3;
+ Offset = 1;
+ Stride = 2;
+ return true;
+
+ case ARM::VLD4d8:
+ case ARM::VLD4d16:
+ case ARM::VLD4d32:
+ case ARM::VLD4d64:
+ case ARM::VLD4LNd8:
+ case ARM::VLD4LNd16:
+ case ARM::VLD4LNd32:
+ FirstOpnd = 0;
+ NumRegs = 4;
+ return true;
+
+ case ARM::VLD4q8a:
+ case ARM::VLD4q16a:
+ case ARM::VLD4q32a:
+ FirstOpnd = 0;
+ NumRegs = 4;
+ Offset = 0;
+ Stride = 2;
+ return true;
+
+ case ARM::VLD4q8b:
+ case ARM::VLD4q16b:
+ case ARM::VLD4q32b:
+ FirstOpnd = 0;
+ NumRegs = 4;
+ Offset = 1;
+ Stride = 2;
+ return true;
+
+ case ARM::VLD4LNq16a:
+ case ARM::VLD4LNq32a:
+ FirstOpnd = 0;
+ NumRegs = 4;
+ Offset = 0;
+ Stride = 2;
+ return true;
+
+ case ARM::VLD4LNq16b:
+ case ARM::VLD4LNq32b:
+ FirstOpnd = 0;
+ NumRegs = 4;
+ Offset = 1;
+ Stride = 2;
+ return true;
+
+ case ARM::VST2d8:
+ case ARM::VST2d16:
+ case ARM::VST2d32:
+ case ARM::VST2d64:
+ case ARM::VST2LNd8:
+ case ARM::VST2LNd16:
+ case ARM::VST2LNd32:
+ FirstOpnd = 3;
+ NumRegs = 2;
+ return true;
+
+ case ARM::VST2q8:
+ case ARM::VST2q16:
+ case ARM::VST2q32:
+ FirstOpnd = 3;
+ NumRegs = 4;
+ return true;
+
+ case ARM::VST2LNq16a:
+ case ARM::VST2LNq32a:
+ FirstOpnd = 3;
+ NumRegs = 2;
+ Offset = 0;
+ Stride = 2;
+ return true;
+
+ case ARM::VST2LNq16b:
+ case ARM::VST2LNq32b:
+ FirstOpnd = 3;
+ NumRegs = 2;
+ Offset = 1;
+ Stride = 2;
+ return true;
+
+ case ARM::VST3d8:
+ case ARM::VST3d16:
+ case ARM::VST3d32:
+ case ARM::VST3d64:
+ case ARM::VST3LNd8:
+ case ARM::VST3LNd16:
+ case ARM::VST3LNd32:
+ FirstOpnd = 3;
+ NumRegs = 3;
+ return true;
+
+ case ARM::VST3q8a:
+ case ARM::VST3q16a:
+ case ARM::VST3q32a:
+ FirstOpnd = 4;
+ NumRegs = 3;
+ Offset = 0;
+ Stride = 2;
+ return true;
+
+ case ARM::VST3q8b:
+ case ARM::VST3q16b:
+ case ARM::VST3q32b:
+ FirstOpnd = 4;
+ NumRegs = 3;
+ Offset = 1;
+ Stride = 2;
+ return true;
+
+ case ARM::VST3LNq16a:
+ case ARM::VST3LNq32a:
+ FirstOpnd = 3;
+ NumRegs = 3;
+ Offset = 0;
+ Stride = 2;
+ return true;
+
+ case ARM::VST3LNq16b:
+ case ARM::VST3LNq32b:
+ FirstOpnd = 3;
+ NumRegs = 3;
+ Offset = 1;
+ Stride = 2;
+ return true;
+
+ case ARM::VST4d8:
+ case ARM::VST4d16:
+ case ARM::VST4d32:
+ case ARM::VST4d64:
+ case ARM::VST4LNd8:
+ case ARM::VST4LNd16:
+ case ARM::VST4LNd32:
+ FirstOpnd = 3;
+ NumRegs = 4;
+ return true;
+
+ case ARM::VST4q8a:
+ case ARM::VST4q16a:
+ case ARM::VST4q32a:
+ FirstOpnd = 4;
+ NumRegs = 4;
+ Offset = 0;
+ Stride = 2;
+ return true;
+
+ case ARM::VST4q8b:
+ case ARM::VST4q16b:
+ case ARM::VST4q32b:
+ FirstOpnd = 4;
+ NumRegs = 4;
+ Offset = 1;
+ Stride = 2;
+ return true;
+
+ case ARM::VST4LNq16a:
+ case ARM::VST4LNq32a:
+ FirstOpnd = 3;
+ NumRegs = 4;
+ Offset = 0;
+ Stride = 2;
+ return true;
+
+ case ARM::VST4LNq16b:
+ case ARM::VST4LNq32b:
+ FirstOpnd = 3;
+ NumRegs = 4;
+ Offset = 1;
+ Stride = 2;
+ return true;
+
+ case ARM::VTBL2:
+ FirstOpnd = 1;
+ NumRegs = 2;
+ return true;
+
+ case ARM::VTBL3:
+ FirstOpnd = 1;
+ NumRegs = 3;
+ return true;
+
+ case ARM::VTBL4:
+ FirstOpnd = 1;
+ NumRegs = 4;
+ return true;
+
+ case ARM::VTBX2:
+ FirstOpnd = 2;
+ NumRegs = 2;
+ return true;
+
+ case ARM::VTBX3:
+ FirstOpnd = 2;
+ NumRegs = 3;
+ return true;
+
+ case ARM::VTBX4:
+ FirstOpnd = 2;
+ NumRegs = 4;
+ return true;
+ }
+
+ return false;
+}
+
+bool NEONPreAllocPass::PreAllocNEONRegisters(MachineBasicBlock &MBB) {
+ bool Modified = false;
+
+ MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+ for (; MBBI != E; ++MBBI) {
+ MachineInstr *MI = &*MBBI;
+ unsigned FirstOpnd, NumRegs, Offset, Stride;
+ if (!isNEONMultiRegOp(MI->getOpcode(), FirstOpnd, NumRegs, Offset, Stride))
+ continue;
+
+ MachineBasicBlock::iterator NextI = next(MBBI);
+ for (unsigned R = 0; R < NumRegs; ++R) {
+ MachineOperand &MO = MI->getOperand(FirstOpnd + R);
+ assert(MO.isReg() && MO.getSubReg() == 0 && "unexpected operand");
+ unsigned VirtReg = MO.getReg();
+ assert(TargetRegisterInfo::isVirtualRegister(VirtReg) &&
+ "expected a virtual register");
+
+ // For now, just assign a fixed set of adjacent registers.
+ // This leaves plenty of room for future improvements.
+ static const unsigned NEONDRegs[] = {
+ ARM::D0, ARM::D1, ARM::D2, ARM::D3,
+ ARM::D4, ARM::D5, ARM::D6, ARM::D7
+ };
+ MO.setReg(NEONDRegs[Offset + R * Stride]);
+
+ if (MO.isUse()) {
+ // Insert a copy from VirtReg.
+ TII->copyRegToReg(MBB, MBBI, MO.getReg(), VirtReg,
+ ARM::DPRRegisterClass, ARM::DPRRegisterClass);
+ if (MO.isKill()) {
+ MachineInstr *CopyMI = prior(MBBI);
+ CopyMI->findRegisterUseOperand(VirtReg)->setIsKill();
+ }
+ MO.setIsKill();
+ } else if (MO.isDef() && !MO.isDead()) {
+ // Add a copy to VirtReg.
+ TII->copyRegToReg(MBB, NextI, VirtReg, MO.getReg(),
+ ARM::DPRRegisterClass, ARM::DPRRegisterClass);
+ }
+ }
+ }
+
+ return Modified;
+}
+
+bool NEONPreAllocPass::runOnMachineFunction(MachineFunction &MF) {
+ TII = MF.getTarget().getInstrInfo();
+
+ bool Modified = false;
+ for (MachineFunction::iterator MFI = MF.begin(), E = MF.end(); MFI != E;
+ ++MFI) {
+ MachineBasicBlock &MBB = *MFI;
+ Modified |= PreAllocNEONRegisters(MBB);
+ }
+
+ return Modified;
+}
+
+/// createNEONPreAllocPass - returns an instance of the NEON register
+/// pre-allocation pass.
+FunctionPass *llvm::createNEONPreAllocPass() {
+ return new NEONPreAllocPass();
+}
diff --git a/lib/Target/ARM/README-Thumb.txt b/lib/Target/ARM/README-Thumb.txt
index 4d3200b..a961a57 100644
--- a/lib/Target/ARM/README-Thumb.txt
+++ b/lib/Target/ARM/README-Thumb.txt
@@ -226,3 +226,31 @@ etc. Almost all Thumb instructions clobber condition code.
//===---------------------------------------------------------------------===//
Add ldmia, stmia support.
+
+//===---------------------------------------------------------------------===//
+
+Thumb load / store address mode offsets are scaled. The values kept in the
+instruction operands are pre-scale values. This probably ought to be changed
+to avoid extra work when we convert Thumb2 instructions to Thumb1 instructions.
+
+//===---------------------------------------------------------------------===//
+
+We need to make (some of the) Thumb1 instructions predicable. That will allow
+shrinking of predicated Thumb2 instructions. To allow this, we need to be able
+to toggle the 's' bit since they do not set CPSR when they are inside IT blocks.
+
+//===---------------------------------------------------------------------===//
+
+Make use of hi register variants of cmp: tCMPhir / tCMPZhir.
+
+//===---------------------------------------------------------------------===//
+
+Thumb1 immediate field sometimes keep pre-scaled values. See
+Thumb1RegisterInfo::eliminateFrameIndex. This is inconsistent from ARM and
+Thumb2.
+
+//===---------------------------------------------------------------------===//
+
+Rather than having tBR_JTr print a ".align 2" and constant island pass pad it,
+add a target specific ALIGN instruction instead. That way, GetInstSizeInBytes
+won't have to over-estimate. It can also be used for loop alignment pass.
diff --git a/lib/Target/ARM/README-Thumb2.txt b/lib/Target/ARM/README-Thumb2.txt
new file mode 100644
index 0000000..e7c2552
--- /dev/null
+++ b/lib/Target/ARM/README-Thumb2.txt
@@ -0,0 +1,6 @@
+//===---------------------------------------------------------------------===//
+// Random ideas for the ARM backend (Thumb2 specific).
+//===---------------------------------------------------------------------===//
+
+Make sure jumptable destinations are below the jumptable in order to make use
+of tbb / tbh.
diff --git a/lib/Target/ARM/README.txt b/lib/Target/ARM/README.txt
index f3377f9..8fb1da3 100644
--- a/lib/Target/ARM/README.txt
+++ b/lib/Target/ARM/README.txt
@@ -537,3 +537,66 @@ Split out LDR (literal) from normal ARM LDR instruction. Also consider spliting
LDR into imm12 and so_reg forms. This allows us to clean up some code. e.g.
ARMLoadStoreOptimizer does not need to look at LDR (literal) and LDR (so_reg)
while ARMConstantIslandPass only need to worry about LDR (literal).
+
+//===---------------------------------------------------------------------===//
+
+We need to fix constant isel for ARMv6t2 to use MOVT.
+
+//===---------------------------------------------------------------------===//
+
+Constant island pass should make use of full range SoImm values for LEApcrel.
+Be careful though as the last attempt caused infinite looping on lencod.
+
+//===---------------------------------------------------------------------===//
+
+Predication issue. This function:
+
+extern unsigned array[ 128 ];
+int foo( int x ) {
+ int y;
+ y = array[ x & 127 ];
+ if ( x & 128 )
+ y = 123456789 & ( y >> 2 );
+ else
+ y = 123456789 & y;
+ return y;
+}
+
+compiles to:
+
+_foo:
+ and r1, r0, #127
+ ldr r2, LCPI1_0
+ ldr r2, [r2]
+ ldr r1, [r2, +r1, lsl #2]
+ mov r2, r1, lsr #2
+ tst r0, #128
+ moveq r2, r1
+ ldr r0, LCPI1_1
+ and r0, r2, r0
+ bx lr
+
+It would be better to do something like this, to fold the shift into the
+conditional move:
+
+ and r1, r0, #127
+ ldr r2, LCPI1_0
+ ldr r2, [r2]
+ ldr r1, [r2, +r1, lsl #2]
+ tst r0, #128
+ movne r1, r1, lsr #2
+ ldr r0, LCPI1_1
+ and r0, r1, r0
+ bx lr
+
+it saves an instruction and a register.
+
+//===---------------------------------------------------------------------===//
+
+add/sub/and/or + i32 imm can be simplified by folding part of the immediate
+into the operation.
+
+//===---------------------------------------------------------------------===//
+
+It might be profitable to cse MOVi16 if there are lots of 32-bit immediates
+with the same bottom half.
diff --git a/lib/Target/ARM/TargetInfo/ARMTargetInfo.cpp b/lib/Target/ARM/TargetInfo/ARMTargetInfo.cpp
new file mode 100644
index 0000000..163a0a9
--- /dev/null
+++ b/lib/Target/ARM/TargetInfo/ARMTargetInfo.cpp
@@ -0,0 +1,23 @@
+//===-- ARMTargetInfo.cpp - ARM Target Implementation ---------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM.h"
+#include "llvm/Module.h"
+#include "llvm/Target/TargetRegistry.h"
+using namespace llvm;
+
+Target llvm::TheARMTarget, llvm::TheThumbTarget;
+
+extern "C" void LLVMInitializeARMTargetInfo() {
+ RegisterTarget<Triple::arm, /*HasJIT=*/true>
+ X(TheARMTarget, "arm", "ARM");
+
+ RegisterTarget<Triple::thumb, /*HasJIT=*/true>
+ Y(TheThumbTarget, "thumb", "Thumb");
+}
diff --git a/lib/Target/ARM/TargetInfo/CMakeLists.txt b/lib/Target/ARM/TargetInfo/CMakeLists.txt
new file mode 100644
index 0000000..3910bb0
--- /dev/null
+++ b/lib/Target/ARM/TargetInfo/CMakeLists.txt
@@ -0,0 +1,7 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_llvm_library(LLVMARMInfo
+ ARMTargetInfo.cpp
+ )
+
+add_dependencies(LLVMARMInfo ARMCodeGenTable_gen)
diff --git a/lib/Target/ARM/TargetInfo/Makefile b/lib/Target/ARM/TargetInfo/Makefile
new file mode 100644
index 0000000..6292ab1
--- /dev/null
+++ b/lib/Target/ARM/TargetInfo/Makefile
@@ -0,0 +1,15 @@
+##===- lib/Target/ARM/TargetInfo/Makefile ------------------*- Makefile -*-===##
+#
+# The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../../..
+LIBRARYNAME = LLVMARMInfo
+
+# Hack: we need to include 'main' target directory to grab private headers
+CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/Target/ARM/Thumb1InstrInfo.cpp b/lib/Target/ARM/Thumb1InstrInfo.cpp
index e13a811..7eed30e 100644
--- a/lib/Target/ARM/Thumb1InstrInfo.cpp
+++ b/lib/Target/ARM/Thumb1InstrInfo.cpp
@@ -22,63 +22,29 @@
using namespace llvm;
-Thumb1InstrInfo::Thumb1InstrInfo(const ARMSubtarget &STI)
- : ARMBaseInstrInfo(STI), RI(*this, STI) {
+Thumb1InstrInfo::Thumb1InstrInfo(const ARMSubtarget &STI) : RI(*this, STI) {
}
-bool Thumb1InstrInfo::isMoveInstr(const MachineInstr &MI,
- unsigned &SrcReg, unsigned &DstReg,
- unsigned& SrcSubIdx, unsigned& DstSubIdx) const {
- SrcSubIdx = DstSubIdx = 0; // No sub-registers.
-
- unsigned oc = MI.getOpcode();
- switch (oc) {
- default:
- return false;
- case ARM::tMOVr:
- case ARM::tMOVhir2lor:
- case ARM::tMOVlor2hir:
- case ARM::tMOVhir2hir:
- assert(MI.getDesc().getNumOperands() >= 2 &&
- MI.getOperand(0).isReg() &&
- MI.getOperand(1).isReg() &&
- "Invalid Thumb MOV instruction");
- SrcReg = MI.getOperand(1).getReg();
- DstReg = MI.getOperand(0).getReg();
- return true;
- }
-}
-
-unsigned Thumb1InstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
- int &FrameIndex) const {
- switch (MI->getOpcode()) {
- default: break;
- case ARM::tRestore:
- if (MI->getOperand(1).isFI() &&
- MI->getOperand(2).isImm() &&
- MI->getOperand(2).getImm() == 0) {
- FrameIndex = MI->getOperand(1).getIndex();
- return MI->getOperand(0).getReg();
- }
- break;
- }
+unsigned Thumb1InstrInfo::getUnindexedOpcode(unsigned Opc) const {
return 0;
}
-unsigned Thumb1InstrInfo::isStoreToStackSlot(const MachineInstr *MI,
- int &FrameIndex) const {
- switch (MI->getOpcode()) {
- default: break;
- case ARM::tSpill:
- if (MI->getOperand(1).isFI() &&
- MI->getOperand(2).isImm() &&
- MI->getOperand(2).getImm() == 0) {
- FrameIndex = MI->getOperand(1).getIndex();
- return MI->getOperand(0).getReg();
- }
+bool
+Thumb1InstrInfo::BlockHasNoFallThrough(const MachineBasicBlock &MBB) const {
+ if (MBB.empty()) return false;
+
+ switch (MBB.back().getOpcode()) {
+ case ARM::tBX_RET:
+ case ARM::tBX_RET_vararg:
+ case ARM::tPOP_RET:
+ case ARM::tB:
+ case ARM::tBR_JTr:
+ return true;
+ default:
break;
}
- return 0;
+
+ return false;
}
bool Thumb1InstrInfo::copyRegToReg(MachineBasicBlock &MBB,
@@ -91,15 +57,15 @@ bool Thumb1InstrInfo::copyRegToReg(MachineBasicBlock &MBB,
if (DestRC == ARM::GPRRegisterClass) {
if (SrcRC == ARM::GPRRegisterClass) {
- BuildMI(MBB, I, DL, get(ARM::tMOVhir2hir), DestReg).addReg(SrcReg);
+ BuildMI(MBB, I, DL, get(ARM::tMOVgpr2gpr), DestReg).addReg(SrcReg);
return true;
} else if (SrcRC == ARM::tGPRRegisterClass) {
- BuildMI(MBB, I, DL, get(ARM::tMOVlor2hir), DestReg).addReg(SrcReg);
+ BuildMI(MBB, I, DL, get(ARM::tMOVtgpr2gpr), DestReg).addReg(SrcReg);
return true;
}
} else if (DestRC == ARM::tGPRRegisterClass) {
if (SrcRC == ARM::GPRRegisterClass) {
- BuildMI(MBB, I, DL, get(ARM::tMOVhir2lor), DestReg).addReg(SrcReg);
+ BuildMI(MBB, I, DL, get(ARM::tMOVgpr2tgpr), DestReg).addReg(SrcReg);
return true;
} else if (SrcRC == ARM::tGPRRegisterClass) {
BuildMI(MBB, I, DL, get(ARM::tMOVr), DestReg).addReg(SrcReg);
@@ -120,17 +86,19 @@ canFoldMemoryOperand(const MachineInstr *MI,
switch (Opc) {
default: break;
case ARM::tMOVr:
- case ARM::tMOVlor2hir:
- case ARM::tMOVhir2lor:
- case ARM::tMOVhir2hir: {
+ case ARM::tMOVtgpr2gpr:
+ case ARM::tMOVgpr2tgpr:
+ case ARM::tMOVgpr2gpr: {
if (OpNum == 0) { // move -> store
unsigned SrcReg = MI->getOperand(1).getReg();
- if (RI.isPhysicalRegister(SrcReg) && !isARMLowRegister(SrcReg))
+ if (TargetRegisterInfo::isPhysicalRegister(SrcReg) &&
+ !isARMLowRegister(SrcReg))
// tSpill cannot take a high register operand.
return false;
} else { // move -> load
unsigned DstReg = MI->getOperand(0).getReg();
- if (RI.isPhysicalRegister(DstReg) && !isARMLowRegister(DstReg))
+ if (TargetRegisterInfo::isPhysicalRegister(DstReg) &&
+ !isARMLowRegister(DstReg))
// tRestore cannot target a high register operand.
return false;
}
@@ -148,36 +116,17 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
DebugLoc DL = DebugLoc::getUnknownLoc();
if (I != MBB.end()) DL = I->getDebugLoc();
- assert(RC == ARM::tGPRRegisterClass && "Unknown regclass!");
+ assert((RC == ARM::tGPRRegisterClass ||
+ (TargetRegisterInfo::isPhysicalRegister(SrcReg) &&
+ isARMLowRegister(SrcReg))) && "Unknown regclass!");
if (RC == ARM::tGPRRegisterClass) {
- BuildMI(MBB, I, DL, get(ARM::tSpill))
- .addReg(SrcReg, getKillRegState(isKill))
- .addFrameIndex(FI).addImm(0);
+ AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::tSpill))
+ .addReg(SrcReg, getKillRegState(isKill))
+ .addFrameIndex(FI).addImm(0));
}
}
-void Thumb1InstrInfo::storeRegToAddr(MachineFunction &MF, unsigned SrcReg,
- bool isKill,
- SmallVectorImpl<MachineOperand> &Addr,
- const TargetRegisterClass *RC,
- SmallVectorImpl<MachineInstr*> &NewMIs) const{
- DebugLoc DL = DebugLoc::getUnknownLoc();
- unsigned Opc = 0;
-
- assert(RC == ARM::GPRRegisterClass && "Unknown regclass!");
- if (RC == ARM::GPRRegisterClass) {
- Opc = Addr[0].isFI() ? ARM::tSpill : ARM::tSTR;
- }
-
- MachineInstrBuilder MIB =
- BuildMI(MF, DL, get(Opc)).addReg(SrcReg, getKillRegState(isKill));
- for (unsigned i = 0, e = Addr.size(); i != e; ++i)
- MIB.addOperand(Addr[i]);
- NewMIs.push_back(MIB);
- return;
-}
-
void Thumb1InstrInfo::
loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
unsigned DestReg, int FI,
@@ -185,33 +134,16 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
DebugLoc DL = DebugLoc::getUnknownLoc();
if (I != MBB.end()) DL = I->getDebugLoc();
- assert(RC == ARM::tGPRRegisterClass && "Unknown regclass!");
+ assert((RC == ARM::tGPRRegisterClass ||
+ (TargetRegisterInfo::isPhysicalRegister(DestReg) &&
+ isARMLowRegister(DestReg))) && "Unknown regclass!");
if (RC == ARM::tGPRRegisterClass) {
- BuildMI(MBB, I, DL, get(ARM::tRestore), DestReg)
- .addFrameIndex(FI).addImm(0);
+ AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::tRestore), DestReg)
+ .addFrameIndex(FI).addImm(0));
}
}
-void Thumb1InstrInfo::
-loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
- SmallVectorImpl<MachineOperand> &Addr,
- const TargetRegisterClass *RC,
- SmallVectorImpl<MachineInstr*> &NewMIs) const {
- DebugLoc DL = DebugLoc::getUnknownLoc();
- unsigned Opc = 0;
-
- if (RC == ARM::GPRRegisterClass) {
- Opc = Addr[0].isFI() ? ARM::tRestore : ARM::tLDR;
- }
-
- MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc), DestReg);
- for (unsigned i = 0, e = Addr.size(); i != e; ++i)
- MIB.addOperand(Addr[i]);
- NewMIs.push_back(MIB);
- return;
-}
-
bool Thumb1InstrInfo::
spillCalleeSavedRegisters(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI,
@@ -223,6 +155,8 @@ spillCalleeSavedRegisters(MachineBasicBlock &MBB,
if (MI != MBB.end()) DL = MI->getDebugLoc();
MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, get(ARM::tPUSH));
+ AddDefaultPred(MIB);
+ MIB.addReg(0); // No write back.
for (unsigned i = CSI.size(); i != 0; --i) {
unsigned Reg = CSI[i-1].getReg();
// Add the callee-saved register as live-in. It's killed at the spill.
@@ -242,7 +176,12 @@ restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
return false;
bool isVarArg = AFI->getVarArgsRegSaveSize() > 0;
- MachineInstr *PopMI = MF.CreateMachineInstr(get(ARM::tPOP),MI->getDebugLoc());
+ DebugLoc DL = MI->getDebugLoc();
+ MachineInstrBuilder MIB = BuildMI(MF, DL, get(ARM::tPOP));
+ AddDefaultPred(MIB);
+ MIB.addReg(0); // No write back.
+
+ bool NumRegs = 0;
for (unsigned i = CSI.size(); i != 0; --i) {
unsigned Reg = CSI[i-1].getReg();
if (Reg == ARM::LR) {
@@ -250,15 +189,16 @@ restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
if (isVarArg)
continue;
Reg = ARM::PC;
- PopMI->setDesc(get(ARM::tPOP_RET));
+ (*MIB).setDesc(get(ARM::tPOP_RET));
MI = MBB.erase(MI);
}
- PopMI->addOperand(MachineOperand::CreateReg(Reg, true));
+ MIB.addReg(Reg, getDefRegState(true));
+ ++NumRegs;
}
// It's illegal to emit pop instruction without operands.
- if (PopMI->getNumOperands() > 0)
- MBB.insert(MI, PopMI);
+ if (NumRegs)
+ MBB.insert(MI, &*MIB);
return true;
}
@@ -274,27 +214,30 @@ foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
switch (Opc) {
default: break;
case ARM::tMOVr:
- case ARM::tMOVlor2hir:
- case ARM::tMOVhir2lor:
- case ARM::tMOVhir2hir: {
+ case ARM::tMOVtgpr2gpr:
+ case ARM::tMOVgpr2tgpr:
+ case ARM::tMOVgpr2gpr: {
if (OpNum == 0) { // move -> store
unsigned SrcReg = MI->getOperand(1).getReg();
bool isKill = MI->getOperand(1).isKill();
- if (RI.isPhysicalRegister(SrcReg) && !isARMLowRegister(SrcReg))
+ if (TargetRegisterInfo::isPhysicalRegister(SrcReg) &&
+ !isARMLowRegister(SrcReg))
// tSpill cannot take a high register operand.
break;
- NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::tSpill))
- .addReg(SrcReg, getKillRegState(isKill))
- .addFrameIndex(FI).addImm(0);
+ NewMI = AddDefaultPred(BuildMI(MF, MI->getDebugLoc(), get(ARM::tSpill))
+ .addReg(SrcReg, getKillRegState(isKill))
+ .addFrameIndex(FI).addImm(0));
} else { // move -> load
unsigned DstReg = MI->getOperand(0).getReg();
- if (RI.isPhysicalRegister(DstReg) && !isARMLowRegister(DstReg))
+ if (TargetRegisterInfo::isPhysicalRegister(DstReg) &&
+ !isARMLowRegister(DstReg))
// tRestore cannot target a high register operand.
break;
bool isDead = MI->getOperand(0).isDead();
- NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::tRestore))
- .addReg(DstReg, RegState::Define | getDeadRegState(isDead))
- .addFrameIndex(FI).addImm(0);
+ NewMI = AddDefaultPred(BuildMI(MF, MI->getDebugLoc(), get(ARM::tRestore))
+ .addReg(DstReg,
+ RegState::Define | getDeadRegState(isDead))
+ .addFrameIndex(FI).addImm(0));
}
break;
}
diff --git a/lib/Target/ARM/Thumb1InstrInfo.h b/lib/Target/ARM/Thumb1InstrInfo.h
index 1bfa1d0..13cc578 100644
--- a/lib/Target/ARM/Thumb1InstrInfo.h
+++ b/lib/Target/ARM/Thumb1InstrInfo.h
@@ -27,6 +27,13 @@ class Thumb1InstrInfo : public ARMBaseInstrInfo {
public:
explicit Thumb1InstrInfo(const ARMSubtarget &STI);
+ // Return the non-pre/post incrementing version of 'Opc'. Return 0
+ // if there is not such an opcode.
+ unsigned getUnindexedOpcode(unsigned Opc) const;
+
+ // Return true if the block does not fall through.
+ bool BlockHasNoFallThrough(const MachineBasicBlock &MBB) const;
+
/// getRegisterInfo - TargetInstrInfo is a superset of MRegister info. As
/// such, whenever a client has an instance of instruction info, it should
/// always be able to get register info as well (through this method).
@@ -40,14 +47,6 @@ public:
MachineBasicBlock::iterator MI,
const std::vector<CalleeSavedInfo> &CSI) const;
- bool isMoveInstr(const MachineInstr &MI,
- unsigned &SrcReg, unsigned &DstReg,
- unsigned &SrcSubIdx, unsigned &DstSubIdx) const;
- unsigned isLoadFromStackSlot(const MachineInstr *MI,
- int &FrameIndex) const;
- unsigned isStoreToStackSlot(const MachineInstr *MI,
- int &FrameIndex) const;
-
bool copyRegToReg(MachineBasicBlock &MBB,
MachineBasicBlock::iterator I,
unsigned DestReg, unsigned SrcReg,
@@ -58,21 +57,11 @@ public:
unsigned SrcReg, bool isKill, int FrameIndex,
const TargetRegisterClass *RC) const;
- void storeRegToAddr(MachineFunction &MF, unsigned SrcReg, bool isKill,
- SmallVectorImpl<MachineOperand> &Addr,
- const TargetRegisterClass *RC,
- SmallVectorImpl<MachineInstr*> &NewMIs) const;
-
void loadRegFromStackSlot(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI,
unsigned DestReg, int FrameIndex,
const TargetRegisterClass *RC) const;
- void loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
- SmallVectorImpl<MachineOperand> &Addr,
- const TargetRegisterClass *RC,
- SmallVectorImpl<MachineInstr*> &NewMIs) const;
-
bool canFoldMemoryOperand(const MachineInstr *MI,
const SmallVectorImpl<unsigned> &Ops) const;
@@ -80,7 +69,7 @@ public:
MachineInstr* MI,
const SmallVectorImpl<unsigned> &Ops,
int FrameIndex) const;
-
+
MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
MachineInstr* MI,
const SmallVectorImpl<unsigned> &Ops,
diff --git a/lib/Target/ARM/Thumb1RegisterInfo.cpp b/lib/Target/ARM/Thumb1RegisterInfo.cpp
index 92f01d1..3c896da 100644
--- a/lib/Target/ARM/Thumb1RegisterInfo.cpp
+++ b/lib/Target/ARM/Thumb1RegisterInfo.cpp
@@ -13,12 +13,15 @@
#include "ARM.h"
#include "ARMAddressingModes.h"
+#include "ARMBaseInstrInfo.h"
#include "ARMMachineFunctionInfo.h"
#include "ARMSubtarget.h"
#include "Thumb1InstrInfo.h"
#include "Thumb1RegisterInfo.h"
#include "llvm/Constants.h"
#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/LLVMContext.h"
#include "llvm/CodeGen/MachineConstantPool.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
@@ -30,14 +33,11 @@
#include "llvm/ADT/BitVector.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
using namespace llvm;
-static cl::opt<bool>
-ThumbRegScavenging("enable-thumb-reg-scavenging",
- cl::Hidden,
- cl::desc("Enable register scavenging on Thumb"));
-
-Thumb1RegisterInfo::Thumb1RegisterInfo(const TargetInstrInfo &tii,
+Thumb1RegisterInfo::Thumb1RegisterInfo(const ARMBaseInstrInfo &tii,
const ARMSubtarget &sti)
: ARMBaseRegisterInfo(tii, sti) {
}
@@ -46,20 +46,24 @@ Thumb1RegisterInfo::Thumb1RegisterInfo(const TargetInstrInfo &tii,
/// specified immediate.
void Thumb1RegisterInfo::emitLoadConstPool(MachineBasicBlock &MBB,
MachineBasicBlock::iterator &MBBI,
- unsigned DestReg, int Val,
- const TargetInstrInfo *TII,
- DebugLoc dl) const {
+ DebugLoc dl,
+ unsigned DestReg, unsigned SubIdx,
+ int Val,
+ ARMCC::CondCodes Pred,
+ unsigned PredReg) const {
MachineFunction &MF = *MBB.getParent();
MachineConstantPool *ConstantPool = MF.getConstantPool();
- Constant *C = ConstantInt::get(Type::Int32Ty, Val);
+ Constant *C = ConstantInt::get(
+ Type::getInt32Ty(MBB.getParent()->getFunction()->getContext()), Val);
unsigned Idx = ConstantPool->getConstantPoolIndex(C, 4);
- BuildMI(MBB, MBBI, dl, TII->get(ARM::tLDRcp), DestReg)
- .addConstantPoolIndex(Idx);
+ BuildMI(MBB, MBBI, dl, TII.get(ARM::tLDRcp))
+ .addReg(DestReg, getDefRegState(true), SubIdx)
+ .addConstantPoolIndex(Idx).addImm(Pred).addReg(PredReg);
}
const TargetRegisterClass*
-Thumb1RegisterInfo::getPhysicalRegisterRegClass(unsigned Reg, MVT VT) const {
+Thumb1RegisterInfo::getPhysicalRegisterRegClass(unsigned Reg, EVT VT) const {
if (isARMLowRegister(Reg))
return ARM::tGPRRegisterClass;
switch (Reg) {
@@ -75,9 +79,16 @@ Thumb1RegisterInfo::getPhysicalRegisterRegClass(unsigned Reg, MVT VT) const {
bool
Thumb1RegisterInfo::requiresRegisterScavenging(const MachineFunction &MF) const {
- return ThumbRegScavenging;
+ return true;
+}
+
+bool
+Thumb1RegisterInfo::requiresFrameIndexScavenging(const MachineFunction &MF)
+ const {
+ return true;
}
+
bool Thumb1RegisterInfo::hasReservedCallFrame(MachineFunction &MF) const {
const MachineFrameInfo *FFI = MF.getFrameInfo();
unsigned CFSize = FFI->getMaxCallFrameSize();
@@ -91,6 +102,7 @@ bool Thumb1RegisterInfo::hasReservedCallFrame(MachineFunction &MF) const {
return !MF.getFrameInfo()->hasVarSizedObjects();
}
+
/// emitThumbRegPlusImmInReg - Emits a series of instructions to materialize
/// a destreg = basereg + immediate in Thumb code. Materialize the immediate
/// in a register using mov / mvn sequences or load the immediate from a
@@ -103,6 +115,7 @@ void emitThumbRegPlusImmInReg(MachineBasicBlock &MBB,
const TargetInstrInfo &TII,
const Thumb1RegisterInfo& MRI,
DebugLoc dl) {
+ MachineFunction &MF = *MBB.getParent();
bool isHigh = !isARMLowRegister(DestReg) ||
(BaseReg != 0 && !isARMLowRegister(BaseReg));
bool isSub = false;
@@ -117,31 +130,31 @@ void emitThumbRegPlusImmInReg(MachineBasicBlock &MBB,
unsigned LdReg = DestReg;
if (DestReg == ARM::SP) {
assert(BaseReg == ARM::SP && "Unexpected!");
- LdReg = ARM::R3;
- BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVlor2hir), ARM::R12)
- .addReg(ARM::R3, RegState::Kill);
+ LdReg = MF.getRegInfo().createVirtualRegister(ARM::tGPRRegisterClass);
}
if (NumBytes <= 255 && NumBytes >= 0)
- BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVi8), LdReg).addImm(NumBytes);
+ AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVi8), LdReg))
+ .addImm(NumBytes);
else if (NumBytes < 0 && NumBytes >= -255) {
- BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVi8), LdReg).addImm(NumBytes);
- BuildMI(MBB, MBBI, dl, TII.get(ARM::tNEG), LdReg)
+ AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVi8), LdReg))
+ .addImm(NumBytes);
+ AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII.get(ARM::tRSB), LdReg))
.addReg(LdReg, RegState::Kill);
} else
- MRI.emitLoadConstPool(MBB, MBBI, LdReg, NumBytes, &TII, dl);
+ MRI.emitLoadConstPool(MBB, MBBI, dl, LdReg, 0, NumBytes);
// Emit add / sub.
int Opc = (isSub) ? ARM::tSUBrr : (isHigh ? ARM::tADDhirr : ARM::tADDrr);
- const MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl,
- TII.get(Opc), DestReg);
+ MachineInstrBuilder MIB =
+ BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg);
+ if (Opc != ARM::tADDhirr)
+ MIB = AddDefaultT1CC(MIB);
if (DestReg == ARM::SP || isSub)
MIB.addReg(BaseReg).addReg(LdReg, RegState::Kill);
else
MIB.addReg(LdReg).addReg(BaseReg, RegState::Kill);
- if (DestReg == ARM::SP)
- BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVhir2lor), ARM::R3)
- .addReg(ARM::R12, RegState::Kill);
+ AddDefaultPred(MIB);
}
/// calcNumMI - Returns the number of instructions required to materialize
@@ -187,6 +200,8 @@ void emitThumbRegPlusImmediate(MachineBasicBlock &MBB,
unsigned Scale = 1;
int Opc = 0;
int ExtraOpc = 0;
+ bool NeedCC = false;
+ bool NeedPred = false;
if (DestReg == BaseReg && BaseReg == ARM::SP) {
assert(isMul4 && "Thumb sp inc / dec size must be multiple of 4!");
@@ -213,7 +228,16 @@ void emitThumbRegPlusImmediate(MachineBasicBlock &MBB,
if (DestReg != BaseReg)
DstNotEqBase = true;
NumBits = 8;
- Opc = isSub ? ARM::tSUBi8 : ARM::tADDi8;
+ if (DestReg == ARM::SP) {
+ Opc = isSub ? ARM::tSUBspi : ARM::tADDspi;
+ assert(isMul4 && "Thumb sp inc / dec size must be multiple of 4!");
+ NumBits = 7;
+ Scale = 4;
+ } else {
+ Opc = isSub ? ARM::tSUBi8 : ARM::tADDi8;
+ NumBits = 8;
+ NeedPred = NeedCC = true;
+ }
isTwoAddr = true;
}
@@ -233,8 +257,10 @@ void emitThumbRegPlusImmediate(MachineBasicBlock &MBB,
unsigned Chunk = (1 << 3) - 1;
unsigned ThisVal = (Bytes > Chunk) ? Chunk : Bytes;
Bytes -= ThisVal;
- BuildMI(MBB, MBBI, dl,TII.get(isSub ? ARM::tSUBi3 : ARM::tADDi3), DestReg)
- .addReg(BaseReg, RegState::Kill).addImm(ThisVal);
+ const TargetInstrDesc &TID = TII.get(isSub ? ARM::tSUBi3 : ARM::tADDi3);
+ const MachineInstrBuilder MIB =
+ AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TID, DestReg));
+ AddDefaultPred(MIB.addReg(BaseReg, RegState::Kill).addImm(ThisVal));
} else {
BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), DestReg)
.addReg(BaseReg, RegState::Kill);
@@ -248,13 +274,22 @@ void emitThumbRegPlusImmediate(MachineBasicBlock &MBB,
Bytes -= ThisVal;
ThisVal /= Scale;
// Build the new tADD / tSUB.
- if (isTwoAddr)
- BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg)
- .addReg(DestReg).addImm(ThisVal);
+ if (isTwoAddr) {
+ MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg);
+ if (NeedCC)
+ MIB = AddDefaultT1CC(MIB);
+ MIB .addReg(DestReg).addImm(ThisVal);
+ if (NeedPred)
+ MIB = AddDefaultPred(MIB);
+ }
else {
bool isKill = BaseReg != ARM::SP;
- BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg)
- .addReg(BaseReg, getKillRegState(isKill)).addImm(ThisVal);
+ MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg);
+ if (NeedCC)
+ MIB = AddDefaultT1CC(MIB);
+ MIB.addReg(BaseReg, getKillRegState(isKill)).addImm(ThisVal);
+ if (NeedPred)
+ MIB = AddDefaultPred(MIB);
BaseReg = DestReg;
if (Opc == ARM::tADDrSPi) {
@@ -265,15 +300,17 @@ void emitThumbRegPlusImmediate(MachineBasicBlock &MBB,
Scale = 1;
Chunk = ((1 << NumBits) - 1) * Scale;
Opc = isSub ? ARM::tSUBi8 : ARM::tADDi8;
- isTwoAddr = true;
+ NeedPred = NeedCC = isTwoAddr = true;
}
}
}
- if (ExtraOpc)
- BuildMI(MBB, MBBI, dl, TII.get(ExtraOpc), DestReg)
- .addReg(DestReg, RegState::Kill)
- .addImm(((unsigned)NumBytes) & 3);
+ if (ExtraOpc) {
+ const TargetInstrDesc &TID = TII.get(ExtraOpc);
+ AddDefaultPred(AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TID, DestReg))
+ .addReg(DestReg, RegState::Kill)
+ .addImm(((unsigned)NumBytes) & 3));
+ }
}
static void emitSPUpdate(MachineBasicBlock &MBB,
@@ -329,16 +366,64 @@ static void emitThumbConstant(MachineBasicBlock &MBB,
int Chunk = (1 << 8) - 1;
int ThisVal = (Imm > Chunk) ? Chunk : Imm;
Imm -= ThisVal;
- BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVi8), DestReg).addImm(ThisVal);
+ AddDefaultPred(AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVi8),
+ DestReg))
+ .addImm(ThisVal));
if (Imm > 0)
emitThumbRegPlusImmediate(MBB, MBBI, DestReg, DestReg, Imm, TII, MRI, dl);
- if (isSub)
- BuildMI(MBB, MBBI, dl, TII.get(ARM::tNEG), DestReg)
- .addReg(DestReg, RegState::Kill);
+ if (isSub) {
+ const TargetInstrDesc &TID = TII.get(ARM::tRSB);
+ AddDefaultPred(AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TID, DestReg))
+ .addReg(DestReg, RegState::Kill));
+ }
}
-void Thumb1RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
- int SPAdj, RegScavenger *RS) const{
+static void removeOperands(MachineInstr &MI, unsigned i) {
+ unsigned Op = i;
+ for (unsigned e = MI.getNumOperands(); i != e; ++i)
+ MI.RemoveOperand(Op);
+}
+
+int Thumb1RegisterInfo::
+rewriteFrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
+ unsigned FrameReg, int Offset,
+ unsigned MOVOpc, unsigned ADDriOpc, unsigned SUBriOpc) const
+{
+ // if/when eliminateFrameIndex() conforms with ARMBaseRegisterInfo
+ // version then can pull out Thumb1 specific parts here
+ return 0;
+}
+
+/// saveScavengerRegister - Save the register so it can be used by the
+/// register scavenger. Return true.
+bool Thumb1RegisterInfo::saveScavengerRegister(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ const TargetRegisterClass *RC,
+ unsigned Reg) const {
+ // Thumb1 can't use the emergency spill slot on the stack because
+ // ldr/str immediate offsets must be positive, and if we're referencing
+ // off the frame pointer (if, for example, there are alloca() calls in
+ // the function, the offset will be negative. Use R12 instead since that's
+ // a call clobbered register that we know won't be used in Thumb1 mode.
+
+ TII.copyRegToReg(MBB, I, ARM::R12, Reg, ARM::GPRRegisterClass, RC);
+ return true;
+}
+
+/// restoreScavengerRegister - restore a registers saved by
+// saveScavengerRegister().
+void Thumb1RegisterInfo::restoreScavengerRegister(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ const TargetRegisterClass *RC,
+ unsigned Reg) const {
+ TII.copyRegToReg(MBB, I, Reg, ARM::R12, RC, ARM::GPRRegisterClass);
+}
+
+unsigned
+Thumb1RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+ int SPAdj, int *Value,
+ RegScavenger *RS) const{
+ unsigned VReg = 0;
unsigned i = 0;
MachineInstr &MI = *II;
MachineBasicBlock &MBB = *MI.getParent();
@@ -380,7 +465,7 @@ void Thumb1RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
unsigned Scale = 1;
if (FrameReg != ARM::SP) {
Opcode = ARM::tADDi3;
- MI.setDesc(TII.get(ARM::tADDi3));
+ MI.setDesc(TII.get(Opcode));
NumBits = 3;
} else {
NumBits = 8;
@@ -391,19 +476,26 @@ void Thumb1RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
if (Offset == 0) {
// Turn it into a move.
- MI.setDesc(TII.get(ARM::tMOVhir2lor));
+ MI.setDesc(TII.get(ARM::tMOVgpr2tgpr));
MI.getOperand(i).ChangeToRegister(FrameReg, false);
MI.RemoveOperand(i+1);
- return;
+ return 0;
}
// Common case: small offset, fits into instruction.
unsigned Mask = (1 << NumBits) - 1;
if (((Offset / Scale) & ~Mask) == 0) {
// Replace the FrameIndex with sp / fp
- MI.getOperand(i).ChangeToRegister(FrameReg, false);
- MI.getOperand(i+1).ChangeToImmediate(Offset / Scale);
- return;
+ if (Opcode == ARM::tADDi3) {
+ removeOperands(MI, i);
+ MachineInstrBuilder MIB(&MI);
+ AddDefaultPred(AddDefaultT1CC(MIB).addReg(FrameReg)
+ .addImm(Offset / Scale));
+ } else {
+ MI.getOperand(i).ChangeToRegister(FrameReg, false);
+ MI.getOperand(i+1).ChangeToImmediate(Offset / Scale);
+ }
+ return 0;
}
unsigned DestReg = MI.getOperand(0).getReg();
@@ -415,15 +507,21 @@ void Thumb1RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
emitThumbRegPlusImmediate(MBB, II, DestReg, FrameReg, Offset, TII,
*this, dl);
MBB.erase(II);
- return;
+ return 0;
}
if (Offset > 0) {
// Translate r0 = add sp, imm to
// r0 = add sp, 255*4
// r0 = add r0, (imm - 255*4)
- MI.getOperand(i).ChangeToRegister(FrameReg, false);
- MI.getOperand(i+1).ChangeToImmediate(Mask);
+ if (Opcode == ARM::tADDi3) {
+ removeOperands(MI, i);
+ MachineInstrBuilder MIB(&MI);
+ AddDefaultPred(AddDefaultT1CC(MIB).addReg(FrameReg).addImm(Mask));
+ } else {
+ MI.getOperand(i).ChangeToRegister(FrameReg, false);
+ MI.getOperand(i+1).ChangeToImmediate(Mask);
+ }
Offset = (Offset - Mask * Scale);
MachineBasicBlock::iterator NII = next(II);
emitThumbRegPlusImmediate(MBB, NII, DestReg, DestReg, Offset, TII,
@@ -433,11 +531,16 @@ void Thumb1RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
// r0 = -imm (this is then translated into a series of instructons)
// r0 = add r0, sp
emitThumbConstant(MBB, II, DestReg, Offset, TII, *this, dl);
+
MI.setDesc(TII.get(ARM::tADDhirr));
MI.getOperand(i).ChangeToRegister(DestReg, false, false, true);
MI.getOperand(i+1).ChangeToRegister(FrameReg, false);
+ if (Opcode == ARM::tADDi3) {
+ MachineInstrBuilder MIB(&MI);
+ AddDefaultPred(MIB);
+ }
}
- return;
+ return 0;
} else {
unsigned ImmIdx = 0;
int InstrOffs = 0;
@@ -452,8 +555,7 @@ void Thumb1RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
break;
}
default:
- assert(0 && "Unsupported addressing mode!");
- abort();
+ llvm_unreachable("Unsupported addressing mode!");
break;
}
@@ -468,7 +570,7 @@ void Thumb1RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
// Replace the FrameIndex with sp
MI.getOperand(i).ChangeToRegister(FrameReg, false);
ImmOp.ChangeToImmediate(ImmedOffset);
- return;
+ return 0;
}
bool isThumSpillRestore = Opcode == ARM::tRestore || Opcode == ARM::tSpill;
@@ -495,6 +597,11 @@ void Thumb1RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
// SP+LargeImm.
assert(Offset && "This code isn't needed if offset already handled!");
+ // Remove predicate first.
+ int PIdx = MI.findFirstPredOperandIdx();
+ if (PIdx != -1)
+ removeOperands(MI, PIdx);
+
if (Desc.mayLoad()) {
// Use the destination register to materialize sp + offset.
unsigned TmpReg = MI.getOperand(0).getReg();
@@ -504,12 +611,14 @@ void Thumb1RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
emitThumbRegPlusImmInReg(MBB, II, TmpReg, FrameReg,
Offset, false, TII, *this, dl);
else {
- emitLoadConstPool(MBB, II, TmpReg, Offset, &TII, dl);
+ emitLoadConstPool(MBB, II, dl, TmpReg, 0, Offset);
UseRR = true;
}
- } else
+ } else {
emitThumbRegPlusImmediate(MBB, II, TmpReg, FrameReg, Offset, TII,
*this, dl);
+ }
+
MI.setDesc(TII.get(ARM::tLDR));
MI.getOperand(i).ChangeToRegister(TmpReg, false, false, true);
if (UseRR)
@@ -518,52 +627,37 @@ void Thumb1RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
else // tLDR has an extra register operand.
MI.addOperand(MachineOperand::CreateReg(0, false));
} else if (Desc.mayStore()) {
- // FIXME! This is horrific!!! We need register scavenging.
- // Our temporary workaround has marked r3 unavailable. Of course, r3 is
- // also a ABI register so it's possible that is is the register that is
- // being storing here. If that's the case, we do the following:
- // r12 = r2
- // Use r2 to materialize sp + offset
- // str r3, r2
- // r2 = r12
- unsigned ValReg = MI.getOperand(0).getReg();
- unsigned TmpReg = ARM::R3;
- bool UseRR = false;
- if (ValReg == ARM::R3) {
- BuildMI(MBB, II, dl, TII.get(ARM::tMOVlor2hir), ARM::R12)
- .addReg(ARM::R2, RegState::Kill);
- TmpReg = ARM::R2;
- }
- if (TmpReg == ARM::R3 && AFI->isR3LiveIn())
- BuildMI(MBB, II, dl, TII.get(ARM::tMOVlor2hir), ARM::R12)
- .addReg(ARM::R3, RegState::Kill);
- if (Opcode == ARM::tSpill) {
- if (FrameReg == ARM::SP)
- emitThumbRegPlusImmInReg(MBB, II, TmpReg, FrameReg,
- Offset, false, TII, *this, dl);
- else {
- emitLoadConstPool(MBB, II, TmpReg, Offset, &TII, dl);
- UseRR = true;
- }
- } else
- emitThumbRegPlusImmediate(MBB, II, TmpReg, FrameReg, Offset, TII,
- *this, dl);
- MI.setDesc(TII.get(ARM::tSTR));
- MI.getOperand(i).ChangeToRegister(TmpReg, false, false, true);
- if (UseRR) // Use [reg, reg] addrmode.
- MI.addOperand(MachineOperand::CreateReg(FrameReg, false));
- else // tSTR has an extra register operand.
- MI.addOperand(MachineOperand::CreateReg(0, false));
-
- MachineBasicBlock::iterator NII = next(II);
- if (ValReg == ARM::R3)
- BuildMI(MBB, NII, dl, TII.get(ARM::tMOVhir2lor), ARM::R2)
- .addReg(ARM::R12, RegState::Kill);
- if (TmpReg == ARM::R3 && AFI->isR3LiveIn())
- BuildMI(MBB, NII, dl, TII.get(ARM::tMOVhir2lor), ARM::R3)
- .addReg(ARM::R12, RegState::Kill);
+ VReg = MF.getRegInfo().createVirtualRegister(ARM::tGPRRegisterClass);
+ assert (Value && "Frame index virtual allocated, but Value arg is NULL!");
+ *Value = Offset;
+ bool UseRR = false;
+
+ if (Opcode == ARM::tSpill) {
+ if (FrameReg == ARM::SP)
+ emitThumbRegPlusImmInReg(MBB, II, VReg, FrameReg,
+ Offset, false, TII, *this, dl);
+ else {
+ emitLoadConstPool(MBB, II, dl, VReg, 0, Offset);
+ UseRR = true;
+ }
+ } else
+ emitThumbRegPlusImmediate(MBB, II, VReg, FrameReg, Offset, TII,
+ *this, dl);
+ MI.setDesc(TII.get(ARM::tSTR));
+ MI.getOperand(i).ChangeToRegister(VReg, false, false, true);
+ if (UseRR) // Use [reg, reg] addrmode.
+ MI.addOperand(MachineOperand::CreateReg(FrameReg, false));
+ else // tSTR has an extra register operand.
+ MI.addOperand(MachineOperand::CreateReg(0, false));
} else
assert(false && "Unexpected opcode!");
+
+ // Add predicate back if it's needed.
+ if (MI.getDesc().isPredicable()) {
+ MachineInstrBuilder MIB(&MI);
+ AddDefaultPred(MIB);
+ }
+ return VReg;
}
void Thumb1RegisterInfo::emitPrologue(MachineFunction &MF) const {
@@ -577,15 +671,6 @@ void Thumb1RegisterInfo::emitPrologue(MachineFunction &MF) const {
DebugLoc dl = (MBBI != MBB.end() ?
MBBI->getDebugLoc() : DebugLoc::getUnknownLoc());
- // Check if R3 is live in. It might have to be used as a scratch register.
- for (MachineRegisterInfo::livein_iterator I =MF.getRegInfo().livein_begin(),
- E = MF.getRegInfo().livein_end(); I != E; ++I) {
- if (I->first == ARM::R3) {
- AFI->setR3IsLiveIn(true);
- break;
- }
- }
-
// Thumb add/sub sp, imm8 instructions implicitly multiply the offset by 4.
NumBytes = (NumBytes + 3) & ~3;
MFI->setStackSize(NumBytes);
@@ -647,8 +732,7 @@ void Thumb1RegisterInfo::emitPrologue(MachineFunction &MF) const {
// Darwin ABI requires FP to point to the stack slot that contains the
// previous FP.
if (STI.isTargetDarwin() || hasFP(MF)) {
- MachineInstrBuilder MIB =
- BuildMI(MBB, MBBI, dl, TII.get(ARM::tADDrSPi), FramePtr)
+ BuildMI(MBB, MBBI, dl, TII.get(ARM::tADDrSPi), FramePtr)
.addFrameIndex(FramePtrSpillFI).addImm(0);
}
@@ -729,7 +813,7 @@ void Thumb1RegisterInfo::emitEpilogue(MachineFunction &MF,
emitThumbRegPlusImmediate(MBB, MBBI, ARM::SP, FramePtr, -NumBytes,
TII, *this, dl);
else
- BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVlor2hir), ARM::SP)
+ BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVtgpr2gpr), ARM::SP)
.addReg(FramePtr);
} else {
if (MBBI->getOpcode() == ARM::tBX_RET &&
@@ -745,11 +829,14 @@ void Thumb1RegisterInfo::emitEpilogue(MachineFunction &MF,
if (VARegSaveSize) {
// Epilogue for vararg functions: pop LR to R3 and branch off it.
// FIXME: Verify this is still ok when R3 is no longer being reserved.
- BuildMI(MBB, MBBI, dl, TII.get(ARM::tPOP)).addReg(ARM::R3);
+ AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tPOP)))
+ .addReg(0) // No write back.
+ .addReg(ARM::R3, RegState::Define);
emitSPUpdate(MBB, MBBI, TII, dl, *this, VARegSaveSize);
- BuildMI(MBB, MBBI, dl, TII.get(ARM::tBX_RET_vararg)).addReg(ARM::R3);
+ BuildMI(MBB, MBBI, dl, TII.get(ARM::tBX_RET_vararg))
+ .addReg(ARM::R3, RegState::Kill);
MBB.erase(MBBI);
}
}
diff --git a/lib/Target/ARM/Thumb1RegisterInfo.h b/lib/Target/ARM/Thumb1RegisterInfo.h
index 6d4f1f0..bb7a619 100644
--- a/lib/Target/ARM/Thumb1RegisterInfo.h
+++ b/lib/Target/ARM/Thumb1RegisterInfo.h
@@ -20,28 +20,28 @@
namespace llvm {
class ARMSubtarget;
- class TargetInstrInfo;
+ class ARMBaseInstrInfo;
class Type;
struct Thumb1RegisterInfo : public ARMBaseRegisterInfo {
public:
- Thumb1RegisterInfo(const TargetInstrInfo &tii, const ARMSubtarget &STI);
+ Thumb1RegisterInfo(const ARMBaseInstrInfo &tii, const ARMSubtarget &STI);
/// emitLoadConstPool - Emits a load from constpool to materialize the
/// specified immediate.
- void emitLoadConstPool(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator &MBBI,
- unsigned DestReg, int Val,
- const TargetInstrInfo *TII,
- DebugLoc dl) const;
+ void emitLoadConstPool(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator &MBBI,
+ DebugLoc dl,
+ unsigned DestReg, unsigned SubIdx, int Val,
+ ARMCC::CondCodes Pred = ARMCC::AL,
+ unsigned PredReg = 0) const;
/// Code Generation virtual methods...
const TargetRegisterClass *
- getPhysicalRegisterRegClass(unsigned Reg, MVT VT = MVT::Other) const;
-
- bool isReservedReg(const MachineFunction &MF, unsigned Reg) const;
+ getPhysicalRegisterRegClass(unsigned Reg, EVT VT = MVT::Other) const;
bool requiresRegisterScavenging(const MachineFunction &MF) const;
+ bool requiresFrameIndexScavenging(const MachineFunction &MF) const;
bool hasReservedCallFrame(MachineFunction &MF) const;
@@ -49,8 +49,23 @@ public:
MachineBasicBlock &MBB,
MachineBasicBlock::iterator I) const;
- void eliminateFrameIndex(MachineBasicBlock::iterator II,
- int SPAdj, RegScavenger *RS = NULL) const;
+ // rewrite MI to access 'Offset' bytes from the FP. Return the offset that
+ // could not be handled directly in MI.
+ int rewriteFrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
+ unsigned FrameReg, int Offset,
+ unsigned MOVOpc, unsigned ADDriOpc, unsigned SUBriOpc) const;
+
+ bool saveScavengerRegister(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ const TargetRegisterClass *RC,
+ unsigned Reg) const;
+ void restoreScavengerRegister(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ const TargetRegisterClass *RC,
+ unsigned Reg) const;
+ unsigned eliminateFrameIndex(MachineBasicBlock::iterator II,
+ int SPAdj, int *Value = NULL,
+ RegScavenger *RS = NULL) const;
void emitPrologue(MachineFunction &MF) const;
void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
diff --git a/lib/Target/ARM/Thumb2ITBlockPass.cpp b/lib/Target/ARM/Thumb2ITBlockPass.cpp
new file mode 100644
index 0000000..98b5cbd
--- /dev/null
+++ b/lib/Target/ARM/Thumb2ITBlockPass.cpp
@@ -0,0 +1,158 @@
+//===-- Thumb2ITBlockPass.cpp - Insert Thumb IT blocks -----------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "thumb2-it"
+#include "ARM.h"
+#include "ARMMachineFunctionInfo.h"
+#include "Thumb2InstrInfo.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/ADT/Statistic.h"
+using namespace llvm;
+
+STATISTIC(NumITs, "Number of IT blocks inserted");
+
+namespace {
+ struct VISIBILITY_HIDDEN Thumb2ITBlockPass : public MachineFunctionPass {
+ static char ID;
+ Thumb2ITBlockPass() : MachineFunctionPass(&ID) {}
+
+ const Thumb2InstrInfo *TII;
+ ARMFunctionInfo *AFI;
+
+ virtual bool runOnMachineFunction(MachineFunction &Fn);
+
+ virtual const char *getPassName() const {
+ return "Thumb IT blocks insertion pass";
+ }
+
+ private:
+ MachineBasicBlock::iterator
+ SplitT2MOV32imm(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+ MachineInstr *MI, DebugLoc dl,
+ unsigned PredReg, ARMCC::CondCodes CC);
+ bool InsertITBlocks(MachineBasicBlock &MBB);
+ };
+ char Thumb2ITBlockPass::ID = 0;
+}
+
+static ARMCC::CondCodes getPredicate(const MachineInstr *MI, unsigned &PredReg){
+ unsigned Opc = MI->getOpcode();
+ if (Opc == ARM::tBcc || Opc == ARM::t2Bcc)
+ return ARMCC::AL;
+ return llvm::getInstrPredicate(MI, PredReg);
+}
+
+MachineBasicBlock::iterator
+Thumb2ITBlockPass::SplitT2MOV32imm(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ MachineInstr *MI,
+ DebugLoc dl, unsigned PredReg,
+ ARMCC::CondCodes CC) {
+ // Splitting t2MOVi32imm into a pair of t2MOVi16 + t2MOVTi16 here.
+ // The only reason it was a single instruction was so it could be
+ // re-materialized. We want to split it before this and the thumb2
+ // size reduction pass to make sure the IT mask is correct and expose
+ // width reduction opportunities. It doesn't make sense to do this in a
+ // separate pass so here it is.
+ unsigned DstReg = MI->getOperand(0).getReg();
+ bool DstDead = MI->getOperand(0).isDead(); // Is this possible?
+ unsigned Imm = MI->getOperand(1).getImm();
+ unsigned Lo16 = Imm & 0xffff;
+ unsigned Hi16 = (Imm >> 16) & 0xffff;
+ BuildMI(MBB, MBBI, dl, TII->get(ARM::t2MOVi16), DstReg)
+ .addImm(Lo16).addImm(CC).addReg(PredReg);
+ BuildMI(MBB, MBBI, dl, TII->get(ARM::t2MOVTi16))
+ .addReg(DstReg, getDefRegState(true) | getDeadRegState(DstDead))
+ .addReg(DstReg).addImm(Hi16).addImm(CC).addReg(PredReg);
+ --MBBI;
+ --MBBI;
+ MI->eraseFromParent();
+ return MBBI;
+}
+
+bool Thumb2ITBlockPass::InsertITBlocks(MachineBasicBlock &MBB) {
+ bool Modified = false;
+
+ MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+ while (MBBI != E) {
+ MachineInstr *MI = &*MBBI;
+ DebugLoc dl = MI->getDebugLoc();
+ unsigned PredReg = 0;
+ ARMCC::CondCodes CC = getPredicate(MI, PredReg);
+
+ if (MI->getOpcode() == ARM::t2MOVi32imm) {
+ MBBI = SplitT2MOV32imm(MBB, MBBI, MI, dl, PredReg, CC);
+ continue;
+ }
+
+ if (CC == ARMCC::AL) {
+ ++MBBI;
+ continue;
+ }
+
+ // Insert an IT instruction.
+ MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII->get(ARM::t2IT))
+ .addImm(CC);
+ ++MBBI;
+
+ // Finalize IT mask.
+ ARMCC::CondCodes OCC = ARMCC::getOppositeCondition(CC);
+ unsigned Mask = 0, Pos = 3;
+ while (MBBI != E && Pos) {
+ MachineInstr *NMI = &*MBBI;
+ DebugLoc ndl = NMI->getDebugLoc();
+ unsigned NPredReg = 0;
+ ARMCC::CondCodes NCC = getPredicate(NMI, NPredReg);
+ if (NMI->getOpcode() == ARM::t2MOVi32imm) {
+ MBBI = SplitT2MOV32imm(MBB, MBBI, NMI, ndl, NPredReg, NCC);
+ continue;
+ }
+
+ if (NCC == OCC) {
+ Mask |= (1 << Pos);
+ } else if (NCC != CC)
+ break;
+ --Pos;
+ ++MBBI;
+ }
+ Mask |= (1 << Pos);
+ MIB.addImm(Mask);
+ Modified = true;
+ ++NumITs;
+ }
+
+ return Modified;
+}
+
+bool Thumb2ITBlockPass::runOnMachineFunction(MachineFunction &Fn) {
+ const TargetMachine &TM = Fn.getTarget();
+ AFI = Fn.getInfo<ARMFunctionInfo>();
+ TII = static_cast<const Thumb2InstrInfo*>(TM.getInstrInfo());
+
+ if (!AFI->isThumbFunction())
+ return false;
+
+ bool Modified = false;
+ for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
+ ++MFI) {
+ MachineBasicBlock &MBB = *MFI;
+ Modified |= InsertITBlocks(MBB);
+ }
+
+ return Modified;
+}
+
+/// createThumb2ITBlockPass - Returns an instance of the Thumb2 IT blocks
+/// insertion pass.
+FunctionPass *llvm::createThumb2ITBlockPass() {
+ return new Thumb2ITBlockPass();
+}
diff --git a/lib/Target/ARM/Thumb2InstrInfo.cpp b/lib/Target/ARM/Thumb2InstrInfo.cpp
index 35d09fd..264601b 100644
--- a/lib/Target/ARM/Thumb2InstrInfo.cpp
+++ b/lib/Target/ARM/Thumb2InstrInfo.cpp
@@ -13,6 +13,7 @@
#include "ARMInstrInfo.h"
#include "ARM.h"
+#include "ARMAddressingModes.h"
#include "ARMGenInstrInfo.inc"
#include "ARMMachineFunctionInfo.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
@@ -22,127 +23,62 @@
using namespace llvm;
-Thumb2InstrInfo::Thumb2InstrInfo(const ARMSubtarget &STI)
- : ARMBaseInstrInfo(STI), RI(*this, STI) {
+Thumb2InstrInfo::Thumb2InstrInfo(const ARMSubtarget &STI) : RI(*this, STI) {
}
-bool Thumb2InstrInfo::isMoveInstr(const MachineInstr &MI,
- unsigned &SrcReg, unsigned &DstReg,
- unsigned& SrcSubIdx, unsigned& DstSubIdx) const {
- SrcSubIdx = DstSubIdx = 0; // No sub-registers.
-
- unsigned oc = MI.getOpcode();
- switch (oc) {
- default:
- return false;
- // FIXME: Thumb2
- case ARM::tMOVr:
- case ARM::tMOVhir2lor:
- case ARM::tMOVlor2hir:
- case ARM::tMOVhir2hir:
- assert(MI.getDesc().getNumOperands() >= 2 &&
- MI.getOperand(0).isReg() &&
- MI.getOperand(1).isReg() &&
- "Invalid Thumb MOV instruction");
- SrcReg = MI.getOperand(1).getReg();
- DstReg = MI.getOperand(0).getReg();
- return true;
- }
-}
-
-unsigned Thumb2InstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
- int &FrameIndex) const {
- switch (MI->getOpcode()) {
- default: break;
- // FIXME: Thumb2
- case ARM::tRestore:
- if (MI->getOperand(1).isFI() &&
- MI->getOperand(2).isImm() &&
- MI->getOperand(2).getImm() == 0) {
- FrameIndex = MI->getOperand(1).getIndex();
- return MI->getOperand(0).getReg();
- }
- break;
- }
+unsigned Thumb2InstrInfo::getUnindexedOpcode(unsigned Opc) const {
+ // FIXME
return 0;
}
-unsigned Thumb2InstrInfo::isStoreToStackSlot(const MachineInstr *MI,
- int &FrameIndex) const {
- switch (MI->getOpcode()) {
- default: break;
- // FIXME: Thumb2
- case ARM::tSpill:
- if (MI->getOperand(1).isFI() &&
- MI->getOperand(2).isImm() &&
- MI->getOperand(2).getImm() == 0) {
- FrameIndex = MI->getOperand(1).getIndex();
- return MI->getOperand(0).getReg();
- }
+bool
+Thumb2InstrInfo::BlockHasNoFallThrough(const MachineBasicBlock &MBB) const {
+ if (MBB.empty()) return false;
+
+ switch (MBB.back().getOpcode()) {
+ case ARM::t2LDM_RET:
+ case ARM::t2B: // Uncond branch.
+ case ARM::t2BR_JT: // Jumptable branch.
+ case ARM::t2TBB: // Table branch byte.
+ case ARM::t2TBH: // Table branch halfword.
+ case ARM::tBR_JTr: // Jumptable branch (16-bit version).
+ case ARM::tBX_RET:
+ case ARM::tBX_RET_vararg:
+ case ARM::tPOP_RET:
+ case ARM::tB:
+ return true;
+ default:
break;
}
- return 0;
+
+ return false;
}
-bool Thumb2InstrInfo::copyRegToReg(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator I,
- unsigned DestReg, unsigned SrcReg,
- const TargetRegisterClass *DestRC,
- const TargetRegisterClass *SrcRC) const {
+bool
+Thumb2InstrInfo::copyRegToReg(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ unsigned DestReg, unsigned SrcReg,
+ const TargetRegisterClass *DestRC,
+ const TargetRegisterClass *SrcRC) const {
DebugLoc DL = DebugLoc::getUnknownLoc();
if (I != MBB.end()) DL = I->getDebugLoc();
- // FIXME: Thumb2
- if (DestRC == ARM::GPRRegisterClass) {
- if (SrcRC == ARM::GPRRegisterClass) {
- BuildMI(MBB, I, DL, get(ARM::tMOVhir2hir), DestReg).addReg(SrcReg);
- return true;
- } else if (SrcRC == ARM::tGPRRegisterClass) {
- BuildMI(MBB, I, DL, get(ARM::tMOVlor2hir), DestReg).addReg(SrcReg);
- return true;
- }
- } else if (DestRC == ARM::tGPRRegisterClass) {
- if (SrcRC == ARM::GPRRegisterClass) {
- BuildMI(MBB, I, DL, get(ARM::tMOVhir2lor), DestReg).addReg(SrcReg);
- return true;
- } else if (SrcRC == ARM::tGPRRegisterClass) {
- BuildMI(MBB, I, DL, get(ARM::tMOVr), DestReg).addReg(SrcReg);
- return true;
- }
- }
-
- return false;
-}
-
-bool Thumb2InstrInfo::
-canFoldMemoryOperand(const MachineInstr *MI,
- const SmallVectorImpl<unsigned> &Ops) const {
- if (Ops.size() != 1) return false;
-
- unsigned OpNum = Ops[0];
- unsigned Opc = MI->getOpcode();
- switch (Opc) {
- default: break;
- case ARM::tMOVr:
- case ARM::tMOVlor2hir:
- case ARM::tMOVhir2lor:
- case ARM::tMOVhir2hir: {
- if (OpNum == 0) { // move -> store
- unsigned SrcReg = MI->getOperand(1).getReg();
- if (RI.isPhysicalRegister(SrcReg) && !isARMLowRegister(SrcReg))
- // tSpill cannot take a high register operand.
- return false;
- } else { // move -> load
- unsigned DstReg = MI->getOperand(0).getReg();
- if (RI.isPhysicalRegister(DstReg) && !isARMLowRegister(DstReg))
- // tRestore cannot target a high register operand.
- return false;
- }
+ if (DestRC == ARM::GPRRegisterClass &&
+ SrcRC == ARM::GPRRegisterClass) {
+ BuildMI(MBB, I, DL, get(ARM::tMOVgpr2gpr), DestReg).addReg(SrcReg);
+ return true;
+ } else if (DestRC == ARM::GPRRegisterClass &&
+ SrcRC == ARM::tGPRRegisterClass) {
+ BuildMI(MBB, I, DL, get(ARM::tMOVtgpr2gpr), DestReg).addReg(SrcReg);
+ return true;
+ } else if (DestRC == ARM::tGPRRegisterClass &&
+ SrcRC == ARM::GPRRegisterClass) {
+ BuildMI(MBB, I, DL, get(ARM::tMOVgpr2tgpr), DestReg).addReg(SrcReg);
return true;
- }
}
- return false;
+ // Handle SPR, DPR, and QPR copies.
+ return ARMBaseInstrInfo::copyRegToReg(MBB, I, DestReg, SrcReg, DestRC, SrcRC);
}
void Thumb2InstrInfo::
@@ -152,36 +88,14 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
DebugLoc DL = DebugLoc::getUnknownLoc();
if (I != MBB.end()) DL = I->getDebugLoc();
- assert(RC == ARM::tGPRRegisterClass && "Unknown regclass!");
-
- // FIXME: Thumb2
- if (RC == ARM::tGPRRegisterClass) {
- BuildMI(MBB, I, DL, get(ARM::tSpill))
- .addReg(SrcReg, getKillRegState(isKill))
- .addFrameIndex(FI).addImm(0);
- }
-}
-
-void Thumb2InstrInfo::storeRegToAddr(MachineFunction &MF, unsigned SrcReg,
- bool isKill,
- SmallVectorImpl<MachineOperand> &Addr,
- const TargetRegisterClass *RC,
- SmallVectorImpl<MachineInstr*> &NewMIs) const{
- DebugLoc DL = DebugLoc::getUnknownLoc();
- unsigned Opc = 0;
-
- // FIXME: Thumb2. Is GPRRegClass here correct?
- assert(RC == ARM::GPRRegisterClass && "Unknown regclass!");
if (RC == ARM::GPRRegisterClass) {
- Opc = Addr[0].isFI() ? ARM::tSpill : ARM::tSTR;
+ AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::t2STRi12))
+ .addReg(SrcReg, getKillRegState(isKill))
+ .addFrameIndex(FI).addImm(0));
+ return;
}
- MachineInstrBuilder MIB =
- BuildMI(MF, DL, get(Opc)).addReg(SrcReg, getKillRegState(isKill));
- for (unsigned i = 0, e = Addr.size(); i != e; ++i)
- MIB.addOperand(Addr[i]);
- NewMIs.push_back(MIB);
- return;
+ ARMBaseInstrInfo::storeRegToStackSlot(MBB, I, SrcReg, isKill, FI, RC);
}
void Thumb2InstrInfo::
@@ -191,122 +105,381 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
DebugLoc DL = DebugLoc::getUnknownLoc();
if (I != MBB.end()) DL = I->getDebugLoc();
- // FIXME: Thumb2
- assert(RC == ARM::tGPRRegisterClass && "Unknown regclass!");
-
- if (RC == ARM::tGPRRegisterClass) {
- BuildMI(MBB, I, DL, get(ARM::tRestore), DestReg)
- .addFrameIndex(FI).addImm(0);
+ if (RC == ARM::GPRRegisterClass) {
+ AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::t2LDRi12), DestReg)
+ .addFrameIndex(FI).addImm(0));
+ return;
}
+
+ ARMBaseInstrInfo::loadRegFromStackSlot(MBB, I, DestReg, FI, RC);
}
-void Thumb2InstrInfo::
-loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
- SmallVectorImpl<MachineOperand> &Addr,
- const TargetRegisterClass *RC,
- SmallVectorImpl<MachineInstr*> &NewMIs) const {
- DebugLoc DL = DebugLoc::getUnknownLoc();
- unsigned Opc = 0;
- // FIXME: Thumb2. Is GPRRegClass ok here?
- if (RC == ARM::GPRRegisterClass) {
- Opc = Addr[0].isFI() ? ARM::tRestore : ARM::tLDR;
+void llvm::emitT2RegPlusImmediate(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator &MBBI, DebugLoc dl,
+ unsigned DestReg, unsigned BaseReg, int NumBytes,
+ ARMCC::CondCodes Pred, unsigned PredReg,
+ const ARMBaseInstrInfo &TII) {
+ bool isSub = NumBytes < 0;
+ if (isSub) NumBytes = -NumBytes;
+
+ // If profitable, use a movw or movt to materialize the offset.
+ // FIXME: Use the scavenger to grab a scratch register.
+ if (DestReg != ARM::SP && DestReg != BaseReg &&
+ NumBytes >= 4096 &&
+ ARM_AM::getT2SOImmVal(NumBytes) == -1) {
+ bool Fits = false;
+ if (NumBytes < 65536) {
+ // Use a movw to materialize the 16-bit constant.
+ BuildMI(MBB, MBBI, dl, TII.get(ARM::t2MOVi16), DestReg)
+ .addImm(NumBytes)
+ .addImm((unsigned)Pred).addReg(PredReg).addReg(0);
+ Fits = true;
+ } else if ((NumBytes & 0xffff) == 0) {
+ // Use a movt to materialize the 32-bit constant.
+ BuildMI(MBB, MBBI, dl, TII.get(ARM::t2MOVTi16), DestReg)
+ .addReg(DestReg)
+ .addImm(NumBytes >> 16)
+ .addImm((unsigned)Pred).addReg(PredReg).addReg(0);
+ Fits = true;
+ }
+
+ if (Fits) {
+ if (isSub) {
+ BuildMI(MBB, MBBI, dl, TII.get(ARM::t2SUBrr), DestReg)
+ .addReg(BaseReg, RegState::Kill)
+ .addReg(DestReg, RegState::Kill)
+ .addImm((unsigned)Pred).addReg(PredReg).addReg(0);
+ } else {
+ BuildMI(MBB, MBBI, dl, TII.get(ARM::t2ADDrr), DestReg)
+ .addReg(DestReg, RegState::Kill)
+ .addReg(BaseReg, RegState::Kill)
+ .addImm((unsigned)Pred).addReg(PredReg).addReg(0);
+ }
+ return;
+ }
}
- MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc), DestReg);
- for (unsigned i = 0, e = Addr.size(); i != e; ++i)
- MIB.addOperand(Addr[i]);
- NewMIs.push_back(MIB);
- return;
-}
+ while (NumBytes) {
+ unsigned ThisVal = NumBytes;
+ unsigned Opc = 0;
+ if (DestReg == ARM::SP && BaseReg != ARM::SP) {
+ // mov sp, rn. Note t2MOVr cannot be used.
+ BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVgpr2gpr),DestReg).addReg(BaseReg);
+ BaseReg = ARM::SP;
+ continue;
+ }
-bool Thumb2InstrInfo::
-spillCalleeSavedRegisters(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MI,
- const std::vector<CalleeSavedInfo> &CSI) const {
- if (CSI.empty())
- return false;
+ if (BaseReg == ARM::SP) {
+ // sub sp, sp, #imm7
+ if (DestReg == ARM::SP && (ThisVal < ((1 << 7)-1) * 4)) {
+ assert((ThisVal & 3) == 0 && "Stack update is not multiple of 4?");
+ Opc = isSub ? ARM::tSUBspi : ARM::tADDspi;
+ // FIXME: Fix Thumb1 immediate encoding.
+ BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg)
+ .addReg(BaseReg).addImm(ThisVal/4);
+ NumBytes = 0;
+ continue;
+ }
+
+ // sub rd, sp, so_imm
+ Opc = isSub ? ARM::t2SUBrSPi : ARM::t2ADDrSPi;
+ if (ARM_AM::getT2SOImmVal(NumBytes) != -1) {
+ NumBytes = 0;
+ } else {
+ // FIXME: Move this to ARMAddressingModes.h?
+ unsigned RotAmt = CountLeadingZeros_32(ThisVal);
+ ThisVal = ThisVal & ARM_AM::rotr32(0xff000000U, RotAmt);
+ NumBytes &= ~ThisVal;
+ assert(ARM_AM::getT2SOImmVal(ThisVal) != -1 &&
+ "Bit extraction didn't work?");
+ }
+ } else {
+ assert(DestReg != ARM::SP && BaseReg != ARM::SP);
+ Opc = isSub ? ARM::t2SUBri : ARM::t2ADDri;
+ if (ARM_AM::getT2SOImmVal(NumBytes) != -1) {
+ NumBytes = 0;
+ } else if (ThisVal < 4096) {
+ Opc = isSub ? ARM::t2SUBri12 : ARM::t2ADDri12;
+ NumBytes = 0;
+ } else {
+ // FIXME: Move this to ARMAddressingModes.h?
+ unsigned RotAmt = CountLeadingZeros_32(ThisVal);
+ ThisVal = ThisVal & ARM_AM::rotr32(0xff000000U, RotAmt);
+ NumBytes &= ~ThisVal;
+ assert(ARM_AM::getT2SOImmVal(ThisVal) != -1 &&
+ "Bit extraction didn't work?");
+ }
+ }
- DebugLoc DL = DebugLoc::getUnknownLoc();
- if (MI != MBB.end()) DL = MI->getDebugLoc();
-
- MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, get(ARM::tPUSH));
- for (unsigned i = CSI.size(); i != 0; --i) {
- unsigned Reg = CSI[i-1].getReg();
- // Add the callee-saved register as live-in. It's killed at the spill.
- MBB.addLiveIn(Reg);
- MIB.addReg(Reg, RegState::Kill);
+ // Build the new ADD / SUB.
+ AddDefaultCC(AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg)
+ .addReg(BaseReg, RegState::Kill)
+ .addImm(ThisVal)));
+
+ BaseReg = DestReg;
}
- return true;
}
-bool Thumb2InstrInfo::
-restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MI,
- const std::vector<CalleeSavedInfo> &CSI) const {
- MachineFunction &MF = *MBB.getParent();
- ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
- if (CSI.empty())
- return false;
-
- bool isVarArg = AFI->getVarArgsRegSaveSize() > 0;
- MachineInstr *PopMI = MF.CreateMachineInstr(get(ARM::tPOP),MI->getDebugLoc());
- for (unsigned i = CSI.size(); i != 0; --i) {
- unsigned Reg = CSI[i-1].getReg();
- if (Reg == ARM::LR) {
- // Special epilogue for vararg functions. See emitEpilogue
- if (isVarArg)
- continue;
- Reg = ARM::PC;
- PopMI->setDesc(get(ARM::tPOP_RET));
- MI = MBB.erase(MI);
- }
- PopMI->addOperand(MachineOperand::CreateReg(Reg, true));
+static unsigned
+negativeOffsetOpcode(unsigned opcode)
+{
+ switch (opcode) {
+ case ARM::t2LDRi12: return ARM::t2LDRi8;
+ case ARM::t2LDRHi12: return ARM::t2LDRHi8;
+ case ARM::t2LDRBi12: return ARM::t2LDRBi8;
+ case ARM::t2LDRSHi12: return ARM::t2LDRSHi8;
+ case ARM::t2LDRSBi12: return ARM::t2LDRSBi8;
+ case ARM::t2STRi12: return ARM::t2STRi8;
+ case ARM::t2STRBi12: return ARM::t2STRBi8;
+ case ARM::t2STRHi12: return ARM::t2STRHi8;
+
+ case ARM::t2LDRi8:
+ case ARM::t2LDRHi8:
+ case ARM::t2LDRBi8:
+ case ARM::t2LDRSHi8:
+ case ARM::t2LDRSBi8:
+ case ARM::t2STRi8:
+ case ARM::t2STRBi8:
+ case ARM::t2STRHi8:
+ return opcode;
+
+ default:
+ break;
}
- // It's illegal to emit pop instruction without operands.
- if (PopMI->getNumOperands() > 0)
- MBB.insert(MI, PopMI);
+ return 0;
+}
+
+static unsigned
+positiveOffsetOpcode(unsigned opcode)
+{
+ switch (opcode) {
+ case ARM::t2LDRi8: return ARM::t2LDRi12;
+ case ARM::t2LDRHi8: return ARM::t2LDRHi12;
+ case ARM::t2LDRBi8: return ARM::t2LDRBi12;
+ case ARM::t2LDRSHi8: return ARM::t2LDRSHi12;
+ case ARM::t2LDRSBi8: return ARM::t2LDRSBi12;
+ case ARM::t2STRi8: return ARM::t2STRi12;
+ case ARM::t2STRBi8: return ARM::t2STRBi12;
+ case ARM::t2STRHi8: return ARM::t2STRHi12;
+
+ case ARM::t2LDRi12:
+ case ARM::t2LDRHi12:
+ case ARM::t2LDRBi12:
+ case ARM::t2LDRSHi12:
+ case ARM::t2LDRSBi12:
+ case ARM::t2STRi12:
+ case ARM::t2STRBi12:
+ case ARM::t2STRHi12:
+ return opcode;
- return true;
+ default:
+ break;
+ }
+
+ return 0;
}
-MachineInstr *Thumb2InstrInfo::
-foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
- const SmallVectorImpl<unsigned> &Ops, int FI) const {
- if (Ops.size() != 1) return NULL;
-
- unsigned OpNum = Ops[0];
- unsigned Opc = MI->getOpcode();
- MachineInstr *NewMI = NULL;
- switch (Opc) {
- default: break;
- case ARM::tMOVr:
- case ARM::tMOVlor2hir:
- case ARM::tMOVhir2lor:
- case ARM::tMOVhir2hir: {
- if (OpNum == 0) { // move -> store
- unsigned SrcReg = MI->getOperand(1).getReg();
- bool isKill = MI->getOperand(1).isKill();
- if (RI.isPhysicalRegister(SrcReg) && !isARMLowRegister(SrcReg))
- // tSpill cannot take a high register operand.
- break;
- NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::tSpill))
- .addReg(SrcReg, getKillRegState(isKill))
- .addFrameIndex(FI).addImm(0);
- } else { // move -> load
- unsigned DstReg = MI->getOperand(0).getReg();
- if (RI.isPhysicalRegister(DstReg) && !isARMLowRegister(DstReg))
- // tRestore cannot target a high register operand.
- break;
- bool isDead = MI->getOperand(0).isDead();
- NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::tRestore))
- .addReg(DstReg, RegState::Define | getDeadRegState(isDead))
- .addFrameIndex(FI).addImm(0);
- }
+static unsigned
+immediateOffsetOpcode(unsigned opcode)
+{
+ switch (opcode) {
+ case ARM::t2LDRs: return ARM::t2LDRi12;
+ case ARM::t2LDRHs: return ARM::t2LDRHi12;
+ case ARM::t2LDRBs: return ARM::t2LDRBi12;
+ case ARM::t2LDRSHs: return ARM::t2LDRSHi12;
+ case ARM::t2LDRSBs: return ARM::t2LDRSBi12;
+ case ARM::t2STRs: return ARM::t2STRi12;
+ case ARM::t2STRBs: return ARM::t2STRBi12;
+ case ARM::t2STRHs: return ARM::t2STRHi12;
+
+ case ARM::t2LDRi12:
+ case ARM::t2LDRHi12:
+ case ARM::t2LDRBi12:
+ case ARM::t2LDRSHi12:
+ case ARM::t2LDRSBi12:
+ case ARM::t2STRi12:
+ case ARM::t2STRBi12:
+ case ARM::t2STRHi12:
+ case ARM::t2LDRi8:
+ case ARM::t2LDRHi8:
+ case ARM::t2LDRBi8:
+ case ARM::t2LDRSHi8:
+ case ARM::t2LDRSBi8:
+ case ARM::t2STRi8:
+ case ARM::t2STRBi8:
+ case ARM::t2STRHi8:
+ return opcode;
+
+ default:
break;
}
+
+ return 0;
+}
+
+bool llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
+ unsigned FrameReg, int &Offset,
+ const ARMBaseInstrInfo &TII) {
+ unsigned Opcode = MI.getOpcode();
+ const TargetInstrDesc &Desc = MI.getDesc();
+ unsigned AddrMode = (Desc.TSFlags & ARMII::AddrModeMask);
+ bool isSub = false;
+
+ // Memory operands in inline assembly always use AddrModeT2_i12.
+ if (Opcode == ARM::INLINEASM)
+ AddrMode = ARMII::AddrModeT2_i12; // FIXME. mode for thumb2?
+
+ if (Opcode == ARM::t2ADDri || Opcode == ARM::t2ADDri12) {
+ Offset += MI.getOperand(FrameRegIdx+1).getImm();
+
+ bool isSP = FrameReg == ARM::SP;
+ if (Offset == 0) {
+ // Turn it into a move.
+ MI.setDesc(TII.get(ARM::tMOVgpr2gpr));
+ MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
+ MI.RemoveOperand(FrameRegIdx+1);
+ Offset = 0;
+ return true;
+ }
+
+ if (Offset < 0) {
+ Offset = -Offset;
+ isSub = true;
+ MI.setDesc(TII.get(isSP ? ARM::t2SUBrSPi : ARM::t2SUBri));
+ } else {
+ MI.setDesc(TII.get(isSP ? ARM::t2ADDrSPi : ARM::t2ADDri));
+ }
+
+ // Common case: small offset, fits into instruction.
+ if (ARM_AM::getT2SOImmVal(Offset) != -1) {
+ MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
+ MI.getOperand(FrameRegIdx+1).ChangeToImmediate(Offset);
+ Offset = 0;
+ return true;
+ }
+ // Another common case: imm12.
+ if (Offset < 4096) {
+ unsigned NewOpc = isSP
+ ? (isSub ? ARM::t2SUBrSPi12 : ARM::t2ADDrSPi12)
+ : (isSub ? ARM::t2SUBri12 : ARM::t2ADDri12);
+ MI.setDesc(TII.get(NewOpc));
+ MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
+ MI.getOperand(FrameRegIdx+1).ChangeToImmediate(Offset);
+ Offset = 0;
+ return true;
+ }
+
+ // Otherwise, extract 8 adjacent bits from the immediate into this
+ // t2ADDri/t2SUBri.
+ unsigned RotAmt = CountLeadingZeros_32(Offset);
+ unsigned ThisImmVal = Offset & ARM_AM::rotr32(0xff000000U, RotAmt);
+
+ // We will handle these bits from offset, clear them.
+ Offset &= ~ThisImmVal;
+
+ assert(ARM_AM::getT2SOImmVal(ThisImmVal) != -1 &&
+ "Bit extraction didn't work?");
+ MI.getOperand(FrameRegIdx+1).ChangeToImmediate(ThisImmVal);
+ } else {
+
+ // AddrMode4 cannot handle any offset.
+ if (AddrMode == ARMII::AddrMode4)
+ return false;
+
+ // AddrModeT2_so cannot handle any offset. If there is no offset
+ // register then we change to an immediate version.
+ unsigned NewOpc = Opcode;
+ if (AddrMode == ARMII::AddrModeT2_so) {
+ unsigned OffsetReg = MI.getOperand(FrameRegIdx+1).getReg();
+ if (OffsetReg != 0) {
+ MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
+ return Offset == 0;
+ }
+
+ MI.RemoveOperand(FrameRegIdx+1);
+ MI.getOperand(FrameRegIdx+1).ChangeToImmediate(0);
+ NewOpc = immediateOffsetOpcode(Opcode);
+ AddrMode = ARMII::AddrModeT2_i12;
+ }
+
+ unsigned NumBits = 0;
+ unsigned Scale = 1;
+ if (AddrMode == ARMII::AddrModeT2_i8 || AddrMode == ARMII::AddrModeT2_i12) {
+ // i8 supports only negative, and i12 supports only positive, so
+ // based on Offset sign convert Opcode to the appropriate
+ // instruction
+ Offset += MI.getOperand(FrameRegIdx+1).getImm();
+ if (Offset < 0) {
+ NewOpc = negativeOffsetOpcode(Opcode);
+ NumBits = 8;
+ isSub = true;
+ Offset = -Offset;
+ } else {
+ NewOpc = positiveOffsetOpcode(Opcode);
+ NumBits = 12;
+ }
+ } else {
+ // VFP and NEON address modes.
+ int InstrOffs = 0;
+ if (AddrMode == ARMII::AddrMode5) {
+ const MachineOperand &OffOp = MI.getOperand(FrameRegIdx+1);
+ InstrOffs = ARM_AM::getAM5Offset(OffOp.getImm());
+ if (ARM_AM::getAM5Op(OffOp.getImm()) == ARM_AM::sub)
+ InstrOffs *= -1;
+ }
+ NumBits = 8;
+ Scale = 4;
+ Offset += InstrOffs * 4;
+ assert((Offset & (Scale-1)) == 0 && "Can't encode this offset!");
+ if (Offset < 0) {
+ Offset = -Offset;
+ isSub = true;
+ }
+ }
+
+ if (NewOpc != Opcode)
+ MI.setDesc(TII.get(NewOpc));
+
+ MachineOperand &ImmOp = MI.getOperand(FrameRegIdx+1);
+
+ // Attempt to fold address computation
+ // Common case: small offset, fits into instruction.
+ int ImmedOffset = Offset / Scale;
+ unsigned Mask = (1 << NumBits) - 1;
+ if ((unsigned)Offset <= Mask * Scale) {
+ // Replace the FrameIndex with fp/sp
+ MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
+ if (isSub) {
+ if (AddrMode == ARMII::AddrMode5)
+ // FIXME: Not consistent.
+ ImmedOffset |= 1 << NumBits;
+ else
+ ImmedOffset = -ImmedOffset;
+ }
+ ImmOp.ChangeToImmediate(ImmedOffset);
+ Offset = 0;
+ return true;
+ }
+
+ // Otherwise, offset doesn't fit. Pull in what we can to simplify
+ ImmedOffset = ImmedOffset & Mask;
+ if (isSub) {
+ if (AddrMode == ARMII::AddrMode5)
+ // FIXME: Not consistent.
+ ImmedOffset |= 1 << NumBits;
+ else {
+ ImmedOffset = -ImmedOffset;
+ if (ImmedOffset == 0)
+ // Change the opcode back if the encoded offset is zero.
+ MI.setDesc(TII.get(positiveOffsetOpcode(NewOpc)));
+ }
+ }
+ ImmOp.ChangeToImmediate(ImmedOffset);
+ Offset &= ~(Mask*Scale);
}
- return NewMI;
+ Offset = (isSub) ? -Offset : Offset;
+ return Offset == 0;
}
diff --git a/lib/Target/ARM/Thumb2InstrInfo.h b/lib/Target/ARM/Thumb2InstrInfo.h
index 84dcb49..f3688c0 100644
--- a/lib/Target/ARM/Thumb2InstrInfo.h
+++ b/lib/Target/ARM/Thumb2InstrInfo.h
@@ -27,66 +27,34 @@ class Thumb2InstrInfo : public ARMBaseInstrInfo {
public:
explicit Thumb2InstrInfo(const ARMSubtarget &STI);
- /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info. As
- /// such, whenever a client has an instance of instruction info, it should
- /// always be able to get register info as well (through this method).
- ///
- const Thumb2RegisterInfo &getRegisterInfo() const { return RI; }
-
- bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MI,
- const std::vector<CalleeSavedInfo> &CSI) const;
- bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MI,
- const std::vector<CalleeSavedInfo> &CSI) const;
+ // Return the non-pre/post incrementing version of 'Opc'. Return 0
+ // if there is not such an opcode.
+ unsigned getUnindexedOpcode(unsigned Opc) const;
- bool isMoveInstr(const MachineInstr &MI,
- unsigned &SrcReg, unsigned &DstReg,
- unsigned &SrcSubIdx, unsigned &DstSubIdx) const;
- unsigned isLoadFromStackSlot(const MachineInstr *MI,
- int &FrameIndex) const;
- unsigned isStoreToStackSlot(const MachineInstr *MI,
- int &FrameIndex) const;
+ // Return true if the block does not fall through.
+ bool BlockHasNoFallThrough(const MachineBasicBlock &MBB) const;
bool copyRegToReg(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator I,
- unsigned DestReg, unsigned SrcReg,
- const TargetRegisterClass *DestRC,
- const TargetRegisterClass *SrcRC) const;
- void storeRegToStackSlot(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MBBI,
- unsigned SrcReg, bool isKill, int FrameIndex,
- const TargetRegisterClass *RC) const;
+ MachineBasicBlock::iterator I,
+ unsigned DestReg, unsigned SrcReg,
+ const TargetRegisterClass *DestRC,
+ const TargetRegisterClass *SrcRC) const;
- void storeRegToAddr(MachineFunction &MF, unsigned SrcReg, bool isKill,
- SmallVectorImpl<MachineOperand> &Addr,
- const TargetRegisterClass *RC,
- SmallVectorImpl<MachineInstr*> &NewMIs) const;
+ void storeRegToStackSlot(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ unsigned SrcReg, bool isKill, int FrameIndex,
+ const TargetRegisterClass *RC) const;
void loadRegFromStackSlot(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MBBI,
- unsigned DestReg, int FrameIndex,
- const TargetRegisterClass *RC) const;
+ MachineBasicBlock::iterator MBBI,
+ unsigned DestReg, int FrameIndex,
+ const TargetRegisterClass *RC) const;
- void loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
- SmallVectorImpl<MachineOperand> &Addr,
- const TargetRegisterClass *RC,
- SmallVectorImpl<MachineInstr*> &NewMIs) const;
-
- bool canFoldMemoryOperand(const MachineInstr *MI,
- const SmallVectorImpl<unsigned> &Ops) const;
-
- MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
- MachineInstr* MI,
- const SmallVectorImpl<unsigned> &Ops,
- int FrameIndex) const;
-
- MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
- MachineInstr* MI,
- const SmallVectorImpl<unsigned> &Ops,
- MachineInstr* LoadMI) const {
- return 0;
- }
+ /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info. As
+ /// such, whenever a client has an instance of instruction info, it should
+ /// always be able to get register info as well (through this method).
+ ///
+ const Thumb2RegisterInfo &getRegisterInfo() const { return RI; }
};
}
diff --git a/lib/Target/ARM/Thumb2RegisterInfo.cpp b/lib/Target/ARM/Thumb2RegisterInfo.cpp
index 0f0c0e4..6c4c15d 100644
--- a/lib/Target/ARM/Thumb2RegisterInfo.cpp
+++ b/lib/Target/ARM/Thumb2RegisterInfo.cpp
@@ -13,12 +13,15 @@
#include "ARM.h"
#include "ARMAddressingModes.h"
+#include "ARMBaseInstrInfo.h"
#include "ARMMachineFunctionInfo.h"
#include "ARMSubtarget.h"
#include "Thumb2InstrInfo.h"
#include "Thumb2RegisterInfo.h"
#include "llvm/Constants.h"
#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/LLVMContext.h"
#include "llvm/CodeGen/MachineConstantPool.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
@@ -30,14 +33,10 @@
#include "llvm/ADT/BitVector.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
using namespace llvm;
-static cl::opt<bool>
-Thumb2RegScavenging("enable-thumb2-reg-scavenging",
- cl::Hidden,
- cl::desc("Enable register scavenging on Thumb-2"));
-
-Thumb2RegisterInfo::Thumb2RegisterInfo(const TargetInstrInfo &tii,
+Thumb2RegisterInfo::Thumb2RegisterInfo(const ARMBaseInstrInfo &tii,
const ARMSubtarget &sti)
: ARMBaseRegisterInfo(tii, sti) {
}
@@ -46,710 +45,23 @@ Thumb2RegisterInfo::Thumb2RegisterInfo(const TargetInstrInfo &tii,
/// specified immediate.
void Thumb2RegisterInfo::emitLoadConstPool(MachineBasicBlock &MBB,
MachineBasicBlock::iterator &MBBI,
- unsigned DestReg, int Val,
- const TargetInstrInfo *TII,
- DebugLoc dl) const {
+ DebugLoc dl,
+ unsigned DestReg, unsigned SubIdx,
+ int Val,
+ ARMCC::CondCodes Pred,
+ unsigned PredReg) const {
MachineFunction &MF = *MBB.getParent();
MachineConstantPool *ConstantPool = MF.getConstantPool();
- Constant *C = ConstantInt::get(Type::Int32Ty, Val);
+ Constant *C = ConstantInt::get(
+ Type::getInt32Ty(MBB.getParent()->getFunction()->getContext()), Val);
unsigned Idx = ConstantPool->getConstantPoolIndex(C, 4);
- BuildMI(MBB, MBBI, dl, TII->get(ARM::tLDRcp), DestReg)
- .addConstantPoolIndex(Idx);
-}
-
-const TargetRegisterClass*
-Thumb2RegisterInfo::getPhysicalRegisterRegClass(unsigned Reg, MVT VT) const {
- if (isARMLowRegister(Reg))
- return ARM::tGPRRegisterClass;
- switch (Reg) {
- default:
- break;
- case ARM::R8: case ARM::R9: case ARM::R10: case ARM::R11:
- case ARM::R12: case ARM::SP: case ARM::LR: case ARM::PC:
- return ARM::GPRRegisterClass;
- }
-
- return TargetRegisterInfo::getPhysicalRegisterRegClass(Reg, VT);
-}
-
-bool
-Thumb2RegisterInfo::requiresRegisterScavenging(const MachineFunction &MF) const {
- return Thumb2RegScavenging;
-}
-
-bool Thumb2RegisterInfo::hasReservedCallFrame(MachineFunction &MF) const {
- const MachineFrameInfo *FFI = MF.getFrameInfo();
- unsigned CFSize = FFI->getMaxCallFrameSize();
- // It's not always a good idea to include the call frame as part of the
- // stack frame. ARM (especially Thumb) has small immediate offset to
- // address the stack frame. So a large call frame can cause poor codegen
- // and may even makes it impossible to scavenge a register.
- if (CFSize >= ((1 << 8) - 1) * 4 / 2) // Half of imm8 * 4
- return false;
-
- return !MF.getFrameInfo()->hasVarSizedObjects();
-}
-
-/// emitThumbRegPlusImmInReg - Emits a series of instructions to materialize
-/// a destreg = basereg + immediate in Thumb code. Materialize the immediate
-/// in a register using mov / mvn sequences or load the immediate from a
-/// constpool entry.
-static
-void emitThumbRegPlusImmInReg(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator &MBBI,
- unsigned DestReg, unsigned BaseReg,
- int NumBytes, bool CanChangeCC,
- const TargetInstrInfo &TII,
- const Thumb2RegisterInfo& MRI,
- DebugLoc dl) {
- bool isHigh = !isARMLowRegister(DestReg) ||
- (BaseReg != 0 && !isARMLowRegister(BaseReg));
- bool isSub = false;
- // Subtract doesn't have high register version. Load the negative value
- // if either base or dest register is a high register. Also, if do not
- // issue sub as part of the sequence if condition register is to be
- // preserved.
- if (NumBytes < 0 && !isHigh && CanChangeCC) {
- isSub = true;
- NumBytes = -NumBytes;
- }
- unsigned LdReg = DestReg;
- if (DestReg == ARM::SP) {
- assert(BaseReg == ARM::SP && "Unexpected!");
- LdReg = ARM::R3;
- BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVlor2hir), ARM::R12)
- .addReg(ARM::R3, RegState::Kill);
- }
-
- if (NumBytes <= 255 && NumBytes >= 0)
- BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVi8), LdReg).addImm(NumBytes);
- else if (NumBytes < 0 && NumBytes >= -255) {
- BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVi8), LdReg).addImm(NumBytes);
- BuildMI(MBB, MBBI, dl, TII.get(ARM::tNEG), LdReg)
- .addReg(LdReg, RegState::Kill);
- } else
- MRI.emitLoadConstPool(MBB, MBBI, LdReg, NumBytes, &TII, dl);
-
- // Emit add / sub.
- int Opc = (isSub) ? ARM::tSUBrr : (isHigh ? ARM::tADDhirr : ARM::tADDrr);
- const MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl,
- TII.get(Opc), DestReg);
- if (DestReg == ARM::SP || isSub)
- MIB.addReg(BaseReg).addReg(LdReg, RegState::Kill);
- else
- MIB.addReg(LdReg).addReg(BaseReg, RegState::Kill);
- if (DestReg == ARM::SP)
- BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVhir2lor), ARM::R3)
- .addReg(ARM::R12, RegState::Kill);
-}
-
-/// calcNumMI - Returns the number of instructions required to materialize
-/// the specific add / sub r, c instruction.
-static unsigned calcNumMI(int Opc, int ExtraOpc, unsigned Bytes,
- unsigned NumBits, unsigned Scale) {
- unsigned NumMIs = 0;
- unsigned Chunk = ((1 << NumBits) - 1) * Scale;
-
- if (Opc == ARM::tADDrSPi) {
- unsigned ThisVal = (Bytes > Chunk) ? Chunk : Bytes;
- Bytes -= ThisVal;
- NumMIs++;
- NumBits = 8;
- Scale = 1; // Followed by a number of tADDi8.
- Chunk = ((1 << NumBits) - 1) * Scale;
- }
-
- NumMIs += Bytes / Chunk;
- if ((Bytes % Chunk) != 0)
- NumMIs++;
- if (ExtraOpc)
- NumMIs++;
- return NumMIs;
-}
-
-/// emitThumbRegPlusImmediate - Emits a series of instructions to materialize
-/// a destreg = basereg + immediate in Thumb code.
-static
-void emitThumbRegPlusImmediate(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator &MBBI,
- unsigned DestReg, unsigned BaseReg,
- int NumBytes, const TargetInstrInfo &TII,
- const Thumb2RegisterInfo& MRI,
- DebugLoc dl) {
- bool isSub = NumBytes < 0;
- unsigned Bytes = (unsigned)NumBytes;
- if (isSub) Bytes = -NumBytes;
- bool isMul4 = (Bytes & 3) == 0;
- bool isTwoAddr = false;
- bool DstNotEqBase = false;
- unsigned NumBits = 1;
- unsigned Scale = 1;
- int Opc = 0;
- int ExtraOpc = 0;
-
- if (DestReg == BaseReg && BaseReg == ARM::SP) {
- assert(isMul4 && "Thumb sp inc / dec size must be multiple of 4!");
- NumBits = 7;
- Scale = 4;
- Opc = isSub ? ARM::tSUBspi : ARM::tADDspi;
- isTwoAddr = true;
- } else if (!isSub && BaseReg == ARM::SP) {
- // r1 = add sp, 403
- // =>
- // r1 = add sp, 100 * 4
- // r1 = add r1, 3
- if (!isMul4) {
- Bytes &= ~3;
- ExtraOpc = ARM::tADDi3;
- }
- NumBits = 8;
- Scale = 4;
- Opc = ARM::tADDrSPi;
- } else {
- // sp = sub sp, c
- // r1 = sub sp, c
- // r8 = sub sp, c
- if (DestReg != BaseReg)
- DstNotEqBase = true;
- NumBits = 8;
- Opc = isSub ? ARM::tSUBi8 : ARM::tADDi8;
- isTwoAddr = true;
- }
-
- unsigned NumMIs = calcNumMI(Opc, ExtraOpc, Bytes, NumBits, Scale);
- unsigned Threshold = (DestReg == ARM::SP) ? 3 : 2;
- if (NumMIs > Threshold) {
- // This will expand into too many instructions. Load the immediate from a
- // constpool entry.
- emitThumbRegPlusImmInReg(MBB, MBBI, DestReg, BaseReg, NumBytes, true, TII,
- MRI, dl);
- return;
- }
-
- if (DstNotEqBase) {
- if (isARMLowRegister(DestReg) && isARMLowRegister(BaseReg)) {
- // If both are low registers, emit DestReg = add BaseReg, max(Imm, 7)
- unsigned Chunk = (1 << 3) - 1;
- unsigned ThisVal = (Bytes > Chunk) ? Chunk : Bytes;
- Bytes -= ThisVal;
- BuildMI(MBB, MBBI, dl,TII.get(isSub ? ARM::tSUBi3 : ARM::tADDi3), DestReg)
- .addReg(BaseReg, RegState::Kill).addImm(ThisVal);
- } else {
- BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), DestReg)
- .addReg(BaseReg, RegState::Kill);
- }
- BaseReg = DestReg;
- }
-
- unsigned Chunk = ((1 << NumBits) - 1) * Scale;
- while (Bytes) {
- unsigned ThisVal = (Bytes > Chunk) ? Chunk : Bytes;
- Bytes -= ThisVal;
- ThisVal /= Scale;
- // Build the new tADD / tSUB.
- if (isTwoAddr)
- BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg)
- .addReg(DestReg).addImm(ThisVal);
- else {
- bool isKill = BaseReg != ARM::SP;
- BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg)
- .addReg(BaseReg, getKillRegState(isKill)).addImm(ThisVal);
- BaseReg = DestReg;
-
- if (Opc == ARM::tADDrSPi) {
- // r4 = add sp, imm
- // r4 = add r4, imm
- // ...
- NumBits = 8;
- Scale = 1;
- Chunk = ((1 << NumBits) - 1) * Scale;
- Opc = isSub ? ARM::tSUBi8 : ARM::tADDi8;
- isTwoAddr = true;
- }
- }
- }
-
- if (ExtraOpc)
- BuildMI(MBB, MBBI, dl, TII.get(ExtraOpc), DestReg)
- .addReg(DestReg, RegState::Kill)
- .addImm(((unsigned)NumBytes) & 3);
-}
-
-static void emitSPUpdate(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator &MBBI,
- const TargetInstrInfo &TII, DebugLoc dl,
- const Thumb2RegisterInfo &MRI,
- int NumBytes) {
- emitThumbRegPlusImmediate(MBB, MBBI, ARM::SP, ARM::SP, NumBytes, TII,
- MRI, dl);
-}
-
-void Thumb2RegisterInfo::
-eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
- MachineBasicBlock::iterator I) const {
- if (!hasReservedCallFrame(MF)) {
- // If we have alloca, convert as follows:
- // ADJCALLSTACKDOWN -> sub, sp, sp, amount
- // ADJCALLSTACKUP -> add, sp, sp, amount
- MachineInstr *Old = I;
- DebugLoc dl = Old->getDebugLoc();
- unsigned Amount = Old->getOperand(0).getImm();
- if (Amount != 0) {
- // We need to keep the stack aligned properly. To do this, we round the
- // amount of space needed for the outgoing arguments up to the next
- // alignment boundary.
- unsigned Align = MF.getTarget().getFrameInfo()->getStackAlignment();
- Amount = (Amount+Align-1)/Align*Align;
-
- // Replace the pseudo instruction with a new instruction...
- unsigned Opc = Old->getOpcode();
- if (Opc == ARM::ADJCALLSTACKDOWN || Opc == ARM::tADJCALLSTACKDOWN) {
- emitSPUpdate(MBB, I, TII, dl, *this, -Amount);
- } else {
- assert(Opc == ARM::ADJCALLSTACKUP || Opc == ARM::tADJCALLSTACKUP);
- emitSPUpdate(MBB, I, TII, dl, *this, Amount);
- }
- }
- }
- MBB.erase(I);
-}
-
-/// emitThumbConstant - Emit a series of instructions to materialize a
-/// constant.
-static void emitThumbConstant(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator &MBBI,
- unsigned DestReg, int Imm,
- const TargetInstrInfo &TII,
- const Thumb2RegisterInfo& MRI,
- DebugLoc dl) {
- bool isSub = Imm < 0;
- if (isSub) Imm = -Imm;
-
- int Chunk = (1 << 8) - 1;
- int ThisVal = (Imm > Chunk) ? Chunk : Imm;
- Imm -= ThisVal;
- BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVi8), DestReg).addImm(ThisVal);
- if (Imm > 0)
- emitThumbRegPlusImmediate(MBB, MBBI, DestReg, DestReg, Imm, TII, MRI, dl);
- if (isSub)
- BuildMI(MBB, MBBI, dl, TII.get(ARM::tNEG), DestReg)
- .addReg(DestReg, RegState::Kill);
-}
-
-void Thumb2RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
- int SPAdj, RegScavenger *RS) const{
- unsigned i = 0;
- MachineInstr &MI = *II;
- MachineBasicBlock &MBB = *MI.getParent();
- MachineFunction &MF = *MBB.getParent();
- ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
- DebugLoc dl = MI.getDebugLoc();
-
- while (!MI.getOperand(i).isFI()) {
- ++i;
- assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!");
- }
-
- unsigned FrameReg = ARM::SP;
- int FrameIndex = MI.getOperand(i).getIndex();
- int Offset = MF.getFrameInfo()->getObjectOffset(FrameIndex) +
- MF.getFrameInfo()->getStackSize() + SPAdj;
-
- if (AFI->isGPRCalleeSavedArea1Frame(FrameIndex))
- Offset -= AFI->getGPRCalleeSavedArea1Offset();
- else if (AFI->isGPRCalleeSavedArea2Frame(FrameIndex))
- Offset -= AFI->getGPRCalleeSavedArea2Offset();
- else if (hasFP(MF)) {
- assert(SPAdj == 0 && "Unexpected");
- // There is alloca()'s in this function, must reference off the frame
- // pointer instead.
- FrameReg = getFrameRegister(MF);
- Offset -= AFI->getFramePtrSpillOffset();
- }
-
- unsigned Opcode = MI.getOpcode();
- const TargetInstrDesc &Desc = MI.getDesc();
- unsigned AddrMode = (Desc.TSFlags & ARMII::AddrModeMask);
-
- if (Opcode == ARM::tADDrSPi) {
- Offset += MI.getOperand(i+1).getImm();
-
- // Can't use tADDrSPi if it's based off the frame pointer.
- unsigned NumBits = 0;
- unsigned Scale = 1;
- if (FrameReg != ARM::SP) {
- Opcode = ARM::tADDi3;
- MI.setDesc(TII.get(ARM::tADDi3));
- NumBits = 3;
- } else {
- NumBits = 8;
- Scale = 4;
- assert((Offset & 3) == 0 &&
- "Thumb add/sub sp, #imm immediate must be multiple of 4!");
- }
-
- if (Offset == 0) {
- // Turn it into a move.
- MI.setDesc(TII.get(ARM::tMOVhir2lor));
- MI.getOperand(i).ChangeToRegister(FrameReg, false);
- MI.RemoveOperand(i+1);
- return;
- }
-
- // Common case: small offset, fits into instruction.
- unsigned Mask = (1 << NumBits) - 1;
- if (((Offset / Scale) & ~Mask) == 0) {
- // Replace the FrameIndex with sp / fp
- MI.getOperand(i).ChangeToRegister(FrameReg, false);
- MI.getOperand(i+1).ChangeToImmediate(Offset / Scale);
- return;
- }
-
- unsigned DestReg = MI.getOperand(0).getReg();
- unsigned Bytes = (Offset > 0) ? Offset : -Offset;
- unsigned NumMIs = calcNumMI(Opcode, 0, Bytes, NumBits, Scale);
- // MI would expand into a large number of instructions. Don't try to
- // simplify the immediate.
- if (NumMIs > 2) {
- emitThumbRegPlusImmediate(MBB, II, DestReg, FrameReg, Offset, TII,
- *this, dl);
- MBB.erase(II);
- return;
- }
-
- if (Offset > 0) {
- // Translate r0 = add sp, imm to
- // r0 = add sp, 255*4
- // r0 = add r0, (imm - 255*4)
- MI.getOperand(i).ChangeToRegister(FrameReg, false);
- MI.getOperand(i+1).ChangeToImmediate(Mask);
- Offset = (Offset - Mask * Scale);
- MachineBasicBlock::iterator NII = next(II);
- emitThumbRegPlusImmediate(MBB, NII, DestReg, DestReg, Offset, TII,
- *this, dl);
- } else {
- // Translate r0 = add sp, -imm to
- // r0 = -imm (this is then translated into a series of instructons)
- // r0 = add r0, sp
- emitThumbConstant(MBB, II, DestReg, Offset, TII, *this, dl);
- MI.setDesc(TII.get(ARM::tADDhirr));
- MI.getOperand(i).ChangeToRegister(DestReg, false, false, true);
- MI.getOperand(i+1).ChangeToRegister(FrameReg, false);
- }
- return;
- } else {
- unsigned ImmIdx = 0;
- int InstrOffs = 0;
- unsigned NumBits = 0;
- unsigned Scale = 1;
- switch (AddrMode) {
- case ARMII::AddrModeT1_s: {
- ImmIdx = i+1;
- InstrOffs = MI.getOperand(ImmIdx).getImm();
- NumBits = (FrameReg == ARM::SP) ? 8 : 5;
- Scale = 4;
- break;
- }
- default:
- assert(0 && "Unsupported addressing mode!");
- abort();
- break;
- }
-
- Offset += InstrOffs * Scale;
- assert((Offset & (Scale-1)) == 0 && "Can't encode this offset!");
-
- // Common case: small offset, fits into instruction.
- MachineOperand &ImmOp = MI.getOperand(ImmIdx);
- int ImmedOffset = Offset / Scale;
- unsigned Mask = (1 << NumBits) - 1;
- if ((unsigned)Offset <= Mask * Scale) {
- // Replace the FrameIndex with sp
- MI.getOperand(i).ChangeToRegister(FrameReg, false);
- ImmOp.ChangeToImmediate(ImmedOffset);
- return;
- }
-
- bool isThumSpillRestore = Opcode == ARM::tRestore || Opcode == ARM::tSpill;
- if (AddrMode == ARMII::AddrModeT1_s) {
- // Thumb tLDRspi, tSTRspi. These will change to instructions that use
- // a different base register.
- NumBits = 5;
- Mask = (1 << NumBits) - 1;
- }
- // If this is a thumb spill / restore, we will be using a constpool load to
- // materialize the offset.
- if (AddrMode == ARMII::AddrModeT1_s && isThumSpillRestore)
- ImmOp.ChangeToImmediate(0);
- else {
- // Otherwise, it didn't fit. Pull in what we can to simplify the immed.
- ImmedOffset = ImmedOffset & Mask;
- ImmOp.ChangeToImmediate(ImmedOffset);
- Offset &= ~(Mask*Scale);
- }
- }
-
- // If we get here, the immediate doesn't fit into the instruction. We folded
- // as much as possible above, handle the rest, providing a register that is
- // SP+LargeImm.
- assert(Offset && "This code isn't needed if offset already handled!");
-
- if (Desc.mayLoad()) {
- // Use the destination register to materialize sp + offset.
- unsigned TmpReg = MI.getOperand(0).getReg();
- bool UseRR = false;
- if (Opcode == ARM::tRestore) {
- if (FrameReg == ARM::SP)
- emitThumbRegPlusImmInReg(MBB, II, TmpReg, FrameReg,
- Offset, false, TII, *this, dl);
- else {
- emitLoadConstPool(MBB, II, TmpReg, Offset, &TII, dl);
- UseRR = true;
- }
- } else
- emitThumbRegPlusImmediate(MBB, II, TmpReg, FrameReg, Offset, TII,
- *this, dl);
- MI.setDesc(TII.get(ARM::tLDR));
- MI.getOperand(i).ChangeToRegister(TmpReg, false, false, true);
- if (UseRR)
- // Use [reg, reg] addrmode.
- MI.addOperand(MachineOperand::CreateReg(FrameReg, false));
- else // tLDR has an extra register operand.
- MI.addOperand(MachineOperand::CreateReg(0, false));
- } else if (Desc.mayStore()) {
- // FIXME! This is horrific!!! We need register scavenging.
- // Our temporary workaround has marked r3 unavailable. Of course, r3 is
- // also a ABI register so it's possible that is is the register that is
- // being storing here. If that's the case, we do the following:
- // r12 = r2
- // Use r2 to materialize sp + offset
- // str r3, r2
- // r2 = r12
- unsigned ValReg = MI.getOperand(0).getReg();
- unsigned TmpReg = ARM::R3;
- bool UseRR = false;
- if (ValReg == ARM::R3) {
- BuildMI(MBB, II, dl, TII.get(ARM::tMOVlor2hir), ARM::R12)
- .addReg(ARM::R2, RegState::Kill);
- TmpReg = ARM::R2;
- }
- if (TmpReg == ARM::R3 && AFI->isR3LiveIn())
- BuildMI(MBB, II, dl, TII.get(ARM::tMOVlor2hir), ARM::R12)
- .addReg(ARM::R3, RegState::Kill);
- if (Opcode == ARM::tSpill) {
- if (FrameReg == ARM::SP)
- emitThumbRegPlusImmInReg(MBB, II, TmpReg, FrameReg,
- Offset, false, TII, *this, dl);
- else {
- emitLoadConstPool(MBB, II, TmpReg, Offset, &TII, dl);
- UseRR = true;
- }
- } else
- emitThumbRegPlusImmediate(MBB, II, TmpReg, FrameReg, Offset, TII,
- *this, dl);
- MI.setDesc(TII.get(ARM::tSTR));
- MI.getOperand(i).ChangeToRegister(TmpReg, false, false, true);
- if (UseRR) // Use [reg, reg] addrmode.
- MI.addOperand(MachineOperand::CreateReg(FrameReg, false));
- else // tSTR has an extra register operand.
- MI.addOperand(MachineOperand::CreateReg(0, false));
-
- MachineBasicBlock::iterator NII = next(II);
- if (ValReg == ARM::R3)
- BuildMI(MBB, NII, dl, TII.get(ARM::tMOVhir2lor), ARM::R2)
- .addReg(ARM::R12, RegState::Kill);
- if (TmpReg == ARM::R3 && AFI->isR3LiveIn())
- BuildMI(MBB, NII, dl, TII.get(ARM::tMOVhir2lor), ARM::R3)
- .addReg(ARM::R12, RegState::Kill);
- } else
- assert(false && "Unexpected opcode!");
-}
-
-void Thumb2RegisterInfo::emitPrologue(MachineFunction &MF) const {
- MachineBasicBlock &MBB = MF.front();
- MachineBasicBlock::iterator MBBI = MBB.begin();
- MachineFrameInfo *MFI = MF.getFrameInfo();
- ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
- unsigned VARegSaveSize = AFI->getVarArgsRegSaveSize();
- unsigned NumBytes = MFI->getStackSize();
- const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
- DebugLoc dl = (MBBI != MBB.end() ?
- MBBI->getDebugLoc() : DebugLoc::getUnknownLoc());
-
- // Check if R3 is live in. It might have to be used as a scratch register.
- for (MachineRegisterInfo::livein_iterator I =MF.getRegInfo().livein_begin(),
- E = MF.getRegInfo().livein_end(); I != E; ++I) {
- if (I->first == ARM::R3) {
- AFI->setR3IsLiveIn(true);
- break;
- }
- }
-
- // Thumb add/sub sp, imm8 instructions implicitly multiply the offset by 4.
- NumBytes = (NumBytes + 3) & ~3;
- MFI->setStackSize(NumBytes);
-
- // Determine the sizes of each callee-save spill areas and record which frame
- // belongs to which callee-save spill areas.
- unsigned GPRCS1Size = 0, GPRCS2Size = 0, DPRCSSize = 0;
- int FramePtrSpillFI = 0;
-
- if (VARegSaveSize)
- emitSPUpdate(MBB, MBBI, TII, dl, *this, -VARegSaveSize);
-
- if (!AFI->hasStackFrame()) {
- if (NumBytes != 0)
- emitSPUpdate(MBB, MBBI, TII, dl, *this, -NumBytes);
- return;
- }
-
- for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
- unsigned Reg = CSI[i].getReg();
- int FI = CSI[i].getFrameIdx();
- switch (Reg) {
- case ARM::R4:
- case ARM::R5:
- case ARM::R6:
- case ARM::R7:
- case ARM::LR:
- if (Reg == FramePtr)
- FramePtrSpillFI = FI;
- AFI->addGPRCalleeSavedArea1Frame(FI);
- GPRCS1Size += 4;
- break;
- case ARM::R8:
- case ARM::R9:
- case ARM::R10:
- case ARM::R11:
- if (Reg == FramePtr)
- FramePtrSpillFI = FI;
- if (STI.isTargetDarwin()) {
- AFI->addGPRCalleeSavedArea2Frame(FI);
- GPRCS2Size += 4;
- } else {
- AFI->addGPRCalleeSavedArea1Frame(FI);
- GPRCS1Size += 4;
- }
- break;
- default:
- AFI->addDPRCalleeSavedAreaFrame(FI);
- DPRCSSize += 8;
- }
- }
-
- if (MBBI != MBB.end() && MBBI->getOpcode() == ARM::tPUSH) {
- ++MBBI;
- if (MBBI != MBB.end())
- dl = MBBI->getDebugLoc();
- }
-
- // Darwin ABI requires FP to point to the stack slot that contains the
- // previous FP.
- if (STI.isTargetDarwin() || hasFP(MF)) {
- MachineInstrBuilder MIB =
- BuildMI(MBB, MBBI, dl, TII.get(ARM::tADDrSPi), FramePtr)
- .addFrameIndex(FramePtrSpillFI).addImm(0);
- }
-
- // Determine starting offsets of spill areas.
- unsigned DPRCSOffset = NumBytes - (GPRCS1Size + GPRCS2Size + DPRCSSize);
- unsigned GPRCS2Offset = DPRCSOffset + DPRCSSize;
- unsigned GPRCS1Offset = GPRCS2Offset + GPRCS2Size;
- AFI->setFramePtrSpillOffset(MFI->getObjectOffset(FramePtrSpillFI) + NumBytes);
- AFI->setGPRCalleeSavedArea1Offset(GPRCS1Offset);
- AFI->setGPRCalleeSavedArea2Offset(GPRCS2Offset);
- AFI->setDPRCalleeSavedAreaOffset(DPRCSOffset);
-
- NumBytes = DPRCSOffset;
- if (NumBytes) {
- // Insert it after all the callee-save spills.
- emitSPUpdate(MBB, MBBI, TII, dl, *this, -NumBytes);
- }
-
- if (STI.isTargetELF() && hasFP(MF)) {
- MFI->setOffsetAdjustment(MFI->getOffsetAdjustment() -
- AFI->getFramePtrSpillOffset());
- }
-
- AFI->setGPRCalleeSavedArea1Size(GPRCS1Size);
- AFI->setGPRCalleeSavedArea2Size(GPRCS2Size);
- AFI->setDPRCalleeSavedAreaSize(DPRCSSize);
+ BuildMI(MBB, MBBI, dl, TII.get(ARM::t2LDRpci))
+ .addReg(DestReg, getDefRegState(true), SubIdx)
+ .addConstantPoolIndex(Idx).addImm((int64_t)ARMCC::AL).addReg(0);
}
-static bool isCalleeSavedRegister(unsigned Reg, const unsigned *CSRegs) {
- for (unsigned i = 0; CSRegs[i]; ++i)
- if (Reg == CSRegs[i])
- return true;
- return false;
-}
-
-static bool isCSRestore(MachineInstr *MI, const unsigned *CSRegs) {
- return (MI->getOpcode() == ARM::tRestore &&
- MI->getOperand(1).isFI() &&
- isCalleeSavedRegister(MI->getOperand(0).getReg(), CSRegs));
-}
-
-void Thumb2RegisterInfo::emitEpilogue(MachineFunction &MF,
- MachineBasicBlock &MBB) const {
- MachineBasicBlock::iterator MBBI = prior(MBB.end());
- assert((MBBI->getOpcode() == ARM::tBX_RET ||
- MBBI->getOpcode() == ARM::tPOP_RET) &&
- "Can only insert epilog into returning blocks");
- DebugLoc dl = MBBI->getDebugLoc();
- MachineFrameInfo *MFI = MF.getFrameInfo();
- ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
- unsigned VARegSaveSize = AFI->getVarArgsRegSaveSize();
- int NumBytes = (int)MFI->getStackSize();
-
- if (!AFI->hasStackFrame()) {
- if (NumBytes != 0)
- emitSPUpdate(MBB, MBBI, TII, dl, *this, NumBytes);
- } else {
- // Unwind MBBI to point to first LDR / FLDD.
- const unsigned *CSRegs = getCalleeSavedRegs();
- if (MBBI != MBB.begin()) {
- do
- --MBBI;
- while (MBBI != MBB.begin() && isCSRestore(MBBI, CSRegs));
- if (!isCSRestore(MBBI, CSRegs))
- ++MBBI;
- }
-
- // Move SP to start of FP callee save spill area.
- NumBytes -= (AFI->getGPRCalleeSavedArea1Size() +
- AFI->getGPRCalleeSavedArea2Size() +
- AFI->getDPRCalleeSavedAreaSize());
-
- if (hasFP(MF)) {
- NumBytes = AFI->getFramePtrSpillOffset() - NumBytes;
- // Reset SP based on frame pointer only if the stack frame extends beyond
- // frame pointer stack slot or target is ELF and the function has FP.
- if (NumBytes)
- emitThumbRegPlusImmediate(MBB, MBBI, ARM::SP, FramePtr, -NumBytes,
- TII, *this, dl);
- else
- BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVlor2hir), ARM::SP)
- .addReg(FramePtr);
- } else {
- if (MBBI->getOpcode() == ARM::tBX_RET &&
- &MBB.front() != MBBI &&
- prior(MBBI)->getOpcode() == ARM::tPOP) {
- MachineBasicBlock::iterator PMBBI = prior(MBBI);
- emitSPUpdate(MBB, PMBBI, TII, dl, *this, NumBytes);
- } else
- emitSPUpdate(MBB, MBBI, TII, dl, *this, NumBytes);
- }
- }
-
- if (VARegSaveSize) {
- // Epilogue for vararg functions: pop LR to R3 and branch off it.
- // FIXME: Verify this is still ok when R3 is no longer being reserved.
- BuildMI(MBB, MBBI, dl, TII.get(ARM::tPOP)).addReg(ARM::R3);
-
- emitSPUpdate(MBB, MBBI, TII, dl, *this, VARegSaveSize);
-
- BuildMI(MBB, MBBI, dl, TII.get(ARM::tBX_RET_vararg)).addReg(ARM::R3);
- MBB.erase(MBBI);
- }
+bool Thumb2RegisterInfo::
+requiresRegisterScavenging(const MachineFunction &MF) const {
+ return true;
}
diff --git a/lib/Target/ARM/Thumb2RegisterInfo.h b/lib/Target/ARM/Thumb2RegisterInfo.h
index d379c31..a63c60b 100644
--- a/lib/Target/ARM/Thumb2RegisterInfo.h
+++ b/lib/Target/ARM/Thumb2RegisterInfo.h
@@ -20,40 +20,23 @@
namespace llvm {
class ARMSubtarget;
- class TargetInstrInfo;
+ class ARMBaseInstrInfo;
class Type;
struct Thumb2RegisterInfo : public ARMBaseRegisterInfo {
public:
- Thumb2RegisterInfo(const TargetInstrInfo &tii, const ARMSubtarget &STI);
+ Thumb2RegisterInfo(const ARMBaseInstrInfo &tii, const ARMSubtarget &STI);
/// emitLoadConstPool - Emits a load from constpool to materialize the
/// specified immediate.
void emitLoadConstPool(MachineBasicBlock &MBB,
MachineBasicBlock::iterator &MBBI,
- unsigned DestReg, int Val,
- const TargetInstrInfo *TII,
- DebugLoc dl) const;
-
- /// Code Generation virtual methods...
- const TargetRegisterClass *
- getPhysicalRegisterRegClass(unsigned Reg, MVT VT = MVT::Other) const;
-
- bool isReservedReg(const MachineFunction &MF, unsigned Reg) const;
+ DebugLoc dl,
+ unsigned DestReg, unsigned SubIdx, int Val,
+ ARMCC::CondCodes Pred = ARMCC::AL,
+ unsigned PredReg = 0) const;
bool requiresRegisterScavenging(const MachineFunction &MF) const;
-
- bool hasReservedCallFrame(MachineFunction &MF) const;
-
- void eliminateCallFramePseudoInstr(MachineFunction &MF,
- MachineBasicBlock &MBB,
- MachineBasicBlock::iterator I) const;
-
- void eliminateFrameIndex(MachineBasicBlock::iterator II,
- int SPAdj, RegScavenger *RS = NULL) const;
-
- void emitPrologue(MachineFunction &MF) const;
- void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
};
}
diff --git a/lib/Target/ARM/Thumb2SizeReduction.cpp b/lib/Target/ARM/Thumb2SizeReduction.cpp
new file mode 100644
index 0000000..b8879d2
--- /dev/null
+++ b/lib/Target/ARM/Thumb2SizeReduction.cpp
@@ -0,0 +1,685 @@
+//===-- Thumb2SizeReduction.cpp - Thumb2 code size reduction pass -*- C++ -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "t2-reduce-size"
+#include "ARM.h"
+#include "ARMAddressingModes.h"
+#include "ARMBaseRegisterInfo.h"
+#include "ARMBaseInstrInfo.h"
+#include "Thumb2InstrInfo.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Statistic.h"
+using namespace llvm;
+
+STATISTIC(NumNarrows, "Number of 32-bit instrs reduced to 16-bit ones");
+STATISTIC(Num2Addrs, "Number of 32-bit instrs reduced to 2addr 16-bit ones");
+STATISTIC(NumLdSts, "Number of 32-bit load / store reduced to 16-bit ones");
+
+static cl::opt<int> ReduceLimit("t2-reduce-limit",
+ cl::init(-1), cl::Hidden);
+static cl::opt<int> ReduceLimit2Addr("t2-reduce-limit2",
+ cl::init(-1), cl::Hidden);
+static cl::opt<int> ReduceLimitLdSt("t2-reduce-limit3",
+ cl::init(-1), cl::Hidden);
+
+namespace {
+ /// ReduceTable - A static table with information on mapping from wide
+ /// opcodes to narrow
+ struct ReduceEntry {
+ unsigned WideOpc; // Wide opcode
+ unsigned NarrowOpc1; // Narrow opcode to transform to
+ unsigned NarrowOpc2; // Narrow opcode when it's two-address
+ uint8_t Imm1Limit; // Limit of immediate field (bits)
+ uint8_t Imm2Limit; // Limit of immediate field when it's two-address
+ unsigned LowRegs1 : 1; // Only possible if low-registers are used
+ unsigned LowRegs2 : 1; // Only possible if low-registers are used (2addr)
+ unsigned PredCC1 : 2; // 0 - If predicated, cc is on and vice versa.
+ // 1 - No cc field.
+ // 2 - Always set CPSR.
+ unsigned PredCC2 : 2;
+ unsigned Special : 1; // Needs to be dealt with specially
+ };
+
+ static const ReduceEntry ReduceTable[] = {
+ // Wide, Narrow1, Narrow2, imm1,imm2, lo1, lo2, P/C, S
+ { ARM::t2ADCrr, 0, ARM::tADC, 0, 0, 0, 1, 0,0, 0 },
+ { ARM::t2ADDri, ARM::tADDi3, ARM::tADDi8, 3, 8, 1, 1, 0,0, 0 },
+ { ARM::t2ADDrr, ARM::tADDrr, ARM::tADDhirr, 0, 0, 1, 0, 0,1, 0 },
+ // Note: immediate scale is 4.
+ { ARM::t2ADDrSPi,ARM::tADDrSPi,0, 8, 0, 1, 0, 1,0, 0 },
+ { ARM::t2ADDSri,ARM::tADDi3, ARM::tADDi8, 3, 8, 1, 1, 2,2, 1 },
+ { ARM::t2ADDSrr,ARM::tADDrr, 0, 0, 0, 1, 0, 2,0, 1 },
+ { ARM::t2ANDrr, 0, ARM::tAND, 0, 0, 0, 1, 0,0, 0 },
+ { ARM::t2ASRri, ARM::tASRri, 0, 5, 0, 1, 0, 0,0, 0 },
+ { ARM::t2ASRrr, 0, ARM::tASRrr, 0, 0, 0, 1, 0,0, 0 },
+ { ARM::t2BICrr, 0, ARM::tBIC, 0, 0, 0, 1, 0,0, 0 },
+ { ARM::t2CMNrr, ARM::tCMN, 0, 0, 0, 1, 0, 2,0, 0 },
+ { ARM::t2CMPri, ARM::tCMPi8, 0, 8, 0, 1, 0, 2,0, 0 },
+ { ARM::t2CMPrr, ARM::tCMPhir, 0, 0, 0, 0, 0, 2,0, 0 },
+ { ARM::t2CMPzri,ARM::tCMPzi8, 0, 8, 0, 1, 0, 2,0, 0 },
+ { ARM::t2CMPzrr,ARM::tCMPzhir,0, 0, 0, 0, 0, 2,0, 0 },
+ { ARM::t2EORrr, 0, ARM::tEOR, 0, 0, 0, 1, 0,0, 0 },
+ // FIXME: adr.n immediate offset must be multiple of 4.
+ //{ ARM::t2LEApcrelJT,ARM::tLEApcrelJT, 0, 0, 0, 1, 0, 1,0, 0 },
+ { ARM::t2LSLri, ARM::tLSLri, 0, 5, 0, 1, 0, 0,0, 0 },
+ { ARM::t2LSLrr, 0, ARM::tLSLrr, 0, 0, 0, 1, 0,0, 0 },
+ { ARM::t2LSRri, ARM::tLSRri, 0, 5, 0, 1, 0, 0,0, 0 },
+ { ARM::t2LSRrr, 0, ARM::tLSRrr, 0, 0, 0, 1, 0,0, 0 },
+ { ARM::t2MOVi, ARM::tMOVi8, 0, 8, 0, 1, 0, 0,0, 0 },
+ { ARM::t2MOVi16,ARM::tMOVi8, 0, 8, 0, 1, 0, 0,0, 0 },
+ // FIXME: Do we need the 16-bit 'S' variant?
+ { ARM::t2MOVr,ARM::tMOVgpr2gpr,0, 0, 0, 0, 0, 1,0, 0 },
+ { ARM::t2MOVCCr,0, ARM::tMOVCCr, 0, 0, 0, 0, 0,1, 0 },
+ { ARM::t2MOVCCi,0, ARM::tMOVCCi, 0, 8, 0, 0, 0,1, 0 },
+ { ARM::t2MUL, 0, ARM::tMUL, 0, 0, 0, 1, 0,0, 0 },
+ { ARM::t2MVNr, ARM::tMVN, 0, 0, 0, 1, 0, 0,0, 0 },
+ { ARM::t2ORRrr, 0, ARM::tORR, 0, 0, 0, 1, 0,0, 0 },
+ { ARM::t2REV, ARM::tREV, 0, 0, 0, 1, 0, 1,0, 0 },
+ { ARM::t2REV16, ARM::tREV16, 0, 0, 0, 1, 0, 1,0, 0 },
+ { ARM::t2REVSH, ARM::tREVSH, 0, 0, 0, 1, 0, 1,0, 0 },
+ { ARM::t2RORrr, 0, ARM::tROR, 0, 0, 0, 1, 0,0, 0 },
+ { ARM::t2RSBri, ARM::tRSB, 0, 0, 0, 1, 0, 0,0, 1 },
+ { ARM::t2RSBSri,ARM::tRSB, 0, 0, 0, 1, 0, 2,0, 1 },
+ { ARM::t2SBCrr, 0, ARM::tSBC, 0, 0, 0, 1, 0,0, 0 },
+ { ARM::t2SUBri, ARM::tSUBi3, ARM::tSUBi8, 3, 8, 1, 1, 0,0, 0 },
+ { ARM::t2SUBrr, ARM::tSUBrr, 0, 0, 0, 1, 0, 0,0, 0 },
+ { ARM::t2SUBSri,ARM::tSUBi3, ARM::tSUBi8, 3, 8, 1, 1, 2,2, 0 },
+ { ARM::t2SUBSrr,ARM::tSUBrr, 0, 0, 0, 1, 0, 2,0, 0 },
+ { ARM::t2SXTBr, ARM::tSXTB, 0, 0, 0, 1, 0, 1,0, 0 },
+ { ARM::t2SXTHr, ARM::tSXTH, 0, 0, 0, 1, 0, 1,0, 0 },
+ { ARM::t2TSTrr, ARM::tTST, 0, 0, 0, 1, 0, 2,0, 0 },
+ { ARM::t2UXTBr, ARM::tUXTB, 0, 0, 0, 1, 0, 1,0, 0 },
+ { ARM::t2UXTHr, ARM::tUXTH, 0, 0, 0, 1, 0, 1,0, 0 },
+
+ // FIXME: Clean this up after splitting each Thumb load / store opcode
+ // into multiple ones.
+ { ARM::t2LDRi12,ARM::tLDR, 0, 5, 0, 1, 0, 0,0, 1 },
+ { ARM::t2LDRs, ARM::tLDR, 0, 0, 0, 1, 0, 0,0, 1 },
+ { ARM::t2LDRBi12,ARM::tLDRB, 0, 5, 0, 1, 0, 0,0, 1 },
+ { ARM::t2LDRBs, ARM::tLDRB, 0, 0, 0, 1, 0, 0,0, 1 },
+ { ARM::t2LDRHi12,ARM::tLDRH, 0, 5, 0, 1, 0, 0,0, 1 },
+ { ARM::t2LDRHs, ARM::tLDRH, 0, 0, 0, 1, 0, 0,0, 1 },
+ { ARM::t2LDRSBs,ARM::tLDRSB, 0, 0, 0, 1, 0, 0,0, 1 },
+ { ARM::t2LDRSHs,ARM::tLDRSH, 0, 0, 0, 1, 0, 0,0, 1 },
+ { ARM::t2STRi12,ARM::tSTR, 0, 5, 0, 1, 0, 0,0, 1 },
+ { ARM::t2STRs, ARM::tSTR, 0, 0, 0, 1, 0, 0,0, 1 },
+ { ARM::t2STRBi12,ARM::tSTRB, 0, 5, 0, 1, 0, 0,0, 1 },
+ { ARM::t2STRBs, ARM::tSTRB, 0, 0, 0, 1, 0, 0,0, 1 },
+ { ARM::t2STRHi12,ARM::tSTRH, 0, 5, 0, 1, 0, 0,0, 1 },
+ { ARM::t2STRHs, ARM::tSTRH, 0, 0, 0, 1, 0, 0,0, 1 },
+
+ { ARM::t2LDM_RET,0, ARM::tPOP_RET, 0, 0, 1, 1, 1,1, 1 },
+ { ARM::t2LDM, ARM::tLDM, ARM::tPOP, 0, 0, 1, 1, 1,1, 1 },
+ { ARM::t2STM, ARM::tSTM, ARM::tPUSH, 0, 0, 1, 1, 1,1, 1 },
+ };
+
+ class VISIBILITY_HIDDEN Thumb2SizeReduce : public MachineFunctionPass {
+ public:
+ static char ID;
+ Thumb2SizeReduce();
+
+ const Thumb2InstrInfo *TII;
+
+ virtual bool runOnMachineFunction(MachineFunction &MF);
+
+ virtual const char *getPassName() const {
+ return "Thumb2 instruction size reduction pass";
+ }
+
+ private:
+ /// ReduceOpcodeMap - Maps wide opcode to index of entry in ReduceTable.
+ DenseMap<unsigned, unsigned> ReduceOpcodeMap;
+
+ bool VerifyPredAndCC(MachineInstr *MI, const ReduceEntry &Entry,
+ bool is2Addr, ARMCC::CondCodes Pred,
+ bool LiveCPSR, bool &HasCC, bool &CCDead);
+
+ bool ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI,
+ const ReduceEntry &Entry);
+
+ bool ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI,
+ const ReduceEntry &Entry, bool LiveCPSR);
+
+ /// ReduceTo2Addr - Reduce a 32-bit instruction to a 16-bit two-address
+ /// instruction.
+ bool ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI,
+ const ReduceEntry &Entry,
+ bool LiveCPSR);
+
+ /// ReduceToNarrow - Reduce a 32-bit instruction to a 16-bit
+ /// non-two-address instruction.
+ bool ReduceToNarrow(MachineBasicBlock &MBB, MachineInstr *MI,
+ const ReduceEntry &Entry,
+ bool LiveCPSR);
+
+ /// ReduceMBB - Reduce width of instructions in the specified basic block.
+ bool ReduceMBB(MachineBasicBlock &MBB);
+ };
+ char Thumb2SizeReduce::ID = 0;
+}
+
+Thumb2SizeReduce::Thumb2SizeReduce() : MachineFunctionPass(&ID) {
+ for (unsigned i = 0, e = array_lengthof(ReduceTable); i != e; ++i) {
+ unsigned FromOpc = ReduceTable[i].WideOpc;
+ if (!ReduceOpcodeMap.insert(std::make_pair(FromOpc, i)).second)
+ assert(false && "Duplicated entries?");
+ }
+}
+
+static bool HasImplicitCPSRDef(const TargetInstrDesc &TID) {
+ for (const unsigned *Regs = TID.ImplicitDefs; *Regs; ++Regs)
+ if (*Regs == ARM::CPSR)
+ return true;
+ return false;
+}
+
+bool
+Thumb2SizeReduce::VerifyPredAndCC(MachineInstr *MI, const ReduceEntry &Entry,
+ bool is2Addr, ARMCC::CondCodes Pred,
+ bool LiveCPSR, bool &HasCC, bool &CCDead) {
+ if ((is2Addr && Entry.PredCC2 == 0) ||
+ (!is2Addr && Entry.PredCC1 == 0)) {
+ if (Pred == ARMCC::AL) {
+ // Not predicated, must set CPSR.
+ if (!HasCC) {
+ // Original instruction was not setting CPSR, but CPSR is not
+ // currently live anyway. It's ok to set it. The CPSR def is
+ // dead though.
+ if (!LiveCPSR) {
+ HasCC = true;
+ CCDead = true;
+ return true;
+ }
+ return false;
+ }
+ } else {
+ // Predicated, must not set CPSR.
+ if (HasCC)
+ return false;
+ }
+ } else if ((is2Addr && Entry.PredCC2 == 2) ||
+ (!is2Addr && Entry.PredCC1 == 2)) {
+ /// Old opcode has an optional def of CPSR.
+ if (HasCC)
+ return true;
+ // If both old opcode does not implicit CPSR def, then it's not ok since
+ // these new opcodes CPSR def is not meant to be thrown away. e.g. CMP.
+ if (!HasImplicitCPSRDef(MI->getDesc()))
+ return false;
+ HasCC = true;
+ } else {
+ // 16-bit instruction does not set CPSR.
+ if (HasCC)
+ return false;
+ }
+
+ return true;
+}
+
+static bool VerifyLowRegs(MachineInstr *MI) {
+ unsigned Opc = MI->getOpcode();
+ bool isPCOk = (Opc == ARM::t2LDM_RET) || (Opc == ARM::t2LDM);
+ bool isLROk = (Opc == ARM::t2STM);
+ bool isSPOk = isPCOk || isLROk || (Opc == ARM::t2ADDrSPi);
+ for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+ const MachineOperand &MO = MI->getOperand(i);
+ if (!MO.isReg() || MO.isImplicit())
+ continue;
+ unsigned Reg = MO.getReg();
+ if (Reg == 0 || Reg == ARM::CPSR)
+ continue;
+ if (isPCOk && Reg == ARM::PC)
+ continue;
+ if (isLROk && Reg == ARM::LR)
+ continue;
+ if (isSPOk && Reg == ARM::SP)
+ continue;
+ if (!isARMLowRegister(Reg))
+ return false;
+ }
+ return true;
+}
+
+bool
+Thumb2SizeReduce::ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI,
+ const ReduceEntry &Entry) {
+ if (ReduceLimitLdSt != -1 && ((int)NumLdSts >= ReduceLimitLdSt))
+ return false;
+
+ unsigned Scale = 1;
+ bool HasImmOffset = false;
+ bool HasShift = false;
+ bool isLdStMul = false;
+ unsigned Opc = Entry.NarrowOpc1;
+ unsigned OpNum = 3; // First 'rest' of operands.
+ switch (Entry.WideOpc) {
+ default:
+ llvm_unreachable("Unexpected Thumb2 load / store opcode!");
+ case ARM::t2LDRi12:
+ case ARM::t2STRi12:
+ Scale = 4;
+ HasImmOffset = true;
+ break;
+ case ARM::t2LDRBi12:
+ case ARM::t2STRBi12:
+ HasImmOffset = true;
+ break;
+ case ARM::t2LDRHi12:
+ case ARM::t2STRHi12:
+ Scale = 2;
+ HasImmOffset = true;
+ break;
+ case ARM::t2LDRs:
+ case ARM::t2LDRBs:
+ case ARM::t2LDRHs:
+ case ARM::t2LDRSBs:
+ case ARM::t2LDRSHs:
+ case ARM::t2STRs:
+ case ARM::t2STRBs:
+ case ARM::t2STRHs:
+ HasShift = true;
+ OpNum = 4;
+ break;
+ case ARM::t2LDM_RET:
+ case ARM::t2LDM:
+ case ARM::t2STM: {
+ OpNum = 0;
+ unsigned BaseReg = MI->getOperand(0).getReg();
+ unsigned Mode = MI->getOperand(1).getImm();
+ if (BaseReg == ARM::SP && ARM_AM::getAM4WBFlag(Mode)) {
+ Opc = Entry.NarrowOpc2;
+ OpNum = 2;
+ } else if (Entry.WideOpc == ARM::t2LDM_RET ||
+ !isARMLowRegister(BaseReg) ||
+ !ARM_AM::getAM4WBFlag(Mode) ||
+ ARM_AM::getAM4SubMode(Mode) != ARM_AM::ia) {
+ return false;
+ }
+ isLdStMul = true;
+ break;
+ }
+ }
+
+ unsigned OffsetReg = 0;
+ bool OffsetKill = false;
+ if (HasShift) {
+ OffsetReg = MI->getOperand(2).getReg();
+ OffsetKill = MI->getOperand(2).isKill();
+ if (MI->getOperand(3).getImm())
+ // Thumb1 addressing mode doesn't support shift.
+ return false;
+ }
+
+ unsigned OffsetImm = 0;
+ if (HasImmOffset) {
+ OffsetImm = MI->getOperand(2).getImm();
+ unsigned MaxOffset = ((1 << Entry.Imm1Limit) - 1) * Scale;
+ if ((OffsetImm & (Scale-1)) || OffsetImm > MaxOffset)
+ // Make sure the immediate field fits.
+ return false;
+ }
+
+ // Add the 16-bit load / store instruction.
+ // FIXME: Thumb1 addressing mode encode both immediate and register offset.
+ DebugLoc dl = MI->getDebugLoc();
+ MachineInstrBuilder MIB = BuildMI(MBB, *MI, dl, TII->get(Opc));
+ if (!isLdStMul) {
+ MIB.addOperand(MI->getOperand(0)).addOperand(MI->getOperand(1));
+ if (Entry.NarrowOpc1 != ARM::tLDRSB && Entry.NarrowOpc1 != ARM::tLDRSH) {
+ // tLDRSB and tLDRSH do not have an immediate offset field. On the other
+ // hand, it must have an offset register.
+ // FIXME: Remove this special case.
+ MIB.addImm(OffsetImm/Scale);
+ }
+ assert((!HasShift || OffsetReg) && "Invalid so_reg load / store address!");
+
+ MIB.addReg(OffsetReg, getKillRegState(OffsetKill));
+ }
+
+ // Transfer the rest of operands.
+ for (unsigned e = MI->getNumOperands(); OpNum != e; ++OpNum)
+ MIB.addOperand(MI->getOperand(OpNum));
+
+ DEBUG(errs() << "Converted 32-bit: " << *MI << " to 16-bit: " << *MIB);
+
+ MBB.erase(MI);
+ ++NumLdSts;
+ return true;
+}
+
+bool
+Thumb2SizeReduce::ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI,
+ const ReduceEntry &Entry,
+ bool LiveCPSR) {
+ if (Entry.LowRegs1 && !VerifyLowRegs(MI))
+ return false;
+
+ const TargetInstrDesc &TID = MI->getDesc();
+ if (TID.mayLoad() || TID.mayStore())
+ return ReduceLoadStore(MBB, MI, Entry);
+
+ unsigned Opc = MI->getOpcode();
+ switch (Opc) {
+ default: break;
+ case ARM::t2ADDSri:
+ case ARM::t2ADDSrr: {
+ unsigned PredReg = 0;
+ if (getInstrPredicate(MI, PredReg) == ARMCC::AL) {
+ switch (Opc) {
+ default: break;
+ case ARM::t2ADDSri: {
+ if (ReduceTo2Addr(MBB, MI, Entry, LiveCPSR))
+ return true;
+ // fallthrough
+ }
+ case ARM::t2ADDSrr:
+ return ReduceToNarrow(MBB, MI, Entry, LiveCPSR);
+ }
+ }
+ break;
+ }
+ case ARM::t2RSBri:
+ case ARM::t2RSBSri:
+ if (MI->getOperand(2).getImm() == 0)
+ return ReduceToNarrow(MBB, MI, Entry, LiveCPSR);
+ break;
+ }
+ return false;
+}
+
+bool
+Thumb2SizeReduce::ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI,
+ const ReduceEntry &Entry,
+ bool LiveCPSR) {
+
+ if (ReduceLimit2Addr != -1 && ((int)Num2Addrs >= ReduceLimit2Addr))
+ return false;
+
+ const TargetInstrDesc &TID = MI->getDesc();
+ unsigned Reg0 = MI->getOperand(0).getReg();
+ unsigned Reg1 = MI->getOperand(1).getReg();
+ if (Reg0 != Reg1)
+ return false;
+ if (Entry.LowRegs2 && !isARMLowRegister(Reg0))
+ return false;
+ if (Entry.Imm2Limit) {
+ unsigned Imm = MI->getOperand(2).getImm();
+ unsigned Limit = (1 << Entry.Imm2Limit) - 1;
+ if (Imm > Limit)
+ return false;
+ } else {
+ unsigned Reg2 = MI->getOperand(2).getReg();
+ if (Entry.LowRegs2 && !isARMLowRegister(Reg2))
+ return false;
+ }
+
+ // Check if it's possible / necessary to transfer the predicate.
+ const TargetInstrDesc &NewTID = TII->get(Entry.NarrowOpc2);
+ unsigned PredReg = 0;
+ ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg);
+ bool SkipPred = false;
+ if (Pred != ARMCC::AL) {
+ if (!NewTID.isPredicable())
+ // Can't transfer predicate, fail.
+ return false;
+ } else {
+ SkipPred = !NewTID.isPredicable();
+ }
+
+ bool HasCC = false;
+ bool CCDead = false;
+ if (TID.hasOptionalDef()) {
+ unsigned NumOps = TID.getNumOperands();
+ HasCC = (MI->getOperand(NumOps-1).getReg() == ARM::CPSR);
+ if (HasCC && MI->getOperand(NumOps-1).isDead())
+ CCDead = true;
+ }
+ if (!VerifyPredAndCC(MI, Entry, true, Pred, LiveCPSR, HasCC, CCDead))
+ return false;
+
+ // Add the 16-bit instruction.
+ DebugLoc dl = MI->getDebugLoc();
+ MachineInstrBuilder MIB = BuildMI(MBB, *MI, dl, NewTID);
+ MIB.addOperand(MI->getOperand(0));
+ if (NewTID.hasOptionalDef()) {
+ if (HasCC)
+ AddDefaultT1CC(MIB, CCDead);
+ else
+ AddNoT1CC(MIB);
+ }
+
+ // Transfer the rest of operands.
+ unsigned NumOps = TID.getNumOperands();
+ for (unsigned i = 1, e = MI->getNumOperands(); i != e; ++i) {
+ if (i < NumOps && TID.OpInfo[i].isOptionalDef())
+ continue;
+ if (SkipPred && TID.OpInfo[i].isPredicate())
+ continue;
+ MIB.addOperand(MI->getOperand(i));
+ }
+
+ DEBUG(errs() << "Converted 32-bit: " << *MI << " to 16-bit: " << *MIB);
+
+ MBB.erase(MI);
+ ++Num2Addrs;
+ return true;
+}
+
+bool
+Thumb2SizeReduce::ReduceToNarrow(MachineBasicBlock &MBB, MachineInstr *MI,
+ const ReduceEntry &Entry,
+ bool LiveCPSR) {
+ if (ReduceLimit != -1 && ((int)NumNarrows >= ReduceLimit))
+ return false;
+
+ unsigned Limit = ~0U;
+ unsigned Scale = (Entry.WideOpc == ARM::t2ADDrSPi) ? 4 : 1;
+ if (Entry.Imm1Limit)
+ Limit = ((1 << Entry.Imm1Limit) - 1) * Scale;
+
+ const TargetInstrDesc &TID = MI->getDesc();
+ for (unsigned i = 0, e = TID.getNumOperands(); i != e; ++i) {
+ if (TID.OpInfo[i].isPredicate())
+ continue;
+ const MachineOperand &MO = MI->getOperand(i);
+ if (MO.isReg()) {
+ unsigned Reg = MO.getReg();
+ if (!Reg || Reg == ARM::CPSR)
+ continue;
+ if (Entry.WideOpc == ARM::t2ADDrSPi && Reg == ARM::SP)
+ continue;
+ if (Entry.LowRegs1 && !isARMLowRegister(Reg))
+ return false;
+ } else if (MO.isImm() &&
+ !TID.OpInfo[i].isPredicate()) {
+ if (((unsigned)MO.getImm()) > Limit || (MO.getImm() & (Scale-1)) != 0)
+ return false;
+ }
+ }
+
+ // Check if it's possible / necessary to transfer the predicate.
+ const TargetInstrDesc &NewTID = TII->get(Entry.NarrowOpc1);
+ unsigned PredReg = 0;
+ ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg);
+ bool SkipPred = false;
+ if (Pred != ARMCC::AL) {
+ if (!NewTID.isPredicable())
+ // Can't transfer predicate, fail.
+ return false;
+ } else {
+ SkipPred = !NewTID.isPredicable();
+ }
+
+ bool HasCC = false;
+ bool CCDead = false;
+ if (TID.hasOptionalDef()) {
+ unsigned NumOps = TID.getNumOperands();
+ HasCC = (MI->getOperand(NumOps-1).getReg() == ARM::CPSR);
+ if (HasCC && MI->getOperand(NumOps-1).isDead())
+ CCDead = true;
+ }
+ if (!VerifyPredAndCC(MI, Entry, false, Pred, LiveCPSR, HasCC, CCDead))
+ return false;
+
+ // Add the 16-bit instruction.
+ DebugLoc dl = MI->getDebugLoc();
+ MachineInstrBuilder MIB = BuildMI(MBB, *MI, dl, NewTID);
+ MIB.addOperand(MI->getOperand(0));
+ if (NewTID.hasOptionalDef()) {
+ if (HasCC)
+ AddDefaultT1CC(MIB, CCDead);
+ else
+ AddNoT1CC(MIB);
+ }
+
+ // Transfer the rest of operands.
+ unsigned NumOps = TID.getNumOperands();
+ for (unsigned i = 1, e = MI->getNumOperands(); i != e; ++i) {
+ if (i < NumOps && TID.OpInfo[i].isOptionalDef())
+ continue;
+ if ((TID.getOpcode() == ARM::t2RSBSri ||
+ TID.getOpcode() == ARM::t2RSBri) && i == 2)
+ // Skip the zero immediate operand, it's now implicit.
+ continue;
+ bool isPred = (i < NumOps && TID.OpInfo[i].isPredicate());
+ if (SkipPred && isPred)
+ continue;
+ const MachineOperand &MO = MI->getOperand(i);
+ if (Scale > 1 && !isPred && MO.isImm())
+ MIB.addImm(MO.getImm() / Scale);
+ else {
+ if (MO.isReg() && MO.isImplicit() && MO.getReg() == ARM::CPSR)
+ // Skip implicit def of CPSR. Either it's modeled as an optional
+ // def now or it's already an implicit def on the new instruction.
+ continue;
+ MIB.addOperand(MO);
+ }
+ }
+ if (!TID.isPredicable() && NewTID.isPredicable())
+ AddDefaultPred(MIB);
+
+ DEBUG(errs() << "Converted 32-bit: " << *MI << " to 16-bit: " << *MIB);
+
+ MBB.erase(MI);
+ ++NumNarrows;
+ return true;
+}
+
+static bool UpdateCPSRDef(MachineInstr &MI, bool LiveCPSR) {
+ bool HasDef = false;
+ for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+ const MachineOperand &MO = MI.getOperand(i);
+ if (!MO.isReg() || MO.isUndef() || MO.isUse())
+ continue;
+ if (MO.getReg() != ARM::CPSR)
+ continue;
+ if (!MO.isDead())
+ HasDef = true;
+ }
+
+ return HasDef || LiveCPSR;
+}
+
+static bool UpdateCPSRUse(MachineInstr &MI, bool LiveCPSR) {
+ for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+ const MachineOperand &MO = MI.getOperand(i);
+ if (!MO.isReg() || MO.isUndef() || MO.isDef())
+ continue;
+ if (MO.getReg() != ARM::CPSR)
+ continue;
+ assert(LiveCPSR && "CPSR liveness tracking is wrong!");
+ if (MO.isKill()) {
+ LiveCPSR = false;
+ break;
+ }
+ }
+
+ return LiveCPSR;
+}
+
+bool Thumb2SizeReduce::ReduceMBB(MachineBasicBlock &MBB) {
+ bool Modified = false;
+
+ bool LiveCPSR = false;
+ // Yes, CPSR could be livein.
+ for (MachineBasicBlock::const_livein_iterator I = MBB.livein_begin(),
+ E = MBB.livein_end(); I != E; ++I) {
+ if (*I == ARM::CPSR) {
+ LiveCPSR = true;
+ break;
+ }
+ }
+
+ MachineBasicBlock::iterator MII = MBB.begin(), E = MBB.end();
+ MachineBasicBlock::iterator NextMII;
+ for (; MII != E; MII = NextMII) {
+ NextMII = next(MII);
+
+ MachineInstr *MI = &*MII;
+ LiveCPSR = UpdateCPSRUse(*MI, LiveCPSR);
+
+ unsigned Opcode = MI->getOpcode();
+ DenseMap<unsigned, unsigned>::iterator OPI = ReduceOpcodeMap.find(Opcode);
+ if (OPI != ReduceOpcodeMap.end()) {
+ const ReduceEntry &Entry = ReduceTable[OPI->second];
+ // Ignore "special" cases for now.
+ if (Entry.Special) {
+ if (ReduceSpecial(MBB, MI, Entry, LiveCPSR)) {
+ Modified = true;
+ MachineBasicBlock::iterator I = prior(NextMII);
+ MI = &*I;
+ }
+ goto ProcessNext;
+ }
+
+ // Try to transform to a 16-bit two-address instruction.
+ if (Entry.NarrowOpc2 && ReduceTo2Addr(MBB, MI, Entry, LiveCPSR)) {
+ Modified = true;
+ MachineBasicBlock::iterator I = prior(NextMII);
+ MI = &*I;
+ goto ProcessNext;
+ }
+
+ // Try to transform ro a 16-bit non-two-address instruction.
+ if (Entry.NarrowOpc1 && ReduceToNarrow(MBB, MI, Entry, LiveCPSR)) {
+ Modified = true;
+ MachineBasicBlock::iterator I = prior(NextMII);
+ MI = &*I;
+ }
+ }
+
+ ProcessNext:
+ LiveCPSR = UpdateCPSRDef(*MI, LiveCPSR);
+ }
+
+ return Modified;
+}
+
+bool Thumb2SizeReduce::runOnMachineFunction(MachineFunction &MF) {
+ const TargetMachine &TM = MF.getTarget();
+ TII = static_cast<const Thumb2InstrInfo*>(TM.getInstrInfo());
+
+ bool Modified = false;
+ for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I)
+ Modified |= ReduceMBB(*I);
+ return Modified;
+}
+
+/// createThumb2SizeReductionPass - Returns an instance of the Thumb2 size
+/// reduction pass.
+FunctionPass *llvm::createThumb2SizeReductionPass() {
+ return new Thumb2SizeReduce();
+}
OpenPOWER on IntegriCloud