diff options
Diffstat (limited to 'contrib/llvm/lib/Target/ARM')
86 files changed, 55875 insertions, 0 deletions
diff --git a/contrib/llvm/lib/Target/ARM/ARM.h b/contrib/llvm/lib/Target/ARM/ARM.h new file mode 100644 index 0000000..ae7ae59 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARM.h @@ -0,0 +1,125 @@ +//===-- ARM.h - Top-level interface for ARM representation---- --*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the entry points for global functions defined in the LLVM +// ARM back-end. +// +//===----------------------------------------------------------------------===// + +#ifndef TARGET_ARM_H +#define TARGET_ARM_H + +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Target/TargetMachine.h" +#include <cassert> + +namespace llvm { + +class ARMBaseTargetMachine; +class FunctionPass; +class JITCodeEmitter; +class formatted_raw_ostream; + +// Enums corresponding to ARM condition codes +namespace ARMCC { + // The CondCodes constants map directly to the 4-bit encoding of the + // condition field for predicated instructions. + enum CondCodes { + EQ, + NE, + HS, + LO, + MI, + PL, + VS, + VC, + HI, + LS, + GE, + LT, + GT, + LE, + AL + }; + + inline static CondCodes getOppositeCondition(CondCodes CC) { + switch (CC) { + default: llvm_unreachable("Unknown condition code"); + case EQ: return NE; + case NE: return EQ; + case HS: return LO; + case LO: return HS; + case MI: return PL; + case PL: return MI; + case VS: return VC; + case VC: return VS; + case HI: return LS; + case LS: return HI; + case GE: return LT; + case LT: return GE; + case GT: return LE; + case LE: return GT; + } + } +} // namespace ARMCC + +inline static const char *ARMCondCodeToString(ARMCC::CondCodes CC) { + switch (CC) { + default: llvm_unreachable("Unknown condition code"); + case ARMCC::EQ: return "eq"; + case ARMCC::NE: return "ne"; + case ARMCC::HS: return "hs"; + case ARMCC::LO: return "lo"; + case ARMCC::MI: return "mi"; + case ARMCC::PL: return "pl"; + case ARMCC::VS: return "vs"; + case ARMCC::VC: return "vc"; + case ARMCC::HI: return "hi"; + case ARMCC::LS: return "ls"; + case ARMCC::GE: return "ge"; + case ARMCC::LT: return "lt"; + case ARMCC::GT: return "gt"; + case ARMCC::LE: return "le"; + case ARMCC::AL: return "al"; + } +} + +/// ModelWithRegSequence - Return true if isel should use REG_SEQUENCE to model +/// operations involving sub-registers. +bool ModelWithRegSequence(); + +FunctionPass *createARMISelDag(ARMBaseTargetMachine &TM, + CodeGenOpt::Level OptLevel); + +FunctionPass *createARMJITCodeEmitterPass(ARMBaseTargetMachine &TM, + JITCodeEmitter &JCE); + +FunctionPass *createARMLoadStoreOptimizationPass(bool PreAlloc = false); +FunctionPass *createARMExpandPseudoPass(); +FunctionPass *createARMConstantIslandPass(); +FunctionPass *createNEONPreAllocPass(); +FunctionPass *createNEONMoveFixPass(); +FunctionPass *createThumb2ITBlockPass(); +FunctionPass *createThumb2SizeReductionPass(); + +extern Target TheARMTarget, TheThumbTarget; + +} // end namespace llvm; + +// Defines symbolic names for ARM registers. This defines a mapping from +// register name to register number. +// +#include "ARMGenRegisterNames.inc" + +// Defines symbolic names for the ARM instructions. +// +#include "ARMGenInstrNames.inc" + + +#endif diff --git a/contrib/llvm/lib/Target/ARM/ARM.td b/contrib/llvm/lib/Target/ARM/ARM.td new file mode 100644 index 0000000..f1e6a9f --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARM.td @@ -0,0 +1,161 @@ +//===- ARM.td - Describe the ARM Target Machine -----------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Target-independent interfaces which we are implementing +//===----------------------------------------------------------------------===// + +include "llvm/Target/Target.td" + +//===----------------------------------------------------------------------===// +// ARM Subtarget features. +// + +def ArchV4T : SubtargetFeature<"v4t", "ARMArchVersion", "V4T", + "ARM v4T">; +def ArchV5T : SubtargetFeature<"v5t", "ARMArchVersion", "V5T", + "ARM v5T">; +def ArchV5TE : SubtargetFeature<"v5te", "ARMArchVersion", "V5TE", + "ARM v5TE, v5TEj, v5TExp">; +def ArchV6 : SubtargetFeature<"v6", "ARMArchVersion", "V6", + "ARM v6">; +def ArchV6T2 : SubtargetFeature<"v6t2", "ARMArchVersion", "V6T2", + "ARM v6t2">; +def ArchV7A : SubtargetFeature<"v7a", "ARMArchVersion", "V7A", + "ARM v7A">; +def ArchV7M : SubtargetFeature<"v7m", "ARMArchVersion", "V7M", + "ARM v7M">; +def FeatureVFP2 : SubtargetFeature<"vfp2", "ARMFPUType", "VFPv2", + "Enable VFP2 instructions">; +def FeatureVFP3 : SubtargetFeature<"vfp3", "ARMFPUType", "VFPv3", + "Enable VFP3 instructions">; +def FeatureNEON : SubtargetFeature<"neon", "ARMFPUType", "NEON", + "Enable NEON instructions">; +def FeatureThumb2 : SubtargetFeature<"thumb2", "ThumbMode", "Thumb2", + "Enable Thumb2 instructions">; +def FeatureFP16 : SubtargetFeature<"fp16", "HasFP16", "true", + "Enable half-precision floating point">; +def FeatureHWDiv : SubtargetFeature<"hwdiv", "HasHardwareDivide", "true", + "Enable divide instructions">; +def FeatureT2ExtractPack: SubtargetFeature<"t2xtpk", "HasT2ExtractPack", "true", + "Enable Thumb2 extract and pack instructions">; + +// Some processors have multiply-accumulate instructions that don't +// play nicely with other VFP instructions, and it's generally better +// to just not use them. +// FIXME: Currently, this is only flagged for Cortex-A8. It may be true for +// others as well. We should do more benchmarking and confirm one way or +// the other. +def FeatureHasSlowVMLx : SubtargetFeature<"vmlx", "SlowVMLx", "true", + "Disable VFP MAC instructions">; +// Some processors benefit from using NEON instructions for scalar +// single-precision FP operations. +def FeatureNEONForFP : SubtargetFeature<"neonfp", "UseNEONForSinglePrecisionFP", + "true", + "Use NEON for single precision FP">; + + +//===----------------------------------------------------------------------===// +// ARM Processors supported. +// + +include "ARMSchedule.td" + +class ProcNoItin<string Name, list<SubtargetFeature> Features> + : Processor<Name, GenericItineraries, Features>; + +// V4 Processors. +def : ProcNoItin<"generic", []>; +def : ProcNoItin<"arm8", []>; +def : ProcNoItin<"arm810", []>; +def : ProcNoItin<"strongarm", []>; +def : ProcNoItin<"strongarm110", []>; +def : ProcNoItin<"strongarm1100", []>; +def : ProcNoItin<"strongarm1110", []>; + +// V4T Processors. +def : ProcNoItin<"arm7tdmi", [ArchV4T]>; +def : ProcNoItin<"arm7tdmi-s", [ArchV4T]>; +def : ProcNoItin<"arm710t", [ArchV4T]>; +def : ProcNoItin<"arm720t", [ArchV4T]>; +def : ProcNoItin<"arm9", [ArchV4T]>; +def : ProcNoItin<"arm9tdmi", [ArchV4T]>; +def : ProcNoItin<"arm920", [ArchV4T]>; +def : ProcNoItin<"arm920t", [ArchV4T]>; +def : ProcNoItin<"arm922t", [ArchV4T]>; +def : ProcNoItin<"arm940t", [ArchV4T]>; +def : ProcNoItin<"ep9312", [ArchV4T]>; + +// V5T Processors. +def : ProcNoItin<"arm10tdmi", [ArchV5T]>; +def : ProcNoItin<"arm1020t", [ArchV5T]>; + +// V5TE Processors. +def : ProcNoItin<"arm9e", [ArchV5TE]>; +def : ProcNoItin<"arm926ej-s", [ArchV5TE]>; +def : ProcNoItin<"arm946e-s", [ArchV5TE]>; +def : ProcNoItin<"arm966e-s", [ArchV5TE]>; +def : ProcNoItin<"arm968e-s", [ArchV5TE]>; +def : ProcNoItin<"arm10e", [ArchV5TE]>; +def : ProcNoItin<"arm1020e", [ArchV5TE]>; +def : ProcNoItin<"arm1022e", [ArchV5TE]>; +def : ProcNoItin<"xscale", [ArchV5TE]>; +def : ProcNoItin<"iwmmxt", [ArchV5TE]>; + +// V6 Processors. +def : Processor<"arm1136j-s", ARMV6Itineraries, [ArchV6]>; +def : Processor<"arm1136jf-s", ARMV6Itineraries, [ArchV6, FeatureVFP2, + FeatureHasSlowVMLx]>; +def : Processor<"arm1176jz-s", ARMV6Itineraries, [ArchV6]>; +def : Processor<"arm1176jzf-s", ARMV6Itineraries, [ArchV6, FeatureVFP2]>; +def : Processor<"mpcorenovfp", ARMV6Itineraries, [ArchV6]>; +def : Processor<"mpcore", ARMV6Itineraries, [ArchV6, FeatureVFP2]>; + +// V6T2 Processors. +def : Processor<"arm1156t2-s", ARMV6Itineraries, + [ArchV6T2, FeatureThumb2]>; +def : Processor<"arm1156t2f-s", ARMV6Itineraries, + [ArchV6T2, FeatureThumb2, FeatureVFP2]>; + +// V7 Processors. +def : Processor<"cortex-a8", CortexA8Itineraries, + [ArchV7A, FeatureThumb2, FeatureNEON, FeatureHasSlowVMLx, + FeatureNEONForFP, FeatureT2ExtractPack]>; +def : Processor<"cortex-a9", CortexA9Itineraries, + [ArchV7A, FeatureThumb2, FeatureNEON, FeatureT2ExtractPack]>; +def : ProcNoItin<"cortex-m3", [ArchV7M, FeatureThumb2, FeatureHWDiv]>; +def : ProcNoItin<"cortex-m4", [ArchV7M, FeatureThumb2, FeatureHWDiv]>; + +//===----------------------------------------------------------------------===// +// Register File Description +//===----------------------------------------------------------------------===// + +include "ARMRegisterInfo.td" + +include "ARMCallingConv.td" + +//===----------------------------------------------------------------------===// +// Instruction Descriptions +//===----------------------------------------------------------------------===// + +include "ARMInstrInfo.td" + +def ARMInstrInfo : InstrInfo; + +//===----------------------------------------------------------------------===// +// Declare the target which we are implementing +//===----------------------------------------------------------------------===// + +def ARM : Target { + // Pull in Instruction Info: + let InstructionSet = ARMInstrInfo; +} diff --git a/contrib/llvm/lib/Target/ARM/ARMAddressingModes.h b/contrib/llvm/lib/Target/ARM/ARMAddressingModes.h new file mode 100644 index 0000000..e68354a --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMAddressingModes.h @@ -0,0 +1,528 @@ +//===- ARMAddressingModes.h - ARM Addressing Modes --------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the ARM addressing mode implementation stuff. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TARGET_ARM_ARMADDRESSINGMODES_H +#define LLVM_TARGET_ARM_ARMADDRESSINGMODES_H + +#include "llvm/CodeGen/SelectionDAGNodes.h" +#include "llvm/Support/MathExtras.h" +#include <cassert> + +namespace llvm { + +/// ARM_AM - ARM Addressing Mode Stuff +namespace ARM_AM { + enum ShiftOpc { + no_shift = 0, + asr, + lsl, + lsr, + ror, + rrx + }; + + enum AddrOpc { + add = '+', sub = '-' + }; + + static inline const char *getAddrOpcStr(AddrOpc Op) { + return Op == sub ? "-" : ""; + } + + static inline const char *getShiftOpcStr(ShiftOpc Op) { + switch (Op) { + default: assert(0 && "Unknown shift opc!"); + case ARM_AM::asr: return "asr"; + case ARM_AM::lsl: return "lsl"; + case ARM_AM::lsr: return "lsr"; + case ARM_AM::ror: return "ror"; + case ARM_AM::rrx: return "rrx"; + } + } + + static inline ShiftOpc getShiftOpcForNode(SDValue N) { + switch (N.getOpcode()) { + default: return ARM_AM::no_shift; + case ISD::SHL: return ARM_AM::lsl; + case ISD::SRL: return ARM_AM::lsr; + case ISD::SRA: return ARM_AM::asr; + case ISD::ROTR: return ARM_AM::ror; + //case ISD::ROTL: // Only if imm -> turn into ROTR. + // Can't handle RRX here, because it would require folding a flag into + // the addressing mode. :( This causes us to miss certain things. + //case ARMISD::RRX: return ARM_AM::rrx; + } + } + + enum AMSubMode { + bad_am_submode = 0, + ia, + ib, + da, + db + }; + + static inline const char *getAMSubModeStr(AMSubMode Mode) { + switch (Mode) { + default: assert(0 && "Unknown addressing sub-mode!"); + case ARM_AM::ia: return "ia"; + case ARM_AM::ib: return "ib"; + case ARM_AM::da: return "da"; + case ARM_AM::db: return "db"; + } + } + + /// rotr32 - Rotate a 32-bit unsigned value right by a specified # bits. + /// + static inline unsigned rotr32(unsigned Val, unsigned Amt) { + assert(Amt < 32 && "Invalid rotate amount"); + return (Val >> Amt) | (Val << ((32-Amt)&31)); + } + + /// rotl32 - Rotate a 32-bit unsigned value left by a specified # bits. + /// + static inline unsigned rotl32(unsigned Val, unsigned Amt) { + assert(Amt < 32 && "Invalid rotate amount"); + return (Val << Amt) | (Val >> ((32-Amt)&31)); + } + + //===--------------------------------------------------------------------===// + // Addressing Mode #1: shift_operand with registers + //===--------------------------------------------------------------------===// + // + // This 'addressing mode' is used for arithmetic instructions. It can + // represent things like: + // reg + // reg [asr|lsl|lsr|ror|rrx] reg + // reg [asr|lsl|lsr|ror|rrx] imm + // + // This is stored three operands [rega, regb, opc]. The first is the base + // reg, the second is the shift amount (or reg0 if not present or imm). The + // third operand encodes the shift opcode and the imm if a reg isn't present. + // + static inline unsigned getSORegOpc(ShiftOpc ShOp, unsigned Imm) { + return ShOp | (Imm << 3); + } + static inline unsigned getSORegOffset(unsigned Op) { + return Op >> 3; + } + static inline ShiftOpc getSORegShOp(unsigned Op) { + return (ShiftOpc)(Op & 7); + } + + /// getSOImmValImm - Given an encoded imm field for the reg/imm form, return + /// the 8-bit imm value. + static inline unsigned getSOImmValImm(unsigned Imm) { + return Imm & 0xFF; + } + /// getSOImmValRot - Given an encoded imm field for the reg/imm form, return + /// the rotate amount. + static inline unsigned getSOImmValRot(unsigned Imm) { + return (Imm >> 8) * 2; + } + + /// getSOImmValRotate - Try to handle Imm with an immediate shifter operand, + /// computing the rotate amount to use. If this immediate value cannot be + /// handled with a single shifter-op, determine a good rotate amount that will + /// take a maximal chunk of bits out of the immediate. + static inline unsigned getSOImmValRotate(unsigned Imm) { + // 8-bit (or less) immediates are trivially shifter_operands with a rotate + // of zero. + if ((Imm & ~255U) == 0) return 0; + + // Use CTZ to compute the rotate amount. + unsigned TZ = CountTrailingZeros_32(Imm); + + // Rotate amount must be even. Something like 0x200 must be rotated 8 bits, + // not 9. + unsigned RotAmt = TZ & ~1; + + // If we can handle this spread, return it. + if ((rotr32(Imm, RotAmt) & ~255U) == 0) + return (32-RotAmt)&31; // HW rotates right, not left. + + // For values like 0xF000000F, we should ignore the low 6 bits, then + // retry the hunt. + if (Imm & 63U) { + unsigned TZ2 = CountTrailingZeros_32(Imm & ~63U); + unsigned RotAmt2 = TZ2 & ~1; + if ((rotr32(Imm, RotAmt2) & ~255U) == 0) + return (32-RotAmt2)&31; // HW rotates right, not left. + } + + // Otherwise, we have no way to cover this span of bits with a single + // shifter_op immediate. Return a chunk of bits that will be useful to + // handle. + return (32-RotAmt)&31; // HW rotates right, not left. + } + + /// getSOImmVal - Given a 32-bit immediate, if it is something that can fit + /// into an shifter_operand immediate operand, return the 12-bit encoding for + /// it. If not, return -1. + static inline int getSOImmVal(unsigned Arg) { + // 8-bit (or less) immediates are trivially shifter_operands with a rotate + // of zero. + if ((Arg & ~255U) == 0) return Arg; + + unsigned RotAmt = getSOImmValRotate(Arg); + + // If this cannot be handled with a single shifter_op, bail out. + if (rotr32(~255U, RotAmt) & Arg) + return -1; + + // Encode this correctly. + return rotl32(Arg, RotAmt) | ((RotAmt>>1) << 8); + } + + /// isSOImmTwoPartVal - Return true if the specified value can be obtained by + /// or'ing together two SOImmVal's. + static inline bool isSOImmTwoPartVal(unsigned V) { + // If this can be handled with a single shifter_op, bail out. + V = rotr32(~255U, getSOImmValRotate(V)) & V; + if (V == 0) + return false; + + // If this can be handled with two shifter_op's, accept. + V = rotr32(~255U, getSOImmValRotate(V)) & V; + return V == 0; + } + + /// getSOImmTwoPartFirst - If V is a value that satisfies isSOImmTwoPartVal, + /// return the first chunk of it. + static inline unsigned getSOImmTwoPartFirst(unsigned V) { + return rotr32(255U, getSOImmValRotate(V)) & V; + } + + /// getSOImmTwoPartSecond - If V is a value that satisfies isSOImmTwoPartVal, + /// return the second chunk of it. + static inline unsigned getSOImmTwoPartSecond(unsigned V) { + // Mask out the first hunk. + V = rotr32(~255U, getSOImmValRotate(V)) & V; + + // Take what's left. + assert(V == (rotr32(255U, getSOImmValRotate(V)) & V)); + return V; + } + + /// getThumbImmValShift - Try to handle Imm with a 8-bit immediate followed + /// by a left shift. Returns the shift amount to use. + static inline unsigned getThumbImmValShift(unsigned Imm) { + // 8-bit (or less) immediates are trivially immediate operand with a shift + // of zero. + if ((Imm & ~255U) == 0) return 0; + + // Use CTZ to compute the shift amount. + return CountTrailingZeros_32(Imm); + } + + /// isThumbImmShiftedVal - Return true if the specified value can be obtained + /// by left shifting a 8-bit immediate. + static inline bool isThumbImmShiftedVal(unsigned V) { + // If this can be handled with + V = (~255U << getThumbImmValShift(V)) & V; + return V == 0; + } + + /// getThumbImm16ValShift - Try to handle Imm with a 16-bit immediate followed + /// by a left shift. Returns the shift amount to use. + static inline unsigned getThumbImm16ValShift(unsigned Imm) { + // 16-bit (or less) immediates are trivially immediate operand with a shift + // of zero. + if ((Imm & ~65535U) == 0) return 0; + + // Use CTZ to compute the shift amount. + return CountTrailingZeros_32(Imm); + } + + /// isThumbImm16ShiftedVal - Return true if the specified value can be + /// obtained by left shifting a 16-bit immediate. + static inline bool isThumbImm16ShiftedVal(unsigned V) { + // If this can be handled with + V = (~65535U << getThumbImm16ValShift(V)) & V; + return V == 0; + } + + /// getThumbImmNonShiftedVal - If V is a value that satisfies + /// isThumbImmShiftedVal, return the non-shiftd value. + static inline unsigned getThumbImmNonShiftedVal(unsigned V) { + return V >> getThumbImmValShift(V); + } + + + /// getT2SOImmValSplat - Return the 12-bit encoded representation + /// if the specified value can be obtained by splatting the low 8 bits + /// into every other byte or every byte of a 32-bit value. i.e., + /// 00000000 00000000 00000000 abcdefgh control = 0 + /// 00000000 abcdefgh 00000000 abcdefgh control = 1 + /// abcdefgh 00000000 abcdefgh 00000000 control = 2 + /// abcdefgh abcdefgh abcdefgh abcdefgh control = 3 + /// Return -1 if none of the above apply. + /// See ARM Reference Manual A6.3.2. + static inline int getT2SOImmValSplatVal(unsigned V) { + unsigned u, Vs, Imm; + // control = 0 + if ((V & 0xffffff00) == 0) + return V; + + // If the value is zeroes in the first byte, just shift those off + Vs = ((V & 0xff) == 0) ? V >> 8 : V; + // Any passing value only has 8 bits of payload, splatted across the word + Imm = Vs & 0xff; + // Likewise, any passing values have the payload splatted into the 3rd byte + u = Imm | (Imm << 16); + + // control = 1 or 2 + if (Vs == u) + return (((Vs == V) ? 1 : 2) << 8) | Imm; + + // control = 3 + if (Vs == (u | (u << 8))) + return (3 << 8) | Imm; + + return -1; + } + + /// getT2SOImmValRotateVal - Return the 12-bit encoded representation if the + /// specified value is a rotated 8-bit value. Return -1 if no rotation + /// encoding is possible. + /// See ARM Reference Manual A6.3.2. + static inline int getT2SOImmValRotateVal(unsigned V) { + unsigned RotAmt = CountLeadingZeros_32(V); + if (RotAmt >= 24) + return -1; + + // If 'Arg' can be handled with a single shifter_op return the value. + if ((rotr32(0xff000000U, RotAmt) & V) == V) + return (rotr32(V, 24 - RotAmt) & 0x7f) | ((RotAmt + 8) << 7); + + return -1; + } + + /// getT2SOImmVal - Given a 32-bit immediate, if it is something that can fit + /// into a Thumb-2 shifter_operand immediate operand, return the 12-bit + /// encoding for it. If not, return -1. + /// See ARM Reference Manual A6.3.2. + static inline int getT2SOImmVal(unsigned Arg) { + // If 'Arg' is an 8-bit splat, then get the encoded value. + int Splat = getT2SOImmValSplatVal(Arg); + if (Splat != -1) + return Splat; + + // If 'Arg' can be handled with a single shifter_op return the value. + int Rot = getT2SOImmValRotateVal(Arg); + if (Rot != -1) + return Rot; + + return -1; + } + + static inline unsigned getT2SOImmValRotate(unsigned V) { + if ((V & ~255U) == 0) return 0; + // Use CTZ to compute the rotate amount. + unsigned RotAmt = CountTrailingZeros_32(V); + return (32 - RotAmt) & 31; + } + + static inline bool isT2SOImmTwoPartVal (unsigned Imm) { + unsigned V = Imm; + // Passing values can be any combination of splat values and shifter + // values. If this can be handled with a single shifter or splat, bail + // out. Those should be handled directly, not with a two-part val. + if (getT2SOImmValSplatVal(V) != -1) + return false; + V = rotr32 (~255U, getT2SOImmValRotate(V)) & V; + if (V == 0) + return false; + + // If this can be handled as an immediate, accept. + if (getT2SOImmVal(V) != -1) return true; + + // Likewise, try masking out a splat value first. + V = Imm; + if (getT2SOImmValSplatVal(V & 0xff00ff00U) != -1) + V &= ~0xff00ff00U; + else if (getT2SOImmValSplatVal(V & 0x00ff00ffU) != -1) + V &= ~0x00ff00ffU; + // If what's left can be handled as an immediate, accept. + if (getT2SOImmVal(V) != -1) return true; + + // Otherwise, do not accept. + return false; + } + + static inline unsigned getT2SOImmTwoPartFirst(unsigned Imm) { + assert (isT2SOImmTwoPartVal(Imm) && + "Immedate cannot be encoded as two part immediate!"); + // Try a shifter operand as one part + unsigned V = rotr32 (~255, getT2SOImmValRotate(Imm)) & Imm; + // If the rest is encodable as an immediate, then return it. + if (getT2SOImmVal(V) != -1) return V; + + // Try masking out a splat value first. + if (getT2SOImmValSplatVal(Imm & 0xff00ff00U) != -1) + return Imm & 0xff00ff00U; + + // The other splat is all that's left as an option. + assert (getT2SOImmValSplatVal(Imm & 0x00ff00ffU) != -1); + return Imm & 0x00ff00ffU; + } + + static inline unsigned getT2SOImmTwoPartSecond(unsigned Imm) { + // Mask out the first hunk + Imm ^= getT2SOImmTwoPartFirst(Imm); + // Return what's left + assert (getT2SOImmVal(Imm) != -1 && + "Unable to encode second part of T2 two part SO immediate"); + return Imm; + } + + + //===--------------------------------------------------------------------===// + // Addressing Mode #2 + //===--------------------------------------------------------------------===// + // + // This is used for most simple load/store instructions. + // + // addrmode2 := reg +/- reg shop imm + // addrmode2 := reg +/- imm12 + // + // The first operand is always a Reg. The second operand is a reg if in + // reg/reg form, otherwise it's reg#0. The third field encodes the operation + // in bit 12, the immediate in bits 0-11, and the shift op in 13-15. + // + // If this addressing mode is a frame index (before prolog/epilog insertion + // and code rewriting), this operand will have the form: FI#, reg0, <offs> + // with no shift amount for the frame offset. + // + static inline unsigned getAM2Opc(AddrOpc Opc, unsigned Imm12, ShiftOpc SO) { + assert(Imm12 < (1 << 12) && "Imm too large!"); + bool isSub = Opc == sub; + return Imm12 | ((int)isSub << 12) | (SO << 13); + } + static inline unsigned getAM2Offset(unsigned AM2Opc) { + return AM2Opc & ((1 << 12)-1); + } + static inline AddrOpc getAM2Op(unsigned AM2Opc) { + return ((AM2Opc >> 12) & 1) ? sub : add; + } + static inline ShiftOpc getAM2ShiftOpc(unsigned AM2Opc) { + return (ShiftOpc)(AM2Opc >> 13); + } + + + //===--------------------------------------------------------------------===// + // Addressing Mode #3 + //===--------------------------------------------------------------------===// + // + // This is used for sign-extending loads, and load/store-pair instructions. + // + // addrmode3 := reg +/- reg + // addrmode3 := reg +/- imm8 + // + // The first operand is always a Reg. The second operand is a reg if in + // reg/reg form, otherwise it's reg#0. The third field encodes the operation + // in bit 8, the immediate in bits 0-7. + + /// getAM3Opc - This function encodes the addrmode3 opc field. + static inline unsigned getAM3Opc(AddrOpc Opc, unsigned char Offset) { + bool isSub = Opc == sub; + return ((int)isSub << 8) | Offset; + } + static inline unsigned char getAM3Offset(unsigned AM3Opc) { + return AM3Opc & 0xFF; + } + static inline AddrOpc getAM3Op(unsigned AM3Opc) { + return ((AM3Opc >> 8) & 1) ? sub : add; + } + + //===--------------------------------------------------------------------===// + // Addressing Mode #4 + //===--------------------------------------------------------------------===// + // + // This is used for load / store multiple instructions. + // + // addrmode4 := reg, <mode> + // + // The four modes are: + // IA - Increment after + // IB - Increment before + // DA - Decrement after + // DB - Decrement before + + static inline AMSubMode getAM4SubMode(unsigned Mode) { + return (AMSubMode)(Mode & 0x7); + } + + static inline unsigned getAM4ModeImm(AMSubMode SubMode) { + return (int)SubMode; + } + + //===--------------------------------------------------------------------===// + // Addressing Mode #5 + //===--------------------------------------------------------------------===// + // + // This is used for coprocessor instructions, such as FP load/stores. + // + // addrmode5 := reg +/- imm8*4 + // + // The first operand is always a Reg. The second operand encodes the + // operation in bit 8 and the immediate in bits 0-7. + // + // This is also used for FP load/store multiple ops. The second operand + // encodes the number of registers (or 2 times the number of registers + // for DPR ops) in bits 0-7. In addition, bits 8-10 encode one of the + // following two sub-modes: + // + // IA - Increment after + // DB - Decrement before + + /// getAM5Opc - This function encodes the addrmode5 opc field. + static inline unsigned getAM5Opc(AddrOpc Opc, unsigned char Offset) { + bool isSub = Opc == sub; + return ((int)isSub << 8) | Offset; + } + static inline unsigned char getAM5Offset(unsigned AM5Opc) { + return AM5Opc & 0xFF; + } + static inline AddrOpc getAM5Op(unsigned AM5Opc) { + return ((AM5Opc >> 8) & 1) ? sub : add; + } + + /// getAM5Opc - This function encodes the addrmode5 opc field for VLDM and + /// VSTM instructions. + static inline unsigned getAM5Opc(AMSubMode SubMode, unsigned char Offset) { + assert((SubMode == ia || SubMode == db) && + "Illegal addressing mode 5 sub-mode!"); + return ((int)SubMode << 8) | Offset; + } + static inline AMSubMode getAM5SubMode(unsigned AM5Opc) { + return (AMSubMode)((AM5Opc >> 8) & 0x7); + } + + //===--------------------------------------------------------------------===// + // Addressing Mode #6 + //===--------------------------------------------------------------------===// + // + // This is used for NEON load / store instructions. + // + // addrmode6 := reg with optional alignment + // + // This is stored in two operands [regaddr, align]. The first is the + // address register. The second operand is the value of the alignment + // specifier to use or zero if no explicit alignment. + +} // end namespace ARM_AM +} // end namespace llvm + +#endif + diff --git a/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp new file mode 100644 index 0000000..2528854 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -0,0 +1,1472 @@ +//===- ARMBaseInstrInfo.cpp - ARM Instruction Information -------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the Base ARM implementation of the TargetInstrInfo class. +// +//===----------------------------------------------------------------------===// + +#include "ARMBaseInstrInfo.h" +#include "ARM.h" +#include "ARMAddressingModes.h" +#include "ARMConstantPoolValue.h" +#include "ARMGenInstrInfo.inc" +#include "ARMMachineFunctionInfo.h" +#include "ARMRegisterInfo.h" +#include "llvm/Constants.h" +#include "llvm/Function.h" +#include "llvm/GlobalValue.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/CodeGen/LiveVariables.h" +#include "llvm/CodeGen/MachineConstantPool.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineJumpTableInfo.h" +#include "llvm/CodeGen/MachineMemOperand.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/PseudoSourceValue.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +using namespace llvm; + +static cl::opt<bool> +EnableARM3Addr("enable-arm-3-addr-conv", cl::Hidden, + cl::desc("Enable ARM 2-addr to 3-addr conv")); + +ARMBaseInstrInfo::ARMBaseInstrInfo(const ARMSubtarget& STI) + : TargetInstrInfoImpl(ARMInsts, array_lengthof(ARMInsts)), + Subtarget(STI) { +} + +MachineInstr * +ARMBaseInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, + MachineBasicBlock::iterator &MBBI, + LiveVariables *LV) const { + // FIXME: Thumb2 support. + + if (!EnableARM3Addr) + return NULL; + + MachineInstr *MI = MBBI; + MachineFunction &MF = *MI->getParent()->getParent(); + unsigned TSFlags = MI->getDesc().TSFlags; + bool isPre = false; + switch ((TSFlags & ARMII::IndexModeMask) >> ARMII::IndexModeShift) { + default: return NULL; + case ARMII::IndexModePre: + isPre = true; + break; + case ARMII::IndexModePost: + break; + } + + // Try splitting an indexed load/store to an un-indexed one plus an add/sub + // operation. + unsigned MemOpc = getUnindexedOpcode(MI->getOpcode()); + if (MemOpc == 0) + return NULL; + + MachineInstr *UpdateMI = NULL; + MachineInstr *MemMI = NULL; + unsigned AddrMode = (TSFlags & ARMII::AddrModeMask); + const TargetInstrDesc &TID = MI->getDesc(); + unsigned NumOps = TID.getNumOperands(); + bool isLoad = !TID.mayStore(); + const MachineOperand &WB = isLoad ? MI->getOperand(1) : MI->getOperand(0); + const MachineOperand &Base = MI->getOperand(2); + const MachineOperand &Offset = MI->getOperand(NumOps-3); + unsigned WBReg = WB.getReg(); + unsigned BaseReg = Base.getReg(); + unsigned OffReg = Offset.getReg(); + unsigned OffImm = MI->getOperand(NumOps-2).getImm(); + ARMCC::CondCodes Pred = (ARMCC::CondCodes)MI->getOperand(NumOps-1).getImm(); + switch (AddrMode) { + default: + assert(false && "Unknown indexed op!"); + return NULL; + case ARMII::AddrMode2: { + bool isSub = ARM_AM::getAM2Op(OffImm) == ARM_AM::sub; + unsigned Amt = ARM_AM::getAM2Offset(OffImm); + if (OffReg == 0) { + if (ARM_AM::getSOImmVal(Amt) == -1) + // Can't encode it in a so_imm operand. This transformation will + // add more than 1 instruction. Abandon! + return NULL; + UpdateMI = BuildMI(MF, MI->getDebugLoc(), + get(isSub ? ARM::SUBri : ARM::ADDri), WBReg) + .addReg(BaseReg).addImm(Amt) + .addImm(Pred).addReg(0).addReg(0); + } else if (Amt != 0) { + ARM_AM::ShiftOpc ShOpc = ARM_AM::getAM2ShiftOpc(OffImm); + unsigned SOOpc = ARM_AM::getSORegOpc(ShOpc, Amt); + UpdateMI = BuildMI(MF, MI->getDebugLoc(), + get(isSub ? ARM::SUBrs : ARM::ADDrs), WBReg) + .addReg(BaseReg).addReg(OffReg).addReg(0).addImm(SOOpc) + .addImm(Pred).addReg(0).addReg(0); + } else + UpdateMI = BuildMI(MF, MI->getDebugLoc(), + get(isSub ? ARM::SUBrr : ARM::ADDrr), WBReg) + .addReg(BaseReg).addReg(OffReg) + .addImm(Pred).addReg(0).addReg(0); + break; + } + case ARMII::AddrMode3 : { + bool isSub = ARM_AM::getAM3Op(OffImm) == ARM_AM::sub; + unsigned Amt = ARM_AM::getAM3Offset(OffImm); + if (OffReg == 0) + // Immediate is 8-bits. It's guaranteed to fit in a so_imm operand. + UpdateMI = BuildMI(MF, MI->getDebugLoc(), + get(isSub ? ARM::SUBri : ARM::ADDri), WBReg) + .addReg(BaseReg).addImm(Amt) + .addImm(Pred).addReg(0).addReg(0); + else + UpdateMI = BuildMI(MF, MI->getDebugLoc(), + get(isSub ? ARM::SUBrr : ARM::ADDrr), WBReg) + .addReg(BaseReg).addReg(OffReg) + .addImm(Pred).addReg(0).addReg(0); + break; + } + } + + std::vector<MachineInstr*> NewMIs; + if (isPre) { + if (isLoad) + MemMI = BuildMI(MF, MI->getDebugLoc(), + get(MemOpc), MI->getOperand(0).getReg()) + .addReg(WBReg).addReg(0).addImm(0).addImm(Pred); + else + MemMI = BuildMI(MF, MI->getDebugLoc(), + get(MemOpc)).addReg(MI->getOperand(1).getReg()) + .addReg(WBReg).addReg(0).addImm(0).addImm(Pred); + NewMIs.push_back(MemMI); + NewMIs.push_back(UpdateMI); + } else { + if (isLoad) + MemMI = BuildMI(MF, MI->getDebugLoc(), + get(MemOpc), MI->getOperand(0).getReg()) + .addReg(BaseReg).addReg(0).addImm(0).addImm(Pred); + else + MemMI = BuildMI(MF, MI->getDebugLoc(), + get(MemOpc)).addReg(MI->getOperand(1).getReg()) + .addReg(BaseReg).addReg(0).addImm(0).addImm(Pred); + if (WB.isDead()) + UpdateMI->getOperand(0).setIsDead(); + NewMIs.push_back(UpdateMI); + NewMIs.push_back(MemMI); + } + + // Transfer LiveVariables states, kill / dead info. + if (LV) { + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { + MachineOperand &MO = MI->getOperand(i); + if (MO.isReg() && MO.getReg() && + TargetRegisterInfo::isVirtualRegister(MO.getReg())) { + unsigned Reg = MO.getReg(); + + LiveVariables::VarInfo &VI = LV->getVarInfo(Reg); + if (MO.isDef()) { + MachineInstr *NewMI = (Reg == WBReg) ? UpdateMI : MemMI; + if (MO.isDead()) + LV->addVirtualRegisterDead(Reg, NewMI); + } + if (MO.isUse() && MO.isKill()) { + for (unsigned j = 0; j < 2; ++j) { + // Look at the two new MI's in reverse order. + MachineInstr *NewMI = NewMIs[j]; + if (!NewMI->readsRegister(Reg)) + continue; + LV->addVirtualRegisterKilled(Reg, NewMI); + if (VI.removeKill(MI)) + VI.Kills.push_back(NewMI); + break; + } + } + } + } + } + + MFI->insert(MBBI, NewMIs[1]); + MFI->insert(MBBI, NewMIs[0]); + return NewMIs[0]; +} + +bool +ARMBaseInstrInfo::spillCalleeSavedRegisters(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + const std::vector<CalleeSavedInfo> &CSI, + const TargetRegisterInfo *TRI) const { + if (CSI.empty()) + return false; + + DebugLoc DL; + if (MI != MBB.end()) DL = MI->getDebugLoc(); + + for (unsigned i = 0, e = CSI.size(); i != e; ++i) { + unsigned Reg = CSI[i].getReg(); + bool isKill = true; + + // Add the callee-saved register as live-in unless it's LR and + // @llvm.returnaddress is called. If LR is returned for @llvm.returnaddress + // then it's already added to the function and entry block live-in sets. + if (Reg == ARM::LR) { + MachineFunction &MF = *MBB.getParent(); + if (MF.getFrameInfo()->isReturnAddressTaken() && + MF.getRegInfo().isLiveIn(Reg)) + isKill = false; + } + + if (isKill) + MBB.addLiveIn(Reg); + + // Insert the spill to the stack frame. The register is killed at the spill + // + storeRegToStackSlot(MBB, MI, Reg, isKill, + CSI[i].getFrameIdx(), CSI[i].getRegClass(), TRI); + } + return true; +} + +// Branch analysis. +bool +ARMBaseInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB, + MachineBasicBlock *&FBB, + SmallVectorImpl<MachineOperand> &Cond, + bool AllowModify) const { + // If the block has no terminators, it just falls into the block after it. + MachineBasicBlock::iterator I = MBB.end(); + if (I == MBB.begin()) + return false; + --I; + while (I->isDebugValue()) { + if (I == MBB.begin()) + return false; + --I; + } + if (!isUnpredicatedTerminator(I)) + return false; + + // Get the last instruction in the block. + MachineInstr *LastInst = I; + + // If there is only one terminator instruction, process it. + unsigned LastOpc = LastInst->getOpcode(); + if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) { + if (isUncondBranchOpcode(LastOpc)) { + TBB = LastInst->getOperand(0).getMBB(); + return false; + } + if (isCondBranchOpcode(LastOpc)) { + // Block ends with fall-through condbranch. + TBB = LastInst->getOperand(0).getMBB(); + Cond.push_back(LastInst->getOperand(1)); + Cond.push_back(LastInst->getOperand(2)); + return false; + } + return true; // Can't handle indirect branch. + } + + // Get the instruction before it if it is a terminator. + MachineInstr *SecondLastInst = I; + + // If there are three terminators, we don't know what sort of block this is. + if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(--I)) + return true; + + // If the block ends with a B and a Bcc, handle it. + unsigned SecondLastOpc = SecondLastInst->getOpcode(); + if (isCondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) { + TBB = SecondLastInst->getOperand(0).getMBB(); + Cond.push_back(SecondLastInst->getOperand(1)); + Cond.push_back(SecondLastInst->getOperand(2)); + FBB = LastInst->getOperand(0).getMBB(); + return false; + } + + // If the block ends with two unconditional branches, handle it. The second + // one is not executed, so remove it. + if (isUncondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) { + TBB = SecondLastInst->getOperand(0).getMBB(); + I = LastInst; + if (AllowModify) + I->eraseFromParent(); + return false; + } + + // ...likewise if it ends with a branch table followed by an unconditional + // branch. The branch folder can create these, and we must get rid of them for + // correctness of Thumb constant islands. + if ((isJumpTableBranchOpcode(SecondLastOpc) || + isIndirectBranchOpcode(SecondLastOpc)) && + isUncondBranchOpcode(LastOpc)) { + I = LastInst; + if (AllowModify) + I->eraseFromParent(); + return true; + } + + // Otherwise, can't handle this. + return true; +} + + +unsigned ARMBaseInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const { + MachineBasicBlock::iterator I = MBB.end(); + if (I == MBB.begin()) return 0; + --I; + while (I->isDebugValue()) { + if (I == MBB.begin()) + return 0; + --I; + } + if (!isUncondBranchOpcode(I->getOpcode()) && + !isCondBranchOpcode(I->getOpcode())) + return 0; + + // Remove the branch. + I->eraseFromParent(); + + I = MBB.end(); + + if (I == MBB.begin()) return 1; + --I; + if (!isCondBranchOpcode(I->getOpcode())) + return 1; + + // Remove the branch. + I->eraseFromParent(); + return 2; +} + +unsigned +ARMBaseInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, + MachineBasicBlock *FBB, + const SmallVectorImpl<MachineOperand> &Cond) const { + // FIXME this should probably have a DebugLoc argument + DebugLoc dl; + + ARMFunctionInfo *AFI = MBB.getParent()->getInfo<ARMFunctionInfo>(); + int BOpc = !AFI->isThumbFunction() + ? ARM::B : (AFI->isThumb2Function() ? ARM::t2B : ARM::tB); + int BccOpc = !AFI->isThumbFunction() + ? ARM::Bcc : (AFI->isThumb2Function() ? ARM::t2Bcc : ARM::tBcc); + + // Shouldn't be a fall through. + assert(TBB && "InsertBranch must not be told to insert a fallthrough"); + assert((Cond.size() == 2 || Cond.size() == 0) && + "ARM branch conditions have two components!"); + + if (FBB == 0) { + if (Cond.empty()) // Unconditional branch? + BuildMI(&MBB, dl, get(BOpc)).addMBB(TBB); + else + BuildMI(&MBB, dl, get(BccOpc)).addMBB(TBB) + .addImm(Cond[0].getImm()).addReg(Cond[1].getReg()); + return 1; + } + + // Two-way conditional branch. + BuildMI(&MBB, dl, get(BccOpc)).addMBB(TBB) + .addImm(Cond[0].getImm()).addReg(Cond[1].getReg()); + BuildMI(&MBB, dl, get(BOpc)).addMBB(FBB); + return 2; +} + +bool ARMBaseInstrInfo:: +ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const { + ARMCC::CondCodes CC = (ARMCC::CondCodes)(int)Cond[0].getImm(); + Cond[0].setImm(ARMCC::getOppositeCondition(CC)); + return false; +} + +bool ARMBaseInstrInfo:: +PredicateInstruction(MachineInstr *MI, + const SmallVectorImpl<MachineOperand> &Pred) const { + unsigned Opc = MI->getOpcode(); + if (isUncondBranchOpcode(Opc)) { + MI->setDesc(get(getMatchingCondBranchOpcode(Opc))); + MI->addOperand(MachineOperand::CreateImm(Pred[0].getImm())); + MI->addOperand(MachineOperand::CreateReg(Pred[1].getReg(), false)); + return true; + } + + int PIdx = MI->findFirstPredOperandIdx(); + if (PIdx != -1) { + MachineOperand &PMO = MI->getOperand(PIdx); + PMO.setImm(Pred[0].getImm()); + MI->getOperand(PIdx+1).setReg(Pred[1].getReg()); + return true; + } + return false; +} + +bool ARMBaseInstrInfo:: +SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1, + const SmallVectorImpl<MachineOperand> &Pred2) const { + if (Pred1.size() > 2 || Pred2.size() > 2) + return false; + + ARMCC::CondCodes CC1 = (ARMCC::CondCodes)Pred1[0].getImm(); + ARMCC::CondCodes CC2 = (ARMCC::CondCodes)Pred2[0].getImm(); + if (CC1 == CC2) + return true; + + switch (CC1) { + default: + return false; + case ARMCC::AL: + return true; + case ARMCC::HS: + return CC2 == ARMCC::HI; + case ARMCC::LS: + return CC2 == ARMCC::LO || CC2 == ARMCC::EQ; + case ARMCC::GE: + return CC2 == ARMCC::GT; + case ARMCC::LE: + return CC2 == ARMCC::LT; + } +} + +bool ARMBaseInstrInfo::DefinesPredicate(MachineInstr *MI, + std::vector<MachineOperand> &Pred) const { + // FIXME: This confuses implicit_def with optional CPSR def. + const TargetInstrDesc &TID = MI->getDesc(); + if (!TID.getImplicitDefs() && !TID.hasOptionalDef()) + return false; + + bool Found = false; + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { + const MachineOperand &MO = MI->getOperand(i); + if (MO.isReg() && MO.getReg() == ARM::CPSR) { + Pred.push_back(MO); + Found = true; + } + } + + return Found; +} + +/// isPredicable - Return true if the specified instruction can be predicated. +/// By default, this returns true for every instruction with a +/// PredicateOperand. +bool ARMBaseInstrInfo::isPredicable(MachineInstr *MI) const { + const TargetInstrDesc &TID = MI->getDesc(); + if (!TID.isPredicable()) + return false; + + if ((TID.TSFlags & ARMII::DomainMask) == ARMII::DomainNEON) { + ARMFunctionInfo *AFI = + MI->getParent()->getParent()->getInfo<ARMFunctionInfo>(); + return AFI->isThumb2Function(); + } + return true; +} + +/// FIXME: Works around a gcc miscompilation with -fstrict-aliasing. +DISABLE_INLINE +static unsigned getNumJTEntries(const std::vector<MachineJumpTableEntry> &JT, + unsigned JTI); +static unsigned getNumJTEntries(const std::vector<MachineJumpTableEntry> &JT, + unsigned JTI) { + assert(JTI < JT.size()); + return JT[JTI].MBBs.size(); +} + +/// GetInstSize - Return the size of the specified MachineInstr. +/// +unsigned ARMBaseInstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const { + const MachineBasicBlock &MBB = *MI->getParent(); + const MachineFunction *MF = MBB.getParent(); + const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo(); + + // Basic size info comes from the TSFlags field. + const TargetInstrDesc &TID = MI->getDesc(); + unsigned TSFlags = TID.TSFlags; + + unsigned Opc = MI->getOpcode(); + switch ((TSFlags & ARMII::SizeMask) >> ARMII::SizeShift) { + default: { + // If this machine instr is an inline asm, measure it. + if (MI->getOpcode() == ARM::INLINEASM) + return getInlineAsmLength(MI->getOperand(0).getSymbolName(), *MAI); + if (MI->isLabel()) + return 0; + switch (Opc) { + default: + llvm_unreachable("Unknown or unset size field for instr!"); + case TargetOpcode::IMPLICIT_DEF: + case TargetOpcode::KILL: + case TargetOpcode::DBG_LABEL: + case TargetOpcode::EH_LABEL: + case TargetOpcode::DBG_VALUE: + return 0; + } + break; + } + case ARMII::Size8Bytes: return 8; // ARM instruction x 2. + case ARMII::Size4Bytes: return 4; // ARM / Thumb2 instruction. + case ARMII::Size2Bytes: return 2; // Thumb1 instruction. + case ARMII::SizeSpecial: { + switch (Opc) { + case ARM::CONSTPOOL_ENTRY: + // If this machine instr is a constant pool entry, its size is recorded as + // operand #2. + return MI->getOperand(2).getImm(); + case ARM::Int_eh_sjlj_longjmp: + return 16; + case ARM::tInt_eh_sjlj_longjmp: + return 10; + case ARM::Int_eh_sjlj_setjmp: + case ARM::Int_eh_sjlj_setjmp_nofp: + return 24; + case ARM::tInt_eh_sjlj_setjmp: + case ARM::t2Int_eh_sjlj_setjmp: + case ARM::t2Int_eh_sjlj_setjmp_nofp: + return 14; + case ARM::BR_JTr: + case ARM::BR_JTm: + case ARM::BR_JTadd: + case ARM::tBR_JTr: + case ARM::t2BR_JT: + case ARM::t2TBB: + case ARM::t2TBH: { + // These are jumptable branches, i.e. a branch followed by an inlined + // jumptable. The size is 4 + 4 * number of entries. For TBB, each + // entry is one byte; TBH two byte each. + unsigned EntrySize = (Opc == ARM::t2TBB) + ? 1 : ((Opc == ARM::t2TBH) ? 2 : 4); + unsigned NumOps = TID.getNumOperands(); + MachineOperand JTOP = + MI->getOperand(NumOps - (TID.isPredicable() ? 3 : 2)); + unsigned JTI = JTOP.getIndex(); + const MachineJumpTableInfo *MJTI = MF->getJumpTableInfo(); + assert(MJTI != 0); + const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables(); + assert(JTI < JT.size()); + // Thumb instructions are 2 byte aligned, but JT entries are 4 byte + // 4 aligned. The assembler / linker may add 2 byte padding just before + // the JT entries. The size does not include this padding; the + // constant islands pass does separate bookkeeping for it. + // FIXME: If we know the size of the function is less than (1 << 16) *2 + // bytes, we can use 16-bit entries instead. Then there won't be an + // alignment issue. + unsigned InstSize = (Opc == ARM::tBR_JTr || Opc == ARM::t2BR_JT) ? 2 : 4; + unsigned NumEntries = getNumJTEntries(JT, JTI); + if (Opc == ARM::t2TBB && (NumEntries & 1)) + // Make sure the instruction that follows TBB is 2-byte aligned. + // FIXME: Constant island pass should insert an "ALIGN" instruction + // instead. + ++NumEntries; + return NumEntries * EntrySize + InstSize; + } + default: + // Otherwise, pseudo-instruction sizes are zero. + return 0; + } + } + } + return 0; // Not reached +} + +/// Return true if the instruction is a register to register move and +/// leave the source and dest operands in the passed parameters. +/// +bool +ARMBaseInstrInfo::isMoveInstr(const MachineInstr &MI, + unsigned &SrcReg, unsigned &DstReg, + unsigned& SrcSubIdx, unsigned& DstSubIdx) const { + switch (MI.getOpcode()) { + default: break; + case ARM::VMOVS: + case ARM::VMOVD: + case ARM::VMOVDneon: + case ARM::VMOVQ: + case ARM::VMOVQQ : { + SrcReg = MI.getOperand(1).getReg(); + DstReg = MI.getOperand(0).getReg(); + SrcSubIdx = MI.getOperand(1).getSubReg(); + DstSubIdx = MI.getOperand(0).getSubReg(); + return true; + } + case ARM::MOVr: + case ARM::tMOVr: + case ARM::tMOVgpr2tgpr: + case ARM::tMOVtgpr2gpr: + case ARM::tMOVgpr2gpr: + case ARM::t2MOVr: { + assert(MI.getDesc().getNumOperands() >= 2 && + MI.getOperand(0).isReg() && + MI.getOperand(1).isReg() && + "Invalid ARM MOV instruction"); + SrcReg = MI.getOperand(1).getReg(); + DstReg = MI.getOperand(0).getReg(); + SrcSubIdx = MI.getOperand(1).getSubReg(); + DstSubIdx = MI.getOperand(0).getSubReg(); + return true; + } + } + + return false; +} + +unsigned +ARMBaseInstrInfo::isLoadFromStackSlot(const MachineInstr *MI, + int &FrameIndex) const { + switch (MI->getOpcode()) { + default: break; + case ARM::LDR: + case ARM::t2LDRs: // FIXME: don't use t2LDRs to access frame. + if (MI->getOperand(1).isFI() && + MI->getOperand(2).isReg() && + MI->getOperand(3).isImm() && + MI->getOperand(2).getReg() == 0 && + MI->getOperand(3).getImm() == 0) { + FrameIndex = MI->getOperand(1).getIndex(); + return MI->getOperand(0).getReg(); + } + break; + case ARM::t2LDRi12: + case ARM::tRestore: + if (MI->getOperand(1).isFI() && + MI->getOperand(2).isImm() && + MI->getOperand(2).getImm() == 0) { + FrameIndex = MI->getOperand(1).getIndex(); + return MI->getOperand(0).getReg(); + } + break; + case ARM::VLDRD: + case ARM::VLDRS: + if (MI->getOperand(1).isFI() && + MI->getOperand(2).isImm() && + MI->getOperand(2).getImm() == 0) { + FrameIndex = MI->getOperand(1).getIndex(); + return MI->getOperand(0).getReg(); + } + break; + } + + return 0; +} + +unsigned +ARMBaseInstrInfo::isStoreToStackSlot(const MachineInstr *MI, + int &FrameIndex) const { + switch (MI->getOpcode()) { + default: break; + case ARM::STR: + case ARM::t2STRs: // FIXME: don't use t2STRs to access frame. + if (MI->getOperand(1).isFI() && + MI->getOperand(2).isReg() && + MI->getOperand(3).isImm() && + MI->getOperand(2).getReg() == 0 && + MI->getOperand(3).getImm() == 0) { + FrameIndex = MI->getOperand(1).getIndex(); + return MI->getOperand(0).getReg(); + } + break; + case ARM::t2STRi12: + case ARM::tSpill: + if (MI->getOperand(1).isFI() && + MI->getOperand(2).isImm() && + MI->getOperand(2).getImm() == 0) { + FrameIndex = MI->getOperand(1).getIndex(); + return MI->getOperand(0).getReg(); + } + break; + case ARM::VSTRD: + case ARM::VSTRS: + if (MI->getOperand(1).isFI() && + MI->getOperand(2).isImm() && + MI->getOperand(2).getImm() == 0) { + FrameIndex = MI->getOperand(1).getIndex(); + return MI->getOperand(0).getReg(); + } + break; + } + + return 0; +} + +bool +ARMBaseInstrInfo::copyRegToReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + unsigned DestReg, unsigned SrcReg, + const TargetRegisterClass *DestRC, + const TargetRegisterClass *SrcRC, + DebugLoc DL) const { + // tGPR is used sometimes in ARM instructions that need to avoid using + // certain registers. Just treat it as GPR here. + if (DestRC == ARM::tGPRRegisterClass) + DestRC = ARM::GPRRegisterClass; + if (SrcRC == ARM::tGPRRegisterClass) + SrcRC = ARM::GPRRegisterClass; + + // Allow DPR / DPR_VFP2 / DPR_8 cross-class copies. + if (DestRC == ARM::DPR_8RegisterClass) + DestRC = ARM::DPR_VFP2RegisterClass; + if (SrcRC == ARM::DPR_8RegisterClass) + SrcRC = ARM::DPR_VFP2RegisterClass; + + // Allow QPR / QPR_VFP2 / QPR_8 cross-class copies. + if (DestRC == ARM::QPR_VFP2RegisterClass || + DestRC == ARM::QPR_8RegisterClass) + DestRC = ARM::QPRRegisterClass; + if (SrcRC == ARM::QPR_VFP2RegisterClass || + SrcRC == ARM::QPR_8RegisterClass) + SrcRC = ARM::QPRRegisterClass; + + // Allow QQPR / QQPR_VFP2 cross-class copies. + if (DestRC == ARM::QQPR_VFP2RegisterClass) + DestRC = ARM::QQPRRegisterClass; + if (SrcRC == ARM::QQPR_VFP2RegisterClass) + SrcRC = ARM::QQPRRegisterClass; + + // Disallow copies of unequal sizes. + if (DestRC != SrcRC && DestRC->getSize() != SrcRC->getSize()) + return false; + + if (DestRC == ARM::GPRRegisterClass) { + if (SrcRC == ARM::SPRRegisterClass) + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VMOVRS), DestReg) + .addReg(SrcReg)); + else + AddDefaultCC(AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::MOVr), + DestReg).addReg(SrcReg))); + } else { + unsigned Opc; + + if (DestRC == ARM::SPRRegisterClass) + Opc = (SrcRC == ARM::GPRRegisterClass ? ARM::VMOVSR : ARM::VMOVS); + else if (DestRC == ARM::DPRRegisterClass) + Opc = ARM::VMOVD; + else if (DestRC == ARM::DPR_VFP2RegisterClass || + SrcRC == ARM::DPR_VFP2RegisterClass) + // Always use neon reg-reg move if source or dest is NEON-only regclass. + Opc = ARM::VMOVDneon; + else if (DestRC == ARM::QPRRegisterClass) + Opc = ARM::VMOVQ; + else if (DestRC == ARM::QQPRRegisterClass) + Opc = ARM::VMOVQQ; + else if (DestRC == ARM::QQQQPRRegisterClass) + Opc = ARM::VMOVQQQQ; + else + return false; + + AddDefaultPred(BuildMI(MBB, I, DL, get(Opc), DestReg).addReg(SrcReg)); + } + + return true; +} + +static const +MachineInstrBuilder &AddDReg(MachineInstrBuilder &MIB, + unsigned Reg, unsigned SubIdx, unsigned State, + const TargetRegisterInfo *TRI) { + if (!SubIdx) + return MIB.addReg(Reg, State); + + if (TargetRegisterInfo::isPhysicalRegister(Reg)) + return MIB.addReg(TRI->getSubReg(Reg, SubIdx), State); + return MIB.addReg(Reg, State, SubIdx); +} + +void ARMBaseInstrInfo:: +storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, + unsigned SrcReg, bool isKill, int FI, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const { + DebugLoc DL; + if (I != MBB.end()) DL = I->getDebugLoc(); + MachineFunction &MF = *MBB.getParent(); + MachineFrameInfo &MFI = *MF.getFrameInfo(); + unsigned Align = MFI.getObjectAlignment(FI); + + MachineMemOperand *MMO = + MF.getMachineMemOperand(PseudoSourceValue::getFixedStack(FI), + MachineMemOperand::MOStore, 0, + MFI.getObjectSize(FI), + Align); + + // tGPR is used sometimes in ARM instructions that need to avoid using + // certain registers. Just treat it as GPR here. + if (RC == ARM::tGPRRegisterClass) + RC = ARM::GPRRegisterClass; + + if (RC == ARM::GPRRegisterClass) { + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::STR)) + .addReg(SrcReg, getKillRegState(isKill)) + .addFrameIndex(FI).addReg(0).addImm(0).addMemOperand(MMO)); + } else if (RC == ARM::SPRRegisterClass) { + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTRS)) + .addReg(SrcReg, getKillRegState(isKill)) + .addFrameIndex(FI).addImm(0).addMemOperand(MMO)); + } else if (RC == ARM::DPRRegisterClass || + RC == ARM::DPR_VFP2RegisterClass || + RC == ARM::DPR_8RegisterClass) { + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTRD)) + .addReg(SrcReg, getKillRegState(isKill)) + .addFrameIndex(FI).addImm(0).addMemOperand(MMO)); + } else if (RC == ARM::QPRRegisterClass || + RC == ARM::QPR_VFP2RegisterClass || + RC == ARM::QPR_8RegisterClass) { + // FIXME: Neon instructions should support predicates + if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) { + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VST1q)) + .addFrameIndex(FI).addImm(128) + .addReg(SrcReg, getKillRegState(isKill)) + .addMemOperand(MMO)); + } else { + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTMQ)) + .addReg(SrcReg, getKillRegState(isKill)) + .addFrameIndex(FI) + .addImm(ARM_AM::getAM5Opc(ARM_AM::ia, 4)) + .addMemOperand(MMO)); + } + } else if (RC == ARM::QQPRRegisterClass || RC == ARM::QQPR_VFP2RegisterClass){ + if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) { + // FIXME: It's possible to only store part of the QQ register if the + // spilled def has a sub-register index. + MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::VST2q32)) + .addFrameIndex(FI).addImm(128); + MIB = AddDReg(MIB, SrcReg, ARM::dsub_0, getKillRegState(isKill), TRI); + MIB = AddDReg(MIB, SrcReg, ARM::dsub_1, 0, TRI); + MIB = AddDReg(MIB, SrcReg, ARM::dsub_2, 0, TRI); + MIB = AddDReg(MIB, SrcReg, ARM::dsub_3, 0, TRI); + AddDefaultPred(MIB.addMemOperand(MMO)); + } else { + MachineInstrBuilder MIB = + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTMD)) + .addFrameIndex(FI) + .addImm(ARM_AM::getAM5Opc(ARM_AM::ia, 4))) + .addMemOperand(MMO); + MIB = AddDReg(MIB, SrcReg, ARM::dsub_0, getKillRegState(isKill), TRI); + MIB = AddDReg(MIB, SrcReg, ARM::dsub_1, 0, TRI); + MIB = AddDReg(MIB, SrcReg, ARM::dsub_2, 0, TRI); + AddDReg(MIB, SrcReg, ARM::dsub_3, 0, TRI); + } + } else { + assert(RC == ARM::QQQQPRRegisterClass && "Unknown regclass!"); + MachineInstrBuilder MIB = + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTMD)) + .addFrameIndex(FI) + .addImm(ARM_AM::getAM5Opc(ARM_AM::ia, 4))) + .addMemOperand(MMO); + MIB = AddDReg(MIB, SrcReg, ARM::dsub_0, getKillRegState(isKill), TRI); + MIB = AddDReg(MIB, SrcReg, ARM::dsub_1, 0, TRI); + MIB = AddDReg(MIB, SrcReg, ARM::dsub_2, 0, TRI); + MIB = AddDReg(MIB, SrcReg, ARM::dsub_3, 0, TRI); + MIB = AddDReg(MIB, SrcReg, ARM::dsub_4, 0, TRI); + MIB = AddDReg(MIB, SrcReg, ARM::dsub_5, 0, TRI); + MIB = AddDReg(MIB, SrcReg, ARM::dsub_6, 0, TRI); + AddDReg(MIB, SrcReg, ARM::dsub_7, 0, TRI); + } +} + +void ARMBaseInstrInfo:: +loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, + unsigned DestReg, int FI, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const { + DebugLoc DL; + if (I != MBB.end()) DL = I->getDebugLoc(); + MachineFunction &MF = *MBB.getParent(); + MachineFrameInfo &MFI = *MF.getFrameInfo(); + unsigned Align = MFI.getObjectAlignment(FI); + MachineMemOperand *MMO = + MF.getMachineMemOperand(PseudoSourceValue::getFixedStack(FI), + MachineMemOperand::MOLoad, 0, + MFI.getObjectSize(FI), + Align); + + // tGPR is used sometimes in ARM instructions that need to avoid using + // certain registers. Just treat it as GPR here. + if (RC == ARM::tGPRRegisterClass) + RC = ARM::GPRRegisterClass; + + if (RC == ARM::GPRRegisterClass) { + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::LDR), DestReg) + .addFrameIndex(FI).addReg(0).addImm(0).addMemOperand(MMO)); + } else if (RC == ARM::SPRRegisterClass) { + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDRS), DestReg) + .addFrameIndex(FI).addImm(0).addMemOperand(MMO)); + } else if (RC == ARM::DPRRegisterClass || + RC == ARM::DPR_VFP2RegisterClass || + RC == ARM::DPR_8RegisterClass) { + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDRD), DestReg) + .addFrameIndex(FI).addImm(0).addMemOperand(MMO)); + } else if (RC == ARM::QPRRegisterClass || + RC == ARM::QPR_VFP2RegisterClass || + RC == ARM::QPR_8RegisterClass) { + if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) { + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLD1q), DestReg) + .addFrameIndex(FI).addImm(128) + .addMemOperand(MMO)); + } else { + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDMQ), DestReg) + .addFrameIndex(FI) + .addImm(ARM_AM::getAM5Opc(ARM_AM::ia, 4)) + .addMemOperand(MMO)); + } + } else if (RC == ARM::QQPRRegisterClass || RC == ARM::QQPR_VFP2RegisterClass){ + if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) { + MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::VLD2q32)); + MIB = AddDReg(MIB, DestReg, ARM::dsub_0, RegState::Define, TRI); + MIB = AddDReg(MIB, DestReg, ARM::dsub_1, RegState::Define, TRI); + MIB = AddDReg(MIB, DestReg, ARM::dsub_2, RegState::Define, TRI); + MIB = AddDReg(MIB, DestReg, ARM::dsub_3, RegState::Define, TRI); + AddDefaultPred(MIB.addFrameIndex(FI).addImm(128).addMemOperand(MMO)); + } else { + MachineInstrBuilder MIB = + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDMD)) + .addFrameIndex(FI) + .addImm(ARM_AM::getAM5Opc(ARM_AM::ia, 4))) + .addMemOperand(MMO); + MIB = AddDReg(MIB, DestReg, ARM::dsub_0, RegState::Define, TRI); + MIB = AddDReg(MIB, DestReg, ARM::dsub_1, RegState::Define, TRI); + MIB = AddDReg(MIB, DestReg, ARM::dsub_2, RegState::Define, TRI); + AddDReg(MIB, DestReg, ARM::dsub_3, RegState::Define, TRI); + } + } else { + assert(RC == ARM::QQQQPRRegisterClass && "Unknown regclass!"); + MachineInstrBuilder MIB = + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDMD)) + .addFrameIndex(FI) + .addImm(ARM_AM::getAM5Opc(ARM_AM::ia, 4))) + .addMemOperand(MMO); + MIB = AddDReg(MIB, DestReg, ARM::dsub_0, RegState::Define, TRI); + MIB = AddDReg(MIB, DestReg, ARM::dsub_1, RegState::Define, TRI); + MIB = AddDReg(MIB, DestReg, ARM::dsub_2, RegState::Define, TRI); + MIB = AddDReg(MIB, DestReg, ARM::dsub_3, RegState::Define, TRI); + MIB = AddDReg(MIB, DestReg, ARM::dsub_4, RegState::Define, TRI); + MIB = AddDReg(MIB, DestReg, ARM::dsub_5, RegState::Define, TRI); + MIB = AddDReg(MIB, DestReg, ARM::dsub_6, RegState::Define, TRI); + AddDReg(MIB, DestReg, ARM::dsub_7, RegState::Define, TRI); + } +} + +MachineInstr* +ARMBaseInstrInfo::emitFrameIndexDebugValue(MachineFunction &MF, + int FrameIx, uint64_t Offset, + const MDNode *MDPtr, + DebugLoc DL) const { + MachineInstrBuilder MIB = BuildMI(MF, DL, get(ARM::DBG_VALUE)) + .addFrameIndex(FrameIx).addImm(0).addImm(Offset).addMetadata(MDPtr); + return &*MIB; +} + +MachineInstr *ARMBaseInstrInfo:: +foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, + const SmallVectorImpl<unsigned> &Ops, int FI) const { + if (Ops.size() != 1) return NULL; + + unsigned OpNum = Ops[0]; + unsigned Opc = MI->getOpcode(); + MachineInstr *NewMI = NULL; + if (Opc == ARM::MOVr || Opc == ARM::t2MOVr) { + // If it is updating CPSR, then it cannot be folded. + if (MI->getOperand(4).getReg() == ARM::CPSR && !MI->getOperand(4).isDead()) + return NULL; + unsigned Pred = MI->getOperand(2).getImm(); + unsigned PredReg = MI->getOperand(3).getReg(); + if (OpNum == 0) { // move -> store + unsigned SrcReg = MI->getOperand(1).getReg(); + unsigned SrcSubReg = MI->getOperand(1).getSubReg(); + bool isKill = MI->getOperand(1).isKill(); + bool isUndef = MI->getOperand(1).isUndef(); + if (Opc == ARM::MOVr) + NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::STR)) + .addReg(SrcReg, + getKillRegState(isKill) | getUndefRegState(isUndef), + SrcSubReg) + .addFrameIndex(FI).addReg(0).addImm(0).addImm(Pred).addReg(PredReg); + else // ARM::t2MOVr + NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::t2STRi12)) + .addReg(SrcReg, + getKillRegState(isKill) | getUndefRegState(isUndef), + SrcSubReg) + .addFrameIndex(FI).addImm(0).addImm(Pred).addReg(PredReg); + } else { // move -> load + unsigned DstReg = MI->getOperand(0).getReg(); + unsigned DstSubReg = MI->getOperand(0).getSubReg(); + bool isDead = MI->getOperand(0).isDead(); + bool isUndef = MI->getOperand(0).isUndef(); + if (Opc == ARM::MOVr) + NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::LDR)) + .addReg(DstReg, + RegState::Define | + getDeadRegState(isDead) | + getUndefRegState(isUndef), DstSubReg) + .addFrameIndex(FI).addReg(0).addImm(0).addImm(Pred).addReg(PredReg); + else // ARM::t2MOVr + NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::t2LDRi12)) + .addReg(DstReg, + RegState::Define | + getDeadRegState(isDead) | + getUndefRegState(isUndef), DstSubReg) + .addFrameIndex(FI).addImm(0).addImm(Pred).addReg(PredReg); + } + } else if (Opc == ARM::tMOVgpr2gpr || + Opc == ARM::tMOVtgpr2gpr || + Opc == ARM::tMOVgpr2tgpr) { + if (OpNum == 0) { // move -> store + unsigned SrcReg = MI->getOperand(1).getReg(); + unsigned SrcSubReg = MI->getOperand(1).getSubReg(); + bool isKill = MI->getOperand(1).isKill(); + bool isUndef = MI->getOperand(1).isUndef(); + NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::t2STRi12)) + .addReg(SrcReg, + getKillRegState(isKill) | getUndefRegState(isUndef), + SrcSubReg) + .addFrameIndex(FI).addImm(0).addImm(ARMCC::AL).addReg(0); + } else { // move -> load + unsigned DstReg = MI->getOperand(0).getReg(); + unsigned DstSubReg = MI->getOperand(0).getSubReg(); + bool isDead = MI->getOperand(0).isDead(); + bool isUndef = MI->getOperand(0).isUndef(); + NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::t2LDRi12)) + .addReg(DstReg, + RegState::Define | + getDeadRegState(isDead) | + getUndefRegState(isUndef), + DstSubReg) + .addFrameIndex(FI).addImm(0).addImm(ARMCC::AL).addReg(0); + } + } else if (Opc == ARM::VMOVS) { + unsigned Pred = MI->getOperand(2).getImm(); + unsigned PredReg = MI->getOperand(3).getReg(); + if (OpNum == 0) { // move -> store + unsigned SrcReg = MI->getOperand(1).getReg(); + unsigned SrcSubReg = MI->getOperand(1).getSubReg(); + bool isKill = MI->getOperand(1).isKill(); + bool isUndef = MI->getOperand(1).isUndef(); + NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::VSTRS)) + .addReg(SrcReg, getKillRegState(isKill) | getUndefRegState(isUndef), + SrcSubReg) + .addFrameIndex(FI) + .addImm(0).addImm(Pred).addReg(PredReg); + } else { // move -> load + unsigned DstReg = MI->getOperand(0).getReg(); + unsigned DstSubReg = MI->getOperand(0).getSubReg(); + bool isDead = MI->getOperand(0).isDead(); + bool isUndef = MI->getOperand(0).isUndef(); + NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::VLDRS)) + .addReg(DstReg, + RegState::Define | + getDeadRegState(isDead) | + getUndefRegState(isUndef), + DstSubReg) + .addFrameIndex(FI).addImm(0).addImm(Pred).addReg(PredReg); + } + } else if (Opc == ARM::VMOVD || Opc == ARM::VMOVDneon) { + unsigned Pred = MI->getOperand(2).getImm(); + unsigned PredReg = MI->getOperand(3).getReg(); + if (OpNum == 0) { // move -> store + unsigned SrcReg = MI->getOperand(1).getReg(); + unsigned SrcSubReg = MI->getOperand(1).getSubReg(); + bool isKill = MI->getOperand(1).isKill(); + bool isUndef = MI->getOperand(1).isUndef(); + NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::VSTRD)) + .addReg(SrcReg, + getKillRegState(isKill) | getUndefRegState(isUndef), + SrcSubReg) + .addFrameIndex(FI).addImm(0).addImm(Pred).addReg(PredReg); + } else { // move -> load + unsigned DstReg = MI->getOperand(0).getReg(); + unsigned DstSubReg = MI->getOperand(0).getSubReg(); + bool isDead = MI->getOperand(0).isDead(); + bool isUndef = MI->getOperand(0).isUndef(); + NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::VLDRD)) + .addReg(DstReg, + RegState::Define | + getDeadRegState(isDead) | + getUndefRegState(isUndef), + DstSubReg) + .addFrameIndex(FI).addImm(0).addImm(Pred).addReg(PredReg); + } + } else if (Opc == ARM::VMOVQ) { + MachineFrameInfo &MFI = *MF.getFrameInfo(); + unsigned Pred = MI->getOperand(2).getImm(); + unsigned PredReg = MI->getOperand(3).getReg(); + if (OpNum == 0) { // move -> store + unsigned SrcReg = MI->getOperand(1).getReg(); + unsigned SrcSubReg = MI->getOperand(1).getSubReg(); + bool isKill = MI->getOperand(1).isKill(); + bool isUndef = MI->getOperand(1).isUndef(); + if (MFI.getObjectAlignment(FI) >= 16 && + getRegisterInfo().canRealignStack(MF)) { + NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::VST1q)) + .addFrameIndex(FI).addImm(128) + .addReg(SrcReg, + getKillRegState(isKill) | getUndefRegState(isUndef), + SrcSubReg) + .addImm(Pred).addReg(PredReg); + } else { + NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::VSTMQ)) + .addReg(SrcReg, + getKillRegState(isKill) | getUndefRegState(isUndef), + SrcSubReg) + .addFrameIndex(FI).addImm(ARM_AM::getAM5Opc(ARM_AM::ia, 4)) + .addImm(Pred).addReg(PredReg); + } + } else { // move -> load + unsigned DstReg = MI->getOperand(0).getReg(); + unsigned DstSubReg = MI->getOperand(0).getSubReg(); + bool isDead = MI->getOperand(0).isDead(); + bool isUndef = MI->getOperand(0).isUndef(); + if (MFI.getObjectAlignment(FI) >= 16 && + getRegisterInfo().canRealignStack(MF)) { + NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::VLD1q)) + .addReg(DstReg, + RegState::Define | + getDeadRegState(isDead) | + getUndefRegState(isUndef), + DstSubReg) + .addFrameIndex(FI).addImm(128).addImm(Pred).addReg(PredReg); + } else { + NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::VLDMQ)) + .addReg(DstReg, + RegState::Define | + getDeadRegState(isDead) | + getUndefRegState(isUndef), + DstSubReg) + .addFrameIndex(FI).addImm(ARM_AM::getAM5Opc(ARM_AM::ia, 4)) + .addImm(Pred).addReg(PredReg); + } + } + } + + return NewMI; +} + +MachineInstr* +ARMBaseInstrInfo::foldMemoryOperandImpl(MachineFunction &MF, + MachineInstr* MI, + const SmallVectorImpl<unsigned> &Ops, + MachineInstr* LoadMI) const { + // FIXME + return 0; +} + +bool +ARMBaseInstrInfo::canFoldMemoryOperand(const MachineInstr *MI, + const SmallVectorImpl<unsigned> &Ops) const { + if (Ops.size() != 1) return false; + + unsigned Opc = MI->getOpcode(); + if (Opc == ARM::MOVr || Opc == ARM::t2MOVr) { + // If it is updating CPSR, then it cannot be folded. + return MI->getOperand(4).getReg() != ARM::CPSR || + MI->getOperand(4).isDead(); + } else if (Opc == ARM::tMOVgpr2gpr || + Opc == ARM::tMOVtgpr2gpr || + Opc == ARM::tMOVgpr2tgpr) { + return true; + } else if (Opc == ARM::VMOVS || Opc == ARM::VMOVD || + Opc == ARM::VMOVDneon || Opc == ARM::VMOVQ) { + return true; + } + + // FIXME: VMOVQQ and VMOVQQQQ? + + return false; +} + +/// Create a copy of a const pool value. Update CPI to the new index and return +/// the label UID. +static unsigned duplicateCPV(MachineFunction &MF, unsigned &CPI) { + MachineConstantPool *MCP = MF.getConstantPool(); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + + const MachineConstantPoolEntry &MCPE = MCP->getConstants()[CPI]; + assert(MCPE.isMachineConstantPoolEntry() && + "Expecting a machine constantpool entry!"); + ARMConstantPoolValue *ACPV = + static_cast<ARMConstantPoolValue*>(MCPE.Val.MachineCPVal); + + unsigned PCLabelId = AFI->createConstPoolEntryUId(); + ARMConstantPoolValue *NewCPV = 0; + if (ACPV->isGlobalValue()) + NewCPV = new ARMConstantPoolValue(ACPV->getGV(), PCLabelId, + ARMCP::CPValue, 4); + else if (ACPV->isExtSymbol()) + NewCPV = new ARMConstantPoolValue(MF.getFunction()->getContext(), + ACPV->getSymbol(), PCLabelId, 4); + else if (ACPV->isBlockAddress()) + NewCPV = new ARMConstantPoolValue(ACPV->getBlockAddress(), PCLabelId, + ARMCP::CPBlockAddress, 4); + else + llvm_unreachable("Unexpected ARM constantpool value type!!"); + CPI = MCP->getConstantPoolIndex(NewCPV, MCPE.getAlignment()); + return PCLabelId; +} + +void ARMBaseInstrInfo:: +reMaterialize(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + unsigned DestReg, unsigned SubIdx, + const MachineInstr *Orig, + const TargetRegisterInfo *TRI) const { + if (SubIdx && TargetRegisterInfo::isPhysicalRegister(DestReg)) { + DestReg = TRI->getSubReg(DestReg, SubIdx); + SubIdx = 0; + } + + unsigned Opcode = Orig->getOpcode(); + switch (Opcode) { + default: { + MachineInstr *MI = MBB.getParent()->CloneMachineInstr(Orig); + MI->getOperand(0).setReg(DestReg); + MBB.insert(I, MI); + break; + } + case ARM::tLDRpci_pic: + case ARM::t2LDRpci_pic: { + MachineFunction &MF = *MBB.getParent(); + unsigned CPI = Orig->getOperand(1).getIndex(); + unsigned PCLabelId = duplicateCPV(MF, CPI); + MachineInstrBuilder MIB = BuildMI(MBB, I, Orig->getDebugLoc(), get(Opcode), + DestReg) + .addConstantPoolIndex(CPI).addImm(PCLabelId); + (*MIB).setMemRefs(Orig->memoperands_begin(), Orig->memoperands_end()); + break; + } + } + + MachineInstr *NewMI = prior(I); + NewMI->getOperand(0).setSubReg(SubIdx); +} + +MachineInstr * +ARMBaseInstrInfo::duplicate(MachineInstr *Orig, MachineFunction &MF) const { + MachineInstr *MI = TargetInstrInfoImpl::duplicate(Orig, MF); + switch(Orig->getOpcode()) { + case ARM::tLDRpci_pic: + case ARM::t2LDRpci_pic: { + unsigned CPI = Orig->getOperand(1).getIndex(); + unsigned PCLabelId = duplicateCPV(MF, CPI); + Orig->getOperand(1).setIndex(CPI); + Orig->getOperand(2).setImm(PCLabelId); + break; + } + } + return MI; +} + +bool ARMBaseInstrInfo::produceSameValue(const MachineInstr *MI0, + const MachineInstr *MI1) const { + int Opcode = MI0->getOpcode(); + if (Opcode == ARM::t2LDRpci || + Opcode == ARM::t2LDRpci_pic || + Opcode == ARM::tLDRpci || + Opcode == ARM::tLDRpci_pic) { + if (MI1->getOpcode() != Opcode) + return false; + if (MI0->getNumOperands() != MI1->getNumOperands()) + return false; + + const MachineOperand &MO0 = MI0->getOperand(1); + const MachineOperand &MO1 = MI1->getOperand(1); + if (MO0.getOffset() != MO1.getOffset()) + return false; + + const MachineFunction *MF = MI0->getParent()->getParent(); + const MachineConstantPool *MCP = MF->getConstantPool(); + int CPI0 = MO0.getIndex(); + int CPI1 = MO1.getIndex(); + const MachineConstantPoolEntry &MCPE0 = MCP->getConstants()[CPI0]; + const MachineConstantPoolEntry &MCPE1 = MCP->getConstants()[CPI1]; + ARMConstantPoolValue *ACPV0 = + static_cast<ARMConstantPoolValue*>(MCPE0.Val.MachineCPVal); + ARMConstantPoolValue *ACPV1 = + static_cast<ARMConstantPoolValue*>(MCPE1.Val.MachineCPVal); + return ACPV0->hasSameValue(ACPV1); + } + + return MI0->isIdenticalTo(MI1, MachineInstr::IgnoreVRegDefs); +} + +/// getInstrPredicate - If instruction is predicated, returns its predicate +/// condition, otherwise returns AL. It also returns the condition code +/// register by reference. +ARMCC::CondCodes +llvm::getInstrPredicate(const MachineInstr *MI, unsigned &PredReg) { + int PIdx = MI->findFirstPredOperandIdx(); + if (PIdx == -1) { + PredReg = 0; + return ARMCC::AL; + } + + PredReg = MI->getOperand(PIdx+1).getReg(); + return (ARMCC::CondCodes)MI->getOperand(PIdx).getImm(); +} + + +int llvm::getMatchingCondBranchOpcode(int Opc) { + if (Opc == ARM::B) + return ARM::Bcc; + else if (Opc == ARM::tB) + return ARM::tBcc; + else if (Opc == ARM::t2B) + return ARM::t2Bcc; + + llvm_unreachable("Unknown unconditional branch opcode!"); + return 0; +} + + +void llvm::emitARMRegPlusImmediate(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, DebugLoc dl, + unsigned DestReg, unsigned BaseReg, int NumBytes, + ARMCC::CondCodes Pred, unsigned PredReg, + const ARMBaseInstrInfo &TII) { + bool isSub = NumBytes < 0; + if (isSub) NumBytes = -NumBytes; + + while (NumBytes) { + unsigned RotAmt = ARM_AM::getSOImmValRotate(NumBytes); + unsigned ThisVal = NumBytes & ARM_AM::rotr32(0xFF, RotAmt); + assert(ThisVal && "Didn't extract field correctly"); + + // We will handle these bits from offset, clear them. + NumBytes &= ~ThisVal; + + assert(ARM_AM::getSOImmVal(ThisVal) != -1 && "Bit extraction didn't work?"); + + // Build the new ADD / SUB. + unsigned Opc = isSub ? ARM::SUBri : ARM::ADDri; + BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg) + .addReg(BaseReg, RegState::Kill).addImm(ThisVal) + .addImm((unsigned)Pred).addReg(PredReg).addReg(0); + BaseReg = DestReg; + } +} + +bool llvm::rewriteARMFrameIndex(MachineInstr &MI, unsigned FrameRegIdx, + unsigned FrameReg, int &Offset, + const ARMBaseInstrInfo &TII) { + unsigned Opcode = MI.getOpcode(); + const TargetInstrDesc &Desc = MI.getDesc(); + unsigned AddrMode = (Desc.TSFlags & ARMII::AddrModeMask); + bool isSub = false; + + // Memory operands in inline assembly always use AddrMode2. + if (Opcode == ARM::INLINEASM) + AddrMode = ARMII::AddrMode2; + + if (Opcode == ARM::ADDri) { + Offset += MI.getOperand(FrameRegIdx+1).getImm(); + if (Offset == 0) { + // Turn it into a move. + MI.setDesc(TII.get(ARM::MOVr)); + MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false); + MI.RemoveOperand(FrameRegIdx+1); + Offset = 0; + return true; + } else if (Offset < 0) { + Offset = -Offset; + isSub = true; + MI.setDesc(TII.get(ARM::SUBri)); + } + + // Common case: small offset, fits into instruction. + if (ARM_AM::getSOImmVal(Offset) != -1) { + // Replace the FrameIndex with sp / fp + MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false); + MI.getOperand(FrameRegIdx+1).ChangeToImmediate(Offset); + Offset = 0; + return true; + } + + // Otherwise, pull as much of the immedidate into this ADDri/SUBri + // as possible. + unsigned RotAmt = ARM_AM::getSOImmValRotate(Offset); + unsigned ThisImmVal = Offset & ARM_AM::rotr32(0xFF, RotAmt); + + // We will handle these bits from offset, clear them. + Offset &= ~ThisImmVal; + + // Get the properly encoded SOImmVal field. + assert(ARM_AM::getSOImmVal(ThisImmVal) != -1 && + "Bit extraction didn't work?"); + MI.getOperand(FrameRegIdx+1).ChangeToImmediate(ThisImmVal); + } else { + unsigned ImmIdx = 0; + int InstrOffs = 0; + unsigned NumBits = 0; + unsigned Scale = 1; + switch (AddrMode) { + case ARMII::AddrMode2: { + ImmIdx = FrameRegIdx+2; + InstrOffs = ARM_AM::getAM2Offset(MI.getOperand(ImmIdx).getImm()); + if (ARM_AM::getAM2Op(MI.getOperand(ImmIdx).getImm()) == ARM_AM::sub) + InstrOffs *= -1; + NumBits = 12; + break; + } + case ARMII::AddrMode3: { + ImmIdx = FrameRegIdx+2; + InstrOffs = ARM_AM::getAM3Offset(MI.getOperand(ImmIdx).getImm()); + if (ARM_AM::getAM3Op(MI.getOperand(ImmIdx).getImm()) == ARM_AM::sub) + InstrOffs *= -1; + NumBits = 8; + break; + } + case ARMII::AddrMode4: + case ARMII::AddrMode6: + // Can't fold any offset even if it's zero. + return false; + case ARMII::AddrMode5: { + ImmIdx = FrameRegIdx+1; + InstrOffs = ARM_AM::getAM5Offset(MI.getOperand(ImmIdx).getImm()); + if (ARM_AM::getAM5Op(MI.getOperand(ImmIdx).getImm()) == ARM_AM::sub) + InstrOffs *= -1; + NumBits = 8; + Scale = 4; + break; + } + default: + llvm_unreachable("Unsupported addressing mode!"); + break; + } + + Offset += InstrOffs * Scale; + assert((Offset & (Scale-1)) == 0 && "Can't encode this offset!"); + if (Offset < 0) { + Offset = -Offset; + isSub = true; + } + + // Attempt to fold address comp. if opcode has offset bits + if (NumBits > 0) { + // Common case: small offset, fits into instruction. + MachineOperand &ImmOp = MI.getOperand(ImmIdx); + int ImmedOffset = Offset / Scale; + unsigned Mask = (1 << NumBits) - 1; + if ((unsigned)Offset <= Mask * Scale) { + // Replace the FrameIndex with sp + MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false); + if (isSub) + ImmedOffset |= 1 << NumBits; + ImmOp.ChangeToImmediate(ImmedOffset); + Offset = 0; + return true; + } + + // Otherwise, it didn't fit. Pull in what we can to simplify the immed. + ImmedOffset = ImmedOffset & Mask; + if (isSub) + ImmedOffset |= 1 << NumBits; + ImmOp.ChangeToImmediate(ImmedOffset); + Offset &= ~(Mask*Scale); + } + } + + Offset = (isSub) ? -Offset : Offset; + return Offset == 0; +} diff --git a/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.h b/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.h new file mode 100644 index 0000000..b566271 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.h @@ -0,0 +1,390 @@ +//===- ARMBaseInstrInfo.h - ARM Base Instruction Information ----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the Base ARM implementation of the TargetInstrInfo class. +// +//===----------------------------------------------------------------------===// + +#ifndef ARMBASEINSTRUCTIONINFO_H +#define ARMBASEINSTRUCTIONINFO_H + +#include "ARM.h" +#include "ARMRegisterInfo.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/Target/TargetInstrInfo.h" + +namespace llvm { + +/// ARMII - This namespace holds all of the target specific flags that +/// instruction info tracks. +/// +namespace ARMII { + enum { + //===------------------------------------------------------------------===// + // Instruction Flags. + + //===------------------------------------------------------------------===// + // This four-bit field describes the addressing mode used. + + AddrModeMask = 0xf, + AddrModeNone = 0, + AddrMode1 = 1, + AddrMode2 = 2, + AddrMode3 = 3, + AddrMode4 = 4, + AddrMode5 = 5, + AddrMode6 = 6, + AddrModeT1_1 = 7, + AddrModeT1_2 = 8, + AddrModeT1_4 = 9, + AddrModeT1_s = 10, // i8 * 4 for pc and sp relative data + AddrModeT2_i12 = 11, + AddrModeT2_i8 = 12, + AddrModeT2_so = 13, + AddrModeT2_pc = 14, // +/- i12 for pc relative data + AddrModeT2_i8s4 = 15, // i8 * 4 + + // Size* - Flags to keep track of the size of an instruction. + SizeShift = 4, + SizeMask = 7 << SizeShift, + SizeSpecial = 1, // 0 byte pseudo or special case. + Size8Bytes = 2, + Size4Bytes = 3, + Size2Bytes = 4, + + // IndexMode - Unindex, pre-indexed, or post-indexed are valid for load + // and store ops only. Generic "updating" flag is used for ld/st multiple. + IndexModeShift = 7, + IndexModeMask = 3 << IndexModeShift, + IndexModePre = 1, + IndexModePost = 2, + IndexModeUpd = 3, + + //===------------------------------------------------------------------===// + // Instruction encoding formats. + // + FormShift = 9, + FormMask = 0x3f << FormShift, + + // Pseudo instructions + Pseudo = 0 << FormShift, + + // Multiply instructions + MulFrm = 1 << FormShift, + + // Branch instructions + BrFrm = 2 << FormShift, + BrMiscFrm = 3 << FormShift, + + // Data Processing instructions + DPFrm = 4 << FormShift, + DPSoRegFrm = 5 << FormShift, + + // Load and Store + LdFrm = 6 << FormShift, + StFrm = 7 << FormShift, + LdMiscFrm = 8 << FormShift, + StMiscFrm = 9 << FormShift, + LdStMulFrm = 10 << FormShift, + + LdStExFrm = 11 << FormShift, + + // Miscellaneous arithmetic instructions + ArithMiscFrm = 12 << FormShift, + + // Extend instructions + ExtFrm = 13 << FormShift, + + // VFP formats + VFPUnaryFrm = 14 << FormShift, + VFPBinaryFrm = 15 << FormShift, + VFPConv1Frm = 16 << FormShift, + VFPConv2Frm = 17 << FormShift, + VFPConv3Frm = 18 << FormShift, + VFPConv4Frm = 19 << FormShift, + VFPConv5Frm = 20 << FormShift, + VFPLdStFrm = 21 << FormShift, + VFPLdStMulFrm = 22 << FormShift, + VFPMiscFrm = 23 << FormShift, + + // Thumb format + ThumbFrm = 24 << FormShift, + + // NEON format + NEONFrm = 25 << FormShift, + NEONGetLnFrm = 26 << FormShift, + NEONSetLnFrm = 27 << FormShift, + NEONDupFrm = 28 << FormShift, + + //===------------------------------------------------------------------===// + // Misc flags. + + // UnaryDP - Indicates this is a unary data processing instruction, i.e. + // it doesn't have a Rn operand. + UnaryDP = 1 << 15, + + // Xform16Bit - Indicates this Thumb2 instruction may be transformed into + // a 16-bit Thumb instruction if certain conditions are met. + Xform16Bit = 1 << 16, + + //===------------------------------------------------------------------===// + // Code domain. + DomainShift = 17, + DomainMask = 3 << DomainShift, + DomainGeneral = 0 << DomainShift, + DomainVFP = 1 << DomainShift, + DomainNEON = 2 << DomainShift, + + //===------------------------------------------------------------------===// + // Field shifts - such shifts are used to set field while generating + // machine instructions. + M_BitShift = 5, + ShiftImmShift = 5, + ShiftShift = 7, + N_BitShift = 7, + ImmHiShift = 8, + SoRotImmShift = 8, + RegRsShift = 8, + ExtRotImmShift = 10, + RegRdLoShift = 12, + RegRdShift = 12, + RegRdHiShift = 16, + RegRnShift = 16, + S_BitShift = 20, + W_BitShift = 21, + AM3_I_BitShift = 22, + D_BitShift = 22, + U_BitShift = 23, + P_BitShift = 24, + I_BitShift = 25, + CondShift = 28 + }; + + /// Target Operand Flag enum. + enum TOF { + //===------------------------------------------------------------------===// + // ARM Specific MachineOperand flags. + + MO_NO_FLAG, + + /// MO_LO16 - On a symbol operand, this represents a relocation containing + /// lower 16 bit of the address. Used only via movw instruction. + MO_LO16, + + /// MO_HI16 - On a symbol operand, this represents a relocation containing + /// higher 16 bit of the address. Used only via movt instruction. + MO_HI16 + }; +} + +class ARMBaseInstrInfo : public TargetInstrInfoImpl { + const ARMSubtarget& Subtarget; +protected: + // Can be only subclassed. + explicit ARMBaseInstrInfo(const ARMSubtarget &STI); +public: + // Return the non-pre/post incrementing version of 'Opc'. Return 0 + // if there is not such an opcode. + virtual unsigned getUnindexedOpcode(unsigned Opc) const =0; + + virtual MachineInstr *convertToThreeAddress(MachineFunction::iterator &MFI, + MachineBasicBlock::iterator &MBBI, + LiveVariables *LV) const; + + virtual const ARMBaseRegisterInfo &getRegisterInfo() const =0; + const ARMSubtarget &getSubtarget() const { return Subtarget; } + + bool spillCalleeSavedRegisters(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + const std::vector<CalleeSavedInfo> &CSI, + const TargetRegisterInfo *TRI) const; + + // Branch analysis. + virtual bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, + MachineBasicBlock *&FBB, + SmallVectorImpl<MachineOperand> &Cond, + bool AllowModify) const; + virtual unsigned RemoveBranch(MachineBasicBlock &MBB) const; + virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, + MachineBasicBlock *FBB, + const SmallVectorImpl<MachineOperand> &Cond) const; + + virtual + bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const; + + // Predication support. + bool isPredicated(const MachineInstr *MI) const { + int PIdx = MI->findFirstPredOperandIdx(); + return PIdx != -1 && MI->getOperand(PIdx).getImm() != ARMCC::AL; + } + + ARMCC::CondCodes getPredicate(const MachineInstr *MI) const { + int PIdx = MI->findFirstPredOperandIdx(); + return PIdx != -1 ? (ARMCC::CondCodes)MI->getOperand(PIdx).getImm() + : ARMCC::AL; + } + + virtual + bool PredicateInstruction(MachineInstr *MI, + const SmallVectorImpl<MachineOperand> &Pred) const; + + virtual + bool SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1, + const SmallVectorImpl<MachineOperand> &Pred2) const; + + virtual bool DefinesPredicate(MachineInstr *MI, + std::vector<MachineOperand> &Pred) const; + + virtual bool isPredicable(MachineInstr *MI) const; + + /// GetInstSize - Returns the size of the specified MachineInstr. + /// + virtual unsigned GetInstSizeInBytes(const MachineInstr* MI) const; + + /// Return true if the instruction is a register to register move and return + /// the source and dest operands and their sub-register indices by reference. + virtual bool isMoveInstr(const MachineInstr &MI, + unsigned &SrcReg, unsigned &DstReg, + unsigned &SrcSubIdx, unsigned &DstSubIdx) const; + + virtual unsigned isLoadFromStackSlot(const MachineInstr *MI, + int &FrameIndex) const; + virtual unsigned isStoreToStackSlot(const MachineInstr *MI, + int &FrameIndex) const; + + virtual bool copyRegToReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + unsigned DestReg, unsigned SrcReg, + const TargetRegisterClass *DestRC, + const TargetRegisterClass *SrcRC, + DebugLoc DL) const; + + virtual void storeRegToStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + unsigned SrcReg, bool isKill, int FrameIndex, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const; + + virtual void loadRegFromStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + unsigned DestReg, int FrameIndex, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const; + + virtual MachineInstr *emitFrameIndexDebugValue(MachineFunction &MF, + int FrameIx, + uint64_t Offset, + const MDNode *MDPtr, + DebugLoc DL) const; + + virtual bool canFoldMemoryOperand(const MachineInstr *MI, + const SmallVectorImpl<unsigned> &Ops) const; + + virtual MachineInstr* foldMemoryOperandImpl(MachineFunction &MF, + MachineInstr* MI, + const SmallVectorImpl<unsigned> &Ops, + int FrameIndex) const; + + virtual MachineInstr* foldMemoryOperandImpl(MachineFunction &MF, + MachineInstr* MI, + const SmallVectorImpl<unsigned> &Ops, + MachineInstr* LoadMI) const; + + virtual void reMaterialize(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + unsigned DestReg, unsigned SubIdx, + const MachineInstr *Orig, + const TargetRegisterInfo *TRI) const; + + MachineInstr *duplicate(MachineInstr *Orig, MachineFunction &MF) const; + + virtual bool produceSameValue(const MachineInstr *MI0, + const MachineInstr *MI1) const; +}; + +static inline +const MachineInstrBuilder &AddDefaultPred(const MachineInstrBuilder &MIB) { + return MIB.addImm((int64_t)ARMCC::AL).addReg(0); +} + +static inline +const MachineInstrBuilder &AddDefaultCC(const MachineInstrBuilder &MIB) { + return MIB.addReg(0); +} + +static inline +const MachineInstrBuilder &AddDefaultT1CC(const MachineInstrBuilder &MIB, + bool isDead = false) { + return MIB.addReg(ARM::CPSR, getDefRegState(true) | getDeadRegState(isDead)); +} + +static inline +const MachineInstrBuilder &AddNoT1CC(const MachineInstrBuilder &MIB) { + return MIB.addReg(0); +} + +static inline +bool isUncondBranchOpcode(int Opc) { + return Opc == ARM::B || Opc == ARM::tB || Opc == ARM::t2B; +} + +static inline +bool isCondBranchOpcode(int Opc) { + return Opc == ARM::Bcc || Opc == ARM::tBcc || Opc == ARM::t2Bcc; +} + +static inline +bool isJumpTableBranchOpcode(int Opc) { + return Opc == ARM::BR_JTr || Opc == ARM::BR_JTm || Opc == ARM::BR_JTadd || + Opc == ARM::tBR_JTr || Opc == ARM::t2BR_JT; +} + +static inline +bool isIndirectBranchOpcode(int Opc) { + return Opc == ARM::BRIND || Opc == ARM::MOVPCRX || Opc == ARM::tBRIND; +} + +/// getInstrPredicate - If instruction is predicated, returns its predicate +/// condition, otherwise returns AL. It also returns the condition code +/// register by reference. +ARMCC::CondCodes getInstrPredicate(const MachineInstr *MI, unsigned &PredReg); + +int getMatchingCondBranchOpcode(int Opc); + +/// emitARMRegPlusImmediate / emitT2RegPlusImmediate - Emits a series of +/// instructions to materializea destreg = basereg + immediate in ARM / Thumb2 +/// code. +void emitARMRegPlusImmediate(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, DebugLoc dl, + unsigned DestReg, unsigned BaseReg, int NumBytes, + ARMCC::CondCodes Pred, unsigned PredReg, + const ARMBaseInstrInfo &TII); + +void emitT2RegPlusImmediate(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, DebugLoc dl, + unsigned DestReg, unsigned BaseReg, int NumBytes, + ARMCC::CondCodes Pred, unsigned PredReg, + const ARMBaseInstrInfo &TII); + + +/// rewriteARMFrameIndex / rewriteT2FrameIndex - +/// Rewrite MI to access 'Offset' bytes from the FP. Return false if the +/// offset could not be handled directly in MI, and return the left-over +/// portion by reference. +bool rewriteARMFrameIndex(MachineInstr &MI, unsigned FrameRegIdx, + unsigned FrameReg, int &Offset, + const ARMBaseInstrInfo &TII); + +bool rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx, + unsigned FrameReg, int &Offset, + const ARMBaseInstrInfo &TII); + +} // End llvm namespace + +#endif diff --git a/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp new file mode 100644 index 0000000..82458d2 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp @@ -0,0 +1,1703 @@ +//===- ARMBaseRegisterInfo.cpp - ARM Register Information -------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the base ARM implementation of TargetRegisterInfo class. +// +//===----------------------------------------------------------------------===// + +#include "ARM.h" +#include "ARMAddressingModes.h" +#include "ARMBaseInstrInfo.h" +#include "ARMBaseRegisterInfo.h" +#include "ARMInstrInfo.h" +#include "ARMMachineFunctionInfo.h" +#include "ARMSubtarget.h" +#include "llvm/Constants.h" +#include "llvm/DerivedTypes.h" +#include "llvm/Function.h" +#include "llvm/LLVMContext.h" +#include "llvm/CodeGen/MachineConstantPool.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineLocation.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/RegisterScavenging.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetFrameInfo.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetOptions.h" +#include "llvm/ADT/BitVector.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/Support/CommandLine.h" + +namespace llvm { +cl::opt<bool> +ReuseFrameIndexVals("arm-reuse-frame-index-vals", cl::Hidden, cl::init(true), + cl::desc("Reuse repeated frame index values")); +} + +using namespace llvm; + +unsigned ARMBaseRegisterInfo::getRegisterNumbering(unsigned RegEnum, + bool *isSPVFP) { + if (isSPVFP) + *isSPVFP = false; + + using namespace ARM; + switch (RegEnum) { + default: + llvm_unreachable("Unknown ARM register!"); + case R0: case D0: case Q0: return 0; + case R1: case D1: case Q1: return 1; + case R2: case D2: case Q2: return 2; + case R3: case D3: case Q3: return 3; + case R4: case D4: case Q4: return 4; + case R5: case D5: case Q5: return 5; + case R6: case D6: case Q6: return 6; + case R7: case D7: case Q7: return 7; + case R8: case D8: case Q8: return 8; + case R9: case D9: case Q9: return 9; + case R10: case D10: case Q10: return 10; + case R11: case D11: case Q11: return 11; + case R12: case D12: case Q12: return 12; + case SP: case D13: case Q13: return 13; + case LR: case D14: case Q14: return 14; + case PC: case D15: case Q15: return 15; + + case D16: return 16; + case D17: return 17; + case D18: return 18; + case D19: return 19; + case D20: return 20; + case D21: return 21; + case D22: return 22; + case D23: return 23; + case D24: return 24; + case D25: return 25; + case D26: return 26; + case D27: return 27; + case D28: return 28; + case D29: return 29; + case D30: return 30; + case D31: return 31; + + case S0: case S1: case S2: case S3: + case S4: case S5: case S6: case S7: + case S8: case S9: case S10: case S11: + case S12: case S13: case S14: case S15: + case S16: case S17: case S18: case S19: + case S20: case S21: case S22: case S23: + case S24: case S25: case S26: case S27: + case S28: case S29: case S30: case S31: { + if (isSPVFP) + *isSPVFP = true; + switch (RegEnum) { + default: return 0; // Avoid compile time warning. + case S0: return 0; + case S1: return 1; + case S2: return 2; + case S3: return 3; + case S4: return 4; + case S5: return 5; + case S6: return 6; + case S7: return 7; + case S8: return 8; + case S9: return 9; + case S10: return 10; + case S11: return 11; + case S12: return 12; + case S13: return 13; + case S14: return 14; + case S15: return 15; + case S16: return 16; + case S17: return 17; + case S18: return 18; + case S19: return 19; + case S20: return 20; + case S21: return 21; + case S22: return 22; + case S23: return 23; + case S24: return 24; + case S25: return 25; + case S26: return 26; + case S27: return 27; + case S28: return 28; + case S29: return 29; + case S30: return 30; + case S31: return 31; + } + } + } +} + +ARMBaseRegisterInfo::ARMBaseRegisterInfo(const ARMBaseInstrInfo &tii, + const ARMSubtarget &sti) + : ARMGenRegisterInfo(ARM::ADJCALLSTACKDOWN, ARM::ADJCALLSTACKUP), + TII(tii), STI(sti), + FramePtr((STI.isTargetDarwin() || STI.isThumb()) ? ARM::R7 : ARM::R11) { +} + +const unsigned* +ARMBaseRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { + static const unsigned CalleeSavedRegs[] = { + ARM::LR, ARM::R11, ARM::R10, ARM::R9, ARM::R8, + ARM::R7, ARM::R6, ARM::R5, ARM::R4, + + ARM::D15, ARM::D14, ARM::D13, ARM::D12, + ARM::D11, ARM::D10, ARM::D9, ARM::D8, + 0 + }; + + static const unsigned DarwinCalleeSavedRegs[] = { + // Darwin ABI deviates from ARM standard ABI. R9 is not a callee-saved + // register. + ARM::LR, ARM::R7, ARM::R6, ARM::R5, ARM::R4, + ARM::R11, ARM::R10, ARM::R8, + + ARM::D15, ARM::D14, ARM::D13, ARM::D12, + ARM::D11, ARM::D10, ARM::D9, ARM::D8, + 0 + }; + return STI.isTargetDarwin() ? DarwinCalleeSavedRegs : CalleeSavedRegs; +} + +const TargetRegisterClass* const * +ARMBaseRegisterInfo::getCalleeSavedRegClasses(const MachineFunction *MF) const { + static const TargetRegisterClass * const CalleeSavedRegClasses[] = { + &ARM::GPRRegClass, &ARM::GPRRegClass, &ARM::GPRRegClass, + &ARM::GPRRegClass, &ARM::GPRRegClass, &ARM::GPRRegClass, + &ARM::GPRRegClass, &ARM::GPRRegClass, &ARM::GPRRegClass, + + &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, + &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, + 0 + }; + + static const TargetRegisterClass * const ThumbCalleeSavedRegClasses[] = { + &ARM::GPRRegClass, &ARM::GPRRegClass, &ARM::GPRRegClass, + &ARM::GPRRegClass, &ARM::GPRRegClass, &ARM::tGPRRegClass, + &ARM::tGPRRegClass,&ARM::tGPRRegClass,&ARM::tGPRRegClass, + + &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, + &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, + 0 + }; + + static const TargetRegisterClass * const DarwinCalleeSavedRegClasses[] = { + &ARM::GPRRegClass, &ARM::GPRRegClass, &ARM::GPRRegClass, + &ARM::GPRRegClass, &ARM::GPRRegClass, &ARM::GPRRegClass, + &ARM::GPRRegClass, &ARM::GPRRegClass, + + &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, + &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, + 0 + }; + + static const TargetRegisterClass * const DarwinThumbCalleeSavedRegClasses[] ={ + &ARM::GPRRegClass, &ARM::tGPRRegClass, &ARM::tGPRRegClass, + &ARM::tGPRRegClass, &ARM::tGPRRegClass, &ARM::GPRRegClass, + &ARM::GPRRegClass, &ARM::GPRRegClass, + + &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, + &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, + 0 + }; + + if (STI.isThumb1Only()) { + return STI.isTargetDarwin() + ? DarwinThumbCalleeSavedRegClasses : ThumbCalleeSavedRegClasses; + } + return STI.isTargetDarwin() + ? DarwinCalleeSavedRegClasses : CalleeSavedRegClasses; +} + +BitVector ARMBaseRegisterInfo:: +getReservedRegs(const MachineFunction &MF) const { + // FIXME: avoid re-calculating this everytime. + BitVector Reserved(getNumRegs()); + Reserved.set(ARM::SP); + Reserved.set(ARM::PC); + if (STI.isTargetDarwin() || hasFP(MF)) + Reserved.set(FramePtr); + // Some targets reserve R9. + if (STI.isR9Reserved()) + Reserved.set(ARM::R9); + return Reserved; +} + +bool ARMBaseRegisterInfo::isReservedReg(const MachineFunction &MF, + unsigned Reg) const { + switch (Reg) { + default: break; + case ARM::SP: + case ARM::PC: + return true; + case ARM::R7: + case ARM::R11: + if (FramePtr == Reg && (STI.isTargetDarwin() || hasFP(MF))) + return true; + break; + case ARM::R9: + return STI.isR9Reserved(); + } + + return false; +} + +const TargetRegisterClass * +ARMBaseRegisterInfo::getMatchingSuperRegClass(const TargetRegisterClass *A, + const TargetRegisterClass *B, + unsigned SubIdx) const { + switch (SubIdx) { + default: return 0; + case ARM::ssub_0: + case ARM::ssub_1: + case ARM::ssub_2: + case ARM::ssub_3: { + // S sub-registers. + if (A->getSize() == 8) { + if (B == &ARM::SPR_8RegClass) + return &ARM::DPR_8RegClass; + assert(B == &ARM::SPRRegClass && "Expecting SPR register class!"); + if (A == &ARM::DPR_8RegClass) + return A; + return &ARM::DPR_VFP2RegClass; + } + + if (A->getSize() == 16) { + if (B == &ARM::SPR_8RegClass) + return &ARM::QPR_8RegClass; + return &ARM::QPR_VFP2RegClass; + } + + if (A->getSize() == 32) { + if (B == &ARM::SPR_8RegClass) + return 0; // Do not allow coalescing! + return &ARM::QQPR_VFP2RegClass; + } + + assert(A->getSize() == 64 && "Expecting a QQQQ register class!"); + return 0; // Do not allow coalescing! + } + case ARM::dsub_0: + case ARM::dsub_1: + case ARM::dsub_2: + case ARM::dsub_3: { + // D sub-registers. + if (A->getSize() == 16) { + if (B == &ARM::DPR_VFP2RegClass) + return &ARM::QPR_VFP2RegClass; + if (B == &ARM::DPR_8RegClass) + return 0; // Do not allow coalescing! + return A; + } + + if (A->getSize() == 32) { + if (B == &ARM::DPR_VFP2RegClass) + return &ARM::QQPR_VFP2RegClass; + if (B == &ARM::DPR_8RegClass) + return 0; // Do not allow coalescing! + return A; + } + + assert(A->getSize() == 64 && "Expecting a QQQQ register class!"); + if (B != &ARM::DPRRegClass) + return 0; // Do not allow coalescing! + return A; + } + case ARM::dsub_4: + case ARM::dsub_5: + case ARM::dsub_6: + case ARM::dsub_7: { + // D sub-registers of QQQQ registers. + if (A->getSize() == 64 && B == &ARM::DPRRegClass) + return A; + return 0; // Do not allow coalescing! + } + + case ARM::qsub_0: + case ARM::qsub_1: { + // Q sub-registers. + if (A->getSize() == 32) { + if (B == &ARM::QPR_VFP2RegClass) + return &ARM::QQPR_VFP2RegClass; + if (B == &ARM::QPR_8RegClass) + return 0; // Do not allow coalescing! + return A; + } + + assert(A->getSize() == 64 && "Expecting a QQQQ register class!"); + if (B == &ARM::QPRRegClass) + return A; + return 0; // Do not allow coalescing! + } + case ARM::qsub_2: + case ARM::qsub_3: { + // Q sub-registers of QQQQ registers. + if (A->getSize() == 64 && B == &ARM::QPRRegClass) + return A; + return 0; // Do not allow coalescing! + } + } + return 0; +} + +bool +ARMBaseRegisterInfo::canCombinedSubRegIndex(const TargetRegisterClass *RC, + SmallVectorImpl<unsigned> &SubIndices, + unsigned &NewSubIdx) const { + + unsigned Size = RC->getSize() * 8; + if (Size < 6) + return 0; + + NewSubIdx = 0; // Whole register. + unsigned NumRegs = SubIndices.size(); + if (NumRegs == 8) { + // 8 D registers -> 1 QQQQ register. + return (Size == 512 && + SubIndices[0] == ARM::dsub_0 && + SubIndices[1] == ARM::dsub_1 && + SubIndices[2] == ARM::dsub_2 && + SubIndices[3] == ARM::dsub_3 && + SubIndices[4] == ARM::dsub_4 && + SubIndices[5] == ARM::dsub_5 && + SubIndices[6] == ARM::dsub_6 && + SubIndices[7] == ARM::dsub_7); + } else if (NumRegs == 4) { + if (SubIndices[0] == ARM::qsub_0) { + // 4 Q registers -> 1 QQQQ register. + return (Size == 512 && + SubIndices[1] == ARM::qsub_1 && + SubIndices[2] == ARM::qsub_2 && + SubIndices[3] == ARM::qsub_3); + } else if (SubIndices[0] == ARM::dsub_0) { + // 4 D registers -> 1 QQ register. + if (Size >= 256 && + SubIndices[1] == ARM::dsub_1 && + SubIndices[2] == ARM::dsub_2 && + SubIndices[3] == ARM::dsub_3) { + if (Size == 512) + NewSubIdx = ARM::qqsub_0; + return true; + } + } else if (SubIndices[0] == ARM::dsub_4) { + // 4 D registers -> 1 QQ register (2nd). + if (Size == 512 && + SubIndices[1] == ARM::dsub_5 && + SubIndices[2] == ARM::dsub_6 && + SubIndices[3] == ARM::dsub_7) { + NewSubIdx = ARM::qqsub_1; + return true; + } + } else if (SubIndices[0] == ARM::ssub_0) { + // 4 S registers -> 1 Q register. + if (Size >= 128 && + SubIndices[1] == ARM::ssub_1 && + SubIndices[2] == ARM::ssub_2 && + SubIndices[3] == ARM::ssub_3) { + if (Size >= 256) + NewSubIdx = ARM::qsub_0; + return true; + } + } + } else if (NumRegs == 2) { + if (SubIndices[0] == ARM::qsub_0) { + // 2 Q registers -> 1 QQ register. + if (Size >= 256 && SubIndices[1] == ARM::qsub_1) { + if (Size == 512) + NewSubIdx = ARM::qqsub_0; + return true; + } + } else if (SubIndices[0] == ARM::qsub_2) { + // 2 Q registers -> 1 QQ register (2nd). + if (Size == 512 && SubIndices[1] == ARM::qsub_3) { + NewSubIdx = ARM::qqsub_1; + return true; + } + } else if (SubIndices[0] == ARM::dsub_0) { + // 2 D registers -> 1 Q register. + if (Size >= 128 && SubIndices[1] == ARM::dsub_1) { + if (Size >= 256) + NewSubIdx = ARM::qsub_0; + return true; + } + } else if (SubIndices[0] == ARM::dsub_2) { + // 2 D registers -> 1 Q register (2nd). + if (Size >= 256 && SubIndices[1] == ARM::dsub_3) { + NewSubIdx = ARM::qsub_1; + return true; + } + } else if (SubIndices[0] == ARM::dsub_4) { + // 2 D registers -> 1 Q register (3rd). + if (Size == 512 && SubIndices[1] == ARM::dsub_5) { + NewSubIdx = ARM::qsub_2; + return true; + } + } else if (SubIndices[0] == ARM::dsub_6) { + // 2 D registers -> 1 Q register (3rd). + if (Size == 512 && SubIndices[1] == ARM::dsub_7) { + NewSubIdx = ARM::qsub_3; + return true; + } + } else if (SubIndices[0] == ARM::ssub_0) { + // 2 S registers -> 1 D register. + if (SubIndices[1] == ARM::ssub_1) { + if (Size >= 128) + NewSubIdx = ARM::dsub_0; + return true; + } + } else if (SubIndices[0] == ARM::ssub_2) { + // 2 S registers -> 1 D register (2nd). + if (Size >= 128 && SubIndices[1] == ARM::ssub_3) { + NewSubIdx = ARM::dsub_1; + return true; + } + } + } + return false; +} + + +const TargetRegisterClass * +ARMBaseRegisterInfo::getPointerRegClass(unsigned Kind) const { + return ARM::GPRRegisterClass; +} + +/// getAllocationOrder - Returns the register allocation order for a specified +/// register class in the form of a pair of TargetRegisterClass iterators. +std::pair<TargetRegisterClass::iterator,TargetRegisterClass::iterator> +ARMBaseRegisterInfo::getAllocationOrder(const TargetRegisterClass *RC, + unsigned HintType, unsigned HintReg, + const MachineFunction &MF) const { + // Alternative register allocation orders when favoring even / odd registers + // of register pairs. + + // No FP, R9 is available. + static const unsigned GPREven1[] = { + ARM::R0, ARM::R2, ARM::R4, ARM::R6, ARM::R8, ARM::R10, + ARM::R1, ARM::R3, ARM::R12,ARM::LR, ARM::R5, ARM::R7, + ARM::R9, ARM::R11 + }; + static const unsigned GPROdd1[] = { + ARM::R1, ARM::R3, ARM::R5, ARM::R7, ARM::R9, ARM::R11, + ARM::R0, ARM::R2, ARM::R12,ARM::LR, ARM::R4, ARM::R6, + ARM::R8, ARM::R10 + }; + + // FP is R7, R9 is available. + static const unsigned GPREven2[] = { + ARM::R0, ARM::R2, ARM::R4, ARM::R8, ARM::R10, + ARM::R1, ARM::R3, ARM::R12,ARM::LR, ARM::R5, ARM::R6, + ARM::R9, ARM::R11 + }; + static const unsigned GPROdd2[] = { + ARM::R1, ARM::R3, ARM::R5, ARM::R9, ARM::R11, + ARM::R0, ARM::R2, ARM::R12,ARM::LR, ARM::R4, ARM::R6, + ARM::R8, ARM::R10 + }; + + // FP is R11, R9 is available. + static const unsigned GPREven3[] = { + ARM::R0, ARM::R2, ARM::R4, ARM::R6, ARM::R8, + ARM::R1, ARM::R3, ARM::R10,ARM::R12,ARM::LR, ARM::R5, ARM::R7, + ARM::R9 + }; + static const unsigned GPROdd3[] = { + ARM::R1, ARM::R3, ARM::R5, ARM::R6, ARM::R9, + ARM::R0, ARM::R2, ARM::R10,ARM::R12,ARM::LR, ARM::R4, ARM::R7, + ARM::R8 + }; + + // No FP, R9 is not available. + static const unsigned GPREven4[] = { + ARM::R0, ARM::R2, ARM::R4, ARM::R6, ARM::R10, + ARM::R1, ARM::R3, ARM::R12,ARM::LR, ARM::R5, ARM::R7, ARM::R8, + ARM::R11 + }; + static const unsigned GPROdd4[] = { + ARM::R1, ARM::R3, ARM::R5, ARM::R7, ARM::R11, + ARM::R0, ARM::R2, ARM::R12,ARM::LR, ARM::R4, ARM::R6, ARM::R8, + ARM::R10 + }; + + // FP is R7, R9 is not available. + static const unsigned GPREven5[] = { + ARM::R0, ARM::R2, ARM::R4, ARM::R10, + ARM::R1, ARM::R3, ARM::R12,ARM::LR, ARM::R5, ARM::R6, ARM::R8, + ARM::R11 + }; + static const unsigned GPROdd5[] = { + ARM::R1, ARM::R3, ARM::R5, ARM::R11, + ARM::R0, ARM::R2, ARM::R12,ARM::LR, ARM::R4, ARM::R6, ARM::R8, + ARM::R10 + }; + + // FP is R11, R9 is not available. + static const unsigned GPREven6[] = { + ARM::R0, ARM::R2, ARM::R4, ARM::R6, + ARM::R1, ARM::R3, ARM::R10,ARM::R12,ARM::LR, ARM::R5, ARM::R7, ARM::R8 + }; + static const unsigned GPROdd6[] = { + ARM::R1, ARM::R3, ARM::R5, ARM::R7, + ARM::R0, ARM::R2, ARM::R10,ARM::R12,ARM::LR, ARM::R4, ARM::R6, ARM::R8 + }; + + + if (HintType == ARMRI::RegPairEven) { + if (isPhysicalRegister(HintReg) && getRegisterPairEven(HintReg, MF) == 0) + // It's no longer possible to fulfill this hint. Return the default + // allocation order. + return std::make_pair(RC->allocation_order_begin(MF), + RC->allocation_order_end(MF)); + + if (!STI.isTargetDarwin() && !hasFP(MF)) { + if (!STI.isR9Reserved()) + return std::make_pair(GPREven1, + GPREven1 + (sizeof(GPREven1)/sizeof(unsigned))); + else + return std::make_pair(GPREven4, + GPREven4 + (sizeof(GPREven4)/sizeof(unsigned))); + } else if (FramePtr == ARM::R7) { + if (!STI.isR9Reserved()) + return std::make_pair(GPREven2, + GPREven2 + (sizeof(GPREven2)/sizeof(unsigned))); + else + return std::make_pair(GPREven5, + GPREven5 + (sizeof(GPREven5)/sizeof(unsigned))); + } else { // FramePtr == ARM::R11 + if (!STI.isR9Reserved()) + return std::make_pair(GPREven3, + GPREven3 + (sizeof(GPREven3)/sizeof(unsigned))); + else + return std::make_pair(GPREven6, + GPREven6 + (sizeof(GPREven6)/sizeof(unsigned))); + } + } else if (HintType == ARMRI::RegPairOdd) { + if (isPhysicalRegister(HintReg) && getRegisterPairOdd(HintReg, MF) == 0) + // It's no longer possible to fulfill this hint. Return the default + // allocation order. + return std::make_pair(RC->allocation_order_begin(MF), + RC->allocation_order_end(MF)); + + if (!STI.isTargetDarwin() && !hasFP(MF)) { + if (!STI.isR9Reserved()) + return std::make_pair(GPROdd1, + GPROdd1 + (sizeof(GPROdd1)/sizeof(unsigned))); + else + return std::make_pair(GPROdd4, + GPROdd4 + (sizeof(GPROdd4)/sizeof(unsigned))); + } else if (FramePtr == ARM::R7) { + if (!STI.isR9Reserved()) + return std::make_pair(GPROdd2, + GPROdd2 + (sizeof(GPROdd2)/sizeof(unsigned))); + else + return std::make_pair(GPROdd5, + GPROdd5 + (sizeof(GPROdd5)/sizeof(unsigned))); + } else { // FramePtr == ARM::R11 + if (!STI.isR9Reserved()) + return std::make_pair(GPROdd3, + GPROdd3 + (sizeof(GPROdd3)/sizeof(unsigned))); + else + return std::make_pair(GPROdd6, + GPROdd6 + (sizeof(GPROdd6)/sizeof(unsigned))); + } + } + return std::make_pair(RC->allocation_order_begin(MF), + RC->allocation_order_end(MF)); +} + +/// ResolveRegAllocHint - Resolves the specified register allocation hint +/// to a physical register. Returns the physical register if it is successful. +unsigned +ARMBaseRegisterInfo::ResolveRegAllocHint(unsigned Type, unsigned Reg, + const MachineFunction &MF) const { + if (Reg == 0 || !isPhysicalRegister(Reg)) + return 0; + if (Type == 0) + return Reg; + else if (Type == (unsigned)ARMRI::RegPairOdd) + // Odd register. + return getRegisterPairOdd(Reg, MF); + else if (Type == (unsigned)ARMRI::RegPairEven) + // Even register. + return getRegisterPairEven(Reg, MF); + return 0; +} + +void +ARMBaseRegisterInfo::UpdateRegAllocHint(unsigned Reg, unsigned NewReg, + MachineFunction &MF) const { + MachineRegisterInfo *MRI = &MF.getRegInfo(); + std::pair<unsigned, unsigned> Hint = MRI->getRegAllocationHint(Reg); + if ((Hint.first == (unsigned)ARMRI::RegPairOdd || + Hint.first == (unsigned)ARMRI::RegPairEven) && + Hint.second && TargetRegisterInfo::isVirtualRegister(Hint.second)) { + // If 'Reg' is one of the even / odd register pair and it's now changed + // (e.g. coalesced) into a different register. The other register of the + // pair allocation hint must be updated to reflect the relationship + // change. + unsigned OtherReg = Hint.second; + Hint = MRI->getRegAllocationHint(OtherReg); + if (Hint.second == Reg) + // Make sure the pair has not already divorced. + MRI->setRegAllocationHint(OtherReg, Hint.first, NewReg); + } +} + +/// hasFP - Return true if the specified function should have a dedicated frame +/// pointer register. This is true if the function has variable sized allocas +/// or if frame pointer elimination is disabled. +/// +bool ARMBaseRegisterInfo::hasFP(const MachineFunction &MF) const { + const MachineFrameInfo *MFI = MF.getFrameInfo(); + return ((DisableFramePointerElim(MF) && MFI->adjustsStack())|| + needsStackRealignment(MF) || + MFI->hasVarSizedObjects() || + MFI->isFrameAddressTaken()); +} + +bool ARMBaseRegisterInfo::canRealignStack(const MachineFunction &MF) const { + const MachineFrameInfo *MFI = MF.getFrameInfo(); + const ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + return (RealignStack && + !AFI->isThumb1OnlyFunction() && + !MFI->hasVarSizedObjects()); +} + +bool ARMBaseRegisterInfo:: +needsStackRealignment(const MachineFunction &MF) const { + const MachineFrameInfo *MFI = MF.getFrameInfo(); + const ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + unsigned StackAlign = MF.getTarget().getFrameInfo()->getStackAlignment(); + return (RealignStack && + !AFI->isThumb1OnlyFunction() && + (MFI->getMaxAlignment() > StackAlign) && + !MFI->hasVarSizedObjects()); +} + +bool ARMBaseRegisterInfo:: +cannotEliminateFrame(const MachineFunction &MF) const { + const MachineFrameInfo *MFI = MF.getFrameInfo(); + if (DisableFramePointerElim(MF) && MFI->adjustsStack()) + return true; + return MFI->hasVarSizedObjects() || MFI->isFrameAddressTaken() + || needsStackRealignment(MF); +} + +/// estimateStackSize - Estimate and return the size of the frame. +static unsigned estimateStackSize(MachineFunction &MF) { + const MachineFrameInfo *FFI = MF.getFrameInfo(); + int Offset = 0; + for (int i = FFI->getObjectIndexBegin(); i != 0; ++i) { + int FixedOff = -FFI->getObjectOffset(i); + if (FixedOff > Offset) Offset = FixedOff; + } + for (unsigned i = 0, e = FFI->getObjectIndexEnd(); i != e; ++i) { + if (FFI->isDeadObjectIndex(i)) + continue; + Offset += FFI->getObjectSize(i); + unsigned Align = FFI->getObjectAlignment(i); + // Adjust to alignment boundary + Offset = (Offset+Align-1)/Align*Align; + } + return (unsigned)Offset; +} + +/// estimateRSStackSizeLimit - Look at each instruction that references stack +/// frames and return the stack size limit beyond which some of these +/// instructions will require a scratch register during their expansion later. +unsigned +ARMBaseRegisterInfo::estimateRSStackSizeLimit(MachineFunction &MF) const { + unsigned Limit = (1 << 12) - 1; + for (MachineFunction::iterator BB = MF.begin(),E = MF.end(); BB != E; ++BB) { + for (MachineBasicBlock::iterator I = BB->begin(), E = BB->end(); + I != E; ++I) { + for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) { + if (!I->getOperand(i).isFI()) continue; + switch (I->getDesc().TSFlags & ARMII::AddrModeMask) { + case ARMII::AddrMode3: + case ARMII::AddrModeT2_i8: + Limit = std::min(Limit, (1U << 8) - 1); + break; + case ARMII::AddrMode5: + case ARMII::AddrModeT2_i8s4: + Limit = std::min(Limit, ((1U << 8) - 1) * 4); + break; + case ARMII::AddrModeT2_i12: + if (hasFP(MF)) Limit = std::min(Limit, (1U << 8) - 1); + break; + case ARMII::AddrMode6: + // Addressing mode 6 (load/store) instructions can't encode an + // immediate offset for stack references. + return 0; + default: + break; + } + break; // At most one FI per instruction + } + } + } + + return Limit; +} + +void +ARMBaseRegisterInfo::processFunctionBeforeCalleeSavedScan(MachineFunction &MF, + RegScavenger *RS) const { + // This tells PEI to spill the FP as if it is any other callee-save register + // to take advantage the eliminateFrameIndex machinery. This also ensures it + // is spilled in the order specified by getCalleeSavedRegs() to make it easier + // to combine multiple loads / stores. + bool CanEliminateFrame = true; + bool CS1Spilled = false; + bool LRSpilled = false; + unsigned NumGPRSpills = 0; + SmallVector<unsigned, 4> UnspilledCS1GPRs; + SmallVector<unsigned, 4> UnspilledCS2GPRs; + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + + // Spill R4 if Thumb2 function requires stack realignment - it will be used as + // scratch register. + // FIXME: It will be better just to find spare register here. + if (needsStackRealignment(MF) && + AFI->isThumb2Function()) + MF.getRegInfo().setPhysRegUsed(ARM::R4); + + // Spill LR if Thumb1 function uses variable length argument lists. + if (AFI->isThumb1OnlyFunction() && AFI->getVarArgsRegSaveSize() > 0) + MF.getRegInfo().setPhysRegUsed(ARM::LR); + + // Don't spill FP if the frame can be eliminated. This is determined + // by scanning the callee-save registers to see if any is used. + const unsigned *CSRegs = getCalleeSavedRegs(); + const TargetRegisterClass* const *CSRegClasses = getCalleeSavedRegClasses(); + for (unsigned i = 0; CSRegs[i]; ++i) { + unsigned Reg = CSRegs[i]; + bool Spilled = false; + if (MF.getRegInfo().isPhysRegUsed(Reg)) { + AFI->setCSRegisterIsSpilled(Reg); + Spilled = true; + CanEliminateFrame = false; + } else { + // Check alias registers too. + for (const unsigned *Aliases = getAliasSet(Reg); *Aliases; ++Aliases) { + if (MF.getRegInfo().isPhysRegUsed(*Aliases)) { + Spilled = true; + CanEliminateFrame = false; + } + } + } + + if (CSRegClasses[i] == ARM::GPRRegisterClass || + CSRegClasses[i] == ARM::tGPRRegisterClass) { + if (Spilled) { + NumGPRSpills++; + + if (!STI.isTargetDarwin()) { + if (Reg == ARM::LR) + LRSpilled = true; + CS1Spilled = true; + continue; + } + + // Keep track if LR and any of R4, R5, R6, and R7 is spilled. + switch (Reg) { + case ARM::LR: + LRSpilled = true; + // Fallthrough + case ARM::R4: + case ARM::R5: + case ARM::R6: + case ARM::R7: + CS1Spilled = true; + break; + default: + break; + } + } else { + if (!STI.isTargetDarwin()) { + UnspilledCS1GPRs.push_back(Reg); + continue; + } + + switch (Reg) { + case ARM::R4: + case ARM::R5: + case ARM::R6: + case ARM::R7: + case ARM::LR: + UnspilledCS1GPRs.push_back(Reg); + break; + default: + UnspilledCS2GPRs.push_back(Reg); + break; + } + } + } + } + + bool ForceLRSpill = false; + if (!LRSpilled && AFI->isThumb1OnlyFunction()) { + unsigned FnSize = TII.GetFunctionSizeInBytes(MF); + // Force LR to be spilled if the Thumb function size is > 2048. This enables + // use of BL to implement far jump. If it turns out that it's not needed + // then the branch fix up path will undo it. + if (FnSize >= (1 << 11)) { + CanEliminateFrame = false; + ForceLRSpill = true; + } + } + + // If any of the stack slot references may be out of range of an immediate + // offset, make sure a register (or a spill slot) is available for the + // register scavenger. Note that if we're indexing off the frame pointer, the + // effective stack size is 4 bytes larger since the FP points to the stack + // slot of the previous FP. + bool BigStack = RS && + estimateStackSize(MF) + (hasFP(MF) ? 4 : 0) >= estimateRSStackSizeLimit(MF); + + bool ExtraCSSpill = false; + if (BigStack || !CanEliminateFrame || cannotEliminateFrame(MF)) { + AFI->setHasStackFrame(true); + + // If LR is not spilled, but at least one of R4, R5, R6, and R7 is spilled. + // Spill LR as well so we can fold BX_RET to the registers restore (LDM). + if (!LRSpilled && CS1Spilled) { + MF.getRegInfo().setPhysRegUsed(ARM::LR); + AFI->setCSRegisterIsSpilled(ARM::LR); + NumGPRSpills++; + UnspilledCS1GPRs.erase(std::find(UnspilledCS1GPRs.begin(), + UnspilledCS1GPRs.end(), (unsigned)ARM::LR)); + ForceLRSpill = false; + ExtraCSSpill = true; + } + + // Darwin ABI requires FP to point to the stack slot that contains the + // previous FP. + if (STI.isTargetDarwin() || hasFP(MF)) { + MF.getRegInfo().setPhysRegUsed(FramePtr); + NumGPRSpills++; + } + + // If stack and double are 8-byte aligned and we are spilling an odd number + // of GPRs. Spill one extra callee save GPR so we won't have to pad between + // the integer and double callee save areas. + unsigned TargetAlign = MF.getTarget().getFrameInfo()->getStackAlignment(); + if (TargetAlign == 8 && (NumGPRSpills & 1)) { + if (CS1Spilled && !UnspilledCS1GPRs.empty()) { + for (unsigned i = 0, e = UnspilledCS1GPRs.size(); i != e; ++i) { + unsigned Reg = UnspilledCS1GPRs[i]; + // Don't spill high register if the function is thumb1 + if (!AFI->isThumb1OnlyFunction() || + isARMLowRegister(Reg) || Reg == ARM::LR) { + MF.getRegInfo().setPhysRegUsed(Reg); + AFI->setCSRegisterIsSpilled(Reg); + if (!isReservedReg(MF, Reg)) + ExtraCSSpill = true; + break; + } + } + } else if (!UnspilledCS2GPRs.empty() && + !AFI->isThumb1OnlyFunction()) { + unsigned Reg = UnspilledCS2GPRs.front(); + MF.getRegInfo().setPhysRegUsed(Reg); + AFI->setCSRegisterIsSpilled(Reg); + if (!isReservedReg(MF, Reg)) + ExtraCSSpill = true; + } + } + + // Estimate if we might need to scavenge a register at some point in order + // to materialize a stack offset. If so, either spill one additional + // callee-saved register or reserve a special spill slot to facilitate + // register scavenging. Thumb1 needs a spill slot for stack pointer + // adjustments also, even when the frame itself is small. + if (BigStack && !ExtraCSSpill) { + // If any non-reserved CS register isn't spilled, just spill one or two + // extra. That should take care of it! + unsigned NumExtras = TargetAlign / 4; + SmallVector<unsigned, 2> Extras; + while (NumExtras && !UnspilledCS1GPRs.empty()) { + unsigned Reg = UnspilledCS1GPRs.back(); + UnspilledCS1GPRs.pop_back(); + if (!isReservedReg(MF, Reg) && + (!AFI->isThumb1OnlyFunction() || isARMLowRegister(Reg) || + Reg == ARM::LR)) { + Extras.push_back(Reg); + NumExtras--; + } + } + // For non-Thumb1 functions, also check for hi-reg CS registers + if (!AFI->isThumb1OnlyFunction()) { + while (NumExtras && !UnspilledCS2GPRs.empty()) { + unsigned Reg = UnspilledCS2GPRs.back(); + UnspilledCS2GPRs.pop_back(); + if (!isReservedReg(MF, Reg)) { + Extras.push_back(Reg); + NumExtras--; + } + } + } + if (Extras.size() && NumExtras == 0) { + for (unsigned i = 0, e = Extras.size(); i != e; ++i) { + MF.getRegInfo().setPhysRegUsed(Extras[i]); + AFI->setCSRegisterIsSpilled(Extras[i]); + } + } else if (!AFI->isThumb1OnlyFunction()) { + // note: Thumb1 functions spill to R12, not the stack. Reserve a slot + // closest to SP or frame pointer. + const TargetRegisterClass *RC = ARM::GPRRegisterClass; + MachineFrameInfo *MFI = MF.getFrameInfo(); + RS->setScavengingFrameIndex(MFI->CreateStackObject(RC->getSize(), + RC->getAlignment(), + false)); + } + } + } + + if (ForceLRSpill) { + MF.getRegInfo().setPhysRegUsed(ARM::LR); + AFI->setCSRegisterIsSpilled(ARM::LR); + AFI->setLRIsSpilledForFarJump(true); + } +} + +unsigned ARMBaseRegisterInfo::getRARegister() const { + return ARM::LR; +} + +unsigned +ARMBaseRegisterInfo::getFrameRegister(const MachineFunction &MF) const { + if (STI.isTargetDarwin() || hasFP(MF)) + return FramePtr; + return ARM::SP; +} + +int +ARMBaseRegisterInfo::getFrameIndexReference(const MachineFunction &MF, int FI, + unsigned &FrameReg) const { + const MachineFrameInfo *MFI = MF.getFrameInfo(); + const ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + int Offset = MFI->getObjectOffset(FI) + MFI->getStackSize(); + bool isFixed = MFI->isFixedObjectIndex(FI); + + FrameReg = ARM::SP; + if (AFI->isGPRCalleeSavedArea1Frame(FI)) + Offset -= AFI->getGPRCalleeSavedArea1Offset(); + else if (AFI->isGPRCalleeSavedArea2Frame(FI)) + Offset -= AFI->getGPRCalleeSavedArea2Offset(); + else if (AFI->isDPRCalleeSavedAreaFrame(FI)) + Offset -= AFI->getDPRCalleeSavedAreaOffset(); + else if (needsStackRealignment(MF)) { + // When dynamically realigning the stack, use the frame pointer for + // parameters, and the stack pointer for locals. + assert (hasFP(MF) && "dynamic stack realignment without a FP!"); + if (isFixed) { + FrameReg = getFrameRegister(MF); + Offset -= AFI->getFramePtrSpillOffset(); + } + } else if (hasFP(MF) && AFI->hasStackFrame()) { + if (isFixed || MFI->hasVarSizedObjects()) { + // Use frame pointer to reference fixed objects unless this is a + // frameless function. + FrameReg = getFrameRegister(MF); + Offset -= AFI->getFramePtrSpillOffset(); + } else if (AFI->isThumb2Function()) { + // In Thumb2 mode, the negative offset is very limited. + int FPOffset = Offset - AFI->getFramePtrSpillOffset(); + if (FPOffset >= -255 && FPOffset < 0) { + FrameReg = getFrameRegister(MF); + Offset = FPOffset; + } + } + } + return Offset; +} + + +int +ARMBaseRegisterInfo::getFrameIndexOffset(const MachineFunction &MF, + int FI) const { + unsigned FrameReg; + return getFrameIndexReference(MF, FI, FrameReg); +} + +unsigned ARMBaseRegisterInfo::getEHExceptionRegister() const { + llvm_unreachable("What is the exception register"); + return 0; +} + +unsigned ARMBaseRegisterInfo::getEHHandlerRegister() const { + llvm_unreachable("What is the exception handler register"); + return 0; +} + +int ARMBaseRegisterInfo::getDwarfRegNum(unsigned RegNum, bool isEH) const { + return ARMGenRegisterInfo::getDwarfRegNumFull(RegNum, 0); +} + +unsigned ARMBaseRegisterInfo::getRegisterPairEven(unsigned Reg, + const MachineFunction &MF) const { + switch (Reg) { + default: break; + // Return 0 if either register of the pair is a special register. + // So no R12, etc. + case ARM::R1: + return ARM::R0; + case ARM::R3: + return ARM::R2; + case ARM::R5: + return ARM::R4; + case ARM::R7: + return isReservedReg(MF, ARM::R7) ? 0 : ARM::R6; + case ARM::R9: + return isReservedReg(MF, ARM::R9) ? 0 :ARM::R8; + case ARM::R11: + return isReservedReg(MF, ARM::R11) ? 0 : ARM::R10; + + case ARM::S1: + return ARM::S0; + case ARM::S3: + return ARM::S2; + case ARM::S5: + return ARM::S4; + case ARM::S7: + return ARM::S6; + case ARM::S9: + return ARM::S8; + case ARM::S11: + return ARM::S10; + case ARM::S13: + return ARM::S12; + case ARM::S15: + return ARM::S14; + case ARM::S17: + return ARM::S16; + case ARM::S19: + return ARM::S18; + case ARM::S21: + return ARM::S20; + case ARM::S23: + return ARM::S22; + case ARM::S25: + return ARM::S24; + case ARM::S27: + return ARM::S26; + case ARM::S29: + return ARM::S28; + case ARM::S31: + return ARM::S30; + + case ARM::D1: + return ARM::D0; + case ARM::D3: + return ARM::D2; + case ARM::D5: + return ARM::D4; + case ARM::D7: + return ARM::D6; + case ARM::D9: + return ARM::D8; + case ARM::D11: + return ARM::D10; + case ARM::D13: + return ARM::D12; + case ARM::D15: + return ARM::D14; + case ARM::D17: + return ARM::D16; + case ARM::D19: + return ARM::D18; + case ARM::D21: + return ARM::D20; + case ARM::D23: + return ARM::D22; + case ARM::D25: + return ARM::D24; + case ARM::D27: + return ARM::D26; + case ARM::D29: + return ARM::D28; + case ARM::D31: + return ARM::D30; + } + + return 0; +} + +unsigned ARMBaseRegisterInfo::getRegisterPairOdd(unsigned Reg, + const MachineFunction &MF) const { + switch (Reg) { + default: break; + // Return 0 if either register of the pair is a special register. + // So no R12, etc. + case ARM::R0: + return ARM::R1; + case ARM::R2: + return ARM::R3; + case ARM::R4: + return ARM::R5; + case ARM::R6: + return isReservedReg(MF, ARM::R7) ? 0 : ARM::R7; + case ARM::R8: + return isReservedReg(MF, ARM::R9) ? 0 :ARM::R9; + case ARM::R10: + return isReservedReg(MF, ARM::R11) ? 0 : ARM::R11; + + case ARM::S0: + return ARM::S1; + case ARM::S2: + return ARM::S3; + case ARM::S4: + return ARM::S5; + case ARM::S6: + return ARM::S7; + case ARM::S8: + return ARM::S9; + case ARM::S10: + return ARM::S11; + case ARM::S12: + return ARM::S13; + case ARM::S14: + return ARM::S15; + case ARM::S16: + return ARM::S17; + case ARM::S18: + return ARM::S19; + case ARM::S20: + return ARM::S21; + case ARM::S22: + return ARM::S23; + case ARM::S24: + return ARM::S25; + case ARM::S26: + return ARM::S27; + case ARM::S28: + return ARM::S29; + case ARM::S30: + return ARM::S31; + + case ARM::D0: + return ARM::D1; + case ARM::D2: + return ARM::D3; + case ARM::D4: + return ARM::D5; + case ARM::D6: + return ARM::D7; + case ARM::D8: + return ARM::D9; + case ARM::D10: + return ARM::D11; + case ARM::D12: + return ARM::D13; + case ARM::D14: + return ARM::D15; + case ARM::D16: + return ARM::D17; + case ARM::D18: + return ARM::D19; + case ARM::D20: + return ARM::D21; + case ARM::D22: + return ARM::D23; + case ARM::D24: + return ARM::D25; + case ARM::D26: + return ARM::D27; + case ARM::D28: + return ARM::D29; + case ARM::D30: + return ARM::D31; + } + + return 0; +} + +/// emitLoadConstPool - Emits a load from constpool to materialize the +/// specified immediate. +void ARMBaseRegisterInfo:: +emitLoadConstPool(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, + DebugLoc dl, + unsigned DestReg, unsigned SubIdx, int Val, + ARMCC::CondCodes Pred, + unsigned PredReg) const { + MachineFunction &MF = *MBB.getParent(); + MachineConstantPool *ConstantPool = MF.getConstantPool(); + const Constant *C = + ConstantInt::get(Type::getInt32Ty(MF.getFunction()->getContext()), Val); + unsigned Idx = ConstantPool->getConstantPoolIndex(C, 4); + + BuildMI(MBB, MBBI, dl, TII.get(ARM::LDRcp)) + .addReg(DestReg, getDefRegState(true), SubIdx) + .addConstantPoolIndex(Idx) + .addReg(0).addImm(0).addImm(Pred).addReg(PredReg); +} + +bool ARMBaseRegisterInfo:: +requiresRegisterScavenging(const MachineFunction &MF) const { + return true; +} + +bool ARMBaseRegisterInfo:: +requiresFrameIndexScavenging(const MachineFunction &MF) const { + return true; +} + +// hasReservedCallFrame - Under normal circumstances, when a frame pointer is +// not required, we reserve argument space for call sites in the function +// immediately on entry to the current function. This eliminates the need for +// add/sub sp brackets around call sites. Returns true if the call frame is +// included as part of the stack frame. +bool ARMBaseRegisterInfo:: +hasReservedCallFrame(MachineFunction &MF) const { + const MachineFrameInfo *FFI = MF.getFrameInfo(); + unsigned CFSize = FFI->getMaxCallFrameSize(); + // It's not always a good idea to include the call frame as part of the + // stack frame. ARM (especially Thumb) has small immediate offset to + // address the stack frame. So a large call frame can cause poor codegen + // and may even makes it impossible to scavenge a register. + if (CFSize >= ((1 << 12) - 1) / 2) // Half of imm12 + return false; + + return !MF.getFrameInfo()->hasVarSizedObjects(); +} + +// canSimplifyCallFramePseudos - If there is a reserved call frame, the +// call frame pseudos can be simplified. Unlike most targets, having a FP +// is not sufficient here since we still may reference some objects via SP +// even when FP is available in Thumb2 mode. +bool ARMBaseRegisterInfo:: +canSimplifyCallFramePseudos(MachineFunction &MF) const { + return hasReservedCallFrame(MF) || MF.getFrameInfo()->hasVarSizedObjects(); +} + +static void +emitSPUpdate(bool isARM, + MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, + DebugLoc dl, const ARMBaseInstrInfo &TII, + int NumBytes, + ARMCC::CondCodes Pred = ARMCC::AL, unsigned PredReg = 0) { + if (isARM) + emitARMRegPlusImmediate(MBB, MBBI, dl, ARM::SP, ARM::SP, NumBytes, + Pred, PredReg, TII); + else + emitT2RegPlusImmediate(MBB, MBBI, dl, ARM::SP, ARM::SP, NumBytes, + Pred, PredReg, TII); +} + + +void ARMBaseRegisterInfo:: +eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator I) const { + if (!hasReservedCallFrame(MF)) { + // If we have alloca, convert as follows: + // ADJCALLSTACKDOWN -> sub, sp, sp, amount + // ADJCALLSTACKUP -> add, sp, sp, amount + MachineInstr *Old = I; + DebugLoc dl = Old->getDebugLoc(); + unsigned Amount = Old->getOperand(0).getImm(); + if (Amount != 0) { + // We need to keep the stack aligned properly. To do this, we round the + // amount of space needed for the outgoing arguments up to the next + // alignment boundary. + unsigned Align = MF.getTarget().getFrameInfo()->getStackAlignment(); + Amount = (Amount+Align-1)/Align*Align; + + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + assert(!AFI->isThumb1OnlyFunction() && + "This eliminateCallFramePseudoInstr does not support Thumb1!"); + bool isARM = !AFI->isThumbFunction(); + + // Replace the pseudo instruction with a new instruction... + unsigned Opc = Old->getOpcode(); + int PIdx = Old->findFirstPredOperandIdx(); + ARMCC::CondCodes Pred = (PIdx == -1) + ? ARMCC::AL : (ARMCC::CondCodes)Old->getOperand(PIdx).getImm(); + if (Opc == ARM::ADJCALLSTACKDOWN || Opc == ARM::tADJCALLSTACKDOWN) { + // Note: PredReg is operand 2 for ADJCALLSTACKDOWN. + unsigned PredReg = Old->getOperand(2).getReg(); + emitSPUpdate(isARM, MBB, I, dl, TII, -Amount, Pred, PredReg); + } else { + // Note: PredReg is operand 3 for ADJCALLSTACKUP. + unsigned PredReg = Old->getOperand(3).getReg(); + assert(Opc == ARM::ADJCALLSTACKUP || Opc == ARM::tADJCALLSTACKUP); + emitSPUpdate(isARM, MBB, I, dl, TII, Amount, Pred, PredReg); + } + } + } + MBB.erase(I); +} + +unsigned +ARMBaseRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, + int SPAdj, FrameIndexValue *Value, + RegScavenger *RS) const { + unsigned i = 0; + MachineInstr &MI = *II; + MachineBasicBlock &MBB = *MI.getParent(); + MachineFunction &MF = *MBB.getParent(); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + assert(!AFI->isThumb1OnlyFunction() && + "This eliminateFrameIndex does not support Thumb1!"); + + while (!MI.getOperand(i).isFI()) { + ++i; + assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!"); + } + + int FrameIndex = MI.getOperand(i).getIndex(); + unsigned FrameReg; + + int Offset = getFrameIndexReference(MF, FrameIndex, FrameReg); + if (FrameReg != ARM::SP) + SPAdj = 0; + Offset += SPAdj; + + // Special handling of dbg_value instructions. + if (MI.isDebugValue()) { + MI.getOperand(i). ChangeToRegister(FrameReg, false /*isDef*/); + MI.getOperand(i+1).ChangeToImmediate(Offset); + return 0; + } + + // Modify MI as necessary to handle as much of 'Offset' as possible + bool Done = false; + if (!AFI->isThumbFunction()) + Done = rewriteARMFrameIndex(MI, i, FrameReg, Offset, TII); + else { + assert(AFI->isThumb2Function()); + Done = rewriteT2FrameIndex(MI, i, FrameReg, Offset, TII); + } + if (Done) + return 0; + + // If we get here, the immediate doesn't fit into the instruction. We folded + // as much as possible above, handle the rest, providing a register that is + // SP+LargeImm. + assert((Offset || + (MI.getDesc().TSFlags & ARMII::AddrModeMask) == ARMII::AddrMode4 || + (MI.getDesc().TSFlags & ARMII::AddrModeMask) == ARMII::AddrMode6) && + "This code isn't needed if offset already handled!"); + + unsigned ScratchReg = 0; + int PIdx = MI.findFirstPredOperandIdx(); + ARMCC::CondCodes Pred = (PIdx == -1) + ? ARMCC::AL : (ARMCC::CondCodes)MI.getOperand(PIdx).getImm(); + unsigned PredReg = (PIdx == -1) ? 0 : MI.getOperand(PIdx+1).getReg(); + if (Offset == 0) + // Must be addrmode4/6. + MI.getOperand(i).ChangeToRegister(FrameReg, false, false, false); + else { + ScratchReg = MF.getRegInfo().createVirtualRegister(ARM::GPRRegisterClass); + if (Value) { + Value->first = FrameReg; // use the frame register as a kind indicator + Value->second = Offset; + } + if (!AFI->isThumbFunction()) + emitARMRegPlusImmediate(MBB, II, MI.getDebugLoc(), ScratchReg, FrameReg, + Offset, Pred, PredReg, TII); + else { + assert(AFI->isThumb2Function()); + emitT2RegPlusImmediate(MBB, II, MI.getDebugLoc(), ScratchReg, FrameReg, + Offset, Pred, PredReg, TII); + } + MI.getOperand(i).ChangeToRegister(ScratchReg, false, false, true); + if (!ReuseFrameIndexVals) + ScratchReg = 0; + } + return ScratchReg; +} + +/// Move iterator past the next bunch of callee save load / store ops for +/// the particular spill area (1: integer area 1, 2: integer area 2, +/// 3: fp area, 0: don't care). +static void movePastCSLoadStoreOps(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, + int Opc1, int Opc2, unsigned Area, + const ARMSubtarget &STI) { + while (MBBI != MBB.end() && + ((MBBI->getOpcode() == Opc1) || (MBBI->getOpcode() == Opc2)) && + MBBI->getOperand(1).isFI()) { + if (Area != 0) { + bool Done = false; + unsigned Category = 0; + switch (MBBI->getOperand(0).getReg()) { + case ARM::R4: case ARM::R5: case ARM::R6: case ARM::R7: + case ARM::LR: + Category = 1; + break; + case ARM::R8: case ARM::R9: case ARM::R10: case ARM::R11: + Category = STI.isTargetDarwin() ? 2 : 1; + break; + case ARM::D8: case ARM::D9: case ARM::D10: case ARM::D11: + case ARM::D12: case ARM::D13: case ARM::D14: case ARM::D15: + Category = 3; + break; + default: + Done = true; + break; + } + if (Done || Category != Area) + break; + } + + ++MBBI; + } +} + +void ARMBaseRegisterInfo:: +emitPrologue(MachineFunction &MF) const { + MachineBasicBlock &MBB = MF.front(); + MachineBasicBlock::iterator MBBI = MBB.begin(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + assert(!AFI->isThumb1OnlyFunction() && + "This emitPrologue does not support Thumb1!"); + bool isARM = !AFI->isThumbFunction(); + unsigned VARegSaveSize = AFI->getVarArgsRegSaveSize(); + unsigned NumBytes = MFI->getStackSize(); + const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo(); + DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc(); + + // Determine the sizes of each callee-save spill areas and record which frame + // belongs to which callee-save spill areas. + unsigned GPRCS1Size = 0, GPRCS2Size = 0, DPRCSSize = 0; + int FramePtrSpillFI = 0; + + // Allocate the vararg register save area. This is not counted in NumBytes. + if (VARegSaveSize) + emitSPUpdate(isARM, MBB, MBBI, dl, TII, -VARegSaveSize); + + if (!AFI->hasStackFrame()) { + if (NumBytes != 0) + emitSPUpdate(isARM, MBB, MBBI, dl, TII, -NumBytes); + return; + } + + for (unsigned i = 0, e = CSI.size(); i != e; ++i) { + unsigned Reg = CSI[i].getReg(); + int FI = CSI[i].getFrameIdx(); + switch (Reg) { + case ARM::R4: + case ARM::R5: + case ARM::R6: + case ARM::R7: + case ARM::LR: + if (Reg == FramePtr) + FramePtrSpillFI = FI; + AFI->addGPRCalleeSavedArea1Frame(FI); + GPRCS1Size += 4; + break; + case ARM::R8: + case ARM::R9: + case ARM::R10: + case ARM::R11: + if (Reg == FramePtr) + FramePtrSpillFI = FI; + if (STI.isTargetDarwin()) { + AFI->addGPRCalleeSavedArea2Frame(FI); + GPRCS2Size += 4; + } else { + AFI->addGPRCalleeSavedArea1Frame(FI); + GPRCS1Size += 4; + } + break; + default: + AFI->addDPRCalleeSavedAreaFrame(FI); + DPRCSSize += 8; + } + } + + // Build the new SUBri to adjust SP for integer callee-save spill area 1. + emitSPUpdate(isARM, MBB, MBBI, dl, TII, -GPRCS1Size); + movePastCSLoadStoreOps(MBB, MBBI, ARM::STR, ARM::t2STRi12, 1, STI); + + // Set FP to point to the stack slot that contains the previous FP. + // For Darwin, FP is R7, which has now been stored in spill area 1. + // Otherwise, if this is not Darwin, all the callee-saved registers go + // into spill area 1, including the FP in R11. In either case, it is + // now safe to emit this assignment. + if (STI.isTargetDarwin() || hasFP(MF)) { + unsigned ADDriOpc = !AFI->isThumbFunction() ? ARM::ADDri : ARM::t2ADDri; + MachineInstrBuilder MIB = + BuildMI(MBB, MBBI, dl, TII.get(ADDriOpc), FramePtr) + .addFrameIndex(FramePtrSpillFI).addImm(0); + AddDefaultCC(AddDefaultPred(MIB)); + } + + // Build the new SUBri to adjust SP for integer callee-save spill area 2. + emitSPUpdate(isARM, MBB, MBBI, dl, TII, -GPRCS2Size); + + // Build the new SUBri to adjust SP for FP callee-save spill area. + movePastCSLoadStoreOps(MBB, MBBI, ARM::STR, ARM::t2STRi12, 2, STI); + emitSPUpdate(isARM, MBB, MBBI, dl, TII, -DPRCSSize); + + // Determine starting offsets of spill areas. + unsigned DPRCSOffset = NumBytes - (GPRCS1Size + GPRCS2Size + DPRCSSize); + unsigned GPRCS2Offset = DPRCSOffset + DPRCSSize; + unsigned GPRCS1Offset = GPRCS2Offset + GPRCS2Size; + if (STI.isTargetDarwin() || hasFP(MF)) + AFI->setFramePtrSpillOffset(MFI->getObjectOffset(FramePtrSpillFI) + + NumBytes); + AFI->setGPRCalleeSavedArea1Offset(GPRCS1Offset); + AFI->setGPRCalleeSavedArea2Offset(GPRCS2Offset); + AFI->setDPRCalleeSavedAreaOffset(DPRCSOffset); + + movePastCSLoadStoreOps(MBB, MBBI, ARM::VSTRD, 0, 3, STI); + NumBytes = DPRCSOffset; + if (NumBytes) { + // Adjust SP after all the callee-save spills. + emitSPUpdate(isARM, MBB, MBBI, dl, TII, -NumBytes); + } + + if (STI.isTargetELF() && hasFP(MF)) { + MFI->setOffsetAdjustment(MFI->getOffsetAdjustment() - + AFI->getFramePtrSpillOffset()); + } + + AFI->setGPRCalleeSavedArea1Size(GPRCS1Size); + AFI->setGPRCalleeSavedArea2Size(GPRCS2Size); + AFI->setDPRCalleeSavedAreaSize(DPRCSSize); + + // If we need dynamic stack realignment, do it here. + if (needsStackRealignment(MF)) { + unsigned MaxAlign = MFI->getMaxAlignment(); + assert (!AFI->isThumb1OnlyFunction()); + if (!AFI->isThumbFunction()) { + // Emit bic sp, sp, MaxAlign + AddDefaultCC(AddDefaultPred(BuildMI(MBB, MBBI, dl, + TII.get(ARM::BICri), ARM::SP) + .addReg(ARM::SP, RegState::Kill) + .addImm(MaxAlign-1))); + } else { + // We cannot use sp as source/dest register here, thus we're emitting the + // following sequence: + // mov r4, sp + // bic r4, r4, MaxAlign + // mov sp, r4 + // FIXME: It will be better just to find spare register here. + BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVgpr2tgpr), ARM::R4) + .addReg(ARM::SP, RegState::Kill); + AddDefaultCC(AddDefaultPred(BuildMI(MBB, MBBI, dl, + TII.get(ARM::t2BICri), ARM::R4) + .addReg(ARM::R4, RegState::Kill) + .addImm(MaxAlign-1))); + BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVtgpr2gpr), ARM::SP) + .addReg(ARM::R4, RegState::Kill); + } + } +} + +static bool isCalleeSavedRegister(unsigned Reg, const unsigned *CSRegs) { + for (unsigned i = 0; CSRegs[i]; ++i) + if (Reg == CSRegs[i]) + return true; + return false; +} + +static bool isCSRestore(MachineInstr *MI, + const ARMBaseInstrInfo &TII, + const unsigned *CSRegs) { + return ((MI->getOpcode() == (int)ARM::VLDRD || + MI->getOpcode() == (int)ARM::LDR || + MI->getOpcode() == (int)ARM::t2LDRi12) && + MI->getOperand(1).isFI() && + isCalleeSavedRegister(MI->getOperand(0).getReg(), CSRegs)); +} + +void ARMBaseRegisterInfo:: +emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const { + MachineBasicBlock::iterator MBBI = prior(MBB.end()); + assert(MBBI->getDesc().isReturn() && + "Can only insert epilog into returning blocks"); + DebugLoc dl = MBBI->getDebugLoc(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + assert(!AFI->isThumb1OnlyFunction() && + "This emitEpilogue does not support Thumb1!"); + bool isARM = !AFI->isThumbFunction(); + + unsigned VARegSaveSize = AFI->getVarArgsRegSaveSize(); + int NumBytes = (int)MFI->getStackSize(); + + if (!AFI->hasStackFrame()) { + if (NumBytes != 0) + emitSPUpdate(isARM, MBB, MBBI, dl, TII, NumBytes); + } else { + // Unwind MBBI to point to first LDR / VLDRD. + const unsigned *CSRegs = getCalleeSavedRegs(); + if (MBBI != MBB.begin()) { + do + --MBBI; + while (MBBI != MBB.begin() && isCSRestore(MBBI, TII, CSRegs)); + if (!isCSRestore(MBBI, TII, CSRegs)) + ++MBBI; + } + + // Move SP to start of FP callee save spill area. + NumBytes -= (AFI->getGPRCalleeSavedArea1Size() + + AFI->getGPRCalleeSavedArea2Size() + + AFI->getDPRCalleeSavedAreaSize()); + + // Darwin ABI requires FP to point to the stack slot that contains the + // previous FP. + bool HasFP = hasFP(MF); + if ((STI.isTargetDarwin() && NumBytes) || HasFP) { + NumBytes = AFI->getFramePtrSpillOffset() - NumBytes; + // Reset SP based on frame pointer only if the stack frame extends beyond + // frame pointer stack slot or target is ELF and the function has FP. + if (HasFP || + AFI->getGPRCalleeSavedArea2Size() || + AFI->getDPRCalleeSavedAreaSize() || + AFI->getDPRCalleeSavedAreaOffset()) { + if (NumBytes) { + if (isARM) + emitARMRegPlusImmediate(MBB, MBBI, dl, ARM::SP, FramePtr, -NumBytes, + ARMCC::AL, 0, TII); + else + emitT2RegPlusImmediate(MBB, MBBI, dl, ARM::SP, FramePtr, -NumBytes, + ARMCC::AL, 0, TII); + } else { + // Thumb2 or ARM. + if (isARM) + BuildMI(MBB, MBBI, dl, TII.get(ARM::MOVr), ARM::SP) + .addReg(FramePtr) + .addImm((unsigned)ARMCC::AL).addReg(0).addReg(0); + else + BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVgpr2gpr), ARM::SP) + .addReg(FramePtr); + } + } + } else if (NumBytes) + emitSPUpdate(isARM, MBB, MBBI, dl, TII, NumBytes); + + // Move SP to start of integer callee save spill area 2. + movePastCSLoadStoreOps(MBB, MBBI, ARM::VLDRD, 0, 3, STI); + emitSPUpdate(isARM, MBB, MBBI, dl, TII, AFI->getDPRCalleeSavedAreaSize()); + + // Move SP to start of integer callee save spill area 1. + movePastCSLoadStoreOps(MBB, MBBI, ARM::LDR, ARM::t2LDRi12, 2, STI); + emitSPUpdate(isARM, MBB, MBBI, dl, TII, AFI->getGPRCalleeSavedArea2Size()); + + // Move SP to SP upon entry to the function. + movePastCSLoadStoreOps(MBB, MBBI, ARM::LDR, ARM::t2LDRi12, 1, STI); + emitSPUpdate(isARM, MBB, MBBI, dl, TII, AFI->getGPRCalleeSavedArea1Size()); + } + + if (VARegSaveSize) + emitSPUpdate(isARM, MBB, MBBI, dl, TII, VARegSaveSize); +} + +#include "ARMGenRegisterInfo.inc" diff --git a/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h b/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h new file mode 100644 index 0000000..2c9c82d --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h @@ -0,0 +1,173 @@ +//===- ARMBaseRegisterInfo.h - ARM Register Information Impl ----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the base ARM implementation of TargetRegisterInfo class. +// +//===----------------------------------------------------------------------===// + +#ifndef ARMBASEREGISTERINFO_H +#define ARMBASEREGISTERINFO_H + +#include "ARM.h" +#include "llvm/Target/TargetRegisterInfo.h" +#include "ARMGenRegisterInfo.h.inc" + +namespace llvm { + class ARMSubtarget; + class ARMBaseInstrInfo; + class Type; + +/// Register allocation hints. +namespace ARMRI { + enum { + RegPairOdd = 1, + RegPairEven = 2 + }; +} + +/// isARMLowRegister - Returns true if the register is low register r0-r7. +/// +static inline bool isARMLowRegister(unsigned Reg) { + using namespace ARM; + switch (Reg) { + case R0: case R1: case R2: case R3: + case R4: case R5: case R6: case R7: + return true; + default: + return false; + } +} + +struct ARMBaseRegisterInfo : public ARMGenRegisterInfo { +protected: + const ARMBaseInstrInfo &TII; + const ARMSubtarget &STI; + + /// FramePtr - ARM physical register used as frame ptr. + unsigned FramePtr; + + // Can be only subclassed. + explicit ARMBaseRegisterInfo(const ARMBaseInstrInfo &tii, + const ARMSubtarget &STI); + + // Return the opcode that implements 'Op', or 0 if no opcode + unsigned getOpcode(int Op) const; + +public: + /// getRegisterNumbering - Given the enum value for some register, e.g. + /// ARM::LR, return the number that it corresponds to (e.g. 14). It + /// also returns true in isSPVFP if the register is a single precision + /// VFP register. + static unsigned getRegisterNumbering(unsigned RegEnum, bool *isSPVFP = 0); + + /// Code Generation virtual methods... + const unsigned *getCalleeSavedRegs(const MachineFunction *MF = 0) const; + + const TargetRegisterClass* const* + getCalleeSavedRegClasses(const MachineFunction *MF = 0) const; + + BitVector getReservedRegs(const MachineFunction &MF) const; + + /// getMatchingSuperRegClass - Return a subclass of the specified register + /// class A so that each register in it has a sub-register of the + /// specified sub-register index which is in the specified register class B. + virtual const TargetRegisterClass * + getMatchingSuperRegClass(const TargetRegisterClass *A, + const TargetRegisterClass *B, unsigned Idx) const; + + /// canCombinedSubRegIndex - Given a register class and a list of sub-register + /// indices, return true if it's possible to combine the sub-register indices + /// into one that corresponds to a larger sub-register. Return the new sub- + /// register index by reference. Note the new index by be zero if the given + /// sub-registers combined to form the whole register. + virtual bool canCombinedSubRegIndex(const TargetRegisterClass *RC, + SmallVectorImpl<unsigned> &SubIndices, + unsigned &NewSubIdx) const; + + const TargetRegisterClass *getPointerRegClass(unsigned Kind = 0) const; + + std::pair<TargetRegisterClass::iterator,TargetRegisterClass::iterator> + getAllocationOrder(const TargetRegisterClass *RC, + unsigned HintType, unsigned HintReg, + const MachineFunction &MF) const; + + unsigned ResolveRegAllocHint(unsigned Type, unsigned Reg, + const MachineFunction &MF) const; + + void UpdateRegAllocHint(unsigned Reg, unsigned NewReg, + MachineFunction &MF) const; + + bool hasFP(const MachineFunction &MF) const; + + bool canRealignStack(const MachineFunction &MF) const; + bool needsStackRealignment(const MachineFunction &MF) const; + + bool cannotEliminateFrame(const MachineFunction &MF) const; + + void processFunctionBeforeCalleeSavedScan(MachineFunction &MF, + RegScavenger *RS = NULL) const; + + // Debug information queries. + unsigned getRARegister() const; + unsigned getFrameRegister(const MachineFunction &MF) const; + int getFrameIndexReference(const MachineFunction &MF, int FI, + unsigned &FrameReg) const; + int getFrameIndexOffset(const MachineFunction &MF, int FI) const; + + // Exception handling queries. + unsigned getEHExceptionRegister() const; + unsigned getEHHandlerRegister() const; + + int getDwarfRegNum(unsigned RegNum, bool isEH) const; + + bool isLowRegister(unsigned Reg) const; + + + /// emitLoadConstPool - Emits a load from constpool to materialize the + /// specified immediate. + virtual void emitLoadConstPool(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, + DebugLoc dl, + unsigned DestReg, unsigned SubIdx, + int Val, + ARMCC::CondCodes Pred = ARMCC::AL, + unsigned PredReg = 0) const; + + /// Code Generation virtual methods... + virtual bool isReservedReg(const MachineFunction &MF, unsigned Reg) const; + + virtual bool requiresRegisterScavenging(const MachineFunction &MF) const; + + virtual bool requiresFrameIndexScavenging(const MachineFunction &MF) const; + + virtual bool hasReservedCallFrame(MachineFunction &MF) const; + virtual bool canSimplifyCallFramePseudos(MachineFunction &MF) const; + + virtual void eliminateCallFramePseudoInstr(MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator I) const; + + virtual unsigned eliminateFrameIndex(MachineBasicBlock::iterator II, + int SPAdj, FrameIndexValue *Value = NULL, + RegScavenger *RS = NULL) const; + + virtual void emitPrologue(MachineFunction &MF) const; + virtual void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const; + +private: + unsigned estimateRSStackSizeLimit(MachineFunction &MF) const; + + unsigned getRegisterPairEven(unsigned Reg, const MachineFunction &MF) const; + + unsigned getRegisterPairOdd(unsigned Reg, const MachineFunction &MF) const; +}; + +} // end namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/ARM/ARMBuildAttrs.h b/contrib/llvm/lib/Target/ARM/ARMBuildAttrs.h new file mode 100644 index 0000000..3b38375 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMBuildAttrs.h @@ -0,0 +1,64 @@ +//===-------- ARMBuildAttrs.h - ARM Build Attributes ------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains enumerations and support routines for ARM build attributes +// as defined in ARM ABI addenda document (ABI release 2.07). +// +//===----------------------------------------------------------------------===// + +#ifndef __TARGET_ARMBUILDATTRS_H__ +#define __TARGET_ARMBUILDATTRS_H__ + +namespace ARMBuildAttrs { + enum { + File = 1, + Section = 2, + Symbol = 3, + CPU_raw_name = 4, + CPU_name = 5, + CPU_arch = 6, + CPU_arch_profile = 7, + ARM_ISA_use = 8, + THUMB_ISA_use = 9, + VFP_arch = 10, + WMMX_arch = 11, + Advanced_SIMD_arch = 12, + PCS_config = 13, + ABI_PCS_R9_use = 14, + ABI_PCS_RW_data = 15, + ABI_PCS_RO_data = 16, + ABI_PCS_GOT_use = 17, + ABI_PCS_wchar_t = 18, + ABI_FP_rounding = 19, + ABI_FP_denormal = 20, + ABI_FP_exceptions = 21, + ABI_FP_user_exceptions = 22, + ABI_FP_number_model = 23, + ABI_align8_needed = 24, + ABI_align8_preserved = 25, + ABI_enum_size = 26, + ABI_HardFP_use = 27, + ABI_VFP_args = 28, + ABI_WMMX_args = 29, + ABI_optimization_goals = 30, + ABI_FP_optimization_goals = 31, + compatibility = 32, + CPU_unaligned_access = 34, + VFP_HP_extension = 36, + ABI_FP_16bit_format = 38, + nodefaults = 64, + also_compatible_with = 65, + T2EE_use = 66, + conformance = 67, + Virtualization_use = 68, + MPextension_use = 70 + }; +} + +#endif // __TARGET_ARMBUILDATTRS_H__ diff --git a/contrib/llvm/lib/Target/ARM/ARMCallingConv.td b/contrib/llvm/lib/Target/ARM/ARMCallingConv.td new file mode 100644 index 0000000..8fdb07f --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMCallingConv.td @@ -0,0 +1,132 @@ +//===- ARMCallingConv.td - Calling Conventions for ARM ----------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// This describes the calling conventions for ARM architecture. +//===----------------------------------------------------------------------===// + +/// CCIfSubtarget - Match if the current subtarget has a feature F. +class CCIfSubtarget<string F, CCAction A>: + CCIf<!strconcat("State.getTarget().getSubtarget<ARMSubtarget>().", F), A>; + +/// CCIfAlign - Match of the original alignment of the arg +class CCIfAlign<string Align, CCAction A>: + CCIf<!strconcat("ArgFlags.getOrigAlign() == ", Align), A>; + +//===----------------------------------------------------------------------===// +// ARM APCS Calling Convention +//===----------------------------------------------------------------------===// +def CC_ARM_APCS : CallingConv<[ + + CCIfType<[i8, i16], CCPromoteToType<i32>>, + + // Handle all vector types as either f64 or v2f64. + CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>, + CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>, + + // f64 and v2f64 are passed in adjacent GPRs, possibly split onto the stack + CCIfType<[f64, v2f64], CCCustom<"CC_ARM_APCS_Custom_f64">>, + + CCIfType<[f32], CCBitConvertToType<i32>>, + CCIfType<[i32], CCAssignToReg<[R0, R1, R2, R3]>>, + + CCIfType<[i32], CCAssignToStack<4, 4>>, + CCIfType<[f64], CCAssignToStack<8, 4>>, + CCIfType<[v2f64], CCAssignToStack<16, 4>> +]>; + +def RetCC_ARM_APCS : CallingConv<[ + CCIfType<[f32], CCBitConvertToType<i32>>, + + // Handle all vector types as either f64 or v2f64. + CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>, + CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>, + + CCIfType<[f64, v2f64], CCCustom<"RetCC_ARM_APCS_Custom_f64">>, + + CCIfType<[i32], CCAssignToReg<[R0, R1, R2, R3]>>, + CCIfType<[i64], CCAssignToRegWithShadow<[R0, R2], [R1, R3]>> +]>; + +//===----------------------------------------------------------------------===// +// ARM AAPCS (EABI) Calling Convention, common parts +//===----------------------------------------------------------------------===// + +def CC_ARM_AAPCS_Common : CallingConv<[ + + CCIfType<[i8, i16], CCPromoteToType<i32>>, + + // i64/f64 is passed in even pairs of GPRs + // i64 is 8-aligned i32 here, so we may need to eat R1 as a pad register + // (and the same is true for f64 if VFP is not enabled) + CCIfType<[i32], CCIfAlign<"8", CCAssignToRegWithShadow<[R0, R2], [R0, R1]>>>, + CCIfType<[i32], CCIf<"State.getNextStackOffset() == 0 &&" + "ArgFlags.getOrigAlign() != 8", + CCAssignToReg<[R0, R1, R2, R3]>>>, + + CCIfType<[i32], CCIfAlign<"8", CCAssignToStack<4, 8>>>, + CCIfType<[i32, f32], CCAssignToStack<4, 4>>, + CCIfType<[f64], CCAssignToStack<8, 8>>, + CCIfType<[v2f64], CCAssignToStack<16, 8>> +]>; + +def RetCC_ARM_AAPCS_Common : CallingConv<[ + CCIfType<[i32], CCAssignToReg<[R0, R1, R2, R3]>>, + CCIfType<[i64], CCAssignToRegWithShadow<[R0, R2], [R1, R3]>> +]>; + +//===----------------------------------------------------------------------===// +// ARM AAPCS (EABI) Calling Convention +//===----------------------------------------------------------------------===// + +def CC_ARM_AAPCS : CallingConv<[ + // Handle all vector types as either f64 or v2f64. + CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>, + CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>, + + CCIfType<[f64, v2f64], CCCustom<"CC_ARM_AAPCS_Custom_f64">>, + CCIfType<[f32], CCBitConvertToType<i32>>, + CCDelegateTo<CC_ARM_AAPCS_Common> +]>; + +def RetCC_ARM_AAPCS : CallingConv<[ + // Handle all vector types as either f64 or v2f64. + CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>, + CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>, + + CCIfType<[f64, v2f64], CCCustom<"RetCC_ARM_AAPCS_Custom_f64">>, + CCIfType<[f32], CCBitConvertToType<i32>>, + CCDelegateTo<RetCC_ARM_AAPCS_Common> +]>; + +//===----------------------------------------------------------------------===// +// ARM AAPCS-VFP (EABI) Calling Convention +//===----------------------------------------------------------------------===// + +def CC_ARM_AAPCS_VFP : CallingConv<[ + // Handle all vector types as either f64 or v2f64. + CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>, + CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>, + + CCIfType<[v2f64], CCAssignToReg<[Q0, Q1, Q2, Q3]>>, + CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>, + CCIfType<[f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7, S8, + S9, S10, S11, S12, S13, S14, S15]>>, + CCDelegateTo<CC_ARM_AAPCS_Common> +]>; + +def RetCC_ARM_AAPCS_VFP : CallingConv<[ + // Handle all vector types as either f64 or v2f64. + CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>, + CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>, + + CCIfType<[v2f64], CCAssignToReg<[Q0, Q1, Q2, Q3]>>, + CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>, + CCIfType<[f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7, S8, + S9, S10, S11, S12, S13, S14, S15]>>, + CCDelegateTo<RetCC_ARM_AAPCS_Common> +]>; diff --git a/contrib/llvm/lib/Target/ARM/ARMCodeEmitter.cpp b/contrib/llvm/lib/Target/ARM/ARMCodeEmitter.cpp new file mode 100644 index 0000000..f2730fc --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMCodeEmitter.cpp @@ -0,0 +1,1542 @@ +//===-- ARM/ARMCodeEmitter.cpp - Convert ARM code to machine code ---------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the pass that transforms the ARM machine instructions into +// relocatable machine code. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "jit" +#include "ARM.h" +#include "ARMAddressingModes.h" +#include "ARMConstantPoolValue.h" +#include "ARMInstrInfo.h" +#include "ARMRelocations.h" +#include "ARMSubtarget.h" +#include "ARMTargetMachine.h" +#include "llvm/Constants.h" +#include "llvm/DerivedTypes.h" +#include "llvm/Function.h" +#include "llvm/PassManager.h" +#include "llvm/CodeGen/JITCodeEmitter.h" +#include "llvm/CodeGen/MachineConstantPool.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineJumpTableInfo.h" +#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" +#ifndef NDEBUG +#include <iomanip> +#endif +using namespace llvm; + +STATISTIC(NumEmitted, "Number of machine instructions emitted"); + +namespace { + + class ARMCodeEmitter : public MachineFunctionPass { + ARMJITInfo *JTI; + const ARMInstrInfo *II; + const TargetData *TD; + const ARMSubtarget *Subtarget; + TargetMachine &TM; + JITCodeEmitter &MCE; + MachineModuleInfo *MMI; + const std::vector<MachineConstantPoolEntry> *MCPEs; + const std::vector<MachineJumpTableEntry> *MJTEs; + bool IsPIC; + + void getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<MachineModuleInfo>(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + static char ID; + public: + ARMCodeEmitter(TargetMachine &tm, JITCodeEmitter &mce) + : MachineFunctionPass(&ID), JTI(0), + II((const ARMInstrInfo *)tm.getInstrInfo()), + TD(tm.getTargetData()), TM(tm), + MCE(mce), MCPEs(0), MJTEs(0), + IsPIC(TM.getRelocationModel() == Reloc::PIC_) {} + + /// getBinaryCodeForInstr - This function, generated by the + /// CodeEmitterGenerator using TableGen, produces the binary encoding for + /// machine instructions. + unsigned getBinaryCodeForInstr(const MachineInstr &MI); + + bool runOnMachineFunction(MachineFunction &MF); + + virtual const char *getPassName() const { + return "ARM Machine Code Emitter"; + } + + void emitInstruction(const MachineInstr &MI); + + private: + + void emitWordLE(unsigned Binary); + void emitDWordLE(uint64_t Binary); + void emitConstPoolInstruction(const MachineInstr &MI); + void emitMOVi32immInstruction(const MachineInstr &MI); + void emitMOVi2piecesInstruction(const MachineInstr &MI); + void emitLEApcrelJTInstruction(const MachineInstr &MI); + void emitPseudoMoveInstruction(const MachineInstr &MI); + void addPCLabel(unsigned LabelID); + void emitPseudoInstruction(const MachineInstr &MI); + unsigned getMachineSoRegOpValue(const MachineInstr &MI, + const TargetInstrDesc &TID, + const MachineOperand &MO, + unsigned OpIdx); + + unsigned getMachineSoImmOpValue(unsigned SoImm); + + unsigned getAddrModeSBit(const MachineInstr &MI, + const TargetInstrDesc &TID) const; + + void emitDataProcessingInstruction(const MachineInstr &MI, + unsigned ImplicitRd = 0, + unsigned ImplicitRn = 0); + + void emitLoadStoreInstruction(const MachineInstr &MI, + unsigned ImplicitRd = 0, + unsigned ImplicitRn = 0); + + void emitMiscLoadStoreInstruction(const MachineInstr &MI, + unsigned ImplicitRn = 0); + + void emitLoadStoreMultipleInstruction(const MachineInstr &MI); + + void emitMulFrmInstruction(const MachineInstr &MI); + + void emitExtendInstruction(const MachineInstr &MI); + + void emitMiscArithInstruction(const MachineInstr &MI); + + void emitBranchInstruction(const MachineInstr &MI); + + void emitInlineJumpTable(unsigned JTIndex); + + void emitMiscBranchInstruction(const MachineInstr &MI); + + void emitVFPArithInstruction(const MachineInstr &MI); + + void emitVFPConversionInstruction(const MachineInstr &MI); + + void emitVFPLoadStoreInstruction(const MachineInstr &MI); + + void emitVFPLoadStoreMultipleInstruction(const MachineInstr &MI); + + void emitMiscInstruction(const MachineInstr &MI); + + /// getMachineOpValue - Return binary encoding of operand. If the machine + /// operand requires relocation, record the relocation and return zero. + unsigned getMachineOpValue(const MachineInstr &MI,const MachineOperand &MO); + unsigned getMachineOpValue(const MachineInstr &MI, unsigned OpIdx) { + return getMachineOpValue(MI, MI.getOperand(OpIdx)); + } + + /// getMovi32Value - Return binary encoding of operand for movw/movt. If the + /// machine operand requires relocation, record the relocation and return zero. + unsigned getMovi32Value(const MachineInstr &MI,const MachineOperand &MO, + unsigned Reloc); + unsigned getMovi32Value(const MachineInstr &MI, unsigned OpIdx, + unsigned Reloc) { + return getMovi32Value(MI, MI.getOperand(OpIdx), Reloc); + } + + /// getShiftOp - Return the shift opcode (bit[6:5]) of the immediate value. + /// + unsigned getShiftOp(unsigned Imm) const ; + + /// Routines that handle operands which add machine relocations which are + /// fixed up by the relocation stage. + void emitGlobalAddress(const GlobalValue *GV, unsigned Reloc, + bool MayNeedFarStub, bool Indirect, + intptr_t ACPV = 0); + void emitExternalSymbolAddress(const char *ES, unsigned Reloc); + void emitConstPoolAddress(unsigned CPI, unsigned Reloc); + void emitJumpTableAddress(unsigned JTIndex, unsigned Reloc); + void emitMachineBasicBlock(MachineBasicBlock *BB, unsigned Reloc, + intptr_t JTBase = 0); + }; +} + +char ARMCodeEmitter::ID = 0; + +/// createARMJITCodeEmitterPass - Return a pass that emits the collected ARM +/// code to the specified MCE object. +FunctionPass *llvm::createARMJITCodeEmitterPass(ARMBaseTargetMachine &TM, + JITCodeEmitter &JCE) { + return new ARMCodeEmitter(TM, JCE); +} + +bool ARMCodeEmitter::runOnMachineFunction(MachineFunction &MF) { + assert((MF.getTarget().getRelocationModel() != Reloc::Default || + MF.getTarget().getRelocationModel() != Reloc::Static) && + "JIT relocation model must be set to static or default!"); + JTI = ((ARMTargetMachine &)MF.getTarget()).getJITInfo(); + II = ((const ARMTargetMachine &)MF.getTarget()).getInstrInfo(); + TD = ((const ARMTargetMachine &)MF.getTarget()).getTargetData(); + Subtarget = &TM.getSubtarget<ARMSubtarget>(); + MCPEs = &MF.getConstantPool()->getConstants(); + MJTEs = 0; + if (MF.getJumpTableInfo()) MJTEs = &MF.getJumpTableInfo()->getJumpTables(); + IsPIC = TM.getRelocationModel() == Reloc::PIC_; + JTI->Initialize(MF, IsPIC); + MMI = &getAnalysis<MachineModuleInfo>(); + MCE.setModuleInfo(MMI); + + do { + DEBUG(errs() << "JITTing function '" + << MF.getFunction()->getName() << "'\n"); + MCE.startFunction(MF); + for (MachineFunction::iterator MBB = MF.begin(), E = MF.end(); + MBB != E; ++MBB) { + MCE.StartMachineBasicBlock(MBB); + for (MachineBasicBlock::const_iterator I = MBB->begin(), E = MBB->end(); + I != E; ++I) + emitInstruction(*I); + } + } while (MCE.finishFunction(MF)); + + return false; +} + +/// getShiftOp - Return the shift opcode (bit[6:5]) of the immediate value. +/// +unsigned ARMCodeEmitter::getShiftOp(unsigned Imm) const { + switch (ARM_AM::getAM2ShiftOpc(Imm)) { + default: llvm_unreachable("Unknown shift opc!"); + case ARM_AM::asr: return 2; + case ARM_AM::lsl: return 0; + case ARM_AM::lsr: return 1; + case ARM_AM::ror: + case ARM_AM::rrx: return 3; + } + return 0; +} + +/// getMovi32Value - Return binary encoding of operand for movw/movt. If the +/// machine operand requires relocation, record the relocation and return zero. +unsigned ARMCodeEmitter::getMovi32Value(const MachineInstr &MI, + const MachineOperand &MO, + unsigned Reloc) { + assert(((Reloc == ARM::reloc_arm_movt) || (Reloc == ARM::reloc_arm_movw)) + && "Relocation to this function should be for movt or movw"); + + if (MO.isImm()) + return static_cast<unsigned>(MO.getImm()); + else if (MO.isGlobal()) + emitGlobalAddress(MO.getGlobal(), Reloc, true, false); + else if (MO.isSymbol()) + emitExternalSymbolAddress(MO.getSymbolName(), Reloc); + else if (MO.isMBB()) + emitMachineBasicBlock(MO.getMBB(), Reloc); + else { +#ifndef NDEBUG + errs() << MO; +#endif + llvm_unreachable("Unsupported operand type for movw/movt"); + } + return 0; +} + +/// getMachineOpValue - Return binary encoding of operand. If the machine +/// operand requires relocation, record the relocation and return zero. +unsigned ARMCodeEmitter::getMachineOpValue(const MachineInstr &MI, + const MachineOperand &MO) { + if (MO.isReg()) + return ARMRegisterInfo::getRegisterNumbering(MO.getReg()); + else if (MO.isImm()) + return static_cast<unsigned>(MO.getImm()); + else if (MO.isGlobal()) + emitGlobalAddress(MO.getGlobal(), ARM::reloc_arm_branch, true, false); + else if (MO.isSymbol()) + emitExternalSymbolAddress(MO.getSymbolName(), ARM::reloc_arm_branch); + else if (MO.isCPI()) { + const TargetInstrDesc &TID = MI.getDesc(); + // For VFP load, the immediate offset is multiplied by 4. + unsigned Reloc = ((TID.TSFlags & ARMII::FormMask) == ARMII::VFPLdStFrm) + ? ARM::reloc_arm_vfp_cp_entry : ARM::reloc_arm_cp_entry; + emitConstPoolAddress(MO.getIndex(), Reloc); + } else if (MO.isJTI()) + emitJumpTableAddress(MO.getIndex(), ARM::reloc_arm_relative); + else if (MO.isMBB()) + emitMachineBasicBlock(MO.getMBB(), ARM::reloc_arm_branch); + else { +#ifndef NDEBUG + errs() << MO; +#endif + llvm_unreachable(0); + } + return 0; +} + +/// emitGlobalAddress - Emit the specified address to the code stream. +/// +void ARMCodeEmitter::emitGlobalAddress(const GlobalValue *GV, unsigned Reloc, + bool MayNeedFarStub, bool Indirect, + intptr_t ACPV) { + MachineRelocation MR = Indirect + ? MachineRelocation::getIndirectSymbol(MCE.getCurrentPCOffset(), Reloc, + const_cast<GlobalValue *>(GV), + ACPV, MayNeedFarStub) + : MachineRelocation::getGV(MCE.getCurrentPCOffset(), Reloc, + const_cast<GlobalValue *>(GV), ACPV, + MayNeedFarStub); + MCE.addRelocation(MR); +} + +/// emitExternalSymbolAddress - Arrange for the address of an external symbol to +/// be emitted to the current location in the function, and allow it to be PC +/// relative. +void ARMCodeEmitter::emitExternalSymbolAddress(const char *ES, unsigned Reloc) { + MCE.addRelocation(MachineRelocation::getExtSym(MCE.getCurrentPCOffset(), + Reloc, ES)); +} + +/// emitConstPoolAddress - Arrange for the address of an constant pool +/// to be emitted to the current location in the function, and allow it to be PC +/// relative. +void ARMCodeEmitter::emitConstPoolAddress(unsigned CPI, unsigned Reloc) { + // Tell JIT emitter we'll resolve the address. + MCE.addRelocation(MachineRelocation::getConstPool(MCE.getCurrentPCOffset(), + Reloc, CPI, 0, true)); +} + +/// emitJumpTableAddress - Arrange for the address of a jump table to +/// be emitted to the current location in the function, and allow it to be PC +/// relative. +void ARMCodeEmitter::emitJumpTableAddress(unsigned JTIndex, unsigned Reloc) { + MCE.addRelocation(MachineRelocation::getJumpTable(MCE.getCurrentPCOffset(), + Reloc, JTIndex, 0, true)); +} + +/// emitMachineBasicBlock - Emit the specified address basic block. +void ARMCodeEmitter::emitMachineBasicBlock(MachineBasicBlock *BB, + unsigned Reloc, intptr_t JTBase) { + MCE.addRelocation(MachineRelocation::getBB(MCE.getCurrentPCOffset(), + Reloc, BB, JTBase)); +} + +void ARMCodeEmitter::emitWordLE(unsigned Binary) { + DEBUG(errs() << " 0x"; + errs().write_hex(Binary) << "\n"); + MCE.emitWordLE(Binary); +} + +void ARMCodeEmitter::emitDWordLE(uint64_t Binary) { + DEBUG(errs() << " 0x"; + errs().write_hex(Binary) << "\n"); + MCE.emitDWordLE(Binary); +} + +void ARMCodeEmitter::emitInstruction(const MachineInstr &MI) { + DEBUG(errs() << "JIT: " << (void*)MCE.getCurrentPCValue() << ":\t" << MI); + + MCE.processDebugLoc(MI.getDebugLoc(), true); + + NumEmitted++; // Keep track of the # of mi's emitted + switch (MI.getDesc().TSFlags & ARMII::FormMask) { + default: { + llvm_unreachable("Unhandled instruction encoding format!"); + break; + } + case ARMII::Pseudo: + emitPseudoInstruction(MI); + break; + case ARMII::DPFrm: + case ARMII::DPSoRegFrm: + emitDataProcessingInstruction(MI); + break; + case ARMII::LdFrm: + case ARMII::StFrm: + emitLoadStoreInstruction(MI); + break; + case ARMII::LdMiscFrm: + case ARMII::StMiscFrm: + emitMiscLoadStoreInstruction(MI); + break; + case ARMII::LdStMulFrm: + emitLoadStoreMultipleInstruction(MI); + break; + case ARMII::MulFrm: + emitMulFrmInstruction(MI); + break; + case ARMII::ExtFrm: + emitExtendInstruction(MI); + break; + case ARMII::ArithMiscFrm: + emitMiscArithInstruction(MI); + break; + case ARMII::BrFrm: + emitBranchInstruction(MI); + break; + case ARMII::BrMiscFrm: + emitMiscBranchInstruction(MI); + break; + // VFP instructions. + case ARMII::VFPUnaryFrm: + case ARMII::VFPBinaryFrm: + emitVFPArithInstruction(MI); + break; + case ARMII::VFPConv1Frm: + case ARMII::VFPConv2Frm: + case ARMII::VFPConv3Frm: + case ARMII::VFPConv4Frm: + case ARMII::VFPConv5Frm: + emitVFPConversionInstruction(MI); + break; + case ARMII::VFPLdStFrm: + emitVFPLoadStoreInstruction(MI); + break; + case ARMII::VFPLdStMulFrm: + emitVFPLoadStoreMultipleInstruction(MI); + break; + case ARMII::VFPMiscFrm: + emitMiscInstruction(MI); + break; + } + MCE.processDebugLoc(MI.getDebugLoc(), false); +} + +void ARMCodeEmitter::emitConstPoolInstruction(const MachineInstr &MI) { + unsigned CPI = MI.getOperand(0).getImm(); // CP instruction index. + unsigned CPIndex = MI.getOperand(1).getIndex(); // Actual cp entry index. + const MachineConstantPoolEntry &MCPE = (*MCPEs)[CPIndex]; + + // Remember the CONSTPOOL_ENTRY address for later relocation. + JTI->addConstantPoolEntryAddr(CPI, MCE.getCurrentPCValue()); + + // Emit constpool island entry. In most cases, the actual values will be + // resolved and relocated after code emission. + if (MCPE.isMachineConstantPoolEntry()) { + ARMConstantPoolValue *ACPV = + static_cast<ARMConstantPoolValue*>(MCPE.Val.MachineCPVal); + + DEBUG(errs() << " ** ARM constant pool #" << CPI << " @ " + << (void*)MCE.getCurrentPCValue() << " " << *ACPV << '\n'); + + assert(ACPV->isGlobalValue() && "unsupported constant pool value"); + const GlobalValue *GV = ACPV->getGV(); + if (GV) { + Reloc::Model RelocM = TM.getRelocationModel(); + emitGlobalAddress(GV, ARM::reloc_arm_machine_cp_entry, + isa<Function>(GV), + Subtarget->GVIsIndirectSymbol(GV, RelocM), + (intptr_t)ACPV); + } else { + emitExternalSymbolAddress(ACPV->getSymbol(), ARM::reloc_arm_absolute); + } + emitWordLE(0); + } else { + const Constant *CV = MCPE.Val.ConstVal; + + DEBUG({ + errs() << " ** Constant pool #" << CPI << " @ " + << (void*)MCE.getCurrentPCValue() << " "; + if (const Function *F = dyn_cast<Function>(CV)) + errs() << F->getName(); + else + errs() << *CV; + errs() << '\n'; + }); + + if (const GlobalValue *GV = dyn_cast<GlobalValue>(CV)) { + emitGlobalAddress(GV, ARM::reloc_arm_absolute, isa<Function>(GV), false); + emitWordLE(0); + } else if (const ConstantInt *CI = dyn_cast<ConstantInt>(CV)) { + uint32_t Val = *(uint32_t*)CI->getValue().getRawData(); + emitWordLE(Val); + } else if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CV)) { + if (CFP->getType()->isFloatTy()) + emitWordLE(CFP->getValueAPF().bitcastToAPInt().getZExtValue()); + else if (CFP->getType()->isDoubleTy()) + emitDWordLE(CFP->getValueAPF().bitcastToAPInt().getZExtValue()); + else { + llvm_unreachable("Unable to handle this constantpool entry!"); + } + } else { + llvm_unreachable("Unable to handle this constantpool entry!"); + } + } +} + +void ARMCodeEmitter::emitMOVi32immInstruction(const MachineInstr &MI) { + const MachineOperand &MO0 = MI.getOperand(0); + const MachineOperand &MO1 = MI.getOperand(1); + + // Emit the 'movw' instruction. + unsigned Binary = 0x30 << 20; // mov: Insts{27-20} = 0b00110000 + + unsigned Lo16 = getMovi32Value(MI, MO1, ARM::reloc_arm_movw) & 0xFFFF; + + // Set the conditional execution predicate. + Binary |= II->getPredicate(&MI) << ARMII::CondShift; + + // Encode Rd. + Binary |= getMachineOpValue(MI, MO0) << ARMII::RegRdShift; + + // Encode imm16 as imm4:imm12 + Binary |= Lo16 & 0xFFF; // Insts{11-0} = imm12 + Binary |= ((Lo16 >> 12) & 0xF) << 16; // Insts{19-16} = imm4 + emitWordLE(Binary); + + unsigned Hi16 = getMovi32Value(MI, MO1, ARM::reloc_arm_movt) >> 16; + // Emit the 'movt' instruction. + Binary = 0x34 << 20; // movt: Insts{27-20} = 0b00110100 + + // Set the conditional execution predicate. + Binary |= II->getPredicate(&MI) << ARMII::CondShift; + + // Encode Rd. + Binary |= getMachineOpValue(MI, MO0) << ARMII::RegRdShift; + + // Encode imm16 as imm4:imm1, same as movw above. + Binary |= Hi16 & 0xFFF; + Binary |= ((Hi16 >> 12) & 0xF) << 16; + emitWordLE(Binary); +} + +void ARMCodeEmitter::emitMOVi2piecesInstruction(const MachineInstr &MI) { + const MachineOperand &MO0 = MI.getOperand(0); + const MachineOperand &MO1 = MI.getOperand(1); + assert(MO1.isImm() && ARM_AM::isSOImmTwoPartVal(MO1.getImm()) && + "Not a valid so_imm value!"); + unsigned V1 = ARM_AM::getSOImmTwoPartFirst(MO1.getImm()); + unsigned V2 = ARM_AM::getSOImmTwoPartSecond(MO1.getImm()); + + // Emit the 'mov' instruction. + unsigned Binary = 0xd << 21; // mov: Insts{24-21} = 0b1101 + + // Set the conditional execution predicate. + Binary |= II->getPredicate(&MI) << ARMII::CondShift; + + // Encode Rd. + Binary |= getMachineOpValue(MI, MO0) << ARMII::RegRdShift; + + // Encode so_imm. + // Set bit I(25) to identify this is the immediate form of <shifter_op> + Binary |= 1 << ARMII::I_BitShift; + Binary |= getMachineSoImmOpValue(V1); + emitWordLE(Binary); + + // Now the 'orr' instruction. + Binary = 0xc << 21; // orr: Insts{24-21} = 0b1100 + + // Set the conditional execution predicate. + Binary |= II->getPredicate(&MI) << ARMII::CondShift; + + // Encode Rd. + Binary |= getMachineOpValue(MI, MO0) << ARMII::RegRdShift; + + // Encode Rn. + Binary |= getMachineOpValue(MI, MO0) << ARMII::RegRnShift; + + // Encode so_imm. + // Set bit I(25) to identify this is the immediate form of <shifter_op> + Binary |= 1 << ARMII::I_BitShift; + Binary |= getMachineSoImmOpValue(V2); + emitWordLE(Binary); +} + +void ARMCodeEmitter::emitLEApcrelJTInstruction(const MachineInstr &MI) { + // It's basically add r, pc, (LJTI - $+8) + + const TargetInstrDesc &TID = MI.getDesc(); + + // Emit the 'add' instruction. + unsigned Binary = 0x4 << 21; // add: Insts{24-31} = 0b0100 + + // Set the conditional execution predicate + Binary |= II->getPredicate(&MI) << ARMII::CondShift; + + // Encode S bit if MI modifies CPSR. + Binary |= getAddrModeSBit(MI, TID); + + // Encode Rd. + Binary |= getMachineOpValue(MI, 0) << ARMII::RegRdShift; + + // Encode Rn which is PC. + Binary |= ARMRegisterInfo::getRegisterNumbering(ARM::PC) << ARMII::RegRnShift; + + // Encode the displacement. + Binary |= 1 << ARMII::I_BitShift; + emitJumpTableAddress(MI.getOperand(1).getIndex(), ARM::reloc_arm_jt_base); + + emitWordLE(Binary); +} + +void ARMCodeEmitter::emitPseudoMoveInstruction(const MachineInstr &MI) { + unsigned Opcode = MI.getDesc().Opcode; + + // Part of binary is determined by TableGn. + unsigned Binary = getBinaryCodeForInstr(MI); + + // Set the conditional execution predicate + Binary |= II->getPredicate(&MI) << ARMII::CondShift; + + // Encode S bit if MI modifies CPSR. + if (Opcode == ARM::MOVsrl_flag || Opcode == ARM::MOVsra_flag) + Binary |= 1 << ARMII::S_BitShift; + + // Encode register def if there is one. + Binary |= getMachineOpValue(MI, 0) << ARMII::RegRdShift; + + // Encode the shift operation. + switch (Opcode) { + default: break; + case ARM::MOVrx: + // rrx + Binary |= 0x6 << 4; + break; + case ARM::MOVsrl_flag: + // lsr #1 + Binary |= (0x2 << 4) | (1 << 7); + break; + case ARM::MOVsra_flag: + // asr #1 + Binary |= (0x4 << 4) | (1 << 7); + break; + } + + // Encode register Rm. + Binary |= getMachineOpValue(MI, 1); + + emitWordLE(Binary); +} + +void ARMCodeEmitter::addPCLabel(unsigned LabelID) { + DEBUG(errs() << " ** LPC" << LabelID << " @ " + << (void*)MCE.getCurrentPCValue() << '\n'); + JTI->addPCLabelAddr(LabelID, MCE.getCurrentPCValue()); +} + +void ARMCodeEmitter::emitPseudoInstruction(const MachineInstr &MI) { + unsigned Opcode = MI.getDesc().Opcode; + switch (Opcode) { + default: + llvm_unreachable("ARMCodeEmitter::emitPseudoInstruction"); + case TargetOpcode::INLINEASM: { + // We allow inline assembler nodes with empty bodies - they can + // implicitly define registers, which is ok for JIT. + if (MI.getOperand(0).getSymbolName()[0]) { + report_fatal_error("JIT does not support inline asm!"); + } + break; + } + case TargetOpcode::DBG_LABEL: + case TargetOpcode::EH_LABEL: + MCE.emitLabel(MI.getOperand(0).getMCSymbol()); + break; + case TargetOpcode::IMPLICIT_DEF: + case TargetOpcode::KILL: + // Do nothing. + break; + case ARM::CONSTPOOL_ENTRY: + emitConstPoolInstruction(MI); + break; + case ARM::PICADD: { + // Remember of the address of the PC label for relocation later. + addPCLabel(MI.getOperand(2).getImm()); + // PICADD is just an add instruction that implicitly read pc. + emitDataProcessingInstruction(MI, 0, ARM::PC); + break; + } + case ARM::PICLDR: + case ARM::PICLDRB: + case ARM::PICSTR: + case ARM::PICSTRB: { + // Remember of the address of the PC label for relocation later. + addPCLabel(MI.getOperand(2).getImm()); + // These are just load / store instructions that implicitly read pc. + emitLoadStoreInstruction(MI, 0, ARM::PC); + break; + } + case ARM::PICLDRH: + case ARM::PICLDRSH: + case ARM::PICLDRSB: + case ARM::PICSTRH: { + // Remember of the address of the PC label for relocation later. + addPCLabel(MI.getOperand(2).getImm()); + // These are just load / store instructions that implicitly read pc. + emitMiscLoadStoreInstruction(MI, ARM::PC); + break; + } + + case ARM::MOVi32imm: + emitMOVi32immInstruction(MI); + break; + + case ARM::MOVi2pieces: + // Two instructions to materialize a constant. + emitMOVi2piecesInstruction(MI); + break; + case ARM::LEApcrelJT: + // Materialize jumptable address. + emitLEApcrelJTInstruction(MI); + break; + case ARM::MOVrx: + case ARM::MOVsrl_flag: + case ARM::MOVsra_flag: + emitPseudoMoveInstruction(MI); + break; + } +} + +unsigned ARMCodeEmitter::getMachineSoRegOpValue(const MachineInstr &MI, + const TargetInstrDesc &TID, + const MachineOperand &MO, + unsigned OpIdx) { + unsigned Binary = getMachineOpValue(MI, MO); + + const MachineOperand &MO1 = MI.getOperand(OpIdx + 1); + const MachineOperand &MO2 = MI.getOperand(OpIdx + 2); + ARM_AM::ShiftOpc SOpc = ARM_AM::getSORegShOp(MO2.getImm()); + + // Encode the shift opcode. + unsigned SBits = 0; + unsigned Rs = MO1.getReg(); + if (Rs) { + // Set shift operand (bit[7:4]). + // LSL - 0001 + // LSR - 0011 + // ASR - 0101 + // ROR - 0111 + // RRX - 0110 and bit[11:8] clear. + switch (SOpc) { + default: llvm_unreachable("Unknown shift opc!"); + case ARM_AM::lsl: SBits = 0x1; break; + case ARM_AM::lsr: SBits = 0x3; break; + case ARM_AM::asr: SBits = 0x5; break; + case ARM_AM::ror: SBits = 0x7; break; + case ARM_AM::rrx: SBits = 0x6; break; + } + } else { + // Set shift operand (bit[6:4]). + // LSL - 000 + // LSR - 010 + // ASR - 100 + // ROR - 110 + switch (SOpc) { + default: llvm_unreachable("Unknown shift opc!"); + case ARM_AM::lsl: SBits = 0x0; break; + case ARM_AM::lsr: SBits = 0x2; break; + case ARM_AM::asr: SBits = 0x4; break; + case ARM_AM::ror: SBits = 0x6; break; + } + } + Binary |= SBits << 4; + if (SOpc == ARM_AM::rrx) + return Binary; + + // Encode the shift operation Rs or shift_imm (except rrx). + if (Rs) { + // Encode Rs bit[11:8]. + assert(ARM_AM::getSORegOffset(MO2.getImm()) == 0); + return Binary | + (ARMRegisterInfo::getRegisterNumbering(Rs) << ARMII::RegRsShift); + } + + // Encode shift_imm bit[11:7]. + return Binary | ARM_AM::getSORegOffset(MO2.getImm()) << 7; +} + +unsigned ARMCodeEmitter::getMachineSoImmOpValue(unsigned SoImm) { + int SoImmVal = ARM_AM::getSOImmVal(SoImm); + assert(SoImmVal != -1 && "Not a valid so_imm value!"); + + // Encode rotate_imm. + unsigned Binary = (ARM_AM::getSOImmValRot((unsigned)SoImmVal) >> 1) + << ARMII::SoRotImmShift; + + // Encode immed_8. + Binary |= ARM_AM::getSOImmValImm((unsigned)SoImmVal); + return Binary; +} + +unsigned ARMCodeEmitter::getAddrModeSBit(const MachineInstr &MI, + const TargetInstrDesc &TID) const { + for (unsigned i = MI.getNumOperands(), e = TID.getNumOperands(); i != e; --i){ + const MachineOperand &MO = MI.getOperand(i-1); + if (MO.isReg() && MO.isDef() && MO.getReg() == ARM::CPSR) + return 1 << ARMII::S_BitShift; + } + return 0; +} + +void ARMCodeEmitter::emitDataProcessingInstruction(const MachineInstr &MI, + unsigned ImplicitRd, + unsigned ImplicitRn) { + const TargetInstrDesc &TID = MI.getDesc(); + + // Part of binary is determined by TableGn. + unsigned Binary = getBinaryCodeForInstr(MI); + + // Set the conditional execution predicate + Binary |= II->getPredicate(&MI) << ARMII::CondShift; + + // Encode S bit if MI modifies CPSR. + Binary |= getAddrModeSBit(MI, TID); + + // Encode register def if there is one. + unsigned NumDefs = TID.getNumDefs(); + unsigned OpIdx = 0; + if (NumDefs) + Binary |= getMachineOpValue(MI, OpIdx++) << ARMII::RegRdShift; + else if (ImplicitRd) + // Special handling for implicit use (e.g. PC). + Binary |= (ARMRegisterInfo::getRegisterNumbering(ImplicitRd) + << ARMII::RegRdShift); + + if (TID.Opcode == ARM::MOVi16) { + // Get immediate from MI. + unsigned Lo16 = getMovi32Value(MI, MI.getOperand(OpIdx), + ARM::reloc_arm_movw); + // Encode imm which is the same as in emitMOVi32immInstruction(). + Binary |= Lo16 & 0xFFF; + Binary |= ((Lo16 >> 12) & 0xF) << 16; + emitWordLE(Binary); + return; + } else if(TID.Opcode == ARM::MOVTi16) { + unsigned Hi16 = (getMovi32Value(MI, MI.getOperand(OpIdx), + ARM::reloc_arm_movt) >> 16); + Binary |= Hi16 & 0xFFF; + Binary |= ((Hi16 >> 12) & 0xF) << 16; + emitWordLE(Binary); + return; + } else if ((TID.Opcode == ARM::BFC) || (TID.Opcode == ARM::BFI)) { + uint32_t v = ~MI.getOperand(2).getImm(); + int32_t lsb = CountTrailingZeros_32(v); + int32_t msb = (32 - CountLeadingZeros_32(v)) - 1; + // Instr{20-16} = msb, Instr{11-7} = lsb + Binary |= (msb & 0x1F) << 16; + Binary |= (lsb & 0x1F) << 7; + emitWordLE(Binary); + return; + } else if ((TID.Opcode == ARM::UBFX) || (TID.Opcode == ARM::SBFX)) { + // Encode Rn in Instr{0-3} + Binary |= getMachineOpValue(MI, OpIdx++); + + uint32_t lsb = MI.getOperand(OpIdx++).getImm(); + uint32_t widthm1 = MI.getOperand(OpIdx++).getImm() - 1; + + // Instr{20-16} = widthm1, Instr{11-7} = lsb + Binary |= (widthm1 & 0x1F) << 16; + Binary |= (lsb & 0x1F) << 7; + emitWordLE(Binary); + return; + } + + // If this is a two-address operand, skip it. e.g. MOVCCr operand 1. + if (TID.getOperandConstraint(OpIdx, TOI::TIED_TO) != -1) + ++OpIdx; + + // Encode first non-shifter register operand if there is one. + bool isUnary = TID.TSFlags & ARMII::UnaryDP; + if (!isUnary) { + if (ImplicitRn) + // Special handling for implicit use (e.g. PC). + Binary |= (ARMRegisterInfo::getRegisterNumbering(ImplicitRn) + << ARMII::RegRnShift); + else { + Binary |= getMachineOpValue(MI, OpIdx) << ARMII::RegRnShift; + ++OpIdx; + } + } + + // Encode shifter operand. + const MachineOperand &MO = MI.getOperand(OpIdx); + if ((TID.TSFlags & ARMII::FormMask) == ARMII::DPSoRegFrm) { + // Encode SoReg. + emitWordLE(Binary | getMachineSoRegOpValue(MI, TID, MO, OpIdx)); + return; + } + + if (MO.isReg()) { + // Encode register Rm. + emitWordLE(Binary | ARMRegisterInfo::getRegisterNumbering(MO.getReg())); + return; + } + + // Encode so_imm. + Binary |= getMachineSoImmOpValue((unsigned)MO.getImm()); + + emitWordLE(Binary); +} + +void ARMCodeEmitter::emitLoadStoreInstruction(const MachineInstr &MI, + unsigned ImplicitRd, + unsigned ImplicitRn) { + const TargetInstrDesc &TID = MI.getDesc(); + unsigned Form = TID.TSFlags & ARMII::FormMask; + bool IsPrePost = (TID.TSFlags & ARMII::IndexModeMask) != 0; + + // Part of binary is determined by TableGn. + unsigned Binary = getBinaryCodeForInstr(MI); + + // Set the conditional execution predicate + Binary |= II->getPredicate(&MI) << ARMII::CondShift; + + unsigned OpIdx = 0; + + // Operand 0 of a pre- and post-indexed store is the address base + // writeback. Skip it. + bool Skipped = false; + if (IsPrePost && Form == ARMII::StFrm) { + ++OpIdx; + Skipped = true; + } + + // Set first operand + if (ImplicitRd) + // Special handling for implicit use (e.g. PC). + Binary |= (ARMRegisterInfo::getRegisterNumbering(ImplicitRd) + << ARMII::RegRdShift); + else + Binary |= getMachineOpValue(MI, OpIdx++) << ARMII::RegRdShift; + + // Set second operand + if (ImplicitRn) + // Special handling for implicit use (e.g. PC). + Binary |= (ARMRegisterInfo::getRegisterNumbering(ImplicitRn) + << ARMII::RegRnShift); + else + Binary |= getMachineOpValue(MI, OpIdx++) << ARMII::RegRnShift; + + // If this is a two-address operand, skip it. e.g. LDR_PRE. + if (!Skipped && TID.getOperandConstraint(OpIdx, TOI::TIED_TO) != -1) + ++OpIdx; + + const MachineOperand &MO2 = MI.getOperand(OpIdx); + unsigned AM2Opc = (ImplicitRn == ARM::PC) + ? 0 : MI.getOperand(OpIdx+1).getImm(); + + // Set bit U(23) according to sign of immed value (positive or negative). + Binary |= ((ARM_AM::getAM2Op(AM2Opc) == ARM_AM::add ? 1 : 0) << + ARMII::U_BitShift); + if (!MO2.getReg()) { // is immediate + if (ARM_AM::getAM2Offset(AM2Opc)) + // Set the value of offset_12 field + Binary |= ARM_AM::getAM2Offset(AM2Opc); + emitWordLE(Binary); + return; + } + + // Set bit I(25), because this is not in immediate enconding. + Binary |= 1 << ARMII::I_BitShift; + assert(TargetRegisterInfo::isPhysicalRegister(MO2.getReg())); + // Set bit[3:0] to the corresponding Rm register + Binary |= ARMRegisterInfo::getRegisterNumbering(MO2.getReg()); + + // If this instr is in scaled register offset/index instruction, set + // shift_immed(bit[11:7]) and shift(bit[6:5]) fields. + if (unsigned ShImm = ARM_AM::getAM2Offset(AM2Opc)) { + Binary |= getShiftOp(AM2Opc) << ARMII::ShiftImmShift; // shift + Binary |= ShImm << ARMII::ShiftShift; // shift_immed + } + + emitWordLE(Binary); +} + +void ARMCodeEmitter::emitMiscLoadStoreInstruction(const MachineInstr &MI, + unsigned ImplicitRn) { + const TargetInstrDesc &TID = MI.getDesc(); + unsigned Form = TID.TSFlags & ARMII::FormMask; + bool IsPrePost = (TID.TSFlags & ARMII::IndexModeMask) != 0; + + // Part of binary is determined by TableGn. + unsigned Binary = getBinaryCodeForInstr(MI); + + // Set the conditional execution predicate + Binary |= II->getPredicate(&MI) << ARMII::CondShift; + + unsigned OpIdx = 0; + + // Operand 0 of a pre- and post-indexed store is the address base + // writeback. Skip it. + bool Skipped = false; + if (IsPrePost && Form == ARMII::StMiscFrm) { + ++OpIdx; + Skipped = true; + } + + // Set first operand + Binary |= getMachineOpValue(MI, OpIdx++) << ARMII::RegRdShift; + + // Skip LDRD and STRD's second operand. + if (TID.Opcode == ARM::LDRD || TID.Opcode == ARM::STRD) + ++OpIdx; + + // Set second operand + if (ImplicitRn) + // Special handling for implicit use (e.g. PC). + Binary |= (ARMRegisterInfo::getRegisterNumbering(ImplicitRn) + << ARMII::RegRnShift); + else + Binary |= getMachineOpValue(MI, OpIdx++) << ARMII::RegRnShift; + + // If this is a two-address operand, skip it. e.g. LDRH_POST. + if (!Skipped && TID.getOperandConstraint(OpIdx, TOI::TIED_TO) != -1) + ++OpIdx; + + const MachineOperand &MO2 = MI.getOperand(OpIdx); + unsigned AM3Opc = (ImplicitRn == ARM::PC) + ? 0 : MI.getOperand(OpIdx+1).getImm(); + + // Set bit U(23) according to sign of immed value (positive or negative) + Binary |= ((ARM_AM::getAM3Op(AM3Opc) == ARM_AM::add ? 1 : 0) << + ARMII::U_BitShift); + + // If this instr is in register offset/index encoding, set bit[3:0] + // to the corresponding Rm register. + if (MO2.getReg()) { + Binary |= ARMRegisterInfo::getRegisterNumbering(MO2.getReg()); + emitWordLE(Binary); + return; + } + + // This instr is in immediate offset/index encoding, set bit 22 to 1. + Binary |= 1 << ARMII::AM3_I_BitShift; + if (unsigned ImmOffs = ARM_AM::getAM3Offset(AM3Opc)) { + // Set operands + Binary |= (ImmOffs >> 4) << ARMII::ImmHiShift; // immedH + Binary |= (ImmOffs & 0xF); // immedL + } + + emitWordLE(Binary); +} + +static unsigned getAddrModeUPBits(unsigned Mode) { + unsigned Binary = 0; + + // Set addressing mode by modifying bits U(23) and P(24) + // IA - Increment after - bit U = 1 and bit P = 0 + // IB - Increment before - bit U = 1 and bit P = 1 + // DA - Decrement after - bit U = 0 and bit P = 0 + // DB - Decrement before - bit U = 0 and bit P = 1 + switch (Mode) { + default: llvm_unreachable("Unknown addressing sub-mode!"); + case ARM_AM::da: break; + case ARM_AM::db: Binary |= 0x1 << ARMII::P_BitShift; break; + case ARM_AM::ia: Binary |= 0x1 << ARMII::U_BitShift; break; + case ARM_AM::ib: Binary |= 0x3 << ARMII::U_BitShift; break; + } + + return Binary; +} + +void ARMCodeEmitter::emitLoadStoreMultipleInstruction(const MachineInstr &MI) { + const TargetInstrDesc &TID = MI.getDesc(); + bool IsUpdating = (TID.TSFlags & ARMII::IndexModeMask) != 0; + + // Part of binary is determined by TableGn. + unsigned Binary = getBinaryCodeForInstr(MI); + + // Set the conditional execution predicate + Binary |= II->getPredicate(&MI) << ARMII::CondShift; + + // Skip operand 0 of an instruction with base register update. + unsigned OpIdx = 0; + if (IsUpdating) + ++OpIdx; + + // Set base address operand + Binary |= getMachineOpValue(MI, OpIdx++) << ARMII::RegRnShift; + + // Set addressing mode by modifying bits U(23) and P(24) + const MachineOperand &MO = MI.getOperand(OpIdx++); + Binary |= getAddrModeUPBits(ARM_AM::getAM4SubMode(MO.getImm())); + + // Set bit W(21) + if (IsUpdating) + Binary |= 0x1 << ARMII::W_BitShift; + + // Set registers + for (unsigned i = OpIdx+2, e = MI.getNumOperands(); i != e; ++i) { + const MachineOperand &MO = MI.getOperand(i); + if (!MO.isReg() || MO.isImplicit()) + break; + unsigned RegNum = ARMRegisterInfo::getRegisterNumbering(MO.getReg()); + assert(TargetRegisterInfo::isPhysicalRegister(MO.getReg()) && + RegNum < 16); + Binary |= 0x1 << RegNum; + } + + emitWordLE(Binary); +} + +void ARMCodeEmitter::emitMulFrmInstruction(const MachineInstr &MI) { + const TargetInstrDesc &TID = MI.getDesc(); + + // Part of binary is determined by TableGn. + unsigned Binary = getBinaryCodeForInstr(MI); + + // Set the conditional execution predicate + Binary |= II->getPredicate(&MI) << ARMII::CondShift; + + // Encode S bit if MI modifies CPSR. + Binary |= getAddrModeSBit(MI, TID); + + // 32x32->64bit operations have two destination registers. The number + // of register definitions will tell us if that's what we're dealing with. + unsigned OpIdx = 0; + if (TID.getNumDefs() == 2) + Binary |= getMachineOpValue (MI, OpIdx++) << ARMII::RegRdLoShift; + + // Encode Rd + Binary |= getMachineOpValue(MI, OpIdx++) << ARMII::RegRdHiShift; + + // Encode Rm + Binary |= getMachineOpValue(MI, OpIdx++); + + // Encode Rs + Binary |= getMachineOpValue(MI, OpIdx++) << ARMII::RegRsShift; + + // Many multiple instructions (e.g. MLA) have three src operands. Encode + // it as Rn (for multiply, that's in the same offset as RdLo. + if (TID.getNumOperands() > OpIdx && + !TID.OpInfo[OpIdx].isPredicate() && + !TID.OpInfo[OpIdx].isOptionalDef()) + Binary |= getMachineOpValue(MI, OpIdx) << ARMII::RegRdLoShift; + + emitWordLE(Binary); +} + +void ARMCodeEmitter::emitExtendInstruction(const MachineInstr &MI) { + const TargetInstrDesc &TID = MI.getDesc(); + + // Part of binary is determined by TableGn. + unsigned Binary = getBinaryCodeForInstr(MI); + + // Set the conditional execution predicate + Binary |= II->getPredicate(&MI) << ARMII::CondShift; + + unsigned OpIdx = 0; + + // Encode Rd + Binary |= getMachineOpValue(MI, OpIdx++) << ARMII::RegRdShift; + + const MachineOperand &MO1 = MI.getOperand(OpIdx++); + const MachineOperand &MO2 = MI.getOperand(OpIdx); + if (MO2.isReg()) { + // Two register operand form. + // Encode Rn. + Binary |= getMachineOpValue(MI, MO1) << ARMII::RegRnShift; + + // Encode Rm. + Binary |= getMachineOpValue(MI, MO2); + ++OpIdx; + } else { + Binary |= getMachineOpValue(MI, MO1); + } + + // Encode rot imm (0, 8, 16, or 24) if it has a rotate immediate operand. + if (MI.getOperand(OpIdx).isImm() && + !TID.OpInfo[OpIdx].isPredicate() && + !TID.OpInfo[OpIdx].isOptionalDef()) + Binary |= (getMachineOpValue(MI, OpIdx) / 8) << ARMII::ExtRotImmShift; + + emitWordLE(Binary); +} + +void ARMCodeEmitter::emitMiscArithInstruction(const MachineInstr &MI) { + const TargetInstrDesc &TID = MI.getDesc(); + + // Part of binary is determined by TableGn. + unsigned Binary = getBinaryCodeForInstr(MI); + + // Set the conditional execution predicate + Binary |= II->getPredicate(&MI) << ARMII::CondShift; + + unsigned OpIdx = 0; + + // Encode Rd + Binary |= getMachineOpValue(MI, OpIdx++) << ARMII::RegRdShift; + + const MachineOperand &MO = MI.getOperand(OpIdx++); + if (OpIdx == TID.getNumOperands() || + TID.OpInfo[OpIdx].isPredicate() || + TID.OpInfo[OpIdx].isOptionalDef()) { + // Encode Rm and it's done. + Binary |= getMachineOpValue(MI, MO); + emitWordLE(Binary); + return; + } + + // Encode Rn. + Binary |= getMachineOpValue(MI, MO) << ARMII::RegRnShift; + + // Encode Rm. + Binary |= getMachineOpValue(MI, OpIdx++); + + // Encode shift_imm. + unsigned ShiftAmt = MI.getOperand(OpIdx).getImm(); + assert(ShiftAmt < 32 && "shift_imm range is 0 to 31!"); + Binary |= ShiftAmt << ARMII::ShiftShift; + + emitWordLE(Binary); +} + +void ARMCodeEmitter::emitBranchInstruction(const MachineInstr &MI) { + const TargetInstrDesc &TID = MI.getDesc(); + + if (TID.Opcode == ARM::TPsoft) { + llvm_unreachable("ARM::TPsoft FIXME"); // FIXME + } + + // Part of binary is determined by TableGn. + unsigned Binary = getBinaryCodeForInstr(MI); + + // Set the conditional execution predicate + Binary |= II->getPredicate(&MI) << ARMII::CondShift; + + // Set signed_immed_24 field + Binary |= getMachineOpValue(MI, 0); + + emitWordLE(Binary); +} + +void ARMCodeEmitter::emitInlineJumpTable(unsigned JTIndex) { + // Remember the base address of the inline jump table. + uintptr_t JTBase = MCE.getCurrentPCValue(); + JTI->addJumpTableBaseAddr(JTIndex, JTBase); + DEBUG(errs() << " ** Jump Table #" << JTIndex << " @ " << (void*)JTBase + << '\n'); + + // Now emit the jump table entries. + const std::vector<MachineBasicBlock*> &MBBs = (*MJTEs)[JTIndex].MBBs; + for (unsigned i = 0, e = MBBs.size(); i != e; ++i) { + if (IsPIC) + // DestBB address - JT base. + emitMachineBasicBlock(MBBs[i], ARM::reloc_arm_pic_jt, JTBase); + else + // Absolute DestBB address. + emitMachineBasicBlock(MBBs[i], ARM::reloc_arm_absolute); + emitWordLE(0); + } +} + +void ARMCodeEmitter::emitMiscBranchInstruction(const MachineInstr &MI) { + const TargetInstrDesc &TID = MI.getDesc(); + + // Handle jump tables. + if (TID.Opcode == ARM::BR_JTr || TID.Opcode == ARM::BR_JTadd) { + // First emit a ldr pc, [] instruction. + emitDataProcessingInstruction(MI, ARM::PC); + + // Then emit the inline jump table. + unsigned JTIndex = + (TID.Opcode == ARM::BR_JTr) + ? MI.getOperand(1).getIndex() : MI.getOperand(2).getIndex(); + emitInlineJumpTable(JTIndex); + return; + } else if (TID.Opcode == ARM::BR_JTm) { + // First emit a ldr pc, [] instruction. + emitLoadStoreInstruction(MI, ARM::PC); + + // Then emit the inline jump table. + emitInlineJumpTable(MI.getOperand(3).getIndex()); + return; + } + + // Part of binary is determined by TableGn. + unsigned Binary = getBinaryCodeForInstr(MI); + + // Set the conditional execution predicate + Binary |= II->getPredicate(&MI) << ARMII::CondShift; + + if (TID.Opcode == ARM::BX_RET || TID.Opcode == ARM::MOVPCLR) + // The return register is LR. + Binary |= ARMRegisterInfo::getRegisterNumbering(ARM::LR); + else + // otherwise, set the return register + Binary |= getMachineOpValue(MI, 0); + + emitWordLE(Binary); +} + +static unsigned encodeVFPRd(const MachineInstr &MI, unsigned OpIdx) { + unsigned RegD = MI.getOperand(OpIdx).getReg(); + unsigned Binary = 0; + bool isSPVFP = false; + RegD = ARMRegisterInfo::getRegisterNumbering(RegD, &isSPVFP); + if (!isSPVFP) + Binary |= RegD << ARMII::RegRdShift; + else { + Binary |= ((RegD & 0x1E) >> 1) << ARMII::RegRdShift; + Binary |= (RegD & 0x01) << ARMII::D_BitShift; + } + return Binary; +} + +static unsigned encodeVFPRn(const MachineInstr &MI, unsigned OpIdx) { + unsigned RegN = MI.getOperand(OpIdx).getReg(); + unsigned Binary = 0; + bool isSPVFP = false; + RegN = ARMRegisterInfo::getRegisterNumbering(RegN, &isSPVFP); + if (!isSPVFP) + Binary |= RegN << ARMII::RegRnShift; + else { + Binary |= ((RegN & 0x1E) >> 1) << ARMII::RegRnShift; + Binary |= (RegN & 0x01) << ARMII::N_BitShift; + } + return Binary; +} + +static unsigned encodeVFPRm(const MachineInstr &MI, unsigned OpIdx) { + unsigned RegM = MI.getOperand(OpIdx).getReg(); + unsigned Binary = 0; + bool isSPVFP = false; + RegM = ARMRegisterInfo::getRegisterNumbering(RegM, &isSPVFP); + if (!isSPVFP) + Binary |= RegM; + else { + Binary |= ((RegM & 0x1E) >> 1); + Binary |= (RegM & 0x01) << ARMII::M_BitShift; + } + return Binary; +} + +void ARMCodeEmitter::emitVFPArithInstruction(const MachineInstr &MI) { + const TargetInstrDesc &TID = MI.getDesc(); + + // Part of binary is determined by TableGn. + unsigned Binary = getBinaryCodeForInstr(MI); + + // Set the conditional execution predicate + Binary |= II->getPredicate(&MI) << ARMII::CondShift; + + unsigned OpIdx = 0; + assert((Binary & ARMII::D_BitShift) == 0 && + (Binary & ARMII::N_BitShift) == 0 && + (Binary & ARMII::M_BitShift) == 0 && "VFP encoding bug!"); + + // Encode Dd / Sd. + Binary |= encodeVFPRd(MI, OpIdx++); + + // If this is a two-address operand, skip it, e.g. FMACD. + if (TID.getOperandConstraint(OpIdx, TOI::TIED_TO) != -1) + ++OpIdx; + + // Encode Dn / Sn. + if ((TID.TSFlags & ARMII::FormMask) == ARMII::VFPBinaryFrm) + Binary |= encodeVFPRn(MI, OpIdx++); + + if (OpIdx == TID.getNumOperands() || + TID.OpInfo[OpIdx].isPredicate() || + TID.OpInfo[OpIdx].isOptionalDef()) { + // FCMPEZD etc. has only one operand. + emitWordLE(Binary); + return; + } + + // Encode Dm / Sm. + Binary |= encodeVFPRm(MI, OpIdx); + + emitWordLE(Binary); +} + +void ARMCodeEmitter::emitVFPConversionInstruction(const MachineInstr &MI) { + const TargetInstrDesc &TID = MI.getDesc(); + unsigned Form = TID.TSFlags & ARMII::FormMask; + + // Part of binary is determined by TableGn. + unsigned Binary = getBinaryCodeForInstr(MI); + + // Set the conditional execution predicate + Binary |= II->getPredicate(&MI) << ARMII::CondShift; + + switch (Form) { + default: break; + case ARMII::VFPConv1Frm: + case ARMII::VFPConv2Frm: + case ARMII::VFPConv3Frm: + // Encode Dd / Sd. + Binary |= encodeVFPRd(MI, 0); + break; + case ARMII::VFPConv4Frm: + // Encode Dn / Sn. + Binary |= encodeVFPRn(MI, 0); + break; + case ARMII::VFPConv5Frm: + // Encode Dm / Sm. + Binary |= encodeVFPRm(MI, 0); + break; + } + + switch (Form) { + default: break; + case ARMII::VFPConv1Frm: + // Encode Dm / Sm. + Binary |= encodeVFPRm(MI, 1); + break; + case ARMII::VFPConv2Frm: + case ARMII::VFPConv3Frm: + // Encode Dn / Sn. + Binary |= encodeVFPRn(MI, 1); + break; + case ARMII::VFPConv4Frm: + case ARMII::VFPConv5Frm: + // Encode Dd / Sd. + Binary |= encodeVFPRd(MI, 1); + break; + } + + if (Form == ARMII::VFPConv5Frm) + // Encode Dn / Sn. + Binary |= encodeVFPRn(MI, 2); + else if (Form == ARMII::VFPConv3Frm) + // Encode Dm / Sm. + Binary |= encodeVFPRm(MI, 2); + + emitWordLE(Binary); +} + +void ARMCodeEmitter::emitVFPLoadStoreInstruction(const MachineInstr &MI) { + // Part of binary is determined by TableGn. + unsigned Binary = getBinaryCodeForInstr(MI); + + // Set the conditional execution predicate + Binary |= II->getPredicate(&MI) << ARMII::CondShift; + + unsigned OpIdx = 0; + + // Encode Dd / Sd. + Binary |= encodeVFPRd(MI, OpIdx++); + + // Encode address base. + const MachineOperand &Base = MI.getOperand(OpIdx++); + Binary |= getMachineOpValue(MI, Base) << ARMII::RegRnShift; + + // If there is a non-zero immediate offset, encode it. + if (Base.isReg()) { + const MachineOperand &Offset = MI.getOperand(OpIdx); + if (unsigned ImmOffs = ARM_AM::getAM5Offset(Offset.getImm())) { + if (ARM_AM::getAM5Op(Offset.getImm()) == ARM_AM::add) + Binary |= 1 << ARMII::U_BitShift; + Binary |= ImmOffs; + emitWordLE(Binary); + return; + } + } + + // If immediate offset is omitted, default to +0. + Binary |= 1 << ARMII::U_BitShift; + + emitWordLE(Binary); +} + +void +ARMCodeEmitter::emitVFPLoadStoreMultipleInstruction(const MachineInstr &MI) { + const TargetInstrDesc &TID = MI.getDesc(); + bool IsUpdating = (TID.TSFlags & ARMII::IndexModeMask) != 0; + + // Part of binary is determined by TableGn. + unsigned Binary = getBinaryCodeForInstr(MI); + + // Set the conditional execution predicate + Binary |= II->getPredicate(&MI) << ARMII::CondShift; + + // Skip operand 0 of an instruction with base register update. + unsigned OpIdx = 0; + if (IsUpdating) + ++OpIdx; + + // Set base address operand + Binary |= getMachineOpValue(MI, OpIdx++) << ARMII::RegRnShift; + + // Set addressing mode by modifying bits U(23) and P(24) + const MachineOperand &MO = MI.getOperand(OpIdx++); + Binary |= getAddrModeUPBits(ARM_AM::getAM5SubMode(MO.getImm())); + + // Set bit W(21) + if (IsUpdating) + Binary |= 0x1 << ARMII::W_BitShift; + + // First register is encoded in Dd. + Binary |= encodeVFPRd(MI, OpIdx+2); + + // Number of registers are encoded in offset field. + unsigned NumRegs = 1; + for (unsigned i = OpIdx+3, e = MI.getNumOperands(); i != e; ++i) { + const MachineOperand &MO = MI.getOperand(i); + if (!MO.isReg() || MO.isImplicit()) + break; + ++NumRegs; + } + // Bit 8 will be set if <list> is consecutive 64-bit registers (e.g., D0) + // Otherwise, it will be 0, in the case of 32-bit registers. + if(Binary & 0x100) + Binary |= NumRegs * 2; + else + Binary |= NumRegs; + + emitWordLE(Binary); +} + +void ARMCodeEmitter::emitMiscInstruction(const MachineInstr &MI) { + unsigned Opcode = MI.getDesc().Opcode; + // Part of binary is determined by TableGn. + unsigned Binary = getBinaryCodeForInstr(MI); + + // Set the conditional execution predicate + Binary |= II->getPredicate(&MI) << ARMII::CondShift; + + switch(Opcode) { + default: + llvm_unreachable("ARMCodeEmitter::emitMiscInstruction"); + + case ARM::FMSTAT: + // No further encoding needed. + break; + + case ARM::VMRS: + case ARM::VMSR: { + const MachineOperand &MO0 = MI.getOperand(0); + // Encode Rt. + Binary |= ARMRegisterInfo::getRegisterNumbering(MO0.getReg()) + << ARMII::RegRdShift; + break; + } + + case ARM::FCONSTD: + case ARM::FCONSTS: { + // Encode Dd / Sd. + Binary |= encodeVFPRd(MI, 0); + + // Encode imm., Table A7-18 VFP modified immediate constants + const MachineOperand &MO1 = MI.getOperand(1); + unsigned Imm = static_cast<unsigned>(MO1.getFPImm()->getValueAPF() + .bitcastToAPInt().getHiBits(32).getLimitedValue()); + unsigned ModifiedImm; + + if(Opcode == ARM::FCONSTS) + ModifiedImm = (Imm & 0x80000000) >> 24 | // a + (Imm & 0x03F80000) >> 19; // bcdefgh + else // Opcode == ARM::FCONSTD + ModifiedImm = (Imm & 0x80000000) >> 24 | // a + (Imm & 0x007F0000) >> 16; // bcdefgh + + // Insts{19-16} = abcd, Insts{3-0} = efgh + Binary |= ((ModifiedImm & 0xF0) >> 4) << 16; + Binary |= (ModifiedImm & 0xF); + break; + } + } + + emitWordLE(Binary); +} + +#include "ARMGenCodeEmitter.inc" diff --git a/contrib/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp b/contrib/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp new file mode 100644 index 0000000..13d8b74 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp @@ -0,0 +1,1821 @@ +//===-- ARMConstantIslandPass.cpp - ARM constant islands --------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains a pass that splits the constant pool up into 'islands' +// which are scattered through-out the function. This is required due to the +// limited pc-relative displacements that ARM has. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "arm-cp-islands" +#include "ARM.h" +#include "ARMAddressingModes.h" +#include "ARMMachineFunctionInfo.h" +#include "ARMInstrInfo.h" +#include "llvm/CodeGen/MachineConstantPool.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineJumpTableInfo.h" +#include "llvm/Target/TargetData.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Support/CommandLine.h" +#include <algorithm> +using namespace llvm; + +STATISTIC(NumCPEs, "Number of constpool entries"); +STATISTIC(NumSplit, "Number of uncond branches inserted"); +STATISTIC(NumCBrFixed, "Number of cond branches fixed"); +STATISTIC(NumUBrFixed, "Number of uncond branches fixed"); +STATISTIC(NumTBs, "Number of table branches generated"); +STATISTIC(NumT2CPShrunk, "Number of Thumb2 constantpool instructions shrunk"); +STATISTIC(NumT2BrShrunk, "Number of Thumb2 immediate branches shrunk"); +STATISTIC(NumCBZ, "Number of CBZ / CBNZ formed"); +STATISTIC(NumJTMoved, "Number of jump table destination blocks moved"); +STATISTIC(NumJTInserted, "Number of jump table intermediate blocks inserted"); + + +static cl::opt<bool> +AdjustJumpTableBlocks("arm-adjust-jump-tables", cl::Hidden, cl::init(true), + cl::desc("Adjust basic block layout to better use TB[BH]")); + +namespace { + /// ARMConstantIslands - Due to limited PC-relative displacements, ARM + /// requires constant pool entries to be scattered among the instructions + /// inside a function. To do this, it completely ignores the normal LLVM + /// constant pool; instead, it places constants wherever it feels like with + /// special instructions. + /// + /// The terminology used in this pass includes: + /// Islands - Clumps of constants placed in the function. + /// Water - Potential places where an island could be formed. + /// CPE - A constant pool entry that has been placed somewhere, which + /// tracks a list of users. + class ARMConstantIslands : public MachineFunctionPass { + /// BBSizes - The size of each MachineBasicBlock in bytes of code, indexed + /// by MBB Number. The two-byte pads required for Thumb alignment are + /// counted as part of the following block (i.e., the offset and size for + /// a padded block will both be ==2 mod 4). + std::vector<unsigned> BBSizes; + + /// BBOffsets - the offset of each MBB in bytes, starting from 0. + /// The two-byte pads required for Thumb alignment are counted as part of + /// the following block. + std::vector<unsigned> BBOffsets; + + /// WaterList - A sorted list of basic blocks where islands could be placed + /// (i.e. blocks that don't fall through to the following block, due + /// to a return, unreachable, or unconditional branch). + std::vector<MachineBasicBlock*> WaterList; + + /// NewWaterList - The subset of WaterList that was created since the + /// previous iteration by inserting unconditional branches. + SmallSet<MachineBasicBlock*, 4> NewWaterList; + + typedef std::vector<MachineBasicBlock*>::iterator water_iterator; + + /// CPUser - One user of a constant pool, keeping the machine instruction + /// pointer, the constant pool being referenced, and the max displacement + /// allowed from the instruction to the CP. The HighWaterMark records the + /// highest basic block where a new CPEntry can be placed. To ensure this + /// pass terminates, the CP entries are initially placed at the end of the + /// function and then move monotonically to lower addresses. The + /// exception to this rule is when the current CP entry for a particular + /// CPUser is out of range, but there is another CP entry for the same + /// constant value in range. We want to use the existing in-range CP + /// entry, but if it later moves out of range, the search for new water + /// should resume where it left off. The HighWaterMark is used to record + /// that point. + struct CPUser { + MachineInstr *MI; + MachineInstr *CPEMI; + MachineBasicBlock *HighWaterMark; + unsigned MaxDisp; + bool NegOk; + bool IsSoImm; + CPUser(MachineInstr *mi, MachineInstr *cpemi, unsigned maxdisp, + bool neg, bool soimm) + : MI(mi), CPEMI(cpemi), MaxDisp(maxdisp), NegOk(neg), IsSoImm(soimm) { + HighWaterMark = CPEMI->getParent(); + } + }; + + /// CPUsers - Keep track of all of the machine instructions that use various + /// constant pools and their max displacement. + std::vector<CPUser> CPUsers; + + /// CPEntry - One per constant pool entry, keeping the machine instruction + /// pointer, the constpool index, and the number of CPUser's which + /// reference this entry. + struct CPEntry { + MachineInstr *CPEMI; + unsigned CPI; + unsigned RefCount; + CPEntry(MachineInstr *cpemi, unsigned cpi, unsigned rc = 0) + : CPEMI(cpemi), CPI(cpi), RefCount(rc) {} + }; + + /// CPEntries - Keep track of all of the constant pool entry machine + /// instructions. For each original constpool index (i.e. those that + /// existed upon entry to this pass), it keeps a vector of entries. + /// Original elements are cloned as we go along; the clones are + /// put in the vector of the original element, but have distinct CPIs. + std::vector<std::vector<CPEntry> > CPEntries; + + /// ImmBranch - One per immediate branch, keeping the machine instruction + /// pointer, conditional or unconditional, the max displacement, + /// and (if isCond is true) the corresponding unconditional branch + /// opcode. + struct ImmBranch { + MachineInstr *MI; + unsigned MaxDisp : 31; + bool isCond : 1; + int UncondBr; + ImmBranch(MachineInstr *mi, unsigned maxdisp, bool cond, int ubr) + : MI(mi), MaxDisp(maxdisp), isCond(cond), UncondBr(ubr) {} + }; + + /// ImmBranches - Keep track of all the immediate branch instructions. + /// + std::vector<ImmBranch> ImmBranches; + + /// PushPopMIs - Keep track of all the Thumb push / pop instructions. + /// + SmallVector<MachineInstr*, 4> PushPopMIs; + + /// T2JumpTables - Keep track of all the Thumb2 jumptable instructions. + SmallVector<MachineInstr*, 4> T2JumpTables; + + /// HasFarJump - True if any far jump instruction has been emitted during + /// the branch fix up pass. + bool HasFarJump; + + /// HasInlineAsm - True if the function contains inline assembly. + bool HasInlineAsm; + + const TargetInstrInfo *TII; + const ARMSubtarget *STI; + ARMFunctionInfo *AFI; + bool isThumb; + bool isThumb1; + bool isThumb2; + public: + static char ID; + ARMConstantIslands() : MachineFunctionPass(&ID) {} + + virtual bool runOnMachineFunction(MachineFunction &MF); + + virtual const char *getPassName() const { + return "ARM constant island placement and branch shortening pass"; + } + + private: + void DoInitialPlacement(MachineFunction &MF, + std::vector<MachineInstr*> &CPEMIs); + CPEntry *findConstPoolEntry(unsigned CPI, const MachineInstr *CPEMI); + void JumpTableFunctionScan(MachineFunction &MF); + void InitialFunctionScan(MachineFunction &MF, + const std::vector<MachineInstr*> &CPEMIs); + MachineBasicBlock *SplitBlockBeforeInstr(MachineInstr *MI); + void UpdateForInsertedWaterBlock(MachineBasicBlock *NewBB); + void AdjustBBOffsetsAfter(MachineBasicBlock *BB, int delta); + bool DecrementOldEntry(unsigned CPI, MachineInstr* CPEMI); + int LookForExistingCPEntry(CPUser& U, unsigned UserOffset); + bool LookForWater(CPUser&U, unsigned UserOffset, water_iterator &WaterIter); + void CreateNewWater(unsigned CPUserIndex, unsigned UserOffset, + MachineBasicBlock *&NewMBB); + bool HandleConstantPoolUser(MachineFunction &MF, unsigned CPUserIndex); + void RemoveDeadCPEMI(MachineInstr *CPEMI); + bool RemoveUnusedCPEntries(); + bool CPEIsInRange(MachineInstr *MI, unsigned UserOffset, + MachineInstr *CPEMI, unsigned Disp, bool NegOk, + bool DoDump = false); + bool WaterIsInRange(unsigned UserOffset, MachineBasicBlock *Water, + CPUser &U); + bool OffsetIsInRange(unsigned UserOffset, unsigned TrialOffset, + unsigned Disp, bool NegativeOK, bool IsSoImm = false); + bool BBIsInRange(MachineInstr *MI, MachineBasicBlock *BB, unsigned Disp); + bool FixUpImmediateBr(MachineFunction &MF, ImmBranch &Br); + bool FixUpConditionalBr(MachineFunction &MF, ImmBranch &Br); + bool FixUpUnconditionalBr(MachineFunction &MF, ImmBranch &Br); + bool UndoLRSpillRestore(); + bool OptimizeThumb2Instructions(MachineFunction &MF); + bool OptimizeThumb2Branches(MachineFunction &MF); + bool ReorderThumb2JumpTables(MachineFunction &MF); + bool OptimizeThumb2JumpTables(MachineFunction &MF); + MachineBasicBlock *AdjustJTTargetBlockForward(MachineBasicBlock *BB, + MachineBasicBlock *JTBB); + + unsigned GetOffsetOf(MachineInstr *MI) const; + void dumpBBs(); + void verify(MachineFunction &MF); + }; + char ARMConstantIslands::ID = 0; +} + +/// verify - check BBOffsets, BBSizes, alignment of islands +void ARMConstantIslands::verify(MachineFunction &MF) { + assert(BBOffsets.size() == BBSizes.size()); + for (unsigned i = 1, e = BBOffsets.size(); i != e; ++i) + assert(BBOffsets[i-1]+BBSizes[i-1] == BBOffsets[i]); + if (!isThumb) + return; +#ifndef NDEBUG + for (MachineFunction::iterator MBBI = MF.begin(), E = MF.end(); + MBBI != E; ++MBBI) { + MachineBasicBlock *MBB = MBBI; + if (!MBB->empty() && + MBB->begin()->getOpcode() == ARM::CONSTPOOL_ENTRY) { + unsigned MBBId = MBB->getNumber(); + assert(HasInlineAsm || + (BBOffsets[MBBId]%4 == 0 && BBSizes[MBBId]%4 == 0) || + (BBOffsets[MBBId]%4 != 0 && BBSizes[MBBId]%4 != 0)); + } + } + for (unsigned i = 0, e = CPUsers.size(); i != e; ++i) { + CPUser &U = CPUsers[i]; + unsigned UserOffset = GetOffsetOf(U.MI) + (isThumb ? 4 : 8); + unsigned CPEOffset = GetOffsetOf(U.CPEMI); + unsigned Disp = UserOffset < CPEOffset ? CPEOffset - UserOffset : + UserOffset - CPEOffset; + assert(Disp <= U.MaxDisp || "Constant pool entry out of range!"); + } +#endif +} + +/// print block size and offset information - debugging +void ARMConstantIslands::dumpBBs() { + for (unsigned J = 0, E = BBOffsets.size(); J !=E; ++J) { + DEBUG(errs() << "block " << J << " offset " << BBOffsets[J] + << " size " << BBSizes[J] << "\n"); + } +} + +/// createARMConstantIslandPass - returns an instance of the constpool +/// island pass. +FunctionPass *llvm::createARMConstantIslandPass() { + return new ARMConstantIslands(); +} + +bool ARMConstantIslands::runOnMachineFunction(MachineFunction &MF) { + MachineConstantPool &MCP = *MF.getConstantPool(); + + TII = MF.getTarget().getInstrInfo(); + AFI = MF.getInfo<ARMFunctionInfo>(); + STI = &MF.getTarget().getSubtarget<ARMSubtarget>(); + + isThumb = AFI->isThumbFunction(); + isThumb1 = AFI->isThumb1OnlyFunction(); + isThumb2 = AFI->isThumb2Function(); + + HasFarJump = false; + HasInlineAsm = false; + + // Renumber all of the machine basic blocks in the function, guaranteeing that + // the numbers agree with the position of the block in the function. + MF.RenumberBlocks(); + + // Try to reorder and otherwise adjust the block layout to make good use + // of the TB[BH] instructions. + bool MadeChange = false; + if (isThumb2 && AdjustJumpTableBlocks) { + JumpTableFunctionScan(MF); + MadeChange |= ReorderThumb2JumpTables(MF); + // Data is out of date, so clear it. It'll be re-computed later. + T2JumpTables.clear(); + // Blocks may have shifted around. Keep the numbering up to date. + MF.RenumberBlocks(); + } + + // Thumb1 functions containing constant pools get 4-byte alignment. + // This is so we can keep exact track of where the alignment padding goes. + + // ARM and Thumb2 functions need to be 4-byte aligned. + if (!isThumb1) + MF.EnsureAlignment(2); // 2 = log2(4) + + // Perform the initial placement of the constant pool entries. To start with, + // we put them all at the end of the function. + std::vector<MachineInstr*> CPEMIs; + if (!MCP.isEmpty()) { + DoInitialPlacement(MF, CPEMIs); + if (isThumb1) + MF.EnsureAlignment(2); // 2 = log2(4) + } + + /// The next UID to take is the first unused one. + AFI->initConstPoolEntryUId(CPEMIs.size()); + + // Do the initial scan of the function, building up information about the + // sizes of each block, the location of all the water, and finding all of the + // constant pool users. + InitialFunctionScan(MF, CPEMIs); + CPEMIs.clear(); + + /// Remove dead constant pool entries. + RemoveUnusedCPEntries(); + + // Iteratively place constant pool entries and fix up branches until there + // is no change. + unsigned NoCPIters = 0, NoBRIters = 0; + while (true) { + bool CPChange = false; + for (unsigned i = 0, e = CPUsers.size(); i != e; ++i) + CPChange |= HandleConstantPoolUser(MF, i); + if (CPChange && ++NoCPIters > 30) + llvm_unreachable("Constant Island pass failed to converge!"); + DEBUG(dumpBBs()); + + // Clear NewWaterList now. If we split a block for branches, it should + // appear as "new water" for the next iteration of constant pool placement. + NewWaterList.clear(); + + bool BRChange = false; + for (unsigned i = 0, e = ImmBranches.size(); i != e; ++i) + BRChange |= FixUpImmediateBr(MF, ImmBranches[i]); + if (BRChange && ++NoBRIters > 30) + llvm_unreachable("Branch Fix Up pass failed to converge!"); + DEBUG(dumpBBs()); + + if (!CPChange && !BRChange) + break; + MadeChange = true; + } + + // Shrink 32-bit Thumb2 branch, load, and store instructions. + if (isThumb2) + MadeChange |= OptimizeThumb2Instructions(MF); + + // After a while, this might be made debug-only, but it is not expensive. + verify(MF); + + // If LR has been forced spilled and no far jumps (i.e. BL) has been issued. + // Undo the spill / restore of LR if possible. + if (isThumb && !HasFarJump && AFI->isLRSpilledForFarJump()) + MadeChange |= UndoLRSpillRestore(); + + BBSizes.clear(); + BBOffsets.clear(); + WaterList.clear(); + CPUsers.clear(); + CPEntries.clear(); + ImmBranches.clear(); + PushPopMIs.clear(); + T2JumpTables.clear(); + + return MadeChange; +} + +/// DoInitialPlacement - Perform the initial placement of the constant pool +/// entries. To start with, we put them all at the end of the function. +void ARMConstantIslands::DoInitialPlacement(MachineFunction &MF, + std::vector<MachineInstr*> &CPEMIs) { + // Create the basic block to hold the CPE's. + MachineBasicBlock *BB = MF.CreateMachineBasicBlock(); + MF.push_back(BB); + + // Add all of the constants from the constant pool to the end block, use an + // identity mapping of CPI's to CPE's. + const std::vector<MachineConstantPoolEntry> &CPs = + MF.getConstantPool()->getConstants(); + + const TargetData &TD = *MF.getTarget().getTargetData(); + for (unsigned i = 0, e = CPs.size(); i != e; ++i) { + unsigned Size = TD.getTypeAllocSize(CPs[i].getType()); + // Verify that all constant pool entries are a multiple of 4 bytes. If not, + // we would have to pad them out or something so that instructions stay + // aligned. + assert((Size & 3) == 0 && "CP Entry not multiple of 4 bytes!"); + MachineInstr *CPEMI = + BuildMI(BB, DebugLoc(), TII->get(ARM::CONSTPOOL_ENTRY)) + .addImm(i).addConstantPoolIndex(i).addImm(Size); + CPEMIs.push_back(CPEMI); + + // Add a new CPEntry, but no corresponding CPUser yet. + std::vector<CPEntry> CPEs; + CPEs.push_back(CPEntry(CPEMI, i)); + CPEntries.push_back(CPEs); + NumCPEs++; + DEBUG(errs() << "Moved CPI#" << i << " to end of function as #" << i + << "\n"); + } +} + +/// BBHasFallthrough - Return true if the specified basic block can fallthrough +/// into the block immediately after it. +static bool BBHasFallthrough(MachineBasicBlock *MBB) { + // Get the next machine basic block in the function. + MachineFunction::iterator MBBI = MBB; + if (llvm::next(MBBI) == MBB->getParent()->end()) // Can't fall off end of function. + return false; + + MachineBasicBlock *NextBB = llvm::next(MBBI); + for (MachineBasicBlock::succ_iterator I = MBB->succ_begin(), + E = MBB->succ_end(); I != E; ++I) + if (*I == NextBB) + return true; + + return false; +} + +/// findConstPoolEntry - Given the constpool index and CONSTPOOL_ENTRY MI, +/// look up the corresponding CPEntry. +ARMConstantIslands::CPEntry +*ARMConstantIslands::findConstPoolEntry(unsigned CPI, + const MachineInstr *CPEMI) { + std::vector<CPEntry> &CPEs = CPEntries[CPI]; + // Number of entries per constpool index should be small, just do a + // linear search. + for (unsigned i = 0, e = CPEs.size(); i != e; ++i) { + if (CPEs[i].CPEMI == CPEMI) + return &CPEs[i]; + } + return NULL; +} + +/// JumpTableFunctionScan - Do a scan of the function, building up +/// information about the sizes of each block and the locations of all +/// the jump tables. +void ARMConstantIslands::JumpTableFunctionScan(MachineFunction &MF) { + for (MachineFunction::iterator MBBI = MF.begin(), E = MF.end(); + MBBI != E; ++MBBI) { + MachineBasicBlock &MBB = *MBBI; + + for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); + I != E; ++I) + if (I->getDesc().isBranch() && I->getOpcode() == ARM::t2BR_JT) + T2JumpTables.push_back(I); + } +} + +/// InitialFunctionScan - Do the initial scan of the function, building up +/// information about the sizes of each block, the location of all the water, +/// and finding all of the constant pool users. +void ARMConstantIslands::InitialFunctionScan(MachineFunction &MF, + const std::vector<MachineInstr*> &CPEMIs) { + // First thing, see if the function has any inline assembly in it. If so, + // we have to be conservative about alignment assumptions, as we don't + // know for sure the size of any instructions in the inline assembly. + for (MachineFunction::iterator MBBI = MF.begin(), E = MF.end(); + MBBI != E; ++MBBI) { + MachineBasicBlock &MBB = *MBBI; + for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); + I != E; ++I) + if (I->getOpcode() == ARM::INLINEASM) + HasInlineAsm = true; + } + + // Now go back through the instructions and build up our data structures + unsigned Offset = 0; + for (MachineFunction::iterator MBBI = MF.begin(), E = MF.end(); + MBBI != E; ++MBBI) { + MachineBasicBlock &MBB = *MBBI; + + // If this block doesn't fall through into the next MBB, then this is + // 'water' that a constant pool island could be placed. + if (!BBHasFallthrough(&MBB)) + WaterList.push_back(&MBB); + + unsigned MBBSize = 0; + for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); + I != E; ++I) { + // Add instruction size to MBBSize. + MBBSize += TII->GetInstSizeInBytes(I); + + int Opc = I->getOpcode(); + if (I->getDesc().isBranch()) { + bool isCond = false; + unsigned Bits = 0; + unsigned Scale = 1; + int UOpc = Opc; + switch (Opc) { + default: + continue; // Ignore other JT branches + case ARM::tBR_JTr: + // A Thumb1 table jump may involve padding; for the offsets to + // be right, functions containing these must be 4-byte aligned. + MF.EnsureAlignment(2U); + if ((Offset+MBBSize)%4 != 0 || HasInlineAsm) + // FIXME: Add a pseudo ALIGN instruction instead. + MBBSize += 2; // padding + continue; // Does not get an entry in ImmBranches + case ARM::t2BR_JT: + T2JumpTables.push_back(I); + continue; // Does not get an entry in ImmBranches + case ARM::Bcc: + isCond = true; + UOpc = ARM::B; + // Fallthrough + case ARM::B: + Bits = 24; + Scale = 4; + break; + case ARM::tBcc: + isCond = true; + UOpc = ARM::tB; + Bits = 8; + Scale = 2; + break; + case ARM::tB: + Bits = 11; + Scale = 2; + break; + case ARM::t2Bcc: + isCond = true; + UOpc = ARM::t2B; + Bits = 20; + Scale = 2; + break; + case ARM::t2B: + Bits = 24; + Scale = 2; + break; + } + + // Record this immediate branch. + unsigned MaxOffs = ((1 << (Bits-1))-1) * Scale; + ImmBranches.push_back(ImmBranch(I, MaxOffs, isCond, UOpc)); + } + + if (Opc == ARM::tPUSH || Opc == ARM::tPOP_RET) + PushPopMIs.push_back(I); + + if (Opc == ARM::CONSTPOOL_ENTRY) + continue; + + // Scan the instructions for constant pool operands. + for (unsigned op = 0, e = I->getNumOperands(); op != e; ++op) + if (I->getOperand(op).isCPI()) { + // We found one. The addressing mode tells us the max displacement + // from the PC that this instruction permits. + + // Basic size info comes from the TSFlags field. + unsigned Bits = 0; + unsigned Scale = 1; + bool NegOk = false; + bool IsSoImm = false; + + switch (Opc) { + default: + llvm_unreachable("Unknown addressing mode for CP reference!"); + break; + + // Taking the address of a CP entry. + case ARM::LEApcrel: + // This takes a SoImm, which is 8 bit immediate rotated. We'll + // pretend the maximum offset is 255 * 4. Since each instruction + // 4 byte wide, this is always correct. We'll check for other + // displacements that fits in a SoImm as well. + Bits = 8; + Scale = 4; + NegOk = true; + IsSoImm = true; + break; + case ARM::t2LEApcrel: + Bits = 12; + NegOk = true; + break; + case ARM::tLEApcrel: + Bits = 8; + Scale = 4; + break; + + case ARM::LDR: + case ARM::LDRcp: + case ARM::t2LDRpci: + Bits = 12; // +-offset_12 + NegOk = true; + break; + + case ARM::tLDRpci: + case ARM::tLDRcp: + Bits = 8; + Scale = 4; // +(offset_8*4) + break; + + case ARM::VLDRD: + case ARM::VLDRS: + Bits = 8; + Scale = 4; // +-(offset_8*4) + NegOk = true; + break; + } + + // Remember that this is a user of a CP entry. + unsigned CPI = I->getOperand(op).getIndex(); + MachineInstr *CPEMI = CPEMIs[CPI]; + unsigned MaxOffs = ((1 << Bits)-1) * Scale; + CPUsers.push_back(CPUser(I, CPEMI, MaxOffs, NegOk, IsSoImm)); + + // Increment corresponding CPEntry reference count. + CPEntry *CPE = findConstPoolEntry(CPI, CPEMI); + assert(CPE && "Cannot find a corresponding CPEntry!"); + CPE->RefCount++; + + // Instructions can only use one CP entry, don't bother scanning the + // rest of the operands. + break; + } + } + + // In thumb mode, if this block is a constpool island, we may need padding + // so it's aligned on 4 byte boundary. + if (isThumb && + !MBB.empty() && + MBB.begin()->getOpcode() == ARM::CONSTPOOL_ENTRY && + ((Offset%4) != 0 || HasInlineAsm)) + MBBSize += 2; + + BBSizes.push_back(MBBSize); + BBOffsets.push_back(Offset); + Offset += MBBSize; + } +} + +/// GetOffsetOf - Return the current offset of the specified machine instruction +/// from the start of the function. This offset changes as stuff is moved +/// around inside the function. +unsigned ARMConstantIslands::GetOffsetOf(MachineInstr *MI) const { + MachineBasicBlock *MBB = MI->getParent(); + + // The offset is composed of two things: the sum of the sizes of all MBB's + // before this instruction's block, and the offset from the start of the block + // it is in. + unsigned Offset = BBOffsets[MBB->getNumber()]; + + // If we're looking for a CONSTPOOL_ENTRY in Thumb, see if this block has + // alignment padding, and compensate if so. + if (isThumb && + MI->getOpcode() == ARM::CONSTPOOL_ENTRY && + (Offset%4 != 0 || HasInlineAsm)) + Offset += 2; + + // Sum instructions before MI in MBB. + for (MachineBasicBlock::iterator I = MBB->begin(); ; ++I) { + assert(I != MBB->end() && "Didn't find MI in its own basic block?"); + if (&*I == MI) return Offset; + Offset += TII->GetInstSizeInBytes(I); + } +} + +/// CompareMBBNumbers - Little predicate function to sort the WaterList by MBB +/// ID. +static bool CompareMBBNumbers(const MachineBasicBlock *LHS, + const MachineBasicBlock *RHS) { + return LHS->getNumber() < RHS->getNumber(); +} + +/// UpdateForInsertedWaterBlock - When a block is newly inserted into the +/// machine function, it upsets all of the block numbers. Renumber the blocks +/// and update the arrays that parallel this numbering. +void ARMConstantIslands::UpdateForInsertedWaterBlock(MachineBasicBlock *NewBB) { + // Renumber the MBB's to keep them consequtive. + NewBB->getParent()->RenumberBlocks(NewBB); + + // Insert a size into BBSizes to align it properly with the (newly + // renumbered) block numbers. + BBSizes.insert(BBSizes.begin()+NewBB->getNumber(), 0); + + // Likewise for BBOffsets. + BBOffsets.insert(BBOffsets.begin()+NewBB->getNumber(), 0); + + // Next, update WaterList. Specifically, we need to add NewMBB as having + // available water after it. + water_iterator IP = + std::lower_bound(WaterList.begin(), WaterList.end(), NewBB, + CompareMBBNumbers); + WaterList.insert(IP, NewBB); +} + + +/// Split the basic block containing MI into two blocks, which are joined by +/// an unconditional branch. Update data structures and renumber blocks to +/// account for this change and returns the newly created block. +MachineBasicBlock *ARMConstantIslands::SplitBlockBeforeInstr(MachineInstr *MI) { + MachineBasicBlock *OrigBB = MI->getParent(); + MachineFunction &MF = *OrigBB->getParent(); + + // Create a new MBB for the code after the OrigBB. + MachineBasicBlock *NewBB = + MF.CreateMachineBasicBlock(OrigBB->getBasicBlock()); + MachineFunction::iterator MBBI = OrigBB; ++MBBI; + MF.insert(MBBI, NewBB); + + // Splice the instructions starting with MI over to NewBB. + NewBB->splice(NewBB->end(), OrigBB, MI, OrigBB->end()); + + // Add an unconditional branch from OrigBB to NewBB. + // Note the new unconditional branch is not being recorded. + // There doesn't seem to be meaningful DebugInfo available; this doesn't + // correspond to anything in the source. + unsigned Opc = isThumb ? (isThumb2 ? ARM::t2B : ARM::tB) : ARM::B; + BuildMI(OrigBB, DebugLoc(), TII->get(Opc)).addMBB(NewBB); + NumSplit++; + + // Update the CFG. All succs of OrigBB are now succs of NewBB. + while (!OrigBB->succ_empty()) { + MachineBasicBlock *Succ = *OrigBB->succ_begin(); + OrigBB->removeSuccessor(Succ); + NewBB->addSuccessor(Succ); + + // This pass should be run after register allocation, so there should be no + // PHI nodes to update. + assert((Succ->empty() || !Succ->begin()->isPHI()) + && "PHI nodes should be eliminated by now!"); + } + + // OrigBB branches to NewBB. + OrigBB->addSuccessor(NewBB); + + // Update internal data structures to account for the newly inserted MBB. + // This is almost the same as UpdateForInsertedWaterBlock, except that + // the Water goes after OrigBB, not NewBB. + MF.RenumberBlocks(NewBB); + + // Insert a size into BBSizes to align it properly with the (newly + // renumbered) block numbers. + BBSizes.insert(BBSizes.begin()+NewBB->getNumber(), 0); + + // Likewise for BBOffsets. + BBOffsets.insert(BBOffsets.begin()+NewBB->getNumber(), 0); + + // Next, update WaterList. Specifically, we need to add OrigMBB as having + // available water after it (but not if it's already there, which happens + // when splitting before a conditional branch that is followed by an + // unconditional branch - in that case we want to insert NewBB). + water_iterator IP = + std::lower_bound(WaterList.begin(), WaterList.end(), OrigBB, + CompareMBBNumbers); + MachineBasicBlock* WaterBB = *IP; + if (WaterBB == OrigBB) + WaterList.insert(llvm::next(IP), NewBB); + else + WaterList.insert(IP, OrigBB); + NewWaterList.insert(OrigBB); + + // Figure out how large the first NewMBB is. (It cannot + // contain a constpool_entry or tablejump.) + unsigned NewBBSize = 0; + for (MachineBasicBlock::iterator I = NewBB->begin(), E = NewBB->end(); + I != E; ++I) + NewBBSize += TII->GetInstSizeInBytes(I); + + unsigned OrigBBI = OrigBB->getNumber(); + unsigned NewBBI = NewBB->getNumber(); + // Set the size of NewBB in BBSizes. + BBSizes[NewBBI] = NewBBSize; + + // We removed instructions from UserMBB, subtract that off from its size. + // Add 2 or 4 to the block to count the unconditional branch we added to it. + int delta = isThumb1 ? 2 : 4; + BBSizes[OrigBBI] -= NewBBSize - delta; + + // ...and adjust BBOffsets for NewBB accordingly. + BBOffsets[NewBBI] = BBOffsets[OrigBBI] + BBSizes[OrigBBI]; + + // All BBOffsets following these blocks must be modified. + AdjustBBOffsetsAfter(NewBB, delta); + + return NewBB; +} + +/// OffsetIsInRange - Checks whether UserOffset (the location of a constant pool +/// reference) is within MaxDisp of TrialOffset (a proposed location of a +/// constant pool entry). +bool ARMConstantIslands::OffsetIsInRange(unsigned UserOffset, + unsigned TrialOffset, unsigned MaxDisp, + bool NegativeOK, bool IsSoImm) { + // On Thumb offsets==2 mod 4 are rounded down by the hardware for + // purposes of the displacement computation; compensate for that here. + // Effectively, the valid range of displacements is 2 bytes smaller for such + // references. + unsigned TotalAdj = 0; + if (isThumb && UserOffset%4 !=0) { + UserOffset -= 2; + TotalAdj = 2; + } + // CPEs will be rounded up to a multiple of 4. + if (isThumb && TrialOffset%4 != 0) { + TrialOffset += 2; + TotalAdj += 2; + } + + // In Thumb2 mode, later branch adjustments can shift instructions up and + // cause alignment change. In the worst case scenario this can cause the + // user's effective address to be subtracted by 2 and the CPE's address to + // be plus 2. + if (isThumb2 && TotalAdj != 4) + MaxDisp -= (4 - TotalAdj); + + if (UserOffset <= TrialOffset) { + // User before the Trial. + if (TrialOffset - UserOffset <= MaxDisp) + return true; + // FIXME: Make use full range of soimm values. + } else if (NegativeOK) { + if (UserOffset - TrialOffset <= MaxDisp) + return true; + // FIXME: Make use full range of soimm values. + } + return false; +} + +/// WaterIsInRange - Returns true if a CPE placed after the specified +/// Water (a basic block) will be in range for the specific MI. + +bool ARMConstantIslands::WaterIsInRange(unsigned UserOffset, + MachineBasicBlock* Water, CPUser &U) { + unsigned MaxDisp = U.MaxDisp; + unsigned CPEOffset = BBOffsets[Water->getNumber()] + + BBSizes[Water->getNumber()]; + + // If the CPE is to be inserted before the instruction, that will raise + // the offset of the instruction. + if (CPEOffset < UserOffset) + UserOffset += U.CPEMI->getOperand(2).getImm(); + + return OffsetIsInRange(UserOffset, CPEOffset, MaxDisp, U.NegOk, U.IsSoImm); +} + +/// CPEIsInRange - Returns true if the distance between specific MI and +/// specific ConstPool entry instruction can fit in MI's displacement field. +bool ARMConstantIslands::CPEIsInRange(MachineInstr *MI, unsigned UserOffset, + MachineInstr *CPEMI, unsigned MaxDisp, + bool NegOk, bool DoDump) { + unsigned CPEOffset = GetOffsetOf(CPEMI); + assert((CPEOffset%4 == 0 || HasInlineAsm) && "Misaligned CPE"); + + if (DoDump) { + DEBUG(errs() << "User of CPE#" << CPEMI->getOperand(0).getImm() + << " max delta=" << MaxDisp + << " insn address=" << UserOffset + << " CPE address=" << CPEOffset + << " offset=" << int(CPEOffset-UserOffset) << "\t" << *MI); + } + + return OffsetIsInRange(UserOffset, CPEOffset, MaxDisp, NegOk); +} + +#ifndef NDEBUG +/// BBIsJumpedOver - Return true of the specified basic block's only predecessor +/// unconditionally branches to its only successor. +static bool BBIsJumpedOver(MachineBasicBlock *MBB) { + if (MBB->pred_size() != 1 || MBB->succ_size() != 1) + return false; + + MachineBasicBlock *Succ = *MBB->succ_begin(); + MachineBasicBlock *Pred = *MBB->pred_begin(); + MachineInstr *PredMI = &Pred->back(); + if (PredMI->getOpcode() == ARM::B || PredMI->getOpcode() == ARM::tB + || PredMI->getOpcode() == ARM::t2B) + return PredMI->getOperand(0).getMBB() == Succ; + return false; +} +#endif // NDEBUG + +void ARMConstantIslands::AdjustBBOffsetsAfter(MachineBasicBlock *BB, + int delta) { + MachineFunction::iterator MBBI = BB; MBBI = llvm::next(MBBI); + for(unsigned i = BB->getNumber()+1, e = BB->getParent()->getNumBlockIDs(); + i < e; ++i) { + BBOffsets[i] += delta; + // If some existing blocks have padding, adjust the padding as needed, a + // bit tricky. delta can be negative so don't use % on that. + if (!isThumb) + continue; + MachineBasicBlock *MBB = MBBI; + if (!MBB->empty() && !HasInlineAsm) { + // Constant pool entries require padding. + if (MBB->begin()->getOpcode() == ARM::CONSTPOOL_ENTRY) { + unsigned OldOffset = BBOffsets[i] - delta; + if ((OldOffset%4) == 0 && (BBOffsets[i]%4) != 0) { + // add new padding + BBSizes[i] += 2; + delta += 2; + } else if ((OldOffset%4) != 0 && (BBOffsets[i]%4) == 0) { + // remove existing padding + BBSizes[i] -= 2; + delta -= 2; + } + } + // Thumb1 jump tables require padding. They should be at the end; + // following unconditional branches are removed by AnalyzeBranch. + MachineInstr *ThumbJTMI = prior(MBB->end()); + if (ThumbJTMI->getOpcode() == ARM::tBR_JTr) { + unsigned NewMIOffset = GetOffsetOf(ThumbJTMI); + unsigned OldMIOffset = NewMIOffset - delta; + if ((OldMIOffset%4) == 0 && (NewMIOffset%4) != 0) { + // remove existing padding + BBSizes[i] -= 2; + delta -= 2; + } else if ((OldMIOffset%4) != 0 && (NewMIOffset%4) == 0) { + // add new padding + BBSizes[i] += 2; + delta += 2; + } + } + if (delta==0) + return; + } + MBBI = llvm::next(MBBI); + } +} + +/// DecrementOldEntry - find the constant pool entry with index CPI +/// and instruction CPEMI, and decrement its refcount. If the refcount +/// becomes 0 remove the entry and instruction. Returns true if we removed +/// the entry, false if we didn't. + +bool ARMConstantIslands::DecrementOldEntry(unsigned CPI, MachineInstr *CPEMI) { + // Find the old entry. Eliminate it if it is no longer used. + CPEntry *CPE = findConstPoolEntry(CPI, CPEMI); + assert(CPE && "Unexpected!"); + if (--CPE->RefCount == 0) { + RemoveDeadCPEMI(CPEMI); + CPE->CPEMI = NULL; + NumCPEs--; + return true; + } + return false; +} + +/// LookForCPEntryInRange - see if the currently referenced CPE is in range; +/// if not, see if an in-range clone of the CPE is in range, and if so, +/// change the data structures so the user references the clone. Returns: +/// 0 = no existing entry found +/// 1 = entry found, and there were no code insertions or deletions +/// 2 = entry found, and there were code insertions or deletions +int ARMConstantIslands::LookForExistingCPEntry(CPUser& U, unsigned UserOffset) +{ + MachineInstr *UserMI = U.MI; + MachineInstr *CPEMI = U.CPEMI; + + // Check to see if the CPE is already in-range. + if (CPEIsInRange(UserMI, UserOffset, CPEMI, U.MaxDisp, U.NegOk, true)) { + DEBUG(errs() << "In range\n"); + return 1; + } + + // No. Look for previously created clones of the CPE that are in range. + unsigned CPI = CPEMI->getOperand(1).getIndex(); + std::vector<CPEntry> &CPEs = CPEntries[CPI]; + for (unsigned i = 0, e = CPEs.size(); i != e; ++i) { + // We already tried this one + if (CPEs[i].CPEMI == CPEMI) + continue; + // Removing CPEs can leave empty entries, skip + if (CPEs[i].CPEMI == NULL) + continue; + if (CPEIsInRange(UserMI, UserOffset, CPEs[i].CPEMI, U.MaxDisp, U.NegOk)) { + DEBUG(errs() << "Replacing CPE#" << CPI << " with CPE#" + << CPEs[i].CPI << "\n"); + // Point the CPUser node to the replacement + U.CPEMI = CPEs[i].CPEMI; + // Change the CPI in the instruction operand to refer to the clone. + for (unsigned j = 0, e = UserMI->getNumOperands(); j != e; ++j) + if (UserMI->getOperand(j).isCPI()) { + UserMI->getOperand(j).setIndex(CPEs[i].CPI); + break; + } + // Adjust the refcount of the clone... + CPEs[i].RefCount++; + // ...and the original. If we didn't remove the old entry, none of the + // addresses changed, so we don't need another pass. + return DecrementOldEntry(CPI, CPEMI) ? 2 : 1; + } + } + return 0; +} + +/// getUnconditionalBrDisp - Returns the maximum displacement that can fit in +/// the specific unconditional branch instruction. +static inline unsigned getUnconditionalBrDisp(int Opc) { + switch (Opc) { + case ARM::tB: + return ((1<<10)-1)*2; + case ARM::t2B: + return ((1<<23)-1)*2; + default: + break; + } + + return ((1<<23)-1)*4; +} + +/// LookForWater - Look for an existing entry in the WaterList in which +/// we can place the CPE referenced from U so it's within range of U's MI. +/// Returns true if found, false if not. If it returns true, WaterIter +/// is set to the WaterList entry. For Thumb, prefer water that will not +/// introduce padding to water that will. To ensure that this pass +/// terminates, the CPE location for a particular CPUser is only allowed to +/// move to a lower address, so search backward from the end of the list and +/// prefer the first water that is in range. +bool ARMConstantIslands::LookForWater(CPUser &U, unsigned UserOffset, + water_iterator &WaterIter) { + if (WaterList.empty()) + return false; + + bool FoundWaterThatWouldPad = false; + water_iterator IPThatWouldPad; + for (water_iterator IP = prior(WaterList.end()), + B = WaterList.begin();; --IP) { + MachineBasicBlock* WaterBB = *IP; + // Check if water is in range and is either at a lower address than the + // current "high water mark" or a new water block that was created since + // the previous iteration by inserting an unconditional branch. In the + // latter case, we want to allow resetting the high water mark back to + // this new water since we haven't seen it before. Inserting branches + // should be relatively uncommon and when it does happen, we want to be + // sure to take advantage of it for all the CPEs near that block, so that + // we don't insert more branches than necessary. + if (WaterIsInRange(UserOffset, WaterBB, U) && + (WaterBB->getNumber() < U.HighWaterMark->getNumber() || + NewWaterList.count(WaterBB))) { + unsigned WBBId = WaterBB->getNumber(); + if (isThumb && + (BBOffsets[WBBId] + BBSizes[WBBId])%4 != 0) { + // This is valid Water, but would introduce padding. Remember + // it in case we don't find any Water that doesn't do this. + if (!FoundWaterThatWouldPad) { + FoundWaterThatWouldPad = true; + IPThatWouldPad = IP; + } + } else { + WaterIter = IP; + return true; + } + } + if (IP == B) + break; + } + if (FoundWaterThatWouldPad) { + WaterIter = IPThatWouldPad; + return true; + } + return false; +} + +/// CreateNewWater - No existing WaterList entry will work for +/// CPUsers[CPUserIndex], so create a place to put the CPE. The end of the +/// block is used if in range, and the conditional branch munged so control +/// flow is correct. Otherwise the block is split to create a hole with an +/// unconditional branch around it. In either case NewMBB is set to a +/// block following which the new island can be inserted (the WaterList +/// is not adjusted). +void ARMConstantIslands::CreateNewWater(unsigned CPUserIndex, + unsigned UserOffset, + MachineBasicBlock *&NewMBB) { + CPUser &U = CPUsers[CPUserIndex]; + MachineInstr *UserMI = U.MI; + MachineInstr *CPEMI = U.CPEMI; + MachineBasicBlock *UserMBB = UserMI->getParent(); + unsigned OffsetOfNextBlock = BBOffsets[UserMBB->getNumber()] + + BBSizes[UserMBB->getNumber()]; + assert(OffsetOfNextBlock== BBOffsets[UserMBB->getNumber()+1]); + + // If the block does not end in an unconditional branch already, and if the + // end of the block is within range, make new water there. (The addition + // below is for the unconditional branch we will be adding: 4 bytes on ARM + + // Thumb2, 2 on Thumb1. Possible Thumb1 alignment padding is allowed for + // inside OffsetIsInRange. + if (BBHasFallthrough(UserMBB) && + OffsetIsInRange(UserOffset, OffsetOfNextBlock + (isThumb1 ? 2: 4), + U.MaxDisp, U.NegOk, U.IsSoImm)) { + DEBUG(errs() << "Split at end of block\n"); + if (&UserMBB->back() == UserMI) + assert(BBHasFallthrough(UserMBB) && "Expected a fallthrough BB!"); + NewMBB = llvm::next(MachineFunction::iterator(UserMBB)); + // Add an unconditional branch from UserMBB to fallthrough block. + // Record it for branch lengthening; this new branch will not get out of + // range, but if the preceding conditional branch is out of range, the + // targets will be exchanged, and the altered branch may be out of + // range, so the machinery has to know about it. + int UncondBr = isThumb ? ((isThumb2) ? ARM::t2B : ARM::tB) : ARM::B; + BuildMI(UserMBB, DebugLoc(), TII->get(UncondBr)).addMBB(NewMBB); + unsigned MaxDisp = getUnconditionalBrDisp(UncondBr); + ImmBranches.push_back(ImmBranch(&UserMBB->back(), + MaxDisp, false, UncondBr)); + int delta = isThumb1 ? 2 : 4; + BBSizes[UserMBB->getNumber()] += delta; + AdjustBBOffsetsAfter(UserMBB, delta); + } else { + // What a big block. Find a place within the block to split it. + // This is a little tricky on Thumb1 since instructions are 2 bytes + // and constant pool entries are 4 bytes: if instruction I references + // island CPE, and instruction I+1 references CPE', it will + // not work well to put CPE as far forward as possible, since then + // CPE' cannot immediately follow it (that location is 2 bytes + // farther away from I+1 than CPE was from I) and we'd need to create + // a new island. So, we make a first guess, then walk through the + // instructions between the one currently being looked at and the + // possible insertion point, and make sure any other instructions + // that reference CPEs will be able to use the same island area; + // if not, we back up the insertion point. + + // The 4 in the following is for the unconditional branch we'll be + // inserting (allows for long branch on Thumb1). Alignment of the + // island is handled inside OffsetIsInRange. + unsigned BaseInsertOffset = UserOffset + U.MaxDisp -4; + // This could point off the end of the block if we've already got + // constant pool entries following this block; only the last one is + // in the water list. Back past any possible branches (allow for a + // conditional and a maximally long unconditional). + if (BaseInsertOffset >= BBOffsets[UserMBB->getNumber()+1]) + BaseInsertOffset = BBOffsets[UserMBB->getNumber()+1] - + (isThumb1 ? 6 : 8); + unsigned EndInsertOffset = BaseInsertOffset + + CPEMI->getOperand(2).getImm(); + MachineBasicBlock::iterator MI = UserMI; + ++MI; + unsigned CPUIndex = CPUserIndex+1; + for (unsigned Offset = UserOffset+TII->GetInstSizeInBytes(UserMI); + Offset < BaseInsertOffset; + Offset += TII->GetInstSizeInBytes(MI), + MI = llvm::next(MI)) { + if (CPUIndex < CPUsers.size() && CPUsers[CPUIndex].MI == MI) { + CPUser &U = CPUsers[CPUIndex]; + if (!OffsetIsInRange(Offset, EndInsertOffset, + U.MaxDisp, U.NegOk, U.IsSoImm)) { + BaseInsertOffset -= (isThumb1 ? 2 : 4); + EndInsertOffset -= (isThumb1 ? 2 : 4); + } + // This is overly conservative, as we don't account for CPEMIs + // being reused within the block, but it doesn't matter much. + EndInsertOffset += CPUsers[CPUIndex].CPEMI->getOperand(2).getImm(); + CPUIndex++; + } + } + DEBUG(errs() << "Split in middle of big block\n"); + NewMBB = SplitBlockBeforeInstr(prior(MI)); + } +} + +/// HandleConstantPoolUser - Analyze the specified user, checking to see if it +/// is out-of-range. If so, pick up the constant pool value and move it some +/// place in-range. Return true if we changed any addresses (thus must run +/// another pass of branch lengthening), false otherwise. +bool ARMConstantIslands::HandleConstantPoolUser(MachineFunction &MF, + unsigned CPUserIndex) { + CPUser &U = CPUsers[CPUserIndex]; + MachineInstr *UserMI = U.MI; + MachineInstr *CPEMI = U.CPEMI; + unsigned CPI = CPEMI->getOperand(1).getIndex(); + unsigned Size = CPEMI->getOperand(2).getImm(); + // Compute this only once, it's expensive. The 4 or 8 is the value the + // hardware keeps in the PC. + unsigned UserOffset = GetOffsetOf(UserMI) + (isThumb ? 4 : 8); + + // See if the current entry is within range, or there is a clone of it + // in range. + int result = LookForExistingCPEntry(U, UserOffset); + if (result==1) return false; + else if (result==2) return true; + + // No existing clone of this CPE is within range. + // We will be generating a new clone. Get a UID for it. + unsigned ID = AFI->createConstPoolEntryUId(); + + // Look for water where we can place this CPE. + MachineBasicBlock *NewIsland = MF.CreateMachineBasicBlock(); + MachineBasicBlock *NewMBB; + water_iterator IP; + if (LookForWater(U, UserOffset, IP)) { + DEBUG(errs() << "found water in range\n"); + MachineBasicBlock *WaterBB = *IP; + + // If the original WaterList entry was "new water" on this iteration, + // propagate that to the new island. This is just keeping NewWaterList + // updated to match the WaterList, which will be updated below. + if (NewWaterList.count(WaterBB)) { + NewWaterList.erase(WaterBB); + NewWaterList.insert(NewIsland); + } + // The new CPE goes before the following block (NewMBB). + NewMBB = llvm::next(MachineFunction::iterator(WaterBB)); + + } else { + // No water found. + DEBUG(errs() << "No water found\n"); + CreateNewWater(CPUserIndex, UserOffset, NewMBB); + + // SplitBlockBeforeInstr adds to WaterList, which is important when it is + // called while handling branches so that the water will be seen on the + // next iteration for constant pools, but in this context, we don't want + // it. Check for this so it will be removed from the WaterList. + // Also remove any entry from NewWaterList. + MachineBasicBlock *WaterBB = prior(MachineFunction::iterator(NewMBB)); + IP = std::find(WaterList.begin(), WaterList.end(), WaterBB); + if (IP != WaterList.end()) + NewWaterList.erase(WaterBB); + + // We are adding new water. Update NewWaterList. + NewWaterList.insert(NewIsland); + } + + // Remove the original WaterList entry; we want subsequent insertions in + // this vicinity to go after the one we're about to insert. This + // considerably reduces the number of times we have to move the same CPE + // more than once and is also important to ensure the algorithm terminates. + if (IP != WaterList.end()) + WaterList.erase(IP); + + // Okay, we know we can put an island before NewMBB now, do it! + MF.insert(NewMBB, NewIsland); + + // Update internal data structures to account for the newly inserted MBB. + UpdateForInsertedWaterBlock(NewIsland); + + // Decrement the old entry, and remove it if refcount becomes 0. + DecrementOldEntry(CPI, CPEMI); + + // Now that we have an island to add the CPE to, clone the original CPE and + // add it to the island. + U.HighWaterMark = NewIsland; + U.CPEMI = BuildMI(NewIsland, DebugLoc(), TII->get(ARM::CONSTPOOL_ENTRY)) + .addImm(ID).addConstantPoolIndex(CPI).addImm(Size); + CPEntries[CPI].push_back(CPEntry(U.CPEMI, ID, 1)); + NumCPEs++; + + BBOffsets[NewIsland->getNumber()] = BBOffsets[NewMBB->getNumber()]; + // Compensate for .align 2 in thumb mode. + if (isThumb && (BBOffsets[NewIsland->getNumber()]%4 != 0 || HasInlineAsm)) + Size += 2; + // Increase the size of the island block to account for the new entry. + BBSizes[NewIsland->getNumber()] += Size; + AdjustBBOffsetsAfter(NewIsland, Size); + + // Finally, change the CPI in the instruction operand to be ID. + for (unsigned i = 0, e = UserMI->getNumOperands(); i != e; ++i) + if (UserMI->getOperand(i).isCPI()) { + UserMI->getOperand(i).setIndex(ID); + break; + } + + DEBUG(errs() << " Moved CPE to #" << ID << " CPI=" << CPI + << '\t' << *UserMI); + + return true; +} + +/// RemoveDeadCPEMI - Remove a dead constant pool entry instruction. Update +/// sizes and offsets of impacted basic blocks. +void ARMConstantIslands::RemoveDeadCPEMI(MachineInstr *CPEMI) { + MachineBasicBlock *CPEBB = CPEMI->getParent(); + unsigned Size = CPEMI->getOperand(2).getImm(); + CPEMI->eraseFromParent(); + BBSizes[CPEBB->getNumber()] -= Size; + // All succeeding offsets have the current size value added in, fix this. + if (CPEBB->empty()) { + // In thumb1 mode, the size of island may be padded by two to compensate for + // the alignment requirement. Then it will now be 2 when the block is + // empty, so fix this. + // All succeeding offsets have the current size value added in, fix this. + if (BBSizes[CPEBB->getNumber()] != 0) { + Size += BBSizes[CPEBB->getNumber()]; + BBSizes[CPEBB->getNumber()] = 0; + } + } + AdjustBBOffsetsAfter(CPEBB, -Size); + // An island has only one predecessor BB and one successor BB. Check if + // this BB's predecessor jumps directly to this BB's successor. This + // shouldn't happen currently. + assert(!BBIsJumpedOver(CPEBB) && "How did this happen?"); + // FIXME: remove the empty blocks after all the work is done? +} + +/// RemoveUnusedCPEntries - Remove constant pool entries whose refcounts +/// are zero. +bool ARMConstantIslands::RemoveUnusedCPEntries() { + unsigned MadeChange = false; + for (unsigned i = 0, e = CPEntries.size(); i != e; ++i) { + std::vector<CPEntry> &CPEs = CPEntries[i]; + for (unsigned j = 0, ee = CPEs.size(); j != ee; ++j) { + if (CPEs[j].RefCount == 0 && CPEs[j].CPEMI) { + RemoveDeadCPEMI(CPEs[j].CPEMI); + CPEs[j].CPEMI = NULL; + MadeChange = true; + } + } + } + return MadeChange; +} + +/// BBIsInRange - Returns true if the distance between specific MI and +/// specific BB can fit in MI's displacement field. +bool ARMConstantIslands::BBIsInRange(MachineInstr *MI,MachineBasicBlock *DestBB, + unsigned MaxDisp) { + unsigned PCAdj = isThumb ? 4 : 8; + unsigned BrOffset = GetOffsetOf(MI) + PCAdj; + unsigned DestOffset = BBOffsets[DestBB->getNumber()]; + + DEBUG(errs() << "Branch of destination BB#" << DestBB->getNumber() + << " from BB#" << MI->getParent()->getNumber() + << " max delta=" << MaxDisp + << " from " << GetOffsetOf(MI) << " to " << DestOffset + << " offset " << int(DestOffset-BrOffset) << "\t" << *MI); + + if (BrOffset <= DestOffset) { + // Branch before the Dest. + if (DestOffset-BrOffset <= MaxDisp) + return true; + } else { + if (BrOffset-DestOffset <= MaxDisp) + return true; + } + return false; +} + +/// FixUpImmediateBr - Fix up an immediate branch whose destination is too far +/// away to fit in its displacement field. +bool ARMConstantIslands::FixUpImmediateBr(MachineFunction &MF, ImmBranch &Br) { + MachineInstr *MI = Br.MI; + MachineBasicBlock *DestBB = MI->getOperand(0).getMBB(); + + // Check to see if the DestBB is already in-range. + if (BBIsInRange(MI, DestBB, Br.MaxDisp)) + return false; + + if (!Br.isCond) + return FixUpUnconditionalBr(MF, Br); + return FixUpConditionalBr(MF, Br); +} + +/// FixUpUnconditionalBr - Fix up an unconditional branch whose destination is +/// too far away to fit in its displacement field. If the LR register has been +/// spilled in the epilogue, then we can use BL to implement a far jump. +/// Otherwise, add an intermediate branch instruction to a branch. +bool +ARMConstantIslands::FixUpUnconditionalBr(MachineFunction &MF, ImmBranch &Br) { + MachineInstr *MI = Br.MI; + MachineBasicBlock *MBB = MI->getParent(); + if (!isThumb1) + llvm_unreachable("FixUpUnconditionalBr is Thumb1 only!"); + + // Use BL to implement far jump. + Br.MaxDisp = (1 << 21) * 2; + MI->setDesc(TII->get(ARM::tBfar)); + BBSizes[MBB->getNumber()] += 2; + AdjustBBOffsetsAfter(MBB, 2); + HasFarJump = true; + NumUBrFixed++; + + DEBUG(errs() << " Changed B to long jump " << *MI); + + return true; +} + +/// FixUpConditionalBr - Fix up a conditional branch whose destination is too +/// far away to fit in its displacement field. It is converted to an inverse +/// conditional branch + an unconditional branch to the destination. +bool +ARMConstantIslands::FixUpConditionalBr(MachineFunction &MF, ImmBranch &Br) { + MachineInstr *MI = Br.MI; + MachineBasicBlock *DestBB = MI->getOperand(0).getMBB(); + + // Add an unconditional branch to the destination and invert the branch + // condition to jump over it: + // blt L1 + // => + // bge L2 + // b L1 + // L2: + ARMCC::CondCodes CC = (ARMCC::CondCodes)MI->getOperand(1).getImm(); + CC = ARMCC::getOppositeCondition(CC); + unsigned CCReg = MI->getOperand(2).getReg(); + + // If the branch is at the end of its MBB and that has a fall-through block, + // direct the updated conditional branch to the fall-through block. Otherwise, + // split the MBB before the next instruction. + MachineBasicBlock *MBB = MI->getParent(); + MachineInstr *BMI = &MBB->back(); + bool NeedSplit = (BMI != MI) || !BBHasFallthrough(MBB); + + NumCBrFixed++; + if (BMI != MI) { + if (llvm::next(MachineBasicBlock::iterator(MI)) == prior(MBB->end()) && + BMI->getOpcode() == Br.UncondBr) { + // Last MI in the BB is an unconditional branch. Can we simply invert the + // condition and swap destinations: + // beq L1 + // b L2 + // => + // bne L2 + // b L1 + MachineBasicBlock *NewDest = BMI->getOperand(0).getMBB(); + if (BBIsInRange(MI, NewDest, Br.MaxDisp)) { + DEBUG(errs() << " Invert Bcc condition and swap its destination with " + << *BMI); + BMI->getOperand(0).setMBB(DestBB); + MI->getOperand(0).setMBB(NewDest); + MI->getOperand(1).setImm(CC); + return true; + } + } + } + + if (NeedSplit) { + SplitBlockBeforeInstr(MI); + // No need for the branch to the next block. We're adding an unconditional + // branch to the destination. + int delta = TII->GetInstSizeInBytes(&MBB->back()); + BBSizes[MBB->getNumber()] -= delta; + MachineBasicBlock* SplitBB = llvm::next(MachineFunction::iterator(MBB)); + AdjustBBOffsetsAfter(SplitBB, -delta); + MBB->back().eraseFromParent(); + // BBOffsets[SplitBB] is wrong temporarily, fixed below + } + MachineBasicBlock *NextBB = llvm::next(MachineFunction::iterator(MBB)); + + DEBUG(errs() << " Insert B to BB#" << DestBB->getNumber() + << " also invert condition and change dest. to BB#" + << NextBB->getNumber() << "\n"); + + // Insert a new conditional branch and a new unconditional branch. + // Also update the ImmBranch as well as adding a new entry for the new branch. + BuildMI(MBB, DebugLoc(), TII->get(MI->getOpcode())) + .addMBB(NextBB).addImm(CC).addReg(CCReg); + Br.MI = &MBB->back(); + BBSizes[MBB->getNumber()] += TII->GetInstSizeInBytes(&MBB->back()); + BuildMI(MBB, DebugLoc(), TII->get(Br.UncondBr)).addMBB(DestBB); + BBSizes[MBB->getNumber()] += TII->GetInstSizeInBytes(&MBB->back()); + unsigned MaxDisp = getUnconditionalBrDisp(Br.UncondBr); + ImmBranches.push_back(ImmBranch(&MBB->back(), MaxDisp, false, Br.UncondBr)); + + // Remove the old conditional branch. It may or may not still be in MBB. + BBSizes[MI->getParent()->getNumber()] -= TII->GetInstSizeInBytes(MI); + MI->eraseFromParent(); + + // The net size change is an addition of one unconditional branch. + int delta = TII->GetInstSizeInBytes(&MBB->back()); + AdjustBBOffsetsAfter(MBB, delta); + return true; +} + +/// UndoLRSpillRestore - Remove Thumb push / pop instructions that only spills +/// LR / restores LR to pc. FIXME: This is done here because it's only possible +/// to do this if tBfar is not used. +bool ARMConstantIslands::UndoLRSpillRestore() { + bool MadeChange = false; + for (unsigned i = 0, e = PushPopMIs.size(); i != e; ++i) { + MachineInstr *MI = PushPopMIs[i]; + // First two operands are predicates. + if (MI->getOpcode() == ARM::tPOP_RET && + MI->getOperand(2).getReg() == ARM::PC && + MI->getNumExplicitOperands() == 3) { + BuildMI(MI->getParent(), MI->getDebugLoc(), TII->get(ARM::tBX_RET)); + MI->eraseFromParent(); + MadeChange = true; + } + } + return MadeChange; +} + +bool ARMConstantIslands::OptimizeThumb2Instructions(MachineFunction &MF) { + bool MadeChange = false; + + // Shrink ADR and LDR from constantpool. + for (unsigned i = 0, e = CPUsers.size(); i != e; ++i) { + CPUser &U = CPUsers[i]; + unsigned Opcode = U.MI->getOpcode(); + unsigned NewOpc = 0; + unsigned Scale = 1; + unsigned Bits = 0; + switch (Opcode) { + default: break; + case ARM::t2LEApcrel: + if (isARMLowRegister(U.MI->getOperand(0).getReg())) { + NewOpc = ARM::tLEApcrel; + Bits = 8; + Scale = 4; + } + break; + case ARM::t2LDRpci: + if (isARMLowRegister(U.MI->getOperand(0).getReg())) { + NewOpc = ARM::tLDRpci; + Bits = 8; + Scale = 4; + } + break; + } + + if (!NewOpc) + continue; + + unsigned UserOffset = GetOffsetOf(U.MI) + 4; + unsigned MaxOffs = ((1 << Bits) - 1) * Scale; + // FIXME: Check if offset is multiple of scale if scale is not 4. + if (CPEIsInRange(U.MI, UserOffset, U.CPEMI, MaxOffs, false, true)) { + U.MI->setDesc(TII->get(NewOpc)); + MachineBasicBlock *MBB = U.MI->getParent(); + BBSizes[MBB->getNumber()] -= 2; + AdjustBBOffsetsAfter(MBB, -2); + ++NumT2CPShrunk; + MadeChange = true; + } + } + + MadeChange |= OptimizeThumb2Branches(MF); + MadeChange |= OptimizeThumb2JumpTables(MF); + return MadeChange; +} + +bool ARMConstantIslands::OptimizeThumb2Branches(MachineFunction &MF) { + bool MadeChange = false; + + for (unsigned i = 0, e = ImmBranches.size(); i != e; ++i) { + ImmBranch &Br = ImmBranches[i]; + unsigned Opcode = Br.MI->getOpcode(); + unsigned NewOpc = 0; + unsigned Scale = 1; + unsigned Bits = 0; + switch (Opcode) { + default: break; + case ARM::t2B: + NewOpc = ARM::tB; + Bits = 11; + Scale = 2; + break; + case ARM::t2Bcc: { + NewOpc = ARM::tBcc; + Bits = 8; + Scale = 2; + break; + } + } + if (NewOpc) { + unsigned MaxOffs = ((1 << (Bits-1))-1) * Scale; + MachineBasicBlock *DestBB = Br.MI->getOperand(0).getMBB(); + if (BBIsInRange(Br.MI, DestBB, MaxOffs)) { + Br.MI->setDesc(TII->get(NewOpc)); + MachineBasicBlock *MBB = Br.MI->getParent(); + BBSizes[MBB->getNumber()] -= 2; + AdjustBBOffsetsAfter(MBB, -2); + ++NumT2BrShrunk; + MadeChange = true; + } + } + + Opcode = Br.MI->getOpcode(); + if (Opcode != ARM::tBcc) + continue; + + NewOpc = 0; + unsigned PredReg = 0; + ARMCC::CondCodes Pred = llvm::getInstrPredicate(Br.MI, PredReg); + if (Pred == ARMCC::EQ) + NewOpc = ARM::tCBZ; + else if (Pred == ARMCC::NE) + NewOpc = ARM::tCBNZ; + if (!NewOpc) + continue; + MachineBasicBlock *DestBB = Br.MI->getOperand(0).getMBB(); + // Check if the distance is within 126. Subtract starting offset by 2 + // because the cmp will be eliminated. + unsigned BrOffset = GetOffsetOf(Br.MI) + 4 - 2; + unsigned DestOffset = BBOffsets[DestBB->getNumber()]; + if (BrOffset < DestOffset && (DestOffset - BrOffset) <= 126) { + MachineBasicBlock::iterator CmpMI = Br.MI; --CmpMI; + if (CmpMI->getOpcode() == ARM::tCMPzi8) { + unsigned Reg = CmpMI->getOperand(0).getReg(); + Pred = llvm::getInstrPredicate(CmpMI, PredReg); + if (Pred == ARMCC::AL && + CmpMI->getOperand(1).getImm() == 0 && + isARMLowRegister(Reg)) { + MachineBasicBlock *MBB = Br.MI->getParent(); + MachineInstr *NewBR = + BuildMI(*MBB, CmpMI, Br.MI->getDebugLoc(), TII->get(NewOpc)) + .addReg(Reg).addMBB(DestBB, Br.MI->getOperand(0).getTargetFlags()); + CmpMI->eraseFromParent(); + Br.MI->eraseFromParent(); + Br.MI = NewBR; + BBSizes[MBB->getNumber()] -= 2; + AdjustBBOffsetsAfter(MBB, -2); + ++NumCBZ; + MadeChange = true; + } + } + } + } + + return MadeChange; +} + +/// OptimizeThumb2JumpTables - Use tbb / tbh instructions to generate smaller +/// jumptables when it's possible. +bool ARMConstantIslands::OptimizeThumb2JumpTables(MachineFunction &MF) { + bool MadeChange = false; + + // FIXME: After the tables are shrunk, can we get rid some of the + // constantpool tables? + MachineJumpTableInfo *MJTI = MF.getJumpTableInfo(); + if (MJTI == 0) return false; + + const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables(); + for (unsigned i = 0, e = T2JumpTables.size(); i != e; ++i) { + MachineInstr *MI = T2JumpTables[i]; + const TargetInstrDesc &TID = MI->getDesc(); + unsigned NumOps = TID.getNumOperands(); + unsigned JTOpIdx = NumOps - (TID.isPredicable() ? 3 : 2); + MachineOperand JTOP = MI->getOperand(JTOpIdx); + unsigned JTI = JTOP.getIndex(); + assert(JTI < JT.size()); + + bool ByteOk = true; + bool HalfWordOk = true; + unsigned JTOffset = GetOffsetOf(MI) + 4; + const std::vector<MachineBasicBlock*> &JTBBs = JT[JTI].MBBs; + for (unsigned j = 0, ee = JTBBs.size(); j != ee; ++j) { + MachineBasicBlock *MBB = JTBBs[j]; + unsigned DstOffset = BBOffsets[MBB->getNumber()]; + // Negative offset is not ok. FIXME: We should change BB layout to make + // sure all the branches are forward. + if (ByteOk && (DstOffset - JTOffset) > ((1<<8)-1)*2) + ByteOk = false; + unsigned TBHLimit = ((1<<16)-1)*2; + if (HalfWordOk && (DstOffset - JTOffset) > TBHLimit) + HalfWordOk = false; + if (!ByteOk && !HalfWordOk) + break; + } + + if (ByteOk || HalfWordOk) { + MachineBasicBlock *MBB = MI->getParent(); + unsigned BaseReg = MI->getOperand(0).getReg(); + bool BaseRegKill = MI->getOperand(0).isKill(); + if (!BaseRegKill) + continue; + unsigned IdxReg = MI->getOperand(1).getReg(); + bool IdxRegKill = MI->getOperand(1).isKill(); + MachineBasicBlock::iterator PrevI = MI; + if (PrevI == MBB->begin()) + continue; + + MachineInstr *AddrMI = --PrevI; + bool OptOk = true; + // Examine the instruction that calculate the jumptable entry address. + // If it's not the one just before the t2BR_JT, we won't delete it, then + // it's not worth doing the optimization. + for (unsigned k = 0, eee = AddrMI->getNumOperands(); k != eee; ++k) { + const MachineOperand &MO = AddrMI->getOperand(k); + if (!MO.isReg() || !MO.getReg()) + continue; + if (MO.isDef() && MO.getReg() != BaseReg) { + OptOk = false; + break; + } + if (MO.isUse() && !MO.isKill() && MO.getReg() != IdxReg) { + OptOk = false; + break; + } + } + if (!OptOk) + continue; + + // The previous instruction should be a tLEApcrel or t2LEApcrelJT, we want + // to delete it as well. + MachineInstr *LeaMI = --PrevI; + if ((LeaMI->getOpcode() != ARM::tLEApcrelJT && + LeaMI->getOpcode() != ARM::t2LEApcrelJT) || + LeaMI->getOperand(0).getReg() != BaseReg) + OptOk = false; + + if (!OptOk) + continue; + + unsigned Opc = ByteOk ? ARM::t2TBB : ARM::t2TBH; + MachineInstr *NewJTMI = BuildMI(MBB, MI->getDebugLoc(), TII->get(Opc)) + .addReg(IdxReg, getKillRegState(IdxRegKill)) + .addJumpTableIndex(JTI, JTOP.getTargetFlags()) + .addImm(MI->getOperand(JTOpIdx+1).getImm()); + // FIXME: Insert an "ALIGN" instruction to ensure the next instruction + // is 2-byte aligned. For now, asm printer will fix it up. + unsigned NewSize = TII->GetInstSizeInBytes(NewJTMI); + unsigned OrigSize = TII->GetInstSizeInBytes(AddrMI); + OrigSize += TII->GetInstSizeInBytes(LeaMI); + OrigSize += TII->GetInstSizeInBytes(MI); + + AddrMI->eraseFromParent(); + LeaMI->eraseFromParent(); + MI->eraseFromParent(); + + int delta = OrigSize - NewSize; + BBSizes[MBB->getNumber()] -= delta; + AdjustBBOffsetsAfter(MBB, -delta); + + ++NumTBs; + MadeChange = true; + } + } + + return MadeChange; +} + +/// ReorderThumb2JumpTables - Adjust the function's block layout to ensure that +/// jump tables always branch forwards, since that's what tbb and tbh need. +bool ARMConstantIslands::ReorderThumb2JumpTables(MachineFunction &MF) { + bool MadeChange = false; + + MachineJumpTableInfo *MJTI = MF.getJumpTableInfo(); + if (MJTI == 0) return false; + + const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables(); + for (unsigned i = 0, e = T2JumpTables.size(); i != e; ++i) { + MachineInstr *MI = T2JumpTables[i]; + const TargetInstrDesc &TID = MI->getDesc(); + unsigned NumOps = TID.getNumOperands(); + unsigned JTOpIdx = NumOps - (TID.isPredicable() ? 3 : 2); + MachineOperand JTOP = MI->getOperand(JTOpIdx); + unsigned JTI = JTOP.getIndex(); + assert(JTI < JT.size()); + + // We prefer if target blocks for the jump table come after the jump + // instruction so we can use TB[BH]. Loop through the target blocks + // and try to adjust them such that that's true. + int JTNumber = MI->getParent()->getNumber(); + const std::vector<MachineBasicBlock*> &JTBBs = JT[JTI].MBBs; + for (unsigned j = 0, ee = JTBBs.size(); j != ee; ++j) { + MachineBasicBlock *MBB = JTBBs[j]; + int DTNumber = MBB->getNumber(); + + if (DTNumber < JTNumber) { + // The destination precedes the switch. Try to move the block forward + // so we have a positive offset. + MachineBasicBlock *NewBB = + AdjustJTTargetBlockForward(MBB, MI->getParent()); + if (NewBB) + MJTI->ReplaceMBBInJumpTable(JTI, JTBBs[j], NewBB); + MadeChange = true; + } + } + } + + return MadeChange; +} + +MachineBasicBlock *ARMConstantIslands:: +AdjustJTTargetBlockForward(MachineBasicBlock *BB, MachineBasicBlock *JTBB) +{ + MachineFunction &MF = *BB->getParent(); + + // If it's the destination block is terminated by an unconditional branch, + // try to move it; otherwise, create a new block following the jump + // table that branches back to the actual target. This is a very simple + // heuristic. FIXME: We can definitely improve it. + MachineBasicBlock *TBB = 0, *FBB = 0; + SmallVector<MachineOperand, 4> Cond; + SmallVector<MachineOperand, 4> CondPrior; + MachineFunction::iterator BBi = BB; + MachineFunction::iterator OldPrior = prior(BBi); + + // If the block terminator isn't analyzable, don't try to move the block + bool B = TII->AnalyzeBranch(*BB, TBB, FBB, Cond); + + // If the block ends in an unconditional branch, move it. The prior block + // has to have an analyzable terminator for us to move this one. Be paranoid + // and make sure we're not trying to move the entry block of the function. + if (!B && Cond.empty() && BB != MF.begin() && + !TII->AnalyzeBranch(*OldPrior, TBB, FBB, CondPrior)) { + BB->moveAfter(JTBB); + OldPrior->updateTerminator(); + BB->updateTerminator(); + // Update numbering to account for the block being moved. + MF.RenumberBlocks(); + ++NumJTMoved; + return NULL; + } + + // Create a new MBB for the code after the jump BB. + MachineBasicBlock *NewBB = + MF.CreateMachineBasicBlock(JTBB->getBasicBlock()); + MachineFunction::iterator MBBI = JTBB; ++MBBI; + MF.insert(MBBI, NewBB); + + // Add an unconditional branch from NewBB to BB. + // There doesn't seem to be meaningful DebugInfo available; this doesn't + // correspond directly to anything in the source. + assert (isThumb2 && "Adjusting for TB[BH] but not in Thumb2?"); + BuildMI(NewBB, DebugLoc(), TII->get(ARM::t2B)).addMBB(BB); + + // Update internal data structures to account for the newly inserted MBB. + MF.RenumberBlocks(NewBB); + + // Update the CFG. + NewBB->addSuccessor(BB); + JTBB->removeSuccessor(BB); + JTBB->addSuccessor(NewBB); + + ++NumJTInserted; + return NewBB; +} diff --git a/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.cpp b/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.cpp new file mode 100644 index 0000000..f13ccc6 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.cpp @@ -0,0 +1,122 @@ +//===- ARMConstantPoolValue.cpp - ARM constantpool value --------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the ARM specific constantpool value class. +// +//===----------------------------------------------------------------------===// + +#include "ARMConstantPoolValue.h" +#include "llvm/ADT/FoldingSet.h" +#include "llvm/Constant.h" +#include "llvm/Constants.h" +#include "llvm/GlobalValue.h" +#include "llvm/Type.h" +#include "llvm/Support/raw_ostream.h" +#include <cstdlib> +using namespace llvm; + +ARMConstantPoolValue::ARMConstantPoolValue(const Constant *cval, unsigned id, + ARMCP::ARMCPKind K, + unsigned char PCAdj, + const char *Modif, + bool AddCA) + : MachineConstantPoolValue((const Type*)cval->getType()), + CVal(cval), S(NULL), LabelId(id), Kind(K), PCAdjust(PCAdj), + Modifier(Modif), AddCurrentAddress(AddCA) {} + +ARMConstantPoolValue::ARMConstantPoolValue(LLVMContext &C, + const char *s, unsigned id, + unsigned char PCAdj, + const char *Modif, + bool AddCA) + : MachineConstantPoolValue((const Type*)Type::getInt32Ty(C)), + CVal(NULL), S(strdup(s)), LabelId(id), Kind(ARMCP::CPExtSymbol), + PCAdjust(PCAdj), Modifier(Modif), AddCurrentAddress(AddCA) {} + +ARMConstantPoolValue::ARMConstantPoolValue(const GlobalValue *gv, + const char *Modif) + : MachineConstantPoolValue((const Type*)Type::getInt32Ty(gv->getContext())), + CVal(gv), S(NULL), LabelId(0), Kind(ARMCP::CPValue), PCAdjust(0), + Modifier(Modif) {} + +const GlobalValue *ARMConstantPoolValue::getGV() const { + return dyn_cast_or_null<GlobalValue>(CVal); +} + +const BlockAddress *ARMConstantPoolValue::getBlockAddress() const { + return dyn_cast_or_null<BlockAddress>(CVal); +} + +int ARMConstantPoolValue::getExistingMachineCPValue(MachineConstantPool *CP, + unsigned Alignment) { + unsigned AlignMask = Alignment - 1; + const std::vector<MachineConstantPoolEntry> Constants = CP->getConstants(); + for (unsigned i = 0, e = Constants.size(); i != e; ++i) { + if (Constants[i].isMachineConstantPoolEntry() && + (Constants[i].getAlignment() & AlignMask) == 0) { + ARMConstantPoolValue *CPV = + (ARMConstantPoolValue *)Constants[i].Val.MachineCPVal; + if (CPV->CVal == CVal && + CPV->LabelId == LabelId && + CPV->PCAdjust == PCAdjust && + (CPV->S == S || strcmp(CPV->S, S) == 0) && + (CPV->Modifier == Modifier || strcmp(CPV->Modifier, Modifier) == 0)) + return i; + } + } + + return -1; +} + +ARMConstantPoolValue::~ARMConstantPoolValue() { + free((void*)S); +} + +void +ARMConstantPoolValue::AddSelectionDAGCSEId(FoldingSetNodeID &ID) { + ID.AddPointer(CVal); + ID.AddPointer(S); + ID.AddInteger(LabelId); + ID.AddInteger(PCAdjust); +} + +bool +ARMConstantPoolValue::hasSameValue(ARMConstantPoolValue *ACPV) { + if (ACPV->Kind == Kind && + ACPV->CVal == CVal && + ACPV->PCAdjust == PCAdjust && + (ACPV->S == S || strcmp(ACPV->S, S) == 0) && + (ACPV->Modifier == Modifier || strcmp(ACPV->Modifier, Modifier) == 0)) { + if (ACPV->LabelId == LabelId) + return true; + // Two PC relative constpool entries containing the same GV address or + // external symbols. FIXME: What about blockaddress? + if (Kind == ARMCP::CPValue || Kind == ARMCP::CPExtSymbol) + return true; + } + return false; +} + +void ARMConstantPoolValue::dump() const { + errs() << " " << *this; +} + + +void ARMConstantPoolValue::print(raw_ostream &O) const { + if (CVal) + O << CVal->getName(); + else + O << S; + if (Modifier) O << "(" << Modifier << ")"; + if (PCAdjust != 0) { + O << "-(LPC" << LabelId << "+" << (unsigned)PCAdjust; + if (AddCurrentAddress) O << "-."; + O << ")"; + } +} diff --git a/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.h b/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.h new file mode 100644 index 0000000..6f4eddf --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.h @@ -0,0 +1,100 @@ +//===- ARMConstantPoolValue.h - ARM constantpool value ----------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the ARM specific constantpool value class. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TARGET_ARM_CONSTANTPOOLVALUE_H +#define LLVM_TARGET_ARM_CONSTANTPOOLVALUE_H + +#include "llvm/CodeGen/MachineConstantPool.h" + +namespace llvm { + +class Constant; +class BlockAddress; +class GlobalValue; +class LLVMContext; + +namespace ARMCP { + enum ARMCPKind { + CPValue, + CPExtSymbol, + CPBlockAddress, + CPLSDA + }; +} + +/// ARMConstantPoolValue - ARM specific constantpool value. This is used to +/// represent PC-relative displacement between the address of the load +/// instruction and the constant being loaded, i.e. (&GV-(LPIC+8)). +class ARMConstantPoolValue : public MachineConstantPoolValue { + const Constant *CVal; // Constant being loaded. + const char *S; // ExtSymbol being loaded. + unsigned LabelId; // Label id of the load. + ARMCP::ARMCPKind Kind; // Kind of constant. + unsigned char PCAdjust; // Extra adjustment if constantpool is pc-relative. + // 8 for ARM, 4 for Thumb. + const char *Modifier; // GV modifier i.e. (&GV(modifier)-(LPIC+8)) + bool AddCurrentAddress; + +public: + ARMConstantPoolValue(const Constant *cval, unsigned id, + ARMCP::ARMCPKind Kind = ARMCP::CPValue, + unsigned char PCAdj = 0, const char *Modifier = NULL, + bool AddCurrentAddress = false); + ARMConstantPoolValue(LLVMContext &C, const char *s, unsigned id, + unsigned char PCAdj = 0, const char *Modifier = NULL, + bool AddCurrentAddress = false); + ARMConstantPoolValue(const GlobalValue *GV, const char *Modifier); + ARMConstantPoolValue(); + ~ARMConstantPoolValue(); + + const GlobalValue *getGV() const; + const char *getSymbol() const { return S; } + const BlockAddress *getBlockAddress() const; + const char *getModifier() const { return Modifier; } + bool hasModifier() const { return Modifier != NULL; } + bool mustAddCurrentAddress() const { return AddCurrentAddress; } + unsigned getLabelId() const { return LabelId; } + unsigned char getPCAdjustment() const { return PCAdjust; } + bool isGlobalValue() const { return Kind == ARMCP::CPValue; } + bool isExtSymbol() const { return Kind == ARMCP::CPExtSymbol; } + bool isBlockAddress() { return Kind == ARMCP::CPBlockAddress; } + bool isLSDA() { return Kind == ARMCP::CPLSDA; } + + virtual unsigned getRelocationInfo() const { + // FIXME: This is conservatively claiming that these entries require a + // relocation, we may be able to do better than this. + return 2; + } + + virtual int getExistingMachineCPValue(MachineConstantPool *CP, + unsigned Alignment); + + virtual void AddSelectionDAGCSEId(FoldingSetNodeID &ID); + + /// hasSameValue - Return true if this ARM constpool value + /// can share the same constantpool entry as another ARM constpool value. + bool hasSameValue(ARMConstantPoolValue *ACPV); + + void print(raw_ostream *O) const { if (O) print(*O); } + void print(raw_ostream &O) const; + void dump() const; +}; + +inline raw_ostream &operator<<(raw_ostream &O, const ARMConstantPoolValue &V) { + V.print(O); + return O; +} + +} // End llvm namespace + +#endif diff --git a/contrib/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/contrib/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp new file mode 100644 index 0000000..c87f5d7 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp @@ -0,0 +1,180 @@ +//===-- ARMExpandPseudoInsts.cpp - Expand pseudo instructions -----*- C++ -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains a pass that expand pseudo instructions into target +// instructions to allow proper scheduling, if-conversion, and other late +// optimizations. This pass should be run after register allocation but before +// post- regalloc scheduling pass. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "arm-pseudo" +#include "ARM.h" +#include "ARMBaseInstrInfo.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" + +using namespace llvm; + +namespace { + class ARMExpandPseudo : public MachineFunctionPass { + public: + static char ID; + ARMExpandPseudo() : MachineFunctionPass(&ID) {} + + const TargetInstrInfo *TII; + const TargetRegisterInfo *TRI; + + virtual bool runOnMachineFunction(MachineFunction &Fn); + + virtual const char *getPassName() const { + return "ARM pseudo instruction expansion pass"; + } + + private: + void TransferImpOps(MachineInstr &OldMI, + MachineInstrBuilder &UseMI, MachineInstrBuilder &DefMI); + bool ExpandMBB(MachineBasicBlock &MBB); + }; + char ARMExpandPseudo::ID = 0; +} + +/// TransferImpOps - Transfer implicit operands on the pseudo instruction to +/// the instructions created from the expansion. +void ARMExpandPseudo::TransferImpOps(MachineInstr &OldMI, + MachineInstrBuilder &UseMI, + MachineInstrBuilder &DefMI) { + const TargetInstrDesc &Desc = OldMI.getDesc(); + for (unsigned i = Desc.getNumOperands(), e = OldMI.getNumOperands(); + i != e; ++i) { + const MachineOperand &MO = OldMI.getOperand(i); + assert(MO.isReg() && MO.getReg()); + if (MO.isUse()) + UseMI.addReg(MO.getReg(), getKillRegState(MO.isKill())); + else + DefMI.addReg(MO.getReg(), + getDefRegState(true) | getDeadRegState(MO.isDead())); + } +} + +bool ARMExpandPseudo::ExpandMBB(MachineBasicBlock &MBB) { + bool Modified = false; + + MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); + while (MBBI != E) { + MachineInstr &MI = *MBBI; + MachineBasicBlock::iterator NMBBI = llvm::next(MBBI); + + unsigned Opcode = MI.getOpcode(); + switch (Opcode) { + default: break; + case ARM::tLDRpci_pic: + case ARM::t2LDRpci_pic: { + unsigned NewLdOpc = (Opcode == ARM::tLDRpci_pic) + ? ARM::tLDRpci : ARM::t2LDRpci; + unsigned DstReg = MI.getOperand(0).getReg(); + bool DstIsDead = MI.getOperand(0).isDead(); + MachineInstrBuilder MIB1 = + AddDefaultPred(BuildMI(MBB, MBBI, MI.getDebugLoc(), + TII->get(NewLdOpc), DstReg) + .addOperand(MI.getOperand(1))); + (*MIB1).setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); + MachineInstrBuilder MIB2 = BuildMI(MBB, MBBI, MI.getDebugLoc(), + TII->get(ARM::tPICADD)) + .addReg(DstReg, getDefRegState(true) | getDeadRegState(DstIsDead)) + .addReg(DstReg) + .addOperand(MI.getOperand(2)); + TransferImpOps(MI, MIB1, MIB2); + MI.eraseFromParent(); + Modified = true; + break; + } + + case ARM::t2MOVi32imm: { + unsigned PredReg = 0; + ARMCC::CondCodes Pred = llvm::getInstrPredicate(&MI, PredReg); + unsigned DstReg = MI.getOperand(0).getReg(); + bool DstIsDead = MI.getOperand(0).isDead(); + const MachineOperand &MO = MI.getOperand(1); + MachineInstrBuilder LO16, HI16; + + LO16 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::t2MOVi16), + DstReg); + HI16 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::t2MOVTi16)) + .addReg(DstReg, getDefRegState(true) | getDeadRegState(DstIsDead)) + .addReg(DstReg); + + if (MO.isImm()) { + unsigned Imm = MO.getImm(); + unsigned Lo16 = Imm & 0xffff; + unsigned Hi16 = (Imm >> 16) & 0xffff; + LO16 = LO16.addImm(Lo16); + HI16 = HI16.addImm(Hi16); + } else { + const GlobalValue *GV = MO.getGlobal(); + unsigned TF = MO.getTargetFlags(); + LO16 = LO16.addGlobalAddress(GV, MO.getOffset(), TF | ARMII::MO_LO16); + HI16 = HI16.addGlobalAddress(GV, MO.getOffset(), TF | ARMII::MO_HI16); + } + (*LO16).setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); + (*HI16).setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); + LO16.addImm(Pred).addReg(PredReg); + HI16.addImm(Pred).addReg(PredReg); + TransferImpOps(MI, LO16, HI16); + MI.eraseFromParent(); + Modified = true; + break; + } + + case ARM::VMOVQQ: { + unsigned DstReg = MI.getOperand(0).getReg(); + bool DstIsDead = MI.getOperand(0).isDead(); + unsigned EvenDst = TRI->getSubReg(DstReg, ARM::qsub_0); + unsigned OddDst = TRI->getSubReg(DstReg, ARM::qsub_1); + unsigned SrcReg = MI.getOperand(1).getReg(); + bool SrcIsKill = MI.getOperand(1).isKill(); + unsigned EvenSrc = TRI->getSubReg(SrcReg, ARM::qsub_0); + unsigned OddSrc = TRI->getSubReg(SrcReg, ARM::qsub_1); + MachineInstrBuilder Even = + AddDefaultPred(BuildMI(MBB, MBBI, MI.getDebugLoc(), + TII->get(ARM::VMOVQ)) + .addReg(EvenDst, getDefRegState(true) | getDeadRegState(DstIsDead)) + .addReg(EvenSrc, getKillRegState(SrcIsKill))); + MachineInstrBuilder Odd = + AddDefaultPred(BuildMI(MBB, MBBI, MI.getDebugLoc(), + TII->get(ARM::VMOVQ)) + .addReg(OddDst, getDefRegState(true) | getDeadRegState(DstIsDead)) + .addReg(OddSrc, getKillRegState(SrcIsKill))); + TransferImpOps(MI, Even, Odd); + MI.eraseFromParent(); + Modified = true; + } + } + MBBI = NMBBI; + } + + return Modified; +} + +bool ARMExpandPseudo::runOnMachineFunction(MachineFunction &MF) { + TII = MF.getTarget().getInstrInfo(); + TRI = MF.getTarget().getRegisterInfo(); + + bool Modified = false; + for (MachineFunction::iterator MFI = MF.begin(), E = MF.end(); MFI != E; + ++MFI) + Modified |= ExpandMBB(*MFI); + return Modified; +} + +/// createARMExpandPseudoPass - returns an instance of the pseudo instruction +/// expansion pass. +FunctionPass *llvm::createARMExpandPseudoPass() { + return new ARMExpandPseudo(); +} diff --git a/contrib/llvm/lib/Target/ARM/ARMFrameInfo.h b/contrib/llvm/lib/Target/ARM/ARMFrameInfo.h new file mode 100644 index 0000000..d5dae24 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMFrameInfo.h @@ -0,0 +1,32 @@ +//===-- ARMTargetFrameInfo.h - Define TargetFrameInfo for ARM ---*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// +// +//===----------------------------------------------------------------------===// + +#ifndef ARM_FRAMEINFO_H +#define ARM_FRAMEINFO_H + +#include "ARM.h" +#include "ARMSubtarget.h" +#include "llvm/Target/TargetFrameInfo.h" + +namespace llvm { + +class ARMFrameInfo : public TargetFrameInfo { +public: + explicit ARMFrameInfo(const ARMSubtarget &ST) + : TargetFrameInfo(StackGrowsDown, ST.getStackAlignment(), 0, 4) { + } +}; + +} // End llvm namespace + +#endif diff --git a/contrib/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/contrib/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp new file mode 100644 index 0000000..9baef6b --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -0,0 +1,2375 @@ +//===-- ARMISelDAGToDAG.cpp - A dag to dag inst selector for ARM ----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines an instruction selector for the ARM target. +// +//===----------------------------------------------------------------------===// + +#include "ARM.h" +#include "ARMAddressingModes.h" +#include "ARMTargetMachine.h" +#include "llvm/CallingConv.h" +#include "llvm/Constants.h" +#include "llvm/DerivedTypes.h" +#include "llvm/Function.h" +#include "llvm/Intrinsics.h" +#include "llvm/LLVMContext.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/SelectionDAG.h" +#include "llvm/CodeGen/SelectionDAGISel.h" +#include "llvm/Target/TargetLowering.h" +#include "llvm/Target/TargetOptions.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Compiler.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +static cl::opt<bool> +UseRegSeq("neon-reg-sequence", cl::Hidden, + cl::desc("Use reg_sequence to model ld / st of multiple neon regs"), + cl::init(true)); + +//===--------------------------------------------------------------------===// +/// ARMDAGToDAGISel - ARM specific code to select ARM machine +/// instructions for SelectionDAG operations. +/// +namespace { +class ARMDAGToDAGISel : public SelectionDAGISel { + ARMBaseTargetMachine &TM; + + /// Subtarget - Keep a pointer to the ARMSubtarget around so that we can + /// make the right decision when generating code for different targets. + const ARMSubtarget *Subtarget; + +public: + explicit ARMDAGToDAGISel(ARMBaseTargetMachine &tm, + CodeGenOpt::Level OptLevel) + : SelectionDAGISel(tm, OptLevel), TM(tm), + Subtarget(&TM.getSubtarget<ARMSubtarget>()) { + } + + virtual const char *getPassName() const { + return "ARM Instruction Selection"; + } + + /// getI32Imm - Return a target constant of type i32 with the specified + /// value. + inline SDValue getI32Imm(unsigned Imm) { + return CurDAG->getTargetConstant(Imm, MVT::i32); + } + + SDNode *Select(SDNode *N); + + bool SelectShifterOperandReg(SDNode *Op, SDValue N, SDValue &A, + SDValue &B, SDValue &C); + bool SelectAddrMode2(SDNode *Op, SDValue N, SDValue &Base, + SDValue &Offset, SDValue &Opc); + bool SelectAddrMode2Offset(SDNode *Op, SDValue N, + SDValue &Offset, SDValue &Opc); + bool SelectAddrMode3(SDNode *Op, SDValue N, SDValue &Base, + SDValue &Offset, SDValue &Opc); + bool SelectAddrMode3Offset(SDNode *Op, SDValue N, + SDValue &Offset, SDValue &Opc); + bool SelectAddrMode4(SDNode *Op, SDValue N, SDValue &Addr, + SDValue &Mode); + bool SelectAddrMode5(SDNode *Op, SDValue N, SDValue &Base, + SDValue &Offset); + bool SelectAddrMode6(SDNode *Op, SDValue N, SDValue &Addr, SDValue &Align); + + bool SelectAddrModePC(SDNode *Op, SDValue N, SDValue &Offset, + SDValue &Label); + + bool SelectThumbAddrModeRR(SDNode *Op, SDValue N, SDValue &Base, + SDValue &Offset); + bool SelectThumbAddrModeRI5(SDNode *Op, SDValue N, unsigned Scale, + SDValue &Base, SDValue &OffImm, + SDValue &Offset); + bool SelectThumbAddrModeS1(SDNode *Op, SDValue N, SDValue &Base, + SDValue &OffImm, SDValue &Offset); + bool SelectThumbAddrModeS2(SDNode *Op, SDValue N, SDValue &Base, + SDValue &OffImm, SDValue &Offset); + bool SelectThumbAddrModeS4(SDNode *Op, SDValue N, SDValue &Base, + SDValue &OffImm, SDValue &Offset); + bool SelectThumbAddrModeSP(SDNode *Op, SDValue N, SDValue &Base, + SDValue &OffImm); + + bool SelectT2ShifterOperandReg(SDNode *Op, SDValue N, + SDValue &BaseReg, SDValue &Opc); + bool SelectT2AddrModeImm12(SDNode *Op, SDValue N, SDValue &Base, + SDValue &OffImm); + bool SelectT2AddrModeImm8(SDNode *Op, SDValue N, SDValue &Base, + SDValue &OffImm); + bool SelectT2AddrModeImm8Offset(SDNode *Op, SDValue N, + SDValue &OffImm); + bool SelectT2AddrModeImm8s4(SDNode *Op, SDValue N, SDValue &Base, + SDValue &OffImm); + bool SelectT2AddrModeSoReg(SDNode *Op, SDValue N, SDValue &Base, + SDValue &OffReg, SDValue &ShImm); + + // Include the pieces autogenerated from the target description. +#include "ARMGenDAGISel.inc" + +private: + /// SelectARMIndexedLoad - Indexed (pre/post inc/dec) load matching code for + /// ARM. + SDNode *SelectARMIndexedLoad(SDNode *N); + SDNode *SelectT2IndexedLoad(SDNode *N); + + /// SelectVLD - Select NEON load intrinsics. NumVecs should be + /// 1, 2, 3 or 4. The opcode arrays specify the instructions used for + /// loads of D registers and even subregs and odd subregs of Q registers. + /// For NumVecs <= 2, QOpcodes1 is not used. + SDNode *SelectVLD(SDNode *N, unsigned NumVecs, unsigned *DOpcodes, + unsigned *QOpcodes0, unsigned *QOpcodes1); + + /// SelectVST - Select NEON store intrinsics. NumVecs should + /// be 1, 2, 3 or 4. The opcode arrays specify the instructions used for + /// stores of D registers and even subregs and odd subregs of Q registers. + /// For NumVecs <= 2, QOpcodes1 is not used. + SDNode *SelectVST(SDNode *N, unsigned NumVecs, unsigned *DOpcodes, + unsigned *QOpcodes0, unsigned *QOpcodes1); + + /// SelectVLDSTLane - Select NEON load/store lane intrinsics. NumVecs should + /// be 2, 3 or 4. The opcode arrays specify the instructions used for + /// load/store of D registers and even subregs and odd subregs of Q registers. + SDNode *SelectVLDSTLane(SDNode *N, bool IsLoad, unsigned NumVecs, + unsigned *DOpcodes, unsigned *QOpcodes0, + unsigned *QOpcodes1); + + /// SelectV6T2BitfieldExtractOp - Select SBFX/UBFX instructions for ARM. + SDNode *SelectV6T2BitfieldExtractOp(SDNode *N, bool isSigned); + + /// SelectCMOVOp - Select CMOV instructions for ARM. + SDNode *SelectCMOVOp(SDNode *N); + SDNode *SelectT2CMOVShiftOp(SDNode *N, SDValue FalseVal, SDValue TrueVal, + ARMCC::CondCodes CCVal, SDValue CCR, + SDValue InFlag); + SDNode *SelectARMCMOVShiftOp(SDNode *N, SDValue FalseVal, SDValue TrueVal, + ARMCC::CondCodes CCVal, SDValue CCR, + SDValue InFlag); + SDNode *SelectT2CMOVSoImmOp(SDNode *N, SDValue FalseVal, SDValue TrueVal, + ARMCC::CondCodes CCVal, SDValue CCR, + SDValue InFlag); + SDNode *SelectARMCMOVSoImmOp(SDNode *N, SDValue FalseVal, SDValue TrueVal, + ARMCC::CondCodes CCVal, SDValue CCR, + SDValue InFlag); + + SDNode *SelectConcatVector(SDNode *N); + + /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for + /// inline asm expressions. + virtual bool SelectInlineAsmMemoryOperand(const SDValue &Op, + char ConstraintCode, + std::vector<SDValue> &OutOps); + + /// PairDRegs - Form a quad register from a pair of D registers. + /// + SDNode *PairDRegs(EVT VT, SDValue V0, SDValue V1); + + /// PairDRegs - Form a quad register pair from a pair of Q registers. + /// + SDNode *PairQRegs(EVT VT, SDValue V0, SDValue V1); + + /// QuadDRegs - Form a quad register pair from a quad of D registers. + /// + SDNode *QuadDRegs(EVT VT, SDValue V0, SDValue V1, SDValue V2, SDValue V3); + + /// QuadQRegs - Form 4 consecutive Q registers. + /// + SDNode *QuadQRegs(EVT VT, SDValue V0, SDValue V1, SDValue V2, SDValue V3); + + /// OctoDRegs - Form 8 consecutive D registers. + /// + SDNode *OctoDRegs(EVT VT, SDValue V0, SDValue V1, SDValue V2, SDValue V3, + SDValue V4, SDValue V5, SDValue V6, SDValue V7); +}; +} + +/// isInt32Immediate - This method tests to see if the node is a 32-bit constant +/// operand. If so Imm will receive the 32-bit value. +static bool isInt32Immediate(SDNode *N, unsigned &Imm) { + if (N->getOpcode() == ISD::Constant && N->getValueType(0) == MVT::i32) { + Imm = cast<ConstantSDNode>(N)->getZExtValue(); + return true; + } + return false; +} + +// isInt32Immediate - This method tests to see if a constant operand. +// If so Imm will receive the 32 bit value. +static bool isInt32Immediate(SDValue N, unsigned &Imm) { + return isInt32Immediate(N.getNode(), Imm); +} + +// isOpcWithIntImmediate - This method tests to see if the node is a specific +// opcode and that it has a immediate integer right operand. +// If so Imm will receive the 32 bit value. +static bool isOpcWithIntImmediate(SDNode *N, unsigned Opc, unsigned& Imm) { + return N->getOpcode() == Opc && + isInt32Immediate(N->getOperand(1).getNode(), Imm); +} + + +bool ARMDAGToDAGISel::SelectShifterOperandReg(SDNode *Op, + SDValue N, + SDValue &BaseReg, + SDValue &ShReg, + SDValue &Opc) { + ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(N); + + // Don't match base register only case. That is matched to a separate + // lower complexity pattern with explicit register operand. + if (ShOpcVal == ARM_AM::no_shift) return false; + + BaseReg = N.getOperand(0); + unsigned ShImmVal = 0; + if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) { + ShReg = CurDAG->getRegister(0, MVT::i32); + ShImmVal = RHS->getZExtValue() & 31; + } else { + ShReg = N.getOperand(1); + } + Opc = CurDAG->getTargetConstant(ARM_AM::getSORegOpc(ShOpcVal, ShImmVal), + MVT::i32); + return true; +} + +bool ARMDAGToDAGISel::SelectAddrMode2(SDNode *Op, SDValue N, + SDValue &Base, SDValue &Offset, + SDValue &Opc) { + if (N.getOpcode() == ISD::MUL) { + if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) { + // X * [3,5,9] -> X + X * [2,4,8] etc. + int RHSC = (int)RHS->getZExtValue(); + if (RHSC & 1) { + RHSC = RHSC & ~1; + ARM_AM::AddrOpc AddSub = ARM_AM::add; + if (RHSC < 0) { + AddSub = ARM_AM::sub; + RHSC = - RHSC; + } + if (isPowerOf2_32(RHSC)) { + unsigned ShAmt = Log2_32(RHSC); + Base = Offset = N.getOperand(0); + Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(AddSub, ShAmt, + ARM_AM::lsl), + MVT::i32); + return true; + } + } + } + } + + if (N.getOpcode() != ISD::ADD && N.getOpcode() != ISD::SUB) { + Base = N; + if (N.getOpcode() == ISD::FrameIndex) { + int FI = cast<FrameIndexSDNode>(N)->getIndex(); + Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy()); + } else if (N.getOpcode() == ARMISD::Wrapper && + !(Subtarget->useMovt() && + N.getOperand(0).getOpcode() == ISD::TargetGlobalAddress)) { + Base = N.getOperand(0); + } + Offset = CurDAG->getRegister(0, MVT::i32); + Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(ARM_AM::add, 0, + ARM_AM::no_shift), + MVT::i32); + return true; + } + + // Match simple R +/- imm12 operands. + if (N.getOpcode() == ISD::ADD) + if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) { + int RHSC = (int)RHS->getZExtValue(); + if ((RHSC >= 0 && RHSC < 0x1000) || + (RHSC < 0 && RHSC > -0x1000)) { // 12 bits. + Base = N.getOperand(0); + if (Base.getOpcode() == ISD::FrameIndex) { + int FI = cast<FrameIndexSDNode>(Base)->getIndex(); + Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy()); + } + Offset = CurDAG->getRegister(0, MVT::i32); + + ARM_AM::AddrOpc AddSub = ARM_AM::add; + if (RHSC < 0) { + AddSub = ARM_AM::sub; + RHSC = - RHSC; + } + Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(AddSub, RHSC, + ARM_AM::no_shift), + MVT::i32); + return true; + } + } + + // Otherwise this is R +/- [possibly shifted] R. + ARM_AM::AddrOpc AddSub = N.getOpcode() == ISD::ADD ? ARM_AM::add:ARM_AM::sub; + ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOperand(1)); + unsigned ShAmt = 0; + + Base = N.getOperand(0); + Offset = N.getOperand(1); + + if (ShOpcVal != ARM_AM::no_shift) { + // Check to see if the RHS of the shift is a constant, if not, we can't fold + // it. + if (ConstantSDNode *Sh = + dyn_cast<ConstantSDNode>(N.getOperand(1).getOperand(1))) { + ShAmt = Sh->getZExtValue(); + Offset = N.getOperand(1).getOperand(0); + } else { + ShOpcVal = ARM_AM::no_shift; + } + } + + // Try matching (R shl C) + (R). + if (N.getOpcode() == ISD::ADD && ShOpcVal == ARM_AM::no_shift) { + ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOperand(0)); + if (ShOpcVal != ARM_AM::no_shift) { + // Check to see if the RHS of the shift is a constant, if not, we can't + // fold it. + if (ConstantSDNode *Sh = + dyn_cast<ConstantSDNode>(N.getOperand(0).getOperand(1))) { + ShAmt = Sh->getZExtValue(); + Offset = N.getOperand(0).getOperand(0); + Base = N.getOperand(1); + } else { + ShOpcVal = ARM_AM::no_shift; + } + } + } + + Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(AddSub, ShAmt, ShOpcVal), + MVT::i32); + return true; +} + +bool ARMDAGToDAGISel::SelectAddrMode2Offset(SDNode *Op, SDValue N, + SDValue &Offset, SDValue &Opc) { + unsigned Opcode = Op->getOpcode(); + ISD::MemIndexedMode AM = (Opcode == ISD::LOAD) + ? cast<LoadSDNode>(Op)->getAddressingMode() + : cast<StoreSDNode>(Op)->getAddressingMode(); + ARM_AM::AddrOpc AddSub = (AM == ISD::PRE_INC || AM == ISD::POST_INC) + ? ARM_AM::add : ARM_AM::sub; + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N)) { + int Val = (int)C->getZExtValue(); + if (Val >= 0 && Val < 0x1000) { // 12 bits. + Offset = CurDAG->getRegister(0, MVT::i32); + Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(AddSub, Val, + ARM_AM::no_shift), + MVT::i32); + return true; + } + } + + Offset = N; + ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(N); + unsigned ShAmt = 0; + if (ShOpcVal != ARM_AM::no_shift) { + // Check to see if the RHS of the shift is a constant, if not, we can't fold + // it. + if (ConstantSDNode *Sh = dyn_cast<ConstantSDNode>(N.getOperand(1))) { + ShAmt = Sh->getZExtValue(); + Offset = N.getOperand(0); + } else { + ShOpcVal = ARM_AM::no_shift; + } + } + + Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(AddSub, ShAmt, ShOpcVal), + MVT::i32); + return true; +} + + +bool ARMDAGToDAGISel::SelectAddrMode3(SDNode *Op, SDValue N, + SDValue &Base, SDValue &Offset, + SDValue &Opc) { + if (N.getOpcode() == ISD::SUB) { + // X - C is canonicalize to X + -C, no need to handle it here. + Base = N.getOperand(0); + Offset = N.getOperand(1); + Opc = CurDAG->getTargetConstant(ARM_AM::getAM3Opc(ARM_AM::sub, 0),MVT::i32); + return true; + } + + if (N.getOpcode() != ISD::ADD) { + Base = N; + if (N.getOpcode() == ISD::FrameIndex) { + int FI = cast<FrameIndexSDNode>(N)->getIndex(); + Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy()); + } + Offset = CurDAG->getRegister(0, MVT::i32); + Opc = CurDAG->getTargetConstant(ARM_AM::getAM3Opc(ARM_AM::add, 0),MVT::i32); + return true; + } + + // If the RHS is +/- imm8, fold into addr mode. + if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) { + int RHSC = (int)RHS->getZExtValue(); + if ((RHSC >= 0 && RHSC < 256) || + (RHSC < 0 && RHSC > -256)) { // note -256 itself isn't allowed. + Base = N.getOperand(0); + if (Base.getOpcode() == ISD::FrameIndex) { + int FI = cast<FrameIndexSDNode>(Base)->getIndex(); + Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy()); + } + Offset = CurDAG->getRegister(0, MVT::i32); + + ARM_AM::AddrOpc AddSub = ARM_AM::add; + if (RHSC < 0) { + AddSub = ARM_AM::sub; + RHSC = - RHSC; + } + Opc = CurDAG->getTargetConstant(ARM_AM::getAM3Opc(AddSub, RHSC),MVT::i32); + return true; + } + } + + Base = N.getOperand(0); + Offset = N.getOperand(1); + Opc = CurDAG->getTargetConstant(ARM_AM::getAM3Opc(ARM_AM::add, 0), MVT::i32); + return true; +} + +bool ARMDAGToDAGISel::SelectAddrMode3Offset(SDNode *Op, SDValue N, + SDValue &Offset, SDValue &Opc) { + unsigned Opcode = Op->getOpcode(); + ISD::MemIndexedMode AM = (Opcode == ISD::LOAD) + ? cast<LoadSDNode>(Op)->getAddressingMode() + : cast<StoreSDNode>(Op)->getAddressingMode(); + ARM_AM::AddrOpc AddSub = (AM == ISD::PRE_INC || AM == ISD::POST_INC) + ? ARM_AM::add : ARM_AM::sub; + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N)) { + int Val = (int)C->getZExtValue(); + if (Val >= 0 && Val < 256) { + Offset = CurDAG->getRegister(0, MVT::i32); + Opc = CurDAG->getTargetConstant(ARM_AM::getAM3Opc(AddSub, Val), MVT::i32); + return true; + } + } + + Offset = N; + Opc = CurDAG->getTargetConstant(ARM_AM::getAM3Opc(AddSub, 0), MVT::i32); + return true; +} + +bool ARMDAGToDAGISel::SelectAddrMode4(SDNode *Op, SDValue N, + SDValue &Addr, SDValue &Mode) { + Addr = N; + Mode = CurDAG->getTargetConstant(0, MVT::i32); + return true; +} + +bool ARMDAGToDAGISel::SelectAddrMode5(SDNode *Op, SDValue N, + SDValue &Base, SDValue &Offset) { + if (N.getOpcode() != ISD::ADD) { + Base = N; + if (N.getOpcode() == ISD::FrameIndex) { + int FI = cast<FrameIndexSDNode>(N)->getIndex(); + Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy()); + } else if (N.getOpcode() == ARMISD::Wrapper && + !(Subtarget->useMovt() && + N.getOperand(0).getOpcode() == ISD::TargetGlobalAddress)) { + Base = N.getOperand(0); + } + Offset = CurDAG->getTargetConstant(ARM_AM::getAM5Opc(ARM_AM::add, 0), + MVT::i32); + return true; + } + + // If the RHS is +/- imm8, fold into addr mode. + if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) { + int RHSC = (int)RHS->getZExtValue(); + if ((RHSC & 3) == 0) { // The constant is implicitly multiplied by 4. + RHSC >>= 2; + if ((RHSC >= 0 && RHSC < 256) || + (RHSC < 0 && RHSC > -256)) { // note -256 itself isn't allowed. + Base = N.getOperand(0); + if (Base.getOpcode() == ISD::FrameIndex) { + int FI = cast<FrameIndexSDNode>(Base)->getIndex(); + Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy()); + } + + ARM_AM::AddrOpc AddSub = ARM_AM::add; + if (RHSC < 0) { + AddSub = ARM_AM::sub; + RHSC = - RHSC; + } + Offset = CurDAG->getTargetConstant(ARM_AM::getAM5Opc(AddSub, RHSC), + MVT::i32); + return true; + } + } + } + + Base = N; + Offset = CurDAG->getTargetConstant(ARM_AM::getAM5Opc(ARM_AM::add, 0), + MVT::i32); + return true; +} + +bool ARMDAGToDAGISel::SelectAddrMode6(SDNode *Op, SDValue N, + SDValue &Addr, SDValue &Align) { + Addr = N; + // Default to no alignment. + Align = CurDAG->getTargetConstant(0, MVT::i32); + return true; +} + +bool ARMDAGToDAGISel::SelectAddrModePC(SDNode *Op, SDValue N, + SDValue &Offset, SDValue &Label) { + if (N.getOpcode() == ARMISD::PIC_ADD && N.hasOneUse()) { + Offset = N.getOperand(0); + SDValue N1 = N.getOperand(1); + Label = CurDAG->getTargetConstant(cast<ConstantSDNode>(N1)->getZExtValue(), + MVT::i32); + return true; + } + return false; +} + +bool ARMDAGToDAGISel::SelectThumbAddrModeRR(SDNode *Op, SDValue N, + SDValue &Base, SDValue &Offset){ + // FIXME dl should come from the parent load or store, not the address + DebugLoc dl = Op->getDebugLoc(); + if (N.getOpcode() != ISD::ADD) { + ConstantSDNode *NC = dyn_cast<ConstantSDNode>(N); + if (!NC || NC->getZExtValue() != 0) + return false; + + Base = Offset = N; + return true; + } + + Base = N.getOperand(0); + Offset = N.getOperand(1); + return true; +} + +bool +ARMDAGToDAGISel::SelectThumbAddrModeRI5(SDNode *Op, SDValue N, + unsigned Scale, SDValue &Base, + SDValue &OffImm, SDValue &Offset) { + if (Scale == 4) { + SDValue TmpBase, TmpOffImm; + if (SelectThumbAddrModeSP(Op, N, TmpBase, TmpOffImm)) + return false; // We want to select tLDRspi / tSTRspi instead. + if (N.getOpcode() == ARMISD::Wrapper && + N.getOperand(0).getOpcode() == ISD::TargetConstantPool) + return false; // We want to select tLDRpci instead. + } + + if (N.getOpcode() != ISD::ADD) { + if (N.getOpcode() == ARMISD::Wrapper && + !(Subtarget->useMovt() && + N.getOperand(0).getOpcode() == ISD::TargetGlobalAddress)) { + Base = N.getOperand(0); + } else + Base = N; + + Offset = CurDAG->getRegister(0, MVT::i32); + OffImm = CurDAG->getTargetConstant(0, MVT::i32); + return true; + } + + // Thumb does not have [sp, r] address mode. + RegisterSDNode *LHSR = dyn_cast<RegisterSDNode>(N.getOperand(0)); + RegisterSDNode *RHSR = dyn_cast<RegisterSDNode>(N.getOperand(1)); + if ((LHSR && LHSR->getReg() == ARM::SP) || + (RHSR && RHSR->getReg() == ARM::SP)) { + Base = N; + Offset = CurDAG->getRegister(0, MVT::i32); + OffImm = CurDAG->getTargetConstant(0, MVT::i32); + return true; + } + + // If the RHS is + imm5 * scale, fold into addr mode. + if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) { + int RHSC = (int)RHS->getZExtValue(); + if ((RHSC & (Scale-1)) == 0) { // The constant is implicitly multiplied. + RHSC /= Scale; + if (RHSC >= 0 && RHSC < 32) { + Base = N.getOperand(0); + Offset = CurDAG->getRegister(0, MVT::i32); + OffImm = CurDAG->getTargetConstant(RHSC, MVT::i32); + return true; + } + } + } + + Base = N.getOperand(0); + Offset = N.getOperand(1); + OffImm = CurDAG->getTargetConstant(0, MVT::i32); + return true; +} + +bool ARMDAGToDAGISel::SelectThumbAddrModeS1(SDNode *Op, SDValue N, + SDValue &Base, SDValue &OffImm, + SDValue &Offset) { + return SelectThumbAddrModeRI5(Op, N, 1, Base, OffImm, Offset); +} + +bool ARMDAGToDAGISel::SelectThumbAddrModeS2(SDNode *Op, SDValue N, + SDValue &Base, SDValue &OffImm, + SDValue &Offset) { + return SelectThumbAddrModeRI5(Op, N, 2, Base, OffImm, Offset); +} + +bool ARMDAGToDAGISel::SelectThumbAddrModeS4(SDNode *Op, SDValue N, + SDValue &Base, SDValue &OffImm, + SDValue &Offset) { + return SelectThumbAddrModeRI5(Op, N, 4, Base, OffImm, Offset); +} + +bool ARMDAGToDAGISel::SelectThumbAddrModeSP(SDNode *Op, SDValue N, + SDValue &Base, SDValue &OffImm) { + if (N.getOpcode() == ISD::FrameIndex) { + int FI = cast<FrameIndexSDNode>(N)->getIndex(); + Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy()); + OffImm = CurDAG->getTargetConstant(0, MVT::i32); + return true; + } + + if (N.getOpcode() != ISD::ADD) + return false; + + RegisterSDNode *LHSR = dyn_cast<RegisterSDNode>(N.getOperand(0)); + if (N.getOperand(0).getOpcode() == ISD::FrameIndex || + (LHSR && LHSR->getReg() == ARM::SP)) { + // If the RHS is + imm8 * scale, fold into addr mode. + if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) { + int RHSC = (int)RHS->getZExtValue(); + if ((RHSC & 3) == 0) { // The constant is implicitly multiplied. + RHSC >>= 2; + if (RHSC >= 0 && RHSC < 256) { + Base = N.getOperand(0); + if (Base.getOpcode() == ISD::FrameIndex) { + int FI = cast<FrameIndexSDNode>(Base)->getIndex(); + Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy()); + } + OffImm = CurDAG->getTargetConstant(RHSC, MVT::i32); + return true; + } + } + } + } + + return false; +} + +bool ARMDAGToDAGISel::SelectT2ShifterOperandReg(SDNode *Op, SDValue N, + SDValue &BaseReg, + SDValue &Opc) { + ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(N); + + // Don't match base register only case. That is matched to a separate + // lower complexity pattern with explicit register operand. + if (ShOpcVal == ARM_AM::no_shift) return false; + + BaseReg = N.getOperand(0); + unsigned ShImmVal = 0; + if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) { + ShImmVal = RHS->getZExtValue() & 31; + Opc = getI32Imm(ARM_AM::getSORegOpc(ShOpcVal, ShImmVal)); + return true; + } + + return false; +} + +bool ARMDAGToDAGISel::SelectT2AddrModeImm12(SDNode *Op, SDValue N, + SDValue &Base, SDValue &OffImm) { + // Match simple R + imm12 operands. + + // Base only. + if (N.getOpcode() != ISD::ADD && N.getOpcode() != ISD::SUB) { + if (N.getOpcode() == ISD::FrameIndex) { + // Match frame index... + int FI = cast<FrameIndexSDNode>(N)->getIndex(); + Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy()); + OffImm = CurDAG->getTargetConstant(0, MVT::i32); + return true; + } else if (N.getOpcode() == ARMISD::Wrapper && + !(Subtarget->useMovt() && + N.getOperand(0).getOpcode() == ISD::TargetGlobalAddress)) { + Base = N.getOperand(0); + if (Base.getOpcode() == ISD::TargetConstantPool) + return false; // We want to select t2LDRpci instead. + } else + Base = N; + OffImm = CurDAG->getTargetConstant(0, MVT::i32); + return true; + } + + if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) { + if (SelectT2AddrModeImm8(Op, N, Base, OffImm)) + // Let t2LDRi8 handle (R - imm8). + return false; + + int RHSC = (int)RHS->getZExtValue(); + if (N.getOpcode() == ISD::SUB) + RHSC = -RHSC; + + if (RHSC >= 0 && RHSC < 0x1000) { // 12 bits (unsigned) + Base = N.getOperand(0); + if (Base.getOpcode() == ISD::FrameIndex) { + int FI = cast<FrameIndexSDNode>(Base)->getIndex(); + Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy()); + } + OffImm = CurDAG->getTargetConstant(RHSC, MVT::i32); + return true; + } + } + + // Base only. + Base = N; + OffImm = CurDAG->getTargetConstant(0, MVT::i32); + return true; +} + +bool ARMDAGToDAGISel::SelectT2AddrModeImm8(SDNode *Op, SDValue N, + SDValue &Base, SDValue &OffImm) { + // Match simple R - imm8 operands. + if (N.getOpcode() == ISD::ADD || N.getOpcode() == ISD::SUB) { + if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) { + int RHSC = (int)RHS->getSExtValue(); + if (N.getOpcode() == ISD::SUB) + RHSC = -RHSC; + + if ((RHSC >= -255) && (RHSC < 0)) { // 8 bits (always negative) + Base = N.getOperand(0); + if (Base.getOpcode() == ISD::FrameIndex) { + int FI = cast<FrameIndexSDNode>(Base)->getIndex(); + Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy()); + } + OffImm = CurDAG->getTargetConstant(RHSC, MVT::i32); + return true; + } + } + } + + return false; +} + +bool ARMDAGToDAGISel::SelectT2AddrModeImm8Offset(SDNode *Op, SDValue N, + SDValue &OffImm){ + unsigned Opcode = Op->getOpcode(); + ISD::MemIndexedMode AM = (Opcode == ISD::LOAD) + ? cast<LoadSDNode>(Op)->getAddressingMode() + : cast<StoreSDNode>(Op)->getAddressingMode(); + if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N)) { + int RHSC = (int)RHS->getZExtValue(); + if (RHSC >= 0 && RHSC < 0x100) { // 8 bits. + OffImm = ((AM == ISD::PRE_INC) || (AM == ISD::POST_INC)) + ? CurDAG->getTargetConstant(RHSC, MVT::i32) + : CurDAG->getTargetConstant(-RHSC, MVT::i32); + return true; + } + } + + return false; +} + +bool ARMDAGToDAGISel::SelectT2AddrModeImm8s4(SDNode *Op, SDValue N, + SDValue &Base, SDValue &OffImm) { + if (N.getOpcode() == ISD::ADD) { + if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) { + int RHSC = (int)RHS->getZExtValue(); + if (((RHSC & 0x3) == 0) && + ((RHSC >= 0 && RHSC < 0x400) || (RHSC < 0 && RHSC > -0x400))) { // 8 bits. + Base = N.getOperand(0); + OffImm = CurDAG->getTargetConstant(RHSC, MVT::i32); + return true; + } + } + } else if (N.getOpcode() == ISD::SUB) { + if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) { + int RHSC = (int)RHS->getZExtValue(); + if (((RHSC & 0x3) == 0) && (RHSC >= 0 && RHSC < 0x400)) { // 8 bits. + Base = N.getOperand(0); + OffImm = CurDAG->getTargetConstant(-RHSC, MVT::i32); + return true; + } + } + } + + return false; +} + +bool ARMDAGToDAGISel::SelectT2AddrModeSoReg(SDNode *Op, SDValue N, + SDValue &Base, + SDValue &OffReg, SDValue &ShImm) { + // (R - imm8) should be handled by t2LDRi8. The rest are handled by t2LDRi12. + if (N.getOpcode() != ISD::ADD) + return false; + + // Leave (R + imm12) for t2LDRi12, (R - imm8) for t2LDRi8. + if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) { + int RHSC = (int)RHS->getZExtValue(); + if (RHSC >= 0 && RHSC < 0x1000) // 12 bits (unsigned) + return false; + else if (RHSC < 0 && RHSC >= -255) // 8 bits + return false; + } + + // Look for (R + R) or (R + (R << [1,2,3])). + unsigned ShAmt = 0; + Base = N.getOperand(0); + OffReg = N.getOperand(1); + + // Swap if it is ((R << c) + R). + ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(OffReg); + if (ShOpcVal != ARM_AM::lsl) { + ShOpcVal = ARM_AM::getShiftOpcForNode(Base); + if (ShOpcVal == ARM_AM::lsl) + std::swap(Base, OffReg); + } + + if (ShOpcVal == ARM_AM::lsl) { + // Check to see if the RHS of the shift is a constant, if not, we can't fold + // it. + if (ConstantSDNode *Sh = dyn_cast<ConstantSDNode>(OffReg.getOperand(1))) { + ShAmt = Sh->getZExtValue(); + if (ShAmt >= 4) { + ShAmt = 0; + ShOpcVal = ARM_AM::no_shift; + } else + OffReg = OffReg.getOperand(0); + } else { + ShOpcVal = ARM_AM::no_shift; + } + } + + ShImm = CurDAG->getTargetConstant(ShAmt, MVT::i32); + + return true; +} + +//===--------------------------------------------------------------------===// + +/// getAL - Returns a ARMCC::AL immediate node. +static inline SDValue getAL(SelectionDAG *CurDAG) { + return CurDAG->getTargetConstant((uint64_t)ARMCC::AL, MVT::i32); +} + +SDNode *ARMDAGToDAGISel::SelectARMIndexedLoad(SDNode *N) { + LoadSDNode *LD = cast<LoadSDNode>(N); + ISD::MemIndexedMode AM = LD->getAddressingMode(); + if (AM == ISD::UNINDEXED) + return NULL; + + EVT LoadedVT = LD->getMemoryVT(); + SDValue Offset, AMOpc; + bool isPre = (AM == ISD::PRE_INC) || (AM == ISD::PRE_DEC); + unsigned Opcode = 0; + bool Match = false; + if (LoadedVT == MVT::i32 && + SelectAddrMode2Offset(N, LD->getOffset(), Offset, AMOpc)) { + Opcode = isPre ? ARM::LDR_PRE : ARM::LDR_POST; + Match = true; + } else if (LoadedVT == MVT::i16 && + SelectAddrMode3Offset(N, LD->getOffset(), Offset, AMOpc)) { + Match = true; + Opcode = (LD->getExtensionType() == ISD::SEXTLOAD) + ? (isPre ? ARM::LDRSH_PRE : ARM::LDRSH_POST) + : (isPre ? ARM::LDRH_PRE : ARM::LDRH_POST); + } else if (LoadedVT == MVT::i8 || LoadedVT == MVT::i1) { + if (LD->getExtensionType() == ISD::SEXTLOAD) { + if (SelectAddrMode3Offset(N, LD->getOffset(), Offset, AMOpc)) { + Match = true; + Opcode = isPre ? ARM::LDRSB_PRE : ARM::LDRSB_POST; + } + } else { + if (SelectAddrMode2Offset(N, LD->getOffset(), Offset, AMOpc)) { + Match = true; + Opcode = isPre ? ARM::LDRB_PRE : ARM::LDRB_POST; + } + } + } + + if (Match) { + SDValue Chain = LD->getChain(); + SDValue Base = LD->getBasePtr(); + SDValue Ops[]= { Base, Offset, AMOpc, getAL(CurDAG), + CurDAG->getRegister(0, MVT::i32), Chain }; + return CurDAG->getMachineNode(Opcode, N->getDebugLoc(), MVT::i32, MVT::i32, + MVT::Other, Ops, 6); + } + + return NULL; +} + +SDNode *ARMDAGToDAGISel::SelectT2IndexedLoad(SDNode *N) { + LoadSDNode *LD = cast<LoadSDNode>(N); + ISD::MemIndexedMode AM = LD->getAddressingMode(); + if (AM == ISD::UNINDEXED) + return NULL; + + EVT LoadedVT = LD->getMemoryVT(); + bool isSExtLd = LD->getExtensionType() == ISD::SEXTLOAD; + SDValue Offset; + bool isPre = (AM == ISD::PRE_INC) || (AM == ISD::PRE_DEC); + unsigned Opcode = 0; + bool Match = false; + if (SelectT2AddrModeImm8Offset(N, LD->getOffset(), Offset)) { + switch (LoadedVT.getSimpleVT().SimpleTy) { + case MVT::i32: + Opcode = isPre ? ARM::t2LDR_PRE : ARM::t2LDR_POST; + break; + case MVT::i16: + if (isSExtLd) + Opcode = isPre ? ARM::t2LDRSH_PRE : ARM::t2LDRSH_POST; + else + Opcode = isPre ? ARM::t2LDRH_PRE : ARM::t2LDRH_POST; + break; + case MVT::i8: + case MVT::i1: + if (isSExtLd) + Opcode = isPre ? ARM::t2LDRSB_PRE : ARM::t2LDRSB_POST; + else + Opcode = isPre ? ARM::t2LDRB_PRE : ARM::t2LDRB_POST; + break; + default: + return NULL; + } + Match = true; + } + + if (Match) { + SDValue Chain = LD->getChain(); + SDValue Base = LD->getBasePtr(); + SDValue Ops[]= { Base, Offset, getAL(CurDAG), + CurDAG->getRegister(0, MVT::i32), Chain }; + return CurDAG->getMachineNode(Opcode, N->getDebugLoc(), MVT::i32, MVT::i32, + MVT::Other, Ops, 5); + } + + return NULL; +} + +/// PairDRegs - Form a quad register from a pair of D registers. +/// +SDNode *ARMDAGToDAGISel::PairDRegs(EVT VT, SDValue V0, SDValue V1) { + DebugLoc dl = V0.getNode()->getDebugLoc(); + SDValue SubReg0 = CurDAG->getTargetConstant(ARM::dsub_0, MVT::i32); + SDValue SubReg1 = CurDAG->getTargetConstant(ARM::dsub_1, MVT::i32); + if (llvm::ModelWithRegSequence()) { + const SDValue Ops[] = { V0, SubReg0, V1, SubReg1 }; + return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops, 4); + } + SDValue Undef = + SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, dl, VT), 0); + SDNode *Pair = CurDAG->getMachineNode(TargetOpcode::INSERT_SUBREG, dl, + VT, Undef, V0, SubReg0); + return CurDAG->getMachineNode(TargetOpcode::INSERT_SUBREG, dl, + VT, SDValue(Pair, 0), V1, SubReg1); +} + +/// PairQRegs - Form 4 consecutive D registers from a pair of Q registers. +/// +SDNode *ARMDAGToDAGISel::PairQRegs(EVT VT, SDValue V0, SDValue V1) { + DebugLoc dl = V0.getNode()->getDebugLoc(); + SDValue SubReg0 = CurDAG->getTargetConstant(ARM::qsub_0, MVT::i32); + SDValue SubReg1 = CurDAG->getTargetConstant(ARM::qsub_1, MVT::i32); + const SDValue Ops[] = { V0, SubReg0, V1, SubReg1 }; + return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops, 4); +} + +/// QuadDRegs - Form 4 consecutive D registers. +/// +SDNode *ARMDAGToDAGISel::QuadDRegs(EVT VT, SDValue V0, SDValue V1, + SDValue V2, SDValue V3) { + DebugLoc dl = V0.getNode()->getDebugLoc(); + SDValue SubReg0 = CurDAG->getTargetConstant(ARM::dsub_0, MVT::i32); + SDValue SubReg1 = CurDAG->getTargetConstant(ARM::dsub_1, MVT::i32); + SDValue SubReg2 = CurDAG->getTargetConstant(ARM::dsub_2, MVT::i32); + SDValue SubReg3 = CurDAG->getTargetConstant(ARM::dsub_3, MVT::i32); + const SDValue Ops[] = { V0, SubReg0, V1, SubReg1, V2, SubReg2, V3, SubReg3 }; + return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops, 8); +} + +/// QuadQRegs - Form 4 consecutive Q registers. +/// +SDNode *ARMDAGToDAGISel::QuadQRegs(EVT VT, SDValue V0, SDValue V1, + SDValue V2, SDValue V3) { + DebugLoc dl = V0.getNode()->getDebugLoc(); + SDValue SubReg0 = CurDAG->getTargetConstant(ARM::qsub_0, MVT::i32); + SDValue SubReg1 = CurDAG->getTargetConstant(ARM::qsub_1, MVT::i32); + SDValue SubReg2 = CurDAG->getTargetConstant(ARM::qsub_2, MVT::i32); + SDValue SubReg3 = CurDAG->getTargetConstant(ARM::qsub_3, MVT::i32); + const SDValue Ops[] = { V0, SubReg0, V1, SubReg1, V2, SubReg2, V3, SubReg3 }; + return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops, 8); +} + +/// OctoDRegs - Form 8 consecutive D registers. +/// +SDNode *ARMDAGToDAGISel::OctoDRegs(EVT VT, SDValue V0, SDValue V1, + SDValue V2, SDValue V3, + SDValue V4, SDValue V5, + SDValue V6, SDValue V7) { + DebugLoc dl = V0.getNode()->getDebugLoc(); + SDValue SubReg0 = CurDAG->getTargetConstant(ARM::dsub_0, MVT::i32); + SDValue SubReg1 = CurDAG->getTargetConstant(ARM::dsub_1, MVT::i32); + SDValue SubReg2 = CurDAG->getTargetConstant(ARM::dsub_2, MVT::i32); + SDValue SubReg3 = CurDAG->getTargetConstant(ARM::dsub_3, MVT::i32); + SDValue SubReg4 = CurDAG->getTargetConstant(ARM::dsub_4, MVT::i32); + SDValue SubReg5 = CurDAG->getTargetConstant(ARM::dsub_5, MVT::i32); + SDValue SubReg6 = CurDAG->getTargetConstant(ARM::dsub_6, MVT::i32); + SDValue SubReg7 = CurDAG->getTargetConstant(ARM::dsub_7, MVT::i32); + const SDValue Ops[] ={ V0, SubReg0, V1, SubReg1, V2, SubReg2, V3, SubReg3, + V4, SubReg4, V5, SubReg5, V6, SubReg6, V7, SubReg7 }; + return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops, 16); +} + +/// GetNEONSubregVT - Given a type for a 128-bit NEON vector, return the type +/// for a 64-bit subregister of the vector. +static EVT GetNEONSubregVT(EVT VT) { + switch (VT.getSimpleVT().SimpleTy) { + default: llvm_unreachable("unhandled NEON type"); + case MVT::v16i8: return MVT::v8i8; + case MVT::v8i16: return MVT::v4i16; + case MVT::v4f32: return MVT::v2f32; + case MVT::v4i32: return MVT::v2i32; + case MVT::v2i64: return MVT::v1i64; + } +} + +SDNode *ARMDAGToDAGISel::SelectVLD(SDNode *N, unsigned NumVecs, + unsigned *DOpcodes, unsigned *QOpcodes0, + unsigned *QOpcodes1) { + assert(NumVecs >= 1 && NumVecs <= 4 && "VLD NumVecs out-of-range"); + DebugLoc dl = N->getDebugLoc(); + + SDValue MemAddr, Align; + if (!SelectAddrMode6(N, N->getOperand(2), MemAddr, Align)) + return NULL; + + SDValue Chain = N->getOperand(0); + EVT VT = N->getValueType(0); + bool is64BitVector = VT.is64BitVector(); + + unsigned OpcodeIndex; + switch (VT.getSimpleVT().SimpleTy) { + default: llvm_unreachable("unhandled vld type"); + // Double-register operations: + case MVT::v8i8: OpcodeIndex = 0; break; + case MVT::v4i16: OpcodeIndex = 1; break; + case MVT::v2f32: + case MVT::v2i32: OpcodeIndex = 2; break; + case MVT::v1i64: OpcodeIndex = 3; break; + // Quad-register operations: + case MVT::v16i8: OpcodeIndex = 0; break; + case MVT::v8i16: OpcodeIndex = 1; break; + case MVT::v4f32: + case MVT::v4i32: OpcodeIndex = 2; break; + case MVT::v2i64: OpcodeIndex = 3; + assert(NumVecs == 1 && "v2i64 type only supported for VLD1"); + break; + } + + SDValue Pred = getAL(CurDAG); + SDValue Reg0 = CurDAG->getRegister(0, MVT::i32); + if (is64BitVector) { + unsigned Opc = DOpcodes[OpcodeIndex]; + const SDValue Ops[] = { MemAddr, Align, Pred, Reg0, Chain }; + std::vector<EVT> ResTys(NumVecs, VT); + ResTys.push_back(MVT::Other); + SDNode *VLd = CurDAG->getMachineNode(Opc, dl, ResTys, Ops, 5); + if (!llvm::ModelWithRegSequence() || NumVecs < 2) + return VLd; + + SDValue RegSeq; + SDValue V0 = SDValue(VLd, 0); + SDValue V1 = SDValue(VLd, 1); + + // Form a REG_SEQUENCE to force register allocation. + if (NumVecs == 2) + RegSeq = SDValue(PairDRegs(MVT::v2i64, V0, V1), 0); + else { + SDValue V2 = SDValue(VLd, 2); + // If it's a vld3, form a quad D-register but discard the last part. + SDValue V3 = (NumVecs == 3) + ? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,dl,VT), 0) + : SDValue(VLd, 3); + RegSeq = SDValue(QuadDRegs(MVT::v4i64, V0, V1, V2, V3), 0); + } + + assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering"); + for (unsigned Vec = 0; Vec < NumVecs; ++Vec) { + SDValue D = CurDAG->getTargetExtractSubreg(ARM::dsub_0+Vec, + dl, VT, RegSeq); + ReplaceUses(SDValue(N, Vec), D); + } + ReplaceUses(SDValue(N, NumVecs), SDValue(VLd, NumVecs)); + return NULL; + } + + EVT RegVT = GetNEONSubregVT(VT); + if (NumVecs <= 2) { + // Quad registers are directly supported for VLD1 and VLD2, + // loading pairs of D regs. + unsigned Opc = QOpcodes0[OpcodeIndex]; + const SDValue Ops[] = { MemAddr, Align, Pred, Reg0, Chain }; + std::vector<EVT> ResTys(2 * NumVecs, RegVT); + ResTys.push_back(MVT::Other); + SDNode *VLd = CurDAG->getMachineNode(Opc, dl, ResTys, Ops, 5); + Chain = SDValue(VLd, 2 * NumVecs); + + // Combine the even and odd subregs to produce the result. + if (llvm::ModelWithRegSequence()) { + if (NumVecs == 1) { + SDNode *Q = PairDRegs(VT, SDValue(VLd, 0), SDValue(VLd, 1)); + ReplaceUses(SDValue(N, 0), SDValue(Q, 0)); + } else { + SDValue QQ = SDValue(QuadDRegs(MVT::v4i64, + SDValue(VLd, 0), SDValue(VLd, 1), + SDValue(VLd, 2), SDValue(VLd, 3)), 0); + SDValue Q0 = CurDAG->getTargetExtractSubreg(ARM::qsub_0, dl, VT, QQ); + SDValue Q1 = CurDAG->getTargetExtractSubreg(ARM::qsub_1, dl, VT, QQ); + ReplaceUses(SDValue(N, 0), Q0); + ReplaceUses(SDValue(N, 1), Q1); + } + } else { + for (unsigned Vec = 0; Vec < NumVecs; ++Vec) { + SDNode *Q = PairDRegs(VT, SDValue(VLd, 2*Vec), SDValue(VLd, 2*Vec+1)); + ReplaceUses(SDValue(N, Vec), SDValue(Q, 0)); + } + } + } else { + // Otherwise, quad registers are loaded with two separate instructions, + // where one loads the even registers and the other loads the odd registers. + + std::vector<EVT> ResTys(NumVecs, RegVT); + ResTys.push_back(MemAddr.getValueType()); + ResTys.push_back(MVT::Other); + + // Load the even subregs. + unsigned Opc = QOpcodes0[OpcodeIndex]; + const SDValue OpsA[] = { MemAddr, Align, Reg0, Pred, Reg0, Chain }; + SDNode *VLdA = CurDAG->getMachineNode(Opc, dl, ResTys, OpsA, 6); + Chain = SDValue(VLdA, NumVecs+1); + + // Load the odd subregs. + Opc = QOpcodes1[OpcodeIndex]; + const SDValue OpsB[] = { SDValue(VLdA, NumVecs), + Align, Reg0, Pred, Reg0, Chain }; + SDNode *VLdB = CurDAG->getMachineNode(Opc, dl, ResTys, OpsB, 6); + Chain = SDValue(VLdB, NumVecs+1); + + if (llvm::ModelWithRegSequence()) { + SDValue V0 = SDValue(VLdA, 0); + SDValue V1 = SDValue(VLdB, 0); + SDValue V2 = SDValue(VLdA, 1); + SDValue V3 = SDValue(VLdB, 1); + SDValue V4 = SDValue(VLdA, 2); + SDValue V5 = SDValue(VLdB, 2); + SDValue V6 = (NumVecs == 3) + ? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,dl,RegVT), + 0) + : SDValue(VLdA, 3); + SDValue V7 = (NumVecs == 3) + ? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,dl,RegVT), + 0) + : SDValue(VLdB, 3); + SDValue RegSeq = SDValue(OctoDRegs(MVT::v8i64, V0, V1, V2, V3, + V4, V5, V6, V7), 0); + + // Extract out the 3 / 4 Q registers. + assert(ARM::qsub_3 == ARM::qsub_0+3 && "Unexpected subreg numbering"); + for (unsigned Vec = 0; Vec < NumVecs; ++Vec) { + SDValue Q = CurDAG->getTargetExtractSubreg(ARM::qsub_0+Vec, + dl, VT, RegSeq); + ReplaceUses(SDValue(N, Vec), Q); + } + } else { + // Combine the even and odd subregs to produce the result. + for (unsigned Vec = 0; Vec < NumVecs; ++Vec) { + SDNode *Q = PairDRegs(VT, SDValue(VLdA, Vec), SDValue(VLdB, Vec)); + ReplaceUses(SDValue(N, Vec), SDValue(Q, 0)); + } + } + } + ReplaceUses(SDValue(N, NumVecs), Chain); + return NULL; +} + +SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, unsigned NumVecs, + unsigned *DOpcodes, unsigned *QOpcodes0, + unsigned *QOpcodes1) { + assert(NumVecs >=1 && NumVecs <= 4 && "VST NumVecs out-of-range"); + DebugLoc dl = N->getDebugLoc(); + + SDValue MemAddr, Align; + if (!SelectAddrMode6(N, N->getOperand(2), MemAddr, Align)) + return NULL; + + SDValue Chain = N->getOperand(0); + EVT VT = N->getOperand(3).getValueType(); + bool is64BitVector = VT.is64BitVector(); + + unsigned OpcodeIndex; + switch (VT.getSimpleVT().SimpleTy) { + default: llvm_unreachable("unhandled vst type"); + // Double-register operations: + case MVT::v8i8: OpcodeIndex = 0; break; + case MVT::v4i16: OpcodeIndex = 1; break; + case MVT::v2f32: + case MVT::v2i32: OpcodeIndex = 2; break; + case MVT::v1i64: OpcodeIndex = 3; break; + // Quad-register operations: + case MVT::v16i8: OpcodeIndex = 0; break; + case MVT::v8i16: OpcodeIndex = 1; break; + case MVT::v4f32: + case MVT::v4i32: OpcodeIndex = 2; break; + case MVT::v2i64: OpcodeIndex = 3; + assert(NumVecs == 1 && "v2i64 type only supported for VST1"); + break; + } + + SDValue Pred = getAL(CurDAG); + SDValue Reg0 = CurDAG->getRegister(0, MVT::i32); + + SmallVector<SDValue, 10> Ops; + Ops.push_back(MemAddr); + Ops.push_back(Align); + + if (is64BitVector) { + if (llvm::ModelWithRegSequence() && NumVecs >= 2) { + SDValue RegSeq; + SDValue V0 = N->getOperand(0+3); + SDValue V1 = N->getOperand(1+3); + + // Form a REG_SEQUENCE to force register allocation. + if (NumVecs == 2) + RegSeq = SDValue(PairDRegs(MVT::v2i64, V0, V1), 0); + else { + SDValue V2 = N->getOperand(2+3); + // If it's a vld3, form a quad D-register and leave the last part as + // an undef. + SDValue V3 = (NumVecs == 3) + ? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,dl,VT), 0) + : N->getOperand(3+3); + RegSeq = SDValue(QuadDRegs(MVT::v4i64, V0, V1, V2, V3), 0); + } + + // Now extract the D registers back out. + Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::dsub_0, dl, VT, + RegSeq)); + Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::dsub_1, dl, VT, + RegSeq)); + if (NumVecs > 2) + Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::dsub_2, dl, VT, + RegSeq)); + if (NumVecs > 3) + Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::dsub_3, dl, VT, + RegSeq)); + } else { + for (unsigned Vec = 0; Vec < NumVecs; ++Vec) + Ops.push_back(N->getOperand(Vec+3)); + } + Ops.push_back(Pred); + Ops.push_back(Reg0); // predicate register + Ops.push_back(Chain); + unsigned Opc = DOpcodes[OpcodeIndex]; + return CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops.data(), NumVecs+5); + } + + EVT RegVT = GetNEONSubregVT(VT); + if (NumVecs <= 2) { + // Quad registers are directly supported for VST1 and VST2, + // storing pairs of D regs. + unsigned Opc = QOpcodes0[OpcodeIndex]; + if (llvm::ModelWithRegSequence() && NumVecs == 2) { + // First extract the pair of Q registers. + SDValue Q0 = N->getOperand(3); + SDValue Q1 = N->getOperand(4); + + // Form a QQ register. + SDValue QQ = SDValue(PairQRegs(MVT::v4i64, Q0, Q1), 0); + + // Now extract the D registers back out. + Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::dsub_0, dl, RegVT, + QQ)); + Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::dsub_1, dl, RegVT, + QQ)); + Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::dsub_2, dl, RegVT, + QQ)); + Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::dsub_3, dl, RegVT, + QQ)); + Ops.push_back(Pred); + Ops.push_back(Reg0); // predicate register + Ops.push_back(Chain); + return CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops.data(), 5 + 4); + } else { + for (unsigned Vec = 0; Vec < NumVecs; ++Vec) { + Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::dsub_0, dl, RegVT, + N->getOperand(Vec+3))); + Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::dsub_1, dl, RegVT, + N->getOperand(Vec+3))); + } + Ops.push_back(Pred); + Ops.push_back(Reg0); // predicate register + Ops.push_back(Chain); + return CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops.data(), + 5 + 2 * NumVecs); + } + } + + // Otherwise, quad registers are stored with two separate instructions, + // where one stores the even registers and the other stores the odd registers. + if (llvm::ModelWithRegSequence()) { + // Form the QQQQ REG_SEQUENCE. + SDValue V[8]; + for (unsigned Vec = 0, i = 0; Vec < NumVecs; ++Vec, i+=2) { + V[i] = CurDAG->getTargetExtractSubreg(ARM::dsub_0, dl, RegVT, + N->getOperand(Vec+3)); + V[i+1] = CurDAG->getTargetExtractSubreg(ARM::dsub_1, dl, RegVT, + N->getOperand(Vec+3)); + } + if (NumVecs == 3) + V[6] = V[7] = SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, + dl, RegVT), 0); + + SDValue RegSeq = SDValue(OctoDRegs(MVT::v8i64, V[0], V[1], V[2], V[3], + V[4], V[5], V[6], V[7]), 0); + + // Store the even D registers. + assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering"); + Ops.push_back(Reg0); // post-access address offset + for (unsigned Vec = 0; Vec < NumVecs; ++Vec) + Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::dsub_0+Vec*2, dl, + RegVT, RegSeq)); + Ops.push_back(Pred); + Ops.push_back(Reg0); // predicate register + Ops.push_back(Chain); + unsigned Opc = QOpcodes0[OpcodeIndex]; + SDNode *VStA = CurDAG->getMachineNode(Opc, dl, MemAddr.getValueType(), + MVT::Other, Ops.data(), NumVecs+6); + Chain = SDValue(VStA, 1); + + // Store the odd D registers. + Ops[0] = SDValue(VStA, 0); // MemAddr + for (unsigned Vec = 0; Vec < NumVecs; ++Vec) + Ops[Vec+3] = CurDAG->getTargetExtractSubreg(ARM::dsub_1+Vec*2, dl, + RegVT, RegSeq); + Ops[NumVecs+5] = Chain; + Opc = QOpcodes1[OpcodeIndex]; + SDNode *VStB = CurDAG->getMachineNode(Opc, dl, MemAddr.getValueType(), + MVT::Other, Ops.data(), NumVecs+6); + Chain = SDValue(VStB, 1); + ReplaceUses(SDValue(N, 0), Chain); + return NULL; + } else { + Ops.push_back(Reg0); // post-access address offset + + // Store the even subregs. + for (unsigned Vec = 0; Vec < NumVecs; ++Vec) + Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::dsub_0, dl, RegVT, + N->getOperand(Vec+3))); + Ops.push_back(Pred); + Ops.push_back(Reg0); // predicate register + Ops.push_back(Chain); + unsigned Opc = QOpcodes0[OpcodeIndex]; + SDNode *VStA = CurDAG->getMachineNode(Opc, dl, MemAddr.getValueType(), + MVT::Other, Ops.data(), NumVecs+6); + Chain = SDValue(VStA, 1); + + // Store the odd subregs. + Ops[0] = SDValue(VStA, 0); // MemAddr + for (unsigned Vec = 0; Vec < NumVecs; ++Vec) + Ops[Vec+3] = CurDAG->getTargetExtractSubreg(ARM::dsub_1, dl, RegVT, + N->getOperand(Vec+3)); + Ops[NumVecs+5] = Chain; + Opc = QOpcodes1[OpcodeIndex]; + SDNode *VStB = CurDAG->getMachineNode(Opc, dl, MemAddr.getValueType(), + MVT::Other, Ops.data(), NumVecs+6); + Chain = SDValue(VStB, 1); + ReplaceUses(SDValue(N, 0), Chain); + return NULL; + } +} + +SDNode *ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad, + unsigned NumVecs, unsigned *DOpcodes, + unsigned *QOpcodes0, + unsigned *QOpcodes1) { + assert(NumVecs >=2 && NumVecs <= 4 && "VLDSTLane NumVecs out-of-range"); + DebugLoc dl = N->getDebugLoc(); + + SDValue MemAddr, Align; + if (!SelectAddrMode6(N, N->getOperand(2), MemAddr, Align)) + return NULL; + + SDValue Chain = N->getOperand(0); + unsigned Lane = + cast<ConstantSDNode>(N->getOperand(NumVecs+3))->getZExtValue(); + EVT VT = IsLoad ? N->getValueType(0) : N->getOperand(3).getValueType(); + bool is64BitVector = VT.is64BitVector(); + + // Quad registers are handled by load/store of subregs. Find the subreg info. + unsigned NumElts = 0; + int SubregIdx = 0; + bool Even = false; + EVT RegVT = VT; + if (!is64BitVector) { + RegVT = GetNEONSubregVT(VT); + NumElts = RegVT.getVectorNumElements(); + SubregIdx = (Lane < NumElts) ? ARM::dsub_0 : ARM::dsub_1; + Even = Lane < NumElts; + } + + unsigned OpcodeIndex; + switch (VT.getSimpleVT().SimpleTy) { + default: llvm_unreachable("unhandled vld/vst lane type"); + // Double-register operations: + case MVT::v8i8: OpcodeIndex = 0; break; + case MVT::v4i16: OpcodeIndex = 1; break; + case MVT::v2f32: + case MVT::v2i32: OpcodeIndex = 2; break; + // Quad-register operations: + case MVT::v8i16: OpcodeIndex = 0; break; + case MVT::v4f32: + case MVT::v4i32: OpcodeIndex = 1; break; + } + + SDValue Pred = getAL(CurDAG); + SDValue Reg0 = CurDAG->getRegister(0, MVT::i32); + + SmallVector<SDValue, 10> Ops; + Ops.push_back(MemAddr); + Ops.push_back(Align); + + unsigned Opc = 0; + if (is64BitVector) { + Opc = DOpcodes[OpcodeIndex]; + if (llvm::ModelWithRegSequence()) { + SDValue RegSeq; + SDValue V0 = N->getOperand(0+3); + SDValue V1 = N->getOperand(1+3); + if (NumVecs == 2) { + RegSeq = SDValue(PairDRegs(MVT::v2i64, V0, V1), 0); + } else { + SDValue V2 = N->getOperand(2+3); + SDValue V3 = (NumVecs == 3) + ? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,dl,VT), 0) + : N->getOperand(3+3); + RegSeq = SDValue(QuadDRegs(MVT::v4i64, V0, V1, V2, V3), 0); + } + + // Now extract the D registers back out. + Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::dsub_0, dl, VT, + RegSeq)); + Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::dsub_1, dl, VT, + RegSeq)); + if (NumVecs > 2) + Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::dsub_2, dl, VT, + RegSeq)); + if (NumVecs > 3) + Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::dsub_3, dl, VT, + RegSeq)); + } else { + for (unsigned Vec = 0; Vec < NumVecs; ++Vec) + Ops.push_back(N->getOperand(Vec+3)); + } + } else { + // Check if this is loading the even or odd subreg of a Q register. + if (Lane < NumElts) { + Opc = QOpcodes0[OpcodeIndex]; + } else { + Lane -= NumElts; + Opc = QOpcodes1[OpcodeIndex]; + } + + if (llvm::ModelWithRegSequence()) { + SDValue RegSeq; + SDValue V0 = N->getOperand(0+3); + SDValue V1 = N->getOperand(1+3); + if (NumVecs == 2) { + RegSeq = SDValue(PairQRegs(MVT::v4i64, V0, V1), 0); + } else { + SDValue V2 = N->getOperand(2+3); + SDValue V3 = (NumVecs == 3) + ? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,dl,VT), 0) + : N->getOperand(3+3); + RegSeq = SDValue(QuadQRegs(MVT::v8i64, V0, V1, V2, V3), 0); + } + + // Extract the subregs of the input vector. + unsigned SubIdx = Even ? ARM::dsub_0 : ARM::dsub_1; + for (unsigned Vec = 0; Vec < NumVecs; ++Vec) + Ops.push_back(CurDAG->getTargetExtractSubreg(SubIdx+Vec*2, dl, RegVT, + RegSeq)); + } else { + // Extract the subregs of the input vector. + for (unsigned Vec = 0; Vec < NumVecs; ++Vec) + Ops.push_back(CurDAG->getTargetExtractSubreg(SubregIdx, dl, RegVT, + N->getOperand(Vec+3))); + } + } + Ops.push_back(getI32Imm(Lane)); + Ops.push_back(Pred); + Ops.push_back(Reg0); + Ops.push_back(Chain); + + if (!IsLoad) + return CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops.data(), NumVecs+6); + + std::vector<EVT> ResTys(NumVecs, RegVT); + ResTys.push_back(MVT::Other); + SDNode *VLdLn = CurDAG->getMachineNode(Opc, dl, ResTys, Ops.data(),NumVecs+6); + + if (llvm::ModelWithRegSequence()) { + // Form a REG_SEQUENCE to force register allocation. + SDValue RegSeq; + if (is64BitVector) { + SDValue V0 = SDValue(VLdLn, 0); + SDValue V1 = SDValue(VLdLn, 1); + if (NumVecs == 2) { + RegSeq = SDValue(PairDRegs(MVT::v2i64, V0, V1), 0); + } else { + SDValue V2 = SDValue(VLdLn, 2); + // If it's a vld3, form a quad D-register but discard the last part. + SDValue V3 = (NumVecs == 3) + ? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,dl,VT), 0) + : SDValue(VLdLn, 3); + RegSeq = SDValue(QuadDRegs(MVT::v4i64, V0, V1, V2, V3), 0); + } + } else { + // For 128-bit vectors, take the 64-bit results of the load and insert them + // as subregs into the result. + SDValue V[8]; + for (unsigned Vec = 0, i = 0; Vec < NumVecs; ++Vec, i+=2) { + if (Even) { + V[i] = SDValue(VLdLn, Vec); + V[i+1] = SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, + dl, RegVT), 0); + } else { + V[i] = SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, + dl, RegVT), 0); + V[i+1] = SDValue(VLdLn, Vec); + } + } + if (NumVecs == 3) + V[6] = V[7] = SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, + dl, RegVT), 0); + + if (NumVecs == 2) + RegSeq = SDValue(QuadDRegs(MVT::v4i64, V[0], V[1], V[2], V[3]), 0); + else + RegSeq = SDValue(OctoDRegs(MVT::v8i64, V[0], V[1], V[2], V[3], + V[4], V[5], V[6], V[7]), 0); + } + + assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering"); + assert(ARM::qsub_3 == ARM::qsub_0+3 && "Unexpected subreg numbering"); + unsigned SubIdx = is64BitVector ? ARM::dsub_0 : ARM::qsub_0; + for (unsigned Vec = 0; Vec < NumVecs; ++Vec) + ReplaceUses(SDValue(N, Vec), + CurDAG->getTargetExtractSubreg(SubIdx+Vec, dl, VT, RegSeq)); + ReplaceUses(SDValue(N, NumVecs), SDValue(VLdLn, NumVecs)); + return NULL; + } + + // For a 64-bit vector load to D registers, nothing more needs to be done. + if (is64BitVector) + return VLdLn; + + // For 128-bit vectors, take the 64-bit results of the load and insert them + // as subregs into the result. + for (unsigned Vec = 0; Vec < NumVecs; ++Vec) { + SDValue QuadVec = CurDAG->getTargetInsertSubreg(SubregIdx, dl, VT, + N->getOperand(Vec+3), + SDValue(VLdLn, Vec)); + ReplaceUses(SDValue(N, Vec), QuadVec); + } + + Chain = SDValue(VLdLn, NumVecs); + ReplaceUses(SDValue(N, NumVecs), Chain); + return NULL; +} + +SDNode *ARMDAGToDAGISel::SelectV6T2BitfieldExtractOp(SDNode *N, + bool isSigned) { + if (!Subtarget->hasV6T2Ops()) + return NULL; + + unsigned Opc = isSigned ? (Subtarget->isThumb() ? ARM::t2SBFX : ARM::SBFX) + : (Subtarget->isThumb() ? ARM::t2UBFX : ARM::UBFX); + + + // For unsigned extracts, check for a shift right and mask + unsigned And_imm = 0; + if (N->getOpcode() == ISD::AND) { + if (isOpcWithIntImmediate(N, ISD::AND, And_imm)) { + + // The immediate is a mask of the low bits iff imm & (imm+1) == 0 + if (And_imm & (And_imm + 1)) + return NULL; + + unsigned Srl_imm = 0; + if (isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::SRL, + Srl_imm)) { + assert(Srl_imm > 0 && Srl_imm < 32 && "bad amount in shift node!"); + + unsigned Width = CountTrailingOnes_32(And_imm); + unsigned LSB = Srl_imm; + SDValue Reg0 = CurDAG->getRegister(0, MVT::i32); + SDValue Ops[] = { N->getOperand(0).getOperand(0), + CurDAG->getTargetConstant(LSB, MVT::i32), + CurDAG->getTargetConstant(Width, MVT::i32), + getAL(CurDAG), Reg0 }; + return CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops, 5); + } + } + return NULL; + } + + // Otherwise, we're looking for a shift of a shift + unsigned Shl_imm = 0; + if (isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::SHL, Shl_imm)) { + assert(Shl_imm > 0 && Shl_imm < 32 && "bad amount in shift node!"); + unsigned Srl_imm = 0; + if (isInt32Immediate(N->getOperand(1), Srl_imm)) { + assert(Srl_imm > 0 && Srl_imm < 32 && "bad amount in shift node!"); + unsigned Width = 32 - Srl_imm; + int LSB = Srl_imm - Shl_imm; + if (LSB < 0) + return NULL; + SDValue Reg0 = CurDAG->getRegister(0, MVT::i32); + SDValue Ops[] = { N->getOperand(0).getOperand(0), + CurDAG->getTargetConstant(LSB, MVT::i32), + CurDAG->getTargetConstant(Width, MVT::i32), + getAL(CurDAG), Reg0 }; + return CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops, 5); + } + } + return NULL; +} + +SDNode *ARMDAGToDAGISel:: +SelectT2CMOVShiftOp(SDNode *N, SDValue FalseVal, SDValue TrueVal, + ARMCC::CondCodes CCVal, SDValue CCR, SDValue InFlag) { + SDValue CPTmp0; + SDValue CPTmp1; + if (SelectT2ShifterOperandReg(N, TrueVal, CPTmp0, CPTmp1)) { + unsigned SOVal = cast<ConstantSDNode>(CPTmp1)->getZExtValue(); + unsigned SOShOp = ARM_AM::getSORegShOp(SOVal); + unsigned Opc = 0; + switch (SOShOp) { + case ARM_AM::lsl: Opc = ARM::t2MOVCClsl; break; + case ARM_AM::lsr: Opc = ARM::t2MOVCClsr; break; + case ARM_AM::asr: Opc = ARM::t2MOVCCasr; break; + case ARM_AM::ror: Opc = ARM::t2MOVCCror; break; + default: + llvm_unreachable("Unknown so_reg opcode!"); + break; + } + SDValue SOShImm = + CurDAG->getTargetConstant(ARM_AM::getSORegOffset(SOVal), MVT::i32); + SDValue CC = CurDAG->getTargetConstant(CCVal, MVT::i32); + SDValue Ops[] = { FalseVal, CPTmp0, SOShImm, CC, CCR, InFlag }; + return CurDAG->SelectNodeTo(N, Opc, MVT::i32,Ops, 6); + } + return 0; +} + +SDNode *ARMDAGToDAGISel:: +SelectARMCMOVShiftOp(SDNode *N, SDValue FalseVal, SDValue TrueVal, + ARMCC::CondCodes CCVal, SDValue CCR, SDValue InFlag) { + SDValue CPTmp0; + SDValue CPTmp1; + SDValue CPTmp2; + if (SelectShifterOperandReg(N, TrueVal, CPTmp0, CPTmp1, CPTmp2)) { + SDValue CC = CurDAG->getTargetConstant(CCVal, MVT::i32); + SDValue Ops[] = { FalseVal, CPTmp0, CPTmp1, CPTmp2, CC, CCR, InFlag }; + return CurDAG->SelectNodeTo(N, ARM::MOVCCs, MVT::i32, Ops, 7); + } + return 0; +} + +SDNode *ARMDAGToDAGISel:: +SelectT2CMOVSoImmOp(SDNode *N, SDValue FalseVal, SDValue TrueVal, + ARMCC::CondCodes CCVal, SDValue CCR, SDValue InFlag) { + ConstantSDNode *T = dyn_cast<ConstantSDNode>(TrueVal); + if (!T) + return 0; + + if (Predicate_t2_so_imm(TrueVal.getNode())) { + SDValue True = CurDAG->getTargetConstant(T->getZExtValue(), MVT::i32); + SDValue CC = CurDAG->getTargetConstant(CCVal, MVT::i32); + SDValue Ops[] = { FalseVal, True, CC, CCR, InFlag }; + return CurDAG->SelectNodeTo(N, + ARM::t2MOVCCi, MVT::i32, Ops, 5); + } + return 0; +} + +SDNode *ARMDAGToDAGISel:: +SelectARMCMOVSoImmOp(SDNode *N, SDValue FalseVal, SDValue TrueVal, + ARMCC::CondCodes CCVal, SDValue CCR, SDValue InFlag) { + ConstantSDNode *T = dyn_cast<ConstantSDNode>(TrueVal); + if (!T) + return 0; + + if (Predicate_so_imm(TrueVal.getNode())) { + SDValue True = CurDAG->getTargetConstant(T->getZExtValue(), MVT::i32); + SDValue CC = CurDAG->getTargetConstant(CCVal, MVT::i32); + SDValue Ops[] = { FalseVal, True, CC, CCR, InFlag }; + return CurDAG->SelectNodeTo(N, + ARM::MOVCCi, MVT::i32, Ops, 5); + } + return 0; +} + +SDNode *ARMDAGToDAGISel::SelectCMOVOp(SDNode *N) { + EVT VT = N->getValueType(0); + SDValue FalseVal = N->getOperand(0); + SDValue TrueVal = N->getOperand(1); + SDValue CC = N->getOperand(2); + SDValue CCR = N->getOperand(3); + SDValue InFlag = N->getOperand(4); + assert(CC.getOpcode() == ISD::Constant); + assert(CCR.getOpcode() == ISD::Register); + ARMCC::CondCodes CCVal = + (ARMCC::CondCodes)cast<ConstantSDNode>(CC)->getZExtValue(); + + if (!Subtarget->isThumb1Only() && VT == MVT::i32) { + // Pattern: (ARMcmov:i32 GPR:i32:$false, so_reg:i32:$true, (imm:i32):$cc) + // Emits: (MOVCCs:i32 GPR:i32:$false, so_reg:i32:$true, (imm:i32):$cc) + // Pattern complexity = 18 cost = 1 size = 0 + SDValue CPTmp0; + SDValue CPTmp1; + SDValue CPTmp2; + if (Subtarget->isThumb()) { + SDNode *Res = SelectT2CMOVShiftOp(N, FalseVal, TrueVal, + CCVal, CCR, InFlag); + if (!Res) + Res = SelectT2CMOVShiftOp(N, TrueVal, FalseVal, + ARMCC::getOppositeCondition(CCVal), CCR, InFlag); + if (Res) + return Res; + } else { + SDNode *Res = SelectARMCMOVShiftOp(N, FalseVal, TrueVal, + CCVal, CCR, InFlag); + if (!Res) + Res = SelectARMCMOVShiftOp(N, TrueVal, FalseVal, + ARMCC::getOppositeCondition(CCVal), CCR, InFlag); + if (Res) + return Res; + } + + // Pattern: (ARMcmov:i32 GPR:i32:$false, + // (imm:i32)<<P:Predicate_so_imm>>:$true, + // (imm:i32):$cc) + // Emits: (MOVCCi:i32 GPR:i32:$false, + // (so_imm:i32 (imm:i32):$true), (imm:i32):$cc) + // Pattern complexity = 10 cost = 1 size = 0 + if (Subtarget->isThumb()) { + SDNode *Res = SelectT2CMOVSoImmOp(N, FalseVal, TrueVal, + CCVal, CCR, InFlag); + if (!Res) + Res = SelectT2CMOVSoImmOp(N, TrueVal, FalseVal, + ARMCC::getOppositeCondition(CCVal), CCR, InFlag); + if (Res) + return Res; + } else { + SDNode *Res = SelectARMCMOVSoImmOp(N, FalseVal, TrueVal, + CCVal, CCR, InFlag); + if (!Res) + Res = SelectARMCMOVSoImmOp(N, TrueVal, FalseVal, + ARMCC::getOppositeCondition(CCVal), CCR, InFlag); + if (Res) + return Res; + } + } + + // Pattern: (ARMcmov:i32 GPR:i32:$false, GPR:i32:$true, (imm:i32):$cc) + // Emits: (MOVCCr:i32 GPR:i32:$false, GPR:i32:$true, (imm:i32):$cc) + // Pattern complexity = 6 cost = 1 size = 0 + // + // Pattern: (ARMcmov:i32 GPR:i32:$false, GPR:i32:$true, (imm:i32):$cc) + // Emits: (tMOVCCr:i32 GPR:i32:$false, GPR:i32:$true, (imm:i32):$cc) + // Pattern complexity = 6 cost = 11 size = 0 + // + // Also FCPYScc and FCPYDcc. + SDValue Tmp2 = CurDAG->getTargetConstant(CCVal, MVT::i32); + SDValue Ops[] = { FalseVal, TrueVal, Tmp2, CCR, InFlag }; + unsigned Opc = 0; + switch (VT.getSimpleVT().SimpleTy) { + default: assert(false && "Illegal conditional move type!"); + break; + case MVT::i32: + Opc = Subtarget->isThumb() + ? (Subtarget->hasThumb2() ? ARM::t2MOVCCr : ARM::tMOVCCr_pseudo) + : ARM::MOVCCr; + break; + case MVT::f32: + Opc = ARM::VMOVScc; + break; + case MVT::f64: + Opc = ARM::VMOVDcc; + break; + } + return CurDAG->SelectNodeTo(N, Opc, VT, Ops, 5); +} + +SDNode *ARMDAGToDAGISel::SelectConcatVector(SDNode *N) { + // The only time a CONCAT_VECTORS operation can have legal types is when + // two 64-bit vectors are concatenated to a 128-bit vector. + EVT VT = N->getValueType(0); + if (!VT.is128BitVector() || N->getNumOperands() != 2) + llvm_unreachable("unexpected CONCAT_VECTORS"); + DebugLoc dl = N->getDebugLoc(); + SDValue V0 = N->getOperand(0); + SDValue V1 = N->getOperand(1); + SDValue SubReg0 = CurDAG->getTargetConstant(ARM::dsub_0, MVT::i32); + SDValue SubReg1 = CurDAG->getTargetConstant(ARM::dsub_1, MVT::i32); + const SDValue Ops[] = { V0, SubReg0, V1, SubReg1 }; + return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops, 4); +} + +SDNode *ARMDAGToDAGISel::Select(SDNode *N) { + DebugLoc dl = N->getDebugLoc(); + + if (N->isMachineOpcode()) + return NULL; // Already selected. + + switch (N->getOpcode()) { + default: break; + case ISD::Constant: { + unsigned Val = cast<ConstantSDNode>(N)->getZExtValue(); + bool UseCP = true; + if (Subtarget->hasThumb2()) + // Thumb2-aware targets have the MOVT instruction, so all immediates can + // be done with MOV + MOVT, at worst. + UseCP = 0; + else { + if (Subtarget->isThumb()) { + UseCP = (Val > 255 && // MOV + ~Val > 255 && // MOV + MVN + !ARM_AM::isThumbImmShiftedVal(Val)); // MOV + LSL + } else + UseCP = (ARM_AM::getSOImmVal(Val) == -1 && // MOV + ARM_AM::getSOImmVal(~Val) == -1 && // MVN + !ARM_AM::isSOImmTwoPartVal(Val)); // two instrs. + } + + if (UseCP) { + SDValue CPIdx = + CurDAG->getTargetConstantPool(ConstantInt::get( + Type::getInt32Ty(*CurDAG->getContext()), Val), + TLI.getPointerTy()); + + SDNode *ResNode; + if (Subtarget->isThumb1Only()) { + SDValue Pred = getAL(CurDAG); + SDValue PredReg = CurDAG->getRegister(0, MVT::i32); + SDValue Ops[] = { CPIdx, Pred, PredReg, CurDAG->getEntryNode() }; + ResNode = CurDAG->getMachineNode(ARM::tLDRcp, dl, MVT::i32, MVT::Other, + Ops, 4); + } else { + SDValue Ops[] = { + CPIdx, + CurDAG->getRegister(0, MVT::i32), + CurDAG->getTargetConstant(0, MVT::i32), + getAL(CurDAG), + CurDAG->getRegister(0, MVT::i32), + CurDAG->getEntryNode() + }; + ResNode=CurDAG->getMachineNode(ARM::LDRcp, dl, MVT::i32, MVT::Other, + Ops, 6); + } + ReplaceUses(SDValue(N, 0), SDValue(ResNode, 0)); + return NULL; + } + + // Other cases are autogenerated. + break; + } + case ISD::FrameIndex: { + // Selects to ADDri FI, 0 which in turn will become ADDri SP, imm. + int FI = cast<FrameIndexSDNode>(N)->getIndex(); + SDValue TFI = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy()); + if (Subtarget->isThumb1Only()) { + return CurDAG->SelectNodeTo(N, ARM::tADDrSPi, MVT::i32, TFI, + CurDAG->getTargetConstant(0, MVT::i32)); + } else { + unsigned Opc = ((Subtarget->isThumb() && Subtarget->hasThumb2()) ? + ARM::t2ADDri : ARM::ADDri); + SDValue Ops[] = { TFI, CurDAG->getTargetConstant(0, MVT::i32), + getAL(CurDAG), CurDAG->getRegister(0, MVT::i32), + CurDAG->getRegister(0, MVT::i32) }; + return CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops, 5); + } + } + case ISD::SRL: + if (SDNode *I = SelectV6T2BitfieldExtractOp(N, false)) + return I; + break; + case ISD::SRA: + if (SDNode *I = SelectV6T2BitfieldExtractOp(N, true)) + return I; + break; + case ISD::MUL: + if (Subtarget->isThumb1Only()) + break; + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1))) { + unsigned RHSV = C->getZExtValue(); + if (!RHSV) break; + if (isPowerOf2_32(RHSV-1)) { // 2^n+1? + unsigned ShImm = Log2_32(RHSV-1); + if (ShImm >= 32) + break; + SDValue V = N->getOperand(0); + ShImm = ARM_AM::getSORegOpc(ARM_AM::lsl, ShImm); + SDValue ShImmOp = CurDAG->getTargetConstant(ShImm, MVT::i32); + SDValue Reg0 = CurDAG->getRegister(0, MVT::i32); + if (Subtarget->isThumb()) { + SDValue Ops[] = { V, V, ShImmOp, getAL(CurDAG), Reg0, Reg0 }; + return CurDAG->SelectNodeTo(N, ARM::t2ADDrs, MVT::i32, Ops, 6); + } else { + SDValue Ops[] = { V, V, Reg0, ShImmOp, getAL(CurDAG), Reg0, Reg0 }; + return CurDAG->SelectNodeTo(N, ARM::ADDrs, MVT::i32, Ops, 7); + } + } + if (isPowerOf2_32(RHSV+1)) { // 2^n-1? + unsigned ShImm = Log2_32(RHSV+1); + if (ShImm >= 32) + break; + SDValue V = N->getOperand(0); + ShImm = ARM_AM::getSORegOpc(ARM_AM::lsl, ShImm); + SDValue ShImmOp = CurDAG->getTargetConstant(ShImm, MVT::i32); + SDValue Reg0 = CurDAG->getRegister(0, MVT::i32); + if (Subtarget->isThumb()) { + SDValue Ops[] = { V, V, ShImmOp, getAL(CurDAG), Reg0 }; + return CurDAG->SelectNodeTo(N, ARM::t2RSBrs, MVT::i32, Ops, 5); + } else { + SDValue Ops[] = { V, V, Reg0, ShImmOp, getAL(CurDAG), Reg0, Reg0 }; + return CurDAG->SelectNodeTo(N, ARM::RSBrs, MVT::i32, Ops, 7); + } + } + } + break; + case ISD::AND: { + // Check for unsigned bitfield extract + if (SDNode *I = SelectV6T2BitfieldExtractOp(N, false)) + return I; + + // (and (or x, c2), c1) and top 16-bits of c1 and c2 match, lower 16-bits + // of c1 are 0xffff, and lower 16-bit of c2 are 0. That is, the top 16-bits + // are entirely contributed by c2 and lower 16-bits are entirely contributed + // by x. That's equal to (or (and x, 0xffff), (and c1, 0xffff0000)). + // Select it to: "movt x, ((c1 & 0xffff) >> 16) + EVT VT = N->getValueType(0); + if (VT != MVT::i32) + break; + unsigned Opc = (Subtarget->isThumb() && Subtarget->hasThumb2()) + ? ARM::t2MOVTi16 + : (Subtarget->hasV6T2Ops() ? ARM::MOVTi16 : 0); + if (!Opc) + break; + SDValue N0 = N->getOperand(0), N1 = N->getOperand(1); + ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1); + if (!N1C) + break; + if (N0.getOpcode() == ISD::OR && N0.getNode()->hasOneUse()) { + SDValue N2 = N0.getOperand(1); + ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(N2); + if (!N2C) + break; + unsigned N1CVal = N1C->getZExtValue(); + unsigned N2CVal = N2C->getZExtValue(); + if ((N1CVal & 0xffff0000U) == (N2CVal & 0xffff0000U) && + (N1CVal & 0xffffU) == 0xffffU && + (N2CVal & 0xffffU) == 0x0U) { + SDValue Imm16 = CurDAG->getTargetConstant((N2CVal & 0xFFFF0000U) >> 16, + MVT::i32); + SDValue Ops[] = { N0.getOperand(0), Imm16, + getAL(CurDAG), CurDAG->getRegister(0, MVT::i32) }; + return CurDAG->getMachineNode(Opc, dl, VT, Ops, 4); + } + } + break; + } + case ARMISD::VMOVRRD: + return CurDAG->getMachineNode(ARM::VMOVRRD, dl, MVT::i32, MVT::i32, + N->getOperand(0), getAL(CurDAG), + CurDAG->getRegister(0, MVT::i32)); + case ISD::UMUL_LOHI: { + if (Subtarget->isThumb1Only()) + break; + if (Subtarget->isThumb()) { + SDValue Ops[] = { N->getOperand(0), N->getOperand(1), + getAL(CurDAG), CurDAG->getRegister(0, MVT::i32), + CurDAG->getRegister(0, MVT::i32) }; + return CurDAG->getMachineNode(ARM::t2UMULL, dl, MVT::i32, MVT::i32, Ops,4); + } else { + SDValue Ops[] = { N->getOperand(0), N->getOperand(1), + getAL(CurDAG), CurDAG->getRegister(0, MVT::i32), + CurDAG->getRegister(0, MVT::i32) }; + return CurDAG->getMachineNode(ARM::UMULL, dl, MVT::i32, MVT::i32, Ops, 5); + } + } + case ISD::SMUL_LOHI: { + if (Subtarget->isThumb1Only()) + break; + if (Subtarget->isThumb()) { + SDValue Ops[] = { N->getOperand(0), N->getOperand(1), + getAL(CurDAG), CurDAG->getRegister(0, MVT::i32) }; + return CurDAG->getMachineNode(ARM::t2SMULL, dl, MVT::i32, MVT::i32, Ops,4); + } else { + SDValue Ops[] = { N->getOperand(0), N->getOperand(1), + getAL(CurDAG), CurDAG->getRegister(0, MVT::i32), + CurDAG->getRegister(0, MVT::i32) }; + return CurDAG->getMachineNode(ARM::SMULL, dl, MVT::i32, MVT::i32, Ops, 5); + } + } + case ISD::LOAD: { + SDNode *ResNode = 0; + if (Subtarget->isThumb() && Subtarget->hasThumb2()) + ResNode = SelectT2IndexedLoad(N); + else + ResNode = SelectARMIndexedLoad(N); + if (ResNode) + return ResNode; + + // VLDMQ must be custom-selected for "v2f64 load" to set the AM5Opc value. + if (Subtarget->hasVFP2() && + N->getValueType(0).getSimpleVT().SimpleTy == MVT::v2f64) { + SDValue Chain = N->getOperand(0); + SDValue AM5Opc = + CurDAG->getTargetConstant(ARM_AM::getAM5Opc(ARM_AM::ia, 4), MVT::i32); + SDValue Pred = getAL(CurDAG); + SDValue PredReg = CurDAG->getRegister(0, MVT::i32); + SDValue Ops[] = { N->getOperand(1), AM5Opc, Pred, PredReg, Chain }; + MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); + MemOp[0] = cast<MemSDNode>(N)->getMemOperand(); + SDNode *Ret = CurDAG->getMachineNode(ARM::VLDMQ, dl, + MVT::v2f64, MVT::Other, Ops, 5); + cast<MachineSDNode>(Ret)->setMemRefs(MemOp, MemOp + 1); + return Ret; + } + // Other cases are autogenerated. + break; + } + case ISD::STORE: { + // VSTMQ must be custom-selected for "v2f64 store" to set the AM5Opc value. + if (Subtarget->hasVFP2() && + N->getOperand(1).getValueType().getSimpleVT().SimpleTy == MVT::v2f64) { + SDValue Chain = N->getOperand(0); + SDValue AM5Opc = + CurDAG->getTargetConstant(ARM_AM::getAM5Opc(ARM_AM::ia, 4), MVT::i32); + SDValue Pred = getAL(CurDAG); + SDValue PredReg = CurDAG->getRegister(0, MVT::i32); + SDValue Ops[] = { N->getOperand(1), N->getOperand(2), + AM5Opc, Pred, PredReg, Chain }; + MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); + MemOp[0] = cast<MemSDNode>(N)->getMemOperand(); + SDNode *Ret = CurDAG->getMachineNode(ARM::VSTMQ, dl, MVT::Other, Ops, 6); + cast<MachineSDNode>(Ret)->setMemRefs(MemOp, MemOp + 1); + return Ret; + } + // Other cases are autogenerated. + break; + } + case ARMISD::BRCOND: { + // Pattern: (ARMbrcond:void (bb:Other):$dst, (imm:i32):$cc) + // Emits: (Bcc:void (bb:Other):$dst, (imm:i32):$cc) + // Pattern complexity = 6 cost = 1 size = 0 + + // Pattern: (ARMbrcond:void (bb:Other):$dst, (imm:i32):$cc) + // Emits: (tBcc:void (bb:Other):$dst, (imm:i32):$cc) + // Pattern complexity = 6 cost = 1 size = 0 + + // Pattern: (ARMbrcond:void (bb:Other):$dst, (imm:i32):$cc) + // Emits: (t2Bcc:void (bb:Other):$dst, (imm:i32):$cc) + // Pattern complexity = 6 cost = 1 size = 0 + + unsigned Opc = Subtarget->isThumb() ? + ((Subtarget->hasThumb2()) ? ARM::t2Bcc : ARM::tBcc) : ARM::Bcc; + SDValue Chain = N->getOperand(0); + SDValue N1 = N->getOperand(1); + SDValue N2 = N->getOperand(2); + SDValue N3 = N->getOperand(3); + SDValue InFlag = N->getOperand(4); + assert(N1.getOpcode() == ISD::BasicBlock); + assert(N2.getOpcode() == ISD::Constant); + assert(N3.getOpcode() == ISD::Register); + + SDValue Tmp2 = CurDAG->getTargetConstant(((unsigned) + cast<ConstantSDNode>(N2)->getZExtValue()), + MVT::i32); + SDValue Ops[] = { N1, Tmp2, N3, Chain, InFlag }; + SDNode *ResNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, + MVT::Flag, Ops, 5); + Chain = SDValue(ResNode, 0); + if (N->getNumValues() == 2) { + InFlag = SDValue(ResNode, 1); + ReplaceUses(SDValue(N, 1), InFlag); + } + ReplaceUses(SDValue(N, 0), + SDValue(Chain.getNode(), Chain.getResNo())); + return NULL; + } + case ARMISD::CMOV: + return SelectCMOVOp(N); + case ARMISD::CNEG: { + EVT VT = N->getValueType(0); + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + SDValue N2 = N->getOperand(2); + SDValue N3 = N->getOperand(3); + SDValue InFlag = N->getOperand(4); + assert(N2.getOpcode() == ISD::Constant); + assert(N3.getOpcode() == ISD::Register); + + SDValue Tmp2 = CurDAG->getTargetConstant(((unsigned) + cast<ConstantSDNode>(N2)->getZExtValue()), + MVT::i32); + SDValue Ops[] = { N0, N1, Tmp2, N3, InFlag }; + unsigned Opc = 0; + switch (VT.getSimpleVT().SimpleTy) { + default: assert(false && "Illegal conditional move type!"); + break; + case MVT::f32: + Opc = ARM::VNEGScc; + break; + case MVT::f64: + Opc = ARM::VNEGDcc; + break; + } + return CurDAG->SelectNodeTo(N, Opc, VT, Ops, 5); + } + + case ARMISD::VZIP: { + unsigned Opc = 0; + EVT VT = N->getValueType(0); + switch (VT.getSimpleVT().SimpleTy) { + default: return NULL; + case MVT::v8i8: Opc = ARM::VZIPd8; break; + case MVT::v4i16: Opc = ARM::VZIPd16; break; + case MVT::v2f32: + case MVT::v2i32: Opc = ARM::VZIPd32; break; + case MVT::v16i8: Opc = ARM::VZIPq8; break; + case MVT::v8i16: Opc = ARM::VZIPq16; break; + case MVT::v4f32: + case MVT::v4i32: Opc = ARM::VZIPq32; break; + } + SDValue Pred = getAL(CurDAG); + SDValue PredReg = CurDAG->getRegister(0, MVT::i32); + SDValue Ops[] = { N->getOperand(0), N->getOperand(1), Pred, PredReg }; + return CurDAG->getMachineNode(Opc, dl, VT, VT, Ops, 4); + } + case ARMISD::VUZP: { + unsigned Opc = 0; + EVT VT = N->getValueType(0); + switch (VT.getSimpleVT().SimpleTy) { + default: return NULL; + case MVT::v8i8: Opc = ARM::VUZPd8; break; + case MVT::v4i16: Opc = ARM::VUZPd16; break; + case MVT::v2f32: + case MVT::v2i32: Opc = ARM::VUZPd32; break; + case MVT::v16i8: Opc = ARM::VUZPq8; break; + case MVT::v8i16: Opc = ARM::VUZPq16; break; + case MVT::v4f32: + case MVT::v4i32: Opc = ARM::VUZPq32; break; + } + SDValue Pred = getAL(CurDAG); + SDValue PredReg = CurDAG->getRegister(0, MVT::i32); + SDValue Ops[] = { N->getOperand(0), N->getOperand(1), Pred, PredReg }; + return CurDAG->getMachineNode(Opc, dl, VT, VT, Ops, 4); + } + case ARMISD::VTRN: { + unsigned Opc = 0; + EVT VT = N->getValueType(0); + switch (VT.getSimpleVT().SimpleTy) { + default: return NULL; + case MVT::v8i8: Opc = ARM::VTRNd8; break; + case MVT::v4i16: Opc = ARM::VTRNd16; break; + case MVT::v2f32: + case MVT::v2i32: Opc = ARM::VTRNd32; break; + case MVT::v16i8: Opc = ARM::VTRNq8; break; + case MVT::v8i16: Opc = ARM::VTRNq16; break; + case MVT::v4f32: + case MVT::v4i32: Opc = ARM::VTRNq32; break; + } + SDValue Pred = getAL(CurDAG); + SDValue PredReg = CurDAG->getRegister(0, MVT::i32); + SDValue Ops[] = { N->getOperand(0), N->getOperand(1), Pred, PredReg }; + return CurDAG->getMachineNode(Opc, dl, VT, VT, Ops, 4); + } + + case ISD::INTRINSIC_VOID: + case ISD::INTRINSIC_W_CHAIN: { + unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); + switch (IntNo) { + default: + break; + + case Intrinsic::arm_neon_vld1: { + unsigned DOpcodes[] = { ARM::VLD1d8, ARM::VLD1d16, + ARM::VLD1d32, ARM::VLD1d64 }; + unsigned QOpcodes[] = { ARM::VLD1q8, ARM::VLD1q16, + ARM::VLD1q32, ARM::VLD1q64 }; + return SelectVLD(N, 1, DOpcodes, QOpcodes, 0); + } + + case Intrinsic::arm_neon_vld2: { + unsigned DOpcodes[] = { ARM::VLD2d8, ARM::VLD2d16, + ARM::VLD2d32, ARM::VLD1q64 }; + unsigned QOpcodes[] = { ARM::VLD2q8, ARM::VLD2q16, ARM::VLD2q32 }; + return SelectVLD(N, 2, DOpcodes, QOpcodes, 0); + } + + case Intrinsic::arm_neon_vld3: { + unsigned DOpcodes[] = { ARM::VLD3d8, ARM::VLD3d16, + ARM::VLD3d32, ARM::VLD1d64T }; + unsigned QOpcodes0[] = { ARM::VLD3q8_UPD, + ARM::VLD3q16_UPD, + ARM::VLD3q32_UPD }; + unsigned QOpcodes1[] = { ARM::VLD3q8odd_UPD, + ARM::VLD3q16odd_UPD, + ARM::VLD3q32odd_UPD }; + return SelectVLD(N, 3, DOpcodes, QOpcodes0, QOpcodes1); + } + + case Intrinsic::arm_neon_vld4: { + unsigned DOpcodes[] = { ARM::VLD4d8, ARM::VLD4d16, + ARM::VLD4d32, ARM::VLD1d64Q }; + unsigned QOpcodes0[] = { ARM::VLD4q8_UPD, + ARM::VLD4q16_UPD, + ARM::VLD4q32_UPD }; + unsigned QOpcodes1[] = { ARM::VLD4q8odd_UPD, + ARM::VLD4q16odd_UPD, + ARM::VLD4q32odd_UPD }; + return SelectVLD(N, 4, DOpcodes, QOpcodes0, QOpcodes1); + } + + case Intrinsic::arm_neon_vld2lane: { + unsigned DOpcodes[] = { ARM::VLD2LNd8, ARM::VLD2LNd16, ARM::VLD2LNd32 }; + unsigned QOpcodes0[] = { ARM::VLD2LNq16, ARM::VLD2LNq32 }; + unsigned QOpcodes1[] = { ARM::VLD2LNq16odd, ARM::VLD2LNq32odd }; + return SelectVLDSTLane(N, true, 2, DOpcodes, QOpcodes0, QOpcodes1); + } + + case Intrinsic::arm_neon_vld3lane: { + unsigned DOpcodes[] = { ARM::VLD3LNd8, ARM::VLD3LNd16, ARM::VLD3LNd32 }; + unsigned QOpcodes0[] = { ARM::VLD3LNq16, ARM::VLD3LNq32 }; + unsigned QOpcodes1[] = { ARM::VLD3LNq16odd, ARM::VLD3LNq32odd }; + return SelectVLDSTLane(N, true, 3, DOpcodes, QOpcodes0, QOpcodes1); + } + + case Intrinsic::arm_neon_vld4lane: { + unsigned DOpcodes[] = { ARM::VLD4LNd8, ARM::VLD4LNd16, ARM::VLD4LNd32 }; + unsigned QOpcodes0[] = { ARM::VLD4LNq16, ARM::VLD4LNq32 }; + unsigned QOpcodes1[] = { ARM::VLD4LNq16odd, ARM::VLD4LNq32odd }; + return SelectVLDSTLane(N, true, 4, DOpcodes, QOpcodes0, QOpcodes1); + } + + case Intrinsic::arm_neon_vst1: { + unsigned DOpcodes[] = { ARM::VST1d8, ARM::VST1d16, + ARM::VST1d32, ARM::VST1d64 }; + unsigned QOpcodes[] = { ARM::VST1q8, ARM::VST1q16, + ARM::VST1q32, ARM::VST1q64 }; + return SelectVST(N, 1, DOpcodes, QOpcodes, 0); + } + + case Intrinsic::arm_neon_vst2: { + unsigned DOpcodes[] = { ARM::VST2d8, ARM::VST2d16, + ARM::VST2d32, ARM::VST1q64 }; + unsigned QOpcodes[] = { ARM::VST2q8, ARM::VST2q16, ARM::VST2q32 }; + return SelectVST(N, 2, DOpcodes, QOpcodes, 0); + } + + case Intrinsic::arm_neon_vst3: { + unsigned DOpcodes[] = { ARM::VST3d8, ARM::VST3d16, + ARM::VST3d32, ARM::VST1d64T }; + unsigned QOpcodes0[] = { ARM::VST3q8_UPD, + ARM::VST3q16_UPD, + ARM::VST3q32_UPD }; + unsigned QOpcodes1[] = { ARM::VST3q8odd_UPD, + ARM::VST3q16odd_UPD, + ARM::VST3q32odd_UPD }; + return SelectVST(N, 3, DOpcodes, QOpcodes0, QOpcodes1); + } + + case Intrinsic::arm_neon_vst4: { + unsigned DOpcodes[] = { ARM::VST4d8, ARM::VST4d16, + ARM::VST4d32, ARM::VST1d64Q }; + unsigned QOpcodes0[] = { ARM::VST4q8_UPD, + ARM::VST4q16_UPD, + ARM::VST4q32_UPD }; + unsigned QOpcodes1[] = { ARM::VST4q8odd_UPD, + ARM::VST4q16odd_UPD, + ARM::VST4q32odd_UPD }; + return SelectVST(N, 4, DOpcodes, QOpcodes0, QOpcodes1); + } + + case Intrinsic::arm_neon_vst2lane: { + unsigned DOpcodes[] = { ARM::VST2LNd8, ARM::VST2LNd16, ARM::VST2LNd32 }; + unsigned QOpcodes0[] = { ARM::VST2LNq16, ARM::VST2LNq32 }; + unsigned QOpcodes1[] = { ARM::VST2LNq16odd, ARM::VST2LNq32odd }; + return SelectVLDSTLane(N, false, 2, DOpcodes, QOpcodes0, QOpcodes1); + } + + case Intrinsic::arm_neon_vst3lane: { + unsigned DOpcodes[] = { ARM::VST3LNd8, ARM::VST3LNd16, ARM::VST3LNd32 }; + unsigned QOpcodes0[] = { ARM::VST3LNq16, ARM::VST3LNq32 }; + unsigned QOpcodes1[] = { ARM::VST3LNq16odd, ARM::VST3LNq32odd }; + return SelectVLDSTLane(N, false, 3, DOpcodes, QOpcodes0, QOpcodes1); + } + + case Intrinsic::arm_neon_vst4lane: { + unsigned DOpcodes[] = { ARM::VST4LNd8, ARM::VST4LNd16, ARM::VST4LNd32 }; + unsigned QOpcodes0[] = { ARM::VST4LNq16, ARM::VST4LNq32 }; + unsigned QOpcodes1[] = { ARM::VST4LNq16odd, ARM::VST4LNq32odd }; + return SelectVLDSTLane(N, false, 4, DOpcodes, QOpcodes0, QOpcodes1); + } + } + break; + } + + case ISD::CONCAT_VECTORS: + return SelectConcatVector(N); + } + + return SelectCode(N); +} + +bool ARMDAGToDAGISel:: +SelectInlineAsmMemoryOperand(const SDValue &Op, char ConstraintCode, + std::vector<SDValue> &OutOps) { + assert(ConstraintCode == 'm' && "unexpected asm memory constraint"); + // Require the address to be in a register. That is safe for all ARM + // variants and it is hard to do anything much smarter without knowing + // how the operand is used. + OutOps.push_back(Op); + return false; +} + +/// createARMISelDag - This pass converts a legalized DAG into a +/// ARM-specific DAG, ready for instruction scheduling. +/// +FunctionPass *llvm::createARMISelDag(ARMBaseTargetMachine &TM, + CodeGenOpt::Level OptLevel) { + return new ARMDAGToDAGISel(TM, OptLevel); +} + +/// ModelWithRegSequence - Return true if isel should use REG_SEQUENCE to model +/// operations involving sub-registers. +bool llvm::ModelWithRegSequence() { + return UseRegSeq; +} diff --git a/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp b/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp new file mode 100644 index 0000000..b8126a3 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -0,0 +1,4886 @@ +//===-- ARMISelLowering.cpp - ARM DAG Lowering Implementation -------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the interfaces that ARM uses to lower LLVM code into a +// selection DAG. +// +//===----------------------------------------------------------------------===// + +#include "ARM.h" +#include "ARMAddressingModes.h" +#include "ARMConstantPoolValue.h" +#include "ARMISelLowering.h" +#include "ARMMachineFunctionInfo.h" +#include "ARMPerfectShuffle.h" +#include "ARMRegisterInfo.h" +#include "ARMSubtarget.h" +#include "ARMTargetMachine.h" +#include "ARMTargetObjectFile.h" +#include "llvm/CallingConv.h" +#include "llvm/Constants.h" +#include "llvm/Function.h" +#include "llvm/GlobalValue.h" +#include "llvm/Instruction.h" +#include "llvm/Intrinsics.h" +#include "llvm/Type.h" +#include "llvm/CodeGen/CallingConvLower.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/PseudoSourceValue.h" +#include "llvm/CodeGen/SelectionDAG.h" +#include "llvm/MC/MCSectionMachO.h" +#include "llvm/Target/TargetOptions.h" +#include "llvm/ADT/VectorExtras.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Support/raw_ostream.h" +#include <sstream> +using namespace llvm; + +static cl::opt<bool> +EnableARMLongCalls("arm-long-calls", cl::Hidden, + cl::desc("Generate calls via indirect call instructions."), + cl::init(false)); + +static bool CC_ARM_APCS_Custom_f64(unsigned &ValNo, EVT &ValVT, EVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, + CCState &State); +static bool CC_ARM_AAPCS_Custom_f64(unsigned &ValNo, EVT &ValVT, EVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, + CCState &State); +static bool RetCC_ARM_APCS_Custom_f64(unsigned &ValNo, EVT &ValVT, EVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, + CCState &State); +static bool RetCC_ARM_AAPCS_Custom_f64(unsigned &ValNo, EVT &ValVT, EVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, + CCState &State); + +void ARMTargetLowering::addTypeForNEON(EVT VT, EVT PromotedLdStVT, + EVT PromotedBitwiseVT) { + if (VT != PromotedLdStVT) { + setOperationAction(ISD::LOAD, VT.getSimpleVT(), Promote); + AddPromotedToType (ISD::LOAD, VT.getSimpleVT(), + PromotedLdStVT.getSimpleVT()); + + setOperationAction(ISD::STORE, VT.getSimpleVT(), Promote); + AddPromotedToType (ISD::STORE, VT.getSimpleVT(), + PromotedLdStVT.getSimpleVT()); + } + + EVT ElemTy = VT.getVectorElementType(); + if (ElemTy != MVT::i64 && ElemTy != MVT::f64) + setOperationAction(ISD::VSETCC, VT.getSimpleVT(), Custom); + if (ElemTy == MVT::i8 || ElemTy == MVT::i16) + setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT.getSimpleVT(), Custom); + if (ElemTy != MVT::i32) { + setOperationAction(ISD::SINT_TO_FP, VT.getSimpleVT(), Expand); + setOperationAction(ISD::UINT_TO_FP, VT.getSimpleVT(), Expand); + setOperationAction(ISD::FP_TO_SINT, VT.getSimpleVT(), Expand); + setOperationAction(ISD::FP_TO_UINT, VT.getSimpleVT(), Expand); + } + setOperationAction(ISD::BUILD_VECTOR, VT.getSimpleVT(), Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, VT.getSimpleVT(), Custom); + if (llvm::ModelWithRegSequence()) + setOperationAction(ISD::CONCAT_VECTORS, VT.getSimpleVT(), Legal); + else + setOperationAction(ISD::CONCAT_VECTORS, VT.getSimpleVT(), Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, VT.getSimpleVT(), Expand); + setOperationAction(ISD::SELECT, VT.getSimpleVT(), Expand); + setOperationAction(ISD::SELECT_CC, VT.getSimpleVT(), Expand); + if (VT.isInteger()) { + setOperationAction(ISD::SHL, VT.getSimpleVT(), Custom); + setOperationAction(ISD::SRA, VT.getSimpleVT(), Custom); + setOperationAction(ISD::SRL, VT.getSimpleVT(), Custom); + } + + // Promote all bit-wise operations. + if (VT.isInteger() && VT != PromotedBitwiseVT) { + setOperationAction(ISD::AND, VT.getSimpleVT(), Promote); + AddPromotedToType (ISD::AND, VT.getSimpleVT(), + PromotedBitwiseVT.getSimpleVT()); + setOperationAction(ISD::OR, VT.getSimpleVT(), Promote); + AddPromotedToType (ISD::OR, VT.getSimpleVT(), + PromotedBitwiseVT.getSimpleVT()); + setOperationAction(ISD::XOR, VT.getSimpleVT(), Promote); + AddPromotedToType (ISD::XOR, VT.getSimpleVT(), + PromotedBitwiseVT.getSimpleVT()); + } + + // Neon does not support vector divide/remainder operations. + setOperationAction(ISD::SDIV, VT.getSimpleVT(), Expand); + setOperationAction(ISD::UDIV, VT.getSimpleVT(), Expand); + setOperationAction(ISD::FDIV, VT.getSimpleVT(), Expand); + setOperationAction(ISD::SREM, VT.getSimpleVT(), Expand); + setOperationAction(ISD::UREM, VT.getSimpleVT(), Expand); + setOperationAction(ISD::FREM, VT.getSimpleVT(), Expand); +} + +void ARMTargetLowering::addDRTypeForNEON(EVT VT) { + addRegisterClass(VT, ARM::DPRRegisterClass); + addTypeForNEON(VT, MVT::f64, MVT::v2i32); +} + +void ARMTargetLowering::addQRTypeForNEON(EVT VT) { + addRegisterClass(VT, ARM::QPRRegisterClass); + addTypeForNEON(VT, MVT::v2f64, MVT::v4i32); +} + +static TargetLoweringObjectFile *createTLOF(TargetMachine &TM) { + if (TM.getSubtarget<ARMSubtarget>().isTargetDarwin()) + return new TargetLoweringObjectFileMachO(); + + return new ARMElfTargetObjectFile(); +} + +ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) + : TargetLowering(TM, createTLOF(TM)) { + Subtarget = &TM.getSubtarget<ARMSubtarget>(); + + if (Subtarget->isTargetDarwin()) { + // Uses VFP for Thumb libfuncs if available. + if (Subtarget->isThumb() && Subtarget->hasVFP2()) { + // Single-precision floating-point arithmetic. + setLibcallName(RTLIB::ADD_F32, "__addsf3vfp"); + setLibcallName(RTLIB::SUB_F32, "__subsf3vfp"); + setLibcallName(RTLIB::MUL_F32, "__mulsf3vfp"); + setLibcallName(RTLIB::DIV_F32, "__divsf3vfp"); + + // Double-precision floating-point arithmetic. + setLibcallName(RTLIB::ADD_F64, "__adddf3vfp"); + setLibcallName(RTLIB::SUB_F64, "__subdf3vfp"); + setLibcallName(RTLIB::MUL_F64, "__muldf3vfp"); + setLibcallName(RTLIB::DIV_F64, "__divdf3vfp"); + + // Single-precision comparisons. + setLibcallName(RTLIB::OEQ_F32, "__eqsf2vfp"); + setLibcallName(RTLIB::UNE_F32, "__nesf2vfp"); + setLibcallName(RTLIB::OLT_F32, "__ltsf2vfp"); + setLibcallName(RTLIB::OLE_F32, "__lesf2vfp"); + setLibcallName(RTLIB::OGE_F32, "__gesf2vfp"); + setLibcallName(RTLIB::OGT_F32, "__gtsf2vfp"); + setLibcallName(RTLIB::UO_F32, "__unordsf2vfp"); + setLibcallName(RTLIB::O_F32, "__unordsf2vfp"); + + setCmpLibcallCC(RTLIB::OEQ_F32, ISD::SETNE); + setCmpLibcallCC(RTLIB::UNE_F32, ISD::SETNE); + setCmpLibcallCC(RTLIB::OLT_F32, ISD::SETNE); + setCmpLibcallCC(RTLIB::OLE_F32, ISD::SETNE); + setCmpLibcallCC(RTLIB::OGE_F32, ISD::SETNE); + setCmpLibcallCC(RTLIB::OGT_F32, ISD::SETNE); + setCmpLibcallCC(RTLIB::UO_F32, ISD::SETNE); + setCmpLibcallCC(RTLIB::O_F32, ISD::SETEQ); + + // Double-precision comparisons. + setLibcallName(RTLIB::OEQ_F64, "__eqdf2vfp"); + setLibcallName(RTLIB::UNE_F64, "__nedf2vfp"); + setLibcallName(RTLIB::OLT_F64, "__ltdf2vfp"); + setLibcallName(RTLIB::OLE_F64, "__ledf2vfp"); + setLibcallName(RTLIB::OGE_F64, "__gedf2vfp"); + setLibcallName(RTLIB::OGT_F64, "__gtdf2vfp"); + setLibcallName(RTLIB::UO_F64, "__unorddf2vfp"); + setLibcallName(RTLIB::O_F64, "__unorddf2vfp"); + + setCmpLibcallCC(RTLIB::OEQ_F64, ISD::SETNE); + setCmpLibcallCC(RTLIB::UNE_F64, ISD::SETNE); + setCmpLibcallCC(RTLIB::OLT_F64, ISD::SETNE); + setCmpLibcallCC(RTLIB::OLE_F64, ISD::SETNE); + setCmpLibcallCC(RTLIB::OGE_F64, ISD::SETNE); + setCmpLibcallCC(RTLIB::OGT_F64, ISD::SETNE); + setCmpLibcallCC(RTLIB::UO_F64, ISD::SETNE); + setCmpLibcallCC(RTLIB::O_F64, ISD::SETEQ); + + // Floating-point to integer conversions. + // i64 conversions are done via library routines even when generating VFP + // instructions, so use the same ones. + setLibcallName(RTLIB::FPTOSINT_F64_I32, "__fixdfsivfp"); + setLibcallName(RTLIB::FPTOUINT_F64_I32, "__fixunsdfsivfp"); + setLibcallName(RTLIB::FPTOSINT_F32_I32, "__fixsfsivfp"); + setLibcallName(RTLIB::FPTOUINT_F32_I32, "__fixunssfsivfp"); + + // Conversions between floating types. + setLibcallName(RTLIB::FPROUND_F64_F32, "__truncdfsf2vfp"); + setLibcallName(RTLIB::FPEXT_F32_F64, "__extendsfdf2vfp"); + + // Integer to floating-point conversions. + // i64 conversions are done via library routines even when generating VFP + // instructions, so use the same ones. + // FIXME: There appears to be some naming inconsistency in ARM libgcc: + // e.g., __floatunsidf vs. __floatunssidfvfp. + setLibcallName(RTLIB::SINTTOFP_I32_F64, "__floatsidfvfp"); + setLibcallName(RTLIB::UINTTOFP_I32_F64, "__floatunssidfvfp"); + setLibcallName(RTLIB::SINTTOFP_I32_F32, "__floatsisfvfp"); + setLibcallName(RTLIB::UINTTOFP_I32_F32, "__floatunssisfvfp"); + } + } + + // These libcalls are not available in 32-bit. + setLibcallName(RTLIB::SHL_I128, 0); + setLibcallName(RTLIB::SRL_I128, 0); + setLibcallName(RTLIB::SRA_I128, 0); + + // Libcalls should use the AAPCS base standard ABI, even if hard float + // is in effect, as per the ARM RTABI specification, section 4.1.2. + if (Subtarget->isAAPCS_ABI()) { + for (int i = 0; i < RTLIB::UNKNOWN_LIBCALL; ++i) { + setLibcallCallingConv(static_cast<RTLIB::Libcall>(i), + CallingConv::ARM_AAPCS); + } + } + + if (Subtarget->isThumb1Only()) + addRegisterClass(MVT::i32, ARM::tGPRRegisterClass); + else + addRegisterClass(MVT::i32, ARM::GPRRegisterClass); + if (!UseSoftFloat && Subtarget->hasVFP2() && !Subtarget->isThumb1Only()) { + addRegisterClass(MVT::f32, ARM::SPRRegisterClass); + addRegisterClass(MVT::f64, ARM::DPRRegisterClass); + + setTruncStoreAction(MVT::f64, MVT::f32, Expand); + } + + if (Subtarget->hasNEON()) { + addDRTypeForNEON(MVT::v2f32); + addDRTypeForNEON(MVT::v8i8); + addDRTypeForNEON(MVT::v4i16); + addDRTypeForNEON(MVT::v2i32); + addDRTypeForNEON(MVT::v1i64); + + addQRTypeForNEON(MVT::v4f32); + addQRTypeForNEON(MVT::v2f64); + addQRTypeForNEON(MVT::v16i8); + addQRTypeForNEON(MVT::v8i16); + addQRTypeForNEON(MVT::v4i32); + addQRTypeForNEON(MVT::v2i64); + + // v2f64 is legal so that QR subregs can be extracted as f64 elements, but + // neither Neon nor VFP support any arithmetic operations on it. + setOperationAction(ISD::FADD, MVT::v2f64, Expand); + setOperationAction(ISD::FSUB, MVT::v2f64, Expand); + setOperationAction(ISD::FMUL, MVT::v2f64, Expand); + setOperationAction(ISD::FDIV, MVT::v2f64, Expand); + setOperationAction(ISD::FREM, MVT::v2f64, Expand); + setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Expand); + setOperationAction(ISD::VSETCC, MVT::v2f64, Expand); + setOperationAction(ISD::FNEG, MVT::v2f64, Expand); + setOperationAction(ISD::FABS, MVT::v2f64, Expand); + setOperationAction(ISD::FSQRT, MVT::v2f64, Expand); + setOperationAction(ISD::FSIN, MVT::v2f64, Expand); + setOperationAction(ISD::FCOS, MVT::v2f64, Expand); + setOperationAction(ISD::FPOWI, MVT::v2f64, Expand); + setOperationAction(ISD::FPOW, MVT::v2f64, Expand); + setOperationAction(ISD::FLOG, MVT::v2f64, Expand); + setOperationAction(ISD::FLOG2, MVT::v2f64, Expand); + setOperationAction(ISD::FLOG10, MVT::v2f64, Expand); + setOperationAction(ISD::FEXP, MVT::v2f64, Expand); + setOperationAction(ISD::FEXP2, MVT::v2f64, Expand); + setOperationAction(ISD::FCEIL, MVT::v2f64, Expand); + setOperationAction(ISD::FTRUNC, MVT::v2f64, Expand); + setOperationAction(ISD::FRINT, MVT::v2f64, Expand); + setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Expand); + setOperationAction(ISD::FFLOOR, MVT::v2f64, Expand); + + // Neon does not support some operations on v1i64 and v2i64 types. + setOperationAction(ISD::MUL, MVT::v1i64, Expand); + setOperationAction(ISD::MUL, MVT::v2i64, Expand); + setOperationAction(ISD::VSETCC, MVT::v1i64, Expand); + setOperationAction(ISD::VSETCC, MVT::v2i64, Expand); + + setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); + setTargetDAGCombine(ISD::SHL); + setTargetDAGCombine(ISD::SRL); + setTargetDAGCombine(ISD::SRA); + setTargetDAGCombine(ISD::SIGN_EXTEND); + setTargetDAGCombine(ISD::ZERO_EXTEND); + setTargetDAGCombine(ISD::ANY_EXTEND); + setTargetDAGCombine(ISD::SELECT_CC); + } + + computeRegisterProperties(); + + // ARM does not have f32 extending load. + setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand); + + // ARM does not have i1 sign extending load. + setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote); + + // ARM supports all 4 flavors of integer indexed load / store. + if (!Subtarget->isThumb1Only()) { + for (unsigned im = (unsigned)ISD::PRE_INC; + im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { + setIndexedLoadAction(im, MVT::i1, Legal); + setIndexedLoadAction(im, MVT::i8, Legal); + setIndexedLoadAction(im, MVT::i16, Legal); + setIndexedLoadAction(im, MVT::i32, Legal); + setIndexedStoreAction(im, MVT::i1, Legal); + setIndexedStoreAction(im, MVT::i8, Legal); + setIndexedStoreAction(im, MVT::i16, Legal); + setIndexedStoreAction(im, MVT::i32, Legal); + } + } + + // i64 operation support. + if (Subtarget->isThumb1Only()) { + setOperationAction(ISD::MUL, MVT::i64, Expand); + setOperationAction(ISD::MULHU, MVT::i32, Expand); + setOperationAction(ISD::MULHS, MVT::i32, Expand); + setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand); + setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand); + } else { + setOperationAction(ISD::MUL, MVT::i64, Expand); + setOperationAction(ISD::MULHU, MVT::i32, Expand); + if (!Subtarget->hasV6Ops()) + setOperationAction(ISD::MULHS, MVT::i32, Expand); + } + setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom); + setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom); + setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom); + setOperationAction(ISD::SRL, MVT::i64, Custom); + setOperationAction(ISD::SRA, MVT::i64, Custom); + + // ARM does not have ROTL. + setOperationAction(ISD::ROTL, MVT::i32, Expand); + setOperationAction(ISD::CTTZ, MVT::i32, Custom); + setOperationAction(ISD::CTPOP, MVT::i32, Expand); + if (!Subtarget->hasV5TOps() || Subtarget->isThumb1Only()) + setOperationAction(ISD::CTLZ, MVT::i32, Expand); + + // Only ARMv6 has BSWAP. + if (!Subtarget->hasV6Ops()) + setOperationAction(ISD::BSWAP, MVT::i32, Expand); + + // These are expanded into libcalls. + if (!Subtarget->hasDivide()) { + // v7M has a hardware divider + setOperationAction(ISD::SDIV, MVT::i32, Expand); + setOperationAction(ISD::UDIV, MVT::i32, Expand); + } + setOperationAction(ISD::SREM, MVT::i32, Expand); + setOperationAction(ISD::UREM, MVT::i32, Expand); + setOperationAction(ISD::SDIVREM, MVT::i32, Expand); + setOperationAction(ISD::UDIVREM, MVT::i32, Expand); + + setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); + setOperationAction(ISD::ConstantPool, MVT::i32, Custom); + setOperationAction(ISD::GLOBAL_OFFSET_TABLE, MVT::i32, Custom); + setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom); + setOperationAction(ISD::BlockAddress, MVT::i32, Custom); + + setOperationAction(ISD::TRAP, MVT::Other, Legal); + + // Use the default implementation. + setOperationAction(ISD::VASTART, MVT::Other, Custom); + setOperationAction(ISD::VAARG, MVT::Other, Expand); + setOperationAction(ISD::VACOPY, MVT::Other, Expand); + setOperationAction(ISD::VAEND, MVT::Other, Expand); + setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); + setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); + setOperationAction(ISD::EHSELECTION, MVT::i32, Expand); + // FIXME: Shouldn't need this, since no register is used, but the legalizer + // doesn't yet know how to not do that for SjLj. + setExceptionSelectorRegister(ARM::R0); + setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand); + setOperationAction(ISD::MEMBARRIER, MVT::Other, Custom); + + // If the subtarget does not have extract instructions, sign_extend_inreg + // needs to be expanded. Extract is available in ARM mode on v6 and up, + // and on most Thumb2 implementations. + if ((!Subtarget->isThumb() && !Subtarget->hasV6Ops()) + || (Subtarget->isThumb2() && !Subtarget->hasT2ExtractPack())) { + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand); + } + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); + + if (!UseSoftFloat && Subtarget->hasVFP2() && !Subtarget->isThumb1Only()) + // Turn f64->i64 into VMOVRRD, i64 -> f64 to VMOVDRR + // iff target supports vfp2. + setOperationAction(ISD::BIT_CONVERT, MVT::i64, Custom); + + // We want to custom lower some of our intrinsics. + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); + setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom); + setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom); + + setOperationAction(ISD::SETCC, MVT::i32, Expand); + setOperationAction(ISD::SETCC, MVT::f32, Expand); + setOperationAction(ISD::SETCC, MVT::f64, Expand); + setOperationAction(ISD::SELECT, MVT::i32, Expand); + setOperationAction(ISD::SELECT, MVT::f32, Expand); + setOperationAction(ISD::SELECT, MVT::f64, Expand); + setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); + setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); + setOperationAction(ISD::SELECT_CC, MVT::f64, Custom); + + setOperationAction(ISD::BRCOND, MVT::Other, Expand); + setOperationAction(ISD::BR_CC, MVT::i32, Custom); + setOperationAction(ISD::BR_CC, MVT::f32, Custom); + setOperationAction(ISD::BR_CC, MVT::f64, Custom); + setOperationAction(ISD::BR_JT, MVT::Other, Custom); + + // We don't support sin/cos/fmod/copysign/pow + setOperationAction(ISD::FSIN, MVT::f64, Expand); + setOperationAction(ISD::FSIN, MVT::f32, Expand); + setOperationAction(ISD::FCOS, MVT::f32, Expand); + setOperationAction(ISD::FCOS, MVT::f64, Expand); + setOperationAction(ISD::FREM, MVT::f64, Expand); + setOperationAction(ISD::FREM, MVT::f32, Expand); + if (!UseSoftFloat && Subtarget->hasVFP2() && !Subtarget->isThumb1Only()) { + setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); + setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); + } + setOperationAction(ISD::FPOW, MVT::f64, Expand); + setOperationAction(ISD::FPOW, MVT::f32, Expand); + + // Various VFP goodness + if (!UseSoftFloat && !Subtarget->isThumb1Only()) { + // int <-> fp are custom expanded into bit_convert + ARMISD ops. + if (Subtarget->hasVFP2()) { + setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom); + setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom); + setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); + setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); + } + // Special handling for half-precision FP. + if (!Subtarget->hasFP16()) { + setOperationAction(ISD::FP16_TO_FP32, MVT::f32, Expand); + setOperationAction(ISD::FP32_TO_FP16, MVT::i32, Expand); + } + } + + // We have target-specific dag combine patterns for the following nodes: + // ARMISD::VMOVRRD - No need to call setTargetDAGCombine + setTargetDAGCombine(ISD::ADD); + setTargetDAGCombine(ISD::SUB); + setTargetDAGCombine(ISD::MUL); + + setStackPointerRegisterToSaveRestore(ARM::SP); + + if (UseSoftFloat || Subtarget->isThumb1Only() || !Subtarget->hasVFP2()) + setSchedulingPreference(Sched::RegPressure); + else + setSchedulingPreference(Sched::Hybrid); + + // FIXME: If-converter should use instruction latency to determine + // profitability rather than relying on fixed limits. + if (Subtarget->getCPUString() == "generic") { + // Generic (and overly aggressive) if-conversion limits. + setIfCvtBlockSizeLimit(10); + setIfCvtDupBlockSizeLimit(2); + } else if (Subtarget->hasV7Ops()) { + setIfCvtBlockSizeLimit(3); + setIfCvtDupBlockSizeLimit(1); + } else if (Subtarget->hasV6Ops()) { + setIfCvtBlockSizeLimit(2); + setIfCvtDupBlockSizeLimit(1); + } else { + setIfCvtBlockSizeLimit(3); + setIfCvtDupBlockSizeLimit(2); + } + + maxStoresPerMemcpy = 1; //// temporary - rewrite interface to use type + // Do not enable CodePlacementOpt for now: it currently runs after the + // ARMConstantIslandPass and messes up branch relaxation and placement + // of constant islands. + // benefitFromCodePlacementOpt = true; +} + +const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { + switch (Opcode) { + default: return 0; + case ARMISD::Wrapper: return "ARMISD::Wrapper"; + case ARMISD::WrapperJT: return "ARMISD::WrapperJT"; + case ARMISD::CALL: return "ARMISD::CALL"; + case ARMISD::CALL_PRED: return "ARMISD::CALL_PRED"; + case ARMISD::CALL_NOLINK: return "ARMISD::CALL_NOLINK"; + case ARMISD::tCALL: return "ARMISD::tCALL"; + case ARMISD::BRCOND: return "ARMISD::BRCOND"; + case ARMISD::BR_JT: return "ARMISD::BR_JT"; + case ARMISD::BR2_JT: return "ARMISD::BR2_JT"; + case ARMISD::RET_FLAG: return "ARMISD::RET_FLAG"; + case ARMISD::PIC_ADD: return "ARMISD::PIC_ADD"; + case ARMISD::CMP: return "ARMISD::CMP"; + case ARMISD::CMPZ: return "ARMISD::CMPZ"; + case ARMISD::CMPFP: return "ARMISD::CMPFP"; + case ARMISD::CMPFPw0: return "ARMISD::CMPFPw0"; + case ARMISD::FMSTAT: return "ARMISD::FMSTAT"; + case ARMISD::CMOV: return "ARMISD::CMOV"; + case ARMISD::CNEG: return "ARMISD::CNEG"; + + case ARMISD::RBIT: return "ARMISD::RBIT"; + + case ARMISD::FTOSI: return "ARMISD::FTOSI"; + case ARMISD::FTOUI: return "ARMISD::FTOUI"; + case ARMISD::SITOF: return "ARMISD::SITOF"; + case ARMISD::UITOF: return "ARMISD::UITOF"; + + case ARMISD::SRL_FLAG: return "ARMISD::SRL_FLAG"; + case ARMISD::SRA_FLAG: return "ARMISD::SRA_FLAG"; + case ARMISD::RRX: return "ARMISD::RRX"; + + case ARMISD::VMOVRRD: return "ARMISD::VMOVRRD"; + case ARMISD::VMOVDRR: return "ARMISD::VMOVDRR"; + + case ARMISD::EH_SJLJ_SETJMP: return "ARMISD::EH_SJLJ_SETJMP"; + case ARMISD::EH_SJLJ_LONGJMP:return "ARMISD::EH_SJLJ_LONGJMP"; + + case ARMISD::THREAD_POINTER:return "ARMISD::THREAD_POINTER"; + + case ARMISD::DYN_ALLOC: return "ARMISD::DYN_ALLOC"; + + case ARMISD::MEMBARRIER: return "ARMISD::MEMBARRIER"; + case ARMISD::SYNCBARRIER: return "ARMISD::SYNCBARRIER"; + + case ARMISD::VCEQ: return "ARMISD::VCEQ"; + case ARMISD::VCGE: return "ARMISD::VCGE"; + case ARMISD::VCGEU: return "ARMISD::VCGEU"; + case ARMISD::VCGT: return "ARMISD::VCGT"; + case ARMISD::VCGTU: return "ARMISD::VCGTU"; + case ARMISD::VTST: return "ARMISD::VTST"; + + case ARMISD::VSHL: return "ARMISD::VSHL"; + case ARMISD::VSHRs: return "ARMISD::VSHRs"; + case ARMISD::VSHRu: return "ARMISD::VSHRu"; + case ARMISD::VSHLLs: return "ARMISD::VSHLLs"; + case ARMISD::VSHLLu: return "ARMISD::VSHLLu"; + case ARMISD::VSHLLi: return "ARMISD::VSHLLi"; + case ARMISD::VSHRN: return "ARMISD::VSHRN"; + case ARMISD::VRSHRs: return "ARMISD::VRSHRs"; + case ARMISD::VRSHRu: return "ARMISD::VRSHRu"; + case ARMISD::VRSHRN: return "ARMISD::VRSHRN"; + case ARMISD::VQSHLs: return "ARMISD::VQSHLs"; + case ARMISD::VQSHLu: return "ARMISD::VQSHLu"; + case ARMISD::VQSHLsu: return "ARMISD::VQSHLsu"; + case ARMISD::VQSHRNs: return "ARMISD::VQSHRNs"; + case ARMISD::VQSHRNu: return "ARMISD::VQSHRNu"; + case ARMISD::VQSHRNsu: return "ARMISD::VQSHRNsu"; + case ARMISD::VQRSHRNs: return "ARMISD::VQRSHRNs"; + case ARMISD::VQRSHRNu: return "ARMISD::VQRSHRNu"; + case ARMISD::VQRSHRNsu: return "ARMISD::VQRSHRNsu"; + case ARMISD::VGETLANEu: return "ARMISD::VGETLANEu"; + case ARMISD::VGETLANEs: return "ARMISD::VGETLANEs"; + case ARMISD::VDUP: return "ARMISD::VDUP"; + case ARMISD::VDUPLANE: return "ARMISD::VDUPLANE"; + case ARMISD::VEXT: return "ARMISD::VEXT"; + case ARMISD::VREV64: return "ARMISD::VREV64"; + case ARMISD::VREV32: return "ARMISD::VREV32"; + case ARMISD::VREV16: return "ARMISD::VREV16"; + case ARMISD::VZIP: return "ARMISD::VZIP"; + case ARMISD::VUZP: return "ARMISD::VUZP"; + case ARMISD::VTRN: return "ARMISD::VTRN"; + case ARMISD::FMAX: return "ARMISD::FMAX"; + case ARMISD::FMIN: return "ARMISD::FMIN"; + } +} + +/// getRegClassFor - Return the register class that should be used for the +/// specified value type. +TargetRegisterClass *ARMTargetLowering::getRegClassFor(EVT VT) const { + // Map v4i64 to QQ registers but do not make the type legal. Similarly map + // v8i64 to QQQQ registers. v4i64 and v8i64 are only used for REG_SEQUENCE to + // load / store 4 to 8 consecutive D registers. + if (Subtarget->hasNEON()) { + if (VT == MVT::v4i64) + return ARM::QQPRRegisterClass; + else if (VT == MVT::v8i64) + return ARM::QQQQPRRegisterClass; + } + return TargetLowering::getRegClassFor(VT); +} + +/// getFunctionAlignment - Return the Log2 alignment of this function. +unsigned ARMTargetLowering::getFunctionAlignment(const Function *F) const { + return getTargetMachine().getSubtarget<ARMSubtarget>().isThumb() ? 0 : 1; +} + +Sched::Preference ARMTargetLowering::getSchedulingPreference(SDNode *N) const { + for (unsigned i = 0, e = N->getNumValues(); i != e; ++i) { + EVT VT = N->getValueType(i); + if (VT.isFloatingPoint() || VT.isVector()) + return Sched::Latency; + } + return Sched::RegPressure; +} + +//===----------------------------------------------------------------------===// +// Lowering Code +//===----------------------------------------------------------------------===// + +/// IntCCToARMCC - Convert a DAG integer condition code to an ARM CC +static ARMCC::CondCodes IntCCToARMCC(ISD::CondCode CC) { + switch (CC) { + default: llvm_unreachable("Unknown condition code!"); + case ISD::SETNE: return ARMCC::NE; + case ISD::SETEQ: return ARMCC::EQ; + case ISD::SETGT: return ARMCC::GT; + case ISD::SETGE: return ARMCC::GE; + case ISD::SETLT: return ARMCC::LT; + case ISD::SETLE: return ARMCC::LE; + case ISD::SETUGT: return ARMCC::HI; + case ISD::SETUGE: return ARMCC::HS; + case ISD::SETULT: return ARMCC::LO; + case ISD::SETULE: return ARMCC::LS; + } +} + +/// FPCCToARMCC - Convert a DAG fp condition code to an ARM CC. +static void FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode, + ARMCC::CondCodes &CondCode2) { + CondCode2 = ARMCC::AL; + switch (CC) { + default: llvm_unreachable("Unknown FP condition!"); + case ISD::SETEQ: + case ISD::SETOEQ: CondCode = ARMCC::EQ; break; + case ISD::SETGT: + case ISD::SETOGT: CondCode = ARMCC::GT; break; + case ISD::SETGE: + case ISD::SETOGE: CondCode = ARMCC::GE; break; + case ISD::SETOLT: CondCode = ARMCC::MI; break; + case ISD::SETOLE: CondCode = ARMCC::LS; break; + case ISD::SETONE: CondCode = ARMCC::MI; CondCode2 = ARMCC::GT; break; + case ISD::SETO: CondCode = ARMCC::VC; break; + case ISD::SETUO: CondCode = ARMCC::VS; break; + case ISD::SETUEQ: CondCode = ARMCC::EQ; CondCode2 = ARMCC::VS; break; + case ISD::SETUGT: CondCode = ARMCC::HI; break; + case ISD::SETUGE: CondCode = ARMCC::PL; break; + case ISD::SETLT: + case ISD::SETULT: CondCode = ARMCC::LT; break; + case ISD::SETLE: + case ISD::SETULE: CondCode = ARMCC::LE; break; + case ISD::SETNE: + case ISD::SETUNE: CondCode = ARMCC::NE; break; + } +} + +//===----------------------------------------------------------------------===// +// Calling Convention Implementation +//===----------------------------------------------------------------------===// + +#include "ARMGenCallingConv.inc" + +// APCS f64 is in register pairs, possibly split to stack +static bool f64AssignAPCS(unsigned &ValNo, EVT &ValVT, EVT &LocVT, + CCValAssign::LocInfo &LocInfo, + CCState &State, bool CanFail) { + static const unsigned RegList[] = { ARM::R0, ARM::R1, ARM::R2, ARM::R3 }; + + // Try to get the first register. + if (unsigned Reg = State.AllocateReg(RegList, 4)) + State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo)); + else { + // For the 2nd half of a v2f64, do not fail. + if (CanFail) + return false; + + // Put the whole thing on the stack. + State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT, + State.AllocateStack(8, 4), + LocVT, LocInfo)); + return true; + } + + // Try to get the second register. + if (unsigned Reg = State.AllocateReg(RegList, 4)) + State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo)); + else + State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT, + State.AllocateStack(4, 4), + LocVT, LocInfo)); + return true; +} + +static bool CC_ARM_APCS_Custom_f64(unsigned &ValNo, EVT &ValVT, EVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, + CCState &State) { + if (!f64AssignAPCS(ValNo, ValVT, LocVT, LocInfo, State, true)) + return false; + if (LocVT == MVT::v2f64 && + !f64AssignAPCS(ValNo, ValVT, LocVT, LocInfo, State, false)) + return false; + return true; // we handled it +} + +// AAPCS f64 is in aligned register pairs +static bool f64AssignAAPCS(unsigned &ValNo, EVT &ValVT, EVT &LocVT, + CCValAssign::LocInfo &LocInfo, + CCState &State, bool CanFail) { + static const unsigned HiRegList[] = { ARM::R0, ARM::R2 }; + static const unsigned LoRegList[] = { ARM::R1, ARM::R3 }; + + unsigned Reg = State.AllocateReg(HiRegList, LoRegList, 2); + if (Reg == 0) { + // For the 2nd half of a v2f64, do not just fail. + if (CanFail) + return false; + + // Put the whole thing on the stack. + State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT, + State.AllocateStack(8, 8), + LocVT, LocInfo)); + return true; + } + + unsigned i; + for (i = 0; i < 2; ++i) + if (HiRegList[i] == Reg) + break; + + State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo)); + State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, LoRegList[i], + LocVT, LocInfo)); + return true; +} + +static bool CC_ARM_AAPCS_Custom_f64(unsigned &ValNo, EVT &ValVT, EVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, + CCState &State) { + if (!f64AssignAAPCS(ValNo, ValVT, LocVT, LocInfo, State, true)) + return false; + if (LocVT == MVT::v2f64 && + !f64AssignAAPCS(ValNo, ValVT, LocVT, LocInfo, State, false)) + return false; + return true; // we handled it +} + +static bool f64RetAssign(unsigned &ValNo, EVT &ValVT, EVT &LocVT, + CCValAssign::LocInfo &LocInfo, CCState &State) { + static const unsigned HiRegList[] = { ARM::R0, ARM::R2 }; + static const unsigned LoRegList[] = { ARM::R1, ARM::R3 }; + + unsigned Reg = State.AllocateReg(HiRegList, LoRegList, 2); + if (Reg == 0) + return false; // we didn't handle it + + unsigned i; + for (i = 0; i < 2; ++i) + if (HiRegList[i] == Reg) + break; + + State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo)); + State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, LoRegList[i], + LocVT, LocInfo)); + return true; +} + +static bool RetCC_ARM_APCS_Custom_f64(unsigned &ValNo, EVT &ValVT, EVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, + CCState &State) { + if (!f64RetAssign(ValNo, ValVT, LocVT, LocInfo, State)) + return false; + if (LocVT == MVT::v2f64 && !f64RetAssign(ValNo, ValVT, LocVT, LocInfo, State)) + return false; + return true; // we handled it +} + +static bool RetCC_ARM_AAPCS_Custom_f64(unsigned &ValNo, EVT &ValVT, EVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, + CCState &State) { + return RetCC_ARM_APCS_Custom_f64(ValNo, ValVT, LocVT, LocInfo, ArgFlags, + State); +} + +/// CCAssignFnForNode - Selects the correct CCAssignFn for a the +/// given CallingConvention value. +CCAssignFn *ARMTargetLowering::CCAssignFnForNode(CallingConv::ID CC, + bool Return, + bool isVarArg) const { + switch (CC) { + default: + llvm_unreachable("Unsupported calling convention"); + case CallingConv::C: + case CallingConv::Fast: + // Use target triple & subtarget features to do actual dispatch. + if (Subtarget->isAAPCS_ABI()) { + if (Subtarget->hasVFP2() && + FloatABIType == FloatABI::Hard && !isVarArg) + return (Return ? RetCC_ARM_AAPCS_VFP: CC_ARM_AAPCS_VFP); + else + return (Return ? RetCC_ARM_AAPCS: CC_ARM_AAPCS); + } else + return (Return ? RetCC_ARM_APCS: CC_ARM_APCS); + case CallingConv::ARM_AAPCS_VFP: + return (Return ? RetCC_ARM_AAPCS_VFP: CC_ARM_AAPCS_VFP); + case CallingConv::ARM_AAPCS: + return (Return ? RetCC_ARM_AAPCS: CC_ARM_AAPCS); + case CallingConv::ARM_APCS: + return (Return ? RetCC_ARM_APCS: CC_ARM_APCS); + } +} + +/// LowerCallResult - Lower the result values of a call into the +/// appropriate copies out of appropriate physical registers. +SDValue +ARMTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, + CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl<ISD::InputArg> &Ins, + DebugLoc dl, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals) const { + + // Assign locations to each value returned by this call. + SmallVector<CCValAssign, 16> RVLocs; + CCState CCInfo(CallConv, isVarArg, getTargetMachine(), + RVLocs, *DAG.getContext()); + CCInfo.AnalyzeCallResult(Ins, + CCAssignFnForNode(CallConv, /* Return*/ true, + isVarArg)); + + // Copy all of the result registers out of their specified physreg. + for (unsigned i = 0; i != RVLocs.size(); ++i) { + CCValAssign VA = RVLocs[i]; + + SDValue Val; + if (VA.needsCustom()) { + // Handle f64 or half of a v2f64. + SDValue Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, + InFlag); + Chain = Lo.getValue(1); + InFlag = Lo.getValue(2); + VA = RVLocs[++i]; // skip ahead to next loc + SDValue Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, + InFlag); + Chain = Hi.getValue(1); + InFlag = Hi.getValue(2); + Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi); + + if (VA.getLocVT() == MVT::v2f64) { + SDValue Vec = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64); + Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val, + DAG.getConstant(0, MVT::i32)); + + VA = RVLocs[++i]; // skip ahead to next loc + Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag); + Chain = Lo.getValue(1); + InFlag = Lo.getValue(2); + VA = RVLocs[++i]; // skip ahead to next loc + Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag); + Chain = Hi.getValue(1); + InFlag = Hi.getValue(2); + Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi); + Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val, + DAG.getConstant(1, MVT::i32)); + } + } else { + Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(), + InFlag); + Chain = Val.getValue(1); + InFlag = Val.getValue(2); + } + + switch (VA.getLocInfo()) { + default: llvm_unreachable("Unknown loc info!"); + case CCValAssign::Full: break; + case CCValAssign::BCvt: + Val = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getValVT(), Val); + break; + } + + InVals.push_back(Val); + } + + return Chain; +} + +/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified +/// by "Src" to address "Dst" of size "Size". Alignment information is +/// specified by the specific parameter attribute. The copy will be passed as +/// a byval function parameter. +/// Sometimes what we are copying is the end of a larger object, the part that +/// does not fit in registers. +static SDValue +CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain, + ISD::ArgFlagsTy Flags, SelectionDAG &DAG, + DebugLoc dl) { + SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32); + return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(), + /*isVolatile=*/false, /*AlwaysInline=*/false, + NULL, 0, NULL, 0); +} + +/// LowerMemOpCallTo - Store the argument to the stack. +SDValue +ARMTargetLowering::LowerMemOpCallTo(SDValue Chain, + SDValue StackPtr, SDValue Arg, + DebugLoc dl, SelectionDAG &DAG, + const CCValAssign &VA, + ISD::ArgFlagsTy Flags) const { + unsigned LocMemOffset = VA.getLocMemOffset(); + SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset); + PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff); + if (Flags.isByVal()) { + return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl); + } + return DAG.getStore(Chain, dl, Arg, PtrOff, + PseudoSourceValue::getStack(), LocMemOffset, + false, false, 0); +} + +void ARMTargetLowering::PassF64ArgInRegs(DebugLoc dl, SelectionDAG &DAG, + SDValue Chain, SDValue &Arg, + RegsToPassVector &RegsToPass, + CCValAssign &VA, CCValAssign &NextVA, + SDValue &StackPtr, + SmallVector<SDValue, 8> &MemOpChains, + ISD::ArgFlagsTy Flags) const { + + SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl, + DAG.getVTList(MVT::i32, MVT::i32), Arg); + RegsToPass.push_back(std::make_pair(VA.getLocReg(), fmrrd)); + + if (NextVA.isRegLoc()) + RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), fmrrd.getValue(1))); + else { + assert(NextVA.isMemLoc()); + if (StackPtr.getNode() == 0) + StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy()); + + MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, fmrrd.getValue(1), + dl, DAG, NextVA, + Flags)); + } +} + +/// LowerCall - Lowering a call into a callseq_start <- +/// ARMISD:CALL <- callseq_end chain. Also add input and output parameter +/// nodes. +SDValue +ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee, + CallingConv::ID CallConv, bool isVarArg, + bool &isTailCall, + const SmallVectorImpl<ISD::OutputArg> &Outs, + const SmallVectorImpl<ISD::InputArg> &Ins, + DebugLoc dl, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals) const { + // ARM target does not yet support tail call optimization. + isTailCall = false; + + // Analyze operands of the call, assigning locations to each operand. + SmallVector<CCValAssign, 16> ArgLocs; + CCState CCInfo(CallConv, isVarArg, getTargetMachine(), ArgLocs, + *DAG.getContext()); + CCInfo.AnalyzeCallOperands(Outs, + CCAssignFnForNode(CallConv, /* Return*/ false, + isVarArg)); + + // Get a count of how many bytes are to be pushed on the stack. + unsigned NumBytes = CCInfo.getNextStackOffset(); + + // Adjust the stack pointer for the new arguments... + // These operations are automatically eliminated by the prolog/epilog pass + Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true)); + + SDValue StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy()); + + RegsToPassVector RegsToPass; + SmallVector<SDValue, 8> MemOpChains; + + // Walk the register/memloc assignments, inserting copies/loads. In the case + // of tail call optimization, arguments are handled later. + for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); + i != e; + ++i, ++realArgIdx) { + CCValAssign &VA = ArgLocs[i]; + SDValue Arg = Outs[realArgIdx].Val; + ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags; + + // Promote the value if needed. + switch (VA.getLocInfo()) { + default: llvm_unreachable("Unknown loc info!"); + case CCValAssign::Full: break; + case CCValAssign::SExt: + Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg); + break; + case CCValAssign::ZExt: + Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg); + break; + case CCValAssign::AExt: + Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg); + break; + case CCValAssign::BCvt: + Arg = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getLocVT(), Arg); + break; + } + + // f64 and v2f64 might be passed in i32 pairs and must be split into pieces + if (VA.needsCustom()) { + if (VA.getLocVT() == MVT::v2f64) { + SDValue Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, + DAG.getConstant(0, MVT::i32)); + SDValue Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, + DAG.getConstant(1, MVT::i32)); + + PassF64ArgInRegs(dl, DAG, Chain, Op0, RegsToPass, + VA, ArgLocs[++i], StackPtr, MemOpChains, Flags); + + VA = ArgLocs[++i]; // skip ahead to next loc + if (VA.isRegLoc()) { + PassF64ArgInRegs(dl, DAG, Chain, Op1, RegsToPass, + VA, ArgLocs[++i], StackPtr, MemOpChains, Flags); + } else { + assert(VA.isMemLoc()); + + MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Op1, + dl, DAG, VA, Flags)); + } + } else { + PassF64ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++i], + StackPtr, MemOpChains, Flags); + } + } else if (VA.isRegLoc()) { + RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); + } else { + assert(VA.isMemLoc()); + + MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg, + dl, DAG, VA, Flags)); + } + } + + if (!MemOpChains.empty()) + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, + &MemOpChains[0], MemOpChains.size()); + + // Build a sequence of copy-to-reg nodes chained together with token chain + // and flag operands which copy the outgoing args into the appropriate regs. + SDValue InFlag; + for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { + Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, + RegsToPass[i].second, InFlag); + InFlag = Chain.getValue(1); + } + + // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every + // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol + // node so that legalize doesn't hack it. + bool isDirect = false; + bool isARMFunc = false; + bool isLocalARMFunc = false; + MachineFunction &MF = DAG.getMachineFunction(); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + + if (EnableARMLongCalls) { + assert (getTargetMachine().getRelocationModel() == Reloc::Static + && "long-calls with non-static relocation model!"); + // Handle a global address or an external symbol. If it's not one of + // those, the target's already in a register, so we don't need to do + // anything extra. + if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { + const GlobalValue *GV = G->getGlobal(); + // Create a constant pool entry for the callee address + unsigned ARMPCLabelIndex = AFI->createConstPoolEntryUId(); + ARMConstantPoolValue *CPV = new ARMConstantPoolValue(GV, + ARMPCLabelIndex, + ARMCP::CPValue, 0); + // Get the address of the callee into a register + SDValue CPAddr = DAG.getTargetConstantPool(CPV, getPointerTy(), 4); + CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); + Callee = DAG.getLoad(getPointerTy(), dl, + DAG.getEntryNode(), CPAddr, + PseudoSourceValue::getConstantPool(), 0, + false, false, 0); + } else if (ExternalSymbolSDNode *S=dyn_cast<ExternalSymbolSDNode>(Callee)) { + const char *Sym = S->getSymbol(); + + // Create a constant pool entry for the callee address + unsigned ARMPCLabelIndex = AFI->createConstPoolEntryUId(); + ARMConstantPoolValue *CPV = new ARMConstantPoolValue(*DAG.getContext(), + Sym, ARMPCLabelIndex, 0); + // Get the address of the callee into a register + SDValue CPAddr = DAG.getTargetConstantPool(CPV, getPointerTy(), 4); + CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); + Callee = DAG.getLoad(getPointerTy(), dl, + DAG.getEntryNode(), CPAddr, + PseudoSourceValue::getConstantPool(), 0, + false, false, 0); + } + } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { + const GlobalValue *GV = G->getGlobal(); + isDirect = true; + bool isExt = GV->isDeclaration() || GV->isWeakForLinker(); + bool isStub = (isExt && Subtarget->isTargetDarwin()) && + getTargetMachine().getRelocationModel() != Reloc::Static; + isARMFunc = !Subtarget->isThumb() || isStub; + // ARM call to a local ARM function is predicable. + isLocalARMFunc = !Subtarget->isThumb() && !isExt; + // tBX takes a register source operand. + if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) { + unsigned ARMPCLabelIndex = AFI->createConstPoolEntryUId(); + ARMConstantPoolValue *CPV = new ARMConstantPoolValue(GV, + ARMPCLabelIndex, + ARMCP::CPValue, 4); + SDValue CPAddr = DAG.getTargetConstantPool(CPV, getPointerTy(), 4); + CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); + Callee = DAG.getLoad(getPointerTy(), dl, + DAG.getEntryNode(), CPAddr, + PseudoSourceValue::getConstantPool(), 0, + false, false, 0); + SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32); + Callee = DAG.getNode(ARMISD::PIC_ADD, dl, + getPointerTy(), Callee, PICLabel); + } else + Callee = DAG.getTargetGlobalAddress(GV, getPointerTy()); + } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { + isDirect = true; + bool isStub = Subtarget->isTargetDarwin() && + getTargetMachine().getRelocationModel() != Reloc::Static; + isARMFunc = !Subtarget->isThumb() || isStub; + // tBX takes a register source operand. + const char *Sym = S->getSymbol(); + if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) { + unsigned ARMPCLabelIndex = AFI->createConstPoolEntryUId(); + ARMConstantPoolValue *CPV = new ARMConstantPoolValue(*DAG.getContext(), + Sym, ARMPCLabelIndex, 4); + SDValue CPAddr = DAG.getTargetConstantPool(CPV, getPointerTy(), 4); + CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); + Callee = DAG.getLoad(getPointerTy(), dl, + DAG.getEntryNode(), CPAddr, + PseudoSourceValue::getConstantPool(), 0, + false, false, 0); + SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32); + Callee = DAG.getNode(ARMISD::PIC_ADD, dl, + getPointerTy(), Callee, PICLabel); + } else + Callee = DAG.getTargetExternalSymbol(Sym, getPointerTy()); + } + + // FIXME: handle tail calls differently. + unsigned CallOpc; + if (Subtarget->isThumb()) { + if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps()) + CallOpc = ARMISD::CALL_NOLINK; + else + CallOpc = isARMFunc ? ARMISD::CALL : ARMISD::tCALL; + } else { + CallOpc = (isDirect || Subtarget->hasV5TOps()) + ? (isLocalARMFunc ? ARMISD::CALL_PRED : ARMISD::CALL) + : ARMISD::CALL_NOLINK; + } + if (CallOpc == ARMISD::CALL_NOLINK && !Subtarget->isThumb1Only()) { + // implicit def LR - LR mustn't be allocated as GRP:$dst of CALL_NOLINK + Chain = DAG.getCopyToReg(Chain, dl, ARM::LR, DAG.getUNDEF(MVT::i32),InFlag); + InFlag = Chain.getValue(1); + } + + std::vector<SDValue> Ops; + Ops.push_back(Chain); + Ops.push_back(Callee); + + // Add argument registers to the end of the list so that they are known live + // into the call. + for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) + Ops.push_back(DAG.getRegister(RegsToPass[i].first, + RegsToPass[i].second.getValueType())); + + if (InFlag.getNode()) + Ops.push_back(InFlag); + // Returns a chain and a flag for retval copy to use. + Chain = DAG.getNode(CallOpc, dl, DAG.getVTList(MVT::Other, MVT::Flag), + &Ops[0], Ops.size()); + InFlag = Chain.getValue(1); + + Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true), + DAG.getIntPtrConstant(0, true), InFlag); + if (!Ins.empty()) + InFlag = Chain.getValue(1); + + // Handle result values, copying them out of physregs into vregs that we + // return. + return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, + dl, DAG, InVals); +} + +SDValue +ARMTargetLowering::LowerReturn(SDValue Chain, + CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl<ISD::OutputArg> &Outs, + DebugLoc dl, SelectionDAG &DAG) const { + + // CCValAssign - represent the assignment of the return value to a location. + SmallVector<CCValAssign, 16> RVLocs; + + // CCState - Info about the registers and stack slots. + CCState CCInfo(CallConv, isVarArg, getTargetMachine(), RVLocs, + *DAG.getContext()); + + // Analyze outgoing return values. + CCInfo.AnalyzeReturn(Outs, CCAssignFnForNode(CallConv, /* Return */ true, + isVarArg)); + + // If this is the first return lowered for this function, add + // the regs to the liveout set for the function. + if (DAG.getMachineFunction().getRegInfo().liveout_empty()) { + for (unsigned i = 0; i != RVLocs.size(); ++i) + if (RVLocs[i].isRegLoc()) + DAG.getMachineFunction().getRegInfo().addLiveOut(RVLocs[i].getLocReg()); + } + + SDValue Flag; + + // Copy the result values into the output registers. + for (unsigned i = 0, realRVLocIdx = 0; + i != RVLocs.size(); + ++i, ++realRVLocIdx) { + CCValAssign &VA = RVLocs[i]; + assert(VA.isRegLoc() && "Can only return in registers!"); + + SDValue Arg = Outs[realRVLocIdx].Val; + + switch (VA.getLocInfo()) { + default: llvm_unreachable("Unknown loc info!"); + case CCValAssign::Full: break; + case CCValAssign::BCvt: + Arg = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getLocVT(), Arg); + break; + } + + if (VA.needsCustom()) { + if (VA.getLocVT() == MVT::v2f64) { + // Extract the first half and return it in two registers. + SDValue Half = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, + DAG.getConstant(0, MVT::i32)); + SDValue HalfGPRs = DAG.getNode(ARMISD::VMOVRRD, dl, + DAG.getVTList(MVT::i32, MVT::i32), Half); + + Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), HalfGPRs, Flag); + Flag = Chain.getValue(1); + VA = RVLocs[++i]; // skip ahead to next loc + Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), + HalfGPRs.getValue(1), Flag); + Flag = Chain.getValue(1); + VA = RVLocs[++i]; // skip ahead to next loc + + // Extract the 2nd half and fall through to handle it as an f64 value. + Arg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, + DAG.getConstant(1, MVT::i32)); + } + // Legalize ret f64 -> ret 2 x i32. We always have fmrrd if f64 is + // available. + SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl, + DAG.getVTList(MVT::i32, MVT::i32), &Arg, 1); + Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), fmrrd, Flag); + Flag = Chain.getValue(1); + VA = RVLocs[++i]; // skip ahead to next loc + Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), fmrrd.getValue(1), + Flag); + } else + Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag); + + // Guarantee that all emitted copies are + // stuck together, avoiding something bad. + Flag = Chain.getValue(1); + } + + SDValue result; + if (Flag.getNode()) + result = DAG.getNode(ARMISD::RET_FLAG, dl, MVT::Other, Chain, Flag); + else // Return Void + result = DAG.getNode(ARMISD::RET_FLAG, dl, MVT::Other, Chain); + + return result; +} + +// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as +// their target counterpart wrapped in the ARMISD::Wrapper node. Suppose N is +// one of the above mentioned nodes. It has to be wrapped because otherwise +// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only +// be used to form addressing mode. These wrapped nodes will be selected +// into MOVi. +static SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) { + EVT PtrVT = Op.getValueType(); + // FIXME there is no actual debug info here + DebugLoc dl = Op.getDebugLoc(); + ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); + SDValue Res; + if (CP->isMachineConstantPoolEntry()) + Res = DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT, + CP->getAlignment()); + else + Res = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, + CP->getAlignment()); + return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Res); +} + +SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op, + SelectionDAG &DAG) const { + MachineFunction &MF = DAG.getMachineFunction(); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + unsigned ARMPCLabelIndex = 0; + DebugLoc DL = Op.getDebugLoc(); + EVT PtrVT = getPointerTy(); + const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); + Reloc::Model RelocM = getTargetMachine().getRelocationModel(); + SDValue CPAddr; + if (RelocM == Reloc::Static) { + CPAddr = DAG.getTargetConstantPool(BA, PtrVT, 4); + } else { + unsigned PCAdj = Subtarget->isThumb() ? 4 : 8; + ARMPCLabelIndex = AFI->createConstPoolEntryUId(); + ARMConstantPoolValue *CPV = new ARMConstantPoolValue(BA, ARMPCLabelIndex, + ARMCP::CPBlockAddress, + PCAdj); + CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4); + } + CPAddr = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, CPAddr); + SDValue Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), CPAddr, + PseudoSourceValue::getConstantPool(), 0, + false, false, 0); + if (RelocM == Reloc::Static) + return Result; + SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32); + return DAG.getNode(ARMISD::PIC_ADD, DL, PtrVT, Result, PICLabel); +} + +// Lower ISD::GlobalTLSAddress using the "general dynamic" model +SDValue +ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA, + SelectionDAG &DAG) const { + DebugLoc dl = GA->getDebugLoc(); + EVT PtrVT = getPointerTy(); + unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8; + MachineFunction &MF = DAG.getMachineFunction(); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + unsigned ARMPCLabelIndex = AFI->createConstPoolEntryUId(); + ARMConstantPoolValue *CPV = + new ARMConstantPoolValue(GA->getGlobal(), ARMPCLabelIndex, + ARMCP::CPValue, PCAdj, "tlsgd", true); + SDValue Argument = DAG.getTargetConstantPool(CPV, PtrVT, 4); + Argument = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Argument); + Argument = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Argument, + PseudoSourceValue::getConstantPool(), 0, + false, false, 0); + SDValue Chain = Argument.getValue(1); + + SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32); + Argument = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Argument, PICLabel); + + // call __tls_get_addr. + ArgListTy Args; + ArgListEntry Entry; + Entry.Node = Argument; + Entry.Ty = (const Type *) Type::getInt32Ty(*DAG.getContext()); + Args.push_back(Entry); + // FIXME: is there useful debug info available here? + std::pair<SDValue, SDValue> CallResult = + LowerCallTo(Chain, (const Type *) Type::getInt32Ty(*DAG.getContext()), + false, false, false, false, + 0, CallingConv::C, false, /*isReturnValueUsed=*/true, + DAG.getExternalSymbol("__tls_get_addr", PtrVT), Args, DAG, dl); + return CallResult.first; +} + +// Lower ISD::GlobalTLSAddress using the "initial exec" or +// "local exec" model. +SDValue +ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA, + SelectionDAG &DAG) const { + const GlobalValue *GV = GA->getGlobal(); + DebugLoc dl = GA->getDebugLoc(); + SDValue Offset; + SDValue Chain = DAG.getEntryNode(); + EVT PtrVT = getPointerTy(); + // Get the Thread Pointer + SDValue ThreadPointer = DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT); + + if (GV->isDeclaration()) { + MachineFunction &MF = DAG.getMachineFunction(); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + unsigned ARMPCLabelIndex = AFI->createConstPoolEntryUId(); + // Initial exec model. + unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8; + ARMConstantPoolValue *CPV = + new ARMConstantPoolValue(GA->getGlobal(), ARMPCLabelIndex, + ARMCP::CPValue, PCAdj, "gottpoff", true); + Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4); + Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset); + Offset = DAG.getLoad(PtrVT, dl, Chain, Offset, + PseudoSourceValue::getConstantPool(), 0, + false, false, 0); + Chain = Offset.getValue(1); + + SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32); + Offset = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Offset, PICLabel); + + Offset = DAG.getLoad(PtrVT, dl, Chain, Offset, + PseudoSourceValue::getConstantPool(), 0, + false, false, 0); + } else { + // local exec model + ARMConstantPoolValue *CPV = new ARMConstantPoolValue(GV, "tpoff"); + Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4); + Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset); + Offset = DAG.getLoad(PtrVT, dl, Chain, Offset, + PseudoSourceValue::getConstantPool(), 0, + false, false, 0); + } + + // The address of the thread local variable is the add of the thread + // pointer with the offset of the variable. + return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset); +} + +SDValue +ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { + // TODO: implement the "local dynamic" model + assert(Subtarget->isTargetELF() && + "TLS not implemented for non-ELF targets"); + GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); + // If the relocation model is PIC, use the "General Dynamic" TLS Model, + // otherwise use the "Local Exec" TLS Model + if (getTargetMachine().getRelocationModel() == Reloc::PIC_) + return LowerToTLSGeneralDynamicModel(GA, DAG); + else + return LowerToTLSExecModels(GA, DAG); +} + +SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op, + SelectionDAG &DAG) const { + EVT PtrVT = getPointerTy(); + DebugLoc dl = Op.getDebugLoc(); + const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); + Reloc::Model RelocM = getTargetMachine().getRelocationModel(); + if (RelocM == Reloc::PIC_) { + bool UseGOTOFF = GV->hasLocalLinkage() || GV->hasHiddenVisibility(); + ARMConstantPoolValue *CPV = + new ARMConstantPoolValue(GV, UseGOTOFF ? "GOTOFF" : "GOT"); + SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4); + CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); + SDValue Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), + CPAddr, + PseudoSourceValue::getConstantPool(), 0, + false, false, 0); + SDValue Chain = Result.getValue(1); + SDValue GOT = DAG.getGLOBAL_OFFSET_TABLE(PtrVT); + Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result, GOT); + if (!UseGOTOFF) + Result = DAG.getLoad(PtrVT, dl, Chain, Result, + PseudoSourceValue::getGOT(), 0, + false, false, 0); + return Result; + } else { + // If we have T2 ops, we can materialize the address directly via movt/movw + // pair. This is always cheaper. + if (Subtarget->useMovt()) { + return DAG.getNode(ARMISD::Wrapper, dl, PtrVT, + DAG.getTargetGlobalAddress(GV, PtrVT)); + } else { + SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, 4); + CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); + return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr, + PseudoSourceValue::getConstantPool(), 0, + false, false, 0); + } + } +} + +SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op, + SelectionDAG &DAG) const { + MachineFunction &MF = DAG.getMachineFunction(); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + unsigned ARMPCLabelIndex = 0; + EVT PtrVT = getPointerTy(); + DebugLoc dl = Op.getDebugLoc(); + const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); + Reloc::Model RelocM = getTargetMachine().getRelocationModel(); + SDValue CPAddr; + if (RelocM == Reloc::Static) + CPAddr = DAG.getTargetConstantPool(GV, PtrVT, 4); + else { + ARMPCLabelIndex = AFI->createConstPoolEntryUId(); + unsigned PCAdj = (RelocM != Reloc::PIC_) ? 0 : (Subtarget->isThumb()?4:8); + ARMConstantPoolValue *CPV = + new ARMConstantPoolValue(GV, ARMPCLabelIndex, ARMCP::CPValue, PCAdj); + CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4); + } + CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); + + SDValue Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr, + PseudoSourceValue::getConstantPool(), 0, + false, false, 0); + SDValue Chain = Result.getValue(1); + + if (RelocM == Reloc::PIC_) { + SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32); + Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel); + } + + if (Subtarget->GVIsIndirectSymbol(GV, RelocM)) + Result = DAG.getLoad(PtrVT, dl, Chain, Result, + PseudoSourceValue::getGOT(), 0, + false, false, 0); + + return Result; +} + +SDValue ARMTargetLowering::LowerGLOBAL_OFFSET_TABLE(SDValue Op, + SelectionDAG &DAG) const { + assert(Subtarget->isTargetELF() && + "GLOBAL OFFSET TABLE not implemented for non-ELF targets"); + MachineFunction &MF = DAG.getMachineFunction(); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + unsigned ARMPCLabelIndex = AFI->createConstPoolEntryUId(); + EVT PtrVT = getPointerTy(); + DebugLoc dl = Op.getDebugLoc(); + unsigned PCAdj = Subtarget->isThumb() ? 4 : 8; + ARMConstantPoolValue *CPV = new ARMConstantPoolValue(*DAG.getContext(), + "_GLOBAL_OFFSET_TABLE_", + ARMPCLabelIndex, PCAdj); + SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4); + CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); + SDValue Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr, + PseudoSourceValue::getConstantPool(), 0, + false, false, 0); + SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32); + return DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel); +} + +SDValue +ARMTargetLowering::LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const { + DebugLoc dl = Op.getDebugLoc(); + SDValue Val = Subtarget->isThumb() ? + DAG.getCopyFromReg(DAG.getEntryNode(), dl, ARM::SP, MVT::i32) : + DAG.getConstant(0, MVT::i32); + return DAG.getNode(ARMISD::EH_SJLJ_SETJMP, dl, MVT::i32, Op.getOperand(0), + Op.getOperand(1), Val); +} + +SDValue +ARMTargetLowering::LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const { + DebugLoc dl = Op.getDebugLoc(); + return DAG.getNode(ARMISD::EH_SJLJ_LONGJMP, dl, MVT::Other, Op.getOperand(0), + Op.getOperand(1), DAG.getConstant(0, MVT::i32)); +} + +SDValue +ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG, + const ARMSubtarget *Subtarget) + const { + unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); + DebugLoc dl = Op.getDebugLoc(); + switch (IntNo) { + default: return SDValue(); // Don't custom lower most intrinsics. + case Intrinsic::arm_thread_pointer: { + EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); + return DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT); + } + case Intrinsic::eh_sjlj_lsda: { + MachineFunction &MF = DAG.getMachineFunction(); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + unsigned ARMPCLabelIndex = AFI->createConstPoolEntryUId(); + EVT PtrVT = getPointerTy(); + DebugLoc dl = Op.getDebugLoc(); + Reloc::Model RelocM = getTargetMachine().getRelocationModel(); + SDValue CPAddr; + unsigned PCAdj = (RelocM != Reloc::PIC_) + ? 0 : (Subtarget->isThumb() ? 4 : 8); + ARMConstantPoolValue *CPV = + new ARMConstantPoolValue(MF.getFunction(), ARMPCLabelIndex, + ARMCP::CPLSDA, PCAdj); + CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4); + CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); + SDValue Result = + DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr, + PseudoSourceValue::getConstantPool(), 0, + false, false, 0); + SDValue Chain = Result.getValue(1); + + if (RelocM == Reloc::PIC_) { + SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32); + Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel); + } + return Result; + } + } +} + +static SDValue LowerMEMBARRIER(SDValue Op, SelectionDAG &DAG, + const ARMSubtarget *Subtarget) { + DebugLoc dl = Op.getDebugLoc(); + SDValue Op5 = Op.getOperand(5); + SDValue Res; + unsigned isDeviceBarrier = cast<ConstantSDNode>(Op5)->getZExtValue(); + if (isDeviceBarrier) { + if (Subtarget->hasV7Ops()) + Res = DAG.getNode(ARMISD::SYNCBARRIER, dl, MVT::Other, Op.getOperand(0)); + else + Res = DAG.getNode(ARMISD::SYNCBARRIER, dl, MVT::Other, Op.getOperand(0), + DAG.getConstant(0, MVT::i32)); + } else { + if (Subtarget->hasV7Ops()) + Res = DAG.getNode(ARMISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0)); + else + Res = DAG.getNode(ARMISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0), + DAG.getConstant(0, MVT::i32)); + } + return Res; +} + +static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) { + MachineFunction &MF = DAG.getMachineFunction(); + ARMFunctionInfo *FuncInfo = MF.getInfo<ARMFunctionInfo>(); + + // vastart just stores the address of the VarArgsFrameIndex slot into the + // memory location argument. + DebugLoc dl = Op.getDebugLoc(); + EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); + SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); + const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); + return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1), SV, 0, + false, false, 0); +} + +SDValue +ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, + SelectionDAG &DAG) const { + SDNode *Node = Op.getNode(); + DebugLoc dl = Node->getDebugLoc(); + EVT VT = Node->getValueType(0); + SDValue Chain = Op.getOperand(0); + SDValue Size = Op.getOperand(1); + SDValue Align = Op.getOperand(2); + + // Chain the dynamic stack allocation so that it doesn't modify the stack + // pointer when other instructions are using the stack. + Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, true)); + + unsigned AlignVal = cast<ConstantSDNode>(Align)->getZExtValue(); + unsigned StackAlign = getTargetMachine().getFrameInfo()->getStackAlignment(); + if (AlignVal > StackAlign) + // Do this now since selection pass cannot introduce new target + // independent node. + Align = DAG.getConstant(-(uint64_t)AlignVal, VT); + + // In Thumb1 mode, there isn't a "sub r, sp, r" instruction, we will end up + // using a "add r, sp, r" instead. Negate the size now so we don't have to + // do even more horrible hack later. + MachineFunction &MF = DAG.getMachineFunction(); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + if (AFI->isThumb1OnlyFunction()) { + bool Negate = true; + ConstantSDNode *C = dyn_cast<ConstantSDNode>(Size); + if (C) { + uint32_t Val = C->getZExtValue(); + if (Val <= 508 && ((Val & 3) == 0)) + Negate = false; + } + if (Negate) + Size = DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(0, VT), Size); + } + + SDVTList VTList = DAG.getVTList(VT, MVT::Other); + SDValue Ops1[] = { Chain, Size, Align }; + SDValue Res = DAG.getNode(ARMISD::DYN_ALLOC, dl, VTList, Ops1, 3); + Chain = Res.getValue(1); + Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, true), + DAG.getIntPtrConstant(0, true), SDValue()); + SDValue Ops2[] = { Res, Chain }; + return DAG.getMergeValues(Ops2, 2, dl); +} + +SDValue +ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA, + SDValue &Root, SelectionDAG &DAG, + DebugLoc dl) const { + MachineFunction &MF = DAG.getMachineFunction(); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + + TargetRegisterClass *RC; + if (AFI->isThumb1OnlyFunction()) + RC = ARM::tGPRRegisterClass; + else + RC = ARM::GPRRegisterClass; + + // Transform the arguments stored in physical registers into virtual ones. + unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); + SDValue ArgValue = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32); + + SDValue ArgValue2; + if (NextVA.isMemLoc()) { + MachineFrameInfo *MFI = MF.getFrameInfo(); + int FI = MFI->CreateFixedObject(4, NextVA.getLocMemOffset(), true, false); + + // Create load node to retrieve arguments from the stack. + SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); + ArgValue2 = DAG.getLoad(MVT::i32, dl, Root, FIN, + PseudoSourceValue::getFixedStack(FI), 0, + false, false, 0); + } else { + Reg = MF.addLiveIn(NextVA.getLocReg(), RC); + ArgValue2 = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32); + } + + return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, ArgValue, ArgValue2); +} + +SDValue +ARMTargetLowering::LowerFormalArguments(SDValue Chain, + CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl<ISD::InputArg> + &Ins, + DebugLoc dl, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals) + const { + + MachineFunction &MF = DAG.getMachineFunction(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + + // Assign locations to all of the incoming arguments. + SmallVector<CCValAssign, 16> ArgLocs; + CCState CCInfo(CallConv, isVarArg, getTargetMachine(), ArgLocs, + *DAG.getContext()); + CCInfo.AnalyzeFormalArguments(Ins, + CCAssignFnForNode(CallConv, /* Return*/ false, + isVarArg)); + + SmallVector<SDValue, 16> ArgValues; + + for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { + CCValAssign &VA = ArgLocs[i]; + + // Arguments stored in registers. + if (VA.isRegLoc()) { + EVT RegVT = VA.getLocVT(); + + SDValue ArgValue; + if (VA.needsCustom()) { + // f64 and vector types are split up into multiple registers or + // combinations of registers and stack slots. + if (VA.getLocVT() == MVT::v2f64) { + SDValue ArgValue1 = GetF64FormalArgument(VA, ArgLocs[++i], + Chain, DAG, dl); + VA = ArgLocs[++i]; // skip ahead to next loc + SDValue ArgValue2; + if (VA.isMemLoc()) { + int FI = MFI->CreateFixedObject(8, VA.getLocMemOffset(), + true, false); + SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); + ArgValue2 = DAG.getLoad(MVT::f64, dl, Chain, FIN, + PseudoSourceValue::getFixedStack(FI), 0, + false, false, 0); + } else { + ArgValue2 = GetF64FormalArgument(VA, ArgLocs[++i], + Chain, DAG, dl); + } + ArgValue = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64); + ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, + ArgValue, ArgValue1, DAG.getIntPtrConstant(0)); + ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, + ArgValue, ArgValue2, DAG.getIntPtrConstant(1)); + } else + ArgValue = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl); + + } else { + TargetRegisterClass *RC; + + if (RegVT == MVT::f32) + RC = ARM::SPRRegisterClass; + else if (RegVT == MVT::f64) + RC = ARM::DPRRegisterClass; + else if (RegVT == MVT::v2f64) + RC = ARM::QPRRegisterClass; + else if (RegVT == MVT::i32) + RC = (AFI->isThumb1OnlyFunction() ? + ARM::tGPRRegisterClass : ARM::GPRRegisterClass); + else + llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering"); + + // Transform the arguments in physical registers into virtual ones. + unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); + ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT); + } + + // If this is an 8 or 16-bit value, it is really passed promoted + // to 32 bits. Insert an assert[sz]ext to capture this, then + // truncate to the right size. + switch (VA.getLocInfo()) { + default: llvm_unreachable("Unknown loc info!"); + case CCValAssign::Full: break; + case CCValAssign::BCvt: + ArgValue = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getValVT(), ArgValue); + break; + case CCValAssign::SExt: + ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue, + DAG.getValueType(VA.getValVT())); + ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); + break; + case CCValAssign::ZExt: + ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue, + DAG.getValueType(VA.getValVT())); + ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); + break; + } + + InVals.push_back(ArgValue); + + } else { // VA.isRegLoc() + + // sanity check + assert(VA.isMemLoc()); + assert(VA.getValVT() != MVT::i64 && "i64 should already be lowered"); + + unsigned ArgSize = VA.getLocVT().getSizeInBits()/8; + int FI = MFI->CreateFixedObject(ArgSize, VA.getLocMemOffset(), + true, false); + + // Create load nodes to retrieve arguments from the stack. + SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); + InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, FIN, + PseudoSourceValue::getFixedStack(FI), 0, + false, false, 0)); + } + } + + // varargs + if (isVarArg) { + static const unsigned GPRArgRegs[] = { + ARM::R0, ARM::R1, ARM::R2, ARM::R3 + }; + + unsigned NumGPRs = CCInfo.getFirstUnallocated + (GPRArgRegs, sizeof(GPRArgRegs) / sizeof(GPRArgRegs[0])); + + unsigned Align = MF.getTarget().getFrameInfo()->getStackAlignment(); + unsigned VARegSize = (4 - NumGPRs) * 4; + unsigned VARegSaveSize = (VARegSize + Align - 1) & ~(Align - 1); + unsigned ArgOffset = CCInfo.getNextStackOffset(); + if (VARegSaveSize) { + // If this function is vararg, store any remaining integer argument regs + // to their spots on the stack so that they may be loaded by deferencing + // the result of va_next. + AFI->setVarArgsRegSaveSize(VARegSaveSize); + AFI->setVarArgsFrameIndex( + MFI->CreateFixedObject(VARegSaveSize, + ArgOffset + VARegSaveSize - VARegSize, + true, false)); + SDValue FIN = DAG.getFrameIndex(AFI->getVarArgsFrameIndex(), + getPointerTy()); + + SmallVector<SDValue, 4> MemOps; + for (; NumGPRs < 4; ++NumGPRs) { + TargetRegisterClass *RC; + if (AFI->isThumb1OnlyFunction()) + RC = ARM::tGPRRegisterClass; + else + RC = ARM::GPRRegisterClass; + + unsigned VReg = MF.addLiveIn(GPRArgRegs[NumGPRs], RC); + SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32); + SDValue Store = + DAG.getStore(Val.getValue(1), dl, Val, FIN, + PseudoSourceValue::getFixedStack(AFI->getVarArgsFrameIndex()), 0, + false, false, 0); + MemOps.push_back(Store); + FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), FIN, + DAG.getConstant(4, getPointerTy())); + } + if (!MemOps.empty()) + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, + &MemOps[0], MemOps.size()); + } else + // This will point to the next argument passed via stack. + AFI->setVarArgsFrameIndex(MFI->CreateFixedObject(4, ArgOffset, + true, false)); + } + + return Chain; +} + +/// isFloatingPointZero - Return true if this is +0.0. +static bool isFloatingPointZero(SDValue Op) { + if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) + return CFP->getValueAPF().isPosZero(); + else if (ISD::isEXTLoad(Op.getNode()) || ISD::isNON_EXTLoad(Op.getNode())) { + // Maybe this has already been legalized into the constant pool? + if (Op.getOperand(1).getOpcode() == ARMISD::Wrapper) { + SDValue WrapperOp = Op.getOperand(1).getOperand(0); + if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(WrapperOp)) + if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CP->getConstVal())) + return CFP->getValueAPF().isPosZero(); + } + } + return false; +} + +/// Returns appropriate ARM CMP (cmp) and corresponding condition code for +/// the given operands. +SDValue +ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, + SDValue &ARMCC, SelectionDAG &DAG, + DebugLoc dl) const { + if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) { + unsigned C = RHSC->getZExtValue(); + if (!isLegalICmpImmediate(C)) { + // Constant does not fit, try adjusting it by one? + switch (CC) { + default: break; + case ISD::SETLT: + case ISD::SETGE: + if (isLegalICmpImmediate(C-1)) { + CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT; + RHS = DAG.getConstant(C-1, MVT::i32); + } + break; + case ISD::SETULT: + case ISD::SETUGE: + if (C > 0 && isLegalICmpImmediate(C-1)) { + CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT; + RHS = DAG.getConstant(C-1, MVT::i32); + } + break; + case ISD::SETLE: + case ISD::SETGT: + if (isLegalICmpImmediate(C+1)) { + CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE; + RHS = DAG.getConstant(C+1, MVT::i32); + } + break; + case ISD::SETULE: + case ISD::SETUGT: + if (C < 0xffffffff && isLegalICmpImmediate(C+1)) { + CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE; + RHS = DAG.getConstant(C+1, MVT::i32); + } + break; + } + } + } + + ARMCC::CondCodes CondCode = IntCCToARMCC(CC); + ARMISD::NodeType CompareType; + switch (CondCode) { + default: + CompareType = ARMISD::CMP; + break; + case ARMCC::EQ: + case ARMCC::NE: + // Uses only Z Flag + CompareType = ARMISD::CMPZ; + break; + } + ARMCC = DAG.getConstant(CondCode, MVT::i32); + return DAG.getNode(CompareType, dl, MVT::Flag, LHS, RHS); +} + +/// Returns a appropriate VFP CMP (fcmp{s|d}+fmstat) for the given operands. +static SDValue getVFPCmp(SDValue LHS, SDValue RHS, SelectionDAG &DAG, + DebugLoc dl) { + SDValue Cmp; + if (!isFloatingPointZero(RHS)) + Cmp = DAG.getNode(ARMISD::CMPFP, dl, MVT::Flag, LHS, RHS); + else + Cmp = DAG.getNode(ARMISD::CMPFPw0, dl, MVT::Flag, LHS); + return DAG.getNode(ARMISD::FMSTAT, dl, MVT::Flag, Cmp); +} + +SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get(); + SDValue TrueVal = Op.getOperand(2); + SDValue FalseVal = Op.getOperand(3); + DebugLoc dl = Op.getDebugLoc(); + + if (LHS.getValueType() == MVT::i32) { + SDValue ARMCC; + SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); + SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMCC, DAG, dl); + return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMCC, CCR,Cmp); + } + + ARMCC::CondCodes CondCode, CondCode2; + FPCCToARMCC(CC, CondCode, CondCode2); + + SDValue ARMCC = DAG.getConstant(CondCode, MVT::i32); + SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); + SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl); + SDValue Result = DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, + ARMCC, CCR, Cmp); + if (CondCode2 != ARMCC::AL) { + SDValue ARMCC2 = DAG.getConstant(CondCode2, MVT::i32); + // FIXME: Needs another CMP because flag can have but one use. + SDValue Cmp2 = getVFPCmp(LHS, RHS, DAG, dl); + Result = DAG.getNode(ARMISD::CMOV, dl, VT, + Result, TrueVal, ARMCC2, CCR, Cmp2); + } + return Result; +} + +SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { + SDValue Chain = Op.getOperand(0); + ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get(); + SDValue LHS = Op.getOperand(2); + SDValue RHS = Op.getOperand(3); + SDValue Dest = Op.getOperand(4); + DebugLoc dl = Op.getDebugLoc(); + + if (LHS.getValueType() == MVT::i32) { + SDValue ARMCC; + SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); + SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMCC, DAG, dl); + return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, + Chain, Dest, ARMCC, CCR,Cmp); + } + + assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64); + ARMCC::CondCodes CondCode, CondCode2; + FPCCToARMCC(CC, CondCode, CondCode2); + + SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl); + SDValue ARMCC = DAG.getConstant(CondCode, MVT::i32); + SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); + SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Flag); + SDValue Ops[] = { Chain, Dest, ARMCC, CCR, Cmp }; + SDValue Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops, 5); + if (CondCode2 != ARMCC::AL) { + ARMCC = DAG.getConstant(CondCode2, MVT::i32); + SDValue Ops[] = { Res, Dest, ARMCC, CCR, Res.getValue(1) }; + Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops, 5); + } + return Res; +} + +SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const { + SDValue Chain = Op.getOperand(0); + SDValue Table = Op.getOperand(1); + SDValue Index = Op.getOperand(2); + DebugLoc dl = Op.getDebugLoc(); + + EVT PTy = getPointerTy(); + JumpTableSDNode *JT = cast<JumpTableSDNode>(Table); + ARMFunctionInfo *AFI = DAG.getMachineFunction().getInfo<ARMFunctionInfo>(); + SDValue UId = DAG.getConstant(AFI->createJumpTableUId(), PTy); + SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PTy); + Table = DAG.getNode(ARMISD::WrapperJT, dl, MVT::i32, JTI, UId); + Index = DAG.getNode(ISD::MUL, dl, PTy, Index, DAG.getConstant(4, PTy)); + SDValue Addr = DAG.getNode(ISD::ADD, dl, PTy, Index, Table); + if (Subtarget->isThumb2()) { + // Thumb2 uses a two-level jump. That is, it jumps into the jump table + // which does another jump to the destination. This also makes it easier + // to translate it to TBB / TBH later. + // FIXME: This might not work if the function is extremely large. + return DAG.getNode(ARMISD::BR2_JT, dl, MVT::Other, Chain, + Addr, Op.getOperand(2), JTI, UId); + } + if (getTargetMachine().getRelocationModel() == Reloc::PIC_) { + Addr = DAG.getLoad((EVT)MVT::i32, dl, Chain, Addr, + PseudoSourceValue::getJumpTable(), 0, + false, false, 0); + Chain = Addr.getValue(1); + Addr = DAG.getNode(ISD::ADD, dl, PTy, Addr, Table); + return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI, UId); + } else { + Addr = DAG.getLoad(PTy, dl, Chain, Addr, + PseudoSourceValue::getJumpTable(), 0, false, false, 0); + Chain = Addr.getValue(1); + return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI, UId); + } +} + +static SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) { + DebugLoc dl = Op.getDebugLoc(); + unsigned Opc; + + switch (Op.getOpcode()) { + default: + assert(0 && "Invalid opcode!"); + case ISD::FP_TO_SINT: + Opc = ARMISD::FTOSI; + break; + case ISD::FP_TO_UINT: + Opc = ARMISD::FTOUI; + break; + } + Op = DAG.getNode(Opc, dl, MVT::f32, Op.getOperand(0)); + return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i32, Op); +} + +static SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) { + EVT VT = Op.getValueType(); + DebugLoc dl = Op.getDebugLoc(); + unsigned Opc; + + switch (Op.getOpcode()) { + default: + assert(0 && "Invalid opcode!"); + case ISD::SINT_TO_FP: + Opc = ARMISD::SITOF; + break; + case ISD::UINT_TO_FP: + Opc = ARMISD::UITOF; + break; + } + + Op = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::f32, Op.getOperand(0)); + return DAG.getNode(Opc, dl, VT, Op); +} + +static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) { + // Implement fcopysign with a fabs and a conditional fneg. + SDValue Tmp0 = Op.getOperand(0); + SDValue Tmp1 = Op.getOperand(1); + DebugLoc dl = Op.getDebugLoc(); + EVT VT = Op.getValueType(); + EVT SrcVT = Tmp1.getValueType(); + SDValue AbsVal = DAG.getNode(ISD::FABS, dl, VT, Tmp0); + SDValue Cmp = getVFPCmp(Tmp1, DAG.getConstantFP(0.0, SrcVT), DAG, dl); + SDValue ARMCC = DAG.getConstant(ARMCC::LT, MVT::i32); + SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); + return DAG.getNode(ARMISD::CNEG, dl, VT, AbsVal, AbsVal, ARMCC, CCR, Cmp); +} + +SDValue ARMTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{ + MachineFunction &MF = DAG.getMachineFunction(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + MFI->setReturnAddressIsTaken(true); + + EVT VT = Op.getValueType(); + DebugLoc dl = Op.getDebugLoc(); + unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); + if (Depth) { + SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); + SDValue Offset = DAG.getConstant(4, MVT::i32); + return DAG.getLoad(VT, dl, DAG.getEntryNode(), + DAG.getNode(ISD::ADD, dl, VT, FrameAddr, Offset), + NULL, 0, false, false, 0); + } + + // Return LR, which contains the return address. Mark it an implicit live-in. + unsigned Reg = MF.addLiveIn(ARM::LR, ARM::GPRRegisterClass); + return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT); +} + +SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { + MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); + MFI->setFrameAddressIsTaken(true); + + EVT VT = Op.getValueType(); + DebugLoc dl = Op.getDebugLoc(); // FIXME probably not meaningful + unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); + unsigned FrameReg = (Subtarget->isThumb() || Subtarget->isTargetDarwin()) + ? ARM::R7 : ARM::R11; + SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT); + while (Depth--) + FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, NULL, 0, + false, false, 0); + return FrameAddr; +} + +/// ExpandBIT_CONVERT - If the target supports VFP, this function is called to +/// expand a bit convert where either the source or destination type is i64 to +/// use a VMOVDRR or VMOVRRD node. This should not be done when the non-i64 +/// operand type is illegal (e.g., v2f32 for a target that doesn't support +/// vectors), since the legalizer won't know what to do with that. +static SDValue ExpandBIT_CONVERT(SDNode *N, SelectionDAG &DAG) { + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + DebugLoc dl = N->getDebugLoc(); + SDValue Op = N->getOperand(0); + + // This function is only supposed to be called for i64 types, either as the + // source or destination of the bit convert. + EVT SrcVT = Op.getValueType(); + EVT DstVT = N->getValueType(0); + assert((SrcVT == MVT::i64 || DstVT == MVT::i64) && + "ExpandBIT_CONVERT called for non-i64 type"); + + // Turn i64->f64 into VMOVDRR. + if (SrcVT == MVT::i64 && TLI.isTypeLegal(DstVT)) { + SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op, + DAG.getConstant(0, MVT::i32)); + SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op, + DAG.getConstant(1, MVT::i32)); + return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi); + } + + // Turn f64->i64 into VMOVRRD. + if (DstVT == MVT::i64 && TLI.isTypeLegal(SrcVT)) { + SDValue Cvt = DAG.getNode(ARMISD::VMOVRRD, dl, + DAG.getVTList(MVT::i32, MVT::i32), &Op, 1); + // Merge the pieces into a single i64 value. + return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Cvt, Cvt.getValue(1)); + } + + return SDValue(); +} + +/// getZeroVector - Returns a vector of specified type with all zero elements. +/// +static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, DebugLoc dl) { + assert(VT.isVector() && "Expected a vector type"); + + // Zero vectors are used to represent vector negation and in those cases + // will be implemented with the NEON VNEG instruction. However, VNEG does + // not support i64 elements, so sometimes the zero vectors will need to be + // explicitly constructed. For those cases, and potentially other uses in + // the future, always build zero vectors as <16 x i8> or <8 x i8> bitcasted + // to their dest type. This ensures they get CSE'd. + SDValue Vec; + SDValue Cst = DAG.getTargetConstant(0, MVT::i8); + SmallVector<SDValue, 8> Ops; + MVT TVT; + + if (VT.getSizeInBits() == 64) { + Ops.assign(8, Cst); TVT = MVT::v8i8; + } else { + Ops.assign(16, Cst); TVT = MVT::v16i8; + } + Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, TVT, &Ops[0], Ops.size()); + + return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec); +} + +/// getOnesVector - Returns a vector of specified type with all bits set. +/// +static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, DebugLoc dl) { + assert(VT.isVector() && "Expected a vector type"); + + // Always build ones vectors as <16 x i8> or <8 x i8> bitcasted to their + // dest type. This ensures they get CSE'd. + SDValue Vec; + SDValue Cst = DAG.getTargetConstant(0xFF, MVT::i8); + SmallVector<SDValue, 8> Ops; + MVT TVT; + + if (VT.getSizeInBits() == 64) { + Ops.assign(8, Cst); TVT = MVT::v8i8; + } else { + Ops.assign(16, Cst); TVT = MVT::v16i8; + } + Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, TVT, &Ops[0], Ops.size()); + + return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec); +} + +/// LowerShiftRightParts - Lower SRA_PARTS, which returns two +/// i32 values and take a 2 x i32 value to shift plus a shift amount. +SDValue ARMTargetLowering::LowerShiftRightParts(SDValue Op, + SelectionDAG &DAG) const { + assert(Op.getNumOperands() == 3 && "Not a double-shift!"); + EVT VT = Op.getValueType(); + unsigned VTBits = VT.getSizeInBits(); + DebugLoc dl = Op.getDebugLoc(); + SDValue ShOpLo = Op.getOperand(0); + SDValue ShOpHi = Op.getOperand(1); + SDValue ShAmt = Op.getOperand(2); + SDValue ARMCC; + unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL; + + assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS); + + SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, + DAG.getConstant(VTBits, MVT::i32), ShAmt); + SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt); + SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt, + DAG.getConstant(VTBits, MVT::i32)); + SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt); + SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); + SDValue TrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt); + + SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); + SDValue Cmp = getARMCmp(ExtraShAmt, DAG.getConstant(0, MVT::i32), ISD::SETGE, + ARMCC, DAG, dl); + SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt); + SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMCC, + CCR, Cmp); + + SDValue Ops[2] = { Lo, Hi }; + return DAG.getMergeValues(Ops, 2, dl); +} + +/// LowerShiftLeftParts - Lower SHL_PARTS, which returns two +/// i32 values and take a 2 x i32 value to shift plus a shift amount. +SDValue ARMTargetLowering::LowerShiftLeftParts(SDValue Op, + SelectionDAG &DAG) const { + assert(Op.getNumOperands() == 3 && "Not a double-shift!"); + EVT VT = Op.getValueType(); + unsigned VTBits = VT.getSizeInBits(); + DebugLoc dl = Op.getDebugLoc(); + SDValue ShOpLo = Op.getOperand(0); + SDValue ShOpHi = Op.getOperand(1); + SDValue ShAmt = Op.getOperand(2); + SDValue ARMCC; + + assert(Op.getOpcode() == ISD::SHL_PARTS); + SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, + DAG.getConstant(VTBits, MVT::i32), ShAmt); + SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt); + SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt, + DAG.getConstant(VTBits, MVT::i32)); + SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt); + SDValue Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt); + + SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); + SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); + SDValue Cmp = getARMCmp(ExtraShAmt, DAG.getConstant(0, MVT::i32), ISD::SETGE, + ARMCC, DAG, dl); + SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); + SDValue Hi = DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, Tmp3, ARMCC, + CCR, Cmp); + + SDValue Ops[2] = { Lo, Hi }; + return DAG.getMergeValues(Ops, 2, dl); +} + +static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG, + const ARMSubtarget *ST) { + EVT VT = N->getValueType(0); + DebugLoc dl = N->getDebugLoc(); + + if (!ST->hasV6T2Ops()) + return SDValue(); + + SDValue rbit = DAG.getNode(ARMISD::RBIT, dl, VT, N->getOperand(0)); + return DAG.getNode(ISD::CTLZ, dl, VT, rbit); +} + +static SDValue LowerShift(SDNode *N, SelectionDAG &DAG, + const ARMSubtarget *ST) { + EVT VT = N->getValueType(0); + DebugLoc dl = N->getDebugLoc(); + + // Lower vector shifts on NEON to use VSHL. + if (VT.isVector()) { + assert(ST->hasNEON() && "unexpected vector shift"); + + // Left shifts translate directly to the vshiftu intrinsic. + if (N->getOpcode() == ISD::SHL) + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(Intrinsic::arm_neon_vshiftu, MVT::i32), + N->getOperand(0), N->getOperand(1)); + + assert((N->getOpcode() == ISD::SRA || + N->getOpcode() == ISD::SRL) && "unexpected vector shift opcode"); + + // NEON uses the same intrinsics for both left and right shifts. For + // right shifts, the shift amounts are negative, so negate the vector of + // shift amounts. + EVT ShiftVT = N->getOperand(1).getValueType(); + SDValue NegatedCount = DAG.getNode(ISD::SUB, dl, ShiftVT, + getZeroVector(ShiftVT, DAG, dl), + N->getOperand(1)); + Intrinsic::ID vshiftInt = (N->getOpcode() == ISD::SRA ? + Intrinsic::arm_neon_vshifts : + Intrinsic::arm_neon_vshiftu); + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(vshiftInt, MVT::i32), + N->getOperand(0), NegatedCount); + } + + // We can get here for a node like i32 = ISD::SHL i32, i64 + if (VT != MVT::i64) + return SDValue(); + + assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) && + "Unknown shift to lower!"); + + // We only lower SRA, SRL of 1 here, all others use generic lowering. + if (!isa<ConstantSDNode>(N->getOperand(1)) || + cast<ConstantSDNode>(N->getOperand(1))->getZExtValue() != 1) + return SDValue(); + + // If we are in thumb mode, we don't have RRX. + if (ST->isThumb1Only()) return SDValue(); + + // Okay, we have a 64-bit SRA or SRL of 1. Lower this to an RRX expr. + SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0), + DAG.getConstant(0, MVT::i32)); + SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0), + DAG.getConstant(1, MVT::i32)); + + // First, build a SRA_FLAG/SRL_FLAG op, which shifts the top part by one and + // captures the result into a carry flag. + unsigned Opc = N->getOpcode() == ISD::SRL ? ARMISD::SRL_FLAG:ARMISD::SRA_FLAG; + Hi = DAG.getNode(Opc, dl, DAG.getVTList(MVT::i32, MVT::Flag), &Hi, 1); + + // The low part is an ARMISD::RRX operand, which shifts the carry in. + Lo = DAG.getNode(ARMISD::RRX, dl, MVT::i32, Lo, Hi.getValue(1)); + + // Merge the pieces into a single i64 value. + return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi); +} + +static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) { + SDValue TmpOp0, TmpOp1; + bool Invert = false; + bool Swap = false; + unsigned Opc = 0; + + SDValue Op0 = Op.getOperand(0); + SDValue Op1 = Op.getOperand(1); + SDValue CC = Op.getOperand(2); + EVT VT = Op.getValueType(); + ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); + DebugLoc dl = Op.getDebugLoc(); + + if (Op.getOperand(1).getValueType().isFloatingPoint()) { + switch (SetCCOpcode) { + default: llvm_unreachable("Illegal FP comparison"); break; + case ISD::SETUNE: + case ISD::SETNE: Invert = true; // Fallthrough + case ISD::SETOEQ: + case ISD::SETEQ: Opc = ARMISD::VCEQ; break; + case ISD::SETOLT: + case ISD::SETLT: Swap = true; // Fallthrough + case ISD::SETOGT: + case ISD::SETGT: Opc = ARMISD::VCGT; break; + case ISD::SETOLE: + case ISD::SETLE: Swap = true; // Fallthrough + case ISD::SETOGE: + case ISD::SETGE: Opc = ARMISD::VCGE; break; + case ISD::SETUGE: Swap = true; // Fallthrough + case ISD::SETULE: Invert = true; Opc = ARMISD::VCGT; break; + case ISD::SETUGT: Swap = true; // Fallthrough + case ISD::SETULT: Invert = true; Opc = ARMISD::VCGE; break; + case ISD::SETUEQ: Invert = true; // Fallthrough + case ISD::SETONE: + // Expand this to (OLT | OGT). + TmpOp0 = Op0; + TmpOp1 = Op1; + Opc = ISD::OR; + Op0 = DAG.getNode(ARMISD::VCGT, dl, VT, TmpOp1, TmpOp0); + Op1 = DAG.getNode(ARMISD::VCGT, dl, VT, TmpOp0, TmpOp1); + break; + case ISD::SETUO: Invert = true; // Fallthrough + case ISD::SETO: + // Expand this to (OLT | OGE). + TmpOp0 = Op0; + TmpOp1 = Op1; + Opc = ISD::OR; + Op0 = DAG.getNode(ARMISD::VCGT, dl, VT, TmpOp1, TmpOp0); + Op1 = DAG.getNode(ARMISD::VCGE, dl, VT, TmpOp0, TmpOp1); + break; + } + } else { + // Integer comparisons. + switch (SetCCOpcode) { + default: llvm_unreachable("Illegal integer comparison"); break; + case ISD::SETNE: Invert = true; + case ISD::SETEQ: Opc = ARMISD::VCEQ; break; + case ISD::SETLT: Swap = true; + case ISD::SETGT: Opc = ARMISD::VCGT; break; + case ISD::SETLE: Swap = true; + case ISD::SETGE: Opc = ARMISD::VCGE; break; + case ISD::SETULT: Swap = true; + case ISD::SETUGT: Opc = ARMISD::VCGTU; break; + case ISD::SETULE: Swap = true; + case ISD::SETUGE: Opc = ARMISD::VCGEU; break; + } + + // Detect VTST (Vector Test Bits) = icmp ne (and (op0, op1), zero). + if (Opc == ARMISD::VCEQ) { + + SDValue AndOp; + if (ISD::isBuildVectorAllZeros(Op1.getNode())) + AndOp = Op0; + else if (ISD::isBuildVectorAllZeros(Op0.getNode())) + AndOp = Op1; + + // Ignore bitconvert. + if (AndOp.getNode() && AndOp.getOpcode() == ISD::BIT_CONVERT) + AndOp = AndOp.getOperand(0); + + if (AndOp.getNode() && AndOp.getOpcode() == ISD::AND) { + Opc = ARMISD::VTST; + Op0 = DAG.getNode(ISD::BIT_CONVERT, dl, VT, AndOp.getOperand(0)); + Op1 = DAG.getNode(ISD::BIT_CONVERT, dl, VT, AndOp.getOperand(1)); + Invert = !Invert; + } + } + } + + if (Swap) + std::swap(Op0, Op1); + + SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1); + + if (Invert) + Result = DAG.getNOT(dl, Result, VT); + + return Result; +} + +/// isVMOVSplat - Check if the specified splat value corresponds to an immediate +/// VMOV instruction, and if so, return the constant being splatted. +static SDValue isVMOVSplat(uint64_t SplatBits, uint64_t SplatUndef, + unsigned SplatBitSize, SelectionDAG &DAG) { + switch (SplatBitSize) { + case 8: + // Any 1-byte value is OK. + assert((SplatBits & ~0xff) == 0 && "one byte splat value is too big"); + return DAG.getTargetConstant(SplatBits, MVT::i8); + + case 16: + // NEON's 16-bit VMOV supports splat values where only one byte is nonzero. + if ((SplatBits & ~0xff) == 0 || + (SplatBits & ~0xff00) == 0) + return DAG.getTargetConstant(SplatBits, MVT::i16); + break; + + case 32: + // NEON's 32-bit VMOV supports splat values where: + // * only one byte is nonzero, or + // * the least significant byte is 0xff and the second byte is nonzero, or + // * the least significant 2 bytes are 0xff and the third is nonzero. + if ((SplatBits & ~0xff) == 0 || + (SplatBits & ~0xff00) == 0 || + (SplatBits & ~0xff0000) == 0 || + (SplatBits & ~0xff000000) == 0) + return DAG.getTargetConstant(SplatBits, MVT::i32); + + if ((SplatBits & ~0xffff) == 0 && + ((SplatBits | SplatUndef) & 0xff) == 0xff) + return DAG.getTargetConstant(SplatBits | 0xff, MVT::i32); + + if ((SplatBits & ~0xffffff) == 0 && + ((SplatBits | SplatUndef) & 0xffff) == 0xffff) + return DAG.getTargetConstant(SplatBits | 0xffff, MVT::i32); + + // Note: there are a few 32-bit splat values (specifically: 00ffff00, + // ff000000, ff0000ff, and ffff00ff) that are valid for VMOV.I64 but not + // VMOV.I32. A (very) minor optimization would be to replicate the value + // and fall through here to test for a valid 64-bit splat. But, then the + // caller would also need to check and handle the change in size. + break; + + case 64: { + // NEON has a 64-bit VMOV splat where each byte is either 0 or 0xff. + uint64_t BitMask = 0xff; + uint64_t Val = 0; + for (int ByteNum = 0; ByteNum < 8; ++ByteNum) { + if (((SplatBits | SplatUndef) & BitMask) == BitMask) + Val |= BitMask; + else if ((SplatBits & BitMask) != 0) + return SDValue(); + BitMask <<= 8; + } + return DAG.getTargetConstant(Val, MVT::i64); + } + + default: + llvm_unreachable("unexpected size for isVMOVSplat"); + break; + } + + return SDValue(); +} + +/// getVMOVImm - If this is a build_vector of constants which can be +/// formed by using a VMOV instruction of the specified element size, +/// return the constant being splatted. The ByteSize field indicates the +/// number of bytes of each element [1248]. +SDValue ARM::getVMOVImm(SDNode *N, unsigned ByteSize, SelectionDAG &DAG) { + BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N); + APInt SplatBits, SplatUndef; + unsigned SplatBitSize; + bool HasAnyUndefs; + if (! BVN || ! BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, + HasAnyUndefs, ByteSize * 8)) + return SDValue(); + + if (SplatBitSize > ByteSize * 8) + return SDValue(); + + return isVMOVSplat(SplatBits.getZExtValue(), SplatUndef.getZExtValue(), + SplatBitSize, DAG); +} + +static bool isVEXTMask(const SmallVectorImpl<int> &M, EVT VT, + bool &ReverseVEXT, unsigned &Imm) { + unsigned NumElts = VT.getVectorNumElements(); + ReverseVEXT = false; + Imm = M[0]; + + // If this is a VEXT shuffle, the immediate value is the index of the first + // element. The other shuffle indices must be the successive elements after + // the first one. + unsigned ExpectedElt = Imm; + for (unsigned i = 1; i < NumElts; ++i) { + // Increment the expected index. If it wraps around, it may still be + // a VEXT but the source vectors must be swapped. + ExpectedElt += 1; + if (ExpectedElt == NumElts * 2) { + ExpectedElt = 0; + ReverseVEXT = true; + } + + if (ExpectedElt != static_cast<unsigned>(M[i])) + return false; + } + + // Adjust the index value if the source operands will be swapped. + if (ReverseVEXT) + Imm -= NumElts; + + return true; +} + +/// isVREVMask - Check if a vector shuffle corresponds to a VREV +/// instruction with the specified blocksize. (The order of the elements +/// within each block of the vector is reversed.) +static bool isVREVMask(const SmallVectorImpl<int> &M, EVT VT, + unsigned BlockSize) { + assert((BlockSize==16 || BlockSize==32 || BlockSize==64) && + "Only possible block sizes for VREV are: 16, 32, 64"); + + unsigned EltSz = VT.getVectorElementType().getSizeInBits(); + if (EltSz == 64) + return false; + + unsigned NumElts = VT.getVectorNumElements(); + unsigned BlockElts = M[0] + 1; + + if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz) + return false; + + for (unsigned i = 0; i < NumElts; ++i) { + if ((unsigned) M[i] != + (i - i%BlockElts) + (BlockElts - 1 - i%BlockElts)) + return false; + } + + return true; +} + +static bool isVTRNMask(const SmallVectorImpl<int> &M, EVT VT, + unsigned &WhichResult) { + unsigned EltSz = VT.getVectorElementType().getSizeInBits(); + if (EltSz == 64) + return false; + + unsigned NumElts = VT.getVectorNumElements(); + WhichResult = (M[0] == 0 ? 0 : 1); + for (unsigned i = 0; i < NumElts; i += 2) { + if ((unsigned) M[i] != i + WhichResult || + (unsigned) M[i+1] != i + NumElts + WhichResult) + return false; + } + return true; +} + +/// isVTRN_v_undef_Mask - Special case of isVTRNMask for canonical form of +/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". +/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>. +static bool isVTRN_v_undef_Mask(const SmallVectorImpl<int> &M, EVT VT, + unsigned &WhichResult) { + unsigned EltSz = VT.getVectorElementType().getSizeInBits(); + if (EltSz == 64) + return false; + + unsigned NumElts = VT.getVectorNumElements(); + WhichResult = (M[0] == 0 ? 0 : 1); + for (unsigned i = 0; i < NumElts; i += 2) { + if ((unsigned) M[i] != i + WhichResult || + (unsigned) M[i+1] != i + WhichResult) + return false; + } + return true; +} + +static bool isVUZPMask(const SmallVectorImpl<int> &M, EVT VT, + unsigned &WhichResult) { + unsigned EltSz = VT.getVectorElementType().getSizeInBits(); + if (EltSz == 64) + return false; + + unsigned NumElts = VT.getVectorNumElements(); + WhichResult = (M[0] == 0 ? 0 : 1); + for (unsigned i = 0; i != NumElts; ++i) { + if ((unsigned) M[i] != 2 * i + WhichResult) + return false; + } + + // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. + if (VT.is64BitVector() && EltSz == 32) + return false; + + return true; +} + +/// isVUZP_v_undef_Mask - Special case of isVUZPMask for canonical form of +/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". +/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>, +static bool isVUZP_v_undef_Mask(const SmallVectorImpl<int> &M, EVT VT, + unsigned &WhichResult) { + unsigned EltSz = VT.getVectorElementType().getSizeInBits(); + if (EltSz == 64) + return false; + + unsigned Half = VT.getVectorNumElements() / 2; + WhichResult = (M[0] == 0 ? 0 : 1); + for (unsigned j = 0; j != 2; ++j) { + unsigned Idx = WhichResult; + for (unsigned i = 0; i != Half; ++i) { + if ((unsigned) M[i + j * Half] != Idx) + return false; + Idx += 2; + } + } + + // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. + if (VT.is64BitVector() && EltSz == 32) + return false; + + return true; +} + +static bool isVZIPMask(const SmallVectorImpl<int> &M, EVT VT, + unsigned &WhichResult) { + unsigned EltSz = VT.getVectorElementType().getSizeInBits(); + if (EltSz == 64) + return false; + + unsigned NumElts = VT.getVectorNumElements(); + WhichResult = (M[0] == 0 ? 0 : 1); + unsigned Idx = WhichResult * NumElts / 2; + for (unsigned i = 0; i != NumElts; i += 2) { + if ((unsigned) M[i] != Idx || + (unsigned) M[i+1] != Idx + NumElts) + return false; + Idx += 1; + } + + // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. + if (VT.is64BitVector() && EltSz == 32) + return false; + + return true; +} + +/// isVZIP_v_undef_Mask - Special case of isVZIPMask for canonical form of +/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". +/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>. +static bool isVZIP_v_undef_Mask(const SmallVectorImpl<int> &M, EVT VT, + unsigned &WhichResult) { + unsigned EltSz = VT.getVectorElementType().getSizeInBits(); + if (EltSz == 64) + return false; + + unsigned NumElts = VT.getVectorNumElements(); + WhichResult = (M[0] == 0 ? 0 : 1); + unsigned Idx = WhichResult * NumElts / 2; + for (unsigned i = 0; i != NumElts; i += 2) { + if ((unsigned) M[i] != Idx || + (unsigned) M[i+1] != Idx) + return false; + Idx += 1; + } + + // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. + if (VT.is64BitVector() && EltSz == 32) + return false; + + return true; +} + + +static SDValue BuildSplat(SDValue Val, EVT VT, SelectionDAG &DAG, DebugLoc dl) { + // Canonicalize all-zeros and all-ones vectors. + ConstantSDNode *ConstVal = cast<ConstantSDNode>(Val.getNode()); + if (ConstVal->isNullValue()) + return getZeroVector(VT, DAG, dl); + if (ConstVal->isAllOnesValue()) + return getOnesVector(VT, DAG, dl); + + EVT CanonicalVT; + if (VT.is64BitVector()) { + switch (Val.getValueType().getSizeInBits()) { + case 8: CanonicalVT = MVT::v8i8; break; + case 16: CanonicalVT = MVT::v4i16; break; + case 32: CanonicalVT = MVT::v2i32; break; + case 64: CanonicalVT = MVT::v1i64; break; + default: llvm_unreachable("unexpected splat element type"); break; + } + } else { + assert(VT.is128BitVector() && "unknown splat vector size"); + switch (Val.getValueType().getSizeInBits()) { + case 8: CanonicalVT = MVT::v16i8; break; + case 16: CanonicalVT = MVT::v8i16; break; + case 32: CanonicalVT = MVT::v4i32; break; + case 64: CanonicalVT = MVT::v2i64; break; + default: llvm_unreachable("unexpected splat element type"); break; + } + } + + // Build a canonical splat for this value. + SmallVector<SDValue, 8> Ops; + Ops.assign(CanonicalVT.getVectorNumElements(), Val); + SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, dl, CanonicalVT, &Ops[0], + Ops.size()); + return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Res); +} + +// If this is a case we can't handle, return null and let the default +// expansion code take care of it. +static SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) { + BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode()); + DebugLoc dl = Op.getDebugLoc(); + EVT VT = Op.getValueType(); + + APInt SplatBits, SplatUndef; + unsigned SplatBitSize; + bool HasAnyUndefs; + if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { + if (SplatBitSize <= 64) { + SDValue Val = isVMOVSplat(SplatBits.getZExtValue(), + SplatUndef.getZExtValue(), SplatBitSize, DAG); + if (Val.getNode()) + return BuildSplat(Val, VT, DAG, dl); + } + } + + // Scan through the operands to see if only one value is used. + unsigned NumElts = VT.getVectorNumElements(); + bool isOnlyLowElement = true; + bool usesOnlyOneValue = true; + bool isConstant = true; + SDValue Value; + for (unsigned i = 0; i < NumElts; ++i) { + SDValue V = Op.getOperand(i); + if (V.getOpcode() == ISD::UNDEF) + continue; + if (i > 0) + isOnlyLowElement = false; + if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V)) + isConstant = false; + + if (!Value.getNode()) + Value = V; + else if (V != Value) + usesOnlyOneValue = false; + } + + if (!Value.getNode()) + return DAG.getUNDEF(VT); + + if (isOnlyLowElement) + return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value); + + // If all elements are constants, fall back to the default expansion, which + // will generate a load from the constant pool. + if (isConstant) + return SDValue(); + + // Use VDUP for non-constant splats. + unsigned EltSize = VT.getVectorElementType().getSizeInBits(); + if (usesOnlyOneValue && EltSize <= 32) + return DAG.getNode(ARMISD::VDUP, dl, VT, Value); + + // Vectors with 32- or 64-bit elements can be built by directly assigning + // the subregisters. + if (EltSize >= 32) { + // Do the expansion with floating-point types, since that is what the VFP + // registers are defined to use, and since i64 is not legal. + EVT EltVT = EVT::getFloatingPointVT(EltSize); + EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts); + SDValue Val = DAG.getUNDEF(VecVT); + for (unsigned i = 0; i < NumElts; ++i) { + SDValue Elt = Op.getOperand(i); + if (Elt.getOpcode() == ISD::UNDEF) + continue; + Elt = DAG.getNode(ISD::BIT_CONVERT, dl, EltVT, Elt); + Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecVT, Val, Elt, + DAG.getConstant(i, MVT::i32)); + } + return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Val); + } + + return SDValue(); +} + +/// isShuffleMaskLegal - Targets can use this to indicate that they only +/// support *some* VECTOR_SHUFFLE operations, those with specific masks. +/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values +/// are assumed to be legal. +bool +ARMTargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M, + EVT VT) const { + if (VT.getVectorNumElements() == 4 && + (VT.is128BitVector() || VT.is64BitVector())) { + unsigned PFIndexes[4]; + for (unsigned i = 0; i != 4; ++i) { + if (M[i] < 0) + PFIndexes[i] = 8; + else + PFIndexes[i] = M[i]; + } + + // Compute the index in the perfect shuffle table. + unsigned PFTableIndex = + PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3]; + unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; + unsigned Cost = (PFEntry >> 30); + + if (Cost <= 4) + return true; + } + + bool ReverseVEXT; + unsigned Imm, WhichResult; + + return (ShuffleVectorSDNode::isSplatMask(&M[0], VT) || + isVREVMask(M, VT, 64) || + isVREVMask(M, VT, 32) || + isVREVMask(M, VT, 16) || + isVEXTMask(M, VT, ReverseVEXT, Imm) || + isVTRNMask(M, VT, WhichResult) || + isVUZPMask(M, VT, WhichResult) || + isVZIPMask(M, VT, WhichResult) || + isVTRN_v_undef_Mask(M, VT, WhichResult) || + isVUZP_v_undef_Mask(M, VT, WhichResult) || + isVZIP_v_undef_Mask(M, VT, WhichResult)); +} + +/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit +/// the specified operations to build the shuffle. +static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS, + SDValue RHS, SelectionDAG &DAG, + DebugLoc dl) { + unsigned OpNum = (PFEntry >> 26) & 0x0F; + unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1); + unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1); + + enum { + OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3> + OP_VREV, + OP_VDUP0, + OP_VDUP1, + OP_VDUP2, + OP_VDUP3, + OP_VEXT1, + OP_VEXT2, + OP_VEXT3, + OP_VUZPL, // VUZP, left result + OP_VUZPR, // VUZP, right result + OP_VZIPL, // VZIP, left result + OP_VZIPR, // VZIP, right result + OP_VTRNL, // VTRN, left result + OP_VTRNR // VTRN, right result + }; + + if (OpNum == OP_COPY) { + if (LHSID == (1*9+2)*9+3) return LHS; + assert(LHSID == ((4*9+5)*9+6)*9+7 && "Illegal OP_COPY!"); + return RHS; + } + + SDValue OpLHS, OpRHS; + OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl); + OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl); + EVT VT = OpLHS.getValueType(); + + switch (OpNum) { + default: llvm_unreachable("Unknown shuffle opcode!"); + case OP_VREV: + return DAG.getNode(ARMISD::VREV64, dl, VT, OpLHS); + case OP_VDUP0: + case OP_VDUP1: + case OP_VDUP2: + case OP_VDUP3: + return DAG.getNode(ARMISD::VDUPLANE, dl, VT, + OpLHS, DAG.getConstant(OpNum-OP_VDUP0, MVT::i32)); + case OP_VEXT1: + case OP_VEXT2: + case OP_VEXT3: + return DAG.getNode(ARMISD::VEXT, dl, VT, + OpLHS, OpRHS, + DAG.getConstant(OpNum-OP_VEXT1+1, MVT::i32)); + case OP_VUZPL: + case OP_VUZPR: + return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT), + OpLHS, OpRHS).getValue(OpNum-OP_VUZPL); + case OP_VZIPL: + case OP_VZIPR: + return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT), + OpLHS, OpRHS).getValue(OpNum-OP_VZIPL); + case OP_VTRNL: + case OP_VTRNR: + return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT), + OpLHS, OpRHS).getValue(OpNum-OP_VTRNL); + } +} + +static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { + SDValue V1 = Op.getOperand(0); + SDValue V2 = Op.getOperand(1); + DebugLoc dl = Op.getDebugLoc(); + EVT VT = Op.getValueType(); + ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode()); + SmallVector<int, 8> ShuffleMask; + + // Convert shuffles that are directly supported on NEON to target-specific + // DAG nodes, instead of keeping them as shuffles and matching them again + // during code selection. This is more efficient and avoids the possibility + // of inconsistencies between legalization and selection. + // FIXME: floating-point vectors should be canonicalized to integer vectors + // of the same time so that they get CSEd properly. + SVN->getMask(ShuffleMask); + + if (ShuffleVectorSDNode::isSplatMask(&ShuffleMask[0], VT)) { + int Lane = SVN->getSplatIndex(); + // If this is undef splat, generate it via "just" vdup, if possible. + if (Lane == -1) Lane = 0; + + if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) { + return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0)); + } + return DAG.getNode(ARMISD::VDUPLANE, dl, VT, V1, + DAG.getConstant(Lane, MVT::i32)); + } + + bool ReverseVEXT; + unsigned Imm; + if (isVEXTMask(ShuffleMask, VT, ReverseVEXT, Imm)) { + if (ReverseVEXT) + std::swap(V1, V2); + return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V2, + DAG.getConstant(Imm, MVT::i32)); + } + + if (isVREVMask(ShuffleMask, VT, 64)) + return DAG.getNode(ARMISD::VREV64, dl, VT, V1); + if (isVREVMask(ShuffleMask, VT, 32)) + return DAG.getNode(ARMISD::VREV32, dl, VT, V1); + if (isVREVMask(ShuffleMask, VT, 16)) + return DAG.getNode(ARMISD::VREV16, dl, VT, V1); + + // Check for Neon shuffles that modify both input vectors in place. + // If both results are used, i.e., if there are two shuffles with the same + // source operands and with masks corresponding to both results of one of + // these operations, DAG memoization will ensure that a single node is + // used for both shuffles. + unsigned WhichResult; + if (isVTRNMask(ShuffleMask, VT, WhichResult)) + return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT), + V1, V2).getValue(WhichResult); + if (isVUZPMask(ShuffleMask, VT, WhichResult)) + return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT), + V1, V2).getValue(WhichResult); + if (isVZIPMask(ShuffleMask, VT, WhichResult)) + return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT), + V1, V2).getValue(WhichResult); + + if (isVTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) + return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT), + V1, V1).getValue(WhichResult); + if (isVUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) + return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT), + V1, V1).getValue(WhichResult); + if (isVZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) + return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT), + V1, V1).getValue(WhichResult); + + // If the shuffle is not directly supported and it has 4 elements, use + // the PerfectShuffle-generated table to synthesize it from other shuffles. + unsigned NumElts = VT.getVectorNumElements(); + if (NumElts == 4) { + unsigned PFIndexes[4]; + for (unsigned i = 0; i != 4; ++i) { + if (ShuffleMask[i] < 0) + PFIndexes[i] = 8; + else + PFIndexes[i] = ShuffleMask[i]; + } + + // Compute the index in the perfect shuffle table. + unsigned PFTableIndex = + PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3]; + unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; + unsigned Cost = (PFEntry >> 30); + + if (Cost <= 4) + return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl); + } + + // Implement shuffles with 32- or 64-bit elements as subreg copies. + unsigned EltSize = VT.getVectorElementType().getSizeInBits(); + if (EltSize >= 32) { + // Do the expansion with floating-point types, since that is what the VFP + // registers are defined to use, and since i64 is not legal. + EVT EltVT = EVT::getFloatingPointVT(EltSize); + EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts); + V1 = DAG.getNode(ISD::BIT_CONVERT, dl, VecVT, V1); + V2 = DAG.getNode(ISD::BIT_CONVERT, dl, VecVT, V2); + SDValue Val = DAG.getUNDEF(VecVT); + for (unsigned i = 0; i < NumElts; ++i) { + if (ShuffleMask[i] < 0) + continue; + SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, + ShuffleMask[i] < (int)NumElts ? V1 : V2, + DAG.getConstant(ShuffleMask[i] & (NumElts-1), + MVT::i32)); + Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecVT, Val, + Elt, DAG.getConstant(i, MVT::i32)); + } + return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Val); + } + + return SDValue(); +} + +static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) { + EVT VT = Op.getValueType(); + DebugLoc dl = Op.getDebugLoc(); + SDValue Vec = Op.getOperand(0); + SDValue Lane = Op.getOperand(1); + assert(VT == MVT::i32 && + Vec.getValueType().getVectorElementType().getSizeInBits() < 32 && + "unexpected type for custom-lowering vector extract"); + return DAG.getNode(ARMISD::VGETLANEu, dl, MVT::i32, Vec, Lane); +} + +static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { + // The only time a CONCAT_VECTORS operation can have legal types is when + // two 64-bit vectors are concatenated to a 128-bit vector. + assert(Op.getValueType().is128BitVector() && Op.getNumOperands() == 2 && + "unexpected CONCAT_VECTORS"); + DebugLoc dl = Op.getDebugLoc(); + SDValue Val = DAG.getUNDEF(MVT::v2f64); + SDValue Op0 = Op.getOperand(0); + SDValue Op1 = Op.getOperand(1); + if (Op0.getOpcode() != ISD::UNDEF) + Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val, + DAG.getNode(ISD::BIT_CONVERT, dl, MVT::f64, Op0), + DAG.getIntPtrConstant(0)); + if (Op1.getOpcode() != ISD::UNDEF) + Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val, + DAG.getNode(ISD::BIT_CONVERT, dl, MVT::f64, Op1), + DAG.getIntPtrConstant(1)); + return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), Val); +} + +SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { + switch (Op.getOpcode()) { + default: llvm_unreachable("Don't know how to custom lower this!"); + case ISD::ConstantPool: return LowerConstantPool(Op, DAG); + case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); + case ISD::GlobalAddress: + return Subtarget->isTargetDarwin() ? LowerGlobalAddressDarwin(Op, DAG) : + LowerGlobalAddressELF(Op, DAG); + case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); + case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); + case ISD::BR_CC: return LowerBR_CC(Op, DAG); + case ISD::BR_JT: return LowerBR_JT(Op, DAG); + case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); + case ISD::VASTART: return LowerVASTART(Op, DAG); + case ISD::MEMBARRIER: return LowerMEMBARRIER(Op, DAG, Subtarget); + case ISD::SINT_TO_FP: + case ISD::UINT_TO_FP: return LowerINT_TO_FP(Op, DAG); + case ISD::FP_TO_SINT: + case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG); + case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); + case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); + case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); + case ISD::GLOBAL_OFFSET_TABLE: return LowerGLOBAL_OFFSET_TABLE(Op, DAG); + case ISD::EH_SJLJ_SETJMP: return LowerEH_SJLJ_SETJMP(Op, DAG); + case ISD::EH_SJLJ_LONGJMP: return LowerEH_SJLJ_LONGJMP(Op, DAG); + case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG, + Subtarget); + case ISD::BIT_CONVERT: return ExpandBIT_CONVERT(Op.getNode(), DAG); + case ISD::SHL: + case ISD::SRL: + case ISD::SRA: return LowerShift(Op.getNode(), DAG, Subtarget); + case ISD::SHL_PARTS: return LowerShiftLeftParts(Op, DAG); + case ISD::SRL_PARTS: + case ISD::SRA_PARTS: return LowerShiftRightParts(Op, DAG); + case ISD::CTTZ: return LowerCTTZ(Op.getNode(), DAG, Subtarget); + case ISD::VSETCC: return LowerVSETCC(Op, DAG); + case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); + case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); + case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); + case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); + } + return SDValue(); +} + +/// ReplaceNodeResults - Replace the results of node with an illegal result +/// type with new values built out of custom code. +void ARMTargetLowering::ReplaceNodeResults(SDNode *N, + SmallVectorImpl<SDValue>&Results, + SelectionDAG &DAG) const { + SDValue Res; + switch (N->getOpcode()) { + default: + llvm_unreachable("Don't know how to custom expand this!"); + break; + case ISD::BIT_CONVERT: + Res = ExpandBIT_CONVERT(N, DAG); + break; + case ISD::SRL: + case ISD::SRA: + Res = LowerShift(N, DAG, Subtarget); + break; + } + if (Res.getNode()) + Results.push_back(Res); +} + +//===----------------------------------------------------------------------===// +// ARM Scheduler Hooks +//===----------------------------------------------------------------------===// + +MachineBasicBlock * +ARMTargetLowering::EmitAtomicCmpSwap(MachineInstr *MI, + MachineBasicBlock *BB, + unsigned Size) const { + unsigned dest = MI->getOperand(0).getReg(); + unsigned ptr = MI->getOperand(1).getReg(); + unsigned oldval = MI->getOperand(2).getReg(); + unsigned newval = MI->getOperand(3).getReg(); + unsigned scratch = BB->getParent()->getRegInfo() + .createVirtualRegister(ARM::GPRRegisterClass); + const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + DebugLoc dl = MI->getDebugLoc(); + bool isThumb2 = Subtarget->isThumb2(); + + unsigned ldrOpc, strOpc; + switch (Size) { + default: llvm_unreachable("unsupported size for AtomicCmpSwap!"); + case 1: + ldrOpc = isThumb2 ? ARM::t2LDREXB : ARM::LDREXB; + strOpc = isThumb2 ? ARM::t2LDREXB : ARM::STREXB; + break; + case 2: + ldrOpc = isThumb2 ? ARM::t2LDREXH : ARM::LDREXH; + strOpc = isThumb2 ? ARM::t2STREXH : ARM::STREXH; + break; + case 4: + ldrOpc = isThumb2 ? ARM::t2LDREX : ARM::LDREX; + strOpc = isThumb2 ? ARM::t2STREX : ARM::STREX; + break; + } + + MachineFunction *MF = BB->getParent(); + const BasicBlock *LLVM_BB = BB->getBasicBlock(); + MachineFunction::iterator It = BB; + ++It; // insert the new blocks after the current block + + MachineBasicBlock *loop1MBB = MF->CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *loop2MBB = MF->CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB); + MF->insert(It, loop1MBB); + MF->insert(It, loop2MBB); + MF->insert(It, exitMBB); + exitMBB->transferSuccessors(BB); + + // thisMBB: + // ... + // fallthrough --> loop1MBB + BB->addSuccessor(loop1MBB); + + // loop1MBB: + // ldrex dest, [ptr] + // cmp dest, oldval + // bne exitMBB + BB = loop1MBB; + AddDefaultPred(BuildMI(BB, dl, TII->get(ldrOpc), dest).addReg(ptr)); + AddDefaultPred(BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr)) + .addReg(dest).addReg(oldval)); + BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)) + .addMBB(exitMBB).addImm(ARMCC::NE).addReg(ARM::CPSR); + BB->addSuccessor(loop2MBB); + BB->addSuccessor(exitMBB); + + // loop2MBB: + // strex scratch, newval, [ptr] + // cmp scratch, #0 + // bne loop1MBB + BB = loop2MBB; + AddDefaultPred(BuildMI(BB, dl, TII->get(strOpc), scratch).addReg(newval) + .addReg(ptr)); + AddDefaultPred(BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri)) + .addReg(scratch).addImm(0)); + BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)) + .addMBB(loop1MBB).addImm(ARMCC::NE).addReg(ARM::CPSR); + BB->addSuccessor(loop1MBB); + BB->addSuccessor(exitMBB); + + // exitMBB: + // ... + BB = exitMBB; + + MF->DeleteMachineInstr(MI); // The instruction is gone now. + + return BB; +} + +MachineBasicBlock * +ARMTargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB, + unsigned Size, unsigned BinOpcode) const { + // This also handles ATOMIC_SWAP, indicated by BinOpcode==0. + const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + + const BasicBlock *LLVM_BB = BB->getBasicBlock(); + MachineFunction *MF = BB->getParent(); + MachineFunction::iterator It = BB; + ++It; + + unsigned dest = MI->getOperand(0).getReg(); + unsigned ptr = MI->getOperand(1).getReg(); + unsigned incr = MI->getOperand(2).getReg(); + DebugLoc dl = MI->getDebugLoc(); + + bool isThumb2 = Subtarget->isThumb2(); + unsigned ldrOpc, strOpc; + switch (Size) { + default: llvm_unreachable("unsupported size for AtomicCmpSwap!"); + case 1: + ldrOpc = isThumb2 ? ARM::t2LDREXB : ARM::LDREXB; + strOpc = isThumb2 ? ARM::t2STREXB : ARM::STREXB; + break; + case 2: + ldrOpc = isThumb2 ? ARM::t2LDREXH : ARM::LDREXH; + strOpc = isThumb2 ? ARM::t2STREXH : ARM::STREXH; + break; + case 4: + ldrOpc = isThumb2 ? ARM::t2LDREX : ARM::LDREX; + strOpc = isThumb2 ? ARM::t2STREX : ARM::STREX; + break; + } + + MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB); + MF->insert(It, loopMBB); + MF->insert(It, exitMBB); + exitMBB->transferSuccessors(BB); + + MachineRegisterInfo &RegInfo = MF->getRegInfo(); + unsigned scratch = RegInfo.createVirtualRegister(ARM::GPRRegisterClass); + unsigned scratch2 = (!BinOpcode) ? incr : + RegInfo.createVirtualRegister(ARM::GPRRegisterClass); + + // thisMBB: + // ... + // fallthrough --> loopMBB + BB->addSuccessor(loopMBB); + + // loopMBB: + // ldrex dest, ptr + // <binop> scratch2, dest, incr + // strex scratch, scratch2, ptr + // cmp scratch, #0 + // bne- loopMBB + // fallthrough --> exitMBB + BB = loopMBB; + AddDefaultPred(BuildMI(BB, dl, TII->get(ldrOpc), dest).addReg(ptr)); + if (BinOpcode) { + // operand order needs to go the other way for NAND + if (BinOpcode == ARM::BICrr || BinOpcode == ARM::t2BICrr) + AddDefaultPred(BuildMI(BB, dl, TII->get(BinOpcode), scratch2). + addReg(incr).addReg(dest)).addReg(0); + else + AddDefaultPred(BuildMI(BB, dl, TII->get(BinOpcode), scratch2). + addReg(dest).addReg(incr)).addReg(0); + } + + AddDefaultPred(BuildMI(BB, dl, TII->get(strOpc), scratch).addReg(scratch2) + .addReg(ptr)); + AddDefaultPred(BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri)) + .addReg(scratch).addImm(0)); + BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)) + .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR); + + BB->addSuccessor(loopMBB); + BB->addSuccessor(exitMBB); + + // exitMBB: + // ... + BB = exitMBB; + + MF->DeleteMachineInstr(MI); // The instruction is gone now. + + return BB; +} + +MachineBasicBlock * +ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, + MachineBasicBlock *BB) const { + const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + DebugLoc dl = MI->getDebugLoc(); + bool isThumb2 = Subtarget->isThumb2(); + switch (MI->getOpcode()) { + default: + MI->dump(); + llvm_unreachable("Unexpected instr type to insert"); + + case ARM::ATOMIC_LOAD_ADD_I8: + return EmitAtomicBinary(MI, BB, 1, isThumb2 ? ARM::t2ADDrr : ARM::ADDrr); + case ARM::ATOMIC_LOAD_ADD_I16: + return EmitAtomicBinary(MI, BB, 2, isThumb2 ? ARM::t2ADDrr : ARM::ADDrr); + case ARM::ATOMIC_LOAD_ADD_I32: + return EmitAtomicBinary(MI, BB, 4, isThumb2 ? ARM::t2ADDrr : ARM::ADDrr); + + case ARM::ATOMIC_LOAD_AND_I8: + return EmitAtomicBinary(MI, BB, 1, isThumb2 ? ARM::t2ANDrr : ARM::ANDrr); + case ARM::ATOMIC_LOAD_AND_I16: + return EmitAtomicBinary(MI, BB, 2, isThumb2 ? ARM::t2ANDrr : ARM::ANDrr); + case ARM::ATOMIC_LOAD_AND_I32: + return EmitAtomicBinary(MI, BB, 4, isThumb2 ? ARM::t2ANDrr : ARM::ANDrr); + + case ARM::ATOMIC_LOAD_OR_I8: + return EmitAtomicBinary(MI, BB, 1, isThumb2 ? ARM::t2ORRrr : ARM::ORRrr); + case ARM::ATOMIC_LOAD_OR_I16: + return EmitAtomicBinary(MI, BB, 2, isThumb2 ? ARM::t2ORRrr : ARM::ORRrr); + case ARM::ATOMIC_LOAD_OR_I32: + return EmitAtomicBinary(MI, BB, 4, isThumb2 ? ARM::t2ORRrr : ARM::ORRrr); + + case ARM::ATOMIC_LOAD_XOR_I8: + return EmitAtomicBinary(MI, BB, 1, isThumb2 ? ARM::t2EORrr : ARM::EORrr); + case ARM::ATOMIC_LOAD_XOR_I16: + return EmitAtomicBinary(MI, BB, 2, isThumb2 ? ARM::t2EORrr : ARM::EORrr); + case ARM::ATOMIC_LOAD_XOR_I32: + return EmitAtomicBinary(MI, BB, 4, isThumb2 ? ARM::t2EORrr : ARM::EORrr); + + case ARM::ATOMIC_LOAD_NAND_I8: + return EmitAtomicBinary(MI, BB, 1, isThumb2 ? ARM::t2BICrr : ARM::BICrr); + case ARM::ATOMIC_LOAD_NAND_I16: + return EmitAtomicBinary(MI, BB, 2, isThumb2 ? ARM::t2BICrr : ARM::BICrr); + case ARM::ATOMIC_LOAD_NAND_I32: + return EmitAtomicBinary(MI, BB, 4, isThumb2 ? ARM::t2BICrr : ARM::BICrr); + + case ARM::ATOMIC_LOAD_SUB_I8: + return EmitAtomicBinary(MI, BB, 1, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr); + case ARM::ATOMIC_LOAD_SUB_I16: + return EmitAtomicBinary(MI, BB, 2, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr); + case ARM::ATOMIC_LOAD_SUB_I32: + return EmitAtomicBinary(MI, BB, 4, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr); + + case ARM::ATOMIC_SWAP_I8: return EmitAtomicBinary(MI, BB, 1, 0); + case ARM::ATOMIC_SWAP_I16: return EmitAtomicBinary(MI, BB, 2, 0); + case ARM::ATOMIC_SWAP_I32: return EmitAtomicBinary(MI, BB, 4, 0); + + case ARM::ATOMIC_CMP_SWAP_I8: return EmitAtomicCmpSwap(MI, BB, 1); + case ARM::ATOMIC_CMP_SWAP_I16: return EmitAtomicCmpSwap(MI, BB, 2); + case ARM::ATOMIC_CMP_SWAP_I32: return EmitAtomicCmpSwap(MI, BB, 4); + + case ARM::tMOVCCr_pseudo: { + // To "insert" a SELECT_CC instruction, we actually have to insert the + // diamond control-flow pattern. The incoming instruction knows the + // destination vreg to set, the condition code register to branch on, the + // true/false values to select between, and a branch opcode to use. + const BasicBlock *LLVM_BB = BB->getBasicBlock(); + MachineFunction::iterator It = BB; + ++It; + + // thisMBB: + // ... + // TrueVal = ... + // cmpTY ccX, r1, r2 + // bCC copy1MBB + // fallthrough --> copy0MBB + MachineBasicBlock *thisMBB = BB; + MachineFunction *F = BB->getParent(); + MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); + BuildMI(BB, dl, TII->get(ARM::tBcc)).addMBB(sinkMBB) + .addImm(MI->getOperand(3).getImm()).addReg(MI->getOperand(4).getReg()); + F->insert(It, copy0MBB); + F->insert(It, sinkMBB); + // Update machine-CFG edges by first adding all successors of the current + // block to the new block which will contain the Phi node for the select. + for (MachineBasicBlock::succ_iterator I = BB->succ_begin(), + E = BB->succ_end(); I != E; ++I) + sinkMBB->addSuccessor(*I); + // Next, remove all successors of the current block, and add the true + // and fallthrough blocks as its successors. + while (!BB->succ_empty()) + BB->removeSuccessor(BB->succ_begin()); + BB->addSuccessor(copy0MBB); + BB->addSuccessor(sinkMBB); + + // copy0MBB: + // %FalseValue = ... + // # fallthrough to sinkMBB + BB = copy0MBB; + + // Update machine-CFG edges + BB->addSuccessor(sinkMBB); + + // sinkMBB: + // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] + // ... + BB = sinkMBB; + BuildMI(BB, dl, TII->get(ARM::PHI), MI->getOperand(0).getReg()) + .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB) + .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB); + + F->DeleteMachineInstr(MI); // The pseudo instruction is gone now. + return BB; + } + + case ARM::tANDsp: + case ARM::tADDspr_: + case ARM::tSUBspi_: + case ARM::t2SUBrSPi_: + case ARM::t2SUBrSPi12_: + case ARM::t2SUBrSPs_: { + MachineFunction *MF = BB->getParent(); + unsigned DstReg = MI->getOperand(0).getReg(); + unsigned SrcReg = MI->getOperand(1).getReg(); + bool DstIsDead = MI->getOperand(0).isDead(); + bool SrcIsKill = MI->getOperand(1).isKill(); + + if (SrcReg != ARM::SP) { + // Copy the source to SP from virtual register. + const TargetRegisterClass *RC = MF->getRegInfo().getRegClass(SrcReg); + unsigned CopyOpc = (RC == ARM::tGPRRegisterClass) + ? ARM::tMOVtgpr2gpr : ARM::tMOVgpr2gpr; + BuildMI(BB, dl, TII->get(CopyOpc), ARM::SP) + .addReg(SrcReg, getKillRegState(SrcIsKill)); + } + + unsigned OpOpc = 0; + bool NeedPred = false, NeedCC = false, NeedOp3 = false; + switch (MI->getOpcode()) { + default: + llvm_unreachable("Unexpected pseudo instruction!"); + case ARM::tANDsp: + OpOpc = ARM::tAND; + NeedPred = true; + break; + case ARM::tADDspr_: + OpOpc = ARM::tADDspr; + break; + case ARM::tSUBspi_: + OpOpc = ARM::tSUBspi; + break; + case ARM::t2SUBrSPi_: + OpOpc = ARM::t2SUBrSPi; + NeedPred = true; NeedCC = true; + break; + case ARM::t2SUBrSPi12_: + OpOpc = ARM::t2SUBrSPi12; + NeedPred = true; + break; + case ARM::t2SUBrSPs_: + OpOpc = ARM::t2SUBrSPs; + NeedPred = true; NeedCC = true; NeedOp3 = true; + break; + } + MachineInstrBuilder MIB = BuildMI(BB, dl, TII->get(OpOpc), ARM::SP); + if (OpOpc == ARM::tAND) + AddDefaultT1CC(MIB); + MIB.addReg(ARM::SP); + MIB.addOperand(MI->getOperand(2)); + if (NeedOp3) + MIB.addOperand(MI->getOperand(3)); + if (NeedPred) + AddDefaultPred(MIB); + if (NeedCC) + AddDefaultCC(MIB); + + // Copy the result from SP to virtual register. + const TargetRegisterClass *RC = MF->getRegInfo().getRegClass(DstReg); + unsigned CopyOpc = (RC == ARM::tGPRRegisterClass) + ? ARM::tMOVgpr2tgpr : ARM::tMOVgpr2gpr; + BuildMI(BB, dl, TII->get(CopyOpc)) + .addReg(DstReg, getDefRegState(true) | getDeadRegState(DstIsDead)) + .addReg(ARM::SP); + MF->DeleteMachineInstr(MI); // The pseudo instruction is gone now. + return BB; + } + } +} + +//===----------------------------------------------------------------------===// +// ARM Optimization Hooks +//===----------------------------------------------------------------------===// + +static +SDValue combineSelectAndUse(SDNode *N, SDValue Slct, SDValue OtherOp, + TargetLowering::DAGCombinerInfo &DCI) { + SelectionDAG &DAG = DCI.DAG; + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + EVT VT = N->getValueType(0); + unsigned Opc = N->getOpcode(); + bool isSlctCC = Slct.getOpcode() == ISD::SELECT_CC; + SDValue LHS = isSlctCC ? Slct.getOperand(2) : Slct.getOperand(1); + SDValue RHS = isSlctCC ? Slct.getOperand(3) : Slct.getOperand(2); + ISD::CondCode CC = ISD::SETCC_INVALID; + + if (isSlctCC) { + CC = cast<CondCodeSDNode>(Slct.getOperand(4))->get(); + } else { + SDValue CCOp = Slct.getOperand(0); + if (CCOp.getOpcode() == ISD::SETCC) + CC = cast<CondCodeSDNode>(CCOp.getOperand(2))->get(); + } + + bool DoXform = false; + bool InvCC = false; + assert ((Opc == ISD::ADD || (Opc == ISD::SUB && Slct == N->getOperand(1))) && + "Bad input!"); + + if (LHS.getOpcode() == ISD::Constant && + cast<ConstantSDNode>(LHS)->isNullValue()) { + DoXform = true; + } else if (CC != ISD::SETCC_INVALID && + RHS.getOpcode() == ISD::Constant && + cast<ConstantSDNode>(RHS)->isNullValue()) { + std::swap(LHS, RHS); + SDValue Op0 = Slct.getOperand(0); + EVT OpVT = isSlctCC ? Op0.getValueType() : + Op0.getOperand(0).getValueType(); + bool isInt = OpVT.isInteger(); + CC = ISD::getSetCCInverse(CC, isInt); + + if (!TLI.isCondCodeLegal(CC, OpVT)) + return SDValue(); // Inverse operator isn't legal. + + DoXform = true; + InvCC = true; + } + + if (DoXform) { + SDValue Result = DAG.getNode(Opc, RHS.getDebugLoc(), VT, OtherOp, RHS); + if (isSlctCC) + return DAG.getSelectCC(N->getDebugLoc(), OtherOp, Result, + Slct.getOperand(0), Slct.getOperand(1), CC); + SDValue CCOp = Slct.getOperand(0); + if (InvCC) + CCOp = DAG.getSetCC(Slct.getDebugLoc(), CCOp.getValueType(), + CCOp.getOperand(0), CCOp.getOperand(1), CC); + return DAG.getNode(ISD::SELECT, N->getDebugLoc(), VT, + CCOp, OtherOp, Result); + } + return SDValue(); +} + +/// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD. +static SDValue PerformADDCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + // added by evan in r37685 with no testcase. + SDValue N0 = N->getOperand(0), N1 = N->getOperand(1); + + // fold (add (select cc, 0, c), x) -> (select cc, x, (add, x, c)) + if (N0.getOpcode() == ISD::SELECT && N0.getNode()->hasOneUse()) { + SDValue Result = combineSelectAndUse(N, N0, N1, DCI); + if (Result.getNode()) return Result; + } + if (N1.getOpcode() == ISD::SELECT && N1.getNode()->hasOneUse()) { + SDValue Result = combineSelectAndUse(N, N1, N0, DCI); + if (Result.getNode()) return Result; + } + + return SDValue(); +} + +/// PerformSUBCombine - Target-specific dag combine xforms for ISD::SUB. +static SDValue PerformSUBCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + // added by evan in r37685 with no testcase. + SDValue N0 = N->getOperand(0), N1 = N->getOperand(1); + + // fold (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c)) + if (N1.getOpcode() == ISD::SELECT && N1.getNode()->hasOneUse()) { + SDValue Result = combineSelectAndUse(N, N1, N0, DCI); + if (Result.getNode()) return Result; + } + + return SDValue(); +} + +static SDValue PerformMULCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + const ARMSubtarget *Subtarget) { + SelectionDAG &DAG = DCI.DAG; + + if (Subtarget->isThumb1Only()) + return SDValue(); + + if (DAG.getMachineFunction(). + getFunction()->hasFnAttr(Attribute::OptimizeForSize)) + return SDValue(); + + if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) + return SDValue(); + + EVT VT = N->getValueType(0); + if (VT != MVT::i32) + return SDValue(); + + ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1)); + if (!C) + return SDValue(); + + uint64_t MulAmt = C->getZExtValue(); + unsigned ShiftAmt = CountTrailingZeros_64(MulAmt); + ShiftAmt = ShiftAmt & (32 - 1); + SDValue V = N->getOperand(0); + DebugLoc DL = N->getDebugLoc(); + + SDValue Res; + MulAmt >>= ShiftAmt; + if (isPowerOf2_32(MulAmt - 1)) { + // (mul x, 2^N + 1) => (add (shl x, N), x) + Res = DAG.getNode(ISD::ADD, DL, VT, + V, DAG.getNode(ISD::SHL, DL, VT, + V, DAG.getConstant(Log2_32(MulAmt-1), + MVT::i32))); + } else if (isPowerOf2_32(MulAmt + 1)) { + // (mul x, 2^N - 1) => (sub (shl x, N), x) + Res = DAG.getNode(ISD::SUB, DL, VT, + DAG.getNode(ISD::SHL, DL, VT, + V, DAG.getConstant(Log2_32(MulAmt+1), + MVT::i32)), + V); + } else + return SDValue(); + + if (ShiftAmt != 0) + Res = DAG.getNode(ISD::SHL, DL, VT, Res, + DAG.getConstant(ShiftAmt, MVT::i32)); + + // Do not add new nodes to DAG combiner worklist. + DCI.CombineTo(N, Res, false); + return SDValue(); +} + +/// PerformVMOVRRDCombine - Target-specific dag combine xforms for +/// ARMISD::VMOVRRD. +static SDValue PerformVMOVRRDCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + // fmrrd(fmdrr x, y) -> x,y + SDValue InDouble = N->getOperand(0); + if (InDouble.getOpcode() == ARMISD::VMOVDRR) + return DCI.CombineTo(N, InDouble.getOperand(0), InDouble.getOperand(1)); + return SDValue(); +} + +/// getVShiftImm - Check if this is a valid build_vector for the immediate +/// operand of a vector shift operation, where all the elements of the +/// build_vector must have the same constant integer value. +static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) { + // Ignore bit_converts. + while (Op.getOpcode() == ISD::BIT_CONVERT) + Op = Op.getOperand(0); + BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode()); + APInt SplatBits, SplatUndef; + unsigned SplatBitSize; + bool HasAnyUndefs; + if (! BVN || ! BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, + HasAnyUndefs, ElementBits) || + SplatBitSize > ElementBits) + return false; + Cnt = SplatBits.getSExtValue(); + return true; +} + +/// isVShiftLImm - Check if this is a valid build_vector for the immediate +/// operand of a vector shift left operation. That value must be in the range: +/// 0 <= Value < ElementBits for a left shift; or +/// 0 <= Value <= ElementBits for a long left shift. +static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) { + assert(VT.isVector() && "vector shift count is not a vector type"); + unsigned ElementBits = VT.getVectorElementType().getSizeInBits(); + if (! getVShiftImm(Op, ElementBits, Cnt)) + return false; + return (Cnt >= 0 && (isLong ? Cnt-1 : Cnt) < ElementBits); +} + +/// isVShiftRImm - Check if this is a valid build_vector for the immediate +/// operand of a vector shift right operation. For a shift opcode, the value +/// is positive, but for an intrinsic the value count must be negative. The +/// absolute value must be in the range: +/// 1 <= |Value| <= ElementBits for a right shift; or +/// 1 <= |Value| <= ElementBits/2 for a narrow right shift. +static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic, + int64_t &Cnt) { + assert(VT.isVector() && "vector shift count is not a vector type"); + unsigned ElementBits = VT.getVectorElementType().getSizeInBits(); + if (! getVShiftImm(Op, ElementBits, Cnt)) + return false; + if (isIntrinsic) + Cnt = -Cnt; + return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits/2 : ElementBits)); +} + +/// PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics. +static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) { + unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); + switch (IntNo) { + default: + // Don't do anything for most intrinsics. + break; + + // Vector shifts: check for immediate versions and lower them. + // Note: This is done during DAG combining instead of DAG legalizing because + // the build_vectors for 64-bit vector element shift counts are generally + // not legal, and it is hard to see their values after they get legalized to + // loads from a constant pool. + case Intrinsic::arm_neon_vshifts: + case Intrinsic::arm_neon_vshiftu: + case Intrinsic::arm_neon_vshiftls: + case Intrinsic::arm_neon_vshiftlu: + case Intrinsic::arm_neon_vshiftn: + case Intrinsic::arm_neon_vrshifts: + case Intrinsic::arm_neon_vrshiftu: + case Intrinsic::arm_neon_vrshiftn: + case Intrinsic::arm_neon_vqshifts: + case Intrinsic::arm_neon_vqshiftu: + case Intrinsic::arm_neon_vqshiftsu: + case Intrinsic::arm_neon_vqshiftns: + case Intrinsic::arm_neon_vqshiftnu: + case Intrinsic::arm_neon_vqshiftnsu: + case Intrinsic::arm_neon_vqrshiftns: + case Intrinsic::arm_neon_vqrshiftnu: + case Intrinsic::arm_neon_vqrshiftnsu: { + EVT VT = N->getOperand(1).getValueType(); + int64_t Cnt; + unsigned VShiftOpc = 0; + + switch (IntNo) { + case Intrinsic::arm_neon_vshifts: + case Intrinsic::arm_neon_vshiftu: + if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) { + VShiftOpc = ARMISD::VSHL; + break; + } + if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt)) { + VShiftOpc = (IntNo == Intrinsic::arm_neon_vshifts ? + ARMISD::VSHRs : ARMISD::VSHRu); + break; + } + return SDValue(); + + case Intrinsic::arm_neon_vshiftls: + case Intrinsic::arm_neon_vshiftlu: + if (isVShiftLImm(N->getOperand(2), VT, true, Cnt)) + break; + llvm_unreachable("invalid shift count for vshll intrinsic"); + + case Intrinsic::arm_neon_vrshifts: + case Intrinsic::arm_neon_vrshiftu: + if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt)) + break; + return SDValue(); + + case Intrinsic::arm_neon_vqshifts: + case Intrinsic::arm_neon_vqshiftu: + if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) + break; + return SDValue(); + + case Intrinsic::arm_neon_vqshiftsu: + if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) + break; + llvm_unreachable("invalid shift count for vqshlu intrinsic"); + + case Intrinsic::arm_neon_vshiftn: + case Intrinsic::arm_neon_vrshiftn: + case Intrinsic::arm_neon_vqshiftns: + case Intrinsic::arm_neon_vqshiftnu: + case Intrinsic::arm_neon_vqshiftnsu: + case Intrinsic::arm_neon_vqrshiftns: + case Intrinsic::arm_neon_vqrshiftnu: + case Intrinsic::arm_neon_vqrshiftnsu: + // Narrowing shifts require an immediate right shift. + if (isVShiftRImm(N->getOperand(2), VT, true, true, Cnt)) + break; + llvm_unreachable("invalid shift count for narrowing vector shift intrinsic"); + + default: + llvm_unreachable("unhandled vector shift"); + } + + switch (IntNo) { + case Intrinsic::arm_neon_vshifts: + case Intrinsic::arm_neon_vshiftu: + // Opcode already set above. + break; + case Intrinsic::arm_neon_vshiftls: + case Intrinsic::arm_neon_vshiftlu: + if (Cnt == VT.getVectorElementType().getSizeInBits()) + VShiftOpc = ARMISD::VSHLLi; + else + VShiftOpc = (IntNo == Intrinsic::arm_neon_vshiftls ? + ARMISD::VSHLLs : ARMISD::VSHLLu); + break; + case Intrinsic::arm_neon_vshiftn: + VShiftOpc = ARMISD::VSHRN; break; + case Intrinsic::arm_neon_vrshifts: + VShiftOpc = ARMISD::VRSHRs; break; + case Intrinsic::arm_neon_vrshiftu: + VShiftOpc = ARMISD::VRSHRu; break; + case Intrinsic::arm_neon_vrshiftn: + VShiftOpc = ARMISD::VRSHRN; break; + case Intrinsic::arm_neon_vqshifts: + VShiftOpc = ARMISD::VQSHLs; break; + case Intrinsic::arm_neon_vqshiftu: + VShiftOpc = ARMISD::VQSHLu; break; + case Intrinsic::arm_neon_vqshiftsu: + VShiftOpc = ARMISD::VQSHLsu; break; + case Intrinsic::arm_neon_vqshiftns: + VShiftOpc = ARMISD::VQSHRNs; break; + case Intrinsic::arm_neon_vqshiftnu: + VShiftOpc = ARMISD::VQSHRNu; break; + case Intrinsic::arm_neon_vqshiftnsu: + VShiftOpc = ARMISD::VQSHRNsu; break; + case Intrinsic::arm_neon_vqrshiftns: + VShiftOpc = ARMISD::VQRSHRNs; break; + case Intrinsic::arm_neon_vqrshiftnu: + VShiftOpc = ARMISD::VQRSHRNu; break; + case Intrinsic::arm_neon_vqrshiftnsu: + VShiftOpc = ARMISD::VQRSHRNsu; break; + } + + return DAG.getNode(VShiftOpc, N->getDebugLoc(), N->getValueType(0), + N->getOperand(1), DAG.getConstant(Cnt, MVT::i32)); + } + + case Intrinsic::arm_neon_vshiftins: { + EVT VT = N->getOperand(1).getValueType(); + int64_t Cnt; + unsigned VShiftOpc = 0; + + if (isVShiftLImm(N->getOperand(3), VT, false, Cnt)) + VShiftOpc = ARMISD::VSLI; + else if (isVShiftRImm(N->getOperand(3), VT, false, true, Cnt)) + VShiftOpc = ARMISD::VSRI; + else { + llvm_unreachable("invalid shift count for vsli/vsri intrinsic"); + } + + return DAG.getNode(VShiftOpc, N->getDebugLoc(), N->getValueType(0), + N->getOperand(1), N->getOperand(2), + DAG.getConstant(Cnt, MVT::i32)); + } + + case Intrinsic::arm_neon_vqrshifts: + case Intrinsic::arm_neon_vqrshiftu: + // No immediate versions of these to check for. + break; + } + + return SDValue(); +} + +/// PerformShiftCombine - Checks for immediate versions of vector shifts and +/// lowers them. As with the vector shift intrinsics, this is done during DAG +/// combining instead of DAG legalizing because the build_vectors for 64-bit +/// vector element shift counts are generally not legal, and it is hard to see +/// their values after they get legalized to loads from a constant pool. +static SDValue PerformShiftCombine(SDNode *N, SelectionDAG &DAG, + const ARMSubtarget *ST) { + EVT VT = N->getValueType(0); + + // Nothing to be done for scalar shifts. + if (! VT.isVector()) + return SDValue(); + + assert(ST->hasNEON() && "unexpected vector shift"); + int64_t Cnt; + + switch (N->getOpcode()) { + default: llvm_unreachable("unexpected shift opcode"); + + case ISD::SHL: + if (isVShiftLImm(N->getOperand(1), VT, false, Cnt)) + return DAG.getNode(ARMISD::VSHL, N->getDebugLoc(), VT, N->getOperand(0), + DAG.getConstant(Cnt, MVT::i32)); + break; + + case ISD::SRA: + case ISD::SRL: + if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) { + unsigned VShiftOpc = (N->getOpcode() == ISD::SRA ? + ARMISD::VSHRs : ARMISD::VSHRu); + return DAG.getNode(VShiftOpc, N->getDebugLoc(), VT, N->getOperand(0), + DAG.getConstant(Cnt, MVT::i32)); + } + } + return SDValue(); +} + +/// PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND, +/// ISD::ZERO_EXTEND, and ISD::ANY_EXTEND. +static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG, + const ARMSubtarget *ST) { + SDValue N0 = N->getOperand(0); + + // Check for sign- and zero-extensions of vector extract operations of 8- + // and 16-bit vector elements. NEON supports these directly. They are + // handled during DAG combining because type legalization will promote them + // to 32-bit types and it is messy to recognize the operations after that. + if (ST->hasNEON() && N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT) { + SDValue Vec = N0.getOperand(0); + SDValue Lane = N0.getOperand(1); + EVT VT = N->getValueType(0); + EVT EltVT = N0.getValueType(); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + + if (VT == MVT::i32 && + (EltVT == MVT::i8 || EltVT == MVT::i16) && + TLI.isTypeLegal(Vec.getValueType())) { + + unsigned Opc = 0; + switch (N->getOpcode()) { + default: llvm_unreachable("unexpected opcode"); + case ISD::SIGN_EXTEND: + Opc = ARMISD::VGETLANEs; + break; + case ISD::ZERO_EXTEND: + case ISD::ANY_EXTEND: + Opc = ARMISD::VGETLANEu; + break; + } + return DAG.getNode(Opc, N->getDebugLoc(), VT, Vec, Lane); + } + } + + return SDValue(); +} + +/// PerformSELECT_CCCombine - Target-specific DAG combining for ISD::SELECT_CC +/// to match f32 max/min patterns to use NEON vmax/vmin instructions. +static SDValue PerformSELECT_CCCombine(SDNode *N, SelectionDAG &DAG, + const ARMSubtarget *ST) { + // If the target supports NEON, try to use vmax/vmin instructions for f32 + // selects like "x < y ? x : y". Unless the FiniteOnlyFPMath option is set, + // be careful about NaNs: NEON's vmax/vmin return NaN if either operand is + // a NaN; only do the transformation when it matches that behavior. + + // For now only do this when using NEON for FP operations; if using VFP, it + // is not obvious that the benefit outweighs the cost of switching to the + // NEON pipeline. + if (!ST->hasNEON() || !ST->useNEONForSinglePrecisionFP() || + N->getValueType(0) != MVT::f32) + return SDValue(); + + SDValue CondLHS = N->getOperand(0); + SDValue CondRHS = N->getOperand(1); + SDValue LHS = N->getOperand(2); + SDValue RHS = N->getOperand(3); + ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(4))->get(); + + unsigned Opcode = 0; + bool IsReversed; + if (DAG.isEqualTo(LHS, CondLHS) && DAG.isEqualTo(RHS, CondRHS)) { + IsReversed = false; // x CC y ? x : y + } else if (DAG.isEqualTo(LHS, CondRHS) && DAG.isEqualTo(RHS, CondLHS)) { + IsReversed = true ; // x CC y ? y : x + } else { + return SDValue(); + } + + bool IsUnordered; + switch (CC) { + default: break; + case ISD::SETOLT: + case ISD::SETOLE: + case ISD::SETLT: + case ISD::SETLE: + case ISD::SETULT: + case ISD::SETULE: + // If LHS is NaN, an ordered comparison will be false and the result will + // be the RHS, but vmin(NaN, RHS) = NaN. Avoid this by checking that LHS + // != NaN. Likewise, for unordered comparisons, check for RHS != NaN. + IsUnordered = (CC == ISD::SETULT || CC == ISD::SETULE); + if (!DAG.isKnownNeverNaN(IsUnordered ? RHS : LHS)) + break; + // For less-than-or-equal comparisons, "+0 <= -0" will be true but vmin + // will return -0, so vmin can only be used for unsafe math or if one of + // the operands is known to be nonzero. + if ((CC == ISD::SETLE || CC == ISD::SETOLE || CC == ISD::SETULE) && + !UnsafeFPMath && + !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) + break; + Opcode = IsReversed ? ARMISD::FMAX : ARMISD::FMIN; + break; + + case ISD::SETOGT: + case ISD::SETOGE: + case ISD::SETGT: + case ISD::SETGE: + case ISD::SETUGT: + case ISD::SETUGE: + // If LHS is NaN, an ordered comparison will be false and the result will + // be the RHS, but vmax(NaN, RHS) = NaN. Avoid this by checking that LHS + // != NaN. Likewise, for unordered comparisons, check for RHS != NaN. + IsUnordered = (CC == ISD::SETUGT || CC == ISD::SETUGE); + if (!DAG.isKnownNeverNaN(IsUnordered ? RHS : LHS)) + break; + // For greater-than-or-equal comparisons, "-0 >= +0" will be true but vmax + // will return +0, so vmax can only be used for unsafe math or if one of + // the operands is known to be nonzero. + if ((CC == ISD::SETGE || CC == ISD::SETOGE || CC == ISD::SETUGE) && + !UnsafeFPMath && + !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) + break; + Opcode = IsReversed ? ARMISD::FMIN : ARMISD::FMAX; + break; + } + + if (!Opcode) + return SDValue(); + return DAG.getNode(Opcode, N->getDebugLoc(), N->getValueType(0), LHS, RHS); +} + +SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + switch (N->getOpcode()) { + default: break; + case ISD::ADD: return PerformADDCombine(N, DCI); + case ISD::SUB: return PerformSUBCombine(N, DCI); + case ISD::MUL: return PerformMULCombine(N, DCI, Subtarget); + case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI); + case ISD::INTRINSIC_WO_CHAIN: return PerformIntrinsicCombine(N, DCI.DAG); + case ISD::SHL: + case ISD::SRA: + case ISD::SRL: return PerformShiftCombine(N, DCI.DAG, Subtarget); + case ISD::SIGN_EXTEND: + case ISD::ZERO_EXTEND: + case ISD::ANY_EXTEND: return PerformExtendCombine(N, DCI.DAG, Subtarget); + case ISD::SELECT_CC: return PerformSELECT_CCCombine(N, DCI.DAG, Subtarget); + } + return SDValue(); +} + +bool ARMTargetLowering::allowsUnalignedMemoryAccesses(EVT VT) const { + if (!Subtarget->hasV6Ops()) + // Pre-v6 does not support unaligned mem access. + return false; + else { + // v6+ may or may not support unaligned mem access depending on the system + // configuration. + // FIXME: This is pretty conservative. Should we provide cmdline option to + // control the behaviour? + if (!Subtarget->isTargetDarwin()) + return false; + } + + switch (VT.getSimpleVT().SimpleTy) { + default: + return false; + case MVT::i8: + case MVT::i16: + case MVT::i32: + return true; + // FIXME: VLD1 etc with standard alignment is legal. + } +} + +static bool isLegalT1AddressImmediate(int64_t V, EVT VT) { + if (V < 0) + return false; + + unsigned Scale = 1; + switch (VT.getSimpleVT().SimpleTy) { + default: return false; + case MVT::i1: + case MVT::i8: + // Scale == 1; + break; + case MVT::i16: + // Scale == 2; + Scale = 2; + break; + case MVT::i32: + // Scale == 4; + Scale = 4; + break; + } + + if ((V & (Scale - 1)) != 0) + return false; + V /= Scale; + return V == (V & ((1LL << 5) - 1)); +} + +static bool isLegalT2AddressImmediate(int64_t V, EVT VT, + const ARMSubtarget *Subtarget) { + bool isNeg = false; + if (V < 0) { + isNeg = true; + V = - V; + } + + switch (VT.getSimpleVT().SimpleTy) { + default: return false; + case MVT::i1: + case MVT::i8: + case MVT::i16: + case MVT::i32: + // + imm12 or - imm8 + if (isNeg) + return V == (V & ((1LL << 8) - 1)); + return V == (V & ((1LL << 12) - 1)); + case MVT::f32: + case MVT::f64: + // Same as ARM mode. FIXME: NEON? + if (!Subtarget->hasVFP2()) + return false; + if ((V & 3) != 0) + return false; + V >>= 2; + return V == (V & ((1LL << 8) - 1)); + } +} + +/// isLegalAddressImmediate - Return true if the integer value can be used +/// as the offset of the target addressing mode for load / store of the +/// given type. +static bool isLegalAddressImmediate(int64_t V, EVT VT, + const ARMSubtarget *Subtarget) { + if (V == 0) + return true; + + if (!VT.isSimple()) + return false; + + if (Subtarget->isThumb1Only()) + return isLegalT1AddressImmediate(V, VT); + else if (Subtarget->isThumb2()) + return isLegalT2AddressImmediate(V, VT, Subtarget); + + // ARM mode. + if (V < 0) + V = - V; + switch (VT.getSimpleVT().SimpleTy) { + default: return false; + case MVT::i1: + case MVT::i8: + case MVT::i32: + // +- imm12 + return V == (V & ((1LL << 12) - 1)); + case MVT::i16: + // +- imm8 + return V == (V & ((1LL << 8) - 1)); + case MVT::f32: + case MVT::f64: + if (!Subtarget->hasVFP2()) // FIXME: NEON? + return false; + if ((V & 3) != 0) + return false; + V >>= 2; + return V == (V & ((1LL << 8) - 1)); + } +} + +bool ARMTargetLowering::isLegalT2ScaledAddressingMode(const AddrMode &AM, + EVT VT) const { + int Scale = AM.Scale; + if (Scale < 0) + return false; + + switch (VT.getSimpleVT().SimpleTy) { + default: return false; + case MVT::i1: + case MVT::i8: + case MVT::i16: + case MVT::i32: + if (Scale == 1) + return true; + // r + r << imm + Scale = Scale & ~1; + return Scale == 2 || Scale == 4 || Scale == 8; + case MVT::i64: + // r + r + if (((unsigned)AM.HasBaseReg + Scale) <= 2) + return true; + return false; + case MVT::isVoid: + // Note, we allow "void" uses (basically, uses that aren't loads or + // stores), because arm allows folding a scale into many arithmetic + // operations. This should be made more precise and revisited later. + + // Allow r << imm, but the imm has to be a multiple of two. + if (Scale & 1) return false; + return isPowerOf2_32(Scale); + } +} + +/// isLegalAddressingMode - Return true if the addressing mode represented +/// by AM is legal for this target, for a load/store of the specified type. +bool ARMTargetLowering::isLegalAddressingMode(const AddrMode &AM, + const Type *Ty) const { + EVT VT = getValueType(Ty, true); + if (!isLegalAddressImmediate(AM.BaseOffs, VT, Subtarget)) + return false; + + // Can never fold addr of global into load/store. + if (AM.BaseGV) + return false; + + switch (AM.Scale) { + case 0: // no scale reg, must be "r+i" or "r", or "i". + break; + case 1: + if (Subtarget->isThumb1Only()) + return false; + // FALL THROUGH. + default: + // ARM doesn't support any R+R*scale+imm addr modes. + if (AM.BaseOffs) + return false; + + if (!VT.isSimple()) + return false; + + if (Subtarget->isThumb2()) + return isLegalT2ScaledAddressingMode(AM, VT); + + int Scale = AM.Scale; + switch (VT.getSimpleVT().SimpleTy) { + default: return false; + case MVT::i1: + case MVT::i8: + case MVT::i32: + if (Scale < 0) Scale = -Scale; + if (Scale == 1) + return true; + // r + r << imm + return isPowerOf2_32(Scale & ~1); + case MVT::i16: + case MVT::i64: + // r + r + if (((unsigned)AM.HasBaseReg + Scale) <= 2) + return true; + return false; + + case MVT::isVoid: + // Note, we allow "void" uses (basically, uses that aren't loads or + // stores), because arm allows folding a scale into many arithmetic + // operations. This should be made more precise and revisited later. + + // Allow r << imm, but the imm has to be a multiple of two. + if (Scale & 1) return false; + return isPowerOf2_32(Scale); + } + break; + } + return true; +} + +/// isLegalICmpImmediate - Return true if the specified immediate is legal +/// icmp immediate, that is the target has icmp instructions which can compare +/// a register against the immediate without having to materialize the +/// immediate into a register. +bool ARMTargetLowering::isLegalICmpImmediate(int64_t Imm) const { + if (!Subtarget->isThumb()) + return ARM_AM::getSOImmVal(Imm) != -1; + if (Subtarget->isThumb2()) + return ARM_AM::getT2SOImmVal(Imm) != -1; + return Imm >= 0 && Imm <= 255; +} + +static bool getARMIndexedAddressParts(SDNode *Ptr, EVT VT, + bool isSEXTLoad, SDValue &Base, + SDValue &Offset, bool &isInc, + SelectionDAG &DAG) { + if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB) + return false; + + if (VT == MVT::i16 || ((VT == MVT::i8 || VT == MVT::i1) && isSEXTLoad)) { + // AddressingMode 3 + Base = Ptr->getOperand(0); + if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) { + int RHSC = (int)RHS->getZExtValue(); + if (RHSC < 0 && RHSC > -256) { + assert(Ptr->getOpcode() == ISD::ADD); + isInc = false; + Offset = DAG.getConstant(-RHSC, RHS->getValueType(0)); + return true; + } + } + isInc = (Ptr->getOpcode() == ISD::ADD); + Offset = Ptr->getOperand(1); + return true; + } else if (VT == MVT::i32 || VT == MVT::i8 || VT == MVT::i1) { + // AddressingMode 2 + if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) { + int RHSC = (int)RHS->getZExtValue(); + if (RHSC < 0 && RHSC > -0x1000) { + assert(Ptr->getOpcode() == ISD::ADD); + isInc = false; + Offset = DAG.getConstant(-RHSC, RHS->getValueType(0)); + Base = Ptr->getOperand(0); + return true; + } + } + + if (Ptr->getOpcode() == ISD::ADD) { + isInc = true; + ARM_AM::ShiftOpc ShOpcVal= ARM_AM::getShiftOpcForNode(Ptr->getOperand(0)); + if (ShOpcVal != ARM_AM::no_shift) { + Base = Ptr->getOperand(1); + Offset = Ptr->getOperand(0); + } else { + Base = Ptr->getOperand(0); + Offset = Ptr->getOperand(1); + } + return true; + } + + isInc = (Ptr->getOpcode() == ISD::ADD); + Base = Ptr->getOperand(0); + Offset = Ptr->getOperand(1); + return true; + } + + // FIXME: Use VLDM / VSTM to emulate indexed FP load / store. + return false; +} + +static bool getT2IndexedAddressParts(SDNode *Ptr, EVT VT, + bool isSEXTLoad, SDValue &Base, + SDValue &Offset, bool &isInc, + SelectionDAG &DAG) { + if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB) + return false; + + Base = Ptr->getOperand(0); + if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) { + int RHSC = (int)RHS->getZExtValue(); + if (RHSC < 0 && RHSC > -0x100) { // 8 bits. + assert(Ptr->getOpcode() == ISD::ADD); + isInc = false; + Offset = DAG.getConstant(-RHSC, RHS->getValueType(0)); + return true; + } else if (RHSC > 0 && RHSC < 0x100) { // 8 bit, no zero. + isInc = Ptr->getOpcode() == ISD::ADD; + Offset = DAG.getConstant(RHSC, RHS->getValueType(0)); + return true; + } + } + + return false; +} + +/// getPreIndexedAddressParts - returns true by value, base pointer and +/// offset pointer and addressing mode by reference if the node's address +/// can be legally represented as pre-indexed load / store address. +bool +ARMTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base, + SDValue &Offset, + ISD::MemIndexedMode &AM, + SelectionDAG &DAG) const { + if (Subtarget->isThumb1Only()) + return false; + + EVT VT; + SDValue Ptr; + bool isSEXTLoad = false; + if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { + Ptr = LD->getBasePtr(); + VT = LD->getMemoryVT(); + isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD; + } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) { + Ptr = ST->getBasePtr(); + VT = ST->getMemoryVT(); + } else + return false; + + bool isInc; + bool isLegal = false; + if (Subtarget->isThumb2()) + isLegal = getT2IndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base, + Offset, isInc, DAG); + else + isLegal = getARMIndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base, + Offset, isInc, DAG); + if (!isLegal) + return false; + + AM = isInc ? ISD::PRE_INC : ISD::PRE_DEC; + return true; +} + +/// getPostIndexedAddressParts - returns true by value, base pointer and +/// offset pointer and addressing mode by reference if this node can be +/// combined with a load / store to form a post-indexed load / store. +bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op, + SDValue &Base, + SDValue &Offset, + ISD::MemIndexedMode &AM, + SelectionDAG &DAG) const { + if (Subtarget->isThumb1Only()) + return false; + + EVT VT; + SDValue Ptr; + bool isSEXTLoad = false; + if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { + VT = LD->getMemoryVT(); + Ptr = LD->getBasePtr(); + isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD; + } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) { + VT = ST->getMemoryVT(); + Ptr = ST->getBasePtr(); + } else + return false; + + bool isInc; + bool isLegal = false; + if (Subtarget->isThumb2()) + isLegal = getT2IndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset, + isInc, DAG); + else + isLegal = getARMIndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset, + isInc, DAG); + if (!isLegal) + return false; + + if (Ptr != Base) { + // Swap base ptr and offset to catch more post-index load / store when + // it's legal. In Thumb2 mode, offset must be an immediate. + if (Ptr == Offset && Op->getOpcode() == ISD::ADD && + !Subtarget->isThumb2()) + std::swap(Base, Offset); + + // Post-indexed load / store update the base pointer. + if (Ptr != Base) + return false; + } + + AM = isInc ? ISD::POST_INC : ISD::POST_DEC; + return true; +} + +void ARMTargetLowering::computeMaskedBitsForTargetNode(const SDValue Op, + const APInt &Mask, + APInt &KnownZero, + APInt &KnownOne, + const SelectionDAG &DAG, + unsigned Depth) const { + KnownZero = KnownOne = APInt(Mask.getBitWidth(), 0); + switch (Op.getOpcode()) { + default: break; + case ARMISD::CMOV: { + // Bits are known zero/one if known on the LHS and RHS. + DAG.ComputeMaskedBits(Op.getOperand(0), Mask, KnownZero, KnownOne, Depth+1); + if (KnownZero == 0 && KnownOne == 0) return; + + APInt KnownZeroRHS, KnownOneRHS; + DAG.ComputeMaskedBits(Op.getOperand(1), Mask, + KnownZeroRHS, KnownOneRHS, Depth+1); + KnownZero &= KnownZeroRHS; + KnownOne &= KnownOneRHS; + return; + } + } +} + +//===----------------------------------------------------------------------===// +// ARM Inline Assembly Support +//===----------------------------------------------------------------------===// + +/// getConstraintType - Given a constraint letter, return the type of +/// constraint it is for this target. +ARMTargetLowering::ConstraintType +ARMTargetLowering::getConstraintType(const std::string &Constraint) const { + if (Constraint.size() == 1) { + switch (Constraint[0]) { + default: break; + case 'l': return C_RegisterClass; + case 'w': return C_RegisterClass; + } + } + return TargetLowering::getConstraintType(Constraint); +} + +std::pair<unsigned, const TargetRegisterClass*> +ARMTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, + EVT VT) const { + if (Constraint.size() == 1) { + // GCC ARM Constraint Letters + switch (Constraint[0]) { + case 'l': + if (Subtarget->isThumb()) + return std::make_pair(0U, ARM::tGPRRegisterClass); + else + return std::make_pair(0U, ARM::GPRRegisterClass); + case 'r': + return std::make_pair(0U, ARM::GPRRegisterClass); + case 'w': + if (VT == MVT::f32) + return std::make_pair(0U, ARM::SPRRegisterClass); + if (VT.getSizeInBits() == 64) + return std::make_pair(0U, ARM::DPRRegisterClass); + if (VT.getSizeInBits() == 128) + return std::make_pair(0U, ARM::QPRRegisterClass); + break; + } + } + if (StringRef("{cc}").equals_lower(Constraint)) + return std::make_pair(0U, ARM::CCRRegisterClass); + + return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT); +} + +std::vector<unsigned> ARMTargetLowering:: +getRegClassForInlineAsmConstraint(const std::string &Constraint, + EVT VT) const { + if (Constraint.size() != 1) + return std::vector<unsigned>(); + + switch (Constraint[0]) { // GCC ARM Constraint Letters + default: break; + case 'l': + return make_vector<unsigned>(ARM::R0, ARM::R1, ARM::R2, ARM::R3, + ARM::R4, ARM::R5, ARM::R6, ARM::R7, + 0); + case 'r': + return make_vector<unsigned>(ARM::R0, ARM::R1, ARM::R2, ARM::R3, + ARM::R4, ARM::R5, ARM::R6, ARM::R7, + ARM::R8, ARM::R9, ARM::R10, ARM::R11, + ARM::R12, ARM::LR, 0); + case 'w': + if (VT == MVT::f32) + return make_vector<unsigned>(ARM::S0, ARM::S1, ARM::S2, ARM::S3, + ARM::S4, ARM::S5, ARM::S6, ARM::S7, + ARM::S8, ARM::S9, ARM::S10, ARM::S11, + ARM::S12,ARM::S13,ARM::S14,ARM::S15, + ARM::S16,ARM::S17,ARM::S18,ARM::S19, + ARM::S20,ARM::S21,ARM::S22,ARM::S23, + ARM::S24,ARM::S25,ARM::S26,ARM::S27, + ARM::S28,ARM::S29,ARM::S30,ARM::S31, 0); + if (VT.getSizeInBits() == 64) + return make_vector<unsigned>(ARM::D0, ARM::D1, ARM::D2, ARM::D3, + ARM::D4, ARM::D5, ARM::D6, ARM::D7, + ARM::D8, ARM::D9, ARM::D10,ARM::D11, + ARM::D12,ARM::D13,ARM::D14,ARM::D15, 0); + if (VT.getSizeInBits() == 128) + return make_vector<unsigned>(ARM::Q0, ARM::Q1, ARM::Q2, ARM::Q3, + ARM::Q4, ARM::Q5, ARM::Q6, ARM::Q7, 0); + break; + } + + return std::vector<unsigned>(); +} + +/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops +/// vector. If it is invalid, don't add anything to Ops. +void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op, + char Constraint, + bool hasMemory, + std::vector<SDValue>&Ops, + SelectionDAG &DAG) const { + SDValue Result(0, 0); + + switch (Constraint) { + default: break; + case 'I': case 'J': case 'K': case 'L': + case 'M': case 'N': case 'O': + ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op); + if (!C) + return; + + int64_t CVal64 = C->getSExtValue(); + int CVal = (int) CVal64; + // None of these constraints allow values larger than 32 bits. Check + // that the value fits in an int. + if (CVal != CVal64) + return; + + switch (Constraint) { + case 'I': + if (Subtarget->isThumb1Only()) { + // This must be a constant between 0 and 255, for ADD + // immediates. + if (CVal >= 0 && CVal <= 255) + break; + } else if (Subtarget->isThumb2()) { + // A constant that can be used as an immediate value in a + // data-processing instruction. + if (ARM_AM::getT2SOImmVal(CVal) != -1) + break; + } else { + // A constant that can be used as an immediate value in a + // data-processing instruction. + if (ARM_AM::getSOImmVal(CVal) != -1) + break; + } + return; + + case 'J': + if (Subtarget->isThumb()) { // FIXME thumb2 + // This must be a constant between -255 and -1, for negated ADD + // immediates. This can be used in GCC with an "n" modifier that + // prints the negated value, for use with SUB instructions. It is + // not useful otherwise but is implemented for compatibility. + if (CVal >= -255 && CVal <= -1) + break; + } else { + // This must be a constant between -4095 and 4095. It is not clear + // what this constraint is intended for. Implemented for + // compatibility with GCC. + if (CVal >= -4095 && CVal <= 4095) + break; + } + return; + + case 'K': + if (Subtarget->isThumb1Only()) { + // A 32-bit value where only one byte has a nonzero value. Exclude + // zero to match GCC. This constraint is used by GCC internally for + // constants that can be loaded with a move/shift combination. + // It is not useful otherwise but is implemented for compatibility. + if (CVal != 0 && ARM_AM::isThumbImmShiftedVal(CVal)) + break; + } else if (Subtarget->isThumb2()) { + // A constant whose bitwise inverse can be used as an immediate + // value in a data-processing instruction. This can be used in GCC + // with a "B" modifier that prints the inverted value, for use with + // BIC and MVN instructions. It is not useful otherwise but is + // implemented for compatibility. + if (ARM_AM::getT2SOImmVal(~CVal) != -1) + break; + } else { + // A constant whose bitwise inverse can be used as an immediate + // value in a data-processing instruction. This can be used in GCC + // with a "B" modifier that prints the inverted value, for use with + // BIC and MVN instructions. It is not useful otherwise but is + // implemented for compatibility. + if (ARM_AM::getSOImmVal(~CVal) != -1) + break; + } + return; + + case 'L': + if (Subtarget->isThumb1Only()) { + // This must be a constant between -7 and 7, + // for 3-operand ADD/SUB immediate instructions. + if (CVal >= -7 && CVal < 7) + break; + } else if (Subtarget->isThumb2()) { + // A constant whose negation can be used as an immediate value in a + // data-processing instruction. This can be used in GCC with an "n" + // modifier that prints the negated value, for use with SUB + // instructions. It is not useful otherwise but is implemented for + // compatibility. + if (ARM_AM::getT2SOImmVal(-CVal) != -1) + break; + } else { + // A constant whose negation can be used as an immediate value in a + // data-processing instruction. This can be used in GCC with an "n" + // modifier that prints the negated value, for use with SUB + // instructions. It is not useful otherwise but is implemented for + // compatibility. + if (ARM_AM::getSOImmVal(-CVal) != -1) + break; + } + return; + + case 'M': + if (Subtarget->isThumb()) { // FIXME thumb2 + // This must be a multiple of 4 between 0 and 1020, for + // ADD sp + immediate. + if ((CVal >= 0 && CVal <= 1020) && ((CVal & 3) == 0)) + break; + } else { + // A power of two or a constant between 0 and 32. This is used in + // GCC for the shift amount on shifted register operands, but it is + // useful in general for any shift amounts. + if ((CVal >= 0 && CVal <= 32) || ((CVal & (CVal - 1)) == 0)) + break; + } + return; + + case 'N': + if (Subtarget->isThumb()) { // FIXME thumb2 + // This must be a constant between 0 and 31, for shift amounts. + if (CVal >= 0 && CVal <= 31) + break; + } + return; + + case 'O': + if (Subtarget->isThumb()) { // FIXME thumb2 + // This must be a multiple of 4 between -508 and 508, for + // ADD/SUB sp = sp + immediate. + if ((CVal >= -508 && CVal <= 508) && ((CVal & 3) == 0)) + break; + } + return; + } + Result = DAG.getTargetConstant(CVal, Op.getValueType()); + break; + } + + if (Result.getNode()) { + Ops.push_back(Result); + return; + } + return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, hasMemory, + Ops, DAG); +} + +bool +ARMTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const { + // The ARM target isn't yet aware of offsets. + return false; +} + +int ARM::getVFPf32Imm(const APFloat &FPImm) { + APInt Imm = FPImm.bitcastToAPInt(); + uint32_t Sign = Imm.lshr(31).getZExtValue() & 1; + int32_t Exp = (Imm.lshr(23).getSExtValue() & 0xff) - 127; // -126 to 127 + int64_t Mantissa = Imm.getZExtValue() & 0x7fffff; // 23 bits + + // We can handle 4 bits of mantissa. + // mantissa = (16+UInt(e:f:g:h))/16. + if (Mantissa & 0x7ffff) + return -1; + Mantissa >>= 19; + if ((Mantissa & 0xf) != Mantissa) + return -1; + + // We can handle 3 bits of exponent: exp == UInt(NOT(b):c:d)-3 + if (Exp < -3 || Exp > 4) + return -1; + Exp = ((Exp+3) & 0x7) ^ 4; + + return ((int)Sign << 7) | (Exp << 4) | Mantissa; +} + +int ARM::getVFPf64Imm(const APFloat &FPImm) { + APInt Imm = FPImm.bitcastToAPInt(); + uint64_t Sign = Imm.lshr(63).getZExtValue() & 1; + int64_t Exp = (Imm.lshr(52).getSExtValue() & 0x7ff) - 1023; // -1022 to 1023 + uint64_t Mantissa = Imm.getZExtValue() & 0xfffffffffffffLL; + + // We can handle 4 bits of mantissa. + // mantissa = (16+UInt(e:f:g:h))/16. + if (Mantissa & 0xffffffffffffLL) + return -1; + Mantissa >>= 48; + if ((Mantissa & 0xf) != Mantissa) + return -1; + + // We can handle 3 bits of exponent: exp == UInt(NOT(b):c:d)-3 + if (Exp < -3 || Exp > 4) + return -1; + Exp = ((Exp+3) & 0x7) ^ 4; + + return ((int)Sign << 7) | (Exp << 4) | Mantissa; +} + +/// isFPImmLegal - Returns true if the target can instruction select the +/// specified FP immediate natively. If false, the legalizer will +/// materialize the FP immediate as a load from a constant pool. +bool ARMTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { + if (!Subtarget->hasVFP3()) + return false; + if (VT == MVT::f32) + return ARM::getVFPf32Imm(Imm) != -1; + if (VT == MVT::f64) + return ARM::getVFPf64Imm(Imm) != -1; + return false; +} diff --git a/contrib/llvm/lib/Target/ARM/ARMISelLowering.h b/contrib/llvm/lib/Target/ARM/ARMISelLowering.h new file mode 100644 index 0000000..9c7517c --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMISelLowering.h @@ -0,0 +1,354 @@ +//===-- ARMISelLowering.h - ARM DAG Lowering Interface ----------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the interfaces that ARM uses to lower LLVM code into a +// selection DAG. +// +//===----------------------------------------------------------------------===// + +#ifndef ARMISELLOWERING_H +#define ARMISELLOWERING_H + +#include "ARMSubtarget.h" +#include "llvm/Target/TargetLowering.h" +#include "llvm/CodeGen/SelectionDAG.h" +#include "llvm/CodeGen/CallingConvLower.h" +#include <vector> + +namespace llvm { + class ARMConstantPoolValue; + + namespace ARMISD { + // ARM Specific DAG Nodes + enum NodeType { + // Start the numbering where the builtin ops and target ops leave off. + FIRST_NUMBER = ISD::BUILTIN_OP_END, + + Wrapper, // Wrapper - A wrapper node for TargetConstantPool, + // TargetExternalSymbol, and TargetGlobalAddress. + WrapperJT, // WrapperJT - A wrapper node for TargetJumpTable + + CALL, // Function call. + CALL_PRED, // Function call that's predicable. + CALL_NOLINK, // Function call with branch not branch-and-link. + tCALL, // Thumb function call. + BRCOND, // Conditional branch. + BR_JT, // Jumptable branch. + BR2_JT, // Jumptable branch (2 level - jumptable entry is a jump). + RET_FLAG, // Return with a flag operand. + + PIC_ADD, // Add with a PC operand and a PIC label. + + CMP, // ARM compare instructions. + CMPZ, // ARM compare that sets only Z flag. + CMPFP, // ARM VFP compare instruction, sets FPSCR. + CMPFPw0, // ARM VFP compare against zero instruction, sets FPSCR. + FMSTAT, // ARM fmstat instruction. + CMOV, // ARM conditional move instructions. + CNEG, // ARM conditional negate instructions. + + RBIT, // ARM bitreverse instruction + + FTOSI, // FP to sint within a FP register. + FTOUI, // FP to uint within a FP register. + SITOF, // sint to FP within a FP register. + UITOF, // uint to FP within a FP register. + + SRL_FLAG, // V,Flag = srl_flag X -> srl X, 1 + save carry out. + SRA_FLAG, // V,Flag = sra_flag X -> sra X, 1 + save carry out. + RRX, // V = RRX X, Flag -> srl X, 1 + shift in carry flag. + + VMOVRRD, // double to two gprs. + VMOVDRR, // Two gprs to double. + + EH_SJLJ_SETJMP, // SjLj exception handling setjmp. + EH_SJLJ_LONGJMP, // SjLj exception handling longjmp. + + THREAD_POINTER, + + DYN_ALLOC, // Dynamic allocation on the stack. + + MEMBARRIER, // Memory barrier + SYNCBARRIER, // Memory sync barrier + + VCEQ, // Vector compare equal. + VCGE, // Vector compare greater than or equal. + VCGEU, // Vector compare unsigned greater than or equal. + VCGT, // Vector compare greater than. + VCGTU, // Vector compare unsigned greater than. + VTST, // Vector test bits. + + // Vector shift by immediate: + VSHL, // ...left + VSHRs, // ...right (signed) + VSHRu, // ...right (unsigned) + VSHLLs, // ...left long (signed) + VSHLLu, // ...left long (unsigned) + VSHLLi, // ...left long (with maximum shift count) + VSHRN, // ...right narrow + + // Vector rounding shift by immediate: + VRSHRs, // ...right (signed) + VRSHRu, // ...right (unsigned) + VRSHRN, // ...right narrow + + // Vector saturating shift by immediate: + VQSHLs, // ...left (signed) + VQSHLu, // ...left (unsigned) + VQSHLsu, // ...left (signed to unsigned) + VQSHRNs, // ...right narrow (signed) + VQSHRNu, // ...right narrow (unsigned) + VQSHRNsu, // ...right narrow (signed to unsigned) + + // Vector saturating rounding shift by immediate: + VQRSHRNs, // ...right narrow (signed) + VQRSHRNu, // ...right narrow (unsigned) + VQRSHRNsu, // ...right narrow (signed to unsigned) + + // Vector shift and insert: + VSLI, // ...left + VSRI, // ...right + + // Vector get lane (VMOV scalar to ARM core register) + // (These are used for 8- and 16-bit element types only.) + VGETLANEu, // zero-extend vector extract element + VGETLANEs, // sign-extend vector extract element + + // Vector duplicate: + VDUP, + VDUPLANE, + + // Vector shuffles: + VEXT, // extract + VREV64, // reverse elements within 64-bit doublewords + VREV32, // reverse elements within 32-bit words + VREV16, // reverse elements within 16-bit halfwords + VZIP, // zip (interleave) + VUZP, // unzip (deinterleave) + VTRN, // transpose + + // Floating-point max and min: + FMAX, + FMIN + }; + } + + /// Define some predicates that are used for node matching. + namespace ARM { + /// getVMOVImm - If this is a build_vector of constants which can be + /// formed by using a VMOV instruction of the specified element size, + /// return the constant being splatted. The ByteSize field indicates the + /// number of bytes of each element [1248]. + SDValue getVMOVImm(SDNode *N, unsigned ByteSize, SelectionDAG &DAG); + + /// getVFPf32Imm / getVFPf64Imm - If the given fp immediate can be + /// materialized with a VMOV.f32 / VMOV.f64 (i.e. fconsts / fconstd) + /// instruction, returns its 8-bit integer representation. Otherwise, + /// returns -1. + int getVFPf32Imm(const APFloat &FPImm); + int getVFPf64Imm(const APFloat &FPImm); + } + + //===--------------------------------------------------------------------===// + // ARMTargetLowering - ARM Implementation of the TargetLowering interface + + class ARMTargetLowering : public TargetLowering { + public: + explicit ARMTargetLowering(TargetMachine &TM); + + virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const; + + /// ReplaceNodeResults - Replace the results of node with an illegal result + /// type with new values built out of custom code. + /// + virtual void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue>&Results, + SelectionDAG &DAG) const; + + virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const; + + virtual const char *getTargetNodeName(unsigned Opcode) const; + + virtual MachineBasicBlock * + EmitInstrWithCustomInserter(MachineInstr *MI, + MachineBasicBlock *MBB) const; + + /// allowsUnalignedMemoryAccesses - Returns true if the target allows + /// unaligned memory accesses. of the specified type. + /// FIXME: Add getOptimalMemOpType to implement memcpy with NEON? + virtual bool allowsUnalignedMemoryAccesses(EVT VT) const; + + /// isLegalAddressingMode - Return true if the addressing mode represented + /// by AM is legal for this target, for a load/store of the specified type. + virtual bool isLegalAddressingMode(const AddrMode &AM, const Type *Ty)const; + bool isLegalT2ScaledAddressingMode(const AddrMode &AM, EVT VT) const; + + /// isLegalICmpImmediate - Return true if the specified immediate is legal + /// icmp immediate, that is the target has icmp instructions which can compare + /// a register against the immediate without having to materialize the + /// immediate into a register. + virtual bool isLegalICmpImmediate(int64_t Imm) const; + + /// getPreIndexedAddressParts - returns true by value, base pointer and + /// offset pointer and addressing mode by reference if the node's address + /// can be legally represented as pre-indexed load / store address. + virtual bool getPreIndexedAddressParts(SDNode *N, SDValue &Base, + SDValue &Offset, + ISD::MemIndexedMode &AM, + SelectionDAG &DAG) const; + + /// getPostIndexedAddressParts - returns true by value, base pointer and + /// offset pointer and addressing mode by reference if this node can be + /// combined with a load / store to form a post-indexed load / store. + virtual bool getPostIndexedAddressParts(SDNode *N, SDNode *Op, + SDValue &Base, SDValue &Offset, + ISD::MemIndexedMode &AM, + SelectionDAG &DAG) const; + + virtual void computeMaskedBitsForTargetNode(const SDValue Op, + const APInt &Mask, + APInt &KnownZero, + APInt &KnownOne, + const SelectionDAG &DAG, + unsigned Depth) const; + + + ConstraintType getConstraintType(const std::string &Constraint) const; + std::pair<unsigned, const TargetRegisterClass*> + getRegForInlineAsmConstraint(const std::string &Constraint, + EVT VT) const; + std::vector<unsigned> + getRegClassForInlineAsmConstraint(const std::string &Constraint, + EVT VT) const; + + /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops + /// vector. If it is invalid, don't add anything to Ops. If hasMemory is + /// true it means one of the asm constraint of the inline asm instruction + /// being processed is 'm'. + virtual void LowerAsmOperandForConstraint(SDValue Op, + char ConstraintLetter, + bool hasMemory, + std::vector<SDValue> &Ops, + SelectionDAG &DAG) const; + + const ARMSubtarget* getSubtarget() const { + return Subtarget; + } + + /// getRegClassFor - Return the register class that should be used for the + /// specified value type. + virtual TargetRegisterClass *getRegClassFor(EVT VT) const; + + /// getFunctionAlignment - Return the Log2 alignment of this function. + virtual unsigned getFunctionAlignment(const Function *F) const; + + Sched::Preference getSchedulingPreference(SDNode *N) const; + + bool isShuffleMaskLegal(const SmallVectorImpl<int> &M, EVT VT) const; + bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const; + + /// isFPImmLegal - Returns true if the target can instruction select the + /// specified FP immediate natively. If false, the legalizer will + /// materialize the FP immediate as a load from a constant pool. + virtual bool isFPImmLegal(const APFloat &Imm, EVT VT) const; + + private: + /// Subtarget - Keep a pointer to the ARMSubtarget around so that we can + /// make the right decision when generating code for different targets. + const ARMSubtarget *Subtarget; + + /// ARMPCLabelIndex - Keep track of the number of ARM PC labels created. + /// + unsigned ARMPCLabelIndex; + + void addTypeForNEON(EVT VT, EVT PromotedLdStVT, EVT PromotedBitwiseVT); + void addDRTypeForNEON(EVT VT); + void addQRTypeForNEON(EVT VT); + + typedef SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPassVector; + void PassF64ArgInRegs(DebugLoc dl, SelectionDAG &DAG, + SDValue Chain, SDValue &Arg, + RegsToPassVector &RegsToPass, + CCValAssign &VA, CCValAssign &NextVA, + SDValue &StackPtr, + SmallVector<SDValue, 8> &MemOpChains, + ISD::ArgFlagsTy Flags) const; + SDValue GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA, + SDValue &Root, SelectionDAG &DAG, + DebugLoc dl) const; + + CCAssignFn *CCAssignFnForNode(CallingConv::ID CC, bool Return, bool isVarArg) const; + SDValue LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, SDValue Arg, + DebugLoc dl, SelectionDAG &DAG, + const CCValAssign &VA, + ISD::ArgFlagsTy Flags) const; + SDValue LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG, + const ARMSubtarget *Subtarget) const; + SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerGlobalAddressDarwin(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerGlobalAddressELF(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA, + SelectionDAG &DAG) const; + SDValue LowerToTLSExecModels(GlobalAddressSDNode *GA, + SelectionDAG &DAG) const; + SDValue LowerGLOBAL_OFFSET_TABLE(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerBR_JT(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerShiftRightParts(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const; + + SDValue LowerCallResult(SDValue Chain, SDValue InFlag, + CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl<ISD::InputArg> &Ins, + DebugLoc dl, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals) const; + + virtual SDValue + LowerFormalArguments(SDValue Chain, + CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl<ISD::InputArg> &Ins, + DebugLoc dl, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals) const; + + virtual SDValue + LowerCall(SDValue Chain, SDValue Callee, + CallingConv::ID CallConv, bool isVarArg, + bool &isTailCall, + const SmallVectorImpl<ISD::OutputArg> &Outs, + const SmallVectorImpl<ISD::InputArg> &Ins, + DebugLoc dl, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals) const; + + virtual SDValue + LowerReturn(SDValue Chain, + CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl<ISD::OutputArg> &Outs, + DebugLoc dl, SelectionDAG &DAG) const; + + SDValue getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, + SDValue &ARMCC, SelectionDAG &DAG, DebugLoc dl) const; + + MachineBasicBlock *EmitAtomicCmpSwap(MachineInstr *MI, + MachineBasicBlock *BB, + unsigned Size) const; + MachineBasicBlock *EmitAtomicBinary(MachineInstr *MI, + MachineBasicBlock *BB, + unsigned Size, + unsigned BinOpcode) const; + + }; +} + +#endif // ARMISELLOWERING_H diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrFormats.td b/contrib/llvm/lib/Target/ARM/ARMInstrFormats.td new file mode 100644 index 0000000..d487df1 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMInstrFormats.td @@ -0,0 +1,1686 @@ +//===- ARMInstrFormats.td - ARM Instruction Formats --*- tablegen -*---------=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// +// ARM Instruction Format Definitions. +// + +// Format specifies the encoding used by the instruction. This is part of the +// ad-hoc solution used to emit machine instruction encodings by our machine +// code emitter. +class Format<bits<6> val> { + bits<6> Value = val; +} + +def Pseudo : Format<0>; +def MulFrm : Format<1>; +def BrFrm : Format<2>; +def BrMiscFrm : Format<3>; + +def DPFrm : Format<4>; +def DPSoRegFrm : Format<5>; + +def LdFrm : Format<6>; +def StFrm : Format<7>; +def LdMiscFrm : Format<8>; +def StMiscFrm : Format<9>; +def LdStMulFrm : Format<10>; + +def LdStExFrm : Format<11>; + +def ArithMiscFrm : Format<12>; +def ExtFrm : Format<13>; + +def VFPUnaryFrm : Format<14>; +def VFPBinaryFrm : Format<15>; +def VFPConv1Frm : Format<16>; +def VFPConv2Frm : Format<17>; +def VFPConv3Frm : Format<18>; +def VFPConv4Frm : Format<19>; +def VFPConv5Frm : Format<20>; +def VFPLdStFrm : Format<21>; +def VFPLdStMulFrm : Format<22>; +def VFPMiscFrm : Format<23>; + +def ThumbFrm : Format<24>; + +def NEONFrm : Format<25>; +def NEONGetLnFrm : Format<26>; +def NEONSetLnFrm : Format<27>; +def NEONDupFrm : Format<28>; + +def MiscFrm : Format<29>; +def ThumbMiscFrm : Format<30>; + +def NLdStFrm : Format<31>; +def N1RegModImmFrm : Format<32>; +def N2RegFrm : Format<33>; +def NVCVTFrm : Format<34>; +def NVDupLnFrm : Format<35>; +def N2RegVShLFrm : Format<36>; +def N2RegVShRFrm : Format<37>; +def N3RegFrm : Format<38>; +def N3RegVShFrm : Format<39>; +def NVExtFrm : Format<40>; +def NVMulSLFrm : Format<41>; +def NVTBLFrm : Format<42>; + +// Misc flags. + +// the instruction has a Rn register operand. +// UnaryDP - Indicates this is a unary data processing instruction, i.e. +// it doesn't have a Rn operand. +class UnaryDP { bit isUnaryDataProc = 1; } + +// Xform16Bit - Indicates this Thumb2 instruction may be transformed into +// a 16-bit Thumb instruction if certain conditions are met. +class Xform16Bit { bit canXformTo16Bit = 1; } + +//===----------------------------------------------------------------------===// +// ARM Instruction flags. These need to match ARMBaseInstrInfo.h. +// + +// Addressing mode. +class AddrMode<bits<4> val> { + bits<4> Value = val; +} +def AddrModeNone : AddrMode<0>; +def AddrMode1 : AddrMode<1>; +def AddrMode2 : AddrMode<2>; +def AddrMode3 : AddrMode<3>; +def AddrMode4 : AddrMode<4>; +def AddrMode5 : AddrMode<5>; +def AddrMode6 : AddrMode<6>; +def AddrModeT1_1 : AddrMode<7>; +def AddrModeT1_2 : AddrMode<8>; +def AddrModeT1_4 : AddrMode<9>; +def AddrModeT1_s : AddrMode<10>; +def AddrModeT2_i12: AddrMode<11>; +def AddrModeT2_i8 : AddrMode<12>; +def AddrModeT2_so : AddrMode<13>; +def AddrModeT2_pc : AddrMode<14>; +def AddrModeT2_i8s4 : AddrMode<15>; + +// Instruction size. +class SizeFlagVal<bits<3> val> { + bits<3> Value = val; +} +def SizeInvalid : SizeFlagVal<0>; // Unset. +def SizeSpecial : SizeFlagVal<1>; // Pseudo or special. +def Size8Bytes : SizeFlagVal<2>; +def Size4Bytes : SizeFlagVal<3>; +def Size2Bytes : SizeFlagVal<4>; + +// Load / store index mode. +class IndexMode<bits<2> val> { + bits<2> Value = val; +} +def IndexModeNone : IndexMode<0>; +def IndexModePre : IndexMode<1>; +def IndexModePost : IndexMode<2>; +def IndexModeUpd : IndexMode<3>; + +// Instruction execution domain. +class Domain<bits<2> val> { + bits<2> Value = val; +} +def GenericDomain : Domain<0>; +def VFPDomain : Domain<1>; // Instructions in VFP domain only +def NeonDomain : Domain<2>; // Instructions in Neon domain only +def VFPNeonDomain : Domain<3>; // Instructions in both VFP & Neon domains + +//===----------------------------------------------------------------------===// + +// ARM special operands. +// + +// ARM Predicate operand. Default to 14 = always (AL). Second part is CC +// register whose default is 0 (no register). +def pred : PredicateOperand<OtherVT, (ops i32imm, CCR), + (ops (i32 14), (i32 zero_reg))> { + let PrintMethod = "printPredicateOperand"; +} + +// Conditional code result for instructions whose 's' bit is set, e.g. subs. +def cc_out : OptionalDefOperand<OtherVT, (ops CCR), (ops (i32 zero_reg))> { + let PrintMethod = "printSBitModifierOperand"; +} + +// Same as cc_out except it defaults to setting CPSR. +def s_cc_out : OptionalDefOperand<OtherVT, (ops CCR), (ops (i32 CPSR))> { + let PrintMethod = "printSBitModifierOperand"; +} + +// ARM special operands for disassembly only. +// + +def cps_opt : Operand<i32> { + let PrintMethod = "printCPSOptionOperand"; +} + +def msr_mask : Operand<i32> { + let PrintMethod = "printMSRMaskOperand"; +} + +// A8.6.117, A8.6.118. Different instructions are generated for #0 and #-0. +// The neg_zero operand translates -0 to -1, -1 to -2, ..., etc. +def neg_zero : Operand<i32> { + let PrintMethod = "printNegZeroOperand"; +} + +//===----------------------------------------------------------------------===// + +// ARM Instruction templates. +// + +class InstTemplate<AddrMode am, SizeFlagVal sz, IndexMode im, + Format f, Domain d, string cstr, InstrItinClass itin> + : Instruction { + let Namespace = "ARM"; + + AddrMode AM = am; + SizeFlagVal SZ = sz; + IndexMode IM = im; + bits<2> IndexModeBits = IM.Value; + Format F = f; + bits<6> Form = F.Value; + Domain D = d; + bit isUnaryDataProc = 0; + bit canXformTo16Bit = 0; + + // The layout of TSFlags should be kept in sync with ARMBaseInstrInfo.h. + let TSFlags{3-0} = AM.Value; + let TSFlags{6-4} = SZ.Value; + let TSFlags{8-7} = IndexModeBits; + let TSFlags{14-9} = Form; + let TSFlags{15} = isUnaryDataProc; + let TSFlags{16} = canXformTo16Bit; + let TSFlags{18-17} = D.Value; + + let Constraints = cstr; + let Itinerary = itin; +} + +class Encoding { + field bits<32> Inst; +} + +class InstARM<AddrMode am, SizeFlagVal sz, IndexMode im, + Format f, Domain d, string cstr, InstrItinClass itin> + : InstTemplate<am, sz, im, f, d, cstr, itin>, Encoding; + +// This Encoding-less class is used by Thumb1 to specify the encoding bits later +// on by adding flavors to specific instructions. +class InstThumb<AddrMode am, SizeFlagVal sz, IndexMode im, + Format f, Domain d, string cstr, InstrItinClass itin> + : InstTemplate<am, sz, im, f, d, cstr, itin>; + +class PseudoInst<dag oops, dag iops, InstrItinClass itin, + string asm, list<dag> pattern> + : InstARM<AddrModeNone, SizeSpecial, IndexModeNone, Pseudo, GenericDomain, + "", itin> { + let OutOperandList = oops; + let InOperandList = iops; + let AsmString = asm; + let Pattern = pattern; +} + +// Almost all ARM instructions are predicable. +class I<dag oops, dag iops, AddrMode am, SizeFlagVal sz, + IndexMode im, Format f, InstrItinClass itin, + string opc, string asm, string cstr, + list<dag> pattern> + : InstARM<am, sz, im, f, GenericDomain, cstr, itin> { + let OutOperandList = oops; + let InOperandList = !con(iops, (ins pred:$p)); + let AsmString = !strconcat(opc, !strconcat("${p}", asm)); + let Pattern = pattern; + list<Predicate> Predicates = [IsARM]; +} +// A few are not predicable +class InoP<dag oops, dag iops, AddrMode am, SizeFlagVal sz, + IndexMode im, Format f, InstrItinClass itin, + string opc, string asm, string cstr, + list<dag> pattern> + : InstARM<am, sz, im, f, GenericDomain, cstr, itin> { + let OutOperandList = oops; + let InOperandList = iops; + let AsmString = !strconcat(opc, asm); + let Pattern = pattern; + let isPredicable = 0; + list<Predicate> Predicates = [IsARM]; +} + +// Same as I except it can optionally modify CPSR. Note it's modeled as +// an input operand since by default it's a zero register. It will +// become an implicit def once it's "flipped". +class sI<dag oops, dag iops, AddrMode am, SizeFlagVal sz, + IndexMode im, Format f, InstrItinClass itin, + string opc, string asm, string cstr, + list<dag> pattern> + : InstARM<am, sz, im, f, GenericDomain, cstr, itin> { + let OutOperandList = oops; + let InOperandList = !con(iops, (ins pred:$p, cc_out:$s)); + let AsmString = !strconcat(opc, !strconcat("${p}${s}", asm)); + let Pattern = pattern; + list<Predicate> Predicates = [IsARM]; +} + +// Special cases +class XI<dag oops, dag iops, AddrMode am, SizeFlagVal sz, + IndexMode im, Format f, InstrItinClass itin, + string asm, string cstr, list<dag> pattern> + : InstARM<am, sz, im, f, GenericDomain, cstr, itin> { + let OutOperandList = oops; + let InOperandList = iops; + let AsmString = asm; + let Pattern = pattern; + list<Predicate> Predicates = [IsARM]; +} + +class AI<dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : I<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, f, itin, + opc, asm, "", pattern>; +class AsI<dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : sI<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, f, itin, + opc, asm, "", pattern>; +class AXI<dag oops, dag iops, Format f, InstrItinClass itin, + string asm, list<dag> pattern> + : XI<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, f, itin, + asm, "", pattern>; +class AInoP<dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : InoP<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, f, itin, + opc, asm, "", pattern>; + +// Ctrl flow instructions +class ABI<bits<4> opcod, dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : I<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, BrFrm, itin, + opc, asm, "", pattern> { + let Inst{27-24} = opcod; +} +class ABXI<bits<4> opcod, dag oops, dag iops, InstrItinClass itin, + string asm, list<dag> pattern> + : XI<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, BrFrm, itin, + asm, "", pattern> { + let Inst{27-24} = opcod; +} +class ABXIx2<dag oops, dag iops, InstrItinClass itin, + string asm, list<dag> pattern> + : XI<oops, iops, AddrModeNone, Size8Bytes, IndexModeNone, BrMiscFrm, itin, + asm, "", pattern>; + +// BR_JT instructions +class JTI<dag oops, dag iops, InstrItinClass itin, + string asm, list<dag> pattern> + : XI<oops, iops, AddrModeNone, SizeSpecial, IndexModeNone, BrMiscFrm, itin, + asm, "", pattern>; + + +// Atomic load/store instructions + +class AIldrex<bits<2> opcod, dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : I<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, LdStExFrm, itin, + opc, asm, "", pattern> { + let Inst{27-23} = 0b00011; + let Inst{22-21} = opcod; + let Inst{20} = 1; + let Inst{11-0} = 0b111110011111; +} +class AIstrex<bits<2> opcod, dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : I<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, LdStExFrm, itin, + opc, asm, "", pattern> { + let Inst{27-23} = 0b00011; + let Inst{22-21} = opcod; + let Inst{20} = 0; + let Inst{11-4} = 0b11111001; +} + +// addrmode1 instructions +class AI1<bits<4> opcod, dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : I<oops, iops, AddrMode1, Size4Bytes, IndexModeNone, f, itin, + opc, asm, "", pattern> { + let Inst{24-21} = opcod; + let Inst{27-26} = {0,0}; +} +class AsI1<bits<4> opcod, dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : sI<oops, iops, AddrMode1, Size4Bytes, IndexModeNone, f, itin, + opc, asm, "", pattern> { + let Inst{24-21} = opcod; + let Inst{27-26} = {0,0}; +} +class AXI1<bits<4> opcod, dag oops, dag iops, Format f, InstrItinClass itin, + string asm, list<dag> pattern> + : XI<oops, iops, AddrMode1, Size4Bytes, IndexModeNone, f, itin, + asm, "", pattern> { + let Inst{24-21} = opcod; + let Inst{27-26} = {0,0}; +} +class AI1x2<dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : I<oops, iops, AddrMode1, Size8Bytes, IndexModeNone, f, itin, + opc, asm, "", pattern>; + + +// addrmode2 loads and stores +class AI2<dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : I<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f, itin, + opc, asm, "", pattern> { + let Inst{27-26} = {0,1}; +} + +// loads +class AI2ldw<dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : I<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f, itin, + opc, asm, "", pattern> { + let Inst{20} = 1; // L bit + let Inst{21} = 0; // W bit + let Inst{22} = 0; // B bit + let Inst{24} = 1; // P bit + let Inst{27-26} = {0,1}; +} +class AXI2ldw<dag oops, dag iops, Format f, InstrItinClass itin, + string asm, list<dag> pattern> + : XI<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f, itin, + asm, "", pattern> { + let Inst{20} = 1; // L bit + let Inst{21} = 0; // W bit + let Inst{22} = 0; // B bit + let Inst{24} = 1; // P bit + let Inst{27-26} = {0,1}; +} +class AI2ldb<dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : I<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f, itin, + opc, asm, "", pattern> { + let Inst{20} = 1; // L bit + let Inst{21} = 0; // W bit + let Inst{22} = 1; // B bit + let Inst{24} = 1; // P bit + let Inst{27-26} = {0,1}; +} +class AXI2ldb<dag oops, dag iops, Format f, InstrItinClass itin, + string asm, list<dag> pattern> + : XI<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f, itin, + asm, "", pattern> { + let Inst{20} = 1; // L bit + let Inst{21} = 0; // W bit + let Inst{22} = 1; // B bit + let Inst{24} = 1; // P bit + let Inst{27-26} = {0,1}; +} + +// stores +class AI2stw<dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : I<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f, itin, + opc, asm, "", pattern> { + let Inst{20} = 0; // L bit + let Inst{21} = 0; // W bit + let Inst{22} = 0; // B bit + let Inst{24} = 1; // P bit + let Inst{27-26} = {0,1}; +} +class AXI2stw<dag oops, dag iops, Format f, InstrItinClass itin, + string asm, list<dag> pattern> + : XI<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f, itin, + asm, "", pattern> { + let Inst{20} = 0; // L bit + let Inst{21} = 0; // W bit + let Inst{22} = 0; // B bit + let Inst{24} = 1; // P bit + let Inst{27-26} = {0,1}; +} +class AI2stb<dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : I<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f, itin, + opc, asm, "", pattern> { + let Inst{20} = 0; // L bit + let Inst{21} = 0; // W bit + let Inst{22} = 1; // B bit + let Inst{24} = 1; // P bit + let Inst{27-26} = {0,1}; +} +class AXI2stb<dag oops, dag iops, Format f, InstrItinClass itin, + string asm, list<dag> pattern> + : XI<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f, itin, + asm, "", pattern> { + let Inst{20} = 0; // L bit + let Inst{21} = 0; // W bit + let Inst{22} = 1; // B bit + let Inst{24} = 1; // P bit + let Inst{27-26} = {0,1}; +} + +// Pre-indexed loads +class AI2ldwpr<dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string asm, string cstr, list<dag> pattern> + : I<oops, iops, AddrMode2, Size4Bytes, IndexModePre, f, itin, + opc, asm, cstr, pattern> { + let Inst{20} = 1; // L bit + let Inst{21} = 1; // W bit + let Inst{22} = 0; // B bit + let Inst{24} = 1; // P bit + let Inst{27-26} = {0,1}; +} +class AI2ldbpr<dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string asm, string cstr, list<dag> pattern> + : I<oops, iops, AddrMode2, Size4Bytes, IndexModePre, f, itin, + opc, asm, cstr, pattern> { + let Inst{20} = 1; // L bit + let Inst{21} = 1; // W bit + let Inst{22} = 1; // B bit + let Inst{24} = 1; // P bit + let Inst{27-26} = {0,1}; +} + +// Pre-indexed stores +class AI2stwpr<dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string asm, string cstr, list<dag> pattern> + : I<oops, iops, AddrMode2, Size4Bytes, IndexModePre, f, itin, + opc, asm, cstr, pattern> { + let Inst{20} = 0; // L bit + let Inst{21} = 1; // W bit + let Inst{22} = 0; // B bit + let Inst{24} = 1; // P bit + let Inst{27-26} = {0,1}; +} +class AI2stbpr<dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string asm, string cstr, list<dag> pattern> + : I<oops, iops, AddrMode2, Size4Bytes, IndexModePre, f, itin, + opc, asm, cstr, pattern> { + let Inst{20} = 0; // L bit + let Inst{21} = 1; // W bit + let Inst{22} = 1; // B bit + let Inst{24} = 1; // P bit + let Inst{27-26} = {0,1}; +} + +// Post-indexed loads +class AI2ldwpo<dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string asm, string cstr, list<dag> pattern> + : I<oops, iops, AddrMode2, Size4Bytes, IndexModePost, f, itin, + opc, asm, cstr,pattern> { + let Inst{20} = 1; // L bit + let Inst{21} = 0; // W bit + let Inst{22} = 0; // B bit + let Inst{24} = 0; // P bit + let Inst{27-26} = {0,1}; +} +class AI2ldbpo<dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string asm, string cstr, list<dag> pattern> + : I<oops, iops, AddrMode2, Size4Bytes, IndexModePost, f, itin, + opc, asm, cstr,pattern> { + let Inst{20} = 1; // L bit + let Inst{21} = 0; // W bit + let Inst{22} = 1; // B bit + let Inst{24} = 0; // P bit + let Inst{27-26} = {0,1}; +} + +// Post-indexed stores +class AI2stwpo<dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string asm, string cstr, list<dag> pattern> + : I<oops, iops, AddrMode2, Size4Bytes, IndexModePost, f, itin, + opc, asm, cstr,pattern> { + let Inst{20} = 0; // L bit + let Inst{21} = 0; // W bit + let Inst{22} = 0; // B bit + let Inst{24} = 0; // P bit + let Inst{27-26} = {0,1}; +} +class AI2stbpo<dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string asm, string cstr, list<dag> pattern> + : I<oops, iops, AddrMode2, Size4Bytes, IndexModePost, f, itin, + opc, asm, cstr,pattern> { + let Inst{20} = 0; // L bit + let Inst{21} = 0; // W bit + let Inst{22} = 1; // B bit + let Inst{24} = 0; // P bit + let Inst{27-26} = {0,1}; +} + +// addrmode3 instructions +class AI3<dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : I<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, itin, + opc, asm, "", pattern>; +class AXI3<dag oops, dag iops, Format f, InstrItinClass itin, + string asm, list<dag> pattern> + : XI<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, itin, + asm, "", pattern>; + +// loads +class AI3ldh<dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : I<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, itin, + opc, asm, "", pattern> { + let Inst{4} = 1; + let Inst{5} = 1; // H bit + let Inst{6} = 0; // S bit + let Inst{7} = 1; + let Inst{20} = 1; // L bit + let Inst{21} = 0; // W bit + let Inst{24} = 1; // P bit + let Inst{27-25} = 0b000; +} +class AXI3ldh<dag oops, dag iops, Format f, InstrItinClass itin, + string asm, list<dag> pattern> + : XI<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, itin, + asm, "", pattern> { + let Inst{4} = 1; + let Inst{5} = 1; // H bit + let Inst{6} = 0; // S bit + let Inst{7} = 1; + let Inst{20} = 1; // L bit + let Inst{21} = 0; // W bit + let Inst{24} = 1; // P bit +} +class AI3ldsh<dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : I<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, itin, + opc, asm, "", pattern> { + let Inst{4} = 1; + let Inst{5} = 1; // H bit + let Inst{6} = 1; // S bit + let Inst{7} = 1; + let Inst{20} = 1; // L bit + let Inst{21} = 0; // W bit + let Inst{24} = 1; // P bit + let Inst{27-25} = 0b000; +} +class AXI3ldsh<dag oops, dag iops, Format f, InstrItinClass itin, + string asm, list<dag> pattern> + : XI<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, itin, + asm, "", pattern> { + let Inst{4} = 1; + let Inst{5} = 1; // H bit + let Inst{6} = 1; // S bit + let Inst{7} = 1; + let Inst{20} = 1; // L bit + let Inst{21} = 0; // W bit + let Inst{24} = 1; // P bit +} +class AI3ldsb<dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : I<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, itin, + opc, asm, "", pattern> { + let Inst{4} = 1; + let Inst{5} = 0; // H bit + let Inst{6} = 1; // S bit + let Inst{7} = 1; + let Inst{20} = 1; // L bit + let Inst{21} = 0; // W bit + let Inst{24} = 1; // P bit + let Inst{27-25} = 0b000; +} +class AXI3ldsb<dag oops, dag iops, Format f, InstrItinClass itin, + string asm, list<dag> pattern> + : XI<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, itin, + asm, "", pattern> { + let Inst{4} = 1; + let Inst{5} = 0; // H bit + let Inst{6} = 1; // S bit + let Inst{7} = 1; + let Inst{20} = 1; // L bit + let Inst{21} = 0; // W bit + let Inst{24} = 1; // P bit +} +class AI3ldd<dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : I<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, itin, + opc, asm, "", pattern> { + let Inst{4} = 1; + let Inst{5} = 0; // H bit + let Inst{6} = 1; // S bit + let Inst{7} = 1; + let Inst{20} = 0; // L bit + let Inst{21} = 0; // W bit + let Inst{24} = 1; // P bit + let Inst{27-25} = 0b000; +} + +// stores +class AI3sth<dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : I<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, itin, + opc, asm, "", pattern> { + let Inst{4} = 1; + let Inst{5} = 1; // H bit + let Inst{6} = 0; // S bit + let Inst{7} = 1; + let Inst{20} = 0; // L bit + let Inst{21} = 0; // W bit + let Inst{24} = 1; // P bit + let Inst{27-25} = 0b000; +} +class AXI3sth<dag oops, dag iops, Format f, InstrItinClass itin, + string asm, list<dag> pattern> + : XI<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, itin, + asm, "", pattern> { + let Inst{4} = 1; + let Inst{5} = 1; // H bit + let Inst{6} = 0; // S bit + let Inst{7} = 1; + let Inst{20} = 0; // L bit + let Inst{21} = 0; // W bit + let Inst{24} = 1; // P bit +} +class AI3std<dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : I<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, itin, + opc, asm, "", pattern> { + let Inst{4} = 1; + let Inst{5} = 1; // H bit + let Inst{6} = 1; // S bit + let Inst{7} = 1; + let Inst{20} = 0; // L bit + let Inst{21} = 0; // W bit + let Inst{24} = 1; // P bit + let Inst{27-25} = 0b000; +} + +// Pre-indexed loads +class AI3ldhpr<dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string asm, string cstr, list<dag> pattern> + : I<oops, iops, AddrMode3, Size4Bytes, IndexModePre, f, itin, + opc, asm, cstr, pattern> { + let Inst{4} = 1; + let Inst{5} = 1; // H bit + let Inst{6} = 0; // S bit + let Inst{7} = 1; + let Inst{20} = 1; // L bit + let Inst{21} = 1; // W bit + let Inst{24} = 1; // P bit + let Inst{27-25} = 0b000; +} +class AI3ldshpr<dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string asm, string cstr, list<dag> pattern> + : I<oops, iops, AddrMode3, Size4Bytes, IndexModePre, f, itin, + opc, asm, cstr, pattern> { + let Inst{4} = 1; + let Inst{5} = 1; // H bit + let Inst{6} = 1; // S bit + let Inst{7} = 1; + let Inst{20} = 1; // L bit + let Inst{21} = 1; // W bit + let Inst{24} = 1; // P bit + let Inst{27-25} = 0b000; +} +class AI3ldsbpr<dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string asm, string cstr, list<dag> pattern> + : I<oops, iops, AddrMode3, Size4Bytes, IndexModePre, f, itin, + opc, asm, cstr, pattern> { + let Inst{4} = 1; + let Inst{5} = 0; // H bit + let Inst{6} = 1; // S bit + let Inst{7} = 1; + let Inst{20} = 1; // L bit + let Inst{21} = 1; // W bit + let Inst{24} = 1; // P bit + let Inst{27-25} = 0b000; +} +class AI3lddpr<dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string asm, string cstr, list<dag> pattern> + : I<oops, iops, AddrMode3, Size4Bytes, IndexModePre, f, itin, + opc, asm, cstr, pattern> { + let Inst{4} = 1; + let Inst{5} = 0; // H bit + let Inst{6} = 1; // S bit + let Inst{7} = 1; + let Inst{20} = 0; // L bit + let Inst{21} = 1; // W bit + let Inst{24} = 1; // P bit + let Inst{27-25} = 0b000; +} + + +// Pre-indexed stores +class AI3sthpr<dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string asm, string cstr, list<dag> pattern> + : I<oops, iops, AddrMode3, Size4Bytes, IndexModePre, f, itin, + opc, asm, cstr, pattern> { + let Inst{4} = 1; + let Inst{5} = 1; // H bit + let Inst{6} = 0; // S bit + let Inst{7} = 1; + let Inst{20} = 0; // L bit + let Inst{21} = 1; // W bit + let Inst{24} = 1; // P bit + let Inst{27-25} = 0b000; +} +class AI3stdpr<dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string asm, string cstr, list<dag> pattern> + : I<oops, iops, AddrMode3, Size4Bytes, IndexModePre, f, itin, + opc, asm, cstr, pattern> { + let Inst{4} = 1; + let Inst{5} = 1; // H bit + let Inst{6} = 1; // S bit + let Inst{7} = 1; + let Inst{20} = 0; // L bit + let Inst{21} = 1; // W bit + let Inst{24} = 1; // P bit + let Inst{27-25} = 0b000; +} + +// Post-indexed loads +class AI3ldhpo<dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string asm, string cstr, list<dag> pattern> + : I<oops, iops, AddrMode3, Size4Bytes, IndexModePost, f, itin, + opc, asm, cstr,pattern> { + let Inst{4} = 1; + let Inst{5} = 1; // H bit + let Inst{6} = 0; // S bit + let Inst{7} = 1; + let Inst{20} = 1; // L bit + let Inst{21} = 0; // W bit + let Inst{24} = 0; // P bit + let Inst{27-25} = 0b000; +} +class AI3ldshpo<dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string asm, string cstr, list<dag> pattern> + : I<oops, iops, AddrMode3, Size4Bytes, IndexModePost, f, itin, + opc, asm, cstr,pattern> { + let Inst{4} = 1; + let Inst{5} = 1; // H bit + let Inst{6} = 1; // S bit + let Inst{7} = 1; + let Inst{20} = 1; // L bit + let Inst{21} = 0; // W bit + let Inst{24} = 0; // P bit + let Inst{27-25} = 0b000; +} +class AI3ldsbpo<dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string asm, string cstr, list<dag> pattern> + : I<oops, iops, AddrMode3, Size4Bytes, IndexModePost, f, itin, + opc, asm, cstr,pattern> { + let Inst{4} = 1; + let Inst{5} = 0; // H bit + let Inst{6} = 1; // S bit + let Inst{7} = 1; + let Inst{20} = 1; // L bit + let Inst{21} = 0; // W bit + let Inst{24} = 0; // P bit + let Inst{27-25} = 0b000; +} +class AI3lddpo<dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string asm, string cstr, list<dag> pattern> + : I<oops, iops, AddrMode3, Size4Bytes, IndexModePost, f, itin, + opc, asm, cstr, pattern> { + let Inst{4} = 1; + let Inst{5} = 0; // H bit + let Inst{6} = 1; // S bit + let Inst{7} = 1; + let Inst{20} = 0; // L bit + let Inst{21} = 0; // W bit + let Inst{24} = 0; // P bit + let Inst{27-25} = 0b000; +} + +// Post-indexed stores +class AI3sthpo<dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string asm, string cstr, list<dag> pattern> + : I<oops, iops, AddrMode3, Size4Bytes, IndexModePost, f, itin, + opc, asm, cstr,pattern> { + let Inst{4} = 1; + let Inst{5} = 1; // H bit + let Inst{6} = 0; // S bit + let Inst{7} = 1; + let Inst{20} = 0; // L bit + let Inst{21} = 0; // W bit + let Inst{24} = 0; // P bit + let Inst{27-25} = 0b000; +} +class AI3stdpo<dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string asm, string cstr, list<dag> pattern> + : I<oops, iops, AddrMode3, Size4Bytes, IndexModePost, f, itin, + opc, asm, cstr, pattern> { + let Inst{4} = 1; + let Inst{5} = 1; // H bit + let Inst{6} = 1; // S bit + let Inst{7} = 1; + let Inst{20} = 0; // L bit + let Inst{21} = 0; // W bit + let Inst{24} = 0; // P bit + let Inst{27-25} = 0b000; +} + +// addrmode4 instructions +class AXI4ld<dag oops, dag iops, IndexMode im, Format f, InstrItinClass itin, + string asm, string cstr, list<dag> pattern> + : XI<oops, iops, AddrMode4, Size4Bytes, im, f, itin, + asm, cstr, pattern> { + let Inst{20} = 1; // L bit + let Inst{22} = 0; // S bit + let Inst{27-25} = 0b100; +} +class AXI4st<dag oops, dag iops, IndexMode im, Format f, InstrItinClass itin, + string asm, string cstr, list<dag> pattern> + : XI<oops, iops, AddrMode4, Size4Bytes, im, f, itin, + asm, cstr, pattern> { + let Inst{20} = 0; // L bit + let Inst{22} = 0; // S bit + let Inst{27-25} = 0b100; +} + +// Unsigned multiply, multiply-accumulate instructions. +class AMul1I<bits<7> opcod, dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : I<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, MulFrm, itin, + opc, asm, "", pattern> { + let Inst{7-4} = 0b1001; + let Inst{20} = 0; // S bit + let Inst{27-21} = opcod; +} +class AsMul1I<bits<7> opcod, dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : sI<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, MulFrm, itin, + opc, asm, "", pattern> { + let Inst{7-4} = 0b1001; + let Inst{27-21} = opcod; +} + +// Most significant word multiply +class AMul2I<bits<7> opcod, dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : I<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, MulFrm, itin, + opc, asm, "", pattern> { + let Inst{7-4} = 0b1001; + let Inst{20} = 1; + let Inst{27-21} = opcod; +} + +// SMUL<x><y> / SMULW<y> / SMLA<x><y> / SMLAW<x><y> +class AMulxyI<bits<7> opcod, dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : I<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, MulFrm, itin, + opc, asm, "", pattern> { + let Inst{4} = 0; + let Inst{7} = 1; + let Inst{20} = 0; + let Inst{27-21} = opcod; +} + +// Extend instructions. +class AExtI<bits<8> opcod, dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : I<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, ExtFrm, itin, + opc, asm, "", pattern> { + let Inst{7-4} = 0b0111; + let Inst{27-20} = opcod; +} + +// Misc Arithmetic instructions. +class AMiscA1I<bits<8> opcod, dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : I<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, ArithMiscFrm, itin, + opc, asm, "", pattern> { + let Inst{27-20} = opcod; +} + +//===----------------------------------------------------------------------===// + +// ARMPat - Same as Pat<>, but requires that the compiler be in ARM mode. +class ARMPat<dag pattern, dag result> : Pat<pattern, result> { + list<Predicate> Predicates = [IsARM]; +} +class ARMV5TEPat<dag pattern, dag result> : Pat<pattern, result> { + list<Predicate> Predicates = [IsARM, HasV5TE]; +} +class ARMV6Pat<dag pattern, dag result> : Pat<pattern, result> { + list<Predicate> Predicates = [IsARM, HasV6]; +} + +//===----------------------------------------------------------------------===// +// +// Thumb Instruction Format Definitions. +// + +// TI - Thumb instruction. + +class ThumbI<dag oops, dag iops, AddrMode am, SizeFlagVal sz, + InstrItinClass itin, string asm, string cstr, list<dag> pattern> + : InstThumb<am, sz, IndexModeNone, ThumbFrm, GenericDomain, cstr, itin> { + let OutOperandList = oops; + let InOperandList = iops; + let AsmString = asm; + let Pattern = pattern; + list<Predicate> Predicates = [IsThumb]; +} + +class TI<dag oops, dag iops, InstrItinClass itin, string asm, list<dag> pattern> + : ThumbI<oops, iops, AddrModeNone, Size2Bytes, itin, asm, "", pattern>; + +// Two-address instructions +class TIt<dag oops, dag iops, InstrItinClass itin, string asm, + list<dag> pattern> + : ThumbI<oops, iops, AddrModeNone, Size2Bytes, itin, asm, "$lhs = $dst", + pattern>; + +// tBL, tBX 32-bit instructions +class TIx2<bits<5> opcod1, bits<2> opcod2, bit opcod3, + dag oops, dag iops, InstrItinClass itin, string asm, + list<dag> pattern> + : ThumbI<oops, iops, AddrModeNone, Size4Bytes, itin, asm, "", pattern>, + Encoding { + let Inst{31-27} = opcod1; + let Inst{15-14} = opcod2; + let Inst{12} = opcod3; +} + +// BR_JT instructions +class TJTI<dag oops, dag iops, InstrItinClass itin, string asm, + list<dag> pattern> + : ThumbI<oops, iops, AddrModeNone, SizeSpecial, itin, asm, "", pattern>; + +// Thumb1 only +class Thumb1I<dag oops, dag iops, AddrMode am, SizeFlagVal sz, + InstrItinClass itin, string asm, string cstr, list<dag> pattern> + : InstThumb<am, sz, IndexModeNone, ThumbFrm, GenericDomain, cstr, itin> { + let OutOperandList = oops; + let InOperandList = iops; + let AsmString = asm; + let Pattern = pattern; + list<Predicate> Predicates = [IsThumb1Only]; +} + +class T1I<dag oops, dag iops, InstrItinClass itin, + string asm, list<dag> pattern> + : Thumb1I<oops, iops, AddrModeNone, Size2Bytes, itin, asm, "", pattern>; +class T1Ix2<dag oops, dag iops, InstrItinClass itin, + string asm, list<dag> pattern> + : Thumb1I<oops, iops, AddrModeNone, Size4Bytes, itin, asm, "", pattern>; +class T1JTI<dag oops, dag iops, InstrItinClass itin, + string asm, list<dag> pattern> + : Thumb1I<oops, iops, AddrModeNone, SizeSpecial, itin, asm, "", pattern>; + +// Two-address instructions +class T1It<dag oops, dag iops, InstrItinClass itin, + string asm, string cstr, list<dag> pattern> + : Thumb1I<oops, iops, AddrModeNone, Size2Bytes, itin, + asm, cstr, pattern>; + +// Thumb1 instruction that can either be predicated or set CPSR. +class Thumb1sI<dag oops, dag iops, AddrMode am, SizeFlagVal sz, + InstrItinClass itin, + string opc, string asm, string cstr, list<dag> pattern> + : InstThumb<am, sz, IndexModeNone, ThumbFrm, GenericDomain, cstr, itin> { + let OutOperandList = !con(oops, (outs s_cc_out:$s)); + let InOperandList = !con(iops, (ins pred:$p)); + let AsmString = !strconcat(opc, !strconcat("${s}${p}", asm)); + let Pattern = pattern; + list<Predicate> Predicates = [IsThumb1Only]; +} + +class T1sI<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : Thumb1sI<oops, iops, AddrModeNone, Size2Bytes, itin, opc, asm, "", pattern>; + +// Two-address instructions +class T1sIt<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : Thumb1sI<oops, iops, AddrModeNone, Size2Bytes, itin, opc, asm, + "$lhs = $dst", pattern>; + +// Thumb1 instruction that can be predicated. +class Thumb1pI<dag oops, dag iops, AddrMode am, SizeFlagVal sz, + InstrItinClass itin, + string opc, string asm, string cstr, list<dag> pattern> + : InstThumb<am, sz, IndexModeNone, ThumbFrm, GenericDomain, cstr, itin> { + let OutOperandList = oops; + let InOperandList = !con(iops, (ins pred:$p)); + let AsmString = !strconcat(opc, !strconcat("${p}", asm)); + let Pattern = pattern; + list<Predicate> Predicates = [IsThumb1Only]; +} + +class T1pI<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : Thumb1pI<oops, iops, AddrModeNone, Size2Bytes, itin, opc, asm, "", pattern>; + +// Two-address instructions +class T1pIt<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : Thumb1pI<oops, iops, AddrModeNone, Size2Bytes, itin, opc, asm, + "$lhs = $dst", pattern>; + +class T1pI1<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : Thumb1pI<oops, iops, AddrModeT1_1, Size2Bytes, itin, opc, asm, "", pattern>; +class T1pI2<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : Thumb1pI<oops, iops, AddrModeT1_2, Size2Bytes, itin, opc, asm, "", pattern>; +class T1pI4<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : Thumb1pI<oops, iops, AddrModeT1_4, Size2Bytes, itin, opc, asm, "", pattern>; +class T1pIs<dag oops, dag iops, + InstrItinClass itin, string opc, string asm, list<dag> pattern> + : Thumb1pI<oops, iops, AddrModeT1_s, Size2Bytes, itin, opc, asm, "", pattern>; + +class Encoding16 : Encoding { + let Inst{31-16} = 0x0000; +} + +// A6.2 16-bit Thumb instruction encoding +class T1Encoding<bits<6> opcode> : Encoding16 { + let Inst{15-10} = opcode; +} + +// A6.2.1 Shift (immediate), add, subtract, move, and compare encoding. +class T1General<bits<5> opcode> : Encoding16 { + let Inst{15-14} = 0b00; + let Inst{13-9} = opcode; +} + +// A6.2.2 Data-processing encoding. +class T1DataProcessing<bits<4> opcode> : Encoding16 { + let Inst{15-10} = 0b010000; + let Inst{9-6} = opcode; +} + +// A6.2.3 Special data instructions and branch and exchange encoding. +class T1Special<bits<4> opcode> : Encoding16 { + let Inst{15-10} = 0b010001; + let Inst{9-6} = opcode; +} + +// A6.2.4 Load/store single data item encoding. +class T1LoadStore<bits<4> opA, bits<3> opB> : Encoding16 { + let Inst{15-12} = opA; + let Inst{11-9} = opB; +} +class T1LdSt<bits<3> opB> : T1LoadStore<0b0101, opB>; +class T1LdSt4Imm<bits<3> opB> : T1LoadStore<0b0110, opB>; // Immediate, 4 bytes +class T1LdSt1Imm<bits<3> opB> : T1LoadStore<0b0111, opB>; // Immediate, 1 byte +class T1LdSt2Imm<bits<3> opB> : T1LoadStore<0b1000, opB>; // Immediate, 2 bytes +class T1LdStSP<bits<3> opB> : T1LoadStore<0b1001, opB>; // SP relative + +// A6.2.5 Miscellaneous 16-bit instructions encoding. +class T1Misc<bits<7> opcode> : Encoding16 { + let Inst{15-12} = 0b1011; + let Inst{11-5} = opcode; +} + +// Thumb2I - Thumb2 instruction. Almost all Thumb2 instructions are predicable. +class Thumb2I<dag oops, dag iops, AddrMode am, SizeFlagVal sz, + InstrItinClass itin, + string opc, string asm, string cstr, list<dag> pattern> + : InstARM<am, sz, IndexModeNone, ThumbFrm, GenericDomain, cstr, itin> { + let OutOperandList = oops; + let InOperandList = !con(iops, (ins pred:$p)); + let AsmString = !strconcat(opc, !strconcat("${p}", asm)); + let Pattern = pattern; + list<Predicate> Predicates = [IsThumb2]; +} + +// Same as Thumb2I except it can optionally modify CPSR. Note it's modeled as +// an input operand since by default it's a zero register. It will +// become an implicit def once it's "flipped". +// FIXME: This uses unified syntax so {s} comes before {p}. We should make it +// more consistent. +class Thumb2sI<dag oops, dag iops, AddrMode am, SizeFlagVal sz, + InstrItinClass itin, + string opc, string asm, string cstr, list<dag> pattern> + : InstARM<am, sz, IndexModeNone, ThumbFrm, GenericDomain, cstr, itin> { + let OutOperandList = oops; + let InOperandList = !con(iops, (ins pred:$p, cc_out:$s)); + let AsmString = !strconcat(opc, !strconcat("${s}${p}", asm)); + let Pattern = pattern; + list<Predicate> Predicates = [IsThumb2]; +} + +// Special cases +class Thumb2XI<dag oops, dag iops, AddrMode am, SizeFlagVal sz, + InstrItinClass itin, + string asm, string cstr, list<dag> pattern> + : InstARM<am, sz, IndexModeNone, ThumbFrm, GenericDomain, cstr, itin> { + let OutOperandList = oops; + let InOperandList = iops; + let AsmString = asm; + let Pattern = pattern; + list<Predicate> Predicates = [IsThumb2]; +} + +class ThumbXI<dag oops, dag iops, AddrMode am, SizeFlagVal sz, + InstrItinClass itin, + string asm, string cstr, list<dag> pattern> + : InstARM<am, sz, IndexModeNone, ThumbFrm, GenericDomain, cstr, itin> { + let OutOperandList = oops; + let InOperandList = iops; + let AsmString = asm; + let Pattern = pattern; + list<Predicate> Predicates = [IsThumb1Only]; +} + +class T2I<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : Thumb2I<oops, iops, AddrModeNone, Size4Bytes, itin, opc, asm, "", pattern>; +class T2Ii12<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : Thumb2I<oops, iops, AddrModeT2_i12, Size4Bytes, itin, opc, asm, "",pattern>; +class T2Ii8<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : Thumb2I<oops, iops, AddrModeT2_i8, Size4Bytes, itin, opc, asm, "", pattern>; +class T2Iso<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : Thumb2I<oops, iops, AddrModeT2_so, Size4Bytes, itin, opc, asm, "", pattern>; +class T2Ipc<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : Thumb2I<oops, iops, AddrModeT2_pc, Size4Bytes, itin, opc, asm, "", pattern>; +class T2Ii8s4<bit P, bit W, bit load, dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : Thumb2I<oops, iops, AddrModeT2_i8s4, Size4Bytes, itin, opc, asm, "", + pattern> { + let Inst{31-27} = 0b11101; + let Inst{26-25} = 0b00; + let Inst{24} = P; + let Inst{23} = ?; // The U bit. + let Inst{22} = 1; + let Inst{21} = W; + let Inst{20} = load; +} + +class T2sI<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : Thumb2sI<oops, iops, AddrModeNone, Size4Bytes, itin, opc, asm, "", pattern>; + +class T2XI<dag oops, dag iops, InstrItinClass itin, + string asm, list<dag> pattern> + : Thumb2XI<oops, iops, AddrModeNone, Size4Bytes, itin, asm, "", pattern>; +class T2JTI<dag oops, dag iops, InstrItinClass itin, + string asm, list<dag> pattern> + : Thumb2XI<oops, iops, AddrModeNone, SizeSpecial, itin, asm, "", pattern>; + +class T2Ix2<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : Thumb2I<oops, iops, AddrModeNone, Size8Bytes, itin, opc, asm, "", pattern>; + +// Two-address instructions +class T2XIt<dag oops, dag iops, InstrItinClass itin, + string asm, string cstr, list<dag> pattern> + : Thumb2XI<oops, iops, AddrModeNone, Size4Bytes, itin, asm, cstr, pattern>; + +// T2Iidxldst - Thumb2 indexed load / store instructions. +class T2Iidxldst<bit signed, bits<2> opcod, bit load, bit pre, + dag oops, dag iops, + AddrMode am, IndexMode im, InstrItinClass itin, + string opc, string asm, string cstr, list<dag> pattern> + : InstARM<am, Size4Bytes, im, ThumbFrm, GenericDomain, cstr, itin> { + let OutOperandList = oops; + let InOperandList = !con(iops, (ins pred:$p)); + let AsmString = !strconcat(opc, !strconcat("${p}", asm)); + let Pattern = pattern; + list<Predicate> Predicates = [IsThumb2]; + let Inst{31-27} = 0b11111; + let Inst{26-25} = 0b00; + let Inst{24} = signed; + let Inst{23} = 0; + let Inst{22-21} = opcod; + let Inst{20} = load; + let Inst{11} = 1; + // (P, W) = (1, 1) Pre-indexed or (0, 1) Post-indexed + let Inst{10} = pre; // The P bit. + let Inst{8} = 1; // The W bit. +} + +// Helper class for disassembly only +// A6.3.16 & A6.3.17 +// T2Imac - Thumb2 multiply [accumulate, and absolute difference] instructions. +class T2I_mac<bit long, bits<3> op22_20, bits<4> op7_4, dag oops, dag iops, + InstrItinClass itin, string opc, string asm, list<dag> pattern> + : T2I<oops, iops, itin, opc, asm, pattern> { + let Inst{31-27} = 0b11111; + let Inst{26-24} = 0b011; + let Inst{23} = long; + let Inst{22-20} = op22_20; + let Inst{7-4} = op7_4; +} + +// Tv5Pat - Same as Pat<>, but requires V5T Thumb mode. +class Tv5Pat<dag pattern, dag result> : Pat<pattern, result> { + list<Predicate> Predicates = [IsThumb1Only, HasV5T]; +} + +// T1Pat - Same as Pat<>, but requires that the compiler be in Thumb1 mode. +class T1Pat<dag pattern, dag result> : Pat<pattern, result> { + list<Predicate> Predicates = [IsThumb1Only]; +} + +// T2Pat - Same as Pat<>, but requires that the compiler be in Thumb2 mode. +class T2Pat<dag pattern, dag result> : Pat<pattern, result> { + list<Predicate> Predicates = [IsThumb2]; +} + +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// ARM VFP Instruction templates. +// + +// Almost all VFP instructions are predicable. +class VFPI<dag oops, dag iops, AddrMode am, SizeFlagVal sz, + IndexMode im, Format f, InstrItinClass itin, + string opc, string asm, string cstr, list<dag> pattern> + : InstARM<am, sz, im, f, VFPDomain, cstr, itin> { + let OutOperandList = oops; + let InOperandList = !con(iops, (ins pred:$p)); + let AsmString = !strconcat(opc, !strconcat("${p}", asm)); + let Pattern = pattern; + list<Predicate> Predicates = [HasVFP2]; +} + +// Special cases +class VFPXI<dag oops, dag iops, AddrMode am, SizeFlagVal sz, + IndexMode im, Format f, InstrItinClass itin, + string asm, string cstr, list<dag> pattern> + : InstARM<am, sz, im, f, VFPDomain, cstr, itin> { + let OutOperandList = oops; + let InOperandList = iops; + let AsmString = asm; + let Pattern = pattern; + list<Predicate> Predicates = [HasVFP2]; +} + +class VFPAI<dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : VFPI<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, f, itin, + opc, asm, "", pattern>; + +// ARM VFP addrmode5 loads and stores +class ADI5<bits<4> opcod1, bits<2> opcod2, dag oops, dag iops, + InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : VFPI<oops, iops, AddrMode5, Size4Bytes, IndexModeNone, + VFPLdStFrm, itin, opc, asm, "", pattern> { + // TODO: Mark the instructions with the appropriate subtarget info. + let Inst{27-24} = opcod1; + let Inst{21-20} = opcod2; + let Inst{11-8} = 0b1011; + + // 64-bit loads & stores operate on both NEON and VFP pipelines. + let D = VFPNeonDomain; +} + +class ASI5<bits<4> opcod1, bits<2> opcod2, dag oops, dag iops, + InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : VFPI<oops, iops, AddrMode5, Size4Bytes, IndexModeNone, + VFPLdStFrm, itin, opc, asm, "", pattern> { + // TODO: Mark the instructions with the appropriate subtarget info. + let Inst{27-24} = opcod1; + let Inst{21-20} = opcod2; + let Inst{11-8} = 0b1010; +} + +// Load / store multiple +class AXDI5<dag oops, dag iops, IndexMode im, InstrItinClass itin, + string asm, string cstr, list<dag> pattern> + : VFPXI<oops, iops, AddrMode5, Size4Bytes, im, + VFPLdStMulFrm, itin, asm, cstr, pattern> { + // TODO: Mark the instructions with the appropriate subtarget info. + let Inst{27-25} = 0b110; + let Inst{11-8} = 0b1011; + + // 64-bit loads & stores operate on both NEON and VFP pipelines. + let D = VFPNeonDomain; +} + +class AXSI5<dag oops, dag iops, IndexMode im, InstrItinClass itin, + string asm, string cstr, list<dag> pattern> + : VFPXI<oops, iops, AddrMode5, Size4Bytes, im, + VFPLdStMulFrm, itin, asm, cstr, pattern> { + // TODO: Mark the instructions with the appropriate subtarget info. + let Inst{27-25} = 0b110; + let Inst{11-8} = 0b1010; +} + +// Double precision, unary +class ADuI<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<2> opcod4, + bit opcod5, dag oops, dag iops, InstrItinClass itin, string opc, + string asm, list<dag> pattern> + : VFPAI<oops, iops, VFPUnaryFrm, itin, opc, asm, pattern> { + let Inst{27-23} = opcod1; + let Inst{21-20} = opcod2; + let Inst{19-16} = opcod3; + let Inst{11-8} = 0b1011; + let Inst{7-6} = opcod4; + let Inst{4} = opcod5; +} + +// Double precision, binary +class ADbI<bits<5> opcod1, bits<2> opcod2, bit op6, bit op4, dag oops, + dag iops, InstrItinClass itin, string opc, string asm, + list<dag> pattern> + : VFPAI<oops, iops, VFPBinaryFrm, itin, opc, asm, pattern> { + let Inst{27-23} = opcod1; + let Inst{21-20} = opcod2; + let Inst{11-8} = 0b1011; + let Inst{6} = op6; + let Inst{4} = op4; +} + +// Double precision, binary, VML[AS] (for additional predicate) +class ADbI_vmlX<bits<5> opcod1, bits<2> opcod2, bit op6, bit op4, dag oops, + dag iops, InstrItinClass itin, string opc, string asm, + list<dag> pattern> + : VFPAI<oops, iops, VFPBinaryFrm, itin, opc, asm, pattern> { + let Inst{27-23} = opcod1; + let Inst{21-20} = opcod2; + let Inst{11-8} = 0b1011; + let Inst{6} = op6; + let Inst{4} = op4; + list<Predicate> Predicates = [HasVFP2, UseVMLx]; +} + + +// Single precision, unary +class ASuI<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<2> opcod4, + bit opcod5, dag oops, dag iops, InstrItinClass itin, string opc, + string asm, list<dag> pattern> + : VFPAI<oops, iops, VFPUnaryFrm, itin, opc, asm, pattern> { + let Inst{27-23} = opcod1; + let Inst{21-20} = opcod2; + let Inst{19-16} = opcod3; + let Inst{11-8} = 0b1010; + let Inst{7-6} = opcod4; + let Inst{4} = opcod5; +} + +// Single precision unary, if no NEON +// Same as ASuI except not available if NEON is enabled +class ASuIn<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<2> opcod4, + bit opcod5, dag oops, dag iops, InstrItinClass itin, string opc, + string asm, list<dag> pattern> + : ASuI<opcod1, opcod2, opcod3, opcod4, opcod5, oops, iops, itin, opc, asm, + pattern> { + list<Predicate> Predicates = [HasVFP2,DontUseNEONForFP]; +} + +// Single precision, binary +class ASbI<bits<5> opcod1, bits<2> opcod2, bit op6, bit op4, dag oops, dag iops, + InstrItinClass itin, string opc, string asm, list<dag> pattern> + : VFPAI<oops, iops, VFPBinaryFrm, itin, opc, asm, pattern> { + let Inst{27-23} = opcod1; + let Inst{21-20} = opcod2; + let Inst{11-8} = 0b1010; + let Inst{6} = op6; + let Inst{4} = op4; +} + +// Single precision binary, if no NEON +// Same as ASbI except not available if NEON is enabled +class ASbIn<bits<5> opcod1, bits<2> opcod2, bit op6, bit op4, dag oops, + dag iops, InstrItinClass itin, string opc, string asm, + list<dag> pattern> + : ASbI<opcod1, opcod2, op6, op4, oops, iops, itin, opc, asm, pattern> { + list<Predicate> Predicates = [HasVFP2,DontUseNEONForFP]; +} + +// VFP conversion instructions +class AVConv1I<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<4> opcod4, + dag oops, dag iops, InstrItinClass itin, string opc, string asm, + list<dag> pattern> + : VFPAI<oops, iops, VFPConv1Frm, itin, opc, asm, pattern> { + let Inst{27-23} = opcod1; + let Inst{21-20} = opcod2; + let Inst{19-16} = opcod3; + let Inst{11-8} = opcod4; + let Inst{6} = 1; + let Inst{4} = 0; +} + +// VFP conversion between floating-point and fixed-point +class AVConv1XI<bits<5> op1, bits<2> op2, bits<4> op3, bits<4> op4, bit op5, + dag oops, dag iops, InstrItinClass itin, string opc, string asm, + list<dag> pattern> + : AVConv1I<op1, op2, op3, op4, oops, iops, itin, opc, asm, pattern> { + // size (fixed-point number): sx == 0 ? 16 : 32 + let Inst{7} = op5; // sx +} + +// VFP conversion instructions, if no NEON +class AVConv1In<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<4> opcod4, + dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : AVConv1I<opcod1, opcod2, opcod3, opcod4, oops, iops, itin, opc, asm, + pattern> { + list<Predicate> Predicates = [HasVFP2,DontUseNEONForFP]; +} + +class AVConvXI<bits<8> opcod1, bits<4> opcod2, dag oops, dag iops, Format f, + InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : VFPAI<oops, iops, f, itin, opc, asm, pattern> { + let Inst{27-20} = opcod1; + let Inst{11-8} = opcod2; + let Inst{4} = 1; +} + +class AVConv2I<bits<8> opcod1, bits<4> opcod2, dag oops, dag iops, + InstrItinClass itin, string opc, string asm, list<dag> pattern> + : AVConvXI<opcod1, opcod2, oops, iops, VFPConv2Frm, itin, opc, asm, pattern>; + +class AVConv3I<bits<8> opcod1, bits<4> opcod2, dag oops, dag iops, + InstrItinClass itin, string opc, string asm, list<dag> pattern> + : AVConvXI<opcod1, opcod2, oops, iops, VFPConv3Frm, itin, opc, asm, pattern>; + +class AVConv4I<bits<8> opcod1, bits<4> opcod2, dag oops, dag iops, + InstrItinClass itin, string opc, string asm, list<dag> pattern> + : AVConvXI<opcod1, opcod2, oops, iops, VFPConv4Frm, itin, opc, asm, pattern>; + +class AVConv5I<bits<8> opcod1, bits<4> opcod2, dag oops, dag iops, + InstrItinClass itin, string opc, string asm, list<dag> pattern> + : AVConvXI<opcod1, opcod2, oops, iops, VFPConv5Frm, itin, opc, asm, pattern>; + +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// ARM NEON Instruction templates. +// + +class NeonI<dag oops, dag iops, AddrMode am, IndexMode im, Format f, + InstrItinClass itin, string opc, string dt, string asm, string cstr, + list<dag> pattern> + : InstARM<am, Size4Bytes, im, f, NeonDomain, cstr, itin> { + let OutOperandList = oops; + let InOperandList = !con(iops, (ins pred:$p)); + let AsmString = !strconcat( + !strconcat(!strconcat(opc, "${p}"), !strconcat(".", dt)), + !strconcat("\t", asm)); + let Pattern = pattern; + list<Predicate> Predicates = [HasNEON]; +} + +// Same as NeonI except it does not have a "data type" specifier. +class NeonXI<dag oops, dag iops, AddrMode am, IndexMode im, Format f, + InstrItinClass itin, string opc, string asm, string cstr, + list<dag> pattern> + : InstARM<am, Size4Bytes, im, f, NeonDomain, cstr, itin> { + let OutOperandList = oops; + let InOperandList = !con(iops, (ins pred:$p)); + let AsmString = !strconcat(!strconcat(opc, "${p}"), !strconcat("\t", asm)); + let Pattern = pattern; + list<Predicate> Predicates = [HasNEON]; +} + +class NLdSt<bit op23, bits<2> op21_20, bits<4> op11_8, bits<4> op7_4, + dag oops, dag iops, InstrItinClass itin, + string opc, string dt, string asm, string cstr, list<dag> pattern> + : NeonI<oops, iops, AddrMode6, IndexModeNone, NLdStFrm, itin, opc, dt, asm, + cstr, pattern> { + let Inst{31-24} = 0b11110100; + let Inst{23} = op23; + let Inst{21-20} = op21_20; + let Inst{11-8} = op11_8; + let Inst{7-4} = op7_4; +} + +class NDataI<dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string dt, string asm, string cstr, list<dag> pattern> + : NeonI<oops, iops, AddrModeNone, IndexModeNone, f, itin, opc, dt, asm, cstr, + pattern> { + let Inst{31-25} = 0b1111001; +} + +class NDataXI<dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string asm, string cstr, list<dag> pattern> + : NeonXI<oops, iops, AddrModeNone, IndexModeNone, f, itin, opc, asm, + cstr, pattern> { + let Inst{31-25} = 0b1111001; +} + +// NEON "one register and a modified immediate" format. +class N1ModImm<bit op23, bits<3> op21_19, bits<4> op11_8, bit op7, bit op6, + bit op5, bit op4, + dag oops, dag iops, InstrItinClass itin, + string opc, string dt, string asm, string cstr, + list<dag> pattern> + : NDataI<oops, iops, N1RegModImmFrm, itin, opc, dt, asm, cstr, pattern> { + let Inst{23} = op23; + let Inst{21-19} = op21_19; + let Inst{11-8} = op11_8; + let Inst{7} = op7; + let Inst{6} = op6; + let Inst{5} = op5; + let Inst{4} = op4; +} + +// NEON 2 vector register format. +class N2V<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, bits<2> op17_16, + bits<5> op11_7, bit op6, bit op4, + dag oops, dag iops, InstrItinClass itin, + string opc, string dt, string asm, string cstr, list<dag> pattern> + : NDataI<oops, iops, N2RegFrm, itin, opc, dt, asm, cstr, pattern> { + let Inst{24-23} = op24_23; + let Inst{21-20} = op21_20; + let Inst{19-18} = op19_18; + let Inst{17-16} = op17_16; + let Inst{11-7} = op11_7; + let Inst{6} = op6; + let Inst{4} = op4; +} + +// Same as N2V except it doesn't have a datatype suffix. +class N2VX<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, bits<2> op17_16, + bits<5> op11_7, bit op6, bit op4, + dag oops, dag iops, InstrItinClass itin, + string opc, string asm, string cstr, list<dag> pattern> + : NDataXI<oops, iops, N2RegFrm, itin, opc, asm, cstr, pattern> { + let Inst{24-23} = op24_23; + let Inst{21-20} = op21_20; + let Inst{19-18} = op19_18; + let Inst{17-16} = op17_16; + let Inst{11-7} = op11_7; + let Inst{6} = op6; + let Inst{4} = op4; +} + +// NEON 2 vector register with immediate. +class N2VImm<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6, bit op4, + dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string dt, string asm, string cstr, list<dag> pattern> + : NDataI<oops, iops, f, itin, opc, dt, asm, cstr, pattern> { + let Inst{24} = op24; + let Inst{23} = op23; + let Inst{11-8} = op11_8; + let Inst{7} = op7; + let Inst{6} = op6; + let Inst{4} = op4; +} + +// NEON 3 vector register format. +class N3V<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6, bit op4, + dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string dt, string asm, string cstr, list<dag> pattern> + : NDataI<oops, iops, f, itin, opc, dt, asm, cstr, pattern> { + let Inst{24} = op24; + let Inst{23} = op23; + let Inst{21-20} = op21_20; + let Inst{11-8} = op11_8; + let Inst{6} = op6; + let Inst{4} = op4; +} + +// Same as N3V except it doesn't have a data type suffix. +class N3VX<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6, + bit op4, + dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string asm, string cstr, list<dag> pattern> + : NDataXI<oops, iops, f, itin, opc, asm, cstr, pattern> { + let Inst{24} = op24; + let Inst{23} = op23; + let Inst{21-20} = op21_20; + let Inst{11-8} = op11_8; + let Inst{6} = op6; + let Inst{4} = op4; +} + +// NEON VMOVs between scalar and core registers. +class NVLaneOp<bits<8> opcod1, bits<4> opcod2, bits<2> opcod3, + dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string dt, string asm, list<dag> pattern> + : InstARM<AddrModeNone, Size4Bytes, IndexModeNone, f, GenericDomain, + "", itin> { + let Inst{27-20} = opcod1; + let Inst{11-8} = opcod2; + let Inst{6-5} = opcod3; + let Inst{4} = 1; + + let OutOperandList = oops; + let InOperandList = !con(iops, (ins pred:$p)); + let AsmString = !strconcat( + !strconcat(!strconcat(opc, "${p}"), !strconcat(".", dt)), + !strconcat("\t", asm)); + let Pattern = pattern; + list<Predicate> Predicates = [HasNEON]; +} +class NVGetLane<bits<8> opcod1, bits<4> opcod2, bits<2> opcod3, + dag oops, dag iops, InstrItinClass itin, + string opc, string dt, string asm, list<dag> pattern> + : NVLaneOp<opcod1, opcod2, opcod3, oops, iops, NEONGetLnFrm, itin, + opc, dt, asm, pattern>; +class NVSetLane<bits<8> opcod1, bits<4> opcod2, bits<2> opcod3, + dag oops, dag iops, InstrItinClass itin, + string opc, string dt, string asm, list<dag> pattern> + : NVLaneOp<opcod1, opcod2, opcod3, oops, iops, NEONSetLnFrm, itin, + opc, dt, asm, pattern>; +class NVDup<bits<8> opcod1, bits<4> opcod2, bits<2> opcod3, + dag oops, dag iops, InstrItinClass itin, + string opc, string dt, string asm, list<dag> pattern> + : NVLaneOp<opcod1, opcod2, opcod3, oops, iops, NEONDupFrm, itin, + opc, dt, asm, pattern>; + +// Vector Duplicate Lane (from scalar to all elements) +class NVDupLane<bits<4> op19_16, bit op6, dag oops, dag iops, + InstrItinClass itin, string opc, string dt, string asm, + list<dag> pattern> + : NDataI<oops, iops, NVDupLnFrm, itin, opc, dt, asm, "", pattern> { + let Inst{24-23} = 0b11; + let Inst{21-20} = 0b11; + let Inst{19-16} = op19_16; + let Inst{11-7} = 0b11000; + let Inst{6} = op6; + let Inst{4} = 0; +} + +// NEONFPPat - Same as Pat<>, but requires that the compiler be using NEON +// for single-precision FP. +class NEONFPPat<dag pattern, dag result> : Pat<pattern, result> { + list<Predicate> Predicates = [HasNEON,UseNEONForFP]; +} diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.cpp new file mode 100644 index 0000000..85f6b40 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.cpp @@ -0,0 +1,86 @@ +//===- ARMInstrInfo.cpp - ARM Instruction Information -----------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the ARM implementation of the TargetInstrInfo class. +// +//===----------------------------------------------------------------------===// + +#include "ARMInstrInfo.h" +#include "ARM.h" +#include "ARMAddressingModes.h" +#include "ARMGenInstrInfo.inc" +#include "ARMMachineFunctionInfo.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/CodeGen/LiveVariables.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineJumpTableInfo.h" +#include "llvm/MC/MCAsmInfo.h" +using namespace llvm; + +ARMInstrInfo::ARMInstrInfo(const ARMSubtarget &STI) + : ARMBaseInstrInfo(STI), RI(*this, STI) { +} + +unsigned ARMInstrInfo::getUnindexedOpcode(unsigned Opc) const { + switch (Opc) { + default: break; + case ARM::LDR_PRE: + case ARM::LDR_POST: + return ARM::LDR; + case ARM::LDRH_PRE: + case ARM::LDRH_POST: + return ARM::LDRH; + case ARM::LDRB_PRE: + case ARM::LDRB_POST: + return ARM::LDRB; + case ARM::LDRSH_PRE: + case ARM::LDRSH_POST: + return ARM::LDRSH; + case ARM::LDRSB_PRE: + case ARM::LDRSB_POST: + return ARM::LDRSB; + case ARM::STR_PRE: + case ARM::STR_POST: + return ARM::STR; + case ARM::STRH_PRE: + case ARM::STRH_POST: + return ARM::STRH; + case ARM::STRB_PRE: + case ARM::STRB_POST: + return ARM::STRB; + } + + return 0; +} + +void ARMInstrInfo:: +reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, + unsigned DestReg, unsigned SubIdx, const MachineInstr *Orig, + const TargetRegisterInfo *TRI) const { + DebugLoc dl = Orig->getDebugLoc(); + unsigned Opcode = Orig->getOpcode(); + switch (Opcode) { + default: + break; + case ARM::MOVi2pieces: { + RI.emitLoadConstPool(MBB, I, dl, + DestReg, SubIdx, + Orig->getOperand(1).getImm(), + (ARMCC::CondCodes)Orig->getOperand(2).getImm(), + Orig->getOperand(3).getReg()); + MachineInstr *NewMI = prior(I); + NewMI->getOperand(0).setSubReg(SubIdx); + return; + } + } + + return ARMBaseInstrInfo::reMaterialize(MBB, I, DestReg, SubIdx, Orig, TRI); +} + diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrInfo.h b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.h new file mode 100644 index 0000000..d4199d1 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.h @@ -0,0 +1,49 @@ +//===- ARMInstrInfo.h - ARM Instruction Information -------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the ARM implementation of the TargetInstrInfo class. +// +//===----------------------------------------------------------------------===// + +#ifndef ARMINSTRUCTIONINFO_H +#define ARMINSTRUCTIONINFO_H + +#include "llvm/Target/TargetInstrInfo.h" +#include "ARMBaseInstrInfo.h" +#include "ARMRegisterInfo.h" +#include "ARMSubtarget.h" +#include "ARM.h" + +namespace llvm { + class ARMSubtarget; + +class ARMInstrInfo : public ARMBaseInstrInfo { + ARMRegisterInfo RI; +public: + explicit ARMInstrInfo(const ARMSubtarget &STI); + + // Return the non-pre/post incrementing version of 'Opc'. Return 0 + // if there is not such an opcode. + unsigned getUnindexedOpcode(unsigned Opc) const; + + void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, + unsigned DestReg, unsigned SubIdx, + const MachineInstr *Orig, + const TargetRegisterInfo *TRI) const; + + /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info. As + /// such, whenever a client has an instance of instruction info, it should + /// always be able to get register info as well (through this method). + /// + const ARMRegisterInfo &getRegisterInfo() const { return RI; } +}; + +} + +#endif diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrInfo.td b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.td new file mode 100644 index 0000000..f3156d9 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.td @@ -0,0 +1,2938 @@ +//===- ARMInstrInfo.td - Target Description for ARM Target -*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the ARM instructions in TableGen format. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// ARM specific DAG Nodes. +// + +// Type profiles. +def SDT_ARMCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32> ]>; +def SDT_ARMCallSeqEnd : SDCallSeqEnd<[ SDTCisVT<0, i32>, SDTCisVT<1, i32> ]>; + +def SDT_ARMSaveCallPC : SDTypeProfile<0, 1, []>; + +def SDT_ARMcall : SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>; + +def SDT_ARMCMov : SDTypeProfile<1, 3, + [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, + SDTCisVT<3, i32>]>; + +def SDT_ARMBrcond : SDTypeProfile<0, 2, + [SDTCisVT<0, OtherVT>, SDTCisVT<1, i32>]>; + +def SDT_ARMBrJT : SDTypeProfile<0, 3, + [SDTCisPtrTy<0>, SDTCisVT<1, i32>, + SDTCisVT<2, i32>]>; + +def SDT_ARMBr2JT : SDTypeProfile<0, 4, + [SDTCisPtrTy<0>, SDTCisVT<1, i32>, + SDTCisVT<2, i32>, SDTCisVT<3, i32>]>; + +def SDT_ARMCmp : SDTypeProfile<0, 2, [SDTCisSameAs<0, 1>]>; + +def SDT_ARMPICAdd : SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>, + SDTCisPtrTy<1>, SDTCisVT<2, i32>]>; + +def SDT_ARMThreadPointer : SDTypeProfile<1, 0, [SDTCisPtrTy<0>]>; +def SDT_ARMEH_SJLJ_Setjmp : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisPtrTy<1>, + SDTCisInt<2>]>; +def SDT_ARMEH_SJLJ_Longjmp: SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisInt<1>]>; + +def SDT_ARMMEMBARRIERV7 : SDTypeProfile<0, 0, []>; +def SDT_ARMSYNCBARRIERV7 : SDTypeProfile<0, 0, []>; +def SDT_ARMMEMBARRIERV6 : SDTypeProfile<0, 1, [SDTCisInt<0>]>; +def SDT_ARMSYNCBARRIERV6 : SDTypeProfile<0, 1, [SDTCisInt<0>]>; + +// Node definitions. +def ARMWrapper : SDNode<"ARMISD::Wrapper", SDTIntUnaryOp>; +def ARMWrapperJT : SDNode<"ARMISD::WrapperJT", SDTIntBinOp>; + +def ARMcallseq_start : SDNode<"ISD::CALLSEQ_START", SDT_ARMCallSeqStart, + [SDNPHasChain, SDNPOutFlag]>; +def ARMcallseq_end : SDNode<"ISD::CALLSEQ_END", SDT_ARMCallSeqEnd, + [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag]>; + +def ARMcall : SDNode<"ARMISD::CALL", SDT_ARMcall, + [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag, + SDNPVariadic]>; +def ARMcall_pred : SDNode<"ARMISD::CALL_PRED", SDT_ARMcall, + [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag, + SDNPVariadic]>; +def ARMcall_nolink : SDNode<"ARMISD::CALL_NOLINK", SDT_ARMcall, + [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag, + SDNPVariadic]>; + +def ARMretflag : SDNode<"ARMISD::RET_FLAG", SDTNone, + [SDNPHasChain, SDNPOptInFlag]>; + +def ARMcmov : SDNode<"ARMISD::CMOV", SDT_ARMCMov, + [SDNPInFlag]>; +def ARMcneg : SDNode<"ARMISD::CNEG", SDT_ARMCMov, + [SDNPInFlag]>; + +def ARMbrcond : SDNode<"ARMISD::BRCOND", SDT_ARMBrcond, + [SDNPHasChain, SDNPInFlag, SDNPOutFlag]>; + +def ARMbrjt : SDNode<"ARMISD::BR_JT", SDT_ARMBrJT, + [SDNPHasChain]>; +def ARMbr2jt : SDNode<"ARMISD::BR2_JT", SDT_ARMBr2JT, + [SDNPHasChain]>; + +def ARMcmp : SDNode<"ARMISD::CMP", SDT_ARMCmp, + [SDNPOutFlag]>; + +def ARMcmpZ : SDNode<"ARMISD::CMPZ", SDT_ARMCmp, + [SDNPOutFlag,SDNPCommutative]>; + +def ARMpic_add : SDNode<"ARMISD::PIC_ADD", SDT_ARMPICAdd>; + +def ARMsrl_flag : SDNode<"ARMISD::SRL_FLAG", SDTIntUnaryOp, [SDNPOutFlag]>; +def ARMsra_flag : SDNode<"ARMISD::SRA_FLAG", SDTIntUnaryOp, [SDNPOutFlag]>; +def ARMrrx : SDNode<"ARMISD::RRX" , SDTIntUnaryOp, [SDNPInFlag ]>; + +def ARMthread_pointer: SDNode<"ARMISD::THREAD_POINTER", SDT_ARMThreadPointer>; +def ARMeh_sjlj_setjmp: SDNode<"ARMISD::EH_SJLJ_SETJMP", + SDT_ARMEH_SJLJ_Setjmp, [SDNPHasChain]>; +def ARMeh_sjlj_longjmp: SDNode<"ARMISD::EH_SJLJ_LONGJMP", + SDT_ARMEH_SJLJ_Longjmp, [SDNPHasChain]>; + +def ARMMemBarrierV7 : SDNode<"ARMISD::MEMBARRIER", SDT_ARMMEMBARRIERV7, + [SDNPHasChain]>; +def ARMSyncBarrierV7 : SDNode<"ARMISD::SYNCBARRIER", SDT_ARMMEMBARRIERV7, + [SDNPHasChain]>; +def ARMMemBarrierV6 : SDNode<"ARMISD::MEMBARRIER", SDT_ARMMEMBARRIERV6, + [SDNPHasChain]>; +def ARMSyncBarrierV6 : SDNode<"ARMISD::SYNCBARRIER", SDT_ARMMEMBARRIERV6, + [SDNPHasChain]>; + +def ARMrbit : SDNode<"ARMISD::RBIT", SDTIntUnaryOp>; + +//===----------------------------------------------------------------------===// +// ARM Instruction Predicate Definitions. +// +def HasV4T : Predicate<"Subtarget->hasV4TOps()">; +def NoV4T : Predicate<"!Subtarget->hasV4TOps()">; +def HasV5T : Predicate<"Subtarget->hasV5TOps()">; +def HasV5TE : Predicate<"Subtarget->hasV5TEOps()">; +def HasV6 : Predicate<"Subtarget->hasV6Ops()">; +def HasV6T2 : Predicate<"Subtarget->hasV6T2Ops()">; +def NoV6T2 : Predicate<"!Subtarget->hasV6T2Ops()">; +def HasV7 : Predicate<"Subtarget->hasV7Ops()">; +def NoVFP : Predicate<"!Subtarget->hasVFP2()">; +def HasVFP2 : Predicate<"Subtarget->hasVFP2()">; +def HasVFP3 : Predicate<"Subtarget->hasVFP3()">; +def HasNEON : Predicate<"Subtarget->hasNEON()">; +def HasDivide : Predicate<"Subtarget->hasDivide()">; +def HasT2ExtractPack : Predicate<"Subtarget->hasT2ExtractPack()">; +def UseNEONForFP : Predicate<"Subtarget->useNEONForSinglePrecisionFP()">; +def DontUseNEONForFP : Predicate<"!Subtarget->useNEONForSinglePrecisionFP()">; +def IsThumb : Predicate<"Subtarget->isThumb()">; +def IsThumb1Only : Predicate<"Subtarget->isThumb1Only()">; +def IsThumb2 : Predicate<"Subtarget->isThumb2()">; +def IsARM : Predicate<"!Subtarget->isThumb()">; +def IsDarwin : Predicate<"Subtarget->isTargetDarwin()">; +def IsNotDarwin : Predicate<"!Subtarget->isTargetDarwin()">; + +// FIXME: Eventually this will be just "hasV6T2Ops". +def UseMovt : Predicate<"Subtarget->useMovt()">; +def DontUseMovt : Predicate<"!Subtarget->useMovt()">; + +def UseVMLx : Predicate<"Subtarget->useVMLx()">; + +//===----------------------------------------------------------------------===// +// ARM Flag Definitions. + +class RegConstraint<string C> { + string Constraints = C; +} + +//===----------------------------------------------------------------------===// +// ARM specific transformation functions and pattern fragments. +// + +// so_imm_neg_XFORM - Return a so_imm value packed into the format described for +// so_imm_neg def below. +def so_imm_neg_XFORM : SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant(-(int)N->getZExtValue(), MVT::i32); +}]>; + +// so_imm_not_XFORM - Return a so_imm value packed into the format described for +// so_imm_not def below. +def so_imm_not_XFORM : SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant(~(int)N->getZExtValue(), MVT::i32); +}]>; + +// rot_imm predicate - True if the 32-bit immediate is equal to 8, 16, or 24. +def rot_imm : PatLeaf<(i32 imm), [{ + int32_t v = (int32_t)N->getZExtValue(); + return v == 8 || v == 16 || v == 24; +}]>; + +/// imm1_15 predicate - True if the 32-bit immediate is in the range [1,15]. +def imm1_15 : PatLeaf<(i32 imm), [{ + return (int32_t)N->getZExtValue() >= 1 && (int32_t)N->getZExtValue() < 16; +}]>; + +/// imm16_31 predicate - True if the 32-bit immediate is in the range [16,31]. +def imm16_31 : PatLeaf<(i32 imm), [{ + return (int32_t)N->getZExtValue() >= 16 && (int32_t)N->getZExtValue() < 32; +}]>; + +def so_imm_neg : + PatLeaf<(imm), [{ + return ARM_AM::getSOImmVal(-(int)N->getZExtValue()) != -1; + }], so_imm_neg_XFORM>; + +def so_imm_not : + PatLeaf<(imm), [{ + return ARM_AM::getSOImmVal(~(int)N->getZExtValue()) != -1; + }], so_imm_not_XFORM>; + +// sext_16_node predicate - True if the SDNode is sign-extended 16 or more bits. +def sext_16_node : PatLeaf<(i32 GPR:$a), [{ + return CurDAG->ComputeNumSignBits(SDValue(N,0)) >= 17; +}]>; + +/// bf_inv_mask_imm predicate - An AND mask to clear an arbitrary width bitfield +/// e.g., 0xf000ffff +def bf_inv_mask_imm : Operand<i32>, + PatLeaf<(imm), [{ + uint32_t v = (uint32_t)N->getZExtValue(); + if (v == 0xffffffff) + return 0; + // there can be 1's on either or both "outsides", all the "inside" + // bits must be 0's + unsigned int lsb = 0, msb = 31; + while (v & (1 << msb)) --msb; + while (v & (1 << lsb)) ++lsb; + for (unsigned int i = lsb; i <= msb; ++i) { + if (v & (1 << i)) + return 0; + } + return 1; +}] > { + let PrintMethod = "printBitfieldInvMaskImmOperand"; +} + +/// Split a 32-bit immediate into two 16 bit parts. +def lo16 : SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant((uint32_t)N->getZExtValue() & 0xffff, + MVT::i32); +}]>; + +def hi16 : SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant((uint32_t)N->getZExtValue() >> 16, MVT::i32); +}]>; + +def lo16AllZero : PatLeaf<(i32 imm), [{ + // Returns true if all low 16-bits are 0. + return (((uint32_t)N->getZExtValue()) & 0xFFFFUL) == 0; +}], hi16>; + +/// imm0_65535 predicate - True if the 32-bit immediate is in the range +/// [0.65535]. +def imm0_65535 : PatLeaf<(i32 imm), [{ + return (uint32_t)N->getZExtValue() < 65536; +}]>; + +class BinOpFrag<dag res> : PatFrag<(ops node:$LHS, node:$RHS), res>; +class UnOpFrag <dag res> : PatFrag<(ops node:$Src), res>; + +/// adde and sube predicates - True based on whether the carry flag output +/// will be needed or not. +def adde_dead_carry : + PatFrag<(ops node:$LHS, node:$RHS), (adde node:$LHS, node:$RHS), + [{return !N->hasAnyUseOfValue(1);}]>; +def sube_dead_carry : + PatFrag<(ops node:$LHS, node:$RHS), (sube node:$LHS, node:$RHS), + [{return !N->hasAnyUseOfValue(1);}]>; +def adde_live_carry : + PatFrag<(ops node:$LHS, node:$RHS), (adde node:$LHS, node:$RHS), + [{return N->hasAnyUseOfValue(1);}]>; +def sube_live_carry : + PatFrag<(ops node:$LHS, node:$RHS), (sube node:$LHS, node:$RHS), + [{return N->hasAnyUseOfValue(1);}]>; + +//===----------------------------------------------------------------------===// +// Operand Definitions. +// + +// Branch target. +def brtarget : Operand<OtherVT>; + +// A list of registers separated by comma. Used by load/store multiple. +def reglist : Operand<i32> { + let PrintMethod = "printRegisterList"; +} + +// An operand for the CONSTPOOL_ENTRY pseudo-instruction. +def cpinst_operand : Operand<i32> { + let PrintMethod = "printCPInstOperand"; +} + +def jtblock_operand : Operand<i32> { + let PrintMethod = "printJTBlockOperand"; +} +def jt2block_operand : Operand<i32> { + let PrintMethod = "printJT2BlockOperand"; +} + +// Local PC labels. +def pclabel : Operand<i32> { + let PrintMethod = "printPCLabel"; +} + +// shifter_operand operands: so_reg and so_imm. +def so_reg : Operand<i32>, // reg reg imm + ComplexPattern<i32, 3, "SelectShifterOperandReg", + [shl,srl,sra,rotr]> { + let PrintMethod = "printSORegOperand"; + let MIOperandInfo = (ops GPR, GPR, i32imm); +} + +// so_imm - Match a 32-bit shifter_operand immediate operand, which is an +// 8-bit immediate rotated by an arbitrary number of bits. so_imm values are +// represented in the imm field in the same 12-bit form that they are encoded +// into so_imm instructions: the 8-bit immediate is the least significant bits +// [bits 0-7], the 4-bit shift amount is the next 4 bits [bits 8-11]. +def so_imm : Operand<i32>, + PatLeaf<(imm), [{ + return ARM_AM::getSOImmVal(N->getZExtValue()) != -1; + }]> { + let PrintMethod = "printSOImmOperand"; +} + +// Break so_imm's up into two pieces. This handles immediates with up to 16 +// bits set in them. This uses so_imm2part to match and so_imm2part_[12] to +// get the first/second pieces. +def so_imm2part : Operand<i32>, + PatLeaf<(imm), [{ + return ARM_AM::isSOImmTwoPartVal((unsigned)N->getZExtValue()); + }]> { + let PrintMethod = "printSOImm2PartOperand"; +} + +def so_imm2part_1 : SDNodeXForm<imm, [{ + unsigned V = ARM_AM::getSOImmTwoPartFirst((unsigned)N->getZExtValue()); + return CurDAG->getTargetConstant(V, MVT::i32); +}]>; + +def so_imm2part_2 : SDNodeXForm<imm, [{ + unsigned V = ARM_AM::getSOImmTwoPartSecond((unsigned)N->getZExtValue()); + return CurDAG->getTargetConstant(V, MVT::i32); +}]>; + +def so_neg_imm2part : Operand<i32>, PatLeaf<(imm), [{ + return ARM_AM::isSOImmTwoPartVal(-(int)N->getZExtValue()); + }]> { + let PrintMethod = "printSOImm2PartOperand"; +} + +def so_neg_imm2part_1 : SDNodeXForm<imm, [{ + unsigned V = ARM_AM::getSOImmTwoPartFirst(-(int)N->getZExtValue()); + return CurDAG->getTargetConstant(V, MVT::i32); +}]>; + +def so_neg_imm2part_2 : SDNodeXForm<imm, [{ + unsigned V = ARM_AM::getSOImmTwoPartSecond(-(int)N->getZExtValue()); + return CurDAG->getTargetConstant(V, MVT::i32); +}]>; + +/// imm0_31 predicate - True if the 32-bit immediate is in the range [0,31]. +def imm0_31 : Operand<i32>, PatLeaf<(imm), [{ + return (int32_t)N->getZExtValue() < 32; +}]>; + +// Define ARM specific addressing modes. + +// addrmode2 := reg +/- reg shop imm +// addrmode2 := reg +/- imm12 +// +def addrmode2 : Operand<i32>, + ComplexPattern<i32, 3, "SelectAddrMode2", []> { + let PrintMethod = "printAddrMode2Operand"; + let MIOperandInfo = (ops GPR:$base, GPR:$offsreg, i32imm:$offsimm); +} + +def am2offset : Operand<i32>, + ComplexPattern<i32, 2, "SelectAddrMode2Offset", []> { + let PrintMethod = "printAddrMode2OffsetOperand"; + let MIOperandInfo = (ops GPR, i32imm); +} + +// addrmode3 := reg +/- reg +// addrmode3 := reg +/- imm8 +// +def addrmode3 : Operand<i32>, + ComplexPattern<i32, 3, "SelectAddrMode3", []> { + let PrintMethod = "printAddrMode3Operand"; + let MIOperandInfo = (ops GPR:$base, GPR:$offsreg, i32imm:$offsimm); +} + +def am3offset : Operand<i32>, + ComplexPattern<i32, 2, "SelectAddrMode3Offset", []> { + let PrintMethod = "printAddrMode3OffsetOperand"; + let MIOperandInfo = (ops GPR, i32imm); +} + +// addrmode4 := reg, <mode|W> +// +def addrmode4 : Operand<i32>, + ComplexPattern<i32, 2, "SelectAddrMode4", []> { + let PrintMethod = "printAddrMode4Operand"; + let MIOperandInfo = (ops GPR:$addr, i32imm); +} + +// addrmode5 := reg +/- imm8*4 +// +def addrmode5 : Operand<i32>, + ComplexPattern<i32, 2, "SelectAddrMode5", []> { + let PrintMethod = "printAddrMode5Operand"; + let MIOperandInfo = (ops GPR:$base, i32imm); +} + +// addrmode6 := reg with optional writeback +// +def addrmode6 : Operand<i32>, + ComplexPattern<i32, 2, "SelectAddrMode6", []> { + let PrintMethod = "printAddrMode6Operand"; + let MIOperandInfo = (ops GPR:$addr, i32imm); +} + +def am6offset : Operand<i32> { + let PrintMethod = "printAddrMode6OffsetOperand"; + let MIOperandInfo = (ops GPR); +} + +// addrmodepc := pc + reg +// +def addrmodepc : Operand<i32>, + ComplexPattern<i32, 2, "SelectAddrModePC", []> { + let PrintMethod = "printAddrModePCOperand"; + let MIOperandInfo = (ops GPR, i32imm); +} + +def nohash_imm : Operand<i32> { + let PrintMethod = "printNoHashImmediate"; +} + +//===----------------------------------------------------------------------===// + +include "ARMInstrFormats.td" + +//===----------------------------------------------------------------------===// +// Multiclass helpers... +// + +/// AsI1_bin_irs - Defines a set of (op r, {so_imm|r|so_reg}) patterns for a +/// binop that produces a value. +multiclass AsI1_bin_irs<bits<4> opcod, string opc, PatFrag opnode, + bit Commutable = 0> { + def ri : AsI1<opcod, (outs GPR:$dst), (ins GPR:$a, so_imm:$b), DPFrm, + IIC_iALUi, opc, "\t$dst, $a, $b", + [(set GPR:$dst, (opnode GPR:$a, so_imm:$b))]> { + let Inst{25} = 1; + } + def rr : AsI1<opcod, (outs GPR:$dst), (ins GPR:$a, GPR:$b), DPFrm, + IIC_iALUr, opc, "\t$dst, $a, $b", + [(set GPR:$dst, (opnode GPR:$a, GPR:$b))]> { + let Inst{11-4} = 0b00000000; + let Inst{25} = 0; + let isCommutable = Commutable; + } + def rs : AsI1<opcod, (outs GPR:$dst), (ins GPR:$a, so_reg:$b), DPSoRegFrm, + IIC_iALUsr, opc, "\t$dst, $a, $b", + [(set GPR:$dst, (opnode GPR:$a, so_reg:$b))]> { + let Inst{25} = 0; + } +} + +/// AI1_bin_s_irs - Similar to AsI1_bin_irs except it sets the 's' bit so the +/// instruction modifies the CPSR register. +let Defs = [CPSR] in { +multiclass AI1_bin_s_irs<bits<4> opcod, string opc, PatFrag opnode, + bit Commutable = 0> { + def ri : AI1<opcod, (outs GPR:$dst), (ins GPR:$a, so_imm:$b), DPFrm, + IIC_iALUi, opc, "\t$dst, $a, $b", + [(set GPR:$dst, (opnode GPR:$a, so_imm:$b))]> { + let Inst{20} = 1; + let Inst{25} = 1; + } + def rr : AI1<opcod, (outs GPR:$dst), (ins GPR:$a, GPR:$b), DPFrm, + IIC_iALUr, opc, "\t$dst, $a, $b", + [(set GPR:$dst, (opnode GPR:$a, GPR:$b))]> { + let isCommutable = Commutable; + let Inst{11-4} = 0b00000000; + let Inst{20} = 1; + let Inst{25} = 0; + } + def rs : AI1<opcod, (outs GPR:$dst), (ins GPR:$a, so_reg:$b), DPSoRegFrm, + IIC_iALUsr, opc, "\t$dst, $a, $b", + [(set GPR:$dst, (opnode GPR:$a, so_reg:$b))]> { + let Inst{20} = 1; + let Inst{25} = 0; + } +} +} + +/// AI1_cmp_irs - Defines a set of (op r, {so_imm|r|so_reg}) cmp / test +/// patterns. Similar to AsI1_bin_irs except the instruction does not produce +/// a explicit result, only implicitly set CPSR. +let Defs = [CPSR] in { +multiclass AI1_cmp_irs<bits<4> opcod, string opc, PatFrag opnode, + bit Commutable = 0> { + def ri : AI1<opcod, (outs), (ins GPR:$a, so_imm:$b), DPFrm, IIC_iCMPi, + opc, "\t$a, $b", + [(opnode GPR:$a, so_imm:$b)]> { + let Inst{20} = 1; + let Inst{25} = 1; + } + def rr : AI1<opcod, (outs), (ins GPR:$a, GPR:$b), DPFrm, IIC_iCMPr, + opc, "\t$a, $b", + [(opnode GPR:$a, GPR:$b)]> { + let Inst{11-4} = 0b00000000; + let Inst{20} = 1; + let Inst{25} = 0; + let isCommutable = Commutable; + } + def rs : AI1<opcod, (outs), (ins GPR:$a, so_reg:$b), DPSoRegFrm, IIC_iCMPsr, + opc, "\t$a, $b", + [(opnode GPR:$a, so_reg:$b)]> { + let Inst{20} = 1; + let Inst{25} = 0; + } +} +} + +/// AI_unary_rrot - A unary operation with two forms: one whose operand is a +/// register and one whose operand is a register rotated by 8/16/24. +/// FIXME: Remove the 'r' variant. Its rot_imm is zero. +multiclass AI_unary_rrot<bits<8> opcod, string opc, PatFrag opnode> { + def r : AExtI<opcod, (outs GPR:$dst), (ins GPR:$src), + IIC_iUNAr, opc, "\t$dst, $src", + [(set GPR:$dst, (opnode GPR:$src))]>, + Requires<[IsARM, HasV6]> { + let Inst{11-10} = 0b00; + let Inst{19-16} = 0b1111; + } + def r_rot : AExtI<opcod, (outs GPR:$dst), (ins GPR:$src, i32imm:$rot), + IIC_iUNAsi, opc, "\t$dst, $src, ror $rot", + [(set GPR:$dst, (opnode (rotr GPR:$src, rot_imm:$rot)))]>, + Requires<[IsARM, HasV6]> { + let Inst{19-16} = 0b1111; + } +} + +multiclass AI_unary_rrot_np<bits<8> opcod, string opc> { + def r : AExtI<opcod, (outs GPR:$dst), (ins GPR:$src), + IIC_iUNAr, opc, "\t$dst, $src", + [/* For disassembly only; pattern left blank */]>, + Requires<[IsARM, HasV6]> { + let Inst{11-10} = 0b00; + let Inst{19-16} = 0b1111; + } + def r_rot : AExtI<opcod, (outs GPR:$dst), (ins GPR:$src, i32imm:$rot), + IIC_iUNAsi, opc, "\t$dst, $src, ror $rot", + [/* For disassembly only; pattern left blank */]>, + Requires<[IsARM, HasV6]> { + let Inst{19-16} = 0b1111; + } +} + +/// AI_bin_rrot - A binary operation with two forms: one whose operand is a +/// register and one whose operand is a register rotated by 8/16/24. +multiclass AI_bin_rrot<bits<8> opcod, string opc, PatFrag opnode> { + def rr : AExtI<opcod, (outs GPR:$dst), (ins GPR:$LHS, GPR:$RHS), + IIC_iALUr, opc, "\t$dst, $LHS, $RHS", + [(set GPR:$dst, (opnode GPR:$LHS, GPR:$RHS))]>, + Requires<[IsARM, HasV6]> { + let Inst{11-10} = 0b00; + } + def rr_rot : AExtI<opcod, (outs GPR:$dst), (ins GPR:$LHS, GPR:$RHS, + i32imm:$rot), + IIC_iALUsi, opc, "\t$dst, $LHS, $RHS, ror $rot", + [(set GPR:$dst, (opnode GPR:$LHS, + (rotr GPR:$RHS, rot_imm:$rot)))]>, + Requires<[IsARM, HasV6]>; +} + +// For disassembly only. +multiclass AI_bin_rrot_np<bits<8> opcod, string opc> { + def rr : AExtI<opcod, (outs GPR:$dst), (ins GPR:$LHS, GPR:$RHS), + IIC_iALUr, opc, "\t$dst, $LHS, $RHS", + [/* For disassembly only; pattern left blank */]>, + Requires<[IsARM, HasV6]> { + let Inst{11-10} = 0b00; + } + def rr_rot : AExtI<opcod, (outs GPR:$dst), (ins GPR:$LHS, GPR:$RHS, + i32imm:$rot), + IIC_iALUsi, opc, "\t$dst, $LHS, $RHS, ror $rot", + [/* For disassembly only; pattern left blank */]>, + Requires<[IsARM, HasV6]>; +} + +/// AI1_adde_sube_irs - Define instructions and patterns for adde and sube. +let Uses = [CPSR] in { +multiclass AI1_adde_sube_irs<bits<4> opcod, string opc, PatFrag opnode, + bit Commutable = 0> { + def ri : AsI1<opcod, (outs GPR:$dst), (ins GPR:$a, so_imm:$b), + DPFrm, IIC_iALUi, opc, "\t$dst, $a, $b", + [(set GPR:$dst, (opnode GPR:$a, so_imm:$b))]>, + Requires<[IsARM]> { + let Inst{25} = 1; + } + def rr : AsI1<opcod, (outs GPR:$dst), (ins GPR:$a, GPR:$b), + DPFrm, IIC_iALUr, opc, "\t$dst, $a, $b", + [(set GPR:$dst, (opnode GPR:$a, GPR:$b))]>, + Requires<[IsARM]> { + let isCommutable = Commutable; + let Inst{11-4} = 0b00000000; + let Inst{25} = 0; + } + def rs : AsI1<opcod, (outs GPR:$dst), (ins GPR:$a, so_reg:$b), + DPSoRegFrm, IIC_iALUsr, opc, "\t$dst, $a, $b", + [(set GPR:$dst, (opnode GPR:$a, so_reg:$b))]>, + Requires<[IsARM]> { + let Inst{25} = 0; + } +} +// Carry setting variants +let Defs = [CPSR] in { +multiclass AI1_adde_sube_s_irs<bits<4> opcod, string opc, PatFrag opnode, + bit Commutable = 0> { + def Sri : AXI1<opcod, (outs GPR:$dst), (ins GPR:$a, so_imm:$b), + DPFrm, IIC_iALUi, !strconcat(opc, "\t$dst, $a, $b"), + [(set GPR:$dst, (opnode GPR:$a, so_imm:$b))]>, + Requires<[IsARM]> { + let Inst{20} = 1; + let Inst{25} = 1; + } + def Srr : AXI1<opcod, (outs GPR:$dst), (ins GPR:$a, GPR:$b), + DPFrm, IIC_iALUr, !strconcat(opc, "\t$dst, $a, $b"), + [(set GPR:$dst, (opnode GPR:$a, GPR:$b))]>, + Requires<[IsARM]> { + let Inst{11-4} = 0b00000000; + let Inst{20} = 1; + let Inst{25} = 0; + } + def Srs : AXI1<opcod, (outs GPR:$dst), (ins GPR:$a, so_reg:$b), + DPSoRegFrm, IIC_iALUsr, !strconcat(opc, "\t$dst, $a, $b"), + [(set GPR:$dst, (opnode GPR:$a, so_reg:$b))]>, + Requires<[IsARM]> { + let Inst{20} = 1; + let Inst{25} = 0; + } +} +} +} + +//===----------------------------------------------------------------------===// +// Instructions +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Miscellaneous Instructions. +// + +/// CONSTPOOL_ENTRY - This instruction represents a floating constant pool in +/// the function. The first operand is the ID# for this instruction, the second +/// is the index into the MachineConstantPool that this is, the third is the +/// size in bytes of this constant pool entry. +let neverHasSideEffects = 1, isNotDuplicable = 1 in +def CONSTPOOL_ENTRY : +PseudoInst<(outs), (ins cpinst_operand:$instid, cpinst_operand:$cpidx, + i32imm:$size), NoItinerary, + "${instid:label} ${cpidx:cpentry}", []>; + +// FIXME: Marking these as hasSideEffects is necessary to prevent machine DCE +// from removing one half of the matched pairs. That breaks PEI, which assumes +// these will always be in pairs, and asserts if it finds otherwise. Better way? +let Defs = [SP], Uses = [SP], hasSideEffects = 1 in { +def ADJCALLSTACKUP : +PseudoInst<(outs), (ins i32imm:$amt1, i32imm:$amt2, pred:$p), NoItinerary, + "${:comment} ADJCALLSTACKUP $amt1", + [(ARMcallseq_end timm:$amt1, timm:$amt2)]>; + +def ADJCALLSTACKDOWN : +PseudoInst<(outs), (ins i32imm:$amt, pred:$p), NoItinerary, + "${:comment} ADJCALLSTACKDOWN $amt", + [(ARMcallseq_start timm:$amt)]>; +} + +def NOP : AI<(outs), (ins), MiscFrm, NoItinerary, "nop", "", + [/* For disassembly only; pattern left blank */]>, + Requires<[IsARM, HasV6T2]> { + let Inst{27-16} = 0b001100100000; + let Inst{7-0} = 0b00000000; +} + +def YIELD : AI<(outs), (ins), MiscFrm, NoItinerary, "yield", "", + [/* For disassembly only; pattern left blank */]>, + Requires<[IsARM, HasV6T2]> { + let Inst{27-16} = 0b001100100000; + let Inst{7-0} = 0b00000001; +} + +def WFE : AI<(outs), (ins), MiscFrm, NoItinerary, "wfe", "", + [/* For disassembly only; pattern left blank */]>, + Requires<[IsARM, HasV6T2]> { + let Inst{27-16} = 0b001100100000; + let Inst{7-0} = 0b00000010; +} + +def WFI : AI<(outs), (ins), MiscFrm, NoItinerary, "wfi", "", + [/* For disassembly only; pattern left blank */]>, + Requires<[IsARM, HasV6T2]> { + let Inst{27-16} = 0b001100100000; + let Inst{7-0} = 0b00000011; +} + +def SEL : AI<(outs GPR:$dst), (ins GPR:$a, GPR:$b), DPFrm, NoItinerary, "sel", + "\t$dst, $a, $b", + [/* For disassembly only; pattern left blank */]>, + Requires<[IsARM, HasV6]> { + let Inst{27-20} = 0b01101000; + let Inst{7-4} = 0b1011; +} + +def SEV : AI<(outs), (ins), MiscFrm, NoItinerary, "sev", "", + [/* For disassembly only; pattern left blank */]>, + Requires<[IsARM, HasV6T2]> { + let Inst{27-16} = 0b001100100000; + let Inst{7-0} = 0b00000100; +} + +// The i32imm operand $val can be used by a debugger to store more information +// about the breakpoint. +def BKPT : AI<(outs), (ins i32imm:$val), MiscFrm, NoItinerary, "bkpt", "\t$val", + [/* For disassembly only; pattern left blank */]>, + Requires<[IsARM]> { + let Inst{27-20} = 0b00010010; + let Inst{7-4} = 0b0111; +} + +// Change Processor State is a system instruction -- for disassembly only. +// The singleton $opt operand contains the following information: +// opt{4-0} = mode from Inst{4-0} +// opt{5} = changemode from Inst{17} +// opt{8-6} = AIF from Inst{8-6} +// opt{10-9} = imod from Inst{19-18} with 0b10 as enable and 0b11 as disable +def CPS : AXI<(outs), (ins cps_opt:$opt), MiscFrm, NoItinerary, "cps$opt", + [/* For disassembly only; pattern left blank */]>, + Requires<[IsARM]> { + let Inst{31-28} = 0b1111; + let Inst{27-20} = 0b00010000; + let Inst{16} = 0; + let Inst{5} = 0; +} + +// Preload signals the memory system of possible future data/instruction access. +// These are for disassembly only. +// +// A8.6.117, A8.6.118. Different instructions are generated for #0 and #-0. +// The neg_zero operand translates -0 to -1, -1 to -2, ..., etc. +multiclass APreLoad<bit data, bit read, string opc> { + + def i : AXI<(outs), (ins GPR:$base, neg_zero:$imm), MiscFrm, NoItinerary, + !strconcat(opc, "\t[$base, $imm]"), []> { + let Inst{31-26} = 0b111101; + let Inst{25} = 0; // 0 for immediate form + let Inst{24} = data; + let Inst{22} = read; + let Inst{21-20} = 0b01; + } + + def r : AXI<(outs), (ins addrmode2:$addr), MiscFrm, NoItinerary, + !strconcat(opc, "\t$addr"), []> { + let Inst{31-26} = 0b111101; + let Inst{25} = 1; // 1 for register form + let Inst{24} = data; + let Inst{22} = read; + let Inst{21-20} = 0b01; + let Inst{4} = 0; + } +} + +defm PLD : APreLoad<1, 1, "pld">; +defm PLDW : APreLoad<1, 0, "pldw">; +defm PLI : APreLoad<0, 1, "pli">; + +def SETENDBE : AXI<(outs),(ins), MiscFrm, NoItinerary, "setend\tbe", + [/* For disassembly only; pattern left blank */]>, + Requires<[IsARM]> { + let Inst{31-28} = 0b1111; + let Inst{27-20} = 0b00010000; + let Inst{16} = 1; + let Inst{9} = 1; + let Inst{7-4} = 0b0000; +} + +def SETENDLE : AXI<(outs),(ins), MiscFrm, NoItinerary, "setend\tle", + [/* For disassembly only; pattern left blank */]>, + Requires<[IsARM]> { + let Inst{31-28} = 0b1111; + let Inst{27-20} = 0b00010000; + let Inst{16} = 1; + let Inst{9} = 0; + let Inst{7-4} = 0b0000; +} + +def DBG : AI<(outs), (ins i32imm:$opt), MiscFrm, NoItinerary, "dbg", "\t$opt", + [/* For disassembly only; pattern left blank */]>, + Requires<[IsARM, HasV7]> { + let Inst{27-16} = 0b001100100000; + let Inst{7-4} = 0b1111; +} + +// A5.4 Permanently UNDEFINED instructions. +// FIXME: Temporary emitted as raw bytes until this pseudo-op will be added to +// binutils +let isBarrier = 1, isTerminator = 1 in +def TRAP : AXI<(outs), (ins), MiscFrm, NoItinerary, + ".long 0xe7ffdefe ${:comment} trap", [(trap)]>, + Requires<[IsARM]> { + let Inst{27-25} = 0b011; + let Inst{24-20} = 0b11111; + let Inst{7-5} = 0b111; + let Inst{4} = 0b1; +} + +// Address computation and loads and stores in PIC mode. +let isNotDuplicable = 1 in { +def PICADD : AXI1<0b0100, (outs GPR:$dst), (ins GPR:$a, pclabel:$cp, pred:$p), + Pseudo, IIC_iALUr, "\n$cp:\n\tadd$p\t$dst, pc, $a", + [(set GPR:$dst, (ARMpic_add GPR:$a, imm:$cp))]>; + +let AddedComplexity = 10 in { +def PICLDR : AXI2ldw<(outs GPR:$dst), (ins addrmodepc:$addr, pred:$p), + Pseudo, IIC_iLoadr, "\n${addr:label}:\n\tldr$p\t$dst, $addr", + [(set GPR:$dst, (load addrmodepc:$addr))]>; + +def PICLDRH : AXI3ldh<(outs GPR:$dst), (ins addrmodepc:$addr, pred:$p), + Pseudo, IIC_iLoadr, "\n${addr:label}:\n\tldrh${p}\t$dst, $addr", + [(set GPR:$dst, (zextloadi16 addrmodepc:$addr))]>; + +def PICLDRB : AXI2ldb<(outs GPR:$dst), (ins addrmodepc:$addr, pred:$p), + Pseudo, IIC_iLoadr, "\n${addr:label}:\n\tldrb${p}\t$dst, $addr", + [(set GPR:$dst, (zextloadi8 addrmodepc:$addr))]>; + +def PICLDRSH : AXI3ldsh<(outs GPR:$dst), (ins addrmodepc:$addr, pred:$p), + Pseudo, IIC_iLoadr, "\n${addr:label}:\n\tldrsh${p}\t$dst, $addr", + [(set GPR:$dst, (sextloadi16 addrmodepc:$addr))]>; + +def PICLDRSB : AXI3ldsb<(outs GPR:$dst), (ins addrmodepc:$addr, pred:$p), + Pseudo, IIC_iLoadr, "\n${addr:label}:\n\tldrsb${p}\t$dst, $addr", + [(set GPR:$dst, (sextloadi8 addrmodepc:$addr))]>; +} +let AddedComplexity = 10 in { +def PICSTR : AXI2stw<(outs), (ins GPR:$src, addrmodepc:$addr, pred:$p), + Pseudo, IIC_iStorer, "\n${addr:label}:\n\tstr$p\t$src, $addr", + [(store GPR:$src, addrmodepc:$addr)]>; + +def PICSTRH : AXI3sth<(outs), (ins GPR:$src, addrmodepc:$addr, pred:$p), + Pseudo, IIC_iStorer, "\n${addr:label}:\n\tstrh${p}\t$src, $addr", + [(truncstorei16 GPR:$src, addrmodepc:$addr)]>; + +def PICSTRB : AXI2stb<(outs), (ins GPR:$src, addrmodepc:$addr, pred:$p), + Pseudo, IIC_iStorer, "\n${addr:label}:\n\tstrb${p}\t$src, $addr", + [(truncstorei8 GPR:$src, addrmodepc:$addr)]>; +} +} // isNotDuplicable = 1 + + +// LEApcrel - Load a pc-relative address into a register without offending the +// assembler. +let neverHasSideEffects = 1 in { +let isReMaterializable = 1 in +def LEApcrel : AXI1<0x0, (outs GPR:$dst), (ins i32imm:$label, pred:$p), + Pseudo, IIC_iALUi, + "adr$p\t$dst, #$label", []>; + +def LEApcrelJT : AXI1<0x0, (outs GPR:$dst), + (ins i32imm:$label, nohash_imm:$id, pred:$p), + Pseudo, IIC_iALUi, + "adr$p\t$dst, #${label}_${id}", []> { + let Inst{25} = 1; +} +} // neverHasSideEffects + +//===----------------------------------------------------------------------===// +// Control Flow Instructions. +// + +let isReturn = 1, isTerminator = 1, isBarrier = 1 in { + // ARMV4T and above + def BX_RET : AI<(outs), (ins), BrMiscFrm, IIC_Br, + "bx", "\tlr", [(ARMretflag)]>, + Requires<[IsARM, HasV4T]> { + let Inst{3-0} = 0b1110; + let Inst{7-4} = 0b0001; + let Inst{19-8} = 0b111111111111; + let Inst{27-20} = 0b00010010; + } + + // ARMV4 only + def MOVPCLR : AI<(outs), (ins), BrMiscFrm, IIC_Br, + "mov", "\tpc, lr", [(ARMretflag)]>, + Requires<[IsARM, NoV4T]> { + let Inst{11-0} = 0b000000001110; + let Inst{15-12} = 0b1111; + let Inst{19-16} = 0b0000; + let Inst{27-20} = 0b00011010; + } +} + +// Indirect branches +let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in { + // ARMV4T and above + def BRIND : AXI<(outs), (ins GPR:$dst), BrMiscFrm, IIC_Br, "bx\t$dst", + [(brind GPR:$dst)]>, + Requires<[IsARM, HasV4T]> { + let Inst{7-4} = 0b0001; + let Inst{19-8} = 0b111111111111; + let Inst{27-20} = 0b00010010; + let Inst{31-28} = 0b1110; + } + + // ARMV4 only + def MOVPCRX : AXI<(outs), (ins GPR:$dst), BrMiscFrm, IIC_Br, "mov\tpc, $dst", + [(brind GPR:$dst)]>, + Requires<[IsARM, NoV4T]> { + let Inst{11-4} = 0b00000000; + let Inst{15-12} = 0b1111; + let Inst{19-16} = 0b0000; + let Inst{27-20} = 0b00011010; + let Inst{31-28} = 0b1110; + } +} + +// FIXME: remove when we have a way to marking a MI with these properties. +// FIXME: Should pc be an implicit operand like PICADD, etc? +let isReturn = 1, isTerminator = 1, isBarrier = 1, mayLoad = 1, + hasExtraDefRegAllocReq = 1 in + def LDM_RET : AXI4ld<(outs GPR:$wb), (ins addrmode4:$addr, pred:$p, + reglist:$dsts, variable_ops), + IndexModeUpd, LdStMulFrm, IIC_Br, + "ldm${addr:submode}${p}\t$addr!, $dsts", + "$addr.addr = $wb", []>; + +// On non-Darwin platforms R9 is callee-saved. +let isCall = 1, + Defs = [R0, R1, R2, R3, R12, LR, + D0, D1, D2, D3, D4, D5, D6, D7, + D16, D17, D18, D19, D20, D21, D22, D23, + D24, D25, D26, D27, D28, D29, D30, D31, CPSR, FPSCR] in { + def BL : ABXI<0b1011, (outs), (ins i32imm:$func, variable_ops), + IIC_Br, "bl\t${func:call}", + [(ARMcall tglobaladdr:$func)]>, + Requires<[IsARM, IsNotDarwin]> { + let Inst{31-28} = 0b1110; + } + + def BL_pred : ABI<0b1011, (outs), (ins i32imm:$func, variable_ops), + IIC_Br, "bl", "\t${func:call}", + [(ARMcall_pred tglobaladdr:$func)]>, + Requires<[IsARM, IsNotDarwin]>; + + // ARMv5T and above + def BLX : AXI<(outs), (ins GPR:$func, variable_ops), BrMiscFrm, + IIC_Br, "blx\t$func", + [(ARMcall GPR:$func)]>, + Requires<[IsARM, HasV5T, IsNotDarwin]> { + let Inst{7-4} = 0b0011; + let Inst{19-8} = 0b111111111111; + let Inst{27-20} = 0b00010010; + } + + // ARMv4T + // Note: Restrict $func to the tGPR regclass to prevent it being in LR. + def BX : ABXIx2<(outs), (ins tGPR:$func, variable_ops), + IIC_Br, "mov\tlr, pc\n\tbx\t$func", + [(ARMcall_nolink tGPR:$func)]>, + Requires<[IsARM, HasV4T, IsNotDarwin]> { + let Inst{7-4} = 0b0001; + let Inst{19-8} = 0b111111111111; + let Inst{27-20} = 0b00010010; + } + + // ARMv4 + def BMOVPCRX : ABXIx2<(outs), (ins tGPR:$func, variable_ops), + IIC_Br, "mov\tlr, pc\n\tmov\tpc, $func", + [(ARMcall_nolink tGPR:$func)]>, + Requires<[IsARM, NoV4T, IsNotDarwin]> { + let Inst{11-4} = 0b00000000; + let Inst{15-12} = 0b1111; + let Inst{19-16} = 0b0000; + let Inst{27-20} = 0b00011010; + } +} + +// On Darwin R9 is call-clobbered. +let isCall = 1, + Defs = [R0, R1, R2, R3, R9, R12, LR, + D0, D1, D2, D3, D4, D5, D6, D7, + D16, D17, D18, D19, D20, D21, D22, D23, + D24, D25, D26, D27, D28, D29, D30, D31, CPSR, FPSCR] in { + def BLr9 : ABXI<0b1011, (outs), (ins i32imm:$func, variable_ops), + IIC_Br, "bl\t${func:call}", + [(ARMcall tglobaladdr:$func)]>, Requires<[IsARM, IsDarwin]> { + let Inst{31-28} = 0b1110; + } + + def BLr9_pred : ABI<0b1011, (outs), (ins i32imm:$func, variable_ops), + IIC_Br, "bl", "\t${func:call}", + [(ARMcall_pred tglobaladdr:$func)]>, + Requires<[IsARM, IsDarwin]>; + + // ARMv5T and above + def BLXr9 : AXI<(outs), (ins GPR:$func, variable_ops), BrMiscFrm, + IIC_Br, "blx\t$func", + [(ARMcall GPR:$func)]>, Requires<[IsARM, HasV5T, IsDarwin]> { + let Inst{7-4} = 0b0011; + let Inst{19-8} = 0b111111111111; + let Inst{27-20} = 0b00010010; + } + + // ARMv4T + // Note: Restrict $func to the tGPR regclass to prevent it being in LR. + def BXr9 : ABXIx2<(outs), (ins tGPR:$func, variable_ops), + IIC_Br, "mov\tlr, pc\n\tbx\t$func", + [(ARMcall_nolink tGPR:$func)]>, + Requires<[IsARM, HasV4T, IsDarwin]> { + let Inst{7-4} = 0b0001; + let Inst{19-8} = 0b111111111111; + let Inst{27-20} = 0b00010010; + } + + // ARMv4 + def BMOVPCRXr9 : ABXIx2<(outs), (ins tGPR:$func, variable_ops), + IIC_Br, "mov\tlr, pc\n\tmov\tpc, $func", + [(ARMcall_nolink tGPR:$func)]>, + Requires<[IsARM, NoV4T, IsDarwin]> { + let Inst{11-4} = 0b00000000; + let Inst{15-12} = 0b1111; + let Inst{19-16} = 0b0000; + let Inst{27-20} = 0b00011010; + } +} + +let isBranch = 1, isTerminator = 1 in { + // B is "predicable" since it can be xformed into a Bcc. + let isBarrier = 1 in { + let isPredicable = 1 in + def B : ABXI<0b1010, (outs), (ins brtarget:$target), IIC_Br, + "b\t$target", [(br bb:$target)]>; + + let isNotDuplicable = 1, isIndirectBranch = 1 in { + def BR_JTr : JTI<(outs), (ins GPR:$target, jtblock_operand:$jt, i32imm:$id), + IIC_Br, "mov\tpc, $target \n$jt", + [(ARMbrjt GPR:$target, tjumptable:$jt, imm:$id)]> { + let Inst{11-4} = 0b00000000; + let Inst{15-12} = 0b1111; + let Inst{20} = 0; // S Bit + let Inst{24-21} = 0b1101; + let Inst{27-25} = 0b000; + } + def BR_JTm : JTI<(outs), + (ins addrmode2:$target, jtblock_operand:$jt, i32imm:$id), + IIC_Br, "ldr\tpc, $target \n$jt", + [(ARMbrjt (i32 (load addrmode2:$target)), tjumptable:$jt, + imm:$id)]> { + let Inst{15-12} = 0b1111; + let Inst{20} = 1; // L bit + let Inst{21} = 0; // W bit + let Inst{22} = 0; // B bit + let Inst{24} = 1; // P bit + let Inst{27-25} = 0b011; + } + def BR_JTadd : JTI<(outs), + (ins GPR:$target, GPR:$idx, jtblock_operand:$jt, i32imm:$id), + IIC_Br, "add\tpc, $target, $idx \n$jt", + [(ARMbrjt (add GPR:$target, GPR:$idx), tjumptable:$jt, + imm:$id)]> { + let Inst{15-12} = 0b1111; + let Inst{20} = 0; // S bit + let Inst{24-21} = 0b0100; + let Inst{27-25} = 0b000; + } + } // isNotDuplicable = 1, isIndirectBranch = 1 + } // isBarrier = 1 + + // FIXME: should be able to write a pattern for ARMBrcond, but can't use + // a two-value operand where a dag node expects two operands. :( + def Bcc : ABI<0b1010, (outs), (ins brtarget:$target), + IIC_Br, "b", "\t$target", + [/*(ARMbrcond bb:$target, imm:$cc, CCR:$ccr)*/]>; +} + +// Branch and Exchange Jazelle -- for disassembly only +def BXJ : ABI<0b0001, (outs), (ins GPR:$func), NoItinerary, "bxj", "\t$func", + [/* For disassembly only; pattern left blank */]> { + let Inst{23-20} = 0b0010; + //let Inst{19-8} = 0xfff; + let Inst{7-4} = 0b0010; +} + +// Secure Monitor Call is a system instruction -- for disassembly only +def SMC : ABI<0b0001, (outs), (ins i32imm:$opt), NoItinerary, "smc", "\t$opt", + [/* For disassembly only; pattern left blank */]> { + let Inst{23-20} = 0b0110; + let Inst{7-4} = 0b0111; +} + +// Supervisor Call (Software Interrupt) -- for disassembly only +let isCall = 1 in { +def SVC : ABI<0b1111, (outs), (ins i32imm:$svc), IIC_Br, "svc", "\t$svc", + [/* For disassembly only; pattern left blank */]>; +} + +// Store Return State is a system instruction -- for disassembly only +def SRSW : ABXI<{1,0,0,?}, (outs), (ins addrmode4:$addr, i32imm:$mode), + NoItinerary, "srs${addr:submode}\tsp!, $mode", + [/* For disassembly only; pattern left blank */]> { + let Inst{31-28} = 0b1111; + let Inst{22-20} = 0b110; // W = 1 +} + +def SRS : ABXI<{1,0,0,?}, (outs), (ins addrmode4:$addr, i32imm:$mode), + NoItinerary, "srs${addr:submode}\tsp, $mode", + [/* For disassembly only; pattern left blank */]> { + let Inst{31-28} = 0b1111; + let Inst{22-20} = 0b100; // W = 0 +} + +// Return From Exception is a system instruction -- for disassembly only +def RFEW : ABXI<{1,0,0,?}, (outs), (ins addrmode4:$addr, GPR:$base), + NoItinerary, "rfe${addr:submode}\t$base!", + [/* For disassembly only; pattern left blank */]> { + let Inst{31-28} = 0b1111; + let Inst{22-20} = 0b011; // W = 1 +} + +def RFE : ABXI<{1,0,0,?}, (outs), (ins addrmode4:$addr, GPR:$base), + NoItinerary, "rfe${addr:submode}\t$base", + [/* For disassembly only; pattern left blank */]> { + let Inst{31-28} = 0b1111; + let Inst{22-20} = 0b001; // W = 0 +} + +//===----------------------------------------------------------------------===// +// Load / store Instructions. +// + +// Load +let canFoldAsLoad = 1, isReMaterializable = 1 in +def LDR : AI2ldw<(outs GPR:$dst), (ins addrmode2:$addr), LdFrm, IIC_iLoadr, + "ldr", "\t$dst, $addr", + [(set GPR:$dst, (load addrmode2:$addr))]>; + +// Special LDR for loads from non-pc-relative constpools. +let canFoldAsLoad = 1, mayLoad = 1, neverHasSideEffects = 1, + isReMaterializable = 1 in +def LDRcp : AI2ldw<(outs GPR:$dst), (ins addrmode2:$addr), LdFrm, IIC_iLoadr, + "ldr", "\t$dst, $addr", []>; + +// Loads with zero extension +def LDRH : AI3ldh<(outs GPR:$dst), (ins addrmode3:$addr), LdMiscFrm, + IIC_iLoadr, "ldrh", "\t$dst, $addr", + [(set GPR:$dst, (zextloadi16 addrmode3:$addr))]>; + +def LDRB : AI2ldb<(outs GPR:$dst), (ins addrmode2:$addr), LdFrm, + IIC_iLoadr, "ldrb", "\t$dst, $addr", + [(set GPR:$dst, (zextloadi8 addrmode2:$addr))]>; + +// Loads with sign extension +def LDRSH : AI3ldsh<(outs GPR:$dst), (ins addrmode3:$addr), LdMiscFrm, + IIC_iLoadr, "ldrsh", "\t$dst, $addr", + [(set GPR:$dst, (sextloadi16 addrmode3:$addr))]>; + +def LDRSB : AI3ldsb<(outs GPR:$dst), (ins addrmode3:$addr), LdMiscFrm, + IIC_iLoadr, "ldrsb", "\t$dst, $addr", + [(set GPR:$dst, (sextloadi8 addrmode3:$addr))]>; + +let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 in { +// Load doubleword +def LDRD : AI3ldd<(outs GPR:$dst1, GPR:$dst2), (ins addrmode3:$addr), LdMiscFrm, + IIC_iLoadr, "ldrd", "\t$dst1, $addr", + []>, Requires<[IsARM, HasV5TE]>; + +// Indexed loads +def LDR_PRE : AI2ldwpr<(outs GPR:$dst, GPR:$base_wb), + (ins addrmode2:$addr), LdFrm, IIC_iLoadru, + "ldr", "\t$dst, $addr!", "$addr.base = $base_wb", []>; + +def LDR_POST : AI2ldwpo<(outs GPR:$dst, GPR:$base_wb), + (ins GPR:$base, am2offset:$offset), LdFrm, IIC_iLoadru, + "ldr", "\t$dst, [$base], $offset", "$base = $base_wb", []>; + +def LDRH_PRE : AI3ldhpr<(outs GPR:$dst, GPR:$base_wb), + (ins addrmode3:$addr), LdMiscFrm, IIC_iLoadru, + "ldrh", "\t$dst, $addr!", "$addr.base = $base_wb", []>; + +def LDRH_POST : AI3ldhpo<(outs GPR:$dst, GPR:$base_wb), + (ins GPR:$base,am3offset:$offset), LdMiscFrm, IIC_iLoadru, + "ldrh", "\t$dst, [$base], $offset", "$base = $base_wb", []>; + +def LDRB_PRE : AI2ldbpr<(outs GPR:$dst, GPR:$base_wb), + (ins addrmode2:$addr), LdFrm, IIC_iLoadru, + "ldrb", "\t$dst, $addr!", "$addr.base = $base_wb", []>; + +def LDRB_POST : AI2ldbpo<(outs GPR:$dst, GPR:$base_wb), + (ins GPR:$base,am2offset:$offset), LdFrm, IIC_iLoadru, + "ldrb", "\t$dst, [$base], $offset", "$base = $base_wb", []>; + +def LDRSH_PRE : AI3ldshpr<(outs GPR:$dst, GPR:$base_wb), + (ins addrmode3:$addr), LdMiscFrm, IIC_iLoadru, + "ldrsh", "\t$dst, $addr!", "$addr.base = $base_wb", []>; + +def LDRSH_POST: AI3ldshpo<(outs GPR:$dst, GPR:$base_wb), + (ins GPR:$base,am3offset:$offset), LdMiscFrm, IIC_iLoadru, + "ldrsh", "\t$dst, [$base], $offset", "$base = $base_wb", []>; + +def LDRSB_PRE : AI3ldsbpr<(outs GPR:$dst, GPR:$base_wb), + (ins addrmode3:$addr), LdMiscFrm, IIC_iLoadru, + "ldrsb", "\t$dst, $addr!", "$addr.base = $base_wb", []>; + +def LDRSB_POST: AI3ldsbpo<(outs GPR:$dst, GPR:$base_wb), + (ins GPR:$base,am3offset:$offset), LdMiscFrm, IIC_iLoadru, + "ldrsb", "\t$dst, [$base], $offset", "$base = $base_wb", []>; + +// For disassembly only +def LDRD_PRE : AI3lddpr<(outs GPR:$dst1, GPR:$dst2, GPR:$base_wb), + (ins addrmode3:$addr), LdMiscFrm, IIC_iLoadr, + "ldrd", "\t$dst1, $dst2, $addr!", "$addr.base = $base_wb", []>, + Requires<[IsARM, HasV5TE]>; + +// For disassembly only +def LDRD_POST : AI3lddpo<(outs GPR:$dst1, GPR:$dst2, GPR:$base_wb), + (ins GPR:$base,am3offset:$offset), LdMiscFrm, IIC_iLoadr, + "ldrd", "\t$dst1, $dst2, [$base], $offset", "$base = $base_wb", []>, + Requires<[IsARM, HasV5TE]>; + +} // mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 + +// LDRT, LDRBT, LDRSBT, LDRHT, LDRSHT are for disassembly only. + +def LDRT : AI2ldwpo<(outs GPR:$dst, GPR:$base_wb), + (ins GPR:$base, am2offset:$offset), LdFrm, IIC_iLoadru, + "ldrt", "\t$dst, [$base], $offset", "$base = $base_wb", []> { + let Inst{21} = 1; // overwrite +} + +def LDRBT : AI2ldbpo<(outs GPR:$dst, GPR:$base_wb), + (ins GPR:$base,am2offset:$offset), LdFrm, IIC_iLoadru, + "ldrbt", "\t$dst, [$base], $offset", "$base = $base_wb", []> { + let Inst{21} = 1; // overwrite +} + +def LDRSBT : AI3ldsbpo<(outs GPR:$dst, GPR:$base_wb), + (ins GPR:$base,am3offset:$offset), LdMiscFrm, IIC_iLoadru, + "ldrsbt", "\t$dst, [$base], $offset", "$base = $base_wb", []> { + let Inst{21} = 1; // overwrite +} + +def LDRHT : AI3ldhpo<(outs GPR:$dst, GPR:$base_wb), + (ins GPR:$base, am3offset:$offset), LdMiscFrm, IIC_iLoadru, + "ldrht", "\t$dst, [$base], $offset", "$base = $base_wb", []> { + let Inst{21} = 1; // overwrite +} + +def LDRSHT : AI3ldshpo<(outs GPR:$dst, GPR:$base_wb), + (ins GPR:$base,am3offset:$offset), LdMiscFrm, IIC_iLoadru, + "ldrsht", "\t$dst, [$base], $offset", "$base = $base_wb", []> { + let Inst{21} = 1; // overwrite +} + +// Store +def STR : AI2stw<(outs), (ins GPR:$src, addrmode2:$addr), StFrm, IIC_iStorer, + "str", "\t$src, $addr", + [(store GPR:$src, addrmode2:$addr)]>; + +// Stores with truncate +def STRH : AI3sth<(outs), (ins GPR:$src, addrmode3:$addr), StMiscFrm, + IIC_iStorer, "strh", "\t$src, $addr", + [(truncstorei16 GPR:$src, addrmode3:$addr)]>; + +def STRB : AI2stb<(outs), (ins GPR:$src, addrmode2:$addr), StFrm, IIC_iStorer, + "strb", "\t$src, $addr", + [(truncstorei8 GPR:$src, addrmode2:$addr)]>; + +// Store doubleword +let mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1 in +def STRD : AI3std<(outs), (ins GPR:$src1, GPR:$src2, addrmode3:$addr), + StMiscFrm, IIC_iStorer, + "strd", "\t$src1, $addr", []>, Requires<[IsARM, HasV5TE]>; + +// Indexed stores +def STR_PRE : AI2stwpr<(outs GPR:$base_wb), + (ins GPR:$src, GPR:$base, am2offset:$offset), + StFrm, IIC_iStoreru, + "str", "\t$src, [$base, $offset]!", "$base = $base_wb", + [(set GPR:$base_wb, + (pre_store GPR:$src, GPR:$base, am2offset:$offset))]>; + +def STR_POST : AI2stwpo<(outs GPR:$base_wb), + (ins GPR:$src, GPR:$base,am2offset:$offset), + StFrm, IIC_iStoreru, + "str", "\t$src, [$base], $offset", "$base = $base_wb", + [(set GPR:$base_wb, + (post_store GPR:$src, GPR:$base, am2offset:$offset))]>; + +def STRH_PRE : AI3sthpr<(outs GPR:$base_wb), + (ins GPR:$src, GPR:$base,am3offset:$offset), + StMiscFrm, IIC_iStoreru, + "strh", "\t$src, [$base, $offset]!", "$base = $base_wb", + [(set GPR:$base_wb, + (pre_truncsti16 GPR:$src, GPR:$base,am3offset:$offset))]>; + +def STRH_POST: AI3sthpo<(outs GPR:$base_wb), + (ins GPR:$src, GPR:$base,am3offset:$offset), + StMiscFrm, IIC_iStoreru, + "strh", "\t$src, [$base], $offset", "$base = $base_wb", + [(set GPR:$base_wb, (post_truncsti16 GPR:$src, + GPR:$base, am3offset:$offset))]>; + +def STRB_PRE : AI2stbpr<(outs GPR:$base_wb), + (ins GPR:$src, GPR:$base,am2offset:$offset), + StFrm, IIC_iStoreru, + "strb", "\t$src, [$base, $offset]!", "$base = $base_wb", + [(set GPR:$base_wb, (pre_truncsti8 GPR:$src, + GPR:$base, am2offset:$offset))]>; + +def STRB_POST: AI2stbpo<(outs GPR:$base_wb), + (ins GPR:$src, GPR:$base,am2offset:$offset), + StFrm, IIC_iStoreru, + "strb", "\t$src, [$base], $offset", "$base = $base_wb", + [(set GPR:$base_wb, (post_truncsti8 GPR:$src, + GPR:$base, am2offset:$offset))]>; + +// For disassembly only +def STRD_PRE : AI3stdpr<(outs GPR:$base_wb), + (ins GPR:$src1, GPR:$src2, GPR:$base, am3offset:$offset), + StMiscFrm, IIC_iStoreru, + "strd", "\t$src1, $src2, [$base, $offset]!", + "$base = $base_wb", []>; + +// For disassembly only +def STRD_POST: AI3stdpo<(outs GPR:$base_wb), + (ins GPR:$src1, GPR:$src2, GPR:$base, am3offset:$offset), + StMiscFrm, IIC_iStoreru, + "strd", "\t$src1, $src2, [$base], $offset", + "$base = $base_wb", []>; + +// STRT, STRBT, and STRHT are for disassembly only. + +def STRT : AI2stwpo<(outs GPR:$base_wb), + (ins GPR:$src, GPR:$base,am2offset:$offset), + StFrm, IIC_iStoreru, + "strt", "\t$src, [$base], $offset", "$base = $base_wb", + [/* For disassembly only; pattern left blank */]> { + let Inst{21} = 1; // overwrite +} + +def STRBT : AI2stbpo<(outs GPR:$base_wb), + (ins GPR:$src, GPR:$base,am2offset:$offset), + StFrm, IIC_iStoreru, + "strbt", "\t$src, [$base], $offset", "$base = $base_wb", + [/* For disassembly only; pattern left blank */]> { + let Inst{21} = 1; // overwrite +} + +def STRHT: AI3sthpo<(outs GPR:$base_wb), + (ins GPR:$src, GPR:$base,am3offset:$offset), + StMiscFrm, IIC_iStoreru, + "strht", "\t$src, [$base], $offset", "$base = $base_wb", + [/* For disassembly only; pattern left blank */]> { + let Inst{21} = 1; // overwrite +} + +//===----------------------------------------------------------------------===// +// Load / store multiple Instructions. +// + +let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 in { +def LDM : AXI4ld<(outs), (ins addrmode4:$addr, pred:$p, + reglist:$dsts, variable_ops), + IndexModeNone, LdStMulFrm, IIC_iLoadm, + "ldm${addr:submode}${p}\t$addr, $dsts", "", []>; + +def LDM_UPD : AXI4ld<(outs GPR:$wb), (ins addrmode4:$addr, pred:$p, + reglist:$dsts, variable_ops), + IndexModeUpd, LdStMulFrm, IIC_iLoadm, + "ldm${addr:submode}${p}\t$addr!, $dsts", + "$addr.addr = $wb", []>; +} // mayLoad, neverHasSideEffects, hasExtraDefRegAllocReq + +let mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1 in { +def STM : AXI4st<(outs), (ins addrmode4:$addr, pred:$p, + reglist:$srcs, variable_ops), + IndexModeNone, LdStMulFrm, IIC_iStorem, + "stm${addr:submode}${p}\t$addr, $srcs", "", []>; + +def STM_UPD : AXI4st<(outs GPR:$wb), (ins addrmode4:$addr, pred:$p, + reglist:$srcs, variable_ops), + IndexModeUpd, LdStMulFrm, IIC_iStorem, + "stm${addr:submode}${p}\t$addr!, $srcs", + "$addr.addr = $wb", []>; +} // mayStore, neverHasSideEffects, hasExtraSrcRegAllocReq + +//===----------------------------------------------------------------------===// +// Move Instructions. +// + +let neverHasSideEffects = 1 in +def MOVr : AsI1<0b1101, (outs GPR:$dst), (ins GPR:$src), DPFrm, IIC_iMOVr, + "mov", "\t$dst, $src", []>, UnaryDP { + let Inst{11-4} = 0b00000000; + let Inst{25} = 0; +} + +def MOVs : AsI1<0b1101, (outs GPR:$dst), (ins so_reg:$src), + DPSoRegFrm, IIC_iMOVsr, + "mov", "\t$dst, $src", [(set GPR:$dst, so_reg:$src)]>, UnaryDP { + let Inst{25} = 0; +} + +let isReMaterializable = 1, isAsCheapAsAMove = 1 in +def MOVi : AsI1<0b1101, (outs GPR:$dst), (ins so_imm:$src), DPFrm, IIC_iMOVi, + "mov", "\t$dst, $src", [(set GPR:$dst, so_imm:$src)]>, UnaryDP { + let Inst{25} = 1; +} + +let isReMaterializable = 1, isAsCheapAsAMove = 1 in +def MOVi16 : AI1<0b1000, (outs GPR:$dst), (ins i32imm:$src), + DPFrm, IIC_iMOVi, + "movw", "\t$dst, $src", + [(set GPR:$dst, imm0_65535:$src)]>, + Requires<[IsARM, HasV6T2]>, UnaryDP { + let Inst{20} = 0; + let Inst{25} = 1; +} + +let Constraints = "$src = $dst" in +def MOVTi16 : AI1<0b1010, (outs GPR:$dst), (ins GPR:$src, i32imm:$imm), + DPFrm, IIC_iMOVi, + "movt", "\t$dst, $imm", + [(set GPR:$dst, + (or (and GPR:$src, 0xffff), + lo16AllZero:$imm))]>, UnaryDP, + Requires<[IsARM, HasV6T2]> { + let Inst{20} = 0; + let Inst{25} = 1; +} + +def : ARMPat<(or GPR:$src, 0xffff0000), (MOVTi16 GPR:$src, 0xffff)>, + Requires<[IsARM, HasV6T2]>; + +let Uses = [CPSR] in +def MOVrx : AsI1<0b1101, (outs GPR:$dst), (ins GPR:$src), Pseudo, IIC_iMOVsi, + "mov", "\t$dst, $src, rrx", + [(set GPR:$dst, (ARMrrx GPR:$src))]>, UnaryDP; + +// These aren't really mov instructions, but we have to define them this way +// due to flag operands. + +let Defs = [CPSR] in { +def MOVsrl_flag : AI1<0b1101, (outs GPR:$dst), (ins GPR:$src), Pseudo, + IIC_iMOVsi, "movs", "\t$dst, $src, lsr #1", + [(set GPR:$dst, (ARMsrl_flag GPR:$src))]>, UnaryDP; +def MOVsra_flag : AI1<0b1101, (outs GPR:$dst), (ins GPR:$src), Pseudo, + IIC_iMOVsi, "movs", "\t$dst, $src, asr #1", + [(set GPR:$dst, (ARMsra_flag GPR:$src))]>, UnaryDP; +} + +//===----------------------------------------------------------------------===// +// Extend Instructions. +// + +// Sign extenders + +defm SXTB : AI_unary_rrot<0b01101010, + "sxtb", UnOpFrag<(sext_inreg node:$Src, i8)>>; +defm SXTH : AI_unary_rrot<0b01101011, + "sxth", UnOpFrag<(sext_inreg node:$Src, i16)>>; + +defm SXTAB : AI_bin_rrot<0b01101010, + "sxtab", BinOpFrag<(add node:$LHS, (sext_inreg node:$RHS, i8))>>; +defm SXTAH : AI_bin_rrot<0b01101011, + "sxtah", BinOpFrag<(add node:$LHS, (sext_inreg node:$RHS,i16))>>; + +// For disassembly only +defm SXTB16 : AI_unary_rrot_np<0b01101000, "sxtb16">; + +// For disassembly only +defm SXTAB16 : AI_bin_rrot_np<0b01101000, "sxtab16">; + +// Zero extenders + +let AddedComplexity = 16 in { +defm UXTB : AI_unary_rrot<0b01101110, + "uxtb" , UnOpFrag<(and node:$Src, 0x000000FF)>>; +defm UXTH : AI_unary_rrot<0b01101111, + "uxth" , UnOpFrag<(and node:$Src, 0x0000FFFF)>>; +defm UXTB16 : AI_unary_rrot<0b01101100, + "uxtb16", UnOpFrag<(and node:$Src, 0x00FF00FF)>>; + +def : ARMV6Pat<(and (shl GPR:$Src, (i32 8)), 0xFF00FF), + (UXTB16r_rot GPR:$Src, 24)>; +def : ARMV6Pat<(and (srl GPR:$Src, (i32 8)), 0xFF00FF), + (UXTB16r_rot GPR:$Src, 8)>; + +defm UXTAB : AI_bin_rrot<0b01101110, "uxtab", + BinOpFrag<(add node:$LHS, (and node:$RHS, 0x00FF))>>; +defm UXTAH : AI_bin_rrot<0b01101111, "uxtah", + BinOpFrag<(add node:$LHS, (and node:$RHS, 0xFFFF))>>; +} + +// This isn't safe in general, the add is two 16-bit units, not a 32-bit add. +// For disassembly only +defm UXTAB16 : AI_bin_rrot_np<0b01101100, "uxtab16">; + + +def SBFX : I<(outs GPR:$dst), + (ins GPR:$src, imm0_31:$lsb, imm0_31:$width), + AddrMode1, Size4Bytes, IndexModeNone, DPFrm, IIC_iALUi, + "sbfx", "\t$dst, $src, $lsb, $width", "", []>, + Requires<[IsARM, HasV6T2]> { + let Inst{27-21} = 0b0111101; + let Inst{6-4} = 0b101; +} + +def UBFX : I<(outs GPR:$dst), + (ins GPR:$src, imm0_31:$lsb, imm0_31:$width), + AddrMode1, Size4Bytes, IndexModeNone, DPFrm, IIC_iALUi, + "ubfx", "\t$dst, $src, $lsb, $width", "", []>, + Requires<[IsARM, HasV6T2]> { + let Inst{27-21} = 0b0111111; + let Inst{6-4} = 0b101; +} + +//===----------------------------------------------------------------------===// +// Arithmetic Instructions. +// + +defm ADD : AsI1_bin_irs<0b0100, "add", + BinOpFrag<(add node:$LHS, node:$RHS)>, 1>; +defm SUB : AsI1_bin_irs<0b0010, "sub", + BinOpFrag<(sub node:$LHS, node:$RHS)>>; + +// ADD and SUB with 's' bit set. +defm ADDS : AI1_bin_s_irs<0b0100, "adds", + BinOpFrag<(addc node:$LHS, node:$RHS)>, 1>; +defm SUBS : AI1_bin_s_irs<0b0010, "subs", + BinOpFrag<(subc node:$LHS, node:$RHS)>>; + +defm ADC : AI1_adde_sube_irs<0b0101, "adc", + BinOpFrag<(adde_dead_carry node:$LHS, node:$RHS)>, 1>; +defm SBC : AI1_adde_sube_irs<0b0110, "sbc", + BinOpFrag<(sube_dead_carry node:$LHS, node:$RHS)>>; +defm ADCS : AI1_adde_sube_s_irs<0b0101, "adcs", + BinOpFrag<(adde_live_carry node:$LHS, node:$RHS)>, 1>; +defm SBCS : AI1_adde_sube_s_irs<0b0110, "sbcs", + BinOpFrag<(sube_live_carry node:$LHS, node:$RHS) >>; + +// These don't define reg/reg forms, because they are handled above. +def RSBri : AsI1<0b0011, (outs GPR:$dst), (ins GPR:$a, so_imm:$b), DPFrm, + IIC_iALUi, "rsb", "\t$dst, $a, $b", + [(set GPR:$dst, (sub so_imm:$b, GPR:$a))]> { + let Inst{25} = 1; +} + +def RSBrs : AsI1<0b0011, (outs GPR:$dst), (ins GPR:$a, so_reg:$b), DPSoRegFrm, + IIC_iALUsr, "rsb", "\t$dst, $a, $b", + [(set GPR:$dst, (sub so_reg:$b, GPR:$a))]> { + let Inst{25} = 0; +} + +// RSB with 's' bit set. +let Defs = [CPSR] in { +def RSBSri : AI1<0b0011, (outs GPR:$dst), (ins GPR:$a, so_imm:$b), DPFrm, + IIC_iALUi, "rsbs", "\t$dst, $a, $b", + [(set GPR:$dst, (subc so_imm:$b, GPR:$a))]> { + let Inst{20} = 1; + let Inst{25} = 1; +} +def RSBSrs : AI1<0b0011, (outs GPR:$dst), (ins GPR:$a, so_reg:$b), DPSoRegFrm, + IIC_iALUsr, "rsbs", "\t$dst, $a, $b", + [(set GPR:$dst, (subc so_reg:$b, GPR:$a))]> { + let Inst{20} = 1; + let Inst{25} = 0; +} +} + +let Uses = [CPSR] in { +def RSCri : AsI1<0b0111, (outs GPR:$dst), (ins GPR:$a, so_imm:$b), + DPFrm, IIC_iALUi, "rsc", "\t$dst, $a, $b", + [(set GPR:$dst, (sube_dead_carry so_imm:$b, GPR:$a))]>, + Requires<[IsARM]> { + let Inst{25} = 1; +} +def RSCrs : AsI1<0b0111, (outs GPR:$dst), (ins GPR:$a, so_reg:$b), + DPSoRegFrm, IIC_iALUsr, "rsc", "\t$dst, $a, $b", + [(set GPR:$dst, (sube_dead_carry so_reg:$b, GPR:$a))]>, + Requires<[IsARM]> { + let Inst{25} = 0; +} +} + +// FIXME: Allow these to be predicated. +let Defs = [CPSR], Uses = [CPSR] in { +def RSCSri : AXI1<0b0111, (outs GPR:$dst), (ins GPR:$a, so_imm:$b), + DPFrm, IIC_iALUi, "rscs\t$dst, $a, $b", + [(set GPR:$dst, (sube_dead_carry so_imm:$b, GPR:$a))]>, + Requires<[IsARM]> { + let Inst{20} = 1; + let Inst{25} = 1; +} +def RSCSrs : AXI1<0b0111, (outs GPR:$dst), (ins GPR:$a, so_reg:$b), + DPSoRegFrm, IIC_iALUsr, "rscs\t$dst, $a, $b", + [(set GPR:$dst, (sube_dead_carry so_reg:$b, GPR:$a))]>, + Requires<[IsARM]> { + let Inst{20} = 1; + let Inst{25} = 0; +} +} + +// (sub X, imm) gets canonicalized to (add X, -imm). Match this form. +def : ARMPat<(add GPR:$src, so_imm_neg:$imm), + (SUBri GPR:$src, so_imm_neg:$imm)>; + +//def : ARMPat<(addc GPR:$src, so_imm_neg:$imm), +// (SUBSri GPR:$src, so_imm_neg:$imm)>; +//def : ARMPat<(adde GPR:$src, so_imm_neg:$imm), +// (SBCri GPR:$src, so_imm_neg:$imm)>; + +// Note: These are implemented in C++ code, because they have to generate +// ADD/SUBrs instructions, which use a complex pattern that a xform function +// cannot produce. +// (mul X, 2^n+1) -> (add (X << n), X) +// (mul X, 2^n-1) -> (rsb X, (X << n)) + +// ARM Arithmetic Instruction -- for disassembly only +// GPR:$dst = GPR:$a op GPR:$b +class AAI<bits<8> op27_20, bits<4> op7_4, string opc> + : AI<(outs GPR:$dst), (ins GPR:$a, GPR:$b), DPFrm, IIC_iALUr, + opc, "\t$dst, $a, $b", + [/* For disassembly only; pattern left blank */]> { + let Inst{27-20} = op27_20; + let Inst{7-4} = op7_4; +} + +// Saturating add/subtract -- for disassembly only + +def QADD : AAI<0b00010000, 0b0101, "qadd">; +def QADD16 : AAI<0b01100010, 0b0001, "qadd16">; +def QADD8 : AAI<0b01100010, 0b1001, "qadd8">; +def QASX : AAI<0b01100010, 0b0011, "qasx">; +def QDADD : AAI<0b00010100, 0b0101, "qdadd">; +def QDSUB : AAI<0b00010110, 0b0101, "qdsub">; +def QSAX : AAI<0b01100010, 0b0101, "qsax">; +def QSUB : AAI<0b00010010, 0b0101, "qsub">; +def QSUB16 : AAI<0b01100010, 0b0111, "qsub16">; +def QSUB8 : AAI<0b01100010, 0b1111, "qsub8">; +def UQADD16 : AAI<0b01100110, 0b0001, "uqadd16">; +def UQADD8 : AAI<0b01100110, 0b1001, "uqadd8">; +def UQASX : AAI<0b01100110, 0b0011, "uqasx">; +def UQSAX : AAI<0b01100110, 0b0101, "uqsax">; +def UQSUB16 : AAI<0b01100110, 0b0111, "uqsub16">; +def UQSUB8 : AAI<0b01100110, 0b1111, "uqsub8">; + +// Signed/Unsigned add/subtract -- for disassembly only + +def SASX : AAI<0b01100001, 0b0011, "sasx">; +def SADD16 : AAI<0b01100001, 0b0001, "sadd16">; +def SADD8 : AAI<0b01100001, 0b1001, "sadd8">; +def SSAX : AAI<0b01100001, 0b0101, "ssax">; +def SSUB16 : AAI<0b01100001, 0b0111, "ssub16">; +def SSUB8 : AAI<0b01100001, 0b1111, "ssub8">; +def UASX : AAI<0b01100101, 0b0011, "uasx">; +def UADD16 : AAI<0b01100101, 0b0001, "uadd16">; +def UADD8 : AAI<0b01100101, 0b1001, "uadd8">; +def USAX : AAI<0b01100101, 0b0101, "usax">; +def USUB16 : AAI<0b01100101, 0b0111, "usub16">; +def USUB8 : AAI<0b01100101, 0b1111, "usub8">; + +// Signed/Unsigned halving add/subtract -- for disassembly only + +def SHASX : AAI<0b01100011, 0b0011, "shasx">; +def SHADD16 : AAI<0b01100011, 0b0001, "shadd16">; +def SHADD8 : AAI<0b01100011, 0b1001, "shadd8">; +def SHSAX : AAI<0b01100011, 0b0101, "shsax">; +def SHSUB16 : AAI<0b01100011, 0b0111, "shsub16">; +def SHSUB8 : AAI<0b01100011, 0b1111, "shsub8">; +def UHASX : AAI<0b01100111, 0b0011, "uhasx">; +def UHADD16 : AAI<0b01100111, 0b0001, "uhadd16">; +def UHADD8 : AAI<0b01100111, 0b1001, "uhadd8">; +def UHSAX : AAI<0b01100111, 0b0101, "uhsax">; +def UHSUB16 : AAI<0b01100111, 0b0111, "uhsub16">; +def UHSUB8 : AAI<0b01100111, 0b1111, "uhsub8">; + +// Unsigned Sum of Absolute Differences [and Accumulate] -- for disassembly only + +def USAD8 : AI<(outs GPR:$dst), (ins GPR:$a, GPR:$b), + MulFrm /* for convenience */, NoItinerary, "usad8", + "\t$dst, $a, $b", []>, + Requires<[IsARM, HasV6]> { + let Inst{27-20} = 0b01111000; + let Inst{15-12} = 0b1111; + let Inst{7-4} = 0b0001; +} +def USADA8 : AI<(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc), + MulFrm /* for convenience */, NoItinerary, "usada8", + "\t$dst, $a, $b, $acc", []>, + Requires<[IsARM, HasV6]> { + let Inst{27-20} = 0b01111000; + let Inst{7-4} = 0b0001; +} + +// Signed/Unsigned saturate -- for disassembly only + +def SSATlsl : AI<(outs GPR:$dst), (ins i32imm:$bit_pos, GPR:$a, i32imm:$shamt), + DPFrm, NoItinerary, "ssat", "\t$dst, $bit_pos, $a, lsl $shamt", + [/* For disassembly only; pattern left blank */]> { + let Inst{27-21} = 0b0110101; + let Inst{6-4} = 0b001; +} + +def SSATasr : AI<(outs GPR:$dst), (ins i32imm:$bit_pos, GPR:$a, i32imm:$shamt), + DPFrm, NoItinerary, "ssat", "\t$dst, $bit_pos, $a, asr $shamt", + [/* For disassembly only; pattern left blank */]> { + let Inst{27-21} = 0b0110101; + let Inst{6-4} = 0b101; +} + +def SSAT16 : AI<(outs GPR:$dst), (ins i32imm:$bit_pos, GPR:$a), DPFrm, + NoItinerary, "ssat16", "\t$dst, $bit_pos, $a", + [/* For disassembly only; pattern left blank */]> { + let Inst{27-20} = 0b01101010; + let Inst{7-4} = 0b0011; +} + +def USATlsl : AI<(outs GPR:$dst), (ins i32imm:$bit_pos, GPR:$a, i32imm:$shamt), + DPFrm, NoItinerary, "usat", "\t$dst, $bit_pos, $a, lsl $shamt", + [/* For disassembly only; pattern left blank */]> { + let Inst{27-21} = 0b0110111; + let Inst{6-4} = 0b001; +} + +def USATasr : AI<(outs GPR:$dst), (ins i32imm:$bit_pos, GPR:$a, i32imm:$shamt), + DPFrm, NoItinerary, "usat", "\t$dst, $bit_pos, $a, asr $shamt", + [/* For disassembly only; pattern left blank */]> { + let Inst{27-21} = 0b0110111; + let Inst{6-4} = 0b101; +} + +def USAT16 : AI<(outs GPR:$dst), (ins i32imm:$bit_pos, GPR:$a), DPFrm, + NoItinerary, "usat16", "\t$dst, $bit_pos, $a", + [/* For disassembly only; pattern left blank */]> { + let Inst{27-20} = 0b01101110; + let Inst{7-4} = 0b0011; +} + +//===----------------------------------------------------------------------===// +// Bitwise Instructions. +// + +defm AND : AsI1_bin_irs<0b0000, "and", + BinOpFrag<(and node:$LHS, node:$RHS)>, 1>; +defm ORR : AsI1_bin_irs<0b1100, "orr", + BinOpFrag<(or node:$LHS, node:$RHS)>, 1>; +defm EOR : AsI1_bin_irs<0b0001, "eor", + BinOpFrag<(xor node:$LHS, node:$RHS)>, 1>; +defm BIC : AsI1_bin_irs<0b1110, "bic", + BinOpFrag<(and node:$LHS, (not node:$RHS))>>; + +def BFC : I<(outs GPR:$dst), (ins GPR:$src, bf_inv_mask_imm:$imm), + AddrMode1, Size4Bytes, IndexModeNone, DPFrm, IIC_iUNAsi, + "bfc", "\t$dst, $imm", "$src = $dst", + [(set GPR:$dst, (and GPR:$src, bf_inv_mask_imm:$imm))]>, + Requires<[IsARM, HasV6T2]> { + let Inst{27-21} = 0b0111110; + let Inst{6-0} = 0b0011111; +} + +// A8.6.18 BFI - Bitfield insert (Encoding A1) +// Added for disassembler with the pattern field purposely left blank. +def BFI : I<(outs GPR:$dst), (ins GPR:$src, bf_inv_mask_imm:$imm), + AddrMode1, Size4Bytes, IndexModeNone, DPFrm, IIC_iUNAsi, + "bfi", "\t$dst, $src, $imm", "", + [/* For disassembly only; pattern left blank */]>, + Requires<[IsARM, HasV6T2]> { + let Inst{27-21} = 0b0111110; + let Inst{6-4} = 0b001; // Rn: Inst{3-0} != 15 +} + +def MVNr : AsI1<0b1111, (outs GPR:$dst), (ins GPR:$src), DPFrm, IIC_iMOVr, + "mvn", "\t$dst, $src", + [(set GPR:$dst, (not GPR:$src))]>, UnaryDP { + let Inst{25} = 0; + let Inst{11-4} = 0b00000000; +} +def MVNs : AsI1<0b1111, (outs GPR:$dst), (ins so_reg:$src), DPSoRegFrm, + IIC_iMOVsr, "mvn", "\t$dst, $src", + [(set GPR:$dst, (not so_reg:$src))]>, UnaryDP { + let Inst{25} = 0; +} +let isReMaterializable = 1, isAsCheapAsAMove = 1 in +def MVNi : AsI1<0b1111, (outs GPR:$dst), (ins so_imm:$imm), DPFrm, + IIC_iMOVi, "mvn", "\t$dst, $imm", + [(set GPR:$dst, so_imm_not:$imm)]>,UnaryDP { + let Inst{25} = 1; +} + +def : ARMPat<(and GPR:$src, so_imm_not:$imm), + (BICri GPR:$src, so_imm_not:$imm)>; + +//===----------------------------------------------------------------------===// +// Multiply Instructions. +// + +let isCommutable = 1 in +def MUL : AsMul1I<0b0000000, (outs GPR:$dst), (ins GPR:$a, GPR:$b), + IIC_iMUL32, "mul", "\t$dst, $a, $b", + [(set GPR:$dst, (mul GPR:$a, GPR:$b))]>; + +def MLA : AsMul1I<0b0000001, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c), + IIC_iMAC32, "mla", "\t$dst, $a, $b, $c", + [(set GPR:$dst, (add (mul GPR:$a, GPR:$b), GPR:$c))]>; + +def MLS : AMul1I<0b0000011, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c), + IIC_iMAC32, "mls", "\t$dst, $a, $b, $c", + [(set GPR:$dst, (sub GPR:$c, (mul GPR:$a, GPR:$b)))]>, + Requires<[IsARM, HasV6T2]>; + +// Extra precision multiplies with low / high results +let neverHasSideEffects = 1 in { +let isCommutable = 1 in { +def SMULL : AsMul1I<0b0000110, (outs GPR:$ldst, GPR:$hdst), + (ins GPR:$a, GPR:$b), IIC_iMUL64, + "smull", "\t$ldst, $hdst, $a, $b", []>; + +def UMULL : AsMul1I<0b0000100, (outs GPR:$ldst, GPR:$hdst), + (ins GPR:$a, GPR:$b), IIC_iMUL64, + "umull", "\t$ldst, $hdst, $a, $b", []>; +} + +// Multiply + accumulate +def SMLAL : AsMul1I<0b0000111, (outs GPR:$ldst, GPR:$hdst), + (ins GPR:$a, GPR:$b), IIC_iMAC64, + "smlal", "\t$ldst, $hdst, $a, $b", []>; + +def UMLAL : AsMul1I<0b0000101, (outs GPR:$ldst, GPR:$hdst), + (ins GPR:$a, GPR:$b), IIC_iMAC64, + "umlal", "\t$ldst, $hdst, $a, $b", []>; + +def UMAAL : AMul1I <0b0000010, (outs GPR:$ldst, GPR:$hdst), + (ins GPR:$a, GPR:$b), IIC_iMAC64, + "umaal", "\t$ldst, $hdst, $a, $b", []>, + Requires<[IsARM, HasV6]>; +} // neverHasSideEffects + +// Most significant word multiply +def SMMUL : AMul2I <0b0111010, (outs GPR:$dst), (ins GPR:$a, GPR:$b), + IIC_iMUL32, "smmul", "\t$dst, $a, $b", + [(set GPR:$dst, (mulhs GPR:$a, GPR:$b))]>, + Requires<[IsARM, HasV6]> { + let Inst{7-4} = 0b0001; + let Inst{15-12} = 0b1111; +} + +def SMMULR : AMul2I <0b0111010, (outs GPR:$dst), (ins GPR:$a, GPR:$b), + IIC_iMUL32, "smmulr", "\t$dst, $a, $b", + [/* For disassembly only; pattern left blank */]>, + Requires<[IsARM, HasV6]> { + let Inst{7-4} = 0b0011; // R = 1 + let Inst{15-12} = 0b1111; +} + +def SMMLA : AMul2I <0b0111010, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c), + IIC_iMAC32, "smmla", "\t$dst, $a, $b, $c", + [(set GPR:$dst, (add (mulhs GPR:$a, GPR:$b), GPR:$c))]>, + Requires<[IsARM, HasV6]> { + let Inst{7-4} = 0b0001; +} + +def SMMLAR : AMul2I <0b0111010, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c), + IIC_iMAC32, "smmlar", "\t$dst, $a, $b, $c", + [/* For disassembly only; pattern left blank */]>, + Requires<[IsARM, HasV6]> { + let Inst{7-4} = 0b0011; // R = 1 +} + +def SMMLS : AMul2I <0b0111010, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c), + IIC_iMAC32, "smmls", "\t$dst, $a, $b, $c", + [(set GPR:$dst, (sub GPR:$c, (mulhs GPR:$a, GPR:$b)))]>, + Requires<[IsARM, HasV6]> { + let Inst{7-4} = 0b1101; +} + +def SMMLSR : AMul2I <0b0111010, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c), + IIC_iMAC32, "smmlsr", "\t$dst, $a, $b, $c", + [/* For disassembly only; pattern left blank */]>, + Requires<[IsARM, HasV6]> { + let Inst{7-4} = 0b1111; // R = 1 +} + +multiclass AI_smul<string opc, PatFrag opnode> { + def BB : AMulxyI<0b0001011, (outs GPR:$dst), (ins GPR:$a, GPR:$b), + IIC_iMUL32, !strconcat(opc, "bb"), "\t$dst, $a, $b", + [(set GPR:$dst, (opnode (sext_inreg GPR:$a, i16), + (sext_inreg GPR:$b, i16)))]>, + Requires<[IsARM, HasV5TE]> { + let Inst{5} = 0; + let Inst{6} = 0; + } + + def BT : AMulxyI<0b0001011, (outs GPR:$dst), (ins GPR:$a, GPR:$b), + IIC_iMUL32, !strconcat(opc, "bt"), "\t$dst, $a, $b", + [(set GPR:$dst, (opnode (sext_inreg GPR:$a, i16), + (sra GPR:$b, (i32 16))))]>, + Requires<[IsARM, HasV5TE]> { + let Inst{5} = 0; + let Inst{6} = 1; + } + + def TB : AMulxyI<0b0001011, (outs GPR:$dst), (ins GPR:$a, GPR:$b), + IIC_iMUL32, !strconcat(opc, "tb"), "\t$dst, $a, $b", + [(set GPR:$dst, (opnode (sra GPR:$a, (i32 16)), + (sext_inreg GPR:$b, i16)))]>, + Requires<[IsARM, HasV5TE]> { + let Inst{5} = 1; + let Inst{6} = 0; + } + + def TT : AMulxyI<0b0001011, (outs GPR:$dst), (ins GPR:$a, GPR:$b), + IIC_iMUL32, !strconcat(opc, "tt"), "\t$dst, $a, $b", + [(set GPR:$dst, (opnode (sra GPR:$a, (i32 16)), + (sra GPR:$b, (i32 16))))]>, + Requires<[IsARM, HasV5TE]> { + let Inst{5} = 1; + let Inst{6} = 1; + } + + def WB : AMulxyI<0b0001001, (outs GPR:$dst), (ins GPR:$a, GPR:$b), + IIC_iMUL16, !strconcat(opc, "wb"), "\t$dst, $a, $b", + [(set GPR:$dst, (sra (opnode GPR:$a, + (sext_inreg GPR:$b, i16)), (i32 16)))]>, + Requires<[IsARM, HasV5TE]> { + let Inst{5} = 1; + let Inst{6} = 0; + } + + def WT : AMulxyI<0b0001001, (outs GPR:$dst), (ins GPR:$a, GPR:$b), + IIC_iMUL16, !strconcat(opc, "wt"), "\t$dst, $a, $b", + [(set GPR:$dst, (sra (opnode GPR:$a, + (sra GPR:$b, (i32 16))), (i32 16)))]>, + Requires<[IsARM, HasV5TE]> { + let Inst{5} = 1; + let Inst{6} = 1; + } +} + + +multiclass AI_smla<string opc, PatFrag opnode> { + def BB : AMulxyI<0b0001000, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc), + IIC_iMAC16, !strconcat(opc, "bb"), "\t$dst, $a, $b, $acc", + [(set GPR:$dst, (add GPR:$acc, + (opnode (sext_inreg GPR:$a, i16), + (sext_inreg GPR:$b, i16))))]>, + Requires<[IsARM, HasV5TE]> { + let Inst{5} = 0; + let Inst{6} = 0; + } + + def BT : AMulxyI<0b0001000, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc), + IIC_iMAC16, !strconcat(opc, "bt"), "\t$dst, $a, $b, $acc", + [(set GPR:$dst, (add GPR:$acc, (opnode (sext_inreg GPR:$a, i16), + (sra GPR:$b, (i32 16)))))]>, + Requires<[IsARM, HasV5TE]> { + let Inst{5} = 0; + let Inst{6} = 1; + } + + def TB : AMulxyI<0b0001000, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc), + IIC_iMAC16, !strconcat(opc, "tb"), "\t$dst, $a, $b, $acc", + [(set GPR:$dst, (add GPR:$acc, (opnode (sra GPR:$a, (i32 16)), + (sext_inreg GPR:$b, i16))))]>, + Requires<[IsARM, HasV5TE]> { + let Inst{5} = 1; + let Inst{6} = 0; + } + + def TT : AMulxyI<0b0001000, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc), + IIC_iMAC16, !strconcat(opc, "tt"), "\t$dst, $a, $b, $acc", + [(set GPR:$dst, (add GPR:$acc, (opnode (sra GPR:$a, (i32 16)), + (sra GPR:$b, (i32 16)))))]>, + Requires<[IsARM, HasV5TE]> { + let Inst{5} = 1; + let Inst{6} = 1; + } + + def WB : AMulxyI<0b0001001, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc), + IIC_iMAC16, !strconcat(opc, "wb"), "\t$dst, $a, $b, $acc", + [(set GPR:$dst, (add GPR:$acc, (sra (opnode GPR:$a, + (sext_inreg GPR:$b, i16)), (i32 16))))]>, + Requires<[IsARM, HasV5TE]> { + let Inst{5} = 0; + let Inst{6} = 0; + } + + def WT : AMulxyI<0b0001001, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc), + IIC_iMAC16, !strconcat(opc, "wt"), "\t$dst, $a, $b, $acc", + [(set GPR:$dst, (add GPR:$acc, (sra (opnode GPR:$a, + (sra GPR:$b, (i32 16))), (i32 16))))]>, + Requires<[IsARM, HasV5TE]> { + let Inst{5} = 0; + let Inst{6} = 1; + } +} + +defm SMUL : AI_smul<"smul", BinOpFrag<(mul node:$LHS, node:$RHS)>>; +defm SMLA : AI_smla<"smla", BinOpFrag<(mul node:$LHS, node:$RHS)>>; + +// Halfword multiply accumulate long: SMLAL<x><y> -- for disassembly only +def SMLALBB : AMulxyI<0b0001010,(outs GPR:$ldst,GPR:$hdst),(ins GPR:$a,GPR:$b), + IIC_iMAC64, "smlalbb", "\t$ldst, $hdst, $a, $b", + [/* For disassembly only; pattern left blank */]>, + Requires<[IsARM, HasV5TE]> { + let Inst{5} = 0; + let Inst{6} = 0; +} + +def SMLALBT : AMulxyI<0b0001010,(outs GPR:$ldst,GPR:$hdst),(ins GPR:$a,GPR:$b), + IIC_iMAC64, "smlalbt", "\t$ldst, $hdst, $a, $b", + [/* For disassembly only; pattern left blank */]>, + Requires<[IsARM, HasV5TE]> { + let Inst{5} = 0; + let Inst{6} = 1; +} + +def SMLALTB : AMulxyI<0b0001010,(outs GPR:$ldst,GPR:$hdst),(ins GPR:$a,GPR:$b), + IIC_iMAC64, "smlaltb", "\t$ldst, $hdst, $a, $b", + [/* For disassembly only; pattern left blank */]>, + Requires<[IsARM, HasV5TE]> { + let Inst{5} = 1; + let Inst{6} = 0; +} + +def SMLALTT : AMulxyI<0b0001010,(outs GPR:$ldst,GPR:$hdst),(ins GPR:$a,GPR:$b), + IIC_iMAC64, "smlaltt", "\t$ldst, $hdst, $a, $b", + [/* For disassembly only; pattern left blank */]>, + Requires<[IsARM, HasV5TE]> { + let Inst{5} = 1; + let Inst{6} = 1; +} + +// Helper class for AI_smld -- for disassembly only +class AMulDualI<bit long, bit sub, bit swap, dag oops, dag iops, + InstrItinClass itin, string opc, string asm> + : AI<oops, iops, MulFrm, itin, opc, asm, []>, Requires<[IsARM, HasV6]> { + let Inst{4} = 1; + let Inst{5} = swap; + let Inst{6} = sub; + let Inst{7} = 0; + let Inst{21-20} = 0b00; + let Inst{22} = long; + let Inst{27-23} = 0b01110; +} + +multiclass AI_smld<bit sub, string opc> { + + def D : AMulDualI<0, sub, 0, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc), + NoItinerary, !strconcat(opc, "d"), "\t$dst, $a, $b, $acc">; + + def DX : AMulDualI<0, sub, 1, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc), + NoItinerary, !strconcat(opc, "dx"), "\t$dst, $a, $b, $acc">; + + def LD : AMulDualI<1, sub, 0, (outs GPR:$ldst,GPR:$hdst), (ins GPR:$a,GPR:$b), + NoItinerary, !strconcat(opc, "ld"), "\t$ldst, $hdst, $a, $b">; + + def LDX : AMulDualI<1, sub, 1, (outs GPR:$ldst,GPR:$hdst),(ins GPR:$a,GPR:$b), + NoItinerary, !strconcat(opc, "ldx"),"\t$ldst, $hdst, $a, $b">; + +} + +defm SMLA : AI_smld<0, "smla">; +defm SMLS : AI_smld<1, "smls">; + +multiclass AI_sdml<bit sub, string opc> { + + def D : AMulDualI<0, sub, 0, (outs GPR:$dst), (ins GPR:$a, GPR:$b), + NoItinerary, !strconcat(opc, "d"), "\t$dst, $a, $b"> { + let Inst{15-12} = 0b1111; + } + + def DX : AMulDualI<0, sub, 1, (outs GPR:$dst), (ins GPR:$a, GPR:$b), + NoItinerary, !strconcat(opc, "dx"), "\t$dst, $a, $b"> { + let Inst{15-12} = 0b1111; + } + +} + +defm SMUA : AI_sdml<0, "smua">; +defm SMUS : AI_sdml<1, "smus">; + +//===----------------------------------------------------------------------===// +// Misc. Arithmetic Instructions. +// + +def CLZ : AMiscA1I<0b000010110, (outs GPR:$dst), (ins GPR:$src), IIC_iUNAr, + "clz", "\t$dst, $src", + [(set GPR:$dst, (ctlz GPR:$src))]>, Requires<[IsARM, HasV5T]> { + let Inst{7-4} = 0b0001; + let Inst{11-8} = 0b1111; + let Inst{19-16} = 0b1111; +} + +def RBIT : AMiscA1I<0b01101111, (outs GPR:$dst), (ins GPR:$src), IIC_iUNAr, + "rbit", "\t$dst, $src", + [(set GPR:$dst, (ARMrbit GPR:$src))]>, + Requires<[IsARM, HasV6T2]> { + let Inst{7-4} = 0b0011; + let Inst{11-8} = 0b1111; + let Inst{19-16} = 0b1111; +} + +def REV : AMiscA1I<0b01101011, (outs GPR:$dst), (ins GPR:$src), IIC_iUNAr, + "rev", "\t$dst, $src", + [(set GPR:$dst, (bswap GPR:$src))]>, Requires<[IsARM, HasV6]> { + let Inst{7-4} = 0b0011; + let Inst{11-8} = 0b1111; + let Inst{19-16} = 0b1111; +} + +def REV16 : AMiscA1I<0b01101011, (outs GPR:$dst), (ins GPR:$src), IIC_iUNAr, + "rev16", "\t$dst, $src", + [(set GPR:$dst, + (or (and (srl GPR:$src, (i32 8)), 0xFF), + (or (and (shl GPR:$src, (i32 8)), 0xFF00), + (or (and (srl GPR:$src, (i32 8)), 0xFF0000), + (and (shl GPR:$src, (i32 8)), 0xFF000000)))))]>, + Requires<[IsARM, HasV6]> { + let Inst{7-4} = 0b1011; + let Inst{11-8} = 0b1111; + let Inst{19-16} = 0b1111; +} + +def REVSH : AMiscA1I<0b01101111, (outs GPR:$dst), (ins GPR:$src), IIC_iUNAr, + "revsh", "\t$dst, $src", + [(set GPR:$dst, + (sext_inreg + (or (srl (and GPR:$src, 0xFF00), (i32 8)), + (shl GPR:$src, (i32 8))), i16))]>, + Requires<[IsARM, HasV6]> { + let Inst{7-4} = 0b1011; + let Inst{11-8} = 0b1111; + let Inst{19-16} = 0b1111; +} + +def PKHBT : AMiscA1I<0b01101000, (outs GPR:$dst), + (ins GPR:$src1, GPR:$src2, i32imm:$shamt), + IIC_iALUsi, "pkhbt", "\t$dst, $src1, $src2, lsl $shamt", + [(set GPR:$dst, (or (and GPR:$src1, 0xFFFF), + (and (shl GPR:$src2, (i32 imm:$shamt)), + 0xFFFF0000)))]>, + Requires<[IsARM, HasV6]> { + let Inst{6-4} = 0b001; +} + +// Alternate cases for PKHBT where identities eliminate some nodes. +def : ARMV6Pat<(or (and GPR:$src1, 0xFFFF), (and GPR:$src2, 0xFFFF0000)), + (PKHBT GPR:$src1, GPR:$src2, 0)>; +def : ARMV6Pat<(or (and GPR:$src1, 0xFFFF), (shl GPR:$src2, imm16_31:$shamt)), + (PKHBT GPR:$src1, GPR:$src2, imm16_31:$shamt)>; + + +def PKHTB : AMiscA1I<0b01101000, (outs GPR:$dst), + (ins GPR:$src1, GPR:$src2, i32imm:$shamt), + IIC_iALUsi, "pkhtb", "\t$dst, $src1, $src2, asr $shamt", + [(set GPR:$dst, (or (and GPR:$src1, 0xFFFF0000), + (and (sra GPR:$src2, imm16_31:$shamt), + 0xFFFF)))]>, Requires<[IsARM, HasV6]> { + let Inst{6-4} = 0b101; +} + +// Alternate cases for PKHTB where identities eliminate some nodes. Note that +// a shift amount of 0 is *not legal* here, it is PKHBT instead. +def : ARMV6Pat<(or (and GPR:$src1, 0xFFFF0000), (srl GPR:$src2, (i32 16))), + (PKHTB GPR:$src1, GPR:$src2, 16)>; +def : ARMV6Pat<(or (and GPR:$src1, 0xFFFF0000), + (and (srl GPR:$src2, imm1_15:$shamt), 0xFFFF)), + (PKHTB GPR:$src1, GPR:$src2, imm1_15:$shamt)>; + +//===----------------------------------------------------------------------===// +// Comparison Instructions... +// + +defm CMP : AI1_cmp_irs<0b1010, "cmp", + BinOpFrag<(ARMcmp node:$LHS, node:$RHS)>>; +//FIXME: Disable CMN, as CCodes are backwards from compare expectations +// Compare-to-zero still works out, just not the relationals +//defm CMN : AI1_cmp_irs<0b1011, "cmn", +// BinOpFrag<(ARMcmp node:$LHS,(ineg node:$RHS))>>; + +// Note that TST/TEQ don't set all the same flags that CMP does! +defm TST : AI1_cmp_irs<0b1000, "tst", + BinOpFrag<(ARMcmpZ (and node:$LHS, node:$RHS), 0)>, 1>; +defm TEQ : AI1_cmp_irs<0b1001, "teq", + BinOpFrag<(ARMcmpZ (xor node:$LHS, node:$RHS), 0)>, 1>; + +defm CMPz : AI1_cmp_irs<0b1010, "cmp", + BinOpFrag<(ARMcmpZ node:$LHS, node:$RHS)>>; +defm CMNz : AI1_cmp_irs<0b1011, "cmn", + BinOpFrag<(ARMcmpZ node:$LHS,(ineg node:$RHS))>>; + +//def : ARMPat<(ARMcmp GPR:$src, so_imm_neg:$imm), +// (CMNri GPR:$src, so_imm_neg:$imm)>; + +def : ARMPat<(ARMcmpZ GPR:$src, so_imm_neg:$imm), + (CMNzri GPR:$src, so_imm_neg:$imm)>; + + +// Conditional moves +// FIXME: should be able to write a pattern for ARMcmov, but can't use +// a two-value operand where a dag node expects two operands. :( +let neverHasSideEffects = 1 in { +def MOVCCr : AI1<0b1101, (outs GPR:$dst), (ins GPR:$false, GPR:$true), DPFrm, + IIC_iCMOVr, "mov", "\t$dst, $true", + [/*(set GPR:$dst, (ARMcmov GPR:$false, GPR:$true, imm:$cc, CCR:$ccr))*/]>, + RegConstraint<"$false = $dst">, UnaryDP { + let Inst{11-4} = 0b00000000; + let Inst{25} = 0; +} + +def MOVCCs : AI1<0b1101, (outs GPR:$dst), + (ins GPR:$false, so_reg:$true), DPSoRegFrm, IIC_iCMOVsr, + "mov", "\t$dst, $true", + [/*(set GPR:$dst, (ARMcmov GPR:$false, so_reg:$true, imm:$cc, CCR:$ccr))*/]>, + RegConstraint<"$false = $dst">, UnaryDP { + let Inst{25} = 0; +} + +def MOVCCi : AI1<0b1101, (outs GPR:$dst), + (ins GPR:$false, so_imm:$true), DPFrm, IIC_iCMOVi, + "mov", "\t$dst, $true", + [/*(set GPR:$dst, (ARMcmov GPR:$false, so_imm:$true, imm:$cc, CCR:$ccr))*/]>, + RegConstraint<"$false = $dst">, UnaryDP { + let Inst{25} = 1; +} +} // neverHasSideEffects + +//===----------------------------------------------------------------------===// +// Atomic operations intrinsics +// + +// memory barriers protect the atomic sequences +let hasSideEffects = 1 in { +def Int_MemBarrierV7 : AInoP<(outs), (ins), + Pseudo, NoItinerary, + "dmb", "", + [(ARMMemBarrierV7)]>, + Requires<[IsARM, HasV7]> { + let Inst{31-4} = 0xf57ff05; + // FIXME: add support for options other than a full system DMB + // See DMB disassembly-only variants below. + let Inst{3-0} = 0b1111; +} + +def Int_SyncBarrierV7 : AInoP<(outs), (ins), + Pseudo, NoItinerary, + "dsb", "", + [(ARMSyncBarrierV7)]>, + Requires<[IsARM, HasV7]> { + let Inst{31-4} = 0xf57ff04; + // FIXME: add support for options other than a full system DSB + // See DSB disassembly-only variants below. + let Inst{3-0} = 0b1111; +} + +def Int_MemBarrierV6 : AInoP<(outs), (ins GPR:$zero), + Pseudo, NoItinerary, + "mcr", "\tp15, 0, $zero, c7, c10, 5", + [(ARMMemBarrierV6 GPR:$zero)]>, + Requires<[IsARM, HasV6]> { + // FIXME: add support for options other than a full system DMB + // FIXME: add encoding +} + +def Int_SyncBarrierV6 : AInoP<(outs), (ins GPR:$zero), + Pseudo, NoItinerary, + "mcr", "\tp15, 0, $zero, c7, c10, 4", + [(ARMSyncBarrierV6 GPR:$zero)]>, + Requires<[IsARM, HasV6]> { + // FIXME: add support for options other than a full system DSB + // FIXME: add encoding +} +} + +// Helper class for multiclass MemB -- for disassembly only +class AMBI<string opc, string asm> + : AInoP<(outs), (ins), MiscFrm, NoItinerary, opc, asm, + [/* For disassembly only; pattern left blank */]>, + Requires<[IsARM, HasV7]> { + let Inst{31-20} = 0xf57; +} + +multiclass MemB<bits<4> op7_4, string opc> { + + def st : AMBI<opc, "\tst"> { + let Inst{7-4} = op7_4; + let Inst{3-0} = 0b1110; + } + + def ish : AMBI<opc, "\tish"> { + let Inst{7-4} = op7_4; + let Inst{3-0} = 0b1011; + } + + def ishst : AMBI<opc, "\tishst"> { + let Inst{7-4} = op7_4; + let Inst{3-0} = 0b1010; + } + + def nsh : AMBI<opc, "\tnsh"> { + let Inst{7-4} = op7_4; + let Inst{3-0} = 0b0111; + } + + def nshst : AMBI<opc, "\tnshst"> { + let Inst{7-4} = op7_4; + let Inst{3-0} = 0b0110; + } + + def osh : AMBI<opc, "\tosh"> { + let Inst{7-4} = op7_4; + let Inst{3-0} = 0b0011; + } + + def oshst : AMBI<opc, "\toshst"> { + let Inst{7-4} = op7_4; + let Inst{3-0} = 0b0010; + } +} + +// These DMB variants are for disassembly only. +defm DMB : MemB<0b0101, "dmb">; + +// These DSB variants are for disassembly only. +defm DSB : MemB<0b0100, "dsb">; + +// ISB has only full system option -- for disassembly only +def ISBsy : AMBI<"isb", ""> { + let Inst{7-4} = 0b0110; + let Inst{3-0} = 0b1111; +} + +let usesCustomInserter = 1 in { + let Uses = [CPSR] in { + def ATOMIC_LOAD_ADD_I8 : PseudoInst< + (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary, + "${:comment} ATOMIC_LOAD_ADD_I8 PSEUDO!", + [(set GPR:$dst, (atomic_load_add_8 GPR:$ptr, GPR:$incr))]>; + def ATOMIC_LOAD_SUB_I8 : PseudoInst< + (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary, + "${:comment} ATOMIC_LOAD_SUB_I8 PSEUDO!", + [(set GPR:$dst, (atomic_load_sub_8 GPR:$ptr, GPR:$incr))]>; + def ATOMIC_LOAD_AND_I8 : PseudoInst< + (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary, + "${:comment} ATOMIC_LOAD_AND_I8 PSEUDO!", + [(set GPR:$dst, (atomic_load_and_8 GPR:$ptr, GPR:$incr))]>; + def ATOMIC_LOAD_OR_I8 : PseudoInst< + (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary, + "${:comment} ATOMIC_LOAD_OR_I8 PSEUDO!", + [(set GPR:$dst, (atomic_load_or_8 GPR:$ptr, GPR:$incr))]>; + def ATOMIC_LOAD_XOR_I8 : PseudoInst< + (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary, + "${:comment} ATOMIC_LOAD_XOR_I8 PSEUDO!", + [(set GPR:$dst, (atomic_load_xor_8 GPR:$ptr, GPR:$incr))]>; + def ATOMIC_LOAD_NAND_I8 : PseudoInst< + (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary, + "${:comment} ATOMIC_LOAD_NAND_I8 PSEUDO!", + [(set GPR:$dst, (atomic_load_nand_8 GPR:$ptr, GPR:$incr))]>; + def ATOMIC_LOAD_ADD_I16 : PseudoInst< + (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary, + "${:comment} ATOMIC_LOAD_ADD_I16 PSEUDO!", + [(set GPR:$dst, (atomic_load_add_16 GPR:$ptr, GPR:$incr))]>; + def ATOMIC_LOAD_SUB_I16 : PseudoInst< + (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary, + "${:comment} ATOMIC_LOAD_SUB_I16 PSEUDO!", + [(set GPR:$dst, (atomic_load_sub_16 GPR:$ptr, GPR:$incr))]>; + def ATOMIC_LOAD_AND_I16 : PseudoInst< + (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary, + "${:comment} ATOMIC_LOAD_AND_I16 PSEUDO!", + [(set GPR:$dst, (atomic_load_and_16 GPR:$ptr, GPR:$incr))]>; + def ATOMIC_LOAD_OR_I16 : PseudoInst< + (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary, + "${:comment} ATOMIC_LOAD_OR_I16 PSEUDO!", + [(set GPR:$dst, (atomic_load_or_16 GPR:$ptr, GPR:$incr))]>; + def ATOMIC_LOAD_XOR_I16 : PseudoInst< + (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary, + "${:comment} ATOMIC_LOAD_XOR_I16 PSEUDO!", + [(set GPR:$dst, (atomic_load_xor_16 GPR:$ptr, GPR:$incr))]>; + def ATOMIC_LOAD_NAND_I16 : PseudoInst< + (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary, + "${:comment} ATOMIC_LOAD_NAND_I16 PSEUDO!", + [(set GPR:$dst, (atomic_load_nand_16 GPR:$ptr, GPR:$incr))]>; + def ATOMIC_LOAD_ADD_I32 : PseudoInst< + (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary, + "${:comment} ATOMIC_LOAD_ADD_I32 PSEUDO!", + [(set GPR:$dst, (atomic_load_add_32 GPR:$ptr, GPR:$incr))]>; + def ATOMIC_LOAD_SUB_I32 : PseudoInst< + (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary, + "${:comment} ATOMIC_LOAD_SUB_I32 PSEUDO!", + [(set GPR:$dst, (atomic_load_sub_32 GPR:$ptr, GPR:$incr))]>; + def ATOMIC_LOAD_AND_I32 : PseudoInst< + (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary, + "${:comment} ATOMIC_LOAD_AND_I32 PSEUDO!", + [(set GPR:$dst, (atomic_load_and_32 GPR:$ptr, GPR:$incr))]>; + def ATOMIC_LOAD_OR_I32 : PseudoInst< + (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary, + "${:comment} ATOMIC_LOAD_OR_I32 PSEUDO!", + [(set GPR:$dst, (atomic_load_or_32 GPR:$ptr, GPR:$incr))]>; + def ATOMIC_LOAD_XOR_I32 : PseudoInst< + (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary, + "${:comment} ATOMIC_LOAD_XOR_I32 PSEUDO!", + [(set GPR:$dst, (atomic_load_xor_32 GPR:$ptr, GPR:$incr))]>; + def ATOMIC_LOAD_NAND_I32 : PseudoInst< + (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary, + "${:comment} ATOMIC_LOAD_NAND_I32 PSEUDO!", + [(set GPR:$dst, (atomic_load_nand_32 GPR:$ptr, GPR:$incr))]>; + + def ATOMIC_SWAP_I8 : PseudoInst< + (outs GPR:$dst), (ins GPR:$ptr, GPR:$new), NoItinerary, + "${:comment} ATOMIC_SWAP_I8 PSEUDO!", + [(set GPR:$dst, (atomic_swap_8 GPR:$ptr, GPR:$new))]>; + def ATOMIC_SWAP_I16 : PseudoInst< + (outs GPR:$dst), (ins GPR:$ptr, GPR:$new), NoItinerary, + "${:comment} ATOMIC_SWAP_I16 PSEUDO!", + [(set GPR:$dst, (atomic_swap_16 GPR:$ptr, GPR:$new))]>; + def ATOMIC_SWAP_I32 : PseudoInst< + (outs GPR:$dst), (ins GPR:$ptr, GPR:$new), NoItinerary, + "${:comment} ATOMIC_SWAP_I32 PSEUDO!", + [(set GPR:$dst, (atomic_swap_32 GPR:$ptr, GPR:$new))]>; + + def ATOMIC_CMP_SWAP_I8 : PseudoInst< + (outs GPR:$dst), (ins GPR:$ptr, GPR:$old, GPR:$new), NoItinerary, + "${:comment} ATOMIC_CMP_SWAP_I8 PSEUDO!", + [(set GPR:$dst, (atomic_cmp_swap_8 GPR:$ptr, GPR:$old, GPR:$new))]>; + def ATOMIC_CMP_SWAP_I16 : PseudoInst< + (outs GPR:$dst), (ins GPR:$ptr, GPR:$old, GPR:$new), NoItinerary, + "${:comment} ATOMIC_CMP_SWAP_I16 PSEUDO!", + [(set GPR:$dst, (atomic_cmp_swap_16 GPR:$ptr, GPR:$old, GPR:$new))]>; + def ATOMIC_CMP_SWAP_I32 : PseudoInst< + (outs GPR:$dst), (ins GPR:$ptr, GPR:$old, GPR:$new), NoItinerary, + "${:comment} ATOMIC_CMP_SWAP_I32 PSEUDO!", + [(set GPR:$dst, (atomic_cmp_swap_32 GPR:$ptr, GPR:$old, GPR:$new))]>; +} +} + +let mayLoad = 1 in { +def LDREXB : AIldrex<0b10, (outs GPR:$dest), (ins GPR:$ptr), NoItinerary, + "ldrexb", "\t$dest, [$ptr]", + []>; +def LDREXH : AIldrex<0b11, (outs GPR:$dest), (ins GPR:$ptr), NoItinerary, + "ldrexh", "\t$dest, [$ptr]", + []>; +def LDREX : AIldrex<0b00, (outs GPR:$dest), (ins GPR:$ptr), NoItinerary, + "ldrex", "\t$dest, [$ptr]", + []>; +def LDREXD : AIldrex<0b01, (outs GPR:$dest, GPR:$dest2), (ins GPR:$ptr), + NoItinerary, + "ldrexd", "\t$dest, $dest2, [$ptr]", + []>; +} + +let mayStore = 1, Constraints = "@earlyclobber $success" in { +def STREXB : AIstrex<0b10, (outs GPR:$success), (ins GPR:$src, GPR:$ptr), + NoItinerary, + "strexb", "\t$success, $src, [$ptr]", + []>; +def STREXH : AIstrex<0b11, (outs GPR:$success), (ins GPR:$src, GPR:$ptr), + NoItinerary, + "strexh", "\t$success, $src, [$ptr]", + []>; +def STREX : AIstrex<0b00, (outs GPR:$success), (ins GPR:$src, GPR:$ptr), + NoItinerary, + "strex", "\t$success, $src, [$ptr]", + []>; +def STREXD : AIstrex<0b01, (outs GPR:$success), + (ins GPR:$src, GPR:$src2, GPR:$ptr), + NoItinerary, + "strexd", "\t$success, $src, $src2, [$ptr]", + []>; +} + +// Clear-Exclusive is for disassembly only. +def CLREX : AXI<(outs), (ins), MiscFrm, NoItinerary, "clrex", + [/* For disassembly only; pattern left blank */]>, + Requires<[IsARM, HasV7]> { + let Inst{31-20} = 0xf57; + let Inst{7-4} = 0b0001; +} + +// SWP/SWPB are deprecated in V6/V7 and for disassembly only. +let mayLoad = 1 in { +def SWP : AI<(outs GPR:$dst), (ins GPR:$src, GPR:$ptr), LdStExFrm, NoItinerary, + "swp", "\t$dst, $src, [$ptr]", + [/* For disassembly only; pattern left blank */]> { + let Inst{27-23} = 0b00010; + let Inst{22} = 0; // B = 0 + let Inst{21-20} = 0b00; + let Inst{7-4} = 0b1001; +} + +def SWPB : AI<(outs GPR:$dst), (ins GPR:$src, GPR:$ptr), LdStExFrm, NoItinerary, + "swpb", "\t$dst, $src, [$ptr]", + [/* For disassembly only; pattern left blank */]> { + let Inst{27-23} = 0b00010; + let Inst{22} = 1; // B = 1 + let Inst{21-20} = 0b00; + let Inst{7-4} = 0b1001; +} +} + +//===----------------------------------------------------------------------===// +// TLS Instructions +// + +// __aeabi_read_tp preserves the registers r1-r3. +let isCall = 1, + Defs = [R0, R12, LR, CPSR] in { + def TPsoft : ABXI<0b1011, (outs), (ins), IIC_Br, + "bl\t__aeabi_read_tp", + [(set R0, ARMthread_pointer)]>; +} + +//===----------------------------------------------------------------------===// +// SJLJ Exception handling intrinsics +// eh_sjlj_setjmp() is an instruction sequence to store the return +// address and save #0 in R0 for the non-longjmp case. +// Since by its nature we may be coming from some other function to get +// here, and we're using the stack frame for the containing function to +// save/restore registers, we can't keep anything live in regs across +// the eh_sjlj_setjmp(), else it will almost certainly have been tromped upon +// when we get here from a longjmp(). We force everthing out of registers +// except for our own input by listing the relevant registers in Defs. By +// doing so, we also cause the prologue/epilogue code to actively preserve +// all of the callee-saved resgisters, which is exactly what we want. +// A constant value is passed in $val, and we use the location as a scratch. +let Defs = + [ R0, R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, LR, D0, + D1, D2, D3, D4, D5, D6, D7, D8, D9, D10, D11, D12, D13, D14, D15, + D16, D17, D18, D19, D20, D21, D22, D23, D24, D25, D26, D27, D28, D29, D30, + D31 ] in { + def Int_eh_sjlj_setjmp : XI<(outs), (ins GPR:$src, GPR:$val), + AddrModeNone, SizeSpecial, IndexModeNone, + Pseudo, NoItinerary, + "str\tsp, [$src, #+8] ${:comment} eh_setjmp begin\n\t" + "add\t$val, pc, #8\n\t" + "str\t$val, [$src, #+4]\n\t" + "mov\tr0, #0\n\t" + "add\tpc, pc, #0\n\t" + "mov\tr0, #1 ${:comment} eh_setjmp end", "", + [(set R0, (ARMeh_sjlj_setjmp GPR:$src, GPR:$val))]>, + Requires<[IsARM, HasVFP2]>; +} + +let Defs = + [ R0, R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, LR ] in { + def Int_eh_sjlj_setjmp_nofp : XI<(outs), (ins GPR:$src, GPR:$val), + AddrModeNone, SizeSpecial, IndexModeNone, + Pseudo, NoItinerary, + "str\tsp, [$src, #+8] ${:comment} eh_setjmp begin\n\t" + "add\t$val, pc, #8\n\t" + "str\t$val, [$src, #+4]\n\t" + "mov\tr0, #0\n\t" + "add\tpc, pc, #0\n\t" + "mov\tr0, #1 ${:comment} eh_setjmp end", "", + [(set R0, (ARMeh_sjlj_setjmp GPR:$src, GPR:$val))]>, + Requires<[IsARM, NoVFP]>; +} + +// FIXME: Non-Darwin version(s) +let isBarrier = 1, hasSideEffects = 1, isTerminator = 1, + Defs = [ R7, LR, SP ] in { +def Int_eh_sjlj_longjmp : XI<(outs), (ins GPR:$src, GPR:$scratch), + AddrModeNone, SizeSpecial, IndexModeNone, + Pseudo, NoItinerary, + "ldr\tsp, [$src, #8]\n\t" + "ldr\t$scratch, [$src, #4]\n\t" + "ldr\tr7, [$src]\n\t" + "bx\t$scratch", "", + [(ARMeh_sjlj_longjmp GPR:$src, GPR:$scratch)]>, + Requires<[IsARM, IsDarwin]>; +} + +//===----------------------------------------------------------------------===// +// Non-Instruction Patterns +// + +// Large immediate handling. + +// Two piece so_imms. +let isReMaterializable = 1 in +def MOVi2pieces : AI1x2<(outs GPR:$dst), (ins so_imm2part:$src), + Pseudo, IIC_iMOVi, + "mov", "\t$dst, $src", + [(set GPR:$dst, so_imm2part:$src)]>, + Requires<[IsARM, NoV6T2]>; + +def : ARMPat<(or GPR:$LHS, so_imm2part:$RHS), + (ORRri (ORRri GPR:$LHS, (so_imm2part_1 imm:$RHS)), + (so_imm2part_2 imm:$RHS))>; +def : ARMPat<(xor GPR:$LHS, so_imm2part:$RHS), + (EORri (EORri GPR:$LHS, (so_imm2part_1 imm:$RHS)), + (so_imm2part_2 imm:$RHS))>; +def : ARMPat<(add GPR:$LHS, so_imm2part:$RHS), + (ADDri (ADDri GPR:$LHS, (so_imm2part_1 imm:$RHS)), + (so_imm2part_2 imm:$RHS))>; +def : ARMPat<(add GPR:$LHS, so_neg_imm2part:$RHS), + (SUBri (SUBri GPR:$LHS, (so_neg_imm2part_1 imm:$RHS)), + (so_neg_imm2part_2 imm:$RHS))>; + +// 32-bit immediate using movw + movt. +// This is a single pseudo instruction, the benefit is that it can be remat'd +// as a single unit instead of having to handle reg inputs. +// FIXME: Remove this when we can do generalized remat. +let isReMaterializable = 1 in +def MOVi32imm : AI1x2<(outs GPR:$dst), (ins i32imm:$src), Pseudo, IIC_iMOVi, + "movw", "\t$dst, ${src:lo16}\n\tmovt${p}\t$dst, ${src:hi16}", + [(set GPR:$dst, (i32 imm:$src))]>, + Requires<[IsARM, HasV6T2]>; + +// ConstantPool, GlobalAddress, and JumpTable +def : ARMPat<(ARMWrapper tglobaladdr :$dst), (LEApcrel tglobaladdr :$dst)>, + Requires<[IsARM, DontUseMovt]>; +def : ARMPat<(ARMWrapper tconstpool :$dst), (LEApcrel tconstpool :$dst)>; +def : ARMPat<(ARMWrapper tglobaladdr :$dst), (MOVi32imm tglobaladdr :$dst)>, + Requires<[IsARM, UseMovt]>; +def : ARMPat<(ARMWrapperJT tjumptable:$dst, imm:$id), + (LEApcrelJT tjumptable:$dst, imm:$id)>; + +// TODO: add,sub,and, 3-instr forms? + + +// Direct calls +def : ARMPat<(ARMcall texternalsym:$func), (BL texternalsym:$func)>, + Requires<[IsARM, IsNotDarwin]>; +def : ARMPat<(ARMcall texternalsym:$func), (BLr9 texternalsym:$func)>, + Requires<[IsARM, IsDarwin]>; + +// zextload i1 -> zextload i8 +def : ARMPat<(zextloadi1 addrmode2:$addr), (LDRB addrmode2:$addr)>; + +// extload -> zextload +def : ARMPat<(extloadi1 addrmode2:$addr), (LDRB addrmode2:$addr)>; +def : ARMPat<(extloadi8 addrmode2:$addr), (LDRB addrmode2:$addr)>; +def : ARMPat<(extloadi16 addrmode3:$addr), (LDRH addrmode3:$addr)>; + +def : ARMPat<(extloadi8 addrmodepc:$addr), (PICLDRB addrmodepc:$addr)>; +def : ARMPat<(extloadi16 addrmodepc:$addr), (PICLDRH addrmodepc:$addr)>; + +// smul* and smla* +def : ARMV5TEPat<(mul (sra (shl GPR:$a, (i32 16)), (i32 16)), + (sra (shl GPR:$b, (i32 16)), (i32 16))), + (SMULBB GPR:$a, GPR:$b)>; +def : ARMV5TEPat<(mul sext_16_node:$a, sext_16_node:$b), + (SMULBB GPR:$a, GPR:$b)>; +def : ARMV5TEPat<(mul (sra (shl GPR:$a, (i32 16)), (i32 16)), + (sra GPR:$b, (i32 16))), + (SMULBT GPR:$a, GPR:$b)>; +def : ARMV5TEPat<(mul sext_16_node:$a, (sra GPR:$b, (i32 16))), + (SMULBT GPR:$a, GPR:$b)>; +def : ARMV5TEPat<(mul (sra GPR:$a, (i32 16)), + (sra (shl GPR:$b, (i32 16)), (i32 16))), + (SMULTB GPR:$a, GPR:$b)>; +def : ARMV5TEPat<(mul (sra GPR:$a, (i32 16)), sext_16_node:$b), + (SMULTB GPR:$a, GPR:$b)>; +def : ARMV5TEPat<(sra (mul GPR:$a, (sra (shl GPR:$b, (i32 16)), (i32 16))), + (i32 16)), + (SMULWB GPR:$a, GPR:$b)>; +def : ARMV5TEPat<(sra (mul GPR:$a, sext_16_node:$b), (i32 16)), + (SMULWB GPR:$a, GPR:$b)>; + +def : ARMV5TEPat<(add GPR:$acc, + (mul (sra (shl GPR:$a, (i32 16)), (i32 16)), + (sra (shl GPR:$b, (i32 16)), (i32 16)))), + (SMLABB GPR:$a, GPR:$b, GPR:$acc)>; +def : ARMV5TEPat<(add GPR:$acc, + (mul sext_16_node:$a, sext_16_node:$b)), + (SMLABB GPR:$a, GPR:$b, GPR:$acc)>; +def : ARMV5TEPat<(add GPR:$acc, + (mul (sra (shl GPR:$a, (i32 16)), (i32 16)), + (sra GPR:$b, (i32 16)))), + (SMLABT GPR:$a, GPR:$b, GPR:$acc)>; +def : ARMV5TEPat<(add GPR:$acc, + (mul sext_16_node:$a, (sra GPR:$b, (i32 16)))), + (SMLABT GPR:$a, GPR:$b, GPR:$acc)>; +def : ARMV5TEPat<(add GPR:$acc, + (mul (sra GPR:$a, (i32 16)), + (sra (shl GPR:$b, (i32 16)), (i32 16)))), + (SMLATB GPR:$a, GPR:$b, GPR:$acc)>; +def : ARMV5TEPat<(add GPR:$acc, + (mul (sra GPR:$a, (i32 16)), sext_16_node:$b)), + (SMLATB GPR:$a, GPR:$b, GPR:$acc)>; +def : ARMV5TEPat<(add GPR:$acc, + (sra (mul GPR:$a, (sra (shl GPR:$b, (i32 16)), (i32 16))), + (i32 16))), + (SMLAWB GPR:$a, GPR:$b, GPR:$acc)>; +def : ARMV5TEPat<(add GPR:$acc, + (sra (mul GPR:$a, sext_16_node:$b), (i32 16))), + (SMLAWB GPR:$a, GPR:$b, GPR:$acc)>; + +//===----------------------------------------------------------------------===// +// Thumb Support +// + +include "ARMInstrThumb.td" + +//===----------------------------------------------------------------------===// +// Thumb2 Support +// + +include "ARMInstrThumb2.td" + +//===----------------------------------------------------------------------===// +// Floating Point Support +// + +include "ARMInstrVFP.td" + +//===----------------------------------------------------------------------===// +// Advanced SIMD (NEON) Support +// + +include "ARMInstrNEON.td" + +//===----------------------------------------------------------------------===// +// Coprocessor Instructions. For disassembly only. +// + +def CDP : ABI<0b1110, (outs), (ins nohash_imm:$cop, i32imm:$opc1, + nohash_imm:$CRd, nohash_imm:$CRn, nohash_imm:$CRm, i32imm:$opc2), + NoItinerary, "cdp", "\tp$cop, $opc1, cr$CRd, cr$CRn, cr$CRm, $opc2", + [/* For disassembly only; pattern left blank */]> { + let Inst{4} = 0; +} + +def CDP2 : ABXI<0b1110, (outs), (ins nohash_imm:$cop, i32imm:$opc1, + nohash_imm:$CRd, nohash_imm:$CRn, nohash_imm:$CRm, i32imm:$opc2), + NoItinerary, "cdp2\tp$cop, $opc1, cr$CRd, cr$CRn, cr$CRm, $opc2", + [/* For disassembly only; pattern left blank */]> { + let Inst{31-28} = 0b1111; + let Inst{4} = 0; +} + +class ACI<dag oops, dag iops, string opc, string asm> + : I<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, BrFrm, NoItinerary, + opc, asm, "", [/* For disassembly only; pattern left blank */]> { + let Inst{27-25} = 0b110; +} + +multiclass LdStCop<bits<4> op31_28, bit load, string opc> { + + def _OFFSET : ACI<(outs), + (ins nohash_imm:$cop, nohash_imm:$CRd, addrmode2:$addr), + opc, "\tp$cop, cr$CRd, $addr"> { + let Inst{31-28} = op31_28; + let Inst{24} = 1; // P = 1 + let Inst{21} = 0; // W = 0 + let Inst{22} = 0; // D = 0 + let Inst{20} = load; + } + + def _PRE : ACI<(outs), + (ins nohash_imm:$cop, nohash_imm:$CRd, addrmode2:$addr), + opc, "\tp$cop, cr$CRd, $addr!"> { + let Inst{31-28} = op31_28; + let Inst{24} = 1; // P = 1 + let Inst{21} = 1; // W = 1 + let Inst{22} = 0; // D = 0 + let Inst{20} = load; + } + + def _POST : ACI<(outs), + (ins nohash_imm:$cop, nohash_imm:$CRd, GPR:$base, am2offset:$offset), + opc, "\tp$cop, cr$CRd, [$base], $offset"> { + let Inst{31-28} = op31_28; + let Inst{24} = 0; // P = 0 + let Inst{21} = 1; // W = 1 + let Inst{22} = 0; // D = 0 + let Inst{20} = load; + } + + def _OPTION : ACI<(outs), + (ins nohash_imm:$cop, nohash_imm:$CRd, GPR:$base, i32imm:$option), + opc, "\tp$cop, cr$CRd, [$base], $option"> { + let Inst{31-28} = op31_28; + let Inst{24} = 0; // P = 0 + let Inst{23} = 1; // U = 1 + let Inst{21} = 0; // W = 0 + let Inst{22} = 0; // D = 0 + let Inst{20} = load; + } + + def L_OFFSET : ACI<(outs), + (ins nohash_imm:$cop, nohash_imm:$CRd, addrmode2:$addr), + !strconcat(opc, "l"), "\tp$cop, cr$CRd, $addr"> { + let Inst{31-28} = op31_28; + let Inst{24} = 1; // P = 1 + let Inst{21} = 0; // W = 0 + let Inst{22} = 1; // D = 1 + let Inst{20} = load; + } + + def L_PRE : ACI<(outs), + (ins nohash_imm:$cop, nohash_imm:$CRd, addrmode2:$addr), + !strconcat(opc, "l"), "\tp$cop, cr$CRd, $addr!"> { + let Inst{31-28} = op31_28; + let Inst{24} = 1; // P = 1 + let Inst{21} = 1; // W = 1 + let Inst{22} = 1; // D = 1 + let Inst{20} = load; + } + + def L_POST : ACI<(outs), + (ins nohash_imm:$cop, nohash_imm:$CRd, GPR:$base, am2offset:$offset), + !strconcat(opc, "l"), "\tp$cop, cr$CRd, [$base], $offset"> { + let Inst{31-28} = op31_28; + let Inst{24} = 0; // P = 0 + let Inst{21} = 1; // W = 1 + let Inst{22} = 1; // D = 1 + let Inst{20} = load; + } + + def L_OPTION : ACI<(outs), + (ins nohash_imm:$cop, nohash_imm:$CRd, GPR:$base, nohash_imm:$option), + !strconcat(opc, "l"), "\tp$cop, cr$CRd, [$base], $option"> { + let Inst{31-28} = op31_28; + let Inst{24} = 0; // P = 0 + let Inst{23} = 1; // U = 1 + let Inst{21} = 0; // W = 0 + let Inst{22} = 1; // D = 1 + let Inst{20} = load; + } +} + +defm LDC : LdStCop<{?,?,?,?}, 1, "ldc">; +defm LDC2 : LdStCop<0b1111, 1, "ldc2">; +defm STC : LdStCop<{?,?,?,?}, 0, "stc">; +defm STC2 : LdStCop<0b1111, 0, "stc2">; + +def MCR : ABI<0b1110, (outs), (ins nohash_imm:$cop, i32imm:$opc1, + GPR:$Rt, nohash_imm:$CRn, nohash_imm:$CRm, i32imm:$opc2), + NoItinerary, "mcr", "\tp$cop, $opc1, $Rt, cr$CRn, cr$CRm, $opc2", + [/* For disassembly only; pattern left blank */]> { + let Inst{20} = 0; + let Inst{4} = 1; +} + +def MCR2 : ABXI<0b1110, (outs), (ins nohash_imm:$cop, i32imm:$opc1, + GPR:$Rt, nohash_imm:$CRn, nohash_imm:$CRm, i32imm:$opc2), + NoItinerary, "mcr2\tp$cop, $opc1, $Rt, cr$CRn, cr$CRm, $opc2", + [/* For disassembly only; pattern left blank */]> { + let Inst{31-28} = 0b1111; + let Inst{20} = 0; + let Inst{4} = 1; +} + +def MRC : ABI<0b1110, (outs), (ins nohash_imm:$cop, i32imm:$opc1, + GPR:$Rt, nohash_imm:$CRn, nohash_imm:$CRm, i32imm:$opc2), + NoItinerary, "mrc", "\tp$cop, $opc1, $Rt, cr$CRn, cr$CRm, $opc2", + [/* For disassembly only; pattern left blank */]> { + let Inst{20} = 1; + let Inst{4} = 1; +} + +def MRC2 : ABXI<0b1110, (outs), (ins nohash_imm:$cop, i32imm:$opc1, + GPR:$Rt, nohash_imm:$CRn, nohash_imm:$CRm, i32imm:$opc2), + NoItinerary, "mrc2\tp$cop, $opc1, $Rt, cr$CRn, cr$CRm, $opc2", + [/* For disassembly only; pattern left blank */]> { + let Inst{31-28} = 0b1111; + let Inst{20} = 1; + let Inst{4} = 1; +} + +def MCRR : ABI<0b1100, (outs), (ins nohash_imm:$cop, i32imm:$opc, + GPR:$Rt, GPR:$Rt2, nohash_imm:$CRm), + NoItinerary, "mcrr", "\tp$cop, $opc, $Rt, $Rt2, cr$CRm", + [/* For disassembly only; pattern left blank */]> { + let Inst{23-20} = 0b0100; +} + +def MCRR2 : ABXI<0b1100, (outs), (ins nohash_imm:$cop, i32imm:$opc, + GPR:$Rt, GPR:$Rt2, nohash_imm:$CRm), + NoItinerary, "mcrr2\tp$cop, $opc, $Rt, $Rt2, cr$CRm", + [/* For disassembly only; pattern left blank */]> { + let Inst{31-28} = 0b1111; + let Inst{23-20} = 0b0100; +} + +def MRRC : ABI<0b1100, (outs), (ins nohash_imm:$cop, i32imm:$opc, + GPR:$Rt, GPR:$Rt2, nohash_imm:$CRm), + NoItinerary, "mrrc", "\tp$cop, $opc, $Rt, $Rt2, cr$CRm", + [/* For disassembly only; pattern left blank */]> { + let Inst{23-20} = 0b0101; +} + +def MRRC2 : ABXI<0b1100, (outs), (ins nohash_imm:$cop, i32imm:$opc, + GPR:$Rt, GPR:$Rt2, nohash_imm:$CRm), + NoItinerary, "mrrc2\tp$cop, $opc, $Rt, $Rt2, cr$CRm", + [/* For disassembly only; pattern left blank */]> { + let Inst{31-28} = 0b1111; + let Inst{23-20} = 0b0101; +} + +//===----------------------------------------------------------------------===// +// Move between special register and ARM core register -- for disassembly only +// + +def MRS : ABI<0b0001,(outs GPR:$dst),(ins), NoItinerary, "mrs", "\t$dst, cpsr", + [/* For disassembly only; pattern left blank */]> { + let Inst{23-20} = 0b0000; + let Inst{7-4} = 0b0000; +} + +def MRSsys : ABI<0b0001,(outs GPR:$dst),(ins), NoItinerary,"mrs","\t$dst, spsr", + [/* For disassembly only; pattern left blank */]> { + let Inst{23-20} = 0b0100; + let Inst{7-4} = 0b0000; +} + +def MSR : ABI<0b0001, (outs), (ins GPR:$src, msr_mask:$mask), NoItinerary, + "msr", "\tcpsr$mask, $src", + [/* For disassembly only; pattern left blank */]> { + let Inst{23-20} = 0b0010; + let Inst{7-4} = 0b0000; +} + +def MSRi : ABI<0b0011, (outs), (ins so_imm:$a, msr_mask:$mask), NoItinerary, + "msr", "\tcpsr$mask, $a", + [/* For disassembly only; pattern left blank */]> { + let Inst{23-20} = 0b0010; + let Inst{7-4} = 0b0000; +} + +def MSRsys : ABI<0b0001, (outs), (ins GPR:$src, msr_mask:$mask), NoItinerary, + "msr", "\tspsr$mask, $src", + [/* For disassembly only; pattern left blank */]> { + let Inst{23-20} = 0b0110; + let Inst{7-4} = 0b0000; +} + +def MSRsysi : ABI<0b0011, (outs), (ins so_imm:$a, msr_mask:$mask), NoItinerary, + "msr", "\tspsr$mask, $a", + [/* For disassembly only; pattern left blank */]> { + let Inst{23-20} = 0b0110; + let Inst{7-4} = 0b0000; +} diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrNEON.td b/contrib/llvm/lib/Target/ARM/ARMInstrNEON.td new file mode 100644 index 0000000..197ec16 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMInstrNEON.td @@ -0,0 +1,3545 @@ +//===- ARMInstrNEON.td - NEON support for ARM -----------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the ARM NEON instruction set. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// NEON-specific DAG Nodes. +//===----------------------------------------------------------------------===// + +def SDTARMVCMP : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisSameAs<1, 2>]>; + +def NEONvceq : SDNode<"ARMISD::VCEQ", SDTARMVCMP>; +def NEONvcge : SDNode<"ARMISD::VCGE", SDTARMVCMP>; +def NEONvcgeu : SDNode<"ARMISD::VCGEU", SDTARMVCMP>; +def NEONvcgt : SDNode<"ARMISD::VCGT", SDTARMVCMP>; +def NEONvcgtu : SDNode<"ARMISD::VCGTU", SDTARMVCMP>; +def NEONvtst : SDNode<"ARMISD::VTST", SDTARMVCMP>; + +// Types for vector shift by immediates. The "SHX" version is for long and +// narrow operations where the source and destination vectors have different +// types. The "SHINS" version is for shift and insert operations. +def SDTARMVSH : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisSameAs<0, 1>, + SDTCisVT<2, i32>]>; +def SDTARMVSHX : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisInt<1>, + SDTCisVT<2, i32>]>; +def SDTARMVSHINS : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<0, 1>, + SDTCisSameAs<0, 2>, SDTCisVT<3, i32>]>; + +def NEONvshl : SDNode<"ARMISD::VSHL", SDTARMVSH>; +def NEONvshrs : SDNode<"ARMISD::VSHRs", SDTARMVSH>; +def NEONvshru : SDNode<"ARMISD::VSHRu", SDTARMVSH>; +def NEONvshlls : SDNode<"ARMISD::VSHLLs", SDTARMVSHX>; +def NEONvshllu : SDNode<"ARMISD::VSHLLu", SDTARMVSHX>; +def NEONvshlli : SDNode<"ARMISD::VSHLLi", SDTARMVSHX>; +def NEONvshrn : SDNode<"ARMISD::VSHRN", SDTARMVSHX>; + +def NEONvrshrs : SDNode<"ARMISD::VRSHRs", SDTARMVSH>; +def NEONvrshru : SDNode<"ARMISD::VRSHRu", SDTARMVSH>; +def NEONvrshrn : SDNode<"ARMISD::VRSHRN", SDTARMVSHX>; + +def NEONvqshls : SDNode<"ARMISD::VQSHLs", SDTARMVSH>; +def NEONvqshlu : SDNode<"ARMISD::VQSHLu", SDTARMVSH>; +def NEONvqshlsu : SDNode<"ARMISD::VQSHLsu", SDTARMVSH>; +def NEONvqshrns : SDNode<"ARMISD::VQSHRNs", SDTARMVSHX>; +def NEONvqshrnu : SDNode<"ARMISD::VQSHRNu", SDTARMVSHX>; +def NEONvqshrnsu : SDNode<"ARMISD::VQSHRNsu", SDTARMVSHX>; + +def NEONvqrshrns : SDNode<"ARMISD::VQRSHRNs", SDTARMVSHX>; +def NEONvqrshrnu : SDNode<"ARMISD::VQRSHRNu", SDTARMVSHX>; +def NEONvqrshrnsu : SDNode<"ARMISD::VQRSHRNsu", SDTARMVSHX>; + +def NEONvsli : SDNode<"ARMISD::VSLI", SDTARMVSHINS>; +def NEONvsri : SDNode<"ARMISD::VSRI", SDTARMVSHINS>; + +def SDTARMVGETLN : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisInt<1>, + SDTCisVT<2, i32>]>; +def NEONvgetlaneu : SDNode<"ARMISD::VGETLANEu", SDTARMVGETLN>; +def NEONvgetlanes : SDNode<"ARMISD::VGETLANEs", SDTARMVGETLN>; + +def NEONvdup : SDNode<"ARMISD::VDUP", SDTypeProfile<1, 1, [SDTCisVec<0>]>>; + +// VDUPLANE can produce a quad-register result from a double-register source, +// so the result is not constrained to match the source. +def NEONvduplane : SDNode<"ARMISD::VDUPLANE", + SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, + SDTCisVT<2, i32>]>>; + +def SDTARMVEXT : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0, 1>, + SDTCisSameAs<0, 2>, SDTCisVT<3, i32>]>; +def NEONvext : SDNode<"ARMISD::VEXT", SDTARMVEXT>; + +def SDTARMVSHUF : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0, 1>]>; +def NEONvrev64 : SDNode<"ARMISD::VREV64", SDTARMVSHUF>; +def NEONvrev32 : SDNode<"ARMISD::VREV32", SDTARMVSHUF>; +def NEONvrev16 : SDNode<"ARMISD::VREV16", SDTARMVSHUF>; + +def SDTARMVSHUF2 : SDTypeProfile<2, 2, [SDTCisVec<0>, SDTCisSameAs<0, 1>, + SDTCisSameAs<0, 2>, + SDTCisSameAs<0, 3>]>; +def NEONzip : SDNode<"ARMISD::VZIP", SDTARMVSHUF2>; +def NEONuzp : SDNode<"ARMISD::VUZP", SDTARMVSHUF2>; +def NEONtrn : SDNode<"ARMISD::VTRN", SDTARMVSHUF2>; + +def SDTARMFMAX : SDTypeProfile<1, 2, [SDTCisVT<0, f32>, SDTCisSameAs<0, 1>, + SDTCisSameAs<0, 2>]>; +def NEONfmax : SDNode<"ARMISD::FMAX", SDTARMFMAX>; +def NEONfmin : SDNode<"ARMISD::FMIN", SDTARMFMAX>; + +//===----------------------------------------------------------------------===// +// NEON operand definitions +//===----------------------------------------------------------------------===// + +def h8imm : Operand<i8> { + let PrintMethod = "printHex8ImmOperand"; +} +def h16imm : Operand<i16> { + let PrintMethod = "printHex16ImmOperand"; +} +def h32imm : Operand<i32> { + let PrintMethod = "printHex32ImmOperand"; +} +def h64imm : Operand<i64> { + let PrintMethod = "printHex64ImmOperand"; +} + +//===----------------------------------------------------------------------===// +// NEON load / store instructions +//===----------------------------------------------------------------------===// + +let mayLoad = 1, neverHasSideEffects = 1 in { +// Use vldmia to load a Q register as a D register pair. +// This is equivalent to VLDMD except that it has a Q register operand +// instead of a pair of D registers. +def VLDMQ + : AXDI5<(outs QPR:$dst), (ins addrmode5:$addr, pred:$p), + IndexModeNone, IIC_fpLoadm, + "vldm${addr:submode}${p}\t${addr:base}, ${dst:dregpair}", "", []>; + +// Use vld1 to load a Q register as a D register pair. +// This alternative to VLDMQ allows an alignment to be specified. +// This is equivalent to VLD1q64 except that it has a Q register operand. +def VLD1q + : NLdSt<0,0b10,0b1010,0b1100, (outs QPR:$dst), (ins addrmode6:$addr), + IIC_VLD1, "vld1", "64", "${dst:dregpair}, $addr", "", []>; +} // mayLoad = 1, neverHasSideEffects = 1 + +let mayStore = 1, neverHasSideEffects = 1 in { +// Use vstmia to store a Q register as a D register pair. +// This is equivalent to VSTMD except that it has a Q register operand +// instead of a pair of D registers. +def VSTMQ + : AXDI5<(outs), (ins QPR:$src, addrmode5:$addr, pred:$p), + IndexModeNone, IIC_fpStorem, + "vstm${addr:submode}${p}\t${addr:base}, ${src:dregpair}", "", []>; + +// Use vst1 to store a Q register as a D register pair. +// This alternative to VSTMQ allows an alignment to be specified. +// This is equivalent to VST1q64 except that it has a Q register operand. +def VST1q + : NLdSt<0,0b00,0b1010,0b1100, (outs), (ins addrmode6:$addr, QPR:$src), + IIC_VST, "vst1", "64", "${src:dregpair}, $addr", "", []>; +} // mayStore = 1, neverHasSideEffects = 1 + +let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 in { + +// VLD1 : Vector Load (multiple single elements) +class VLD1D<bits<4> op7_4, string Dt> + : NLdSt<0,0b10,0b0111,op7_4, (outs DPR:$dst), + (ins addrmode6:$addr), IIC_VLD1, + "vld1", Dt, "\\{$dst\\}, $addr", "", []>; +class VLD1Q<bits<4> op7_4, string Dt> + : NLdSt<0,0b10,0b1010,op7_4, (outs DPR:$dst1, DPR:$dst2), + (ins addrmode6:$addr), IIC_VLD1, + "vld1", Dt, "\\{$dst1, $dst2\\}, $addr", "", []>; + +def VLD1d8 : VLD1D<0b0000, "8">; +def VLD1d16 : VLD1D<0b0100, "16">; +def VLD1d32 : VLD1D<0b1000, "32">; +def VLD1d64 : VLD1D<0b1100, "64">; + +def VLD1q8 : VLD1Q<0b0000, "8">; +def VLD1q16 : VLD1Q<0b0100, "16">; +def VLD1q32 : VLD1Q<0b1000, "32">; +def VLD1q64 : VLD1Q<0b1100, "64">; + +// ...with address register writeback: +class VLD1DWB<bits<4> op7_4, string Dt> + : NLdSt<0,0b10,0b0111,op7_4, (outs DPR:$dst, GPR:$wb), + (ins addrmode6:$addr, am6offset:$offset), IIC_VLD1, + "vld1", Dt, "\\{$dst\\}, $addr$offset", + "$addr.addr = $wb", []>; +class VLD1QWB<bits<4> op7_4, string Dt> + : NLdSt<0,0b10,0b1010,op7_4, (outs QPR:$dst, GPR:$wb), + (ins addrmode6:$addr, am6offset:$offset), IIC_VLD1, + "vld1", Dt, "${dst:dregpair}, $addr$offset", + "$addr.addr = $wb", []>; + +def VLD1d8_UPD : VLD1DWB<0b0000, "8">; +def VLD1d16_UPD : VLD1DWB<0b0100, "16">; +def VLD1d32_UPD : VLD1DWB<0b1000, "32">; +def VLD1d64_UPD : VLD1DWB<0b1100, "64">; + +def VLD1q8_UPD : VLD1QWB<0b0000, "8">; +def VLD1q16_UPD : VLD1QWB<0b0100, "16">; +def VLD1q32_UPD : VLD1QWB<0b1000, "32">; +def VLD1q64_UPD : VLD1QWB<0b1100, "64">; + +// ...with 3 registers (some of these are only for the disassembler): +class VLD1D3<bits<4> op7_4, string Dt> + : NLdSt<0,0b10,0b0110,op7_4, (outs DPR:$dst1, DPR:$dst2, DPR:$dst3), + (ins addrmode6:$addr), IIC_VLD1, "vld1", Dt, + "\\{$dst1, $dst2, $dst3\\}, $addr", "", []>; +class VLD1D3WB<bits<4> op7_4, string Dt> + : NLdSt<0,0b10,0b0110,op7_4, (outs DPR:$dst1, DPR:$dst2, DPR:$dst3, GPR:$wb), + (ins addrmode6:$addr, am6offset:$offset), IIC_VLD1, "vld1", Dt, + "\\{$dst1, $dst2, $dst3\\}, $addr$offset", "$addr.addr = $wb", []>; + +def VLD1d8T : VLD1D3<0b0000, "8">; +def VLD1d16T : VLD1D3<0b0100, "16">; +def VLD1d32T : VLD1D3<0b1000, "32">; +def VLD1d64T : VLD1D3<0b1100, "64">; + +def VLD1d8T_UPD : VLD1D3WB<0b0000, "8">; +def VLD1d16T_UPD : VLD1D3WB<0b0100, "16">; +def VLD1d32T_UPD : VLD1D3WB<0b1000, "32">; +def VLD1d64T_UPD : VLD1D3WB<0b1100, "64">; + +// ...with 4 registers (some of these are only for the disassembler): +class VLD1D4<bits<4> op7_4, string Dt> + : NLdSt<0,0b10,0b0010,op7_4,(outs DPR:$dst1, DPR:$dst2, DPR:$dst3, DPR:$dst4), + (ins addrmode6:$addr), IIC_VLD1, "vld1", Dt, + "\\{$dst1, $dst2, $dst3, $dst4\\}, $addr", "", []>; +class VLD1D4WB<bits<4> op7_4, string Dt> + : NLdSt<0,0b10,0b0010,op7_4, + (outs DPR:$dst1, DPR:$dst2, DPR:$dst3, DPR:$dst4, GPR:$wb), + (ins addrmode6:$addr, am6offset:$offset), IIC_VLD1, "vld1", Dt, + "\\{$dst1, $dst2, $dst3, $dst4\\}, $addr$offset", "$addr.addr = $wb", + []>; + +def VLD1d8Q : VLD1D4<0b0000, "8">; +def VLD1d16Q : VLD1D4<0b0100, "16">; +def VLD1d32Q : VLD1D4<0b1000, "32">; +def VLD1d64Q : VLD1D4<0b1100, "64">; + +def VLD1d8Q_UPD : VLD1D4WB<0b0000, "8">; +def VLD1d16Q_UPD : VLD1D4WB<0b0100, "16">; +def VLD1d32Q_UPD : VLD1D4WB<0b1000, "32">; +def VLD1d64Q_UPD : VLD1D4WB<0b1100, "64">; + +// VLD2 : Vector Load (multiple 2-element structures) +class VLD2D<bits<4> op11_8, bits<4> op7_4, string Dt> + : NLdSt<0, 0b10, op11_8, op7_4, (outs DPR:$dst1, DPR:$dst2), + (ins addrmode6:$addr), IIC_VLD2, + "vld2", Dt, "\\{$dst1, $dst2\\}, $addr", "", []>; +class VLD2Q<bits<4> op7_4, string Dt> + : NLdSt<0, 0b10, 0b0011, op7_4, + (outs DPR:$dst1, DPR:$dst2, DPR:$dst3, DPR:$dst4), + (ins addrmode6:$addr), IIC_VLD2, + "vld2", Dt, "\\{$dst1, $dst2, $dst3, $dst4\\}, $addr", "", []>; + +def VLD2d8 : VLD2D<0b1000, 0b0000, "8">; +def VLD2d16 : VLD2D<0b1000, 0b0100, "16">; +def VLD2d32 : VLD2D<0b1000, 0b1000, "32">; + +def VLD2q8 : VLD2Q<0b0000, "8">; +def VLD2q16 : VLD2Q<0b0100, "16">; +def VLD2q32 : VLD2Q<0b1000, "32">; + +// ...with address register writeback: +class VLD2DWB<bits<4> op11_8, bits<4> op7_4, string Dt> + : NLdSt<0, 0b10, op11_8, op7_4, (outs DPR:$dst1, DPR:$dst2, GPR:$wb), + (ins addrmode6:$addr, am6offset:$offset), IIC_VLD2, + "vld2", Dt, "\\{$dst1, $dst2\\}, $addr$offset", + "$addr.addr = $wb", []>; +class VLD2QWB<bits<4> op7_4, string Dt> + : NLdSt<0, 0b10, 0b0011, op7_4, + (outs DPR:$dst1, DPR:$dst2, DPR:$dst3, DPR:$dst4, GPR:$wb), + (ins addrmode6:$addr, am6offset:$offset), IIC_VLD2, + "vld2", Dt, "\\{$dst1, $dst2, $dst3, $dst4\\}, $addr$offset", + "$addr.addr = $wb", []>; + +def VLD2d8_UPD : VLD2DWB<0b1000, 0b0000, "8">; +def VLD2d16_UPD : VLD2DWB<0b1000, 0b0100, "16">; +def VLD2d32_UPD : VLD2DWB<0b1000, 0b1000, "32">; + +def VLD2q8_UPD : VLD2QWB<0b0000, "8">; +def VLD2q16_UPD : VLD2QWB<0b0100, "16">; +def VLD2q32_UPD : VLD2QWB<0b1000, "32">; + +// ...with double-spaced registers (for disassembly only): +def VLD2b8 : VLD2D<0b1001, 0b0000, "8">; +def VLD2b16 : VLD2D<0b1001, 0b0100, "16">; +def VLD2b32 : VLD2D<0b1001, 0b1000, "32">; +def VLD2b8_UPD : VLD2DWB<0b1001, 0b0000, "8">; +def VLD2b16_UPD : VLD2DWB<0b1001, 0b0100, "16">; +def VLD2b32_UPD : VLD2DWB<0b1001, 0b1000, "32">; + +// VLD3 : Vector Load (multiple 3-element structures) +class VLD3D<bits<4> op11_8, bits<4> op7_4, string Dt> + : NLdSt<0, 0b10, op11_8, op7_4, (outs DPR:$dst1, DPR:$dst2, DPR:$dst3), + (ins addrmode6:$addr), IIC_VLD3, + "vld3", Dt, "\\{$dst1, $dst2, $dst3\\}, $addr", "", []>; + +def VLD3d8 : VLD3D<0b0100, 0b0000, "8">; +def VLD3d16 : VLD3D<0b0100, 0b0100, "16">; +def VLD3d32 : VLD3D<0b0100, 0b1000, "32">; + +// ...with address register writeback: +class VLD3DWB<bits<4> op11_8, bits<4> op7_4, string Dt> + : NLdSt<0, 0b10, op11_8, op7_4, + (outs DPR:$dst1, DPR:$dst2, DPR:$dst3, GPR:$wb), + (ins addrmode6:$addr, am6offset:$offset), IIC_VLD3, + "vld3", Dt, "\\{$dst1, $dst2, $dst3\\}, $addr$offset", + "$addr.addr = $wb", []>; + +def VLD3d8_UPD : VLD3DWB<0b0100, 0b0000, "8">; +def VLD3d16_UPD : VLD3DWB<0b0100, 0b0100, "16">; +def VLD3d32_UPD : VLD3DWB<0b0100, 0b1000, "32">; + +// ...with double-spaced registers (non-updating versions for disassembly only): +def VLD3q8 : VLD3D<0b0101, 0b0000, "8">; +def VLD3q16 : VLD3D<0b0101, 0b0100, "16">; +def VLD3q32 : VLD3D<0b0101, 0b1000, "32">; +def VLD3q8_UPD : VLD3DWB<0b0101, 0b0000, "8">; +def VLD3q16_UPD : VLD3DWB<0b0101, 0b0100, "16">; +def VLD3q32_UPD : VLD3DWB<0b0101, 0b1000, "32">; + +// ...alternate versions to be allocated odd register numbers: +def VLD3q8odd_UPD : VLD3DWB<0b0101, 0b0000, "8">; +def VLD3q16odd_UPD : VLD3DWB<0b0101, 0b0100, "16">; +def VLD3q32odd_UPD : VLD3DWB<0b0101, 0b1000, "32">; + +// VLD4 : Vector Load (multiple 4-element structures) +class VLD4D<bits<4> op11_8, bits<4> op7_4, string Dt> + : NLdSt<0, 0b10, op11_8, op7_4, + (outs DPR:$dst1, DPR:$dst2, DPR:$dst3, DPR:$dst4), + (ins addrmode6:$addr), IIC_VLD4, + "vld4", Dt, "\\{$dst1, $dst2, $dst3, $dst4\\}, $addr", "", []>; + +def VLD4d8 : VLD4D<0b0000, 0b0000, "8">; +def VLD4d16 : VLD4D<0b0000, 0b0100, "16">; +def VLD4d32 : VLD4D<0b0000, 0b1000, "32">; + +// ...with address register writeback: +class VLD4DWB<bits<4> op11_8, bits<4> op7_4, string Dt> + : NLdSt<0, 0b10, op11_8, op7_4, + (outs DPR:$dst1, DPR:$dst2, DPR:$dst3, DPR:$dst4, GPR:$wb), + (ins addrmode6:$addr, am6offset:$offset), IIC_VLD4, + "vld4", Dt, "\\{$dst1, $dst2, $dst3, $dst4\\}, $addr$offset", + "$addr.addr = $wb", []>; + +def VLD4d8_UPD : VLD4DWB<0b0000, 0b0000, "8">; +def VLD4d16_UPD : VLD4DWB<0b0000, 0b0100, "16">; +def VLD4d32_UPD : VLD4DWB<0b0000, 0b1000, "32">; + +// ...with double-spaced registers (non-updating versions for disassembly only): +def VLD4q8 : VLD4D<0b0001, 0b0000, "8">; +def VLD4q16 : VLD4D<0b0001, 0b0100, "16">; +def VLD4q32 : VLD4D<0b0001, 0b1000, "32">; +def VLD4q8_UPD : VLD4DWB<0b0001, 0b0000, "8">; +def VLD4q16_UPD : VLD4DWB<0b0001, 0b0100, "16">; +def VLD4q32_UPD : VLD4DWB<0b0001, 0b1000, "32">; + +// ...alternate versions to be allocated odd register numbers: +def VLD4q8odd_UPD : VLD4DWB<0b0001, 0b0000, "8">; +def VLD4q16odd_UPD : VLD4DWB<0b0001, 0b0100, "16">; +def VLD4q32odd_UPD : VLD4DWB<0b0001, 0b1000, "32">; + +// VLD1LN : Vector Load (single element to one lane) +// FIXME: Not yet implemented. + +// VLD2LN : Vector Load (single 2-element structure to one lane) +class VLD2LN<bits<4> op11_8, bits<4> op7_4, string Dt> + : NLdSt<1, 0b10, op11_8, op7_4, (outs DPR:$dst1, DPR:$dst2), + (ins addrmode6:$addr, DPR:$src1, DPR:$src2, nohash_imm:$lane), + IIC_VLD2, "vld2", Dt, "\\{$dst1[$lane], $dst2[$lane]\\}, $addr", + "$src1 = $dst1, $src2 = $dst2", []>; + +def VLD2LNd8 : VLD2LN<0b0001, {?,?,?,?}, "8">; +def VLD2LNd16 : VLD2LN<0b0101, {?,?,0,?}, "16">; +def VLD2LNd32 : VLD2LN<0b1001, {?,0,?,?}, "32">; + +// ...with double-spaced registers: +def VLD2LNq16 : VLD2LN<0b0101, {?,?,1,?}, "16">; +def VLD2LNq32 : VLD2LN<0b1001, {?,1,?,?}, "32">; + +// ...alternate versions to be allocated odd register numbers: +def VLD2LNq16odd : VLD2LN<0b0101, {?,?,1,?}, "16">; +def VLD2LNq32odd : VLD2LN<0b1001, {?,1,?,?}, "32">; + +// ...with address register writeback: +class VLD2LNWB<bits<4> op11_8, bits<4> op7_4, string Dt> + : NLdSt<1, 0b10, op11_8, op7_4, (outs DPR:$dst1, DPR:$dst2, GPR:$wb), + (ins addrmode6:$addr, am6offset:$offset, + DPR:$src1, DPR:$src2, nohash_imm:$lane), IIC_VLD2, "vld2", Dt, + "\\{$dst1[$lane], $dst2[$lane]\\}, $addr$offset", + "$src1 = $dst1, $src2 = $dst2, $addr.addr = $wb", []>; + +def VLD2LNd8_UPD : VLD2LNWB<0b0001, {?,?,?,?}, "8">; +def VLD2LNd16_UPD : VLD2LNWB<0b0101, {?,?,0,?}, "16">; +def VLD2LNd32_UPD : VLD2LNWB<0b1001, {?,0,?,?}, "32">; + +def VLD2LNq16_UPD : VLD2LNWB<0b0101, {?,?,1,?}, "16">; +def VLD2LNq32_UPD : VLD2LNWB<0b1001, {?,1,?,?}, "32">; + +// VLD3LN : Vector Load (single 3-element structure to one lane) +class VLD3LN<bits<4> op11_8, bits<4> op7_4, string Dt> + : NLdSt<1, 0b10, op11_8, op7_4, (outs DPR:$dst1, DPR:$dst2, DPR:$dst3), + (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3, + nohash_imm:$lane), IIC_VLD3, "vld3", Dt, + "\\{$dst1[$lane], $dst2[$lane], $dst3[$lane]\\}, $addr", + "$src1 = $dst1, $src2 = $dst2, $src3 = $dst3", []>; + +def VLD3LNd8 : VLD3LN<0b0010, {?,?,?,0}, "8">; +def VLD3LNd16 : VLD3LN<0b0110, {?,?,0,0}, "16">; +def VLD3LNd32 : VLD3LN<0b1010, {?,0,0,0}, "32">; + +// ...with double-spaced registers: +def VLD3LNq16 : VLD3LN<0b0110, {?,?,1,0}, "16">; +def VLD3LNq32 : VLD3LN<0b1010, {?,1,0,0}, "32">; + +// ...alternate versions to be allocated odd register numbers: +def VLD3LNq16odd : VLD3LN<0b0110, {?,?,1,0}, "16">; +def VLD3LNq32odd : VLD3LN<0b1010, {?,1,0,0}, "32">; + +// ...with address register writeback: +class VLD3LNWB<bits<4> op11_8, bits<4> op7_4, string Dt> + : NLdSt<1, 0b10, op11_8, op7_4, + (outs DPR:$dst1, DPR:$dst2, DPR:$dst3, GPR:$wb), + (ins addrmode6:$addr, am6offset:$offset, + DPR:$src1, DPR:$src2, DPR:$src3, nohash_imm:$lane), + IIC_VLD3, "vld3", Dt, + "\\{$dst1[$lane], $dst2[$lane], $dst3[$lane]\\}, $addr$offset", + "$src1 = $dst1, $src2 = $dst2, $src3 = $dst3, $addr.addr = $wb", + []>; + +def VLD3LNd8_UPD : VLD3LNWB<0b0010, {?,?,?,0}, "8">; +def VLD3LNd16_UPD : VLD3LNWB<0b0110, {?,?,0,0}, "16">; +def VLD3LNd32_UPD : VLD3LNWB<0b1010, {?,0,0,0}, "32">; + +def VLD3LNq16_UPD : VLD3LNWB<0b0110, {?,?,1,0}, "16">; +def VLD3LNq32_UPD : VLD3LNWB<0b1010, {?,1,0,0}, "32">; + +// VLD4LN : Vector Load (single 4-element structure to one lane) +class VLD4LN<bits<4> op11_8, bits<4> op7_4, string Dt> + : NLdSt<1, 0b10, op11_8, op7_4, + (outs DPR:$dst1, DPR:$dst2, DPR:$dst3, DPR:$dst4), + (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4, + nohash_imm:$lane), IIC_VLD4, "vld4", Dt, + "\\{$dst1[$lane], $dst2[$lane], $dst3[$lane], $dst4[$lane]\\}, $addr", + "$src1 = $dst1, $src2 = $dst2, $src3 = $dst3, $src4 = $dst4", []>; + +def VLD4LNd8 : VLD4LN<0b0011, {?,?,?,?}, "8">; +def VLD4LNd16 : VLD4LN<0b0111, {?,?,0,?}, "16">; +def VLD4LNd32 : VLD4LN<0b1011, {?,0,?,?}, "32">; + +// ...with double-spaced registers: +def VLD4LNq16 : VLD4LN<0b0111, {?,?,1,?}, "16">; +def VLD4LNq32 : VLD4LN<0b1011, {?,1,?,?}, "32">; + +// ...alternate versions to be allocated odd register numbers: +def VLD4LNq16odd : VLD4LN<0b0111, {?,?,1,?}, "16">; +def VLD4LNq32odd : VLD4LN<0b1011, {?,1,?,?}, "32">; + +// ...with address register writeback: +class VLD4LNWB<bits<4> op11_8, bits<4> op7_4, string Dt> + : NLdSt<1, 0b10, op11_8, op7_4, + (outs DPR:$dst1, DPR:$dst2, DPR:$dst3, DPR:$dst4, GPR:$wb), + (ins addrmode6:$addr, am6offset:$offset, + DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4, nohash_imm:$lane), + IIC_VLD4, "vld4", Dt, +"\\{$dst1[$lane], $dst2[$lane], $dst3[$lane], $dst4[$lane]\\}, $addr$offset", +"$src1 = $dst1, $src2 = $dst2, $src3 = $dst3, $src4 = $dst4, $addr.addr = $wb", + []>; + +def VLD4LNd8_UPD : VLD4LNWB<0b0011, {?,?,?,?}, "8">; +def VLD4LNd16_UPD : VLD4LNWB<0b0111, {?,?,0,?}, "16">; +def VLD4LNd32_UPD : VLD4LNWB<0b1011, {?,0,?,?}, "32">; + +def VLD4LNq16_UPD : VLD4LNWB<0b0111, {?,?,1,?}, "16">; +def VLD4LNq32_UPD : VLD4LNWB<0b1011, {?,1,?,?}, "32">; + +// VLD1DUP : Vector Load (single element to all lanes) +// VLD2DUP : Vector Load (single 2-element structure to all lanes) +// VLD3DUP : Vector Load (single 3-element structure to all lanes) +// VLD4DUP : Vector Load (single 4-element structure to all lanes) +// FIXME: Not yet implemented. +} // mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 + +let mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1 in { + +// VST1 : Vector Store (multiple single elements) +class VST1D<bits<4> op7_4, string Dt> + : NLdSt<0,0b00,0b0111,op7_4, (outs), (ins addrmode6:$addr, DPR:$src), IIC_VST, + "vst1", Dt, "\\{$src\\}, $addr", "", []>; +class VST1Q<bits<4> op7_4, string Dt> + : NLdSt<0,0b00,0b1010,op7_4, (outs), + (ins addrmode6:$addr, DPR:$src1, DPR:$src2), IIC_VST, + "vst1", Dt, "\\{$src1, $src2\\}, $addr", "", []>; + +def VST1d8 : VST1D<0b0000, "8">; +def VST1d16 : VST1D<0b0100, "16">; +def VST1d32 : VST1D<0b1000, "32">; +def VST1d64 : VST1D<0b1100, "64">; + +def VST1q8 : VST1Q<0b0000, "8">; +def VST1q16 : VST1Q<0b0100, "16">; +def VST1q32 : VST1Q<0b1000, "32">; +def VST1q64 : VST1Q<0b1100, "64">; + +// ...with address register writeback: +class VST1DWB<bits<4> op7_4, string Dt> + : NLdSt<0, 0b00, 0b0111, op7_4, (outs GPR:$wb), + (ins addrmode6:$addr, am6offset:$offset, DPR:$src), IIC_VST, + "vst1", Dt, "\\{$src\\}, $addr$offset", "$addr.addr = $wb", []>; +class VST1QWB<bits<4> op7_4, string Dt> + : NLdSt<0, 0b00, 0b1010, op7_4, (outs GPR:$wb), + (ins addrmode6:$addr, am6offset:$offset, QPR:$src), IIC_VST, + "vst1", Dt, "${src:dregpair}, $addr$offset", "$addr.addr = $wb", []>; + +def VST1d8_UPD : VST1DWB<0b0000, "8">; +def VST1d16_UPD : VST1DWB<0b0100, "16">; +def VST1d32_UPD : VST1DWB<0b1000, "32">; +def VST1d64_UPD : VST1DWB<0b1100, "64">; + +def VST1q8_UPD : VST1QWB<0b0000, "8">; +def VST1q16_UPD : VST1QWB<0b0100, "16">; +def VST1q32_UPD : VST1QWB<0b1000, "32">; +def VST1q64_UPD : VST1QWB<0b1100, "64">; + +// ...with 3 registers (some of these are only for the disassembler): +class VST1D3<bits<4> op7_4, string Dt> + : NLdSt<0, 0b00, 0b0110, op7_4, (outs), + (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3), + IIC_VST, "vst1", Dt, "\\{$src1, $src2, $src3\\}, $addr", "", []>; +class VST1D3WB<bits<4> op7_4, string Dt> + : NLdSt<0, 0b00, 0b0110, op7_4, (outs GPR:$wb), + (ins addrmode6:$addr, am6offset:$offset, + DPR:$src1, DPR:$src2, DPR:$src3), + IIC_VST, "vst1", Dt, "\\{$src1, $src2, $src3\\}, $addr$offset", + "$addr.addr = $wb", []>; + +def VST1d8T : VST1D3<0b0000, "8">; +def VST1d16T : VST1D3<0b0100, "16">; +def VST1d32T : VST1D3<0b1000, "32">; +def VST1d64T : VST1D3<0b1100, "64">; + +def VST1d8T_UPD : VST1D3WB<0b0000, "8">; +def VST1d16T_UPD : VST1D3WB<0b0100, "16">; +def VST1d32T_UPD : VST1D3WB<0b1000, "32">; +def VST1d64T_UPD : VST1D3WB<0b1100, "64">; + +// ...with 4 registers (some of these are only for the disassembler): +class VST1D4<bits<4> op7_4, string Dt> + : NLdSt<0, 0b00, 0b0010, op7_4, (outs), + (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4), + IIC_VST, "vst1", Dt, "\\{$src1, $src2, $src3, $src4\\}, $addr", "", + []>; +class VST1D4WB<bits<4> op7_4, string Dt> + : NLdSt<0, 0b00, 0b0010, op7_4, (outs GPR:$wb), + (ins addrmode6:$addr, am6offset:$offset, + DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4), + IIC_VST, "vst1", Dt, "\\{$src1, $src2, $src3, $src4\\}, $addr$offset", + "$addr.addr = $wb", []>; + +def VST1d8Q : VST1D4<0b0000, "8">; +def VST1d16Q : VST1D4<0b0100, "16">; +def VST1d32Q : VST1D4<0b1000, "32">; +def VST1d64Q : VST1D4<0b1100, "64">; + +def VST1d8Q_UPD : VST1D4WB<0b0000, "8">; +def VST1d16Q_UPD : VST1D4WB<0b0100, "16">; +def VST1d32Q_UPD : VST1D4WB<0b1000, "32">; +def VST1d64Q_UPD : VST1D4WB<0b1100, "64">; + +// VST2 : Vector Store (multiple 2-element structures) +class VST2D<bits<4> op11_8, bits<4> op7_4, string Dt> + : NLdSt<0, 0b00, op11_8, op7_4, (outs), + (ins addrmode6:$addr, DPR:$src1, DPR:$src2), + IIC_VST, "vst2", Dt, "\\{$src1, $src2\\}, $addr", "", []>; +class VST2Q<bits<4> op7_4, string Dt> + : NLdSt<0, 0b00, 0b0011, op7_4, (outs), + (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4), + IIC_VST, "vst2", Dt, "\\{$src1, $src2, $src3, $src4\\}, $addr", + "", []>; + +def VST2d8 : VST2D<0b1000, 0b0000, "8">; +def VST2d16 : VST2D<0b1000, 0b0100, "16">; +def VST2d32 : VST2D<0b1000, 0b1000, "32">; + +def VST2q8 : VST2Q<0b0000, "8">; +def VST2q16 : VST2Q<0b0100, "16">; +def VST2q32 : VST2Q<0b1000, "32">; + +// ...with address register writeback: +class VST2DWB<bits<4> op11_8, bits<4> op7_4, string Dt> + : NLdSt<0, 0b00, op11_8, op7_4, (outs GPR:$wb), + (ins addrmode6:$addr, am6offset:$offset, DPR:$src1, DPR:$src2), + IIC_VST, "vst2", Dt, "\\{$src1, $src2\\}, $addr$offset", + "$addr.addr = $wb", []>; +class VST2QWB<bits<4> op7_4, string Dt> + : NLdSt<0, 0b00, 0b0011, op7_4, (outs GPR:$wb), + (ins addrmode6:$addr, am6offset:$offset, + DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4), + IIC_VST, "vst2", Dt, "\\{$src1, $src2, $src3, $src4\\}, $addr$offset", + "$addr.addr = $wb", []>; + +def VST2d8_UPD : VST2DWB<0b1000, 0b0000, "8">; +def VST2d16_UPD : VST2DWB<0b1000, 0b0100, "16">; +def VST2d32_UPD : VST2DWB<0b1000, 0b1000, "32">; + +def VST2q8_UPD : VST2QWB<0b0000, "8">; +def VST2q16_UPD : VST2QWB<0b0100, "16">; +def VST2q32_UPD : VST2QWB<0b1000, "32">; + +// ...with double-spaced registers (for disassembly only): +def VST2b8 : VST2D<0b1001, 0b0000, "8">; +def VST2b16 : VST2D<0b1001, 0b0100, "16">; +def VST2b32 : VST2D<0b1001, 0b1000, "32">; +def VST2b8_UPD : VST2DWB<0b1001, 0b0000, "8">; +def VST2b16_UPD : VST2DWB<0b1001, 0b0100, "16">; +def VST2b32_UPD : VST2DWB<0b1001, 0b1000, "32">; + +// VST3 : Vector Store (multiple 3-element structures) +class VST3D<bits<4> op11_8, bits<4> op7_4, string Dt> + : NLdSt<0, 0b00, op11_8, op7_4, (outs), + (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3), IIC_VST, + "vst3", Dt, "\\{$src1, $src2, $src3\\}, $addr", "", []>; + +def VST3d8 : VST3D<0b0100, 0b0000, "8">; +def VST3d16 : VST3D<0b0100, 0b0100, "16">; +def VST3d32 : VST3D<0b0100, 0b1000, "32">; + +// ...with address register writeback: +class VST3DWB<bits<4> op11_8, bits<4> op7_4, string Dt> + : NLdSt<0, 0b00, op11_8, op7_4, (outs GPR:$wb), + (ins addrmode6:$addr, am6offset:$offset, + DPR:$src1, DPR:$src2, DPR:$src3), IIC_VST, + "vst3", Dt, "\\{$src1, $src2, $src3\\}, $addr$offset", + "$addr.addr = $wb", []>; + +def VST3d8_UPD : VST3DWB<0b0100, 0b0000, "8">; +def VST3d16_UPD : VST3DWB<0b0100, 0b0100, "16">; +def VST3d32_UPD : VST3DWB<0b0100, 0b1000, "32">; + +// ...with double-spaced registers (non-updating versions for disassembly only): +def VST3q8 : VST3D<0b0101, 0b0000, "8">; +def VST3q16 : VST3D<0b0101, 0b0100, "16">; +def VST3q32 : VST3D<0b0101, 0b1000, "32">; +def VST3q8_UPD : VST3DWB<0b0101, 0b0000, "8">; +def VST3q16_UPD : VST3DWB<0b0101, 0b0100, "16">; +def VST3q32_UPD : VST3DWB<0b0101, 0b1000, "32">; + +// ...alternate versions to be allocated odd register numbers: +def VST3q8odd_UPD : VST3DWB<0b0101, 0b0000, "8">; +def VST3q16odd_UPD : VST3DWB<0b0101, 0b0100, "16">; +def VST3q32odd_UPD : VST3DWB<0b0101, 0b1000, "32">; + +// VST4 : Vector Store (multiple 4-element structures) +class VST4D<bits<4> op11_8, bits<4> op7_4, string Dt> + : NLdSt<0, 0b00, op11_8, op7_4, (outs), + (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4), + IIC_VST, "vst4", Dt, "\\{$src1, $src2, $src3, $src4\\}, $addr", + "", []>; + +def VST4d8 : VST4D<0b0000, 0b0000, "8">; +def VST4d16 : VST4D<0b0000, 0b0100, "16">; +def VST4d32 : VST4D<0b0000, 0b1000, "32">; + +// ...with address register writeback: +class VST4DWB<bits<4> op11_8, bits<4> op7_4, string Dt> + : NLdSt<0, 0b00, op11_8, op7_4, (outs GPR:$wb), + (ins addrmode6:$addr, am6offset:$offset, + DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4), IIC_VST, + "vst4", Dt, "\\{$src1, $src2, $src3, $src4\\}, $addr$offset", + "$addr.addr = $wb", []>; + +def VST4d8_UPD : VST4DWB<0b0000, 0b0000, "8">; +def VST4d16_UPD : VST4DWB<0b0000, 0b0100, "16">; +def VST4d32_UPD : VST4DWB<0b0000, 0b1000, "32">; + +// ...with double-spaced registers (non-updating versions for disassembly only): +def VST4q8 : VST4D<0b0001, 0b0000, "8">; +def VST4q16 : VST4D<0b0001, 0b0100, "16">; +def VST4q32 : VST4D<0b0001, 0b1000, "32">; +def VST4q8_UPD : VST4DWB<0b0001, 0b0000, "8">; +def VST4q16_UPD : VST4DWB<0b0001, 0b0100, "16">; +def VST4q32_UPD : VST4DWB<0b0001, 0b1000, "32">; + +// ...alternate versions to be allocated odd register numbers: +def VST4q8odd_UPD : VST4DWB<0b0001, 0b0000, "8">; +def VST4q16odd_UPD : VST4DWB<0b0001, 0b0100, "16">; +def VST4q32odd_UPD : VST4DWB<0b0001, 0b1000, "32">; + +// VST1LN : Vector Store (single element from one lane) +// FIXME: Not yet implemented. + +// VST2LN : Vector Store (single 2-element structure from one lane) +class VST2LN<bits<4> op11_8, bits<4> op7_4, string Dt> + : NLdSt<1, 0b00, op11_8, op7_4, (outs), + (ins addrmode6:$addr, DPR:$src1, DPR:$src2, nohash_imm:$lane), + IIC_VST, "vst2", Dt, "\\{$src1[$lane], $src2[$lane]\\}, $addr", + "", []>; + +def VST2LNd8 : VST2LN<0b0001, {?,?,?,?}, "8">; +def VST2LNd16 : VST2LN<0b0101, {?,?,0,?}, "16">; +def VST2LNd32 : VST2LN<0b1001, {?,0,?,?}, "32">; + +// ...with double-spaced registers: +def VST2LNq16 : VST2LN<0b0101, {?,?,1,?}, "16">; +def VST2LNq32 : VST2LN<0b1001, {?,1,?,?}, "32">; + +// ...alternate versions to be allocated odd register numbers: +def VST2LNq16odd : VST2LN<0b0101, {?,?,1,?}, "16">; +def VST2LNq32odd : VST2LN<0b1001, {?,1,?,?}, "32">; + +// ...with address register writeback: +class VST2LNWB<bits<4> op11_8, bits<4> op7_4, string Dt> + : NLdSt<1, 0b00, op11_8, op7_4, (outs GPR:$wb), + (ins addrmode6:$addr, am6offset:$offset, + DPR:$src1, DPR:$src2, nohash_imm:$lane), IIC_VST, "vst2", Dt, + "\\{$src1[$lane], $src2[$lane]\\}, $addr$offset", + "$addr.addr = $wb", []>; + +def VST2LNd8_UPD : VST2LNWB<0b0001, {?,?,?,?}, "8">; +def VST2LNd16_UPD : VST2LNWB<0b0101, {?,?,0,?}, "16">; +def VST2LNd32_UPD : VST2LNWB<0b1001, {?,0,?,?}, "32">; + +def VST2LNq16_UPD : VST2LNWB<0b0101, {?,?,1,?}, "16">; +def VST2LNq32_UPD : VST2LNWB<0b1001, {?,1,?,?}, "32">; + +// VST3LN : Vector Store (single 3-element structure from one lane) +class VST3LN<bits<4> op11_8, bits<4> op7_4, string Dt> + : NLdSt<1, 0b00, op11_8, op7_4, (outs), + (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3, + nohash_imm:$lane), IIC_VST, "vst3", Dt, + "\\{$src1[$lane], $src2[$lane], $src3[$lane]\\}, $addr", "", []>; + +def VST3LNd8 : VST3LN<0b0010, {?,?,?,0}, "8">; +def VST3LNd16 : VST3LN<0b0110, {?,?,0,0}, "16">; +def VST3LNd32 : VST3LN<0b1010, {?,0,0,0}, "32">; + +// ...with double-spaced registers: +def VST3LNq16 : VST3LN<0b0110, {?,?,1,0}, "16">; +def VST3LNq32 : VST3LN<0b1010, {?,1,0,0}, "32">; + +// ...alternate versions to be allocated odd register numbers: +def VST3LNq16odd : VST3LN<0b0110, {?,?,1,0}, "16">; +def VST3LNq32odd : VST3LN<0b1010, {?,1,0,0}, "32">; + +// ...with address register writeback: +class VST3LNWB<bits<4> op11_8, bits<4> op7_4, string Dt> + : NLdSt<1, 0b00, op11_8, op7_4, (outs GPR:$wb), + (ins addrmode6:$addr, am6offset:$offset, + DPR:$src1, DPR:$src2, DPR:$src3, nohash_imm:$lane), + IIC_VST, "vst3", Dt, + "\\{$src1[$lane], $src2[$lane], $src3[$lane]\\}, $addr$offset", + "$addr.addr = $wb", []>; + +def VST3LNd8_UPD : VST3LNWB<0b0010, {?,?,?,0}, "8">; +def VST3LNd16_UPD : VST3LNWB<0b0110, {?,?,0,0}, "16">; +def VST3LNd32_UPD : VST3LNWB<0b1010, {?,0,0,0}, "32">; + +def VST3LNq16_UPD : VST3LNWB<0b0110, {?,?,1,0}, "16">; +def VST3LNq32_UPD : VST3LNWB<0b1010, {?,1,0,0}, "32">; + +// VST4LN : Vector Store (single 4-element structure from one lane) +class VST4LN<bits<4> op11_8, bits<4> op7_4, string Dt> + : NLdSt<1, 0b00, op11_8, op7_4, (outs), + (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4, + nohash_imm:$lane), IIC_VST, "vst4", Dt, + "\\{$src1[$lane], $src2[$lane], $src3[$lane], $src4[$lane]\\}, $addr", + "", []>; + +def VST4LNd8 : VST4LN<0b0011, {?,?,?,?}, "8">; +def VST4LNd16 : VST4LN<0b0111, {?,?,0,?}, "16">; +def VST4LNd32 : VST4LN<0b1011, {?,0,?,?}, "32">; + +// ...with double-spaced registers: +def VST4LNq16 : VST4LN<0b0111, {?,?,1,?}, "16">; +def VST4LNq32 : VST4LN<0b1011, {?,1,?,?}, "32">; + +// ...alternate versions to be allocated odd register numbers: +def VST4LNq16odd : VST4LN<0b0111, {?,?,1,?}, "16">; +def VST4LNq32odd : VST4LN<0b1011, {?,1,?,?}, "32">; + +// ...with address register writeback: +class VST4LNWB<bits<4> op11_8, bits<4> op7_4, string Dt> + : NLdSt<1, 0b00, op11_8, op7_4, (outs GPR:$wb), + (ins addrmode6:$addr, am6offset:$offset, + DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4, nohash_imm:$lane), + IIC_VST, "vst4", Dt, + "\\{$src1[$lane], $src2[$lane], $src3[$lane], $src4[$lane]\\}, $addr$offset", + "$addr.addr = $wb", []>; + +def VST4LNd8_UPD : VST4LNWB<0b0011, {?,?,?,?}, "8">; +def VST4LNd16_UPD : VST4LNWB<0b0111, {?,?,0,?}, "16">; +def VST4LNd32_UPD : VST4LNWB<0b1011, {?,0,?,?}, "32">; + +def VST4LNq16_UPD : VST4LNWB<0b0111, {?,?,1,?}, "16">; +def VST4LNq32_UPD : VST4LNWB<0b1011, {?,1,?,?}, "32">; + +} // mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1 + + +//===----------------------------------------------------------------------===// +// NEON pattern fragments +//===----------------------------------------------------------------------===// + +// Extract D sub-registers of Q registers. +def DSubReg_i8_reg : SDNodeXForm<imm, [{ + assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering"); + return CurDAG->getTargetConstant(ARM::dsub_0 + N->getZExtValue()/8, MVT::i32); +}]>; +def DSubReg_i16_reg : SDNodeXForm<imm, [{ + assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering"); + return CurDAG->getTargetConstant(ARM::dsub_0 + N->getZExtValue()/4, MVT::i32); +}]>; +def DSubReg_i32_reg : SDNodeXForm<imm, [{ + assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering"); + return CurDAG->getTargetConstant(ARM::dsub_0 + N->getZExtValue()/2, MVT::i32); +}]>; +def DSubReg_f64_reg : SDNodeXForm<imm, [{ + assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering"); + return CurDAG->getTargetConstant(ARM::dsub_0 + N->getZExtValue(), MVT::i32); +}]>; +def DSubReg_f64_other_reg : SDNodeXForm<imm, [{ + assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering"); + return CurDAG->getTargetConstant(ARM::dsub_0 + (1 - N->getZExtValue()), + MVT::i32); +}]>; + +// Extract S sub-registers of Q/D registers. +def SSubReg_f32_reg : SDNodeXForm<imm, [{ + assert(ARM::ssub_3 == ARM::ssub_0+3 && "Unexpected subreg numbering"); + return CurDAG->getTargetConstant(ARM::ssub_0 + N->getZExtValue(), MVT::i32); +}]>; + +// Translate lane numbers from Q registers to D subregs. +def SubReg_i8_lane : SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant(N->getZExtValue() & 7, MVT::i32); +}]>; +def SubReg_i16_lane : SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant(N->getZExtValue() & 3, MVT::i32); +}]>; +def SubReg_i32_lane : SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant(N->getZExtValue() & 1, MVT::i32); +}]>; + +//===----------------------------------------------------------------------===// +// Instruction Classes +//===----------------------------------------------------------------------===// + +// Basic 2-register operations: single-, double- and quad-register. +class N2VS<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, + bits<2> op17_16, bits<5> op11_7, bit op4, string OpcodeStr, + string Dt, ValueType ResTy, ValueType OpTy, SDNode OpNode> + : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 0, op4, + (outs DPR_VFP2:$dst), (ins DPR_VFP2:$src), + IIC_VUNAD, OpcodeStr, Dt, "$dst, $src", "", []>; +class N2VD<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, + bits<2> op17_16, bits<5> op11_7, bit op4, string OpcodeStr, + string Dt, ValueType ResTy, ValueType OpTy, SDNode OpNode> + : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 0, op4, (outs DPR:$dst), + (ins DPR:$src), IIC_VUNAD, OpcodeStr, Dt,"$dst, $src", "", + [(set DPR:$dst, (ResTy (OpNode (OpTy DPR:$src))))]>; +class N2VQ<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, + bits<2> op17_16, bits<5> op11_7, bit op4, string OpcodeStr, + string Dt, ValueType ResTy, ValueType OpTy, SDNode OpNode> + : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 1, op4, (outs QPR:$dst), + (ins QPR:$src), IIC_VUNAQ, OpcodeStr, Dt,"$dst, $src", "", + [(set QPR:$dst, (ResTy (OpNode (OpTy QPR:$src))))]>; + +// Basic 2-register intrinsics, both double- and quad-register. +class N2VDInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, + bits<2> op17_16, bits<5> op11_7, bit op4, + InstrItinClass itin, string OpcodeStr, string Dt, + ValueType ResTy, ValueType OpTy, Intrinsic IntOp> + : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 0, op4, (outs DPR:$dst), + (ins DPR:$src), itin, OpcodeStr, Dt, "$dst, $src", "", + [(set DPR:$dst, (ResTy (IntOp (OpTy DPR:$src))))]>; +class N2VQInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, + bits<2> op17_16, bits<5> op11_7, bit op4, + InstrItinClass itin, string OpcodeStr, string Dt, + ValueType ResTy, ValueType OpTy, Intrinsic IntOp> + : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 1, op4, (outs QPR:$dst), + (ins QPR:$src), itin, OpcodeStr, Dt, "$dst, $src", "", + [(set QPR:$dst, (ResTy (IntOp (OpTy QPR:$src))))]>; + +// Narrow 2-register intrinsics. +class N2VNInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, + bits<2> op17_16, bits<5> op11_7, bit op6, bit op4, + InstrItinClass itin, string OpcodeStr, string Dt, + ValueType TyD, ValueType TyQ, Intrinsic IntOp> + : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, op6, op4, (outs DPR:$dst), + (ins QPR:$src), itin, OpcodeStr, Dt, "$dst, $src", "", + [(set DPR:$dst, (TyD (IntOp (TyQ QPR:$src))))]>; + +// Long 2-register intrinsics (currently only used for VMOVL). +class N2VLInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, + bits<2> op17_16, bits<5> op11_7, bit op6, bit op4, + InstrItinClass itin, string OpcodeStr, string Dt, + ValueType TyQ, ValueType TyD, Intrinsic IntOp> + : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, op6, op4, (outs QPR:$dst), + (ins DPR:$src), itin, OpcodeStr, Dt, "$dst, $src", "", + [(set QPR:$dst, (TyQ (IntOp (TyD DPR:$src))))]>; + +// 2-register shuffles (VTRN/VZIP/VUZP), both double- and quad-register. +class N2VDShuffle<bits<2> op19_18, bits<5> op11_7, string OpcodeStr, string Dt> + : N2V<0b11, 0b11, op19_18, 0b10, op11_7, 0, 0, (outs DPR:$dst1, DPR:$dst2), + (ins DPR:$src1, DPR:$src2), IIC_VPERMD, + OpcodeStr, Dt, "$dst1, $dst2", + "$src1 = $dst1, $src2 = $dst2", []>; +class N2VQShuffle<bits<2> op19_18, bits<5> op11_7, + InstrItinClass itin, string OpcodeStr, string Dt> + : N2V<0b11, 0b11, op19_18, 0b10, op11_7, 1, 0, (outs QPR:$dst1, QPR:$dst2), + (ins QPR:$src1, QPR:$src2), itin, OpcodeStr, Dt, "$dst1, $dst2", + "$src1 = $dst1, $src2 = $dst2", []>; + +// Basic 3-register operations: single-, double- and quad-register. +class N3VS<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, + string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy, + SDNode OpNode, bit Commutable> + : N3V<op24, op23, op21_20, op11_8, 0, op4, + (outs DPR_VFP2:$dst), (ins DPR_VFP2:$src1, DPR_VFP2:$src2), N3RegFrm, + IIC_VBIND, OpcodeStr, Dt, "$dst, $src1, $src2", "", []> { + let isCommutable = Commutable; +} + +class N3VD<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, + InstrItinClass itin, string OpcodeStr, string Dt, + ValueType ResTy, ValueType OpTy, SDNode OpNode, bit Commutable> + : N3V<op24, op23, op21_20, op11_8, 0, op4, + (outs DPR:$dst), (ins DPR:$src1, DPR:$src2), N3RegFrm, itin, + OpcodeStr, Dt, "$dst, $src1, $src2", "", + [(set DPR:$dst, (ResTy (OpNode (OpTy DPR:$src1), (OpTy DPR:$src2))))]> { + let isCommutable = Commutable; +} +// Same as N3VD but no data type. +class N3VDX<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, + InstrItinClass itin, string OpcodeStr, + ValueType ResTy, ValueType OpTy, + SDNode OpNode, bit Commutable> + : N3VX<op24, op23, op21_20, op11_8, 0, op4, + (outs DPR:$dst), (ins DPR:$src1, DPR:$src2), N3RegFrm, itin, + OpcodeStr, "$dst, $src1, $src2", "", + [(set DPR:$dst, (ResTy (OpNode (OpTy DPR:$src1), (OpTy DPR:$src2))))]>{ + let isCommutable = Commutable; +} + +class N3VDSL<bits<2> op21_20, bits<4> op11_8, + InstrItinClass itin, string OpcodeStr, string Dt, + ValueType Ty, SDNode ShOp> + : N3V<0, 1, op21_20, op11_8, 1, 0, + (outs DPR:$dst), (ins DPR:$src1, DPR_VFP2:$src2, nohash_imm:$lane), + NVMulSLFrm, itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "", + [(set (Ty DPR:$dst), + (Ty (ShOp (Ty DPR:$src1), + (Ty (NEONvduplane (Ty DPR_VFP2:$src2),imm:$lane)))))]> { + let isCommutable = 0; +} +class N3VDSL16<bits<2> op21_20, bits<4> op11_8, + string OpcodeStr, string Dt, ValueType Ty, SDNode ShOp> + : N3V<0, 1, op21_20, op11_8, 1, 0, + (outs DPR:$dst), (ins DPR:$src1, DPR_8:$src2, nohash_imm:$lane), + NVMulSLFrm, IIC_VMULi16D, OpcodeStr, Dt,"$dst, $src1, $src2[$lane]","", + [(set (Ty DPR:$dst), + (Ty (ShOp (Ty DPR:$src1), + (Ty (NEONvduplane (Ty DPR_8:$src2), imm:$lane)))))]> { + let isCommutable = 0; +} + +class N3VQ<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, + InstrItinClass itin, string OpcodeStr, string Dt, + ValueType ResTy, ValueType OpTy, SDNode OpNode, bit Commutable> + : N3V<op24, op23, op21_20, op11_8, 1, op4, + (outs QPR:$dst), (ins QPR:$src1, QPR:$src2), N3RegFrm, itin, + OpcodeStr, Dt, "$dst, $src1, $src2", "", + [(set QPR:$dst, (ResTy (OpNode (OpTy QPR:$src1), (OpTy QPR:$src2))))]> { + let isCommutable = Commutable; +} +class N3VQX<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, + InstrItinClass itin, string OpcodeStr, + ValueType ResTy, ValueType OpTy, SDNode OpNode, bit Commutable> + : N3VX<op24, op23, op21_20, op11_8, 1, op4, + (outs QPR:$dst), (ins QPR:$src1, QPR:$src2), N3RegFrm, itin, + OpcodeStr, "$dst, $src1, $src2", "", + [(set QPR:$dst, (ResTy (OpNode (OpTy QPR:$src1), (OpTy QPR:$src2))))]>{ + let isCommutable = Commutable; +} +class N3VQSL<bits<2> op21_20, bits<4> op11_8, + InstrItinClass itin, string OpcodeStr, string Dt, + ValueType ResTy, ValueType OpTy, SDNode ShOp> + : N3V<1, 1, op21_20, op11_8, 1, 0, + (outs QPR:$dst), (ins QPR:$src1, DPR_VFP2:$src2, nohash_imm:$lane), + NVMulSLFrm, itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "", + [(set (ResTy QPR:$dst), + (ResTy (ShOp (ResTy QPR:$src1), + (ResTy (NEONvduplane (OpTy DPR_VFP2:$src2), + imm:$lane)))))]> { + let isCommutable = 0; +} +class N3VQSL16<bits<2> op21_20, bits<4> op11_8, string OpcodeStr, string Dt, + ValueType ResTy, ValueType OpTy, SDNode ShOp> + : N3V<1, 1, op21_20, op11_8, 1, 0, + (outs QPR:$dst), (ins QPR:$src1, DPR_8:$src2, nohash_imm:$lane), + NVMulSLFrm, IIC_VMULi16Q, OpcodeStr, Dt,"$dst, $src1, $src2[$lane]","", + [(set (ResTy QPR:$dst), + (ResTy (ShOp (ResTy QPR:$src1), + (ResTy (NEONvduplane (OpTy DPR_8:$src2), + imm:$lane)))))]> { + let isCommutable = 0; +} + +// Basic 3-register intrinsics, both double- and quad-register. +class N3VDInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, + Format f, InstrItinClass itin, string OpcodeStr, string Dt, + ValueType ResTy, ValueType OpTy, Intrinsic IntOp, bit Commutable> + : N3V<op24, op23, op21_20, op11_8, 0, op4, + (outs DPR:$dst), (ins DPR:$src1, DPR:$src2), f, itin, + OpcodeStr, Dt, "$dst, $src1, $src2", "", + [(set DPR:$dst, (ResTy (IntOp (OpTy DPR:$src1), (OpTy DPR:$src2))))]> { + let isCommutable = Commutable; +} +class N3VDIntSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, + string OpcodeStr, string Dt, ValueType Ty, Intrinsic IntOp> + : N3V<0, 1, op21_20, op11_8, 1, 0, + (outs DPR:$dst), (ins DPR:$src1, DPR_VFP2:$src2, nohash_imm:$lane), + NVMulSLFrm, itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "", + [(set (Ty DPR:$dst), + (Ty (IntOp (Ty DPR:$src1), + (Ty (NEONvduplane (Ty DPR_VFP2:$src2), + imm:$lane)))))]> { + let isCommutable = 0; +} +class N3VDIntSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, + string OpcodeStr, string Dt, ValueType Ty, Intrinsic IntOp> + : N3V<0, 1, op21_20, op11_8, 1, 0, + (outs DPR:$dst), (ins DPR:$src1, DPR_8:$src2, nohash_imm:$lane), + NVMulSLFrm, itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "", + [(set (Ty DPR:$dst), + (Ty (IntOp (Ty DPR:$src1), + (Ty (NEONvduplane (Ty DPR_8:$src2), imm:$lane)))))]> { + let isCommutable = 0; +} + +class N3VQInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, + Format f, InstrItinClass itin, string OpcodeStr, string Dt, + ValueType ResTy, ValueType OpTy, Intrinsic IntOp, bit Commutable> + : N3V<op24, op23, op21_20, op11_8, 1, op4, + (outs QPR:$dst), (ins QPR:$src1, QPR:$src2), f, itin, + OpcodeStr, Dt, "$dst, $src1, $src2", "", + [(set QPR:$dst, (ResTy (IntOp (OpTy QPR:$src1), (OpTy QPR:$src2))))]> { + let isCommutable = Commutable; +} +class N3VQIntSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, + string OpcodeStr, string Dt, + ValueType ResTy, ValueType OpTy, Intrinsic IntOp> + : N3V<1, 1, op21_20, op11_8, 1, 0, + (outs QPR:$dst), (ins QPR:$src1, DPR_VFP2:$src2, nohash_imm:$lane), + NVMulSLFrm, itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "", + [(set (ResTy QPR:$dst), + (ResTy (IntOp (ResTy QPR:$src1), + (ResTy (NEONvduplane (OpTy DPR_VFP2:$src2), + imm:$lane)))))]> { + let isCommutable = 0; +} +class N3VQIntSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, + string OpcodeStr, string Dt, + ValueType ResTy, ValueType OpTy, Intrinsic IntOp> + : N3V<1, 1, op21_20, op11_8, 1, 0, + (outs QPR:$dst), (ins QPR:$src1, DPR_8:$src2, nohash_imm:$lane), + NVMulSLFrm, itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "", + [(set (ResTy QPR:$dst), + (ResTy (IntOp (ResTy QPR:$src1), + (ResTy (NEONvduplane (OpTy DPR_8:$src2), + imm:$lane)))))]> { + let isCommutable = 0; +} + +// Multiply-Add/Sub operations: single-, double- and quad-register. +class N3VSMulOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, + InstrItinClass itin, string OpcodeStr, string Dt, + ValueType Ty, SDNode MulOp, SDNode OpNode> + : N3V<op24, op23, op21_20, op11_8, 0, op4, + (outs DPR_VFP2:$dst), + (ins DPR_VFP2:$src1, DPR_VFP2:$src2, DPR_VFP2:$src3), N3RegFrm, itin, + OpcodeStr, Dt, "$dst, $src2, $src3", "$src1 = $dst", []>; + +class N3VDMulOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, + InstrItinClass itin, string OpcodeStr, string Dt, + ValueType Ty, SDNode MulOp, SDNode OpNode> + : N3V<op24, op23, op21_20, op11_8, 0, op4, + (outs DPR:$dst), (ins DPR:$src1, DPR:$src2, DPR:$src3), N3RegFrm, itin, + OpcodeStr, Dt, "$dst, $src2, $src3", "$src1 = $dst", + [(set DPR:$dst, (Ty (OpNode DPR:$src1, + (Ty (MulOp DPR:$src2, DPR:$src3)))))]>; +class N3VDMulOpSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, + string OpcodeStr, string Dt, + ValueType Ty, SDNode MulOp, SDNode ShOp> + : N3V<0, 1, op21_20, op11_8, 1, 0, + (outs DPR:$dst), + (ins DPR:$src1, DPR:$src2, DPR_VFP2:$src3, nohash_imm:$lane), + NVMulSLFrm, itin, + OpcodeStr, Dt, "$dst, $src2, $src3[$lane]", "$src1 = $dst", + [(set (Ty DPR:$dst), + (Ty (ShOp (Ty DPR:$src1), + (Ty (MulOp DPR:$src2, + (Ty (NEONvduplane (Ty DPR_VFP2:$src3), + imm:$lane)))))))]>; +class N3VDMulOpSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, + string OpcodeStr, string Dt, + ValueType Ty, SDNode MulOp, SDNode ShOp> + : N3V<0, 1, op21_20, op11_8, 1, 0, + (outs DPR:$dst), + (ins DPR:$src1, DPR:$src2, DPR_8:$src3, nohash_imm:$lane), + NVMulSLFrm, itin, + OpcodeStr, Dt, "$dst, $src2, $src3[$lane]", "$src1 = $dst", + [(set (Ty DPR:$dst), + (Ty (ShOp (Ty DPR:$src1), + (Ty (MulOp DPR:$src2, + (Ty (NEONvduplane (Ty DPR_8:$src3), + imm:$lane)))))))]>; + +class N3VQMulOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, + InstrItinClass itin, string OpcodeStr, string Dt, ValueType Ty, + SDNode MulOp, SDNode OpNode> + : N3V<op24, op23, op21_20, op11_8, 1, op4, + (outs QPR:$dst), (ins QPR:$src1, QPR:$src2, QPR:$src3), N3RegFrm, itin, + OpcodeStr, Dt, "$dst, $src2, $src3", "$src1 = $dst", + [(set QPR:$dst, (Ty (OpNode QPR:$src1, + (Ty (MulOp QPR:$src2, QPR:$src3)))))]>; +class N3VQMulOpSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, + string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy, + SDNode MulOp, SDNode ShOp> + : N3V<1, 1, op21_20, op11_8, 1, 0, + (outs QPR:$dst), + (ins QPR:$src1, QPR:$src2, DPR_VFP2:$src3, nohash_imm:$lane), + NVMulSLFrm, itin, + OpcodeStr, Dt, "$dst, $src2, $src3[$lane]", "$src1 = $dst", + [(set (ResTy QPR:$dst), + (ResTy (ShOp (ResTy QPR:$src1), + (ResTy (MulOp QPR:$src2, + (ResTy (NEONvduplane (OpTy DPR_VFP2:$src3), + imm:$lane)))))))]>; +class N3VQMulOpSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, + string OpcodeStr, string Dt, + ValueType ResTy, ValueType OpTy, + SDNode MulOp, SDNode ShOp> + : N3V<1, 1, op21_20, op11_8, 1, 0, + (outs QPR:$dst), + (ins QPR:$src1, QPR:$src2, DPR_8:$src3, nohash_imm:$lane), + NVMulSLFrm, itin, + OpcodeStr, Dt, "$dst, $src2, $src3[$lane]", "$src1 = $dst", + [(set (ResTy QPR:$dst), + (ResTy (ShOp (ResTy QPR:$src1), + (ResTy (MulOp QPR:$src2, + (ResTy (NEONvduplane (OpTy DPR_8:$src3), + imm:$lane)))))))]>; + +// Neon 3-argument intrinsics, both double- and quad-register. +// The destination register is also used as the first source operand register. +class N3VDInt3<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, + InstrItinClass itin, string OpcodeStr, string Dt, + ValueType ResTy, ValueType OpTy, Intrinsic IntOp> + : N3V<op24, op23, op21_20, op11_8, 0, op4, + (outs DPR:$dst), (ins DPR:$src1, DPR:$src2, DPR:$src3), N3RegFrm, itin, + OpcodeStr, Dt, "$dst, $src2, $src3", "$src1 = $dst", + [(set DPR:$dst, (ResTy (IntOp (OpTy DPR:$src1), + (OpTy DPR:$src2), (OpTy DPR:$src3))))]>; +class N3VQInt3<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, + InstrItinClass itin, string OpcodeStr, string Dt, + ValueType ResTy, ValueType OpTy, Intrinsic IntOp> + : N3V<op24, op23, op21_20, op11_8, 1, op4, + (outs QPR:$dst), (ins QPR:$src1, QPR:$src2, QPR:$src3), N3RegFrm, itin, + OpcodeStr, Dt, "$dst, $src2, $src3", "$src1 = $dst", + [(set QPR:$dst, (ResTy (IntOp (OpTy QPR:$src1), + (OpTy QPR:$src2), (OpTy QPR:$src3))))]>; + +// Neon Long 3-argument intrinsic. The destination register is +// a quad-register and is also used as the first source operand register. +class N3VLInt3<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, + InstrItinClass itin, string OpcodeStr, string Dt, + ValueType TyQ, ValueType TyD, Intrinsic IntOp> + : N3V<op24, op23, op21_20, op11_8, 0, op4, + (outs QPR:$dst), (ins QPR:$src1, DPR:$src2, DPR:$src3), N3RegFrm, itin, + OpcodeStr, Dt, "$dst, $src2, $src3", "$src1 = $dst", + [(set QPR:$dst, + (TyQ (IntOp (TyQ QPR:$src1), (TyD DPR:$src2), (TyD DPR:$src3))))]>; +class N3VLInt3SL<bit op24, bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, + string OpcodeStr, string Dt, + ValueType ResTy, ValueType OpTy, Intrinsic IntOp> + : N3V<op24, 1, op21_20, op11_8, 1, 0, + (outs QPR:$dst), + (ins QPR:$src1, DPR:$src2, DPR_VFP2:$src3, nohash_imm:$lane), + NVMulSLFrm, itin, + OpcodeStr, Dt, "$dst, $src2, $src3[$lane]", "$src1 = $dst", + [(set (ResTy QPR:$dst), + (ResTy (IntOp (ResTy QPR:$src1), + (OpTy DPR:$src2), + (OpTy (NEONvduplane (OpTy DPR_VFP2:$src3), + imm:$lane)))))]>; +class N3VLInt3SL16<bit op24, bits<2> op21_20, bits<4> op11_8, + InstrItinClass itin, string OpcodeStr, string Dt, + ValueType ResTy, ValueType OpTy, Intrinsic IntOp> + : N3V<op24, 1, op21_20, op11_8, 1, 0, + (outs QPR:$dst), + (ins QPR:$src1, DPR:$src2, DPR_8:$src3, nohash_imm:$lane), + NVMulSLFrm, itin, + OpcodeStr, Dt, "$dst, $src2, $src3[$lane]", "$src1 = $dst", + [(set (ResTy QPR:$dst), + (ResTy (IntOp (ResTy QPR:$src1), + (OpTy DPR:$src2), + (OpTy (NEONvduplane (OpTy DPR_8:$src3), + imm:$lane)))))]>; + +// Narrowing 3-register intrinsics. +class N3VNInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, + string OpcodeStr, string Dt, ValueType TyD, ValueType TyQ, + Intrinsic IntOp, bit Commutable> + : N3V<op24, op23, op21_20, op11_8, 0, op4, + (outs DPR:$dst), (ins QPR:$src1, QPR:$src2), N3RegFrm, IIC_VBINi4D, + OpcodeStr, Dt, "$dst, $src1, $src2", "", + [(set DPR:$dst, (TyD (IntOp (TyQ QPR:$src1), (TyQ QPR:$src2))))]> { + let isCommutable = Commutable; +} + +// Long 3-register intrinsics. +class N3VLInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, + InstrItinClass itin, string OpcodeStr, string Dt, + ValueType TyQ, ValueType TyD, Intrinsic IntOp, bit Commutable> + : N3V<op24, op23, op21_20, op11_8, 0, op4, + (outs QPR:$dst), (ins DPR:$src1, DPR:$src2), N3RegFrm, itin, + OpcodeStr, Dt, "$dst, $src1, $src2", "", + [(set QPR:$dst, (TyQ (IntOp (TyD DPR:$src1), (TyD DPR:$src2))))]> { + let isCommutable = Commutable; +} +class N3VLIntSL<bit op24, bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, + string OpcodeStr, string Dt, + ValueType ResTy, ValueType OpTy, Intrinsic IntOp> + : N3V<op24, 1, op21_20, op11_8, 1, 0, + (outs QPR:$dst), (ins DPR:$src1, DPR_VFP2:$src2, nohash_imm:$lane), + NVMulSLFrm, itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "", + [(set (ResTy QPR:$dst), + (ResTy (IntOp (OpTy DPR:$src1), + (OpTy (NEONvduplane (OpTy DPR_VFP2:$src2), + imm:$lane)))))]>; +class N3VLIntSL16<bit op24, bits<2> op21_20, bits<4> op11_8, + InstrItinClass itin, string OpcodeStr, string Dt, + ValueType ResTy, ValueType OpTy, Intrinsic IntOp> + : N3V<op24, 1, op21_20, op11_8, 1, 0, + (outs QPR:$dst), (ins DPR:$src1, DPR_8:$src2, nohash_imm:$lane), + NVMulSLFrm, itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "", + [(set (ResTy QPR:$dst), + (ResTy (IntOp (OpTy DPR:$src1), + (OpTy (NEONvduplane (OpTy DPR_8:$src2), + imm:$lane)))))]>; + +// Wide 3-register intrinsics. +class N3VWInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, + string OpcodeStr, string Dt, ValueType TyQ, ValueType TyD, + Intrinsic IntOp, bit Commutable> + : N3V<op24, op23, op21_20, op11_8, 0, op4, + (outs QPR:$dst), (ins QPR:$src1, DPR:$src2), N3RegFrm, IIC_VSUBiD, + OpcodeStr, Dt, "$dst, $src1, $src2", "", + [(set QPR:$dst, (TyQ (IntOp (TyQ QPR:$src1), (TyD DPR:$src2))))]> { + let isCommutable = Commutable; +} + +// Pairwise long 2-register intrinsics, both double- and quad-register. +class N2VDPLInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, + bits<2> op17_16, bits<5> op11_7, bit op4, + string OpcodeStr, string Dt, + ValueType ResTy, ValueType OpTy, Intrinsic IntOp> + : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 0, op4, (outs DPR:$dst), + (ins DPR:$src), IIC_VSHLiD, OpcodeStr, Dt, "$dst, $src", "", + [(set DPR:$dst, (ResTy (IntOp (OpTy DPR:$src))))]>; +class N2VQPLInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, + bits<2> op17_16, bits<5> op11_7, bit op4, + string OpcodeStr, string Dt, + ValueType ResTy, ValueType OpTy, Intrinsic IntOp> + : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 1, op4, (outs QPR:$dst), + (ins QPR:$src), IIC_VSHLiD, OpcodeStr, Dt, "$dst, $src", "", + [(set QPR:$dst, (ResTy (IntOp (OpTy QPR:$src))))]>; + +// Pairwise long 2-register accumulate intrinsics, +// both double- and quad-register. +// The destination register is also used as the first source operand register. +class N2VDPLInt2<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, + bits<2> op17_16, bits<5> op11_7, bit op4, + string OpcodeStr, string Dt, + ValueType ResTy, ValueType OpTy, Intrinsic IntOp> + : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 0, op4, + (outs DPR:$dst), (ins DPR:$src1, DPR:$src2), IIC_VPALiD, + OpcodeStr, Dt, "$dst, $src2", "$src1 = $dst", + [(set DPR:$dst, (ResTy (IntOp (ResTy DPR:$src1), (OpTy DPR:$src2))))]>; +class N2VQPLInt2<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, + bits<2> op17_16, bits<5> op11_7, bit op4, + string OpcodeStr, string Dt, + ValueType ResTy, ValueType OpTy, Intrinsic IntOp> + : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 1, op4, + (outs QPR:$dst), (ins QPR:$src1, QPR:$src2), IIC_VPALiQ, + OpcodeStr, Dt, "$dst, $src2", "$src1 = $dst", + [(set QPR:$dst, (ResTy (IntOp (ResTy QPR:$src1), (OpTy QPR:$src2))))]>; + +// Shift by immediate, +// both double- and quad-register. +class N2VDSh<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4, + Format f, InstrItinClass itin, string OpcodeStr, string Dt, + ValueType Ty, SDNode OpNode> + : N2VImm<op24, op23, op11_8, op7, 0, op4, + (outs DPR:$dst), (ins DPR:$src, i32imm:$SIMM), f, itin, + OpcodeStr, Dt, "$dst, $src, $SIMM", "", + [(set DPR:$dst, (Ty (OpNode (Ty DPR:$src), (i32 imm:$SIMM))))]>; +class N2VQSh<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4, + Format f, InstrItinClass itin, string OpcodeStr, string Dt, + ValueType Ty, SDNode OpNode> + : N2VImm<op24, op23, op11_8, op7, 1, op4, + (outs QPR:$dst), (ins QPR:$src, i32imm:$SIMM), f, itin, + OpcodeStr, Dt, "$dst, $src, $SIMM", "", + [(set QPR:$dst, (Ty (OpNode (Ty QPR:$src), (i32 imm:$SIMM))))]>; + +// Long shift by immediate. +class N2VLSh<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6, bit op4, + string OpcodeStr, string Dt, + ValueType ResTy, ValueType OpTy, SDNode OpNode> + : N2VImm<op24, op23, op11_8, op7, op6, op4, + (outs QPR:$dst), (ins DPR:$src, i32imm:$SIMM), N2RegVShLFrm, + IIC_VSHLiD, OpcodeStr, Dt, "$dst, $src, $SIMM", "", + [(set QPR:$dst, (ResTy (OpNode (OpTy DPR:$src), + (i32 imm:$SIMM))))]>; + +// Narrow shift by immediate. +class N2VNSh<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6, bit op4, + InstrItinClass itin, string OpcodeStr, string Dt, + ValueType ResTy, ValueType OpTy, SDNode OpNode> + : N2VImm<op24, op23, op11_8, op7, op6, op4, + (outs DPR:$dst), (ins QPR:$src, i32imm:$SIMM), N2RegVShRFrm, itin, + OpcodeStr, Dt, "$dst, $src, $SIMM", "", + [(set DPR:$dst, (ResTy (OpNode (OpTy QPR:$src), + (i32 imm:$SIMM))))]>; + +// Shift right by immediate and accumulate, +// both double- and quad-register. +class N2VDShAdd<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4, + string OpcodeStr, string Dt, ValueType Ty, SDNode ShOp> + : N2VImm<op24, op23, op11_8, op7, 0, op4, (outs DPR:$dst), + (ins DPR:$src1, DPR:$src2, i32imm:$SIMM), N2RegVShRFrm, IIC_VPALiD, + OpcodeStr, Dt, "$dst, $src2, $SIMM", "$src1 = $dst", + [(set DPR:$dst, (Ty (add DPR:$src1, + (Ty (ShOp DPR:$src2, (i32 imm:$SIMM))))))]>; +class N2VQShAdd<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4, + string OpcodeStr, string Dt, ValueType Ty, SDNode ShOp> + : N2VImm<op24, op23, op11_8, op7, 1, op4, (outs QPR:$dst), + (ins QPR:$src1, QPR:$src2, i32imm:$SIMM), N2RegVShRFrm, IIC_VPALiD, + OpcodeStr, Dt, "$dst, $src2, $SIMM", "$src1 = $dst", + [(set QPR:$dst, (Ty (add QPR:$src1, + (Ty (ShOp QPR:$src2, (i32 imm:$SIMM))))))]>; + +// Shift by immediate and insert, +// both double- and quad-register. +class N2VDShIns<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4, + Format f, string OpcodeStr, string Dt, ValueType Ty,SDNode ShOp> + : N2VImm<op24, op23, op11_8, op7, 0, op4, (outs DPR:$dst), + (ins DPR:$src1, DPR:$src2, i32imm:$SIMM), f, IIC_VSHLiD, + OpcodeStr, Dt, "$dst, $src2, $SIMM", "$src1 = $dst", + [(set DPR:$dst, (Ty (ShOp DPR:$src1, DPR:$src2, (i32 imm:$SIMM))))]>; +class N2VQShIns<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4, + Format f, string OpcodeStr, string Dt, ValueType Ty,SDNode ShOp> + : N2VImm<op24, op23, op11_8, op7, 1, op4, (outs QPR:$dst), + (ins QPR:$src1, QPR:$src2, i32imm:$SIMM), f, IIC_VSHLiQ, + OpcodeStr, Dt, "$dst, $src2, $SIMM", "$src1 = $dst", + [(set QPR:$dst, (Ty (ShOp QPR:$src1, QPR:$src2, (i32 imm:$SIMM))))]>; + +// Convert, with fractional bits immediate, +// both double- and quad-register. +class N2VCvtD<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4, + string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy, + Intrinsic IntOp> + : N2VImm<op24, op23, op11_8, op7, 0, op4, + (outs DPR:$dst), (ins DPR:$src, i32imm:$SIMM), NVCVTFrm, + IIC_VUNAD, OpcodeStr, Dt, "$dst, $src, $SIMM", "", + [(set DPR:$dst, (ResTy (IntOp (OpTy DPR:$src), (i32 imm:$SIMM))))]>; +class N2VCvtQ<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4, + string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy, + Intrinsic IntOp> + : N2VImm<op24, op23, op11_8, op7, 1, op4, + (outs QPR:$dst), (ins QPR:$src, i32imm:$SIMM), NVCVTFrm, + IIC_VUNAQ, OpcodeStr, Dt, "$dst, $src, $SIMM", "", + [(set QPR:$dst, (ResTy (IntOp (OpTy QPR:$src), (i32 imm:$SIMM))))]>; + +//===----------------------------------------------------------------------===// +// Multiclasses +//===----------------------------------------------------------------------===// + +// Abbreviations used in multiclass suffixes: +// Q = quarter int (8 bit) elements +// H = half int (16 bit) elements +// S = single int (32 bit) elements +// D = double int (64 bit) elements + +// Neon 2-register vector operations -- for disassembly only. + +// First with only element sizes of 8, 16 and 32 bits: +multiclass N2V_QHS_cmp<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16, + bits<5> op11_7, bit op4, string opc, string Dt, + string asm> { + // 64-bit vector types. + def v8i8 : N2V<op24_23, op21_20, 0b00, op17_16, op11_7, 0, op4, + (outs DPR:$dst), (ins DPR:$src), NoItinerary, + opc, !strconcat(Dt, "8"), asm, "", []>; + def v4i16 : N2V<op24_23, op21_20, 0b01, op17_16, op11_7, 0, op4, + (outs DPR:$dst), (ins DPR:$src), NoItinerary, + opc, !strconcat(Dt, "16"), asm, "", []>; + def v2i32 : N2V<op24_23, op21_20, 0b10, op17_16, op11_7, 0, op4, + (outs DPR:$dst), (ins DPR:$src), NoItinerary, + opc, !strconcat(Dt, "32"), asm, "", []>; + def v2f32 : N2V<op24_23, op21_20, 0b10, op17_16, op11_7, 0, op4, + (outs DPR:$dst), (ins DPR:$src), NoItinerary, + opc, "f32", asm, "", []> { + let Inst{10} = 1; // overwrite F = 1 + } + + // 128-bit vector types. + def v16i8 : N2V<op24_23, op21_20, 0b00, op17_16, op11_7, 1, op4, + (outs QPR:$dst), (ins QPR:$src), NoItinerary, + opc, !strconcat(Dt, "8"), asm, "", []>; + def v8i16 : N2V<op24_23, op21_20, 0b01, op17_16, op11_7, 1, op4, + (outs QPR:$dst), (ins QPR:$src), NoItinerary, + opc, !strconcat(Dt, "16"), asm, "", []>; + def v4i32 : N2V<op24_23, op21_20, 0b10, op17_16, op11_7, 1, op4, + (outs QPR:$dst), (ins QPR:$src), NoItinerary, + opc, !strconcat(Dt, "32"), asm, "", []>; + def v4f32 : N2V<op24_23, op21_20, 0b10, op17_16, op11_7, 1, op4, + (outs QPR:$dst), (ins QPR:$src), NoItinerary, + opc, "f32", asm, "", []> { + let Inst{10} = 1; // overwrite F = 1 + } +} + +// Neon 3-register vector operations. + +// First with only element sizes of 8, 16 and 32 bits: +multiclass N3V_QHS<bit op24, bit op23, bits<4> op11_8, bit op4, + InstrItinClass itinD16, InstrItinClass itinD32, + InstrItinClass itinQ16, InstrItinClass itinQ32, + string OpcodeStr, string Dt, + SDNode OpNode, bit Commutable = 0> { + // 64-bit vector types. + def v8i8 : N3VD<op24, op23, 0b00, op11_8, op4, itinD16, + OpcodeStr, !strconcat(Dt, "8"), + v8i8, v8i8, OpNode, Commutable>; + def v4i16 : N3VD<op24, op23, 0b01, op11_8, op4, itinD16, + OpcodeStr, !strconcat(Dt, "16"), + v4i16, v4i16, OpNode, Commutable>; + def v2i32 : N3VD<op24, op23, 0b10, op11_8, op4, itinD32, + OpcodeStr, !strconcat(Dt, "32"), + v2i32, v2i32, OpNode, Commutable>; + + // 128-bit vector types. + def v16i8 : N3VQ<op24, op23, 0b00, op11_8, op4, itinQ16, + OpcodeStr, !strconcat(Dt, "8"), + v16i8, v16i8, OpNode, Commutable>; + def v8i16 : N3VQ<op24, op23, 0b01, op11_8, op4, itinQ16, + OpcodeStr, !strconcat(Dt, "16"), + v8i16, v8i16, OpNode, Commutable>; + def v4i32 : N3VQ<op24, op23, 0b10, op11_8, op4, itinQ32, + OpcodeStr, !strconcat(Dt, "32"), + v4i32, v4i32, OpNode, Commutable>; +} + +multiclass N3VSL_HS<bits<4> op11_8, string OpcodeStr, string Dt, SDNode ShOp> { + def v4i16 : N3VDSL16<0b01, op11_8, OpcodeStr, !strconcat(Dt, "16"), + v4i16, ShOp>; + def v2i32 : N3VDSL<0b10, op11_8, IIC_VMULi32D, OpcodeStr, !strconcat(Dt,"32"), + v2i32, ShOp>; + def v8i16 : N3VQSL16<0b01, op11_8, OpcodeStr, !strconcat(Dt, "16"), + v8i16, v4i16, ShOp>; + def v4i32 : N3VQSL<0b10, op11_8, IIC_VMULi32Q, OpcodeStr, !strconcat(Dt,"32"), + v4i32, v2i32, ShOp>; +} + +// ....then also with element size 64 bits: +multiclass N3V_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4, + InstrItinClass itinD, InstrItinClass itinQ, + string OpcodeStr, string Dt, + SDNode OpNode, bit Commutable = 0> + : N3V_QHS<op24, op23, op11_8, op4, itinD, itinD, itinQ, itinQ, + OpcodeStr, Dt, OpNode, Commutable> { + def v1i64 : N3VD<op24, op23, 0b11, op11_8, op4, itinD, + OpcodeStr, !strconcat(Dt, "64"), + v1i64, v1i64, OpNode, Commutable>; + def v2i64 : N3VQ<op24, op23, 0b11, op11_8, op4, itinQ, + OpcodeStr, !strconcat(Dt, "64"), + v2i64, v2i64, OpNode, Commutable>; +} + + +// Neon Narrowing 2-register vector intrinsics, +// source operand element sizes of 16, 32 and 64 bits: +multiclass N2VNInt_HSD<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16, + bits<5> op11_7, bit op6, bit op4, + InstrItinClass itin, string OpcodeStr, string Dt, + Intrinsic IntOp> { + def v8i8 : N2VNInt<op24_23, op21_20, 0b00, op17_16, op11_7, op6, op4, + itin, OpcodeStr, !strconcat(Dt, "16"), + v8i8, v8i16, IntOp>; + def v4i16 : N2VNInt<op24_23, op21_20, 0b01, op17_16, op11_7, op6, op4, + itin, OpcodeStr, !strconcat(Dt, "32"), + v4i16, v4i32, IntOp>; + def v2i32 : N2VNInt<op24_23, op21_20, 0b10, op17_16, op11_7, op6, op4, + itin, OpcodeStr, !strconcat(Dt, "64"), + v2i32, v2i64, IntOp>; +} + + +// Neon Lengthening 2-register vector intrinsic (currently specific to VMOVL). +// source operand element sizes of 16, 32 and 64 bits: +multiclass N2VLInt_QHS<bits<2> op24_23, bits<5> op11_7, bit op6, bit op4, + string OpcodeStr, string Dt, Intrinsic IntOp> { + def v8i16 : N2VLInt<op24_23, 0b00, 0b10, 0b00, op11_7, op6, op4, IIC_VQUNAiD, + OpcodeStr, !strconcat(Dt, "8"), v8i16, v8i8, IntOp>; + def v4i32 : N2VLInt<op24_23, 0b01, 0b00, 0b00, op11_7, op6, op4, IIC_VQUNAiD, + OpcodeStr, !strconcat(Dt, "16"), v4i32, v4i16, IntOp>; + def v2i64 : N2VLInt<op24_23, 0b10, 0b00, 0b00, op11_7, op6, op4, IIC_VQUNAiD, + OpcodeStr, !strconcat(Dt, "32"), v2i64, v2i32, IntOp>; +} + + +// Neon 3-register vector intrinsics. + +// First with only element sizes of 16 and 32 bits: +multiclass N3VInt_HS<bit op24, bit op23, bits<4> op11_8, bit op4, Format f, + InstrItinClass itinD16, InstrItinClass itinD32, + InstrItinClass itinQ16, InstrItinClass itinQ32, + string OpcodeStr, string Dt, + Intrinsic IntOp, bit Commutable = 0> { + // 64-bit vector types. + def v4i16 : N3VDInt<op24, op23, 0b01, op11_8, op4, f, itinD16, + OpcodeStr, !strconcat(Dt, "16"), + v4i16, v4i16, IntOp, Commutable>; + def v2i32 : N3VDInt<op24, op23, 0b10, op11_8, op4, f, itinD32, + OpcodeStr, !strconcat(Dt, "32"), + v2i32, v2i32, IntOp, Commutable>; + + // 128-bit vector types. + def v8i16 : N3VQInt<op24, op23, 0b01, op11_8, op4, f, itinQ16, + OpcodeStr, !strconcat(Dt, "16"), + v8i16, v8i16, IntOp, Commutable>; + def v4i32 : N3VQInt<op24, op23, 0b10, op11_8, op4, f, itinQ32, + OpcodeStr, !strconcat(Dt, "32"), + v4i32, v4i32, IntOp, Commutable>; +} + +multiclass N3VIntSL_HS<bits<4> op11_8, + InstrItinClass itinD16, InstrItinClass itinD32, + InstrItinClass itinQ16, InstrItinClass itinQ32, + string OpcodeStr, string Dt, Intrinsic IntOp> { + def v4i16 : N3VDIntSL16<0b01, op11_8, itinD16, + OpcodeStr, !strconcat(Dt, "16"), v4i16, IntOp>; + def v2i32 : N3VDIntSL<0b10, op11_8, itinD32, + OpcodeStr, !strconcat(Dt, "32"), v2i32, IntOp>; + def v8i16 : N3VQIntSL16<0b01, op11_8, itinQ16, + OpcodeStr, !strconcat(Dt, "16"), v8i16, v4i16, IntOp>; + def v4i32 : N3VQIntSL<0b10, op11_8, itinQ32, + OpcodeStr, !strconcat(Dt, "32"), v4i32, v2i32, IntOp>; +} + +// ....then also with element size of 8 bits: +multiclass N3VInt_QHS<bit op24, bit op23, bits<4> op11_8, bit op4, Format f, + InstrItinClass itinD16, InstrItinClass itinD32, + InstrItinClass itinQ16, InstrItinClass itinQ32, + string OpcodeStr, string Dt, + Intrinsic IntOp, bit Commutable = 0> + : N3VInt_HS<op24, op23, op11_8, op4, f, itinD16, itinD32, itinQ16, itinQ32, + OpcodeStr, Dt, IntOp, Commutable> { + def v8i8 : N3VDInt<op24, op23, 0b00, op11_8, op4, f, itinD16, + OpcodeStr, !strconcat(Dt, "8"), + v8i8, v8i8, IntOp, Commutable>; + def v16i8 : N3VQInt<op24, op23, 0b00, op11_8, op4, f, itinQ16, + OpcodeStr, !strconcat(Dt, "8"), + v16i8, v16i8, IntOp, Commutable>; +} + +// ....then also with element size of 64 bits: +multiclass N3VInt_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4, Format f, + InstrItinClass itinD16, InstrItinClass itinD32, + InstrItinClass itinQ16, InstrItinClass itinQ32, + string OpcodeStr, string Dt, + Intrinsic IntOp, bit Commutable = 0> + : N3VInt_QHS<op24, op23, op11_8, op4, f, itinD16, itinD32, itinQ16, itinQ32, + OpcodeStr, Dt, IntOp, Commutable> { + def v1i64 : N3VDInt<op24, op23, 0b11, op11_8, op4, f, itinD32, + OpcodeStr, !strconcat(Dt, "64"), + v1i64, v1i64, IntOp, Commutable>; + def v2i64 : N3VQInt<op24, op23, 0b11, op11_8, op4, f, itinQ32, + OpcodeStr, !strconcat(Dt, "64"), + v2i64, v2i64, IntOp, Commutable>; +} + +// Neon Narrowing 3-register vector intrinsics, +// source operand element sizes of 16, 32 and 64 bits: +multiclass N3VNInt_HSD<bit op24, bit op23, bits<4> op11_8, bit op4, + string OpcodeStr, string Dt, + Intrinsic IntOp, bit Commutable = 0> { + def v8i8 : N3VNInt<op24, op23, 0b00, op11_8, op4, + OpcodeStr, !strconcat(Dt, "16"), + v8i8, v8i16, IntOp, Commutable>; + def v4i16 : N3VNInt<op24, op23, 0b01, op11_8, op4, + OpcodeStr, !strconcat(Dt, "32"), + v4i16, v4i32, IntOp, Commutable>; + def v2i32 : N3VNInt<op24, op23, 0b10, op11_8, op4, + OpcodeStr, !strconcat(Dt, "64"), + v2i32, v2i64, IntOp, Commutable>; +} + + +// Neon Long 3-register vector intrinsics. + +// First with only element sizes of 16 and 32 bits: +multiclass N3VLInt_HS<bit op24, bit op23, bits<4> op11_8, bit op4, + InstrItinClass itin16, InstrItinClass itin32, + string OpcodeStr, string Dt, + Intrinsic IntOp, bit Commutable = 0> { + def v4i32 : N3VLInt<op24, op23, 0b01, op11_8, op4, itin16, + OpcodeStr, !strconcat(Dt, "16"), + v4i32, v4i16, IntOp, Commutable>; + def v2i64 : N3VLInt<op24, op23, 0b10, op11_8, op4, itin32, + OpcodeStr, !strconcat(Dt, "32"), + v2i64, v2i32, IntOp, Commutable>; +} + +multiclass N3VLIntSL_HS<bit op24, bits<4> op11_8, + InstrItinClass itin, string OpcodeStr, string Dt, + Intrinsic IntOp> { + def v4i16 : N3VLIntSL16<op24, 0b01, op11_8, itin, + OpcodeStr, !strconcat(Dt, "16"), v4i32, v4i16, IntOp>; + def v2i32 : N3VLIntSL<op24, 0b10, op11_8, itin, + OpcodeStr, !strconcat(Dt, "32"), v2i64, v2i32, IntOp>; +} + +// ....then also with element size of 8 bits: +multiclass N3VLInt_QHS<bit op24, bit op23, bits<4> op11_8, bit op4, + InstrItinClass itin16, InstrItinClass itin32, + string OpcodeStr, string Dt, + Intrinsic IntOp, bit Commutable = 0> + : N3VLInt_HS<op24, op23, op11_8, op4, itin16, itin32, OpcodeStr, Dt, + IntOp, Commutable> { + def v8i16 : N3VLInt<op24, op23, 0b00, op11_8, op4, itin16, + OpcodeStr, !strconcat(Dt, "8"), + v8i16, v8i8, IntOp, Commutable>; +} + + +// Neon Wide 3-register vector intrinsics, +// source operand element sizes of 8, 16 and 32 bits: +multiclass N3VWInt_QHS<bit op24, bit op23, bits<4> op11_8, bit op4, + string OpcodeStr, string Dt, + Intrinsic IntOp, bit Commutable = 0> { + def v8i16 : N3VWInt<op24, op23, 0b00, op11_8, op4, + OpcodeStr, !strconcat(Dt, "8"), + v8i16, v8i8, IntOp, Commutable>; + def v4i32 : N3VWInt<op24, op23, 0b01, op11_8, op4, + OpcodeStr, !strconcat(Dt, "16"), + v4i32, v4i16, IntOp, Commutable>; + def v2i64 : N3VWInt<op24, op23, 0b10, op11_8, op4, + OpcodeStr, !strconcat(Dt, "32"), + v2i64, v2i32, IntOp, Commutable>; +} + + +// Neon Multiply-Op vector operations, +// element sizes of 8, 16 and 32 bits: +multiclass N3VMulOp_QHS<bit op24, bit op23, bits<4> op11_8, bit op4, + InstrItinClass itinD16, InstrItinClass itinD32, + InstrItinClass itinQ16, InstrItinClass itinQ32, + string OpcodeStr, string Dt, SDNode OpNode> { + // 64-bit vector types. + def v8i8 : N3VDMulOp<op24, op23, 0b00, op11_8, op4, itinD16, + OpcodeStr, !strconcat(Dt, "8"), v8i8, mul, OpNode>; + def v4i16 : N3VDMulOp<op24, op23, 0b01, op11_8, op4, itinD16, + OpcodeStr, !strconcat(Dt, "16"), v4i16, mul, OpNode>; + def v2i32 : N3VDMulOp<op24, op23, 0b10, op11_8, op4, itinD32, + OpcodeStr, !strconcat(Dt, "32"), v2i32, mul, OpNode>; + + // 128-bit vector types. + def v16i8 : N3VQMulOp<op24, op23, 0b00, op11_8, op4, itinQ16, + OpcodeStr, !strconcat(Dt, "8"), v16i8, mul, OpNode>; + def v8i16 : N3VQMulOp<op24, op23, 0b01, op11_8, op4, itinQ16, + OpcodeStr, !strconcat(Dt, "16"), v8i16, mul, OpNode>; + def v4i32 : N3VQMulOp<op24, op23, 0b10, op11_8, op4, itinQ32, + OpcodeStr, !strconcat(Dt, "32"), v4i32, mul, OpNode>; +} + +multiclass N3VMulOpSL_HS<bits<4> op11_8, + InstrItinClass itinD16, InstrItinClass itinD32, + InstrItinClass itinQ16, InstrItinClass itinQ32, + string OpcodeStr, string Dt, SDNode ShOp> { + def v4i16 : N3VDMulOpSL16<0b01, op11_8, itinD16, + OpcodeStr, !strconcat(Dt, "16"), v4i16, mul, ShOp>; + def v2i32 : N3VDMulOpSL<0b10, op11_8, itinD32, + OpcodeStr, !strconcat(Dt, "32"), v2i32, mul, ShOp>; + def v8i16 : N3VQMulOpSL16<0b01, op11_8, itinQ16, + OpcodeStr, !strconcat(Dt, "16"), v8i16, v4i16, + mul, ShOp>; + def v4i32 : N3VQMulOpSL<0b10, op11_8, itinQ32, + OpcodeStr, !strconcat(Dt, "32"), v4i32, v2i32, + mul, ShOp>; +} + +// Neon 3-argument intrinsics, +// element sizes of 8, 16 and 32 bits: +multiclass N3VInt3_QHS<bit op24, bit op23, bits<4> op11_8, bit op4, + InstrItinClass itinD, InstrItinClass itinQ, + string OpcodeStr, string Dt, Intrinsic IntOp> { + // 64-bit vector types. + def v8i8 : N3VDInt3<op24, op23, 0b00, op11_8, op4, itinD, + OpcodeStr, !strconcat(Dt, "8"), v8i8, v8i8, IntOp>; + def v4i16 : N3VDInt3<op24, op23, 0b01, op11_8, op4, itinD, + OpcodeStr, !strconcat(Dt, "16"), v4i16, v4i16, IntOp>; + def v2i32 : N3VDInt3<op24, op23, 0b10, op11_8, op4, itinD, + OpcodeStr, !strconcat(Dt, "32"), v2i32, v2i32, IntOp>; + + // 128-bit vector types. + def v16i8 : N3VQInt3<op24, op23, 0b00, op11_8, op4, itinQ, + OpcodeStr, !strconcat(Dt, "8"), v16i8, v16i8, IntOp>; + def v8i16 : N3VQInt3<op24, op23, 0b01, op11_8, op4, itinQ, + OpcodeStr, !strconcat(Dt, "16"), v8i16, v8i16, IntOp>; + def v4i32 : N3VQInt3<op24, op23, 0b10, op11_8, op4, itinQ, + OpcodeStr, !strconcat(Dt, "32"), v4i32, v4i32, IntOp>; +} + + +// Neon Long 3-argument intrinsics. + +// First with only element sizes of 16 and 32 bits: +multiclass N3VLInt3_HS<bit op24, bit op23, bits<4> op11_8, bit op4, + InstrItinClass itin16, InstrItinClass itin32, + string OpcodeStr, string Dt, Intrinsic IntOp> { + def v4i32 : N3VLInt3<op24, op23, 0b01, op11_8, op4, itin16, + OpcodeStr, !strconcat(Dt, "16"), v4i32, v4i16, IntOp>; + def v2i64 : N3VLInt3<op24, op23, 0b10, op11_8, op4, itin32, + OpcodeStr, !strconcat(Dt, "32"), v2i64, v2i32, IntOp>; +} + +multiclass N3VLInt3SL_HS<bit op24, bits<4> op11_8, + string OpcodeStr, string Dt, Intrinsic IntOp> { + def v4i16 : N3VLInt3SL16<op24, 0b01, op11_8, IIC_VMACi16D, + OpcodeStr, !strconcat(Dt,"16"), v4i32, v4i16, IntOp>; + def v2i32 : N3VLInt3SL<op24, 0b10, op11_8, IIC_VMACi32D, + OpcodeStr, !strconcat(Dt, "32"), v2i64, v2i32, IntOp>; +} + +// ....then also with element size of 8 bits: +multiclass N3VLInt3_QHS<bit op24, bit op23, bits<4> op11_8, bit op4, + InstrItinClass itin16, InstrItinClass itin32, + string OpcodeStr, string Dt, Intrinsic IntOp> + : N3VLInt3_HS<op24, op23, op11_8, op4, itin16, itin32, OpcodeStr, Dt, IntOp> { + def v8i16 : N3VLInt3<op24, op23, 0b00, op11_8, op4, itin16, + OpcodeStr, !strconcat(Dt, "8"), v8i16, v8i8, IntOp>; +} + + +// Neon 2-register vector intrinsics, +// element sizes of 8, 16 and 32 bits: +multiclass N2VInt_QHS<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16, + bits<5> op11_7, bit op4, + InstrItinClass itinD, InstrItinClass itinQ, + string OpcodeStr, string Dt, Intrinsic IntOp> { + // 64-bit vector types. + def v8i8 : N2VDInt<op24_23, op21_20, 0b00, op17_16, op11_7, op4, + itinD, OpcodeStr, !strconcat(Dt, "8"), v8i8, v8i8, IntOp>; + def v4i16 : N2VDInt<op24_23, op21_20, 0b01, op17_16, op11_7, op4, + itinD, OpcodeStr, !strconcat(Dt, "16"),v4i16,v4i16,IntOp>; + def v2i32 : N2VDInt<op24_23, op21_20, 0b10, op17_16, op11_7, op4, + itinD, OpcodeStr, !strconcat(Dt, "32"),v2i32,v2i32,IntOp>; + + // 128-bit vector types. + def v16i8 : N2VQInt<op24_23, op21_20, 0b00, op17_16, op11_7, op4, + itinQ, OpcodeStr, !strconcat(Dt, "8"), v16i8,v16i8,IntOp>; + def v8i16 : N2VQInt<op24_23, op21_20, 0b01, op17_16, op11_7, op4, + itinQ, OpcodeStr, !strconcat(Dt, "16"),v8i16,v8i16,IntOp>; + def v4i32 : N2VQInt<op24_23, op21_20, 0b10, op17_16, op11_7, op4, + itinQ, OpcodeStr, !strconcat(Dt, "32"),v4i32,v4i32,IntOp>; +} + + +// Neon Pairwise long 2-register intrinsics, +// element sizes of 8, 16 and 32 bits: +multiclass N2VPLInt_QHS<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16, + bits<5> op11_7, bit op4, + string OpcodeStr, string Dt, Intrinsic IntOp> { + // 64-bit vector types. + def v8i8 : N2VDPLInt<op24_23, op21_20, 0b00, op17_16, op11_7, op4, + OpcodeStr, !strconcat(Dt, "8"), v4i16, v8i8, IntOp>; + def v4i16 : N2VDPLInt<op24_23, op21_20, 0b01, op17_16, op11_7, op4, + OpcodeStr, !strconcat(Dt, "16"), v2i32, v4i16, IntOp>; + def v2i32 : N2VDPLInt<op24_23, op21_20, 0b10, op17_16, op11_7, op4, + OpcodeStr, !strconcat(Dt, "32"), v1i64, v2i32, IntOp>; + + // 128-bit vector types. + def v16i8 : N2VQPLInt<op24_23, op21_20, 0b00, op17_16, op11_7, op4, + OpcodeStr, !strconcat(Dt, "8"), v8i16, v16i8, IntOp>; + def v8i16 : N2VQPLInt<op24_23, op21_20, 0b01, op17_16, op11_7, op4, + OpcodeStr, !strconcat(Dt, "16"), v4i32, v8i16, IntOp>; + def v4i32 : N2VQPLInt<op24_23, op21_20, 0b10, op17_16, op11_7, op4, + OpcodeStr, !strconcat(Dt, "32"), v2i64, v4i32, IntOp>; +} + + +// Neon Pairwise long 2-register accumulate intrinsics, +// element sizes of 8, 16 and 32 bits: +multiclass N2VPLInt2_QHS<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16, + bits<5> op11_7, bit op4, + string OpcodeStr, string Dt, Intrinsic IntOp> { + // 64-bit vector types. + def v8i8 : N2VDPLInt2<op24_23, op21_20, 0b00, op17_16, op11_7, op4, + OpcodeStr, !strconcat(Dt, "8"), v4i16, v8i8, IntOp>; + def v4i16 : N2VDPLInt2<op24_23, op21_20, 0b01, op17_16, op11_7, op4, + OpcodeStr, !strconcat(Dt, "16"), v2i32, v4i16, IntOp>; + def v2i32 : N2VDPLInt2<op24_23, op21_20, 0b10, op17_16, op11_7, op4, + OpcodeStr, !strconcat(Dt, "32"), v1i64, v2i32, IntOp>; + + // 128-bit vector types. + def v16i8 : N2VQPLInt2<op24_23, op21_20, 0b00, op17_16, op11_7, op4, + OpcodeStr, !strconcat(Dt, "8"), v8i16, v16i8, IntOp>; + def v8i16 : N2VQPLInt2<op24_23, op21_20, 0b01, op17_16, op11_7, op4, + OpcodeStr, !strconcat(Dt, "16"), v4i32, v8i16, IntOp>; + def v4i32 : N2VQPLInt2<op24_23, op21_20, 0b10, op17_16, op11_7, op4, + OpcodeStr, !strconcat(Dt, "32"), v2i64, v4i32, IntOp>; +} + + +// Neon 2-register vector shift by immediate, +// with f of either N2RegVShLFrm or N2RegVShRFrm +// element sizes of 8, 16, 32 and 64 bits: +multiclass N2VSh_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4, + InstrItinClass itin, string OpcodeStr, string Dt, + SDNode OpNode, Format f> { + // 64-bit vector types. + def v8i8 : N2VDSh<op24, op23, op11_8, 0, op4, f, itin, + OpcodeStr, !strconcat(Dt, "8"), v8i8, OpNode> { + let Inst{21-19} = 0b001; // imm6 = 001xxx + } + def v4i16 : N2VDSh<op24, op23, op11_8, 0, op4, f, itin, + OpcodeStr, !strconcat(Dt, "16"), v4i16, OpNode> { + let Inst{21-20} = 0b01; // imm6 = 01xxxx + } + def v2i32 : N2VDSh<op24, op23, op11_8, 0, op4, f, itin, + OpcodeStr, !strconcat(Dt, "32"), v2i32, OpNode> { + let Inst{21} = 0b1; // imm6 = 1xxxxx + } + def v1i64 : N2VDSh<op24, op23, op11_8, 1, op4, f, itin, + OpcodeStr, !strconcat(Dt, "64"), v1i64, OpNode>; + // imm6 = xxxxxx + + // 128-bit vector types. + def v16i8 : N2VQSh<op24, op23, op11_8, 0, op4, f, itin, + OpcodeStr, !strconcat(Dt, "8"), v16i8, OpNode> { + let Inst{21-19} = 0b001; // imm6 = 001xxx + } + def v8i16 : N2VQSh<op24, op23, op11_8, 0, op4, f, itin, + OpcodeStr, !strconcat(Dt, "16"), v8i16, OpNode> { + let Inst{21-20} = 0b01; // imm6 = 01xxxx + } + def v4i32 : N2VQSh<op24, op23, op11_8, 0, op4, f, itin, + OpcodeStr, !strconcat(Dt, "32"), v4i32, OpNode> { + let Inst{21} = 0b1; // imm6 = 1xxxxx + } + def v2i64 : N2VQSh<op24, op23, op11_8, 1, op4, f, itin, + OpcodeStr, !strconcat(Dt, "64"), v2i64, OpNode>; + // imm6 = xxxxxx +} + +// Neon Shift-Accumulate vector operations, +// element sizes of 8, 16, 32 and 64 bits: +multiclass N2VShAdd_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4, + string OpcodeStr, string Dt, SDNode ShOp> { + // 64-bit vector types. + def v8i8 : N2VDShAdd<op24, op23, op11_8, 0, op4, + OpcodeStr, !strconcat(Dt, "8"), v8i8, ShOp> { + let Inst{21-19} = 0b001; // imm6 = 001xxx + } + def v4i16 : N2VDShAdd<op24, op23, op11_8, 0, op4, + OpcodeStr, !strconcat(Dt, "16"), v4i16, ShOp> { + let Inst{21-20} = 0b01; // imm6 = 01xxxx + } + def v2i32 : N2VDShAdd<op24, op23, op11_8, 0, op4, + OpcodeStr, !strconcat(Dt, "32"), v2i32, ShOp> { + let Inst{21} = 0b1; // imm6 = 1xxxxx + } + def v1i64 : N2VDShAdd<op24, op23, op11_8, 1, op4, + OpcodeStr, !strconcat(Dt, "64"), v1i64, ShOp>; + // imm6 = xxxxxx + + // 128-bit vector types. + def v16i8 : N2VQShAdd<op24, op23, op11_8, 0, op4, + OpcodeStr, !strconcat(Dt, "8"), v16i8, ShOp> { + let Inst{21-19} = 0b001; // imm6 = 001xxx + } + def v8i16 : N2VQShAdd<op24, op23, op11_8, 0, op4, + OpcodeStr, !strconcat(Dt, "16"), v8i16, ShOp> { + let Inst{21-20} = 0b01; // imm6 = 01xxxx + } + def v4i32 : N2VQShAdd<op24, op23, op11_8, 0, op4, + OpcodeStr, !strconcat(Dt, "32"), v4i32, ShOp> { + let Inst{21} = 0b1; // imm6 = 1xxxxx + } + def v2i64 : N2VQShAdd<op24, op23, op11_8, 1, op4, + OpcodeStr, !strconcat(Dt, "64"), v2i64, ShOp>; + // imm6 = xxxxxx +} + + +// Neon Shift-Insert vector operations, +// with f of either N2RegVShLFrm or N2RegVShRFrm +// element sizes of 8, 16, 32 and 64 bits: +multiclass N2VShIns_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4, + string OpcodeStr, SDNode ShOp, + Format f> { + // 64-bit vector types. + def v8i8 : N2VDShIns<op24, op23, op11_8, 0, op4, + f, OpcodeStr, "8", v8i8, ShOp> { + let Inst{21-19} = 0b001; // imm6 = 001xxx + } + def v4i16 : N2VDShIns<op24, op23, op11_8, 0, op4, + f, OpcodeStr, "16", v4i16, ShOp> { + let Inst{21-20} = 0b01; // imm6 = 01xxxx + } + def v2i32 : N2VDShIns<op24, op23, op11_8, 0, op4, + f, OpcodeStr, "32", v2i32, ShOp> { + let Inst{21} = 0b1; // imm6 = 1xxxxx + } + def v1i64 : N2VDShIns<op24, op23, op11_8, 1, op4, + f, OpcodeStr, "64", v1i64, ShOp>; + // imm6 = xxxxxx + + // 128-bit vector types. + def v16i8 : N2VQShIns<op24, op23, op11_8, 0, op4, + f, OpcodeStr, "8", v16i8, ShOp> { + let Inst{21-19} = 0b001; // imm6 = 001xxx + } + def v8i16 : N2VQShIns<op24, op23, op11_8, 0, op4, + f, OpcodeStr, "16", v8i16, ShOp> { + let Inst{21-20} = 0b01; // imm6 = 01xxxx + } + def v4i32 : N2VQShIns<op24, op23, op11_8, 0, op4, + f, OpcodeStr, "32", v4i32, ShOp> { + let Inst{21} = 0b1; // imm6 = 1xxxxx + } + def v2i64 : N2VQShIns<op24, op23, op11_8, 1, op4, + f, OpcodeStr, "64", v2i64, ShOp>; + // imm6 = xxxxxx +} + +// Neon Shift Long operations, +// element sizes of 8, 16, 32 bits: +multiclass N2VLSh_QHS<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6, + bit op4, string OpcodeStr, string Dt, SDNode OpNode> { + def v8i16 : N2VLSh<op24, op23, op11_8, op7, op6, op4, + OpcodeStr, !strconcat(Dt, "8"), v8i16, v8i8, OpNode> { + let Inst{21-19} = 0b001; // imm6 = 001xxx + } + def v4i32 : N2VLSh<op24, op23, op11_8, op7, op6, op4, + OpcodeStr, !strconcat(Dt, "16"), v4i32, v4i16, OpNode> { + let Inst{21-20} = 0b01; // imm6 = 01xxxx + } + def v2i64 : N2VLSh<op24, op23, op11_8, op7, op6, op4, + OpcodeStr, !strconcat(Dt, "32"), v2i64, v2i32, OpNode> { + let Inst{21} = 0b1; // imm6 = 1xxxxx + } +} + +// Neon Shift Narrow operations, +// element sizes of 16, 32, 64 bits: +multiclass N2VNSh_HSD<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6, + bit op4, InstrItinClass itin, string OpcodeStr, string Dt, + SDNode OpNode> { + def v8i8 : N2VNSh<op24, op23, op11_8, op7, op6, op4, itin, + OpcodeStr, !strconcat(Dt, "16"), v8i8, v8i16, OpNode> { + let Inst{21-19} = 0b001; // imm6 = 001xxx + } + def v4i16 : N2VNSh<op24, op23, op11_8, op7, op6, op4, itin, + OpcodeStr, !strconcat(Dt, "32"), v4i16, v4i32, OpNode> { + let Inst{21-20} = 0b01; // imm6 = 01xxxx + } + def v2i32 : N2VNSh<op24, op23, op11_8, op7, op6, op4, itin, + OpcodeStr, !strconcat(Dt, "64"), v2i32, v2i64, OpNode> { + let Inst{21} = 0b1; // imm6 = 1xxxxx + } +} + +//===----------------------------------------------------------------------===// +// Instruction Definitions. +//===----------------------------------------------------------------------===// + +// Vector Add Operations. + +// VADD : Vector Add (integer and floating-point) +defm VADD : N3V_QHSD<0, 0, 0b1000, 0, IIC_VBINiD, IIC_VBINiQ, "vadd", "i", + add, 1>; +def VADDfd : N3VD<0, 0, 0b00, 0b1101, 0, IIC_VBIND, "vadd", "f32", + v2f32, v2f32, fadd, 1>; +def VADDfq : N3VQ<0, 0, 0b00, 0b1101, 0, IIC_VBINQ, "vadd", "f32", + v4f32, v4f32, fadd, 1>; +// VADDL : Vector Add Long (Q = D + D) +defm VADDLs : N3VLInt_QHS<0,1,0b0000,0, IIC_VSHLiD, IIC_VSHLiD, + "vaddl", "s", int_arm_neon_vaddls, 1>; +defm VADDLu : N3VLInt_QHS<1,1,0b0000,0, IIC_VSHLiD, IIC_VSHLiD, + "vaddl", "u", int_arm_neon_vaddlu, 1>; +// VADDW : Vector Add Wide (Q = Q + D) +defm VADDWs : N3VWInt_QHS<0,1,0b0001,0, "vaddw", "s", int_arm_neon_vaddws, 0>; +defm VADDWu : N3VWInt_QHS<1,1,0b0001,0, "vaddw", "u", int_arm_neon_vaddwu, 0>; +// VHADD : Vector Halving Add +defm VHADDs : N3VInt_QHS<0, 0, 0b0000, 0, N3RegFrm, + IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q, + "vhadd", "s", int_arm_neon_vhadds, 1>; +defm VHADDu : N3VInt_QHS<1, 0, 0b0000, 0, N3RegFrm, + IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q, + "vhadd", "u", int_arm_neon_vhaddu, 1>; +// VRHADD : Vector Rounding Halving Add +defm VRHADDs : N3VInt_QHS<0, 0, 0b0001, 0, N3RegFrm, + IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q, + "vrhadd", "s", int_arm_neon_vrhadds, 1>; +defm VRHADDu : N3VInt_QHS<1, 0, 0b0001, 0, N3RegFrm, + IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q, + "vrhadd", "u", int_arm_neon_vrhaddu, 1>; +// VQADD : Vector Saturating Add +defm VQADDs : N3VInt_QHSD<0, 0, 0b0000, 1, N3RegFrm, + IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q, + "vqadd", "s", int_arm_neon_vqadds, 1>; +defm VQADDu : N3VInt_QHSD<1, 0, 0b0000, 1, N3RegFrm, + IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q, + "vqadd", "u", int_arm_neon_vqaddu, 1>; +// VADDHN : Vector Add and Narrow Returning High Half (D = Q + Q) +defm VADDHN : N3VNInt_HSD<0,1,0b0100,0, "vaddhn", "i", + int_arm_neon_vaddhn, 1>; +// VRADDHN : Vector Rounding Add and Narrow Returning High Half (D = Q + Q) +defm VRADDHN : N3VNInt_HSD<1,1,0b0100,0, "vraddhn", "i", + int_arm_neon_vraddhn, 1>; + +// Vector Multiply Operations. + +// VMUL : Vector Multiply (integer, polynomial and floating-point) +defm VMUL : N3V_QHS<0, 0, 0b1001, 1, IIC_VMULi16D, IIC_VMULi32D, + IIC_VMULi16Q, IIC_VMULi32Q, "vmul", "i", mul, 1>; +def VMULpd : N3VDInt<1, 0, 0b00, 0b1001, 1, N3RegFrm, IIC_VMULi16D, "vmul", + "p8", v8i8, v8i8, int_arm_neon_vmulp, 1>; +def VMULpq : N3VQInt<1, 0, 0b00, 0b1001, 1, N3RegFrm, IIC_VMULi16Q, "vmul", + "p8", v16i8, v16i8, int_arm_neon_vmulp, 1>; +def VMULfd : N3VD<1, 0, 0b00, 0b1101, 1, IIC_VBIND, "vmul", "f32", + v2f32, v2f32, fmul, 1>; +def VMULfq : N3VQ<1, 0, 0b00, 0b1101, 1, IIC_VBINQ, "vmul", "f32", + v4f32, v4f32, fmul, 1>; +defm VMULsl : N3VSL_HS<0b1000, "vmul", "i", mul>; +def VMULslfd : N3VDSL<0b10, 0b1001, IIC_VBIND, "vmul", "f32", v2f32, fmul>; +def VMULslfq : N3VQSL<0b10, 0b1001, IIC_VBINQ, "vmul", "f32", v4f32, + v2f32, fmul>; + +def : Pat<(v8i16 (mul (v8i16 QPR:$src1), + (v8i16 (NEONvduplane (v8i16 QPR:$src2), imm:$lane)))), + (v8i16 (VMULslv8i16 (v8i16 QPR:$src1), + (v4i16 (EXTRACT_SUBREG QPR:$src2, + (DSubReg_i16_reg imm:$lane))), + (SubReg_i16_lane imm:$lane)))>; +def : Pat<(v4i32 (mul (v4i32 QPR:$src1), + (v4i32 (NEONvduplane (v4i32 QPR:$src2), imm:$lane)))), + (v4i32 (VMULslv4i32 (v4i32 QPR:$src1), + (v2i32 (EXTRACT_SUBREG QPR:$src2, + (DSubReg_i32_reg imm:$lane))), + (SubReg_i32_lane imm:$lane)))>; +def : Pat<(v4f32 (fmul (v4f32 QPR:$src1), + (v4f32 (NEONvduplane (v4f32 QPR:$src2), imm:$lane)))), + (v4f32 (VMULslfq (v4f32 QPR:$src1), + (v2f32 (EXTRACT_SUBREG QPR:$src2, + (DSubReg_i32_reg imm:$lane))), + (SubReg_i32_lane imm:$lane)))>; + +// VQDMULH : Vector Saturating Doubling Multiply Returning High Half +defm VQDMULH : N3VInt_HS<0, 0, 0b1011, 0, N3RegFrm, IIC_VMULi16D, IIC_VMULi32D, + IIC_VMULi16Q, IIC_VMULi32Q, + "vqdmulh", "s", int_arm_neon_vqdmulh, 1>; +defm VQDMULHsl: N3VIntSL_HS<0b1100, IIC_VMULi16D, IIC_VMULi32D, + IIC_VMULi16Q, IIC_VMULi32Q, + "vqdmulh", "s", int_arm_neon_vqdmulh>; +def : Pat<(v8i16 (int_arm_neon_vqdmulh (v8i16 QPR:$src1), + (v8i16 (NEONvduplane (v8i16 QPR:$src2), + imm:$lane)))), + (v8i16 (VQDMULHslv8i16 (v8i16 QPR:$src1), + (v4i16 (EXTRACT_SUBREG QPR:$src2, + (DSubReg_i16_reg imm:$lane))), + (SubReg_i16_lane imm:$lane)))>; +def : Pat<(v4i32 (int_arm_neon_vqdmulh (v4i32 QPR:$src1), + (v4i32 (NEONvduplane (v4i32 QPR:$src2), + imm:$lane)))), + (v4i32 (VQDMULHslv4i32 (v4i32 QPR:$src1), + (v2i32 (EXTRACT_SUBREG QPR:$src2, + (DSubReg_i32_reg imm:$lane))), + (SubReg_i32_lane imm:$lane)))>; + +// VQRDMULH : Vector Rounding Saturating Doubling Multiply Returning High Half +defm VQRDMULH : N3VInt_HS<1, 0, 0b1011, 0, N3RegFrm, + IIC_VMULi16D,IIC_VMULi32D,IIC_VMULi16Q,IIC_VMULi32Q, + "vqrdmulh", "s", int_arm_neon_vqrdmulh, 1>; +defm VQRDMULHsl : N3VIntSL_HS<0b1101, IIC_VMULi16D, IIC_VMULi32D, + IIC_VMULi16Q, IIC_VMULi32Q, + "vqrdmulh", "s", int_arm_neon_vqrdmulh>; +def : Pat<(v8i16 (int_arm_neon_vqrdmulh (v8i16 QPR:$src1), + (v8i16 (NEONvduplane (v8i16 QPR:$src2), + imm:$lane)))), + (v8i16 (VQRDMULHslv8i16 (v8i16 QPR:$src1), + (v4i16 (EXTRACT_SUBREG QPR:$src2, + (DSubReg_i16_reg imm:$lane))), + (SubReg_i16_lane imm:$lane)))>; +def : Pat<(v4i32 (int_arm_neon_vqrdmulh (v4i32 QPR:$src1), + (v4i32 (NEONvduplane (v4i32 QPR:$src2), + imm:$lane)))), + (v4i32 (VQRDMULHslv4i32 (v4i32 QPR:$src1), + (v2i32 (EXTRACT_SUBREG QPR:$src2, + (DSubReg_i32_reg imm:$lane))), + (SubReg_i32_lane imm:$lane)))>; + +// VMULL : Vector Multiply Long (integer and polynomial) (Q = D * D) +defm VMULLs : N3VLInt_QHS<0,1,0b1100,0, IIC_VMULi16D, IIC_VMULi32D, + "vmull", "s", int_arm_neon_vmulls, 1>; +defm VMULLu : N3VLInt_QHS<1,1,0b1100,0, IIC_VMULi16D, IIC_VMULi32D, + "vmull", "u", int_arm_neon_vmullu, 1>; +def VMULLp : N3VLInt<0, 1, 0b00, 0b1110, 0, IIC_VMULi16D, "vmull", "p8", + v8i16, v8i8, int_arm_neon_vmullp, 1>; +defm VMULLsls : N3VLIntSL_HS<0, 0b1010, IIC_VMULi16D, "vmull", "s", + int_arm_neon_vmulls>; +defm VMULLslu : N3VLIntSL_HS<1, 0b1010, IIC_VMULi16D, "vmull", "u", + int_arm_neon_vmullu>; + +// VQDMULL : Vector Saturating Doubling Multiply Long (Q = D * D) +defm VQDMULL : N3VLInt_HS<0,1,0b1101,0, IIC_VMULi16D, IIC_VMULi32D, + "vqdmull", "s", int_arm_neon_vqdmull, 1>; +defm VQDMULLsl: N3VLIntSL_HS<0, 0b1011, IIC_VMULi16D, + "vqdmull", "s", int_arm_neon_vqdmull>; + +// Vector Multiply-Accumulate and Multiply-Subtract Operations. + +// VMLA : Vector Multiply Accumulate (integer and floating-point) +defm VMLA : N3VMulOp_QHS<0, 0, 0b1001, 0, IIC_VMACi16D, IIC_VMACi32D, + IIC_VMACi16Q, IIC_VMACi32Q, "vmla", "i", add>; +def VMLAfd : N3VDMulOp<0, 0, 0b00, 0b1101, 1, IIC_VMACD, "vmla", "f32", + v2f32, fmul, fadd>; +def VMLAfq : N3VQMulOp<0, 0, 0b00, 0b1101, 1, IIC_VMACQ, "vmla", "f32", + v4f32, fmul, fadd>; +defm VMLAsl : N3VMulOpSL_HS<0b0000, IIC_VMACi16D, IIC_VMACi32D, + IIC_VMACi16Q, IIC_VMACi32Q, "vmla", "i", add>; +def VMLAslfd : N3VDMulOpSL<0b10, 0b0001, IIC_VMACD, "vmla", "f32", + v2f32, fmul, fadd>; +def VMLAslfq : N3VQMulOpSL<0b10, 0b0001, IIC_VMACQ, "vmla", "f32", + v4f32, v2f32, fmul, fadd>; + +def : Pat<(v8i16 (add (v8i16 QPR:$src1), + (mul (v8i16 QPR:$src2), + (v8i16 (NEONvduplane (v8i16 QPR:$src3), imm:$lane))))), + (v8i16 (VMLAslv8i16 (v8i16 QPR:$src1), (v8i16 QPR:$src2), + (v4i16 (EXTRACT_SUBREG QPR:$src3, + (DSubReg_i16_reg imm:$lane))), + (SubReg_i16_lane imm:$lane)))>; + +def : Pat<(v4i32 (add (v4i32 QPR:$src1), + (mul (v4i32 QPR:$src2), + (v4i32 (NEONvduplane (v4i32 QPR:$src3), imm:$lane))))), + (v4i32 (VMLAslv4i32 (v4i32 QPR:$src1), (v4i32 QPR:$src2), + (v2i32 (EXTRACT_SUBREG QPR:$src3, + (DSubReg_i32_reg imm:$lane))), + (SubReg_i32_lane imm:$lane)))>; + +def : Pat<(v4f32 (fadd (v4f32 QPR:$src1), + (fmul (v4f32 QPR:$src2), + (v4f32 (NEONvduplane (v4f32 QPR:$src3), imm:$lane))))), + (v4f32 (VMLAslfq (v4f32 QPR:$src1), + (v4f32 QPR:$src2), + (v2f32 (EXTRACT_SUBREG QPR:$src3, + (DSubReg_i32_reg imm:$lane))), + (SubReg_i32_lane imm:$lane)))>; + +// VMLAL : Vector Multiply Accumulate Long (Q += D * D) +defm VMLALs : N3VLInt3_QHS<0,1,0b1000,0, IIC_VMACi16D, IIC_VMACi32D, + "vmlal", "s", int_arm_neon_vmlals>; +defm VMLALu : N3VLInt3_QHS<1,1,0b1000,0, IIC_VMACi16D, IIC_VMACi32D, + "vmlal", "u", int_arm_neon_vmlalu>; + +defm VMLALsls : N3VLInt3SL_HS<0, 0b0010, "vmlal", "s", int_arm_neon_vmlals>; +defm VMLALslu : N3VLInt3SL_HS<1, 0b0010, "vmlal", "u", int_arm_neon_vmlalu>; + +// VQDMLAL : Vector Saturating Doubling Multiply Accumulate Long (Q += D * D) +defm VQDMLAL : N3VLInt3_HS<0, 1, 0b1001, 0, IIC_VMACi16D, IIC_VMACi32D, + "vqdmlal", "s", int_arm_neon_vqdmlal>; +defm VQDMLALsl: N3VLInt3SL_HS<0, 0b0011, "vqdmlal", "s", int_arm_neon_vqdmlal>; + +// VMLS : Vector Multiply Subtract (integer and floating-point) +defm VMLS : N3VMulOp_QHS<1, 0, 0b1001, 0, IIC_VMACi16D, IIC_VMACi32D, + IIC_VMACi16Q, IIC_VMACi32Q, "vmls", "i", sub>; +def VMLSfd : N3VDMulOp<0, 0, 0b10, 0b1101, 1, IIC_VMACD, "vmls", "f32", + v2f32, fmul, fsub>; +def VMLSfq : N3VQMulOp<0, 0, 0b10, 0b1101, 1, IIC_VMACQ, "vmls", "f32", + v4f32, fmul, fsub>; +defm VMLSsl : N3VMulOpSL_HS<0b0100, IIC_VMACi16D, IIC_VMACi32D, + IIC_VMACi16Q, IIC_VMACi32Q, "vmls", "i", sub>; +def VMLSslfd : N3VDMulOpSL<0b10, 0b0101, IIC_VMACD, "vmls", "f32", + v2f32, fmul, fsub>; +def VMLSslfq : N3VQMulOpSL<0b10, 0b0101, IIC_VMACQ, "vmls", "f32", + v4f32, v2f32, fmul, fsub>; + +def : Pat<(v8i16 (sub (v8i16 QPR:$src1), + (mul (v8i16 QPR:$src2), + (v8i16 (NEONvduplane (v8i16 QPR:$src3), imm:$lane))))), + (v8i16 (VMLSslv8i16 (v8i16 QPR:$src1), (v8i16 QPR:$src2), + (v4i16 (EXTRACT_SUBREG QPR:$src3, + (DSubReg_i16_reg imm:$lane))), + (SubReg_i16_lane imm:$lane)))>; + +def : Pat<(v4i32 (sub (v4i32 QPR:$src1), + (mul (v4i32 QPR:$src2), + (v4i32 (NEONvduplane (v4i32 QPR:$src3), imm:$lane))))), + (v4i32 (VMLSslv4i32 (v4i32 QPR:$src1), (v4i32 QPR:$src2), + (v2i32 (EXTRACT_SUBREG QPR:$src3, + (DSubReg_i32_reg imm:$lane))), + (SubReg_i32_lane imm:$lane)))>; + +def : Pat<(v4f32 (fsub (v4f32 QPR:$src1), + (fmul (v4f32 QPR:$src2), + (v4f32 (NEONvduplane (v4f32 QPR:$src3), imm:$lane))))), + (v4f32 (VMLSslfq (v4f32 QPR:$src1), (v4f32 QPR:$src2), + (v2f32 (EXTRACT_SUBREG QPR:$src3, + (DSubReg_i32_reg imm:$lane))), + (SubReg_i32_lane imm:$lane)))>; + +// VMLSL : Vector Multiply Subtract Long (Q -= D * D) +defm VMLSLs : N3VLInt3_QHS<0,1,0b1010,0, IIC_VMACi16D, IIC_VMACi32D, + "vmlsl", "s", int_arm_neon_vmlsls>; +defm VMLSLu : N3VLInt3_QHS<1,1,0b1010,0, IIC_VMACi16D, IIC_VMACi32D, + "vmlsl", "u", int_arm_neon_vmlslu>; + +defm VMLSLsls : N3VLInt3SL_HS<0, 0b0110, "vmlsl", "s", int_arm_neon_vmlsls>; +defm VMLSLslu : N3VLInt3SL_HS<1, 0b0110, "vmlsl", "u", int_arm_neon_vmlslu>; + +// VQDMLSL : Vector Saturating Doubling Multiply Subtract Long (Q -= D * D) +defm VQDMLSL : N3VLInt3_HS<0, 1, 0b1011, 0, IIC_VMACi16D, IIC_VMACi32D, + "vqdmlsl", "s", int_arm_neon_vqdmlsl>; +defm VQDMLSLsl: N3VLInt3SL_HS<0, 0b111, "vqdmlsl", "s", int_arm_neon_vqdmlsl>; + +// Vector Subtract Operations. + +// VSUB : Vector Subtract (integer and floating-point) +defm VSUB : N3V_QHSD<1, 0, 0b1000, 0, IIC_VSUBiD, IIC_VSUBiQ, + "vsub", "i", sub, 0>; +def VSUBfd : N3VD<0, 0, 0b10, 0b1101, 0, IIC_VBIND, "vsub", "f32", + v2f32, v2f32, fsub, 0>; +def VSUBfq : N3VQ<0, 0, 0b10, 0b1101, 0, IIC_VBINQ, "vsub", "f32", + v4f32, v4f32, fsub, 0>; +// VSUBL : Vector Subtract Long (Q = D - D) +defm VSUBLs : N3VLInt_QHS<0,1,0b0010,0, IIC_VSHLiD, IIC_VSHLiD, + "vsubl", "s", int_arm_neon_vsubls, 1>; +defm VSUBLu : N3VLInt_QHS<1,1,0b0010,0, IIC_VSHLiD, IIC_VSHLiD, + "vsubl", "u", int_arm_neon_vsublu, 1>; +// VSUBW : Vector Subtract Wide (Q = Q - D) +defm VSUBWs : N3VWInt_QHS<0,1,0b0011,0, "vsubw", "s", int_arm_neon_vsubws, 0>; +defm VSUBWu : N3VWInt_QHS<1,1,0b0011,0, "vsubw", "u", int_arm_neon_vsubwu, 0>; +// VHSUB : Vector Halving Subtract +defm VHSUBs : N3VInt_QHS<0, 0, 0b0010, 0, N3RegFrm, + IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q, + "vhsub", "s", int_arm_neon_vhsubs, 0>; +defm VHSUBu : N3VInt_QHS<1, 0, 0b0010, 0, N3RegFrm, + IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q, + "vhsub", "u", int_arm_neon_vhsubu, 0>; +// VQSUB : Vector Saturing Subtract +defm VQSUBs : N3VInt_QHSD<0, 0, 0b0010, 1, N3RegFrm, + IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q, + "vqsub", "s", int_arm_neon_vqsubs, 0>; +defm VQSUBu : N3VInt_QHSD<1, 0, 0b0010, 1, N3RegFrm, + IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q, + "vqsub", "u", int_arm_neon_vqsubu, 0>; +// VSUBHN : Vector Subtract and Narrow Returning High Half (D = Q - Q) +defm VSUBHN : N3VNInt_HSD<0,1,0b0110,0, "vsubhn", "i", + int_arm_neon_vsubhn, 0>; +// VRSUBHN : Vector Rounding Subtract and Narrow Returning High Half (D=Q-Q) +defm VRSUBHN : N3VNInt_HSD<1,1,0b0110,0, "vrsubhn", "i", + int_arm_neon_vrsubhn, 0>; + +// Vector Comparisons. + +// VCEQ : Vector Compare Equal +defm VCEQ : N3V_QHS<1, 0, 0b1000, 1, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, + IIC_VSUBi4Q, "vceq", "i", NEONvceq, 1>; +def VCEQfd : N3VD<0,0,0b00,0b1110,0, IIC_VBIND, "vceq", "f32", v2i32, v2f32, + NEONvceq, 1>; +def VCEQfq : N3VQ<0,0,0b00,0b1110,0, IIC_VBINQ, "vceq", "f32", v4i32, v4f32, + NEONvceq, 1>; +// For disassembly only. +defm VCEQz : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00010, 0, "vceq", "i", + "$dst, $src, #0">; + +// VCGE : Vector Compare Greater Than or Equal +defm VCGEs : N3V_QHS<0, 0, 0b0011, 1, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, + IIC_VSUBi4Q, "vcge", "s", NEONvcge, 0>; +defm VCGEu : N3V_QHS<1, 0, 0b0011, 1, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, + IIC_VSUBi4Q, "vcge", "u", NEONvcgeu, 0>; +def VCGEfd : N3VD<1,0,0b00,0b1110,0, IIC_VBIND, "vcge", "f32", v2i32, v2f32, + NEONvcge, 0>; +def VCGEfq : N3VQ<1,0,0b00,0b1110,0, IIC_VBINQ, "vcge", "f32", v4i32, v4f32, + NEONvcge, 0>; +// For disassembly only. +defm VCGEz : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00001, 0, "vcge", "s", + "$dst, $src, #0">; +// For disassembly only. +defm VCLEz : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00011, 0, "vcle", "s", + "$dst, $src, #0">; + +// VCGT : Vector Compare Greater Than +defm VCGTs : N3V_QHS<0, 0, 0b0011, 0, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, + IIC_VSUBi4Q, "vcgt", "s", NEONvcgt, 0>; +defm VCGTu : N3V_QHS<1, 0, 0b0011, 0, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, + IIC_VSUBi4Q, "vcgt", "u", NEONvcgtu, 0>; +def VCGTfd : N3VD<1,0,0b10,0b1110,0, IIC_VBIND, "vcgt", "f32", v2i32, v2f32, + NEONvcgt, 0>; +def VCGTfq : N3VQ<1,0,0b10,0b1110,0, IIC_VBINQ, "vcgt", "f32", v4i32, v4f32, + NEONvcgt, 0>; +// For disassembly only. +defm VCGTz : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00000, 0, "vcgt", "s", + "$dst, $src, #0">; +// For disassembly only. +defm VCLTz : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00100, 0, "vclt", "s", + "$dst, $src, #0">; + +// VACGE : Vector Absolute Compare Greater Than or Equal (aka VCAGE) +def VACGEd : N3VDInt<1, 0, 0b00, 0b1110, 1, N3RegFrm, IIC_VBIND, "vacge", + "f32", v2i32, v2f32, int_arm_neon_vacged, 0>; +def VACGEq : N3VQInt<1, 0, 0b00, 0b1110, 1, N3RegFrm, IIC_VBINQ, "vacge", + "f32", v4i32, v4f32, int_arm_neon_vacgeq, 0>; +// VACGT : Vector Absolute Compare Greater Than (aka VCAGT) +def VACGTd : N3VDInt<1, 0, 0b10, 0b1110, 1, N3RegFrm, IIC_VBIND, "vacgt", + "f32", v2i32, v2f32, int_arm_neon_vacgtd, 0>; +def VACGTq : N3VQInt<1, 0, 0b10, 0b1110, 1, N3RegFrm, IIC_VBINQ, "vacgt", + "f32", v4i32, v4f32, int_arm_neon_vacgtq, 0>; +// VTST : Vector Test Bits +defm VTST : N3V_QHS<0, 0, 0b1000, 1, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, + IIC_VBINi4Q, "vtst", "", NEONvtst, 1>; + +// Vector Bitwise Operations. + +def vnot8 : PatFrag<(ops node:$in), + (xor node:$in, (bitconvert (v8i8 immAllOnesV)))>; +def vnot16 : PatFrag<(ops node:$in), + (xor node:$in, (bitconvert (v16i8 immAllOnesV)))>; + + +// VAND : Vector Bitwise AND +def VANDd : N3VDX<0, 0, 0b00, 0b0001, 1, IIC_VBINiD, "vand", + v2i32, v2i32, and, 1>; +def VANDq : N3VQX<0, 0, 0b00, 0b0001, 1, IIC_VBINiQ, "vand", + v4i32, v4i32, and, 1>; + +// VEOR : Vector Bitwise Exclusive OR +def VEORd : N3VDX<1, 0, 0b00, 0b0001, 1, IIC_VBINiD, "veor", + v2i32, v2i32, xor, 1>; +def VEORq : N3VQX<1, 0, 0b00, 0b0001, 1, IIC_VBINiQ, "veor", + v4i32, v4i32, xor, 1>; + +// VORR : Vector Bitwise OR +def VORRd : N3VDX<0, 0, 0b10, 0b0001, 1, IIC_VBINiD, "vorr", + v2i32, v2i32, or, 1>; +def VORRq : N3VQX<0, 0, 0b10, 0b0001, 1, IIC_VBINiQ, "vorr", + v4i32, v4i32, or, 1>; + +// VBIC : Vector Bitwise Bit Clear (AND NOT) +def VBICd : N3VX<0, 0, 0b01, 0b0001, 0, 1, (outs DPR:$dst), + (ins DPR:$src1, DPR:$src2), N3RegFrm, IIC_VBINiD, + "vbic", "$dst, $src1, $src2", "", + [(set DPR:$dst, (v2i32 (and DPR:$src1, + (vnot8 DPR:$src2))))]>; +def VBICq : N3VX<0, 0, 0b01, 0b0001, 1, 1, (outs QPR:$dst), + (ins QPR:$src1, QPR:$src2), N3RegFrm, IIC_VBINiQ, + "vbic", "$dst, $src1, $src2", "", + [(set QPR:$dst, (v4i32 (and QPR:$src1, + (vnot16 QPR:$src2))))]>; + +// VORN : Vector Bitwise OR NOT +def VORNd : N3VX<0, 0, 0b11, 0b0001, 0, 1, (outs DPR:$dst), + (ins DPR:$src1, DPR:$src2), N3RegFrm, IIC_VBINiD, + "vorn", "$dst, $src1, $src2", "", + [(set DPR:$dst, (v2i32 (or DPR:$src1, + (vnot8 DPR:$src2))))]>; +def VORNq : N3VX<0, 0, 0b11, 0b0001, 1, 1, (outs QPR:$dst), + (ins QPR:$src1, QPR:$src2), N3RegFrm, IIC_VBINiQ, + "vorn", "$dst, $src1, $src2", "", + [(set QPR:$dst, (v4i32 (or QPR:$src1, + (vnot16 QPR:$src2))))]>; + +// VMVN : Vector Bitwise NOT +def VMVNd : N2VX<0b11, 0b11, 0b00, 0b00, 0b01011, 0, 0, + (outs DPR:$dst), (ins DPR:$src), IIC_VSUBiD, + "vmvn", "$dst, $src", "", + [(set DPR:$dst, (v2i32 (vnot8 DPR:$src)))]>; +def VMVNq : N2VX<0b11, 0b11, 0b00, 0b00, 0b01011, 1, 0, + (outs QPR:$dst), (ins QPR:$src), IIC_VSUBiD, + "vmvn", "$dst, $src", "", + [(set QPR:$dst, (v4i32 (vnot16 QPR:$src)))]>; +def : Pat<(v2i32 (vnot8 DPR:$src)), (VMVNd DPR:$src)>; +def : Pat<(v4i32 (vnot16 QPR:$src)), (VMVNq QPR:$src)>; + +// VBSL : Vector Bitwise Select +def VBSLd : N3VX<1, 0, 0b01, 0b0001, 0, 1, (outs DPR:$dst), + (ins DPR:$src1, DPR:$src2, DPR:$src3), + N3RegFrm, IIC_VCNTiD, + "vbsl", "$dst, $src2, $src3", "$src1 = $dst", + [(set DPR:$dst, + (v2i32 (or (and DPR:$src2, DPR:$src1), + (and DPR:$src3, (vnot8 DPR:$src1)))))]>; +def VBSLq : N3VX<1, 0, 0b01, 0b0001, 1, 1, (outs QPR:$dst), + (ins QPR:$src1, QPR:$src2, QPR:$src3), + N3RegFrm, IIC_VCNTiQ, + "vbsl", "$dst, $src2, $src3", "$src1 = $dst", + [(set QPR:$dst, + (v4i32 (or (and QPR:$src2, QPR:$src1), + (and QPR:$src3, (vnot16 QPR:$src1)))))]>; + +// VBIF : Vector Bitwise Insert if False +// like VBSL but with: "vbif $dst, $src3, $src1", "$src2 = $dst", +def VBIFd : N3VX<1, 0, 0b11, 0b0001, 0, 1, + (outs DPR:$dst), (ins DPR:$src1, DPR:$src2, DPR:$src3), + N3RegFrm, IIC_VBINiD, + "vbif", "$dst, $src2, $src3", "$src1 = $dst", + [/* For disassembly only; pattern left blank */]>; +def VBIFq : N3VX<1, 0, 0b11, 0b0001, 1, 1, + (outs QPR:$dst), (ins QPR:$src1, QPR:$src2, QPR:$src3), + N3RegFrm, IIC_VBINiQ, + "vbif", "$dst, $src2, $src3", "$src1 = $dst", + [/* For disassembly only; pattern left blank */]>; + +// VBIT : Vector Bitwise Insert if True +// like VBSL but with: "vbit $dst, $src2, $src1", "$src3 = $dst", +def VBITd : N3VX<1, 0, 0b10, 0b0001, 0, 1, + (outs DPR:$dst), (ins DPR:$src1, DPR:$src2, DPR:$src3), + N3RegFrm, IIC_VBINiD, + "vbit", "$dst, $src2, $src3", "$src1 = $dst", + [/* For disassembly only; pattern left blank */]>; +def VBITq : N3VX<1, 0, 0b10, 0b0001, 1, 1, + (outs QPR:$dst), (ins QPR:$src1, QPR:$src2, QPR:$src3), + N3RegFrm, IIC_VBINiQ, + "vbit", "$dst, $src2, $src3", "$src1 = $dst", + [/* For disassembly only; pattern left blank */]>; + +// VBIT/VBIF are not yet implemented. The TwoAddress pass will not go looking +// for equivalent operations with different register constraints; it just +// inserts copies. + +// Vector Absolute Differences. + +// VABD : Vector Absolute Difference +defm VABDs : N3VInt_QHS<0, 0, 0b0111, 0, N3RegFrm, + IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q, + "vabd", "s", int_arm_neon_vabds, 0>; +defm VABDu : N3VInt_QHS<1, 0, 0b0111, 0, N3RegFrm, + IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q, + "vabd", "u", int_arm_neon_vabdu, 0>; +def VABDfd : N3VDInt<1, 0, 0b10, 0b1101, 0, N3RegFrm, IIC_VBIND, + "vabd", "f32", v2f32, v2f32, int_arm_neon_vabds, 0>; +def VABDfq : N3VQInt<1, 0, 0b10, 0b1101, 0, N3RegFrm, IIC_VBINQ, + "vabd", "f32", v4f32, v4f32, int_arm_neon_vabds, 0>; + +// VABDL : Vector Absolute Difference Long (Q = | D - D |) +defm VABDLs : N3VLInt_QHS<0,1,0b0111,0, IIC_VSUBi4Q, IIC_VSUBi4Q, + "vabdl", "s", int_arm_neon_vabdls, 0>; +defm VABDLu : N3VLInt_QHS<1,1,0b0111,0, IIC_VSUBi4Q, IIC_VSUBi4Q, + "vabdl", "u", int_arm_neon_vabdlu, 0>; + +// VABA : Vector Absolute Difference and Accumulate +defm VABAs : N3VInt3_QHS<0,0,0b0111,1, IIC_VABAD, IIC_VABAQ, + "vaba", "s", int_arm_neon_vabas>; +defm VABAu : N3VInt3_QHS<1,0,0b0111,1, IIC_VABAD, IIC_VABAQ, + "vaba", "u", int_arm_neon_vabau>; + +// VABAL : Vector Absolute Difference and Accumulate Long (Q += | D - D |) +defm VABALs : N3VLInt3_QHS<0,1,0b0101,0, IIC_VABAD, IIC_VABAD, + "vabal", "s", int_arm_neon_vabals>; +defm VABALu : N3VLInt3_QHS<1,1,0b0101,0, IIC_VABAD, IIC_VABAD, + "vabal", "u", int_arm_neon_vabalu>; + +// Vector Maximum and Minimum. + +// VMAX : Vector Maximum +defm VMAXs : N3VInt_QHS<0, 0, 0b0110, 0, N3RegFrm, + IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q, + "vmax", "s", int_arm_neon_vmaxs, 1>; +defm VMAXu : N3VInt_QHS<1, 0, 0b0110, 0, N3RegFrm, + IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q, + "vmax", "u", int_arm_neon_vmaxu, 1>; +def VMAXfd : N3VDInt<0, 0, 0b00, 0b1111, 0, N3RegFrm, IIC_VBIND, + "vmax", "f32", + v2f32, v2f32, int_arm_neon_vmaxs, 1>; +def VMAXfq : N3VQInt<0, 0, 0b00, 0b1111, 0, N3RegFrm, IIC_VBINQ, + "vmax", "f32", + v4f32, v4f32, int_arm_neon_vmaxs, 1>; + +// VMIN : Vector Minimum +defm VMINs : N3VInt_QHS<0, 0, 0b0110, 1, N3RegFrm, + IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q, + "vmin", "s", int_arm_neon_vmins, 1>; +defm VMINu : N3VInt_QHS<1, 0, 0b0110, 1, N3RegFrm, + IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q, + "vmin", "u", int_arm_neon_vminu, 1>; +def VMINfd : N3VDInt<0, 0, 0b10, 0b1111, 0, N3RegFrm, IIC_VBIND, + "vmin", "f32", + v2f32, v2f32, int_arm_neon_vmins, 1>; +def VMINfq : N3VQInt<0, 0, 0b10, 0b1111, 0, N3RegFrm, IIC_VBINQ, + "vmin", "f32", + v4f32, v4f32, int_arm_neon_vmins, 1>; + +// Vector Pairwise Operations. + +// VPADD : Vector Pairwise Add +def VPADDi8 : N3VDInt<0, 0, 0b00, 0b1011, 1, N3RegFrm, IIC_VSHLiD, + "vpadd", "i8", + v8i8, v8i8, int_arm_neon_vpadd, 0>; +def VPADDi16 : N3VDInt<0, 0, 0b01, 0b1011, 1, N3RegFrm, IIC_VSHLiD, + "vpadd", "i16", + v4i16, v4i16, int_arm_neon_vpadd, 0>; +def VPADDi32 : N3VDInt<0, 0, 0b10, 0b1011, 1, N3RegFrm, IIC_VSHLiD, + "vpadd", "i32", + v2i32, v2i32, int_arm_neon_vpadd, 0>; +def VPADDf : N3VDInt<1, 0, 0b00, 0b1101, 0, N3RegFrm, + IIC_VBIND, "vpadd", "f32", + v2f32, v2f32, int_arm_neon_vpadd, 0>; + +// VPADDL : Vector Pairwise Add Long +defm VPADDLs : N2VPLInt_QHS<0b11, 0b11, 0b00, 0b00100, 0, "vpaddl", "s", + int_arm_neon_vpaddls>; +defm VPADDLu : N2VPLInt_QHS<0b11, 0b11, 0b00, 0b00101, 0, "vpaddl", "u", + int_arm_neon_vpaddlu>; + +// VPADAL : Vector Pairwise Add and Accumulate Long +defm VPADALs : N2VPLInt2_QHS<0b11, 0b11, 0b00, 0b01100, 0, "vpadal", "s", + int_arm_neon_vpadals>; +defm VPADALu : N2VPLInt2_QHS<0b11, 0b11, 0b00, 0b01101, 0, "vpadal", "u", + int_arm_neon_vpadalu>; + +// VPMAX : Vector Pairwise Maximum +def VPMAXs8 : N3VDInt<0, 0, 0b00, 0b1010, 0, N3RegFrm, IIC_VSUBi4D, "vpmax", + "s8", v8i8, v8i8, int_arm_neon_vpmaxs, 0>; +def VPMAXs16 : N3VDInt<0, 0, 0b01, 0b1010, 0, N3RegFrm, IIC_VSUBi4D, "vpmax", + "s16", v4i16, v4i16, int_arm_neon_vpmaxs, 0>; +def VPMAXs32 : N3VDInt<0, 0, 0b10, 0b1010, 0, N3RegFrm, IIC_VSUBi4D, "vpmax", + "s32", v2i32, v2i32, int_arm_neon_vpmaxs, 0>; +def VPMAXu8 : N3VDInt<1, 0, 0b00, 0b1010, 0, N3RegFrm, IIC_VSUBi4D, "vpmax", + "u8", v8i8, v8i8, int_arm_neon_vpmaxu, 0>; +def VPMAXu16 : N3VDInt<1, 0, 0b01, 0b1010, 0, N3RegFrm, IIC_VSUBi4D, "vpmax", + "u16", v4i16, v4i16, int_arm_neon_vpmaxu, 0>; +def VPMAXu32 : N3VDInt<1, 0, 0b10, 0b1010, 0, N3RegFrm, IIC_VSUBi4D, "vpmax", + "u32", v2i32, v2i32, int_arm_neon_vpmaxu, 0>; +def VPMAXf : N3VDInt<1, 0, 0b00, 0b1111, 0, N3RegFrm, IIC_VSUBi4D, "vpmax", + "f32", v2f32, v2f32, int_arm_neon_vpmaxs, 0>; + +// VPMIN : Vector Pairwise Minimum +def VPMINs8 : N3VDInt<0, 0, 0b00, 0b1010, 1, N3RegFrm, IIC_VSUBi4D, "vpmin", + "s8", v8i8, v8i8, int_arm_neon_vpmins, 0>; +def VPMINs16 : N3VDInt<0, 0, 0b01, 0b1010, 1, N3RegFrm, IIC_VSUBi4D, "vpmin", + "s16", v4i16, v4i16, int_arm_neon_vpmins, 0>; +def VPMINs32 : N3VDInt<0, 0, 0b10, 0b1010, 1, N3RegFrm, IIC_VSUBi4D, "vpmin", + "s32", v2i32, v2i32, int_arm_neon_vpmins, 0>; +def VPMINu8 : N3VDInt<1, 0, 0b00, 0b1010, 1, N3RegFrm, IIC_VSUBi4D, "vpmin", + "u8", v8i8, v8i8, int_arm_neon_vpminu, 0>; +def VPMINu16 : N3VDInt<1, 0, 0b01, 0b1010, 1, N3RegFrm, IIC_VSUBi4D, "vpmin", + "u16", v4i16, v4i16, int_arm_neon_vpminu, 0>; +def VPMINu32 : N3VDInt<1, 0, 0b10, 0b1010, 1, N3RegFrm, IIC_VSUBi4D, "vpmin", + "u32", v2i32, v2i32, int_arm_neon_vpminu, 0>; +def VPMINf : N3VDInt<1, 0, 0b10, 0b1111, 0, N3RegFrm, IIC_VSUBi4D, "vpmin", + "f32", v2f32, v2f32, int_arm_neon_vpmins, 0>; + +// Vector Reciprocal and Reciprocal Square Root Estimate and Step. + +// VRECPE : Vector Reciprocal Estimate +def VRECPEd : N2VDInt<0b11, 0b11, 0b10, 0b11, 0b01000, 0, + IIC_VUNAD, "vrecpe", "u32", + v2i32, v2i32, int_arm_neon_vrecpe>; +def VRECPEq : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01000, 0, + IIC_VUNAQ, "vrecpe", "u32", + v4i32, v4i32, int_arm_neon_vrecpe>; +def VRECPEfd : N2VDInt<0b11, 0b11, 0b10, 0b11, 0b01010, 0, + IIC_VUNAD, "vrecpe", "f32", + v2f32, v2f32, int_arm_neon_vrecpe>; +def VRECPEfq : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01010, 0, + IIC_VUNAQ, "vrecpe", "f32", + v4f32, v4f32, int_arm_neon_vrecpe>; + +// VRECPS : Vector Reciprocal Step +def VRECPSfd : N3VDInt<0, 0, 0b00, 0b1111, 1, N3RegFrm, + IIC_VRECSD, "vrecps", "f32", + v2f32, v2f32, int_arm_neon_vrecps, 1>; +def VRECPSfq : N3VQInt<0, 0, 0b00, 0b1111, 1, N3RegFrm, + IIC_VRECSQ, "vrecps", "f32", + v4f32, v4f32, int_arm_neon_vrecps, 1>; + +// VRSQRTE : Vector Reciprocal Square Root Estimate +def VRSQRTEd : N2VDInt<0b11, 0b11, 0b10, 0b11, 0b01001, 0, + IIC_VUNAD, "vrsqrte", "u32", + v2i32, v2i32, int_arm_neon_vrsqrte>; +def VRSQRTEq : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01001, 0, + IIC_VUNAQ, "vrsqrte", "u32", + v4i32, v4i32, int_arm_neon_vrsqrte>; +def VRSQRTEfd : N2VDInt<0b11, 0b11, 0b10, 0b11, 0b01011, 0, + IIC_VUNAD, "vrsqrte", "f32", + v2f32, v2f32, int_arm_neon_vrsqrte>; +def VRSQRTEfq : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01011, 0, + IIC_VUNAQ, "vrsqrte", "f32", + v4f32, v4f32, int_arm_neon_vrsqrte>; + +// VRSQRTS : Vector Reciprocal Square Root Step +def VRSQRTSfd : N3VDInt<0, 0, 0b10, 0b1111, 1, N3RegFrm, + IIC_VRECSD, "vrsqrts", "f32", + v2f32, v2f32, int_arm_neon_vrsqrts, 1>; +def VRSQRTSfq : N3VQInt<0, 0, 0b10, 0b1111, 1, N3RegFrm, + IIC_VRECSQ, "vrsqrts", "f32", + v4f32, v4f32, int_arm_neon_vrsqrts, 1>; + +// Vector Shifts. + +// VSHL : Vector Shift +defm VSHLs : N3VInt_QHSD<0, 0, 0b0100, 0, N3RegVShFrm, + IIC_VSHLiD, IIC_VSHLiD, IIC_VSHLiQ, IIC_VSHLiQ, + "vshl", "s", int_arm_neon_vshifts, 0>; +defm VSHLu : N3VInt_QHSD<1, 0, 0b0100, 0, N3RegVShFrm, + IIC_VSHLiD, IIC_VSHLiD, IIC_VSHLiQ, IIC_VSHLiQ, + "vshl", "u", int_arm_neon_vshiftu, 0>; +// VSHL : Vector Shift Left (Immediate) +defm VSHLi : N2VSh_QHSD<0, 1, 0b0101, 1, IIC_VSHLiD, "vshl", "i", NEONvshl, + N2RegVShLFrm>; +// VSHR : Vector Shift Right (Immediate) +defm VSHRs : N2VSh_QHSD<0, 1, 0b0000, 1, IIC_VSHLiD, "vshr", "s", NEONvshrs, + N2RegVShRFrm>; +defm VSHRu : N2VSh_QHSD<1, 1, 0b0000, 1, IIC_VSHLiD, "vshr", "u", NEONvshru, + N2RegVShRFrm>; + +// VSHLL : Vector Shift Left Long +defm VSHLLs : N2VLSh_QHS<0, 1, 0b1010, 0, 0, 1, "vshll", "s", NEONvshlls>; +defm VSHLLu : N2VLSh_QHS<1, 1, 0b1010, 0, 0, 1, "vshll", "u", NEONvshllu>; + +// VSHLL : Vector Shift Left Long (with maximum shift count) +class N2VLShMax<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7, + bit op6, bit op4, string OpcodeStr, string Dt, ValueType ResTy, + ValueType OpTy, SDNode OpNode> + : N2VLSh<op24, op23, op11_8, op7, op6, op4, OpcodeStr, Dt, + ResTy, OpTy, OpNode> { + let Inst{21-16} = op21_16; +} +def VSHLLi8 : N2VLShMax<1, 1, 0b110010, 0b0011, 0, 0, 0, "vshll", "i8", + v8i16, v8i8, NEONvshlli>; +def VSHLLi16 : N2VLShMax<1, 1, 0b110110, 0b0011, 0, 0, 0, "vshll", "i16", + v4i32, v4i16, NEONvshlli>; +def VSHLLi32 : N2VLShMax<1, 1, 0b111010, 0b0011, 0, 0, 0, "vshll", "i32", + v2i64, v2i32, NEONvshlli>; + +// VSHRN : Vector Shift Right and Narrow +defm VSHRN : N2VNSh_HSD<0,1,0b1000,0,0,1, IIC_VSHLiD, "vshrn", "i", + NEONvshrn>; + +// VRSHL : Vector Rounding Shift +defm VRSHLs : N3VInt_QHSD<0, 0, 0b0101, 0, N3RegVShFrm, + IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, IIC_VSHLi4Q, + "vrshl", "s", int_arm_neon_vrshifts, 0>; +defm VRSHLu : N3VInt_QHSD<1, 0, 0b0101, 0, N3RegVShFrm, + IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, IIC_VSHLi4Q, + "vrshl", "u", int_arm_neon_vrshiftu, 0>; +// VRSHR : Vector Rounding Shift Right +defm VRSHRs : N2VSh_QHSD<0,1,0b0010,1, IIC_VSHLi4D, "vrshr", "s", NEONvrshrs, + N2RegVShRFrm>; +defm VRSHRu : N2VSh_QHSD<1,1,0b0010,1, IIC_VSHLi4D, "vrshr", "u", NEONvrshru, + N2RegVShRFrm>; + +// VRSHRN : Vector Rounding Shift Right and Narrow +defm VRSHRN : N2VNSh_HSD<0, 1, 0b1000, 0, 1, 1, IIC_VSHLi4D, "vrshrn", "i", + NEONvrshrn>; + +// VQSHL : Vector Saturating Shift +defm VQSHLs : N3VInt_QHSD<0, 0, 0b0100, 1, N3RegVShFrm, + IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, IIC_VSHLi4Q, + "vqshl", "s", int_arm_neon_vqshifts, 0>; +defm VQSHLu : N3VInt_QHSD<1, 0, 0b0100, 1, N3RegVShFrm, + IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, IIC_VSHLi4Q, + "vqshl", "u", int_arm_neon_vqshiftu, 0>; +// VQSHL : Vector Saturating Shift Left (Immediate) +defm VQSHLsi : N2VSh_QHSD<0,1,0b0111,1, IIC_VSHLi4D, "vqshl", "s",NEONvqshls, + N2RegVShLFrm>; +defm VQSHLui : N2VSh_QHSD<1,1,0b0111,1, IIC_VSHLi4D, "vqshl", "u",NEONvqshlu, + N2RegVShLFrm>; +// VQSHLU : Vector Saturating Shift Left (Immediate, Unsigned) +defm VQSHLsu : N2VSh_QHSD<1,1,0b0110,1, IIC_VSHLi4D,"vqshlu","s",NEONvqshlsu, + N2RegVShLFrm>; + +// VQSHRN : Vector Saturating Shift Right and Narrow +defm VQSHRNs : N2VNSh_HSD<0, 1, 0b1001, 0, 0, 1, IIC_VSHLi4D, "vqshrn", "s", + NEONvqshrns>; +defm VQSHRNu : N2VNSh_HSD<1, 1, 0b1001, 0, 0, 1, IIC_VSHLi4D, "vqshrn", "u", + NEONvqshrnu>; + +// VQSHRUN : Vector Saturating Shift Right and Narrow (Unsigned) +defm VQSHRUN : N2VNSh_HSD<1, 1, 0b1000, 0, 0, 1, IIC_VSHLi4D, "vqshrun", "s", + NEONvqshrnsu>; + +// VQRSHL : Vector Saturating Rounding Shift +defm VQRSHLs : N3VInt_QHSD<0, 0, 0b0101, 1, N3RegVShFrm, + IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, IIC_VSHLi4Q, + "vqrshl", "s", int_arm_neon_vqrshifts, 0>; +defm VQRSHLu : N3VInt_QHSD<1, 0, 0b0101, 1, N3RegVShFrm, + IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, IIC_VSHLi4Q, + "vqrshl", "u", int_arm_neon_vqrshiftu, 0>; + +// VQRSHRN : Vector Saturating Rounding Shift Right and Narrow +defm VQRSHRNs : N2VNSh_HSD<0, 1, 0b1001, 0, 1, 1, IIC_VSHLi4D, "vqrshrn", "s", + NEONvqrshrns>; +defm VQRSHRNu : N2VNSh_HSD<1, 1, 0b1001, 0, 1, 1, IIC_VSHLi4D, "vqrshrn", "u", + NEONvqrshrnu>; + +// VQRSHRUN : Vector Saturating Rounding Shift Right and Narrow (Unsigned) +defm VQRSHRUN : N2VNSh_HSD<1, 1, 0b1000, 0, 1, 1, IIC_VSHLi4D, "vqrshrun", "s", + NEONvqrshrnsu>; + +// VSRA : Vector Shift Right and Accumulate +defm VSRAs : N2VShAdd_QHSD<0, 1, 0b0001, 1, "vsra", "s", NEONvshrs>; +defm VSRAu : N2VShAdd_QHSD<1, 1, 0b0001, 1, "vsra", "u", NEONvshru>; +// VRSRA : Vector Rounding Shift Right and Accumulate +defm VRSRAs : N2VShAdd_QHSD<0, 1, 0b0011, 1, "vrsra", "s", NEONvrshrs>; +defm VRSRAu : N2VShAdd_QHSD<1, 1, 0b0011, 1, "vrsra", "u", NEONvrshru>; + +// VSLI : Vector Shift Left and Insert +defm VSLI : N2VShIns_QHSD<1, 1, 0b0101, 1, "vsli", NEONvsli, N2RegVShLFrm>; +// VSRI : Vector Shift Right and Insert +defm VSRI : N2VShIns_QHSD<1, 1, 0b0100, 1, "vsri", NEONvsri, N2RegVShRFrm>; + +// Vector Absolute and Saturating Absolute. + +// VABS : Vector Absolute Value +defm VABS : N2VInt_QHS<0b11, 0b11, 0b01, 0b00110, 0, + IIC_VUNAiD, IIC_VUNAiQ, "vabs", "s", + int_arm_neon_vabs>; +def VABSfd : N2VDInt<0b11, 0b11, 0b10, 0b01, 0b01110, 0, + IIC_VUNAD, "vabs", "f32", + v2f32, v2f32, int_arm_neon_vabs>; +def VABSfq : N2VQInt<0b11, 0b11, 0b10, 0b01, 0b01110, 0, + IIC_VUNAQ, "vabs", "f32", + v4f32, v4f32, int_arm_neon_vabs>; + +// VQABS : Vector Saturating Absolute Value +defm VQABS : N2VInt_QHS<0b11, 0b11, 0b00, 0b01110, 0, + IIC_VQUNAiD, IIC_VQUNAiQ, "vqabs", "s", + int_arm_neon_vqabs>; + +// Vector Negate. + +def vneg : PatFrag<(ops node:$in), (sub immAllZerosV, node:$in)>; +def vneg8 : PatFrag<(ops node:$in), + (sub (bitconvert (v8i8 immAllZerosV)), node:$in)>; +def vneg16 : PatFrag<(ops node:$in), + (sub (bitconvert (v16i8 immAllZerosV)), node:$in)>; + +class VNEGD<bits<2> size, string OpcodeStr, string Dt, ValueType Ty> + : N2V<0b11, 0b11, size, 0b01, 0b00111, 0, 0, (outs DPR:$dst), (ins DPR:$src), + IIC_VSHLiD, OpcodeStr, Dt, "$dst, $src", "", + [(set DPR:$dst, (Ty (vneg8 DPR:$src)))]>; +class VNEGQ<bits<2> size, string OpcodeStr, string Dt, ValueType Ty> + : N2V<0b11, 0b11, size, 0b01, 0b00111, 1, 0, (outs QPR:$dst), (ins QPR:$src), + IIC_VSHLiD, OpcodeStr, Dt, "$dst, $src", "", + [(set QPR:$dst, (Ty (vneg16 QPR:$src)))]>; + +// VNEG : Vector Negate (integer) +def VNEGs8d : VNEGD<0b00, "vneg", "s8", v8i8>; +def VNEGs16d : VNEGD<0b01, "vneg", "s16", v4i16>; +def VNEGs32d : VNEGD<0b10, "vneg", "s32", v2i32>; +def VNEGs8q : VNEGQ<0b00, "vneg", "s8", v16i8>; +def VNEGs16q : VNEGQ<0b01, "vneg", "s16", v8i16>; +def VNEGs32q : VNEGQ<0b10, "vneg", "s32", v4i32>; + +// VNEG : Vector Negate (floating-point) +def VNEGfd : N2V<0b11, 0b11, 0b10, 0b01, 0b01111, 0, 0, + (outs DPR:$dst), (ins DPR:$src), IIC_VUNAD, + "vneg", "f32", "$dst, $src", "", + [(set DPR:$dst, (v2f32 (fneg DPR:$src)))]>; +def VNEGf32q : N2V<0b11, 0b11, 0b10, 0b01, 0b01111, 1, 0, + (outs QPR:$dst), (ins QPR:$src), IIC_VUNAQ, + "vneg", "f32", "$dst, $src", "", + [(set QPR:$dst, (v4f32 (fneg QPR:$src)))]>; + +def : Pat<(v8i8 (vneg8 DPR:$src)), (VNEGs8d DPR:$src)>; +def : Pat<(v4i16 (vneg8 DPR:$src)), (VNEGs16d DPR:$src)>; +def : Pat<(v2i32 (vneg8 DPR:$src)), (VNEGs32d DPR:$src)>; +def : Pat<(v16i8 (vneg16 QPR:$src)), (VNEGs8q QPR:$src)>; +def : Pat<(v8i16 (vneg16 QPR:$src)), (VNEGs16q QPR:$src)>; +def : Pat<(v4i32 (vneg16 QPR:$src)), (VNEGs32q QPR:$src)>; + +// VQNEG : Vector Saturating Negate +defm VQNEG : N2VInt_QHS<0b11, 0b11, 0b00, 0b01111, 0, + IIC_VQUNAiD, IIC_VQUNAiQ, "vqneg", "s", + int_arm_neon_vqneg>; + +// Vector Bit Counting Operations. + +// VCLS : Vector Count Leading Sign Bits +defm VCLS : N2VInt_QHS<0b11, 0b11, 0b00, 0b01000, 0, + IIC_VCNTiD, IIC_VCNTiQ, "vcls", "s", + int_arm_neon_vcls>; +// VCLZ : Vector Count Leading Zeros +defm VCLZ : N2VInt_QHS<0b11, 0b11, 0b00, 0b01001, 0, + IIC_VCNTiD, IIC_VCNTiQ, "vclz", "i", + int_arm_neon_vclz>; +// VCNT : Vector Count One Bits +def VCNTd : N2VDInt<0b11, 0b11, 0b00, 0b00, 0b01010, 0, + IIC_VCNTiD, "vcnt", "8", + v8i8, v8i8, int_arm_neon_vcnt>; +def VCNTq : N2VQInt<0b11, 0b11, 0b00, 0b00, 0b01010, 0, + IIC_VCNTiQ, "vcnt", "8", + v16i8, v16i8, int_arm_neon_vcnt>; + +// Vector Swap -- for disassembly only. +def VSWPd : N2VX<0b11, 0b11, 0b00, 0b10, 0b00000, 0, 0, + (outs DPR:$dst), (ins DPR:$src), NoItinerary, + "vswp", "$dst, $src", "", []>; +def VSWPq : N2VX<0b11, 0b11, 0b00, 0b10, 0b00000, 1, 0, + (outs QPR:$dst), (ins QPR:$src), NoItinerary, + "vswp", "$dst, $src", "", []>; + +// Vector Move Operations. + +// VMOV : Vector Move (Register) + +let neverHasSideEffects = 1 in { +def VMOVDneon: N3VX<0, 0, 0b10, 0b0001, 0, 1, (outs DPR:$dst), (ins DPR:$src), + N3RegFrm, IIC_VMOVD, "vmov", "$dst, $src", "", []>; +def VMOVQ : N3VX<0, 0, 0b10, 0b0001, 1, 1, (outs QPR:$dst), (ins QPR:$src), + N3RegFrm, IIC_VMOVD, "vmov", "$dst, $src", "", []>; + +// Pseudo vector move instructions for QQ and QQQQ registers. This should +// be expanded after register allocation is completed. +def VMOVQQ : PseudoInst<(outs QQPR:$dst), (ins QQPR:$src), + NoItinerary, "${:comment} vmov\t$dst, $src", []>; + +def VMOVQQQQ : PseudoInst<(outs QQQQPR:$dst), (ins QQQQPR:$src), + NoItinerary, "${:comment} vmov\t$dst, $src", []>; +} // neverHasSideEffects + +// VMOV : Vector Move (Immediate) + +// VMOV_get_imm8 xform function: convert build_vector to VMOV.i8 imm. +def VMOV_get_imm8 : SDNodeXForm<build_vector, [{ + return ARM::getVMOVImm(N, 1, *CurDAG); +}]>; +def vmovImm8 : PatLeaf<(build_vector), [{ + return ARM::getVMOVImm(N, 1, *CurDAG).getNode() != 0; +}], VMOV_get_imm8>; + +// VMOV_get_imm16 xform function: convert build_vector to VMOV.i16 imm. +def VMOV_get_imm16 : SDNodeXForm<build_vector, [{ + return ARM::getVMOVImm(N, 2, *CurDAG); +}]>; +def vmovImm16 : PatLeaf<(build_vector), [{ + return ARM::getVMOVImm(N, 2, *CurDAG).getNode() != 0; +}], VMOV_get_imm16>; + +// VMOV_get_imm32 xform function: convert build_vector to VMOV.i32 imm. +def VMOV_get_imm32 : SDNodeXForm<build_vector, [{ + return ARM::getVMOVImm(N, 4, *CurDAG); +}]>; +def vmovImm32 : PatLeaf<(build_vector), [{ + return ARM::getVMOVImm(N, 4, *CurDAG).getNode() != 0; +}], VMOV_get_imm32>; + +// VMOV_get_imm64 xform function: convert build_vector to VMOV.i64 imm. +def VMOV_get_imm64 : SDNodeXForm<build_vector, [{ + return ARM::getVMOVImm(N, 8, *CurDAG); +}]>; +def vmovImm64 : PatLeaf<(build_vector), [{ + return ARM::getVMOVImm(N, 8, *CurDAG).getNode() != 0; +}], VMOV_get_imm64>; + +// Note: Some of the cmode bits in the following VMOV instructions need to +// be encoded based on the immed values. + +let isReMaterializable = 1 in { +def VMOVv8i8 : N1ModImm<1, 0b000, 0b1110, 0, 0, 0, 1, (outs DPR:$dst), + (ins h8imm:$SIMM), IIC_VMOVImm, + "vmov", "i8", "$dst, $SIMM", "", + [(set DPR:$dst, (v8i8 vmovImm8:$SIMM))]>; +def VMOVv16i8 : N1ModImm<1, 0b000, 0b1110, 0, 1, 0, 1, (outs QPR:$dst), + (ins h8imm:$SIMM), IIC_VMOVImm, + "vmov", "i8", "$dst, $SIMM", "", + [(set QPR:$dst, (v16i8 vmovImm8:$SIMM))]>; + +def VMOVv4i16 : N1ModImm<1, 0b000, {1,0,?,?}, 0, 0, {?}, 1, (outs DPR:$dst), + (ins h16imm:$SIMM), IIC_VMOVImm, + "vmov", "i16", "$dst, $SIMM", "", + [(set DPR:$dst, (v4i16 vmovImm16:$SIMM))]>; +def VMOVv8i16 : N1ModImm<1, 0b000, {1,0,?,?}, 0, 1, {?}, 1, (outs QPR:$dst), + (ins h16imm:$SIMM), IIC_VMOVImm, + "vmov", "i16", "$dst, $SIMM", "", + [(set QPR:$dst, (v8i16 vmovImm16:$SIMM))]>; + +def VMOVv2i32 : N1ModImm<1, 0b000, {?,?,?,?}, 0, 0, {?}, 1, (outs DPR:$dst), + (ins h32imm:$SIMM), IIC_VMOVImm, + "vmov", "i32", "$dst, $SIMM", "", + [(set DPR:$dst, (v2i32 vmovImm32:$SIMM))]>; +def VMOVv4i32 : N1ModImm<1, 0b000, {?,?,?,?}, 0, 1, {?}, 1, (outs QPR:$dst), + (ins h32imm:$SIMM), IIC_VMOVImm, + "vmov", "i32", "$dst, $SIMM", "", + [(set QPR:$dst, (v4i32 vmovImm32:$SIMM))]>; + +def VMOVv1i64 : N1ModImm<1, 0b000, 0b1110, 0, 0, 1, 1, (outs DPR:$dst), + (ins h64imm:$SIMM), IIC_VMOVImm, + "vmov", "i64", "$dst, $SIMM", "", + [(set DPR:$dst, (v1i64 vmovImm64:$SIMM))]>; +def VMOVv2i64 : N1ModImm<1, 0b000, 0b1110, 0, 1, 1, 1, (outs QPR:$dst), + (ins h64imm:$SIMM), IIC_VMOVImm, + "vmov", "i64", "$dst, $SIMM", "", + [(set QPR:$dst, (v2i64 vmovImm64:$SIMM))]>; +} // isReMaterializable + +// VMOV : Vector Get Lane (move scalar to ARM core register) + +def VGETLNs8 : NVGetLane<{1,1,1,0,0,1,?,1}, 0b1011, {?,?}, + (outs GPR:$dst), (ins DPR:$src, nohash_imm:$lane), + IIC_VMOVSI, "vmov", "s8", "$dst, $src[$lane]", + [(set GPR:$dst, (NEONvgetlanes (v8i8 DPR:$src), + imm:$lane))]>; +def VGETLNs16 : NVGetLane<{1,1,1,0,0,0,?,1}, 0b1011, {?,1}, + (outs GPR:$dst), (ins DPR:$src, nohash_imm:$lane), + IIC_VMOVSI, "vmov", "s16", "$dst, $src[$lane]", + [(set GPR:$dst, (NEONvgetlanes (v4i16 DPR:$src), + imm:$lane))]>; +def VGETLNu8 : NVGetLane<{1,1,1,0,1,1,?,1}, 0b1011, {?,?}, + (outs GPR:$dst), (ins DPR:$src, nohash_imm:$lane), + IIC_VMOVSI, "vmov", "u8", "$dst, $src[$lane]", + [(set GPR:$dst, (NEONvgetlaneu (v8i8 DPR:$src), + imm:$lane))]>; +def VGETLNu16 : NVGetLane<{1,1,1,0,1,0,?,1}, 0b1011, {?,1}, + (outs GPR:$dst), (ins DPR:$src, nohash_imm:$lane), + IIC_VMOVSI, "vmov", "u16", "$dst, $src[$lane]", + [(set GPR:$dst, (NEONvgetlaneu (v4i16 DPR:$src), + imm:$lane))]>; +def VGETLNi32 : NVGetLane<{1,1,1,0,0,0,?,1}, 0b1011, 0b00, + (outs GPR:$dst), (ins DPR:$src, nohash_imm:$lane), + IIC_VMOVSI, "vmov", "32", "$dst, $src[$lane]", + [(set GPR:$dst, (extractelt (v2i32 DPR:$src), + imm:$lane))]>; +// def VGETLNf32: see FMRDH and FMRDL in ARMInstrVFP.td +def : Pat<(NEONvgetlanes (v16i8 QPR:$src), imm:$lane), + (VGETLNs8 (v8i8 (EXTRACT_SUBREG QPR:$src, + (DSubReg_i8_reg imm:$lane))), + (SubReg_i8_lane imm:$lane))>; +def : Pat<(NEONvgetlanes (v8i16 QPR:$src), imm:$lane), + (VGETLNs16 (v4i16 (EXTRACT_SUBREG QPR:$src, + (DSubReg_i16_reg imm:$lane))), + (SubReg_i16_lane imm:$lane))>; +def : Pat<(NEONvgetlaneu (v16i8 QPR:$src), imm:$lane), + (VGETLNu8 (v8i8 (EXTRACT_SUBREG QPR:$src, + (DSubReg_i8_reg imm:$lane))), + (SubReg_i8_lane imm:$lane))>; +def : Pat<(NEONvgetlaneu (v8i16 QPR:$src), imm:$lane), + (VGETLNu16 (v4i16 (EXTRACT_SUBREG QPR:$src, + (DSubReg_i16_reg imm:$lane))), + (SubReg_i16_lane imm:$lane))>; +def : Pat<(extractelt (v4i32 QPR:$src), imm:$lane), + (VGETLNi32 (v2i32 (EXTRACT_SUBREG QPR:$src, + (DSubReg_i32_reg imm:$lane))), + (SubReg_i32_lane imm:$lane))>; +def : Pat<(extractelt (v2f32 DPR:$src1), imm:$src2), + (EXTRACT_SUBREG (v2f32 (COPY_TO_REGCLASS (v2f32 DPR:$src1),DPR_VFP2)), + (SSubReg_f32_reg imm:$src2))>; +def : Pat<(extractelt (v4f32 QPR:$src1), imm:$src2), + (EXTRACT_SUBREG (v4f32 (COPY_TO_REGCLASS (v4f32 QPR:$src1),QPR_VFP2)), + (SSubReg_f32_reg imm:$src2))>; +//def : Pat<(extractelt (v2i64 QPR:$src1), imm:$src2), +// (EXTRACT_SUBREG QPR:$src1, (DSubReg_f64_reg imm:$src2))>; +def : Pat<(extractelt (v2f64 QPR:$src1), imm:$src2), + (EXTRACT_SUBREG QPR:$src1, (DSubReg_f64_reg imm:$src2))>; + + +// VMOV : Vector Set Lane (move ARM core register to scalar) + +let Constraints = "$src1 = $dst" in { +def VSETLNi8 : NVSetLane<{1,1,1,0,0,1,?,0}, 0b1011, {?,?}, (outs DPR:$dst), + (ins DPR:$src1, GPR:$src2, nohash_imm:$lane), + IIC_VMOVISL, "vmov", "8", "$dst[$lane], $src2", + [(set DPR:$dst, (vector_insert (v8i8 DPR:$src1), + GPR:$src2, imm:$lane))]>; +def VSETLNi16 : NVSetLane<{1,1,1,0,0,0,?,0}, 0b1011, {?,1}, (outs DPR:$dst), + (ins DPR:$src1, GPR:$src2, nohash_imm:$lane), + IIC_VMOVISL, "vmov", "16", "$dst[$lane], $src2", + [(set DPR:$dst, (vector_insert (v4i16 DPR:$src1), + GPR:$src2, imm:$lane))]>; +def VSETLNi32 : NVSetLane<{1,1,1,0,0,0,?,0}, 0b1011, 0b00, (outs DPR:$dst), + (ins DPR:$src1, GPR:$src2, nohash_imm:$lane), + IIC_VMOVISL, "vmov", "32", "$dst[$lane], $src2", + [(set DPR:$dst, (insertelt (v2i32 DPR:$src1), + GPR:$src2, imm:$lane))]>; +} +def : Pat<(vector_insert (v16i8 QPR:$src1), GPR:$src2, imm:$lane), + (v16i8 (INSERT_SUBREG QPR:$src1, + (v8i8 (VSETLNi8 (v8i8 (EXTRACT_SUBREG QPR:$src1, + (DSubReg_i8_reg imm:$lane))), + GPR:$src2, (SubReg_i8_lane imm:$lane))), + (DSubReg_i8_reg imm:$lane)))>; +def : Pat<(vector_insert (v8i16 QPR:$src1), GPR:$src2, imm:$lane), + (v8i16 (INSERT_SUBREG QPR:$src1, + (v4i16 (VSETLNi16 (v4i16 (EXTRACT_SUBREG QPR:$src1, + (DSubReg_i16_reg imm:$lane))), + GPR:$src2, (SubReg_i16_lane imm:$lane))), + (DSubReg_i16_reg imm:$lane)))>; +def : Pat<(insertelt (v4i32 QPR:$src1), GPR:$src2, imm:$lane), + (v4i32 (INSERT_SUBREG QPR:$src1, + (v2i32 (VSETLNi32 (v2i32 (EXTRACT_SUBREG QPR:$src1, + (DSubReg_i32_reg imm:$lane))), + GPR:$src2, (SubReg_i32_lane imm:$lane))), + (DSubReg_i32_reg imm:$lane)))>; + +def : Pat<(v2f32 (insertelt DPR:$src1, SPR:$src2, imm:$src3)), + (INSERT_SUBREG (v2f32 (COPY_TO_REGCLASS DPR:$src1, DPR_VFP2)), + SPR:$src2, (SSubReg_f32_reg imm:$src3))>; +def : Pat<(v4f32 (insertelt QPR:$src1, SPR:$src2, imm:$src3)), + (INSERT_SUBREG (v4f32 (COPY_TO_REGCLASS QPR:$src1, QPR_VFP2)), + SPR:$src2, (SSubReg_f32_reg imm:$src3))>; + +//def : Pat<(v2i64 (insertelt QPR:$src1, DPR:$src2, imm:$src3)), +// (INSERT_SUBREG QPR:$src1, DPR:$src2, (DSubReg_f64_reg imm:$src3))>; +def : Pat<(v2f64 (insertelt QPR:$src1, DPR:$src2, imm:$src3)), + (INSERT_SUBREG QPR:$src1, DPR:$src2, (DSubReg_f64_reg imm:$src3))>; + +def : Pat<(v2f32 (scalar_to_vector SPR:$src)), + (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), SPR:$src, ssub_0)>; +def : Pat<(v2f64 (scalar_to_vector (f64 DPR:$src))), + (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), DPR:$src, dsub_0)>; +def : Pat<(v4f32 (scalar_to_vector SPR:$src)), + (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), SPR:$src, ssub_0)>; + +def : Pat<(v8i8 (scalar_to_vector GPR:$src)), + (VSETLNi8 (v8i8 (IMPLICIT_DEF)), GPR:$src, (i32 0))>; +def : Pat<(v4i16 (scalar_to_vector GPR:$src)), + (VSETLNi16 (v4i16 (IMPLICIT_DEF)), GPR:$src, (i32 0))>; +def : Pat<(v2i32 (scalar_to_vector GPR:$src)), + (VSETLNi32 (v2i32 (IMPLICIT_DEF)), GPR:$src, (i32 0))>; + +def : Pat<(v16i8 (scalar_to_vector GPR:$src)), + (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), + (VSETLNi8 (v8i8 (IMPLICIT_DEF)), GPR:$src, (i32 0)), + dsub_0)>; +def : Pat<(v8i16 (scalar_to_vector GPR:$src)), + (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), + (VSETLNi16 (v4i16 (IMPLICIT_DEF)), GPR:$src, (i32 0)), + dsub_0)>; +def : Pat<(v4i32 (scalar_to_vector GPR:$src)), + (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), + (VSETLNi32 (v2i32 (IMPLICIT_DEF)), GPR:$src, (i32 0)), + dsub_0)>; + +// VDUP : Vector Duplicate (from ARM core register to all elements) + +class VDUPD<bits<8> opcod1, bits<2> opcod3, string Dt, ValueType Ty> + : NVDup<opcod1, 0b1011, opcod3, (outs DPR:$dst), (ins GPR:$src), + IIC_VMOVIS, "vdup", Dt, "$dst, $src", + [(set DPR:$dst, (Ty (NEONvdup (i32 GPR:$src))))]>; +class VDUPQ<bits<8> opcod1, bits<2> opcod3, string Dt, ValueType Ty> + : NVDup<opcod1, 0b1011, opcod3, (outs QPR:$dst), (ins GPR:$src), + IIC_VMOVIS, "vdup", Dt, "$dst, $src", + [(set QPR:$dst, (Ty (NEONvdup (i32 GPR:$src))))]>; + +def VDUP8d : VDUPD<0b11101100, 0b00, "8", v8i8>; +def VDUP16d : VDUPD<0b11101000, 0b01, "16", v4i16>; +def VDUP32d : VDUPD<0b11101000, 0b00, "32", v2i32>; +def VDUP8q : VDUPQ<0b11101110, 0b00, "8", v16i8>; +def VDUP16q : VDUPQ<0b11101010, 0b01, "16", v8i16>; +def VDUP32q : VDUPQ<0b11101010, 0b00, "32", v4i32>; + +def VDUPfd : NVDup<0b11101000, 0b1011, 0b00, (outs DPR:$dst), (ins GPR:$src), + IIC_VMOVIS, "vdup", "32", "$dst, $src", + [(set DPR:$dst, (v2f32 (NEONvdup + (f32 (bitconvert GPR:$src)))))]>; +def VDUPfq : NVDup<0b11101010, 0b1011, 0b00, (outs QPR:$dst), (ins GPR:$src), + IIC_VMOVIS, "vdup", "32", "$dst, $src", + [(set QPR:$dst, (v4f32 (NEONvdup + (f32 (bitconvert GPR:$src)))))]>; + +// VDUP : Vector Duplicate Lane (from scalar to all elements) + +class VDUPLND<bits<4> op19_16, string OpcodeStr, string Dt, + ValueType Ty> + : NVDupLane<op19_16, 0, (outs DPR:$dst), (ins DPR:$src, nohash_imm:$lane), + IIC_VMOVD, OpcodeStr, Dt, "$dst, $src[$lane]", + [(set DPR:$dst, (Ty (NEONvduplane (Ty DPR:$src), imm:$lane)))]>; + +class VDUPLNQ<bits<4> op19_16, string OpcodeStr, string Dt, + ValueType ResTy, ValueType OpTy> + : NVDupLane<op19_16, 1, (outs QPR:$dst), (ins DPR:$src, nohash_imm:$lane), + IIC_VMOVD, OpcodeStr, Dt, "$dst, $src[$lane]", + [(set QPR:$dst, (ResTy (NEONvduplane (OpTy DPR:$src), + imm:$lane)))]>; + +// Inst{19-16} is partially specified depending on the element size. + +def VDUPLN8d : VDUPLND<{?,?,?,1}, "vdup", "8", v8i8>; +def VDUPLN16d : VDUPLND<{?,?,1,0}, "vdup", "16", v4i16>; +def VDUPLN32d : VDUPLND<{?,1,0,0}, "vdup", "32", v2i32>; +def VDUPLNfd : VDUPLND<{?,1,0,0}, "vdup", "32", v2f32>; +def VDUPLN8q : VDUPLNQ<{?,?,?,1}, "vdup", "8", v16i8, v8i8>; +def VDUPLN16q : VDUPLNQ<{?,?,1,0}, "vdup", "16", v8i16, v4i16>; +def VDUPLN32q : VDUPLNQ<{?,1,0,0}, "vdup", "32", v4i32, v2i32>; +def VDUPLNfq : VDUPLNQ<{?,1,0,0}, "vdup", "32", v4f32, v2f32>; + +def : Pat<(v16i8 (NEONvduplane (v16i8 QPR:$src), imm:$lane)), + (v16i8 (VDUPLN8q (v8i8 (EXTRACT_SUBREG QPR:$src, + (DSubReg_i8_reg imm:$lane))), + (SubReg_i8_lane imm:$lane)))>; +def : Pat<(v8i16 (NEONvduplane (v8i16 QPR:$src), imm:$lane)), + (v8i16 (VDUPLN16q (v4i16 (EXTRACT_SUBREG QPR:$src, + (DSubReg_i16_reg imm:$lane))), + (SubReg_i16_lane imm:$lane)))>; +def : Pat<(v4i32 (NEONvduplane (v4i32 QPR:$src), imm:$lane)), + (v4i32 (VDUPLN32q (v2i32 (EXTRACT_SUBREG QPR:$src, + (DSubReg_i32_reg imm:$lane))), + (SubReg_i32_lane imm:$lane)))>; +def : Pat<(v4f32 (NEONvduplane (v4f32 QPR:$src), imm:$lane)), + (v4f32 (VDUPLNfq (v2f32 (EXTRACT_SUBREG QPR:$src, + (DSubReg_i32_reg imm:$lane))), + (SubReg_i32_lane imm:$lane)))>; + +def VDUPfdf : N2V<0b11, 0b11, {?,1}, {0,0}, 0b11000, 0, 0, + (outs DPR:$dst), (ins SPR:$src), + IIC_VMOVD, "vdup", "32", "$dst, ${src:lane}", "", + [(set DPR:$dst, (v2f32 (NEONvdup (f32 SPR:$src))))]>; + +def VDUPfqf : N2V<0b11, 0b11, {?,1}, {0,0}, 0b11000, 1, 0, + (outs QPR:$dst), (ins SPR:$src), + IIC_VMOVD, "vdup", "32", "$dst, ${src:lane}", "", + [(set QPR:$dst, (v4f32 (NEONvdup (f32 SPR:$src))))]>; + +def : Pat<(v2i64 (NEONvduplane (v2i64 QPR:$src), imm:$lane)), + (INSERT_SUBREG QPR:$src, + (i64 (EXTRACT_SUBREG QPR:$src, + (DSubReg_f64_reg imm:$lane))), + (DSubReg_f64_other_reg imm:$lane))>; +def : Pat<(v2f64 (NEONvduplane (v2f64 QPR:$src), imm:$lane)), + (INSERT_SUBREG QPR:$src, + (f64 (EXTRACT_SUBREG QPR:$src, + (DSubReg_f64_reg imm:$lane))), + (DSubReg_f64_other_reg imm:$lane))>; + +// VMOVN : Vector Narrowing Move +defm VMOVN : N2VNInt_HSD<0b11,0b11,0b10,0b00100,0,0, IIC_VMOVD, + "vmovn", "i", int_arm_neon_vmovn>; +// VQMOVN : Vector Saturating Narrowing Move +defm VQMOVNs : N2VNInt_HSD<0b11,0b11,0b10,0b00101,0,0, IIC_VQUNAiD, + "vqmovn", "s", int_arm_neon_vqmovns>; +defm VQMOVNu : N2VNInt_HSD<0b11,0b11,0b10,0b00101,1,0, IIC_VQUNAiD, + "vqmovn", "u", int_arm_neon_vqmovnu>; +defm VQMOVNsu : N2VNInt_HSD<0b11,0b11,0b10,0b00100,1,0, IIC_VQUNAiD, + "vqmovun", "s", int_arm_neon_vqmovnsu>; +// VMOVL : Vector Lengthening Move +defm VMOVLs : N2VLInt_QHS<0b01,0b10100,0,1, "vmovl", "s", + int_arm_neon_vmovls>; +defm VMOVLu : N2VLInt_QHS<0b11,0b10100,0,1, "vmovl", "u", + int_arm_neon_vmovlu>; + +// Vector Conversions. + +// VCVT : Vector Convert Between Floating-Point and Integers +def VCVTf2sd : N2VD<0b11, 0b11, 0b10, 0b11, 0b01110, 0, "vcvt", "s32.f32", + v2i32, v2f32, fp_to_sint>; +def VCVTf2ud : N2VD<0b11, 0b11, 0b10, 0b11, 0b01111, 0, "vcvt", "u32.f32", + v2i32, v2f32, fp_to_uint>; +def VCVTs2fd : N2VD<0b11, 0b11, 0b10, 0b11, 0b01100, 0, "vcvt", "f32.s32", + v2f32, v2i32, sint_to_fp>; +def VCVTu2fd : N2VD<0b11, 0b11, 0b10, 0b11, 0b01101, 0, "vcvt", "f32.u32", + v2f32, v2i32, uint_to_fp>; + +def VCVTf2sq : N2VQ<0b11, 0b11, 0b10, 0b11, 0b01110, 0, "vcvt", "s32.f32", + v4i32, v4f32, fp_to_sint>; +def VCVTf2uq : N2VQ<0b11, 0b11, 0b10, 0b11, 0b01111, 0, "vcvt", "u32.f32", + v4i32, v4f32, fp_to_uint>; +def VCVTs2fq : N2VQ<0b11, 0b11, 0b10, 0b11, 0b01100, 0, "vcvt", "f32.s32", + v4f32, v4i32, sint_to_fp>; +def VCVTu2fq : N2VQ<0b11, 0b11, 0b10, 0b11, 0b01101, 0, "vcvt", "f32.u32", + v4f32, v4i32, uint_to_fp>; + +// VCVT : Vector Convert Between Floating-Point and Fixed-Point. +def VCVTf2xsd : N2VCvtD<0, 1, 0b1111, 0, 1, "vcvt", "s32.f32", + v2i32, v2f32, int_arm_neon_vcvtfp2fxs>; +def VCVTf2xud : N2VCvtD<1, 1, 0b1111, 0, 1, "vcvt", "u32.f32", + v2i32, v2f32, int_arm_neon_vcvtfp2fxu>; +def VCVTxs2fd : N2VCvtD<0, 1, 0b1110, 0, 1, "vcvt", "f32.s32", + v2f32, v2i32, int_arm_neon_vcvtfxs2fp>; +def VCVTxu2fd : N2VCvtD<1, 1, 0b1110, 0, 1, "vcvt", "f32.u32", + v2f32, v2i32, int_arm_neon_vcvtfxu2fp>; + +def VCVTf2xsq : N2VCvtQ<0, 1, 0b1111, 0, 1, "vcvt", "s32.f32", + v4i32, v4f32, int_arm_neon_vcvtfp2fxs>; +def VCVTf2xuq : N2VCvtQ<1, 1, 0b1111, 0, 1, "vcvt", "u32.f32", + v4i32, v4f32, int_arm_neon_vcvtfp2fxu>; +def VCVTxs2fq : N2VCvtQ<0, 1, 0b1110, 0, 1, "vcvt", "f32.s32", + v4f32, v4i32, int_arm_neon_vcvtfxs2fp>; +def VCVTxu2fq : N2VCvtQ<1, 1, 0b1110, 0, 1, "vcvt", "f32.u32", + v4f32, v4i32, int_arm_neon_vcvtfxu2fp>; + +// Vector Reverse. + +// VREV64 : Vector Reverse elements within 64-bit doublewords + +class VREV64D<bits<2> op19_18, string OpcodeStr, string Dt, ValueType Ty> + : N2V<0b11, 0b11, op19_18, 0b00, 0b00000, 0, 0, (outs DPR:$dst), + (ins DPR:$src), IIC_VMOVD, + OpcodeStr, Dt, "$dst, $src", "", + [(set DPR:$dst, (Ty (NEONvrev64 (Ty DPR:$src))))]>; +class VREV64Q<bits<2> op19_18, string OpcodeStr, string Dt, ValueType Ty> + : N2V<0b11, 0b11, op19_18, 0b00, 0b00000, 1, 0, (outs QPR:$dst), + (ins QPR:$src), IIC_VMOVD, + OpcodeStr, Dt, "$dst, $src", "", + [(set QPR:$dst, (Ty (NEONvrev64 (Ty QPR:$src))))]>; + +def VREV64d8 : VREV64D<0b00, "vrev64", "8", v8i8>; +def VREV64d16 : VREV64D<0b01, "vrev64", "16", v4i16>; +def VREV64d32 : VREV64D<0b10, "vrev64", "32", v2i32>; +def VREV64df : VREV64D<0b10, "vrev64", "32", v2f32>; + +def VREV64q8 : VREV64Q<0b00, "vrev64", "8", v16i8>; +def VREV64q16 : VREV64Q<0b01, "vrev64", "16", v8i16>; +def VREV64q32 : VREV64Q<0b10, "vrev64", "32", v4i32>; +def VREV64qf : VREV64Q<0b10, "vrev64", "32", v4f32>; + +// VREV32 : Vector Reverse elements within 32-bit words + +class VREV32D<bits<2> op19_18, string OpcodeStr, string Dt, ValueType Ty> + : N2V<0b11, 0b11, op19_18, 0b00, 0b00001, 0, 0, (outs DPR:$dst), + (ins DPR:$src), IIC_VMOVD, + OpcodeStr, Dt, "$dst, $src", "", + [(set DPR:$dst, (Ty (NEONvrev32 (Ty DPR:$src))))]>; +class VREV32Q<bits<2> op19_18, string OpcodeStr, string Dt, ValueType Ty> + : N2V<0b11, 0b11, op19_18, 0b00, 0b00001, 1, 0, (outs QPR:$dst), + (ins QPR:$src), IIC_VMOVD, + OpcodeStr, Dt, "$dst, $src", "", + [(set QPR:$dst, (Ty (NEONvrev32 (Ty QPR:$src))))]>; + +def VREV32d8 : VREV32D<0b00, "vrev32", "8", v8i8>; +def VREV32d16 : VREV32D<0b01, "vrev32", "16", v4i16>; + +def VREV32q8 : VREV32Q<0b00, "vrev32", "8", v16i8>; +def VREV32q16 : VREV32Q<0b01, "vrev32", "16", v8i16>; + +// VREV16 : Vector Reverse elements within 16-bit halfwords + +class VREV16D<bits<2> op19_18, string OpcodeStr, string Dt, ValueType Ty> + : N2V<0b11, 0b11, op19_18, 0b00, 0b00010, 0, 0, (outs DPR:$dst), + (ins DPR:$src), IIC_VMOVD, + OpcodeStr, Dt, "$dst, $src", "", + [(set DPR:$dst, (Ty (NEONvrev16 (Ty DPR:$src))))]>; +class VREV16Q<bits<2> op19_18, string OpcodeStr, string Dt, ValueType Ty> + : N2V<0b11, 0b11, op19_18, 0b00, 0b00010, 1, 0, (outs QPR:$dst), + (ins QPR:$src), IIC_VMOVD, + OpcodeStr, Dt, "$dst, $src", "", + [(set QPR:$dst, (Ty (NEONvrev16 (Ty QPR:$src))))]>; + +def VREV16d8 : VREV16D<0b00, "vrev16", "8", v8i8>; +def VREV16q8 : VREV16Q<0b00, "vrev16", "8", v16i8>; + +// Other Vector Shuffles. + +// VEXT : Vector Extract + +class VEXTd<string OpcodeStr, string Dt, ValueType Ty> + : N3V<0,1,0b11,{?,?,?,?},0,0, (outs DPR:$dst), + (ins DPR:$lhs, DPR:$rhs, i32imm:$index), NVExtFrm, + IIC_VEXTD, OpcodeStr, Dt, "$dst, $lhs, $rhs, $index", "", + [(set DPR:$dst, (Ty (NEONvext (Ty DPR:$lhs), + (Ty DPR:$rhs), imm:$index)))]>; + +class VEXTq<string OpcodeStr, string Dt, ValueType Ty> + : N3V<0,1,0b11,{?,?,?,?},1,0, (outs QPR:$dst), + (ins QPR:$lhs, QPR:$rhs, i32imm:$index), NVExtFrm, + IIC_VEXTQ, OpcodeStr, Dt, "$dst, $lhs, $rhs, $index", "", + [(set QPR:$dst, (Ty (NEONvext (Ty QPR:$lhs), + (Ty QPR:$rhs), imm:$index)))]>; + +def VEXTd8 : VEXTd<"vext", "8", v8i8>; +def VEXTd16 : VEXTd<"vext", "16", v4i16>; +def VEXTd32 : VEXTd<"vext", "32", v2i32>; +def VEXTdf : VEXTd<"vext", "32", v2f32>; + +def VEXTq8 : VEXTq<"vext", "8", v16i8>; +def VEXTq16 : VEXTq<"vext", "16", v8i16>; +def VEXTq32 : VEXTq<"vext", "32", v4i32>; +def VEXTqf : VEXTq<"vext", "32", v4f32>; + +// VTRN : Vector Transpose + +def VTRNd8 : N2VDShuffle<0b00, 0b00001, "vtrn", "8">; +def VTRNd16 : N2VDShuffle<0b01, 0b00001, "vtrn", "16">; +def VTRNd32 : N2VDShuffle<0b10, 0b00001, "vtrn", "32">; + +def VTRNq8 : N2VQShuffle<0b00, 0b00001, IIC_VPERMQ, "vtrn", "8">; +def VTRNq16 : N2VQShuffle<0b01, 0b00001, IIC_VPERMQ, "vtrn", "16">; +def VTRNq32 : N2VQShuffle<0b10, 0b00001, IIC_VPERMQ, "vtrn", "32">; + +// VUZP : Vector Unzip (Deinterleave) + +def VUZPd8 : N2VDShuffle<0b00, 0b00010, "vuzp", "8">; +def VUZPd16 : N2VDShuffle<0b01, 0b00010, "vuzp", "16">; +def VUZPd32 : N2VDShuffle<0b10, 0b00010, "vuzp", "32">; + +def VUZPq8 : N2VQShuffle<0b00, 0b00010, IIC_VPERMQ3, "vuzp", "8">; +def VUZPq16 : N2VQShuffle<0b01, 0b00010, IIC_VPERMQ3, "vuzp", "16">; +def VUZPq32 : N2VQShuffle<0b10, 0b00010, IIC_VPERMQ3, "vuzp", "32">; + +// VZIP : Vector Zip (Interleave) + +def VZIPd8 : N2VDShuffle<0b00, 0b00011, "vzip", "8">; +def VZIPd16 : N2VDShuffle<0b01, 0b00011, "vzip", "16">; +def VZIPd32 : N2VDShuffle<0b10, 0b00011, "vzip", "32">; + +def VZIPq8 : N2VQShuffle<0b00, 0b00011, IIC_VPERMQ3, "vzip", "8">; +def VZIPq16 : N2VQShuffle<0b01, 0b00011, IIC_VPERMQ3, "vzip", "16">; +def VZIPq32 : N2VQShuffle<0b10, 0b00011, IIC_VPERMQ3, "vzip", "32">; + +// Vector Table Lookup and Table Extension. + +// VTBL : Vector Table Lookup +def VTBL1 + : N3V<1,1,0b11,0b1000,0,0, (outs DPR:$dst), + (ins DPR:$tbl1, DPR:$src), NVTBLFrm, IIC_VTB1, + "vtbl", "8", "$dst, \\{$tbl1\\}, $src", "", + [(set DPR:$dst, (v8i8 (int_arm_neon_vtbl1 DPR:$tbl1, DPR:$src)))]>; +let hasExtraSrcRegAllocReq = 1 in { +def VTBL2 + : N3V<1,1,0b11,0b1001,0,0, (outs DPR:$dst), + (ins DPR:$tbl1, DPR:$tbl2, DPR:$src), NVTBLFrm, IIC_VTB2, + "vtbl", "8", "$dst, \\{$tbl1, $tbl2\\}, $src", "", + [(set DPR:$dst, (v8i8 (int_arm_neon_vtbl2 + DPR:$tbl1, DPR:$tbl2, DPR:$src)))]>; +def VTBL3 + : N3V<1,1,0b11,0b1010,0,0, (outs DPR:$dst), + (ins DPR:$tbl1, DPR:$tbl2, DPR:$tbl3, DPR:$src), NVTBLFrm, IIC_VTB3, + "vtbl", "8", "$dst, \\{$tbl1, $tbl2, $tbl3\\}, $src", "", + [(set DPR:$dst, (v8i8 (int_arm_neon_vtbl3 + DPR:$tbl1, DPR:$tbl2, DPR:$tbl3, DPR:$src)))]>; +def VTBL4 + : N3V<1,1,0b11,0b1011,0,0, (outs DPR:$dst), + (ins DPR:$tbl1, DPR:$tbl2, DPR:$tbl3, DPR:$tbl4, DPR:$src), + NVTBLFrm, IIC_VTB4, + "vtbl", "8", "$dst, \\{$tbl1, $tbl2, $tbl3, $tbl4\\}, $src", "", + [(set DPR:$dst, (v8i8 (int_arm_neon_vtbl4 DPR:$tbl1, DPR:$tbl2, + DPR:$tbl3, DPR:$tbl4, DPR:$src)))]>; +} // hasExtraSrcRegAllocReq = 1 + +// VTBX : Vector Table Extension +def VTBX1 + : N3V<1,1,0b11,0b1000,1,0, (outs DPR:$dst), + (ins DPR:$orig, DPR:$tbl1, DPR:$src), NVTBLFrm, IIC_VTBX1, + "vtbx", "8", "$dst, \\{$tbl1\\}, $src", "$orig = $dst", + [(set DPR:$dst, (v8i8 (int_arm_neon_vtbx1 + DPR:$orig, DPR:$tbl1, DPR:$src)))]>; +let hasExtraSrcRegAllocReq = 1 in { +def VTBX2 + : N3V<1,1,0b11,0b1001,1,0, (outs DPR:$dst), + (ins DPR:$orig, DPR:$tbl1, DPR:$tbl2, DPR:$src), NVTBLFrm, IIC_VTBX2, + "vtbx", "8", "$dst, \\{$tbl1, $tbl2\\}, $src", "$orig = $dst", + [(set DPR:$dst, (v8i8 (int_arm_neon_vtbx2 + DPR:$orig, DPR:$tbl1, DPR:$tbl2, DPR:$src)))]>; +def VTBX3 + : N3V<1,1,0b11,0b1010,1,0, (outs DPR:$dst), + (ins DPR:$orig, DPR:$tbl1, DPR:$tbl2, DPR:$tbl3, DPR:$src), + NVTBLFrm, IIC_VTBX3, + "vtbx", "8", "$dst, \\{$tbl1, $tbl2, $tbl3\\}, $src", "$orig = $dst", + [(set DPR:$dst, (v8i8 (int_arm_neon_vtbx3 DPR:$orig, DPR:$tbl1, + DPR:$tbl2, DPR:$tbl3, DPR:$src)))]>; +def VTBX4 + : N3V<1,1,0b11,0b1011,1,0, (outs DPR:$dst), (ins DPR:$orig, DPR:$tbl1, + DPR:$tbl2, DPR:$tbl3, DPR:$tbl4, DPR:$src), NVTBLFrm, IIC_VTBX4, + "vtbx", "8", "$dst, \\{$tbl1, $tbl2, $tbl3, $tbl4\\}, $src", + "$orig = $dst", + [(set DPR:$dst, (v8i8 (int_arm_neon_vtbx4 DPR:$orig, DPR:$tbl1, + DPR:$tbl2, DPR:$tbl3, DPR:$tbl4, DPR:$src)))]>; +} // hasExtraSrcRegAllocReq = 1 + +//===----------------------------------------------------------------------===// +// NEON instructions for single-precision FP math +//===----------------------------------------------------------------------===// + +class N2VSPat<SDNode OpNode, ValueType ResTy, ValueType OpTy, NeonI Inst> + : NEONFPPat<(ResTy (OpNode SPR:$a)), + (EXTRACT_SUBREG (OpTy (Inst (INSERT_SUBREG (OpTy (IMPLICIT_DEF)), + SPR:$a, ssub_0))), + ssub_0)>; + +class N3VSPat<SDNode OpNode, NeonI Inst> + : NEONFPPat<(f32 (OpNode SPR:$a, SPR:$b)), + (EXTRACT_SUBREG (v2f32 + (Inst (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), + SPR:$a, ssub_0), + (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), + SPR:$b, ssub_0))), + ssub_0)>; + +class N3VSMulOpPat<SDNode MulNode, SDNode OpNode, NeonI Inst> + : NEONFPPat<(f32 (OpNode SPR:$acc, (f32 (MulNode SPR:$a, SPR:$b)))), + (EXTRACT_SUBREG (Inst (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), + SPR:$acc, ssub_0), + (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), + SPR:$a, ssub_0), + (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), + SPR:$b, ssub_0)), + ssub_0)>; + +// These need separate instructions because they must use DPR_VFP2 register +// class which have SPR sub-registers. + +// Vector Add Operations used for single-precision FP +let neverHasSideEffects = 1 in +def VADDfd_sfp : N3VS<0,0,0b00,0b1101,0, "vadd", "f32", v2f32, v2f32, fadd, 1>; +def : N3VSPat<fadd, VADDfd_sfp>; + +// Vector Sub Operations used for single-precision FP +let neverHasSideEffects = 1 in +def VSUBfd_sfp : N3VS<0,0,0b10,0b1101,0, "vsub", "f32", v2f32, v2f32, fsub, 0>; +def : N3VSPat<fsub, VSUBfd_sfp>; + +// Vector Multiply Operations used for single-precision FP +let neverHasSideEffects = 1 in +def VMULfd_sfp : N3VS<1,0,0b00,0b1101,1, "vmul", "f32", v2f32, v2f32, fmul, 1>; +def : N3VSPat<fmul, VMULfd_sfp>; + +// Vector Multiply-Accumulate/Subtract used for single-precision FP +// vml[as].f32 can cause 4-8 cycle stalls in following ASIMD instructions, so +// we want to avoid them for now. e.g., alternating vmla/vadd instructions. + +//let neverHasSideEffects = 1 in +//def VMLAfd_sfp : N3VSMulOp<0,0,0b00,0b1101,1, IIC_VMACD, "vmla", "f32", +// v2f32, fmul, fadd>; +//def : N3VSMulOpPat<fmul, fadd, VMLAfd_sfp>; + +//let neverHasSideEffects = 1 in +//def VMLSfd_sfp : N3VSMulOp<0,0,0b10,0b1101,1, IIC_VMACD, "vmls", "f32", +// v2f32, fmul, fsub>; +//def : N3VSMulOpPat<fmul, fsub, VMLSfd_sfp>; + +// Vector Absolute used for single-precision FP +let neverHasSideEffects = 1 in +def VABSfd_sfp : N2V<0b11, 0b11, 0b10, 0b01, 0b01110, 0, 0, + (outs DPR_VFP2:$dst), (ins DPR_VFP2:$src), IIC_VUNAD, + "vabs", "f32", "$dst, $src", "", []>; +def : N2VSPat<fabs, f32, v2f32, VABSfd_sfp>; + +// Vector Negate used for single-precision FP +let neverHasSideEffects = 1 in +def VNEGfd_sfp : N2V<0b11, 0b11, 0b10, 0b01, 0b01111, 0, 0, + (outs DPR_VFP2:$dst), (ins DPR_VFP2:$src), IIC_VUNAD, + "vneg", "f32", "$dst, $src", "", []>; +def : N2VSPat<fneg, f32, v2f32, VNEGfd_sfp>; + +// Vector Maximum used for single-precision FP +let neverHasSideEffects = 1 in +def VMAXfd_sfp : N3V<0, 0, 0b00, 0b1111, 0, 0, (outs DPR_VFP2:$dst), + (ins DPR_VFP2:$src1, DPR_VFP2:$src2), N3RegFrm, IIC_VBIND, + "vmax", "f32", "$dst, $src1, $src2", "", []>; +def : N3VSPat<NEONfmax, VMAXfd_sfp>; + +// Vector Minimum used for single-precision FP +let neverHasSideEffects = 1 in +def VMINfd_sfp : N3V<0, 0, 0b00, 0b1111, 0, 0, (outs DPR_VFP2:$dst), + (ins DPR_VFP2:$src1, DPR_VFP2:$src2), N3RegFrm, IIC_VBIND, + "vmin", "f32", "$dst, $src1, $src2", "", []>; +def : N3VSPat<NEONfmin, VMINfd_sfp>; + +// Vector Convert between single-precision FP and integer +let neverHasSideEffects = 1 in +def VCVTf2sd_sfp : N2VS<0b11, 0b11, 0b10, 0b11, 0b01110, 0, "vcvt", "s32.f32", + v2i32, v2f32, fp_to_sint>; +def : N2VSPat<arm_ftosi, f32, v2f32, VCVTf2sd_sfp>; + +let neverHasSideEffects = 1 in +def VCVTf2ud_sfp : N2VS<0b11, 0b11, 0b10, 0b11, 0b01111, 0, "vcvt", "u32.f32", + v2i32, v2f32, fp_to_uint>; +def : N2VSPat<arm_ftoui, f32, v2f32, VCVTf2ud_sfp>; + +let neverHasSideEffects = 1 in +def VCVTs2fd_sfp : N2VS<0b11, 0b11, 0b10, 0b11, 0b01100, 0, "vcvt", "f32.s32", + v2f32, v2i32, sint_to_fp>; +def : N2VSPat<arm_sitof, f32, v2i32, VCVTs2fd_sfp>; + +let neverHasSideEffects = 1 in +def VCVTu2fd_sfp : N2VS<0b11, 0b11, 0b10, 0b11, 0b01101, 0, "vcvt", "f32.u32", + v2f32, v2i32, uint_to_fp>; +def : N2VSPat<arm_uitof, f32, v2i32, VCVTu2fd_sfp>; + +//===----------------------------------------------------------------------===// +// Non-Instruction Patterns +//===----------------------------------------------------------------------===// + +// bit_convert +def : Pat<(v1i64 (bitconvert (v2i32 DPR:$src))), (v1i64 DPR:$src)>; +def : Pat<(v1i64 (bitconvert (v4i16 DPR:$src))), (v1i64 DPR:$src)>; +def : Pat<(v1i64 (bitconvert (v8i8 DPR:$src))), (v1i64 DPR:$src)>; +def : Pat<(v1i64 (bitconvert (f64 DPR:$src))), (v1i64 DPR:$src)>; +def : Pat<(v1i64 (bitconvert (v2f32 DPR:$src))), (v1i64 DPR:$src)>; +def : Pat<(v2i32 (bitconvert (v1i64 DPR:$src))), (v2i32 DPR:$src)>; +def : Pat<(v2i32 (bitconvert (v4i16 DPR:$src))), (v2i32 DPR:$src)>; +def : Pat<(v2i32 (bitconvert (v8i8 DPR:$src))), (v2i32 DPR:$src)>; +def : Pat<(v2i32 (bitconvert (f64 DPR:$src))), (v2i32 DPR:$src)>; +def : Pat<(v2i32 (bitconvert (v2f32 DPR:$src))), (v2i32 DPR:$src)>; +def : Pat<(v4i16 (bitconvert (v1i64 DPR:$src))), (v4i16 DPR:$src)>; +def : Pat<(v4i16 (bitconvert (v2i32 DPR:$src))), (v4i16 DPR:$src)>; +def : Pat<(v4i16 (bitconvert (v8i8 DPR:$src))), (v4i16 DPR:$src)>; +def : Pat<(v4i16 (bitconvert (f64 DPR:$src))), (v4i16 DPR:$src)>; +def : Pat<(v4i16 (bitconvert (v2f32 DPR:$src))), (v4i16 DPR:$src)>; +def : Pat<(v8i8 (bitconvert (v1i64 DPR:$src))), (v8i8 DPR:$src)>; +def : Pat<(v8i8 (bitconvert (v2i32 DPR:$src))), (v8i8 DPR:$src)>; +def : Pat<(v8i8 (bitconvert (v4i16 DPR:$src))), (v8i8 DPR:$src)>; +def : Pat<(v8i8 (bitconvert (f64 DPR:$src))), (v8i8 DPR:$src)>; +def : Pat<(v8i8 (bitconvert (v2f32 DPR:$src))), (v8i8 DPR:$src)>; +def : Pat<(f64 (bitconvert (v1i64 DPR:$src))), (f64 DPR:$src)>; +def : Pat<(f64 (bitconvert (v2i32 DPR:$src))), (f64 DPR:$src)>; +def : Pat<(f64 (bitconvert (v4i16 DPR:$src))), (f64 DPR:$src)>; +def : Pat<(f64 (bitconvert (v8i8 DPR:$src))), (f64 DPR:$src)>; +def : Pat<(f64 (bitconvert (v2f32 DPR:$src))), (f64 DPR:$src)>; +def : Pat<(v2f32 (bitconvert (f64 DPR:$src))), (v2f32 DPR:$src)>; +def : Pat<(v2f32 (bitconvert (v1i64 DPR:$src))), (v2f32 DPR:$src)>; +def : Pat<(v2f32 (bitconvert (v2i32 DPR:$src))), (v2f32 DPR:$src)>; +def : Pat<(v2f32 (bitconvert (v4i16 DPR:$src))), (v2f32 DPR:$src)>; +def : Pat<(v2f32 (bitconvert (v8i8 DPR:$src))), (v2f32 DPR:$src)>; + +def : Pat<(v2i64 (bitconvert (v4i32 QPR:$src))), (v2i64 QPR:$src)>; +def : Pat<(v2i64 (bitconvert (v8i16 QPR:$src))), (v2i64 QPR:$src)>; +def : Pat<(v2i64 (bitconvert (v16i8 QPR:$src))), (v2i64 QPR:$src)>; +def : Pat<(v2i64 (bitconvert (v2f64 QPR:$src))), (v2i64 QPR:$src)>; +def : Pat<(v2i64 (bitconvert (v4f32 QPR:$src))), (v2i64 QPR:$src)>; +def : Pat<(v4i32 (bitconvert (v2i64 QPR:$src))), (v4i32 QPR:$src)>; +def : Pat<(v4i32 (bitconvert (v8i16 QPR:$src))), (v4i32 QPR:$src)>; +def : Pat<(v4i32 (bitconvert (v16i8 QPR:$src))), (v4i32 QPR:$src)>; +def : Pat<(v4i32 (bitconvert (v2f64 QPR:$src))), (v4i32 QPR:$src)>; +def : Pat<(v4i32 (bitconvert (v4f32 QPR:$src))), (v4i32 QPR:$src)>; +def : Pat<(v8i16 (bitconvert (v2i64 QPR:$src))), (v8i16 QPR:$src)>; +def : Pat<(v8i16 (bitconvert (v4i32 QPR:$src))), (v8i16 QPR:$src)>; +def : Pat<(v8i16 (bitconvert (v16i8 QPR:$src))), (v8i16 QPR:$src)>; +def : Pat<(v8i16 (bitconvert (v2f64 QPR:$src))), (v8i16 QPR:$src)>; +def : Pat<(v8i16 (bitconvert (v4f32 QPR:$src))), (v8i16 QPR:$src)>; +def : Pat<(v16i8 (bitconvert (v2i64 QPR:$src))), (v16i8 QPR:$src)>; +def : Pat<(v16i8 (bitconvert (v4i32 QPR:$src))), (v16i8 QPR:$src)>; +def : Pat<(v16i8 (bitconvert (v8i16 QPR:$src))), (v16i8 QPR:$src)>; +def : Pat<(v16i8 (bitconvert (v2f64 QPR:$src))), (v16i8 QPR:$src)>; +def : Pat<(v16i8 (bitconvert (v4f32 QPR:$src))), (v16i8 QPR:$src)>; +def : Pat<(v4f32 (bitconvert (v2i64 QPR:$src))), (v4f32 QPR:$src)>; +def : Pat<(v4f32 (bitconvert (v4i32 QPR:$src))), (v4f32 QPR:$src)>; +def : Pat<(v4f32 (bitconvert (v8i16 QPR:$src))), (v4f32 QPR:$src)>; +def : Pat<(v4f32 (bitconvert (v16i8 QPR:$src))), (v4f32 QPR:$src)>; +def : Pat<(v4f32 (bitconvert (v2f64 QPR:$src))), (v4f32 QPR:$src)>; +def : Pat<(v2f64 (bitconvert (v2i64 QPR:$src))), (v2f64 QPR:$src)>; +def : Pat<(v2f64 (bitconvert (v4i32 QPR:$src))), (v2f64 QPR:$src)>; +def : Pat<(v2f64 (bitconvert (v8i16 QPR:$src))), (v2f64 QPR:$src)>; +def : Pat<(v2f64 (bitconvert (v16i8 QPR:$src))), (v2f64 QPR:$src)>; +def : Pat<(v2f64 (bitconvert (v4f32 QPR:$src))), (v2f64 QPR:$src)>; diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrThumb.td b/contrib/llvm/lib/Target/ARM/ARMInstrThumb.td new file mode 100644 index 0000000..40f924b --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMInstrThumb.td @@ -0,0 +1,1043 @@ +//===- ARMInstrThumb.td - Thumb support for ARM ---------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the Thumb instruction set. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Thumb specific DAG Nodes. +// + +def ARMtcall : SDNode<"ARMISD::tCALL", SDT_ARMcall, + [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag, + SDNPVariadic]>; + +def imm_neg_XFORM : SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant(-(int)N->getZExtValue(), MVT::i32); +}]>; +def imm_comp_XFORM : SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant(~((uint32_t)N->getZExtValue()), MVT::i32); +}]>; + + +/// imm0_7 predicate - True if the 32-bit immediate is in the range [0,7]. +def imm0_7 : PatLeaf<(i32 imm), [{ + return (uint32_t)N->getZExtValue() < 8; +}]>; +def imm0_7_neg : PatLeaf<(i32 imm), [{ + return (uint32_t)-N->getZExtValue() < 8; +}], imm_neg_XFORM>; + +def imm0_255 : PatLeaf<(i32 imm), [{ + return (uint32_t)N->getZExtValue() < 256; +}]>; +def imm0_255_comp : PatLeaf<(i32 imm), [{ + return ~((uint32_t)N->getZExtValue()) < 256; +}]>; + +def imm8_255 : PatLeaf<(i32 imm), [{ + return (uint32_t)N->getZExtValue() >= 8 && (uint32_t)N->getZExtValue() < 256; +}]>; +def imm8_255_neg : PatLeaf<(i32 imm), [{ + unsigned Val = -N->getZExtValue(); + return Val >= 8 && Val < 256; +}], imm_neg_XFORM>; + +// Break imm's up into two pieces: an immediate + a left shift. +// This uses thumb_immshifted to match and thumb_immshifted_val and +// thumb_immshifted_shamt to get the val/shift pieces. +def thumb_immshifted : PatLeaf<(imm), [{ + return ARM_AM::isThumbImmShiftedVal((unsigned)N->getZExtValue()); +}]>; + +def thumb_immshifted_val : SDNodeXForm<imm, [{ + unsigned V = ARM_AM::getThumbImmNonShiftedVal((unsigned)N->getZExtValue()); + return CurDAG->getTargetConstant(V, MVT::i32); +}]>; + +def thumb_immshifted_shamt : SDNodeXForm<imm, [{ + unsigned V = ARM_AM::getThumbImmValShift((unsigned)N->getZExtValue()); + return CurDAG->getTargetConstant(V, MVT::i32); +}]>; + +// Scaled 4 immediate. +def t_imm_s4 : Operand<i32> { + let PrintMethod = "printThumbS4ImmOperand"; +} + +// Define Thumb specific addressing modes. + +// t_addrmode_rr := reg + reg +// +def t_addrmode_rr : Operand<i32>, + ComplexPattern<i32, 2, "SelectThumbAddrModeRR", []> { + let PrintMethod = "printThumbAddrModeRROperand"; + let MIOperandInfo = (ops tGPR:$base, tGPR:$offsreg); +} + +// t_addrmode_s4 := reg + reg +// reg + imm5 * 4 +// +def t_addrmode_s4 : Operand<i32>, + ComplexPattern<i32, 3, "SelectThumbAddrModeS4", []> { + let PrintMethod = "printThumbAddrModeS4Operand"; + let MIOperandInfo = (ops tGPR:$base, i32imm:$offsimm, tGPR:$offsreg); +} + +// t_addrmode_s2 := reg + reg +// reg + imm5 * 2 +// +def t_addrmode_s2 : Operand<i32>, + ComplexPattern<i32, 3, "SelectThumbAddrModeS2", []> { + let PrintMethod = "printThumbAddrModeS2Operand"; + let MIOperandInfo = (ops tGPR:$base, i32imm:$offsimm, tGPR:$offsreg); +} + +// t_addrmode_s1 := reg + reg +// reg + imm5 +// +def t_addrmode_s1 : Operand<i32>, + ComplexPattern<i32, 3, "SelectThumbAddrModeS1", []> { + let PrintMethod = "printThumbAddrModeS1Operand"; + let MIOperandInfo = (ops tGPR:$base, i32imm:$offsimm, tGPR:$offsreg); +} + +// t_addrmode_sp := sp + imm8 * 4 +// +def t_addrmode_sp : Operand<i32>, + ComplexPattern<i32, 2, "SelectThumbAddrModeSP", []> { + let PrintMethod = "printThumbAddrModeSPOperand"; + let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm); +} + +//===----------------------------------------------------------------------===// +// Miscellaneous Instructions. +// + +// FIXME: Marking these as hasSideEffects is necessary to prevent machine DCE +// from removing one half of the matched pairs. That breaks PEI, which assumes +// these will always be in pairs, and asserts if it finds otherwise. Better way? +let Defs = [SP], Uses = [SP], hasSideEffects = 1 in { +def tADJCALLSTACKUP : +PseudoInst<(outs), (ins i32imm:$amt1, i32imm:$amt2), NoItinerary, + "${:comment} tADJCALLSTACKUP $amt1", + [(ARMcallseq_end imm:$amt1, imm:$amt2)]>, Requires<[IsThumb1Only]>; + +def tADJCALLSTACKDOWN : +PseudoInst<(outs), (ins i32imm:$amt), NoItinerary, + "${:comment} tADJCALLSTACKDOWN $amt", + [(ARMcallseq_start imm:$amt)]>, Requires<[IsThumb1Only]>; +} + +def tNOP : T1pI<(outs), (ins), NoItinerary, "nop", "", + [/* For disassembly only; pattern left blank */]>, + T1Encoding<0b101111> { + let Inst{9-8} = 0b11; + let Inst{7-0} = 0b00000000; +} + +def tYIELD : T1pI<(outs), (ins), NoItinerary, "yield", "", + [/* For disassembly only; pattern left blank */]>, + T1Encoding<0b101111> { + let Inst{9-8} = 0b11; + let Inst{7-0} = 0b00010000; +} + +def tWFE : T1pI<(outs), (ins), NoItinerary, "wfe", "", + [/* For disassembly only; pattern left blank */]>, + T1Encoding<0b101111> { + let Inst{9-8} = 0b11; + let Inst{7-0} = 0b00100000; +} + +def tWFI : T1pI<(outs), (ins), NoItinerary, "wfi", "", + [/* For disassembly only; pattern left blank */]>, + T1Encoding<0b101111> { + let Inst{9-8} = 0b11; + let Inst{7-0} = 0b00110000; +} + +def tSEV : T1pI<(outs), (ins), NoItinerary, "sev", "", + [/* For disassembly only; pattern left blank */]>, + T1Encoding<0b101111> { + let Inst{9-8} = 0b11; + let Inst{7-0} = 0b01000000; +} + +def tSETENDBE : T1I<(outs), (ins), NoItinerary, "setend\tbe", + [/* For disassembly only; pattern left blank */]>, + T1Encoding<0b101101> { + let Inst{9-5} = 0b10010; + let Inst{3} = 1; +} + +def tSETENDLE : T1I<(outs), (ins), NoItinerary, "setend\tle", + [/* For disassembly only; pattern left blank */]>, + T1Encoding<0b101101> { + let Inst{9-5} = 0b10010; + let Inst{3} = 0; +} + +// The i32imm operand $val can be used by a debugger to store more information +// about the breakpoint. +def tBKPT : T1I<(outs), (ins i32imm:$val), NoItinerary, "bkpt\t$val", + [/* For disassembly only; pattern left blank */]>, + T1Encoding<0b101111> { + let Inst{9-8} = 0b10; +} + +// Change Processor State is a system instruction -- for disassembly only. +// The singleton $opt operand contains the following information: +// opt{4-0} = mode ==> don't care +// opt{5} = changemode ==> 0 (false for 16-bit Thumb instr) +// opt{8-6} = AIF from Inst{2-0} +// opt{10-9} = 1:imod from Inst{4} with 0b10 as enable and 0b11 as disable +// +// The opt{4-0} and opt{5} sub-fields are to accommodate 32-bit Thumb and ARM +// CPS which has more options. +def tCPS : T1I<(outs), (ins cps_opt:$opt), NoItinerary, "cps$opt", + [/* For disassembly only; pattern left blank */]>, + T1Misc<0b0110011>; + +// For both thumb1 and thumb2. +let isNotDuplicable = 1 in +def tPICADD : TIt<(outs GPR:$dst), (ins GPR:$lhs, pclabel:$cp), IIC_iALUr, + "\n$cp:\n\tadd\t$dst, pc", + [(set GPR:$dst, (ARMpic_add GPR:$lhs, imm:$cp))]>, + T1Special<{0,0,?,?}> { + let Inst{6-3} = 0b1111; // A8.6.6 Rm = pc +} + +// PC relative add. +def tADDrPCi : T1I<(outs tGPR:$dst), (ins t_imm_s4:$rhs), IIC_iALUi, + "add\t$dst, pc, $rhs", []>, + T1Encoding<{1,0,1,0,0,?}>; // A6.2 & A8.6.10 + +// ADD rd, sp, #imm8 +def tADDrSPi : T1I<(outs tGPR:$dst), (ins GPR:$sp, t_imm_s4:$rhs), IIC_iALUi, + "add\t$dst, $sp, $rhs", []>, + T1Encoding<{1,0,1,0,1,?}>; // A6.2 & A8.6.8 + +// ADD sp, sp, #imm7 +def tADDspi : TIt<(outs GPR:$dst), (ins GPR:$lhs, t_imm_s4:$rhs), IIC_iALUi, + "add\t$dst, $rhs", []>, + T1Misc<{0,0,0,0,0,?,?}>; // A6.2.5 & A8.6.8 + +// SUB sp, sp, #imm7 +def tSUBspi : TIt<(outs GPR:$dst), (ins GPR:$lhs, t_imm_s4:$rhs), IIC_iALUi, + "sub\t$dst, $rhs", []>, + T1Misc<{0,0,0,0,1,?,?}>; // A6.2.5 & A8.6.215 + +// ADD rm, sp +def tADDrSP : TIt<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs), IIC_iALUr, + "add\t$dst, $rhs", []>, + T1Special<{0,0,?,?}> { + let Inst{6-3} = 0b1101; // A8.6.9 Encoding T1 +} + +// ADD sp, rm +def tADDspr : TIt<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs), IIC_iALUr, + "add\t$dst, $rhs", []>, + T1Special<{0,0,?,?}> { + // A8.6.9 Encoding T2 + let Inst{7} = 1; + let Inst{2-0} = 0b101; +} + +// Pseudo instruction that will expand into a tSUBspi + a copy. +let usesCustomInserter = 1 in { // Expanded after instruction selection. +def tSUBspi_ : PseudoInst<(outs GPR:$dst), (ins GPR:$lhs, t_imm_s4:$rhs), + NoItinerary, "${:comment} sub\t$dst, $rhs", []>; + +def tADDspr_ : PseudoInst<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs), + NoItinerary, "${:comment} add\t$dst, $rhs", []>; + +let Defs = [CPSR] in +def tANDsp : PseudoInst<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), + NoItinerary, "${:comment} and\t$dst, $rhs", []>; +} // usesCustomInserter + +//===----------------------------------------------------------------------===// +// Control Flow Instructions. +// + +let isReturn = 1, isTerminator = 1, isBarrier = 1 in { + def tBX_RET : TI<(outs), (ins), IIC_Br, "bx\tlr", [(ARMretflag)]>, + T1Special<{1,1,0,?}> { // A6.2.3 & A8.6.25 + let Inst{6-3} = 0b1110; // Rm = lr + } + // Alternative return instruction used by vararg functions. + def tBX_RET_vararg : TI<(outs), (ins tGPR:$target), IIC_Br, "bx\t$target",[]>, + T1Special<{1,1,0,?}>; // A6.2.3 & A8.6.25 +} + +// Indirect branches +let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in { + def tBRIND : TI<(outs), (ins GPR:$dst), IIC_Br, "mov\tpc, $dst", + [(brind GPR:$dst)]>, + T1Special<{1,0,1,?}> { + // <Rd> = Inst{7:2-0} = pc + let Inst{2-0} = 0b111; + } +} + +// FIXME: remove when we have a way to marking a MI with these properties. +let isReturn = 1, isTerminator = 1, isBarrier = 1, mayLoad = 1, + hasExtraDefRegAllocReq = 1 in +def tPOP_RET : T1I<(outs), (ins pred:$p, reglist:$dsts, variable_ops), IIC_Br, + "pop${p}\t$dsts", []>, + T1Misc<{1,1,0,?,?,?,?}>; + +let isCall = 1, + Defs = [R0, R1, R2, R3, R12, LR, + D0, D1, D2, D3, D4, D5, D6, D7, + D16, D17, D18, D19, D20, D21, D22, D23, + D24, D25, D26, D27, D28, D29, D30, D31, CPSR, FPSCR] in { + // Also used for Thumb2 + def tBL : TIx2<0b11110, 0b11, 1, + (outs), (ins i32imm:$func, variable_ops), IIC_Br, + "bl\t${func:call}", + [(ARMtcall tglobaladdr:$func)]>, + Requires<[IsThumb, IsNotDarwin]>; + + // ARMv5T and above, also used for Thumb2 + def tBLXi : TIx2<0b11110, 0b11, 0, + (outs), (ins i32imm:$func, variable_ops), IIC_Br, + "blx\t${func:call}", + [(ARMcall tglobaladdr:$func)]>, + Requires<[IsThumb, HasV5T, IsNotDarwin]>; + + // Also used for Thumb2 + def tBLXr : TI<(outs), (ins GPR:$func, variable_ops), IIC_Br, + "blx\t$func", + [(ARMtcall GPR:$func)]>, + Requires<[IsThumb, HasV5T, IsNotDarwin]>, + T1Special<{1,1,1,?}>; // A6.2.3 & A8.6.24; + + // ARMv4T + def tBX : TIx2<{?,?,?,?,?}, {?,?}, ?, + (outs), (ins tGPR:$func, variable_ops), IIC_Br, + "mov\tlr, pc\n\tbx\t$func", + [(ARMcall_nolink tGPR:$func)]>, + Requires<[IsThumb1Only, IsNotDarwin]>; +} + +// On Darwin R9 is call-clobbered. +let isCall = 1, + Defs = [R0, R1, R2, R3, R9, R12, LR, + D0, D1, D2, D3, D4, D5, D6, D7, + D16, D17, D18, D19, D20, D21, D22, D23, + D24, D25, D26, D27, D28, D29, D30, D31, CPSR, FPSCR] in { + // Also used for Thumb2 + def tBLr9 : TIx2<0b11110, 0b11, 1, + (outs), (ins i32imm:$func, variable_ops), IIC_Br, + "bl\t${func:call}", + [(ARMtcall tglobaladdr:$func)]>, + Requires<[IsThumb, IsDarwin]>; + + // ARMv5T and above, also used for Thumb2 + def tBLXi_r9 : TIx2<0b11110, 0b11, 0, + (outs), (ins i32imm:$func, variable_ops), IIC_Br, + "blx\t${func:call}", + [(ARMcall tglobaladdr:$func)]>, + Requires<[IsThumb, HasV5T, IsDarwin]>; + + // Also used for Thumb2 + def tBLXr_r9 : TI<(outs), (ins GPR:$func, variable_ops), IIC_Br, + "blx\t$func", + [(ARMtcall GPR:$func)]>, + Requires<[IsThumb, HasV5T, IsDarwin]>, + T1Special<{1,1,1,?}>; // A6.2.3 & A8.6.24 + + // ARMv4T + def tBXr9 : TIx2<{?,?,?,?,?}, {?,?}, ?, + (outs), (ins tGPR:$func, variable_ops), IIC_Br, + "mov\tlr, pc\n\tbx\t$func", + [(ARMcall_nolink tGPR:$func)]>, + Requires<[IsThumb1Only, IsDarwin]>; +} + +let isBranch = 1, isTerminator = 1 in { + let isBarrier = 1 in { + let isPredicable = 1 in + def tB : T1I<(outs), (ins brtarget:$target), IIC_Br, + "b\t$target", [(br bb:$target)]>, + T1Encoding<{1,1,1,0,0,?}>; + + // Far jump + let Defs = [LR] in + def tBfar : TIx2<0b11110, 0b11, 1, (outs), (ins brtarget:$target), IIC_Br, + "bl\t$target\t${:comment} far jump",[]>; + + def tBR_JTr : T1JTI<(outs), + (ins tGPR:$target, jtblock_operand:$jt, i32imm:$id), + IIC_Br, "mov\tpc, $target\n\t.align\t2\n$jt", + [(ARMbrjt tGPR:$target, tjumptable:$jt, imm:$id)]>, + Encoding16 { + let Inst{15-7} = 0b010001101; + let Inst{2-0} = 0b111; + } + } +} + +// FIXME: should be able to write a pattern for ARMBrcond, but can't use +// a two-value operand where a dag node expects two operands. :( +let isBranch = 1, isTerminator = 1 in + def tBcc : T1I<(outs), (ins brtarget:$target, pred:$cc), IIC_Br, + "b$cc\t$target", + [/*(ARMbrcond bb:$target, imm:$cc)*/]>, + T1Encoding<{1,1,0,1,?,?}>; + +// Compare and branch on zero / non-zero +let isBranch = 1, isTerminator = 1 in { + def tCBZ : T1I<(outs), (ins tGPR:$cmp, brtarget:$target), IIC_Br, + "cbz\t$cmp, $target", []>, + T1Misc<{0,0,?,1,?,?,?}>; + + def tCBNZ : T1I<(outs), (ins tGPR:$cmp, brtarget:$target), IIC_Br, + "cbnz\t$cmp, $target", []>, + T1Misc<{1,0,?,1,?,?,?}>; +} + +// A8.6.218 Supervisor Call (Software Interrupt) -- for disassembly only +// A8.6.16 B: Encoding T1 +// If Inst{11-8} == 0b1111 then SEE SVC +let isCall = 1 in { +def tSVC : T1pI<(outs), (ins i32imm:$svc), IIC_Br, "svc", "\t$svc", []>, + Encoding16 { + let Inst{15-12} = 0b1101; + let Inst{11-8} = 0b1111; +} +} + +// A8.6.16 B: Encoding T1 +// If Inst{11-8} == 0b1110 then UNDEFINED +// FIXME: Temporary emitted as raw bytes until this pseudo-op will be added to +// binutils +let isBarrier = 1, isTerminator = 1 in +def tTRAP : TI<(outs), (ins), IIC_Br, + ".short 0xdefe ${:comment} trap", [(trap)]>, Encoding16 { + let Inst{15-12} = 0b1101; + let Inst{11-8} = 0b1110; +} + +//===----------------------------------------------------------------------===// +// Load Store Instructions. +// + +let canFoldAsLoad = 1, isReMaterializable = 1 in +def tLDR : T1pI4<(outs tGPR:$dst), (ins t_addrmode_s4:$addr), IIC_iLoadr, + "ldr", "\t$dst, $addr", + [(set tGPR:$dst, (load t_addrmode_s4:$addr))]>, + T1LdSt<0b100>; +def tLDRi: T1pI4<(outs tGPR:$dst), (ins t_addrmode_s4:$addr), IIC_iLoadr, + "ldr", "\t$dst, $addr", + []>, + T1LdSt4Imm<{1,?,?}>; + +def tLDRB : T1pI1<(outs tGPR:$dst), (ins t_addrmode_s1:$addr), IIC_iLoadr, + "ldrb", "\t$dst, $addr", + [(set tGPR:$dst, (zextloadi8 t_addrmode_s1:$addr))]>, + T1LdSt<0b110>; +def tLDRBi: T1pI1<(outs tGPR:$dst), (ins t_addrmode_s1:$addr), IIC_iLoadr, + "ldrb", "\t$dst, $addr", + []>, + T1LdSt1Imm<{1,?,?}>; + +def tLDRH : T1pI2<(outs tGPR:$dst), (ins t_addrmode_s2:$addr), IIC_iLoadr, + "ldrh", "\t$dst, $addr", + [(set tGPR:$dst, (zextloadi16 t_addrmode_s2:$addr))]>, + T1LdSt<0b101>; +def tLDRHi: T1pI2<(outs tGPR:$dst), (ins t_addrmode_s2:$addr), IIC_iLoadr, + "ldrh", "\t$dst, $addr", + []>, + T1LdSt2Imm<{1,?,?}>; + +let AddedComplexity = 10 in +def tLDRSB : T1pI1<(outs tGPR:$dst), (ins t_addrmode_rr:$addr), IIC_iLoadr, + "ldrsb", "\t$dst, $addr", + [(set tGPR:$dst, (sextloadi8 t_addrmode_rr:$addr))]>, + T1LdSt<0b011>; + +let AddedComplexity = 10 in +def tLDRSH : T1pI2<(outs tGPR:$dst), (ins t_addrmode_rr:$addr), IIC_iLoadr, + "ldrsh", "\t$dst, $addr", + [(set tGPR:$dst, (sextloadi16 t_addrmode_rr:$addr))]>, + T1LdSt<0b111>; + +let canFoldAsLoad = 1 in +def tLDRspi : T1pIs<(outs tGPR:$dst), (ins t_addrmode_sp:$addr), IIC_iLoadi, + "ldr", "\t$dst, $addr", + [(set tGPR:$dst, (load t_addrmode_sp:$addr))]>, + T1LdStSP<{1,?,?}>; + +// Special instruction for restore. It cannot clobber condition register +// when it's expanded by eliminateCallFramePseudoInstr(). +let canFoldAsLoad = 1, mayLoad = 1, neverHasSideEffects = 1 in +def tRestore : T1pIs<(outs tGPR:$dst), (ins t_addrmode_sp:$addr), IIC_iLoadi, + "ldr", "\t$dst, $addr", []>, + T1LdStSP<{1,?,?}>; + +// Load tconstpool +// FIXME: Use ldr.n to work around a Darwin assembler bug. +let canFoldAsLoad = 1, isReMaterializable = 1 in +def tLDRpci : T1pIs<(outs tGPR:$dst), (ins i32imm:$addr), IIC_iLoadi, + "ldr", ".n\t$dst, $addr", + [(set tGPR:$dst, (load (ARMWrapper tconstpool:$addr)))]>, + T1Encoding<{0,1,0,0,1,?}>; // A6.2 & A8.6.59 + +// Special LDR for loads from non-pc-relative constpools. +let canFoldAsLoad = 1, mayLoad = 1, neverHasSideEffects = 1, + isReMaterializable = 1 in +def tLDRcp : T1pIs<(outs tGPR:$dst), (ins i32imm:$addr), IIC_iLoadi, + "ldr", "\t$dst, $addr", []>, + T1LdStSP<{1,?,?}>; + +def tSTR : T1pI4<(outs), (ins tGPR:$src, t_addrmode_s4:$addr), IIC_iStorer, + "str", "\t$src, $addr", + [(store tGPR:$src, t_addrmode_s4:$addr)]>, + T1LdSt<0b000>; +def tSTRi: T1pI4<(outs), (ins tGPR:$src, t_addrmode_s4:$addr), IIC_iStorer, + "str", "\t$src, $addr", + []>, + T1LdSt4Imm<{0,?,?}>; + +def tSTRB : T1pI1<(outs), (ins tGPR:$src, t_addrmode_s1:$addr), IIC_iStorer, + "strb", "\t$src, $addr", + [(truncstorei8 tGPR:$src, t_addrmode_s1:$addr)]>, + T1LdSt<0b010>; +def tSTRBi: T1pI1<(outs), (ins tGPR:$src, t_addrmode_s1:$addr), IIC_iStorer, + "strb", "\t$src, $addr", + []>, + T1LdSt1Imm<{0,?,?}>; + +def tSTRH : T1pI2<(outs), (ins tGPR:$src, t_addrmode_s2:$addr), IIC_iStorer, + "strh", "\t$src, $addr", + [(truncstorei16 tGPR:$src, t_addrmode_s2:$addr)]>, + T1LdSt<0b001>; +def tSTRHi: T1pI2<(outs), (ins tGPR:$src, t_addrmode_s2:$addr), IIC_iStorer, + "strh", "\t$src, $addr", + []>, + T1LdSt2Imm<{0,?,?}>; + +def tSTRspi : T1pIs<(outs), (ins tGPR:$src, t_addrmode_sp:$addr), IIC_iStorei, + "str", "\t$src, $addr", + [(store tGPR:$src, t_addrmode_sp:$addr)]>, + T1LdStSP<{0,?,?}>; + +let mayStore = 1, neverHasSideEffects = 1 in { +// Special instruction for spill. It cannot clobber condition register +// when it's expanded by eliminateCallFramePseudoInstr(). +def tSpill : T1pIs<(outs), (ins tGPR:$src, t_addrmode_sp:$addr), IIC_iStorei, + "str", "\t$src, $addr", []>, + T1LdStSP<{0,?,?}>; +} + +//===----------------------------------------------------------------------===// +// Load / store multiple Instructions. +// + +// These requires base address to be written back or one of the loaded regs. +let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 in { +def tLDM : T1I<(outs), + (ins addrmode4:$addr, pred:$p, reglist:$dsts, variable_ops), + IIC_iLoadm, + "ldm${addr:submode}${p}\t$addr, $dsts", []>, + T1Encoding<{1,1,0,0,1,?}>; // A6.2 & A8.6.53 + +def tLDM_UPD : T1It<(outs tGPR:$wb), + (ins addrmode4:$addr, pred:$p, reglist:$dsts, variable_ops), + IIC_iLoadm, + "ldm${addr:submode}${p}\t$addr!, $dsts", + "$addr.addr = $wb", []>, + T1Encoding<{1,1,0,0,1,?}>; // A6.2 & A8.6.53 +} // mayLoad, neverHasSideEffects = 1, hasExtraDefRegAllocReq + +let mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1 in +def tSTM_UPD : T1It<(outs tGPR:$wb), + (ins addrmode4:$addr, pred:$p, reglist:$srcs, variable_ops), + IIC_iStorem, + "stm${addr:submode}${p}\t$addr!, $srcs", + "$addr.addr = $wb", []>, + T1Encoding<{1,1,0,0,0,?}>; // A6.2 & A8.6.189 + +let mayLoad = 1, Uses = [SP], Defs = [SP], hasExtraDefRegAllocReq = 1 in +def tPOP : T1I<(outs), (ins pred:$p, reglist:$dsts, variable_ops), IIC_Br, + "pop${p}\t$dsts", []>, + T1Misc<{1,1,0,?,?,?,?}>; + +let mayStore = 1, Uses = [SP], Defs = [SP], hasExtraSrcRegAllocReq = 1 in +def tPUSH : T1I<(outs), (ins pred:$p, reglist:$srcs, variable_ops), IIC_Br, + "push${p}\t$srcs", []>, + T1Misc<{0,1,0,?,?,?,?}>; + +//===----------------------------------------------------------------------===// +// Arithmetic Instructions. +// + +// Add with carry register +let isCommutable = 1, Uses = [CPSR] in +def tADC : T1sIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), IIC_iALUr, + "adc", "\t$dst, $rhs", + [(set tGPR:$dst, (adde tGPR:$lhs, tGPR:$rhs))]>, + T1DataProcessing<0b0101>; + +// Add immediate +def tADDi3 : T1sI<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs), IIC_iALUi, + "add", "\t$dst, $lhs, $rhs", + [(set tGPR:$dst, (add tGPR:$lhs, imm0_7:$rhs))]>, + T1General<0b01110>; + +def tADDi8 : T1sIt<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs), IIC_iALUi, + "add", "\t$dst, $rhs", + [(set tGPR:$dst, (add tGPR:$lhs, imm8_255:$rhs))]>, + T1General<{1,1,0,?,?}>; + +// Add register +let isCommutable = 1 in +def tADDrr : T1sI<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), IIC_iALUr, + "add", "\t$dst, $lhs, $rhs", + [(set tGPR:$dst, (add tGPR:$lhs, tGPR:$rhs))]>, + T1General<0b01100>; + +let neverHasSideEffects = 1 in +def tADDhirr : T1pIt<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs), IIC_iALUr, + "add", "\t$dst, $rhs", []>, + T1Special<{0,0,?,?}>; + +// And register +let isCommutable = 1 in +def tAND : T1sIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), IIC_iALUr, + "and", "\t$dst, $rhs", + [(set tGPR:$dst, (and tGPR:$lhs, tGPR:$rhs))]>, + T1DataProcessing<0b0000>; + +// ASR immediate +def tASRri : T1sI<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs), IIC_iMOVsi, + "asr", "\t$dst, $lhs, $rhs", + [(set tGPR:$dst, (sra tGPR:$lhs, (i32 imm:$rhs)))]>, + T1General<{0,1,0,?,?}>; + +// ASR register +def tASRrr : T1sIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), IIC_iMOVsr, + "asr", "\t$dst, $rhs", + [(set tGPR:$dst, (sra tGPR:$lhs, tGPR:$rhs))]>, + T1DataProcessing<0b0100>; + +// BIC register +def tBIC : T1sIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), IIC_iALUr, + "bic", "\t$dst, $rhs", + [(set tGPR:$dst, (and tGPR:$lhs, (not tGPR:$rhs)))]>, + T1DataProcessing<0b1110>; + +// CMN register +let Defs = [CPSR] in { +//FIXME: Disable CMN, as CCodes are backwards from compare expectations +// Compare-to-zero still works out, just not the relationals +//def tCMN : T1pI<(outs), (ins tGPR:$lhs, tGPR:$rhs), IIC_iCMPr, +// "cmn", "\t$lhs, $rhs", +// [(ARMcmp tGPR:$lhs, (ineg tGPR:$rhs))]>, +// T1DataProcessing<0b1011>; +def tCMNz : T1pI<(outs), (ins tGPR:$lhs, tGPR:$rhs), IIC_iCMPr, + "cmn", "\t$lhs, $rhs", + [(ARMcmpZ tGPR:$lhs, (ineg tGPR:$rhs))]>, + T1DataProcessing<0b1011>; +} + +// CMP immediate +let Defs = [CPSR] in { +def tCMPi8 : T1pI<(outs), (ins tGPR:$lhs, i32imm:$rhs), IIC_iCMPi, + "cmp", "\t$lhs, $rhs", + [(ARMcmp tGPR:$lhs, imm0_255:$rhs)]>, + T1General<{1,0,1,?,?}>; +def tCMPzi8 : T1pI<(outs), (ins tGPR:$lhs, i32imm:$rhs), IIC_iCMPi, + "cmp", "\t$lhs, $rhs", + [(ARMcmpZ tGPR:$lhs, imm0_255:$rhs)]>, + T1General<{1,0,1,?,?}>; +} + +// CMP register +let Defs = [CPSR] in { +def tCMPr : T1pI<(outs), (ins tGPR:$lhs, tGPR:$rhs), IIC_iCMPr, + "cmp", "\t$lhs, $rhs", + [(ARMcmp tGPR:$lhs, tGPR:$rhs)]>, + T1DataProcessing<0b1010>; +def tCMPzr : T1pI<(outs), (ins tGPR:$lhs, tGPR:$rhs), IIC_iCMPr, + "cmp", "\t$lhs, $rhs", + [(ARMcmpZ tGPR:$lhs, tGPR:$rhs)]>, + T1DataProcessing<0b1010>; + +def tCMPhir : T1pI<(outs), (ins GPR:$lhs, GPR:$rhs), IIC_iCMPr, + "cmp", "\t$lhs, $rhs", []>, + T1Special<{0,1,?,?}>; +def tCMPzhir : T1pI<(outs), (ins GPR:$lhs, GPR:$rhs), IIC_iCMPr, + "cmp", "\t$lhs, $rhs", []>, + T1Special<{0,1,?,?}>; +} + + +// XOR register +let isCommutable = 1 in +def tEOR : T1sIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), IIC_iALUr, + "eor", "\t$dst, $rhs", + [(set tGPR:$dst, (xor tGPR:$lhs, tGPR:$rhs))]>, + T1DataProcessing<0b0001>; + +// LSL immediate +def tLSLri : T1sI<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs), IIC_iMOVsi, + "lsl", "\t$dst, $lhs, $rhs", + [(set tGPR:$dst, (shl tGPR:$lhs, (i32 imm:$rhs)))]>, + T1General<{0,0,0,?,?}>; + +// LSL register +def tLSLrr : T1sIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), IIC_iMOVsr, + "lsl", "\t$dst, $rhs", + [(set tGPR:$dst, (shl tGPR:$lhs, tGPR:$rhs))]>, + T1DataProcessing<0b0010>; + +// LSR immediate +def tLSRri : T1sI<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs), IIC_iMOVsi, + "lsr", "\t$dst, $lhs, $rhs", + [(set tGPR:$dst, (srl tGPR:$lhs, (i32 imm:$rhs)))]>, + T1General<{0,0,1,?,?}>; + +// LSR register +def tLSRrr : T1sIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), IIC_iMOVsr, + "lsr", "\t$dst, $rhs", + [(set tGPR:$dst, (srl tGPR:$lhs, tGPR:$rhs))]>, + T1DataProcessing<0b0011>; + +// move register +def tMOVi8 : T1sI<(outs tGPR:$dst), (ins i32imm:$src), IIC_iMOVi, + "mov", "\t$dst, $src", + [(set tGPR:$dst, imm0_255:$src)]>, + T1General<{1,0,0,?,?}>; + +// TODO: A7-73: MOV(2) - mov setting flag. + + +let neverHasSideEffects = 1 in { +// FIXME: Make this predicable. +def tMOVr : T1I<(outs tGPR:$dst), (ins tGPR:$src), IIC_iMOVr, + "mov\t$dst, $src", []>, + T1Special<0b1000>; +let Defs = [CPSR] in +def tMOVSr : T1I<(outs tGPR:$dst), (ins tGPR:$src), IIC_iMOVr, + "movs\t$dst, $src", []>, Encoding16 { + let Inst{15-6} = 0b0000000000; +} + +// FIXME: Make these predicable. +def tMOVgpr2tgpr : T1I<(outs tGPR:$dst), (ins GPR:$src), IIC_iMOVr, + "mov\t$dst, $src", []>, + T1Special<{1,0,0,?}>; +def tMOVtgpr2gpr : T1I<(outs GPR:$dst), (ins tGPR:$src), IIC_iMOVr, + "mov\t$dst, $src", []>, + T1Special<{1,0,?,0}>; +def tMOVgpr2gpr : T1I<(outs GPR:$dst), (ins GPR:$src), IIC_iMOVr, + "mov\t$dst, $src", []>, + T1Special<{1,0,?,?}>; +} // neverHasSideEffects + +// multiply register +let isCommutable = 1 in +def tMUL : T1sIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), IIC_iMUL32, + "mul", "\t$dst, $rhs, $dst", /* A8.6.105 MUL Encoding T1 */ + [(set tGPR:$dst, (mul tGPR:$lhs, tGPR:$rhs))]>, + T1DataProcessing<0b1101>; + +// move inverse register +def tMVN : T1sI<(outs tGPR:$dst), (ins tGPR:$src), IIC_iMOVr, + "mvn", "\t$dst, $src", + [(set tGPR:$dst, (not tGPR:$src))]>, + T1DataProcessing<0b1111>; + +// bitwise or register +let isCommutable = 1 in +def tORR : T1sIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), IIC_iALUr, + "orr", "\t$dst, $rhs", + [(set tGPR:$dst, (or tGPR:$lhs, tGPR:$rhs))]>, + T1DataProcessing<0b1100>; + +// swaps +def tREV : T1pI<(outs tGPR:$dst), (ins tGPR:$src), IIC_iUNAr, + "rev", "\t$dst, $src", + [(set tGPR:$dst, (bswap tGPR:$src))]>, + Requires<[IsThumb1Only, HasV6]>, + T1Misc<{1,0,1,0,0,0,?}>; + +def tREV16 : T1pI<(outs tGPR:$dst), (ins tGPR:$src), IIC_iUNAr, + "rev16", "\t$dst, $src", + [(set tGPR:$dst, + (or (and (srl tGPR:$src, (i32 8)), 0xFF), + (or (and (shl tGPR:$src, (i32 8)), 0xFF00), + (or (and (srl tGPR:$src, (i32 8)), 0xFF0000), + (and (shl tGPR:$src, (i32 8)), 0xFF000000)))))]>, + Requires<[IsThumb1Only, HasV6]>, + T1Misc<{1,0,1,0,0,1,?}>; + +def tREVSH : T1pI<(outs tGPR:$dst), (ins tGPR:$src), IIC_iUNAr, + "revsh", "\t$dst, $src", + [(set tGPR:$dst, + (sext_inreg + (or (srl (and tGPR:$src, 0xFF00), (i32 8)), + (shl tGPR:$src, (i32 8))), i16))]>, + Requires<[IsThumb1Only, HasV6]>, + T1Misc<{1,0,1,0,1,1,?}>; + +// rotate right register +def tROR : T1sIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), IIC_iMOVsr, + "ror", "\t$dst, $rhs", + [(set tGPR:$dst, (rotr tGPR:$lhs, tGPR:$rhs))]>, + T1DataProcessing<0b0111>; + +// negate register +def tRSB : T1sI<(outs tGPR:$dst), (ins tGPR:$src), IIC_iALUi, + "rsb", "\t$dst, $src, #0", + [(set tGPR:$dst, (ineg tGPR:$src))]>, + T1DataProcessing<0b1001>; + +// Subtract with carry register +let Uses = [CPSR] in +def tSBC : T1sIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), IIC_iALUr, + "sbc", "\t$dst, $rhs", + [(set tGPR:$dst, (sube tGPR:$lhs, tGPR:$rhs))]>, + T1DataProcessing<0b0110>; + +// Subtract immediate +def tSUBi3 : T1sI<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs), IIC_iALUi, + "sub", "\t$dst, $lhs, $rhs", + [(set tGPR:$dst, (add tGPR:$lhs, imm0_7_neg:$rhs))]>, + T1General<0b01111>; + +def tSUBi8 : T1sIt<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs), IIC_iALUi, + "sub", "\t$dst, $rhs", + [(set tGPR:$dst, (add tGPR:$lhs, imm8_255_neg:$rhs))]>, + T1General<{1,1,1,?,?}>; + +// subtract register +def tSUBrr : T1sI<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), IIC_iALUr, + "sub", "\t$dst, $lhs, $rhs", + [(set tGPR:$dst, (sub tGPR:$lhs, tGPR:$rhs))]>, + T1General<0b01101>; + +// TODO: A7-96: STMIA - store multiple. + +// sign-extend byte +def tSXTB : T1pI<(outs tGPR:$dst), (ins tGPR:$src), IIC_iUNAr, + "sxtb", "\t$dst, $src", + [(set tGPR:$dst, (sext_inreg tGPR:$src, i8))]>, + Requires<[IsThumb1Only, HasV6]>, + T1Misc<{0,0,1,0,0,1,?}>; + +// sign-extend short +def tSXTH : T1pI<(outs tGPR:$dst), (ins tGPR:$src), IIC_iUNAr, + "sxth", "\t$dst, $src", + [(set tGPR:$dst, (sext_inreg tGPR:$src, i16))]>, + Requires<[IsThumb1Only, HasV6]>, + T1Misc<{0,0,1,0,0,0,?}>; + +// test +let isCommutable = 1, Defs = [CPSR] in +def tTST : T1pI<(outs), (ins tGPR:$lhs, tGPR:$rhs), IIC_iCMPr, + "tst", "\t$lhs, $rhs", + [(ARMcmpZ (and tGPR:$lhs, tGPR:$rhs), 0)]>, + T1DataProcessing<0b1000>; + +// zero-extend byte +def tUXTB : T1pI<(outs tGPR:$dst), (ins tGPR:$src), IIC_iUNAr, + "uxtb", "\t$dst, $src", + [(set tGPR:$dst, (and tGPR:$src, 0xFF))]>, + Requires<[IsThumb1Only, HasV6]>, + T1Misc<{0,0,1,0,1,1,?}>; + +// zero-extend short +def tUXTH : T1pI<(outs tGPR:$dst), (ins tGPR:$src), IIC_iUNAr, + "uxth", "\t$dst, $src", + [(set tGPR:$dst, (and tGPR:$src, 0xFFFF))]>, + Requires<[IsThumb1Only, HasV6]>, + T1Misc<{0,0,1,0,1,0,?}>; + + +// Conditional move tMOVCCr - Used to implement the Thumb SELECT_CC operation. +// Expanded after instruction selection into a branch sequence. +let usesCustomInserter = 1 in // Expanded after instruction selection. + def tMOVCCr_pseudo : + PseudoInst<(outs tGPR:$dst), (ins tGPR:$false, tGPR:$true, pred:$cc), + NoItinerary, "${:comment} tMOVCCr $cc", + [/*(set tGPR:$dst, (ARMcmov tGPR:$false, tGPR:$true, imm:$cc))*/]>; + + +// 16-bit movcc in IT blocks for Thumb2. +let neverHasSideEffects = 1 in { +def tMOVCCr : T1pIt<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs), IIC_iCMOVr, + "mov", "\t$dst, $rhs", []>, + T1Special<{1,0,?,?}>; + +def tMOVCCi : T1pIt<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs), IIC_iCMOVi, + "mov", "\t$dst, $rhs", []>, + T1General<{1,0,0,?,?}>; +} // neverHasSideEffects + +// tLEApcrel - Load a pc-relative address into a register without offending the +// assembler. +let neverHasSideEffects = 1 in { +let isReMaterializable = 1 in +def tLEApcrel : T1I<(outs tGPR:$dst), (ins i32imm:$label, pred:$p), IIC_iALUi, + "adr$p\t$dst, #$label", []>, + T1Encoding<{1,0,1,0,0,?}>; // A6.2 & A8.6.10 + +def tLEApcrelJT : T1I<(outs tGPR:$dst), + (ins i32imm:$label, nohash_imm:$id, pred:$p), + IIC_iALUi, "adr$p\t$dst, #${label}_${id}", []>, + T1Encoding<{1,0,1,0,0,?}>; // A6.2 & A8.6.10 +} // neverHasSideEffects + +//===----------------------------------------------------------------------===// +// TLS Instructions +// + +// __aeabi_read_tp preserves the registers r1-r3. +let isCall = 1, + Defs = [R0, LR] in { + def tTPsoft : TIx2<0b11110, 0b11, 1, (outs), (ins), IIC_Br, + "bl\t__aeabi_read_tp", + [(set R0, ARMthread_pointer)]>; +} + +// SJLJ Exception handling intrinsics +// eh_sjlj_setjmp() is an instruction sequence to store the return +// address and save #0 in R0 for the non-longjmp case. +// Since by its nature we may be coming from some other function to get +// here, and we're using the stack frame for the containing function to +// save/restore registers, we can't keep anything live in regs across +// the eh_sjlj_setjmp(), else it will almost certainly have been tromped upon +// when we get here from a longjmp(). We force everthing out of registers +// except for our own input by listing the relevant registers in Defs. By +// doing so, we also cause the prologue/epilogue code to actively preserve +// all of the callee-saved resgisters, which is exactly what we want. +// The current SP is passed in $val, and we reuse the reg as a scratch. +let Defs = + [ R0, R1, R2, R3, R4, R5, R6, R7, R12 ] in { + def tInt_eh_sjlj_setjmp : ThumbXI<(outs),(ins tGPR:$src, tGPR:$val), + AddrModeNone, SizeSpecial, NoItinerary, + "str\t$val, [$src, #8]\t${:comment} begin eh.setjmp\n" + "\tmov\t$val, pc\n" + "\tadds\t$val, #7\n" + "\tstr\t$val, [$src, #4]\n" + "\tmovs\tr0, #0\n" + "\tb\t1f\n" + "\tmovs\tr0, #1\t${:comment} end eh.setjmp\n" + "1:", "", + [(set R0, (ARMeh_sjlj_setjmp tGPR:$src, tGPR:$val))]>; +} + +// FIXME: Non-Darwin version(s) +let isBarrier = 1, hasSideEffects = 1, isTerminator = 1, + Defs = [ R7, LR, SP ] in { +def tInt_eh_sjlj_longjmp : XI<(outs), (ins GPR:$src, GPR:$scratch), + AddrModeNone, SizeSpecial, IndexModeNone, + Pseudo, NoItinerary, + "ldr\t$scratch, [$src, #8]\n\t" + "mov\tsp, $scratch\n\t" + "ldr\t$scratch, [$src, #4]\n\t" + "ldr\tr7, [$src]\n\t" + "bx\t$scratch", "", + [(ARMeh_sjlj_longjmp GPR:$src, GPR:$scratch)]>, + Requires<[IsThumb, IsDarwin]>; +} + +//===----------------------------------------------------------------------===// +// Non-Instruction Patterns +// + +// Add with carry +def : T1Pat<(addc tGPR:$lhs, imm0_7:$rhs), + (tADDi3 tGPR:$lhs, imm0_7:$rhs)>; +def : T1Pat<(addc tGPR:$lhs, imm8_255:$rhs), + (tADDi8 tGPR:$lhs, imm8_255:$rhs)>; +def : T1Pat<(addc tGPR:$lhs, tGPR:$rhs), + (tADDrr tGPR:$lhs, tGPR:$rhs)>; + +// Subtract with carry +def : T1Pat<(addc tGPR:$lhs, imm0_7_neg:$rhs), + (tSUBi3 tGPR:$lhs, imm0_7_neg:$rhs)>; +def : T1Pat<(addc tGPR:$lhs, imm8_255_neg:$rhs), + (tSUBi8 tGPR:$lhs, imm8_255_neg:$rhs)>; +def : T1Pat<(subc tGPR:$lhs, tGPR:$rhs), + (tSUBrr tGPR:$lhs, tGPR:$rhs)>; + +// ConstantPool, GlobalAddress +def : T1Pat<(ARMWrapper tglobaladdr :$dst), (tLEApcrel tglobaladdr :$dst)>; +def : T1Pat<(ARMWrapper tconstpool :$dst), (tLEApcrel tconstpool :$dst)>; + +// JumpTable +def : T1Pat<(ARMWrapperJT tjumptable:$dst, imm:$id), + (tLEApcrelJT tjumptable:$dst, imm:$id)>; + +// Direct calls +def : T1Pat<(ARMtcall texternalsym:$func), (tBL texternalsym:$func)>, + Requires<[IsThumb, IsNotDarwin]>; +def : T1Pat<(ARMtcall texternalsym:$func), (tBLr9 texternalsym:$func)>, + Requires<[IsThumb, IsDarwin]>; + +def : Tv5Pat<(ARMcall texternalsym:$func), (tBLXi texternalsym:$func)>, + Requires<[IsThumb, HasV5T, IsNotDarwin]>; +def : Tv5Pat<(ARMcall texternalsym:$func), (tBLXi_r9 texternalsym:$func)>, + Requires<[IsThumb, HasV5T, IsDarwin]>; + +// Indirect calls to ARM routines +def : Tv5Pat<(ARMcall GPR:$dst), (tBLXr GPR:$dst)>, + Requires<[IsThumb, HasV5T, IsNotDarwin]>; +def : Tv5Pat<(ARMcall GPR:$dst), (tBLXr_r9 GPR:$dst)>, + Requires<[IsThumb, HasV5T, IsDarwin]>; + +// zextload i1 -> zextload i8 +def : T1Pat<(zextloadi1 t_addrmode_s1:$addr), + (tLDRB t_addrmode_s1:$addr)>; + +// extload -> zextload +def : T1Pat<(extloadi1 t_addrmode_s1:$addr), (tLDRB t_addrmode_s1:$addr)>; +def : T1Pat<(extloadi8 t_addrmode_s1:$addr), (tLDRB t_addrmode_s1:$addr)>; +def : T1Pat<(extloadi16 t_addrmode_s2:$addr), (tLDRH t_addrmode_s2:$addr)>; + +// If it's impossible to use [r,r] address mode for sextload, select to +// ldr{b|h} + sxt{b|h} instead. +def : T1Pat<(sextloadi8 t_addrmode_s1:$addr), + (tSXTB (tLDRB t_addrmode_s1:$addr))>, + Requires<[IsThumb1Only, HasV6]>; +def : T1Pat<(sextloadi16 t_addrmode_s2:$addr), + (tSXTH (tLDRH t_addrmode_s2:$addr))>, + Requires<[IsThumb1Only, HasV6]>; + +def : T1Pat<(sextloadi8 t_addrmode_s1:$addr), + (tASRri (tLSLri (tLDRB t_addrmode_s1:$addr), 24), 24)>; +def : T1Pat<(sextloadi16 t_addrmode_s1:$addr), + (tASRri (tLSLri (tLDRH t_addrmode_s1:$addr), 16), 16)>; + +// Large immediate handling. + +// Two piece imms. +def : T1Pat<(i32 thumb_immshifted:$src), + (tLSLri (tMOVi8 (thumb_immshifted_val imm:$src)), + (thumb_immshifted_shamt imm:$src))>; + +def : T1Pat<(i32 imm0_255_comp:$src), + (tMVN (tMOVi8 (imm_comp_XFORM imm:$src)))>; + +// Pseudo instruction that combines ldr from constpool and add pc. This should +// be expanded into two instructions late to allow if-conversion and +// scheduling. +let isReMaterializable = 1 in +def tLDRpci_pic : PseudoInst<(outs GPR:$dst), (ins i32imm:$addr, pclabel:$cp), + NoItinerary, "${:comment} ldr.n\t$dst, $addr\n$cp:\n\tadd\t$dst, pc", + [(set GPR:$dst, (ARMpic_add (load (ARMWrapper tconstpool:$addr)), + imm:$cp))]>, + Requires<[IsThumb1Only]>; diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrThumb2.td b/contrib/llvm/lib/Target/ARM/ARMInstrThumb2.td new file mode 100644 index 0000000..b91c089 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMInstrThumb2.td @@ -0,0 +1,2747 @@ +//===- ARMInstrThumb2.td - Thumb2 support for ARM -------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the Thumb2 instruction set. +// +//===----------------------------------------------------------------------===// + +// IT block predicate field +def it_pred : Operand<i32> { + let PrintMethod = "printMandatoryPredicateOperand"; +} + +// IT block condition mask +def it_mask : Operand<i32> { + let PrintMethod = "printThumbITMask"; +} + +// Table branch address +def tb_addrmode : Operand<i32> { + let PrintMethod = "printTBAddrMode"; +} + +// Shifted operands. No register controlled shifts for Thumb2. +// Note: We do not support rrx shifted operands yet. +def t2_so_reg : Operand<i32>, // reg imm + ComplexPattern<i32, 2, "SelectT2ShifterOperandReg", + [shl,srl,sra,rotr]> { + let PrintMethod = "printT2SOOperand"; + let MIOperandInfo = (ops GPR, i32imm); +} + +// t2_so_imm_not_XFORM - Return the complement of a t2_so_imm value +def t2_so_imm_not_XFORM : SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant(~((uint32_t)N->getZExtValue()), MVT::i32); +}]>; + +// t2_so_imm_neg_XFORM - Return the negation of a t2_so_imm value +def t2_so_imm_neg_XFORM : SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant(-((int)N->getZExtValue()), MVT::i32); +}]>; + +// t2_so_imm - Match a 32-bit immediate operand, which is an +// 8-bit immediate rotated by an arbitrary number of bits, or an 8-bit +// immediate splatted into multiple bytes of the word. t2_so_imm values are +// represented in the imm field in the same 12-bit form that they are encoded +// into t2_so_imm instructions: the 8-bit immediate is the least significant +// bits [bits 0-7], the 4-bit shift/splat amount is the next 4 bits [bits 8-11]. +def t2_so_imm : Operand<i32>, + PatLeaf<(imm), [{ + return ARM_AM::getT2SOImmVal((uint32_t)N->getZExtValue()) != -1; +}]>; + +// t2_so_imm_not - Match an immediate that is a complement +// of a t2_so_imm. +def t2_so_imm_not : Operand<i32>, + PatLeaf<(imm), [{ + return ARM_AM::getT2SOImmVal(~((uint32_t)N->getZExtValue())) != -1; +}], t2_so_imm_not_XFORM>; + +// t2_so_imm_neg - Match an immediate that is a negation of a t2_so_imm. +def t2_so_imm_neg : Operand<i32>, + PatLeaf<(imm), [{ + return ARM_AM::getT2SOImmVal(-((int)N->getZExtValue())) != -1; +}], t2_so_imm_neg_XFORM>; + +// Break t2_so_imm's up into two pieces. This handles immediates with up to 16 +// bits set in them. This uses t2_so_imm2part to match and t2_so_imm2part_[12] +// to get the first/second pieces. +def t2_so_imm2part : Operand<i32>, + PatLeaf<(imm), [{ + return ARM_AM::isT2SOImmTwoPartVal((unsigned)N->getZExtValue()); + }]> { +} + +def t2_so_imm2part_1 : SDNodeXForm<imm, [{ + unsigned V = ARM_AM::getT2SOImmTwoPartFirst((unsigned)N->getZExtValue()); + return CurDAG->getTargetConstant(V, MVT::i32); +}]>; + +def t2_so_imm2part_2 : SDNodeXForm<imm, [{ + unsigned V = ARM_AM::getT2SOImmTwoPartSecond((unsigned)N->getZExtValue()); + return CurDAG->getTargetConstant(V, MVT::i32); +}]>; + +def t2_so_neg_imm2part : Operand<i32>, PatLeaf<(imm), [{ + return ARM_AM::isT2SOImmTwoPartVal(-(int)N->getZExtValue()); + }]> { +} + +def t2_so_neg_imm2part_1 : SDNodeXForm<imm, [{ + unsigned V = ARM_AM::getT2SOImmTwoPartFirst(-(int)N->getZExtValue()); + return CurDAG->getTargetConstant(V, MVT::i32); +}]>; + +def t2_so_neg_imm2part_2 : SDNodeXForm<imm, [{ + unsigned V = ARM_AM::getT2SOImmTwoPartSecond(-(int)N->getZExtValue()); + return CurDAG->getTargetConstant(V, MVT::i32); +}]>; + +/// imm1_31 predicate - True if the 32-bit immediate is in the range [1,31]. +def imm1_31 : PatLeaf<(i32 imm), [{ + return (int32_t)N->getZExtValue() >= 1 && (int32_t)N->getZExtValue() < 32; +}]>; + +/// imm0_4095 predicate - True if the 32-bit immediate is in the range [0.4095]. +def imm0_4095 : Operand<i32>, + PatLeaf<(i32 imm), [{ + return (uint32_t)N->getZExtValue() < 4096; +}]>; + +def imm0_4095_neg : PatLeaf<(i32 imm), [{ + return (uint32_t)(-N->getZExtValue()) < 4096; +}], imm_neg_XFORM>; + +def imm0_255_neg : PatLeaf<(i32 imm), [{ + return (uint32_t)(-N->getZExtValue()) < 255; +}], imm_neg_XFORM>; + +// Define Thumb2 specific addressing modes. + +// t2addrmode_imm12 := reg + imm12 +def t2addrmode_imm12 : Operand<i32>, + ComplexPattern<i32, 2, "SelectT2AddrModeImm12", []> { + let PrintMethod = "printT2AddrModeImm12Operand"; + let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm); +} + +// t2addrmode_imm8 := reg +/- imm8 +def t2addrmode_imm8 : Operand<i32>, + ComplexPattern<i32, 2, "SelectT2AddrModeImm8", []> { + let PrintMethod = "printT2AddrModeImm8Operand"; + let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm); +} + +def t2am_imm8_offset : Operand<i32>, + ComplexPattern<i32, 1, "SelectT2AddrModeImm8Offset", []>{ + let PrintMethod = "printT2AddrModeImm8OffsetOperand"; +} + +// t2addrmode_imm8s4 := reg +/- (imm8 << 2) +def t2addrmode_imm8s4 : Operand<i32>, + ComplexPattern<i32, 2, "SelectT2AddrModeImm8s4", []> { + let PrintMethod = "printT2AddrModeImm8s4Operand"; + let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm); +} + +def t2am_imm8s4_offset : Operand<i32> { + let PrintMethod = "printT2AddrModeImm8s4OffsetOperand"; +} + +// t2addrmode_so_reg := reg + (reg << imm2) +def t2addrmode_so_reg : Operand<i32>, + ComplexPattern<i32, 3, "SelectT2AddrModeSoReg", []> { + let PrintMethod = "printT2AddrModeSoRegOperand"; + let MIOperandInfo = (ops GPR:$base, GPR:$offsreg, i32imm:$offsimm); +} + + +//===----------------------------------------------------------------------===// +// Multiclass helpers... +// + +/// T2I_un_irs - Defines a set of (op reg, {so_imm|r|so_reg}) patterns for a +/// unary operation that produces a value. These are predicable and can be +/// changed to modify CPSR. +multiclass T2I_un_irs<bits<4> opcod, string opc, PatFrag opnode, + bit Cheap = 0, bit ReMat = 0> { + // shifted imm + def i : T2sI<(outs GPR:$dst), (ins t2_so_imm:$src), IIC_iMOVi, + opc, "\t$dst, $src", + [(set GPR:$dst, (opnode t2_so_imm:$src))]> { + let isAsCheapAsAMove = Cheap; + let isReMaterializable = ReMat; + let Inst{31-27} = 0b11110; + let Inst{25} = 0; + let Inst{24-21} = opcod; + let Inst{20} = ?; // The S bit. + let Inst{19-16} = 0b1111; // Rn + let Inst{15} = 0; + } + // register + def r : T2sI<(outs GPR:$dst), (ins GPR:$src), IIC_iMOVr, + opc, ".w\t$dst, $src", + [(set GPR:$dst, (opnode GPR:$src))]> { + let Inst{31-27} = 0b11101; + let Inst{26-25} = 0b01; + let Inst{24-21} = opcod; + let Inst{20} = ?; // The S bit. + let Inst{19-16} = 0b1111; // Rn + let Inst{14-12} = 0b000; // imm3 + let Inst{7-6} = 0b00; // imm2 + let Inst{5-4} = 0b00; // type + } + // shifted register + def s : T2sI<(outs GPR:$dst), (ins t2_so_reg:$src), IIC_iMOVsi, + opc, ".w\t$dst, $src", + [(set GPR:$dst, (opnode t2_so_reg:$src))]> { + let Inst{31-27} = 0b11101; + let Inst{26-25} = 0b01; + let Inst{24-21} = opcod; + let Inst{20} = ?; // The S bit. + let Inst{19-16} = 0b1111; // Rn + } +} + +/// T2I_bin_irs - Defines a set of (op reg, {so_imm|r|so_reg}) patterns for a +/// binary operation that produces a value. These are predicable and can be +/// changed to modify CPSR. +multiclass T2I_bin_irs<bits<4> opcod, string opc, PatFrag opnode, + bit Commutable = 0, string wide =""> { + // shifted imm + def ri : T2sI<(outs GPR:$dst), (ins GPR:$lhs, t2_so_imm:$rhs), IIC_iALUi, + opc, "\t$dst, $lhs, $rhs", + [(set GPR:$dst, (opnode GPR:$lhs, t2_so_imm:$rhs))]> { + let Inst{31-27} = 0b11110; + let Inst{25} = 0; + let Inst{24-21} = opcod; + let Inst{20} = ?; // The S bit. + let Inst{15} = 0; + } + // register + def rr : T2sI<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs), IIC_iALUr, + opc, !strconcat(wide, "\t$dst, $lhs, $rhs"), + [(set GPR:$dst, (opnode GPR:$lhs, GPR:$rhs))]> { + let isCommutable = Commutable; + let Inst{31-27} = 0b11101; + let Inst{26-25} = 0b01; + let Inst{24-21} = opcod; + let Inst{20} = ?; // The S bit. + let Inst{14-12} = 0b000; // imm3 + let Inst{7-6} = 0b00; // imm2 + let Inst{5-4} = 0b00; // type + } + // shifted register + def rs : T2sI<(outs GPR:$dst), (ins GPR:$lhs, t2_so_reg:$rhs), IIC_iALUsi, + opc, !strconcat(wide, "\t$dst, $lhs, $rhs"), + [(set GPR:$dst, (opnode GPR:$lhs, t2_so_reg:$rhs))]> { + let Inst{31-27} = 0b11101; + let Inst{26-25} = 0b01; + let Inst{24-21} = opcod; + let Inst{20} = ?; // The S bit. + } +} + +/// T2I_bin_w_irs - Same as T2I_bin_irs except these operations need +// the ".w" prefix to indicate that they are wide. +multiclass T2I_bin_w_irs<bits<4> opcod, string opc, PatFrag opnode, + bit Commutable = 0> : + T2I_bin_irs<opcod, opc, opnode, Commutable, ".w">; + +/// T2I_rbin_is - Same as T2I_bin_irs except the order of operands are +/// reversed. It doesn't define the 'rr' form since it's handled by its +/// T2I_bin_irs counterpart. +multiclass T2I_rbin_is<bits<4> opcod, string opc, PatFrag opnode> { + // shifted imm + def ri : T2sI<(outs GPR:$dst), (ins GPR:$rhs, t2_so_imm:$lhs), IIC_iALUi, + opc, ".w\t$dst, $rhs, $lhs", + [(set GPR:$dst, (opnode t2_so_imm:$lhs, GPR:$rhs))]> { + let Inst{31-27} = 0b11110; + let Inst{25} = 0; + let Inst{24-21} = opcod; + let Inst{20} = ?; // The S bit. + let Inst{15} = 0; + } + // shifted register + def rs : T2sI<(outs GPR:$dst), (ins GPR:$rhs, t2_so_reg:$lhs), IIC_iALUsi, + opc, "\t$dst, $rhs, $lhs", + [(set GPR:$dst, (opnode t2_so_reg:$lhs, GPR:$rhs))]> { + let Inst{31-27} = 0b11101; + let Inst{26-25} = 0b01; + let Inst{24-21} = opcod; + let Inst{20} = ?; // The S bit. + } +} + +/// T2I_bin_s_irs - Similar to T2I_bin_irs except it sets the 's' bit so the +/// instruction modifies the CPSR register. +let Defs = [CPSR] in { +multiclass T2I_bin_s_irs<bits<4> opcod, string opc, PatFrag opnode, + bit Commutable = 0> { + // shifted imm + def ri : T2I<(outs GPR:$dst), (ins GPR:$lhs, t2_so_imm:$rhs), IIC_iALUi, + !strconcat(opc, "s"), ".w\t$dst, $lhs, $rhs", + [(set GPR:$dst, (opnode GPR:$lhs, t2_so_imm:$rhs))]> { + let Inst{31-27} = 0b11110; + let Inst{25} = 0; + let Inst{24-21} = opcod; + let Inst{20} = 1; // The S bit. + let Inst{15} = 0; + } + // register + def rr : T2I<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs), IIC_iALUr, + !strconcat(opc, "s"), ".w\t$dst, $lhs, $rhs", + [(set GPR:$dst, (opnode GPR:$lhs, GPR:$rhs))]> { + let isCommutable = Commutable; + let Inst{31-27} = 0b11101; + let Inst{26-25} = 0b01; + let Inst{24-21} = opcod; + let Inst{20} = 1; // The S bit. + let Inst{14-12} = 0b000; // imm3 + let Inst{7-6} = 0b00; // imm2 + let Inst{5-4} = 0b00; // type + } + // shifted register + def rs : T2I<(outs GPR:$dst), (ins GPR:$lhs, t2_so_reg:$rhs), IIC_iALUsi, + !strconcat(opc, "s"), ".w\t$dst, $lhs, $rhs", + [(set GPR:$dst, (opnode GPR:$lhs, t2_so_reg:$rhs))]> { + let Inst{31-27} = 0b11101; + let Inst{26-25} = 0b01; + let Inst{24-21} = opcod; + let Inst{20} = 1; // The S bit. + } +} +} + +/// T2I_bin_ii12rs - Defines a set of (op reg, {so_imm|imm0_4095|r|so_reg}) +/// patterns for a binary operation that produces a value. +multiclass T2I_bin_ii12rs<bits<3> op23_21, string opc, PatFrag opnode, + bit Commutable = 0> { + // shifted imm + def ri : T2sI<(outs GPR:$dst), (ins GPR:$lhs, t2_so_imm:$rhs), IIC_iALUi, + opc, ".w\t$dst, $lhs, $rhs", + [(set GPR:$dst, (opnode GPR:$lhs, t2_so_imm:$rhs))]> { + let Inst{31-27} = 0b11110; + let Inst{25} = 0; + let Inst{24} = 1; + let Inst{23-21} = op23_21; + let Inst{20} = 0; // The S bit. + let Inst{15} = 0; + } + // 12-bit imm + def ri12 : T2I<(outs GPR:$dst), (ins GPR:$lhs, imm0_4095:$rhs), IIC_iALUi, + !strconcat(opc, "w"), "\t$dst, $lhs, $rhs", + [(set GPR:$dst, (opnode GPR:$lhs, imm0_4095:$rhs))]> { + let Inst{31-27} = 0b11110; + let Inst{25} = 1; + let Inst{24} = 0; + let Inst{23-21} = op23_21; + let Inst{20} = 0; // The S bit. + let Inst{15} = 0; + } + // register + def rr : T2sI<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs), IIC_iALUr, + opc, ".w\t$dst, $lhs, $rhs", + [(set GPR:$dst, (opnode GPR:$lhs, GPR:$rhs))]> { + let isCommutable = Commutable; + let Inst{31-27} = 0b11101; + let Inst{26-25} = 0b01; + let Inst{24} = 1; + let Inst{23-21} = op23_21; + let Inst{20} = 0; // The S bit. + let Inst{14-12} = 0b000; // imm3 + let Inst{7-6} = 0b00; // imm2 + let Inst{5-4} = 0b00; // type + } + // shifted register + def rs : T2sI<(outs GPR:$dst), (ins GPR:$lhs, t2_so_reg:$rhs), IIC_iALUsi, + opc, ".w\t$dst, $lhs, $rhs", + [(set GPR:$dst, (opnode GPR:$lhs, t2_so_reg:$rhs))]> { + let Inst{31-27} = 0b11101; + let Inst{26-25} = 0b01; + let Inst{24} = 1; + let Inst{23-21} = op23_21; + let Inst{20} = 0; // The S bit. + } +} + +/// T2I_adde_sube_irs - Defines a set of (op reg, {so_imm|r|so_reg}) patterns +/// for a binary operation that produces a value and use the carry +/// bit. It's not predicable. +let Uses = [CPSR] in { +multiclass T2I_adde_sube_irs<bits<4> opcod, string opc, PatFrag opnode, + bit Commutable = 0> { + // shifted imm + def ri : T2sI<(outs GPR:$dst), (ins GPR:$lhs, t2_so_imm:$rhs), IIC_iALUi, + opc, "\t$dst, $lhs, $rhs", + [(set GPR:$dst, (opnode GPR:$lhs, t2_so_imm:$rhs))]>, + Requires<[IsThumb2]> { + let Inst{31-27} = 0b11110; + let Inst{25} = 0; + let Inst{24-21} = opcod; + let Inst{20} = 0; // The S bit. + let Inst{15} = 0; + } + // register + def rr : T2sI<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs), IIC_iALUr, + opc, ".w\t$dst, $lhs, $rhs", + [(set GPR:$dst, (opnode GPR:$lhs, GPR:$rhs))]>, + Requires<[IsThumb2]> { + let isCommutable = Commutable; + let Inst{31-27} = 0b11101; + let Inst{26-25} = 0b01; + let Inst{24-21} = opcod; + let Inst{20} = 0; // The S bit. + let Inst{14-12} = 0b000; // imm3 + let Inst{7-6} = 0b00; // imm2 + let Inst{5-4} = 0b00; // type + } + // shifted register + def rs : T2sI<(outs GPR:$dst), (ins GPR:$lhs, t2_so_reg:$rhs), IIC_iALUsi, + opc, ".w\t$dst, $lhs, $rhs", + [(set GPR:$dst, (opnode GPR:$lhs, t2_so_reg:$rhs))]>, + Requires<[IsThumb2]> { + let Inst{31-27} = 0b11101; + let Inst{26-25} = 0b01; + let Inst{24-21} = opcod; + let Inst{20} = 0; // The S bit. + } +} + +// Carry setting variants +let Defs = [CPSR] in { +multiclass T2I_adde_sube_s_irs<bits<4> opcod, string opc, PatFrag opnode, + bit Commutable = 0> { + // shifted imm + def ri : T2sI<(outs GPR:$dst), (ins GPR:$lhs, t2_so_imm:$rhs), IIC_iALUi, + opc, "\t$dst, $lhs, $rhs", + [(set GPR:$dst, (opnode GPR:$lhs, t2_so_imm:$rhs))]>, + Requires<[IsThumb2]> { + let Inst{31-27} = 0b11110; + let Inst{25} = 0; + let Inst{24-21} = opcod; + let Inst{20} = 1; // The S bit. + let Inst{15} = 0; + } + // register + def rr : T2sI<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs), IIC_iALUr, + opc, ".w\t$dst, $lhs, $rhs", + [(set GPR:$dst, (opnode GPR:$lhs, GPR:$rhs))]>, + Requires<[IsThumb2]> { + let isCommutable = Commutable; + let Inst{31-27} = 0b11101; + let Inst{26-25} = 0b01; + let Inst{24-21} = opcod; + let Inst{20} = 1; // The S bit. + let Inst{14-12} = 0b000; // imm3 + let Inst{7-6} = 0b00; // imm2 + let Inst{5-4} = 0b00; // type + } + // shifted register + def rs : T2sI<(outs GPR:$dst), (ins GPR:$lhs, t2_so_reg:$rhs), IIC_iALUsi, + opc, ".w\t$dst, $lhs, $rhs", + [(set GPR:$dst, (opnode GPR:$lhs, t2_so_reg:$rhs))]>, + Requires<[IsThumb2]> { + let Inst{31-27} = 0b11101; + let Inst{26-25} = 0b01; + let Inst{24-21} = opcod; + let Inst{20} = 1; // The S bit. + } +} +} +} + +/// T2I_rbin_s_is - Same as T2I_rbin_is except sets 's' bit. +let Defs = [CPSR] in { +multiclass T2I_rbin_s_is<bits<4> opcod, string opc, PatFrag opnode> { + // shifted imm + def ri : T2I<(outs GPR:$dst), (ins GPR:$rhs, t2_so_imm:$lhs), IIC_iALUi, + !strconcat(opc, "s"), ".w\t$dst, $rhs, $lhs", + [(set GPR:$dst, (opnode t2_so_imm:$lhs, GPR:$rhs))]> { + let Inst{31-27} = 0b11110; + let Inst{25} = 0; + let Inst{24-21} = opcod; + let Inst{20} = 1; // The S bit. + let Inst{15} = 0; + } + // shifted register + def rs : T2I<(outs GPR:$dst), (ins GPR:$rhs, t2_so_reg:$lhs), IIC_iALUsi, + !strconcat(opc, "s"), "\t$dst, $rhs, $lhs", + [(set GPR:$dst, (opnode t2_so_reg:$lhs, GPR:$rhs))]> { + let Inst{31-27} = 0b11101; + let Inst{26-25} = 0b01; + let Inst{24-21} = opcod; + let Inst{20} = 1; // The S bit. + } +} +} + +/// T2I_sh_ir - Defines a set of (op reg, {so_imm|r}) patterns for a shift / +// rotate operation that produces a value. +multiclass T2I_sh_ir<bits<2> opcod, string opc, PatFrag opnode> { + // 5-bit imm + def ri : T2sI<(outs GPR:$dst), (ins GPR:$lhs, i32imm:$rhs), IIC_iMOVsi, + opc, ".w\t$dst, $lhs, $rhs", + [(set GPR:$dst, (opnode GPR:$lhs, imm1_31:$rhs))]> { + let Inst{31-27} = 0b11101; + let Inst{26-21} = 0b010010; + let Inst{19-16} = 0b1111; // Rn + let Inst{5-4} = opcod; + } + // register + def rr : T2sI<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs), IIC_iMOVsr, + opc, ".w\t$dst, $lhs, $rhs", + [(set GPR:$dst, (opnode GPR:$lhs, GPR:$rhs))]> { + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0100; + let Inst{22-21} = opcod; + let Inst{15-12} = 0b1111; + let Inst{7-4} = 0b0000; + } +} + +/// T2I_cmp_irs - Defines a set of (op r, {so_imm|r|so_reg}) cmp / test +/// patterns. Similar to T2I_bin_irs except the instruction does not produce +/// a explicit result, only implicitly set CPSR. +let Defs = [CPSR] in { +multiclass T2I_cmp_irs<bits<4> opcod, string opc, PatFrag opnode> { + // shifted imm + def ri : T2I<(outs), (ins GPR:$lhs, t2_so_imm:$rhs), IIC_iCMPi, + opc, ".w\t$lhs, $rhs", + [(opnode GPR:$lhs, t2_so_imm:$rhs)]> { + let Inst{31-27} = 0b11110; + let Inst{25} = 0; + let Inst{24-21} = opcod; + let Inst{20} = 1; // The S bit. + let Inst{15} = 0; + let Inst{11-8} = 0b1111; // Rd + } + // register + def rr : T2I<(outs), (ins GPR:$lhs, GPR:$rhs), IIC_iCMPr, + opc, ".w\t$lhs, $rhs", + [(opnode GPR:$lhs, GPR:$rhs)]> { + let Inst{31-27} = 0b11101; + let Inst{26-25} = 0b01; + let Inst{24-21} = opcod; + let Inst{20} = 1; // The S bit. + let Inst{14-12} = 0b000; // imm3 + let Inst{11-8} = 0b1111; // Rd + let Inst{7-6} = 0b00; // imm2 + let Inst{5-4} = 0b00; // type + } + // shifted register + def rs : T2I<(outs), (ins GPR:$lhs, t2_so_reg:$rhs), IIC_iCMPsi, + opc, ".w\t$lhs, $rhs", + [(opnode GPR:$lhs, t2_so_reg:$rhs)]> { + let Inst{31-27} = 0b11101; + let Inst{26-25} = 0b01; + let Inst{24-21} = opcod; + let Inst{20} = 1; // The S bit. + let Inst{11-8} = 0b1111; // Rd + } +} +} + +/// T2I_ld - Defines a set of (op r, {imm12|imm8|so_reg}) load patterns. +multiclass T2I_ld<bit signed, bits<2> opcod, string opc, PatFrag opnode> { + def i12 : T2Ii12<(outs GPR:$dst), (ins t2addrmode_imm12:$addr), IIC_iLoadi, + opc, ".w\t$dst, $addr", + [(set GPR:$dst, (opnode t2addrmode_imm12:$addr))]> { + let Inst{31-27} = 0b11111; + let Inst{26-25} = 0b00; + let Inst{24} = signed; + let Inst{23} = 1; + let Inst{22-21} = opcod; + let Inst{20} = 1; // load + } + def i8 : T2Ii8 <(outs GPR:$dst), (ins t2addrmode_imm8:$addr), IIC_iLoadi, + opc, "\t$dst, $addr", + [(set GPR:$dst, (opnode t2addrmode_imm8:$addr))]> { + let Inst{31-27} = 0b11111; + let Inst{26-25} = 0b00; + let Inst{24} = signed; + let Inst{23} = 0; + let Inst{22-21} = opcod; + let Inst{20} = 1; // load + let Inst{11} = 1; + // Offset: index==TRUE, wback==FALSE + let Inst{10} = 1; // The P bit. + let Inst{8} = 0; // The W bit. + } + def s : T2Iso <(outs GPR:$dst), (ins t2addrmode_so_reg:$addr), IIC_iLoadr, + opc, ".w\t$dst, $addr", + [(set GPR:$dst, (opnode t2addrmode_so_reg:$addr))]> { + let Inst{31-27} = 0b11111; + let Inst{26-25} = 0b00; + let Inst{24} = signed; + let Inst{23} = 0; + let Inst{22-21} = opcod; + let Inst{20} = 1; // load + let Inst{11-6} = 0b000000; + } + def pci : T2Ipc <(outs GPR:$dst), (ins i32imm:$addr), IIC_iLoadi, + opc, ".w\t$dst, $addr", + [(set GPR:$dst, (opnode (ARMWrapper tconstpool:$addr)))]> { + let isReMaterializable = 1; + let Inst{31-27} = 0b11111; + let Inst{26-25} = 0b00; + let Inst{24} = signed; + let Inst{23} = ?; // add = (U == '1') + let Inst{22-21} = opcod; + let Inst{20} = 1; // load + let Inst{19-16} = 0b1111; // Rn + } +} + +/// T2I_st - Defines a set of (op r, {imm12|imm8|so_reg}) store patterns. +multiclass T2I_st<bits<2> opcod, string opc, PatFrag opnode> { + def i12 : T2Ii12<(outs), (ins GPR:$src, t2addrmode_imm12:$addr), IIC_iStorei, + opc, ".w\t$src, $addr", + [(opnode GPR:$src, t2addrmode_imm12:$addr)]> { + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0001; + let Inst{22-21} = opcod; + let Inst{20} = 0; // !load + } + def i8 : T2Ii8 <(outs), (ins GPR:$src, t2addrmode_imm8:$addr), IIC_iStorei, + opc, "\t$src, $addr", + [(opnode GPR:$src, t2addrmode_imm8:$addr)]> { + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0000; + let Inst{22-21} = opcod; + let Inst{20} = 0; // !load + let Inst{11} = 1; + // Offset: index==TRUE, wback==FALSE + let Inst{10} = 1; // The P bit. + let Inst{8} = 0; // The W bit. + } + def s : T2Iso <(outs), (ins GPR:$src, t2addrmode_so_reg:$addr), IIC_iStorer, + opc, ".w\t$src, $addr", + [(opnode GPR:$src, t2addrmode_so_reg:$addr)]> { + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0000; + let Inst{22-21} = opcod; + let Inst{20} = 0; // !load + let Inst{11-6} = 0b000000; + } +} + +/// T2I_unary_rrot - A unary operation with two forms: one whose operand is a +/// register and one whose operand is a register rotated by 8/16/24. +multiclass T2I_unary_rrot<bits<3> opcod, string opc, PatFrag opnode> { + def r : T2I<(outs GPR:$dst), (ins GPR:$src), IIC_iUNAr, + opc, ".w\t$dst, $src", + [(set GPR:$dst, (opnode GPR:$src))]>, + Requires<[HasT2ExtractPack]> { + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0100; + let Inst{22-20} = opcod; + let Inst{19-16} = 0b1111; // Rn + let Inst{15-12} = 0b1111; + let Inst{7} = 1; + let Inst{5-4} = 0b00; // rotate + } + def r_rot : T2I<(outs GPR:$dst), (ins GPR:$src, i32imm:$rot), IIC_iUNAsi, + opc, ".w\t$dst, $src, ror $rot", + [(set GPR:$dst, (opnode (rotr GPR:$src, rot_imm:$rot)))]>, + Requires<[HasT2ExtractPack]> { + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0100; + let Inst{22-20} = opcod; + let Inst{19-16} = 0b1111; // Rn + let Inst{15-12} = 0b1111; + let Inst{7} = 1; + let Inst{5-4} = {?,?}; // rotate + } +} + +// SXTB16 and UXTB16 do not need the .w qualifier. +multiclass T2I_unary_rrot_nw<bits<3> opcod, string opc, PatFrag opnode> { + def r : T2I<(outs GPR:$dst), (ins GPR:$src), IIC_iUNAr, + opc, "\t$dst, $src", + [(set GPR:$dst, (opnode GPR:$src))]>, + Requires<[HasT2ExtractPack]> { + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0100; + let Inst{22-20} = opcod; + let Inst{19-16} = 0b1111; // Rn + let Inst{15-12} = 0b1111; + let Inst{7} = 1; + let Inst{5-4} = 0b00; // rotate + } + def r_rot : T2I<(outs GPR:$dst), (ins GPR:$src, i32imm:$rot), IIC_iUNAsi, + opc, "\t$dst, $src, ror $rot", + [(set GPR:$dst, (opnode (rotr GPR:$src, rot_imm:$rot)))]>, + Requires<[HasT2ExtractPack]> { + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0100; + let Inst{22-20} = opcod; + let Inst{19-16} = 0b1111; // Rn + let Inst{15-12} = 0b1111; + let Inst{7} = 1; + let Inst{5-4} = {?,?}; // rotate + } +} + +// DO variant - disassembly only, no pattern + +multiclass T2I_unary_rrot_DO<bits<3> opcod, string opc> { + def r : T2I<(outs GPR:$dst), (ins GPR:$src), IIC_iUNAr, + opc, "\t$dst, $src", []> { + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0100; + let Inst{22-20} = opcod; + let Inst{19-16} = 0b1111; // Rn + let Inst{15-12} = 0b1111; + let Inst{7} = 1; + let Inst{5-4} = 0b00; // rotate + } + def r_rot : T2I<(outs GPR:$dst), (ins GPR:$src, i32imm:$rot), IIC_iUNAsi, + opc, "\t$dst, $src, ror $rot", []> { + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0100; + let Inst{22-20} = opcod; + let Inst{19-16} = 0b1111; // Rn + let Inst{15-12} = 0b1111; + let Inst{7} = 1; + let Inst{5-4} = {?,?}; // rotate + } +} + +/// T2I_bin_rrot - A binary operation with two forms: one whose operand is a +/// register and one whose operand is a register rotated by 8/16/24. +multiclass T2I_bin_rrot<bits<3> opcod, string opc, PatFrag opnode> { + def rr : T2I<(outs GPR:$dst), (ins GPR:$LHS, GPR:$RHS), IIC_iALUr, + opc, "\t$dst, $LHS, $RHS", + [(set GPR:$dst, (opnode GPR:$LHS, GPR:$RHS))]>, + Requires<[HasT2ExtractPack]> { + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0100; + let Inst{22-20} = opcod; + let Inst{15-12} = 0b1111; + let Inst{7} = 1; + let Inst{5-4} = 0b00; // rotate + } + def rr_rot : T2I<(outs GPR:$dst), (ins GPR:$LHS, GPR:$RHS, i32imm:$rot), + IIC_iALUsr, opc, "\t$dst, $LHS, $RHS, ror $rot", + [(set GPR:$dst, (opnode GPR:$LHS, + (rotr GPR:$RHS, rot_imm:$rot)))]>, + Requires<[HasT2ExtractPack]> { + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0100; + let Inst{22-20} = opcod; + let Inst{15-12} = 0b1111; + let Inst{7} = 1; + let Inst{5-4} = {?,?}; // rotate + } +} + +// DO variant - disassembly only, no pattern + +multiclass T2I_bin_rrot_DO<bits<3> opcod, string opc> { + def rr : T2I<(outs GPR:$dst), (ins GPR:$LHS, GPR:$RHS), IIC_iALUr, + opc, "\t$dst, $LHS, $RHS", []> { + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0100; + let Inst{22-20} = opcod; + let Inst{15-12} = 0b1111; + let Inst{7} = 1; + let Inst{5-4} = 0b00; // rotate + } + def rr_rot : T2I<(outs GPR:$dst), (ins GPR:$LHS, GPR:$RHS, i32imm:$rot), + IIC_iALUsr, opc, "\t$dst, $LHS, $RHS, ror $rot", []> { + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0100; + let Inst{22-20} = opcod; + let Inst{15-12} = 0b1111; + let Inst{7} = 1; + let Inst{5-4} = {?,?}; // rotate + } +} + +//===----------------------------------------------------------------------===// +// Instructions +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Miscellaneous Instructions. +// + +// LEApcrel - Load a pc-relative address into a register without offending the +// assembler. +let neverHasSideEffects = 1 in { +let isReMaterializable = 1 in +def t2LEApcrel : T2XI<(outs GPR:$dst), (ins i32imm:$label, pred:$p), IIC_iALUi, + "adr$p.w\t$dst, #$label", []> { + let Inst{31-27} = 0b11110; + let Inst{25-24} = 0b10; + // Inst{23:21} = '11' (add = FALSE) or '00' (add = TRUE) + let Inst{22} = 0; + let Inst{20} = 0; + let Inst{19-16} = 0b1111; // Rn + let Inst{15} = 0; +} +def t2LEApcrelJT : T2XI<(outs GPR:$dst), + (ins i32imm:$label, nohash_imm:$id, pred:$p), IIC_iALUi, + "adr$p.w\t$dst, #${label}_${id}", []> { + let Inst{31-27} = 0b11110; + let Inst{25-24} = 0b10; + // Inst{23:21} = '11' (add = FALSE) or '00' (add = TRUE) + let Inst{22} = 0; + let Inst{20} = 0; + let Inst{19-16} = 0b1111; // Rn + let Inst{15} = 0; +} +} // neverHasSideEffects + +// ADD r, sp, {so_imm|i12} +def t2ADDrSPi : T2sI<(outs GPR:$dst), (ins GPR:$sp, t2_so_imm:$imm), + IIC_iALUi, "add", ".w\t$dst, $sp, $imm", []> { + let Inst{31-27} = 0b11110; + let Inst{25} = 0; + let Inst{24-21} = 0b1000; + let Inst{20} = ?; // The S bit. + let Inst{19-16} = 0b1101; // Rn = sp + let Inst{15} = 0; +} +def t2ADDrSPi12 : T2I<(outs GPR:$dst), (ins GPR:$sp, imm0_4095:$imm), + IIC_iALUi, "addw", "\t$dst, $sp, $imm", []> { + let Inst{31-27} = 0b11110; + let Inst{25} = 1; + let Inst{24-21} = 0b0000; + let Inst{20} = 0; // The S bit. + let Inst{19-16} = 0b1101; // Rn = sp + let Inst{15} = 0; +} + +// ADD r, sp, so_reg +def t2ADDrSPs : T2sI<(outs GPR:$dst), (ins GPR:$sp, t2_so_reg:$rhs), + IIC_iALUsi, "add", ".w\t$dst, $sp, $rhs", []> { + let Inst{31-27} = 0b11101; + let Inst{26-25} = 0b01; + let Inst{24-21} = 0b1000; + let Inst{20} = ?; // The S bit. + let Inst{19-16} = 0b1101; // Rn = sp + let Inst{15} = 0; +} + +// SUB r, sp, {so_imm|i12} +def t2SUBrSPi : T2sI<(outs GPR:$dst), (ins GPR:$sp, t2_so_imm:$imm), + IIC_iALUi, "sub", ".w\t$dst, $sp, $imm", []> { + let Inst{31-27} = 0b11110; + let Inst{25} = 0; + let Inst{24-21} = 0b1101; + let Inst{20} = ?; // The S bit. + let Inst{19-16} = 0b1101; // Rn = sp + let Inst{15} = 0; +} +def t2SUBrSPi12 : T2I<(outs GPR:$dst), (ins GPR:$sp, imm0_4095:$imm), + IIC_iALUi, "subw", "\t$dst, $sp, $imm", []> { + let Inst{31-27} = 0b11110; + let Inst{25} = 1; + let Inst{24-21} = 0b0101; + let Inst{20} = 0; // The S bit. + let Inst{19-16} = 0b1101; // Rn = sp + let Inst{15} = 0; +} + +// SUB r, sp, so_reg +def t2SUBrSPs : T2sI<(outs GPR:$dst), (ins GPR:$sp, t2_so_reg:$rhs), + IIC_iALUsi, + "sub", "\t$dst, $sp, $rhs", []> { + let Inst{31-27} = 0b11101; + let Inst{26-25} = 0b01; + let Inst{24-21} = 0b1101; + let Inst{20} = ?; // The S bit. + let Inst{19-16} = 0b1101; // Rn = sp + let Inst{15} = 0; +} + +// Signed and unsigned division on v7-M +def t2SDIV : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b), IIC_iALUi, + "sdiv", "\t$dst, $a, $b", + [(set GPR:$dst, (sdiv GPR:$a, GPR:$b))]>, + Requires<[HasDivide]> { + let Inst{31-27} = 0b11111; + let Inst{26-21} = 0b011100; + let Inst{20} = 0b1; + let Inst{15-12} = 0b1111; + let Inst{7-4} = 0b1111; +} + +def t2UDIV : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b), IIC_iALUi, + "udiv", "\t$dst, $a, $b", + [(set GPR:$dst, (udiv GPR:$a, GPR:$b))]>, + Requires<[HasDivide]> { + let Inst{31-27} = 0b11111; + let Inst{26-21} = 0b011101; + let Inst{20} = 0b1; + let Inst{15-12} = 0b1111; + let Inst{7-4} = 0b1111; +} + +// Pseudo instruction that will expand into a t2SUBrSPi + a copy. +let usesCustomInserter = 1 in { // Expanded after instruction selection. +def t2SUBrSPi_ : PseudoInst<(outs GPR:$dst), (ins GPR:$sp, t2_so_imm:$imm), + NoItinerary, "${:comment} sub.w\t$dst, $sp, $imm", []>; +def t2SUBrSPi12_ : PseudoInst<(outs GPR:$dst), (ins GPR:$sp, imm0_4095:$imm), + NoItinerary, "${:comment} subw\t$dst, $sp, $imm", []>; +def t2SUBrSPs_ : PseudoInst<(outs GPR:$dst), (ins GPR:$sp, t2_so_reg:$rhs), + NoItinerary, "${:comment} sub\t$dst, $sp, $rhs", []>; +} // usesCustomInserter + + +//===----------------------------------------------------------------------===// +// Load / store Instructions. +// + +// Load +let canFoldAsLoad = 1, isReMaterializable = 1 in +defm t2LDR : T2I_ld<0, 0b10, "ldr", UnOpFrag<(load node:$Src)>>; + +// Loads with zero extension +defm t2LDRH : T2I_ld<0, 0b01, "ldrh", UnOpFrag<(zextloadi16 node:$Src)>>; +defm t2LDRB : T2I_ld<0, 0b00, "ldrb", UnOpFrag<(zextloadi8 node:$Src)>>; + +// Loads with sign extension +defm t2LDRSH : T2I_ld<1, 0b01, "ldrsh", UnOpFrag<(sextloadi16 node:$Src)>>; +defm t2LDRSB : T2I_ld<1, 0b00, "ldrsb", UnOpFrag<(sextloadi8 node:$Src)>>; + +let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 in { +// Load doubleword +def t2LDRDi8 : T2Ii8s4<1, 0, 1, (outs GPR:$dst1, GPR:$dst2), + (ins t2addrmode_imm8s4:$addr), + IIC_iLoadi, "ldrd", "\t$dst1, $addr", []>; +def t2LDRDpci : T2Ii8s4<1, 0, 1, (outs GPR:$dst1, GPR:$dst2), + (ins i32imm:$addr), IIC_iLoadi, + "ldrd", "\t$dst1, $addr", []> { + let Inst{19-16} = 0b1111; // Rn +} +} // mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 + +// zextload i1 -> zextload i8 +def : T2Pat<(zextloadi1 t2addrmode_imm12:$addr), + (t2LDRBi12 t2addrmode_imm12:$addr)>; +def : T2Pat<(zextloadi1 t2addrmode_imm8:$addr), + (t2LDRBi8 t2addrmode_imm8:$addr)>; +def : T2Pat<(zextloadi1 t2addrmode_so_reg:$addr), + (t2LDRBs t2addrmode_so_reg:$addr)>; +def : T2Pat<(zextloadi1 (ARMWrapper tconstpool:$addr)), + (t2LDRBpci tconstpool:$addr)>; + +// extload -> zextload +// FIXME: Reduce the number of patterns by legalizing extload to zextload +// earlier? +def : T2Pat<(extloadi1 t2addrmode_imm12:$addr), + (t2LDRBi12 t2addrmode_imm12:$addr)>; +def : T2Pat<(extloadi1 t2addrmode_imm8:$addr), + (t2LDRBi8 t2addrmode_imm8:$addr)>; +def : T2Pat<(extloadi1 t2addrmode_so_reg:$addr), + (t2LDRBs t2addrmode_so_reg:$addr)>; +def : T2Pat<(extloadi1 (ARMWrapper tconstpool:$addr)), + (t2LDRBpci tconstpool:$addr)>; + +def : T2Pat<(extloadi8 t2addrmode_imm12:$addr), + (t2LDRBi12 t2addrmode_imm12:$addr)>; +def : T2Pat<(extloadi8 t2addrmode_imm8:$addr), + (t2LDRBi8 t2addrmode_imm8:$addr)>; +def : T2Pat<(extloadi8 t2addrmode_so_reg:$addr), + (t2LDRBs t2addrmode_so_reg:$addr)>; +def : T2Pat<(extloadi8 (ARMWrapper tconstpool:$addr)), + (t2LDRBpci tconstpool:$addr)>; + +def : T2Pat<(extloadi16 t2addrmode_imm12:$addr), + (t2LDRHi12 t2addrmode_imm12:$addr)>; +def : T2Pat<(extloadi16 t2addrmode_imm8:$addr), + (t2LDRHi8 t2addrmode_imm8:$addr)>; +def : T2Pat<(extloadi16 t2addrmode_so_reg:$addr), + (t2LDRHs t2addrmode_so_reg:$addr)>; +def : T2Pat<(extloadi16 (ARMWrapper tconstpool:$addr)), + (t2LDRHpci tconstpool:$addr)>; + +// Indexed loads +let mayLoad = 1, neverHasSideEffects = 1 in { +def t2LDR_PRE : T2Iidxldst<0, 0b10, 1, 1, (outs GPR:$dst, GPR:$base_wb), + (ins t2addrmode_imm8:$addr), + AddrModeT2_i8, IndexModePre, IIC_iLoadiu, + "ldr", "\t$dst, $addr!", "$addr.base = $base_wb", + []>; + +def t2LDR_POST : T2Iidxldst<0, 0b10, 1, 0, (outs GPR:$dst, GPR:$base_wb), + (ins GPR:$base, t2am_imm8_offset:$offset), + AddrModeT2_i8, IndexModePost, IIC_iLoadiu, + "ldr", "\t$dst, [$base], $offset", "$base = $base_wb", + []>; + +def t2LDRB_PRE : T2Iidxldst<0, 0b00, 1, 1, (outs GPR:$dst, GPR:$base_wb), + (ins t2addrmode_imm8:$addr), + AddrModeT2_i8, IndexModePre, IIC_iLoadiu, + "ldrb", "\t$dst, $addr!", "$addr.base = $base_wb", + []>; +def t2LDRB_POST : T2Iidxldst<0, 0b00, 1, 0, (outs GPR:$dst, GPR:$base_wb), + (ins GPR:$base, t2am_imm8_offset:$offset), + AddrModeT2_i8, IndexModePost, IIC_iLoadiu, + "ldrb", "\t$dst, [$base], $offset", "$base = $base_wb", + []>; + +def t2LDRH_PRE : T2Iidxldst<0, 0b01, 1, 1, (outs GPR:$dst, GPR:$base_wb), + (ins t2addrmode_imm8:$addr), + AddrModeT2_i8, IndexModePre, IIC_iLoadiu, + "ldrh", "\t$dst, $addr!", "$addr.base = $base_wb", + []>; +def t2LDRH_POST : T2Iidxldst<0, 0b01, 1, 0, (outs GPR:$dst, GPR:$base_wb), + (ins GPR:$base, t2am_imm8_offset:$offset), + AddrModeT2_i8, IndexModePost, IIC_iLoadiu, + "ldrh", "\t$dst, [$base], $offset", "$base = $base_wb", + []>; + +def t2LDRSB_PRE : T2Iidxldst<1, 0b00, 1, 1, (outs GPR:$dst, GPR:$base_wb), + (ins t2addrmode_imm8:$addr), + AddrModeT2_i8, IndexModePre, IIC_iLoadiu, + "ldrsb", "\t$dst, $addr!", "$addr.base = $base_wb", + []>; +def t2LDRSB_POST : T2Iidxldst<1, 0b00, 1, 0, (outs GPR:$dst, GPR:$base_wb), + (ins GPR:$base, t2am_imm8_offset:$offset), + AddrModeT2_i8, IndexModePost, IIC_iLoadiu, + "ldrsb", "\t$dst, [$base], $offset", "$base = $base_wb", + []>; + +def t2LDRSH_PRE : T2Iidxldst<1, 0b01, 1, 1, (outs GPR:$dst, GPR:$base_wb), + (ins t2addrmode_imm8:$addr), + AddrModeT2_i8, IndexModePre, IIC_iLoadiu, + "ldrsh", "\t$dst, $addr!", "$addr.base = $base_wb", + []>; +def t2LDRSH_POST : T2Iidxldst<1, 0b01, 1, 0, (outs GPR:$dst, GPR:$base_wb), + (ins GPR:$base, t2am_imm8_offset:$offset), + AddrModeT2_i8, IndexModePost, IIC_iLoadiu, + "ldrsh", "\t$dst, [$base], $offset", "$base = $base_wb", + []>; +} // mayLoad = 1, neverHasSideEffects = 1 + +// LDRT, LDRBT, LDRHT, LDRSBT, LDRSHT all have offset mode (PUW=0b110) and are +// for disassembly only. +// Ref: A8.6.57 LDR (immediate, Thumb) Encoding T4 +class T2IldT<bit signed, bits<2> type, string opc> + : T2Ii8<(outs GPR:$dst), (ins t2addrmode_imm8:$addr), IIC_iLoadi, opc, + "\t$dst, $addr", []> { + let Inst{31-27} = 0b11111; + let Inst{26-25} = 0b00; + let Inst{24} = signed; + let Inst{23} = 0; + let Inst{22-21} = type; + let Inst{20} = 1; // load + let Inst{11} = 1; + let Inst{10-8} = 0b110; // PUW. +} + +def t2LDRT : T2IldT<0, 0b10, "ldrt">; +def t2LDRBT : T2IldT<0, 0b00, "ldrbt">; +def t2LDRHT : T2IldT<0, 0b01, "ldrht">; +def t2LDRSBT : T2IldT<1, 0b00, "ldrsbt">; +def t2LDRSHT : T2IldT<1, 0b01, "ldrsht">; + +// Store +defm t2STR :T2I_st<0b10,"str", BinOpFrag<(store node:$LHS, node:$RHS)>>; +defm t2STRB:T2I_st<0b00,"strb",BinOpFrag<(truncstorei8 node:$LHS, node:$RHS)>>; +defm t2STRH:T2I_st<0b01,"strh",BinOpFrag<(truncstorei16 node:$LHS, node:$RHS)>>; + +// Store doubleword +let mayLoad = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1 in +def t2STRDi8 : T2Ii8s4<1, 0, 0, (outs), + (ins GPR:$src1, GPR:$src2, t2addrmode_imm8s4:$addr), + IIC_iStorer, "strd", "\t$src1, $addr", []>; + +// Indexed stores +def t2STR_PRE : T2Iidxldst<0, 0b10, 0, 1, (outs GPR:$base_wb), + (ins GPR:$src, GPR:$base, t2am_imm8_offset:$offset), + AddrModeT2_i8, IndexModePre, IIC_iStoreiu, + "str", "\t$src, [$base, $offset]!", "$base = $base_wb", + [(set GPR:$base_wb, + (pre_store GPR:$src, GPR:$base, t2am_imm8_offset:$offset))]>; + +def t2STR_POST : T2Iidxldst<0, 0b10, 0, 0, (outs GPR:$base_wb), + (ins GPR:$src, GPR:$base, t2am_imm8_offset:$offset), + AddrModeT2_i8, IndexModePost, IIC_iStoreiu, + "str", "\t$src, [$base], $offset", "$base = $base_wb", + [(set GPR:$base_wb, + (post_store GPR:$src, GPR:$base, t2am_imm8_offset:$offset))]>; + +def t2STRH_PRE : T2Iidxldst<0, 0b01, 0, 1, (outs GPR:$base_wb), + (ins GPR:$src, GPR:$base, t2am_imm8_offset:$offset), + AddrModeT2_i8, IndexModePre, IIC_iStoreiu, + "strh", "\t$src, [$base, $offset]!", "$base = $base_wb", + [(set GPR:$base_wb, + (pre_truncsti16 GPR:$src, GPR:$base, t2am_imm8_offset:$offset))]>; + +def t2STRH_POST : T2Iidxldst<0, 0b01, 0, 0, (outs GPR:$base_wb), + (ins GPR:$src, GPR:$base, t2am_imm8_offset:$offset), + AddrModeT2_i8, IndexModePost, IIC_iStoreiu, + "strh", "\t$src, [$base], $offset", "$base = $base_wb", + [(set GPR:$base_wb, + (post_truncsti16 GPR:$src, GPR:$base, t2am_imm8_offset:$offset))]>; + +def t2STRB_PRE : T2Iidxldst<0, 0b00, 0, 1, (outs GPR:$base_wb), + (ins GPR:$src, GPR:$base, t2am_imm8_offset:$offset), + AddrModeT2_i8, IndexModePre, IIC_iStoreiu, + "strb", "\t$src, [$base, $offset]!", "$base = $base_wb", + [(set GPR:$base_wb, + (pre_truncsti8 GPR:$src, GPR:$base, t2am_imm8_offset:$offset))]>; + +def t2STRB_POST : T2Iidxldst<0, 0b00, 0, 0, (outs GPR:$base_wb), + (ins GPR:$src, GPR:$base, t2am_imm8_offset:$offset), + AddrModeT2_i8, IndexModePost, IIC_iStoreiu, + "strb", "\t$src, [$base], $offset", "$base = $base_wb", + [(set GPR:$base_wb, + (post_truncsti8 GPR:$src, GPR:$base, t2am_imm8_offset:$offset))]>; + +// STRT, STRBT, STRHT all have offset mode (PUW=0b110) and are for disassembly +// only. +// Ref: A8.6.193 STR (immediate, Thumb) Encoding T4 +class T2IstT<bits<2> type, string opc> + : T2Ii8<(outs GPR:$src), (ins t2addrmode_imm8:$addr), IIC_iStorei, opc, + "\t$src, $addr", []> { + let Inst{31-27} = 0b11111; + let Inst{26-25} = 0b00; + let Inst{24} = 0; // not signed + let Inst{23} = 0; + let Inst{22-21} = type; + let Inst{20} = 0; // store + let Inst{11} = 1; + let Inst{10-8} = 0b110; // PUW +} + +def t2STRT : T2IstT<0b10, "strt">; +def t2STRBT : T2IstT<0b00, "strbt">; +def t2STRHT : T2IstT<0b01, "strht">; + +// ldrd / strd pre / post variants +// For disassembly only. + +def t2LDRD_PRE : T2Ii8s4<1, 1, 1, (outs GPR:$dst1, GPR:$dst2), + (ins GPR:$base, t2am_imm8s4_offset:$imm), NoItinerary, + "ldrd", "\t$dst1, $dst2, [$base, $imm]!", []>; + +def t2LDRD_POST : T2Ii8s4<0, 1, 1, (outs GPR:$dst1, GPR:$dst2), + (ins GPR:$base, t2am_imm8s4_offset:$imm), NoItinerary, + "ldrd", "\t$dst1, $dst2, [$base], $imm", []>; + +def t2STRD_PRE : T2Ii8s4<1, 1, 0, (outs), + (ins GPR:$src1, GPR:$src2, GPR:$base, t2am_imm8s4_offset:$imm), + NoItinerary, "strd", "\t$src1, $src2, [$base, $imm]!", []>; + +def t2STRD_POST : T2Ii8s4<0, 1, 0, (outs), + (ins GPR:$src1, GPR:$src2, GPR:$base, t2am_imm8s4_offset:$imm), + NoItinerary, "strd", "\t$src1, $src2, [$base], $imm", []>; + +// T2Ipl (Preload Data/Instruction) signals the memory system of possible future +// data/instruction access. These are for disassembly only. +// +// A8.6.117, A8.6.118. Different instructions are generated for #0 and #-0. +// The neg_zero operand translates -0 to -1, -1 to -2, ..., etc. +multiclass T2Ipl<bit instr, bit write, string opc> { + + def i12 : T2I<(outs), (ins GPR:$base, i32imm:$imm), IIC_iLoadi, opc, + "\t[$base, $imm]", []> { + let Inst{31-25} = 0b1111100; + let Inst{24} = instr; + let Inst{23} = 1; // U = 1 + let Inst{22} = 0; + let Inst{21} = write; + let Inst{20} = 1; + let Inst{15-12} = 0b1111; + } + + def i8 : T2I<(outs), (ins GPR:$base, neg_zero:$imm), IIC_iLoadi, opc, + "\t[$base, $imm]", []> { + let Inst{31-25} = 0b1111100; + let Inst{24} = instr; + let Inst{23} = 0; // U = 0 + let Inst{22} = 0; + let Inst{21} = write; + let Inst{20} = 1; + let Inst{15-12} = 0b1111; + let Inst{11-8} = 0b1100; + } + + def pci : T2I<(outs), (ins GPR:$base, neg_zero:$imm), IIC_iLoadi, opc, + "\t[pc, $imm]", []> { + let Inst{31-25} = 0b1111100; + let Inst{24} = instr; + let Inst{23} = ?; // add = (U == 1) + let Inst{22} = 0; + let Inst{21} = write; + let Inst{20} = 1; + let Inst{19-16} = 0b1111; // Rn = 0b1111 + let Inst{15-12} = 0b1111; + } + + def r : T2I<(outs), (ins GPR:$base, GPR:$a), IIC_iLoadi, opc, + "\t[$base, $a]", []> { + let Inst{31-25} = 0b1111100; + let Inst{24} = instr; + let Inst{23} = 0; // add = TRUE for T1 + let Inst{22} = 0; + let Inst{21} = write; + let Inst{20} = 1; + let Inst{15-12} = 0b1111; + let Inst{11-6} = 0000000; + let Inst{5-4} = 0b00; // no shift is applied + } + + def s : T2I<(outs), (ins GPR:$base, GPR:$a, i32imm:$shamt), IIC_iLoadi, opc, + "\t[$base, $a, lsl $shamt]", []> { + let Inst{31-25} = 0b1111100; + let Inst{24} = instr; + let Inst{23} = 0; // add = TRUE for T1 + let Inst{22} = 0; + let Inst{21} = write; + let Inst{20} = 1; + let Inst{15-12} = 0b1111; + let Inst{11-6} = 0000000; + } +} + +defm t2PLD : T2Ipl<0, 0, "pld">; +defm t2PLDW : T2Ipl<0, 1, "pldw">; +defm t2PLI : T2Ipl<1, 0, "pli">; + +//===----------------------------------------------------------------------===// +// Load / store multiple Instructions. +// + +let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 in { +def t2LDM : T2XI<(outs), (ins addrmode4:$addr, pred:$p, + reglist:$dsts, variable_ops), IIC_iLoadm, + "ldm${addr:submode}${p}${addr:wide}\t$addr, $dsts", []> { + let Inst{31-27} = 0b11101; + let Inst{26-25} = 0b00; + let Inst{24-23} = {?, ?}; // IA: '01', DB: '10' + let Inst{22} = 0; + let Inst{21} = 0; // The W bit. + let Inst{20} = 1; // Load +} + +def t2LDM_UPD : T2XIt<(outs GPR:$wb), (ins addrmode4:$addr, pred:$p, + reglist:$dsts, variable_ops), IIC_iLoadm, + "ldm${addr:submode}${p}${addr:wide}\t$addr!, $dsts", + "$addr.addr = $wb", []> { + let Inst{31-27} = 0b11101; + let Inst{26-25} = 0b00; + let Inst{24-23} = {?, ?}; // IA: '01', DB: '10' + let Inst{22} = 0; + let Inst{21} = 1; // The W bit. + let Inst{20} = 1; // Load +} +} // mayLoad, neverHasSideEffects, hasExtraDefRegAllocReq + +let mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1 in { +def t2STM : T2XI<(outs), (ins addrmode4:$addr, pred:$p, + reglist:$srcs, variable_ops), IIC_iStorem, + "stm${addr:submode}${p}${addr:wide}\t$addr, $srcs", []> { + let Inst{31-27} = 0b11101; + let Inst{26-25} = 0b00; + let Inst{24-23} = {?, ?}; // IA: '01', DB: '10' + let Inst{22} = 0; + let Inst{21} = 0; // The W bit. + let Inst{20} = 0; // Store +} + +def t2STM_UPD : T2XIt<(outs GPR:$wb), (ins addrmode4:$addr, pred:$p, + reglist:$srcs, variable_ops), + IIC_iStorem, + "stm${addr:submode}${p}${addr:wide}\t$addr!, $srcs", + "$addr.addr = $wb", []> { + let Inst{31-27} = 0b11101; + let Inst{26-25} = 0b00; + let Inst{24-23} = {?, ?}; // IA: '01', DB: '10' + let Inst{22} = 0; + let Inst{21} = 1; // The W bit. + let Inst{20} = 0; // Store +} +} // mayStore, neverHasSideEffects, hasExtraSrcRegAllocReq + +//===----------------------------------------------------------------------===// +// Move Instructions. +// + +let neverHasSideEffects = 1 in +def t2MOVr : T2sI<(outs GPR:$dst), (ins GPR:$src), IIC_iMOVr, + "mov", ".w\t$dst, $src", []> { + let Inst{31-27} = 0b11101; + let Inst{26-25} = 0b01; + let Inst{24-21} = 0b0010; + let Inst{20} = ?; // The S bit. + let Inst{19-16} = 0b1111; // Rn + let Inst{14-12} = 0b000; + let Inst{7-4} = 0b0000; +} + +// AddedComplexity to ensure isel tries t2MOVi before t2MOVi16. +let isReMaterializable = 1, isAsCheapAsAMove = 1, AddedComplexity = 1 in +def t2MOVi : T2sI<(outs GPR:$dst), (ins t2_so_imm:$src), IIC_iMOVi, + "mov", ".w\t$dst, $src", + [(set GPR:$dst, t2_so_imm:$src)]> { + let Inst{31-27} = 0b11110; + let Inst{25} = 0; + let Inst{24-21} = 0b0010; + let Inst{20} = ?; // The S bit. + let Inst{19-16} = 0b1111; // Rn + let Inst{15} = 0; +} + +let isReMaterializable = 1, isAsCheapAsAMove = 1 in +def t2MOVi16 : T2I<(outs GPR:$dst), (ins i32imm:$src), IIC_iMOVi, + "movw", "\t$dst, $src", + [(set GPR:$dst, imm0_65535:$src)]> { + let Inst{31-27} = 0b11110; + let Inst{25} = 1; + let Inst{24-21} = 0b0010; + let Inst{20} = 0; // The S bit. + let Inst{15} = 0; +} + +let Constraints = "$src = $dst" in +def t2MOVTi16 : T2I<(outs GPR:$dst), (ins GPR:$src, i32imm:$imm), IIC_iMOVi, + "movt", "\t$dst, $imm", + [(set GPR:$dst, + (or (and GPR:$src, 0xffff), lo16AllZero:$imm))]> { + let Inst{31-27} = 0b11110; + let Inst{25} = 1; + let Inst{24-21} = 0b0110; + let Inst{20} = 0; // The S bit. + let Inst{15} = 0; +} + +def : T2Pat<(or GPR:$src, 0xffff0000), (t2MOVTi16 GPR:$src, 0xffff)>; + +//===----------------------------------------------------------------------===// +// Extend Instructions. +// + +// Sign extenders + +defm t2SXTB : T2I_unary_rrot<0b100, "sxtb", + UnOpFrag<(sext_inreg node:$Src, i8)>>; +defm t2SXTH : T2I_unary_rrot<0b000, "sxth", + UnOpFrag<(sext_inreg node:$Src, i16)>>; +defm t2SXTB16 : T2I_unary_rrot_DO<0b010, "sxtb16">; + +defm t2SXTAB : T2I_bin_rrot<0b100, "sxtab", + BinOpFrag<(add node:$LHS, (sext_inreg node:$RHS, i8))>>; +defm t2SXTAH : T2I_bin_rrot<0b000, "sxtah", + BinOpFrag<(add node:$LHS, (sext_inreg node:$RHS,i16))>>; +defm t2SXTAB16 : T2I_bin_rrot_DO<0b010, "sxtab16">; + +// TODO: SXT(A){B|H}16 - done for disassembly only + +// Zero extenders + +let AddedComplexity = 16 in { +defm t2UXTB : T2I_unary_rrot<0b101, "uxtb", + UnOpFrag<(and node:$Src, 0x000000FF)>>; +defm t2UXTH : T2I_unary_rrot<0b001, "uxth", + UnOpFrag<(and node:$Src, 0x0000FFFF)>>; +defm t2UXTB16 : T2I_unary_rrot_nw<0b011, "uxtb16", + UnOpFrag<(and node:$Src, 0x00FF00FF)>>; + +def : T2Pat<(and (shl GPR:$Src, (i32 8)), 0xFF00FF), + (t2UXTB16r_rot GPR:$Src, 24)>; +def : T2Pat<(and (srl GPR:$Src, (i32 8)), 0xFF00FF), + (t2UXTB16r_rot GPR:$Src, 8)>; + +defm t2UXTAB : T2I_bin_rrot<0b101, "uxtab", + BinOpFrag<(add node:$LHS, (and node:$RHS, 0x00FF))>>; +defm t2UXTAH : T2I_bin_rrot<0b001, "uxtah", + BinOpFrag<(add node:$LHS, (and node:$RHS, 0xFFFF))>>; +defm t2UXTAB16 : T2I_bin_rrot_DO<0b011, "uxtab16">; +} + +//===----------------------------------------------------------------------===// +// Arithmetic Instructions. +// + +defm t2ADD : T2I_bin_ii12rs<0b000, "add", + BinOpFrag<(add node:$LHS, node:$RHS)>, 1>; +defm t2SUB : T2I_bin_ii12rs<0b101, "sub", + BinOpFrag<(sub node:$LHS, node:$RHS)>>; + +// ADD and SUB with 's' bit set. No 12-bit immediate (T4) variants. +defm t2ADDS : T2I_bin_s_irs <0b1000, "add", + BinOpFrag<(addc node:$LHS, node:$RHS)>, 1>; +defm t2SUBS : T2I_bin_s_irs <0b1101, "sub", + BinOpFrag<(subc node:$LHS, node:$RHS)>>; + +defm t2ADC : T2I_adde_sube_irs<0b1010, "adc", + BinOpFrag<(adde_dead_carry node:$LHS, node:$RHS)>, 1>; +defm t2SBC : T2I_adde_sube_irs<0b1011, "sbc", + BinOpFrag<(sube_dead_carry node:$LHS, node:$RHS)>>; +defm t2ADCS : T2I_adde_sube_s_irs<0b1010, "adc", + BinOpFrag<(adde_live_carry node:$LHS, node:$RHS)>, 1>; +defm t2SBCS : T2I_adde_sube_s_irs<0b1011, "sbc", + BinOpFrag<(sube_live_carry node:$LHS, node:$RHS)>>; + +// RSB +defm t2RSB : T2I_rbin_is <0b1110, "rsb", + BinOpFrag<(sub node:$LHS, node:$RHS)>>; +defm t2RSBS : T2I_rbin_s_is <0b1110, "rsb", + BinOpFrag<(subc node:$LHS, node:$RHS)>>; + +// (sub X, imm) gets canonicalized to (add X, -imm). Match this form. +let AddedComplexity = 1 in +def : T2Pat<(add GPR:$src, imm0_255_neg:$imm), + (t2SUBri GPR:$src, imm0_255_neg:$imm)>; +def : T2Pat<(add GPR:$src, t2_so_imm_neg:$imm), + (t2SUBri GPR:$src, t2_so_imm_neg:$imm)>; +def : T2Pat<(add GPR:$src, imm0_4095_neg:$imm), + (t2SUBri12 GPR:$src, imm0_4095_neg:$imm)>; + +// Select Bytes -- for disassembly only + +def t2SEL : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b), NoItinerary, "sel", + "\t$dst, $a, $b", []> { + let Inst{31-27} = 0b11111; + let Inst{26-24} = 0b010; + let Inst{23} = 0b1; + let Inst{22-20} = 0b010; + let Inst{15-12} = 0b1111; + let Inst{7} = 0b1; + let Inst{6-4} = 0b000; +} + +// A6.3.13, A6.3.14, A6.3.15 Parallel addition and subtraction (signed/unsigned) +// And Miscellaneous operations -- for disassembly only +class T2I_pam<bits<3> op22_20, bits<4> op7_4, string opc> + : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b), NoItinerary, opc, + "\t$dst, $a, $b", [/* For disassembly only; pattern left blank */]> { + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0101; + let Inst{22-20} = op22_20; + let Inst{15-12} = 0b1111; + let Inst{7-4} = op7_4; +} + +// Saturating add/subtract -- for disassembly only + +def t2QADD : T2I_pam<0b000, 0b1000, "qadd">; +def t2QADD16 : T2I_pam<0b001, 0b0001, "qadd16">; +def t2QADD8 : T2I_pam<0b000, 0b0001, "qadd8">; +def t2QASX : T2I_pam<0b010, 0b0001, "qasx">; +def t2QDADD : T2I_pam<0b000, 0b1001, "qdadd">; +def t2QDSUB : T2I_pam<0b000, 0b1011, "qdsub">; +def t2QSAX : T2I_pam<0b110, 0b0001, "qsax">; +def t2QSUB : T2I_pam<0b000, 0b1010, "qsub">; +def t2QSUB16 : T2I_pam<0b101, 0b0001, "qsub16">; +def t2QSUB8 : T2I_pam<0b100, 0b0001, "qsub8">; +def t2UQADD16 : T2I_pam<0b001, 0b0101, "uqadd16">; +def t2UQADD8 : T2I_pam<0b000, 0b0101, "uqadd8">; +def t2UQASX : T2I_pam<0b010, 0b0101, "uqasx">; +def t2UQSAX : T2I_pam<0b110, 0b0101, "uqsax">; +def t2UQSUB16 : T2I_pam<0b101, 0b0101, "uqsub16">; +def t2UQSUB8 : T2I_pam<0b100, 0b0101, "uqsub8">; + +// Signed/Unsigned add/subtract -- for disassembly only + +def t2SASX : T2I_pam<0b010, 0b0000, "sasx">; +def t2SADD16 : T2I_pam<0b001, 0b0000, "sadd16">; +def t2SADD8 : T2I_pam<0b000, 0b0000, "sadd8">; +def t2SSAX : T2I_pam<0b110, 0b0000, "ssax">; +def t2SSUB16 : T2I_pam<0b101, 0b0000, "ssub16">; +def t2SSUB8 : T2I_pam<0b100, 0b0000, "ssub8">; +def t2UASX : T2I_pam<0b010, 0b0100, "uasx">; +def t2UADD16 : T2I_pam<0b001, 0b0100, "uadd16">; +def t2UADD8 : T2I_pam<0b000, 0b0100, "uadd8">; +def t2USAX : T2I_pam<0b110, 0b0100, "usax">; +def t2USUB16 : T2I_pam<0b101, 0b0100, "usub16">; +def t2USUB8 : T2I_pam<0b100, 0b0100, "usub8">; + +// Signed/Unsigned halving add/subtract -- for disassembly only + +def t2SHASX : T2I_pam<0b010, 0b0010, "shasx">; +def t2SHADD16 : T2I_pam<0b001, 0b0010, "shadd16">; +def t2SHADD8 : T2I_pam<0b000, 0b0010, "shadd8">; +def t2SHSAX : T2I_pam<0b110, 0b0010, "shsax">; +def t2SHSUB16 : T2I_pam<0b101, 0b0010, "shsub16">; +def t2SHSUB8 : T2I_pam<0b100, 0b0010, "shsub8">; +def t2UHASX : T2I_pam<0b010, 0b0110, "uhasx">; +def t2UHADD16 : T2I_pam<0b001, 0b0110, "uhadd16">; +def t2UHADD8 : T2I_pam<0b000, 0b0110, "uhadd8">; +def t2UHSAX : T2I_pam<0b110, 0b0110, "uhsax">; +def t2UHSUB16 : T2I_pam<0b101, 0b0110, "uhsub16">; +def t2UHSUB8 : T2I_pam<0b100, 0b0110, "uhsub8">; + +// Unsigned Sum of Absolute Differences [and Accumulate] -- for disassembly only + +def t2USAD8 : T2I_mac<0, 0b111, 0b0000, (outs GPR:$dst), (ins GPR:$a, GPR:$b), + NoItinerary, "usad8", "\t$dst, $a, $b", []> { + let Inst{15-12} = 0b1111; +} +def t2USADA8 : T2I_mac<0, 0b111, 0b0000, (outs GPR:$dst), + (ins GPR:$a, GPR:$b, GPR:$acc), NoItinerary, "usada8", + "\t$dst, $a, $b, $acc", []>; + +// Signed/Unsigned saturate -- for disassembly only + +def t2SSATlsl : T2I<(outs GPR:$dst), (ins i32imm:$bit_pos,GPR:$a,i32imm:$shamt), + NoItinerary, "ssat", "\t$dst, $bit_pos, $a, lsl $shamt", + [/* For disassembly only; pattern left blank */]> { + let Inst{31-27} = 0b11110; + let Inst{25-22} = 0b1100; + let Inst{20} = 0; + let Inst{15} = 0; + let Inst{21} = 0; // sh = '0' +} + +def t2SSATasr : T2I<(outs GPR:$dst), (ins i32imm:$bit_pos,GPR:$a,i32imm:$shamt), + NoItinerary, "ssat", "\t$dst, $bit_pos, $a, asr $shamt", + [/* For disassembly only; pattern left blank */]> { + let Inst{31-27} = 0b11110; + let Inst{25-22} = 0b1100; + let Inst{20} = 0; + let Inst{15} = 0; + let Inst{21} = 1; // sh = '1' +} + +def t2SSAT16 : T2I<(outs GPR:$dst), (ins i32imm:$bit_pos, GPR:$a), NoItinerary, + "ssat16", "\t$dst, $bit_pos, $a", + [/* For disassembly only; pattern left blank */]> { + let Inst{31-27} = 0b11110; + let Inst{25-22} = 0b1100; + let Inst{20} = 0; + let Inst{15} = 0; + let Inst{21} = 1; // sh = '1' + let Inst{14-12} = 0b000; // imm3 = '000' + let Inst{7-6} = 0b00; // imm2 = '00' +} + +def t2USATlsl : T2I<(outs GPR:$dst), (ins i32imm:$bit_pos,GPR:$a,i32imm:$shamt), + NoItinerary, "usat", "\t$dst, $bit_pos, $a, lsl $shamt", + [/* For disassembly only; pattern left blank */]> { + let Inst{31-27} = 0b11110; + let Inst{25-22} = 0b1110; + let Inst{20} = 0; + let Inst{15} = 0; + let Inst{21} = 0; // sh = '0' +} + +def t2USATasr : T2I<(outs GPR:$dst), (ins i32imm:$bit_pos,GPR:$a,i32imm:$shamt), + NoItinerary, "usat", "\t$dst, $bit_pos, $a, asr $shamt", + [/* For disassembly only; pattern left blank */]> { + let Inst{31-27} = 0b11110; + let Inst{25-22} = 0b1110; + let Inst{20} = 0; + let Inst{15} = 0; + let Inst{21} = 1; // sh = '1' +} + +def t2USAT16 : T2I<(outs GPR:$dst), (ins i32imm:$bit_pos, GPR:$a), NoItinerary, + "usat16", "\t$dst, $bit_pos, $a", + [/* For disassembly only; pattern left blank */]> { + let Inst{31-27} = 0b11110; + let Inst{25-22} = 0b1110; + let Inst{20} = 0; + let Inst{15} = 0; + let Inst{21} = 1; // sh = '1' + let Inst{14-12} = 0b000; // imm3 = '000' + let Inst{7-6} = 0b00; // imm2 = '00' +} + +//===----------------------------------------------------------------------===// +// Shift and rotate Instructions. +// + +defm t2LSL : T2I_sh_ir<0b00, "lsl", BinOpFrag<(shl node:$LHS, node:$RHS)>>; +defm t2LSR : T2I_sh_ir<0b01, "lsr", BinOpFrag<(srl node:$LHS, node:$RHS)>>; +defm t2ASR : T2I_sh_ir<0b10, "asr", BinOpFrag<(sra node:$LHS, node:$RHS)>>; +defm t2ROR : T2I_sh_ir<0b11, "ror", BinOpFrag<(rotr node:$LHS, node:$RHS)>>; + +let Uses = [CPSR] in { +def t2MOVrx : T2sI<(outs GPR:$dst), (ins GPR:$src), IIC_iMOVsi, + "rrx", "\t$dst, $src", + [(set GPR:$dst, (ARMrrx GPR:$src))]> { + let Inst{31-27} = 0b11101; + let Inst{26-25} = 0b01; + let Inst{24-21} = 0b0010; + let Inst{20} = ?; // The S bit. + let Inst{19-16} = 0b1111; // Rn + let Inst{14-12} = 0b000; + let Inst{7-4} = 0b0011; +} +} + +let Defs = [CPSR] in { +def t2MOVsrl_flag : T2I<(outs GPR:$dst), (ins GPR:$src), IIC_iMOVsi, + "lsrs", ".w\t$dst, $src, #1", + [(set GPR:$dst, (ARMsrl_flag GPR:$src))]> { + let Inst{31-27} = 0b11101; + let Inst{26-25} = 0b01; + let Inst{24-21} = 0b0010; + let Inst{20} = 1; // The S bit. + let Inst{19-16} = 0b1111; // Rn + let Inst{5-4} = 0b01; // Shift type. + // Shift amount = Inst{14-12:7-6} = 1. + let Inst{14-12} = 0b000; + let Inst{7-6} = 0b01; +} +def t2MOVsra_flag : T2I<(outs GPR:$dst), (ins GPR:$src), IIC_iMOVsi, + "asrs", ".w\t$dst, $src, #1", + [(set GPR:$dst, (ARMsra_flag GPR:$src))]> { + let Inst{31-27} = 0b11101; + let Inst{26-25} = 0b01; + let Inst{24-21} = 0b0010; + let Inst{20} = 1; // The S bit. + let Inst{19-16} = 0b1111; // Rn + let Inst{5-4} = 0b10; // Shift type. + // Shift amount = Inst{14-12:7-6} = 1. + let Inst{14-12} = 0b000; + let Inst{7-6} = 0b01; +} +} + +//===----------------------------------------------------------------------===// +// Bitwise Instructions. +// + +defm t2AND : T2I_bin_w_irs<0b0000, "and", + BinOpFrag<(and node:$LHS, node:$RHS)>, 1>; +defm t2ORR : T2I_bin_w_irs<0b0010, "orr", + BinOpFrag<(or node:$LHS, node:$RHS)>, 1>; +defm t2EOR : T2I_bin_w_irs<0b0100, "eor", + BinOpFrag<(xor node:$LHS, node:$RHS)>, 1>; + +defm t2BIC : T2I_bin_w_irs<0b0001, "bic", + BinOpFrag<(and node:$LHS, (not node:$RHS))>>; + +let Constraints = "$src = $dst" in +def t2BFC : T2I<(outs GPR:$dst), (ins GPR:$src, bf_inv_mask_imm:$imm), + IIC_iUNAsi, "bfc", "\t$dst, $imm", + [(set GPR:$dst, (and GPR:$src, bf_inv_mask_imm:$imm))]> { + let Inst{31-27} = 0b11110; + let Inst{25} = 1; + let Inst{24-20} = 0b10110; + let Inst{19-16} = 0b1111; // Rn + let Inst{15} = 0; +} + +def t2SBFX : T2I<(outs GPR:$dst), (ins GPR:$src, imm0_31:$lsb, imm0_31:$width), + IIC_iALUi, "sbfx", "\t$dst, $src, $lsb, $width", []> { + let Inst{31-27} = 0b11110; + let Inst{25} = 1; + let Inst{24-20} = 0b10100; + let Inst{15} = 0; +} + +def t2UBFX : T2I<(outs GPR:$dst), (ins GPR:$src, imm0_31:$lsb, imm0_31:$width), + IIC_iALUi, "ubfx", "\t$dst, $src, $lsb, $width", []> { + let Inst{31-27} = 0b11110; + let Inst{25} = 1; + let Inst{24-20} = 0b11100; + let Inst{15} = 0; +} + +// A8.6.18 BFI - Bitfield insert (Encoding T1) +// Added for disassembler with the pattern field purposely left blank. +// FIXME: Utilize this instruction in codgen. +def t2BFI : T2I<(outs GPR:$dst), (ins GPR:$src, imm0_31:$lsb, imm0_31:$width), + IIC_iALUi, "bfi", "\t$dst, $src, $lsb, $width", []> { + let Inst{31-27} = 0b11110; + let Inst{25} = 1; + let Inst{24-20} = 0b10110; + let Inst{15} = 0; +} + +defm t2ORN : T2I_bin_irs<0b0011, "orn", BinOpFrag<(or node:$LHS, + (not node:$RHS))>>; + +// Prefer over of t2EORri ra, rb, -1 because mvn has 16-bit version +let AddedComplexity = 1 in +defm t2MVN : T2I_un_irs <0b0011, "mvn", UnOpFrag<(not node:$Src)>, 1, 1>; + + +def : T2Pat<(and GPR:$src, t2_so_imm_not:$imm), + (t2BICri GPR:$src, t2_so_imm_not:$imm)>; + +// FIXME: Disable this pattern on Darwin to workaround an assembler bug. +def : T2Pat<(or GPR:$src, t2_so_imm_not:$imm), + (t2ORNri GPR:$src, t2_so_imm_not:$imm)>, + Requires<[IsThumb2]>; + +def : T2Pat<(t2_so_imm_not:$src), + (t2MVNi t2_so_imm_not:$src)>; + +//===----------------------------------------------------------------------===// +// Multiply Instructions. +// +let isCommutable = 1 in +def t2MUL: T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b), IIC_iMUL32, + "mul", "\t$dst, $a, $b", + [(set GPR:$dst, (mul GPR:$a, GPR:$b))]> { + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0110; + let Inst{22-20} = 0b000; + let Inst{15-12} = 0b1111; // Ra = 0b1111 (no accumulate) + let Inst{7-4} = 0b0000; // Multiply +} + +def t2MLA: T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c), IIC_iMAC32, + "mla", "\t$dst, $a, $b, $c", + [(set GPR:$dst, (add (mul GPR:$a, GPR:$b), GPR:$c))]> { + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0110; + let Inst{22-20} = 0b000; + let Inst{15-12} = {?, ?, ?, ?}; // Ra + let Inst{7-4} = 0b0000; // Multiply +} + +def t2MLS: T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c), IIC_iMAC32, + "mls", "\t$dst, $a, $b, $c", + [(set GPR:$dst, (sub GPR:$c, (mul GPR:$a, GPR:$b)))]> { + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0110; + let Inst{22-20} = 0b000; + let Inst{15-12} = {?, ?, ?, ?}; // Ra + let Inst{7-4} = 0b0001; // Multiply and Subtract +} + +// Extra precision multiplies with low / high results +let neverHasSideEffects = 1 in { +let isCommutable = 1 in { +def t2SMULL : T2I<(outs GPR:$ldst, GPR:$hdst), (ins GPR:$a, GPR:$b), IIC_iMUL64, + "smull", "\t$ldst, $hdst, $a, $b", []> { + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0111; + let Inst{22-20} = 0b000; + let Inst{7-4} = 0b0000; +} + +def t2UMULL : T2I<(outs GPR:$ldst, GPR:$hdst), (ins GPR:$a, GPR:$b), IIC_iMUL64, + "umull", "\t$ldst, $hdst, $a, $b", []> { + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0111; + let Inst{22-20} = 0b010; + let Inst{7-4} = 0b0000; +} +} // isCommutable + +// Multiply + accumulate +def t2SMLAL : T2I<(outs GPR:$ldst, GPR:$hdst), (ins GPR:$a, GPR:$b), IIC_iMAC64, + "smlal", "\t$ldst, $hdst, $a, $b", []>{ + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0111; + let Inst{22-20} = 0b100; + let Inst{7-4} = 0b0000; +} + +def t2UMLAL : T2I<(outs GPR:$ldst, GPR:$hdst), (ins GPR:$a, GPR:$b), IIC_iMAC64, + "umlal", "\t$ldst, $hdst, $a, $b", []>{ + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0111; + let Inst{22-20} = 0b110; + let Inst{7-4} = 0b0000; +} + +def t2UMAAL : T2I<(outs GPR:$ldst, GPR:$hdst), (ins GPR:$a, GPR:$b), IIC_iMAC64, + "umaal", "\t$ldst, $hdst, $a, $b", []>{ + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0111; + let Inst{22-20} = 0b110; + let Inst{7-4} = 0b0110; +} +} // neverHasSideEffects + +// Rounding variants of the below included for disassembly only + +// Most significant word multiply +def t2SMMUL : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b), IIC_iMUL32, + "smmul", "\t$dst, $a, $b", + [(set GPR:$dst, (mulhs GPR:$a, GPR:$b))]> { + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0110; + let Inst{22-20} = 0b101; + let Inst{15-12} = 0b1111; // Ra = 0b1111 (no accumulate) + let Inst{7-4} = 0b0000; // No Rounding (Inst{4} = 0) +} + +def t2SMMULR : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b), IIC_iMUL32, + "smmulr", "\t$dst, $a, $b", []> { + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0110; + let Inst{22-20} = 0b101; + let Inst{15-12} = 0b1111; // Ra = 0b1111 (no accumulate) + let Inst{7-4} = 0b0001; // Rounding (Inst{4} = 1) +} + +def t2SMMLA : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c), IIC_iMAC32, + "smmla", "\t$dst, $a, $b, $c", + [(set GPR:$dst, (add (mulhs GPR:$a, GPR:$b), GPR:$c))]> { + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0110; + let Inst{22-20} = 0b101; + let Inst{15-12} = {?, ?, ?, ?}; // Ra + let Inst{7-4} = 0b0000; // No Rounding (Inst{4} = 0) +} + +def t2SMMLAR : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c), IIC_iMAC32, + "smmlar", "\t$dst, $a, $b, $c", []> { + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0110; + let Inst{22-20} = 0b101; + let Inst{15-12} = {?, ?, ?, ?}; // Ra + let Inst{7-4} = 0b0001; // Rounding (Inst{4} = 1) +} + +def t2SMMLS : T2I <(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c), IIC_iMAC32, + "smmls", "\t$dst, $a, $b, $c", + [(set GPR:$dst, (sub GPR:$c, (mulhs GPR:$a, GPR:$b)))]> { + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0110; + let Inst{22-20} = 0b110; + let Inst{15-12} = {?, ?, ?, ?}; // Ra + let Inst{7-4} = 0b0000; // No Rounding (Inst{4} = 0) +} + +def t2SMMLSR : T2I <(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c), IIC_iMAC32, + "smmlsr", "\t$dst, $a, $b, $c", []> { + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0110; + let Inst{22-20} = 0b110; + let Inst{15-12} = {?, ?, ?, ?}; // Ra + let Inst{7-4} = 0b0001; // Rounding (Inst{4} = 1) +} + +multiclass T2I_smul<string opc, PatFrag opnode> { + def BB : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b), IIC_iMUL32, + !strconcat(opc, "bb"), "\t$dst, $a, $b", + [(set GPR:$dst, (opnode (sext_inreg GPR:$a, i16), + (sext_inreg GPR:$b, i16)))]> { + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0110; + let Inst{22-20} = 0b001; + let Inst{15-12} = 0b1111; // Ra = 0b1111 (no accumulate) + let Inst{7-6} = 0b00; + let Inst{5-4} = 0b00; + } + + def BT : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b), IIC_iMUL32, + !strconcat(opc, "bt"), "\t$dst, $a, $b", + [(set GPR:$dst, (opnode (sext_inreg GPR:$a, i16), + (sra GPR:$b, (i32 16))))]> { + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0110; + let Inst{22-20} = 0b001; + let Inst{15-12} = 0b1111; // Ra = 0b1111 (no accumulate) + let Inst{7-6} = 0b00; + let Inst{5-4} = 0b01; + } + + def TB : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b), IIC_iMUL32, + !strconcat(opc, "tb"), "\t$dst, $a, $b", + [(set GPR:$dst, (opnode (sra GPR:$a, (i32 16)), + (sext_inreg GPR:$b, i16)))]> { + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0110; + let Inst{22-20} = 0b001; + let Inst{15-12} = 0b1111; // Ra = 0b1111 (no accumulate) + let Inst{7-6} = 0b00; + let Inst{5-4} = 0b10; + } + + def TT : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b), IIC_iMUL32, + !strconcat(opc, "tt"), "\t$dst, $a, $b", + [(set GPR:$dst, (opnode (sra GPR:$a, (i32 16)), + (sra GPR:$b, (i32 16))))]> { + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0110; + let Inst{22-20} = 0b001; + let Inst{15-12} = 0b1111; // Ra = 0b1111 (no accumulate) + let Inst{7-6} = 0b00; + let Inst{5-4} = 0b11; + } + + def WB : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b), IIC_iMUL16, + !strconcat(opc, "wb"), "\t$dst, $a, $b", + [(set GPR:$dst, (sra (opnode GPR:$a, + (sext_inreg GPR:$b, i16)), (i32 16)))]> { + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0110; + let Inst{22-20} = 0b011; + let Inst{15-12} = 0b1111; // Ra = 0b1111 (no accumulate) + let Inst{7-6} = 0b00; + let Inst{5-4} = 0b00; + } + + def WT : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b), IIC_iMUL16, + !strconcat(opc, "wt"), "\t$dst, $a, $b", + [(set GPR:$dst, (sra (opnode GPR:$a, + (sra GPR:$b, (i32 16))), (i32 16)))]> { + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0110; + let Inst{22-20} = 0b011; + let Inst{15-12} = 0b1111; // Ra = 0b1111 (no accumulate) + let Inst{7-6} = 0b00; + let Inst{5-4} = 0b01; + } +} + + +multiclass T2I_smla<string opc, PatFrag opnode> { + def BB : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc), IIC_iMAC16, + !strconcat(opc, "bb"), "\t$dst, $a, $b, $acc", + [(set GPR:$dst, (add GPR:$acc, + (opnode (sext_inreg GPR:$a, i16), + (sext_inreg GPR:$b, i16))))]> { + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0110; + let Inst{22-20} = 0b001; + let Inst{15-12} = {?, ?, ?, ?}; // Ra + let Inst{7-6} = 0b00; + let Inst{5-4} = 0b00; + } + + def BT : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc), IIC_iMAC16, + !strconcat(opc, "bt"), "\t$dst, $a, $b, $acc", + [(set GPR:$dst, (add GPR:$acc, (opnode (sext_inreg GPR:$a, i16), + (sra GPR:$b, (i32 16)))))]> { + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0110; + let Inst{22-20} = 0b001; + let Inst{15-12} = {?, ?, ?, ?}; // Ra + let Inst{7-6} = 0b00; + let Inst{5-4} = 0b01; + } + + def TB : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc), IIC_iMAC16, + !strconcat(opc, "tb"), "\t$dst, $a, $b, $acc", + [(set GPR:$dst, (add GPR:$acc, (opnode (sra GPR:$a, (i32 16)), + (sext_inreg GPR:$b, i16))))]> { + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0110; + let Inst{22-20} = 0b001; + let Inst{15-12} = {?, ?, ?, ?}; // Ra + let Inst{7-6} = 0b00; + let Inst{5-4} = 0b10; + } + + def TT : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc), IIC_iMAC16, + !strconcat(opc, "tt"), "\t$dst, $a, $b, $acc", + [(set GPR:$dst, (add GPR:$acc, (opnode (sra GPR:$a, (i32 16)), + (sra GPR:$b, (i32 16)))))]> { + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0110; + let Inst{22-20} = 0b001; + let Inst{15-12} = {?, ?, ?, ?}; // Ra + let Inst{7-6} = 0b00; + let Inst{5-4} = 0b11; + } + + def WB : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc), IIC_iMAC16, + !strconcat(opc, "wb"), "\t$dst, $a, $b, $acc", + [(set GPR:$dst, (add GPR:$acc, (sra (opnode GPR:$a, + (sext_inreg GPR:$b, i16)), (i32 16))))]> { + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0110; + let Inst{22-20} = 0b011; + let Inst{15-12} = {?, ?, ?, ?}; // Ra + let Inst{7-6} = 0b00; + let Inst{5-4} = 0b00; + } + + def WT : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc), IIC_iMAC16, + !strconcat(opc, "wt"), "\t$dst, $a, $b, $acc", + [(set GPR:$dst, (add GPR:$acc, (sra (opnode GPR:$a, + (sra GPR:$b, (i32 16))), (i32 16))))]> { + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0110; + let Inst{22-20} = 0b011; + let Inst{15-12} = {?, ?, ?, ?}; // Ra + let Inst{7-6} = 0b00; + let Inst{5-4} = 0b01; + } +} + +defm t2SMUL : T2I_smul<"smul", BinOpFrag<(mul node:$LHS, node:$RHS)>>; +defm t2SMLA : T2I_smla<"smla", BinOpFrag<(mul node:$LHS, node:$RHS)>>; + +// Halfword multiple accumulate long: SMLAL<x><y> -- for disassembly only +def t2SMLALBB : T2I_mac<1, 0b100, 0b1000, (outs GPR:$ldst,GPR:$hdst), + (ins GPR:$a,GPR:$b), IIC_iMAC64, "smlalbb", "\t$ldst, $hdst, $a, $b", + [/* For disassembly only; pattern left blank */]>; +def t2SMLALBT : T2I_mac<1, 0b100, 0b1001, (outs GPR:$ldst,GPR:$hdst), + (ins GPR:$a,GPR:$b), IIC_iMAC64, "smlalbt", "\t$ldst, $hdst, $a, $b", + [/* For disassembly only; pattern left blank */]>; +def t2SMLALTB : T2I_mac<1, 0b100, 0b1010, (outs GPR:$ldst,GPR:$hdst), + (ins GPR:$a,GPR:$b), IIC_iMAC64, "smlaltb", "\t$ldst, $hdst, $a, $b", + [/* For disassembly only; pattern left blank */]>; +def t2SMLALTT : T2I_mac<1, 0b100, 0b1011, (outs GPR:$ldst,GPR:$hdst), + (ins GPR:$a,GPR:$b), IIC_iMAC64, "smlaltt", "\t$ldst, $hdst, $a, $b", + [/* For disassembly only; pattern left blank */]>; + +// Dual halfword multiple: SMUAD, SMUSD, SMLAD, SMLSD, SMLALD, SMLSLD +// These are for disassembly only. + +def t2SMUAD : T2I_mac<0, 0b010, 0b0000, (outs GPR:$dst), (ins GPR:$a, GPR:$b), + IIC_iMAC32, "smuad", "\t$dst, $a, $b", []> { + let Inst{15-12} = 0b1111; +} +def t2SMUADX : T2I_mac<0, 0b010, 0b0001, (outs GPR:$dst), (ins GPR:$a, GPR:$b), + IIC_iMAC32, "smuadx", "\t$dst, $a, $b", []> { + let Inst{15-12} = 0b1111; +} +def t2SMUSD : T2I_mac<0, 0b100, 0b0000, (outs GPR:$dst), (ins GPR:$a, GPR:$b), + IIC_iMAC32, "smusd", "\t$dst, $a, $b", []> { + let Inst{15-12} = 0b1111; +} +def t2SMUSDX : T2I_mac<0, 0b100, 0b0001, (outs GPR:$dst), (ins GPR:$a, GPR:$b), + IIC_iMAC32, "smusdx", "\t$dst, $a, $b", []> { + let Inst{15-12} = 0b1111; +} +def t2SMLAD : T2I_mac<0, 0b010, 0b0000, (outs GPR:$dst), + (ins GPR:$a, GPR:$b, GPR:$acc), IIC_iMAC32, "smlad", + "\t$dst, $a, $b, $acc", []>; +def t2SMLADX : T2I_mac<0, 0b010, 0b0001, (outs GPR:$dst), + (ins GPR:$a, GPR:$b, GPR:$acc), IIC_iMAC32, "smladx", + "\t$dst, $a, $b, $acc", []>; +def t2SMLSD : T2I_mac<0, 0b100, 0b0000, (outs GPR:$dst), + (ins GPR:$a, GPR:$b, GPR:$acc), IIC_iMAC32, "smlsd", + "\t$dst, $a, $b, $acc", []>; +def t2SMLSDX : T2I_mac<0, 0b100, 0b0001, (outs GPR:$dst), + (ins GPR:$a, GPR:$b, GPR:$acc), IIC_iMAC32, "smlsdx", + "\t$dst, $a, $b, $acc", []>; +def t2SMLALD : T2I_mac<1, 0b100, 0b1100, (outs GPR:$ldst,GPR:$hdst), + (ins GPR:$a,GPR:$b), IIC_iMAC64, "smlald", + "\t$ldst, $hdst, $a, $b", []>; +def t2SMLALDX : T2I_mac<1, 0b100, 0b1101, (outs GPR:$ldst,GPR:$hdst), + (ins GPR:$a,GPR:$b), IIC_iMAC64, "smlaldx", + "\t$ldst, $hdst, $a, $b", []>; +def t2SMLSLD : T2I_mac<1, 0b101, 0b1100, (outs GPR:$ldst,GPR:$hdst), + (ins GPR:$a,GPR:$b), IIC_iMAC64, "smlsld", + "\t$ldst, $hdst, $a, $b", []>; +def t2SMLSLDX : T2I_mac<1, 0b101, 0b1101, (outs GPR:$ldst,GPR:$hdst), + (ins GPR:$a,GPR:$b), IIC_iMAC64, "smlsldx", + "\t$ldst, $hdst, $a, $b", []>; + +//===----------------------------------------------------------------------===// +// Misc. Arithmetic Instructions. +// + +class T2I_misc<bits<2> op1, bits<2> op2, dag oops, dag iops, + InstrItinClass itin, string opc, string asm, list<dag> pattern> + : T2I<oops, iops, itin, opc, asm, pattern> { + let Inst{31-27} = 0b11111; + let Inst{26-22} = 0b01010; + let Inst{21-20} = op1; + let Inst{15-12} = 0b1111; + let Inst{7-6} = 0b10; + let Inst{5-4} = op2; +} + +def t2CLZ : T2I_misc<0b11, 0b00, (outs GPR:$dst), (ins GPR:$src), IIC_iUNAr, + "clz", "\t$dst, $src", [(set GPR:$dst, (ctlz GPR:$src))]>; + +def t2RBIT : T2I_misc<0b01, 0b10, (outs GPR:$dst), (ins GPR:$src), IIC_iUNAr, + "rbit", "\t$dst, $src", + [(set GPR:$dst, (ARMrbit GPR:$src))]>; + +def t2REV : T2I_misc<0b01, 0b00, (outs GPR:$dst), (ins GPR:$src), IIC_iUNAr, + "rev", ".w\t$dst, $src", [(set GPR:$dst, (bswap GPR:$src))]>; + +def t2REV16 : T2I_misc<0b01, 0b01, (outs GPR:$dst), (ins GPR:$src), IIC_iUNAr, + "rev16", ".w\t$dst, $src", + [(set GPR:$dst, + (or (and (srl GPR:$src, (i32 8)), 0xFF), + (or (and (shl GPR:$src, (i32 8)), 0xFF00), + (or (and (srl GPR:$src, (i32 8)), 0xFF0000), + (and (shl GPR:$src, (i32 8)), 0xFF000000)))))]>; + +def t2REVSH : T2I_misc<0b01, 0b11, (outs GPR:$dst), (ins GPR:$src), IIC_iUNAr, + "revsh", ".w\t$dst, $src", + [(set GPR:$dst, + (sext_inreg + (or (srl (and GPR:$src, 0xFF00), (i32 8)), + (shl GPR:$src, (i32 8))), i16))]>; + +def t2PKHBT : T2I<(outs GPR:$dst), (ins GPR:$src1, GPR:$src2, i32imm:$shamt), + IIC_iALUsi, "pkhbt", "\t$dst, $src1, $src2, lsl $shamt", + [(set GPR:$dst, (or (and GPR:$src1, 0xFFFF), + (and (shl GPR:$src2, (i32 imm:$shamt)), + 0xFFFF0000)))]>, + Requires<[HasT2ExtractPack]> { + let Inst{31-27} = 0b11101; + let Inst{26-25} = 0b01; + let Inst{24-20} = 0b01100; + let Inst{5} = 0; // BT form + let Inst{4} = 0; +} + +// Alternate cases for PKHBT where identities eliminate some nodes. +def : T2Pat<(or (and GPR:$src1, 0xFFFF), (and GPR:$src2, 0xFFFF0000)), + (t2PKHBT GPR:$src1, GPR:$src2, 0)>, + Requires<[HasT2ExtractPack]>; +def : T2Pat<(or (and GPR:$src1, 0xFFFF), (shl GPR:$src2, imm16_31:$shamt)), + (t2PKHBT GPR:$src1, GPR:$src2, imm16_31:$shamt)>, + Requires<[HasT2ExtractPack]>; + +def t2PKHTB : T2I<(outs GPR:$dst), (ins GPR:$src1, GPR:$src2, i32imm:$shamt), + IIC_iALUsi, "pkhtb", "\t$dst, $src1, $src2, asr $shamt", + [(set GPR:$dst, (or (and GPR:$src1, 0xFFFF0000), + (and (sra GPR:$src2, imm16_31:$shamt), + 0xFFFF)))]>, + Requires<[HasT2ExtractPack]> { + let Inst{31-27} = 0b11101; + let Inst{26-25} = 0b01; + let Inst{24-20} = 0b01100; + let Inst{5} = 1; // TB form + let Inst{4} = 0; +} + +// Alternate cases for PKHTB where identities eliminate some nodes. Note that +// a shift amount of 0 is *not legal* here, it is PKHBT instead. +def : T2Pat<(or (and GPR:$src1, 0xFFFF0000), (srl GPR:$src2, (i32 16))), + (t2PKHTB GPR:$src1, GPR:$src2, 16)>, + Requires<[HasT2ExtractPack]>; +def : T2Pat<(or (and GPR:$src1, 0xFFFF0000), + (and (srl GPR:$src2, imm1_15:$shamt), 0xFFFF)), + (t2PKHTB GPR:$src1, GPR:$src2, imm1_15:$shamt)>, + Requires<[HasT2ExtractPack]>; + +//===----------------------------------------------------------------------===// +// Comparison Instructions... +// + +defm t2CMP : T2I_cmp_irs<0b1101, "cmp", + BinOpFrag<(ARMcmp node:$LHS, node:$RHS)>>; +defm t2CMPz : T2I_cmp_irs<0b1101, "cmp", + BinOpFrag<(ARMcmpZ node:$LHS, node:$RHS)>>; + +//FIXME: Disable CMN, as CCodes are backwards from compare expectations +// Compare-to-zero still works out, just not the relationals +//defm t2CMN : T2I_cmp_irs<0b1000, "cmn", +// BinOpFrag<(ARMcmp node:$LHS,(ineg node:$RHS))>>; +defm t2CMNz : T2I_cmp_irs<0b1000, "cmn", + BinOpFrag<(ARMcmpZ node:$LHS,(ineg node:$RHS))>>; + +//def : T2Pat<(ARMcmp GPR:$src, t2_so_imm_neg:$imm), +// (t2CMNri GPR:$src, t2_so_imm_neg:$imm)>; + +def : T2Pat<(ARMcmpZ GPR:$src, t2_so_imm_neg:$imm), + (t2CMNzri GPR:$src, t2_so_imm_neg:$imm)>; + +defm t2TST : T2I_cmp_irs<0b0000, "tst", + BinOpFrag<(ARMcmpZ (and node:$LHS, node:$RHS), 0)>>; +defm t2TEQ : T2I_cmp_irs<0b0100, "teq", + BinOpFrag<(ARMcmpZ (xor node:$LHS, node:$RHS), 0)>>; + +// A8.6.27 CBNZ, CBZ - Compare and branch on (non)zero. +// Short range conditional branch. Looks awesome for loops. Need to figure +// out how to use this one. + + +// Conditional moves +// FIXME: should be able to write a pattern for ARMcmov, but can't use +// a two-value operand where a dag node expects two operands. :( +let neverHasSideEffects = 1 in { +def t2MOVCCr : T2I<(outs GPR:$dst), (ins GPR:$false, GPR:$true), IIC_iCMOVr, + "mov", ".w\t$dst, $true", + [/*(set GPR:$dst, (ARMcmov GPR:$false, GPR:$true, imm:$cc, CCR:$ccr))*/]>, + RegConstraint<"$false = $dst"> { + let Inst{31-27} = 0b11101; + let Inst{26-25} = 0b01; + let Inst{24-21} = 0b0010; + let Inst{20} = 0; // The S bit. + let Inst{19-16} = 0b1111; // Rn + let Inst{14-12} = 0b000; + let Inst{7-4} = 0b0000; +} + +def t2MOVCCi : T2I<(outs GPR:$dst), (ins GPR:$false, t2_so_imm:$true), + IIC_iCMOVi, "mov", ".w\t$dst, $true", +[/*(set GPR:$dst, (ARMcmov GPR:$false, t2_so_imm:$true, imm:$cc, CCR:$ccr))*/]>, + RegConstraint<"$false = $dst"> { + let Inst{31-27} = 0b11110; + let Inst{25} = 0; + let Inst{24-21} = 0b0010; + let Inst{20} = 0; // The S bit. + let Inst{19-16} = 0b1111; // Rn + let Inst{15} = 0; +} + +class T2I_movcc_sh<bits<2> opcod, dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : T2I<oops, iops, itin, opc, asm, pattern> { + let Inst{31-27} = 0b11101; + let Inst{26-25} = 0b01; + let Inst{24-21} = 0b0010; + let Inst{20} = 0; // The S bit. + let Inst{19-16} = 0b1111; // Rn + let Inst{5-4} = opcod; // Shift type. +} +def t2MOVCClsl : T2I_movcc_sh<0b00, (outs GPR:$dst), + (ins GPR:$false, GPR:$true, i32imm:$rhs), + IIC_iCMOVsi, "lsl", ".w\t$dst, $true, $rhs", []>, + RegConstraint<"$false = $dst">; +def t2MOVCClsr : T2I_movcc_sh<0b01, (outs GPR:$dst), + (ins GPR:$false, GPR:$true, i32imm:$rhs), + IIC_iCMOVsi, "lsr", ".w\t$dst, $true, $rhs", []>, + RegConstraint<"$false = $dst">; +def t2MOVCCasr : T2I_movcc_sh<0b10, (outs GPR:$dst), + (ins GPR:$false, GPR:$true, i32imm:$rhs), + IIC_iCMOVsi, "asr", ".w\t$dst, $true, $rhs", []>, + RegConstraint<"$false = $dst">; +def t2MOVCCror : T2I_movcc_sh<0b11, (outs GPR:$dst), + (ins GPR:$false, GPR:$true, i32imm:$rhs), + IIC_iCMOVsi, "ror", ".w\t$dst, $true, $rhs", []>, + RegConstraint<"$false = $dst">; +} // neverHasSideEffects + +//===----------------------------------------------------------------------===// +// Atomic operations intrinsics +// + +// memory barriers protect the atomic sequences +let hasSideEffects = 1 in { +def t2Int_MemBarrierV7 : AInoP<(outs), (ins), + ThumbFrm, NoItinerary, + "dmb", "", + [(ARMMemBarrierV7)]>, + Requires<[IsThumb2]> { + let Inst{31-4} = 0xF3BF8F5; + // FIXME: add support for options other than a full system DMB + let Inst{3-0} = 0b1111; +} + +def t2Int_SyncBarrierV7 : AInoP<(outs), (ins), + ThumbFrm, NoItinerary, + "dsb", "", + [(ARMSyncBarrierV7)]>, + Requires<[IsThumb2]> { + let Inst{31-4} = 0xF3BF8F4; + // FIXME: add support for options other than a full system DSB + let Inst{3-0} = 0b1111; +} +} + +// Helper class for multiclass T2MemB -- for disassembly only +class T2I_memb<string opc, string asm> + : T2I<(outs), (ins), NoItinerary, opc, asm, + [/* For disassembly only; pattern left blank */]>, + Requires<[IsThumb2, HasV7]> { + let Inst{31-20} = 0xf3b; + let Inst{15-14} = 0b10; + let Inst{12} = 0; +} + +multiclass T2MemB<bits<4> op7_4, string opc> { + + def st : T2I_memb<opc, "\tst"> { + let Inst{7-4} = op7_4; + let Inst{3-0} = 0b1110; + } + + def ish : T2I_memb<opc, "\tish"> { + let Inst{7-4} = op7_4; + let Inst{3-0} = 0b1011; + } + + def ishst : T2I_memb<opc, "\tishst"> { + let Inst{7-4} = op7_4; + let Inst{3-0} = 0b1010; + } + + def nsh : T2I_memb<opc, "\tnsh"> { + let Inst{7-4} = op7_4; + let Inst{3-0} = 0b0111; + } + + def nshst : T2I_memb<opc, "\tnshst"> { + let Inst{7-4} = op7_4; + let Inst{3-0} = 0b0110; + } + + def osh : T2I_memb<opc, "\tosh"> { + let Inst{7-4} = op7_4; + let Inst{3-0} = 0b0011; + } + + def oshst : T2I_memb<opc, "\toshst"> { + let Inst{7-4} = op7_4; + let Inst{3-0} = 0b0010; + } +} + +// These DMB variants are for disassembly only. +defm t2DMB : T2MemB<0b0101, "dmb">; + +// These DSB variants are for disassembly only. +defm t2DSB : T2MemB<0b0100, "dsb">; + +// ISB has only full system option -- for disassembly only +def t2ISBsy : T2I_memb<"isb", ""> { + let Inst{7-4} = 0b0110; + let Inst{3-0} = 0b1111; +} + +class T2I_ldrex<bits<2> opcod, dag oops, dag iops, AddrMode am, SizeFlagVal sz, + InstrItinClass itin, string opc, string asm, string cstr, + list<dag> pattern, bits<4> rt2 = 0b1111> + : Thumb2I<oops, iops, am, sz, itin, opc, asm, cstr, pattern> { + let Inst{31-27} = 0b11101; + let Inst{26-20} = 0b0001101; + let Inst{11-8} = rt2; + let Inst{7-6} = 0b01; + let Inst{5-4} = opcod; + let Inst{3-0} = 0b1111; +} +class T2I_strex<bits<2> opcod, dag oops, dag iops, AddrMode am, SizeFlagVal sz, + InstrItinClass itin, string opc, string asm, string cstr, + list<dag> pattern, bits<4> rt2 = 0b1111> + : Thumb2I<oops, iops, am, sz, itin, opc, asm, cstr, pattern> { + let Inst{31-27} = 0b11101; + let Inst{26-20} = 0b0001100; + let Inst{11-8} = rt2; + let Inst{7-6} = 0b01; + let Inst{5-4} = opcod; +} + +let mayLoad = 1 in { +def t2LDREXB : T2I_ldrex<0b00, (outs GPR:$dest), (ins GPR:$ptr), AddrModeNone, + Size4Bytes, NoItinerary, "ldrexb", "\t$dest, [$ptr]", + "", []>; +def t2LDREXH : T2I_ldrex<0b01, (outs GPR:$dest), (ins GPR:$ptr), AddrModeNone, + Size4Bytes, NoItinerary, "ldrexh", "\t$dest, [$ptr]", + "", []>; +def t2LDREX : Thumb2I<(outs GPR:$dest), (ins GPR:$ptr), AddrModeNone, + Size4Bytes, NoItinerary, + "ldrex", "\t$dest, [$ptr]", "", + []> { + let Inst{31-27} = 0b11101; + let Inst{26-20} = 0b0000101; + let Inst{11-8} = 0b1111; + let Inst{7-0} = 0b00000000; // imm8 = 0 +} +def t2LDREXD : T2I_ldrex<0b11, (outs GPR:$dest, GPR:$dest2), (ins GPR:$ptr), + AddrModeNone, Size4Bytes, NoItinerary, + "ldrexd", "\t$dest, $dest2, [$ptr]", "", + [], {?, ?, ?, ?}>; +} + +let mayStore = 1, Constraints = "@earlyclobber $success" in { +def t2STREXB : T2I_strex<0b00, (outs GPR:$success), (ins GPR:$src, GPR:$ptr), + AddrModeNone, Size4Bytes, NoItinerary, + "strexb", "\t$success, $src, [$ptr]", "", []>; +def t2STREXH : T2I_strex<0b01, (outs GPR:$success), (ins GPR:$src, GPR:$ptr), + AddrModeNone, Size4Bytes, NoItinerary, + "strexh", "\t$success, $src, [$ptr]", "", []>; +def t2STREX : Thumb2I<(outs GPR:$success), (ins GPR:$src, GPR:$ptr), + AddrModeNone, Size4Bytes, NoItinerary, + "strex", "\t$success, $src, [$ptr]", "", + []> { + let Inst{31-27} = 0b11101; + let Inst{26-20} = 0b0000100; + let Inst{7-0} = 0b00000000; // imm8 = 0 +} +def t2STREXD : T2I_strex<0b11, (outs GPR:$success), + (ins GPR:$src, GPR:$src2, GPR:$ptr), + AddrModeNone, Size4Bytes, NoItinerary, + "strexd", "\t$success, $src, $src2, [$ptr]", "", [], + {?, ?, ?, ?}>; +} + +// Clear-Exclusive is for disassembly only. +def t2CLREX : T2I<(outs), (ins), NoItinerary, "clrex", "", + [/* For disassembly only; pattern left blank */]>, + Requires<[IsARM, HasV7]> { + let Inst{31-20} = 0xf3b; + let Inst{15-14} = 0b10; + let Inst{12} = 0; + let Inst{7-4} = 0b0010; +} + +//===----------------------------------------------------------------------===// +// TLS Instructions +// + +// __aeabi_read_tp preserves the registers r1-r3. +let isCall = 1, + Defs = [R0, R12, LR, CPSR] in { + def t2TPsoft : T2XI<(outs), (ins), IIC_Br, + "bl\t__aeabi_read_tp", + [(set R0, ARMthread_pointer)]> { + let Inst{31-27} = 0b11110; + let Inst{15-14} = 0b11; + let Inst{12} = 1; + } +} + +//===----------------------------------------------------------------------===// +// SJLJ Exception handling intrinsics +// eh_sjlj_setjmp() is an instruction sequence to store the return +// address and save #0 in R0 for the non-longjmp case. +// Since by its nature we may be coming from some other function to get +// here, and we're using the stack frame for the containing function to +// save/restore registers, we can't keep anything live in regs across +// the eh_sjlj_setjmp(), else it will almost certainly have been tromped upon +// when we get here from a longjmp(). We force everthing out of registers +// except for our own input by listing the relevant registers in Defs. By +// doing so, we also cause the prologue/epilogue code to actively preserve +// all of the callee-saved resgisters, which is exactly what we want. +// The current SP is passed in $val, and we reuse the reg as a scratch. +let Defs = + [ R0, R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, LR, D0, + D1, D2, D3, D4, D5, D6, D7, D8, D9, D10, D11, D12, D13, D14, D15, + D16, D17, D18, D19, D20, D21, D22, D23, D24, D25, D26, D27, D28, D29, D30, + D31 ] in { + def t2Int_eh_sjlj_setjmp : Thumb2XI<(outs), (ins GPR:$src, tGPR:$val), + AddrModeNone, SizeSpecial, NoItinerary, + "str\t$val, [$src, #8]\t${:comment} begin eh.setjmp\n" + "\tmov\t$val, pc\n" + "\tadds\t$val, #7\n" + "\tstr\t$val, [$src, #4]\n" + "\tmovs\tr0, #0\n" + "\tb\t1f\n" + "\tmovs\tr0, #1\t${:comment} end eh.setjmp\n" + "1:", "", + [(set R0, (ARMeh_sjlj_setjmp GPR:$src, tGPR:$val))]>, + Requires<[IsThumb2, HasVFP2]>; +} + +let Defs = + [ R0, R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, LR ] in { + def t2Int_eh_sjlj_setjmp_nofp : Thumb2XI<(outs), (ins GPR:$src, tGPR:$val), + AddrModeNone, SizeSpecial, NoItinerary, + "str\t$val, [$src, #8]\t${:comment} begin eh.setjmp\n" + "\tmov\t$val, pc\n" + "\tadds\t$val, #7\n" + "\tstr\t$val, [$src, #4]\n" + "\tmovs\tr0, #0\n" + "\tb\t1f\n" + "\tmovs\tr0, #1\t${:comment} end eh.setjmp\n" + "1:", "", + [(set R0, (ARMeh_sjlj_setjmp GPR:$src, tGPR:$val))]>, + Requires<[IsThumb2, NoVFP]>; +} + + +//===----------------------------------------------------------------------===// +// Control-Flow Instructions +// + +// FIXME: remove when we have a way to marking a MI with these properties. +// FIXME: $dst1 should be a def. But the extra ops must be in the end of the +// operand list. +// FIXME: Should pc be an implicit operand like PICADD, etc? +let isReturn = 1, isTerminator = 1, isBarrier = 1, mayLoad = 1, + hasExtraDefRegAllocReq = 1 in + def t2LDM_RET : T2XIt<(outs GPR:$wb), (ins addrmode4:$addr, pred:$p, + reglist:$dsts, variable_ops), IIC_Br, + "ldm${addr:submode}${p}${addr:wide}\t$addr, $dsts", + "$addr.addr = $wb", []> { + let Inst{31-27} = 0b11101; + let Inst{26-25} = 0b00; + let Inst{24-23} = {?, ?}; // IA: '01', DB: '10' + let Inst{22} = 0; + let Inst{21} = 1; // The W bit. + let Inst{20} = 1; // Load +} + +let isBranch = 1, isTerminator = 1, isBarrier = 1 in { +let isPredicable = 1 in +def t2B : T2XI<(outs), (ins brtarget:$target), IIC_Br, + "b.w\t$target", + [(br bb:$target)]> { + let Inst{31-27} = 0b11110; + let Inst{15-14} = 0b10; + let Inst{12} = 1; +} + +let isNotDuplicable = 1, isIndirectBranch = 1 in { +def t2BR_JT : + T2JTI<(outs), + (ins GPR:$target, GPR:$index, jt2block_operand:$jt, i32imm:$id), + IIC_Br, "mov\tpc, $target\n$jt", + [(ARMbr2jt GPR:$target, GPR:$index, tjumptable:$jt, imm:$id)]> { + let Inst{31-27} = 0b11101; + let Inst{26-20} = 0b0100100; + let Inst{19-16} = 0b1111; + let Inst{14-12} = 0b000; + let Inst{11-8} = 0b1111; // Rd = pc + let Inst{7-4} = 0b0000; +} + +// FIXME: Add a non-pc based case that can be predicated. +def t2TBB : + T2JTI<(outs), + (ins tb_addrmode:$index, jt2block_operand:$jt, i32imm:$id), + IIC_Br, "tbb\t$index\n$jt", []> { + let Inst{31-27} = 0b11101; + let Inst{26-20} = 0b0001101; + let Inst{19-16} = 0b1111; // Rn = pc (table follows this instruction) + let Inst{15-8} = 0b11110000; + let Inst{7-4} = 0b0000; // B form +} + +def t2TBH : + T2JTI<(outs), + (ins tb_addrmode:$index, jt2block_operand:$jt, i32imm:$id), + IIC_Br, "tbh\t$index\n$jt", []> { + let Inst{31-27} = 0b11101; + let Inst{26-20} = 0b0001101; + let Inst{19-16} = 0b1111; // Rn = pc (table follows this instruction) + let Inst{15-8} = 0b11110000; + let Inst{7-4} = 0b0001; // H form +} + +// Generic versions of the above two instructions, for disassembly only + +def t2TBBgen : T2I<(outs), (ins GPR:$a, GPR:$b), IIC_Br, + "tbb", "\t[$a, $b]", []>{ + let Inst{31-27} = 0b11101; + let Inst{26-20} = 0b0001101; + let Inst{15-8} = 0b11110000; + let Inst{7-4} = 0b0000; // B form +} + +def t2TBHgen : T2I<(outs), (ins GPR:$a, GPR:$b), IIC_Br, + "tbh", "\t[$a, $b, lsl #1]", []> { + let Inst{31-27} = 0b11101; + let Inst{26-20} = 0b0001101; + let Inst{15-8} = 0b11110000; + let Inst{7-4} = 0b0001; // H form +} +} // isNotDuplicable, isIndirectBranch + +} // isBranch, isTerminator, isBarrier + +// FIXME: should be able to write a pattern for ARMBrcond, but can't use +// a two-value operand where a dag node expects two operands. :( +let isBranch = 1, isTerminator = 1 in +def t2Bcc : T2I<(outs), (ins brtarget:$target), IIC_Br, + "b", ".w\t$target", + [/*(ARMbrcond bb:$target, imm:$cc)*/]> { + let Inst{31-27} = 0b11110; + let Inst{15-14} = 0b10; + let Inst{12} = 0; +} + + +// IT block +def t2IT : Thumb2XI<(outs), (ins it_pred:$cc, it_mask:$mask), + AddrModeNone, Size2Bytes, IIC_iALUx, + "it$mask\t$cc", "", []> { + // 16-bit instruction. + let Inst{31-16} = 0x0000; + let Inst{15-8} = 0b10111111; +} + +// Branch and Exchange Jazelle -- for disassembly only +// Rm = Inst{19-16} +def t2BXJ : T2I<(outs), (ins GPR:$func), NoItinerary, "bxj", "\t$func", + [/* For disassembly only; pattern left blank */]> { + let Inst{31-27} = 0b11110; + let Inst{26} = 0; + let Inst{25-20} = 0b111100; + let Inst{15-14} = 0b10; + let Inst{12} = 0; +} + +// Change Processor State is a system instruction -- for disassembly only. +// The singleton $opt operand contains the following information: +// opt{4-0} = mode from Inst{4-0} +// opt{5} = changemode from Inst{17} +// opt{8-6} = AIF from Inst{8-6} +// opt{10-9} = imod from Inst{19-18} with 0b10 as enable and 0b11 as disable +def t2CPS : T2XI<(outs),(ins cps_opt:$opt), NoItinerary, "cps$opt", + [/* For disassembly only; pattern left blank */]> { + let Inst{31-27} = 0b11110; + let Inst{26} = 0; + let Inst{25-20} = 0b111010; + let Inst{15-14} = 0b10; + let Inst{12} = 0; +} + +// A6.3.4 Branches and miscellaneous control +// Table A6-14 Change Processor State, and hint instructions +// Helper class for disassembly only. +class T2I_hint<bits<8> op7_0, string opc, string asm> + : T2I<(outs), (ins), NoItinerary, opc, asm, + [/* For disassembly only; pattern left blank */]> { + let Inst{31-20} = 0xf3a; + let Inst{15-14} = 0b10; + let Inst{12} = 0; + let Inst{10-8} = 0b000; + let Inst{7-0} = op7_0; +} + +def t2NOP : T2I_hint<0b00000000, "nop", ".w">; +def t2YIELD : T2I_hint<0b00000001, "yield", ".w">; +def t2WFE : T2I_hint<0b00000010, "wfe", ".w">; +def t2WFI : T2I_hint<0b00000011, "wfi", ".w">; +def t2SEV : T2I_hint<0b00000100, "sev", ".w">; + +def t2DBG : T2I<(outs),(ins i32imm:$opt), NoItinerary, "dbg", "\t$opt", + [/* For disassembly only; pattern left blank */]> { + let Inst{31-20} = 0xf3a; + let Inst{15-14} = 0b10; + let Inst{12} = 0; + let Inst{10-8} = 0b000; + let Inst{7-4} = 0b1111; +} + +// Secure Monitor Call is a system instruction -- for disassembly only +// Option = Inst{19-16} +def t2SMC : T2I<(outs), (ins i32imm:$opt), NoItinerary, "smc", "\t$opt", + [/* For disassembly only; pattern left blank */]> { + let Inst{31-27} = 0b11110; + let Inst{26-20} = 0b1111111; + let Inst{15-12} = 0b1000; +} + +// Store Return State is a system instruction -- for disassembly only +def t2SRSDBW : T2I<(outs),(ins i32imm:$mode),NoItinerary,"srsdb","\tsp!, $mode", + [/* For disassembly only; pattern left blank */]> { + let Inst{31-27} = 0b11101; + let Inst{26-20} = 0b0000010; // W = 1 +} + +def t2SRSDB : T2I<(outs),(ins i32imm:$mode),NoItinerary,"srsdb","\tsp, $mode", + [/* For disassembly only; pattern left blank */]> { + let Inst{31-27} = 0b11101; + let Inst{26-20} = 0b0000000; // W = 0 +} + +def t2SRSIAW : T2I<(outs),(ins i32imm:$mode),NoItinerary,"srsia","\tsp!, $mode", + [/* For disassembly only; pattern left blank */]> { + let Inst{31-27} = 0b11101; + let Inst{26-20} = 0b0011010; // W = 1 +} + +def t2SRSIA : T2I<(outs), (ins i32imm:$mode),NoItinerary,"srsia","\tsp, $mode", + [/* For disassembly only; pattern left blank */]> { + let Inst{31-27} = 0b11101; + let Inst{26-20} = 0b0011000; // W = 0 +} + +// Return From Exception is a system instruction -- for disassembly only +def t2RFEDBW : T2I<(outs), (ins GPR:$base), NoItinerary, "rfedb", "\t$base!", + [/* For disassembly only; pattern left blank */]> { + let Inst{31-27} = 0b11101; + let Inst{26-20} = 0b0000011; // W = 1 +} + +def t2RFEDB : T2I<(outs), (ins GPR:$base), NoItinerary, "rfeab", "\t$base", + [/* For disassembly only; pattern left blank */]> { + let Inst{31-27} = 0b11101; + let Inst{26-20} = 0b0000001; // W = 0 +} + +def t2RFEIAW : T2I<(outs), (ins GPR:$base), NoItinerary, "rfeia", "\t$base!", + [/* For disassembly only; pattern left blank */]> { + let Inst{31-27} = 0b11101; + let Inst{26-20} = 0b0011011; // W = 1 +} + +def t2RFEIA : T2I<(outs), (ins GPR:$base), NoItinerary, "rfeia", "\t$base", + [/* For disassembly only; pattern left blank */]> { + let Inst{31-27} = 0b11101; + let Inst{26-20} = 0b0011001; // W = 0 +} + +//===----------------------------------------------------------------------===// +// Non-Instruction Patterns +// + +// Two piece so_imms. +def : T2Pat<(or GPR:$LHS, t2_so_imm2part:$RHS), + (t2ORRri (t2ORRri GPR:$LHS, (t2_so_imm2part_1 imm:$RHS)), + (t2_so_imm2part_2 imm:$RHS))>; +def : T2Pat<(xor GPR:$LHS, t2_so_imm2part:$RHS), + (t2EORri (t2EORri GPR:$LHS, (t2_so_imm2part_1 imm:$RHS)), + (t2_so_imm2part_2 imm:$RHS))>; +def : T2Pat<(add GPR:$LHS, t2_so_imm2part:$RHS), + (t2ADDri (t2ADDri GPR:$LHS, (t2_so_imm2part_1 imm:$RHS)), + (t2_so_imm2part_2 imm:$RHS))>; +def : T2Pat<(add GPR:$LHS, t2_so_neg_imm2part:$RHS), + (t2SUBri (t2SUBri GPR:$LHS, (t2_so_neg_imm2part_1 imm:$RHS)), + (t2_so_neg_imm2part_2 imm:$RHS))>; + +// 32-bit immediate using movw + movt. +// This is a single pseudo instruction to make it re-materializable. Remove +// when we can do generalized remat. +let isReMaterializable = 1 in +def t2MOVi32imm : T2Ix2<(outs GPR:$dst), (ins i32imm:$src), IIC_iMOVi, + "movw", "\t$dst, ${src:lo16}\n\tmovt${p}\t$dst, ${src:hi16}", + [(set GPR:$dst, (i32 imm:$src))]>; + +// ConstantPool, GlobalAddress, and JumpTable +def : T2Pat<(ARMWrapper tglobaladdr :$dst), (t2LEApcrel tglobaladdr :$dst)>, + Requires<[IsThumb2, DontUseMovt]>; +def : T2Pat<(ARMWrapper tconstpool :$dst), (t2LEApcrel tconstpool :$dst)>; +def : T2Pat<(ARMWrapper tglobaladdr :$dst), (t2MOVi32imm tglobaladdr :$dst)>, + Requires<[IsThumb2, UseMovt]>; + +def : T2Pat<(ARMWrapperJT tjumptable:$dst, imm:$id), + (t2LEApcrelJT tjumptable:$dst, imm:$id)>; + +// Pseudo instruction that combines ldr from constpool and add pc. This should +// be expanded into two instructions late to allow if-conversion and +// scheduling. +let canFoldAsLoad = 1, isReMaterializable = 1 in +def t2LDRpci_pic : PseudoInst<(outs GPR:$dst), (ins i32imm:$addr, pclabel:$cp), + NoItinerary, "${:comment} ldr.w\t$dst, $addr\n$cp:\n\tadd\t$dst, pc", + [(set GPR:$dst, (ARMpic_add (load (ARMWrapper tconstpool:$addr)), + imm:$cp))]>, + Requires<[IsThumb2]>; + +//===----------------------------------------------------------------------===// +// Move between special register and ARM core register -- for disassembly only +// + +// Rd = Instr{11-8} +def t2MRS : T2I<(outs GPR:$dst), (ins), NoItinerary, "mrs", "\t$dst, cpsr", + [/* For disassembly only; pattern left blank */]> { + let Inst{31-27} = 0b11110; + let Inst{26} = 0; + let Inst{25-21} = 0b11111; + let Inst{20} = 0; // The R bit. + let Inst{15-14} = 0b10; + let Inst{12} = 0; +} + +// Rd = Instr{11-8} +def t2MRSsys : T2I<(outs GPR:$dst), (ins), NoItinerary, "mrs", "\t$dst, spsr", + [/* For disassembly only; pattern left blank */]> { + let Inst{31-27} = 0b11110; + let Inst{26} = 0; + let Inst{25-21} = 0b11111; + let Inst{20} = 1; // The R bit. + let Inst{15-14} = 0b10; + let Inst{12} = 0; +} + +// Rn = Inst{19-16} +def t2MSR : T2I<(outs), (ins GPR:$src, msr_mask:$mask), NoItinerary, "msr", + "\tcpsr$mask, $src", + [/* For disassembly only; pattern left blank */]> { + let Inst{31-27} = 0b11110; + let Inst{26} = 0; + let Inst{25-21} = 0b11100; + let Inst{20} = 0; // The R bit. + let Inst{15-14} = 0b10; + let Inst{12} = 0; +} + +// Rn = Inst{19-16} +def t2MSRsys : T2I<(outs), (ins GPR:$src, msr_mask:$mask), NoItinerary, "msr", + "\tspsr$mask, $src", + [/* For disassembly only; pattern left blank */]> { + let Inst{31-27} = 0b11110; + let Inst{26} = 0; + let Inst{25-21} = 0b11100; + let Inst{20} = 1; // The R bit. + let Inst{15-14} = 0b10; + let Inst{12} = 0; +} diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrVFP.td b/contrib/llvm/lib/Target/ARM/ARMInstrVFP.td new file mode 100644 index 0000000..54474cf --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMInstrVFP.td @@ -0,0 +1,707 @@ +//===- ARMInstrVFP.td - VFP support for ARM -------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the ARM VFP instruction set. +// +//===----------------------------------------------------------------------===// + +def SDT_FTOI : +SDTypeProfile<1, 1, [SDTCisVT<0, f32>, SDTCisFP<1>]>; +def SDT_ITOF : +SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisVT<1, f32>]>; +def SDT_CMPFP0 : +SDTypeProfile<0, 1, [SDTCisFP<0>]>; +def SDT_VMOVDRR : +SDTypeProfile<1, 2, [SDTCisVT<0, f64>, SDTCisVT<1, i32>, + SDTCisSameAs<1, 2>]>; + +def arm_ftoui : SDNode<"ARMISD::FTOUI", SDT_FTOI>; +def arm_ftosi : SDNode<"ARMISD::FTOSI", SDT_FTOI>; +def arm_sitof : SDNode<"ARMISD::SITOF", SDT_ITOF>; +def arm_uitof : SDNode<"ARMISD::UITOF", SDT_ITOF>; +def arm_fmstat : SDNode<"ARMISD::FMSTAT", SDTNone, [SDNPInFlag,SDNPOutFlag]>; +def arm_cmpfp : SDNode<"ARMISD::CMPFP", SDT_ARMCmp, [SDNPOutFlag]>; +def arm_cmpfp0 : SDNode<"ARMISD::CMPFPw0",SDT_CMPFP0, [SDNPOutFlag]>; +def arm_fmdrr : SDNode<"ARMISD::VMOVDRR", SDT_VMOVDRR>; + +//===----------------------------------------------------------------------===// +// Operand Definitions. +// + + +def vfp_f32imm : Operand<f32>, + PatLeaf<(f32 fpimm), [{ + return ARM::getVFPf32Imm(N->getValueAPF()) != -1; + }]> { + let PrintMethod = "printVFPf32ImmOperand"; +} + +def vfp_f64imm : Operand<f64>, + PatLeaf<(f64 fpimm), [{ + return ARM::getVFPf64Imm(N->getValueAPF()) != -1; + }]> { + let PrintMethod = "printVFPf64ImmOperand"; +} + + +//===----------------------------------------------------------------------===// +// Load / store Instructions. +// + +let canFoldAsLoad = 1, isReMaterializable = 1 in { +def VLDRD : ADI5<0b1101, 0b01, (outs DPR:$dst), (ins addrmode5:$addr), + IIC_fpLoad64, "vldr", ".64\t$dst, $addr", + [(set DPR:$dst, (f64 (load addrmode5:$addr)))]>; + +def VLDRS : ASI5<0b1101, 0b01, (outs SPR:$dst), (ins addrmode5:$addr), + IIC_fpLoad32, "vldr", ".32\t$dst, $addr", + [(set SPR:$dst, (load addrmode5:$addr))]>; +} // canFoldAsLoad + +def VSTRD : ADI5<0b1101, 0b00, (outs), (ins DPR:$src, addrmode5:$addr), + IIC_fpStore64, "vstr", ".64\t$src, $addr", + [(store (f64 DPR:$src), addrmode5:$addr)]>; + +def VSTRS : ASI5<0b1101, 0b00, (outs), (ins SPR:$src, addrmode5:$addr), + IIC_fpStore32, "vstr", ".32\t$src, $addr", + [(store SPR:$src, addrmode5:$addr)]>; + +//===----------------------------------------------------------------------===// +// Load / store multiple Instructions. +// + +let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 in { +def VLDMD : AXDI5<(outs), (ins addrmode5:$addr, pred:$p, reglist:$dsts, + variable_ops), IndexModeNone, IIC_fpLoadm, + "vldm${addr:submode}${p}\t${addr:base}, $dsts", "", []> { + let Inst{20} = 1; +} + +def VLDMS : AXSI5<(outs), (ins addrmode5:$addr, pred:$p, reglist:$dsts, + variable_ops), IndexModeNone, IIC_fpLoadm, + "vldm${addr:submode}${p}\t${addr:base}, $dsts", "", []> { + let Inst{20} = 1; +} + +def VLDMD_UPD : AXDI5<(outs GPR:$wb), (ins addrmode5:$addr, pred:$p, + reglist:$dsts, variable_ops), + IndexModeUpd, IIC_fpLoadm, + "vldm${addr:submode}${p}\t${addr:base}!, $dsts", + "$addr.base = $wb", []> { + let Inst{20} = 1; +} + +def VLDMS_UPD : AXSI5<(outs GPR:$wb), (ins addrmode5:$addr, pred:$p, + reglist:$dsts, variable_ops), + IndexModeUpd, IIC_fpLoadm, + "vldm${addr:submode}${p}\t${addr:base}!, $dsts", + "$addr.base = $wb", []> { + let Inst{20} = 1; +} +} // mayLoad, neverHasSideEffects, hasExtraDefRegAllocReq + +let mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1 in { +def VSTMD : AXDI5<(outs), (ins addrmode5:$addr, pred:$p, reglist:$srcs, + variable_ops), IndexModeNone, IIC_fpStorem, + "vstm${addr:submode}${p}\t${addr:base}, $srcs", "", []> { + let Inst{20} = 0; +} + +def VSTMS : AXSI5<(outs), (ins addrmode5:$addr, pred:$p, reglist:$srcs, + variable_ops), IndexModeNone, IIC_fpStorem, + "vstm${addr:submode}${p}\t${addr:base}, $srcs", "", []> { + let Inst{20} = 0; +} + +def VSTMD_UPD : AXDI5<(outs GPR:$wb), (ins addrmode5:$addr, pred:$p, + reglist:$srcs, variable_ops), + IndexModeUpd, IIC_fpStorem, + "vstm${addr:submode}${p}\t${addr:base}!, $srcs", + "$addr.base = $wb", []> { + let Inst{20} = 0; +} + +def VSTMS_UPD : AXSI5<(outs GPR:$wb), (ins addrmode5:$addr, pred:$p, + reglist:$srcs, variable_ops), + IndexModeUpd, IIC_fpStorem, + "vstm${addr:submode}${p}\t${addr:base}!, $srcs", + "$addr.base = $wb", []> { + let Inst{20} = 0; +} +} // mayStore, neverHasSideEffects, hasExtraSrcRegAllocReq + +// FLDMX, FSTMX - mixing S/D registers for pre-armv6 cores + +//===----------------------------------------------------------------------===// +// FP Binary Operations. +// + +def VADDD : ADbI<0b11100, 0b11, 0, 0, (outs DPR:$dst), (ins DPR:$a, DPR:$b), + IIC_fpALU64, "vadd", ".f64\t$dst, $a, $b", + [(set DPR:$dst, (fadd DPR:$a, (f64 DPR:$b)))]>; + +def VADDS : ASbIn<0b11100, 0b11, 0, 0, (outs SPR:$dst), (ins SPR:$a, SPR:$b), + IIC_fpALU32, "vadd", ".f32\t$dst, $a, $b", + [(set SPR:$dst, (fadd SPR:$a, SPR:$b))]>; + +// These are encoded as unary instructions. +let Defs = [FPSCR] in { +def VCMPED : ADuI<0b11101, 0b11, 0b0100, 0b11, 0, (outs), (ins DPR:$a, DPR:$b), + IIC_fpCMP64, "vcmpe", ".f64\t$a, $b", + [(arm_cmpfp DPR:$a, (f64 DPR:$b))]>; + +def VCMPD : ADuI<0b11101, 0b11, 0b0100, 0b01, 0, (outs), (ins DPR:$a, DPR:$b), + IIC_fpCMP64, "vcmp", ".f64\t$a, $b", + [/* For disassembly only; pattern left blank */]>; + +def VCMPES : ASuI<0b11101, 0b11, 0b0100, 0b11, 0, (outs), (ins SPR:$a, SPR:$b), + IIC_fpCMP32, "vcmpe", ".f32\t$a, $b", + [(arm_cmpfp SPR:$a, SPR:$b)]>; + +def VCMPS : ASuI<0b11101, 0b11, 0b0100, 0b01, 0, (outs), (ins SPR:$a, SPR:$b), + IIC_fpCMP32, "vcmp", ".f32\t$a, $b", + [/* For disassembly only; pattern left blank */]>; +} + +def VDIVD : ADbI<0b11101, 0b00, 0, 0, (outs DPR:$dst), (ins DPR:$a, DPR:$b), + IIC_fpDIV64, "vdiv", ".f64\t$dst, $a, $b", + [(set DPR:$dst, (fdiv DPR:$a, (f64 DPR:$b)))]>; + +def VDIVS : ASbI<0b11101, 0b00, 0, 0, (outs SPR:$dst), (ins SPR:$a, SPR:$b), + IIC_fpDIV32, "vdiv", ".f32\t$dst, $a, $b", + [(set SPR:$dst, (fdiv SPR:$a, SPR:$b))]>; + +def VMULD : ADbI<0b11100, 0b10, 0, 0, (outs DPR:$dst), (ins DPR:$a, DPR:$b), + IIC_fpMUL64, "vmul", ".f64\t$dst, $a, $b", + [(set DPR:$dst, (fmul DPR:$a, (f64 DPR:$b)))]>; + +def VMULS : ASbIn<0b11100, 0b10, 0, 0, (outs SPR:$dst), (ins SPR:$a, SPR:$b), + IIC_fpMUL32, "vmul", ".f32\t$dst, $a, $b", + [(set SPR:$dst, (fmul SPR:$a, SPR:$b))]>; + +def VNMULD : ADbI<0b11100, 0b10, 1, 0, (outs DPR:$dst), (ins DPR:$a, DPR:$b), + IIC_fpMUL64, "vnmul", ".f64\t$dst, $a, $b", + [(set DPR:$dst, (fneg (fmul DPR:$a, (f64 DPR:$b))))]>; + +def VNMULS : ASbI<0b11100, 0b10, 1, 0, (outs SPR:$dst), (ins SPR:$a, SPR:$b), + IIC_fpMUL32, "vnmul", ".f32\t$dst, $a, $b", + [(set SPR:$dst, (fneg (fmul SPR:$a, SPR:$b)))]>; + +// Match reassociated forms only if not sign dependent rounding. +def : Pat<(fmul (fneg DPR:$a), (f64 DPR:$b)), + (VNMULD DPR:$a, DPR:$b)>, Requires<[NoHonorSignDependentRounding]>; +def : Pat<(fmul (fneg SPR:$a), SPR:$b), + (VNMULS SPR:$a, SPR:$b)>, Requires<[NoHonorSignDependentRounding]>; + + +def VSUBD : ADbI<0b11100, 0b11, 1, 0, (outs DPR:$dst), (ins DPR:$a, DPR:$b), + IIC_fpALU64, "vsub", ".f64\t$dst, $a, $b", + [(set DPR:$dst, (fsub DPR:$a, (f64 DPR:$b)))]>; + +def VSUBS : ASbIn<0b11100, 0b11, 1, 0, (outs SPR:$dst), (ins SPR:$a, SPR:$b), + IIC_fpALU32, "vsub", ".f32\t$dst, $a, $b", + [(set SPR:$dst, (fsub SPR:$a, SPR:$b))]>; + +//===----------------------------------------------------------------------===// +// FP Unary Operations. +// + +def VABSD : ADuI<0b11101, 0b11, 0b0000, 0b11, 0, (outs DPR:$dst), (ins DPR:$a), + IIC_fpUNA64, "vabs", ".f64\t$dst, $a", + [(set DPR:$dst, (fabs (f64 DPR:$a)))]>; + +def VABSS : ASuIn<0b11101, 0b11, 0b0000, 0b11, 0,(outs SPR:$dst), (ins SPR:$a), + IIC_fpUNA32, "vabs", ".f32\t$dst, $a", + [(set SPR:$dst, (fabs SPR:$a))]>; + +let Defs = [FPSCR] in { +def VCMPEZD : ADuI<0b11101, 0b11, 0b0101, 0b11, 0, (outs), (ins DPR:$a), + IIC_fpCMP64, "vcmpe", ".f64\t$a, #0", + [(arm_cmpfp0 (f64 DPR:$a))]>; + +def VCMPZD : ADuI<0b11101, 0b11, 0b0101, 0b01, 0, (outs), (ins DPR:$a), + IIC_fpCMP64, "vcmp", ".f64\t$a, #0", + [/* For disassembly only; pattern left blank */]>; + +def VCMPEZS : ASuI<0b11101, 0b11, 0b0101, 0b11, 0, (outs), (ins SPR:$a), + IIC_fpCMP32, "vcmpe", ".f32\t$a, #0", + [(arm_cmpfp0 SPR:$a)]>; + +def VCMPZS : ASuI<0b11101, 0b11, 0b0101, 0b01, 0, (outs), (ins SPR:$a), + IIC_fpCMP32, "vcmp", ".f32\t$a, #0", + [/* For disassembly only; pattern left blank */]>; +} + +def VCVTDS : ASuI<0b11101, 0b11, 0b0111, 0b11, 0, (outs DPR:$dst), (ins SPR:$a), + IIC_fpCVTDS, "vcvt", ".f64.f32\t$dst, $a", + [(set DPR:$dst, (fextend SPR:$a))]>; + +// Special case encoding: bits 11-8 is 0b1011. +def VCVTSD : VFPAI<(outs SPR:$dst), (ins DPR:$a), VFPUnaryFrm, + IIC_fpCVTSD, "vcvt", ".f32.f64\t$dst, $a", + [(set SPR:$dst, (fround DPR:$a))]> { + let Inst{27-23} = 0b11101; + let Inst{21-16} = 0b110111; + let Inst{11-8} = 0b1011; + let Inst{7-6} = 0b11; + let Inst{4} = 0; +} + +// Between half-precision and single-precision. For disassembly only. + +def VCVTBSH : ASuI<0b11101, 0b11, 0b0010, 0b01, 0, (outs SPR:$dst), (ins SPR:$a), + /* FIXME */ IIC_fpCVTSH, "vcvtb", ".f32.f16\t$dst, $a", + [/* For disassembly only; pattern left blank */]>; + +def : ARMPat<(f32_to_f16 SPR:$a), + (i32 (COPY_TO_REGCLASS (VCVTBSH SPR:$a), GPR))>; + +def VCVTBHS : ASuI<0b11101, 0b11, 0b0011, 0b01, 0, (outs SPR:$dst), (ins SPR:$a), + /* FIXME */ IIC_fpCVTHS, "vcvtb", ".f16.f32\t$dst, $a", + [/* For disassembly only; pattern left blank */]>; + +def : ARMPat<(f16_to_f32 GPR:$a), + (VCVTBHS (COPY_TO_REGCLASS GPR:$a, SPR))>; + +def VCVTTSH : ASuI<0b11101, 0b11, 0b0010, 0b11, 0, (outs SPR:$dst), (ins SPR:$a), + /* FIXME */ IIC_fpCVTSH, "vcvtt", ".f32.f16\t$dst, $a", + [/* For disassembly only; pattern left blank */]>; + +def VCVTTHS : ASuI<0b11101, 0b11, 0b0011, 0b11, 0, (outs SPR:$dst), (ins SPR:$a), + /* FIXME */ IIC_fpCVTHS, "vcvtt", ".f16.f32\t$dst, $a", + [/* For disassembly only; pattern left blank */]>; + +let neverHasSideEffects = 1 in { +def VMOVD: ADuI<0b11101, 0b11, 0b0000, 0b01, 0, (outs DPR:$dst), (ins DPR:$a), + IIC_fpUNA64, "vmov", ".f64\t$dst, $a", []>; + +def VMOVS: ASuI<0b11101, 0b11, 0b0000, 0b01, 0, (outs SPR:$dst), (ins SPR:$a), + IIC_fpUNA32, "vmov", ".f32\t$dst, $a", []>; +} // neverHasSideEffects + +def VNEGD : ADuI<0b11101, 0b11, 0b0001, 0b01, 0, (outs DPR:$dst), (ins DPR:$a), + IIC_fpUNA64, "vneg", ".f64\t$dst, $a", + [(set DPR:$dst, (fneg (f64 DPR:$a)))]>; + +def VNEGS : ASuIn<0b11101, 0b11, 0b0001, 0b01, 0,(outs SPR:$dst), (ins SPR:$a), + IIC_fpUNA32, "vneg", ".f32\t$dst, $a", + [(set SPR:$dst, (fneg SPR:$a))]>; + +def VSQRTD : ADuI<0b11101, 0b11, 0b0001, 0b11, 0, (outs DPR:$dst), (ins DPR:$a), + IIC_fpSQRT64, "vsqrt", ".f64\t$dst, $a", + [(set DPR:$dst, (fsqrt (f64 DPR:$a)))]>; + +def VSQRTS : ASuI<0b11101, 0b11, 0b0001, 0b11, 0, (outs SPR:$dst), (ins SPR:$a), + IIC_fpSQRT32, "vsqrt", ".f32\t$dst, $a", + [(set SPR:$dst, (fsqrt SPR:$a))]>; + +//===----------------------------------------------------------------------===// +// FP <-> GPR Copies. Int <-> FP Conversions. +// + +def VMOVRS : AVConv2I<0b11100001, 0b1010, (outs GPR:$dst), (ins SPR:$src), + IIC_fpMOVSI, "vmov", "\t$dst, $src", + [(set GPR:$dst, (bitconvert SPR:$src))]>; + +def VMOVSR : AVConv4I<0b11100000, 0b1010, (outs SPR:$dst), (ins GPR:$src), + IIC_fpMOVIS, "vmov", "\t$dst, $src", + [(set SPR:$dst, (bitconvert GPR:$src))]>; + +let neverHasSideEffects = 1 in { +def VMOVRRD : AVConv3I<0b11000101, 0b1011, + (outs GPR:$wb, GPR:$dst2), (ins DPR:$src), + IIC_fpMOVDI, "vmov", "\t$wb, $dst2, $src", + [/* FIXME: Can't write pattern for multiple result instr*/]> { + let Inst{7-6} = 0b00; +} + +def VMOVRRS : AVConv3I<0b11000101, 0b1010, + (outs GPR:$wb, GPR:$dst2), (ins SPR:$src1, SPR:$src2), + IIC_fpMOVDI, "vmov", "\t$wb, $dst2, $src1, $src2", + [/* For disassembly only; pattern left blank */]> { + let Inst{7-6} = 0b00; +} +} // neverHasSideEffects + +// FMDHR: GPR -> SPR +// FMDLR: GPR -> SPR + +def VMOVDRR : AVConv5I<0b11000100, 0b1011, + (outs DPR:$dst), (ins GPR:$src1, GPR:$src2), + IIC_fpMOVID, "vmov", "\t$dst, $src1, $src2", + [(set DPR:$dst, (arm_fmdrr GPR:$src1, GPR:$src2))]> { + let Inst{7-6} = 0b00; +} + +let neverHasSideEffects = 1 in +def VMOVSRR : AVConv5I<0b11000100, 0b1010, + (outs SPR:$dst1, SPR:$dst2), (ins GPR:$src1, GPR:$src2), + IIC_fpMOVID, "vmov", "\t$dst1, $dst2, $src1, $src2", + [/* For disassembly only; pattern left blank */]> { + let Inst{7-6} = 0b00; +} + +// FMRDH: SPR -> GPR +// FMRDL: SPR -> GPR +// FMRRS: SPR -> GPR +// FMRX : SPR system reg -> GPR + +// FMSRR: GPR -> SPR + +// FMXR: GPR -> VFP Sstem reg + + +// Int to FP: + +def VSITOD : AVConv1I<0b11101, 0b11, 0b1000, 0b1011, + (outs DPR:$dst), (ins SPR:$a), + IIC_fpCVTID, "vcvt", ".f64.s32\t$dst, $a", + [(set DPR:$dst, (f64 (arm_sitof SPR:$a)))]> { + let Inst{7} = 1; // s32 +} + +def VSITOS : AVConv1In<0b11101, 0b11, 0b1000, 0b1010, + (outs SPR:$dst),(ins SPR:$a), + IIC_fpCVTIS, "vcvt", ".f32.s32\t$dst, $a", + [(set SPR:$dst, (arm_sitof SPR:$a))]> { + let Inst{7} = 1; // s32 +} + +def VUITOD : AVConv1I<0b11101, 0b11, 0b1000, 0b1011, + (outs DPR:$dst), (ins SPR:$a), + IIC_fpCVTID, "vcvt", ".f64.u32\t$dst, $a", + [(set DPR:$dst, (f64 (arm_uitof SPR:$a)))]> { + let Inst{7} = 0; // u32 +} + +def VUITOS : AVConv1In<0b11101, 0b11, 0b1000, 0b1010, + (outs SPR:$dst), (ins SPR:$a), + IIC_fpCVTIS, "vcvt", ".f32.u32\t$dst, $a", + [(set SPR:$dst, (arm_uitof SPR:$a))]> { + let Inst{7} = 0; // u32 +} + +// FP to Int: +// Always set Z bit in the instruction, i.e. "round towards zero" variants. + +def VTOSIZD : AVConv1I<0b11101, 0b11, 0b1101, 0b1011, + (outs SPR:$dst), (ins DPR:$a), + IIC_fpCVTDI, "vcvt", ".s32.f64\t$dst, $a", + [(set SPR:$dst, (arm_ftosi (f64 DPR:$a)))]> { + let Inst{7} = 1; // Z bit +} + +def VTOSIZS : AVConv1In<0b11101, 0b11, 0b1101, 0b1010, + (outs SPR:$dst), (ins SPR:$a), + IIC_fpCVTSI, "vcvt", ".s32.f32\t$dst, $a", + [(set SPR:$dst, (arm_ftosi SPR:$a))]> { + let Inst{7} = 1; // Z bit +} + +def VTOUIZD : AVConv1I<0b11101, 0b11, 0b1100, 0b1011, + (outs SPR:$dst), (ins DPR:$a), + IIC_fpCVTDI, "vcvt", ".u32.f64\t$dst, $a", + [(set SPR:$dst, (arm_ftoui (f64 DPR:$a)))]> { + let Inst{7} = 1; // Z bit +} + +def VTOUIZS : AVConv1In<0b11101, 0b11, 0b1100, 0b1010, + (outs SPR:$dst), (ins SPR:$a), + IIC_fpCVTSI, "vcvt", ".u32.f32\t$dst, $a", + [(set SPR:$dst, (arm_ftoui SPR:$a))]> { + let Inst{7} = 1; // Z bit +} + +// And the Z bit '0' variants, i.e. use the rounding mode specified by FPSCR. +// For disassembly only. + +def VTOSIRD : AVConv1I<0b11101, 0b11, 0b1101, 0b1011, + (outs SPR:$dst), (ins DPR:$a), + IIC_fpCVTDI, "vcvtr", ".s32.f64\t$dst, $a", + [/* For disassembly only; pattern left blank */]> { + let Inst{7} = 0; // Z bit +} + +def VTOSIRS : AVConv1In<0b11101, 0b11, 0b1101, 0b1010, + (outs SPR:$dst), (ins SPR:$a), + IIC_fpCVTSI, "vcvtr", ".s32.f32\t$dst, $a", + [/* For disassembly only; pattern left blank */]> { + let Inst{7} = 0; // Z bit +} + +def VTOUIRD : AVConv1I<0b11101, 0b11, 0b1100, 0b1011, + (outs SPR:$dst), (ins DPR:$a), + IIC_fpCVTDI, "vcvtr", ".u32.f64\t$dst, $a", + [/* For disassembly only; pattern left blank */]> { + let Inst{7} = 0; // Z bit +} + +def VTOUIRS : AVConv1In<0b11101, 0b11, 0b1100, 0b1010, + (outs SPR:$dst), (ins SPR:$a), + IIC_fpCVTSI, "vcvtr", ".u32.f32\t$dst, $a", + [/* For disassembly only; pattern left blank */]> { + let Inst{7} = 0; // Z bit +} + +// Convert between floating-point and fixed-point +// Data type for fixed-point naming convention: +// S16 (U=0, sx=0) -> SH +// U16 (U=1, sx=0) -> UH +// S32 (U=0, sx=1) -> SL +// U32 (U=1, sx=1) -> UL + +let Constraints = "$a = $dst" in { + +// FP to Fixed-Point: + +def VTOSHS : AVConv1XI<0b11101, 0b11, 0b1110, 0b1010, 0, + (outs SPR:$dst), (ins SPR:$a, i32imm:$fbits), + IIC_fpCVTSI, "vcvt", ".s16.f32\t$dst, $a, $fbits", + [/* For disassembly only; pattern left blank */]>; + +def VTOUHS : AVConv1XI<0b11101, 0b11, 0b1111, 0b1010, 0, + (outs SPR:$dst), (ins SPR:$a, i32imm:$fbits), + IIC_fpCVTSI, "vcvt", ".u16.f32\t$dst, $a, $fbits", + [/* For disassembly only; pattern left blank */]>; + +def VTOSLS : AVConv1XI<0b11101, 0b11, 0b1110, 0b1010, 1, + (outs SPR:$dst), (ins SPR:$a, i32imm:$fbits), + IIC_fpCVTSI, "vcvt", ".s32.f32\t$dst, $a, $fbits", + [/* For disassembly only; pattern left blank */]>; + +def VTOULS : AVConv1XI<0b11101, 0b11, 0b1111, 0b1010, 1, + (outs SPR:$dst), (ins SPR:$a, i32imm:$fbits), + IIC_fpCVTSI, "vcvt", ".u32.f32\t$dst, $a, $fbits", + [/* For disassembly only; pattern left blank */]>; + +def VTOSHD : AVConv1XI<0b11101, 0b11, 0b1110, 0b1011, 0, + (outs DPR:$dst), (ins DPR:$a, i32imm:$fbits), + IIC_fpCVTDI, "vcvt", ".s16.f64\t$dst, $a, $fbits", + [/* For disassembly only; pattern left blank */]>; + +def VTOUHD : AVConv1XI<0b11101, 0b11, 0b1111, 0b1011, 0, + (outs DPR:$dst), (ins DPR:$a, i32imm:$fbits), + IIC_fpCVTDI, "vcvt", ".u16.f64\t$dst, $a, $fbits", + [/* For disassembly only; pattern left blank */]>; + +def VTOSLD : AVConv1XI<0b11101, 0b11, 0b1110, 0b1011, 1, + (outs DPR:$dst), (ins DPR:$a, i32imm:$fbits), + IIC_fpCVTDI, "vcvt", ".s32.f64\t$dst, $a, $fbits", + [/* For disassembly only; pattern left blank */]>; + +def VTOULD : AVConv1XI<0b11101, 0b11, 0b1111, 0b1011, 1, + (outs DPR:$dst), (ins DPR:$a, i32imm:$fbits), + IIC_fpCVTDI, "vcvt", ".u32.f64\t$dst, $a, $fbits", + [/* For disassembly only; pattern left blank */]>; + +// Fixed-Point to FP: + +def VSHTOS : AVConv1XI<0b11101, 0b11, 0b1010, 0b1010, 0, + (outs SPR:$dst), (ins SPR:$a, i32imm:$fbits), + IIC_fpCVTIS, "vcvt", ".f32.s16\t$dst, $a, $fbits", + [/* For disassembly only; pattern left blank */]>; + +def VUHTOS : AVConv1XI<0b11101, 0b11, 0b1011, 0b1010, 0, + (outs SPR:$dst), (ins SPR:$a, i32imm:$fbits), + IIC_fpCVTIS, "vcvt", ".f32.u16\t$dst, $a, $fbits", + [/* For disassembly only; pattern left blank */]>; + +def VSLTOS : AVConv1XI<0b11101, 0b11, 0b1010, 0b1010, 1, + (outs SPR:$dst), (ins SPR:$a, i32imm:$fbits), + IIC_fpCVTIS, "vcvt", ".f32.s32\t$dst, $a, $fbits", + [/* For disassembly only; pattern left blank */]>; + +def VULTOS : AVConv1XI<0b11101, 0b11, 0b1011, 0b1010, 1, + (outs SPR:$dst), (ins SPR:$a, i32imm:$fbits), + IIC_fpCVTIS, "vcvt", ".f32.u32\t$dst, $a, $fbits", + [/* For disassembly only; pattern left blank */]>; + +def VSHTOD : AVConv1XI<0b11101, 0b11, 0b1010, 0b1011, 0, + (outs DPR:$dst), (ins DPR:$a, i32imm:$fbits), + IIC_fpCVTID, "vcvt", ".f64.s16\t$dst, $a, $fbits", + [/* For disassembly only; pattern left blank */]>; + +def VUHTOD : AVConv1XI<0b11101, 0b11, 0b1011, 0b1011, 0, + (outs DPR:$dst), (ins DPR:$a, i32imm:$fbits), + IIC_fpCVTID, "vcvt", ".f64.u16\t$dst, $a, $fbits", + [/* For disassembly only; pattern left blank */]>; + +def VSLTOD : AVConv1XI<0b11101, 0b11, 0b1010, 0b1011, 1, + (outs DPR:$dst), (ins DPR:$a, i32imm:$fbits), + IIC_fpCVTID, "vcvt", ".f64.s32\t$dst, $a, $fbits", + [/* For disassembly only; pattern left blank */]>; + +def VULTOD : AVConv1XI<0b11101, 0b11, 0b1011, 0b1011, 1, + (outs DPR:$dst), (ins DPR:$a, i32imm:$fbits), + IIC_fpCVTID, "vcvt", ".f64.u32\t$dst, $a, $fbits", + [/* For disassembly only; pattern left blank */]>; + +} // End of 'let Constraints = "$src = $dst" in' + +//===----------------------------------------------------------------------===// +// FP FMA Operations. +// + +def VMLAD : ADbI_vmlX<0b11100, 0b00, 0, 0, + (outs DPR:$dst), (ins DPR:$dstin, DPR:$a, DPR:$b), + IIC_fpMAC64, "vmla", ".f64\t$dst, $a, $b", + [(set DPR:$dst, (fadd (fmul DPR:$a, DPR:$b), + (f64 DPR:$dstin)))]>, + RegConstraint<"$dstin = $dst">; + +def VMLAS : ASbIn<0b11100, 0b00, 0, 0, + (outs SPR:$dst), (ins SPR:$dstin, SPR:$a, SPR:$b), + IIC_fpMAC32, "vmla", ".f32\t$dst, $a, $b", + [(set SPR:$dst, (fadd (fmul SPR:$a, SPR:$b), SPR:$dstin))]>, + RegConstraint<"$dstin = $dst">; + +def VNMLSD : ADbI_vmlX<0b11100, 0b01, 0, 0, + (outs DPR:$dst), (ins DPR:$dstin, DPR:$a, DPR:$b), + IIC_fpMAC64, "vnmls", ".f64\t$dst, $a, $b", + [(set DPR:$dst, (fsub (fmul DPR:$a, DPR:$b), + (f64 DPR:$dstin)))]>, + RegConstraint<"$dstin = $dst">; + +def VNMLSS : ASbI<0b11100, 0b01, 0, 0, + (outs SPR:$dst), (ins SPR:$dstin, SPR:$a, SPR:$b), + IIC_fpMAC32, "vnmls", ".f32\t$dst, $a, $b", + [(set SPR:$dst, (fsub (fmul SPR:$a, SPR:$b), SPR:$dstin))]>, + RegConstraint<"$dstin = $dst">; + +def VMLSD : ADbI_vmlX<0b11100, 0b00, 1, 0, + (outs DPR:$dst), (ins DPR:$dstin, DPR:$a, DPR:$b), + IIC_fpMAC64, "vmls", ".f64\t$dst, $a, $b", + [(set DPR:$dst, (fadd (fneg (fmul DPR:$a, DPR:$b)), + (f64 DPR:$dstin)))]>, + RegConstraint<"$dstin = $dst">; + +def VMLSS : ASbIn<0b11100, 0b00, 1, 0, + (outs SPR:$dst), (ins SPR:$dstin, SPR:$a, SPR:$b), + IIC_fpMAC32, "vmls", ".f32\t$dst, $a, $b", + [(set SPR:$dst, (fadd (fneg (fmul SPR:$a, SPR:$b)), SPR:$dstin))]>, + RegConstraint<"$dstin = $dst">; + +def : Pat<(fsub DPR:$dstin, (fmul DPR:$a, (f64 DPR:$b))), + (VMLSD DPR:$dstin, DPR:$a, DPR:$b)>, Requires<[DontUseNEONForFP]>; +def : Pat<(fsub SPR:$dstin, (fmul SPR:$a, SPR:$b)), + (VMLSS SPR:$dstin, SPR:$a, SPR:$b)>, Requires<[DontUseNEONForFP]>; + +def VNMLAD : ADbI_vmlX<0b11100, 0b01, 1, 0, + (outs DPR:$dst), (ins DPR:$dstin, DPR:$a, DPR:$b), + IIC_fpMAC64, "vnmla", ".f64\t$dst, $a, $b", + [(set DPR:$dst, (fsub (fneg (fmul DPR:$a, DPR:$b)), + (f64 DPR:$dstin)))]>, + RegConstraint<"$dstin = $dst">; + +def VNMLAS : ASbI<0b11100, 0b01, 1, 0, + (outs SPR:$dst), (ins SPR:$dstin, SPR:$a, SPR:$b), + IIC_fpMAC32, "vnmla", ".f32\t$dst, $a, $b", + [(set SPR:$dst, (fsub (fneg (fmul SPR:$a, SPR:$b)), SPR:$dstin))]>, + RegConstraint<"$dstin = $dst">; + +//===----------------------------------------------------------------------===// +// FP Conditional moves. +// + +let neverHasSideEffects = 1 in { +def VMOVDcc : ADuI<0b11101, 0b11, 0b0000, 0b01, 0, + (outs DPR:$dst), (ins DPR:$false, DPR:$true), + IIC_fpUNA64, "vmov", ".f64\t$dst, $true", + [/*(set DPR:$dst, (ARMcmov DPR:$false, DPR:$true, imm:$cc))*/]>, + RegConstraint<"$false = $dst">; + +def VMOVScc : ASuI<0b11101, 0b11, 0b0000, 0b01, 0, + (outs SPR:$dst), (ins SPR:$false, SPR:$true), + IIC_fpUNA32, "vmov", ".f32\t$dst, $true", + [/*(set SPR:$dst, (ARMcmov SPR:$false, SPR:$true, imm:$cc))*/]>, + RegConstraint<"$false = $dst">; + +def VNEGDcc : ADuI<0b11101, 0b11, 0b0001, 0b01, 0, + (outs DPR:$dst), (ins DPR:$false, DPR:$true), + IIC_fpUNA64, "vneg", ".f64\t$dst, $true", + [/*(set DPR:$dst, (ARMcneg DPR:$false, DPR:$true, imm:$cc))*/]>, + RegConstraint<"$false = $dst">; + +def VNEGScc : ASuI<0b11101, 0b11, 0b0001, 0b01, 0, + (outs SPR:$dst), (ins SPR:$false, SPR:$true), + IIC_fpUNA32, "vneg", ".f32\t$dst, $true", + [/*(set SPR:$dst, (ARMcneg SPR:$false, SPR:$true, imm:$cc))*/]>, + RegConstraint<"$false = $dst">; +} // neverHasSideEffects + +//===----------------------------------------------------------------------===// +// Misc. +// + +// APSR is the application level alias of CPSR. This FPSCR N, Z, C, V flags +// to APSR. +let Defs = [CPSR], Uses = [FPSCR] in +def FMSTAT : VFPAI<(outs), (ins), VFPMiscFrm, IIC_fpSTAT, "vmrs", + "\tapsr_nzcv, fpscr", + [(arm_fmstat)]> { + let Inst{27-20} = 0b11101111; + let Inst{19-16} = 0b0001; + let Inst{15-12} = 0b1111; + let Inst{11-8} = 0b1010; + let Inst{7} = 0; + let Inst{4} = 1; +} + +// FPSCR <-> GPR (for disassembly only) + +let neverHasSideEffects = 1 in { +let Uses = [FPSCR] in { +def VMRS : VFPAI<(outs GPR:$dst), (ins), VFPMiscFrm, IIC_fpSTAT, "vmrs", + "\t$dst, fpscr", + [/* For disassembly only; pattern left blank */]> { + let Inst{27-20} = 0b11101111; + let Inst{19-16} = 0b0001; + let Inst{11-8} = 0b1010; + let Inst{7} = 0; + let Inst{4} = 1; +} +} + +let Defs = [FPSCR] in { +def VMSR : VFPAI<(outs), (ins GPR:$src), VFPMiscFrm, IIC_fpSTAT, "vmsr", + "\tfpscr, $src", + [/* For disassembly only; pattern left blank */]> { + let Inst{27-20} = 0b11101110; + let Inst{19-16} = 0b0001; + let Inst{11-8} = 0b1010; + let Inst{7} = 0; + let Inst{4} = 1; +} +} +} // neverHasSideEffects + +// Materialize FP immediates. VFP3 only. +let isReMaterializable = 1 in { +def FCONSTD : VFPAI<(outs DPR:$dst), (ins vfp_f64imm:$imm), + VFPMiscFrm, IIC_fpUNA64, + "vmov", ".f64\t$dst, $imm", + [(set DPR:$dst, vfp_f64imm:$imm)]>, Requires<[HasVFP3]> { + let Inst{27-23} = 0b11101; + let Inst{21-20} = 0b11; + let Inst{11-9} = 0b101; + let Inst{8} = 1; + let Inst{7-4} = 0b0000; +} + +def FCONSTS : VFPAI<(outs SPR:$dst), (ins vfp_f32imm:$imm), + VFPMiscFrm, IIC_fpUNA32, + "vmov", ".f32\t$dst, $imm", + [(set SPR:$dst, vfp_f32imm:$imm)]>, Requires<[HasVFP3]> { + let Inst{27-23} = 0b11101; + let Inst{21-20} = 0b11; + let Inst{11-9} = 0b101; + let Inst{8} = 0; + let Inst{7-4} = 0b0000; +} +} diff --git a/contrib/llvm/lib/Target/ARM/ARMJITInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMJITInfo.cpp new file mode 100644 index 0000000..5f6d7ee --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMJITInfo.cpp @@ -0,0 +1,335 @@ +//===-- ARMJITInfo.cpp - Implement the JIT interfaces for the ARM target --===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the JIT interfaces for the ARM target. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "jit" +#include "ARMJITInfo.h" +#include "ARMInstrInfo.h" +#include "ARMConstantPoolValue.h" +#include "ARMRelocations.h" +#include "ARMSubtarget.h" +#include "llvm/Function.h" +#include "llvm/CodeGen/JITCodeEmitter.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/System/Memory.h" +#include <cstdlib> +using namespace llvm; + +void ARMJITInfo::replaceMachineCodeForFunction(void *Old, void *New) { + report_fatal_error("ARMJITInfo::replaceMachineCodeForFunction"); +} + +/// JITCompilerFunction - This contains the address of the JIT function used to +/// compile a function lazily. +static TargetJITInfo::JITCompilerFn JITCompilerFunction; + +// Get the ASMPREFIX for the current host. This is often '_'. +#ifndef __USER_LABEL_PREFIX__ +#define __USER_LABEL_PREFIX__ +#endif +#define GETASMPREFIX2(X) #X +#define GETASMPREFIX(X) GETASMPREFIX2(X) +#define ASMPREFIX GETASMPREFIX(__USER_LABEL_PREFIX__) + +// CompilationCallback stub - We can't use a C function with inline assembly in +// it, because we the prolog/epilog inserted by GCC won't work for us (we need +// to preserve more context and manipulate the stack directly). Instead, +// write our own wrapper, which does things our way, so we have complete +// control over register saving and restoring. +extern "C" { +#if defined(__arm__) + void ARMCompilationCallback(); + asm( + ".text\n" + ".align 2\n" + ".globl " ASMPREFIX "ARMCompilationCallback\n" + ASMPREFIX "ARMCompilationCallback:\n" + // Save caller saved registers since they may contain stuff + // for the real target function right now. We have to act as if this + // whole compilation callback doesn't exist as far as the caller is + // concerned, so we can't just preserve the callee saved regs. + "stmdb sp!, {r0, r1, r2, r3, lr}\n" +#if (defined(__VFP_FP__) && !defined(__SOFTFP__)) + "fstmfdd sp!, {d0, d1, d2, d3, d4, d5, d6, d7}\n" +#endif + // The LR contains the address of the stub function on entry. + // pass it as the argument to the C part of the callback + "mov r0, lr\n" + "sub sp, sp, #4\n" + // Call the C portion of the callback + "bl " ASMPREFIX "ARMCompilationCallbackC\n" + "add sp, sp, #4\n" + // Restoring the LR to the return address of the function that invoked + // the stub and de-allocating the stack space for it requires us to + // swap the two saved LR values on the stack, as they're backwards + // for what we need since the pop instruction has a pre-determined + // order for the registers. + // +--------+ + // 0 | LR | Original return address + // +--------+ + // 1 | LR | Stub address (start of stub) + // 2-5 | R3..R0 | Saved registers (we need to preserve all regs) + // 6-20 | D0..D7 | Saved VFP registers + // +--------+ + // +#if (defined(__VFP_FP__) && !defined(__SOFTFP__)) + // Restore VFP caller-saved registers. + "fldmfdd sp!, {d0, d1, d2, d3, d4, d5, d6, d7}\n" +#endif + // + // We need to exchange the values in slots 0 and 1 so we can + // return to the address in slot 1 with the address in slot 0 + // restored to the LR. + "ldr r0, [sp,#20]\n" + "ldr r1, [sp,#16]\n" + "str r1, [sp,#20]\n" + "str r0, [sp,#16]\n" + // Return to the (newly modified) stub to invoke the real function. + // The above twiddling of the saved return addresses allows us to + // deallocate everything, including the LR the stub saved, all in one + // pop instruction. + "ldmia sp!, {r0, r1, r2, r3, lr, pc}\n" + ); +#else // Not an ARM host + void ARMCompilationCallback() { + llvm_unreachable("Cannot call ARMCompilationCallback() on a non-ARM arch!"); + } +#endif +} + +/// ARMCompilationCallbackC - This is the target-specific function invoked +/// by the function stub when we did not know the real target of a call. +/// This function must locate the start of the stub or call site and pass +/// it into the JIT compiler function. +extern "C" void ARMCompilationCallbackC(intptr_t StubAddr) { + // Get the address of the compiled code for this function. + intptr_t NewVal = (intptr_t)JITCompilerFunction((void*)StubAddr); + + // Rewrite the call target... so that we don't end up here every time we + // execute the call. We're replacing the first two instructions of the + // stub with: + // ldr pc, [pc,#-4] + // <addr> + if (!sys::Memory::setRangeWritable((void*)StubAddr, 8)) { + llvm_unreachable("ERROR: Unable to mark stub writable"); + } + *(intptr_t *)StubAddr = 0xe51ff004; // ldr pc, [pc, #-4] + *(intptr_t *)(StubAddr+4) = NewVal; + if (!sys::Memory::setRangeExecutable((void*)StubAddr, 8)) { + llvm_unreachable("ERROR: Unable to mark stub executable"); + } +} + +TargetJITInfo::LazyResolverFn +ARMJITInfo::getLazyResolverFunction(JITCompilerFn F) { + JITCompilerFunction = F; + return ARMCompilationCallback; +} + +void *ARMJITInfo::emitGlobalValueIndirectSym(const GlobalValue *GV, void *Ptr, + JITCodeEmitter &JCE) { + uint8_t Buffer[4]; + uint8_t *Cur = Buffer; + MachineCodeEmitter::emitWordLEInto(Cur, (intptr_t)Ptr); + void *PtrAddr = JCE.allocIndirectGV( + GV, Buffer, sizeof(Buffer), /*Alignment=*/4); + addIndirectSymAddr(Ptr, (intptr_t)PtrAddr); + return PtrAddr; +} + +TargetJITInfo::StubLayout ARMJITInfo::getStubLayout() { + // The stub contains up to 3 4-byte instructions, aligned at 4 bytes, and a + // 4-byte address. See emitFunctionStub for details. + StubLayout Result = {16, 4}; + return Result; +} + +void *ARMJITInfo::emitFunctionStub(const Function* F, void *Fn, + JITCodeEmitter &JCE) { + void *Addr; + // If this is just a call to an external function, emit a branch instead of a + // call. The code is the same except for one bit of the last instruction. + if (Fn != (void*)(intptr_t)ARMCompilationCallback) { + // Branch to the corresponding function addr. + if (IsPIC) { + // The stub is 16-byte size and 4-aligned. + intptr_t LazyPtr = getIndirectSymAddr(Fn); + if (!LazyPtr) { + // In PIC mode, the function stub is loading a lazy-ptr. + LazyPtr= (intptr_t)emitGlobalValueIndirectSym((GlobalValue*)F, Fn, JCE); + DEBUG(if (F) + errs() << "JIT: Indirect symbol emitted at [" << LazyPtr + << "] for GV '" << F->getName() << "'\n"; + else + errs() << "JIT: Stub emitted at [" << LazyPtr + << "] for external function at '" << Fn << "'\n"); + } + JCE.emitAlignment(4); + Addr = (void*)JCE.getCurrentPCValue(); + if (!sys::Memory::setRangeWritable(Addr, 16)) { + llvm_unreachable("ERROR: Unable to mark stub writable"); + } + JCE.emitWordLE(0xe59fc004); // ldr ip, [pc, #+4] + JCE.emitWordLE(0xe08fc00c); // L_func$scv: add ip, pc, ip + JCE.emitWordLE(0xe59cf000); // ldr pc, [ip] + JCE.emitWordLE(LazyPtr - (intptr_t(Addr)+4+8)); // func - (L_func$scv+8) + sys::Memory::InvalidateInstructionCache(Addr, 16); + if (!sys::Memory::setRangeExecutable(Addr, 16)) { + llvm_unreachable("ERROR: Unable to mark stub executable"); + } + } else { + // The stub is 8-byte size and 4-aligned. + JCE.emitAlignment(4); + Addr = (void*)JCE.getCurrentPCValue(); + if (!sys::Memory::setRangeWritable(Addr, 8)) { + llvm_unreachable("ERROR: Unable to mark stub writable"); + } + JCE.emitWordLE(0xe51ff004); // ldr pc, [pc, #-4] + JCE.emitWordLE((intptr_t)Fn); // addr of function + sys::Memory::InvalidateInstructionCache(Addr, 8); + if (!sys::Memory::setRangeExecutable(Addr, 8)) { + llvm_unreachable("ERROR: Unable to mark stub executable"); + } + } + } else { + // The compilation callback will overwrite the first two words of this + // stub with indirect branch instructions targeting the compiled code. + // This stub sets the return address to restart the stub, so that + // the new branch will be invoked when we come back. + // + // Branch and link to the compilation callback. + // The stub is 16-byte size and 4-byte aligned. + JCE.emitAlignment(4); + Addr = (void*)JCE.getCurrentPCValue(); + if (!sys::Memory::setRangeWritable(Addr, 16)) { + llvm_unreachable("ERROR: Unable to mark stub writable"); + } + // Save LR so the callback can determine which stub called it. + // The compilation callback is responsible for popping this prior + // to returning. + JCE.emitWordLE(0xe92d4000); // push {lr} + // Set the return address to go back to the start of this stub. + JCE.emitWordLE(0xe24fe00c); // sub lr, pc, #12 + // Invoke the compilation callback. + JCE.emitWordLE(0xe51ff004); // ldr pc, [pc, #-4] + // The address of the compilation callback. + JCE.emitWordLE((intptr_t)ARMCompilationCallback); + sys::Memory::InvalidateInstructionCache(Addr, 16); + if (!sys::Memory::setRangeExecutable(Addr, 16)) { + llvm_unreachable("ERROR: Unable to mark stub executable"); + } + } + + return Addr; +} + +intptr_t ARMJITInfo::resolveRelocDestAddr(MachineRelocation *MR) const { + ARM::RelocationType RT = (ARM::RelocationType)MR->getRelocationType(); + switch (RT) { + default: + return (intptr_t)(MR->getResultPointer()); + case ARM::reloc_arm_pic_jt: + // Destination address - jump table base. + return (intptr_t)(MR->getResultPointer()) - MR->getConstantVal(); + case ARM::reloc_arm_jt_base: + // Jump table base address. + return getJumpTableBaseAddr(MR->getJumpTableIndex()); + case ARM::reloc_arm_cp_entry: + case ARM::reloc_arm_vfp_cp_entry: + // Constant pool entry address. + return getConstantPoolEntryAddr(MR->getConstantPoolIndex()); + case ARM::reloc_arm_machine_cp_entry: { + ARMConstantPoolValue *ACPV = (ARMConstantPoolValue*)MR->getConstantVal(); + assert((!ACPV->hasModifier() && !ACPV->mustAddCurrentAddress()) && + "Can't handle this machine constant pool entry yet!"); + intptr_t Addr = (intptr_t)(MR->getResultPointer()); + Addr -= getPCLabelAddr(ACPV->getLabelId()) + ACPV->getPCAdjustment(); + return Addr; + } + } +} + +/// relocate - Before the JIT can run a block of code that has been emitted, +/// it must rewrite the code to contain the actual addresses of any +/// referenced global symbols. +void ARMJITInfo::relocate(void *Function, MachineRelocation *MR, + unsigned NumRelocs, unsigned char* GOTBase) { + for (unsigned i = 0; i != NumRelocs; ++i, ++MR) { + void *RelocPos = (char*)Function + MR->getMachineCodeOffset(); + intptr_t ResultPtr = resolveRelocDestAddr(MR); + switch ((ARM::RelocationType)MR->getRelocationType()) { + case ARM::reloc_arm_cp_entry: + case ARM::reloc_arm_vfp_cp_entry: + case ARM::reloc_arm_relative: { + // It is necessary to calculate the correct PC relative value. We + // subtract the base addr from the target addr to form a byte offset. + ResultPtr = ResultPtr - (intptr_t)RelocPos - 8; + // If the result is positive, set bit U(23) to 1. + if (ResultPtr >= 0) + *((intptr_t*)RelocPos) |= 1 << ARMII::U_BitShift; + else { + // Otherwise, obtain the absolute value and set bit U(23) to 0. + *((intptr_t*)RelocPos) &= ~(1 << ARMII::U_BitShift); + ResultPtr = - ResultPtr; + } + // Set the immed value calculated. + // VFP immediate offset is multiplied by 4. + if (MR->getRelocationType() == ARM::reloc_arm_vfp_cp_entry) + ResultPtr = ResultPtr >> 2; + *((intptr_t*)RelocPos) |= ResultPtr; + // Set register Rn to PC. + *((intptr_t*)RelocPos) |= + ARMRegisterInfo::getRegisterNumbering(ARM::PC) << ARMII::RegRnShift; + break; + } + case ARM::reloc_arm_pic_jt: + case ARM::reloc_arm_machine_cp_entry: + case ARM::reloc_arm_absolute: { + // These addresses have already been resolved. + *((intptr_t*)RelocPos) |= (intptr_t)ResultPtr; + break; + } + case ARM::reloc_arm_branch: { + // It is necessary to calculate the correct value of signed_immed_24 + // field. We subtract the base addr from the target addr to form a + // byte offset, which must be inside the range -33554432 and +33554428. + // Then, we set the signed_immed_24 field of the instruction to bits + // [25:2] of the byte offset. More details ARM-ARM p. A4-11. + ResultPtr = ResultPtr - (intptr_t)RelocPos - 8; + ResultPtr = (ResultPtr & 0x03FFFFFC) >> 2; + assert(ResultPtr >= -33554432 && ResultPtr <= 33554428); + *((intptr_t*)RelocPos) |= ResultPtr; + break; + } + case ARM::reloc_arm_jt_base: { + // JT base - (instruction addr + 8) + ResultPtr = ResultPtr - (intptr_t)RelocPos - 8; + *((intptr_t*)RelocPos) |= ResultPtr; + break; + } + case ARM::reloc_arm_movw: { + ResultPtr = ResultPtr & 0xFFFF; + *((intptr_t*)RelocPos) |= ResultPtr & 0xFFF; + *((intptr_t*)RelocPos) |= ((ResultPtr >> 12) & 0xF) << 16; + break; + } + case ARM::reloc_arm_movt: { + ResultPtr = (ResultPtr >> 16) & 0xFFFF; + *((intptr_t*)RelocPos) |= ResultPtr & 0xFFF; + *((intptr_t*)RelocPos) |= ((ResultPtr >> 12) & 0xF) << 16; + break; + } + } + } +} diff --git a/contrib/llvm/lib/Target/ARM/ARMJITInfo.h b/contrib/llvm/lib/Target/ARM/ARMJITInfo.h new file mode 100644 index 0000000..ff332b7 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMJITInfo.h @@ -0,0 +1,182 @@ +//===- ARMJITInfo.h - ARM implementation of the JIT interface --*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the declaration of the ARMJITInfo class. +// +//===----------------------------------------------------------------------===// + +#ifndef ARMJITINFO_H +#define ARMJITINFO_H + +#include "ARMMachineFunctionInfo.h" +#include "llvm/CodeGen/MachineConstantPool.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineJumpTableInfo.h" +#include "llvm/Target/TargetJITInfo.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallVector.h" + +namespace llvm { + class ARMTargetMachine; + + class ARMJITInfo : public TargetJITInfo { + // ConstPoolId2AddrMap - A map from constant pool ids to the corresponding + // CONSTPOOL_ENTRY addresses. + SmallVector<intptr_t, 16> ConstPoolId2AddrMap; + + // JumpTableId2AddrMap - A map from inline jumptable ids to the + // corresponding inline jump table bases. + SmallVector<intptr_t, 16> JumpTableId2AddrMap; + + // PCLabelMap - A map from PC labels to addresses. + DenseMap<unsigned, intptr_t> PCLabelMap; + + // Sym2IndirectSymMap - A map from symbol (GlobalValue and ExternalSymbol) + // addresses to their indirect symbol addresses. + DenseMap<void*, intptr_t> Sym2IndirectSymMap; + + // IsPIC - True if the relocation model is PIC. This is used to determine + // how to codegen function stubs. + bool IsPIC; + + public: + explicit ARMJITInfo() : IsPIC(false) { useGOT = false; } + + /// replaceMachineCodeForFunction - Make it so that calling the function + /// whose machine code is at OLD turns into a call to NEW, perhaps by + /// overwriting OLD with a branch to NEW. This is used for self-modifying + /// code. + /// + virtual void replaceMachineCodeForFunction(void *Old, void *New); + + /// emitGlobalValueIndirectSym - Use the specified JITCodeEmitter object + /// to emit an indirect symbol which contains the address of the specified + /// ptr. + virtual void *emitGlobalValueIndirectSym(const GlobalValue* GV, void *ptr, + JITCodeEmitter &JCE); + + // getStubLayout - Returns the size and alignment of the largest call stub + // on ARM. + virtual StubLayout getStubLayout(); + + /// emitFunctionStub - Use the specified JITCodeEmitter object to emit a + /// small native function that simply calls the function at the specified + /// address. + virtual void *emitFunctionStub(const Function* F, void *Fn, + JITCodeEmitter &JCE); + + /// getLazyResolverFunction - Expose the lazy resolver to the JIT. + virtual LazyResolverFn getLazyResolverFunction(JITCompilerFn); + + /// relocate - Before the JIT can run a block of code that has been emitted, + /// it must rewrite the code to contain the actual addresses of any + /// referenced global symbols. + virtual void relocate(void *Function, MachineRelocation *MR, + unsigned NumRelocs, unsigned char* GOTBase); + + /// hasCustomConstantPool - Allows a target to specify that constant + /// pool address resolution is handled by the target. + virtual bool hasCustomConstantPool() const { return true; } + + /// hasCustomJumpTables - Allows a target to specify that jumptables + /// are emitted by the target. + virtual bool hasCustomJumpTables() const { return true; } + + /// allocateSeparateGVMemory - If true, globals should be placed in + /// separately allocated heap memory rather than in the same + /// code memory allocated by JITCodeEmitter. + virtual bool allocateSeparateGVMemory() const { +#ifdef __APPLE__ + return true; +#else + return false; +#endif + } + + /// Initialize - Initialize internal stage for the function being JITted. + /// Resize constant pool ids to CONSTPOOL_ENTRY addresses map; resize + /// jump table ids to jump table bases map; remember if codegen relocation + /// model is PIC. + void Initialize(const MachineFunction &MF, bool isPIC) { + const ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + ConstPoolId2AddrMap.resize(AFI->getNumConstPoolEntries()); + JumpTableId2AddrMap.resize(AFI->getNumJumpTables()); + IsPIC = isPIC; + } + + /// getConstantPoolEntryAddr - The ARM target puts all constant + /// pool entries into constant islands. This returns the address of the + /// constant pool entry of the specified index. + intptr_t getConstantPoolEntryAddr(unsigned CPI) const { + assert(CPI < ConstPoolId2AddrMap.size()); + return ConstPoolId2AddrMap[CPI]; + } + + /// addConstantPoolEntryAddr - Map a Constant Pool Index to the address + /// where its associated value is stored. When relocations are processed, + /// this value will be used to resolve references to the constant. + void addConstantPoolEntryAddr(unsigned CPI, intptr_t Addr) { + assert(CPI < ConstPoolId2AddrMap.size()); + ConstPoolId2AddrMap[CPI] = Addr; + } + + /// getJumpTableBaseAddr - The ARM target inline all jump tables within + /// text section of the function. This returns the address of the base of + /// the jump table of the specified index. + intptr_t getJumpTableBaseAddr(unsigned JTI) const { + assert(JTI < JumpTableId2AddrMap.size()); + return JumpTableId2AddrMap[JTI]; + } + + /// addJumpTableBaseAddr - Map a jump table index to the address where + /// the corresponding inline jump table is emitted. When relocations are + /// processed, this value will be used to resolve references to the + /// jump table. + void addJumpTableBaseAddr(unsigned JTI, intptr_t Addr) { + assert(JTI < JumpTableId2AddrMap.size()); + JumpTableId2AddrMap[JTI] = Addr; + } + + /// getPCLabelAddr - Retrieve the address of the PC label of the specified id. + intptr_t getPCLabelAddr(unsigned Id) const { + DenseMap<unsigned, intptr_t>::const_iterator I = PCLabelMap.find(Id); + assert(I != PCLabelMap.end()); + return I->second; + } + + /// addPCLabelAddr - Remember the address of the specified PC label. + void addPCLabelAddr(unsigned Id, intptr_t Addr) { + PCLabelMap.insert(std::make_pair(Id, Addr)); + } + + /// getIndirectSymAddr - Retrieve the address of the indirect symbol of the + /// specified symbol located at address. Returns 0 if the indirect symbol + /// has not been emitted. + intptr_t getIndirectSymAddr(void *Addr) const { + DenseMap<void*,intptr_t>::const_iterator I= Sym2IndirectSymMap.find(Addr); + if (I != Sym2IndirectSymMap.end()) + return I->second; + return 0; + } + + /// addIndirectSymAddr - Add a mapping from address of an emitted symbol to + /// its indirect symbol address. + void addIndirectSymAddr(void *SymAddr, intptr_t IndSymAddr) { + Sym2IndirectSymMap.insert(std::make_pair(SymAddr, IndSymAddr)); + } + + private: + /// resolveRelocDestAddr - Resolve the resulting address of the relocation + /// if it's not already solved. Constantpool entries must be resolved by + /// ARM target. + intptr_t resolveRelocDestAddr(MachineRelocation *MR) const; + }; +} + +#endif diff --git a/contrib/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/contrib/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp new file mode 100644 index 0000000..8585c1e --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp @@ -0,0 +1,1658 @@ +//===-- ARMLoadStoreOptimizer.cpp - ARM load / store opt. pass ----*- C++ -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains a pass that performs load / store related peephole +// optimizations. This pass should be run after register allocation. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "arm-ldst-opt" +#include "ARM.h" +#include "ARMAddressingModes.h" +#include "ARMBaseInstrInfo.h" +#include "ARMMachineFunctionInfo.h" +#include "ARMRegisterInfo.h" +#include "llvm/DerivedTypes.h" +#include "llvm/Function.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/RegisterScavenging.h" +#include "llvm/Target/TargetData.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetRegisterInfo.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +using namespace llvm; + +STATISTIC(NumLDMGened , "Number of ldm instructions generated"); +STATISTIC(NumSTMGened , "Number of stm instructions generated"); +STATISTIC(NumVLDMGened, "Number of vldm instructions generated"); +STATISTIC(NumVSTMGened, "Number of vstm instructions generated"); +STATISTIC(NumLdStMoved, "Number of load / store instructions moved"); +STATISTIC(NumLDRDFormed,"Number of ldrd created before allocation"); +STATISTIC(NumSTRDFormed,"Number of strd created before allocation"); +STATISTIC(NumLDRD2LDM, "Number of ldrd instructions turned back into ldm"); +STATISTIC(NumSTRD2STM, "Number of strd instructions turned back into stm"); +STATISTIC(NumLDRD2LDR, "Number of ldrd instructions turned back into ldr's"); +STATISTIC(NumSTRD2STR, "Number of strd instructions turned back into str's"); + +/// ARMAllocLoadStoreOpt - Post- register allocation pass the combine +/// load / store instructions to form ldm / stm instructions. + +namespace { + struct ARMLoadStoreOpt : public MachineFunctionPass { + static char ID; + ARMLoadStoreOpt() : MachineFunctionPass(&ID) {} + + const TargetInstrInfo *TII; + const TargetRegisterInfo *TRI; + ARMFunctionInfo *AFI; + RegScavenger *RS; + bool isThumb2; + + virtual bool runOnMachineFunction(MachineFunction &Fn); + + virtual const char *getPassName() const { + return "ARM load / store optimization pass"; + } + + private: + struct MemOpQueueEntry { + int Offset; + unsigned Position; + MachineBasicBlock::iterator MBBI; + bool Merged; + MemOpQueueEntry(int o, int p, MachineBasicBlock::iterator i) + : Offset(o), Position(p), MBBI(i), Merged(false) {} + }; + typedef SmallVector<MemOpQueueEntry,8> MemOpQueue; + typedef MemOpQueue::iterator MemOpQueueIter; + + bool MergeOps(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + int Offset, unsigned Base, bool BaseKill, int Opcode, + ARMCC::CondCodes Pred, unsigned PredReg, unsigned Scratch, + DebugLoc dl, SmallVector<std::pair<unsigned, bool>, 8> &Regs); + void MergeOpsUpdate(MachineBasicBlock &MBB, + MemOpQueue &MemOps, + unsigned memOpsBegin, + unsigned memOpsEnd, + unsigned insertAfter, + int Offset, + unsigned Base, + bool BaseKill, + int Opcode, + ARMCC::CondCodes Pred, + unsigned PredReg, + unsigned Scratch, + DebugLoc dl, + SmallVector<MachineBasicBlock::iterator, 4> &Merges); + void MergeLDR_STR(MachineBasicBlock &MBB, unsigned SIndex, unsigned Base, + int Opcode, unsigned Size, + ARMCC::CondCodes Pred, unsigned PredReg, + unsigned Scratch, MemOpQueue &MemOps, + SmallVector<MachineBasicBlock::iterator, 4> &Merges); + + void AdvanceRS(MachineBasicBlock &MBB, MemOpQueue &MemOps); + bool FixInvalidRegPairOp(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI); + bool MergeBaseUpdateLoadStore(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + const TargetInstrInfo *TII, + bool &Advance, + MachineBasicBlock::iterator &I); + bool MergeBaseUpdateLSMultiple(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + bool &Advance, + MachineBasicBlock::iterator &I); + bool LoadStoreMultipleOpti(MachineBasicBlock &MBB); + bool MergeReturnIntoLDM(MachineBasicBlock &MBB); + }; + char ARMLoadStoreOpt::ID = 0; +} + +static int getLoadStoreMultipleOpcode(int Opcode) { + switch (Opcode) { + case ARM::LDR: + NumLDMGened++; + return ARM::LDM; + case ARM::STR: + NumSTMGened++; + return ARM::STM; + case ARM::t2LDRi8: + case ARM::t2LDRi12: + NumLDMGened++; + return ARM::t2LDM; + case ARM::t2STRi8: + case ARM::t2STRi12: + NumSTMGened++; + return ARM::t2STM; + case ARM::VLDRS: + NumVLDMGened++; + return ARM::VLDMS; + case ARM::VSTRS: + NumVSTMGened++; + return ARM::VSTMS; + case ARM::VLDRD: + NumVLDMGened++; + return ARM::VLDMD; + case ARM::VSTRD: + NumVSTMGened++; + return ARM::VSTMD; + default: llvm_unreachable("Unhandled opcode!"); + } + return 0; +} + +static bool isT2i32Load(unsigned Opc) { + return Opc == ARM::t2LDRi12 || Opc == ARM::t2LDRi8; +} + +static bool isi32Load(unsigned Opc) { + return Opc == ARM::LDR || isT2i32Load(Opc); +} + +static bool isT2i32Store(unsigned Opc) { + return Opc == ARM::t2STRi12 || Opc == ARM::t2STRi8; +} + +static bool isi32Store(unsigned Opc) { + return Opc == ARM::STR || isT2i32Store(Opc); +} + +/// MergeOps - Create and insert a LDM or STM with Base as base register and +/// registers in Regs as the register operands that would be loaded / stored. +/// It returns true if the transformation is done. +bool +ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + int Offset, unsigned Base, bool BaseKill, + int Opcode, ARMCC::CondCodes Pred, + unsigned PredReg, unsigned Scratch, DebugLoc dl, + SmallVector<std::pair<unsigned, bool>, 8> &Regs) { + // Only a single register to load / store. Don't bother. + unsigned NumRegs = Regs.size(); + if (NumRegs <= 1) + return false; + + ARM_AM::AMSubMode Mode = ARM_AM::ia; + bool isAM4 = isi32Load(Opcode) || isi32Store(Opcode); + if (isAM4 && Offset == 4) { + if (isThumb2) + // Thumb2 does not support ldmib / stmib. + return false; + Mode = ARM_AM::ib; + } else if (isAM4 && Offset == -4 * (int)NumRegs + 4) { + if (isThumb2) + // Thumb2 does not support ldmda / stmda. + return false; + Mode = ARM_AM::da; + } else if (isAM4 && Offset == -4 * (int)NumRegs) { + Mode = ARM_AM::db; + } else if (Offset != 0) { + // If starting offset isn't zero, insert a MI to materialize a new base. + // But only do so if it is cost effective, i.e. merging more than two + // loads / stores. + if (NumRegs <= 2) + return false; + + unsigned NewBase; + if (isi32Load(Opcode)) + // If it is a load, then just use one of the destination register to + // use as the new base. + NewBase = Regs[NumRegs-1].first; + else { + // Use the scratch register to use as a new base. + NewBase = Scratch; + if (NewBase == 0) + return false; + } + int BaseOpc = !isThumb2 + ? ARM::ADDri + : ((Base == ARM::SP) ? ARM::t2ADDrSPi : ARM::t2ADDri); + if (Offset < 0) { + BaseOpc = !isThumb2 + ? ARM::SUBri + : ((Base == ARM::SP) ? ARM::t2SUBrSPi : ARM::t2SUBri); + Offset = - Offset; + } + int ImmedOffset = isThumb2 + ? ARM_AM::getT2SOImmVal(Offset) : ARM_AM::getSOImmVal(Offset); + if (ImmedOffset == -1) + // FIXME: Try t2ADDri12 or t2SUBri12? + return false; // Probably not worth it then. + + BuildMI(MBB, MBBI, dl, TII->get(BaseOpc), NewBase) + .addReg(Base, getKillRegState(BaseKill)).addImm(Offset) + .addImm(Pred).addReg(PredReg).addReg(0); + Base = NewBase; + BaseKill = true; // New base is always killed right its use. + } + + bool isDPR = (Opcode == ARM::VLDRD || Opcode == ARM::VSTRD); + bool isDef = (isi32Load(Opcode) || Opcode == ARM::VLDRS || + Opcode == ARM::VLDRD); + Opcode = getLoadStoreMultipleOpcode(Opcode); + MachineInstrBuilder MIB = (isAM4) + ? BuildMI(MBB, MBBI, dl, TII->get(Opcode)) + .addReg(Base, getKillRegState(BaseKill)) + .addImm(ARM_AM::getAM4ModeImm(Mode)).addImm(Pred).addReg(PredReg) + : BuildMI(MBB, MBBI, dl, TII->get(Opcode)) + .addReg(Base, getKillRegState(BaseKill)) + .addImm(ARM_AM::getAM5Opc(Mode, isDPR ? NumRegs<<1 : NumRegs)) + .addImm(Pred).addReg(PredReg); + for (unsigned i = 0; i != NumRegs; ++i) + MIB = MIB.addReg(Regs[i].first, getDefRegState(isDef) + | getKillRegState(Regs[i].second)); + + return true; +} + +// MergeOpsUpdate - call MergeOps and update MemOps and merges accordingly on +// success. +void ARMLoadStoreOpt:: +MergeOpsUpdate(MachineBasicBlock &MBB, + MemOpQueue &memOps, + unsigned memOpsBegin, + unsigned memOpsEnd, + unsigned insertAfter, + int Offset, + unsigned Base, + bool BaseKill, + int Opcode, + ARMCC::CondCodes Pred, + unsigned PredReg, + unsigned Scratch, + DebugLoc dl, + SmallVector<MachineBasicBlock::iterator, 4> &Merges) { + // First calculate which of the registers should be killed by the merged + // instruction. + SmallVector<std::pair<unsigned, bool>, 8> Regs; + const unsigned insertPos = memOps[insertAfter].Position; + for (unsigned i = memOpsBegin; i < memOpsEnd; ++i) { + const MachineOperand &MO = memOps[i].MBBI->getOperand(0); + unsigned Reg = MO.getReg(); + bool isKill = MO.isKill(); + + // If we are inserting the merged operation after an unmerged operation that + // uses the same register, make sure to transfer any kill flag. + for (unsigned j = memOpsEnd, e = memOps.size(); !isKill && j != e; ++j) + if (memOps[j].Position<insertPos) { + const MachineOperand &MOJ = memOps[j].MBBI->getOperand(0); + if (MOJ.getReg() == Reg && MOJ.isKill()) + isKill = true; + } + + Regs.push_back(std::make_pair(Reg, isKill)); + } + + // Try to do the merge. + MachineBasicBlock::iterator Loc = memOps[insertAfter].MBBI; + Loc++; + if (!MergeOps(MBB, Loc, Offset, Base, BaseKill, Opcode, + Pred, PredReg, Scratch, dl, Regs)) + return; + + // Merge succeeded, update records. + Merges.push_back(prior(Loc)); + for (unsigned i = memOpsBegin; i < memOpsEnd; ++i) { + // Remove kill flags from any unmerged memops that come before insertPos. + if (Regs[i-memOpsBegin].second) + for (unsigned j = memOpsEnd, e = memOps.size(); j != e; ++j) + if (memOps[j].Position<insertPos) { + MachineOperand &MOJ = memOps[j].MBBI->getOperand(0); + if (MOJ.getReg() == Regs[i-memOpsBegin].first && MOJ.isKill()) + MOJ.setIsKill(false); + } + MBB.erase(memOps[i].MBBI); + memOps[i].Merged = true; + } +} + +/// MergeLDR_STR - Merge a number of load / store instructions into one or more +/// load / store multiple instructions. +void +ARMLoadStoreOpt::MergeLDR_STR(MachineBasicBlock &MBB, unsigned SIndex, + unsigned Base, int Opcode, unsigned Size, + ARMCC::CondCodes Pred, unsigned PredReg, + unsigned Scratch, MemOpQueue &MemOps, + SmallVector<MachineBasicBlock::iterator, 4> &Merges) { + bool isAM4 = isi32Load(Opcode) || isi32Store(Opcode); + int Offset = MemOps[SIndex].Offset; + int SOffset = Offset; + unsigned insertAfter = SIndex; + MachineBasicBlock::iterator Loc = MemOps[SIndex].MBBI; + DebugLoc dl = Loc->getDebugLoc(); + const MachineOperand &PMO = Loc->getOperand(0); + unsigned PReg = PMO.getReg(); + unsigned PRegNum = PMO.isUndef() ? UINT_MAX + : ARMRegisterInfo::getRegisterNumbering(PReg); + unsigned Count = 1; + + for (unsigned i = SIndex+1, e = MemOps.size(); i != e; ++i) { + int NewOffset = MemOps[i].Offset; + const MachineOperand &MO = MemOps[i].MBBI->getOperand(0); + unsigned Reg = MO.getReg(); + unsigned RegNum = MO.isUndef() ? UINT_MAX + : ARMRegisterInfo::getRegisterNumbering(Reg); + // AM4 - register numbers in ascending order. + // AM5 - consecutive register numbers in ascending order. + // Can only do up to 16 double-word registers per insn. + if (Reg != ARM::SP && + NewOffset == Offset + (int)Size && + ((isAM4 && RegNum > PRegNum) + || ((Size < 8 || Count < 16) && RegNum == PRegNum+1))) { + Offset += Size; + PRegNum = RegNum; + ++Count; + } else { + // Can't merge this in. Try merge the earlier ones first. + MergeOpsUpdate(MBB, MemOps, SIndex, i, insertAfter, SOffset, + Base, false, Opcode, Pred, PredReg, Scratch, dl, Merges); + MergeLDR_STR(MBB, i, Base, Opcode, Size, Pred, PredReg, Scratch, + MemOps, Merges); + return; + } + + if (MemOps[i].Position > MemOps[insertAfter].Position) + insertAfter = i; + } + + bool BaseKill = Loc->findRegisterUseOperandIdx(Base, true) != -1; + MergeOpsUpdate(MBB, MemOps, SIndex, MemOps.size(), insertAfter, SOffset, + Base, BaseKill, Opcode, Pred, PredReg, Scratch, dl, Merges); + return; +} + +static inline bool isMatchingDecrement(MachineInstr *MI, unsigned Base, + unsigned Bytes, unsigned Limit, + ARMCC::CondCodes Pred, unsigned PredReg){ + unsigned MyPredReg = 0; + if (!MI) + return false; + if (MI->getOpcode() != ARM::t2SUBri && + MI->getOpcode() != ARM::t2SUBrSPi && + MI->getOpcode() != ARM::t2SUBrSPi12 && + MI->getOpcode() != ARM::tSUBspi && + MI->getOpcode() != ARM::SUBri) + return false; + + // Make sure the offset fits in 8 bits. + if (Bytes <= 0 || (Limit && Bytes >= Limit)) + return false; + + unsigned Scale = (MI->getOpcode() == ARM::tSUBspi) ? 4 : 1; // FIXME + return (MI->getOperand(0).getReg() == Base && + MI->getOperand(1).getReg() == Base && + (MI->getOperand(2).getImm()*Scale) == Bytes && + llvm::getInstrPredicate(MI, MyPredReg) == Pred && + MyPredReg == PredReg); +} + +static inline bool isMatchingIncrement(MachineInstr *MI, unsigned Base, + unsigned Bytes, unsigned Limit, + ARMCC::CondCodes Pred, unsigned PredReg){ + unsigned MyPredReg = 0; + if (!MI) + return false; + if (MI->getOpcode() != ARM::t2ADDri && + MI->getOpcode() != ARM::t2ADDrSPi && + MI->getOpcode() != ARM::t2ADDrSPi12 && + MI->getOpcode() != ARM::tADDspi && + MI->getOpcode() != ARM::ADDri) + return false; + + if (Bytes <= 0 || (Limit && Bytes >= Limit)) + // Make sure the offset fits in 8 bits. + return false; + + unsigned Scale = (MI->getOpcode() == ARM::tADDspi) ? 4 : 1; // FIXME + return (MI->getOperand(0).getReg() == Base && + MI->getOperand(1).getReg() == Base && + (MI->getOperand(2).getImm()*Scale) == Bytes && + llvm::getInstrPredicate(MI, MyPredReg) == Pred && + MyPredReg == PredReg); +} + +static inline unsigned getLSMultipleTransferSize(MachineInstr *MI) { + switch (MI->getOpcode()) { + default: return 0; + case ARM::LDR: + case ARM::STR: + case ARM::t2LDRi8: + case ARM::t2LDRi12: + case ARM::t2STRi8: + case ARM::t2STRi12: + case ARM::VLDRS: + case ARM::VSTRS: + return 4; + case ARM::VLDRD: + case ARM::VSTRD: + return 8; + case ARM::LDM: + case ARM::STM: + case ARM::t2LDM: + case ARM::t2STM: + return (MI->getNumOperands() - 4) * 4; + case ARM::VLDMS: + case ARM::VSTMS: + case ARM::VLDMD: + case ARM::VSTMD: + return ARM_AM::getAM5Offset(MI->getOperand(1).getImm()) * 4; + } +} + +static unsigned getUpdatingLSMultipleOpcode(unsigned Opc) { + switch (Opc) { + case ARM::LDM: return ARM::LDM_UPD; + case ARM::STM: return ARM::STM_UPD; + case ARM::t2LDM: return ARM::t2LDM_UPD; + case ARM::t2STM: return ARM::t2STM_UPD; + case ARM::VLDMS: return ARM::VLDMS_UPD; + case ARM::VLDMD: return ARM::VLDMD_UPD; + case ARM::VSTMS: return ARM::VSTMS_UPD; + case ARM::VSTMD: return ARM::VSTMD_UPD; + default: llvm_unreachable("Unhandled opcode!"); + } + return 0; +} + +/// MergeBaseUpdateLSMultiple - Fold proceeding/trailing inc/dec of base +/// register into the LDM/STM/VLDM{D|S}/VSTM{D|S} op when possible: +/// +/// stmia rn, <ra, rb, rc> +/// rn := rn + 4 * 3; +/// => +/// stmia rn!, <ra, rb, rc> +/// +/// rn := rn - 4 * 3; +/// ldmia rn, <ra, rb, rc> +/// => +/// ldmdb rn!, <ra, rb, rc> +bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + bool &Advance, + MachineBasicBlock::iterator &I) { + MachineInstr *MI = MBBI; + unsigned Base = MI->getOperand(0).getReg(); + bool BaseKill = MI->getOperand(0).isKill(); + unsigned Bytes = getLSMultipleTransferSize(MI); + unsigned PredReg = 0; + ARMCC::CondCodes Pred = llvm::getInstrPredicate(MI, PredReg); + int Opcode = MI->getOpcode(); + DebugLoc dl = MI->getDebugLoc(); + bool isAM4 = (Opcode == ARM::LDM || Opcode == ARM::t2LDM || + Opcode == ARM::STM || Opcode == ARM::t2STM); + + bool DoMerge = false; + ARM_AM::AMSubMode Mode = ARM_AM::ia; + unsigned Offset = 0; + + if (isAM4) { + // Can't use an updating ld/st if the base register is also a dest + // register. e.g. ldmdb r0!, {r0, r1, r2}. The behavior is undefined. + for (unsigned i = 3, e = MI->getNumOperands(); i != e; ++i) { + if (MI->getOperand(i).getReg() == Base) + return false; + } + Mode = ARM_AM::getAM4SubMode(MI->getOperand(1).getImm()); + } else { + // VLDM{D|S}, VSTM{D|S} addressing mode 5 ops. + Mode = ARM_AM::getAM5SubMode(MI->getOperand(1).getImm()); + Offset = ARM_AM::getAM5Offset(MI->getOperand(1).getImm()); + } + + // Try merging with the previous instruction. + if (MBBI != MBB.begin()) { + MachineBasicBlock::iterator PrevMBBI = prior(MBBI); + if (isAM4) { + if (Mode == ARM_AM::ia && + isMatchingDecrement(PrevMBBI, Base, Bytes, 0, Pred, PredReg)) { + DoMerge = true; + Mode = ARM_AM::db; + } else if (isAM4 && Mode == ARM_AM::ib && + isMatchingDecrement(PrevMBBI, Base, Bytes, 0, Pred, PredReg)) { + DoMerge = true; + Mode = ARM_AM::da; + } + } else { + if (Mode == ARM_AM::ia && + isMatchingDecrement(PrevMBBI, Base, Bytes, 0, Pred, PredReg)) { + Mode = ARM_AM::db; + DoMerge = true; + } + } + if (DoMerge) + MBB.erase(PrevMBBI); + } + + // Try merging with the next instruction. + if (!DoMerge && MBBI != MBB.end()) { + MachineBasicBlock::iterator NextMBBI = llvm::next(MBBI); + if (isAM4) { + if ((Mode == ARM_AM::ia || Mode == ARM_AM::ib) && + isMatchingIncrement(NextMBBI, Base, Bytes, 0, Pred, PredReg)) { + DoMerge = true; + } else if ((Mode == ARM_AM::da || Mode == ARM_AM::db) && + isMatchingDecrement(NextMBBI, Base, Bytes, 0, Pred, PredReg)) { + DoMerge = true; + } + } else { + if (Mode == ARM_AM::ia && + isMatchingIncrement(NextMBBI, Base, Bytes, 0, Pred, PredReg)) { + DoMerge = true; + } + } + if (DoMerge) { + if (NextMBBI == I) { + Advance = true; + ++I; + } + MBB.erase(NextMBBI); + } + } + + if (!DoMerge) + return false; + + unsigned NewOpc = getUpdatingLSMultipleOpcode(Opcode); + MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII->get(NewOpc)) + .addReg(Base, getDefRegState(true)) // WB base register + .addReg(Base, getKillRegState(BaseKill)); + if (isAM4) { + // [t2]LDM_UPD, [t2]STM_UPD + MIB.addImm(ARM_AM::getAM4ModeImm(Mode)) + .addImm(Pred).addReg(PredReg); + } else { + // VLDM[SD}_UPD, VSTM[SD]_UPD + MIB.addImm(ARM_AM::getAM5Opc(Mode, Offset)) + .addImm(Pred).addReg(PredReg); + } + // Transfer the rest of operands. + for (unsigned OpNum = 4, e = MI->getNumOperands(); OpNum != e; ++OpNum) + MIB.addOperand(MI->getOperand(OpNum)); + // Transfer memoperands. + (*MIB).setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); + + MBB.erase(MBBI); + return true; +} + +static unsigned getPreIndexedLoadStoreOpcode(unsigned Opc) { + switch (Opc) { + case ARM::LDR: return ARM::LDR_PRE; + case ARM::STR: return ARM::STR_PRE; + case ARM::VLDRS: return ARM::VLDMS_UPD; + case ARM::VLDRD: return ARM::VLDMD_UPD; + case ARM::VSTRS: return ARM::VSTMS_UPD; + case ARM::VSTRD: return ARM::VSTMD_UPD; + case ARM::t2LDRi8: + case ARM::t2LDRi12: + return ARM::t2LDR_PRE; + case ARM::t2STRi8: + case ARM::t2STRi12: + return ARM::t2STR_PRE; + default: llvm_unreachable("Unhandled opcode!"); + } + return 0; +} + +static unsigned getPostIndexedLoadStoreOpcode(unsigned Opc) { + switch (Opc) { + case ARM::LDR: return ARM::LDR_POST; + case ARM::STR: return ARM::STR_POST; + case ARM::VLDRS: return ARM::VLDMS_UPD; + case ARM::VLDRD: return ARM::VLDMD_UPD; + case ARM::VSTRS: return ARM::VSTMS_UPD; + case ARM::VSTRD: return ARM::VSTMD_UPD; + case ARM::t2LDRi8: + case ARM::t2LDRi12: + return ARM::t2LDR_POST; + case ARM::t2STRi8: + case ARM::t2STRi12: + return ARM::t2STR_POST; + default: llvm_unreachable("Unhandled opcode!"); + } + return 0; +} + +/// MergeBaseUpdateLoadStore - Fold proceeding/trailing inc/dec of base +/// register into the LDR/STR/FLD{D|S}/FST{D|S} op when possible: +bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + const TargetInstrInfo *TII, + bool &Advance, + MachineBasicBlock::iterator &I) { + MachineInstr *MI = MBBI; + unsigned Base = MI->getOperand(1).getReg(); + bool BaseKill = MI->getOperand(1).isKill(); + unsigned Bytes = getLSMultipleTransferSize(MI); + int Opcode = MI->getOpcode(); + DebugLoc dl = MI->getDebugLoc(); + bool isAM5 = (Opcode == ARM::VLDRD || Opcode == ARM::VLDRS || + Opcode == ARM::VSTRD || Opcode == ARM::VSTRS); + bool isAM2 = (Opcode == ARM::LDR || Opcode == ARM::STR); + if (isAM2 && ARM_AM::getAM2Offset(MI->getOperand(3).getImm()) != 0) + return false; + if (isAM5 && ARM_AM::getAM5Offset(MI->getOperand(2).getImm()) != 0) + return false; + if (isT2i32Load(Opcode) || isT2i32Store(Opcode)) + if (MI->getOperand(2).getImm() != 0) + return false; + + bool isLd = isi32Load(Opcode) || Opcode == ARM::VLDRS || Opcode == ARM::VLDRD; + // Can't do the merge if the destination register is the same as the would-be + // writeback register. + if (isLd && MI->getOperand(0).getReg() == Base) + return false; + + unsigned PredReg = 0; + ARMCC::CondCodes Pred = llvm::getInstrPredicate(MI, PredReg); + bool DoMerge = false; + ARM_AM::AddrOpc AddSub = ARM_AM::add; + unsigned NewOpc = 0; + // AM2 - 12 bits, thumb2 - 8 bits. + unsigned Limit = isAM5 ? 0 : (isAM2 ? 0x1000 : 0x100); + + // Try merging with the previous instruction. + if (MBBI != MBB.begin()) { + MachineBasicBlock::iterator PrevMBBI = prior(MBBI); + if (isMatchingDecrement(PrevMBBI, Base, Bytes, Limit, Pred, PredReg)) { + DoMerge = true; + AddSub = ARM_AM::sub; + } else if (!isAM5 && + isMatchingIncrement(PrevMBBI, Base, Bytes, Limit,Pred,PredReg)) { + DoMerge = true; + } + if (DoMerge) { + NewOpc = getPreIndexedLoadStoreOpcode(Opcode); + MBB.erase(PrevMBBI); + } + } + + // Try merging with the next instruction. + if (!DoMerge && MBBI != MBB.end()) { + MachineBasicBlock::iterator NextMBBI = llvm::next(MBBI); + if (!isAM5 && + isMatchingDecrement(NextMBBI, Base, Bytes, Limit, Pred, PredReg)) { + DoMerge = true; + AddSub = ARM_AM::sub; + } else if (isMatchingIncrement(NextMBBI, Base, Bytes, Limit,Pred,PredReg)) { + DoMerge = true; + } + if (DoMerge) { + NewOpc = getPostIndexedLoadStoreOpcode(Opcode); + if (NextMBBI == I) { + Advance = true; + ++I; + } + MBB.erase(NextMBBI); + } + } + + if (!DoMerge) + return false; + + bool isDPR = NewOpc == ARM::VLDMD || NewOpc == ARM::VSTMD; + unsigned Offset = 0; + if (isAM5) + Offset = ARM_AM::getAM5Opc(AddSub == ARM_AM::sub ? ARM_AM::db : ARM_AM::ia, + (isDPR ? 2 : 1)); + else if (isAM2) + Offset = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift); + else + Offset = AddSub == ARM_AM::sub ? -Bytes : Bytes; + + if (isAM5) { + // VLDM[SD}_UPD, VSTM[SD]_UPD + MachineOperand &MO = MI->getOperand(0); + BuildMI(MBB, MBBI, dl, TII->get(NewOpc)) + .addReg(Base, getDefRegState(true)) // WB base register + .addReg(Base, getKillRegState(isLd ? BaseKill : false)) + .addImm(Offset) + .addImm(Pred).addReg(PredReg) + .addReg(MO.getReg(), (isLd ? getDefRegState(true) : + getKillRegState(MO.isKill()))); + } else if (isLd) { + if (isAM2) + // LDR_PRE, LDR_POST, + BuildMI(MBB, MBBI, dl, TII->get(NewOpc), MI->getOperand(0).getReg()) + .addReg(Base, RegState::Define) + .addReg(Base).addReg(0).addImm(Offset).addImm(Pred).addReg(PredReg); + else + // t2LDR_PRE, t2LDR_POST + BuildMI(MBB, MBBI, dl, TII->get(NewOpc), MI->getOperand(0).getReg()) + .addReg(Base, RegState::Define) + .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg); + } else { + MachineOperand &MO = MI->getOperand(0); + if (isAM2) + // STR_PRE, STR_POST + BuildMI(MBB, MBBI, dl, TII->get(NewOpc), Base) + .addReg(MO.getReg(), getKillRegState(MO.isKill())) + .addReg(Base).addReg(0).addImm(Offset).addImm(Pred).addReg(PredReg); + else + // t2STR_PRE, t2STR_POST + BuildMI(MBB, MBBI, dl, TII->get(NewOpc), Base) + .addReg(MO.getReg(), getKillRegState(MO.isKill())) + .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg); + } + MBB.erase(MBBI); + + return true; +} + +/// isMemoryOp - Returns true if instruction is a memory operations (that this +/// pass is capable of operating on). +static bool isMemoryOp(const MachineInstr *MI) { + if (MI->hasOneMemOperand()) { + const MachineMemOperand *MMO = *MI->memoperands_begin(); + + // Don't touch volatile memory accesses - we may be changing their order. + if (MMO->isVolatile()) + return false; + + // Unaligned ldr/str is emulated by some kernels, but unaligned ldm/stm is + // not. + if (MMO->getAlignment() < 4) + return false; + } + + // str <undef> could probably be eliminated entirely, but for now we just want + // to avoid making a mess of it. + // FIXME: Use str <undef> as a wildcard to enable better stm folding. + if (MI->getNumOperands() > 0 && MI->getOperand(0).isReg() && + MI->getOperand(0).isUndef()) + return false; + + // Likewise don't mess with references to undefined addresses. + if (MI->getNumOperands() > 1 && MI->getOperand(1).isReg() && + MI->getOperand(1).isUndef()) + return false; + + int Opcode = MI->getOpcode(); + switch (Opcode) { + default: break; + case ARM::LDR: + case ARM::STR: + return MI->getOperand(1).isReg() && MI->getOperand(2).getReg() == 0; + case ARM::VLDRS: + case ARM::VSTRS: + return MI->getOperand(1).isReg(); + case ARM::VLDRD: + case ARM::VSTRD: + return MI->getOperand(1).isReg(); + case ARM::t2LDRi8: + case ARM::t2LDRi12: + case ARM::t2STRi8: + case ARM::t2STRi12: + return MI->getOperand(1).isReg(); + } + return false; +} + +/// AdvanceRS - Advance register scavenger to just before the earliest memory +/// op that is being merged. +void ARMLoadStoreOpt::AdvanceRS(MachineBasicBlock &MBB, MemOpQueue &MemOps) { + MachineBasicBlock::iterator Loc = MemOps[0].MBBI; + unsigned Position = MemOps[0].Position; + for (unsigned i = 1, e = MemOps.size(); i != e; ++i) { + if (MemOps[i].Position < Position) { + Position = MemOps[i].Position; + Loc = MemOps[i].MBBI; + } + } + + if (Loc != MBB.begin()) + RS->forward(prior(Loc)); +} + +static int getMemoryOpOffset(const MachineInstr *MI) { + int Opcode = MI->getOpcode(); + bool isAM2 = Opcode == ARM::LDR || Opcode == ARM::STR; + bool isAM3 = Opcode == ARM::LDRD || Opcode == ARM::STRD; + unsigned NumOperands = MI->getDesc().getNumOperands(); + unsigned OffField = MI->getOperand(NumOperands-3).getImm(); + + if (Opcode == ARM::t2LDRi12 || Opcode == ARM::t2LDRi8 || + Opcode == ARM::t2STRi12 || Opcode == ARM::t2STRi8 || + Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8) + return OffField; + + int Offset = isAM2 + ? ARM_AM::getAM2Offset(OffField) + : (isAM3 ? ARM_AM::getAM3Offset(OffField) + : ARM_AM::getAM5Offset(OffField) * 4); + if (isAM2) { + if (ARM_AM::getAM2Op(OffField) == ARM_AM::sub) + Offset = -Offset; + } else if (isAM3) { + if (ARM_AM::getAM3Op(OffField) == ARM_AM::sub) + Offset = -Offset; + } else { + if (ARM_AM::getAM5Op(OffField) == ARM_AM::sub) + Offset = -Offset; + } + return Offset; +} + +static void InsertLDR_STR(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, + int OffImm, bool isDef, + DebugLoc dl, unsigned NewOpc, + unsigned Reg, bool RegDeadKill, bool RegUndef, + unsigned BaseReg, bool BaseKill, bool BaseUndef, + unsigned OffReg, bool OffKill, bool OffUndef, + ARMCC::CondCodes Pred, unsigned PredReg, + const TargetInstrInfo *TII, bool isT2) { + int Offset = OffImm; + if (!isT2) { + if (OffImm < 0) + Offset = ARM_AM::getAM2Opc(ARM_AM::sub, -OffImm, ARM_AM::no_shift); + else + Offset = ARM_AM::getAM2Opc(ARM_AM::add, OffImm, ARM_AM::no_shift); + } + if (isDef) { + MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MBBI->getDebugLoc(), + TII->get(NewOpc)) + .addReg(Reg, getDefRegState(true) | getDeadRegState(RegDeadKill)) + .addReg(BaseReg, getKillRegState(BaseKill)|getUndefRegState(BaseUndef)); + if (!isT2) + MIB.addReg(OffReg, getKillRegState(OffKill)|getUndefRegState(OffUndef)); + MIB.addImm(Offset).addImm(Pred).addReg(PredReg); + } else { + MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MBBI->getDebugLoc(), + TII->get(NewOpc)) + .addReg(Reg, getKillRegState(RegDeadKill) | getUndefRegState(RegUndef)) + .addReg(BaseReg, getKillRegState(BaseKill)|getUndefRegState(BaseUndef)); + if (!isT2) + MIB.addReg(OffReg, getKillRegState(OffKill)|getUndefRegState(OffUndef)); + MIB.addImm(Offset).addImm(Pred).addReg(PredReg); + } +} + +bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI) { + MachineInstr *MI = &*MBBI; + unsigned Opcode = MI->getOpcode(); + if (Opcode == ARM::LDRD || Opcode == ARM::STRD || + Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8) { + unsigned EvenReg = MI->getOperand(0).getReg(); + unsigned OddReg = MI->getOperand(1).getReg(); + unsigned EvenRegNum = TRI->getDwarfRegNum(EvenReg, false); + unsigned OddRegNum = TRI->getDwarfRegNum(OddReg, false); + if ((EvenRegNum & 1) == 0 && (EvenRegNum + 1) == OddRegNum) + return false; + + bool isT2 = Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8; + bool isLd = Opcode == ARM::LDRD || Opcode == ARM::t2LDRDi8; + bool EvenDeadKill = isLd ? + MI->getOperand(0).isDead() : MI->getOperand(0).isKill(); + bool EvenUndef = MI->getOperand(0).isUndef(); + bool OddDeadKill = isLd ? + MI->getOperand(1).isDead() : MI->getOperand(1).isKill(); + bool OddUndef = MI->getOperand(1).isUndef(); + const MachineOperand &BaseOp = MI->getOperand(2); + unsigned BaseReg = BaseOp.getReg(); + bool BaseKill = BaseOp.isKill(); + bool BaseUndef = BaseOp.isUndef(); + unsigned OffReg = isT2 ? 0 : MI->getOperand(3).getReg(); + bool OffKill = isT2 ? false : MI->getOperand(3).isKill(); + bool OffUndef = isT2 ? false : MI->getOperand(3).isUndef(); + int OffImm = getMemoryOpOffset(MI); + unsigned PredReg = 0; + ARMCC::CondCodes Pred = llvm::getInstrPredicate(MI, PredReg); + + if (OddRegNum > EvenRegNum && OffReg == 0 && OffImm == 0) { + // Ascending register numbers and no offset. It's safe to change it to a + // ldm or stm. + unsigned NewOpc = (isLd) + ? (isT2 ? ARM::t2LDM : ARM::LDM) + : (isT2 ? ARM::t2STM : ARM::STM); + if (isLd) { + BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(NewOpc)) + .addReg(BaseReg, getKillRegState(BaseKill)) + .addImm(ARM_AM::getAM4ModeImm(ARM_AM::ia)) + .addImm(Pred).addReg(PredReg) + .addReg(EvenReg, getDefRegState(isLd) | getDeadRegState(EvenDeadKill)) + .addReg(OddReg, getDefRegState(isLd) | getDeadRegState(OddDeadKill)); + ++NumLDRD2LDM; + } else { + BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(NewOpc)) + .addReg(BaseReg, getKillRegState(BaseKill)) + .addImm(ARM_AM::getAM4ModeImm(ARM_AM::ia)) + .addImm(Pred).addReg(PredReg) + .addReg(EvenReg, + getKillRegState(EvenDeadKill) | getUndefRegState(EvenUndef)) + .addReg(OddReg, + getKillRegState(OddDeadKill) | getUndefRegState(OddUndef)); + ++NumSTRD2STM; + } + } else { + // Split into two instructions. + assert((!isT2 || !OffReg) && + "Thumb2 ldrd / strd does not encode offset register!"); + unsigned NewOpc = (isLd) + ? (isT2 ? (OffImm < 0 ? ARM::t2LDRi8 : ARM::t2LDRi12) : ARM::LDR) + : (isT2 ? (OffImm < 0 ? ARM::t2STRi8 : ARM::t2STRi12) : ARM::STR); + DebugLoc dl = MBBI->getDebugLoc(); + // If this is a load and base register is killed, it may have been + // re-defed by the load, make sure the first load does not clobber it. + if (isLd && + (BaseKill || OffKill) && + (TRI->regsOverlap(EvenReg, BaseReg) || + (OffReg && TRI->regsOverlap(EvenReg, OffReg)))) { + assert(!TRI->regsOverlap(OddReg, BaseReg) && + (!OffReg || !TRI->regsOverlap(OddReg, OffReg))); + InsertLDR_STR(MBB, MBBI, OffImm+4, isLd, dl, NewOpc, + OddReg, OddDeadKill, false, + BaseReg, false, BaseUndef, OffReg, false, OffUndef, + Pred, PredReg, TII, isT2); + InsertLDR_STR(MBB, MBBI, OffImm, isLd, dl, NewOpc, + EvenReg, EvenDeadKill, false, + BaseReg, BaseKill, BaseUndef, OffReg, OffKill, OffUndef, + Pred, PredReg, TII, isT2); + } else { + if (OddReg == EvenReg && EvenDeadKill) { + // If the two source operands are the same, the kill marker is probably + // on the first one. e.g. + // t2STRDi8 %R5<kill>, %R5, %R9<kill>, 0, 14, %reg0 + EvenDeadKill = false; + OddDeadKill = true; + } + InsertLDR_STR(MBB, MBBI, OffImm, isLd, dl, NewOpc, + EvenReg, EvenDeadKill, EvenUndef, + BaseReg, false, BaseUndef, OffReg, false, OffUndef, + Pred, PredReg, TII, isT2); + InsertLDR_STR(MBB, MBBI, OffImm+4, isLd, dl, NewOpc, + OddReg, OddDeadKill, OddUndef, + BaseReg, BaseKill, BaseUndef, OffReg, OffKill, OffUndef, + Pred, PredReg, TII, isT2); + } + if (isLd) + ++NumLDRD2LDR; + else + ++NumSTRD2STR; + } + + MBBI = prior(MBBI); + MBB.erase(MI); + } + return false; +} + +/// LoadStoreMultipleOpti - An optimization pass to turn multiple LDR / STR +/// ops of the same base and incrementing offset into LDM / STM ops. +bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) { + unsigned NumMerges = 0; + unsigned NumMemOps = 0; + MemOpQueue MemOps; + unsigned CurrBase = 0; + int CurrOpc = -1; + unsigned CurrSize = 0; + ARMCC::CondCodes CurrPred = ARMCC::AL; + unsigned CurrPredReg = 0; + unsigned Position = 0; + SmallVector<MachineBasicBlock::iterator,4> Merges; + + RS->enterBasicBlock(&MBB); + MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); + while (MBBI != E) { + if (FixInvalidRegPairOp(MBB, MBBI)) + continue; + + bool Advance = false; + bool TryMerge = false; + bool Clobber = false; + + bool isMemOp = isMemoryOp(MBBI); + if (isMemOp) { + int Opcode = MBBI->getOpcode(); + unsigned Size = getLSMultipleTransferSize(MBBI); + unsigned Base = MBBI->getOperand(1).getReg(); + unsigned PredReg = 0; + ARMCC::CondCodes Pred = llvm::getInstrPredicate(MBBI, PredReg); + int Offset = getMemoryOpOffset(MBBI); + // Watch out for: + // r4 := ldr [r5] + // r5 := ldr [r5, #4] + // r6 := ldr [r5, #8] + // + // The second ldr has effectively broken the chain even though it + // looks like the later ldr(s) use the same base register. Try to + // merge the ldr's so far, including this one. But don't try to + // combine the following ldr(s). + Clobber = (isi32Load(Opcode) && Base == MBBI->getOperand(0).getReg()); + if (CurrBase == 0 && !Clobber) { + // Start of a new chain. + CurrBase = Base; + CurrOpc = Opcode; + CurrSize = Size; + CurrPred = Pred; + CurrPredReg = PredReg; + MemOps.push_back(MemOpQueueEntry(Offset, Position, MBBI)); + NumMemOps++; + Advance = true; + } else { + if (Clobber) { + TryMerge = true; + Advance = true; + } + + if (CurrOpc == Opcode && CurrBase == Base && CurrPred == Pred) { + // No need to match PredReg. + // Continue adding to the queue. + if (Offset > MemOps.back().Offset) { + MemOps.push_back(MemOpQueueEntry(Offset, Position, MBBI)); + NumMemOps++; + Advance = true; + } else { + for (MemOpQueueIter I = MemOps.begin(), E = MemOps.end(); + I != E; ++I) { + if (Offset < I->Offset) { + MemOps.insert(I, MemOpQueueEntry(Offset, Position, MBBI)); + NumMemOps++; + Advance = true; + break; + } else if (Offset == I->Offset) { + // Collision! This can't be merged! + break; + } + } + } + } + } + } + + if (Advance) { + ++Position; + ++MBBI; + if (MBBI == E) + // Reach the end of the block, try merging the memory instructions. + TryMerge = true; + } else + TryMerge = true; + + if (TryMerge) { + if (NumMemOps > 1) { + // Try to find a free register to use as a new base in case it's needed. + // First advance to the instruction just before the start of the chain. + AdvanceRS(MBB, MemOps); + // Find a scratch register. + unsigned Scratch = RS->FindUnusedReg(ARM::GPRRegisterClass); + // Process the load / store instructions. + RS->forward(prior(MBBI)); + + // Merge ops. + Merges.clear(); + MergeLDR_STR(MBB, 0, CurrBase, CurrOpc, CurrSize, + CurrPred, CurrPredReg, Scratch, MemOps, Merges); + + // Try folding preceeding/trailing base inc/dec into the generated + // LDM/STM ops. + for (unsigned i = 0, e = Merges.size(); i < e; ++i) + if (MergeBaseUpdateLSMultiple(MBB, Merges[i], Advance, MBBI)) + ++NumMerges; + NumMerges += Merges.size(); + + // Try folding preceeding/trailing base inc/dec into those load/store + // that were not merged to form LDM/STM ops. + for (unsigned i = 0; i != NumMemOps; ++i) + if (!MemOps[i].Merged) + if (MergeBaseUpdateLoadStore(MBB, MemOps[i].MBBI, TII,Advance,MBBI)) + ++NumMerges; + + // RS may be pointing to an instruction that's deleted. + RS->skipTo(prior(MBBI)); + } else if (NumMemOps == 1) { + // Try folding preceeding/trailing base inc/dec into the single + // load/store. + if (MergeBaseUpdateLoadStore(MBB, MemOps[0].MBBI, TII, Advance, MBBI)) { + ++NumMerges; + RS->forward(prior(MBBI)); + } + } + + CurrBase = 0; + CurrOpc = -1; + CurrSize = 0; + CurrPred = ARMCC::AL; + CurrPredReg = 0; + if (NumMemOps) { + MemOps.clear(); + NumMemOps = 0; + } + + // If iterator hasn't been advanced and this is not a memory op, skip it. + // It can't start a new chain anyway. + if (!Advance && !isMemOp && MBBI != E) { + ++Position; + ++MBBI; + } + } + } + return NumMerges > 0; +} + +namespace { + struct OffsetCompare { + bool operator()(const MachineInstr *LHS, const MachineInstr *RHS) const { + int LOffset = getMemoryOpOffset(LHS); + int ROffset = getMemoryOpOffset(RHS); + assert(LHS == RHS || LOffset != ROffset); + return LOffset > ROffset; + } + }; +} + +/// MergeReturnIntoLDM - If this is a exit BB, try merging the return ops +/// ("bx lr" and "mov pc, lr") into the preceeding stack restore so it +/// directly restore the value of LR into pc. +/// ldmfd sp!, {..., lr} +/// bx lr +/// or +/// ldmfd sp!, {..., lr} +/// mov pc, lr +/// => +/// ldmfd sp!, {..., pc} +bool ARMLoadStoreOpt::MergeReturnIntoLDM(MachineBasicBlock &MBB) { + if (MBB.empty()) return false; + + MachineBasicBlock::iterator MBBI = prior(MBB.end()); + if (MBBI != MBB.begin() && + (MBBI->getOpcode() == ARM::BX_RET || + MBBI->getOpcode() == ARM::tBX_RET || + MBBI->getOpcode() == ARM::MOVPCLR)) { + MachineInstr *PrevMI = prior(MBBI); + if (PrevMI->getOpcode() == ARM::LDM_UPD || + PrevMI->getOpcode() == ARM::t2LDM_UPD) { + MachineOperand &MO = PrevMI->getOperand(PrevMI->getNumOperands()-1); + if (MO.getReg() != ARM::LR) + return false; + unsigned NewOpc = isThumb2 ? ARM::t2LDM_RET : ARM::LDM_RET; + PrevMI->setDesc(TII->get(NewOpc)); + MO.setReg(ARM::PC); + MBB.erase(MBBI); + return true; + } + } + return false; +} + +bool ARMLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { + const TargetMachine &TM = Fn.getTarget(); + AFI = Fn.getInfo<ARMFunctionInfo>(); + TII = TM.getInstrInfo(); + TRI = TM.getRegisterInfo(); + RS = new RegScavenger(); + isThumb2 = AFI->isThumb2Function(); + + bool Modified = false; + for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E; + ++MFI) { + MachineBasicBlock &MBB = *MFI; + Modified |= LoadStoreMultipleOpti(MBB); + Modified |= MergeReturnIntoLDM(MBB); + } + + delete RS; + return Modified; +} + + +/// ARMPreAllocLoadStoreOpt - Pre- register allocation pass that move +/// load / stores from consecutive locations close to make it more +/// likely they will be combined later. + +namespace { + struct ARMPreAllocLoadStoreOpt : public MachineFunctionPass{ + static char ID; + ARMPreAllocLoadStoreOpt() : MachineFunctionPass(&ID) {} + + const TargetData *TD; + const TargetInstrInfo *TII; + const TargetRegisterInfo *TRI; + const ARMSubtarget *STI; + MachineRegisterInfo *MRI; + MachineFunction *MF; + + virtual bool runOnMachineFunction(MachineFunction &Fn); + + virtual const char *getPassName() const { + return "ARM pre- register allocation load / store optimization pass"; + } + + private: + bool CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1, DebugLoc &dl, + unsigned &NewOpc, unsigned &EvenReg, + unsigned &OddReg, unsigned &BaseReg, + unsigned &OffReg, int &Offset, + unsigned &PredReg, ARMCC::CondCodes &Pred, + bool &isT2); + bool RescheduleOps(MachineBasicBlock *MBB, + SmallVector<MachineInstr*, 4> &Ops, + unsigned Base, bool isLd, + DenseMap<MachineInstr*, unsigned> &MI2LocMap); + bool RescheduleLoadStoreInstrs(MachineBasicBlock *MBB); + }; + char ARMPreAllocLoadStoreOpt::ID = 0; +} + +bool ARMPreAllocLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { + TD = Fn.getTarget().getTargetData(); + TII = Fn.getTarget().getInstrInfo(); + TRI = Fn.getTarget().getRegisterInfo(); + STI = &Fn.getTarget().getSubtarget<ARMSubtarget>(); + MRI = &Fn.getRegInfo(); + MF = &Fn; + + bool Modified = false; + for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E; + ++MFI) + Modified |= RescheduleLoadStoreInstrs(MFI); + + return Modified; +} + +static bool IsSafeAndProfitableToMove(bool isLd, unsigned Base, + MachineBasicBlock::iterator I, + MachineBasicBlock::iterator E, + SmallPtrSet<MachineInstr*, 4> &MemOps, + SmallSet<unsigned, 4> &MemRegs, + const TargetRegisterInfo *TRI) { + // Are there stores / loads / calls between them? + // FIXME: This is overly conservative. We should make use of alias information + // some day. + SmallSet<unsigned, 4> AddedRegPressure; + while (++I != E) { + if (MemOps.count(&*I)) + continue; + const TargetInstrDesc &TID = I->getDesc(); + if (TID.isCall() || TID.isTerminator() || TID.hasUnmodeledSideEffects()) + return false; + if (isLd && TID.mayStore()) + return false; + if (!isLd) { + if (TID.mayLoad()) + return false; + // It's not safe to move the first 'str' down. + // str r1, [r0] + // strh r5, [r0] + // str r4, [r0, #+4] + if (TID.mayStore()) + return false; + } + for (unsigned j = 0, NumOps = I->getNumOperands(); j != NumOps; ++j) { + MachineOperand &MO = I->getOperand(j); + if (!MO.isReg()) + continue; + unsigned Reg = MO.getReg(); + if (MO.isDef() && TRI->regsOverlap(Reg, Base)) + return false; + if (Reg != Base && !MemRegs.count(Reg)) + AddedRegPressure.insert(Reg); + } + } + + // Estimate register pressure increase due to the transformation. + if (MemRegs.size() <= 4) + // Ok if we are moving small number of instructions. + return true; + return AddedRegPressure.size() <= MemRegs.size() * 2; +} + +bool +ARMPreAllocLoadStoreOpt::CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1, + DebugLoc &dl, + unsigned &NewOpc, unsigned &EvenReg, + unsigned &OddReg, unsigned &BaseReg, + unsigned &OffReg, int &Offset, + unsigned &PredReg, + ARMCC::CondCodes &Pred, + bool &isT2) { + // Make sure we're allowed to generate LDRD/STRD. + if (!STI->hasV5TEOps()) + return false; + + // FIXME: VLDRS / VSTRS -> VLDRD / VSTRD + unsigned Scale = 1; + unsigned Opcode = Op0->getOpcode(); + if (Opcode == ARM::LDR) + NewOpc = ARM::LDRD; + else if (Opcode == ARM::STR) + NewOpc = ARM::STRD; + else if (Opcode == ARM::t2LDRi8 || Opcode == ARM::t2LDRi12) { + NewOpc = ARM::t2LDRDi8; + Scale = 4; + isT2 = true; + } else if (Opcode == ARM::t2STRi8 || Opcode == ARM::t2STRi12) { + NewOpc = ARM::t2STRDi8; + Scale = 4; + isT2 = true; + } else + return false; + + // Make sure the offset registers match. + if (!isT2 && + (Op0->getOperand(2).getReg() != Op1->getOperand(2).getReg())) + return false; + + // Must sure the base address satisfies i64 ld / st alignment requirement. + if (!Op0->hasOneMemOperand() || + !(*Op0->memoperands_begin())->getValue() || + (*Op0->memoperands_begin())->isVolatile()) + return false; + + unsigned Align = (*Op0->memoperands_begin())->getAlignment(); + const Function *Func = MF->getFunction(); + unsigned ReqAlign = STI->hasV6Ops() + ? TD->getPrefTypeAlignment(Type::getInt64Ty(Func->getContext())) + : 8; // Pre-v6 need 8-byte align + if (Align < ReqAlign) + return false; + + // Then make sure the immediate offset fits. + int OffImm = getMemoryOpOffset(Op0); + if (isT2) { + if (OffImm < 0) { + if (OffImm < -255) + // Can't fall back to t2LDRi8 / t2STRi8. + return false; + } else { + int Limit = (1 << 8) * Scale; + if (OffImm >= Limit || (OffImm & (Scale-1))) + return false; + } + Offset = OffImm; + } else { + ARM_AM::AddrOpc AddSub = ARM_AM::add; + if (OffImm < 0) { + AddSub = ARM_AM::sub; + OffImm = - OffImm; + } + int Limit = (1 << 8) * Scale; + if (OffImm >= Limit || (OffImm & (Scale-1))) + return false; + Offset = ARM_AM::getAM3Opc(AddSub, OffImm); + } + EvenReg = Op0->getOperand(0).getReg(); + OddReg = Op1->getOperand(0).getReg(); + if (EvenReg == OddReg) + return false; + BaseReg = Op0->getOperand(1).getReg(); + if (!isT2) + OffReg = Op0->getOperand(2).getReg(); + Pred = llvm::getInstrPredicate(Op0, PredReg); + dl = Op0->getDebugLoc(); + return true; +} + +bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB, + SmallVector<MachineInstr*, 4> &Ops, + unsigned Base, bool isLd, + DenseMap<MachineInstr*, unsigned> &MI2LocMap) { + bool RetVal = false; + + // Sort by offset (in reverse order). + std::sort(Ops.begin(), Ops.end(), OffsetCompare()); + + // The loads / stores of the same base are in order. Scan them from first to + // last and check for the followins: + // 1. Any def of base. + // 2. Any gaps. + while (Ops.size() > 1) { + unsigned FirstLoc = ~0U; + unsigned LastLoc = 0; + MachineInstr *FirstOp = 0; + MachineInstr *LastOp = 0; + int LastOffset = 0; + unsigned LastOpcode = 0; + unsigned LastBytes = 0; + unsigned NumMove = 0; + for (int i = Ops.size() - 1; i >= 0; --i) { + MachineInstr *Op = Ops[i]; + unsigned Loc = MI2LocMap[Op]; + if (Loc <= FirstLoc) { + FirstLoc = Loc; + FirstOp = Op; + } + if (Loc >= LastLoc) { + LastLoc = Loc; + LastOp = Op; + } + + unsigned Opcode = Op->getOpcode(); + if (LastOpcode && Opcode != LastOpcode) + break; + + int Offset = getMemoryOpOffset(Op); + unsigned Bytes = getLSMultipleTransferSize(Op); + if (LastBytes) { + if (Bytes != LastBytes || Offset != (LastOffset + (int)Bytes)) + break; + } + LastOffset = Offset; + LastBytes = Bytes; + LastOpcode = Opcode; + if (++NumMove == 8) // FIXME: Tune this limit. + break; + } + + if (NumMove <= 1) + Ops.pop_back(); + else { + SmallPtrSet<MachineInstr*, 4> MemOps; + SmallSet<unsigned, 4> MemRegs; + for (int i = NumMove-1; i >= 0; --i) { + MemOps.insert(Ops[i]); + MemRegs.insert(Ops[i]->getOperand(0).getReg()); + } + + // Be conservative, if the instructions are too far apart, don't + // move them. We want to limit the increase of register pressure. + bool DoMove = (LastLoc - FirstLoc) <= NumMove*4; // FIXME: Tune this. + if (DoMove) + DoMove = IsSafeAndProfitableToMove(isLd, Base, FirstOp, LastOp, + MemOps, MemRegs, TRI); + if (!DoMove) { + for (unsigned i = 0; i != NumMove; ++i) + Ops.pop_back(); + } else { + // This is the new location for the loads / stores. + MachineBasicBlock::iterator InsertPos = isLd ? FirstOp : LastOp; + while (InsertPos != MBB->end() && MemOps.count(InsertPos)) + ++InsertPos; + + // If we are moving a pair of loads / stores, see if it makes sense + // to try to allocate a pair of registers that can form register pairs. + MachineInstr *Op0 = Ops.back(); + MachineInstr *Op1 = Ops[Ops.size()-2]; + unsigned EvenReg = 0, OddReg = 0; + unsigned BaseReg = 0, OffReg = 0, PredReg = 0; + ARMCC::CondCodes Pred = ARMCC::AL; + bool isT2 = false; + unsigned NewOpc = 0; + int Offset = 0; + DebugLoc dl; + if (NumMove == 2 && CanFormLdStDWord(Op0, Op1, dl, NewOpc, + EvenReg, OddReg, BaseReg, OffReg, + Offset, PredReg, Pred, isT2)) { + Ops.pop_back(); + Ops.pop_back(); + + // Form the pair instruction. + if (isLd) { + MachineInstrBuilder MIB = BuildMI(*MBB, InsertPos, + dl, TII->get(NewOpc)) + .addReg(EvenReg, RegState::Define) + .addReg(OddReg, RegState::Define) + .addReg(BaseReg); + if (!isT2) + MIB.addReg(OffReg); + MIB.addImm(Offset).addImm(Pred).addReg(PredReg); + ++NumLDRDFormed; + } else { + MachineInstrBuilder MIB = BuildMI(*MBB, InsertPos, + dl, TII->get(NewOpc)) + .addReg(EvenReg) + .addReg(OddReg) + .addReg(BaseReg); + if (!isT2) + MIB.addReg(OffReg); + MIB.addImm(Offset).addImm(Pred).addReg(PredReg); + ++NumSTRDFormed; + } + MBB->erase(Op0); + MBB->erase(Op1); + + // Add register allocation hints to form register pairs. + MRI->setRegAllocationHint(EvenReg, ARMRI::RegPairEven, OddReg); + MRI->setRegAllocationHint(OddReg, ARMRI::RegPairOdd, EvenReg); + } else { + for (unsigned i = 0; i != NumMove; ++i) { + MachineInstr *Op = Ops.back(); + Ops.pop_back(); + MBB->splice(InsertPos, MBB, Op); + } + } + + NumLdStMoved += NumMove; + RetVal = true; + } + } + } + + return RetVal; +} + +bool +ARMPreAllocLoadStoreOpt::RescheduleLoadStoreInstrs(MachineBasicBlock *MBB) { + bool RetVal = false; + + DenseMap<MachineInstr*, unsigned> MI2LocMap; + DenseMap<unsigned, SmallVector<MachineInstr*, 4> > Base2LdsMap; + DenseMap<unsigned, SmallVector<MachineInstr*, 4> > Base2StsMap; + SmallVector<unsigned, 4> LdBases; + SmallVector<unsigned, 4> StBases; + + unsigned Loc = 0; + MachineBasicBlock::iterator MBBI = MBB->begin(); + MachineBasicBlock::iterator E = MBB->end(); + while (MBBI != E) { + for (; MBBI != E; ++MBBI) { + MachineInstr *MI = MBBI; + const TargetInstrDesc &TID = MI->getDesc(); + if (TID.isCall() || TID.isTerminator()) { + // Stop at barriers. + ++MBBI; + break; + } + + MI2LocMap[MI] = Loc++; + if (!isMemoryOp(MI)) + continue; + unsigned PredReg = 0; + if (llvm::getInstrPredicate(MI, PredReg) != ARMCC::AL) + continue; + + int Opc = MI->getOpcode(); + bool isLd = isi32Load(Opc) || Opc == ARM::VLDRS || Opc == ARM::VLDRD; + unsigned Base = MI->getOperand(1).getReg(); + int Offset = getMemoryOpOffset(MI); + + bool StopHere = false; + if (isLd) { + DenseMap<unsigned, SmallVector<MachineInstr*, 4> >::iterator BI = + Base2LdsMap.find(Base); + if (BI != Base2LdsMap.end()) { + for (unsigned i = 0, e = BI->second.size(); i != e; ++i) { + if (Offset == getMemoryOpOffset(BI->second[i])) { + StopHere = true; + break; + } + } + if (!StopHere) + BI->second.push_back(MI); + } else { + SmallVector<MachineInstr*, 4> MIs; + MIs.push_back(MI); + Base2LdsMap[Base] = MIs; + LdBases.push_back(Base); + } + } else { + DenseMap<unsigned, SmallVector<MachineInstr*, 4> >::iterator BI = + Base2StsMap.find(Base); + if (BI != Base2StsMap.end()) { + for (unsigned i = 0, e = BI->second.size(); i != e; ++i) { + if (Offset == getMemoryOpOffset(BI->second[i])) { + StopHere = true; + break; + } + } + if (!StopHere) + BI->second.push_back(MI); + } else { + SmallVector<MachineInstr*, 4> MIs; + MIs.push_back(MI); + Base2StsMap[Base] = MIs; + StBases.push_back(Base); + } + } + + if (StopHere) { + // Found a duplicate (a base+offset combination that's seen earlier). + // Backtrack. + --Loc; + break; + } + } + + // Re-schedule loads. + for (unsigned i = 0, e = LdBases.size(); i != e; ++i) { + unsigned Base = LdBases[i]; + SmallVector<MachineInstr*, 4> &Lds = Base2LdsMap[Base]; + if (Lds.size() > 1) + RetVal |= RescheduleOps(MBB, Lds, Base, true, MI2LocMap); + } + + // Re-schedule stores. + for (unsigned i = 0, e = StBases.size(); i != e; ++i) { + unsigned Base = StBases[i]; + SmallVector<MachineInstr*, 4> &Sts = Base2StsMap[Base]; + if (Sts.size() > 1) + RetVal |= RescheduleOps(MBB, Sts, Base, false, MI2LocMap); + } + + if (MBBI != E) { + Base2LdsMap.clear(); + Base2StsMap.clear(); + LdBases.clear(); + StBases.clear(); + } + } + + return RetVal; +} + + +/// createARMLoadStoreOptimizationPass - returns an instance of the load / store +/// optimization pass. +FunctionPass *llvm::createARMLoadStoreOptimizationPass(bool PreAlloc) { + if (PreAlloc) + return new ARMPreAllocLoadStoreOpt(); + return new ARMLoadStoreOpt(); +} diff --git a/contrib/llvm/lib/Target/ARM/ARMMCAsmInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMMCAsmInfo.cpp new file mode 100644 index 0000000..53edfca --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMMCAsmInfo.cpp @@ -0,0 +1,68 @@ +//===-- ARMMCAsmInfo.cpp - ARM asm properties -------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the declarations of the ARMMCAsmInfo properties. +// +//===----------------------------------------------------------------------===// + +#include "ARMMCAsmInfo.h" +using namespace llvm; + +static const char *const arm_asm_table[] = { + "{r0}", "r0", + "{r1}", "r1", + "{r2}", "r2", + "{r3}", "r3", + "{r4}", "r4", + "{r5}", "r5", + "{r6}", "r6", + "{r7}", "r7", + "{r8}", "r8", + "{r9}", "r9", + "{r10}", "r10", + "{r11}", "r11", + "{r12}", "r12", + "{r13}", "r13", + "{r14}", "r14", + "{lr}", "lr", + "{sp}", "sp", + "{ip}", "ip", + "{fp}", "fp", + "{sl}", "sl", + "{memory}", "memory", + "{cc}", "cc", + 0,0 +}; + +ARMMCAsmInfoDarwin::ARMMCAsmInfoDarwin() { + AsmTransCBE = arm_asm_table; + Data64bitsDirective = 0; + CommentString = "@"; + SupportsDebugInformation = true; + + // Exceptions handling + ExceptionsType = ExceptionHandling::SjLj; +} + +ARMELFMCAsmInfo::ARMELFMCAsmInfo() { + // ".comm align is in bytes but .align is pow-2." + AlignmentIsInBytes = false; + + Data64bitsDirective = 0; + CommentString = "@"; + + HasLEB128 = true; + PrivateGlobalPrefix = ".L"; + WeakRefDirective = "\t.weak\t"; + HasLCOMMDirective = true; + + DwarfRequiresFrameSection = false; + + SupportsDebugInformation = true; +} diff --git a/contrib/llvm/lib/Target/ARM/ARMMCAsmInfo.h b/contrib/llvm/lib/Target/ARM/ARMMCAsmInfo.h new file mode 100644 index 0000000..90f7822 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMMCAsmInfo.h @@ -0,0 +1,31 @@ +//=====-- ARMMCAsmInfo.h - ARM asm properties -------------*- C++ -*--====// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the declaration of the ARMMCAsmInfo class. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_ARMTARGETASMINFO_H +#define LLVM_ARMTARGETASMINFO_H + +#include "llvm/MC/MCAsmInfoDarwin.h" + +namespace llvm { + + struct ARMMCAsmInfoDarwin : public MCAsmInfoDarwin { + explicit ARMMCAsmInfoDarwin(); + }; + + struct ARMELFMCAsmInfo : public MCAsmInfo { + explicit ARMELFMCAsmInfo(); + }; + +} // namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h b/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h new file mode 100644 index 0000000..0134276 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h @@ -0,0 +1,235 @@ +//====- ARMMachineFuctionInfo.h - ARM machine function info -----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file declares ARM-specific per-machine-function information. +// +//===----------------------------------------------------------------------===// + +#ifndef ARMMACHINEFUNCTIONINFO_H +#define ARMMACHINEFUNCTIONINFO_H + +#include "ARMSubtarget.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/Target/TargetRegisterInfo.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/ADT/BitVector.h" + +namespace llvm { + +/// ARMFunctionInfo - This class is derived from MachineFunction private +/// ARM target-specific information for each MachineFunction. +class ARMFunctionInfo : public MachineFunctionInfo { + + /// isThumb - True if this function is compiled under Thumb mode. + /// Used to initialized Align, so must precede it. + bool isThumb; + + /// hasThumb2 - True if the target architecture supports Thumb2. Do not use + /// to determine if function is compiled under Thumb mode, for that use + /// 'isThumb'. + bool hasThumb2; + + /// VarArgsRegSaveSize - Size of the register save area for vararg functions. + /// + unsigned VarArgsRegSaveSize; + + /// HasStackFrame - True if this function has a stack frame. Set by + /// processFunctionBeforeCalleeSavedScan(). + bool HasStackFrame; + + /// LRSpilledForFarJump - True if the LR register has been for spilled to + /// enable far jump. + bool LRSpilledForFarJump; + + /// FramePtrSpillOffset - If HasStackFrame, this records the frame pointer + /// spill stack offset. + unsigned FramePtrSpillOffset; + + /// GPRCS1Offset, GPRCS2Offset, DPRCSOffset - Starting offset of callee saved + /// register spills areas. For Mac OS X: + /// + /// GPR callee-saved (1) : r4, r5, r6, r7, lr + /// -------------------------------------------- + /// GPR callee-saved (2) : r8, r10, r11 + /// -------------------------------------------- + /// DPR callee-saved : d8 - d15 + unsigned GPRCS1Offset; + unsigned GPRCS2Offset; + unsigned DPRCSOffset; + + /// GPRCS1Size, GPRCS2Size, DPRCSSize - Sizes of callee saved register spills + /// areas. + unsigned GPRCS1Size; + unsigned GPRCS2Size; + unsigned DPRCSSize; + + /// GPRCS1Frames, GPRCS2Frames, DPRCSFrames - Keeps track of frame indices + /// which belong to these spill areas. + BitVector GPRCS1Frames; + BitVector GPRCS2Frames; + BitVector DPRCSFrames; + + /// SpilledCSRegs - A BitVector mask of all spilled callee-saved registers. + /// + BitVector SpilledCSRegs; + + /// JumpTableUId - Unique id for jumptables. + /// + unsigned JumpTableUId; + + unsigned ConstPoolEntryUId; + + /// VarArgsFrameIndex - FrameIndex for start of varargs area. + int VarArgsFrameIndex; + +public: + ARMFunctionInfo() : + isThumb(false), + hasThumb2(false), + VarArgsRegSaveSize(0), HasStackFrame(false), + LRSpilledForFarJump(false), + FramePtrSpillOffset(0), GPRCS1Offset(0), GPRCS2Offset(0), DPRCSOffset(0), + GPRCS1Size(0), GPRCS2Size(0), DPRCSSize(0), + GPRCS1Frames(0), GPRCS2Frames(0), DPRCSFrames(0), + JumpTableUId(0), ConstPoolEntryUId(0), VarArgsFrameIndex(0) {} + + explicit ARMFunctionInfo(MachineFunction &MF) : + isThumb(MF.getTarget().getSubtarget<ARMSubtarget>().isThumb()), + hasThumb2(MF.getTarget().getSubtarget<ARMSubtarget>().hasThumb2()), + VarArgsRegSaveSize(0), HasStackFrame(false), + LRSpilledForFarJump(false), + FramePtrSpillOffset(0), GPRCS1Offset(0), GPRCS2Offset(0), DPRCSOffset(0), + GPRCS1Size(0), GPRCS2Size(0), DPRCSSize(0), + GPRCS1Frames(32), GPRCS2Frames(32), DPRCSFrames(32), + SpilledCSRegs(MF.getTarget().getRegisterInfo()->getNumRegs()), + JumpTableUId(0), ConstPoolEntryUId(0), VarArgsFrameIndex(0) {} + + bool isThumbFunction() const { return isThumb; } + bool isThumb1OnlyFunction() const { return isThumb && !hasThumb2; } + bool isThumb2Function() const { return isThumb && hasThumb2; } + + unsigned getVarArgsRegSaveSize() const { return VarArgsRegSaveSize; } + void setVarArgsRegSaveSize(unsigned s) { VarArgsRegSaveSize = s; } + + bool hasStackFrame() const { return HasStackFrame; } + void setHasStackFrame(bool s) { HasStackFrame = s; } + + bool isLRSpilledForFarJump() const { return LRSpilledForFarJump; } + void setLRIsSpilledForFarJump(bool s) { LRSpilledForFarJump = s; } + + unsigned getFramePtrSpillOffset() const { return FramePtrSpillOffset; } + void setFramePtrSpillOffset(unsigned o) { FramePtrSpillOffset = o; } + + unsigned getGPRCalleeSavedArea1Offset() const { return GPRCS1Offset; } + unsigned getGPRCalleeSavedArea2Offset() const { return GPRCS2Offset; } + unsigned getDPRCalleeSavedAreaOffset() const { return DPRCSOffset; } + + void setGPRCalleeSavedArea1Offset(unsigned o) { GPRCS1Offset = o; } + void setGPRCalleeSavedArea2Offset(unsigned o) { GPRCS2Offset = o; } + void setDPRCalleeSavedAreaOffset(unsigned o) { DPRCSOffset = o; } + + unsigned getGPRCalleeSavedArea1Size() const { return GPRCS1Size; } + unsigned getGPRCalleeSavedArea2Size() const { return GPRCS2Size; } + unsigned getDPRCalleeSavedAreaSize() const { return DPRCSSize; } + + void setGPRCalleeSavedArea1Size(unsigned s) { GPRCS1Size = s; } + void setGPRCalleeSavedArea2Size(unsigned s) { GPRCS2Size = s; } + void setDPRCalleeSavedAreaSize(unsigned s) { DPRCSSize = s; } + + bool isGPRCalleeSavedArea1Frame(int fi) const { + if (fi < 0 || fi >= (int)GPRCS1Frames.size()) + return false; + return GPRCS1Frames[fi]; + } + bool isGPRCalleeSavedArea2Frame(int fi) const { + if (fi < 0 || fi >= (int)GPRCS2Frames.size()) + return false; + return GPRCS2Frames[fi]; + } + bool isDPRCalleeSavedAreaFrame(int fi) const { + if (fi < 0 || fi >= (int)DPRCSFrames.size()) + return false; + return DPRCSFrames[fi]; + } + + void addGPRCalleeSavedArea1Frame(int fi) { + if (fi >= 0) { + int Size = GPRCS1Frames.size(); + if (fi >= Size) { + Size *= 2; + if (fi >= Size) + Size = fi+1; + GPRCS1Frames.resize(Size); + } + GPRCS1Frames[fi] = true; + } + } + void addGPRCalleeSavedArea2Frame(int fi) { + if (fi >= 0) { + int Size = GPRCS2Frames.size(); + if (fi >= Size) { + Size *= 2; + if (fi >= Size) + Size = fi+1; + GPRCS2Frames.resize(Size); + } + GPRCS2Frames[fi] = true; + } + } + void addDPRCalleeSavedAreaFrame(int fi) { + if (fi >= 0) { + int Size = DPRCSFrames.size(); + if (fi >= Size) { + Size *= 2; + if (fi >= Size) + Size = fi+1; + DPRCSFrames.resize(Size); + } + DPRCSFrames[fi] = true; + } + } + + void setCSRegisterIsSpilled(unsigned Reg) { + SpilledCSRegs.set(Reg); + } + + bool isCSRegisterSpilled(unsigned Reg) const { + return SpilledCSRegs[Reg]; + } + + const BitVector &getSpilledCSRegisters() const { + return SpilledCSRegs; + } + + unsigned createJumpTableUId() { + return JumpTableUId++; + } + + unsigned getNumJumpTables() const { + return JumpTableUId; + } + + void initConstPoolEntryUId(unsigned UId) { + ConstPoolEntryUId = UId; + } + + unsigned getNumConstPoolEntries() const { + return ConstPoolEntryUId; + } + + unsigned createConstPoolEntryUId() { + return ConstPoolEntryUId++; + } + + int getVarArgsFrameIndex() const { return VarArgsFrameIndex; } + void setVarArgsFrameIndex(int Index) { VarArgsFrameIndex = Index; } +}; +} // End llvm namespace + +#endif // ARMMACHINEFUNCTIONINFO_H diff --git a/contrib/llvm/lib/Target/ARM/ARMPerfectShuffle.h b/contrib/llvm/lib/Target/ARM/ARMPerfectShuffle.h new file mode 100644 index 0000000..5ff7c38 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMPerfectShuffle.h @@ -0,0 +1,6586 @@ +//===-- ARMPerfectShuffle.h - NEON Perfect Shuffle Table ------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file, which was autogenerated by llvm-PerfectShuffle, contains data +// for the optimal way to build a perfect shuffle using neon instructions. +// +//===----------------------------------------------------------------------===// + +// 31 entries have cost 0 +// 242 entries have cost 1 +// 1447 entries have cost 2 +// 3602 entries have cost 3 +// 1237 entries have cost 4 +// 2 entries have cost 5 + +// This table is 6561*4 = 26244 bytes in size. +static const unsigned PerfectShuffleTable[6561+1] = { + 135053414U, // <0,0,0,0>: Cost 1 vdup0 LHS + 1543503974U, // <0,0,0,1>: Cost 2 vext2 <0,0,0,0>, LHS + 2618572962U, // <0,0,0,2>: Cost 3 vext2 <0,2,0,0>, <0,2,0,0> + 2568054923U, // <0,0,0,3>: Cost 3 vext1 <3,0,0,0>, <3,0,0,0> + 1476398390U, // <0,0,0,4>: Cost 2 vext1 <0,0,0,0>, RHS + 2550140624U, // <0,0,0,5>: Cost 3 vext1 <0,0,0,0>, <5,1,7,3> + 2550141434U, // <0,0,0,6>: Cost 3 vext1 <0,0,0,0>, <6,2,7,3> + 2591945711U, // <0,0,0,7>: Cost 3 vext1 <7,0,0,0>, <7,0,0,0> + 135053414U, // <0,0,0,u>: Cost 1 vdup0 LHS + 2886516736U, // <0,0,1,0>: Cost 3 vzipl LHS, <0,0,0,0> + 1812775014U, // <0,0,1,1>: Cost 2 vzipl LHS, LHS + 1618133094U, // <0,0,1,2>: Cost 2 vext3 <1,2,3,0>, LHS + 2625209292U, // <0,0,1,3>: Cost 3 vext2 <1,3,0,0>, <1,3,0,0> + 2886558034U, // <0,0,1,4>: Cost 3 vzipl LHS, <0,4,1,5> + 2617246864U, // <0,0,1,5>: Cost 3 vext2 <0,0,0,0>, <1,5,3,7> + 3659723031U, // <0,0,1,6>: Cost 4 vext1 <6,0,0,1>, <6,0,0,1> + 2591953904U, // <0,0,1,7>: Cost 3 vext1 <7,0,0,1>, <7,0,0,1> + 1812775581U, // <0,0,1,u>: Cost 2 vzipl LHS, LHS + 3020734464U, // <0,0,2,0>: Cost 3 vtrnl LHS, <0,0,0,0> + 3020734474U, // <0,0,2,1>: Cost 3 vtrnl LHS, <0,0,1,1> + 1946992742U, // <0,0,2,2>: Cost 2 vtrnl LHS, LHS + 2631181989U, // <0,0,2,3>: Cost 3 vext2 <2,3,0,0>, <2,3,0,0> + 3020734668U, // <0,0,2,4>: Cost 3 vtrnl LHS, <0,2,4,6> + 3826550569U, // <0,0,2,5>: Cost 4 vuzpl <0,2,0,2>, <2,4,5,6> + 2617247674U, // <0,0,2,6>: Cost 3 vext2 <0,0,0,0>, <2,6,3,7> + 2591962097U, // <0,0,2,7>: Cost 3 vext1 <7,0,0,2>, <7,0,0,2> + 1946992796U, // <0,0,2,u>: Cost 2 vtrnl LHS, LHS + 2635163787U, // <0,0,3,0>: Cost 3 vext2 <3,0,0,0>, <3,0,0,0> + 2686419196U, // <0,0,3,1>: Cost 3 vext3 <0,3,1,0>, <0,3,1,0> + 2686492933U, // <0,0,3,2>: Cost 3 vext3 <0,3,2,0>, <0,3,2,0> + 2617248156U, // <0,0,3,3>: Cost 3 vext2 <0,0,0,0>, <3,3,3,3> + 2617248258U, // <0,0,3,4>: Cost 3 vext2 <0,0,0,0>, <3,4,5,6> + 3826551298U, // <0,0,3,5>: Cost 4 vuzpl <0,2,0,2>, <3,4,5,6> + 3690990200U, // <0,0,3,6>: Cost 4 vext2 <0,0,0,0>, <3,6,0,7> + 3713551042U, // <0,0,3,7>: Cost 4 vext2 <3,7,0,0>, <3,7,0,0> + 2635163787U, // <0,0,3,u>: Cost 3 vext2 <3,0,0,0>, <3,0,0,0> + 2617248658U, // <0,0,4,0>: Cost 3 vext2 <0,0,0,0>, <4,0,5,1> + 2888450150U, // <0,0,4,1>: Cost 3 vzipl <0,4,1,5>, LHS + 3021570150U, // <0,0,4,2>: Cost 3 vtrnl <0,2,4,6>, LHS + 3641829519U, // <0,0,4,3>: Cost 4 vext1 <3,0,0,4>, <3,0,0,4> + 3021570252U, // <0,0,4,4>: Cost 3 vtrnl <0,2,4,6>, <0,2,4,6> + 1543507254U, // <0,0,4,5>: Cost 2 vext2 <0,0,0,0>, RHS + 2752810294U, // <0,0,4,6>: Cost 3 vuzpl <0,2,0,2>, RHS + 3786998152U, // <0,0,4,7>: Cost 4 vext3 <4,7,5,0>, <0,4,7,5> + 1543507497U, // <0,0,4,u>: Cost 2 vext2 <0,0,0,0>, RHS + 2684354972U, // <0,0,5,0>: Cost 3 vext3 <0,0,0,0>, <0,5,0,7> + 2617249488U, // <0,0,5,1>: Cost 3 vext2 <0,0,0,0>, <5,1,7,3> + 3765617070U, // <0,0,5,2>: Cost 4 vext3 <1,2,3,0>, <0,5,2,7> + 3635865780U, // <0,0,5,3>: Cost 4 vext1 <2,0,0,5>, <3,0,4,5> + 2617249734U, // <0,0,5,4>: Cost 3 vext2 <0,0,0,0>, <5,4,7,6> + 2617249796U, // <0,0,5,5>: Cost 3 vext2 <0,0,0,0>, <5,5,5,5> + 2718712274U, // <0,0,5,6>: Cost 3 vext3 <5,6,7,0>, <0,5,6,7> + 2617249960U, // <0,0,5,7>: Cost 3 vext2 <0,0,0,0>, <5,7,5,7> + 2720039396U, // <0,0,5,u>: Cost 3 vext3 <5,u,7,0>, <0,5,u,7> + 2684355053U, // <0,0,6,0>: Cost 3 vext3 <0,0,0,0>, <0,6,0,7> + 3963609190U, // <0,0,6,1>: Cost 4 vzipl <0,6,2,7>, LHS + 2617250298U, // <0,0,6,2>: Cost 3 vext2 <0,0,0,0>, <6,2,7,3> + 3796435464U, // <0,0,6,3>: Cost 4 vext3 <6,3,7,0>, <0,6,3,7> + 3659762998U, // <0,0,6,4>: Cost 4 vext1 <6,0,0,6>, RHS + 3659763810U, // <0,0,6,5>: Cost 4 vext1 <6,0,0,6>, <5,6,7,0> + 2617250616U, // <0,0,6,6>: Cost 3 vext2 <0,0,0,0>, <6,6,6,6> + 2657727309U, // <0,0,6,7>: Cost 3 vext2 <6,7,0,0>, <6,7,0,0> + 2658390942U, // <0,0,6,u>: Cost 3 vext2 <6,u,0,0>, <6,u,0,0> + 2659054575U, // <0,0,7,0>: Cost 3 vext2 <7,0,0,0>, <7,0,0,0> + 3635880854U, // <0,0,7,1>: Cost 4 vext1 <2,0,0,7>, <1,2,3,0> + 3635881401U, // <0,0,7,2>: Cost 4 vext1 <2,0,0,7>, <2,0,0,7> + 3734787298U, // <0,0,7,3>: Cost 4 vext2 <7,3,0,0>, <7,3,0,0> + 2617251174U, // <0,0,7,4>: Cost 3 vext2 <0,0,0,0>, <7,4,5,6> + 3659772002U, // <0,0,7,5>: Cost 4 vext1 <6,0,0,7>, <5,6,7,0> + 3659772189U, // <0,0,7,6>: Cost 4 vext1 <6,0,0,7>, <6,0,0,7> + 2617251436U, // <0,0,7,7>: Cost 3 vext2 <0,0,0,0>, <7,7,7,7> + 2659054575U, // <0,0,7,u>: Cost 3 vext2 <7,0,0,0>, <7,0,0,0> + 135053414U, // <0,0,u,0>: Cost 1 vdup0 LHS + 1817419878U, // <0,0,u,1>: Cost 2 vzipl LHS, LHS + 1947435110U, // <0,0,u,2>: Cost 2 vtrnl LHS, LHS + 2568120467U, // <0,0,u,3>: Cost 3 vext1 <3,0,0,u>, <3,0,0,u> + 1476463926U, // <0,0,u,4>: Cost 2 vext1 <0,0,0,u>, RHS + 1543510170U, // <0,0,u,5>: Cost 2 vext2 <0,0,0,0>, RHS + 2752813210U, // <0,0,u,6>: Cost 3 vuzpl <0,2,0,2>, RHS + 2592011255U, // <0,0,u,7>: Cost 3 vext1 <7,0,0,u>, <7,0,0,u> + 135053414U, // <0,0,u,u>: Cost 1 vdup0 LHS + 2618581002U, // <0,1,0,0>: Cost 3 vext2 <0,2,0,1>, <0,0,1,1> + 1557446758U, // <0,1,0,1>: Cost 2 vext2 <2,3,0,1>, LHS + 2618581155U, // <0,1,0,2>: Cost 3 vext2 <0,2,0,1>, <0,2,0,1> + 2690548468U, // <0,1,0,3>: Cost 3 vext3 <1,0,3,0>, <1,0,3,0> + 2626543954U, // <0,1,0,4>: Cost 3 vext2 <1,5,0,1>, <0,4,1,5> + 4094985216U, // <0,1,0,5>: Cost 4 vtrnl <0,2,0,2>, <1,3,5,7> + 2592019278U, // <0,1,0,6>: Cost 3 vext1 <7,0,1,0>, <6,7,0,1> + 2592019448U, // <0,1,0,7>: Cost 3 vext1 <7,0,1,0>, <7,0,1,0> + 1557447325U, // <0,1,0,u>: Cost 2 vext2 <2,3,0,1>, LHS + 1476476938U, // <0,1,1,0>: Cost 2 vext1 <0,0,1,1>, <0,0,1,1> + 2886517556U, // <0,1,1,1>: Cost 3 vzipl LHS, <1,1,1,1> + 2886517654U, // <0,1,1,2>: Cost 3 vzipl LHS, <1,2,3,0> + 2886517720U, // <0,1,1,3>: Cost 3 vzipl LHS, <1,3,1,3> + 1476480310U, // <0,1,1,4>: Cost 2 vext1 <0,0,1,1>, RHS + 2886558864U, // <0,1,1,5>: Cost 3 vzipl LHS, <1,5,3,7> + 2550223354U, // <0,1,1,6>: Cost 3 vext1 <0,0,1,1>, <6,2,7,3> + 2550223856U, // <0,1,1,7>: Cost 3 vext1 <0,0,1,1>, <7,0,0,1> + 1476482862U, // <0,1,1,u>: Cost 2 vext1 <0,0,1,1>, LHS + 1494401126U, // <0,1,2,0>: Cost 2 vext1 <3,0,1,2>, LHS + 3020735284U, // <0,1,2,1>: Cost 3 vtrnl LHS, <1,1,1,1> + 2562172349U, // <0,1,2,2>: Cost 3 vext1 <2,0,1,2>, <2,0,1,2> + 835584U, // <0,1,2,3>: Cost 0 copy LHS + 1494404406U, // <0,1,2,4>: Cost 2 vext1 <3,0,1,2>, RHS + 3020735488U, // <0,1,2,5>: Cost 3 vtrnl LHS, <1,3,5,7> + 2631190458U, // <0,1,2,6>: Cost 3 vext2 <2,3,0,1>, <2,6,3,7> + 1518294010U, // <0,1,2,7>: Cost 2 vext1 <7,0,1,2>, <7,0,1,2> + 835584U, // <0,1,2,u>: Cost 0 copy LHS + 2692318156U, // <0,1,3,0>: Cost 3 vext3 <1,3,0,0>, <1,3,0,0> + 2691875800U, // <0,1,3,1>: Cost 3 vext3 <1,2,3,0>, <1,3,1,3> + 2691875806U, // <0,1,3,2>: Cost 3 vext3 <1,2,3,0>, <1,3,2,0> + 2692539367U, // <0,1,3,3>: Cost 3 vext3 <1,3,3,0>, <1,3,3,0> + 2562182454U, // <0,1,3,4>: Cost 3 vext1 <2,0,1,3>, RHS + 2691875840U, // <0,1,3,5>: Cost 3 vext3 <1,2,3,0>, <1,3,5,7> + 2692760578U, // <0,1,3,6>: Cost 3 vext3 <1,3,6,0>, <1,3,6,0> + 2639817411U, // <0,1,3,7>: Cost 3 vext2 <3,7,0,1>, <3,7,0,1> + 2691875863U, // <0,1,3,u>: Cost 3 vext3 <1,2,3,0>, <1,3,u,3> + 2568159334U, // <0,1,4,0>: Cost 3 vext1 <3,0,1,4>, LHS + 4095312692U, // <0,1,4,1>: Cost 4 vtrnl <0,2,4,6>, <1,1,1,1> + 2568160934U, // <0,1,4,2>: Cost 3 vext1 <3,0,1,4>, <2,3,0,1> + 2568161432U, // <0,1,4,3>: Cost 3 vext1 <3,0,1,4>, <3,0,1,4> + 2568162614U, // <0,1,4,4>: Cost 3 vext1 <3,0,1,4>, RHS + 1557450038U, // <0,1,4,5>: Cost 2 vext2 <2,3,0,1>, RHS + 2754235702U, // <0,1,4,6>: Cost 3 vuzpl <0,4,1,5>, RHS + 2592052220U, // <0,1,4,7>: Cost 3 vext1 <7,0,1,4>, <7,0,1,4> + 1557450281U, // <0,1,4,u>: Cost 2 vext2 <2,3,0,1>, RHS + 3765617775U, // <0,1,5,0>: Cost 4 vext3 <1,2,3,0>, <1,5,0,1> + 2647781007U, // <0,1,5,1>: Cost 3 vext2 <5,1,0,1>, <5,1,0,1> + 3704934138U, // <0,1,5,2>: Cost 4 vext2 <2,3,0,1>, <5,2,3,0> + 2691875984U, // <0,1,5,3>: Cost 3 vext3 <1,2,3,0>, <1,5,3,7> + 2657734598U, // <0,1,5,4>: Cost 3 vext2 <6,7,0,1>, <5,4,7,6> + 2650435539U, // <0,1,5,5>: Cost 3 vext2 <5,5,0,1>, <5,5,0,1> + 2651099172U, // <0,1,5,6>: Cost 3 vext2 <5,6,0,1>, <5,6,0,1> + 2651762805U, // <0,1,5,7>: Cost 3 vext2 <5,7,0,1>, <5,7,0,1> + 2691876029U, // <0,1,5,u>: Cost 3 vext3 <1,2,3,0>, <1,5,u,7> + 2592063590U, // <0,1,6,0>: Cost 3 vext1 <7,0,1,6>, LHS + 3765617871U, // <0,1,6,1>: Cost 4 vext3 <1,2,3,0>, <1,6,1,7> + 2654417337U, // <0,1,6,2>: Cost 3 vext2 <6,2,0,1>, <6,2,0,1> + 3765617889U, // <0,1,6,3>: Cost 4 vext3 <1,2,3,0>, <1,6,3,7> + 2592066870U, // <0,1,6,4>: Cost 3 vext1 <7,0,1,6>, RHS + 3765617907U, // <0,1,6,5>: Cost 4 vext3 <1,2,3,0>, <1,6,5,7> + 2657071869U, // <0,1,6,6>: Cost 3 vext2 <6,6,0,1>, <6,6,0,1> + 1583993678U, // <0,1,6,7>: Cost 2 vext2 <6,7,0,1>, <6,7,0,1> + 1584657311U, // <0,1,6,u>: Cost 2 vext2 <6,u,0,1>, <6,u,0,1> + 2657735672U, // <0,1,7,0>: Cost 3 vext2 <6,7,0,1>, <7,0,1,0> + 2657735808U, // <0,1,7,1>: Cost 3 vext2 <6,7,0,1>, <7,1,7,1> + 2631193772U, // <0,1,7,2>: Cost 3 vext2 <2,3,0,1>, <7,2,3,0> + 2661053667U, // <0,1,7,3>: Cost 3 vext2 <7,3,0,1>, <7,3,0,1> + 2657736038U, // <0,1,7,4>: Cost 3 vext2 <6,7,0,1>, <7,4,5,6> + 3721524621U, // <0,1,7,5>: Cost 4 vext2 <5,1,0,1>, <7,5,1,0> + 2657736158U, // <0,1,7,6>: Cost 3 vext2 <6,7,0,1>, <7,6,1,0> + 2657736300U, // <0,1,7,7>: Cost 3 vext2 <6,7,0,1>, <7,7,7,7> + 2657736322U, // <0,1,7,u>: Cost 3 vext2 <6,7,0,1>, <7,u,1,2> + 1494450278U, // <0,1,u,0>: Cost 2 vext1 <3,0,1,u>, LHS + 1557452590U, // <0,1,u,1>: Cost 2 vext2 <2,3,0,1>, LHS + 2754238254U, // <0,1,u,2>: Cost 3 vuzpl <0,4,1,5>, LHS + 835584U, // <0,1,u,3>: Cost 0 copy LHS + 1494453558U, // <0,1,u,4>: Cost 2 vext1 <3,0,1,u>, RHS + 1557452954U, // <0,1,u,5>: Cost 2 vext2 <2,3,0,1>, RHS + 2754238618U, // <0,1,u,6>: Cost 3 vuzpl <0,4,1,5>, RHS + 1518343168U, // <0,1,u,7>: Cost 2 vext1 <7,0,1,u>, <7,0,1,u> + 835584U, // <0,1,u,u>: Cost 0 copy LHS + 2752299008U, // <0,2,0,0>: Cost 3 vuzpl LHS, <0,0,0,0> + 1544847462U, // <0,2,0,1>: Cost 2 vext2 <0,2,0,2>, LHS + 1678557286U, // <0,2,0,2>: Cost 2 vuzpl LHS, LHS + 2696521165U, // <0,2,0,3>: Cost 3 vext3 <2,0,3,0>, <2,0,3,0> + 2752340172U, // <0,2,0,4>: Cost 3 vuzpl LHS, <0,2,4,6> + 2691876326U, // <0,2,0,5>: Cost 3 vext3 <1,2,3,0>, <2,0,5,7> + 2618589695U, // <0,2,0,6>: Cost 3 vext2 <0,2,0,2>, <0,6,2,7> + 2592093185U, // <0,2,0,7>: Cost 3 vext1 <7,0,2,0>, <7,0,2,0> + 1678557340U, // <0,2,0,u>: Cost 2 vuzpl LHS, LHS + 2618589942U, // <0,2,1,0>: Cost 3 vext2 <0,2,0,2>, <1,0,3,2> + 2752299828U, // <0,2,1,1>: Cost 3 vuzpl LHS, <1,1,1,1> + 2886518376U, // <0,2,1,2>: Cost 3 vzipl LHS, <2,2,2,2> + 2752299766U, // <0,2,1,3>: Cost 3 vuzpl LHS, <1,0,3,2> + 2550295862U, // <0,2,1,4>: Cost 3 vext1 <0,0,2,1>, RHS + 2752340992U, // <0,2,1,5>: Cost 3 vuzpl LHS, <1,3,5,7> + 2886559674U, // <0,2,1,6>: Cost 3 vzipl LHS, <2,6,3,7> + 3934208106U, // <0,2,1,7>: Cost 4 vuzpr <7,0,1,2>, <0,1,2,7> + 2752340771U, // <0,2,1,u>: Cost 3 vuzpl LHS, <1,0,u,2> + 1476558868U, // <0,2,2,0>: Cost 2 vext1 <0,0,2,2>, <0,0,2,2> + 2226628029U, // <0,2,2,1>: Cost 3 vrev <2,0,1,2> + 2752300648U, // <0,2,2,2>: Cost 3 vuzpl LHS, <2,2,2,2> + 3020736114U, // <0,2,2,3>: Cost 3 vtrnl LHS, <2,2,3,3> + 1476562230U, // <0,2,2,4>: Cost 2 vext1 <0,0,2,2>, RHS + 2550304464U, // <0,2,2,5>: Cost 3 vext1 <0,0,2,2>, <5,1,7,3> + 2618591162U, // <0,2,2,6>: Cost 3 vext2 <0,2,0,2>, <2,6,3,7> + 2550305777U, // <0,2,2,7>: Cost 3 vext1 <0,0,2,2>, <7,0,0,2> + 1476564782U, // <0,2,2,u>: Cost 2 vext1 <0,0,2,2>, LHS + 2618591382U, // <0,2,3,0>: Cost 3 vext2 <0,2,0,2>, <3,0,1,2> + 2752301206U, // <0,2,3,1>: Cost 3 vuzpl LHS, <3,0,1,2> + 3826043121U, // <0,2,3,2>: Cost 4 vuzpl LHS, <3,1,2,3> + 2752301468U, // <0,2,3,3>: Cost 3 vuzpl LHS, <3,3,3,3> + 2618591746U, // <0,2,3,4>: Cost 3 vext2 <0,2,0,2>, <3,4,5,6> + 2752301570U, // <0,2,3,5>: Cost 3 vuzpl LHS, <3,4,5,6> + 3830688102U, // <0,2,3,6>: Cost 4 vuzpl LHS, <3,2,6,3> + 2698807012U, // <0,2,3,7>: Cost 3 vext3 <2,3,7,0>, <2,3,7,0> + 2752301269U, // <0,2,3,u>: Cost 3 vuzpl LHS, <3,0,u,2> + 2562261094U, // <0,2,4,0>: Cost 3 vext1 <2,0,2,4>, LHS + 4095313828U, // <0,2,4,1>: Cost 4 vtrnl <0,2,4,6>, <2,6,1,3> + 2226718152U, // <0,2,4,2>: Cost 3 vrev <2,0,2,4> + 2568235169U, // <0,2,4,3>: Cost 3 vext1 <3,0,2,4>, <3,0,2,4> + 2562264374U, // <0,2,4,4>: Cost 3 vext1 <2,0,2,4>, RHS + 1544850742U, // <0,2,4,5>: Cost 2 vext2 <0,2,0,2>, RHS + 1678560566U, // <0,2,4,6>: Cost 2 vuzpl LHS, RHS + 2592125957U, // <0,2,4,7>: Cost 3 vext1 <7,0,2,4>, <7,0,2,4> + 1678560584U, // <0,2,4,u>: Cost 2 vuzpl LHS, RHS + 2691876686U, // <0,2,5,0>: Cost 3 vext3 <1,2,3,0>, <2,5,0,7> + 2618592976U, // <0,2,5,1>: Cost 3 vext2 <0,2,0,2>, <5,1,7,3> + 3765618528U, // <0,2,5,2>: Cost 4 vext3 <1,2,3,0>, <2,5,2,7> + 3765618536U, // <0,2,5,3>: Cost 4 vext3 <1,2,3,0>, <2,5,3,6> + 2618593222U, // <0,2,5,4>: Cost 3 vext2 <0,2,0,2>, <5,4,7,6> + 2752303108U, // <0,2,5,5>: Cost 3 vuzpl LHS, <5,5,5,5> + 2618593378U, // <0,2,5,6>: Cost 3 vext2 <0,2,0,2>, <5,6,7,0> + 2824785206U, // <0,2,5,7>: Cost 3 vuzpr <1,0,3,2>, RHS + 2824785207U, // <0,2,5,u>: Cost 3 vuzpr <1,0,3,2>, RHS + 2752303950U, // <0,2,6,0>: Cost 3 vuzpl LHS, <6,7,0,1> + 3830690081U, // <0,2,6,1>: Cost 4 vuzpl LHS, <6,0,1,2> + 2618593786U, // <0,2,6,2>: Cost 3 vext2 <0,2,0,2>, <6,2,7,3> + 2691876794U, // <0,2,6,3>: Cost 3 vext3 <1,2,3,0>, <2,6,3,7> + 2752303990U, // <0,2,6,4>: Cost 3 vuzpl LHS, <6,7,4,5> + 3830690445U, // <0,2,6,5>: Cost 4 vuzpl LHS, <6,4,5,6> + 2752303928U, // <0,2,6,6>: Cost 3 vuzpl LHS, <6,6,6,6> + 2657743695U, // <0,2,6,7>: Cost 3 vext2 <6,7,0,2>, <6,7,0,2> + 2691876839U, // <0,2,6,u>: Cost 3 vext3 <1,2,3,0>, <2,6,u,7> + 2659070961U, // <0,2,7,0>: Cost 3 vext2 <7,0,0,2>, <7,0,0,2> + 2659734594U, // <0,2,7,1>: Cost 3 vext2 <7,1,0,2>, <7,1,0,2> + 3734140051U, // <0,2,7,2>: Cost 4 vext2 <7,2,0,2>, <7,2,0,2> + 2701166596U, // <0,2,7,3>: Cost 3 vext3 <2,7,3,0>, <2,7,3,0> + 2662389094U, // <0,2,7,4>: Cost 3 vext2 <7,5,0,2>, <7,4,5,6> + 2662389126U, // <0,2,7,5>: Cost 3 vext2 <7,5,0,2>, <7,5,0,2> + 3736794583U, // <0,2,7,6>: Cost 4 vext2 <7,6,0,2>, <7,6,0,2> + 2752304748U, // <0,2,7,7>: Cost 3 vuzpl LHS, <7,7,7,7> + 2659070961U, // <0,2,7,u>: Cost 3 vext2 <7,0,0,2>, <7,0,0,2> + 1476608026U, // <0,2,u,0>: Cost 2 vext1 <0,0,2,u>, <0,0,2,u> + 1544853294U, // <0,2,u,1>: Cost 2 vext2 <0,2,0,2>, LHS + 1678563118U, // <0,2,u,2>: Cost 2 vuzpl LHS, LHS + 3021178482U, // <0,2,u,3>: Cost 3 vtrnl LHS, <2,2,3,3> + 1476611382U, // <0,2,u,4>: Cost 2 vext1 <0,0,2,u>, RHS + 1544853658U, // <0,2,u,5>: Cost 2 vext2 <0,2,0,2>, RHS + 1678563482U, // <0,2,u,6>: Cost 2 vuzpl LHS, RHS + 2824785449U, // <0,2,u,7>: Cost 3 vuzpr <1,0,3,2>, RHS + 1678563172U, // <0,2,u,u>: Cost 2 vuzpl LHS, LHS + 2556329984U, // <0,3,0,0>: Cost 3 vext1 <1,0,3,0>, <0,0,0,0> + 2686421142U, // <0,3,0,1>: Cost 3 vext3 <0,3,1,0>, <3,0,1,2> + 2562303437U, // <0,3,0,2>: Cost 3 vext1 <2,0,3,0>, <2,0,3,0> + 4094986652U, // <0,3,0,3>: Cost 4 vtrnl <0,2,0,2>, <3,3,3,3> + 2556333366U, // <0,3,0,4>: Cost 3 vext1 <1,0,3,0>, RHS + 4094986754U, // <0,3,0,5>: Cost 4 vtrnl <0,2,0,2>, <3,4,5,6> + 3798796488U, // <0,3,0,6>: Cost 4 vext3 <6,7,3,0>, <3,0,6,7> + 3776530634U, // <0,3,0,7>: Cost 4 vext3 <3,0,7,0>, <3,0,7,0> + 2556335918U, // <0,3,0,u>: Cost 3 vext1 <1,0,3,0>, LHS + 2886518934U, // <0,3,1,0>: Cost 3 vzipl LHS, <3,0,1,2> + 2556338933U, // <0,3,1,1>: Cost 3 vext1 <1,0,3,1>, <1,0,3,1> + 2691877105U, // <0,3,1,2>: Cost 3 vext3 <1,2,3,0>, <3,1,2,3> + 2886519196U, // <0,3,1,3>: Cost 3 vzipl LHS, <3,3,3,3> + 2886519298U, // <0,3,1,4>: Cost 3 vzipl LHS, <3,4,5,6> + 4095740418U, // <0,3,1,5>: Cost 4 vtrnl <0,3,1,4>, <3,4,5,6> + 3659944242U, // <0,3,1,6>: Cost 4 vext1 <6,0,3,1>, <6,0,3,1> + 3769600286U, // <0,3,1,7>: Cost 4 vext3 <1,u,3,0>, <3,1,7,3> + 2886519582U, // <0,3,1,u>: Cost 3 vzipl LHS, <3,u,1,2> + 1482604646U, // <0,3,2,0>: Cost 2 vext1 <1,0,3,2>, LHS + 1482605302U, // <0,3,2,1>: Cost 2 vext1 <1,0,3,2>, <1,0,3,2> + 2556348008U, // <0,3,2,2>: Cost 3 vext1 <1,0,3,2>, <2,2,2,2> + 3020736924U, // <0,3,2,3>: Cost 3 vtrnl LHS, <3,3,3,3> + 1482607926U, // <0,3,2,4>: Cost 2 vext1 <1,0,3,2>, RHS + 3020737026U, // <0,3,2,5>: Cost 3 vtrnl LHS, <3,4,5,6> + 2598154746U, // <0,3,2,6>: Cost 3 vext1 <u,0,3,2>, <6,2,7,3> + 2598155258U, // <0,3,2,7>: Cost 3 vext1 <u,0,3,2>, <7,0,1,2> + 1482610478U, // <0,3,2,u>: Cost 2 vext1 <1,0,3,2>, LHS + 3692341398U, // <0,3,3,0>: Cost 4 vext2 <0,2,0,3>, <3,0,1,2> + 2635851999U, // <0,3,3,1>: Cost 3 vext2 <3,1,0,3>, <3,1,0,3> + 3636069840U, // <0,3,3,2>: Cost 4 vext1 <2,0,3,3>, <2,0,3,3> + 2691877276U, // <0,3,3,3>: Cost 3 vext3 <1,2,3,0>, <3,3,3,3> + 3961522690U, // <0,3,3,4>: Cost 4 vzipl <0,3,1,4>, <3,4,5,6> + 3826797058U, // <0,3,3,5>: Cost 4 vuzpl <0,2,3,5>, <3,4,5,6> + 3703622282U, // <0,3,3,6>: Cost 4 vext2 <2,1,0,3>, <3,6,2,7> + 3769600452U, // <0,3,3,7>: Cost 4 vext3 <1,u,3,0>, <3,3,7,7> + 2640497430U, // <0,3,3,u>: Cost 3 vext2 <3,u,0,3>, <3,u,0,3> + 3962194070U, // <0,3,4,0>: Cost 4 vzipl <0,4,1,5>, <3,0,1,2> + 2232617112U, // <0,3,4,1>: Cost 3 vrev <3,0,1,4> + 2232690849U, // <0,3,4,2>: Cost 3 vrev <3,0,2,4> + 4095314332U, // <0,3,4,3>: Cost 4 vtrnl <0,2,4,6>, <3,3,3,3> + 3962194434U, // <0,3,4,4>: Cost 4 vzipl <0,4,1,5>, <3,4,5,6> + 2691877378U, // <0,3,4,5>: Cost 3 vext3 <1,2,3,0>, <3,4,5,6> + 3826765110U, // <0,3,4,6>: Cost 4 vuzpl <0,2,3,1>, RHS + 3665941518U, // <0,3,4,7>: Cost 4 vext1 <7,0,3,4>, <7,0,3,4> + 2691877405U, // <0,3,4,u>: Cost 3 vext3 <1,2,3,0>, <3,4,u,6> + 3630112870U, // <0,3,5,0>: Cost 4 vext1 <1,0,3,5>, LHS + 3630113526U, // <0,3,5,1>: Cost 4 vext1 <1,0,3,5>, <1,0,3,2> + 4035199734U, // <0,3,5,2>: Cost 4 vzipr <1,4,0,5>, <1,0,3,2> + 3769600578U, // <0,3,5,3>: Cost 4 vext3 <1,u,3,0>, <3,5,3,7> + 2232846516U, // <0,3,5,4>: Cost 3 vrev <3,0,4,5> + 3779037780U, // <0,3,5,5>: Cost 4 vext3 <3,4,5,0>, <3,5,5,7> + 2718714461U, // <0,3,5,6>: Cost 3 vext3 <5,6,7,0>, <3,5,6,7> + 2706106975U, // <0,3,5,7>: Cost 3 vext3 <3,5,7,0>, <3,5,7,0> + 2233141464U, // <0,3,5,u>: Cost 3 vrev <3,0,u,5> + 2691877496U, // <0,3,6,0>: Cost 3 vext3 <1,2,3,0>, <3,6,0,7> + 3727511914U, // <0,3,6,1>: Cost 4 vext2 <6,1,0,3>, <6,1,0,3> + 3765619338U, // <0,3,6,2>: Cost 4 vext3 <1,2,3,0>, <3,6,2,7> + 3765619347U, // <0,3,6,3>: Cost 4 vext3 <1,2,3,0>, <3,6,3,7> + 3765987996U, // <0,3,6,4>: Cost 4 vext3 <1,2,u,0>, <3,6,4,7> + 3306670270U, // <0,3,6,5>: Cost 4 vrev <3,0,5,6> + 3792456365U, // <0,3,6,6>: Cost 4 vext3 <5,6,7,0>, <3,6,6,6> + 2706770608U, // <0,3,6,7>: Cost 3 vext3 <3,6,7,0>, <3,6,7,0> + 2706844345U, // <0,3,6,u>: Cost 3 vext3 <3,6,u,0>, <3,6,u,0> + 3769600707U, // <0,3,7,0>: Cost 4 vext3 <1,u,3,0>, <3,7,0,1> + 2659742787U, // <0,3,7,1>: Cost 3 vext2 <7,1,0,3>, <7,1,0,3> + 3636102612U, // <0,3,7,2>: Cost 4 vext1 <2,0,3,7>, <2,0,3,7> + 3769600740U, // <0,3,7,3>: Cost 4 vext3 <1,u,3,0>, <3,7,3,7> + 3769600747U, // <0,3,7,4>: Cost 4 vext3 <1,u,3,0>, <3,7,4,5> + 3769600758U, // <0,3,7,5>: Cost 4 vext3 <1,u,3,0>, <3,7,5,7> + 3659993400U, // <0,3,7,6>: Cost 4 vext1 <6,0,3,7>, <6,0,3,7> + 3781176065U, // <0,3,7,7>: Cost 4 vext3 <3,7,7,0>, <3,7,7,0> + 2664388218U, // <0,3,7,u>: Cost 3 vext2 <7,u,0,3>, <7,u,0,3> + 1482653798U, // <0,3,u,0>: Cost 2 vext1 <1,0,3,u>, LHS + 1482654460U, // <0,3,u,1>: Cost 2 vext1 <1,0,3,u>, <1,0,3,u> + 2556397160U, // <0,3,u,2>: Cost 3 vext1 <1,0,3,u>, <2,2,2,2> + 3021179292U, // <0,3,u,3>: Cost 3 vtrnl LHS, <3,3,3,3> + 1482657078U, // <0,3,u,4>: Cost 2 vext1 <1,0,3,u>, RHS + 3021179394U, // <0,3,u,5>: Cost 3 vtrnl LHS, <3,4,5,6> + 2598203898U, // <0,3,u,6>: Cost 3 vext1 <u,0,3,u>, <6,2,7,3> + 2708097874U, // <0,3,u,7>: Cost 3 vext3 <3,u,7,0>, <3,u,7,0> + 1482659630U, // <0,3,u,u>: Cost 2 vext1 <1,0,3,u>, LHS + 2617278468U, // <0,4,0,0>: Cost 3 vext2 <0,0,0,4>, <0,0,0,4> + 2618605670U, // <0,4,0,1>: Cost 3 vext2 <0,2,0,4>, LHS + 2618605734U, // <0,4,0,2>: Cost 3 vext2 <0,2,0,4>, <0,2,0,4> + 3642091695U, // <0,4,0,3>: Cost 4 vext1 <3,0,4,0>, <3,0,4,0> + 2753134796U, // <0,4,0,4>: Cost 3 vuzpl <0,2,4,6>, <0,2,4,6> + 2718714770U, // <0,4,0,5>: Cost 3 vext3 <5,6,7,0>, <4,0,5,1> + 3021245750U, // <0,4,0,6>: Cost 3 vtrnl <0,2,0,2>, RHS + 3665982483U, // <0,4,0,7>: Cost 4 vext1 <7,0,4,0>, <7,0,4,0> + 3021245768U, // <0,4,0,u>: Cost 3 vtrnl <0,2,0,2>, RHS + 2568355942U, // <0,4,1,0>: Cost 3 vext1 <3,0,4,1>, LHS + 3692348212U, // <0,4,1,1>: Cost 4 vext2 <0,2,0,4>, <1,1,1,1> + 3692348310U, // <0,4,1,2>: Cost 4 vext2 <0,2,0,4>, <1,2,3,0> + 2568358064U, // <0,4,1,3>: Cost 3 vext1 <3,0,4,1>, <3,0,4,1> + 2568359222U, // <0,4,1,4>: Cost 3 vext1 <3,0,4,1>, RHS + 1812778294U, // <0,4,1,5>: Cost 2 vzipl LHS, RHS + 3022671158U, // <0,4,1,6>: Cost 3 vtrnl <0,4,1,5>, RHS + 2592248852U, // <0,4,1,7>: Cost 3 vext1 <7,0,4,1>, <7,0,4,1> + 1812778537U, // <0,4,1,u>: Cost 2 vzipl LHS, RHS + 2568364134U, // <0,4,2,0>: Cost 3 vext1 <3,0,4,2>, LHS + 2238573423U, // <0,4,2,1>: Cost 3 vrev <4,0,1,2> + 3692349032U, // <0,4,2,2>: Cost 4 vext2 <0,2,0,4>, <2,2,2,2> + 2631214761U, // <0,4,2,3>: Cost 3 vext2 <2,3,0,4>, <2,3,0,4> + 2568367414U, // <0,4,2,4>: Cost 3 vext1 <3,0,4,2>, RHS + 2887028022U, // <0,4,2,5>: Cost 3 vzipl <0,2,0,2>, RHS + 1946996022U, // <0,4,2,6>: Cost 2 vtrnl LHS, RHS + 2592257045U, // <0,4,2,7>: Cost 3 vext1 <7,0,4,2>, <7,0,4,2> + 1946996040U, // <0,4,2,u>: Cost 2 vtrnl LHS, RHS + 3692349590U, // <0,4,3,0>: Cost 4 vext2 <0,2,0,4>, <3,0,1,2> + 3826878614U, // <0,4,3,1>: Cost 4 vuzpl <0,2,4,6>, <3,0,1,2> + 3826878625U, // <0,4,3,2>: Cost 4 vuzpl <0,2,4,6>, <3,0,2,4> + 3692349852U, // <0,4,3,3>: Cost 4 vext2 <0,2,0,4>, <3,3,3,3> + 3692349954U, // <0,4,3,4>: Cost 4 vext2 <0,2,0,4>, <3,4,5,6> + 3826878978U, // <0,4,3,5>: Cost 4 vuzpl <0,2,4,6>, <3,4,5,6> + 4095200566U, // <0,4,3,6>: Cost 4 vtrnl <0,2,3,1>, RHS + 3713583814U, // <0,4,3,7>: Cost 4 vext2 <3,7,0,4>, <3,7,0,4> + 3692350238U, // <0,4,3,u>: Cost 4 vext2 <0,2,0,4>, <3,u,1,2> + 2550464552U, // <0,4,4,0>: Cost 3 vext1 <0,0,4,4>, <0,0,4,4> + 3962194914U, // <0,4,4,1>: Cost 4 vzipl <0,4,1,5>, <4,1,5,0> + 3693677631U, // <0,4,4,2>: Cost 4 vext2 <0,4,0,4>, <4,2,6,3> + 3642124467U, // <0,4,4,3>: Cost 4 vext1 <3,0,4,4>, <3,0,4,4> + 2718715088U, // <0,4,4,4>: Cost 3 vext3 <5,6,7,0>, <4,4,4,4> + 2618608950U, // <0,4,4,5>: Cost 3 vext2 <0,2,0,4>, RHS + 2753137974U, // <0,4,4,6>: Cost 3 vuzpl <0,2,4,6>, RHS + 3666015255U, // <0,4,4,7>: Cost 4 vext1 <7,0,4,4>, <7,0,4,4> + 2618609193U, // <0,4,4,u>: Cost 3 vext2 <0,2,0,4>, RHS + 2568388710U, // <0,4,5,0>: Cost 3 vext1 <3,0,4,5>, LHS + 2568389526U, // <0,4,5,1>: Cost 3 vext1 <3,0,4,5>, <1,2,3,0> + 3636159963U, // <0,4,5,2>: Cost 4 vext1 <2,0,4,5>, <2,0,4,5> + 2568390836U, // <0,4,5,3>: Cost 3 vext1 <3,0,4,5>, <3,0,4,5> + 2568391990U, // <0,4,5,4>: Cost 3 vext1 <3,0,4,5>, RHS + 2718715180U, // <0,4,5,5>: Cost 3 vext3 <5,6,7,0>, <4,5,5,6> + 1618136374U, // <0,4,5,6>: Cost 2 vext3 <1,2,3,0>, RHS + 2592281624U, // <0,4,5,7>: Cost 3 vext1 <7,0,4,5>, <7,0,4,5> + 1618136392U, // <0,4,5,u>: Cost 2 vext3 <1,2,3,0>, RHS + 2550480938U, // <0,4,6,0>: Cost 3 vext1 <0,0,4,6>, <0,0,4,6> + 3826880801U, // <0,4,6,1>: Cost 4 vuzpl <0,2,4,6>, <6,0,1,2> + 2562426332U, // <0,4,6,2>: Cost 3 vext1 <2,0,4,6>, <2,0,4,6> + 3786190181U, // <0,4,6,3>: Cost 4 vext3 <4,6,3,0>, <4,6,3,0> + 2718715252U, // <0,4,6,4>: Cost 3 vext3 <5,6,7,0>, <4,6,4,6> + 3826881165U, // <0,4,6,5>: Cost 4 vuzpl <0,2,4,6>, <6,4,5,6> + 2712669568U, // <0,4,6,6>: Cost 3 vext3 <4,6,6,0>, <4,6,6,0> + 2657760081U, // <0,4,6,7>: Cost 3 vext2 <6,7,0,4>, <6,7,0,4> + 2718715284U, // <0,4,6,u>: Cost 3 vext3 <5,6,7,0>, <4,6,u,2> + 3654090854U, // <0,4,7,0>: Cost 4 vext1 <5,0,4,7>, LHS + 3934229326U, // <0,4,7,1>: Cost 4 vuzpr <7,0,1,4>, <6,7,0,1> + 3734156437U, // <0,4,7,2>: Cost 4 vext2 <7,2,0,4>, <7,2,0,4> + 3734820070U, // <0,4,7,3>: Cost 4 vext2 <7,3,0,4>, <7,3,0,4> + 3654094134U, // <0,4,7,4>: Cost 4 vext1 <5,0,4,7>, RHS + 2713259464U, // <0,4,7,5>: Cost 3 vext3 <4,7,5,0>, <4,7,5,0> + 2713333201U, // <0,4,7,6>: Cost 3 vext3 <4,7,6,0>, <4,7,6,0> + 3654095866U, // <0,4,7,7>: Cost 4 vext1 <5,0,4,7>, <7,0,1,2> + 2713259464U, // <0,4,7,u>: Cost 3 vext3 <4,7,5,0>, <4,7,5,0> + 2568413286U, // <0,4,u,0>: Cost 3 vext1 <3,0,4,u>, LHS + 2618611502U, // <0,4,u,1>: Cost 3 vext2 <0,2,0,4>, LHS + 2753140526U, // <0,4,u,2>: Cost 3 vuzpl <0,2,4,6>, LHS + 2568415415U, // <0,4,u,3>: Cost 3 vext1 <3,0,4,u>, <3,0,4,u> + 2568416566U, // <0,4,u,4>: Cost 3 vext1 <3,0,4,u>, RHS + 1817423158U, // <0,4,u,5>: Cost 2 vzipl LHS, RHS + 1947438390U, // <0,4,u,6>: Cost 2 vtrnl LHS, RHS + 2592306203U, // <0,4,u,7>: Cost 3 vext1 <7,0,4,u>, <7,0,4,u> + 1947438408U, // <0,4,u,u>: Cost 2 vtrnl LHS, RHS + 3630219264U, // <0,5,0,0>: Cost 4 vext1 <1,0,5,0>, <0,0,0,0> + 2625912934U, // <0,5,0,1>: Cost 3 vext2 <1,4,0,5>, LHS + 3692355748U, // <0,5,0,2>: Cost 4 vext2 <0,2,0,5>, <0,2,0,2> + 3693019384U, // <0,5,0,3>: Cost 4 vext2 <0,3,0,5>, <0,3,0,5> + 3630222646U, // <0,5,0,4>: Cost 4 vext1 <1,0,5,0>, RHS + 3699655062U, // <0,5,0,5>: Cost 4 vext2 <1,4,0,5>, <0,5,0,1> + 2718715508U, // <0,5,0,6>: Cost 3 vext3 <5,6,7,0>, <5,0,6,1> + 3087011126U, // <0,5,0,7>: Cost 3 vtrnr <0,0,0,0>, RHS + 2625913501U, // <0,5,0,u>: Cost 3 vext2 <1,4,0,5>, LHS + 1500659814U, // <0,5,1,0>: Cost 2 vext1 <4,0,5,1>, LHS + 2886520528U, // <0,5,1,1>: Cost 3 vzipl LHS, <5,1,7,3> + 2574403176U, // <0,5,1,2>: Cost 3 vext1 <4,0,5,1>, <2,2,2,2> + 2574403734U, // <0,5,1,3>: Cost 3 vext1 <4,0,5,1>, <3,0,1,2> + 1500662674U, // <0,5,1,4>: Cost 2 vext1 <4,0,5,1>, <4,0,5,1> + 2886520836U, // <0,5,1,5>: Cost 3 vzipl LHS, <5,5,5,5> + 2886520930U, // <0,5,1,6>: Cost 3 vzipl LHS, <5,6,7,0> + 2718715600U, // <0,5,1,7>: Cost 3 vext3 <5,6,7,0>, <5,1,7,3> + 1500665646U, // <0,5,1,u>: Cost 2 vext1 <4,0,5,1>, LHS + 2556493926U, // <0,5,2,0>: Cost 3 vext1 <1,0,5,2>, LHS + 2244546120U, // <0,5,2,1>: Cost 3 vrev <5,0,1,2> + 3692357256U, // <0,5,2,2>: Cost 4 vext2 <0,2,0,5>, <2,2,5,7> + 2568439994U, // <0,5,2,3>: Cost 3 vext1 <3,0,5,2>, <3,0,5,2> + 2556497206U, // <0,5,2,4>: Cost 3 vext1 <1,0,5,2>, RHS + 3020738564U, // <0,5,2,5>: Cost 3 vtrnl LHS, <5,5,5,5> + 4027877161U, // <0,5,2,6>: Cost 4 vzipr <0,2,0,2>, <2,4,5,6> + 3093220662U, // <0,5,2,7>: Cost 3 vtrnr <1,0,3,2>, RHS + 3093220663U, // <0,5,2,u>: Cost 3 vtrnr <1,0,3,2>, RHS + 3699656854U, // <0,5,3,0>: Cost 4 vext2 <1,4,0,5>, <3,0,1,2> + 3699656927U, // <0,5,3,1>: Cost 4 vext2 <1,4,0,5>, <3,1,0,3> + 3699657006U, // <0,5,3,2>: Cost 4 vext2 <1,4,0,5>, <3,2,0,1> + 3699657116U, // <0,5,3,3>: Cost 4 vext2 <1,4,0,5>, <3,3,3,3> + 2637859284U, // <0,5,3,4>: Cost 3 vext2 <3,4,0,5>, <3,4,0,5> + 3790319453U, // <0,5,3,5>: Cost 4 vext3 <5,3,5,0>, <5,3,5,0> + 3699657354U, // <0,5,3,6>: Cost 4 vext2 <1,4,0,5>, <3,6,2,7> + 2716725103U, // <0,5,3,7>: Cost 3 vext3 <5,3,7,0>, <5,3,7,0> + 2716798840U, // <0,5,3,u>: Cost 3 vext3 <5,3,u,0>, <5,3,u,0> + 2661747602U, // <0,5,4,0>: Cost 3 vext2 <7,4,0,5>, <4,0,5,1> + 3630252810U, // <0,5,4,1>: Cost 4 vext1 <1,0,5,4>, <1,0,5,4> + 3636225507U, // <0,5,4,2>: Cost 4 vext1 <2,0,5,4>, <2,0,5,4> + 3716910172U, // <0,5,4,3>: Cost 4 vext2 <4,3,0,5>, <4,3,0,5> + 3962195892U, // <0,5,4,4>: Cost 4 vzipl <0,4,1,5>, <5,4,5,6> + 2625916214U, // <0,5,4,5>: Cost 3 vext2 <1,4,0,5>, RHS + 3718901071U, // <0,5,4,6>: Cost 4 vext2 <4,6,0,5>, <4,6,0,5> + 2718715846U, // <0,5,4,7>: Cost 3 vext3 <5,6,7,0>, <5,4,7,6> + 2625916457U, // <0,5,4,u>: Cost 3 vext2 <1,4,0,5>, RHS + 3791278034U, // <0,5,5,0>: Cost 4 vext3 <5,5,0,0>, <5,5,0,0> + 3791351771U, // <0,5,5,1>: Cost 4 vext3 <5,5,1,0>, <5,5,1,0> + 3318386260U, // <0,5,5,2>: Cost 4 vrev <5,0,2,5> + 3791499245U, // <0,5,5,3>: Cost 4 vext3 <5,5,3,0>, <5,5,3,0> + 3318533734U, // <0,5,5,4>: Cost 4 vrev <5,0,4,5> + 2718715908U, // <0,5,5,5>: Cost 3 vext3 <5,6,7,0>, <5,5,5,5> + 2657767522U, // <0,5,5,6>: Cost 3 vext2 <6,7,0,5>, <5,6,7,0> + 2718715928U, // <0,5,5,7>: Cost 3 vext3 <5,6,7,0>, <5,5,7,7> + 2718715937U, // <0,5,5,u>: Cost 3 vext3 <5,6,7,0>, <5,5,u,7> + 2592358502U, // <0,5,6,0>: Cost 3 vext1 <7,0,5,6>, LHS + 3792015404U, // <0,5,6,1>: Cost 4 vext3 <5,6,1,0>, <5,6,1,0> + 3731509754U, // <0,5,6,2>: Cost 4 vext2 <6,7,0,5>, <6,2,7,3> + 3785748546U, // <0,5,6,3>: Cost 4 vext3 <4,5,6,0>, <5,6,3,4> + 2592361782U, // <0,5,6,4>: Cost 3 vext1 <7,0,5,6>, RHS + 2592362594U, // <0,5,6,5>: Cost 3 vext1 <7,0,5,6>, <5,6,7,0> + 3785748576U, // <0,5,6,6>: Cost 4 vext3 <4,5,6,0>, <5,6,6,7> + 1644974178U, // <0,5,6,7>: Cost 2 vext3 <5,6,7,0>, <5,6,7,0> + 1645047915U, // <0,5,6,u>: Cost 2 vext3 <5,6,u,0>, <5,6,u,0> + 2562506854U, // <0,5,7,0>: Cost 3 vext1 <2,0,5,7>, LHS + 2562507670U, // <0,5,7,1>: Cost 3 vext1 <2,0,5,7>, <1,2,3,0> + 2562508262U, // <0,5,7,2>: Cost 3 vext1 <2,0,5,7>, <2,0,5,7> + 3636250774U, // <0,5,7,3>: Cost 4 vext1 <2,0,5,7>, <3,0,1,2> + 2562510134U, // <0,5,7,4>: Cost 3 vext1 <2,0,5,7>, RHS + 2718716072U, // <0,5,7,5>: Cost 3 vext3 <5,6,7,0>, <5,7,5,7> + 2718716074U, // <0,5,7,6>: Cost 3 vext3 <5,6,7,0>, <5,7,6,0> + 2719379635U, // <0,5,7,7>: Cost 3 vext3 <5,7,7,0>, <5,7,7,0> + 2562512686U, // <0,5,7,u>: Cost 3 vext1 <2,0,5,7>, LHS + 1500717158U, // <0,5,u,0>: Cost 2 vext1 <4,0,5,u>, LHS + 2625918766U, // <0,5,u,1>: Cost 3 vext2 <1,4,0,5>, LHS + 2719674583U, // <0,5,u,2>: Cost 3 vext3 <5,u,2,0>, <5,u,2,0> + 2568489152U, // <0,5,u,3>: Cost 3 vext1 <3,0,5,u>, <3,0,5,u> + 1500720025U, // <0,5,u,4>: Cost 2 vext1 <4,0,5,u>, <4,0,5,u> + 2625919130U, // <0,5,u,5>: Cost 3 vext2 <1,4,0,5>, RHS + 2586407243U, // <0,5,u,6>: Cost 3 vext1 <6,0,5,u>, <6,0,5,u> + 1646301444U, // <0,5,u,7>: Cost 2 vext3 <5,u,7,0>, <5,u,7,0> + 1646375181U, // <0,5,u,u>: Cost 2 vext3 <5,u,u,0>, <5,u,u,0> + 2586411110U, // <0,6,0,0>: Cost 3 vext1 <6,0,6,0>, LHS + 2619949158U, // <0,6,0,1>: Cost 3 vext2 <0,4,0,6>, LHS + 2619949220U, // <0,6,0,2>: Cost 3 vext2 <0,4,0,6>, <0,2,0,2> + 3785748789U, // <0,6,0,3>: Cost 4 vext3 <4,5,6,0>, <6,0,3,4> + 2619949386U, // <0,6,0,4>: Cost 3 vext2 <0,4,0,6>, <0,4,0,6> + 2586415202U, // <0,6,0,5>: Cost 3 vext1 <6,0,6,0>, <5,6,7,0> + 2586415436U, // <0,6,0,6>: Cost 3 vext1 <6,0,6,0>, <6,0,6,0> + 2952793398U, // <0,6,0,7>: Cost 3 vzipr <0,0,0,0>, RHS + 2619949725U, // <0,6,0,u>: Cost 3 vext2 <0,4,0,6>, LHS + 2562531430U, // <0,6,1,0>: Cost 3 vext1 <2,0,6,1>, LHS + 3693691700U, // <0,6,1,1>: Cost 4 vext2 <0,4,0,6>, <1,1,1,1> + 2886521338U, // <0,6,1,2>: Cost 3 vzipl LHS, <6,2,7,3> + 3693691864U, // <0,6,1,3>: Cost 4 vext2 <0,4,0,6>, <1,3,1,3> + 2562534710U, // <0,6,1,4>: Cost 3 vext1 <2,0,6,1>, RHS + 2580450932U, // <0,6,1,5>: Cost 3 vext1 <5,0,6,1>, <5,0,6,1> + 2886521656U, // <0,6,1,6>: Cost 3 vzipl LHS, <6,6,6,6> + 2966736182U, // <0,6,1,7>: Cost 3 vzipr <2,3,0,1>, RHS + 2966736183U, // <0,6,1,u>: Cost 3 vzipr <2,3,0,1>, RHS + 1500741734U, // <0,6,2,0>: Cost 2 vext1 <4,0,6,2>, LHS + 2250518817U, // <0,6,2,1>: Cost 3 vrev <6,0,1,2> + 2574485096U, // <0,6,2,2>: Cost 3 vext1 <4,0,6,2>, <2,2,2,2> + 2631894694U, // <0,6,2,3>: Cost 3 vext2 <2,4,0,6>, <2,3,0,1> + 1500744604U, // <0,6,2,4>: Cost 2 vext1 <4,0,6,2>, <4,0,6,2> + 2574487248U, // <0,6,2,5>: Cost 3 vext1 <4,0,6,2>, <5,1,7,3> + 3020739384U, // <0,6,2,6>: Cost 3 vtrnl LHS, <6,6,6,6> + 2954136886U, // <0,6,2,7>: Cost 3 vzipr <0,2,0,2>, RHS + 1500747566U, // <0,6,2,u>: Cost 2 vext1 <4,0,6,2>, LHS + 3693693078U, // <0,6,3,0>: Cost 4 vext2 <0,4,0,6>, <3,0,1,2> + 3705637136U, // <0,6,3,1>: Cost 4 vext2 <2,4,0,6>, <3,1,5,7> + 3705637192U, // <0,6,3,2>: Cost 4 vext2 <2,4,0,6>, <3,2,3,0> + 3693693340U, // <0,6,3,3>: Cost 4 vext2 <0,4,0,6>, <3,3,3,3> + 2637867477U, // <0,6,3,4>: Cost 3 vext2 <3,4,0,6>, <3,4,0,6> + 3705637424U, // <0,6,3,5>: Cost 4 vext2 <2,4,0,6>, <3,5,1,7> + 3666154056U, // <0,6,3,6>: Cost 4 vext1 <7,0,6,3>, <6,3,7,0> + 2722697800U, // <0,6,3,7>: Cost 3 vext3 <6,3,7,0>, <6,3,7,0> + 2722771537U, // <0,6,3,u>: Cost 3 vext3 <6,3,u,0>, <6,3,u,0> + 2562556006U, // <0,6,4,0>: Cost 3 vext1 <2,0,6,4>, LHS + 4095316257U, // <0,6,4,1>: Cost 4 vtrnl <0,2,4,6>, <6,0,1,2> + 2562557420U, // <0,6,4,2>: Cost 3 vext1 <2,0,6,4>, <2,0,6,4> + 3636299926U, // <0,6,4,3>: Cost 4 vext1 <2,0,6,4>, <3,0,1,2> + 2562559286U, // <0,6,4,4>: Cost 3 vext1 <2,0,6,4>, RHS + 2619952438U, // <0,6,4,5>: Cost 3 vext2 <0,4,0,6>, RHS + 2723287696U, // <0,6,4,6>: Cost 3 vext3 <6,4,6,0>, <6,4,6,0> + 4027895094U, // <0,6,4,7>: Cost 4 vzipr <0,2,0,4>, RHS + 2619952681U, // <0,6,4,u>: Cost 3 vext2 <0,4,0,6>, RHS + 2718716594U, // <0,6,5,0>: Cost 3 vext3 <5,6,7,0>, <6,5,0,7> + 3648250774U, // <0,6,5,1>: Cost 4 vext1 <4,0,6,5>, <1,2,3,0> + 3792458436U, // <0,6,5,2>: Cost 4 vext3 <5,6,7,0>, <6,5,2,7> + 3705638767U, // <0,6,5,3>: Cost 5 vext2 <2,4,0,6>, <5,3,7,0> + 3648252831U, // <0,6,5,4>: Cost 4 vext1 <4,0,6,5>, <4,0,6,5> + 3797619416U, // <0,6,5,5>: Cost 4 vext3 <6,5,5,0>, <6,5,5,0> + 3792458472U, // <0,6,5,6>: Cost 4 vext3 <5,6,7,0>, <6,5,6,7> + 4035202358U, // <0,6,5,7>: Cost 4 vzipr <1,4,0,5>, RHS + 2718716594U, // <0,6,5,u>: Cost 3 vext3 <5,6,7,0>, <6,5,0,7> + 3786412796U, // <0,6,6,0>: Cost 4 vext3 <4,6,6,0>, <6,6,0,0> + 3792458504U, // <0,6,6,1>: Cost 4 vext3 <5,6,7,0>, <6,6,1,3> + 3728200126U, // <0,6,6,2>: Cost 4 vext2 <6,2,0,6>, <6,2,0,6> + 3798135575U, // <0,6,6,3>: Cost 4 vext3 <6,6,3,0>, <6,6,3,0> + 3786412836U, // <0,6,6,4>: Cost 4 vext3 <4,6,6,0>, <6,6,4,4> + 3792458543U, // <0,6,6,5>: Cost 4 vext3 <5,6,7,0>, <6,6,5,6> + 2718716728U, // <0,6,6,6>: Cost 3 vext3 <5,6,7,0>, <6,6,6,6> + 2718716738U, // <0,6,6,7>: Cost 3 vext3 <5,6,7,0>, <6,6,7,7> + 2718716747U, // <0,6,6,u>: Cost 3 vext3 <5,6,7,0>, <6,6,u,7> + 2718716750U, // <0,6,7,0>: Cost 3 vext3 <5,6,7,0>, <6,7,0,1> + 2724909910U, // <0,6,7,1>: Cost 3 vext3 <6,7,1,0>, <6,7,1,0> + 3636323823U, // <0,6,7,2>: Cost 4 vext1 <2,0,6,7>, <2,0,6,7> + 2725057384U, // <0,6,7,3>: Cost 3 vext3 <6,7,3,0>, <6,7,3,0> + 2718716790U, // <0,6,7,4>: Cost 3 vext3 <5,6,7,0>, <6,7,4,5> + 2718716800U, // <0,6,7,5>: Cost 3 vext3 <5,6,7,0>, <6,7,5,6> + 3792458629U, // <0,6,7,6>: Cost 4 vext3 <5,6,7,0>, <6,7,6,2> + 2725352332U, // <0,6,7,7>: Cost 3 vext3 <6,7,7,0>, <6,7,7,0> + 2718716822U, // <0,6,7,u>: Cost 3 vext3 <5,6,7,0>, <6,7,u,1> + 1500790886U, // <0,6,u,0>: Cost 2 vext1 <4,0,6,u>, LHS + 2619954990U, // <0,6,u,1>: Cost 3 vext2 <0,4,0,6>, LHS + 2562590192U, // <0,6,u,2>: Cost 3 vext1 <2,0,6,u>, <2,0,6,u> + 2725721017U, // <0,6,u,3>: Cost 3 vext3 <6,u,3,0>, <6,u,3,0> + 1500793762U, // <0,6,u,4>: Cost 2 vext1 <4,0,6,u>, <4,0,6,u> + 2619955354U, // <0,6,u,5>: Cost 3 vext2 <0,4,0,6>, RHS + 2725942228U, // <0,6,u,6>: Cost 3 vext3 <6,u,6,0>, <6,u,6,0> + 2954186038U, // <0,6,u,7>: Cost 3 vzipr <0,2,0,u>, RHS + 1500796718U, // <0,6,u,u>: Cost 2 vext1 <4,0,6,u>, LHS + 2256401391U, // <0,7,0,0>: Cost 3 vrev <7,0,0,0> + 2632564838U, // <0,7,0,1>: Cost 3 vext2 <2,5,0,7>, LHS + 2256548865U, // <0,7,0,2>: Cost 3 vrev <7,0,2,0> + 3700998396U, // <0,7,0,3>: Cost 4 vext2 <1,6,0,7>, <0,3,1,0> + 2718716952U, // <0,7,0,4>: Cost 3 vext3 <5,6,7,0>, <7,0,4,5> + 2718716962U, // <0,7,0,5>: Cost 3 vext3 <5,6,7,0>, <7,0,5,6> + 2621284845U, // <0,7,0,6>: Cost 3 vext2 <0,6,0,7>, <0,6,0,7> + 3904685542U, // <0,7,0,7>: Cost 4 vuzpr <2,0,5,7>, <2,0,5,7> + 2632565405U, // <0,7,0,u>: Cost 3 vext2 <2,5,0,7>, LHS + 2256409584U, // <0,7,1,0>: Cost 3 vrev <7,0,0,1> + 3706307380U, // <0,7,1,1>: Cost 4 vext2 <2,5,0,7>, <1,1,1,1> + 2632565654U, // <0,7,1,2>: Cost 3 vext2 <2,5,0,7>, <1,2,3,0> + 3769603168U, // <0,7,1,3>: Cost 4 vext3 <1,u,3,0>, <7,1,3,5> + 2256704532U, // <0,7,1,4>: Cost 3 vrev <7,0,4,1> + 3769603184U, // <0,7,1,5>: Cost 4 vext3 <1,u,3,0>, <7,1,5,3> + 3700999366U, // <0,7,1,6>: Cost 4 vext2 <1,6,0,7>, <1,6,0,7> + 2886522476U, // <0,7,1,7>: Cost 3 vzipl LHS, <7,7,7,7> + 2256999480U, // <0,7,1,u>: Cost 3 vrev <7,0,u,1> + 2586501222U, // <0,7,2,0>: Cost 3 vext1 <6,0,7,2>, LHS + 1182749690U, // <0,7,2,1>: Cost 2 vrev <7,0,1,2> + 3636356595U, // <0,7,2,2>: Cost 4 vext1 <2,0,7,2>, <2,0,7,2> + 2727711916U, // <0,7,2,3>: Cost 3 vext3 <7,2,3,0>, <7,2,3,0> + 2586504502U, // <0,7,2,4>: Cost 3 vext1 <6,0,7,2>, RHS + 2632566606U, // <0,7,2,5>: Cost 3 vext2 <2,5,0,7>, <2,5,0,7> + 2586505559U, // <0,7,2,6>: Cost 3 vext1 <6,0,7,2>, <6,0,7,2> + 3020740204U, // <0,7,2,7>: Cost 3 vtrnl LHS, <7,7,7,7> + 1183265849U, // <0,7,2,u>: Cost 2 vrev <7,0,u,2> + 3701000342U, // <0,7,3,0>: Cost 4 vext2 <1,6,0,7>, <3,0,1,2> + 3706308849U, // <0,7,3,1>: Cost 4 vext2 <2,5,0,7>, <3,1,2,3> + 3330315268U, // <0,7,3,2>: Cost 4 vrev <7,0,2,3> + 3706309020U, // <0,7,3,3>: Cost 4 vext2 <2,5,0,7>, <3,3,3,3> + 3706309122U, // <0,7,3,4>: Cost 4 vext2 <2,5,0,7>, <3,4,5,6> + 3712281127U, // <0,7,3,5>: Cost 4 vext2 <3,5,0,7>, <3,5,0,7> + 2639202936U, // <0,7,3,6>: Cost 3 vext2 <3,6,0,7>, <3,6,0,7> + 3802412321U, // <0,7,3,7>: Cost 4 vext3 <7,3,7,0>, <7,3,7,0> + 2640530202U, // <0,7,3,u>: Cost 3 vext2 <3,u,0,7>, <3,u,0,7> + 3654287462U, // <0,7,4,0>: Cost 4 vext1 <5,0,7,4>, LHS + 2256507900U, // <0,7,4,1>: Cost 3 vrev <7,0,1,4> + 2256581637U, // <0,7,4,2>: Cost 3 vrev <7,0,2,4> + 3660262008U, // <0,7,4,3>: Cost 4 vext1 <6,0,7,4>, <3,6,0,7> + 3786413405U, // <0,7,4,4>: Cost 4 vext3 <4,6,6,0>, <7,4,4,6> + 2632568118U, // <0,7,4,5>: Cost 3 vext2 <2,5,0,7>, RHS + 3718917457U, // <0,7,4,6>: Cost 4 vext2 <4,6,0,7>, <4,6,0,7> + 3787003255U, // <0,7,4,7>: Cost 4 vext3 <4,7,5,0>, <7,4,7,5> + 2632568361U, // <0,7,4,u>: Cost 3 vext2 <2,5,0,7>, RHS + 3706310268U, // <0,7,5,0>: Cost 4 vext2 <2,5,0,7>, <5,0,7,0> + 3792459156U, // <0,7,5,1>: Cost 4 vext3 <5,6,7,0>, <7,5,1,7> + 3330331654U, // <0,7,5,2>: Cost 4 vrev <7,0,2,5> + 3722899255U, // <0,7,5,3>: Cost 4 vext2 <5,3,0,7>, <5,3,0,7> + 2256737304U, // <0,7,5,4>: Cost 3 vrev <7,0,4,5> + 3724226521U, // <0,7,5,5>: Cost 4 vext2 <5,5,0,7>, <5,5,0,7> + 2718717377U, // <0,7,5,6>: Cost 3 vext3 <5,6,7,0>, <7,5,6,7> + 2729997763U, // <0,7,5,7>: Cost 3 vext3 <7,5,7,0>, <7,5,7,0> + 2720044499U, // <0,7,5,u>: Cost 3 vext3 <5,u,7,0>, <7,5,u,7> + 3712946517U, // <0,7,6,0>: Cost 4 vext2 <3,6,0,7>, <6,0,7,0> + 2256524286U, // <0,7,6,1>: Cost 3 vrev <7,0,1,6> + 3792459246U, // <0,7,6,2>: Cost 4 vext3 <5,6,7,0>, <7,6,2,7> + 3796440567U, // <0,7,6,3>: Cost 4 vext3 <6,3,7,0>, <7,6,3,7> + 3654307126U, // <0,7,6,4>: Cost 4 vext1 <5,0,7,6>, RHS + 2656457394U, // <0,7,6,5>: Cost 3 vext2 <6,5,0,7>, <6,5,0,7> + 3792459281U, // <0,7,6,6>: Cost 4 vext3 <5,6,7,0>, <7,6,6,6> + 2730661396U, // <0,7,6,7>: Cost 3 vext3 <7,6,7,0>, <7,6,7,0> + 2658448293U, // <0,7,6,u>: Cost 3 vext2 <6,u,0,7>, <6,u,0,7> + 3787003431U, // <0,7,7,0>: Cost 4 vext3 <4,7,5,0>, <7,7,0,1> + 3654312854U, // <0,7,7,1>: Cost 4 vext1 <5,0,7,7>, <1,2,3,0> + 3654313446U, // <0,7,7,2>: Cost 4 vext1 <5,0,7,7>, <2,0,5,7> + 3804771905U, // <0,7,7,3>: Cost 4 vext3 <7,7,3,0>, <7,7,3,0> + 3654315318U, // <0,7,7,4>: Cost 4 vext1 <5,0,7,7>, RHS + 3654315651U, // <0,7,7,5>: Cost 4 vext1 <5,0,7,7>, <5,0,7,7> + 3660288348U, // <0,7,7,6>: Cost 4 vext1 <6,0,7,7>, <6,0,7,7> + 2718717548U, // <0,7,7,7>: Cost 3 vext3 <5,6,7,0>, <7,7,7,7> + 2664420990U, // <0,7,7,u>: Cost 3 vext2 <7,u,0,7>, <7,u,0,7> + 2256466935U, // <0,7,u,0>: Cost 3 vrev <7,0,0,u> + 1182798848U, // <0,7,u,1>: Cost 2 vrev <7,0,1,u> + 2256614409U, // <0,7,u,2>: Cost 3 vrev <7,0,2,u> + 2731693714U, // <0,7,u,3>: Cost 3 vext3 <7,u,3,0>, <7,u,3,0> + 2256761883U, // <0,7,u,4>: Cost 3 vrev <7,0,4,u> + 2632571034U, // <0,7,u,5>: Cost 3 vext2 <2,5,0,7>, RHS + 2669066421U, // <0,7,u,6>: Cost 3 vext2 <u,6,0,7>, <u,6,0,7> + 2731988662U, // <0,7,u,7>: Cost 3 vext3 <7,u,7,0>, <7,u,7,0> + 1183315007U, // <0,7,u,u>: Cost 2 vrev <7,0,u,u> + 135053414U, // <0,u,0,0>: Cost 1 vdup0 LHS + 1544896614U, // <0,u,0,1>: Cost 2 vext2 <0,2,0,u>, LHS + 1678999654U, // <0,u,0,2>: Cost 2 vuzpl LHS, LHS + 2691880677U, // <0,u,0,3>: Cost 3 vext3 <1,2,3,0>, <u,0,3,2> + 1476988214U, // <0,u,0,4>: Cost 2 vext1 <0,0,u,0>, RHS + 2718791419U, // <0,u,0,5>: Cost 3 vext3 <5,6,u,0>, <u,0,5,6> + 3021248666U, // <0,u,0,6>: Cost 3 vtrnl <0,2,0,2>, RHS + 2592535607U, // <0,u,0,7>: Cost 3 vext1 <7,0,u,0>, <7,0,u,0> + 135053414U, // <0,u,0,u>: Cost 1 vdup0 LHS + 1476993097U, // <0,u,1,0>: Cost 2 vext1 <0,0,u,1>, <0,0,u,1> + 1812780846U, // <0,u,1,1>: Cost 2 vzipl LHS, LHS + 1618138926U, // <0,u,1,2>: Cost 2 vext3 <1,2,3,0>, LHS + 2752742134U, // <0,u,1,3>: Cost 3 vuzpl LHS, <1,0,3,2> + 1476996406U, // <0,u,1,4>: Cost 2 vext1 <0,0,u,1>, RHS + 1812781210U, // <0,u,1,5>: Cost 2 vzipl LHS, RHS + 2887006416U, // <0,u,1,6>: Cost 3 vzipl LHS, <u,6,3,7> + 2966736200U, // <0,u,1,7>: Cost 3 vzipr <2,3,0,1>, RHS + 1812781413U, // <0,u,1,u>: Cost 2 vzipl LHS, LHS + 1482973286U, // <0,u,2,0>: Cost 2 vext1 <1,0,u,2>, LHS + 1482973987U, // <0,u,2,1>: Cost 2 vext1 <1,0,u,2>, <1,0,u,2> + 1946998574U, // <0,u,2,2>: Cost 2 vtrnl LHS, LHS + 835584U, // <0,u,2,3>: Cost 0 copy LHS + 1482976566U, // <0,u,2,4>: Cost 2 vext1 <1,0,u,2>, RHS + 3020781631U, // <0,u,2,5>: Cost 3 vtrnl LHS, <u,4,5,6> + 1946998938U, // <0,u,2,6>: Cost 2 vtrnl LHS, RHS + 1518810169U, // <0,u,2,7>: Cost 2 vext1 <7,0,u,2>, <7,0,u,2> + 835584U, // <0,u,2,u>: Cost 0 copy LHS + 2618640534U, // <0,u,3,0>: Cost 3 vext2 <0,2,0,u>, <3,0,1,2> + 2752743574U, // <0,u,3,1>: Cost 3 vuzpl LHS, <3,0,1,2> + 2636556597U, // <0,u,3,2>: Cost 3 vext2 <3,2,0,u>, <3,2,0,u> + 2752743836U, // <0,u,3,3>: Cost 3 vuzpl LHS, <3,3,3,3> + 2618640898U, // <0,u,3,4>: Cost 3 vext2 <0,2,0,u>, <3,4,5,6> + 2752743938U, // <0,u,3,5>: Cost 3 vuzpl LHS, <3,4,5,6> + 2639202936U, // <0,u,3,6>: Cost 3 vext2 <3,6,0,7>, <3,6,0,7> + 2639874762U, // <0,u,3,7>: Cost 3 vext2 <3,7,0,u>, <3,7,0,u> + 2752743637U, // <0,u,3,u>: Cost 3 vuzpl LHS, <3,0,u,2> + 2562703462U, // <0,u,4,0>: Cost 3 vext1 <2,0,u,4>, LHS + 2888455982U, // <0,u,4,1>: Cost 3 vzipl <0,4,1,5>, LHS + 3021575982U, // <0,u,4,2>: Cost 3 vtrnl <0,2,4,6>, LHS + 2568677591U, // <0,u,4,3>: Cost 3 vext1 <3,0,u,4>, <3,0,u,4> + 2562706742U, // <0,u,4,4>: Cost 3 vext1 <2,0,u,4>, RHS + 1544899894U, // <0,u,4,5>: Cost 2 vext2 <0,2,0,u>, RHS + 1679002934U, // <0,u,4,6>: Cost 2 vuzpl LHS, RHS + 2718718033U, // <0,u,4,7>: Cost 3 vext3 <5,6,7,0>, <u,4,7,6> + 1679002952U, // <0,u,4,u>: Cost 2 vuzpl LHS, RHS + 2568683622U, // <0,u,5,0>: Cost 3 vext1 <3,0,u,5>, LHS + 2568684438U, // <0,u,5,1>: Cost 3 vext1 <3,0,u,5>, <1,2,3,0> + 3765622902U, // <0,u,5,2>: Cost 4 vext3 <1,2,3,0>, <u,5,2,7> + 2691881087U, // <0,u,5,3>: Cost 3 vext3 <1,2,3,0>, <u,5,3,7> + 2568686902U, // <0,u,5,4>: Cost 3 vext1 <3,0,u,5>, RHS + 2650492890U, // <0,u,5,5>: Cost 3 vext2 <5,5,0,u>, <5,5,0,u> + 1618139290U, // <0,u,5,6>: Cost 2 vext3 <1,2,3,0>, RHS + 2824834358U, // <0,u,5,7>: Cost 3 vuzpr <1,0,3,u>, RHS + 1618139308U, // <0,u,5,u>: Cost 2 vext3 <1,2,3,0>, RHS + 2592579686U, // <0,u,6,0>: Cost 3 vext1 <7,0,u,6>, LHS + 2262496983U, // <0,u,6,1>: Cost 3 vrev <u,0,1,6> + 2654474688U, // <0,u,6,2>: Cost 3 vext2 <6,2,0,u>, <6,2,0,u> + 2691881168U, // <0,u,6,3>: Cost 3 vext3 <1,2,3,0>, <u,6,3,7> + 2592582966U, // <0,u,6,4>: Cost 3 vext1 <7,0,u,6>, RHS + 2656465587U, // <0,u,6,5>: Cost 3 vext2 <6,5,0,u>, <6,5,0,u> + 2657129220U, // <0,u,6,6>: Cost 3 vext2 <6,6,0,u>, <6,6,0,u> + 1584051029U, // <0,u,6,7>: Cost 2 vext2 <6,7,0,u>, <6,7,0,u> + 1584714662U, // <0,u,6,u>: Cost 2 vext2 <6,u,0,u>, <6,u,0,u> + 2562728038U, // <0,u,7,0>: Cost 3 vext1 <2,0,u,7>, LHS + 2562728854U, // <0,u,7,1>: Cost 3 vext1 <2,0,u,7>, <1,2,3,0> + 2562729473U, // <0,u,7,2>: Cost 3 vext1 <2,0,u,7>, <2,0,u,7> + 2661111018U, // <0,u,7,3>: Cost 3 vext2 <7,3,0,u>, <7,3,0,u> + 2562731318U, // <0,u,7,4>: Cost 3 vext1 <2,0,u,7>, RHS + 2718718258U, // <0,u,7,5>: Cost 3 vext3 <5,6,7,0>, <u,7,5,6> + 2586620261U, // <0,u,7,6>: Cost 3 vext1 <6,0,u,7>, <6,0,u,7> + 2657793644U, // <0,u,7,7>: Cost 3 vext2 <6,7,0,u>, <7,7,7,7> + 2562733870U, // <0,u,7,u>: Cost 3 vext1 <2,0,u,7>, LHS + 135053414U, // <0,u,u,0>: Cost 1 vdup0 LHS + 1544902446U, // <0,u,u,1>: Cost 2 vext2 <0,2,0,u>, LHS + 1679005486U, // <0,u,u,2>: Cost 2 vuzpl LHS, LHS + 835584U, // <0,u,u,3>: Cost 0 copy LHS + 1483025718U, // <0,u,u,4>: Cost 2 vext1 <1,0,u,u>, RHS + 1544902810U, // <0,u,u,5>: Cost 2 vext2 <0,2,0,u>, RHS + 1679005850U, // <0,u,u,6>: Cost 2 vuzpl LHS, RHS + 1518859327U, // <0,u,u,7>: Cost 2 vext1 <7,0,u,u>, <7,0,u,u> + 835584U, // <0,u,u,u>: Cost 0 copy LHS + 2689744896U, // <1,0,0,0>: Cost 3 vext3 <0,u,1,1>, <0,0,0,0> + 1610694666U, // <1,0,0,1>: Cost 2 vext3 <0,0,1,1>, <0,0,1,1> + 2689744916U, // <1,0,0,2>: Cost 3 vext3 <0,u,1,1>, <0,0,2,2> + 2619310332U, // <1,0,0,3>: Cost 3 vext2 <0,3,1,0>, <0,3,1,0> + 2684657701U, // <1,0,0,4>: Cost 3 vext3 <0,0,4,1>, <0,0,4,1> + 2620637598U, // <1,0,0,5>: Cost 3 vext2 <0,5,1,0>, <0,5,1,0> + 3708977654U, // <1,0,0,6>: Cost 4 vext2 <3,0,1,0>, <0,6,1,7> + 3666351168U, // <1,0,0,7>: Cost 4 vext1 <7,1,0,0>, <7,1,0,0> + 1611210825U, // <1,0,0,u>: Cost 2 vext3 <0,0,u,1>, <0,0,u,1> + 2556780646U, // <1,0,1,0>: Cost 3 vext1 <1,1,0,1>, LHS + 2556781355U, // <1,0,1,1>: Cost 3 vext1 <1,1,0,1>, <1,1,0,1> + 1616003174U, // <1,0,1,2>: Cost 2 vext3 <0,u,1,1>, LHS + 3693052888U, // <1,0,1,3>: Cost 4 vext2 <0,3,1,0>, <1,3,1,3> + 2556783926U, // <1,0,1,4>: Cost 3 vext1 <1,1,0,1>, RHS + 2580672143U, // <1,0,1,5>: Cost 3 vext1 <5,1,0,1>, <5,1,0,1> + 2724839566U, // <1,0,1,6>: Cost 3 vext3 <6,7,0,1>, <0,1,6,7> + 3654415354U, // <1,0,1,7>: Cost 4 vext1 <5,1,0,1>, <7,0,1,2> + 1616003228U, // <1,0,1,u>: Cost 2 vext3 <0,u,1,1>, LHS + 2685690019U, // <1,0,2,0>: Cost 3 vext3 <0,2,0,1>, <0,2,0,1> + 2685763756U, // <1,0,2,1>: Cost 3 vext3 <0,2,1,1>, <0,2,1,1> + 2698297524U, // <1,0,2,2>: Cost 3 vext3 <2,3,0,1>, <0,2,2,0> + 2685911230U, // <1,0,2,3>: Cost 3 vext3 <0,2,3,1>, <0,2,3,1> + 2689745100U, // <1,0,2,4>: Cost 3 vext3 <0,u,1,1>, <0,2,4,6> + 3764814038U, // <1,0,2,5>: Cost 4 vext3 <1,1,1,1>, <0,2,5,7> + 2724839640U, // <1,0,2,6>: Cost 3 vext3 <6,7,0,1>, <0,2,6,0> + 2592625658U, // <1,0,2,7>: Cost 3 vext1 <7,1,0,2>, <7,0,1,2> + 2686279915U, // <1,0,2,u>: Cost 3 vext3 <0,2,u,1>, <0,2,u,1> + 3087843328U, // <1,0,3,0>: Cost 3 vtrnr LHS, <0,0,0,0> + 3087843338U, // <1,0,3,1>: Cost 3 vtrnr LHS, <0,0,1,1> + 67944550U, // <1,0,3,2>: Cost 1 vrev LHS + 2568743135U, // <1,0,3,3>: Cost 3 vext1 <3,1,0,3>, <3,1,0,3> + 2562772278U, // <1,0,3,4>: Cost 3 vext1 <2,1,0,3>, RHS + 4099850454U, // <1,0,3,5>: Cost 4 vtrnl <1,0,3,2>, <0,2,5,7> + 3704998538U, // <1,0,3,6>: Cost 4 vext2 <2,3,1,0>, <3,6,2,7> + 2592633923U, // <1,0,3,7>: Cost 3 vext1 <7,1,0,3>, <7,1,0,3> + 68386972U, // <1,0,3,u>: Cost 1 vrev LHS + 2620640146U, // <1,0,4,0>: Cost 3 vext2 <0,5,1,0>, <4,0,5,1> + 2689745234U, // <1,0,4,1>: Cost 3 vext3 <0,u,1,1>, <0,4,1,5> + 2689745244U, // <1,0,4,2>: Cost 3 vext3 <0,u,1,1>, <0,4,2,6> + 3760980320U, // <1,0,4,3>: Cost 4 vext3 <0,4,3,1>, <0,4,3,1> + 3761054057U, // <1,0,4,4>: Cost 4 vext3 <0,4,4,1>, <0,4,4,1> + 2619313462U, // <1,0,4,5>: Cost 3 vext2 <0,3,1,0>, RHS + 3761201531U, // <1,0,4,6>: Cost 4 vext3 <0,4,6,1>, <0,4,6,1> + 3666383940U, // <1,0,4,7>: Cost 4 vext1 <7,1,0,4>, <7,1,0,4> + 2619313705U, // <1,0,4,u>: Cost 3 vext2 <0,3,1,0>, RHS + 4029300736U, // <1,0,5,0>: Cost 4 vzipr <0,4,1,5>, <0,0,0,0> + 2895249510U, // <1,0,5,1>: Cost 3 vzipl <1,5,3,7>, LHS + 3028287590U, // <1,0,5,2>: Cost 3 vtrnl <1,3,5,7>, LHS + 3642501345U, // <1,0,5,3>: Cost 4 vext1 <3,1,0,5>, <3,1,0,5> + 2215592058U, // <1,0,5,4>: Cost 3 vrev <0,1,4,5> + 3724242907U, // <1,0,5,5>: Cost 4 vext2 <5,5,1,0>, <5,5,1,0> + 3724906540U, // <1,0,5,6>: Cost 4 vext2 <5,6,1,0>, <5,6,1,0> + 3911118134U, // <1,0,5,7>: Cost 4 vuzpr <3,1,3,0>, RHS + 3028287644U, // <1,0,5,u>: Cost 3 vtrnl <1,3,5,7>, LHS + 3762086375U, // <1,0,6,0>: Cost 4 vext3 <0,6,0,1>, <0,6,0,1> + 2698297846U, // <1,0,6,1>: Cost 3 vext3 <2,3,0,1>, <0,6,1,7> + 3760022015U, // <1,0,6,2>: Cost 4 vext3 <0,2,u,1>, <0,6,2,7> + 3642509538U, // <1,0,6,3>: Cost 4 vext1 <3,1,0,6>, <3,1,0,6> + 3762381323U, // <1,0,6,4>: Cost 4 vext3 <0,6,4,1>, <0,6,4,1> + 3730215604U, // <1,0,6,5>: Cost 4 vext2 <6,5,1,0>, <6,5,1,0> + 3730879237U, // <1,0,6,6>: Cost 4 vext2 <6,6,1,0>, <6,6,1,0> + 2657801046U, // <1,0,6,7>: Cost 3 vext2 <6,7,1,0>, <6,7,1,0> + 2658464679U, // <1,0,6,u>: Cost 3 vext2 <6,u,1,0>, <6,u,1,0> + 2659128312U, // <1,0,7,0>: Cost 3 vext2 <7,0,1,0>, <7,0,1,0> + 4047898278U, // <1,0,7,1>: Cost 4 vzipr <3,5,1,7>, <2,3,0,1> + 2215460970U, // <1,0,7,2>: Cost 3 vrev <0,1,2,7> + 3734861035U, // <1,0,7,3>: Cost 4 vext2 <7,3,1,0>, <7,3,1,0> + 3731543398U, // <1,0,7,4>: Cost 4 vext2 <6,7,1,0>, <7,4,5,6> + 3736188301U, // <1,0,7,5>: Cost 4 vext2 <7,5,1,0>, <7,5,1,0> + 2663110110U, // <1,0,7,6>: Cost 3 vext2 <7,6,1,0>, <7,6,1,0> + 3731543660U, // <1,0,7,7>: Cost 4 vext2 <6,7,1,0>, <7,7,7,7> + 2664437376U, // <1,0,7,u>: Cost 3 vext2 <7,u,1,0>, <7,u,1,0> + 3087884288U, // <1,0,u,0>: Cost 3 vtrnr LHS, <0,0,0,0> + 1616003730U, // <1,0,u,1>: Cost 2 vext3 <0,u,1,1>, <0,u,1,1> + 67985515U, // <1,0,u,2>: Cost 1 vrev LHS + 2689893028U, // <1,0,u,3>: Cost 3 vext3 <0,u,3,1>, <0,u,3,1> + 2689745586U, // <1,0,u,4>: Cost 3 vext3 <0,u,1,1>, <0,u,4,6> + 2619316378U, // <1,0,u,5>: Cost 3 vext2 <0,3,1,0>, RHS + 2669082807U, // <1,0,u,6>: Cost 3 vext2 <u,6,1,0>, <u,6,1,0> + 2592674888U, // <1,0,u,7>: Cost 3 vext1 <7,1,0,u>, <7,1,0,u> + 68427937U, // <1,0,u,u>: Cost 1 vrev LHS + 1543585802U, // <1,1,0,0>: Cost 2 vext2 <0,0,1,1>, <0,0,1,1> + 1548894310U, // <1,1,0,1>: Cost 2 vext2 <0,u,1,1>, LHS + 2618654892U, // <1,1,0,2>: Cost 3 vext2 <0,2,1,1>, <0,2,1,1> + 2689745654U, // <1,1,0,3>: Cost 3 vext3 <0,u,1,1>, <1,0,3,2> + 2622636370U, // <1,1,0,4>: Cost 3 vext2 <0,u,1,1>, <0,4,1,5> + 2620645791U, // <1,1,0,5>: Cost 3 vext2 <0,5,1,1>, <0,5,1,1> + 3696378367U, // <1,1,0,6>: Cost 4 vext2 <0,u,1,1>, <0,6,2,7> + 3666424905U, // <1,1,0,7>: Cost 4 vext1 <7,1,1,0>, <7,1,1,0> + 1548894866U, // <1,1,0,u>: Cost 2 vext2 <0,u,1,1>, <0,u,1,1> + 1483112550U, // <1,1,1,0>: Cost 2 vext1 <1,1,1,1>, LHS + 202162278U, // <1,1,1,1>: Cost 1 vdup1 LHS + 2622636950U, // <1,1,1,2>: Cost 3 vext2 <0,u,1,1>, <1,2,3,0> + 2622637016U, // <1,1,1,3>: Cost 3 vext2 <0,u,1,1>, <1,3,1,3> + 1483115830U, // <1,1,1,4>: Cost 2 vext1 <1,1,1,1>, RHS + 2622637200U, // <1,1,1,5>: Cost 3 vext2 <0,u,1,1>, <1,5,3,7> + 2622637263U, // <1,1,1,6>: Cost 3 vext2 <0,u,1,1>, <1,6,1,7> + 2592691274U, // <1,1,1,7>: Cost 3 vext1 <7,1,1,1>, <7,1,1,1> + 202162278U, // <1,1,1,u>: Cost 1 vdup1 LHS + 2550890588U, // <1,1,2,0>: Cost 3 vext1 <0,1,1,2>, <0,1,1,2> + 2617329183U, // <1,1,2,1>: Cost 3 vext2 <0,0,1,1>, <2,1,3,1> + 2622637672U, // <1,1,2,2>: Cost 3 vext2 <0,u,1,1>, <2,2,2,2> + 2622637734U, // <1,1,2,3>: Cost 3 vext2 <0,u,1,1>, <2,3,0,1> + 2550893878U, // <1,1,2,4>: Cost 3 vext1 <0,1,1,2>, RHS + 3696379744U, // <1,1,2,5>: Cost 4 vext2 <0,u,1,1>, <2,5,2,7> + 2622638010U, // <1,1,2,6>: Cost 3 vext2 <0,u,1,1>, <2,6,3,7> + 3804554170U, // <1,1,2,7>: Cost 4 vext3 <7,7,0,1>, <1,2,7,0> + 2622638139U, // <1,1,2,u>: Cost 3 vext2 <0,u,1,1>, <2,u,0,1> + 2622638230U, // <1,1,3,0>: Cost 3 vext2 <0,u,1,1>, <3,0,1,2> + 3087844148U, // <1,1,3,1>: Cost 3 vtrnr LHS, <1,1,1,1> + 4161585244U, // <1,1,3,2>: Cost 4 vtrnr LHS, <0,1,1,2> + 2014101606U, // <1,1,3,3>: Cost 2 vtrnr LHS, LHS + 2622638594U, // <1,1,3,4>: Cost 3 vext2 <0,u,1,1>, <3,4,5,6> + 2689745920U, // <1,1,3,5>: Cost 3 vext3 <0,u,1,1>, <1,3,5,7> + 3763487753U, // <1,1,3,6>: Cost 4 vext3 <0,u,1,1>, <1,3,6,7> + 2592707660U, // <1,1,3,7>: Cost 3 vext1 <7,1,1,3>, <7,1,1,3> + 2014101611U, // <1,1,3,u>: Cost 2 vtrnr LHS, LHS + 2556878950U, // <1,1,4,0>: Cost 3 vext1 <1,1,1,4>, LHS + 2221335351U, // <1,1,4,1>: Cost 3 vrev <1,1,1,4> + 3696380988U, // <1,1,4,2>: Cost 4 vext2 <0,u,1,1>, <4,2,6,0> + 3763487805U, // <1,1,4,3>: Cost 4 vext3 <0,u,1,1>, <1,4,3,5> + 2556882230U, // <1,1,4,4>: Cost 3 vext1 <1,1,1,4>, RHS + 1548897590U, // <1,1,4,5>: Cost 2 vext2 <0,u,1,1>, RHS + 2758184246U, // <1,1,4,6>: Cost 3 vuzpl <1,1,1,1>, RHS + 3666457677U, // <1,1,4,7>: Cost 4 vext1 <7,1,1,4>, <7,1,1,4> + 1548897833U, // <1,1,4,u>: Cost 2 vext2 <0,u,1,1>, RHS + 2693653615U, // <1,1,5,0>: Cost 3 vext3 <1,5,0,1>, <1,5,0,1> + 2617331408U, // <1,1,5,1>: Cost 3 vext2 <0,0,1,1>, <5,1,7,3> + 4029302934U, // <1,1,5,2>: Cost 4 vzipr <0,4,1,5>, <3,0,1,2> + 2689746064U, // <1,1,5,3>: Cost 3 vext3 <0,u,1,1>, <1,5,3,7> + 2221564755U, // <1,1,5,4>: Cost 3 vrev <1,1,4,5> + 2955559250U, // <1,1,5,5>: Cost 3 vzipr <0,4,1,5>, <0,4,1,5> + 2617331810U, // <1,1,5,6>: Cost 3 vext2 <0,0,1,1>, <5,6,7,0> + 2825293110U, // <1,1,5,7>: Cost 3 vuzpr <1,1,1,1>, RHS + 2689746109U, // <1,1,5,u>: Cost 3 vext3 <0,u,1,1>, <1,5,u,7> + 3696382241U, // <1,1,6,0>: Cost 4 vext2 <0,u,1,1>, <6,0,1,2> + 2689746127U, // <1,1,6,1>: Cost 3 vext3 <0,u,1,1>, <1,6,1,7> + 2617332218U, // <1,1,6,2>: Cost 3 vext2 <0,0,1,1>, <6,2,7,3> + 3763487969U, // <1,1,6,3>: Cost 4 vext3 <0,u,1,1>, <1,6,3,7> + 3696382605U, // <1,1,6,4>: Cost 4 vext2 <0,u,1,1>, <6,4,5,6> + 4029309266U, // <1,1,6,5>: Cost 4 vzipr <0,4,1,6>, <0,4,1,5> + 2617332536U, // <1,1,6,6>: Cost 3 vext2 <0,0,1,1>, <6,6,6,6> + 2724840702U, // <1,1,6,7>: Cost 3 vext3 <6,7,0,1>, <1,6,7,0> + 2725504263U, // <1,1,6,u>: Cost 3 vext3 <6,u,0,1>, <1,6,u,0> + 2617332720U, // <1,1,7,0>: Cost 3 vext2 <0,0,1,1>, <7,0,0,1> + 2659800138U, // <1,1,7,1>: Cost 3 vext2 <7,1,1,1>, <7,1,1,1> + 3691074717U, // <1,1,7,2>: Cost 4 vext2 <0,0,1,1>, <7,2,1,3> + 4167811174U, // <1,1,7,3>: Cost 4 vtrnr <1,1,5,7>, LHS + 2617333094U, // <1,1,7,4>: Cost 3 vext2 <0,0,1,1>, <7,4,5,6> + 3295396702U, // <1,1,7,5>: Cost 4 vrev <1,1,5,7> + 3803891014U, // <1,1,7,6>: Cost 4 vext3 <7,6,0,1>, <1,7,6,0> + 2617333356U, // <1,1,7,7>: Cost 3 vext2 <0,0,1,1>, <7,7,7,7> + 2659800138U, // <1,1,7,u>: Cost 3 vext2 <7,1,1,1>, <7,1,1,1> + 1483112550U, // <1,1,u,0>: Cost 2 vext1 <1,1,1,1>, LHS + 202162278U, // <1,1,u,1>: Cost 1 vdup1 LHS + 2622642056U, // <1,1,u,2>: Cost 3 vext2 <0,u,1,1>, <u,2,3,3> + 2014142566U, // <1,1,u,3>: Cost 2 vtrnr LHS, LHS + 1483115830U, // <1,1,u,4>: Cost 2 vext1 <1,1,1,1>, RHS + 1548900506U, // <1,1,u,5>: Cost 2 vext2 <0,u,1,1>, RHS + 2622642384U, // <1,1,u,6>: Cost 3 vext2 <0,u,1,1>, <u,6,3,7> + 2825293353U, // <1,1,u,7>: Cost 3 vuzpr <1,1,1,1>, RHS + 202162278U, // <1,1,u,u>: Cost 1 vdup1 LHS + 2635251712U, // <1,2,0,0>: Cost 3 vext2 <3,0,1,2>, <0,0,0,0> + 1561509990U, // <1,2,0,1>: Cost 2 vext2 <3,0,1,2>, LHS + 2618663085U, // <1,2,0,2>: Cost 3 vext2 <0,2,1,2>, <0,2,1,2> + 2696529358U, // <1,2,0,3>: Cost 3 vext3 <2,0,3,1>, <2,0,3,1> + 2635252050U, // <1,2,0,4>: Cost 3 vext2 <3,0,1,2>, <0,4,1,5> + 3769533926U, // <1,2,0,5>: Cost 4 vext3 <1,u,2,1>, <2,0,5,7> + 2621317617U, // <1,2,0,6>: Cost 3 vext2 <0,6,1,2>, <0,6,1,2> + 2659140170U, // <1,2,0,7>: Cost 3 vext2 <7,0,1,2>, <0,7,2,1> + 1561510557U, // <1,2,0,u>: Cost 2 vext2 <3,0,1,2>, LHS + 2623308516U, // <1,2,1,0>: Cost 3 vext2 <1,0,1,2>, <1,0,1,2> + 2635252532U, // <1,2,1,1>: Cost 3 vext2 <3,0,1,2>, <1,1,1,1> + 2631271318U, // <1,2,1,2>: Cost 3 vext2 <2,3,1,2>, <1,2,3,0> + 2958180454U, // <1,2,1,3>: Cost 3 vzipr <0,u,1,1>, LHS + 2550959414U, // <1,2,1,4>: Cost 3 vext1 <0,1,2,1>, RHS + 2635252880U, // <1,2,1,5>: Cost 3 vext2 <3,0,1,2>, <1,5,3,7> + 2635252952U, // <1,2,1,6>: Cost 3 vext2 <3,0,1,2>, <1,6,2,7> + 3732882731U, // <1,2,1,7>: Cost 4 vext2 <7,0,1,2>, <1,7,3,0> + 2958180459U, // <1,2,1,u>: Cost 3 vzipr <0,u,1,1>, LHS + 2629281213U, // <1,2,2,0>: Cost 3 vext2 <2,0,1,2>, <2,0,1,2> + 2635253280U, // <1,2,2,1>: Cost 3 vext2 <3,0,1,2>, <2,1,3,2> + 2618664552U, // <1,2,2,2>: Cost 3 vext2 <0,2,1,2>, <2,2,2,2> + 2689746546U, // <1,2,2,3>: Cost 3 vext3 <0,u,1,1>, <2,2,3,3> + 3764815485U, // <1,2,2,4>: Cost 4 vext3 <1,1,1,1>, <2,2,4,5> + 3760023176U, // <1,2,2,5>: Cost 4 vext3 <0,2,u,1>, <2,2,5,7> + 2635253690U, // <1,2,2,6>: Cost 3 vext2 <3,0,1,2>, <2,6,3,7> + 2659141610U, // <1,2,2,7>: Cost 3 vext2 <7,0,1,2>, <2,7,0,1> + 2689746591U, // <1,2,2,u>: Cost 3 vext3 <0,u,1,1>, <2,2,u,3> + 403488870U, // <1,2,3,0>: Cost 1 vext1 LHS, LHS + 1477231350U, // <1,2,3,1>: Cost 2 vext1 LHS, <1,0,3,2> + 1477232232U, // <1,2,3,2>: Cost 2 vext1 LHS, <2,2,2,2> + 1477233052U, // <1,2,3,3>: Cost 2 vext1 LHS, <3,3,3,3> + 403492150U, // <1,2,3,4>: Cost 1 vext1 LHS, RHS + 1525010128U, // <1,2,3,5>: Cost 2 vext1 LHS, <5,1,7,3> + 1525010938U, // <1,2,3,6>: Cost 2 vext1 LHS, <6,2,7,3> + 1525011450U, // <1,2,3,7>: Cost 2 vext1 LHS, <7,0,1,2> + 403494702U, // <1,2,3,u>: Cost 1 vext1 LHS, LHS + 2641226607U, // <1,2,4,0>: Cost 3 vext2 <4,0,1,2>, <4,0,1,2> + 3624723446U, // <1,2,4,1>: Cost 4 vext1 <0,1,2,4>, <1,3,4,6> + 3301123609U, // <1,2,4,2>: Cost 4 vrev <2,1,2,4> + 2598759198U, // <1,2,4,3>: Cost 3 vext1 <u,1,2,4>, <3,u,1,2> + 2659142864U, // <1,2,4,4>: Cost 3 vext2 <7,0,1,2>, <4,4,4,4> + 1561513270U, // <1,2,4,5>: Cost 2 vext2 <3,0,1,2>, RHS + 2659143028U, // <1,2,4,6>: Cost 3 vext2 <7,0,1,2>, <4,6,4,6> + 2659143112U, // <1,2,4,7>: Cost 3 vext2 <7,0,1,2>, <4,7,5,0> + 1561513513U, // <1,2,4,u>: Cost 2 vext2 <3,0,1,2>, RHS + 2550988902U, // <1,2,5,0>: Cost 3 vext1 <0,1,2,5>, LHS + 2550989824U, // <1,2,5,1>: Cost 3 vext1 <0,1,2,5>, <1,3,5,7> + 3624732264U, // <1,2,5,2>: Cost 4 vext1 <0,1,2,5>, <2,2,2,2> + 2955559014U, // <1,2,5,3>: Cost 3 vzipr <0,4,1,5>, LHS + 2550992182U, // <1,2,5,4>: Cost 3 vext1 <0,1,2,5>, RHS + 2659143684U, // <1,2,5,5>: Cost 3 vext2 <7,0,1,2>, <5,5,5,5> + 2659143778U, // <1,2,5,6>: Cost 3 vext2 <7,0,1,2>, <5,6,7,0> + 2659143848U, // <1,2,5,7>: Cost 3 vext2 <7,0,1,2>, <5,7,5,7> + 2550994734U, // <1,2,5,u>: Cost 3 vext1 <0,1,2,5>, LHS + 2700289945U, // <1,2,6,0>: Cost 3 vext3 <2,6,0,1>, <2,6,0,1> + 2635256232U, // <1,2,6,1>: Cost 3 vext2 <3,0,1,2>, <6,1,7,2> + 2659144186U, // <1,2,6,2>: Cost 3 vext2 <7,0,1,2>, <6,2,7,3> + 2689746874U, // <1,2,6,3>: Cost 3 vext3 <0,u,1,1>, <2,6,3,7> + 3763488705U, // <1,2,6,4>: Cost 4 vext3 <0,u,1,1>, <2,6,4,5> + 3763488716U, // <1,2,6,5>: Cost 4 vext3 <0,u,1,1>, <2,6,5,7> + 2659144504U, // <1,2,6,6>: Cost 3 vext2 <7,0,1,2>, <6,6,6,6> + 2657817432U, // <1,2,6,7>: Cost 3 vext2 <6,7,1,2>, <6,7,1,2> + 2689746919U, // <1,2,6,u>: Cost 3 vext3 <0,u,1,1>, <2,6,u,7> + 1585402874U, // <1,2,7,0>: Cost 2 vext2 <7,0,1,2>, <7,0,1,2> + 2659144770U, // <1,2,7,1>: Cost 3 vext2 <7,0,1,2>, <7,1,0,2> + 3708998858U, // <1,2,7,2>: Cost 4 vext2 <3,0,1,2>, <7,2,6,3> + 2635257059U, // <1,2,7,3>: Cost 3 vext2 <3,0,1,2>, <7,3,0,1> + 2659145062U, // <1,2,7,4>: Cost 3 vext2 <7,0,1,2>, <7,4,5,6> + 3732886916U, // <1,2,7,5>: Cost 4 vext2 <7,0,1,2>, <7,5,0,0> + 3732886998U, // <1,2,7,6>: Cost 4 vext2 <7,0,1,2>, <7,6,0,1> + 2659145255U, // <1,2,7,7>: Cost 3 vext2 <7,0,1,2>, <7,7,0,1> + 1590711938U, // <1,2,7,u>: Cost 2 vext2 <7,u,1,2>, <7,u,1,2> + 403529835U, // <1,2,u,0>: Cost 1 vext1 LHS, LHS + 1477272310U, // <1,2,u,1>: Cost 2 vext1 LHS, <1,0,3,2> + 1477273192U, // <1,2,u,2>: Cost 2 vext1 LHS, <2,2,2,2> + 1477273750U, // <1,2,u,3>: Cost 2 vext1 LHS, <3,0,1,2> + 403533110U, // <1,2,u,4>: Cost 1 vext1 LHS, RHS + 1561516186U, // <1,2,u,5>: Cost 2 vext2 <3,0,1,2>, RHS + 1525051898U, // <1,2,u,6>: Cost 2 vext1 LHS, <6,2,7,3> + 1525052410U, // <1,2,u,7>: Cost 2 vext1 LHS, <7,0,1,2> + 403535662U, // <1,2,u,u>: Cost 1 vext1 LHS, LHS + 2819407872U, // <1,3,0,0>: Cost 3 vuzpr LHS, <0,0,0,0> + 1551564902U, // <1,3,0,1>: Cost 2 vext2 <1,3,1,3>, LHS + 2819408630U, // <1,3,0,2>: Cost 3 vuzpr LHS, <1,0,3,2> + 2619334911U, // <1,3,0,3>: Cost 3 vext2 <0,3,1,3>, <0,3,1,3> + 2625306962U, // <1,3,0,4>: Cost 3 vext2 <1,3,1,3>, <0,4,1,5> + 3832725879U, // <1,3,0,5>: Cost 4 vuzpl <1,2,3,0>, <0,4,5,6> + 3699048959U, // <1,3,0,6>: Cost 4 vext2 <1,3,1,3>, <0,6,2,7> + 3776538827U, // <1,3,0,7>: Cost 4 vext3 <3,0,7,1>, <3,0,7,1> + 1551565469U, // <1,3,0,u>: Cost 2 vext2 <1,3,1,3>, LHS + 2618671862U, // <1,3,1,0>: Cost 3 vext2 <0,2,1,3>, <1,0,3,2> + 2819408692U, // <1,3,1,1>: Cost 3 vuzpr LHS, <1,1,1,1> + 2624643975U, // <1,3,1,2>: Cost 3 vext2 <1,2,1,3>, <1,2,1,3> + 1745666150U, // <1,3,1,3>: Cost 2 vuzpr LHS, LHS + 2557005110U, // <1,3,1,4>: Cost 3 vext1 <1,1,3,1>, RHS + 2625307792U, // <1,3,1,5>: Cost 3 vext2 <1,3,1,3>, <1,5,3,7> + 3698386127U, // <1,3,1,6>: Cost 4 vext2 <1,2,1,3>, <1,6,1,7> + 2592838748U, // <1,3,1,7>: Cost 3 vext1 <7,1,3,1>, <7,1,3,1> + 1745666155U, // <1,3,1,u>: Cost 2 vuzpr LHS, LHS + 2819408790U, // <1,3,2,0>: Cost 3 vuzpr LHS, <1,2,3,0> + 2625308193U, // <1,3,2,1>: Cost 3 vext2 <1,3,1,3>, <2,1,3,3> + 2819408036U, // <1,3,2,2>: Cost 3 vuzpr LHS, <0,2,0,2> + 2819851890U, // <1,3,2,3>: Cost 3 vuzpr LHS, <2,2,3,3> + 2819408794U, // <1,3,2,4>: Cost 3 vuzpr LHS, <1,2,3,4> + 3893149890U, // <1,3,2,5>: Cost 4 vuzpr LHS, <0,2,3,5> + 2819408076U, // <1,3,2,6>: Cost 3 vuzpr LHS, <0,2,4,6> + 3772041583U, // <1,3,2,7>: Cost 4 vext3 <2,3,0,1>, <3,2,7,3> + 2819408042U, // <1,3,2,u>: Cost 3 vuzpr LHS, <0,2,0,u> + 1483276390U, // <1,3,3,0>: Cost 2 vext1 <1,1,3,3>, LHS + 1483277128U, // <1,3,3,1>: Cost 2 vext1 <1,1,3,3>, <1,1,3,3> + 2557019752U, // <1,3,3,2>: Cost 3 vext1 <1,1,3,3>, <2,2,2,2> + 2819408856U, // <1,3,3,3>: Cost 3 vuzpr LHS, <1,3,1,3> + 1483279670U, // <1,3,3,4>: Cost 2 vext1 <1,1,3,3>, RHS + 2819409614U, // <1,3,3,5>: Cost 3 vuzpr LHS, <2,3,4,5> + 2598826490U, // <1,3,3,6>: Cost 3 vext1 <u,1,3,3>, <6,2,7,3> + 3087844352U, // <1,3,3,7>: Cost 3 vtrnr LHS, <1,3,5,7> + 1483282222U, // <1,3,3,u>: Cost 2 vext1 <1,1,3,3>, LHS + 2568970342U, // <1,3,4,0>: Cost 3 vext1 <3,1,3,4>, LHS + 2568971224U, // <1,3,4,1>: Cost 3 vext1 <3,1,3,4>, <1,3,1,3> + 3832761290U, // <1,3,4,2>: Cost 4 vuzpl <1,2,3,4>, <4,1,2,3> + 2233428219U, // <1,3,4,3>: Cost 3 vrev <3,1,3,4> + 2568973622U, // <1,3,4,4>: Cost 3 vext1 <3,1,3,4>, RHS + 1551568182U, // <1,3,4,5>: Cost 2 vext2 <1,3,1,3>, RHS + 2819410434U, // <1,3,4,6>: Cost 3 vuzpr LHS, <3,4,5,6> + 3666605151U, // <1,3,4,7>: Cost 4 vext1 <7,1,3,4>, <7,1,3,4> + 1551568425U, // <1,3,4,u>: Cost 2 vext2 <1,3,1,3>, RHS + 2563006566U, // <1,3,5,0>: Cost 3 vext1 <2,1,3,5>, LHS + 2568979456U, // <1,3,5,1>: Cost 3 vext1 <3,1,3,5>, <1,3,5,7> + 2563008035U, // <1,3,5,2>: Cost 3 vext1 <2,1,3,5>, <2,1,3,5> + 2233436412U, // <1,3,5,3>: Cost 3 vrev <3,1,3,5> + 2563009846U, // <1,3,5,4>: Cost 3 vext1 <2,1,3,5>, RHS + 2867187716U, // <1,3,5,5>: Cost 3 vuzpr LHS, <5,5,5,5> + 2655834214U, // <1,3,5,6>: Cost 3 vext2 <6,4,1,3>, <5,6,7,4> + 1745669430U, // <1,3,5,7>: Cost 2 vuzpr LHS, RHS + 1745669431U, // <1,3,5,u>: Cost 2 vuzpr LHS, RHS + 2867187810U, // <1,3,6,0>: Cost 3 vuzpr LHS, <5,6,7,0> + 3699052931U, // <1,3,6,1>: Cost 4 vext2 <1,3,1,3>, <6,1,3,1> + 2654507460U, // <1,3,6,2>: Cost 3 vext2 <6,2,1,3>, <6,2,1,3> + 3766291091U, // <1,3,6,3>: Cost 4 vext3 <1,3,3,1>, <3,6,3,7> + 2655834726U, // <1,3,6,4>: Cost 3 vext2 <6,4,1,3>, <6,4,1,3> + 3923384562U, // <1,3,6,5>: Cost 4 vuzpr <5,1,7,3>, <u,6,7,5> + 2657161992U, // <1,3,6,6>: Cost 3 vext2 <6,6,1,3>, <6,6,1,3> + 2819852218U, // <1,3,6,7>: Cost 3 vuzpr LHS, <2,6,3,7> + 2819852219U, // <1,3,6,u>: Cost 3 vuzpr LHS, <2,6,3,u> + 2706926275U, // <1,3,7,0>: Cost 3 vext3 <3,7,0,1>, <3,7,0,1> + 2659816524U, // <1,3,7,1>: Cost 3 vext2 <7,1,1,3>, <7,1,1,3> + 3636766245U, // <1,3,7,2>: Cost 4 vext1 <2,1,3,7>, <2,1,3,7> + 2867187903U, // <1,3,7,3>: Cost 3 vuzpr LHS, <5,7,u,3> + 2625312102U, // <1,3,7,4>: Cost 3 vext2 <1,3,1,3>, <7,4,5,6> + 2867188598U, // <1,3,7,5>: Cost 3 vuzpr LHS, <6,7,4,5> + 3728250344U, // <1,3,7,6>: Cost 4 vext2 <6,2,1,3>, <7,6,2,1> + 2867187880U, // <1,3,7,7>: Cost 3 vuzpr LHS, <5,7,5,7> + 2707516171U, // <1,3,7,u>: Cost 3 vext3 <3,7,u,1>, <3,7,u,1> + 1483317350U, // <1,3,u,0>: Cost 2 vext1 <1,1,3,u>, LHS + 1483318093U, // <1,3,u,1>: Cost 2 vext1 <1,1,3,u>, <1,1,3,u> + 2819410718U, // <1,3,u,2>: Cost 3 vuzpr LHS, <3,u,1,2> + 1745666717U, // <1,3,u,3>: Cost 2 vuzpr LHS, LHS + 1483320630U, // <1,3,u,4>: Cost 2 vext1 <1,1,3,u>, RHS + 1551571098U, // <1,3,u,5>: Cost 2 vext2 <1,3,1,3>, RHS + 2819410758U, // <1,3,u,6>: Cost 3 vuzpr LHS, <3,u,5,6> + 1745669673U, // <1,3,u,7>: Cost 2 vuzpr LHS, RHS + 1745666722U, // <1,3,u,u>: Cost 2 vuzpr LHS, LHS + 2617352205U, // <1,4,0,0>: Cost 3 vext2 <0,0,1,4>, <0,0,1,4> + 2619342950U, // <1,4,0,1>: Cost 3 vext2 <0,3,1,4>, LHS + 3692421295U, // <1,4,0,2>: Cost 4 vext2 <0,2,1,4>, <0,2,1,4> + 2619343104U, // <1,4,0,3>: Cost 3 vext2 <0,3,1,4>, <0,3,1,4> + 2617352530U, // <1,4,0,4>: Cost 3 vext2 <0,0,1,4>, <0,4,1,5> + 1634880402U, // <1,4,0,5>: Cost 2 vext3 <4,0,5,1>, <4,0,5,1> + 2713930652U, // <1,4,0,6>: Cost 3 vext3 <4,u,5,1>, <4,0,6,2> + 3732898396U, // <1,4,0,7>: Cost 4 vext2 <7,0,1,4>, <0,7,4,1> + 1635101613U, // <1,4,0,u>: Cost 2 vext3 <4,0,u,1>, <4,0,u,1> + 3693085430U, // <1,4,1,0>: Cost 4 vext2 <0,3,1,4>, <1,0,3,2> + 2623988535U, // <1,4,1,1>: Cost 3 vext2 <1,1,1,4>, <1,1,1,4> + 3693085590U, // <1,4,1,2>: Cost 4 vext2 <0,3,1,4>, <1,2,3,0> + 3692422134U, // <1,4,1,3>: Cost 4 vext2 <0,2,1,4>, <1,3,4,6> + 3693085726U, // <1,4,1,4>: Cost 4 vext2 <0,3,1,4>, <1,4,0,1> + 2892401974U, // <1,4,1,5>: Cost 3 vzipl <1,1,1,1>, RHS + 3026619702U, // <1,4,1,6>: Cost 3 vtrnl <1,1,1,1>, RHS + 3800206324U, // <1,4,1,7>: Cost 4 vext3 <7,0,4,1>, <4,1,7,0> + 2892402217U, // <1,4,1,u>: Cost 3 vzipl <1,1,1,1>, RHS + 3966978927U, // <1,4,2,0>: Cost 4 vzipl <1,2,3,4>, <4,0,1,2> + 3966979018U, // <1,4,2,1>: Cost 4 vzipl <1,2,3,4>, <4,1,2,3> + 3693086312U, // <1,4,2,2>: Cost 4 vext2 <0,3,1,4>, <2,2,2,2> + 2635269798U, // <1,4,2,3>: Cost 3 vext2 <3,0,1,4>, <2,3,0,1> + 3966979280U, // <1,4,2,4>: Cost 4 vzipl <1,2,3,4>, <4,4,4,4> + 2893204790U, // <1,4,2,5>: Cost 3 vzipl <1,2,3,0>, RHS + 3693086650U, // <1,4,2,6>: Cost 4 vext2 <0,3,1,4>, <2,6,3,7> + 3666662502U, // <1,4,2,7>: Cost 4 vext1 <7,1,4,2>, <7,1,4,2> + 2893205033U, // <1,4,2,u>: Cost 3 vzipl <1,2,3,0>, RHS + 2563063910U, // <1,4,3,0>: Cost 3 vext1 <2,1,4,3>, LHS + 2563064730U, // <1,4,3,1>: Cost 3 vext1 <2,1,4,3>, <1,2,3,4> + 2563065386U, // <1,4,3,2>: Cost 3 vext1 <2,1,4,3>, <2,1,4,3> + 3693087132U, // <1,4,3,3>: Cost 4 vext2 <0,3,1,4>, <3,3,3,3> + 2619345410U, // <1,4,3,4>: Cost 3 vext2 <0,3,1,4>, <3,4,5,6> + 3087843666U, // <1,4,3,5>: Cost 3 vtrnr LHS, <0,4,1,5> + 3087843676U, // <1,4,3,6>: Cost 3 vtrnr LHS, <0,4,2,6> + 3666670695U, // <1,4,3,7>: Cost 4 vext1 <7,1,4,3>, <7,1,4,3> + 3087843669U, // <1,4,3,u>: Cost 3 vtrnr LHS, <0,4,1,u> + 2620672914U, // <1,4,4,0>: Cost 3 vext2 <0,5,1,4>, <4,0,5,1> + 3630842706U, // <1,4,4,1>: Cost 4 vext1 <1,1,4,4>, <1,1,4,4> + 3313069003U, // <1,4,4,2>: Cost 4 vrev <4,1,2,4> + 3642788100U, // <1,4,4,3>: Cost 4 vext1 <3,1,4,4>, <3,1,4,4> + 2713930960U, // <1,4,4,4>: Cost 3 vext3 <4,u,5,1>, <4,4,4,4> + 2619346230U, // <1,4,4,5>: Cost 3 vext2 <0,3,1,4>, RHS + 2713930980U, // <1,4,4,6>: Cost 3 vext3 <4,u,5,1>, <4,4,6,6> + 3736882642U, // <1,4,4,7>: Cost 4 vext2 <7,6,1,4>, <4,7,6,1> + 2619346473U, // <1,4,4,u>: Cost 3 vext2 <0,3,1,4>, RHS + 2557108326U, // <1,4,5,0>: Cost 3 vext1 <1,1,4,5>, LHS + 2557109075U, // <1,4,5,1>: Cost 3 vext1 <1,1,4,5>, <1,1,4,5> + 2598913774U, // <1,4,5,2>: Cost 3 vext1 <u,1,4,5>, <2,3,u,1> + 3630852246U, // <1,4,5,3>: Cost 4 vext1 <1,1,4,5>, <3,0,1,2> + 2557111606U, // <1,4,5,4>: Cost 3 vext1 <1,1,4,5>, RHS + 2895252790U, // <1,4,5,5>: Cost 3 vzipl <1,5,3,7>, RHS + 1616006454U, // <1,4,5,6>: Cost 2 vext3 <0,u,1,1>, RHS + 3899059510U, // <1,4,5,7>: Cost 4 vuzpr <1,1,1,4>, RHS + 1616006472U, // <1,4,5,u>: Cost 2 vext3 <0,u,1,1>, RHS + 2557116518U, // <1,4,6,0>: Cost 3 vext1 <1,1,4,6>, LHS + 2557117236U, // <1,4,6,1>: Cost 3 vext1 <1,1,4,6>, <1,1,1,1> + 3630859880U, // <1,4,6,2>: Cost 4 vext1 <1,1,4,6>, <2,2,2,2> + 2569062550U, // <1,4,6,3>: Cost 3 vext1 <3,1,4,6>, <3,0,1,2> + 2557119798U, // <1,4,6,4>: Cost 3 vext1 <1,1,4,6>, RHS + 3763490174U, // <1,4,6,5>: Cost 4 vext3 <0,u,1,1>, <4,6,5,7> + 3763490183U, // <1,4,6,6>: Cost 4 vext3 <0,u,1,1>, <4,6,6,7> + 2712751498U, // <1,4,6,7>: Cost 3 vext3 <4,6,7,1>, <4,6,7,1> + 2557122350U, // <1,4,6,u>: Cost 3 vext1 <1,1,4,6>, LHS + 2659161084U, // <1,4,7,0>: Cost 3 vext2 <7,0,1,4>, <7,0,1,4> + 3732903040U, // <1,4,7,1>: Cost 4 vext2 <7,0,1,4>, <7,1,7,1> + 3734230174U, // <1,4,7,2>: Cost 4 vext2 <7,2,1,4>, <7,2,1,4> + 3734893807U, // <1,4,7,3>: Cost 4 vext2 <7,3,1,4>, <7,3,1,4> + 3660729654U, // <1,4,7,4>: Cost 4 vext1 <6,1,4,7>, RHS + 3786493384U, // <1,4,7,5>: Cost 4 vext3 <4,6,7,1>, <4,7,5,0> + 2713341394U, // <1,4,7,6>: Cost 3 vext3 <4,7,6,1>, <4,7,6,1> + 3660731386U, // <1,4,7,7>: Cost 4 vext1 <6,1,4,7>, <7,0,1,2> + 2664470148U, // <1,4,7,u>: Cost 3 vext2 <7,u,1,4>, <7,u,1,4> + 2557132902U, // <1,4,u,0>: Cost 3 vext1 <1,1,4,u>, LHS + 2619348782U, // <1,4,u,1>: Cost 3 vext2 <0,3,1,4>, LHS + 2563106351U, // <1,4,u,2>: Cost 3 vext1 <2,1,4,u>, <2,1,4,u> + 2713783816U, // <1,4,u,3>: Cost 3 vext3 <4,u,3,1>, <4,u,3,1> + 2622666815U, // <1,4,u,4>: Cost 3 vext2 <0,u,1,4>, <u,4,5,6> + 1640189466U, // <1,4,u,5>: Cost 2 vext3 <4,u,5,1>, <4,u,5,1> + 1616006697U, // <1,4,u,6>: Cost 2 vext3 <0,u,1,1>, RHS + 2712751498U, // <1,4,u,7>: Cost 3 vext3 <4,6,7,1>, <4,6,7,1> + 1616006715U, // <1,4,u,u>: Cost 2 vext3 <0,u,1,1>, RHS + 2620014592U, // <1,5,0,0>: Cost 3 vext2 <0,4,1,5>, <0,0,0,0> + 1546272870U, // <1,5,0,1>: Cost 2 vext2 <0,4,1,5>, LHS + 2618687664U, // <1,5,0,2>: Cost 3 vext2 <0,2,1,5>, <0,2,1,5> + 3693093120U, // <1,5,0,3>: Cost 4 vext2 <0,3,1,5>, <0,3,1,4> + 1546273106U, // <1,5,0,4>: Cost 2 vext2 <0,4,1,5>, <0,4,1,5> + 2620678563U, // <1,5,0,5>: Cost 3 vext2 <0,5,1,5>, <0,5,1,5> + 2714668660U, // <1,5,0,6>: Cost 3 vext3 <5,0,6,1>, <5,0,6,1> + 3772042877U, // <1,5,0,7>: Cost 4 vext3 <2,3,0,1>, <5,0,7,1> + 1546273437U, // <1,5,0,u>: Cost 2 vext2 <0,4,1,5>, LHS + 2620015350U, // <1,5,1,0>: Cost 3 vext2 <0,4,1,5>, <1,0,3,2> + 2620015412U, // <1,5,1,1>: Cost 3 vext2 <0,4,1,5>, <1,1,1,1> + 2620015510U, // <1,5,1,2>: Cost 3 vext2 <0,4,1,5>, <1,2,3,0> + 2618688512U, // <1,5,1,3>: Cost 3 vext2 <0,2,1,5>, <1,3,5,7> + 2620015677U, // <1,5,1,4>: Cost 3 vext2 <0,4,1,5>, <1,4,3,5> + 2620015727U, // <1,5,1,5>: Cost 3 vext2 <0,4,1,5>, <1,5,0,1> + 2620015859U, // <1,5,1,6>: Cost 3 vext2 <0,4,1,5>, <1,6,5,7> + 3093728566U, // <1,5,1,7>: Cost 3 vtrnr <1,1,1,1>, RHS + 2620015981U, // <1,5,1,u>: Cost 3 vext2 <0,4,1,5>, <1,u,1,3> + 3692430816U, // <1,5,2,0>: Cost 4 vext2 <0,2,1,5>, <2,0,5,1> + 2620016163U, // <1,5,2,1>: Cost 3 vext2 <0,4,1,5>, <2,1,3,5> + 2620016232U, // <1,5,2,2>: Cost 3 vext2 <0,4,1,5>, <2,2,2,2> + 2620016294U, // <1,5,2,3>: Cost 3 vext2 <0,4,1,5>, <2,3,0,1> + 3693758221U, // <1,5,2,4>: Cost 4 vext2 <0,4,1,5>, <2,4,2,5> + 3692431209U, // <1,5,2,5>: Cost 4 vext2 <0,2,1,5>, <2,5,3,7> + 2620016570U, // <1,5,2,6>: Cost 3 vext2 <0,4,1,5>, <2,6,3,7> + 4173598006U, // <1,5,2,7>: Cost 4 vtrnr <2,1,3,2>, RHS + 2620016699U, // <1,5,2,u>: Cost 3 vext2 <0,4,1,5>, <2,u,0,1> + 2620016790U, // <1,5,3,0>: Cost 3 vext2 <0,4,1,5>, <3,0,1,2> + 2569110672U, // <1,5,3,1>: Cost 3 vext1 <3,1,5,3>, <1,5,3,7> + 3693758785U, // <1,5,3,2>: Cost 4 vext2 <0,4,1,5>, <3,2,2,2> + 2620017052U, // <1,5,3,3>: Cost 3 vext2 <0,4,1,5>, <3,3,3,3> + 2620017154U, // <1,5,3,4>: Cost 3 vext2 <0,4,1,5>, <3,4,5,6> + 3135623172U, // <1,5,3,5>: Cost 3 vtrnr LHS, <5,5,5,5> + 4161587048U, // <1,5,3,6>: Cost 4 vtrnr LHS, <2,5,3,6> + 2014104886U, // <1,5,3,7>: Cost 2 vtrnr LHS, RHS + 2014104887U, // <1,5,3,u>: Cost 2 vtrnr LHS, RHS + 2620017554U, // <1,5,4,0>: Cost 3 vext2 <0,4,1,5>, <4,0,5,1> + 2620017634U, // <1,5,4,1>: Cost 3 vext2 <0,4,1,5>, <4,1,5,0> + 3693759551U, // <1,5,4,2>: Cost 4 vext2 <0,4,1,5>, <4,2,6,3> + 3642861837U, // <1,5,4,3>: Cost 4 vext1 <3,1,5,4>, <3,1,5,4> + 2575092710U, // <1,5,4,4>: Cost 3 vext1 <4,1,5,4>, <4,1,5,4> + 1546276150U, // <1,5,4,5>: Cost 2 vext2 <0,4,1,5>, RHS + 2759855414U, // <1,5,4,6>: Cost 3 vuzpl <1,3,5,7>, RHS + 2713931718U, // <1,5,4,7>: Cost 3 vext3 <4,u,5,1>, <5,4,7,6> + 1546276393U, // <1,5,4,u>: Cost 2 vext2 <0,4,1,5>, RHS + 2557182054U, // <1,5,5,0>: Cost 3 vext1 <1,1,5,5>, LHS + 2557182812U, // <1,5,5,1>: Cost 3 vext1 <1,1,5,5>, <1,1,5,5> + 3630925347U, // <1,5,5,2>: Cost 4 vext1 <1,1,5,5>, <2,1,3,5> + 4029301675U, // <1,5,5,3>: Cost 4 vzipr <0,4,1,5>, <1,2,5,3> + 2557185334U, // <1,5,5,4>: Cost 3 vext1 <1,1,5,5>, RHS + 2713931780U, // <1,5,5,5>: Cost 3 vext3 <4,u,5,1>, <5,5,5,5> + 2667794530U, // <1,5,5,6>: Cost 3 vext2 <u,4,1,5>, <5,6,7,0> + 2713931800U, // <1,5,5,7>: Cost 3 vext3 <4,u,5,1>, <5,5,7,7> + 2557187886U, // <1,5,5,u>: Cost 3 vext1 <1,1,5,5>, LHS + 2718208036U, // <1,5,6,0>: Cost 3 vext3 <5,6,0,1>, <5,6,0,1> + 2620019115U, // <1,5,6,1>: Cost 3 vext2 <0,4,1,5>, <6,1,7,5> + 2667794938U, // <1,5,6,2>: Cost 3 vext2 <u,4,1,5>, <6,2,7,3> + 3787673666U, // <1,5,6,3>: Cost 4 vext3 <4,u,5,1>, <5,6,3,4> + 3693761165U, // <1,5,6,4>: Cost 4 vext2 <0,4,1,5>, <6,4,5,6> + 3319279297U, // <1,5,6,5>: Cost 4 vrev <5,1,5,6> + 2667795256U, // <1,5,6,6>: Cost 3 vext2 <u,4,1,5>, <6,6,6,6> + 2713931874U, // <1,5,6,7>: Cost 3 vext3 <4,u,5,1>, <5,6,7,0> + 2713931883U, // <1,5,6,u>: Cost 3 vext3 <4,u,5,1>, <5,6,u,0> + 2557198438U, // <1,5,7,0>: Cost 3 vext1 <1,1,5,7>, LHS + 2557199156U, // <1,5,7,1>: Cost 3 vext1 <1,1,5,7>, <1,1,1,1> + 2569143974U, // <1,5,7,2>: Cost 3 vext1 <3,1,5,7>, <2,3,0,1> + 2569144592U, // <1,5,7,3>: Cost 3 vext1 <3,1,5,7>, <3,1,5,7> + 2557201718U, // <1,5,7,4>: Cost 3 vext1 <1,1,5,7>, RHS + 2713931944U, // <1,5,7,5>: Cost 3 vext3 <4,u,5,1>, <5,7,5,7> + 3787673770U, // <1,5,7,6>: Cost 4 vext3 <4,u,5,1>, <5,7,6,0> + 2719387828U, // <1,5,7,7>: Cost 3 vext3 <5,7,7,1>, <5,7,7,1> + 2557204270U, // <1,5,7,u>: Cost 3 vext1 <1,1,5,7>, LHS + 2620020435U, // <1,5,u,0>: Cost 3 vext2 <0,4,1,5>, <u,0,1,2> + 1546278702U, // <1,5,u,1>: Cost 2 vext2 <0,4,1,5>, LHS + 2620020616U, // <1,5,u,2>: Cost 3 vext2 <0,4,1,5>, <u,2,3,3> + 2620020668U, // <1,5,u,3>: Cost 3 vext2 <0,4,1,5>, <u,3,0,1> + 1594054682U, // <1,5,u,4>: Cost 2 vext2 <u,4,1,5>, <u,4,1,5> + 1546279066U, // <1,5,u,5>: Cost 2 vext2 <0,4,1,5>, RHS + 2620020944U, // <1,5,u,6>: Cost 3 vext2 <0,4,1,5>, <u,6,3,7> + 2014145846U, // <1,5,u,7>: Cost 2 vtrnr LHS, RHS + 2014145847U, // <1,5,u,u>: Cost 2 vtrnr LHS, RHS + 3692437504U, // <1,6,0,0>: Cost 4 vext2 <0,2,1,6>, <0,0,0,0> + 2618695782U, // <1,6,0,1>: Cost 3 vext2 <0,2,1,6>, LHS + 2618695857U, // <1,6,0,2>: Cost 3 vext2 <0,2,1,6>, <0,2,1,6> + 3794161970U, // <1,6,0,3>: Cost 4 vext3 <6,0,3,1>, <6,0,3,1> + 2620023122U, // <1,6,0,4>: Cost 3 vext2 <0,4,1,6>, <0,4,1,5> + 2620686756U, // <1,6,0,5>: Cost 3 vext2 <0,5,1,6>, <0,5,1,6> + 2621350389U, // <1,6,0,6>: Cost 3 vext2 <0,6,1,6>, <0,6,1,6> + 4028599606U, // <1,6,0,7>: Cost 4 vzipr <0,3,1,0>, RHS + 2618696349U, // <1,6,0,u>: Cost 3 vext2 <0,2,1,6>, LHS + 3692438262U, // <1,6,1,0>: Cost 4 vext2 <0,2,1,6>, <1,0,3,2> + 2625995572U, // <1,6,1,1>: Cost 3 vext2 <1,4,1,6>, <1,1,1,1> + 3692438422U, // <1,6,1,2>: Cost 4 vext2 <0,2,1,6>, <1,2,3,0> + 3692438488U, // <1,6,1,3>: Cost 4 vext2 <0,2,1,6>, <1,3,1,3> + 2625995820U, // <1,6,1,4>: Cost 3 vext2 <1,4,1,6>, <1,4,1,6> + 3692438672U, // <1,6,1,5>: Cost 4 vext2 <0,2,1,6>, <1,5,3,7> + 3692438720U, // <1,6,1,6>: Cost 4 vext2 <0,2,1,6>, <1,6,0,1> + 2958183734U, // <1,6,1,7>: Cost 3 vzipr <0,u,1,1>, RHS + 2958183735U, // <1,6,1,u>: Cost 3 vzipr <0,u,1,1>, RHS + 2721526201U, // <1,6,2,0>: Cost 3 vext3 <6,2,0,1>, <6,2,0,1> + 3692439097U, // <1,6,2,1>: Cost 4 vext2 <0,2,1,6>, <2,1,6,0> + 3692439144U, // <1,6,2,2>: Cost 4 vext2 <0,2,1,6>, <2,2,2,2> + 3692439206U, // <1,6,2,3>: Cost 4 vext2 <0,2,1,6>, <2,3,0,1> + 3636948278U, // <1,6,2,4>: Cost 4 vext1 <2,1,6,2>, RHS + 3787674092U, // <1,6,2,5>: Cost 4 vext3 <4,u,5,1>, <6,2,5,7> + 2618697658U, // <1,6,2,6>: Cost 3 vext2 <0,2,1,6>, <2,6,3,7> + 2970799414U, // <1,6,2,7>: Cost 3 vzipr <3,0,1,2>, RHS + 2970799415U, // <1,6,2,u>: Cost 3 vzipr <3,0,1,2>, RHS + 2563211366U, // <1,6,3,0>: Cost 3 vext1 <2,1,6,3>, LHS + 3699738854U, // <1,6,3,1>: Cost 4 vext2 <1,4,1,6>, <3,1,1,1> + 2563212860U, // <1,6,3,2>: Cost 3 vext1 <2,1,6,3>, <2,1,6,3> + 3692439964U, // <1,6,3,3>: Cost 4 vext2 <0,2,1,6>, <3,3,3,3> + 2563214646U, // <1,6,3,4>: Cost 3 vext1 <2,1,6,3>, RHS + 4191820018U, // <1,6,3,5>: Cost 4 vtrnr <5,1,7,3>, <u,6,7,5> + 2587103648U, // <1,6,3,6>: Cost 3 vext1 <6,1,6,3>, <6,1,6,3> + 3087845306U, // <1,6,3,7>: Cost 3 vtrnr LHS, <2,6,3,7> + 3087845307U, // <1,6,3,u>: Cost 3 vtrnr LHS, <2,6,3,u> + 3693767570U, // <1,6,4,0>: Cost 4 vext2 <0,4,1,6>, <4,0,5,1> + 3693767650U, // <1,6,4,1>: Cost 4 vext2 <0,4,1,6>, <4,1,5,0> + 3636962877U, // <1,6,4,2>: Cost 4 vext1 <2,1,6,4>, <2,1,6,4> + 3325088134U, // <1,6,4,3>: Cost 4 vrev <6,1,3,4> + 3693767898U, // <1,6,4,4>: Cost 4 vext2 <0,4,1,6>, <4,4,5,5> + 2618699062U, // <1,6,4,5>: Cost 3 vext2 <0,2,1,6>, RHS + 3833670966U, // <1,6,4,6>: Cost 4 vuzpl <1,3,6,7>, RHS + 4028632374U, // <1,6,4,7>: Cost 4 vzipr <0,3,1,4>, RHS + 2618699305U, // <1,6,4,u>: Cost 3 vext2 <0,2,1,6>, RHS + 3693768264U, // <1,6,5,0>: Cost 4 vext2 <0,4,1,6>, <5,0,1,2> + 3630998373U, // <1,6,5,1>: Cost 4 vext1 <1,1,6,5>, <1,1,6,5> + 3636971070U, // <1,6,5,2>: Cost 4 vext1 <2,1,6,5>, <2,1,6,5> + 3642943767U, // <1,6,5,3>: Cost 4 vext1 <3,1,6,5>, <3,1,6,5> + 3693768628U, // <1,6,5,4>: Cost 4 vext2 <0,4,1,6>, <5,4,5,6> + 3732918276U, // <1,6,5,5>: Cost 4 vext2 <7,0,1,6>, <5,5,5,5> + 2620690530U, // <1,6,5,6>: Cost 3 vext2 <0,5,1,6>, <5,6,7,0> + 2955562294U, // <1,6,5,7>: Cost 3 vzipr <0,4,1,5>, RHS + 2955562295U, // <1,6,5,u>: Cost 3 vzipr <0,4,1,5>, RHS + 2724180733U, // <1,6,6,0>: Cost 3 vext3 <6,6,0,1>, <6,6,0,1> + 3631006566U, // <1,6,6,1>: Cost 4 vext1 <1,1,6,6>, <1,1,6,6> + 3631007674U, // <1,6,6,2>: Cost 4 vext1 <1,1,6,6>, <2,6,3,7> + 3692442184U, // <1,6,6,3>: Cost 4 vext2 <0,2,1,6>, <6,3,7,0> + 3631009078U, // <1,6,6,4>: Cost 4 vext1 <1,1,6,6>, RHS + 3787674416U, // <1,6,6,5>: Cost 4 vext3 <4,u,5,1>, <6,6,5,7> + 2713932600U, // <1,6,6,6>: Cost 3 vext3 <4,u,5,1>, <6,6,6,6> + 2713932610U, // <1,6,6,7>: Cost 3 vext3 <4,u,5,1>, <6,6,7,7> + 2713932619U, // <1,6,6,u>: Cost 3 vext3 <4,u,5,1>, <6,6,u,7> + 1651102542U, // <1,6,7,0>: Cost 2 vext3 <6,7,0,1>, <6,7,0,1> + 2724918103U, // <1,6,7,1>: Cost 3 vext3 <6,7,1,1>, <6,7,1,1> + 2698302306U, // <1,6,7,2>: Cost 3 vext3 <2,3,0,1>, <6,7,2,3> + 3642960153U, // <1,6,7,3>: Cost 4 vext1 <3,1,6,7>, <3,1,6,7> + 2713932662U, // <1,6,7,4>: Cost 3 vext3 <4,u,5,1>, <6,7,4,5> + 2725213051U, // <1,6,7,5>: Cost 3 vext3 <6,7,5,1>, <6,7,5,1> + 2724844426U, // <1,6,7,6>: Cost 3 vext3 <6,7,0,1>, <6,7,6,7> + 4035956022U, // <1,6,7,7>: Cost 4 vzipr <1,5,1,7>, RHS + 1651692438U, // <1,6,7,u>: Cost 2 vext3 <6,7,u,1>, <6,7,u,1> + 1651766175U, // <1,6,u,0>: Cost 2 vext3 <6,u,0,1>, <6,u,0,1> + 2618701614U, // <1,6,u,1>: Cost 3 vext2 <0,2,1,6>, LHS + 3135663508U, // <1,6,u,2>: Cost 3 vtrnr LHS, <4,6,u,2> + 3692443580U, // <1,6,u,3>: Cost 4 vext2 <0,2,1,6>, <u,3,0,1> + 2713932743U, // <1,6,u,4>: Cost 3 vext3 <4,u,5,1>, <6,u,4,5> + 2618701978U, // <1,6,u,5>: Cost 3 vext2 <0,2,1,6>, RHS + 2622683344U, // <1,6,u,6>: Cost 3 vext2 <0,u,1,6>, <u,6,3,7> + 3087886266U, // <1,6,u,7>: Cost 3 vtrnr LHS, <2,6,3,7> + 1652356071U, // <1,6,u,u>: Cost 2 vext3 <6,u,u,1>, <6,u,u,1> + 2726171632U, // <1,7,0,0>: Cost 3 vext3 <7,0,0,1>, <7,0,0,1> + 2626666598U, // <1,7,0,1>: Cost 3 vext2 <1,5,1,7>, LHS + 3695100067U, // <1,7,0,2>: Cost 4 vext2 <0,6,1,7>, <0,2,0,1> + 3707044102U, // <1,7,0,3>: Cost 4 vext2 <2,6,1,7>, <0,3,2,1> + 2726466580U, // <1,7,0,4>: Cost 3 vext3 <7,0,4,1>, <7,0,4,1> + 3654921933U, // <1,7,0,5>: Cost 4 vext1 <5,1,7,0>, <5,1,7,0> + 2621358582U, // <1,7,0,6>: Cost 3 vext2 <0,6,1,7>, <0,6,1,7> + 2622022215U, // <1,7,0,7>: Cost 3 vext2 <0,7,1,7>, <0,7,1,7> + 2626667165U, // <1,7,0,u>: Cost 3 vext2 <1,5,1,7>, LHS + 2593128550U, // <1,7,1,0>: Cost 3 vext1 <7,1,7,1>, LHS + 2626667316U, // <1,7,1,1>: Cost 3 vext2 <1,5,1,7>, <1,1,1,1> + 3700409238U, // <1,7,1,2>: Cost 4 vext2 <1,5,1,7>, <1,2,3,0> + 2257294428U, // <1,7,1,3>: Cost 3 vrev <7,1,3,1> + 2593131830U, // <1,7,1,4>: Cost 3 vext1 <7,1,7,1>, RHS + 2626667646U, // <1,7,1,5>: Cost 3 vext2 <1,5,1,7>, <1,5,1,7> + 2627331279U, // <1,7,1,6>: Cost 3 vext2 <1,6,1,7>, <1,6,1,7> + 2593133696U, // <1,7,1,7>: Cost 3 vext1 <7,1,7,1>, <7,1,7,1> + 2628658545U, // <1,7,1,u>: Cost 3 vext2 <1,u,1,7>, <1,u,1,7> + 2587164774U, // <1,7,2,0>: Cost 3 vext1 <6,1,7,2>, LHS + 3701073445U, // <1,7,2,1>: Cost 4 vext2 <1,6,1,7>, <2,1,3,7> + 3700409960U, // <1,7,2,2>: Cost 4 vext2 <1,5,1,7>, <2,2,2,2> + 2638612134U, // <1,7,2,3>: Cost 3 vext2 <3,5,1,7>, <2,3,0,1> + 2587168054U, // <1,7,2,4>: Cost 3 vext1 <6,1,7,2>, RHS + 3706382167U, // <1,7,2,5>: Cost 4 vext2 <2,5,1,7>, <2,5,1,7> + 2587169192U, // <1,7,2,6>: Cost 3 vext1 <6,1,7,2>, <6,1,7,2> + 3660911610U, // <1,7,2,7>: Cost 4 vext1 <6,1,7,2>, <7,0,1,2> + 2587170606U, // <1,7,2,u>: Cost 3 vext1 <6,1,7,2>, LHS + 1507459174U, // <1,7,3,0>: Cost 2 vext1 <5,1,7,3>, LHS + 2569257984U, // <1,7,3,1>: Cost 3 vext1 <3,1,7,3>, <1,3,5,7> + 2581202536U, // <1,7,3,2>: Cost 3 vext1 <5,1,7,3>, <2,2,2,2> + 2569259294U, // <1,7,3,3>: Cost 3 vext1 <3,1,7,3>, <3,1,7,3> + 1507462454U, // <1,7,3,4>: Cost 2 vext1 <5,1,7,3>, RHS + 1507462864U, // <1,7,3,5>: Cost 2 vext1 <5,1,7,3>, <5,1,7,3> + 2581205498U, // <1,7,3,6>: Cost 3 vext1 <5,1,7,3>, <6,2,7,3> + 2581206010U, // <1,7,3,7>: Cost 3 vext1 <5,1,7,3>, <7,0,1,2> + 1507465006U, // <1,7,3,u>: Cost 2 vext1 <5,1,7,3>, LHS + 2728826164U, // <1,7,4,0>: Cost 3 vext3 <7,4,0,1>, <7,4,0,1> + 3654951732U, // <1,7,4,1>: Cost 4 vext1 <5,1,7,4>, <1,1,1,1> + 3330987094U, // <1,7,4,2>: Cost 4 vrev <7,1,2,4> + 3331060831U, // <1,7,4,3>: Cost 4 vrev <7,1,3,4> + 3787674971U, // <1,7,4,4>: Cost 4 vext3 <4,u,5,1>, <7,4,4,4> + 2626669878U, // <1,7,4,5>: Cost 3 vext2 <1,5,1,7>, RHS + 3785979241U, // <1,7,4,6>: Cost 4 vext3 <4,6,0,1>, <7,4,6,0> + 3787085176U, // <1,7,4,7>: Cost 4 vext3 <4,7,6,1>, <7,4,7,6> + 2626670121U, // <1,7,4,u>: Cost 3 vext2 <1,5,1,7>, RHS + 2569273446U, // <1,7,5,0>: Cost 3 vext1 <3,1,7,5>, LHS + 2569274368U, // <1,7,5,1>: Cost 3 vext1 <3,1,7,5>, <1,3,5,7> + 3643016808U, // <1,7,5,2>: Cost 4 vext1 <3,1,7,5>, <2,2,2,2> + 2569275680U, // <1,7,5,3>: Cost 3 vext1 <3,1,7,5>, <3,1,7,5> + 2569276726U, // <1,7,5,4>: Cost 3 vext1 <3,1,7,5>, RHS + 4102034790U, // <1,7,5,5>: Cost 4 vtrnl <1,3,5,7>, <7,4,5,6> + 2651222067U, // <1,7,5,6>: Cost 3 vext2 <5,6,1,7>, <5,6,1,7> + 3899378998U, // <1,7,5,7>: Cost 4 vuzpr <1,1,5,7>, RHS + 2569279278U, // <1,7,5,u>: Cost 3 vext1 <3,1,7,5>, LHS + 2730153430U, // <1,7,6,0>: Cost 3 vext3 <7,6,0,1>, <7,6,0,1> + 2724845022U, // <1,7,6,1>: Cost 3 vext3 <6,7,0,1>, <7,6,1,0> + 3643025338U, // <1,7,6,2>: Cost 4 vext1 <3,1,7,6>, <2,6,3,7> + 3643025697U, // <1,7,6,3>: Cost 4 vext1 <3,1,7,6>, <3,1,7,6> + 3643026742U, // <1,7,6,4>: Cost 4 vext1 <3,1,7,6>, RHS + 3654971091U, // <1,7,6,5>: Cost 4 vext1 <5,1,7,6>, <5,1,7,6> + 3787675153U, // <1,7,6,6>: Cost 4 vext3 <4,u,5,1>, <7,6,6,6> + 2724845076U, // <1,7,6,7>: Cost 3 vext3 <6,7,0,1>, <7,6,7,0> + 2725508637U, // <1,7,6,u>: Cost 3 vext3 <6,u,0,1>, <7,6,u,0> + 2730817063U, // <1,7,7,0>: Cost 3 vext3 <7,7,0,1>, <7,7,0,1> + 3631088436U, // <1,7,7,1>: Cost 4 vext1 <1,1,7,7>, <1,1,1,1> + 3660949158U, // <1,7,7,2>: Cost 4 vext1 <6,1,7,7>, <2,3,0,1> + 3801904705U, // <1,7,7,3>: Cost 4 vext3 <7,3,0,1>, <7,7,3,0> + 3631090998U, // <1,7,7,4>: Cost 4 vext1 <1,1,7,7>, RHS + 2662503828U, // <1,7,7,5>: Cost 3 vext2 <7,5,1,7>, <7,5,1,7> + 3660951981U, // <1,7,7,6>: Cost 4 vext1 <6,1,7,7>, <6,1,7,7> + 2713933420U, // <1,7,7,7>: Cost 3 vext3 <4,u,5,1>, <7,7,7,7> + 2731406959U, // <1,7,7,u>: Cost 3 vext3 <7,7,u,1>, <7,7,u,1> + 1507500134U, // <1,7,u,0>: Cost 2 vext1 <5,1,7,u>, LHS + 2626672430U, // <1,7,u,1>: Cost 3 vext2 <1,5,1,7>, LHS + 2581243496U, // <1,7,u,2>: Cost 3 vext1 <5,1,7,u>, <2,2,2,2> + 2569300259U, // <1,7,u,3>: Cost 3 vext1 <3,1,7,u>, <3,1,7,u> + 1507503414U, // <1,7,u,4>: Cost 2 vext1 <5,1,7,u>, RHS + 1507503829U, // <1,7,u,5>: Cost 2 vext1 <5,1,7,u>, <5,1,7,u> + 2581246458U, // <1,7,u,6>: Cost 3 vext1 <5,1,7,u>, <6,2,7,3> + 2581246970U, // <1,7,u,7>: Cost 3 vext1 <5,1,7,u>, <7,0,1,2> + 1507505966U, // <1,7,u,u>: Cost 2 vext1 <5,1,7,u>, LHS + 1543643153U, // <1,u,0,0>: Cost 2 vext2 <0,0,1,u>, <0,0,1,u> + 1546297446U, // <1,u,0,1>: Cost 2 vext2 <0,4,1,u>, LHS + 2819448852U, // <1,u,0,2>: Cost 3 vuzpr LHS, <0,0,2,2> + 2619375876U, // <1,u,0,3>: Cost 3 vext2 <0,3,1,u>, <0,3,1,u> + 1546297685U, // <1,u,0,4>: Cost 2 vext2 <0,4,1,u>, <0,4,1,u> + 1658771190U, // <1,u,0,5>: Cost 2 vext3 <u,0,5,1>, <u,0,5,1> + 2736789248U, // <1,u,0,6>: Cost 3 vext3 <u,7,0,1>, <u,0,6,2> + 2659189376U, // <1,u,0,7>: Cost 3 vext2 <7,0,1,u>, <0,7,u,1> + 1546298013U, // <1,u,0,u>: Cost 2 vext2 <0,4,1,u>, LHS + 1483112550U, // <1,u,1,0>: Cost 2 vext1 <1,1,1,1>, LHS + 202162278U, // <1,u,1,1>: Cost 1 vdup1 LHS + 1616009006U, // <1,u,1,2>: Cost 2 vext3 <0,u,1,1>, LHS + 1745707110U, // <1,u,1,3>: Cost 2 vuzpr LHS, LHS + 1483115830U, // <1,u,1,4>: Cost 2 vext1 <1,1,1,1>, RHS + 2620040336U, // <1,u,1,5>: Cost 3 vext2 <0,4,1,u>, <1,5,3,7> + 3026622618U, // <1,u,1,6>: Cost 3 vtrnl <1,1,1,1>, RHS + 2958183752U, // <1,u,1,7>: Cost 3 vzipr <0,u,1,1>, RHS + 202162278U, // <1,u,1,u>: Cost 1 vdup1 LHS + 2819449750U, // <1,u,2,0>: Cost 3 vuzpr LHS, <1,2,3,0> + 2893207342U, // <1,u,2,1>: Cost 3 vzipl <1,2,3,0>, LHS + 2819448996U, // <1,u,2,2>: Cost 3 vuzpr LHS, <0,2,0,2> + 2819450482U, // <1,u,2,3>: Cost 3 vuzpr LHS, <2,2,3,3> + 2819449754U, // <1,u,2,4>: Cost 3 vuzpr LHS, <1,2,3,4> + 2893207706U, // <1,u,2,5>: Cost 3 vzipl <1,2,3,0>, RHS + 2819449036U, // <1,u,2,6>: Cost 3 vuzpr LHS, <0,2,4,6> + 2970799432U, // <1,u,2,7>: Cost 3 vzipr <3,0,1,2>, RHS + 2819449002U, // <1,u,2,u>: Cost 3 vuzpr LHS, <0,2,0,u> + 403931292U, // <1,u,3,0>: Cost 1 vext1 LHS, LHS + 1477673718U, // <1,u,3,1>: Cost 2 vext1 LHS, <1,0,3,2> + 115726126U, // <1,u,3,2>: Cost 1 vrev LHS + 2014102173U, // <1,u,3,3>: Cost 2 vtrnr LHS, LHS + 403934518U, // <1,u,3,4>: Cost 1 vext1 LHS, RHS + 1507536601U, // <1,u,3,5>: Cost 2 vext1 <5,1,u,3>, <5,1,u,3> + 1525453306U, // <1,u,3,6>: Cost 2 vext1 LHS, <6,2,7,3> + 2014105129U, // <1,u,3,7>: Cost 2 vtrnr LHS, RHS + 403937070U, // <1,u,3,u>: Cost 1 vext1 LHS, LHS + 2620042157U, // <1,u,4,0>: Cost 3 vext2 <0,4,1,u>, <4,0,u,1> + 2620042237U, // <1,u,4,1>: Cost 3 vext2 <0,4,1,u>, <4,1,u,0> + 2263217967U, // <1,u,4,2>: Cost 3 vrev <u,1,2,4> + 2569341224U, // <1,u,4,3>: Cost 3 vext1 <3,1,u,4>, <3,1,u,4> + 2569342262U, // <1,u,4,4>: Cost 3 vext1 <3,1,u,4>, RHS + 1546300726U, // <1,u,4,5>: Cost 2 vext2 <0,4,1,u>, RHS + 2819449180U, // <1,u,4,6>: Cost 3 vuzpr LHS, <0,4,2,6> + 2724845649U, // <1,u,4,7>: Cost 3 vext3 <6,7,0,1>, <u,4,7,6> + 1546300969U, // <1,u,4,u>: Cost 2 vext2 <0,4,1,u>, RHS + 2551431270U, // <1,u,5,0>: Cost 3 vext1 <0,1,u,5>, LHS + 2551432192U, // <1,u,5,1>: Cost 3 vext1 <0,1,u,5>, <1,3,5,7> + 3028293422U, // <1,u,5,2>: Cost 3 vtrnl <1,3,5,7>, LHS + 2955559068U, // <1,u,5,3>: Cost 3 vzipr <0,4,1,5>, LHS + 2551434550U, // <1,u,5,4>: Cost 3 vext1 <0,1,u,5>, RHS + 2895255706U, // <1,u,5,5>: Cost 3 vzipl <1,5,3,7>, RHS + 1616009370U, // <1,u,5,6>: Cost 2 vext3 <0,u,1,1>, RHS + 1745710390U, // <1,u,5,7>: Cost 2 vuzpr LHS, RHS + 1745710391U, // <1,u,5,u>: Cost 2 vuzpr LHS, RHS + 2653221159U, // <1,u,6,0>: Cost 3 vext2 <6,0,1,u>, <6,0,1,u> + 2725509303U, // <1,u,6,1>: Cost 3 vext3 <6,u,0,1>, <u,6,1,0> + 2659193338U, // <1,u,6,2>: Cost 3 vext2 <7,0,1,u>, <6,2,7,3> + 2689751248U, // <1,u,6,3>: Cost 3 vext3 <0,u,1,1>, <u,6,3,7> + 2867228774U, // <1,u,6,4>: Cost 3 vuzpr LHS, <5,6,7,4> + 3764820194U, // <1,u,6,5>: Cost 4 vext3 <1,1,1,1>, <u,6,5,7> + 2657202957U, // <1,u,6,6>: Cost 3 vext2 <6,6,1,u>, <6,6,1,u> + 2819450810U, // <1,u,6,7>: Cost 3 vuzpr LHS, <2,6,3,7> + 2819450811U, // <1,u,6,u>: Cost 3 vuzpr LHS, <2,6,3,u> + 1585452032U, // <1,u,7,0>: Cost 2 vext2 <7,0,1,u>, <7,0,1,u> + 2557420340U, // <1,u,7,1>: Cost 3 vext1 <1,1,u,7>, <1,1,1,1> + 2569365158U, // <1,u,7,2>: Cost 3 vext1 <3,1,u,7>, <2,3,0,1> + 2569365803U, // <1,u,7,3>: Cost 3 vext1 <3,1,u,7>, <3,1,u,7> + 2557422902U, // <1,u,7,4>: Cost 3 vext1 <1,1,u,7>, RHS + 2662512021U, // <1,u,7,5>: Cost 3 vext2 <7,5,1,u>, <7,5,1,u> + 2724845884U, // <1,u,7,6>: Cost 3 vext3 <6,7,0,1>, <u,7,6,7> + 2659194476U, // <1,u,7,7>: Cost 3 vext2 <7,0,1,u>, <7,7,7,7> + 1590761096U, // <1,u,7,u>: Cost 2 vext2 <7,u,1,u>, <7,u,1,u> + 403972257U, // <1,u,u,0>: Cost 1 vext1 LHS, LHS + 202162278U, // <1,u,u,1>: Cost 1 vdup1 LHS + 115767091U, // <1,u,u,2>: Cost 1 vrev LHS + 1745707677U, // <1,u,u,3>: Cost 2 vuzpr LHS, LHS + 403975478U, // <1,u,u,4>: Cost 1 vext1 LHS, RHS + 1546303642U, // <1,u,u,5>: Cost 2 vext2 <0,4,1,u>, RHS + 1616009613U, // <1,u,u,6>: Cost 2 vext3 <0,u,1,1>, RHS + 1745710633U, // <1,u,u,7>: Cost 2 vuzpr LHS, RHS + 403978030U, // <1,u,u,u>: Cost 1 vext1 LHS, LHS + 2551463936U, // <2,0,0,0>: Cost 3 vext1 <0,2,0,0>, <0,0,0,0> + 2685698058U, // <2,0,0,1>: Cost 3 vext3 <0,2,0,2>, <0,0,1,1> + 1610776596U, // <2,0,0,2>: Cost 2 vext3 <0,0,2,2>, <0,0,2,2> + 2619384069U, // <2,0,0,3>: Cost 3 vext2 <0,3,2,0>, <0,3,2,0> + 2551467318U, // <2,0,0,4>: Cost 3 vext1 <0,2,0,0>, RHS + 3899836596U, // <2,0,0,5>: Cost 4 vuzpr <1,2,3,0>, <3,0,4,5> + 2621374968U, // <2,0,0,6>: Cost 3 vext2 <0,6,2,0>, <0,6,2,0> + 4168271334U, // <2,0,0,7>: Cost 4 vtrnr <1,2,3,0>, <2,0,5,7> + 1611219018U, // <2,0,0,u>: Cost 2 vext3 <0,0,u,2>, <0,0,u,2> + 2551472138U, // <2,0,1,0>: Cost 3 vext1 <0,2,0,1>, <0,0,1,1> + 2690564186U, // <2,0,1,1>: Cost 3 vext3 <1,0,3,2>, <0,1,1,0> + 1611956326U, // <2,0,1,2>: Cost 2 vext3 <0,2,0,2>, LHS + 2826092646U, // <2,0,1,3>: Cost 3 vuzpr <1,2,3,0>, LHS + 2551475510U, // <2,0,1,4>: Cost 3 vext1 <0,2,0,1>, RHS + 3692463248U, // <2,0,1,5>: Cost 4 vext2 <0,2,2,0>, <1,5,3,7> + 2587308473U, // <2,0,1,6>: Cost 3 vext1 <6,2,0,1>, <6,2,0,1> + 3661050874U, // <2,0,1,7>: Cost 4 vext1 <6,2,0,1>, <7,0,1,2> + 1611956380U, // <2,0,1,u>: Cost 2 vext3 <0,2,0,2>, LHS + 1477738598U, // <2,0,2,0>: Cost 2 vext1 <0,2,0,2>, LHS + 2551481078U, // <2,0,2,1>: Cost 3 vext1 <0,2,0,2>, <1,0,3,2> + 2551481796U, // <2,0,2,2>: Cost 3 vext1 <0,2,0,2>, <2,0,2,0> + 2551482518U, // <2,0,2,3>: Cost 3 vext1 <0,2,0,2>, <3,0,1,2> + 1477741878U, // <2,0,2,4>: Cost 2 vext1 <0,2,0,2>, RHS + 2551484112U, // <2,0,2,5>: Cost 3 vext1 <0,2,0,2>, <5,1,7,3> + 2551484759U, // <2,0,2,6>: Cost 3 vext1 <0,2,0,2>, <6,0,7,2> + 2551485434U, // <2,0,2,7>: Cost 3 vext1 <0,2,0,2>, <7,0,1,2> + 1477744430U, // <2,0,2,u>: Cost 2 vext1 <0,2,0,2>, LHS + 2953625600U, // <2,0,3,0>: Cost 3 vzipr LHS, <0,0,0,0> + 2953627302U, // <2,0,3,1>: Cost 3 vzipr LHS, <2,3,0,1> + 2953625764U, // <2,0,3,2>: Cost 3 vzipr LHS, <0,2,0,2> + 4027369695U, // <2,0,3,3>: Cost 4 vzipr LHS, <3,1,0,3> + 3625233718U, // <2,0,3,4>: Cost 4 vext1 <0,2,0,3>, RHS + 3899836110U, // <2,0,3,5>: Cost 4 vuzpr <1,2,3,0>, <2,3,4,5> + 4032012618U, // <2,0,3,6>: Cost 4 vzipr LHS, <0,4,0,6> + 3899835392U, // <2,0,3,7>: Cost 4 vuzpr <1,2,3,0>, <1,3,5,7> + 2953625770U, // <2,0,3,u>: Cost 3 vzipr LHS, <0,2,0,u> + 2551496806U, // <2,0,4,0>: Cost 3 vext1 <0,2,0,4>, LHS + 2685698386U, // <2,0,4,1>: Cost 3 vext3 <0,2,0,2>, <0,4,1,5> + 2685698396U, // <2,0,4,2>: Cost 3 vext3 <0,2,0,2>, <0,4,2,6> + 3625240726U, // <2,0,4,3>: Cost 4 vext1 <0,2,0,4>, <3,0,1,2> + 2551500086U, // <2,0,4,4>: Cost 3 vext1 <0,2,0,4>, RHS + 2618723638U, // <2,0,4,5>: Cost 3 vext2 <0,2,2,0>, RHS + 2765409590U, // <2,0,4,6>: Cost 3 vuzpl <2,3,0,1>, RHS + 3799990664U, // <2,0,4,7>: Cost 4 vext3 <7,0,1,2>, <0,4,7,5> + 2685698450U, // <2,0,4,u>: Cost 3 vext3 <0,2,0,2>, <0,4,u,6> + 3625246822U, // <2,0,5,0>: Cost 4 vext1 <0,2,0,5>, LHS + 3289776304U, // <2,0,5,1>: Cost 4 vrev <0,2,1,5> + 2690564526U, // <2,0,5,2>: Cost 3 vext3 <1,0,3,2>, <0,5,2,7> + 3289923778U, // <2,0,5,3>: Cost 4 vrev <0,2,3,5> + 2216255691U, // <2,0,5,4>: Cost 3 vrev <0,2,4,5> + 3726307332U, // <2,0,5,5>: Cost 4 vext2 <5,u,2,0>, <5,5,5,5> + 3726307426U, // <2,0,5,6>: Cost 4 vext2 <5,u,2,0>, <5,6,7,0> + 2826095926U, // <2,0,5,7>: Cost 3 vuzpr <1,2,3,0>, RHS + 2216550639U, // <2,0,5,u>: Cost 3 vrev <0,2,u,5> + 4162420736U, // <2,0,6,0>: Cost 4 vtrnr <0,2,4,6>, <0,0,0,0> + 2901885030U, // <2,0,6,1>: Cost 3 vzipl <2,6,3,7>, LHS + 2685698559U, // <2,0,6,2>: Cost 3 vext3 <0,2,0,2>, <0,6,2,7> + 3643173171U, // <2,0,6,3>: Cost 4 vext1 <3,2,0,6>, <3,2,0,6> + 2216263884U, // <2,0,6,4>: Cost 3 vrev <0,2,4,6> + 3730289341U, // <2,0,6,5>: Cost 4 vext2 <6,5,2,0>, <6,5,2,0> + 3726308152U, // <2,0,6,6>: Cost 4 vext2 <5,u,2,0>, <6,6,6,6> + 3899836346U, // <2,0,6,7>: Cost 4 vuzpr <1,2,3,0>, <2,6,3,7> + 2216558832U, // <2,0,6,u>: Cost 3 vrev <0,2,u,6> + 2659202049U, // <2,0,7,0>: Cost 3 vext2 <7,0,2,0>, <7,0,2,0> + 3726308437U, // <2,0,7,1>: Cost 4 vext2 <5,u,2,0>, <7,1,2,3> + 2726249034U, // <2,0,7,2>: Cost 3 vext3 <7,0,1,2>, <0,7,2,1> + 3734934772U, // <2,0,7,3>: Cost 4 vext2 <7,3,2,0>, <7,3,2,0> + 3726308710U, // <2,0,7,4>: Cost 4 vext2 <5,u,2,0>, <7,4,5,6> + 3726308814U, // <2,0,7,5>: Cost 4 vext2 <5,u,2,0>, <7,5,u,2> + 3736925671U, // <2,0,7,6>: Cost 4 vext2 <7,6,2,0>, <7,6,2,0> + 3726308972U, // <2,0,7,7>: Cost 4 vext2 <5,u,2,0>, <7,7,7,7> + 2659202049U, // <2,0,7,u>: Cost 3 vext2 <7,0,2,0>, <7,0,2,0> + 1477787750U, // <2,0,u,0>: Cost 2 vext1 <0,2,0,u>, LHS + 2953668262U, // <2,0,u,1>: Cost 3 vzipr LHS, <2,3,0,1> + 1611956893U, // <2,0,u,2>: Cost 2 vext3 <0,2,0,2>, LHS + 2551531670U, // <2,0,u,3>: Cost 3 vext1 <0,2,0,u>, <3,0,1,2> + 1477791030U, // <2,0,u,4>: Cost 2 vext1 <0,2,0,u>, RHS + 2618726554U, // <2,0,u,5>: Cost 3 vext2 <0,2,2,0>, RHS + 2765412506U, // <2,0,u,6>: Cost 3 vuzpl <2,3,0,1>, RHS + 2826096169U, // <2,0,u,7>: Cost 3 vuzpr <1,2,3,0>, RHS + 1611956947U, // <2,0,u,u>: Cost 2 vext3 <0,2,0,2>, LHS + 2569453670U, // <2,1,0,0>: Cost 3 vext1 <3,2,1,0>, LHS + 2619392102U, // <2,1,0,1>: Cost 3 vext2 <0,3,2,1>, LHS + 3759440619U, // <2,1,0,2>: Cost 4 vext3 <0,2,0,2>, <1,0,2,0> + 1616823030U, // <2,1,0,3>: Cost 2 vext3 <1,0,3,2>, <1,0,3,2> + 2569456950U, // <2,1,0,4>: Cost 3 vext1 <3,2,1,0>, RHS + 2690712328U, // <2,1,0,5>: Cost 3 vext3 <1,0,5,2>, <1,0,5,2> + 3661115841U, // <2,1,0,6>: Cost 4 vext1 <6,2,1,0>, <6,2,1,0> + 2622046794U, // <2,1,0,7>: Cost 3 vext2 <0,7,2,1>, <0,7,2,1> + 1617191715U, // <2,1,0,u>: Cost 2 vext3 <1,0,u,2>, <1,0,u,2> + 2551545958U, // <2,1,1,0>: Cost 3 vext1 <0,2,1,1>, LHS + 2685698868U, // <2,1,1,1>: Cost 3 vext3 <0,2,0,2>, <1,1,1,1> + 2628682646U, // <2,1,1,2>: Cost 3 vext2 <1,u,2,1>, <1,2,3,0> + 2685698888U, // <2,1,1,3>: Cost 3 vext3 <0,2,0,2>, <1,1,3,3> + 2551549238U, // <2,1,1,4>: Cost 3 vext1 <0,2,1,1>, RHS + 3693134992U, // <2,1,1,5>: Cost 4 vext2 <0,3,2,1>, <1,5,3,7> + 3661124034U, // <2,1,1,6>: Cost 4 vext1 <6,2,1,1>, <6,2,1,1> + 3625292794U, // <2,1,1,7>: Cost 4 vext1 <0,2,1,1>, <7,0,1,2> + 2685698933U, // <2,1,1,u>: Cost 3 vext3 <0,2,0,2>, <1,1,u,3> + 2551554150U, // <2,1,2,0>: Cost 3 vext1 <0,2,1,2>, LHS + 3893649571U, // <2,1,2,1>: Cost 4 vuzpr <0,2,0,1>, <0,2,0,1> + 2551555688U, // <2,1,2,2>: Cost 3 vext1 <0,2,1,2>, <2,2,2,2> + 2685698966U, // <2,1,2,3>: Cost 3 vext3 <0,2,0,2>, <1,2,3,0> + 2551557430U, // <2,1,2,4>: Cost 3 vext1 <0,2,1,2>, RHS + 3763422123U, // <2,1,2,5>: Cost 4 vext3 <0,u,0,2>, <1,2,5,3> + 3693135802U, // <2,1,2,6>: Cost 4 vext2 <0,3,2,1>, <2,6,3,7> + 2726249402U, // <2,1,2,7>: Cost 3 vext3 <7,0,1,2>, <1,2,7,0> + 2685699011U, // <2,1,2,u>: Cost 3 vext3 <0,2,0,2>, <1,2,u,0> + 2551562342U, // <2,1,3,0>: Cost 3 vext1 <0,2,1,3>, LHS + 2953625610U, // <2,1,3,1>: Cost 3 vzipr LHS, <0,0,1,1> + 2953627798U, // <2,1,3,2>: Cost 3 vzipr LHS, <3,0,1,2> + 2953626584U, // <2,1,3,3>: Cost 3 vzipr LHS, <1,3,1,3> + 2551565622U, // <2,1,3,4>: Cost 3 vext1 <0,2,1,3>, RHS + 2953625938U, // <2,1,3,5>: Cost 3 vzipr LHS, <0,4,1,5> + 2587398596U, // <2,1,3,6>: Cost 3 vext1 <6,2,1,3>, <6,2,1,3> + 4032013519U, // <2,1,3,7>: Cost 4 vzipr LHS, <1,6,1,7> + 2953625617U, // <2,1,3,u>: Cost 3 vzipr LHS, <0,0,1,u> + 2690565154U, // <2,1,4,0>: Cost 3 vext3 <1,0,3,2>, <1,4,0,5> + 3625313270U, // <2,1,4,1>: Cost 4 vext1 <0,2,1,4>, <1,3,4,6> + 3771532340U, // <2,1,4,2>: Cost 4 vext3 <2,2,2,2>, <1,4,2,5> + 1148404634U, // <2,1,4,3>: Cost 2 vrev <1,2,3,4> + 3625315638U, // <2,1,4,4>: Cost 4 vext1 <0,2,1,4>, RHS + 2619395382U, // <2,1,4,5>: Cost 3 vext2 <0,3,2,1>, RHS + 3837242678U, // <2,1,4,6>: Cost 4 vuzpl <2,0,1,2>, RHS + 3799991394U, // <2,1,4,7>: Cost 4 vext3 <7,0,1,2>, <1,4,7,6> + 1148773319U, // <2,1,4,u>: Cost 2 vrev <1,2,u,4> + 2551578726U, // <2,1,5,0>: Cost 3 vext1 <0,2,1,5>, LHS + 2551579648U, // <2,1,5,1>: Cost 3 vext1 <0,2,1,5>, <1,3,5,7> + 3625321952U, // <2,1,5,2>: Cost 4 vext1 <0,2,1,5>, <2,0,5,1> + 2685699216U, // <2,1,5,3>: Cost 3 vext3 <0,2,0,2>, <1,5,3,7> + 2551582006U, // <2,1,5,4>: Cost 3 vext1 <0,2,1,5>, RHS + 3740913668U, // <2,1,5,5>: Cost 4 vext2 <u,3,2,1>, <5,5,5,5> + 3661156806U, // <2,1,5,6>: Cost 4 vext1 <6,2,1,5>, <6,2,1,5> + 3893652790U, // <2,1,5,7>: Cost 4 vuzpr <0,2,0,1>, RHS + 2685699261U, // <2,1,5,u>: Cost 3 vext3 <0,2,0,2>, <1,5,u,7> + 2551586918U, // <2,1,6,0>: Cost 3 vext1 <0,2,1,6>, LHS + 3625329398U, // <2,1,6,1>: Cost 4 vext1 <0,2,1,6>, <1,0,3,2> + 2551588794U, // <2,1,6,2>: Cost 3 vext1 <0,2,1,6>, <2,6,3,7> + 3088679014U, // <2,1,6,3>: Cost 3 vtrnr <0,2,4,6>, LHS + 2551590198U, // <2,1,6,4>: Cost 3 vext1 <0,2,1,6>, RHS + 4029382994U, // <2,1,6,5>: Cost 4 vzipr <0,4,2,6>, <0,4,1,5> + 3625333560U, // <2,1,6,6>: Cost 4 vext1 <0,2,1,6>, <6,6,6,6> + 3731624800U, // <2,1,6,7>: Cost 4 vext2 <6,7,2,1>, <6,7,2,1> + 2551592750U, // <2,1,6,u>: Cost 3 vext1 <0,2,1,6>, LHS + 2622051322U, // <2,1,7,0>: Cost 3 vext2 <0,7,2,1>, <7,0,1,2> + 3733615699U, // <2,1,7,1>: Cost 4 vext2 <7,1,2,1>, <7,1,2,1> + 3795125538U, // <2,1,7,2>: Cost 4 vext3 <6,1,7,2>, <1,7,2,0> + 2222171037U, // <2,1,7,3>: Cost 3 vrev <1,2,3,7> + 3740915046U, // <2,1,7,4>: Cost 4 vext2 <u,3,2,1>, <7,4,5,6> + 3296060335U, // <2,1,7,5>: Cost 4 vrev <1,2,5,7> + 3736933864U, // <2,1,7,6>: Cost 4 vext2 <7,6,2,1>, <7,6,2,1> + 3805300055U, // <2,1,7,7>: Cost 4 vext3 <7,u,1,2>, <1,7,7,u> + 2669827714U, // <2,1,7,u>: Cost 3 vext2 <u,7,2,1>, <7,u,1,2> + 2551603302U, // <2,1,u,0>: Cost 3 vext1 <0,2,1,u>, LHS + 2953666570U, // <2,1,u,1>: Cost 3 vzipr LHS, <0,0,1,1> + 2953668758U, // <2,1,u,2>: Cost 3 vzipr LHS, <3,0,1,2> + 1148437406U, // <2,1,u,3>: Cost 2 vrev <1,2,3,u> + 2551606582U, // <2,1,u,4>: Cost 3 vext1 <0,2,1,u>, RHS + 2953666898U, // <2,1,u,5>: Cost 3 vzipr LHS, <0,4,1,5> + 2587398596U, // <2,1,u,6>: Cost 3 vext1 <6,2,1,3>, <6,2,1,3> + 2669828370U, // <2,1,u,7>: Cost 3 vext2 <u,7,2,1>, <u,7,2,1> + 1148806091U, // <2,1,u,u>: Cost 2 vrev <1,2,u,u> + 1543667732U, // <2,2,0,0>: Cost 2 vext2 <0,0,2,2>, <0,0,2,2> + 1548976230U, // <2,2,0,1>: Cost 2 vext2 <0,u,2,2>, LHS + 2685699524U, // <2,2,0,2>: Cost 3 vext3 <0,2,0,2>, <2,0,2,0> + 2685699535U, // <2,2,0,3>: Cost 3 vext3 <0,2,0,2>, <2,0,3,2> + 2551614774U, // <2,2,0,4>: Cost 3 vext1 <0,2,2,0>, RHS + 3704422830U, // <2,2,0,5>: Cost 4 vext2 <2,2,2,2>, <0,5,2,7> + 3893657642U, // <2,2,0,6>: Cost 4 vuzpr <0,2,0,2>, <0,0,4,6> + 3770574323U, // <2,2,0,7>: Cost 4 vext3 <2,0,7,2>, <2,0,7,2> + 1548976796U, // <2,2,0,u>: Cost 2 vext2 <0,u,2,2>, <0,u,2,2> + 2622718710U, // <2,2,1,0>: Cost 3 vext2 <0,u,2,2>, <1,0,3,2> + 2622718772U, // <2,2,1,1>: Cost 3 vext2 <0,u,2,2>, <1,1,1,1> + 2622718870U, // <2,2,1,2>: Cost 3 vext2 <0,u,2,2>, <1,2,3,0> + 2819915878U, // <2,2,1,3>: Cost 3 vuzpr <0,2,0,2>, LHS + 3625364790U, // <2,2,1,4>: Cost 4 vext1 <0,2,2,1>, RHS + 2622719120U, // <2,2,1,5>: Cost 3 vext2 <0,u,2,2>, <1,5,3,7> + 3760031292U, // <2,2,1,6>: Cost 4 vext3 <0,2,u,2>, <2,1,6,3> + 3667170468U, // <2,2,1,7>: Cost 4 vext1 <7,2,2,1>, <7,2,2,1> + 2819915883U, // <2,2,1,u>: Cost 3 vuzpr <0,2,0,2>, LHS + 1489829990U, // <2,2,2,0>: Cost 2 vext1 <2,2,2,2>, LHS + 2563572470U, // <2,2,2,1>: Cost 3 vext1 <2,2,2,2>, <1,0,3,2> + 269271142U, // <2,2,2,2>: Cost 1 vdup2 LHS + 2685699698U, // <2,2,2,3>: Cost 3 vext3 <0,2,0,2>, <2,2,3,3> + 1489833270U, // <2,2,2,4>: Cost 2 vext1 <2,2,2,2>, RHS + 2685699720U, // <2,2,2,5>: Cost 3 vext3 <0,2,0,2>, <2,2,5,7> + 2622719930U, // <2,2,2,6>: Cost 3 vext2 <0,u,2,2>, <2,6,3,7> + 2593436837U, // <2,2,2,7>: Cost 3 vext1 <7,2,2,2>, <7,2,2,2> + 269271142U, // <2,2,2,u>: Cost 1 vdup2 LHS + 2685699750U, // <2,2,3,0>: Cost 3 vext3 <0,2,0,2>, <2,3,0,1> + 2690565806U, // <2,2,3,1>: Cost 3 vext3 <1,0,3,2>, <2,3,1,0> + 2953627240U, // <2,2,3,2>: Cost 3 vzipr LHS, <2,2,2,2> + 1879883878U, // <2,2,3,3>: Cost 2 vzipr LHS, LHS + 2685699790U, // <2,2,3,4>: Cost 3 vext3 <0,2,0,2>, <2,3,4,5> + 3893659342U, // <2,2,3,5>: Cost 4 vuzpr <0,2,0,2>, <2,3,4,5> + 2958270812U, // <2,2,3,6>: Cost 3 vzipr LHS, <0,4,2,6> + 2593445030U, // <2,2,3,7>: Cost 3 vext1 <7,2,2,3>, <7,2,2,3> + 1879883883U, // <2,2,3,u>: Cost 2 vzipr LHS, LHS + 2551644262U, // <2,2,4,0>: Cost 3 vext1 <0,2,2,4>, LHS + 3625386742U, // <2,2,4,1>: Cost 4 vext1 <0,2,2,4>, <1,0,3,2> + 2551645902U, // <2,2,4,2>: Cost 3 vext1 <0,2,2,4>, <2,3,4,5> + 3759441686U, // <2,2,4,3>: Cost 4 vext3 <0,2,0,2>, <2,4,3,5> + 2551647542U, // <2,2,4,4>: Cost 3 vext1 <0,2,2,4>, RHS + 1548979510U, // <2,2,4,5>: Cost 2 vext2 <0,u,2,2>, RHS + 2764901686U, // <2,2,4,6>: Cost 3 vuzpl <2,2,2,2>, RHS + 3667195047U, // <2,2,4,7>: Cost 4 vext1 <7,2,2,4>, <7,2,2,4> + 1548979753U, // <2,2,4,u>: Cost 2 vext2 <0,u,2,2>, RHS + 3696463432U, // <2,2,5,0>: Cost 4 vext2 <0,u,2,2>, <5,0,1,2> + 2617413328U, // <2,2,5,1>: Cost 3 vext2 <0,0,2,2>, <5,1,7,3> + 2685699936U, // <2,2,5,2>: Cost 3 vext3 <0,2,0,2>, <2,5,2,7> + 4027383910U, // <2,2,5,3>: Cost 4 vzipr <0,1,2,5>, LHS + 2228201085U, // <2,2,5,4>: Cost 3 vrev <2,2,4,5> + 2617413636U, // <2,2,5,5>: Cost 3 vext2 <0,0,2,2>, <5,5,5,5> + 2617413730U, // <2,2,5,6>: Cost 3 vext2 <0,0,2,2>, <5,6,7,0> + 2819919158U, // <2,2,5,7>: Cost 3 vuzpr <0,2,0,2>, RHS + 2819919159U, // <2,2,5,u>: Cost 3 vuzpr <0,2,0,2>, RHS + 3625402554U, // <2,2,6,0>: Cost 4 vext1 <0,2,2,6>, <0,2,2,6> + 3760031652U, // <2,2,6,1>: Cost 4 vext3 <0,2,u,2>, <2,6,1,3> + 2617414138U, // <2,2,6,2>: Cost 3 vext2 <0,0,2,2>, <6,2,7,3> + 2685700026U, // <2,2,6,3>: Cost 3 vext3 <0,2,0,2>, <2,6,3,7> + 3625405750U, // <2,2,6,4>: Cost 4 vext1 <0,2,2,6>, RHS + 3760031692U, // <2,2,6,5>: Cost 4 vext3 <0,2,u,2>, <2,6,5,7> + 3088679116U, // <2,2,6,6>: Cost 3 vtrnr <0,2,4,6>, <0,2,4,6> + 2657891169U, // <2,2,6,7>: Cost 3 vext2 <6,7,2,2>, <6,7,2,2> + 2685700071U, // <2,2,6,u>: Cost 3 vext3 <0,2,0,2>, <2,6,u,7> + 2726250474U, // <2,2,7,0>: Cost 3 vext3 <7,0,1,2>, <2,7,0,1> + 3704427616U, // <2,2,7,1>: Cost 4 vext2 <2,2,2,2>, <7,1,3,5> + 2660545701U, // <2,2,7,2>: Cost 3 vext2 <7,2,2,2>, <7,2,2,2> + 4030718054U, // <2,2,7,3>: Cost 4 vzipr <0,6,2,7>, LHS + 2617415014U, // <2,2,7,4>: Cost 3 vext2 <0,0,2,2>, <7,4,5,6> + 3302033032U, // <2,2,7,5>: Cost 4 vrev <2,2,5,7> + 3661246929U, // <2,2,7,6>: Cost 4 vext1 <6,2,2,7>, <6,2,2,7> + 2617415276U, // <2,2,7,7>: Cost 3 vext2 <0,0,2,2>, <7,7,7,7> + 2731558962U, // <2,2,7,u>: Cost 3 vext3 <7,u,1,2>, <2,7,u,1> + 1489829990U, // <2,2,u,0>: Cost 2 vext1 <2,2,2,2>, LHS + 1548982062U, // <2,2,u,1>: Cost 2 vext2 <0,u,2,2>, LHS + 269271142U, // <2,2,u,2>: Cost 1 vdup2 LHS + 1879924838U, // <2,2,u,3>: Cost 2 vzipr LHS, LHS + 1489833270U, // <2,2,u,4>: Cost 2 vext1 <2,2,2,2>, RHS + 1548982426U, // <2,2,u,5>: Cost 2 vext2 <0,u,2,2>, RHS + 2953666908U, // <2,2,u,6>: Cost 3 vzipr LHS, <0,4,2,6> + 2819919401U, // <2,2,u,7>: Cost 3 vuzpr <0,2,0,2>, RHS + 269271142U, // <2,2,u,u>: Cost 1 vdup2 LHS + 1544339456U, // <2,3,0,0>: Cost 2 vext2 LHS, <0,0,0,0> + 470597734U, // <2,3,0,1>: Cost 1 vext2 LHS, LHS + 1548984484U, // <2,3,0,2>: Cost 2 vext2 LHS, <0,2,0,2> + 2619408648U, // <2,3,0,3>: Cost 3 vext2 <0,3,2,3>, <0,3,2,3> + 1548984658U, // <2,3,0,4>: Cost 2 vext2 LHS, <0,4,1,5> + 2665857454U, // <2,3,0,5>: Cost 3 vext2 LHS, <0,5,2,7> + 2622726655U, // <2,3,0,6>: Cost 3 vext2 LHS, <0,6,2,7> + 2593494188U, // <2,3,0,7>: Cost 3 vext1 <7,2,3,0>, <7,2,3,0> + 470598301U, // <2,3,0,u>: Cost 1 vext2 LHS, LHS + 1544340214U, // <2,3,1,0>: Cost 2 vext2 LHS, <1,0,3,2> + 1544340276U, // <2,3,1,1>: Cost 2 vext2 LHS, <1,1,1,1> + 1544340374U, // <2,3,1,2>: Cost 2 vext2 LHS, <1,2,3,0> + 1548985304U, // <2,3,1,3>: Cost 2 vext2 LHS, <1,3,1,3> + 2551696694U, // <2,3,1,4>: Cost 3 vext1 <0,2,3,1>, RHS + 1548985488U, // <2,3,1,5>: Cost 2 vext2 LHS, <1,5,3,7> + 2622727375U, // <2,3,1,6>: Cost 3 vext2 LHS, <1,6,1,7> + 2665858347U, // <2,3,1,7>: Cost 3 vext2 LHS, <1,7,3,0> + 1548985709U, // <2,3,1,u>: Cost 2 vext2 LHS, <1,u,1,3> + 2622727613U, // <2,3,2,0>: Cost 3 vext2 LHS, <2,0,1,2> + 2622727711U, // <2,3,2,1>: Cost 3 vext2 LHS, <2,1,3,1> + 1544341096U, // <2,3,2,2>: Cost 2 vext2 LHS, <2,2,2,2> + 1544341158U, // <2,3,2,3>: Cost 2 vext2 LHS, <2,3,0,1> + 2622727958U, // <2,3,2,4>: Cost 3 vext2 LHS, <2,4,3,5> + 2622728032U, // <2,3,2,5>: Cost 3 vext2 LHS, <2,5,2,7> + 1548986298U, // <2,3,2,6>: Cost 2 vext2 LHS, <2,6,3,7> + 2665859050U, // <2,3,2,7>: Cost 3 vext2 LHS, <2,7,0,1> + 1548986427U, // <2,3,2,u>: Cost 2 vext2 LHS, <2,u,0,1> + 1548986518U, // <2,3,3,0>: Cost 2 vext2 LHS, <3,0,1,2> + 2622728415U, // <2,3,3,1>: Cost 3 vext2 LHS, <3,1,0,3> + 1489913458U, // <2,3,3,2>: Cost 2 vext1 <2,2,3,3>, <2,2,3,3> + 1544341916U, // <2,3,3,3>: Cost 2 vext2 LHS, <3,3,3,3> + 1548986882U, // <2,3,3,4>: Cost 2 vext2 LHS, <3,4,5,6> + 2665859632U, // <2,3,3,5>: Cost 3 vext2 LHS, <3,5,1,7> + 2234304870U, // <2,3,3,6>: Cost 3 vrev <3,2,6,3> + 2958271632U, // <2,3,3,7>: Cost 3 vzipr LHS, <1,5,3,7> + 1548987166U, // <2,3,3,u>: Cost 2 vext2 LHS, <3,u,1,2> + 1483948134U, // <2,3,4,0>: Cost 2 vext1 <1,2,3,4>, LHS + 1483948954U, // <2,3,4,1>: Cost 2 vext1 <1,2,3,4>, <1,2,3,4> + 2622729276U, // <2,3,4,2>: Cost 3 vext2 LHS, <4,2,6,0> + 2557692054U, // <2,3,4,3>: Cost 3 vext1 <1,2,3,4>, <3,0,1,2> + 1483951414U, // <2,3,4,4>: Cost 2 vext1 <1,2,3,4>, RHS + 470601014U, // <2,3,4,5>: Cost 1 vext2 LHS, RHS + 1592118644U, // <2,3,4,6>: Cost 2 vext2 LHS, <4,6,4,6> + 2593526960U, // <2,3,4,7>: Cost 3 vext1 <7,2,3,4>, <7,2,3,4> + 470601257U, // <2,3,4,u>: Cost 1 vext2 LHS, RHS + 2551726182U, // <2,3,5,0>: Cost 3 vext1 <0,2,3,5>, LHS + 1592118992U, // <2,3,5,1>: Cost 2 vext2 LHS, <5,1,7,3> + 2665860862U, // <2,3,5,2>: Cost 3 vext2 LHS, <5,2,3,4> + 2551728642U, // <2,3,5,3>: Cost 3 vext1 <0,2,3,5>, <3,4,5,6> + 1592119238U, // <2,3,5,4>: Cost 2 vext2 LHS, <5,4,7,6> + 1592119300U, // <2,3,5,5>: Cost 2 vext2 LHS, <5,5,5,5> + 1592119394U, // <2,3,5,6>: Cost 2 vext2 LHS, <5,6,7,0> + 1592119464U, // <2,3,5,7>: Cost 2 vext2 LHS, <5,7,5,7> + 1592119545U, // <2,3,5,u>: Cost 2 vext2 LHS, <5,u,5,7> + 2622730529U, // <2,3,6,0>: Cost 3 vext2 LHS, <6,0,1,2> + 2557707164U, // <2,3,6,1>: Cost 3 vext1 <1,2,3,6>, <1,2,3,6> + 1592119802U, // <2,3,6,2>: Cost 2 vext2 LHS, <6,2,7,3> + 2665861682U, // <2,3,6,3>: Cost 3 vext2 LHS, <6,3,4,5> + 2622730893U, // <2,3,6,4>: Cost 3 vext2 LHS, <6,4,5,6> + 2665861810U, // <2,3,6,5>: Cost 3 vext2 LHS, <6,5,0,7> + 1592120120U, // <2,3,6,6>: Cost 2 vext2 LHS, <6,6,6,6> + 1592120142U, // <2,3,6,7>: Cost 2 vext2 LHS, <6,7,0,1> + 1592120223U, // <2,3,6,u>: Cost 2 vext2 LHS, <6,u,0,1> + 1592120314U, // <2,3,7,0>: Cost 2 vext2 LHS, <7,0,1,2> + 2659890261U, // <2,3,7,1>: Cost 3 vext2 <7,1,2,3>, <7,1,2,3> + 2660553894U, // <2,3,7,2>: Cost 3 vext2 <7,2,2,3>, <7,2,2,3> + 2665862371U, // <2,3,7,3>: Cost 3 vext2 LHS, <7,3,0,1> + 1592120678U, // <2,3,7,4>: Cost 2 vext2 LHS, <7,4,5,6> + 2665862534U, // <2,3,7,5>: Cost 3 vext2 LHS, <7,5,0,2> + 2665862614U, // <2,3,7,6>: Cost 3 vext2 LHS, <7,6,0,1> + 1592120940U, // <2,3,7,7>: Cost 2 vext2 LHS, <7,7,7,7> + 1592120962U, // <2,3,7,u>: Cost 2 vext2 LHS, <7,u,1,2> + 1548990163U, // <2,3,u,0>: Cost 2 vext2 LHS, <u,0,1,2> + 470603566U, // <2,3,u,1>: Cost 1 vext2 LHS, LHS + 1548990341U, // <2,3,u,2>: Cost 2 vext2 LHS, <u,2,3,0> + 1548990396U, // <2,3,u,3>: Cost 2 vext2 LHS, <u,3,0,1> + 1548990527U, // <2,3,u,4>: Cost 2 vext2 LHS, <u,4,5,6> + 470603930U, // <2,3,u,5>: Cost 1 vext2 LHS, RHS + 1548990672U, // <2,3,u,6>: Cost 2 vext2 LHS, <u,6,3,7> + 1592121600U, // <2,3,u,7>: Cost 2 vext2 LHS, <u,7,0,1> + 470604133U, // <2,3,u,u>: Cost 1 vext2 LHS, LHS + 2617425942U, // <2,4,0,0>: Cost 3 vext2 <0,0,2,4>, <0,0,2,4> + 2618753126U, // <2,4,0,1>: Cost 3 vext2 <0,2,2,4>, LHS + 2618753208U, // <2,4,0,2>: Cost 3 vext2 <0,2,2,4>, <0,2,2,4> + 2619416841U, // <2,4,0,3>: Cost 3 vext2 <0,3,2,4>, <0,3,2,4> + 2587593628U, // <2,4,0,4>: Cost 3 vext1 <6,2,4,0>, <4,0,6,2> + 2712832914U, // <2,4,0,5>: Cost 3 vext3 <4,6,u,2>, <4,0,5,1> + 1634962332U, // <2,4,0,6>: Cost 2 vext3 <4,0,6,2>, <4,0,6,2> + 3799993252U, // <2,4,0,7>: Cost 4 vext3 <7,0,1,2>, <4,0,7,1> + 1634962332U, // <2,4,0,u>: Cost 2 vext3 <4,0,6,2>, <4,0,6,2> + 2619417334U, // <2,4,1,0>: Cost 3 vext2 <0,3,2,4>, <1,0,3,2> + 3692495668U, // <2,4,1,1>: Cost 4 vext2 <0,2,2,4>, <1,1,1,1> + 2625389466U, // <2,4,1,2>: Cost 3 vext2 <1,3,2,4>, <1,2,3,4> + 2826125414U, // <2,4,1,3>: Cost 3 vuzpr <1,2,3,4>, LHS + 3699794995U, // <2,4,1,4>: Cost 4 vext2 <1,4,2,4>, <1,4,2,4> + 3692496016U, // <2,4,1,5>: Cost 4 vext2 <0,2,2,4>, <1,5,3,7> + 3763424238U, // <2,4,1,6>: Cost 4 vext3 <0,u,0,2>, <4,1,6,3> + 3667317942U, // <2,4,1,7>: Cost 4 vext1 <7,2,4,1>, <7,2,4,1> + 2826125419U, // <2,4,1,u>: Cost 3 vuzpr <1,2,3,4>, LHS + 2629371336U, // <2,4,2,0>: Cost 3 vext2 <2,0,2,4>, <2,0,2,4> + 3699131946U, // <2,4,2,1>: Cost 4 vext2 <1,3,2,4>, <2,1,4,3> + 2630698602U, // <2,4,2,2>: Cost 3 vext2 <2,2,2,4>, <2,2,2,4> + 2618754766U, // <2,4,2,3>: Cost 3 vext2 <0,2,2,4>, <2,3,4,5> + 2826126234U, // <2,4,2,4>: Cost 3 vuzpr <1,2,3,4>, <1,2,3,4> + 2899119414U, // <2,4,2,5>: Cost 3 vzipl <2,2,2,2>, RHS + 3033337142U, // <2,4,2,6>: Cost 3 vtrnl <2,2,2,2>, RHS + 3800214597U, // <2,4,2,7>: Cost 4 vext3 <7,0,4,2>, <4,2,7,0> + 2899119657U, // <2,4,2,u>: Cost 3 vzipl <2,2,2,2>, RHS + 2635344033U, // <2,4,3,0>: Cost 3 vext2 <3,0,2,4>, <3,0,2,4> + 4032012325U, // <2,4,3,1>: Cost 4 vzipr LHS, <0,0,4,1> + 3692497228U, // <2,4,3,2>: Cost 4 vext2 <0,2,2,4>, <3,2,3,4> + 3692497308U, // <2,4,3,3>: Cost 4 vext2 <0,2,2,4>, <3,3,3,3> + 3001404624U, // <2,4,3,4>: Cost 3 vzipr LHS, <4,4,4,4> + 2953627342U, // <2,4,3,5>: Cost 3 vzipr LHS, <2,3,4,5> + 2953625804U, // <2,4,3,6>: Cost 3 vzipr LHS, <0,2,4,6> + 3899868160U, // <2,4,3,7>: Cost 4 vuzpr <1,2,3,4>, <1,3,5,7> + 2953625806U, // <2,4,3,u>: Cost 3 vzipr LHS, <0,2,4,u> + 2710916266U, // <2,4,4,0>: Cost 3 vext3 <4,4,0,2>, <4,4,0,2> + 3899869648U, // <2,4,4,1>: Cost 4 vuzpr <1,2,3,4>, <3,4,0,1> + 3899869658U, // <2,4,4,2>: Cost 4 vuzpr <1,2,3,4>, <3,4,1,2> + 3899868930U, // <2,4,4,3>: Cost 4 vuzpr <1,2,3,4>, <2,4,1,3> + 2712833232U, // <2,4,4,4>: Cost 3 vext3 <4,6,u,2>, <4,4,4,4> + 2618756406U, // <2,4,4,5>: Cost 3 vext2 <0,2,2,4>, RHS + 2765737270U, // <2,4,4,6>: Cost 3 vuzpl <2,3,4,5>, RHS + 4168304426U, // <2,4,4,7>: Cost 4 vtrnr <1,2,3,4>, <2,4,5,7> + 2618756649U, // <2,4,4,u>: Cost 3 vext2 <0,2,2,4>, RHS + 2551800011U, // <2,4,5,0>: Cost 3 vext1 <0,2,4,5>, <0,2,4,5> + 2569716470U, // <2,4,5,1>: Cost 3 vext1 <3,2,4,5>, <1,0,3,2> + 2563745405U, // <2,4,5,2>: Cost 3 vext1 <2,2,4,5>, <2,2,4,5> + 2569718102U, // <2,4,5,3>: Cost 3 vext1 <3,2,4,5>, <3,2,4,5> + 2551803190U, // <2,4,5,4>: Cost 3 vext1 <0,2,4,5>, RHS + 3625545732U, // <2,4,5,5>: Cost 4 vext1 <0,2,4,5>, <5,5,5,5> + 1611959606U, // <2,4,5,6>: Cost 2 vext3 <0,2,0,2>, RHS + 2826128694U, // <2,4,5,7>: Cost 3 vuzpr <1,2,3,4>, RHS + 1611959624U, // <2,4,5,u>: Cost 2 vext3 <0,2,0,2>, RHS + 1478066278U, // <2,4,6,0>: Cost 2 vext1 <0,2,4,6>, LHS + 2551808758U, // <2,4,6,1>: Cost 3 vext1 <0,2,4,6>, <1,0,3,2> + 2551809516U, // <2,4,6,2>: Cost 3 vext1 <0,2,4,6>, <2,0,6,4> + 2551810198U, // <2,4,6,3>: Cost 3 vext1 <0,2,4,6>, <3,0,1,2> + 1478069558U, // <2,4,6,4>: Cost 2 vext1 <0,2,4,6>, RHS + 2901888310U, // <2,4,6,5>: Cost 3 vzipl <2,6,3,7>, RHS + 2551812920U, // <2,4,6,6>: Cost 3 vext1 <0,2,4,6>, <6,6,6,6> + 2726251914U, // <2,4,6,7>: Cost 3 vext3 <7,0,1,2>, <4,6,7,1> + 1478072110U, // <2,4,6,u>: Cost 2 vext1 <0,2,4,6>, LHS + 2659234821U, // <2,4,7,0>: Cost 3 vext2 <7,0,2,4>, <7,0,2,4> + 3786722726U, // <2,4,7,1>: Cost 4 vext3 <4,7,1,2>, <4,7,1,2> + 3734303911U, // <2,4,7,2>: Cost 4 vext2 <7,2,2,4>, <7,2,2,4> + 3734967544U, // <2,4,7,3>: Cost 4 vext2 <7,3,2,4>, <7,3,2,4> + 3727005030U, // <2,4,7,4>: Cost 4 vext2 <6,0,2,4>, <7,4,5,6> + 2726251976U, // <2,4,7,5>: Cost 3 vext3 <7,0,1,2>, <4,7,5,0> + 2726251986U, // <2,4,7,6>: Cost 3 vext3 <7,0,1,2>, <4,7,6,1> + 3727005292U, // <2,4,7,7>: Cost 4 vext2 <6,0,2,4>, <7,7,7,7> + 2659234821U, // <2,4,7,u>: Cost 3 vext2 <7,0,2,4>, <7,0,2,4> + 1478082662U, // <2,4,u,0>: Cost 2 vext1 <0,2,4,u>, LHS + 2618758958U, // <2,4,u,1>: Cost 3 vext2 <0,2,2,4>, LHS + 2551826024U, // <2,4,u,2>: Cost 3 vext1 <0,2,4,u>, <2,2,2,2> + 2551826582U, // <2,4,u,3>: Cost 3 vext1 <0,2,4,u>, <3,0,1,2> + 1478085942U, // <2,4,u,4>: Cost 2 vext1 <0,2,4,u>, RHS + 2953668302U, // <2,4,u,5>: Cost 3 vzipr LHS, <2,3,4,5> + 1611959849U, // <2,4,u,6>: Cost 2 vext3 <0,2,0,2>, RHS + 2826128937U, // <2,4,u,7>: Cost 3 vuzpr <1,2,3,4>, RHS + 1611959867U, // <2,4,u,u>: Cost 2 vext3 <0,2,0,2>, RHS + 3691839488U, // <2,5,0,0>: Cost 4 vext2 <0,1,2,5>, <0,0,0,0> + 2618097766U, // <2,5,0,1>: Cost 3 vext2 <0,1,2,5>, LHS + 2620088484U, // <2,5,0,2>: Cost 3 vext2 <0,4,2,5>, <0,2,0,2> + 2619425034U, // <2,5,0,3>: Cost 3 vext2 <0,3,2,5>, <0,3,2,5> + 2620088667U, // <2,5,0,4>: Cost 3 vext2 <0,4,2,5>, <0,4,2,5> + 2620752300U, // <2,5,0,5>: Cost 3 vext2 <0,5,2,5>, <0,5,2,5> + 3693830655U, // <2,5,0,6>: Cost 4 vext2 <0,4,2,5>, <0,6,2,7> + 3094531382U, // <2,5,0,7>: Cost 3 vtrnr <1,2,3,0>, RHS + 2618098333U, // <2,5,0,u>: Cost 3 vext2 <0,1,2,5>, LHS + 3691840246U, // <2,5,1,0>: Cost 4 vext2 <0,1,2,5>, <1,0,3,2> + 3691840308U, // <2,5,1,1>: Cost 4 vext2 <0,1,2,5>, <1,1,1,1> + 2626061206U, // <2,5,1,2>: Cost 3 vext2 <1,4,2,5>, <1,2,3,0> + 2618098688U, // <2,5,1,3>: Cost 3 vext2 <0,1,2,5>, <1,3,5,7> + 2626061364U, // <2,5,1,4>: Cost 3 vext2 <1,4,2,5>, <1,4,2,5> + 3691840656U, // <2,5,1,5>: Cost 4 vext2 <0,1,2,5>, <1,5,3,7> + 3789082310U, // <2,5,1,6>: Cost 4 vext3 <5,1,6,2>, <5,1,6,2> + 2712833744U, // <2,5,1,7>: Cost 3 vext3 <4,6,u,2>, <5,1,7,3> + 2628715896U, // <2,5,1,u>: Cost 3 vext2 <1,u,2,5>, <1,u,2,5> + 3693831613U, // <2,5,2,0>: Cost 4 vext2 <0,4,2,5>, <2,0,1,2> + 4026698642U, // <2,5,2,1>: Cost 4 vzipr <0,0,2,2>, <4,0,5,1> + 2632033896U, // <2,5,2,2>: Cost 3 vext2 <2,4,2,5>, <2,2,2,2> + 3691841190U, // <2,5,2,3>: Cost 4 vext2 <0,1,2,5>, <2,3,0,1> + 2632034061U, // <2,5,2,4>: Cost 3 vext2 <2,4,2,5>, <2,4,2,5> + 3691841352U, // <2,5,2,5>: Cost 4 vext2 <0,1,2,5>, <2,5,0,1> + 3691841466U, // <2,5,2,6>: Cost 4 vext2 <0,1,2,5>, <2,6,3,7> + 3088354614U, // <2,5,2,7>: Cost 3 vtrnr <0,2,0,2>, RHS + 3088354615U, // <2,5,2,u>: Cost 3 vtrnr <0,2,0,2>, RHS + 2557829222U, // <2,5,3,0>: Cost 3 vext1 <1,2,5,3>, LHS + 2557830059U, // <2,5,3,1>: Cost 3 vext1 <1,2,5,3>, <1,2,5,3> + 2575746766U, // <2,5,3,2>: Cost 3 vext1 <4,2,5,3>, <2,3,4,5> + 3691841948U, // <2,5,3,3>: Cost 4 vext2 <0,1,2,5>, <3,3,3,3> + 2619427330U, // <2,5,3,4>: Cost 3 vext2 <0,3,2,5>, <3,4,5,6> + 2581720847U, // <2,5,3,5>: Cost 3 vext1 <5,2,5,3>, <5,2,5,3> + 2953628162U, // <2,5,3,6>: Cost 3 vzipr LHS, <3,4,5,6> + 2953626624U, // <2,5,3,7>: Cost 3 vzipr LHS, <1,3,5,7> + 2953626625U, // <2,5,3,u>: Cost 3 vzipr LHS, <1,3,5,u> + 2569781350U, // <2,5,4,0>: Cost 3 vext1 <3,2,5,4>, LHS + 3631580076U, // <2,5,4,1>: Cost 4 vext1 <1,2,5,4>, <1,2,5,4> + 2569782990U, // <2,5,4,2>: Cost 3 vext1 <3,2,5,4>, <2,3,4,5> + 2569783646U, // <2,5,4,3>: Cost 3 vext1 <3,2,5,4>, <3,2,5,4> + 2569784630U, // <2,5,4,4>: Cost 3 vext1 <3,2,5,4>, RHS + 2618101046U, // <2,5,4,5>: Cost 3 vext2 <0,1,2,5>, RHS + 3893905922U, // <2,5,4,6>: Cost 4 vuzpr <0,2,3,5>, <3,4,5,6> + 3094564150U, // <2,5,4,7>: Cost 3 vtrnr <1,2,3,4>, RHS + 2618101289U, // <2,5,4,u>: Cost 3 vext2 <0,1,2,5>, RHS + 2551873638U, // <2,5,5,0>: Cost 3 vext1 <0,2,5,5>, LHS + 3637560320U, // <2,5,5,1>: Cost 4 vext1 <2,2,5,5>, <1,3,5,7> + 3637560966U, // <2,5,5,2>: Cost 4 vext1 <2,2,5,5>, <2,2,5,5> + 3723030343U, // <2,5,5,3>: Cost 4 vext2 <5,3,2,5>, <5,3,2,5> + 2551876918U, // <2,5,5,4>: Cost 3 vext1 <0,2,5,5>, RHS + 2712834052U, // <2,5,5,5>: Cost 3 vext3 <4,6,u,2>, <5,5,5,5> + 4028713474U, // <2,5,5,6>: Cost 4 vzipr <0,3,2,5>, <3,4,5,6> + 2712834072U, // <2,5,5,7>: Cost 3 vext3 <4,6,u,2>, <5,5,7,7> + 2712834081U, // <2,5,5,u>: Cost 3 vext3 <4,6,u,2>, <5,5,u,7> + 2575769702U, // <2,5,6,0>: Cost 3 vext1 <4,2,5,6>, LHS + 3631596462U, // <2,5,6,1>: Cost 4 vext1 <1,2,5,6>, <1,2,5,6> + 2655924730U, // <2,5,6,2>: Cost 3 vext2 <6,4,2,5>, <6,2,7,3> + 3643541856U, // <2,5,6,3>: Cost 4 vext1 <3,2,5,6>, <3,2,5,6> + 2655924849U, // <2,5,6,4>: Cost 3 vext2 <6,4,2,5>, <6,4,2,5> + 3787755607U, // <2,5,6,5>: Cost 4 vext3 <4,u,6,2>, <5,6,5,7> + 4029385218U, // <2,5,6,6>: Cost 4 vzipr <0,4,2,6>, <3,4,5,6> + 3088682294U, // <2,5,6,7>: Cost 3 vtrnr <0,2,4,6>, RHS + 3088682295U, // <2,5,6,u>: Cost 3 vtrnr <0,2,4,6>, RHS + 2563833958U, // <2,5,7,0>: Cost 3 vext1 <2,2,5,7>, LHS + 2551890678U, // <2,5,7,1>: Cost 3 vext1 <0,2,5,7>, <1,0,3,2> + 2563835528U, // <2,5,7,2>: Cost 3 vext1 <2,2,5,7>, <2,2,5,7> + 3637577878U, // <2,5,7,3>: Cost 4 vext1 <2,2,5,7>, <3,0,1,2> + 2563837238U, // <2,5,7,4>: Cost 3 vext1 <2,2,5,7>, RHS + 2712834216U, // <2,5,7,5>: Cost 3 vext3 <4,6,u,2>, <5,7,5,7> + 2712834220U, // <2,5,7,6>: Cost 3 vext3 <4,6,u,2>, <5,7,6,2> + 4174449974U, // <2,5,7,7>: Cost 4 vtrnr <2,2,5,7>, RHS + 2563839790U, // <2,5,7,u>: Cost 3 vext1 <2,2,5,7>, LHS + 2563842150U, // <2,5,u,0>: Cost 3 vext1 <2,2,5,u>, LHS + 2618103598U, // <2,5,u,1>: Cost 3 vext2 <0,1,2,5>, LHS + 2563843721U, // <2,5,u,2>: Cost 3 vext1 <2,2,5,u>, <2,2,5,u> + 2569816418U, // <2,5,u,3>: Cost 3 vext1 <3,2,5,u>, <3,2,5,u> + 2622748735U, // <2,5,u,4>: Cost 3 vext2 <0,u,2,5>, <u,4,5,6> + 2618103962U, // <2,5,u,5>: Cost 3 vext2 <0,1,2,5>, RHS + 2953669122U, // <2,5,u,6>: Cost 3 vzipr LHS, <3,4,5,6> + 2953667584U, // <2,5,u,7>: Cost 3 vzipr LHS, <1,3,5,7> + 2618104165U, // <2,5,u,u>: Cost 3 vext2 <0,1,2,5>, LHS + 2620096512U, // <2,6,0,0>: Cost 3 vext2 <0,4,2,6>, <0,0,0,0> + 1546354790U, // <2,6,0,1>: Cost 2 vext2 <0,4,2,6>, LHS + 2620096676U, // <2,6,0,2>: Cost 3 vext2 <0,4,2,6>, <0,2,0,2> + 3693838588U, // <2,6,0,3>: Cost 4 vext2 <0,4,2,6>, <0,3,1,0> + 1546355036U, // <2,6,0,4>: Cost 2 vext2 <0,4,2,6>, <0,4,2,6> + 3694502317U, // <2,6,0,5>: Cost 4 vext2 <0,5,2,6>, <0,5,2,6> + 2551911246U, // <2,6,0,6>: Cost 3 vext1 <0,2,6,0>, <6,7,0,1> + 2720723287U, // <2,6,0,7>: Cost 3 vext3 <6,0,7,2>, <6,0,7,2> + 1546355357U, // <2,6,0,u>: Cost 2 vext2 <0,4,2,6>, LHS + 2620097270U, // <2,6,1,0>: Cost 3 vext2 <0,4,2,6>, <1,0,3,2> + 2620097332U, // <2,6,1,1>: Cost 3 vext2 <0,4,2,6>, <1,1,1,1> + 2620097430U, // <2,6,1,2>: Cost 3 vext2 <0,4,2,6>, <1,2,3,0> + 2820243558U, // <2,6,1,3>: Cost 3 vuzpr <0,2,4,6>, LHS + 2620097598U, // <2,6,1,4>: Cost 3 vext2 <0,4,2,6>, <1,4,3,6> + 2620097680U, // <2,6,1,5>: Cost 3 vext2 <0,4,2,6>, <1,5,3,7> + 3693839585U, // <2,6,1,6>: Cost 4 vext2 <0,4,2,6>, <1,6,3,7> + 2721386920U, // <2,6,1,7>: Cost 3 vext3 <6,1,7,2>, <6,1,7,2> + 2820243563U, // <2,6,1,u>: Cost 3 vuzpr <0,2,4,6>, LHS + 2714014137U, // <2,6,2,0>: Cost 3 vext3 <4,u,6,2>, <6,2,0,1> + 2712834500U, // <2,6,2,1>: Cost 3 vext3 <4,6,u,2>, <6,2,1,3> + 2620098152U, // <2,6,2,2>: Cost 3 vext2 <0,4,2,6>, <2,2,2,2> + 2620098214U, // <2,6,2,3>: Cost 3 vext2 <0,4,2,6>, <2,3,0,1> + 2632042254U, // <2,6,2,4>: Cost 3 vext2 <2,4,2,6>, <2,4,2,6> + 2712834540U, // <2,6,2,5>: Cost 3 vext3 <4,6,u,2>, <6,2,5,7> + 2820243660U, // <2,6,2,6>: Cost 3 vuzpr <0,2,4,6>, <0,2,4,6> + 2958265654U, // <2,6,2,7>: Cost 3 vzipr <0,u,2,2>, RHS + 2620098619U, // <2,6,2,u>: Cost 3 vext2 <0,4,2,6>, <2,u,0,1> + 2620098710U, // <2,6,3,0>: Cost 3 vext2 <0,4,2,6>, <3,0,1,2> + 3893986982U, // <2,6,3,1>: Cost 4 vuzpr <0,2,4,6>, <2,3,0,1> + 2569848762U, // <2,6,3,2>: Cost 3 vext1 <3,2,6,3>, <2,6,3,7> + 2620098972U, // <2,6,3,3>: Cost 3 vext2 <0,4,2,6>, <3,3,3,3> + 2620099074U, // <2,6,3,4>: Cost 3 vext2 <0,4,2,6>, <3,4,5,6> + 3893987022U, // <2,6,3,5>: Cost 4 vuzpr <0,2,4,6>, <2,3,4,5> + 3001404644U, // <2,6,3,6>: Cost 3 vzipr LHS, <4,4,6,6> + 1879887158U, // <2,6,3,7>: Cost 2 vzipr LHS, RHS + 1879887159U, // <2,6,3,u>: Cost 2 vzipr LHS, RHS + 2620099484U, // <2,6,4,0>: Cost 3 vext2 <0,4,2,6>, <4,0,6,2> + 2620099566U, // <2,6,4,1>: Cost 3 vext2 <0,4,2,6>, <4,1,6,3> + 2620099644U, // <2,6,4,2>: Cost 3 vext2 <0,4,2,6>, <4,2,6,0> + 3643599207U, // <2,6,4,3>: Cost 4 vext1 <3,2,6,4>, <3,2,6,4> + 2575830080U, // <2,6,4,4>: Cost 3 vext1 <4,2,6,4>, <4,2,6,4> + 1546358070U, // <2,6,4,5>: Cost 2 vext2 <0,4,2,6>, RHS + 2667875700U, // <2,6,4,6>: Cost 3 vext2 <u,4,2,6>, <4,6,4,6> + 4028042550U, // <2,6,4,7>: Cost 4 vzipr <0,2,2,4>, RHS + 1546358313U, // <2,6,4,u>: Cost 2 vext2 <0,4,2,6>, RHS + 3693841992U, // <2,6,5,0>: Cost 4 vext2 <0,4,2,6>, <5,0,1,2> + 2667876048U, // <2,6,5,1>: Cost 3 vext2 <u,4,2,6>, <5,1,7,3> + 2712834756U, // <2,6,5,2>: Cost 3 vext3 <4,6,u,2>, <6,5,2,7> + 3643607400U, // <2,6,5,3>: Cost 4 vext1 <3,2,6,5>, <3,2,6,5> + 2252091873U, // <2,6,5,4>: Cost 3 vrev <6,2,4,5> + 2667876356U, // <2,6,5,5>: Cost 3 vext2 <u,4,2,6>, <5,5,5,5> + 2667876450U, // <2,6,5,6>: Cost 3 vext2 <u,4,2,6>, <5,6,7,0> + 2820246838U, // <2,6,5,7>: Cost 3 vuzpr <0,2,4,6>, RHS + 2820246839U, // <2,6,5,u>: Cost 3 vuzpr <0,2,4,6>, RHS + 2563899494U, // <2,6,6,0>: Cost 3 vext1 <2,2,6,6>, LHS + 3893988683U, // <2,6,6,1>: Cost 4 vuzpr <0,2,4,6>, <4,6,0,1> + 2563901072U, // <2,6,6,2>: Cost 3 vext1 <2,2,6,6>, <2,2,6,6> + 3893987236U, // <2,6,6,3>: Cost 4 vuzpr <0,2,4,6>, <2,6,1,3> + 2563902774U, // <2,6,6,4>: Cost 3 vext1 <2,2,6,6>, RHS + 3893988723U, // <2,6,6,5>: Cost 4 vuzpr <0,2,4,6>, <4,6,4,5> + 2712834872U, // <2,6,6,6>: Cost 3 vext3 <4,6,u,2>, <6,6,6,6> + 2955644214U, // <2,6,6,7>: Cost 3 vzipr <0,4,2,6>, RHS + 2955644215U, // <2,6,6,u>: Cost 3 vzipr <0,4,2,6>, RHS + 2712834894U, // <2,6,7,0>: Cost 3 vext3 <4,6,u,2>, <6,7,0,1> + 2724926296U, // <2,6,7,1>: Cost 3 vext3 <6,7,1,2>, <6,7,1,2> + 2725000033U, // <2,6,7,2>: Cost 3 vext3 <6,7,2,2>, <6,7,2,2> + 2702365544U, // <2,6,7,3>: Cost 3 vext3 <3,0,1,2>, <6,7,3,0> + 2712834934U, // <2,6,7,4>: Cost 3 vext3 <4,6,u,2>, <6,7,4,5> + 3776107393U, // <2,6,7,5>: Cost 4 vext3 <3,0,1,2>, <6,7,5,7> + 2725294981U, // <2,6,7,6>: Cost 3 vext3 <6,7,6,2>, <6,7,6,2> + 2726253452U, // <2,6,7,7>: Cost 3 vext3 <7,0,1,2>, <6,7,7,0> + 2712834966U, // <2,6,7,u>: Cost 3 vext3 <4,6,u,2>, <6,7,u,1> + 2620102355U, // <2,6,u,0>: Cost 3 vext2 <0,4,2,6>, <u,0,1,2> + 1546360622U, // <2,6,u,1>: Cost 2 vext2 <0,4,2,6>, LHS + 2620102536U, // <2,6,u,2>: Cost 3 vext2 <0,4,2,6>, <u,2,3,3> + 2820244125U, // <2,6,u,3>: Cost 3 vuzpr <0,2,4,6>, LHS + 1594136612U, // <2,6,u,4>: Cost 2 vext2 <u,4,2,6>, <u,4,2,6> + 1546360986U, // <2,6,u,5>: Cost 2 vext2 <0,4,2,6>, RHS + 2620102864U, // <2,6,u,6>: Cost 3 vext2 <0,4,2,6>, <u,6,3,7> + 1879928118U, // <2,6,u,7>: Cost 2 vzipr LHS, RHS + 1879928119U, // <2,6,u,u>: Cost 2 vzipr LHS, RHS + 2726179825U, // <2,7,0,0>: Cost 3 vext3 <7,0,0,2>, <7,0,0,2> + 1652511738U, // <2,7,0,1>: Cost 2 vext3 <7,0,1,2>, <7,0,1,2> + 2621431972U, // <2,7,0,2>: Cost 3 vext2 <0,6,2,7>, <0,2,0,2> + 2257949868U, // <2,7,0,3>: Cost 3 vrev <7,2,3,0> + 2726474773U, // <2,7,0,4>: Cost 3 vext3 <7,0,4,2>, <7,0,4,2> + 2620768686U, // <2,7,0,5>: Cost 3 vext2 <0,5,2,7>, <0,5,2,7> + 2621432319U, // <2,7,0,6>: Cost 3 vext2 <0,6,2,7>, <0,6,2,7> + 2599760953U, // <2,7,0,7>: Cost 3 vext1 <u,2,7,0>, <7,0,u,2> + 1653027897U, // <2,7,0,u>: Cost 2 vext3 <7,0,u,2>, <7,0,u,2> + 2639348470U, // <2,7,1,0>: Cost 3 vext2 <3,6,2,7>, <1,0,3,2> + 3695174452U, // <2,7,1,1>: Cost 4 vext2 <0,6,2,7>, <1,1,1,1> + 3695174550U, // <2,7,1,2>: Cost 4 vext2 <0,6,2,7>, <1,2,3,0> + 3694511104U, // <2,7,1,3>: Cost 4 vext2 <0,5,2,7>, <1,3,5,7> + 3713090594U, // <2,7,1,4>: Cost 4 vext2 <3,6,2,7>, <1,4,0,5> + 3693184144U, // <2,7,1,5>: Cost 4 vext2 <0,3,2,7>, <1,5,3,7> + 2627405016U, // <2,7,1,6>: Cost 3 vext2 <1,6,2,7>, <1,6,2,7> + 3799995519U, // <2,7,1,7>: Cost 4 vext3 <7,0,1,2>, <7,1,7,0> + 2639348470U, // <2,7,1,u>: Cost 3 vext2 <3,6,2,7>, <1,0,3,2> + 3695175101U, // <2,7,2,0>: Cost 4 vext2 <0,6,2,7>, <2,0,1,2> + 3643655168U, // <2,7,2,1>: Cost 4 vext1 <3,2,7,2>, <1,3,5,7> + 2257892517U, // <2,7,2,2>: Cost 3 vrev <7,2,2,2> + 3695175334U, // <2,7,2,3>: Cost 4 vext2 <0,6,2,7>, <2,3,0,1> + 3695175465U, // <2,7,2,4>: Cost 4 vext2 <0,6,2,7>, <2,4,5,6> + 2632714080U, // <2,7,2,5>: Cost 3 vext2 <2,5,2,7>, <2,5,2,7> + 2633377713U, // <2,7,2,6>: Cost 3 vext2 <2,6,2,7>, <2,6,2,7> + 3695175658U, // <2,7,2,7>: Cost 4 vext2 <0,6,2,7>, <2,7,0,1> + 2634704979U, // <2,7,2,u>: Cost 3 vext2 <2,u,2,7>, <2,u,2,7> + 1514094694U, // <2,7,3,0>: Cost 2 vext1 <6,2,7,3>, LHS + 2569921680U, // <2,7,3,1>: Cost 3 vext1 <3,2,7,3>, <1,5,3,7> + 2587838056U, // <2,7,3,2>: Cost 3 vext1 <6,2,7,3>, <2,2,2,2> + 2569922927U, // <2,7,3,3>: Cost 3 vext1 <3,2,7,3>, <3,2,7,3> + 1514097974U, // <2,7,3,4>: Cost 2 vext1 <6,2,7,3>, RHS + 2581868321U, // <2,7,3,5>: Cost 3 vext1 <5,2,7,3>, <5,2,7,3> + 1514099194U, // <2,7,3,6>: Cost 2 vext1 <6,2,7,3>, <6,2,7,3> + 2587841530U, // <2,7,3,7>: Cost 3 vext1 <6,2,7,3>, <7,0,1,2> + 1514100526U, // <2,7,3,u>: Cost 2 vext1 <6,2,7,3>, LHS + 2708706617U, // <2,7,4,0>: Cost 3 vext3 <4,0,6,2>, <7,4,0,6> + 3649643418U, // <2,7,4,1>: Cost 4 vext1 <4,2,7,4>, <1,2,3,4> + 3649644330U, // <2,7,4,2>: Cost 4 vext1 <4,2,7,4>, <2,4,5,7> + 2257982640U, // <2,7,4,3>: Cost 3 vrev <7,2,3,4> + 3649645641U, // <2,7,4,4>: Cost 4 vext1 <4,2,7,4>, <4,2,7,4> + 2621435190U, // <2,7,4,5>: Cost 3 vext2 <0,6,2,7>, RHS + 2712835441U, // <2,7,4,6>: Cost 3 vext3 <4,6,u,2>, <7,4,6,u> + 3799995762U, // <2,7,4,7>: Cost 4 vext3 <7,0,1,2>, <7,4,7,0> + 2621435433U, // <2,7,4,u>: Cost 3 vext2 <0,6,2,7>, RHS + 2729497990U, // <2,7,5,0>: Cost 3 vext3 <7,5,0,2>, <7,5,0,2> + 3643679744U, // <2,7,5,1>: Cost 4 vext1 <3,2,7,5>, <1,3,5,7> + 3637708424U, // <2,7,5,2>: Cost 4 vext1 <2,2,7,5>, <2,2,5,7> + 3643681137U, // <2,7,5,3>: Cost 4 vext1 <3,2,7,5>, <3,2,7,5> + 2599800118U, // <2,7,5,4>: Cost 3 vext1 <u,2,7,5>, RHS + 3786577334U, // <2,7,5,5>: Cost 4 vext3 <4,6,u,2>, <7,5,5,5> + 3786577345U, // <2,7,5,6>: Cost 4 vext3 <4,6,u,2>, <7,5,6,7> + 2599802214U, // <2,7,5,7>: Cost 3 vext1 <u,2,7,5>, <7,4,5,6> + 2599802670U, // <2,7,5,u>: Cost 3 vext1 <u,2,7,5>, LHS + 2581889126U, // <2,7,6,0>: Cost 3 vext1 <5,2,7,6>, LHS + 3643687936U, // <2,7,6,1>: Cost 4 vext1 <3,2,7,6>, <1,3,5,7> + 2663240186U, // <2,7,6,2>: Cost 3 vext2 <7,6,2,7>, <6,2,7,3> + 3643689330U, // <2,7,6,3>: Cost 4 vext1 <3,2,7,6>, <3,2,7,6> + 2581892406U, // <2,7,6,4>: Cost 3 vext1 <5,2,7,6>, RHS + 2581892900U, // <2,7,6,5>: Cost 3 vext1 <5,2,7,6>, <5,2,7,6> + 2587865597U, // <2,7,6,6>: Cost 3 vext1 <6,2,7,6>, <6,2,7,6> + 3786577428U, // <2,7,6,7>: Cost 4 vext3 <4,6,u,2>, <7,6,7,0> + 2581894958U, // <2,7,6,u>: Cost 3 vext1 <5,2,7,6>, LHS + 2726254119U, // <2,7,7,0>: Cost 3 vext3 <7,0,1,2>, <7,7,0,1> + 3804640817U, // <2,7,7,1>: Cost 4 vext3 <7,7,1,2>, <7,7,1,2> + 3637724826U, // <2,7,7,2>: Cost 4 vext1 <2,2,7,7>, <2,2,7,7> + 3734992123U, // <2,7,7,3>: Cost 4 vext2 <7,3,2,7>, <7,3,2,7> + 2552040758U, // <2,7,7,4>: Cost 3 vext1 <0,2,7,7>, RHS + 3799995992U, // <2,7,7,5>: Cost 4 vext3 <7,0,1,2>, <7,7,5,5> + 2663241198U, // <2,7,7,6>: Cost 3 vext2 <7,6,2,7>, <7,6,2,7> + 2712835692U, // <2,7,7,7>: Cost 3 vext3 <4,6,u,2>, <7,7,7,7> + 2731562607U, // <2,7,7,u>: Cost 3 vext3 <7,u,1,2>, <7,7,u,1> + 1514135654U, // <2,7,u,0>: Cost 2 vext1 <6,2,7,u>, LHS + 1657820802U, // <2,7,u,1>: Cost 2 vext3 <7,u,1,2>, <7,u,1,2> + 2587879016U, // <2,7,u,2>: Cost 3 vext1 <6,2,7,u>, <2,2,2,2> + 2569963892U, // <2,7,u,3>: Cost 3 vext1 <3,2,7,u>, <3,2,7,u> + 1514138934U, // <2,7,u,4>: Cost 2 vext1 <6,2,7,u>, RHS + 2621438106U, // <2,7,u,5>: Cost 3 vext2 <0,6,2,7>, RHS + 1514140159U, // <2,7,u,6>: Cost 2 vext1 <6,2,7,u>, <6,2,7,u> + 2587882490U, // <2,7,u,7>: Cost 3 vext1 <6,2,7,u>, <7,0,1,2> + 1514141486U, // <2,7,u,u>: Cost 2 vext1 <6,2,7,u>, LHS + 1544380416U, // <2,u,0,0>: Cost 2 vext2 LHS, <0,0,0,0> + 470638699U, // <2,u,0,1>: Cost 1 vext2 LHS, LHS + 1544380580U, // <2,u,0,2>: Cost 2 vext2 LHS, <0,2,0,2> + 1658631909U, // <2,u,0,3>: Cost 2 vext3 <u,0,3,2>, <u,0,3,2> + 1544380754U, // <2,u,0,4>: Cost 2 vext2 LHS, <0,4,1,5> + 2665898414U, // <2,u,0,5>: Cost 3 vext2 LHS, <0,5,2,7> + 1658853120U, // <2,u,0,6>: Cost 2 vext3 <u,0,6,2>, <u,0,6,2> + 3094531625U, // <2,u,0,7>: Cost 3 vtrnr <1,2,3,0>, RHS + 470639261U, // <2,u,0,u>: Cost 1 vext2 LHS, LHS + 1544381174U, // <2,u,1,0>: Cost 2 vext2 LHS, <1,0,3,2> + 1544381236U, // <2,u,1,1>: Cost 2 vext2 LHS, <1,1,1,1> + 1544381334U, // <2,u,1,2>: Cost 2 vext2 LHS, <1,2,3,0> + 1544381400U, // <2,u,1,3>: Cost 2 vext2 LHS, <1,3,1,3> + 2618123325U, // <2,u,1,4>: Cost 3 vext2 LHS, <1,4,3,5> + 1544381584U, // <2,u,1,5>: Cost 2 vext2 LHS, <1,5,3,7> + 2618123489U, // <2,u,1,6>: Cost 3 vext2 LHS, <1,6,3,7> + 2726254427U, // <2,u,1,7>: Cost 3 vext3 <7,0,1,2>, <u,1,7,3> + 1544381823U, // <2,u,1,u>: Cost 2 vext2 LHS, <1,u,3,3> + 1478328422U, // <2,u,2,0>: Cost 2 vext1 <0,2,u,2>, LHS + 2618123807U, // <2,u,2,1>: Cost 3 vext2 LHS, <2,1,3,1> + 269271142U, // <2,u,2,2>: Cost 1 vdup2 LHS + 1544382118U, // <2,u,2,3>: Cost 2 vext2 LHS, <2,3,0,1> + 1478331702U, // <2,u,2,4>: Cost 2 vext1 <0,2,u,2>, RHS + 2618124136U, // <2,u,2,5>: Cost 3 vext2 LHS, <2,5,3,6> + 1544382394U, // <2,u,2,6>: Cost 2 vext2 LHS, <2,6,3,7> + 3088354857U, // <2,u,2,7>: Cost 3 vtrnr <0,2,0,2>, RHS + 269271142U, // <2,u,2,u>: Cost 1 vdup2 LHS + 1544382614U, // <2,u,3,0>: Cost 2 vext2 LHS, <3,0,1,2> + 2953627374U, // <2,u,3,1>: Cost 3 vzipr LHS, <2,3,u,1> + 1490282143U, // <2,u,3,2>: Cost 2 vext1 <2,2,u,3>, <2,2,u,3> + 1879883932U, // <2,u,3,3>: Cost 2 vzipr LHS, LHS + 1544382978U, // <2,u,3,4>: Cost 2 vext2 LHS, <3,4,5,6> + 2953627378U, // <2,u,3,5>: Cost 3 vzipr LHS, <2,3,u,5> + 1514172931U, // <2,u,3,6>: Cost 2 vext1 <6,2,u,3>, <6,2,u,3> + 1879887176U, // <2,u,3,7>: Cost 2 vzipr LHS, RHS + 1879883937U, // <2,u,3,u>: Cost 2 vzipr LHS, LHS + 1484316774U, // <2,u,4,0>: Cost 2 vext1 <1,2,u,4>, LHS + 1484317639U, // <2,u,4,1>: Cost 2 vext1 <1,2,u,4>, <1,2,u,4> + 2552088270U, // <2,u,4,2>: Cost 3 vext1 <0,2,u,4>, <2,3,4,5> + 1190213513U, // <2,u,4,3>: Cost 2 vrev <u,2,3,4> + 1484320054U, // <2,u,4,4>: Cost 2 vext1 <1,2,u,4>, RHS + 470641974U, // <2,u,4,5>: Cost 1 vext2 LHS, RHS + 1592159604U, // <2,u,4,6>: Cost 2 vext2 LHS, <4,6,4,6> + 3094564393U, // <2,u,4,7>: Cost 3 vtrnr <1,2,3,4>, RHS + 470642217U, // <2,u,4,u>: Cost 1 vext2 LHS, RHS + 2552094959U, // <2,u,5,0>: Cost 3 vext1 <0,2,u,5>, <0,2,u,5> + 1592159952U, // <2,u,5,1>: Cost 2 vext2 LHS, <5,1,7,3> + 2564040353U, // <2,u,5,2>: Cost 3 vext1 <2,2,u,5>, <2,2,u,5> + 2690275455U, // <2,u,5,3>: Cost 3 vext3 <0,u,u,2>, <u,5,3,7> + 1592160198U, // <2,u,5,4>: Cost 2 vext2 LHS, <5,4,7,6> + 1592160260U, // <2,u,5,5>: Cost 2 vext2 LHS, <5,5,5,5> + 1611962522U, // <2,u,5,6>: Cost 2 vext3 <0,2,0,2>, RHS + 1592160424U, // <2,u,5,7>: Cost 2 vext2 LHS, <5,7,5,7> + 1611962540U, // <2,u,5,u>: Cost 2 vext3 <0,2,0,2>, RHS + 1478361190U, // <2,u,6,0>: Cost 2 vext1 <0,2,u,6>, LHS + 2552103670U, // <2,u,6,1>: Cost 3 vext1 <0,2,u,6>, <1,0,3,2> + 1592160762U, // <2,u,6,2>: Cost 2 vext2 LHS, <6,2,7,3> + 2685704400U, // <2,u,6,3>: Cost 3 vext3 <0,2,0,2>, <u,6,3,7> + 1478364470U, // <2,u,6,4>: Cost 2 vext1 <0,2,u,6>, RHS + 2901891226U, // <2,u,6,5>: Cost 3 vzipl <2,6,3,7>, RHS + 1592161080U, // <2,u,6,6>: Cost 2 vext2 LHS, <6,6,6,6> + 1592161102U, // <2,u,6,7>: Cost 2 vext2 LHS, <6,7,0,1> + 1478367022U, // <2,u,6,u>: Cost 2 vext1 <0,2,u,6>, LHS + 1592161274U, // <2,u,7,0>: Cost 2 vext2 LHS, <7,0,1,2> + 2659931226U, // <2,u,7,1>: Cost 3 vext2 <7,1,2,u>, <7,1,2,u> + 2564056739U, // <2,u,7,2>: Cost 3 vext1 <2,2,u,7>, <2,2,u,7> + 2665903331U, // <2,u,7,3>: Cost 3 vext2 LHS, <7,3,0,1> + 1592161638U, // <2,u,7,4>: Cost 2 vext2 LHS, <7,4,5,6> + 2665903494U, // <2,u,7,5>: Cost 3 vext2 LHS, <7,5,0,2> + 2587947527U, // <2,u,7,6>: Cost 3 vext1 <6,2,u,7>, <6,2,u,7> + 1592161900U, // <2,u,7,7>: Cost 2 vext2 LHS, <7,7,7,7> + 1592161922U, // <2,u,7,u>: Cost 2 vext2 LHS, <7,u,1,2> + 1478377574U, // <2,u,u,0>: Cost 2 vext1 <0,2,u,u>, LHS + 470644526U, // <2,u,u,1>: Cost 1 vext2 LHS, LHS + 269271142U, // <2,u,u,2>: Cost 1 vdup2 LHS + 1879924892U, // <2,u,u,3>: Cost 2 vzipr LHS, LHS + 1478380854U, // <2,u,u,4>: Cost 2 vext1 <0,2,u,u>, RHS + 470644890U, // <2,u,u,5>: Cost 1 vext2 LHS, RHS + 1611962765U, // <2,u,u,6>: Cost 2 vext3 <0,2,0,2>, RHS + 1879928136U, // <2,u,u,7>: Cost 2 vzipr LHS, RHS + 470645093U, // <2,u,u,u>: Cost 1 vext2 LHS, LHS + 1611448320U, // <3,0,0,0>: Cost 2 vext3 LHS, <0,0,0,0> + 1611890698U, // <3,0,0,1>: Cost 2 vext3 LHS, <0,0,1,1> + 1611890708U, // <3,0,0,2>: Cost 2 vext3 LHS, <0,0,2,2> + 3763576860U, // <3,0,0,3>: Cost 4 vext3 LHS, <0,0,3,1> + 2689835045U, // <3,0,0,4>: Cost 3 vext3 LHS, <0,0,4,1> + 3698508206U, // <3,0,0,5>: Cost 4 vext2 <1,2,3,0>, <0,5,2,7> + 3763576887U, // <3,0,0,6>: Cost 4 vext3 LHS, <0,0,6,1> + 3667678434U, // <3,0,0,7>: Cost 4 vext1 <7,3,0,0>, <7,3,0,0> + 1616093258U, // <3,0,0,u>: Cost 2 vext3 LHS, <0,0,u,2> + 1490337894U, // <3,0,1,0>: Cost 2 vext1 <2,3,0,1>, LHS + 2685632602U, // <3,0,1,1>: Cost 3 vext3 LHS, <0,1,1,0> + 537706598U, // <3,0,1,2>: Cost 1 vext3 LHS, LHS + 2624766936U, // <3,0,1,3>: Cost 3 vext2 <1,2,3,0>, <1,3,1,3> + 1490341174U, // <3,0,1,4>: Cost 2 vext1 <2,3,0,1>, RHS + 2624767120U, // <3,0,1,5>: Cost 3 vext2 <1,2,3,0>, <1,5,3,7> + 2732966030U, // <3,0,1,6>: Cost 3 vext3 LHS, <0,1,6,7> + 2593944803U, // <3,0,1,7>: Cost 3 vext1 <7,3,0,1>, <7,3,0,1> + 537706652U, // <3,0,1,u>: Cost 1 vext3 LHS, LHS + 1611890852U, // <3,0,2,0>: Cost 2 vext3 LHS, <0,2,0,2> + 2685632684U, // <3,0,2,1>: Cost 3 vext3 LHS, <0,2,1,1> + 2685632692U, // <3,0,2,2>: Cost 3 vext3 LHS, <0,2,2,0> + 2685632702U, // <3,0,2,3>: Cost 3 vext3 LHS, <0,2,3,1> + 1611890892U, // <3,0,2,4>: Cost 2 vext3 LHS, <0,2,4,6> + 2732966102U, // <3,0,2,5>: Cost 3 vext3 LHS, <0,2,5,7> + 2624767930U, // <3,0,2,6>: Cost 3 vext2 <1,2,3,0>, <2,6,3,7> + 2685632744U, // <3,0,2,7>: Cost 3 vext3 LHS, <0,2,7,7> + 1611890924U, // <3,0,2,u>: Cost 2 vext3 LHS, <0,2,u,2> + 2624768150U, // <3,0,3,0>: Cost 3 vext2 <1,2,3,0>, <3,0,1,2> + 2685632764U, // <3,0,3,1>: Cost 3 vext3 LHS, <0,3,1,0> + 2685632774U, // <3,0,3,2>: Cost 3 vext3 LHS, <0,3,2,1> + 2624768412U, // <3,0,3,3>: Cost 3 vext2 <1,2,3,0>, <3,3,3,3> + 2624768514U, // <3,0,3,4>: Cost 3 vext2 <1,2,3,0>, <3,4,5,6> + 3702491714U, // <3,0,3,5>: Cost 4 vext2 <1,u,3,0>, <3,5,3,7> + 2624768632U, // <3,0,3,6>: Cost 3 vext2 <1,2,3,0>, <3,6,0,7> + 3702491843U, // <3,0,3,7>: Cost 4 vext2 <1,u,3,0>, <3,7,0,1> + 2686959934U, // <3,0,3,u>: Cost 3 vext3 <0,3,u,3>, <0,3,u,3> + 2689835336U, // <3,0,4,0>: Cost 3 vext3 LHS, <0,4,0,4> + 1611891026U, // <3,0,4,1>: Cost 2 vext3 LHS, <0,4,1,5> + 1611891036U, // <3,0,4,2>: Cost 2 vext3 LHS, <0,4,2,6> + 3763577184U, // <3,0,4,3>: Cost 4 vext3 LHS, <0,4,3,1> + 2689835374U, // <3,0,4,4>: Cost 3 vext3 LHS, <0,4,4,6> + 1551027510U, // <3,0,4,5>: Cost 2 vext2 <1,2,3,0>, RHS + 2666573172U, // <3,0,4,6>: Cost 3 vext2 <u,2,3,0>, <4,6,4,6> + 3667711206U, // <3,0,4,7>: Cost 4 vext1 <7,3,0,4>, <7,3,0,4> + 1616093586U, // <3,0,4,u>: Cost 2 vext3 LHS, <0,4,u,6> + 2685190556U, // <3,0,5,0>: Cost 3 vext3 LHS, <0,5,0,7> + 2666573520U, // <3,0,5,1>: Cost 3 vext2 <u,2,3,0>, <5,1,7,3> + 3040886886U, // <3,0,5,2>: Cost 3 vtrnl <3,4,5,6>, LHS + 3625912834U, // <3,0,5,3>: Cost 4 vext1 <0,3,0,5>, <3,4,5,6> + 2666573766U, // <3,0,5,4>: Cost 3 vext2 <u,2,3,0>, <5,4,7,6> + 2666573828U, // <3,0,5,5>: Cost 3 vext2 <u,2,3,0>, <5,5,5,5> + 2732966354U, // <3,0,5,6>: Cost 3 vext3 LHS, <0,5,6,7> + 2666573992U, // <3,0,5,7>: Cost 3 vext2 <u,2,3,0>, <5,7,5,7> + 3040886940U, // <3,0,5,u>: Cost 3 vtrnl <3,4,5,6>, LHS + 2685190637U, // <3,0,6,0>: Cost 3 vext3 LHS, <0,6,0,7> + 2732966390U, // <3,0,6,1>: Cost 3 vext3 LHS, <0,6,1,7> + 2689835519U, // <3,0,6,2>: Cost 3 vext3 LHS, <0,6,2,7> + 3667724438U, // <3,0,6,3>: Cost 4 vext1 <7,3,0,6>, <3,0,1,2> + 3763577355U, // <3,0,6,4>: Cost 4 vext3 LHS, <0,6,4,1> + 3806708243U, // <3,0,6,5>: Cost 4 vext3 LHS, <0,6,5,0> + 2666574648U, // <3,0,6,6>: Cost 3 vext2 <u,2,3,0>, <6,6,6,6> + 2657948520U, // <3,0,6,7>: Cost 3 vext2 <6,7,3,0>, <6,7,3,0> + 2689835573U, // <3,0,6,u>: Cost 3 vext3 LHS, <0,6,u,7> + 2666574842U, // <3,0,7,0>: Cost 3 vext2 <u,2,3,0>, <7,0,1,2> + 2685633095U, // <3,0,7,1>: Cost 3 vext3 LHS, <0,7,1,7> + 2660603052U, // <3,0,7,2>: Cost 3 vext2 <7,2,3,0>, <7,2,3,0> + 3643844997U, // <3,0,7,3>: Cost 4 vext1 <3,3,0,7>, <3,3,0,7> + 2666575206U, // <3,0,7,4>: Cost 3 vext2 <u,2,3,0>, <7,4,5,6> + 3655790391U, // <3,0,7,5>: Cost 4 vext1 <5,3,0,7>, <5,3,0,7> + 3731690968U, // <3,0,7,6>: Cost 4 vext2 <6,7,3,0>, <7,6,0,3> + 2666575468U, // <3,0,7,7>: Cost 3 vext2 <u,2,3,0>, <7,7,7,7> + 2664584850U, // <3,0,7,u>: Cost 3 vext2 <7,u,3,0>, <7,u,3,0> + 1616093834U, // <3,0,u,0>: Cost 2 vext3 LHS, <0,u,0,2> + 1611891346U, // <3,0,u,1>: Cost 2 vext3 LHS, <0,u,1,1> + 537707165U, // <3,0,u,2>: Cost 1 vext3 LHS, LHS + 2689835684U, // <3,0,u,3>: Cost 3 vext3 LHS, <0,u,3,1> + 1616093874U, // <3,0,u,4>: Cost 2 vext3 LHS, <0,u,4,6> + 1551030426U, // <3,0,u,5>: Cost 2 vext2 <1,2,3,0>, RHS + 2624772304U, // <3,0,u,6>: Cost 3 vext2 <1,2,3,0>, <u,6,3,7> + 2594002154U, // <3,0,u,7>: Cost 3 vext1 <7,3,0,u>, <7,3,0,u> + 537707219U, // <3,0,u,u>: Cost 1 vext3 LHS, LHS + 2552201318U, // <3,1,0,0>: Cost 3 vext1 <0,3,1,0>, LHS + 2618802278U, // <3,1,0,1>: Cost 3 vext2 <0,2,3,1>, LHS + 2618802366U, // <3,1,0,2>: Cost 3 vext2 <0,2,3,1>, <0,2,3,1> + 1611449078U, // <3,1,0,3>: Cost 2 vext3 LHS, <1,0,3,2> + 2552204598U, // <3,1,0,4>: Cost 3 vext1 <0,3,1,0>, RHS + 2732966663U, // <3,1,0,5>: Cost 3 vext3 LHS, <1,0,5,1> + 3906258396U, // <3,1,0,6>: Cost 4 vuzpr <2,3,0,1>, <2,0,4,6> + 3667752171U, // <3,1,0,7>: Cost 4 vext1 <7,3,1,0>, <7,3,1,0> + 1611891491U, // <3,1,0,u>: Cost 2 vext3 LHS, <1,0,u,2> + 2689835819U, // <3,1,1,0>: Cost 3 vext3 LHS, <1,1,0,1> + 1611449140U, // <3,1,1,1>: Cost 2 vext3 LHS, <1,1,1,1> + 2624775063U, // <3,1,1,2>: Cost 3 vext2 <1,2,3,1>, <1,2,3,1> + 1611891528U, // <3,1,1,3>: Cost 2 vext3 LHS, <1,1,3,3> + 2689835859U, // <3,1,1,4>: Cost 3 vext3 LHS, <1,1,4,5> + 2689835868U, // <3,1,1,5>: Cost 3 vext3 LHS, <1,1,5,5> + 3763577701U, // <3,1,1,6>: Cost 4 vext3 LHS, <1,1,6,5> + 3765273452U, // <3,1,1,7>: Cost 4 vext3 <1,1,7,3>, <1,1,7,3> + 1611891573U, // <3,1,1,u>: Cost 2 vext3 LHS, <1,1,u,3> + 2629420494U, // <3,1,2,0>: Cost 3 vext2 <2,0,3,1>, <2,0,3,1> + 2689835911U, // <3,1,2,1>: Cost 3 vext3 LHS, <1,2,1,3> + 2564163248U, // <3,1,2,2>: Cost 3 vext1 <2,3,1,2>, <2,3,1,2> + 1611449238U, // <3,1,2,3>: Cost 2 vext3 LHS, <1,2,3,0> + 2564164918U, // <3,1,2,4>: Cost 3 vext1 <2,3,1,2>, RHS + 2689835947U, // <3,1,2,5>: Cost 3 vext3 LHS, <1,2,5,3> + 3692545978U, // <3,1,2,6>: Cost 4 vext2 <0,2,3,1>, <2,6,3,7> + 2732966842U, // <3,1,2,7>: Cost 3 vext3 LHS, <1,2,7,0> + 1611891651U, // <3,1,2,u>: Cost 2 vext3 LHS, <1,2,u,0> + 1484456038U, // <3,1,3,0>: Cost 2 vext1 <1,3,1,3>, LHS + 1611891672U, // <3,1,3,1>: Cost 2 vext3 LHS, <1,3,1,3> + 2685633502U, // <3,1,3,2>: Cost 3 vext3 LHS, <1,3,2,0> + 2685633512U, // <3,1,3,3>: Cost 3 vext3 LHS, <1,3,3,1> + 1484459318U, // <3,1,3,4>: Cost 2 vext1 <1,3,1,3>, RHS + 1611891712U, // <3,1,3,5>: Cost 2 vext3 LHS, <1,3,5,7> + 2689836041U, // <3,1,3,6>: Cost 3 vext3 LHS, <1,3,6,7> + 2733409294U, // <3,1,3,7>: Cost 3 vext3 LHS, <1,3,7,3> + 1611891735U, // <3,1,3,u>: Cost 2 vext3 LHS, <1,3,u,3> + 2552234086U, // <3,1,4,0>: Cost 3 vext1 <0,3,1,4>, LHS + 2732966955U, // <3,1,4,1>: Cost 3 vext3 LHS, <1,4,1,5> + 2732966964U, // <3,1,4,2>: Cost 3 vext3 LHS, <1,4,2,5> + 2685633597U, // <3,1,4,3>: Cost 3 vext3 LHS, <1,4,3,5> + 2552237366U, // <3,1,4,4>: Cost 3 vext1 <0,3,1,4>, RHS + 2618805558U, // <3,1,4,5>: Cost 3 vext2 <0,2,3,1>, RHS + 2769472822U, // <3,1,4,6>: Cost 3 vuzpl <3,0,1,2>, RHS + 3667784943U, // <3,1,4,7>: Cost 4 vext1 <7,3,1,4>, <7,3,1,4> + 2685633642U, // <3,1,4,u>: Cost 3 vext3 LHS, <1,4,u,5> + 2689836143U, // <3,1,5,0>: Cost 3 vext3 LHS, <1,5,0,1> + 2564187280U, // <3,1,5,1>: Cost 3 vext1 <2,3,1,5>, <1,5,3,7> + 2564187827U, // <3,1,5,2>: Cost 3 vext1 <2,3,1,5>, <2,3,1,5> + 1611891856U, // <3,1,5,3>: Cost 2 vext3 LHS, <1,5,3,7> + 2689836183U, // <3,1,5,4>: Cost 3 vext3 LHS, <1,5,4,5> + 3759375522U, // <3,1,5,5>: Cost 4 vext3 LHS, <1,5,5,7> + 3720417378U, // <3,1,5,6>: Cost 4 vext2 <4,u,3,1>, <5,6,7,0> + 2832518454U, // <3,1,5,7>: Cost 3 vuzpr <2,3,0,1>, RHS + 1611891901U, // <3,1,5,u>: Cost 2 vext3 LHS, <1,5,u,7> + 3763578048U, // <3,1,6,0>: Cost 4 vext3 LHS, <1,6,0,1> + 2689836239U, // <3,1,6,1>: Cost 3 vext3 LHS, <1,6,1,7> + 2732967128U, // <3,1,6,2>: Cost 3 vext3 LHS, <1,6,2,7> + 2685633761U, // <3,1,6,3>: Cost 3 vext3 LHS, <1,6,3,7> + 3763578088U, // <3,1,6,4>: Cost 4 vext3 LHS, <1,6,4,5> + 2689836275U, // <3,1,6,5>: Cost 3 vext3 LHS, <1,6,5,7> + 3763578108U, // <3,1,6,6>: Cost 4 vext3 LHS, <1,6,6,7> + 2732967166U, // <3,1,6,7>: Cost 3 vext3 LHS, <1,6,7,0> + 2685633806U, // <3,1,6,u>: Cost 3 vext3 LHS, <1,6,u,7> + 3631972454U, // <3,1,7,0>: Cost 4 vext1 <1,3,1,7>, LHS + 2659947612U, // <3,1,7,1>: Cost 3 vext2 <7,1,3,1>, <7,1,3,1> + 4036102294U, // <3,1,7,2>: Cost 4 vzipr <1,5,3,7>, <3,0,1,2> + 3095396454U, // <3,1,7,3>: Cost 3 vtrnr <1,3,5,7>, LHS + 3631975734U, // <3,1,7,4>: Cost 4 vext1 <1,3,1,7>, RHS + 2222982144U, // <3,1,7,5>: Cost 3 vrev <1,3,5,7> + 3296797705U, // <3,1,7,6>: Cost 4 vrev <1,3,6,7> + 3720418924U, // <3,1,7,7>: Cost 4 vext2 <4,u,3,1>, <7,7,7,7> + 3095396459U, // <3,1,7,u>: Cost 3 vtrnr <1,3,5,7>, LHS + 1484496998U, // <3,1,u,0>: Cost 2 vext1 <1,3,1,u>, LHS + 1611892077U, // <3,1,u,1>: Cost 2 vext3 LHS, <1,u,1,3> + 2685633907U, // <3,1,u,2>: Cost 3 vext3 LHS, <1,u,2,0> + 1611892092U, // <3,1,u,3>: Cost 2 vext3 LHS, <1,u,3,0> + 1484500278U, // <3,1,u,4>: Cost 2 vext1 <1,3,1,u>, RHS + 1611892117U, // <3,1,u,5>: Cost 2 vext3 LHS, <1,u,5,7> + 2685633950U, // <3,1,u,6>: Cost 3 vext3 LHS, <1,u,6,7> + 2832518697U, // <3,1,u,7>: Cost 3 vuzpr <2,3,0,1>, RHS + 1611892140U, // <3,1,u,u>: Cost 2 vext3 LHS, <1,u,u,3> + 2623455232U, // <3,2,0,0>: Cost 3 vext2 <1,0,3,2>, <0,0,0,0> + 1549713510U, // <3,2,0,1>: Cost 2 vext2 <1,0,3,2>, LHS + 2689836484U, // <3,2,0,2>: Cost 3 vext3 LHS, <2,0,2,0> + 2685633997U, // <3,2,0,3>: Cost 3 vext3 LHS, <2,0,3,0> + 2623455570U, // <3,2,0,4>: Cost 3 vext2 <1,0,3,2>, <0,4,1,5> + 2732967398U, // <3,2,0,5>: Cost 3 vext3 LHS, <2,0,5,7> + 2689836524U, // <3,2,0,6>: Cost 3 vext3 LHS, <2,0,6,4> + 2229044964U, // <3,2,0,7>: Cost 3 vrev <2,3,7,0> + 1549714077U, // <3,2,0,u>: Cost 2 vext2 <1,0,3,2>, LHS + 1549714166U, // <3,2,1,0>: Cost 2 vext2 <1,0,3,2>, <1,0,3,2> + 2623456052U, // <3,2,1,1>: Cost 3 vext2 <1,0,3,2>, <1,1,1,1> + 2623456150U, // <3,2,1,2>: Cost 3 vext2 <1,0,3,2>, <1,2,3,0> + 2685634079U, // <3,2,1,3>: Cost 3 vext3 LHS, <2,1,3,1> + 2552286518U, // <3,2,1,4>: Cost 3 vext1 <0,3,2,1>, RHS + 2623456400U, // <3,2,1,5>: Cost 3 vext2 <1,0,3,2>, <1,5,3,7> + 2689836604U, // <3,2,1,6>: Cost 3 vext3 LHS, <2,1,6,3> + 3667834101U, // <3,2,1,7>: Cost 4 vext1 <7,3,2,1>, <7,3,2,1> + 1155385070U, // <3,2,1,u>: Cost 2 vrev <2,3,u,1> + 2689836629U, // <3,2,2,0>: Cost 3 vext3 LHS, <2,2,0,1> + 2689836640U, // <3,2,2,1>: Cost 3 vext3 LHS, <2,2,1,3> + 1611449960U, // <3,2,2,2>: Cost 2 vext3 LHS, <2,2,2,2> + 1611892338U, // <3,2,2,3>: Cost 2 vext3 LHS, <2,2,3,3> + 2689836669U, // <3,2,2,4>: Cost 3 vext3 LHS, <2,2,4,5> + 2689836680U, // <3,2,2,5>: Cost 3 vext3 LHS, <2,2,5,7> + 2689836688U, // <3,2,2,6>: Cost 3 vext3 LHS, <2,2,6,6> + 3763578518U, // <3,2,2,7>: Cost 4 vext3 LHS, <2,2,7,3> + 1611892383U, // <3,2,2,u>: Cost 2 vext3 LHS, <2,2,u,3> + 1611450022U, // <3,2,3,0>: Cost 2 vext3 LHS, <2,3,0,1> + 2685191854U, // <3,2,3,1>: Cost 3 vext3 LHS, <2,3,1,0> + 2685191865U, // <3,2,3,2>: Cost 3 vext3 LHS, <2,3,2,2> + 2685191875U, // <3,2,3,3>: Cost 3 vext3 LHS, <2,3,3,3> + 1611450062U, // <3,2,3,4>: Cost 2 vext3 LHS, <2,3,4,5> + 2732967635U, // <3,2,3,5>: Cost 3 vext3 LHS, <2,3,5,1> + 2732967645U, // <3,2,3,6>: Cost 3 vext3 LHS, <2,3,6,2> + 2732967652U, // <3,2,3,7>: Cost 3 vext3 LHS, <2,3,7,0> + 1611450094U, // <3,2,3,u>: Cost 2 vext3 LHS, <2,3,u,1> + 2558279782U, // <3,2,4,0>: Cost 3 vext1 <1,3,2,4>, LHS + 2558280602U, // <3,2,4,1>: Cost 3 vext1 <1,3,2,4>, <1,2,3,4> + 2732967692U, // <3,2,4,2>: Cost 3 vext3 LHS, <2,4,2,4> + 2685634326U, // <3,2,4,3>: Cost 3 vext3 LHS, <2,4,3,5> + 2558283062U, // <3,2,4,4>: Cost 3 vext1 <1,3,2,4>, RHS + 1549716790U, // <3,2,4,5>: Cost 2 vext2 <1,0,3,2>, RHS + 2689836844U, // <3,2,4,6>: Cost 3 vext3 LHS, <2,4,6,0> + 2229077736U, // <3,2,4,7>: Cost 3 vrev <2,3,7,4> + 1549717033U, // <3,2,4,u>: Cost 2 vext2 <1,0,3,2>, RHS + 2552316006U, // <3,2,5,0>: Cost 3 vext1 <0,3,2,5>, LHS + 2228643507U, // <3,2,5,1>: Cost 3 vrev <2,3,1,5> + 2689836896U, // <3,2,5,2>: Cost 3 vext3 LHS, <2,5,2,7> + 2685634408U, // <3,2,5,3>: Cost 3 vext3 LHS, <2,5,3,6> + 1155122894U, // <3,2,5,4>: Cost 2 vrev <2,3,4,5> + 2665263108U, // <3,2,5,5>: Cost 3 vext2 <u,0,3,2>, <5,5,5,5> + 2689836932U, // <3,2,5,6>: Cost 3 vext3 LHS, <2,5,6,7> + 2665263272U, // <3,2,5,7>: Cost 3 vext2 <u,0,3,2>, <5,7,5,7> + 1155417842U, // <3,2,5,u>: Cost 2 vrev <2,3,u,5> + 2689836953U, // <3,2,6,0>: Cost 3 vext3 LHS, <2,6,0,1> + 2689836964U, // <3,2,6,1>: Cost 3 vext3 LHS, <2,6,1,3> + 2689836976U, // <3,2,6,2>: Cost 3 vext3 LHS, <2,6,2,6> + 1611892666U, // <3,2,6,3>: Cost 2 vext3 LHS, <2,6,3,7> + 2689836993U, // <3,2,6,4>: Cost 3 vext3 LHS, <2,6,4,5> + 2689837004U, // <3,2,6,5>: Cost 3 vext3 LHS, <2,6,5,7> + 2689837013U, // <3,2,6,6>: Cost 3 vext3 LHS, <2,6,6,7> + 2665263950U, // <3,2,6,7>: Cost 3 vext2 <u,0,3,2>, <6,7,0,1> + 1611892711U, // <3,2,6,u>: Cost 2 vext3 LHS, <2,6,u,7> + 2665264122U, // <3,2,7,0>: Cost 3 vext2 <u,0,3,2>, <7,0,1,2> + 2623460419U, // <3,2,7,1>: Cost 3 vext2 <1,0,3,2>, <7,1,0,3> + 4169138340U, // <3,2,7,2>: Cost 4 vtrnr <1,3,5,7>, <0,2,0,2> + 2962358374U, // <3,2,7,3>: Cost 3 vzipr <1,5,3,7>, LHS + 2665264486U, // <3,2,7,4>: Cost 3 vext2 <u,0,3,2>, <7,4,5,6> + 2228954841U, // <3,2,7,5>: Cost 3 vrev <2,3,5,7> + 2229028578U, // <3,2,7,6>: Cost 3 vrev <2,3,6,7> + 2665264748U, // <3,2,7,7>: Cost 3 vext2 <u,0,3,2>, <7,7,7,7> + 2962358379U, // <3,2,7,u>: Cost 3 vzipr <1,5,3,7>, LHS + 1611892795U, // <3,2,u,0>: Cost 2 vext3 LHS, <2,u,0,1> + 1549719342U, // <3,2,u,1>: Cost 2 vext2 <1,0,3,2>, LHS + 1611449960U, // <3,2,u,2>: Cost 2 vext3 LHS, <2,2,2,2> + 1611892824U, // <3,2,u,3>: Cost 2 vext3 LHS, <2,u,3,3> + 1611892835U, // <3,2,u,4>: Cost 2 vext3 LHS, <2,u,4,5> + 1549719706U, // <3,2,u,5>: Cost 2 vext2 <1,0,3,2>, RHS + 2689837168U, // <3,2,u,6>: Cost 3 vext3 LHS, <2,u,6,0> + 2665265408U, // <3,2,u,7>: Cost 3 vext2 <u,0,3,2>, <u,7,0,1> + 1611892867U, // <3,2,u,u>: Cost 2 vext3 LHS, <2,u,u,1> + 2685192331U, // <3,3,0,0>: Cost 3 vext3 LHS, <3,0,0,0> + 1611450518U, // <3,3,0,1>: Cost 2 vext3 LHS, <3,0,1,2> + 2685634717U, // <3,3,0,2>: Cost 3 vext3 LHS, <3,0,2,0> + 2564294806U, // <3,3,0,3>: Cost 3 vext1 <2,3,3,0>, <3,0,1,2> + 2685634736U, // <3,3,0,4>: Cost 3 vext3 LHS, <3,0,4,1> + 2732968122U, // <3,3,0,5>: Cost 3 vext3 LHS, <3,0,5,2> + 3763579075U, // <3,3,0,6>: Cost 4 vext3 LHS, <3,0,6,2> + 4034053264U, // <3,3,0,7>: Cost 4 vzipr <1,2,3,0>, <1,5,3,7> + 1611450581U, // <3,3,0,u>: Cost 2 vext3 LHS, <3,0,u,2> + 2685192415U, // <3,3,1,0>: Cost 3 vext3 LHS, <3,1,0,3> + 1550385992U, // <3,3,1,1>: Cost 2 vext2 <1,1,3,3>, <1,1,3,3> + 2685192433U, // <3,3,1,2>: Cost 3 vext3 LHS, <3,1,2,3> + 2685634808U, // <3,3,1,3>: Cost 3 vext3 LHS, <3,1,3,1> + 2558332214U, // <3,3,1,4>: Cost 3 vext1 <1,3,3,1>, RHS + 2685634828U, // <3,3,1,5>: Cost 3 vext3 LHS, <3,1,5,3> + 3759376661U, // <3,3,1,6>: Cost 4 vext3 LHS, <3,1,6,3> + 2703477022U, // <3,3,1,7>: Cost 3 vext3 <3,1,7,3>, <3,1,7,3> + 1555031423U, // <3,3,1,u>: Cost 2 vext2 <1,u,3,3>, <1,u,3,3> + 2564309094U, // <3,3,2,0>: Cost 3 vext1 <2,3,3,2>, LHS + 2630100513U, // <3,3,2,1>: Cost 3 vext2 <2,1,3,3>, <2,1,3,3> + 1557022322U, // <3,3,2,2>: Cost 2 vext2 <2,2,3,3>, <2,2,3,3> + 2685192520U, // <3,3,2,3>: Cost 3 vext3 LHS, <3,2,3,0> + 2564312374U, // <3,3,2,4>: Cost 3 vext1 <2,3,3,2>, RHS + 2732968286U, // <3,3,2,5>: Cost 3 vext3 LHS, <3,2,5,4> + 2685634918U, // <3,3,2,6>: Cost 3 vext3 LHS, <3,2,6,3> + 2704140655U, // <3,3,2,7>: Cost 3 vext3 <3,2,7,3>, <3,2,7,3> + 1561004120U, // <3,3,2,u>: Cost 2 vext2 <2,u,3,3>, <2,u,3,3> + 1496547430U, // <3,3,3,0>: Cost 2 vext1 <3,3,3,3>, LHS + 2624129256U, // <3,3,3,1>: Cost 3 vext2 <1,1,3,3>, <3,1,1,3> + 2630764866U, // <3,3,3,2>: Cost 3 vext2 <2,2,3,3>, <3,2,2,3> + 336380006U, // <3,3,3,3>: Cost 1 vdup3 LHS + 1496550710U, // <3,3,3,4>: Cost 2 vext1 <3,3,3,3>, RHS + 2732968368U, // <3,3,3,5>: Cost 3 vext3 LHS, <3,3,5,5> + 2624129683U, // <3,3,3,6>: Cost 3 vext2 <1,1,3,3>, <3,6,3,7> + 2594182400U, // <3,3,3,7>: Cost 3 vext1 <7,3,3,3>, <7,3,3,3> + 336380006U, // <3,3,3,u>: Cost 1 vdup3 LHS + 2558353510U, // <3,3,4,0>: Cost 3 vext1 <1,3,3,4>, LHS + 2558354411U, // <3,3,4,1>: Cost 3 vext1 <1,3,3,4>, <1,3,3,4> + 2564327108U, // <3,3,4,2>: Cost 3 vext1 <2,3,3,4>, <2,3,3,4> + 2564327938U, // <3,3,4,3>: Cost 3 vext1 <2,3,3,4>, <3,4,5,6> + 2960343962U, // <3,3,4,4>: Cost 3 vzipr <1,2,3,4>, <1,2,3,4> + 1611893250U, // <3,3,4,5>: Cost 2 vext3 LHS, <3,4,5,6> + 2771619126U, // <3,3,4,6>: Cost 3 vuzpl <3,3,3,3>, RHS + 4034086032U, // <3,3,4,7>: Cost 4 vzipr <1,2,3,4>, <1,5,3,7> + 1611893277U, // <3,3,4,u>: Cost 2 vext3 LHS, <3,4,u,6> + 2558361702U, // <3,3,5,0>: Cost 3 vext1 <1,3,3,5>, LHS + 2558362604U, // <3,3,5,1>: Cost 3 vext1 <1,3,3,5>, <1,3,3,5> + 2558363342U, // <3,3,5,2>: Cost 3 vext1 <1,3,3,5>, <2,3,4,5> + 2732968512U, // <3,3,5,3>: Cost 3 vext3 LHS, <3,5,3,5> + 2558364982U, // <3,3,5,4>: Cost 3 vext1 <1,3,3,5>, RHS + 3101279950U, // <3,3,5,5>: Cost 3 vtrnr <2,3,4,5>, <2,3,4,5> + 2665934946U, // <3,3,5,6>: Cost 3 vext2 <u,1,3,3>, <5,6,7,0> + 2826636598U, // <3,3,5,7>: Cost 3 vuzpr <1,3,1,3>, RHS + 2826636599U, // <3,3,5,u>: Cost 3 vuzpr <1,3,1,3>, RHS + 2732968568U, // <3,3,6,0>: Cost 3 vext3 LHS, <3,6,0,7> + 3763579521U, // <3,3,6,1>: Cost 4 vext3 LHS, <3,6,1,7> + 2732968586U, // <3,3,6,2>: Cost 3 vext3 LHS, <3,6,2,7> + 2732968595U, // <3,3,6,3>: Cost 3 vext3 LHS, <3,6,3,7> + 2732968604U, // <3,3,6,4>: Cost 3 vext3 LHS, <3,6,4,7> + 3763579557U, // <3,3,6,5>: Cost 4 vext3 LHS, <3,6,5,7> + 2732968621U, // <3,3,6,6>: Cost 3 vext3 LHS, <3,6,6,6> + 2657973099U, // <3,3,6,7>: Cost 3 vext2 <6,7,3,3>, <6,7,3,3> + 2658636732U, // <3,3,6,u>: Cost 3 vext2 <6,u,3,3>, <6,u,3,3> + 2558378086U, // <3,3,7,0>: Cost 3 vext1 <1,3,3,7>, LHS + 2558378990U, // <3,3,7,1>: Cost 3 vext1 <1,3,3,7>, <1,3,3,7> + 2564351687U, // <3,3,7,2>: Cost 3 vext1 <2,3,3,7>, <2,3,3,7> + 2661291264U, // <3,3,7,3>: Cost 3 vext2 <7,3,3,3>, <7,3,3,3> + 2558381366U, // <3,3,7,4>: Cost 3 vext1 <1,3,3,7>, RHS + 2732968694U, // <3,3,7,5>: Cost 3 vext3 LHS, <3,7,5,7> + 3781126907U, // <3,3,7,6>: Cost 4 vext3 <3,7,6,3>, <3,7,6,3> + 3095397376U, // <3,3,7,7>: Cost 3 vtrnr <1,3,5,7>, <1,3,5,7> + 2558383918U, // <3,3,7,u>: Cost 3 vext1 <1,3,3,7>, LHS + 1496547430U, // <3,3,u,0>: Cost 2 vext1 <3,3,3,3>, LHS + 1611893534U, // <3,3,u,1>: Cost 2 vext3 LHS, <3,u,1,2> + 1592858504U, // <3,3,u,2>: Cost 2 vext2 <u,2,3,3>, <u,2,3,3> + 336380006U, // <3,3,u,3>: Cost 1 vdup3 LHS + 1496550710U, // <3,3,u,4>: Cost 2 vext1 <3,3,3,3>, RHS + 1611893574U, // <3,3,u,5>: Cost 2 vext3 LHS, <3,u,5,6> + 2690280268U, // <3,3,u,6>: Cost 3 vext3 LHS, <3,u,6,3> + 2826636841U, // <3,3,u,7>: Cost 3 vuzpr <1,3,1,3>, RHS + 336380006U, // <3,3,u,u>: Cost 1 vdup3 LHS + 2624798720U, // <3,4,0,0>: Cost 3 vext2 <1,2,3,4>, <0,0,0,0> + 1551056998U, // <3,4,0,1>: Cost 2 vext2 <1,2,3,4>, LHS + 2624798884U, // <3,4,0,2>: Cost 3 vext2 <1,2,3,4>, <0,2,0,2> + 3693232384U, // <3,4,0,3>: Cost 4 vext2 <0,3,3,4>, <0,3,1,4> + 2624799058U, // <3,4,0,4>: Cost 3 vext2 <1,2,3,4>, <0,4,1,5> + 1659227026U, // <3,4,0,5>: Cost 2 vext3 LHS, <4,0,5,1> + 1659227036U, // <3,4,0,6>: Cost 2 vext3 LHS, <4,0,6,2> + 3667973382U, // <3,4,0,7>: Cost 4 vext1 <7,3,4,0>, <7,3,4,0> + 1551057565U, // <3,4,0,u>: Cost 2 vext2 <1,2,3,4>, LHS + 2624799478U, // <3,4,1,0>: Cost 3 vext2 <1,2,3,4>, <1,0,3,2> + 2624799540U, // <3,4,1,1>: Cost 3 vext2 <1,2,3,4>, <1,1,1,1> + 1551057818U, // <3,4,1,2>: Cost 2 vext2 <1,2,3,4>, <1,2,3,4> + 2624799704U, // <3,4,1,3>: Cost 3 vext2 <1,2,3,4>, <1,3,1,3> + 2564377910U, // <3,4,1,4>: Cost 3 vext1 <2,3,4,1>, RHS + 2689838050U, // <3,4,1,5>: Cost 3 vext3 LHS, <4,1,5,0> + 2689838062U, // <3,4,1,6>: Cost 3 vext3 LHS, <4,1,6,3> + 2628117807U, // <3,4,1,7>: Cost 3 vext2 <1,7,3,4>, <1,7,3,4> + 1555039616U, // <3,4,1,u>: Cost 2 vext2 <1,u,3,4>, <1,u,3,4> + 3626180710U, // <3,4,2,0>: Cost 4 vext1 <0,3,4,2>, LHS + 2624800298U, // <3,4,2,1>: Cost 3 vext2 <1,2,3,4>, <2,1,4,3> + 2624800360U, // <3,4,2,2>: Cost 3 vext2 <1,2,3,4>, <2,2,2,2> + 2624800422U, // <3,4,2,3>: Cost 3 vext2 <1,2,3,4>, <2,3,0,1> + 2624800514U, // <3,4,2,4>: Cost 3 vext2 <1,2,3,4>, <2,4,1,3> + 2709965878U, // <3,4,2,5>: Cost 3 vext3 <4,2,5,3>, <4,2,5,3> + 2689838140U, // <3,4,2,6>: Cost 3 vext3 LHS, <4,2,6,0> + 2634090504U, // <3,4,2,7>: Cost 3 vext2 <2,7,3,4>, <2,7,3,4> + 2689838158U, // <3,4,2,u>: Cost 3 vext3 LHS, <4,2,u,0> + 2624800918U, // <3,4,3,0>: Cost 3 vext2 <1,2,3,4>, <3,0,1,2> + 2636081403U, // <3,4,3,1>: Cost 3 vext2 <3,1,3,4>, <3,1,3,4> + 2636745036U, // <3,4,3,2>: Cost 3 vext2 <3,2,3,4>, <3,2,3,4> + 2624801180U, // <3,4,3,3>: Cost 3 vext2 <1,2,3,4>, <3,3,3,3> + 2624801232U, // <3,4,3,4>: Cost 3 vext2 <1,2,3,4>, <3,4,0,1> + 2905836854U, // <3,4,3,5>: Cost 3 vzipl <3,3,3,3>, RHS + 3040054582U, // <3,4,3,6>: Cost 3 vtrnl <3,3,3,3>, RHS + 3702524611U, // <3,4,3,7>: Cost 4 vext2 <1,u,3,4>, <3,7,0,1> + 2624801566U, // <3,4,3,u>: Cost 3 vext2 <1,2,3,4>, <3,u,1,2> + 2564399206U, // <3,4,4,0>: Cost 3 vext1 <2,3,4,4>, LHS + 2564400026U, // <3,4,4,1>: Cost 3 vext1 <2,3,4,4>, <1,2,3,4> + 2564400845U, // <3,4,4,2>: Cost 3 vext1 <2,3,4,4>, <2,3,4,4> + 2570373542U, // <3,4,4,3>: Cost 3 vext1 <3,3,4,4>, <3,3,4,4> + 1659227344U, // <3,4,4,4>: Cost 2 vext3 LHS, <4,4,4,4> + 1551060278U, // <3,4,4,5>: Cost 2 vext2 <1,2,3,4>, RHS + 1659227364U, // <3,4,4,6>: Cost 2 vext3 LHS, <4,4,6,6> + 3668006154U, // <3,4,4,7>: Cost 4 vext1 <7,3,4,4>, <7,3,4,4> + 1551060521U, // <3,4,4,u>: Cost 2 vext2 <1,2,3,4>, RHS + 1490665574U, // <3,4,5,0>: Cost 2 vext1 <2,3,4,5>, LHS + 2689838341U, // <3,4,5,1>: Cost 3 vext3 LHS, <4,5,1,3> + 1490667214U, // <3,4,5,2>: Cost 2 vext1 <2,3,4,5>, <2,3,4,5> + 2564409494U, // <3,4,5,3>: Cost 3 vext1 <2,3,4,5>, <3,0,1,2> + 1490668854U, // <3,4,5,4>: Cost 2 vext1 <2,3,4,5>, RHS + 2689838381U, // <3,4,5,5>: Cost 3 vext3 LHS, <4,5,5,7> + 537709878U, // <3,4,5,6>: Cost 1 vext3 LHS, RHS + 2594272523U, // <3,4,5,7>: Cost 3 vext1 <7,3,4,5>, <7,3,4,5> + 537709896U, // <3,4,5,u>: Cost 1 vext3 LHS, RHS + 2689838411U, // <3,4,6,0>: Cost 3 vext3 LHS, <4,6,0,1> + 2558444534U, // <3,4,6,1>: Cost 3 vext1 <1,3,4,6>, <1,3,4,6> + 2666607098U, // <3,4,6,2>: Cost 3 vext2 <u,2,3,4>, <6,2,7,3> + 2558446082U, // <3,4,6,3>: Cost 3 vext1 <1,3,4,6>, <3,4,5,6> + 1659227508U, // <3,4,6,4>: Cost 2 vext3 LHS, <4,6,4,6> + 2689838462U, // <3,4,6,5>: Cost 3 vext3 LHS, <4,6,5,7> + 2689838471U, // <3,4,6,6>: Cost 3 vext3 LHS, <4,6,6,7> + 2657981292U, // <3,4,6,7>: Cost 3 vext2 <6,7,3,4>, <6,7,3,4> + 1659227540U, // <3,4,6,u>: Cost 2 vext3 LHS, <4,6,u,2> + 2666607610U, // <3,4,7,0>: Cost 3 vext2 <u,2,3,4>, <7,0,1,2> + 3702527072U, // <3,4,7,1>: Cost 4 vext2 <1,u,3,4>, <7,1,3,5> + 2660635824U, // <3,4,7,2>: Cost 3 vext2 <7,2,3,4>, <7,2,3,4> + 3644139945U, // <3,4,7,3>: Cost 4 vext1 <3,3,4,7>, <3,3,4,7> + 2666607974U, // <3,4,7,4>: Cost 3 vext2 <u,2,3,4>, <7,4,5,6> + 2732969416U, // <3,4,7,5>: Cost 3 vext3 LHS, <4,7,5,0> + 2732969425U, // <3,4,7,6>: Cost 3 vext3 LHS, <4,7,6,0> + 2666608236U, // <3,4,7,7>: Cost 3 vext2 <u,2,3,4>, <7,7,7,7> + 2664617622U, // <3,4,7,u>: Cost 3 vext2 <7,u,3,4>, <7,u,3,4> + 1490690150U, // <3,4,u,0>: Cost 2 vext1 <2,3,4,u>, LHS + 1551062830U, // <3,4,u,1>: Cost 2 vext2 <1,2,3,4>, LHS + 1490691793U, // <3,4,u,2>: Cost 2 vext1 <2,3,4,u>, <2,3,4,u> + 2624804796U, // <3,4,u,3>: Cost 3 vext2 <1,2,3,4>, <u,3,0,1> + 1490693430U, // <3,4,u,4>: Cost 2 vext1 <2,3,4,u>, RHS + 1551063194U, // <3,4,u,5>: Cost 2 vext2 <1,2,3,4>, RHS + 537710121U, // <3,4,u,6>: Cost 1 vext3 LHS, RHS + 2594297102U, // <3,4,u,7>: Cost 3 vext1 <7,3,4,u>, <7,3,4,u> + 537710139U, // <3,4,u,u>: Cost 1 vext3 LHS, RHS + 3692576768U, // <3,5,0,0>: Cost 4 vext2 <0,2,3,5>, <0,0,0,0> + 2618835046U, // <3,5,0,1>: Cost 3 vext2 <0,2,3,5>, LHS + 2618835138U, // <3,5,0,2>: Cost 3 vext2 <0,2,3,5>, <0,2,3,5> + 3692577024U, // <3,5,0,3>: Cost 4 vext2 <0,2,3,5>, <0,3,1,4> + 2689838690U, // <3,5,0,4>: Cost 3 vext3 LHS, <5,0,4,1> + 2732969579U, // <3,5,0,5>: Cost 3 vext3 LHS, <5,0,5,1> + 2732969588U, // <3,5,0,6>: Cost 3 vext3 LHS, <5,0,6,1> + 2246963055U, // <3,5,0,7>: Cost 3 vrev <5,3,7,0> + 2618835613U, // <3,5,0,u>: Cost 3 vext2 <0,2,3,5>, LHS + 2594308198U, // <3,5,1,0>: Cost 3 vext1 <7,3,5,1>, LHS + 3692577588U, // <3,5,1,1>: Cost 4 vext2 <0,2,3,5>, <1,1,1,1> + 2624807835U, // <3,5,1,2>: Cost 3 vext2 <1,2,3,5>, <1,2,3,5> + 2625471468U, // <3,5,1,3>: Cost 3 vext2 <1,3,3,5>, <1,3,3,5> + 2626135101U, // <3,5,1,4>: Cost 3 vext2 <1,4,3,5>, <1,4,3,5> + 2594311888U, // <3,5,1,5>: Cost 3 vext1 <7,3,5,1>, <5,1,7,3> + 3699877107U, // <3,5,1,6>: Cost 4 vext2 <1,4,3,5>, <1,6,5,7> + 1641680592U, // <3,5,1,7>: Cost 2 vext3 <5,1,7,3>, <5,1,7,3> + 1641754329U, // <3,5,1,u>: Cost 2 vext3 <5,1,u,3>, <5,1,u,3> + 3692578274U, // <3,5,2,0>: Cost 4 vext2 <0,2,3,5>, <2,0,5,3> + 2630116899U, // <3,5,2,1>: Cost 3 vext2 <2,1,3,5>, <2,1,3,5> + 3692578408U, // <3,5,2,2>: Cost 4 vext2 <0,2,3,5>, <2,2,2,2> + 2625472206U, // <3,5,2,3>: Cost 3 vext2 <1,3,3,5>, <2,3,4,5> + 2632107798U, // <3,5,2,4>: Cost 3 vext2 <2,4,3,5>, <2,4,3,5> + 2715938575U, // <3,5,2,5>: Cost 3 vext3 <5,2,5,3>, <5,2,5,3> + 3692578746U, // <3,5,2,6>: Cost 4 vext2 <0,2,3,5>, <2,6,3,7> + 2716086049U, // <3,5,2,7>: Cost 3 vext3 <5,2,7,3>, <5,2,7,3> + 2634762330U, // <3,5,2,u>: Cost 3 vext2 <2,u,3,5>, <2,u,3,5> + 3692578966U, // <3,5,3,0>: Cost 4 vext2 <0,2,3,5>, <3,0,1,2> + 2636089596U, // <3,5,3,1>: Cost 3 vext2 <3,1,3,5>, <3,1,3,5> + 3699214668U, // <3,5,3,2>: Cost 4 vext2 <1,3,3,5>, <3,2,3,4> + 2638080412U, // <3,5,3,3>: Cost 3 vext2 <3,4,3,5>, <3,3,3,3> + 2618837506U, // <3,5,3,4>: Cost 3 vext2 <0,2,3,5>, <3,4,5,6> + 2832844494U, // <3,5,3,5>: Cost 3 vuzpr <2,3,4,5>, <2,3,4,5> + 4033415682U, // <3,5,3,6>: Cost 4 vzipr <1,1,3,3>, <3,4,5,6> + 3095072054U, // <3,5,3,7>: Cost 3 vtrnr <1,3,1,3>, RHS + 3095072055U, // <3,5,3,u>: Cost 3 vtrnr <1,3,1,3>, RHS + 2600304742U, // <3,5,4,0>: Cost 3 vext1 <u,3,5,4>, LHS + 3763580815U, // <3,5,4,1>: Cost 4 vext3 LHS, <5,4,1,5> + 2564474582U, // <3,5,4,2>: Cost 3 vext1 <2,3,5,4>, <2,3,5,4> + 3699879044U, // <3,5,4,3>: Cost 4 vext2 <1,4,3,5>, <4,3,5,0> + 2600308022U, // <3,5,4,4>: Cost 3 vext1 <u,3,5,4>, RHS + 2618838326U, // <3,5,4,5>: Cost 3 vext2 <0,2,3,5>, RHS + 2772454710U, // <3,5,4,6>: Cost 3 vuzpl <3,4,5,6>, RHS + 1659228102U, // <3,5,4,7>: Cost 2 vext3 LHS, <5,4,7,6> + 1659228111U, // <3,5,4,u>: Cost 2 vext3 LHS, <5,4,u,6> + 2570453094U, // <3,5,5,0>: Cost 3 vext1 <3,3,5,5>, LHS + 2624810704U, // <3,5,5,1>: Cost 3 vext2 <1,2,3,5>, <5,1,7,3> + 2570454734U, // <3,5,5,2>: Cost 3 vext1 <3,3,5,5>, <2,3,4,5> + 2570455472U, // <3,5,5,3>: Cost 3 vext1 <3,3,5,5>, <3,3,5,5> + 2570456374U, // <3,5,5,4>: Cost 3 vext1 <3,3,5,5>, RHS + 1659228164U, // <3,5,5,5>: Cost 2 vext3 LHS, <5,5,5,5> + 2732969998U, // <3,5,5,6>: Cost 3 vext3 LHS, <5,5,6,6> + 1659228184U, // <3,5,5,7>: Cost 2 vext3 LHS, <5,5,7,7> + 1659228193U, // <3,5,5,u>: Cost 2 vext3 LHS, <5,5,u,7> + 2732970020U, // <3,5,6,0>: Cost 3 vext3 LHS, <5,6,0,1> + 2732970035U, // <3,5,6,1>: Cost 3 vext3 LHS, <5,6,1,7> + 2564490968U, // <3,5,6,2>: Cost 3 vext1 <2,3,5,6>, <2,3,5,6> + 2732970050U, // <3,5,6,3>: Cost 3 vext3 LHS, <5,6,3,4> + 2732970060U, // <3,5,6,4>: Cost 3 vext3 LHS, <5,6,4,5> + 2732970071U, // <3,5,6,5>: Cost 3 vext3 LHS, <5,6,5,7> + 2732970080U, // <3,5,6,6>: Cost 3 vext3 LHS, <5,6,6,7> + 1659228258U, // <3,5,6,7>: Cost 2 vext3 LHS, <5,6,7,0> + 1659228267U, // <3,5,6,u>: Cost 2 vext3 LHS, <5,6,u,0> + 1484783718U, // <3,5,7,0>: Cost 2 vext1 <1,3,5,7>, LHS + 1484784640U, // <3,5,7,1>: Cost 2 vext1 <1,3,5,7>, <1,3,5,7> + 2558527080U, // <3,5,7,2>: Cost 3 vext1 <1,3,5,7>, <2,2,2,2> + 2558527638U, // <3,5,7,3>: Cost 3 vext1 <1,3,5,7>, <3,0,1,2> + 1484786998U, // <3,5,7,4>: Cost 2 vext1 <1,3,5,7>, RHS + 1659228328U, // <3,5,7,5>: Cost 2 vext3 LHS, <5,7,5,7> + 2732970154U, // <3,5,7,6>: Cost 3 vext3 LHS, <5,7,6,0> + 2558531180U, // <3,5,7,7>: Cost 3 vext1 <1,3,5,7>, <7,7,7,7> + 1484789550U, // <3,5,7,u>: Cost 2 vext1 <1,3,5,7>, LHS + 1484791910U, // <3,5,u,0>: Cost 2 vext1 <1,3,5,u>, LHS + 1484792833U, // <3,5,u,1>: Cost 2 vext1 <1,3,5,u>, <1,3,5,u> + 2558535272U, // <3,5,u,2>: Cost 3 vext1 <1,3,5,u>, <2,2,2,2> + 2558535830U, // <3,5,u,3>: Cost 3 vext1 <1,3,5,u>, <3,0,1,2> + 1484795190U, // <3,5,u,4>: Cost 2 vext1 <1,3,5,u>, RHS + 1659228409U, // <3,5,u,5>: Cost 2 vext3 LHS, <5,u,5,7> + 2772457626U, // <3,5,u,6>: Cost 3 vuzpl <3,4,5,6>, RHS + 1646326023U, // <3,5,u,7>: Cost 2 vext3 <5,u,7,3>, <5,u,7,3> + 1484797742U, // <3,5,u,u>: Cost 2 vext1 <1,3,5,u>, LHS + 2558541926U, // <3,6,0,0>: Cost 3 vext1 <1,3,6,0>, LHS + 2689839393U, // <3,6,0,1>: Cost 3 vext3 LHS, <6,0,1,2> + 2689839404U, // <3,6,0,2>: Cost 3 vext3 LHS, <6,0,2,4> + 3706519808U, // <3,6,0,3>: Cost 4 vext2 <2,5,3,6>, <0,3,1,4> + 2689839420U, // <3,6,0,4>: Cost 3 vext3 LHS, <6,0,4,2> + 2732970314U, // <3,6,0,5>: Cost 3 vext3 LHS, <6,0,5,7> + 2732970316U, // <3,6,0,6>: Cost 3 vext3 LHS, <6,0,6,0> + 2960313654U, // <3,6,0,7>: Cost 3 vzipr <1,2,3,0>, RHS + 2689839456U, // <3,6,0,u>: Cost 3 vext3 LHS, <6,0,u,2> + 3763581290U, // <3,6,1,0>: Cost 4 vext3 LHS, <6,1,0,3> + 3763581297U, // <3,6,1,1>: Cost 4 vext3 LHS, <6,1,1,1> + 2624816028U, // <3,6,1,2>: Cost 3 vext2 <1,2,3,6>, <1,2,3,6> + 3763581315U, // <3,6,1,3>: Cost 4 vext3 LHS, <6,1,3,1> + 2626143294U, // <3,6,1,4>: Cost 3 vext2 <1,4,3,6>, <1,4,3,6> + 3763581335U, // <3,6,1,5>: Cost 4 vext3 LHS, <6,1,5,3> + 2721321376U, // <3,6,1,6>: Cost 3 vext3 <6,1,6,3>, <6,1,6,3> + 2721395113U, // <3,6,1,7>: Cost 3 vext3 <6,1,7,3>, <6,1,7,3> + 2628797826U, // <3,6,1,u>: Cost 3 vext2 <1,u,3,6>, <1,u,3,6> + 2594390118U, // <3,6,2,0>: Cost 3 vext1 <7,3,6,2>, LHS + 2721616324U, // <3,6,2,1>: Cost 3 vext3 <6,2,1,3>, <6,2,1,3> + 2630788725U, // <3,6,2,2>: Cost 3 vext2 <2,2,3,6>, <2,2,3,6> + 3763581395U, // <3,6,2,3>: Cost 4 vext3 LHS, <6,2,3,0> + 2632115991U, // <3,6,2,4>: Cost 3 vext2 <2,4,3,6>, <2,4,3,6> + 2632779624U, // <3,6,2,5>: Cost 3 vext2 <2,5,3,6>, <2,5,3,6> + 2594394618U, // <3,6,2,6>: Cost 3 vext1 <7,3,6,2>, <6,2,7,3> + 1648316922U, // <3,6,2,7>: Cost 2 vext3 <6,2,7,3>, <6,2,7,3> + 1648390659U, // <3,6,2,u>: Cost 2 vext3 <6,2,u,3>, <6,2,u,3> + 3693914262U, // <3,6,3,0>: Cost 4 vext2 <0,4,3,6>, <3,0,1,2> + 3638281176U, // <3,6,3,1>: Cost 4 vext1 <2,3,6,3>, <1,3,1,3> + 3696568678U, // <3,6,3,2>: Cost 4 vext2 <0,u,3,6>, <3,2,6,3> + 2638088604U, // <3,6,3,3>: Cost 3 vext2 <3,4,3,6>, <3,3,3,3> + 2632780290U, // <3,6,3,4>: Cost 3 vext2 <2,5,3,6>, <3,4,5,6> + 3712494145U, // <3,6,3,5>: Cost 4 vext2 <3,5,3,6>, <3,5,3,6> + 3698559612U, // <3,6,3,6>: Cost 4 vext2 <1,2,3,6>, <3,6,1,2> + 2959674678U, // <3,6,3,7>: Cost 3 vzipr <1,1,3,3>, RHS + 2959674679U, // <3,6,3,u>: Cost 3 vzipr <1,1,3,3>, RHS + 3763581536U, // <3,6,4,0>: Cost 4 vext3 LHS, <6,4,0,6> + 2722943590U, // <3,6,4,1>: Cost 3 vext3 <6,4,1,3>, <6,4,1,3> + 2732970609U, // <3,6,4,2>: Cost 3 vext3 LHS, <6,4,2,5> + 3698560147U, // <3,6,4,3>: Cost 4 vext2 <1,2,3,6>, <4,3,6,6> + 2732970628U, // <3,6,4,4>: Cost 3 vext3 LHS, <6,4,4,6> + 2689839757U, // <3,6,4,5>: Cost 3 vext3 LHS, <6,4,5,6> + 2732970640U, // <3,6,4,6>: Cost 3 vext3 LHS, <6,4,6,0> + 2960346422U, // <3,6,4,7>: Cost 3 vzipr <1,2,3,4>, RHS + 2689839784U, // <3,6,4,u>: Cost 3 vext3 LHS, <6,4,u,6> + 2576498790U, // <3,6,5,0>: Cost 3 vext1 <4,3,6,5>, LHS + 3650241270U, // <3,6,5,1>: Cost 4 vext1 <4,3,6,5>, <1,0,3,2> + 2732970692U, // <3,6,5,2>: Cost 3 vext3 LHS, <6,5,2,7> + 2576501250U, // <3,6,5,3>: Cost 3 vext1 <4,3,6,5>, <3,4,5,6> + 2576501906U, // <3,6,5,4>: Cost 3 vext1 <4,3,6,5>, <4,3,6,5> + 3650244622U, // <3,6,5,5>: Cost 4 vext1 <4,3,6,5>, <5,5,6,6> + 4114633528U, // <3,6,5,6>: Cost 4 vtrnl <3,4,5,6>, <6,6,6,6> + 2732970735U, // <3,6,5,7>: Cost 3 vext3 LHS, <6,5,7,5> + 2576504622U, // <3,6,5,u>: Cost 3 vext1 <4,3,6,5>, LHS + 2732970749U, // <3,6,6,0>: Cost 3 vext3 LHS, <6,6,0,1> + 2724270856U, // <3,6,6,1>: Cost 3 vext3 <6,6,1,3>, <6,6,1,3> + 2624819706U, // <3,6,6,2>: Cost 3 vext2 <1,2,3,6>, <6,2,7,3> + 3656223234U, // <3,6,6,3>: Cost 4 vext1 <5,3,6,6>, <3,4,5,6> + 2732970788U, // <3,6,6,4>: Cost 3 vext3 LHS, <6,6,4,4> + 2732970800U, // <3,6,6,5>: Cost 3 vext3 LHS, <6,6,5,7> + 1659228984U, // <3,6,6,6>: Cost 2 vext3 LHS, <6,6,6,6> + 1659228994U, // <3,6,6,7>: Cost 2 vext3 LHS, <6,6,7,7> + 1659229003U, // <3,6,6,u>: Cost 2 vext3 LHS, <6,6,u,7> + 1659229006U, // <3,6,7,0>: Cost 2 vext3 LHS, <6,7,0,1> + 2558600201U, // <3,6,7,1>: Cost 3 vext1 <1,3,6,7>, <1,3,6,7> + 2558601146U, // <3,6,7,2>: Cost 3 vext1 <1,3,6,7>, <2,6,3,7> + 2725081963U, // <3,6,7,3>: Cost 3 vext3 <6,7,3,3>, <6,7,3,3> + 1659229046U, // <3,6,7,4>: Cost 2 vext3 LHS, <6,7,4,5> + 2715423611U, // <3,6,7,5>: Cost 3 vext3 <5,1,7,3>, <6,7,5,1> + 2722059141U, // <3,6,7,6>: Cost 3 vext3 <6,2,7,3>, <6,7,6,2> + 2962361654U, // <3,6,7,7>: Cost 3 vzipr <1,5,3,7>, RHS + 1659229078U, // <3,6,7,u>: Cost 2 vext3 LHS, <6,7,u,1> + 1659229087U, // <3,6,u,0>: Cost 2 vext3 LHS, <6,u,0,1> + 2689840041U, // <3,6,u,1>: Cost 3 vext3 LHS, <6,u,1,2> + 2558609339U, // <3,6,u,2>: Cost 3 vext1 <1,3,6,u>, <2,6,3,u> + 2576525853U, // <3,6,u,3>: Cost 3 vext1 <4,3,6,u>, <3,4,u,6> + 1659229127U, // <3,6,u,4>: Cost 2 vext3 LHS, <6,u,4,5> + 2689840081U, // <3,6,u,5>: Cost 3 vext3 LHS, <6,u,5,6> + 1659228984U, // <3,6,u,6>: Cost 2 vext3 LHS, <6,6,6,6> + 1652298720U, // <3,6,u,7>: Cost 2 vext3 <6,u,7,3>, <6,u,7,3> + 1659229159U, // <3,6,u,u>: Cost 2 vext3 LHS, <6,u,u,1> + 2626813952U, // <3,7,0,0>: Cost 3 vext2 <1,5,3,7>, <0,0,0,0> + 1553072230U, // <3,7,0,1>: Cost 2 vext2 <1,5,3,7>, LHS + 2626814116U, // <3,7,0,2>: Cost 3 vext2 <1,5,3,7>, <0,2,0,2> + 3700556028U, // <3,7,0,3>: Cost 4 vext2 <1,5,3,7>, <0,3,1,0> + 2626814290U, // <3,7,0,4>: Cost 3 vext2 <1,5,3,7>, <0,4,1,5> + 2582507375U, // <3,7,0,5>: Cost 3 vext1 <5,3,7,0>, <5,3,7,0> + 2588480072U, // <3,7,0,6>: Cost 3 vext1 <6,3,7,0>, <6,3,7,0> + 2732971055U, // <3,7,0,7>: Cost 3 vext3 LHS, <7,0,7,1> + 1553072797U, // <3,7,0,u>: Cost 2 vext2 <1,5,3,7>, LHS + 2626814710U, // <3,7,1,0>: Cost 3 vext2 <1,5,3,7>, <1,0,3,2> + 2626814772U, // <3,7,1,1>: Cost 3 vext2 <1,5,3,7>, <1,1,1,1> + 2626814870U, // <3,7,1,2>: Cost 3 vext2 <1,5,3,7>, <1,2,3,0> + 2625487854U, // <3,7,1,3>: Cost 3 vext2 <1,3,3,7>, <1,3,3,7> + 2582514998U, // <3,7,1,4>: Cost 3 vext1 <5,3,7,1>, RHS + 1553073296U, // <3,7,1,5>: Cost 2 vext2 <1,5,3,7>, <1,5,3,7> + 2627478753U, // <3,7,1,6>: Cost 3 vext2 <1,6,3,7>, <1,6,3,7> + 2727367810U, // <3,7,1,7>: Cost 3 vext3 <7,1,7,3>, <7,1,7,3> + 1555064195U, // <3,7,1,u>: Cost 2 vext2 <1,u,3,7>, <1,u,3,7> + 2588491878U, // <3,7,2,0>: Cost 3 vext1 <6,3,7,2>, LHS + 3700557318U, // <3,7,2,1>: Cost 4 vext2 <1,5,3,7>, <2,1,0,3> + 2626815592U, // <3,7,2,2>: Cost 3 vext2 <1,5,3,7>, <2,2,2,2> + 2626815654U, // <3,7,2,3>: Cost 3 vext2 <1,5,3,7>, <2,3,0,1> + 2588495158U, // <3,7,2,4>: Cost 3 vext1 <6,3,7,2>, RHS + 2632787817U, // <3,7,2,5>: Cost 3 vext2 <2,5,3,7>, <2,5,3,7> + 1559709626U, // <3,7,2,6>: Cost 2 vext2 <2,6,3,7>, <2,6,3,7> + 2728031443U, // <3,7,2,7>: Cost 3 vext3 <7,2,7,3>, <7,2,7,3> + 1561036892U, // <3,7,2,u>: Cost 2 vext2 <2,u,3,7>, <2,u,3,7> + 2626816150U, // <3,7,3,0>: Cost 3 vext2 <1,5,3,7>, <3,0,1,2> + 2626816268U, // <3,7,3,1>: Cost 3 vext2 <1,5,3,7>, <3,1,5,3> + 2633451878U, // <3,7,3,2>: Cost 3 vext2 <2,6,3,7>, <3,2,6,3> + 2626816412U, // <3,7,3,3>: Cost 3 vext2 <1,5,3,7>, <3,3,3,3> + 2626816514U, // <3,7,3,4>: Cost 3 vext2 <1,5,3,7>, <3,4,5,6> + 2638760514U, // <3,7,3,5>: Cost 3 vext2 <3,5,3,7>, <3,5,3,7> + 2639424147U, // <3,7,3,6>: Cost 3 vext2 <3,6,3,7>, <3,6,3,7> + 2826961920U, // <3,7,3,7>: Cost 3 vuzpr <1,3,5,7>, <1,3,5,7> + 2626816798U, // <3,7,3,u>: Cost 3 vext2 <1,5,3,7>, <3,u,1,2> + 2582536294U, // <3,7,4,0>: Cost 3 vext1 <5,3,7,4>, LHS + 2582537360U, // <3,7,4,1>: Cost 3 vext1 <5,3,7,4>, <1,5,3,7> + 2588510138U, // <3,7,4,2>: Cost 3 vext1 <6,3,7,4>, <2,6,3,7> + 3700558996U, // <3,7,4,3>: Cost 4 vext2 <1,5,3,7>, <4,3,6,7> + 2582539574U, // <3,7,4,4>: Cost 3 vext1 <5,3,7,4>, RHS + 1553075510U, // <3,7,4,5>: Cost 2 vext2 <1,5,3,7>, RHS + 2588512844U, // <3,7,4,6>: Cost 3 vext1 <6,3,7,4>, <6,3,7,4> + 2564625766U, // <3,7,4,7>: Cost 3 vext1 <2,3,7,4>, <7,4,5,6> + 1553075753U, // <3,7,4,u>: Cost 2 vext2 <1,5,3,7>, RHS + 2732971398U, // <3,7,5,0>: Cost 3 vext3 LHS, <7,5,0,2> + 2626817744U, // <3,7,5,1>: Cost 3 vext2 <1,5,3,7>, <5,1,7,3> + 3700559649U, // <3,7,5,2>: Cost 4 vext2 <1,5,3,7>, <5,2,7,3> + 2626817903U, // <3,7,5,3>: Cost 3 vext2 <1,5,3,7>, <5,3,7,0> + 2258728203U, // <3,7,5,4>: Cost 3 vrev <7,3,4,5> + 2732971446U, // <3,7,5,5>: Cost 3 vext3 LHS, <7,5,5,5> + 2732971457U, // <3,7,5,6>: Cost 3 vext3 LHS, <7,5,6,7> + 2826964278U, // <3,7,5,7>: Cost 3 vuzpr <1,3,5,7>, RHS + 2826964279U, // <3,7,5,u>: Cost 3 vuzpr <1,3,5,7>, RHS + 2732971478U, // <3,7,6,0>: Cost 3 vext3 LHS, <7,6,0,1> + 2732971486U, // <3,7,6,1>: Cost 3 vext3 LHS, <7,6,1,0> + 2633454074U, // <3,7,6,2>: Cost 3 vext2 <2,6,3,7>, <6,2,7,3> + 2633454152U, // <3,7,6,3>: Cost 3 vext2 <2,6,3,7>, <6,3,7,0> + 2732971518U, // <3,7,6,4>: Cost 3 vext3 LHS, <7,6,4,5> + 2732971526U, // <3,7,6,5>: Cost 3 vext3 LHS, <7,6,5,4> + 2732971537U, // <3,7,6,6>: Cost 3 vext3 LHS, <7,6,6,6> + 2732971540U, // <3,7,6,7>: Cost 3 vext3 LHS, <7,6,7,0> + 2726041124U, // <3,7,6,u>: Cost 3 vext3 <6,u,7,3>, <7,6,u,7> + 2570616934U, // <3,7,7,0>: Cost 3 vext1 <3,3,7,7>, LHS + 2570617856U, // <3,7,7,1>: Cost 3 vext1 <3,3,7,7>, <1,3,5,7> + 2564646635U, // <3,7,7,2>: Cost 3 vext1 <2,3,7,7>, <2,3,7,7> + 2570619332U, // <3,7,7,3>: Cost 3 vext1 <3,3,7,7>, <3,3,7,7> + 2570620214U, // <3,7,7,4>: Cost 3 vext1 <3,3,7,7>, RHS + 2582564726U, // <3,7,7,5>: Cost 3 vext1 <5,3,7,7>, <5,3,7,7> + 2588537423U, // <3,7,7,6>: Cost 3 vext1 <6,3,7,7>, <6,3,7,7> + 1659229804U, // <3,7,7,7>: Cost 2 vext3 LHS, <7,7,7,7> + 1659229804U, // <3,7,7,u>: Cost 2 vext3 LHS, <7,7,7,7> + 2626819795U, // <3,7,u,0>: Cost 3 vext2 <1,5,3,7>, <u,0,1,2> + 1553078062U, // <3,7,u,1>: Cost 2 vext2 <1,5,3,7>, LHS + 2626819973U, // <3,7,u,2>: Cost 3 vext2 <1,5,3,7>, <u,2,3,0> + 2826961565U, // <3,7,u,3>: Cost 3 vuzpr <1,3,5,7>, LHS + 2626820159U, // <3,7,u,4>: Cost 3 vext2 <1,5,3,7>, <u,4,5,6> + 1553078426U, // <3,7,u,5>: Cost 2 vext2 <1,5,3,7>, RHS + 1595545808U, // <3,7,u,6>: Cost 2 vext2 <u,6,3,7>, <u,6,3,7> + 1659229804U, // <3,7,u,7>: Cost 2 vext3 LHS, <7,7,7,7> + 1553078629U, // <3,7,u,u>: Cost 2 vext2 <1,5,3,7>, LHS + 1611448320U, // <3,u,0,0>: Cost 2 vext3 LHS, <0,0,0,0> + 1611896531U, // <3,u,0,1>: Cost 2 vext3 LHS, <u,0,1,2> + 1659672284U, // <3,u,0,2>: Cost 2 vext3 LHS, <u,0,2,2> + 1616099045U, // <3,u,0,3>: Cost 2 vext3 LHS, <u,0,3,2> + 2685638381U, // <3,u,0,4>: Cost 3 vext3 LHS, <u,0,4,1> + 1663874806U, // <3,u,0,5>: Cost 2 vext3 LHS, <u,0,5,1> + 1663874816U, // <3,u,0,6>: Cost 2 vext3 LHS, <u,0,6,2> + 2960313672U, // <3,u,0,7>: Cost 3 vzipr <1,2,3,0>, RHS + 1611896594U, // <3,u,0,u>: Cost 2 vext3 LHS, <u,0,u,2> + 1549763324U, // <3,u,1,0>: Cost 2 vext2 <1,0,3,u>, <1,0,3,u> + 1550426957U, // <3,u,1,1>: Cost 2 vext2 <1,1,3,u>, <1,1,3,u> + 537712430U, // <3,u,1,2>: Cost 1 vext3 LHS, LHS + 1616541495U, // <3,u,1,3>: Cost 2 vext3 LHS, <u,1,3,3> + 1490930998U, // <3,u,1,4>: Cost 2 vext1 <2,3,u,1>, RHS + 1553081489U, // <3,u,1,5>: Cost 2 vext2 <1,5,3,u>, <1,5,3,u> + 2627486946U, // <3,u,1,6>: Cost 3 vext2 <1,6,3,u>, <1,6,3,u> + 1659230043U, // <3,u,1,7>: Cost 2 vext3 LHS, <u,1,7,3> + 537712484U, // <3,u,1,u>: Cost 1 vext3 LHS, LHS + 1611890852U, // <3,u,2,0>: Cost 2 vext3 LHS, <0,2,0,2> + 2624833102U, // <3,u,2,1>: Cost 3 vext2 <1,2,3,u>, <2,1,u,3> + 1557063287U, // <3,u,2,2>: Cost 2 vext2 <2,2,3,u>, <2,2,3,u> + 1616099205U, // <3,u,2,3>: Cost 2 vext3 LHS, <u,2,3,0> + 1611890892U, // <3,u,2,4>: Cost 2 vext3 LHS, <0,2,4,6> + 2689841054U, // <3,u,2,5>: Cost 3 vext3 LHS, <u,2,5,7> + 1559717819U, // <3,u,2,6>: Cost 2 vext2 <2,6,3,u>, <2,6,3,u> + 1659230124U, // <3,u,2,7>: Cost 2 vext3 LHS, <u,2,7,3> + 1616541618U, // <3,u,2,u>: Cost 2 vext3 LHS, <u,2,u,0> + 1611896764U, // <3,u,3,0>: Cost 2 vext3 LHS, <u,3,0,1> + 1484973079U, // <3,u,3,1>: Cost 2 vext1 <1,3,u,3>, <1,3,u,3> + 2685638607U, // <3,u,3,2>: Cost 3 vext3 LHS, <u,3,2,2> + 336380006U, // <3,u,3,3>: Cost 1 vdup3 LHS + 1611896804U, // <3,u,3,4>: Cost 2 vext3 LHS, <u,3,4,5> + 1616541679U, // <3,u,3,5>: Cost 2 vext3 LHS, <u,3,5,7> + 2690283512U, // <3,u,3,6>: Cost 3 vext3 LHS, <u,3,6,7> + 2959674696U, // <3,u,3,7>: Cost 3 vzipr <1,1,3,3>, RHS + 336380006U, // <3,u,3,u>: Cost 1 vdup3 LHS + 2558722150U, // <3,u,4,0>: Cost 3 vext1 <1,3,u,4>, LHS + 1659672602U, // <3,u,4,1>: Cost 2 vext3 LHS, <u,4,1,5> + 1659672612U, // <3,u,4,2>: Cost 2 vext3 LHS, <u,4,2,6> + 2689841196U, // <3,u,4,3>: Cost 3 vext3 LHS, <u,4,3,5> + 1659227344U, // <3,u,4,4>: Cost 2 vext3 LHS, <4,4,4,4> + 1611896895U, // <3,u,4,5>: Cost 2 vext3 LHS, <u,4,5,6> + 1663875144U, // <3,u,4,6>: Cost 2 vext3 LHS, <u,4,6,6> + 1659230289U, // <3,u,4,7>: Cost 2 vext3 LHS, <u,4,7,6> + 1611896922U, // <3,u,4,u>: Cost 2 vext3 LHS, <u,4,u,6> + 1490960486U, // <3,u,5,0>: Cost 2 vext1 <2,3,u,5>, LHS + 2689841261U, // <3,u,5,1>: Cost 3 vext3 LHS, <u,5,1,7> + 1490962162U, // <3,u,5,2>: Cost 2 vext1 <2,3,u,5>, <2,3,u,5> + 1616541823U, // <3,u,5,3>: Cost 2 vext3 LHS, <u,5,3,7> + 1490963766U, // <3,u,5,4>: Cost 2 vext1 <2,3,u,5>, RHS + 1659228164U, // <3,u,5,5>: Cost 2 vext3 LHS, <5,5,5,5> + 537712794U, // <3,u,5,6>: Cost 1 vext3 LHS, RHS + 1659230371U, // <3,u,5,7>: Cost 2 vext3 LHS, <u,5,7,7> + 537712812U, // <3,u,5,u>: Cost 1 vext3 LHS, RHS + 2689841327U, // <3,u,6,0>: Cost 3 vext3 LHS, <u,6,0,1> + 2558739482U, // <3,u,6,1>: Cost 3 vext1 <1,3,u,6>, <1,3,u,6> + 2689841351U, // <3,u,6,2>: Cost 3 vext3 LHS, <u,6,2,7> + 1616099536U, // <3,u,6,3>: Cost 2 vext3 LHS, <u,6,3,7> + 1659227508U, // <3,u,6,4>: Cost 2 vext3 LHS, <4,6,4,6> + 2690283746U, // <3,u,6,5>: Cost 3 vext3 LHS, <u,6,5,7> + 1659228984U, // <3,u,6,6>: Cost 2 vext3 LHS, <6,6,6,6> + 1659230445U, // <3,u,6,7>: Cost 2 vext3 LHS, <u,6,7,0> + 1616099581U, // <3,u,6,u>: Cost 2 vext3 LHS, <u,6,u,7> + 1485004902U, // <3,u,7,0>: Cost 2 vext1 <1,3,u,7>, LHS + 1485005851U, // <3,u,7,1>: Cost 2 vext1 <1,3,u,7>, <1,3,u,7> + 2558748264U, // <3,u,7,2>: Cost 3 vext1 <1,3,u,7>, <2,2,2,2> + 3095397021U, // <3,u,7,3>: Cost 3 vtrnr <1,3,5,7>, LHS + 1485008182U, // <3,u,7,4>: Cost 2 vext1 <1,3,u,7>, RHS + 1659228328U, // <3,u,7,5>: Cost 2 vext3 LHS, <5,7,5,7> + 2722060599U, // <3,u,7,6>: Cost 3 vext3 <6,2,7,3>, <u,7,6,2> + 1659229804U, // <3,u,7,7>: Cost 2 vext3 LHS, <7,7,7,7> + 1485010734U, // <3,u,7,u>: Cost 2 vext1 <1,3,u,7>, LHS + 1616099665U, // <3,u,u,0>: Cost 2 vext3 LHS, <u,u,0,1> + 1611897179U, // <3,u,u,1>: Cost 2 vext3 LHS, <u,u,1,2> + 537712997U, // <3,u,u,2>: Cost 1 vext3 LHS, LHS + 336380006U, // <3,u,u,3>: Cost 1 vdup3 LHS + 1616099705U, // <3,u,u,4>: Cost 2 vext3 LHS, <u,u,4,5> + 1611897219U, // <3,u,u,5>: Cost 2 vext3 LHS, <u,u,5,6> + 537713037U, // <3,u,u,6>: Cost 1 vext3 LHS, RHS + 1659230607U, // <3,u,u,7>: Cost 2 vext3 LHS, <u,u,7,0> + 537713051U, // <3,u,u,u>: Cost 1 vext3 LHS, LHS + 2691907584U, // <4,0,0,0>: Cost 3 vext3 <1,2,3,4>, <0,0,0,0> + 2691907594U, // <4,0,0,1>: Cost 3 vext3 <1,2,3,4>, <0,0,1,1> + 2691907604U, // <4,0,0,2>: Cost 3 vext3 <1,2,3,4>, <0,0,2,2> + 3709862144U, // <4,0,0,3>: Cost 4 vext2 <3,1,4,0>, <0,3,1,4> + 2684682280U, // <4,0,0,4>: Cost 3 vext3 <0,0,4,4>, <0,0,4,4> + 3694600633U, // <4,0,0,5>: Cost 4 vext2 <0,5,4,0>, <0,5,4,0> + 3291431290U, // <4,0,0,6>: Cost 4 vrev <0,4,6,0> + 3668342067U, // <4,0,0,7>: Cost 4 vext1 <7,4,0,0>, <7,4,0,0> + 2691907657U, // <4,0,0,u>: Cost 3 vext3 <1,2,3,4>, <0,0,u,1> + 2570715238U, // <4,0,1,0>: Cost 3 vext1 <3,4,0,1>, LHS + 2570716058U, // <4,0,1,1>: Cost 3 vext1 <3,4,0,1>, <1,2,3,4> + 1618165862U, // <4,0,1,2>: Cost 2 vext3 <1,2,3,4>, LHS + 2570717648U, // <4,0,1,3>: Cost 3 vext1 <3,4,0,1>, <3,4,0,1> + 2570718518U, // <4,0,1,4>: Cost 3 vext1 <3,4,0,1>, RHS + 2594607206U, // <4,0,1,5>: Cost 3 vext1 <7,4,0,1>, <5,6,7,4> + 3662377563U, // <4,0,1,6>: Cost 4 vext1 <6,4,0,1>, <6,4,0,1> + 2594608436U, // <4,0,1,7>: Cost 3 vext1 <7,4,0,1>, <7,4,0,1> + 1618165916U, // <4,0,1,u>: Cost 2 vext3 <1,2,3,4>, LHS + 2685714598U, // <4,0,2,0>: Cost 3 vext3 <0,2,0,4>, <0,2,0,4> + 3759530159U, // <4,0,2,1>: Cost 4 vext3 <0,2,1,4>, <0,2,1,4> + 2685862072U, // <4,0,2,2>: Cost 3 vext3 <0,2,2,4>, <0,2,2,4> + 2631476937U, // <4,0,2,3>: Cost 3 vext2 <2,3,4,0>, <2,3,4,0> + 2685714636U, // <4,0,2,4>: Cost 3 vext3 <0,2,0,4>, <0,2,4,6> + 3765649622U, // <4,0,2,5>: Cost 4 vext3 <1,2,3,4>, <0,2,5,7> + 2686157020U, // <4,0,2,6>: Cost 3 vext3 <0,2,6,4>, <0,2,6,4> + 3668358453U, // <4,0,2,7>: Cost 4 vext1 <7,4,0,2>, <7,4,0,2> + 2686304494U, // <4,0,2,u>: Cost 3 vext3 <0,2,u,4>, <0,2,u,4> + 3632529510U, // <4,0,3,0>: Cost 4 vext1 <1,4,0,3>, LHS + 2686451968U, // <4,0,3,1>: Cost 3 vext3 <0,3,1,4>, <0,3,1,4> + 2686525705U, // <4,0,3,2>: Cost 3 vext3 <0,3,2,4>, <0,3,2,4> + 3760341266U, // <4,0,3,3>: Cost 4 vext3 <0,3,3,4>, <0,3,3,4> + 3632532790U, // <4,0,3,4>: Cost 4 vext1 <1,4,0,3>, RHS + 3913254606U, // <4,0,3,5>: Cost 4 vuzpr <3,4,5,0>, <2,3,4,5> + 3705219740U, // <4,0,3,6>: Cost 4 vext2 <2,3,4,0>, <3,6,4,7> + 3713845990U, // <4,0,3,7>: Cost 4 vext2 <3,7,4,0>, <3,7,4,0> + 2686451968U, // <4,0,3,u>: Cost 3 vext3 <0,3,1,4>, <0,3,1,4> + 2552823910U, // <4,0,4,0>: Cost 3 vext1 <0,4,0,4>, LHS + 2691907922U, // <4,0,4,1>: Cost 3 vext3 <1,2,3,4>, <0,4,1,5> + 2691907932U, // <4,0,4,2>: Cost 3 vext3 <1,2,3,4>, <0,4,2,6> + 3626567830U, // <4,0,4,3>: Cost 4 vext1 <0,4,0,4>, <3,0,1,2> + 2552827190U, // <4,0,4,4>: Cost 3 vext1 <0,4,0,4>, RHS + 2631478582U, // <4,0,4,5>: Cost 3 vext2 <2,3,4,0>, RHS + 3626570017U, // <4,0,4,6>: Cost 4 vext1 <0,4,0,4>, <6,0,1,2> + 3668374839U, // <4,0,4,7>: Cost 4 vext1 <7,4,0,4>, <7,4,0,4> + 2552829742U, // <4,0,4,u>: Cost 3 vext1 <0,4,0,4>, LHS + 2558804070U, // <4,0,5,0>: Cost 3 vext1 <1,4,0,5>, LHS + 1839644774U, // <4,0,5,1>: Cost 2 vzipl RHS, LHS + 2913386660U, // <4,0,5,2>: Cost 3 vzipl RHS, <0,2,0,2> + 2570750420U, // <4,0,5,3>: Cost 3 vext1 <3,4,0,5>, <3,4,0,5> + 2558807350U, // <4,0,5,4>: Cost 3 vext1 <1,4,0,5>, RHS + 3987128750U, // <4,0,5,5>: Cost 4 vzipl RHS, <0,5,2,7> + 3987128822U, // <4,0,5,6>: Cost 4 vzipl RHS, <0,6,1,7> + 2594641208U, // <4,0,5,7>: Cost 3 vext1 <7,4,0,5>, <7,4,0,5> + 1839645341U, // <4,0,5,u>: Cost 2 vzipl RHS, LHS + 2552840294U, // <4,0,6,0>: Cost 3 vext1 <0,4,0,6>, LHS + 3047604234U, // <4,0,6,1>: Cost 3 vtrnl RHS, <0,0,1,1> + 1973862502U, // <4,0,6,2>: Cost 2 vtrnl RHS, LHS + 2570758613U, // <4,0,6,3>: Cost 3 vext1 <3,4,0,6>, <3,4,0,6> + 2552843574U, // <4,0,6,4>: Cost 3 vext1 <0,4,0,6>, RHS + 2217664887U, // <4,0,6,5>: Cost 3 vrev <0,4,5,6> + 3662418528U, // <4,0,6,6>: Cost 4 vext1 <6,4,0,6>, <6,4,0,6> + 2658022257U, // <4,0,6,7>: Cost 3 vext2 <6,7,4,0>, <6,7,4,0> + 1973862556U, // <4,0,6,u>: Cost 2 vtrnl RHS, LHS + 3731764218U, // <4,0,7,0>: Cost 4 vext2 <6,7,4,0>, <7,0,1,2> + 3988324454U, // <4,0,7,1>: Cost 4 vzipl <4,7,5,0>, LHS + 4122034278U, // <4,0,7,2>: Cost 4 vtrnl <4,6,7,1>, LHS + 3735082246U, // <4,0,7,3>: Cost 4 vext2 <7,3,4,0>, <7,3,4,0> + 3731764536U, // <4,0,7,4>: Cost 4 vext2 <6,7,4,0>, <7,4,0,5> + 3937145718U, // <4,0,7,5>: Cost 4 vuzpr <7,4,5,0>, <6,7,4,5> + 3737073145U, // <4,0,7,6>: Cost 4 vext2 <7,6,4,0>, <7,6,4,0> + 3731764844U, // <4,0,7,7>: Cost 4 vext2 <6,7,4,0>, <7,7,7,7> + 4122034332U, // <4,0,7,u>: Cost 4 vtrnl <4,6,7,1>, LHS + 2552856678U, // <4,0,u,0>: Cost 3 vext1 <0,4,0,u>, LHS + 1841635430U, // <4,0,u,1>: Cost 2 vzipl RHS, LHS + 1618166429U, // <4,0,u,2>: Cost 2 vext3 <1,2,3,4>, LHS + 2570774999U, // <4,0,u,3>: Cost 3 vext1 <3,4,0,u>, <3,4,0,u> + 2552859958U, // <4,0,u,4>: Cost 3 vext1 <0,4,0,u>, RHS + 2631481498U, // <4,0,u,5>: Cost 3 vext2 <2,3,4,0>, RHS + 2686157020U, // <4,0,u,6>: Cost 3 vext3 <0,2,6,4>, <0,2,6,4> + 2594665787U, // <4,0,u,7>: Cost 3 vext1 <7,4,0,u>, <7,4,0,u> + 1618166483U, // <4,0,u,u>: Cost 2 vext3 <1,2,3,4>, LHS + 2617548837U, // <4,1,0,0>: Cost 3 vext2 <0,0,4,1>, <0,0,4,1> + 2622857318U, // <4,1,0,1>: Cost 3 vext2 <0,u,4,1>, LHS + 3693281484U, // <4,1,0,2>: Cost 4 vext2 <0,3,4,1>, <0,2,4,6> + 2691908342U, // <4,1,0,3>: Cost 3 vext3 <1,2,3,4>, <1,0,3,2> + 2622857554U, // <4,1,0,4>: Cost 3 vext2 <0,u,4,1>, <0,4,1,5> + 3764470538U, // <4,1,0,5>: Cost 4 vext3 <1,0,5,4>, <1,0,5,4> + 3695272459U, // <4,1,0,6>: Cost 4 vext2 <0,6,4,1>, <0,6,4,1> + 3733094980U, // <4,1,0,7>: Cost 4 vext2 <7,0,4,1>, <0,7,1,4> + 2622857885U, // <4,1,0,u>: Cost 3 vext2 <0,u,4,1>, LHS + 3696599798U, // <4,1,1,0>: Cost 4 vext2 <0,u,4,1>, <1,0,3,2> + 2691097399U, // <4,1,1,1>: Cost 3 vext3 <1,1,1,4>, <1,1,1,4> + 2631484314U, // <4,1,1,2>: Cost 3 vext2 <2,3,4,1>, <1,2,3,4> + 2691908424U, // <4,1,1,3>: Cost 3 vext3 <1,2,3,4>, <1,1,3,3> + 3696600125U, // <4,1,1,4>: Cost 4 vext2 <0,u,4,1>, <1,4,3,5> + 3696600175U, // <4,1,1,5>: Cost 4 vext2 <0,u,4,1>, <1,5,0,1> + 3696600307U, // <4,1,1,6>: Cost 4 vext2 <0,u,4,1>, <1,6,5,7> + 3668423997U, // <4,1,1,7>: Cost 4 vext1 <7,4,1,1>, <7,4,1,1> + 2691908469U, // <4,1,1,u>: Cost 3 vext3 <1,2,3,4>, <1,1,u,3> + 2570797158U, // <4,1,2,0>: Cost 3 vext1 <3,4,1,2>, LHS + 2570797978U, // <4,1,2,1>: Cost 3 vext1 <3,4,1,2>, <1,2,3,4> + 3696600680U, // <4,1,2,2>: Cost 4 vext2 <0,u,4,1>, <2,2,2,2> + 1618166682U, // <4,1,2,3>: Cost 2 vext3 <1,2,3,4>, <1,2,3,4> + 2570800438U, // <4,1,2,4>: Cost 3 vext1 <3,4,1,2>, RHS + 3765650347U, // <4,1,2,5>: Cost 4 vext3 <1,2,3,4>, <1,2,5,3> + 3696601018U, // <4,1,2,6>: Cost 4 vext2 <0,u,4,1>, <2,6,3,7> + 3668432190U, // <4,1,2,7>: Cost 4 vext1 <7,4,1,2>, <7,4,1,2> + 1618535367U, // <4,1,2,u>: Cost 2 vext3 <1,2,u,4>, <1,2,u,4> + 2564833382U, // <4,1,3,0>: Cost 3 vext1 <2,4,1,3>, LHS + 2691908568U, // <4,1,3,1>: Cost 3 vext3 <1,2,3,4>, <1,3,1,3> + 2691908578U, // <4,1,3,2>: Cost 3 vext3 <1,2,3,4>, <1,3,2,4> + 2692572139U, // <4,1,3,3>: Cost 3 vext3 <1,3,3,4>, <1,3,3,4> + 2564836662U, // <4,1,3,4>: Cost 3 vext1 <2,4,1,3>, RHS + 2691908608U, // <4,1,3,5>: Cost 3 vext3 <1,2,3,4>, <1,3,5,7> + 2588725862U, // <4,1,3,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3> + 3662468090U, // <4,1,3,7>: Cost 4 vext1 <6,4,1,3>, <7,0,1,2> + 2691908631U, // <4,1,3,u>: Cost 3 vext3 <1,2,3,4>, <1,3,u,3> + 3760194590U, // <4,1,4,0>: Cost 4 vext3 <0,3,1,4>, <1,4,0,1> + 3693947874U, // <4,1,4,1>: Cost 4 vext2 <0,4,4,1>, <4,1,5,0> + 3765650484U, // <4,1,4,2>: Cost 4 vext3 <1,2,3,4>, <1,4,2,5> + 3113877606U, // <4,1,4,3>: Cost 3 vtrnr <4,4,4,4>, LHS + 3760194630U, // <4,1,4,4>: Cost 4 vext3 <0,3,1,4>, <1,4,4,5> + 2622860598U, // <4,1,4,5>: Cost 3 vext2 <0,u,4,1>, RHS + 3297436759U, // <4,1,4,6>: Cost 4 vrev <1,4,6,4> + 3800007772U, // <4,1,4,7>: Cost 4 vext3 <7,0,1,4>, <1,4,7,0> + 2622860841U, // <4,1,4,u>: Cost 3 vext2 <0,u,4,1>, RHS + 1479164006U, // <4,1,5,0>: Cost 2 vext1 <0,4,1,5>, LHS + 2552906486U, // <4,1,5,1>: Cost 3 vext1 <0,4,1,5>, <1,0,3,2> + 2552907299U, // <4,1,5,2>: Cost 3 vext1 <0,4,1,5>, <2,1,3,5> + 2552907926U, // <4,1,5,3>: Cost 3 vext1 <0,4,1,5>, <3,0,1,2> + 1479167286U, // <4,1,5,4>: Cost 2 vext1 <0,4,1,5>, RHS + 2913387664U, // <4,1,5,5>: Cost 3 vzipl RHS, <1,5,3,7> + 2600686074U, // <4,1,5,6>: Cost 3 vext1 <u,4,1,5>, <6,2,7,3> + 2600686586U, // <4,1,5,7>: Cost 3 vext1 <u,4,1,5>, <7,0,1,2> + 1479169838U, // <4,1,5,u>: Cost 2 vext1 <0,4,1,5>, LHS + 2552914022U, // <4,1,6,0>: Cost 3 vext1 <0,4,1,6>, LHS + 2558886708U, // <4,1,6,1>: Cost 3 vext1 <1,4,1,6>, <1,1,1,1> + 4028205206U, // <4,1,6,2>: Cost 4 vzipr <0,2,4,6>, <3,0,1,2> + 3089858662U, // <4,1,6,3>: Cost 3 vtrnr <0,4,2,6>, LHS + 2552917302U, // <4,1,6,4>: Cost 3 vext1 <0,4,1,6>, RHS + 2223637584U, // <4,1,6,5>: Cost 3 vrev <1,4,5,6> + 4121347081U, // <4,1,6,6>: Cost 4 vtrnl RHS, <1,3,6,7> + 3721155406U, // <4,1,6,7>: Cost 4 vext2 <5,0,4,1>, <6,7,0,1> + 2552919854U, // <4,1,6,u>: Cost 3 vext1 <0,4,1,6>, LHS + 2659357716U, // <4,1,7,0>: Cost 3 vext2 <7,0,4,1>, <7,0,4,1> + 3733763173U, // <4,1,7,1>: Cost 4 vext2 <7,1,4,1>, <7,1,4,1> + 3734426806U, // <4,1,7,2>: Cost 4 vext2 <7,2,4,1>, <7,2,4,1> + 2695226671U, // <4,1,7,3>: Cost 3 vext3 <1,7,3,4>, <1,7,3,4> + 3721155942U, // <4,1,7,4>: Cost 4 vext2 <5,0,4,1>, <7,4,5,6> + 3721155976U, // <4,1,7,5>: Cost 4 vext2 <5,0,4,1>, <7,5,0,4> + 3662500458U, // <4,1,7,6>: Cost 4 vext1 <6,4,1,7>, <6,4,1,7> + 3721156204U, // <4,1,7,7>: Cost 4 vext2 <5,0,4,1>, <7,7,7,7> + 2659357716U, // <4,1,7,u>: Cost 3 vext2 <7,0,4,1>, <7,0,4,1> + 1479188582U, // <4,1,u,0>: Cost 2 vext1 <0,4,1,u>, LHS + 2552931062U, // <4,1,u,1>: Cost 3 vext1 <0,4,1,u>, <1,0,3,2> + 2552931944U, // <4,1,u,2>: Cost 3 vext1 <0,4,1,u>, <2,2,2,2> + 1622148480U, // <4,1,u,3>: Cost 2 vext3 <1,u,3,4>, <1,u,3,4> + 1479191862U, // <4,1,u,4>: Cost 2 vext1 <0,4,1,u>, RHS + 2622863514U, // <4,1,u,5>: Cost 3 vext2 <0,u,4,1>, RHS + 2588725862U, // <4,1,u,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3> + 2600686586U, // <4,1,u,7>: Cost 3 vext1 <u,4,1,5>, <7,0,1,2> + 1479194414U, // <4,1,u,u>: Cost 2 vext1 <0,4,1,u>, LHS + 2617557030U, // <4,2,0,0>: Cost 3 vext2 <0,0,4,2>, <0,0,4,2> + 2622865510U, // <4,2,0,1>: Cost 3 vext2 <0,u,4,2>, LHS + 2622865612U, // <4,2,0,2>: Cost 3 vext2 <0,u,4,2>, <0,2,4,6> + 3693289753U, // <4,2,0,3>: Cost 4 vext2 <0,3,4,2>, <0,3,4,2> + 2635473244U, // <4,2,0,4>: Cost 3 vext2 <3,0,4,2>, <0,4,2,6> + 3765650918U, // <4,2,0,5>: Cost 4 vext3 <1,2,3,4>, <2,0,5,7> + 2696775148U, // <4,2,0,6>: Cost 3 vext3 <2,0,6,4>, <2,0,6,4> + 3695944285U, // <4,2,0,7>: Cost 4 vext2 <0,7,4,2>, <0,7,4,2> + 2622866077U, // <4,2,0,u>: Cost 3 vext2 <0,u,4,2>, LHS + 3696607990U, // <4,2,1,0>: Cost 4 vext2 <0,u,4,2>, <1,0,3,2> + 3696608052U, // <4,2,1,1>: Cost 4 vext2 <0,u,4,2>, <1,1,1,1> + 3696608150U, // <4,2,1,2>: Cost 4 vext2 <0,u,4,2>, <1,2,3,0> + 3895574630U, // <4,2,1,3>: Cost 4 vuzpr <0,4,u,2>, LHS + 2691909162U, // <4,2,1,4>: Cost 3 vext3 <1,2,3,4>, <2,1,4,3> + 3696608400U, // <4,2,1,5>: Cost 4 vext2 <0,u,4,2>, <1,5,3,7> + 3760784956U, // <4,2,1,6>: Cost 4 vext3 <0,4,0,4>, <2,1,6,3> + 3773908549U, // <4,2,1,7>: Cost 5 vext3 <2,5,7,4>, <2,1,7,3> + 2691909162U, // <4,2,1,u>: Cost 3 vext3 <1,2,3,4>, <2,1,4,3> + 3696608748U, // <4,2,2,0>: Cost 4 vext2 <0,u,4,2>, <2,0,6,4> + 3696608828U, // <4,2,2,1>: Cost 4 vext2 <0,u,4,2>, <2,1,6,3> + 2691909224U, // <4,2,2,2>: Cost 3 vext3 <1,2,3,4>, <2,2,2,2> + 2691909234U, // <4,2,2,3>: Cost 3 vext3 <1,2,3,4>, <2,2,3,3> + 3759605368U, // <4,2,2,4>: Cost 4 vext3 <0,2,2,4>, <2,2,4,0> + 3696609156U, // <4,2,2,5>: Cost 4 vext2 <0,u,4,2>, <2,5,6,7> + 3760785040U, // <4,2,2,6>: Cost 4 vext3 <0,4,0,4>, <2,2,6,6> + 3668505927U, // <4,2,2,7>: Cost 4 vext1 <7,4,2,2>, <7,4,2,2> + 2691909279U, // <4,2,2,u>: Cost 3 vext3 <1,2,3,4>, <2,2,u,3> + 2691909286U, // <4,2,3,0>: Cost 3 vext3 <1,2,3,4>, <2,3,0,1> + 3764840111U, // <4,2,3,1>: Cost 4 vext3 <1,1,1,4>, <2,3,1,1> + 3765651129U, // <4,2,3,2>: Cost 4 vext3 <1,2,3,4>, <2,3,2,2> + 2698544836U, // <4,2,3,3>: Cost 3 vext3 <2,3,3,4>, <2,3,3,4> + 2685863630U, // <4,2,3,4>: Cost 3 vext3 <0,2,2,4>, <2,3,4,5> + 2698692310U, // <4,2,3,5>: Cost 3 vext3 <2,3,5,4>, <2,3,5,4> + 3772507871U, // <4,2,3,6>: Cost 4 vext3 <2,3,6,4>, <2,3,6,4> + 2698839784U, // <4,2,3,7>: Cost 3 vext3 <2,3,7,4>, <2,3,7,4> + 2691909358U, // <4,2,3,u>: Cost 3 vext3 <1,2,3,4>, <2,3,u,1> + 2564915302U, // <4,2,4,0>: Cost 3 vext1 <2,4,2,4>, LHS + 2564916122U, // <4,2,4,1>: Cost 3 vext1 <2,4,2,4>, <1,2,3,4> + 2564917004U, // <4,2,4,2>: Cost 3 vext1 <2,4,2,4>, <2,4,2,4> + 2699208469U, // <4,2,4,3>: Cost 3 vext3 <2,4,3,4>, <2,4,3,4> + 2564918582U, // <4,2,4,4>: Cost 3 vext1 <2,4,2,4>, RHS + 2622868790U, // <4,2,4,5>: Cost 3 vext2 <0,u,4,2>, RHS + 2229667632U, // <4,2,4,6>: Cost 3 vrev <2,4,6,4> + 3800082229U, // <4,2,4,7>: Cost 4 vext3 <7,0,2,4>, <2,4,7,0> + 2622869033U, // <4,2,4,u>: Cost 3 vext2 <0,u,4,2>, RHS + 2552979558U, // <4,2,5,0>: Cost 3 vext1 <0,4,2,5>, LHS + 2558952342U, // <4,2,5,1>: Cost 3 vext1 <1,4,2,5>, <1,2,3,0> + 2564925032U, // <4,2,5,2>: Cost 3 vext1 <2,4,2,5>, <2,2,2,2> + 2967060582U, // <4,2,5,3>: Cost 3 vzipr <2,3,4,5>, LHS + 2552982838U, // <4,2,5,4>: Cost 3 vext1 <0,4,2,5>, RHS + 3987130190U, // <4,2,5,5>: Cost 4 vzipl RHS, <2,5,0,7> + 2913388474U, // <4,2,5,6>: Cost 3 vzipl RHS, <2,6,3,7> + 3895577910U, // <4,2,5,7>: Cost 4 vuzpr <0,4,u,2>, RHS + 2552985390U, // <4,2,5,u>: Cost 3 vext1 <0,4,2,5>, LHS + 1479245926U, // <4,2,6,0>: Cost 2 vext1 <0,4,2,6>, LHS + 2552988406U, // <4,2,6,1>: Cost 3 vext1 <0,4,2,6>, <1,0,3,2> + 2552989288U, // <4,2,6,2>: Cost 3 vext1 <0,4,2,6>, <2,2,2,2> + 2954461286U, // <4,2,6,3>: Cost 3 vzipr <0,2,4,6>, LHS + 1479249206U, // <4,2,6,4>: Cost 2 vext1 <0,4,2,6>, RHS + 2229610281U, // <4,2,6,5>: Cost 3 vrev <2,4,5,6> + 2600767994U, // <4,2,6,6>: Cost 3 vext1 <u,4,2,6>, <6,2,7,3> + 2600768506U, // <4,2,6,7>: Cost 3 vext1 <u,4,2,6>, <7,0,1,2> + 1479251758U, // <4,2,6,u>: Cost 2 vext1 <0,4,2,6>, LHS + 2659365909U, // <4,2,7,0>: Cost 3 vext2 <7,0,4,2>, <7,0,4,2> + 3733771366U, // <4,2,7,1>: Cost 4 vext2 <7,1,4,2>, <7,1,4,2> + 3734434999U, // <4,2,7,2>: Cost 4 vext2 <7,2,4,2>, <7,2,4,2> + 2701199368U, // <4,2,7,3>: Cost 3 vext3 <2,7,3,4>, <2,7,3,4> + 4175774618U, // <4,2,7,4>: Cost 4 vtrnr <2,4,5,7>, <1,2,3,4> + 3303360298U, // <4,2,7,5>: Cost 4 vrev <2,4,5,7> + 3727136217U, // <4,2,7,6>: Cost 4 vext2 <6,0,4,2>, <7,6,0,4> + 3727136364U, // <4,2,7,7>: Cost 4 vext2 <6,0,4,2>, <7,7,7,7> + 2659365909U, // <4,2,7,u>: Cost 3 vext2 <7,0,4,2>, <7,0,4,2> + 1479262310U, // <4,2,u,0>: Cost 2 vext1 <0,4,2,u>, LHS + 2553004790U, // <4,2,u,1>: Cost 3 vext1 <0,4,2,u>, <1,0,3,2> + 2553005672U, // <4,2,u,2>: Cost 3 vext1 <0,4,2,u>, <2,2,2,2> + 2954477670U, // <4,2,u,3>: Cost 3 vzipr <0,2,4,u>, LHS + 1479265590U, // <4,2,u,4>: Cost 2 vext1 <0,4,2,u>, RHS + 2622871706U, // <4,2,u,5>: Cost 3 vext2 <0,u,4,2>, RHS + 2229700404U, // <4,2,u,6>: Cost 3 vrev <2,4,6,u> + 2600784890U, // <4,2,u,7>: Cost 3 vext1 <u,4,2,u>, <7,0,1,2> + 1479268142U, // <4,2,u,u>: Cost 2 vext1 <0,4,2,u>, LHS + 3765651595U, // <4,3,0,0>: Cost 4 vext3 <1,2,3,4>, <3,0,0,0> + 2691909782U, // <4,3,0,1>: Cost 3 vext3 <1,2,3,4>, <3,0,1,2> + 2702452897U, // <4,3,0,2>: Cost 3 vext3 <3,0,2,4>, <3,0,2,4> + 3693297946U, // <4,3,0,3>: Cost 4 vext2 <0,3,4,3>, <0,3,4,3> + 3760711856U, // <4,3,0,4>: Cost 4 vext3 <0,3,u,4>, <3,0,4,1> + 2235533820U, // <4,3,0,5>: Cost 3 vrev <3,4,5,0> + 3309349381U, // <4,3,0,6>: Cost 4 vrev <3,4,6,0> + 3668563278U, // <4,3,0,7>: Cost 4 vext1 <7,4,3,0>, <7,4,3,0> + 2691909845U, // <4,3,0,u>: Cost 3 vext3 <1,2,3,4>, <3,0,u,2> + 2235173328U, // <4,3,1,0>: Cost 3 vrev <3,4,0,1> + 3764840678U, // <4,3,1,1>: Cost 4 vext3 <1,1,1,4>, <3,1,1,1> + 2630173594U, // <4,3,1,2>: Cost 3 vext2 <2,1,4,3>, <1,2,3,4> + 2703190267U, // <4,3,1,3>: Cost 3 vext3 <3,1,3,4>, <3,1,3,4> + 3760195840U, // <4,3,1,4>: Cost 4 vext3 <0,3,1,4>, <3,1,4,0> + 3765651724U, // <4,3,1,5>: Cost 4 vext3 <1,2,3,4>, <3,1,5,3> + 3309357574U, // <4,3,1,6>: Cost 4 vrev <3,4,6,1> + 3769633054U, // <4,3,1,7>: Cost 4 vext3 <1,u,3,4>, <3,1,7,3> + 2703558952U, // <4,3,1,u>: Cost 3 vext3 <3,1,u,4>, <3,1,u,4> + 3626770534U, // <4,3,2,0>: Cost 4 vext1 <0,4,3,2>, LHS + 2630174250U, // <4,3,2,1>: Cost 3 vext2 <2,1,4,3>, <2,1,4,3> + 3765651777U, // <4,3,2,2>: Cost 4 vext3 <1,2,3,4>, <3,2,2,2> + 2703853900U, // <4,3,2,3>: Cost 3 vext3 <3,2,3,4>, <3,2,3,4> + 3626773814U, // <4,3,2,4>: Cost 4 vext1 <0,4,3,2>, RHS + 2704001374U, // <4,3,2,5>: Cost 3 vext3 <3,2,5,4>, <3,2,5,4> + 3765651814U, // <4,3,2,6>: Cost 4 vext3 <1,2,3,4>, <3,2,6,3> + 3769633135U, // <4,3,2,7>: Cost 4 vext3 <1,u,3,4>, <3,2,7,3> + 2634819681U, // <4,3,2,u>: Cost 3 vext2 <2,u,4,3>, <2,u,4,3> + 3765651839U, // <4,3,3,0>: Cost 4 vext3 <1,2,3,4>, <3,3,0,1> + 3765651848U, // <4,3,3,1>: Cost 4 vext3 <1,2,3,4>, <3,3,1,1> + 3710552404U, // <4,3,3,2>: Cost 4 vext2 <3,2,4,3>, <3,2,4,3> + 2691910044U, // <4,3,3,3>: Cost 3 vext3 <1,2,3,4>, <3,3,3,3> + 2704591270U, // <4,3,3,4>: Cost 3 vext3 <3,3,4,4>, <3,3,4,4> + 3769633202U, // <4,3,3,5>: Cost 4 vext3 <1,u,3,4>, <3,3,5,7> + 3703917212U, // <4,3,3,6>: Cost 4 vext2 <2,1,4,3>, <3,6,4,7> + 3769633220U, // <4,3,3,7>: Cost 4 vext3 <1,u,3,4>, <3,3,7,7> + 2691910044U, // <4,3,3,u>: Cost 3 vext3 <1,2,3,4>, <3,3,3,3> + 2691910096U, // <4,3,4,0>: Cost 3 vext3 <1,2,3,4>, <3,4,0,1> + 2691910106U, // <4,3,4,1>: Cost 3 vext3 <1,2,3,4>, <3,4,1,2> + 2564990741U, // <4,3,4,2>: Cost 3 vext1 <2,4,3,4>, <2,4,3,4> + 3765651946U, // <4,3,4,3>: Cost 4 vext3 <1,2,3,4>, <3,4,3,0> + 2691910136U, // <4,3,4,4>: Cost 3 vext3 <1,2,3,4>, <3,4,4,5> + 2686454274U, // <4,3,4,5>: Cost 3 vext3 <0,3,1,4>, <3,4,5,6> + 2235640329U, // <4,3,4,6>: Cost 3 vrev <3,4,6,4> + 3801483792U, // <4,3,4,7>: Cost 4 vext3 <7,2,3,4>, <3,4,7,2> + 2691910168U, // <4,3,4,u>: Cost 3 vext3 <1,2,3,4>, <3,4,u,1> + 2559025254U, // <4,3,5,0>: Cost 3 vext1 <1,4,3,5>, LHS + 2559026237U, // <4,3,5,1>: Cost 3 vext1 <1,4,3,5>, <1,4,3,5> + 2564998862U, // <4,3,5,2>: Cost 3 vext1 <2,4,3,5>, <2,3,4,5> + 2570971548U, // <4,3,5,3>: Cost 3 vext1 <3,4,3,5>, <3,3,3,3> + 2559028534U, // <4,3,5,4>: Cost 3 vext1 <1,4,3,5>, RHS + 4163519477U, // <4,3,5,5>: Cost 4 vtrnr <0,4,1,5>, <1,3,4,5> + 3309390346U, // <4,3,5,6>: Cost 4 vrev <3,4,6,5> + 2706139747U, // <4,3,5,7>: Cost 3 vext3 <3,5,7,4>, <3,5,7,4> + 2559031086U, // <4,3,5,u>: Cost 3 vext1 <1,4,3,5>, LHS + 2559033446U, // <4,3,6,0>: Cost 3 vext1 <1,4,3,6>, LHS + 2559034430U, // <4,3,6,1>: Cost 3 vext1 <1,4,3,6>, <1,4,3,6> + 2565007127U, // <4,3,6,2>: Cost 3 vext1 <2,4,3,6>, <2,4,3,6> + 2570979740U, // <4,3,6,3>: Cost 3 vext1 <3,4,3,6>, <3,3,3,3> + 2559036726U, // <4,3,6,4>: Cost 3 vext1 <1,4,3,6>, RHS + 1161841154U, // <4,3,6,5>: Cost 2 vrev <3,4,5,6> + 4028203932U, // <4,3,6,6>: Cost 4 vzipr <0,2,4,6>, <1,2,3,6> + 2706803380U, // <4,3,6,7>: Cost 3 vext3 <3,6,7,4>, <3,6,7,4> + 1162062365U, // <4,3,6,u>: Cost 2 vrev <3,4,u,6> + 3769633475U, // <4,3,7,0>: Cost 4 vext3 <1,u,3,4>, <3,7,0,1> + 3769633488U, // <4,3,7,1>: Cost 4 vext3 <1,u,3,4>, <3,7,1,5> + 3638757144U, // <4,3,7,2>: Cost 4 vext1 <2,4,3,7>, <2,4,3,7> + 3769633508U, // <4,3,7,3>: Cost 4 vext3 <1,u,3,4>, <3,7,3,7> + 3769633515U, // <4,3,7,4>: Cost 4 vext3 <1,u,3,4>, <3,7,4,5> + 3769633526U, // <4,3,7,5>: Cost 4 vext3 <1,u,3,4>, <3,7,5,7> + 3662647932U, // <4,3,7,6>: Cost 4 vext1 <6,4,3,7>, <6,4,3,7> + 3781208837U, // <4,3,7,7>: Cost 4 vext3 <3,7,7,4>, <3,7,7,4> + 3769633547U, // <4,3,7,u>: Cost 4 vext3 <1,u,3,4>, <3,7,u,1> + 2559049830U, // <4,3,u,0>: Cost 3 vext1 <1,4,3,u>, LHS + 2691910430U, // <4,3,u,1>: Cost 3 vext3 <1,2,3,4>, <3,u,1,2> + 2565023513U, // <4,3,u,2>: Cost 3 vext1 <2,4,3,u>, <2,4,3,u> + 2707835698U, // <4,3,u,3>: Cost 3 vext3 <3,u,3,4>, <3,u,3,4> + 2559053110U, // <4,3,u,4>: Cost 3 vext1 <1,4,3,u>, RHS + 1161857540U, // <4,3,u,5>: Cost 2 vrev <3,4,5,u> + 2235673101U, // <4,3,u,6>: Cost 3 vrev <3,4,6,u> + 2708130646U, // <4,3,u,7>: Cost 3 vext3 <3,u,7,4>, <3,u,7,4> + 1162078751U, // <4,3,u,u>: Cost 2 vrev <3,4,u,u> + 2617573416U, // <4,4,0,0>: Cost 3 vext2 <0,0,4,4>, <0,0,4,4> + 1570373734U, // <4,4,0,1>: Cost 2 vext2 <4,4,4,4>, LHS + 2779676774U, // <4,4,0,2>: Cost 3 vuzpl <4,6,4,6>, LHS + 3760196480U, // <4,4,0,3>: Cost 4 vext3 <0,3,1,4>, <4,0,3,1> + 2576977100U, // <4,4,0,4>: Cost 3 vext1 <4,4,4,0>, <4,4,4,0> + 2718747538U, // <4,4,0,5>: Cost 3 vext3 <5,6,7,4>, <4,0,5,1> + 2718747548U, // <4,4,0,6>: Cost 3 vext3 <5,6,7,4>, <4,0,6,2> + 3668637015U, // <4,4,0,7>: Cost 4 vext1 <7,4,4,0>, <7,4,4,0> + 1570374301U, // <4,4,0,u>: Cost 2 vext2 <4,4,4,4>, LHS + 2644116214U, // <4,4,1,0>: Cost 3 vext2 <4,4,4,4>, <1,0,3,2> + 2644116276U, // <4,4,1,1>: Cost 3 vext2 <4,4,4,4>, <1,1,1,1> + 2691910602U, // <4,4,1,2>: Cost 3 vext3 <1,2,3,4>, <4,1,2,3> + 2644116440U, // <4,4,1,3>: Cost 3 vext2 <4,4,4,4>, <1,3,1,3> + 2711227356U, // <4,4,1,4>: Cost 3 vext3 <4,4,4,4>, <4,1,4,3> + 2709310438U, // <4,4,1,5>: Cost 3 vext3 <4,1,5,4>, <4,1,5,4> + 3765652462U, // <4,4,1,6>: Cost 4 vext3 <1,2,3,4>, <4,1,6,3> + 3768970231U, // <4,4,1,7>: Cost 4 vext3 <1,7,3,4>, <4,1,7,3> + 2695891968U, // <4,4,1,u>: Cost 3 vext3 <1,u,3,4>, <4,1,u,3> + 3703260634U, // <4,4,2,0>: Cost 4 vext2 <2,0,4,4>, <2,0,4,4> + 3765652499U, // <4,4,2,1>: Cost 4 vext3 <1,2,3,4>, <4,2,1,4> + 2644117096U, // <4,4,2,2>: Cost 3 vext2 <4,4,4,4>, <2,2,2,2> + 2631509709U, // <4,4,2,3>: Cost 3 vext2 <2,3,4,4>, <2,3,4,4> + 2644117269U, // <4,4,2,4>: Cost 3 vext2 <4,4,4,4>, <2,4,3,4> + 3705251698U, // <4,4,2,5>: Cost 4 vext2 <2,3,4,4>, <2,5,4,7> + 2710047808U, // <4,4,2,6>: Cost 3 vext3 <4,2,6,4>, <4,2,6,4> + 3783863369U, // <4,4,2,7>: Cost 4 vext3 <4,2,7,4>, <4,2,7,4> + 2634827874U, // <4,4,2,u>: Cost 3 vext2 <2,u,4,4>, <2,u,4,4> + 2644117654U, // <4,4,3,0>: Cost 3 vext2 <4,4,4,4>, <3,0,1,2> + 3638797210U, // <4,4,3,1>: Cost 4 vext1 <2,4,4,3>, <1,2,3,4> + 3638798082U, // <4,4,3,2>: Cost 4 vext1 <2,4,4,3>, <2,4,1,3> + 2637482406U, // <4,4,3,3>: Cost 3 vext2 <3,3,4,4>, <3,3,4,4> + 2638146039U, // <4,4,3,4>: Cost 3 vext2 <3,4,4,4>, <3,4,4,4> + 3913287374U, // <4,4,3,5>: Cost 4 vuzpr <3,4,5,4>, <2,3,4,5> + 3765652625U, // <4,4,3,6>: Cost 4 vext3 <1,2,3,4>, <4,3,6,4> + 3713878762U, // <4,4,3,7>: Cost 4 vext2 <3,7,4,4>, <3,7,4,4> + 2637482406U, // <4,4,3,u>: Cost 3 vext2 <3,3,4,4>, <3,3,4,4> + 1503264870U, // <4,4,4,0>: Cost 2 vext1 <4,4,4,4>, LHS + 2577007514U, // <4,4,4,1>: Cost 3 vext1 <4,4,4,4>, <1,2,3,4> + 2577008232U, // <4,4,4,2>: Cost 3 vext1 <4,4,4,4>, <2,2,2,2> + 2571037175U, // <4,4,4,3>: Cost 3 vext1 <3,4,4,4>, <3,4,4,4> + 161926454U, // <4,4,4,4>: Cost 1 vdup0 RHS + 1570377014U, // <4,4,4,5>: Cost 2 vext2 <4,4,4,4>, RHS + 2779680054U, // <4,4,4,6>: Cost 3 vuzpl <4,6,4,6>, RHS + 2594927963U, // <4,4,4,7>: Cost 3 vext1 <7,4,4,4>, <7,4,4,4> + 161926454U, // <4,4,4,u>: Cost 1 vdup0 RHS + 2571042918U, // <4,4,5,0>: Cost 3 vext1 <3,4,4,5>, LHS + 2571043738U, // <4,4,5,1>: Cost 3 vext1 <3,4,4,5>, <1,2,3,4> + 3638814495U, // <4,4,5,2>: Cost 4 vext1 <2,4,4,5>, <2,4,4,5> + 2571045368U, // <4,4,5,3>: Cost 3 vext1 <3,4,4,5>, <3,4,4,5> + 2571046198U, // <4,4,5,4>: Cost 3 vext1 <3,4,4,5>, RHS + 1839648054U, // <4,4,5,5>: Cost 2 vzipl RHS, RHS + 1618169142U, // <4,4,5,6>: Cost 2 vext3 <1,2,3,4>, RHS + 2594936156U, // <4,4,5,7>: Cost 3 vext1 <7,4,4,5>, <7,4,4,5> + 1618169160U, // <4,4,5,u>: Cost 2 vext3 <1,2,3,4>, RHS + 2553135206U, // <4,4,6,0>: Cost 3 vext1 <0,4,4,6>, LHS + 3626877686U, // <4,4,6,1>: Cost 4 vext1 <0,4,4,6>, <1,0,3,2> + 2565080782U, // <4,4,6,2>: Cost 3 vext1 <2,4,4,6>, <2,3,4,5> + 2571053561U, // <4,4,6,3>: Cost 3 vext1 <3,4,4,6>, <3,4,4,6> + 2553138486U, // <4,4,6,4>: Cost 3 vext1 <0,4,4,6>, RHS + 2241555675U, // <4,4,6,5>: Cost 3 vrev <4,4,5,6> + 1973865782U, // <4,4,6,6>: Cost 2 vtrnl RHS, RHS + 2658055029U, // <4,4,6,7>: Cost 3 vext2 <6,7,4,4>, <6,7,4,4> + 1973865800U, // <4,4,6,u>: Cost 2 vtrnl RHS, RHS + 2644120570U, // <4,4,7,0>: Cost 3 vext2 <4,4,4,4>, <7,0,1,2> + 3638829978U, // <4,4,7,1>: Cost 4 vext1 <2,4,4,7>, <1,2,3,4> + 3638830881U, // <4,4,7,2>: Cost 4 vext1 <2,4,4,7>, <2,4,4,7> + 3735115018U, // <4,4,7,3>: Cost 4 vext2 <7,3,4,4>, <7,3,4,4> + 2662036827U, // <4,4,7,4>: Cost 3 vext2 <7,4,4,4>, <7,4,4,4> + 2713292236U, // <4,4,7,5>: Cost 3 vext3 <4,7,5,4>, <4,7,5,4> + 2713365973U, // <4,4,7,6>: Cost 3 vext3 <4,7,6,4>, <4,7,6,4> + 2644121196U, // <4,4,7,7>: Cost 3 vext2 <4,4,4,4>, <7,7,7,7> + 2662036827U, // <4,4,7,u>: Cost 3 vext2 <7,4,4,4>, <7,4,4,4> + 1503297638U, // <4,4,u,0>: Cost 2 vext1 <4,4,4,u>, LHS + 1570379566U, // <4,4,u,1>: Cost 2 vext2 <4,4,4,4>, LHS + 2779682606U, // <4,4,u,2>: Cost 3 vuzpl <4,6,4,6>, LHS + 2571069947U, // <4,4,u,3>: Cost 3 vext1 <3,4,4,u>, <3,4,4,u> + 161926454U, // <4,4,u,4>: Cost 1 vdup0 RHS + 1841638710U, // <4,4,u,5>: Cost 2 vzipl RHS, RHS + 1618169385U, // <4,4,u,6>: Cost 2 vext3 <1,2,3,4>, RHS + 2594960735U, // <4,4,u,7>: Cost 3 vext1 <7,4,4,u>, <7,4,4,u> + 161926454U, // <4,4,u,u>: Cost 1 vdup0 RHS + 2631516160U, // <4,5,0,0>: Cost 3 vext2 <2,3,4,5>, <0,0,0,0> + 1557774438U, // <4,5,0,1>: Cost 2 vext2 <2,3,4,5>, LHS + 2618908875U, // <4,5,0,2>: Cost 3 vext2 <0,2,4,5>, <0,2,4,5> + 2571078140U, // <4,5,0,3>: Cost 3 vext1 <3,4,5,0>, <3,4,5,0> + 2626871634U, // <4,5,0,4>: Cost 3 vext2 <1,5,4,5>, <0,4,1,5> + 3705258414U, // <4,5,0,5>: Cost 4 vext2 <2,3,4,5>, <0,5,2,7> + 2594968438U, // <4,5,0,6>: Cost 3 vext1 <7,4,5,0>, <6,7,4,5> + 2594968928U, // <4,5,0,7>: Cost 3 vext1 <7,4,5,0>, <7,4,5,0> + 1557775005U, // <4,5,0,u>: Cost 2 vext2 <2,3,4,5>, LHS + 2631516918U, // <4,5,1,0>: Cost 3 vext2 <2,3,4,5>, <1,0,3,2> + 2624217939U, // <4,5,1,1>: Cost 3 vext2 <1,1,4,5>, <1,1,4,5> + 2631517078U, // <4,5,1,2>: Cost 3 vext2 <2,3,4,5>, <1,2,3,0> + 2821341286U, // <4,5,1,3>: Cost 3 vuzpr <0,4,1,5>, LHS + 3895086054U, // <4,5,1,4>: Cost 4 vuzpr <0,4,1,5>, <4,1,5,4> + 2626872471U, // <4,5,1,5>: Cost 3 vext2 <1,5,4,5>, <1,5,4,5> + 3895083131U, // <4,5,1,6>: Cost 4 vuzpr <0,4,1,5>, <0,1,4,6> + 2718748368U, // <4,5,1,7>: Cost 3 vext3 <5,6,7,4>, <5,1,7,3> + 2821341291U, // <4,5,1,u>: Cost 3 vuzpr <0,4,1,5>, LHS + 2571092070U, // <4,5,2,0>: Cost 3 vext1 <3,4,5,2>, LHS + 3699287585U, // <4,5,2,1>: Cost 4 vext2 <1,3,4,5>, <2,1,3,3> + 2630854269U, // <4,5,2,2>: Cost 3 vext2 <2,2,4,5>, <2,2,4,5> + 1557776078U, // <4,5,2,3>: Cost 2 vext2 <2,3,4,5>, <2,3,4,5> + 2631517974U, // <4,5,2,4>: Cost 3 vext2 <2,3,4,5>, <2,4,3,5> + 3692652384U, // <4,5,2,5>: Cost 4 vext2 <0,2,4,5>, <2,5,2,7> + 2631518138U, // <4,5,2,6>: Cost 3 vext2 <2,3,4,5>, <2,6,3,7> + 4164013366U, // <4,5,2,7>: Cost 4 vtrnr <0,4,u,2>, RHS + 1561094243U, // <4,5,2,u>: Cost 2 vext2 <2,u,4,5>, <2,u,4,5> + 2631518358U, // <4,5,3,0>: Cost 3 vext2 <2,3,4,5>, <3,0,1,2> + 3895084710U, // <4,5,3,1>: Cost 4 vuzpr <0,4,1,5>, <2,3,0,1> + 2631518540U, // <4,5,3,2>: Cost 3 vext2 <2,3,4,5>, <3,2,3,4> + 2631518620U, // <4,5,3,3>: Cost 3 vext2 <2,3,4,5>, <3,3,3,3> + 2631518716U, // <4,5,3,4>: Cost 3 vext2 <2,3,4,5>, <3,4,5,0> + 2631518784U, // <4,5,3,5>: Cost 3 vext2 <2,3,4,5>, <3,5,3,5> + 2658060980U, // <4,5,3,6>: Cost 3 vext2 <6,7,4,5>, <3,6,7,4> + 2640145131U, // <4,5,3,7>: Cost 3 vext2 <3,7,4,5>, <3,7,4,5> + 2631519006U, // <4,5,3,u>: Cost 3 vext2 <2,3,4,5>, <3,u,1,2> + 2571108454U, // <4,5,4,0>: Cost 3 vext1 <3,4,5,4>, LHS + 3632907342U, // <4,5,4,1>: Cost 4 vext1 <1,4,5,4>, <1,4,5,4> + 2571110094U, // <4,5,4,2>: Cost 3 vext1 <3,4,5,4>, <2,3,4,5> + 2571110912U, // <4,5,4,3>: Cost 3 vext1 <3,4,5,4>, <3,4,5,4> + 2571111734U, // <4,5,4,4>: Cost 3 vext1 <3,4,5,4>, RHS + 1557777718U, // <4,5,4,5>: Cost 2 vext2 <2,3,4,5>, RHS + 2645454195U, // <4,5,4,6>: Cost 3 vext2 <4,6,4,5>, <4,6,4,5> + 2718748614U, // <4,5,4,7>: Cost 3 vext3 <5,6,7,4>, <5,4,7,6> + 1557777961U, // <4,5,4,u>: Cost 2 vext2 <2,3,4,5>, RHS + 1503346790U, // <4,5,5,0>: Cost 2 vext1 <4,4,5,5>, LHS + 2913398480U, // <4,5,5,1>: Cost 3 vzipl RHS, <5,1,7,3> + 2631519998U, // <4,5,5,2>: Cost 3 vext2 <2,3,4,5>, <5,2,3,4> + 2577090710U, // <4,5,5,3>: Cost 3 vext1 <4,4,5,5>, <3,0,1,2> + 1503349978U, // <4,5,5,4>: Cost 2 vext1 <4,4,5,5>, <4,4,5,5> + 2631520260U, // <4,5,5,5>: Cost 3 vext2 <2,3,4,5>, <5,5,5,5> + 2913390690U, // <4,5,5,6>: Cost 3 vzipl RHS, <5,6,7,0> + 2821344566U, // <4,5,5,7>: Cost 3 vuzpr <0,4,1,5>, RHS + 1503352622U, // <4,5,5,u>: Cost 2 vext1 <4,4,5,5>, LHS + 1497383014U, // <4,5,6,0>: Cost 2 vext1 <3,4,5,6>, LHS + 2559181904U, // <4,5,6,1>: Cost 3 vext1 <1,4,5,6>, <1,4,5,6> + 2565154601U, // <4,5,6,2>: Cost 3 vext1 <2,4,5,6>, <2,4,5,6> + 1497385474U, // <4,5,6,3>: Cost 2 vext1 <3,4,5,6>, <3,4,5,6> + 1497386294U, // <4,5,6,4>: Cost 2 vext1 <3,4,5,6>, RHS + 3047608324U, // <4,5,6,5>: Cost 3 vtrnl RHS, <5,5,5,5> + 2571129656U, // <4,5,6,6>: Cost 3 vext1 <3,4,5,6>, <6,6,6,6> + 27705344U, // <4,5,6,7>: Cost 0 copy RHS + 27705344U, // <4,5,6,u>: Cost 0 copy RHS + 2565161062U, // <4,5,7,0>: Cost 3 vext1 <2,4,5,7>, LHS + 2565161882U, // <4,5,7,1>: Cost 3 vext1 <2,4,5,7>, <1,2,3,4> + 2565162794U, // <4,5,7,2>: Cost 3 vext1 <2,4,5,7>, <2,4,5,7> + 2661381387U, // <4,5,7,3>: Cost 3 vext2 <7,3,4,5>, <7,3,4,5> + 2565164342U, // <4,5,7,4>: Cost 3 vext1 <2,4,5,7>, RHS + 2718748840U, // <4,5,7,5>: Cost 3 vext3 <5,6,7,4>, <5,7,5,7> + 2718748846U, // <4,5,7,6>: Cost 3 vext3 <5,6,7,4>, <5,7,6,4> + 2719412407U, // <4,5,7,7>: Cost 3 vext3 <5,7,7,4>, <5,7,7,4> + 2565166894U, // <4,5,7,u>: Cost 3 vext1 <2,4,5,7>, LHS + 1497399398U, // <4,5,u,0>: Cost 2 vext1 <3,4,5,u>, LHS + 1557780270U, // <4,5,u,1>: Cost 2 vext2 <2,3,4,5>, LHS + 2631522181U, // <4,5,u,2>: Cost 3 vext2 <2,3,4,5>, <u,2,3,0> + 1497401860U, // <4,5,u,3>: Cost 2 vext1 <3,4,5,u>, <3,4,5,u> + 1497402678U, // <4,5,u,4>: Cost 2 vext1 <3,4,5,u>, RHS + 1557780634U, // <4,5,u,5>: Cost 2 vext2 <2,3,4,5>, RHS + 2631522512U, // <4,5,u,6>: Cost 3 vext2 <2,3,4,5>, <u,6,3,7> + 27705344U, // <4,5,u,7>: Cost 0 copy RHS + 27705344U, // <4,5,u,u>: Cost 0 copy RHS + 2618916864U, // <4,6,0,0>: Cost 3 vext2 <0,2,4,6>, <0,0,0,0> + 1545175142U, // <4,6,0,1>: Cost 2 vext2 <0,2,4,6>, LHS + 1545175244U, // <4,6,0,2>: Cost 2 vext2 <0,2,4,6>, <0,2,4,6> + 3692658940U, // <4,6,0,3>: Cost 4 vext2 <0,2,4,6>, <0,3,1,0> + 2618917202U, // <4,6,0,4>: Cost 3 vext2 <0,2,4,6>, <0,4,1,5> + 3852910806U, // <4,6,0,5>: Cost 4 vuzpl RHS, <0,2,5,7> + 2253525648U, // <4,6,0,6>: Cost 3 vrev <6,4,6,0> + 4040764726U, // <4,6,0,7>: Cost 4 vzipr <2,3,4,0>, RHS + 1545175709U, // <4,6,0,u>: Cost 2 vext2 <0,2,4,6>, LHS + 2618917622U, // <4,6,1,0>: Cost 3 vext2 <0,2,4,6>, <1,0,3,2> + 2618917684U, // <4,6,1,1>: Cost 3 vext2 <0,2,4,6>, <1,1,1,1> + 2618917782U, // <4,6,1,2>: Cost 3 vext2 <0,2,4,6>, <1,2,3,0> + 2618917848U, // <4,6,1,3>: Cost 3 vext2 <0,2,4,6>, <1,3,1,3> + 3692659773U, // <4,6,1,4>: Cost 4 vext2 <0,2,4,6>, <1,4,3,5> + 2618918032U, // <4,6,1,5>: Cost 3 vext2 <0,2,4,6>, <1,5,3,7> + 3692659937U, // <4,6,1,6>: Cost 4 vext2 <0,2,4,6>, <1,6,3,7> + 4032146742U, // <4,6,1,7>: Cost 4 vzipr <0,u,4,1>, RHS + 2618918253U, // <4,6,1,u>: Cost 3 vext2 <0,2,4,6>, <1,u,1,3> + 2618918380U, // <4,6,2,0>: Cost 3 vext2 <0,2,4,6>, <2,0,6,4> + 2618918460U, // <4,6,2,1>: Cost 3 vext2 <0,2,4,6>, <2,1,6,3> + 2618918504U, // <4,6,2,2>: Cost 3 vext2 <0,2,4,6>, <2,2,2,2> + 2618918566U, // <4,6,2,3>: Cost 3 vext2 <0,2,4,6>, <2,3,0,1> + 2618918679U, // <4,6,2,4>: Cost 3 vext2 <0,2,4,6>, <2,4,3,6> + 2618918788U, // <4,6,2,5>: Cost 3 vext2 <0,2,4,6>, <2,5,6,7> + 2618918842U, // <4,6,2,6>: Cost 3 vext2 <0,2,4,6>, <2,6,3,7> + 2718749178U, // <4,6,2,7>: Cost 3 vext3 <5,6,7,4>, <6,2,7,3> + 2618918971U, // <4,6,2,u>: Cost 3 vext2 <0,2,4,6>, <2,u,0,1> + 2618919062U, // <4,6,3,0>: Cost 3 vext2 <0,2,4,6>, <3,0,1,2> + 2636171526U, // <4,6,3,1>: Cost 3 vext2 <3,1,4,6>, <3,1,4,6> + 3692661057U, // <4,6,3,2>: Cost 4 vext2 <0,2,4,6>, <3,2,2,2> + 2618919324U, // <4,6,3,3>: Cost 3 vext2 <0,2,4,6>, <3,3,3,3> + 2618919426U, // <4,6,3,4>: Cost 3 vext2 <0,2,4,6>, <3,4,5,6> + 2638826058U, // <4,6,3,5>: Cost 3 vext2 <3,5,4,6>, <3,5,4,6> + 3913303030U, // <4,6,3,6>: Cost 4 vuzpr <3,4,5,6>, <1,3,4,6> + 2722730572U, // <4,6,3,7>: Cost 3 vext3 <6,3,7,4>, <6,3,7,4> + 2618919710U, // <4,6,3,u>: Cost 3 vext2 <0,2,4,6>, <3,u,1,2> + 2565210214U, // <4,6,4,0>: Cost 3 vext1 <2,4,6,4>, LHS + 2718749286U, // <4,6,4,1>: Cost 3 vext3 <5,6,7,4>, <6,4,1,3> + 2565211952U, // <4,6,4,2>: Cost 3 vext1 <2,4,6,4>, <2,4,6,4> + 2571184649U, // <4,6,4,3>: Cost 3 vext1 <3,4,6,4>, <3,4,6,4> + 2565213494U, // <4,6,4,4>: Cost 3 vext1 <2,4,6,4>, RHS + 1545178422U, // <4,6,4,5>: Cost 2 vext2 <0,2,4,6>, RHS + 1705430326U, // <4,6,4,6>: Cost 2 vuzpl RHS, RHS + 2595075437U, // <4,6,4,7>: Cost 3 vext1 <7,4,6,4>, <7,4,6,4> + 1545178665U, // <4,6,4,u>: Cost 2 vext2 <0,2,4,6>, RHS + 2565218406U, // <4,6,5,0>: Cost 3 vext1 <2,4,6,5>, LHS + 2645462736U, // <4,6,5,1>: Cost 3 vext2 <4,6,4,6>, <5,1,7,3> + 2913399290U, // <4,6,5,2>: Cost 3 vzipl RHS, <6,2,7,3> + 3913305394U, // <4,6,5,3>: Cost 4 vuzpr <3,4,5,6>, <4,5,6,3> + 2645462982U, // <4,6,5,4>: Cost 3 vext2 <4,6,4,6>, <5,4,7,6> + 2779172868U, // <4,6,5,5>: Cost 3 vuzpl RHS, <5,5,5,5> + 2913391416U, // <4,6,5,6>: Cost 3 vzipl RHS, <6,6,6,6> + 2821426486U, // <4,6,5,7>: Cost 3 vuzpr <0,4,2,6>, RHS + 2821426487U, // <4,6,5,u>: Cost 3 vuzpr <0,4,2,6>, RHS + 1503428710U, // <4,6,6,0>: Cost 2 vext1 <4,4,6,6>, LHS + 2577171190U, // <4,6,6,1>: Cost 3 vext1 <4,4,6,6>, <1,0,3,2> + 2645463546U, // <4,6,6,2>: Cost 3 vext2 <4,6,4,6>, <6,2,7,3> + 2577172630U, // <4,6,6,3>: Cost 3 vext1 <4,4,6,6>, <3,0,1,2> + 1503431908U, // <4,6,6,4>: Cost 2 vext1 <4,4,6,6>, <4,4,6,6> + 2253501069U, // <4,6,6,5>: Cost 3 vrev <6,4,5,6> + 2618921784U, // <4,6,6,6>: Cost 3 vext2 <0,2,4,6>, <6,6,6,6> + 2954464566U, // <4,6,6,7>: Cost 3 vzipr <0,2,4,6>, RHS + 1503434542U, // <4,6,6,u>: Cost 2 vext1 <4,4,6,6>, LHS + 2645464058U, // <4,6,7,0>: Cost 3 vext2 <4,6,4,6>, <7,0,1,2> + 2779173882U, // <4,6,7,1>: Cost 3 vuzpl RHS, <7,0,1,2> + 3638978355U, // <4,6,7,2>: Cost 4 vext1 <2,4,6,7>, <2,4,6,7> + 2725090156U, // <4,6,7,3>: Cost 3 vext3 <6,7,3,4>, <6,7,3,4> + 2645464422U, // <4,6,7,4>: Cost 3 vext2 <4,6,4,6>, <7,4,5,6> + 2779174246U, // <4,6,7,5>: Cost 3 vuzpl RHS, <7,4,5,6> + 3852915914U, // <4,6,7,6>: Cost 4 vuzpl RHS, <7,2,6,3> + 2779174508U, // <4,6,7,7>: Cost 3 vuzpl RHS, <7,7,7,7> + 2779173945U, // <4,6,7,u>: Cost 3 vuzpl RHS, <7,0,u,2> + 1503445094U, // <4,6,u,0>: Cost 2 vext1 <4,4,6,u>, LHS + 1545180974U, // <4,6,u,1>: Cost 2 vext2 <0,2,4,6>, LHS + 1705432878U, // <4,6,u,2>: Cost 2 vuzpl RHS, LHS + 2618922940U, // <4,6,u,3>: Cost 3 vext2 <0,2,4,6>, <u,3,0,1> + 1503448294U, // <4,6,u,4>: Cost 2 vext1 <4,4,6,u>, <4,4,6,u> + 1545181338U, // <4,6,u,5>: Cost 2 vext2 <0,2,4,6>, RHS + 1705433242U, // <4,6,u,6>: Cost 2 vuzpl RHS, RHS + 2954480950U, // <4,6,u,7>: Cost 3 vzipr <0,2,4,u>, RHS + 1545181541U, // <4,6,u,u>: Cost 2 vext2 <0,2,4,6>, LHS + 3706601472U, // <4,7,0,0>: Cost 4 vext2 <2,5,4,7>, <0,0,0,0> + 2632859750U, // <4,7,0,1>: Cost 3 vext2 <2,5,4,7>, LHS + 2726343685U, // <4,7,0,2>: Cost 3 vext3 <7,0,2,4>, <7,0,2,4> + 3701293312U, // <4,7,0,3>: Cost 4 vext2 <1,6,4,7>, <0,3,1,4> + 3706601810U, // <4,7,0,4>: Cost 4 vext2 <2,5,4,7>, <0,4,1,5> + 2259424608U, // <4,7,0,5>: Cost 3 vrev <7,4,5,0> + 3695321617U, // <4,7,0,6>: Cost 4 vext2 <0,6,4,7>, <0,6,4,7> + 3800454194U, // <4,7,0,7>: Cost 4 vext3 <7,0,7,4>, <7,0,7,4> + 2632860317U, // <4,7,0,u>: Cost 3 vext2 <2,5,4,7>, LHS + 2259064116U, // <4,7,1,0>: Cost 3 vrev <7,4,0,1> + 3700630324U, // <4,7,1,1>: Cost 4 vext2 <1,5,4,7>, <1,1,1,1> + 2632860570U, // <4,7,1,2>: Cost 3 vext2 <2,5,4,7>, <1,2,3,4> + 3769635936U, // <4,7,1,3>: Cost 4 vext3 <1,u,3,4>, <7,1,3,5> + 3656920374U, // <4,7,1,4>: Cost 4 vext1 <5,4,7,1>, RHS + 3700630681U, // <4,7,1,5>: Cost 4 vext2 <1,5,4,7>, <1,5,4,7> + 3701294314U, // <4,7,1,6>: Cost 4 vext2 <1,6,4,7>, <1,6,4,7> + 3793818754U, // <4,7,1,7>: Cost 4 vext3 <5,u,7,4>, <7,1,7,3> + 2259654012U, // <4,7,1,u>: Cost 3 vrev <7,4,u,1> + 3656925286U, // <4,7,2,0>: Cost 4 vext1 <5,4,7,2>, LHS + 3706603050U, // <4,7,2,1>: Cost 4 vext2 <2,5,4,7>, <2,1,4,3> + 3706603112U, // <4,7,2,2>: Cost 4 vext2 <2,5,4,7>, <2,2,2,2> + 2727744688U, // <4,7,2,3>: Cost 3 vext3 <7,2,3,4>, <7,2,3,4> + 3705939745U, // <4,7,2,4>: Cost 4 vext2 <2,4,4,7>, <2,4,4,7> + 2632861554U, // <4,7,2,5>: Cost 3 vext2 <2,5,4,7>, <2,5,4,7> + 3706603450U, // <4,7,2,6>: Cost 4 vext2 <2,5,4,7>, <2,6,3,7> + 3792491731U, // <4,7,2,7>: Cost 4 vext3 <5,6,7,4>, <7,2,7,3> + 2634852453U, // <4,7,2,u>: Cost 3 vext2 <2,u,4,7>, <2,u,4,7> + 3706603670U, // <4,7,3,0>: Cost 4 vext2 <2,5,4,7>, <3,0,1,2> + 3662906266U, // <4,7,3,1>: Cost 4 vext1 <6,4,7,3>, <1,2,3,4> + 3725183326U, // <4,7,3,2>: Cost 4 vext2 <5,6,4,7>, <3,2,5,4> + 3706603932U, // <4,7,3,3>: Cost 4 vext2 <2,5,4,7>, <3,3,3,3> + 3701295618U, // <4,7,3,4>: Cost 4 vext2 <1,6,4,7>, <3,4,5,6> + 2638834251U, // <4,7,3,5>: Cost 3 vext2 <3,5,4,7>, <3,5,4,7> + 2639497884U, // <4,7,3,6>: Cost 3 vext2 <3,6,4,7>, <3,6,4,7> + 3802445093U, // <4,7,3,7>: Cost 4 vext3 <7,3,7,4>, <7,3,7,4> + 2640825150U, // <4,7,3,u>: Cost 3 vext2 <3,u,4,7>, <3,u,4,7> + 2718750004U, // <4,7,4,0>: Cost 3 vext3 <5,6,7,4>, <7,4,0,1> + 3706604490U, // <4,7,4,1>: Cost 4 vext2 <2,5,4,7>, <4,1,2,3> + 3656943474U, // <4,7,4,2>: Cost 4 vext1 <5,4,7,4>, <2,5,4,7> + 3779884371U, // <4,7,4,3>: Cost 4 vext3 <3,5,7,4>, <7,4,3,5> + 2259383643U, // <4,7,4,4>: Cost 3 vrev <7,4,4,4> + 2632863030U, // <4,7,4,5>: Cost 3 vext2 <2,5,4,7>, RHS + 2259531117U, // <4,7,4,6>: Cost 3 vrev <7,4,6,4> + 3907340074U, // <4,7,4,7>: Cost 4 vuzpr <2,4,5,7>, <2,4,5,7> + 2632863273U, // <4,7,4,u>: Cost 3 vext2 <2,5,4,7>, RHS + 2913391610U, // <4,7,5,0>: Cost 3 vzipl RHS, <7,0,1,2> + 3645006848U, // <4,7,5,1>: Cost 4 vext1 <3,4,7,5>, <1,3,5,7> + 2589181646U, // <4,7,5,2>: Cost 3 vext1 <6,4,7,5>, <2,3,4,5> + 3645008403U, // <4,7,5,3>: Cost 4 vext1 <3,4,7,5>, <3,4,7,5> + 2913391974U, // <4,7,5,4>: Cost 3 vzipl RHS, <7,4,5,6> + 2583211973U, // <4,7,5,5>: Cost 3 vext1 <5,4,7,5>, <5,4,7,5> + 2589184670U, // <4,7,5,6>: Cost 3 vext1 <6,4,7,5>, <6,4,7,5> + 2913392236U, // <4,7,5,7>: Cost 3 vzipl RHS, <7,7,7,7> + 2913392258U, // <4,7,5,u>: Cost 3 vzipl RHS, <7,u,1,2> + 1509474406U, // <4,7,6,0>: Cost 2 vext1 <5,4,7,6>, LHS + 3047609338U, // <4,7,6,1>: Cost 3 vtrnl RHS, <7,0,1,2> + 2583217768U, // <4,7,6,2>: Cost 3 vext1 <5,4,7,6>, <2,2,2,2> + 2583218326U, // <4,7,6,3>: Cost 3 vext1 <5,4,7,6>, <3,0,1,2> + 1509477686U, // <4,7,6,4>: Cost 2 vext1 <5,4,7,6>, RHS + 1509478342U, // <4,7,6,5>: Cost 2 vext1 <5,4,7,6>, <5,4,7,6> + 2583220730U, // <4,7,6,6>: Cost 3 vext1 <5,4,7,6>, <6,2,7,3> + 3047609964U, // <4,7,6,7>: Cost 3 vtrnl RHS, <7,7,7,7> + 1509480238U, // <4,7,6,u>: Cost 2 vext1 <5,4,7,6>, LHS + 3650994278U, // <4,7,7,0>: Cost 4 vext1 <4,4,7,7>, LHS + 3650995098U, // <4,7,7,1>: Cost 4 vext1 <4,4,7,7>, <1,2,3,4> + 3650996010U, // <4,7,7,2>: Cost 4 vext1 <4,4,7,7>, <2,4,5,7> + 3804804677U, // <4,7,7,3>: Cost 4 vext3 <7,7,3,4>, <7,7,3,4> + 3650997486U, // <4,7,7,4>: Cost 4 vext1 <4,4,7,7>, <4,4,7,7> + 2662725039U, // <4,7,7,5>: Cost 3 vext2 <7,5,4,7>, <7,5,4,7> + 3662942880U, // <4,7,7,6>: Cost 4 vext1 <6,4,7,7>, <6,4,7,7> + 2718750316U, // <4,7,7,7>: Cost 3 vext3 <5,6,7,4>, <7,7,7,7> + 2664715938U, // <4,7,7,u>: Cost 3 vext2 <7,u,4,7>, <7,u,4,7> + 1509490790U, // <4,7,u,0>: Cost 2 vext1 <5,4,7,u>, LHS + 2632865582U, // <4,7,u,1>: Cost 3 vext2 <2,5,4,7>, LHS + 2583234152U, // <4,7,u,2>: Cost 3 vext1 <5,4,7,u>, <2,2,2,2> + 2583234710U, // <4,7,u,3>: Cost 3 vext1 <5,4,7,u>, <3,0,1,2> + 1509494070U, // <4,7,u,4>: Cost 2 vext1 <5,4,7,u>, RHS + 1509494728U, // <4,7,u,5>: Cost 2 vext1 <5,4,7,u>, <5,4,7,u> + 2583237114U, // <4,7,u,6>: Cost 3 vext1 <5,4,7,u>, <6,2,7,3> + 3047757420U, // <4,7,u,7>: Cost 3 vtrnl RHS, <7,7,7,7> + 1509496622U, // <4,7,u,u>: Cost 2 vext1 <5,4,7,u>, LHS + 2618933248U, // <4,u,0,0>: Cost 3 vext2 <0,2,4,u>, <0,0,0,0> + 1545191526U, // <4,u,0,1>: Cost 2 vext2 <0,2,4,u>, LHS + 1545191630U, // <4,u,0,2>: Cost 2 vext2 <0,2,4,u>, <0,2,4,u> + 2691913445U, // <4,u,0,3>: Cost 3 vext3 <1,2,3,4>, <u,0,3,2> + 2618933586U, // <4,u,0,4>: Cost 3 vext2 <0,2,4,u>, <0,4,1,5> + 2265397305U, // <4,u,0,5>: Cost 3 vrev <u,4,5,0> + 2595189625U, // <4,u,0,6>: Cost 3 vext1 <7,4,u,0>, <6,7,4,u> + 2595190139U, // <4,u,0,7>: Cost 3 vext1 <7,4,u,0>, <7,4,u,0> + 1545192093U, // <4,u,0,u>: Cost 2 vext2 <0,2,4,u>, LHS + 2618934006U, // <4,u,1,0>: Cost 3 vext2 <0,2,4,u>, <1,0,3,2> + 2618934068U, // <4,u,1,1>: Cost 3 vext2 <0,2,4,u>, <1,1,1,1> + 1618171694U, // <4,u,1,2>: Cost 2 vext3 <1,2,3,4>, LHS + 2618934232U, // <4,u,1,3>: Cost 3 vext2 <0,2,4,u>, <1,3,1,3> + 2695894848U, // <4,u,1,4>: Cost 3 vext3 <1,u,3,4>, <u,1,4,3> + 2618934416U, // <4,u,1,5>: Cost 3 vext2 <0,2,4,u>, <1,5,3,7> + 3692676321U, // <4,u,1,6>: Cost 4 vext2 <0,2,4,u>, <1,6,3,7> + 2718750555U, // <4,u,1,7>: Cost 3 vext3 <5,6,7,4>, <u,1,7,3> + 1618171748U, // <4,u,1,u>: Cost 2 vext3 <1,2,3,4>, LHS + 2553397350U, // <4,u,2,0>: Cost 3 vext1 <0,4,u,2>, LHS + 2630215215U, // <4,u,2,1>: Cost 3 vext2 <2,1,4,u>, <2,1,4,u> + 2618934888U, // <4,u,2,2>: Cost 3 vext2 <0,2,4,u>, <2,2,2,2> + 1557800657U, // <4,u,2,3>: Cost 2 vext2 <2,3,4,u>, <2,3,4,u> + 2618935065U, // <4,u,2,4>: Cost 3 vext2 <0,2,4,u>, <2,4,3,u> + 2733864859U, // <4,u,2,5>: Cost 3 vext3 <u,2,5,4>, <u,2,5,4> + 2618935226U, // <4,u,2,6>: Cost 3 vext2 <0,2,4,u>, <2,6,3,7> + 2718750636U, // <4,u,2,7>: Cost 3 vext3 <5,6,7,4>, <u,2,7,3> + 1561118822U, // <4,u,2,u>: Cost 2 vext2 <2,u,4,u>, <2,u,4,u> + 2618935446U, // <4,u,3,0>: Cost 3 vext2 <0,2,4,u>, <3,0,1,2> + 2779318422U, // <4,u,3,1>: Cost 3 vuzpl RHS, <3,0,1,2> + 2636851545U, // <4,u,3,2>: Cost 3 vext2 <3,2,4,u>, <3,2,4,u> + 2618935708U, // <4,u,3,3>: Cost 3 vext2 <0,2,4,u>, <3,3,3,3> + 2618935810U, // <4,u,3,4>: Cost 3 vext2 <0,2,4,u>, <3,4,5,6> + 2691913711U, // <4,u,3,5>: Cost 3 vext3 <1,2,3,4>, <u,3,5,7> + 2588725862U, // <4,u,3,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3> + 2640169710U, // <4,u,3,7>: Cost 3 vext2 <3,7,4,u>, <3,7,4,u> + 2618936094U, // <4,u,3,u>: Cost 3 vext2 <0,2,4,u>, <3,u,1,2> + 1503559782U, // <4,u,4,0>: Cost 2 vext1 <4,4,u,4>, LHS + 2692282391U, // <4,u,4,1>: Cost 3 vext3 <1,2,u,4>, <u,4,1,2> + 2565359426U, // <4,u,4,2>: Cost 3 vext1 <2,4,u,4>, <2,4,u,4> + 2571332123U, // <4,u,4,3>: Cost 3 vext1 <3,4,u,4>, <3,4,u,4> + 161926454U, // <4,u,4,4>: Cost 1 vdup0 RHS + 1545194806U, // <4,u,4,5>: Cost 2 vext2 <0,2,4,u>, RHS + 1705577782U, // <4,u,4,6>: Cost 2 vuzpl RHS, RHS + 2718750801U, // <4,u,4,7>: Cost 3 vext3 <5,6,7,4>, <u,4,7,6> + 161926454U, // <4,u,4,u>: Cost 1 vdup0 RHS + 1479164006U, // <4,u,5,0>: Cost 2 vext1 <0,4,1,5>, LHS + 1839650606U, // <4,u,5,1>: Cost 2 vzipl RHS, LHS + 2565367502U, // <4,u,5,2>: Cost 3 vext1 <2,4,u,5>, <2,3,4,5> + 3089777309U, // <4,u,5,3>: Cost 3 vtrnr <0,4,1,5>, LHS + 1479167286U, // <4,u,5,4>: Cost 2 vext1 <0,4,1,5>, RHS + 1839650970U, // <4,u,5,5>: Cost 2 vzipl RHS, RHS + 1618172058U, // <4,u,5,6>: Cost 2 vext3 <1,2,3,4>, RHS + 3089780265U, // <4,u,5,7>: Cost 3 vtrnr <0,4,1,5>, RHS + 1618172076U, // <4,u,5,u>: Cost 2 vext3 <1,2,3,4>, RHS + 1479688294U, // <4,u,6,0>: Cost 2 vext1 <0,4,u,6>, LHS + 2553430774U, // <4,u,6,1>: Cost 3 vext1 <0,4,u,6>, <1,0,3,2> + 1973868334U, // <4,u,6,2>: Cost 2 vtrnl RHS, LHS + 1497606685U, // <4,u,6,3>: Cost 2 vext1 <3,4,u,6>, <3,4,u,6> + 1479691574U, // <4,u,6,4>: Cost 2 vext1 <0,4,u,6>, RHS + 1509552079U, // <4,u,6,5>: Cost 2 vext1 <5,4,u,6>, <5,4,u,6> + 1973868698U, // <4,u,6,6>: Cost 2 vtrnl RHS, RHS + 27705344U, // <4,u,6,7>: Cost 0 copy RHS + 27705344U, // <4,u,6,u>: Cost 0 copy RHS + 2565382246U, // <4,u,7,0>: Cost 3 vext1 <2,4,u,7>, LHS + 2565383066U, // <4,u,7,1>: Cost 3 vext1 <2,4,u,7>, <1,2,3,4> + 2565384005U, // <4,u,7,2>: Cost 3 vext1 <2,4,u,7>, <2,4,u,7> + 2661405966U, // <4,u,7,3>: Cost 3 vext2 <7,3,4,u>, <7,3,4,u> + 2565385526U, // <4,u,7,4>: Cost 3 vext1 <2,4,u,7>, RHS + 2779321702U, // <4,u,7,5>: Cost 3 vuzpl RHS, <7,4,5,6> + 2589274793U, // <4,u,7,6>: Cost 3 vext1 <6,4,u,7>, <6,4,u,7> + 2779321964U, // <4,u,7,7>: Cost 3 vuzpl RHS, <7,7,7,7> + 2565388078U, // <4,u,7,u>: Cost 3 vext1 <2,4,u,7>, LHS + 1479704678U, // <4,u,u,0>: Cost 2 vext1 <0,4,u,u>, LHS + 1545197358U, // <4,u,u,1>: Cost 2 vext2 <0,2,4,u>, LHS + 1618172261U, // <4,u,u,2>: Cost 2 vext3 <1,2,3,4>, LHS + 1497623071U, // <4,u,u,3>: Cost 2 vext1 <3,4,u,u>, <3,4,u,u> + 161926454U, // <4,u,u,4>: Cost 1 vdup0 RHS + 1545197722U, // <4,u,u,5>: Cost 2 vext2 <0,2,4,u>, RHS + 1618172301U, // <4,u,u,6>: Cost 2 vext3 <1,2,3,4>, RHS + 27705344U, // <4,u,u,7>: Cost 0 copy RHS + 27705344U, // <4,u,u,u>: Cost 0 copy RHS + 2687123456U, // <5,0,0,0>: Cost 3 vext3 <0,4,1,5>, <0,0,0,0> + 2687123466U, // <5,0,0,1>: Cost 3 vext3 <0,4,1,5>, <0,0,1,1> + 2687123476U, // <5,0,0,2>: Cost 3 vext3 <0,4,1,5>, <0,0,2,2> + 3710599434U, // <5,0,0,3>: Cost 4 vext2 <3,2,5,0>, <0,3,2,5> + 2642166098U, // <5,0,0,4>: Cost 3 vext2 <4,1,5,0>, <0,4,1,5> + 3657060306U, // <5,0,0,5>: Cost 4 vext1 <5,5,0,0>, <5,5,0,0> + 3292094923U, // <5,0,0,6>: Cost 4 vrev <0,5,6,0> + 3669005700U, // <5,0,0,7>: Cost 4 vext1 <7,5,0,0>, <7,5,0,0> + 2687123530U, // <5,0,0,u>: Cost 3 vext3 <0,4,1,5>, <0,0,u,2> + 2559434854U, // <5,0,1,0>: Cost 3 vext1 <1,5,0,1>, LHS + 2559435887U, // <5,0,1,1>: Cost 3 vext1 <1,5,0,1>, <1,5,0,1> + 1613381734U, // <5,0,1,2>: Cost 2 vext3 <0,4,1,5>, LHS + 3698656256U, // <5,0,1,3>: Cost 4 vext2 <1,2,5,0>, <1,3,5,7> + 2559438134U, // <5,0,1,4>: Cost 3 vext1 <1,5,0,1>, RHS + 2583326675U, // <5,0,1,5>: Cost 3 vext1 <5,5,0,1>, <5,5,0,1> + 3715908851U, // <5,0,1,6>: Cost 4 vext2 <4,1,5,0>, <1,6,5,7> + 3657069562U, // <5,0,1,7>: Cost 4 vext1 <5,5,0,1>, <7,0,1,2> + 1613381788U, // <5,0,1,u>: Cost 2 vext3 <0,4,1,5>, LHS + 2686017700U, // <5,0,2,0>: Cost 3 vext3 <0,2,4,5>, <0,2,0,2> + 2685796528U, // <5,0,2,1>: Cost 3 vext3 <0,2,1,5>, <0,2,1,5> + 2698625208U, // <5,0,2,2>: Cost 3 vext3 <2,3,4,5>, <0,2,2,4> + 2685944002U, // <5,0,2,3>: Cost 3 vext3 <0,2,3,5>, <0,2,3,5> + 2686017739U, // <5,0,2,4>: Cost 3 vext3 <0,2,4,5>, <0,2,4,5> + 2686091476U, // <5,0,2,5>: Cost 3 vext3 <0,2,5,5>, <0,2,5,5> + 2725167324U, // <5,0,2,6>: Cost 3 vext3 <6,7,4,5>, <0,2,6,4> + 2595280230U, // <5,0,2,7>: Cost 3 vext1 <7,5,0,2>, <7,4,5,6> + 2686312687U, // <5,0,2,u>: Cost 3 vext3 <0,2,u,5>, <0,2,u,5> + 3760128248U, // <5,0,3,0>: Cost 4 vext3 <0,3,0,5>, <0,3,0,5> + 3759685888U, // <5,0,3,1>: Cost 4 vext3 <0,2,3,5>, <0,3,1,4> + 2686533898U, // <5,0,3,2>: Cost 3 vext3 <0,3,2,5>, <0,3,2,5> + 3760349459U, // <5,0,3,3>: Cost 4 vext3 <0,3,3,5>, <0,3,3,5> + 2638187004U, // <5,0,3,4>: Cost 3 vext2 <3,4,5,0>, <3,4,5,0> + 3776348452U, // <5,0,3,5>: Cost 4 vext3 <3,0,4,5>, <0,3,5,4> + 3713256094U, // <5,0,3,6>: Cost 4 vext2 <3,6,5,0>, <3,6,5,0> + 3914064896U, // <5,0,3,7>: Cost 4 vuzpr <3,5,7,0>, <1,3,5,7> + 2686976320U, // <5,0,3,u>: Cost 3 vext3 <0,3,u,5>, <0,3,u,5> + 2559459430U, // <5,0,4,0>: Cost 3 vext1 <1,5,0,4>, LHS + 1613381970U, // <5,0,4,1>: Cost 2 vext3 <0,4,1,5>, <0,4,1,5> + 2687123804U, // <5,0,4,2>: Cost 3 vext3 <0,4,1,5>, <0,4,2,6> + 3761013092U, // <5,0,4,3>: Cost 4 vext3 <0,4,3,5>, <0,4,3,5> + 2559462710U, // <5,0,4,4>: Cost 3 vext1 <1,5,0,4>, RHS + 2638187830U, // <5,0,4,5>: Cost 3 vext2 <3,4,5,0>, RHS + 3761234303U, // <5,0,4,6>: Cost 4 vext3 <0,4,6,5>, <0,4,6,5> + 2646150600U, // <5,0,4,7>: Cost 3 vext2 <4,7,5,0>, <4,7,5,0> + 1613381970U, // <5,0,4,u>: Cost 2 vext3 <0,4,1,5>, <0,4,1,5> + 3766763926U, // <5,0,5,0>: Cost 4 vext3 <1,4,0,5>, <0,5,0,1> + 2919268454U, // <5,0,5,1>: Cost 3 vzipl <5,5,5,5>, LHS + 3053486182U, // <5,0,5,2>: Cost 3 vtrnl <5,5,5,5>, LHS + 3723210589U, // <5,0,5,3>: Cost 4 vext2 <5,3,5,0>, <5,3,5,0> + 3766763966U, // <5,0,5,4>: Cost 4 vext3 <1,4,0,5>, <0,5,4,5> + 2650796031U, // <5,0,5,5>: Cost 3 vext2 <5,5,5,0>, <5,5,5,0> + 3719893090U, // <5,0,5,6>: Cost 4 vext2 <4,7,5,0>, <5,6,7,0> + 3914067254U, // <5,0,5,7>: Cost 4 vuzpr <3,5,7,0>, RHS + 2919269021U, // <5,0,5,u>: Cost 3 vzipl <5,5,5,5>, LHS + 4047519744U, // <5,0,6,0>: Cost 4 vzipr <3,4,5,6>, <0,0,0,0> + 2920038502U, // <5,0,6,1>: Cost 3 vzipl <5,6,7,0>, LHS + 3759759871U, // <5,0,6,2>: Cost 4 vext3 <0,2,4,5>, <0,6,2,7> + 3645164070U, // <5,0,6,3>: Cost 4 vext1 <3,5,0,6>, <3,5,0,6> + 3762414095U, // <5,0,6,4>: Cost 4 vext3 <0,6,4,5>, <0,6,4,5> + 3993780690U, // <5,0,6,5>: Cost 4 vzipl <5,6,7,0>, <0,5,6,7> + 3719893816U, // <5,0,6,6>: Cost 4 vext2 <4,7,5,0>, <6,6,6,6> + 2662077302U, // <5,0,6,7>: Cost 3 vext2 <7,4,5,0>, <6,7,4,5> + 2920039069U, // <5,0,6,u>: Cost 3 vzipl <5,6,7,0>, LHS + 2565455974U, // <5,0,7,0>: Cost 3 vext1 <2,5,0,7>, LHS + 2565456790U, // <5,0,7,1>: Cost 3 vext1 <2,5,0,7>, <1,2,3,0> + 2565457742U, // <5,0,7,2>: Cost 3 vext1 <2,5,0,7>, <2,5,0,7> + 3639199894U, // <5,0,7,3>: Cost 4 vext1 <2,5,0,7>, <3,0,1,2> + 2565459254U, // <5,0,7,4>: Cost 3 vext1 <2,5,0,7>, RHS + 2589347938U, // <5,0,7,5>: Cost 3 vext1 <6,5,0,7>, <5,6,7,0> + 2589348530U, // <5,0,7,6>: Cost 3 vext1 <6,5,0,7>, <6,5,0,7> + 4188456422U, // <5,0,7,7>: Cost 4 vtrnr RHS, <2,0,5,7> + 2565461806U, // <5,0,7,u>: Cost 3 vext1 <2,5,0,7>, LHS + 2687124106U, // <5,0,u,0>: Cost 3 vext3 <0,4,1,5>, <0,u,0,2> + 1616036502U, // <5,0,u,1>: Cost 2 vext3 <0,u,1,5>, <0,u,1,5> + 1613382301U, // <5,0,u,2>: Cost 2 vext3 <0,4,1,5>, LHS + 2689925800U, // <5,0,u,3>: Cost 3 vext3 <0,u,3,5>, <0,u,3,5> + 2687124146U, // <5,0,u,4>: Cost 3 vext3 <0,4,1,5>, <0,u,4,6> + 2638190746U, // <5,0,u,5>: Cost 3 vext2 <3,4,5,0>, RHS + 2589356723U, // <5,0,u,6>: Cost 3 vext1 <6,5,0,u>, <6,5,0,u> + 2595280230U, // <5,0,u,7>: Cost 3 vext1 <7,5,0,2>, <7,4,5,6> + 1613382355U, // <5,0,u,u>: Cost 2 vext3 <0,4,1,5>, LHS + 2646818816U, // <5,1,0,0>: Cost 3 vext2 <4,u,5,1>, <0,0,0,0> + 1573077094U, // <5,1,0,1>: Cost 2 vext2 <4,u,5,1>, LHS + 2646818980U, // <5,1,0,2>: Cost 3 vext2 <4,u,5,1>, <0,2,0,2> + 2687124214U, // <5,1,0,3>: Cost 3 vext3 <0,4,1,5>, <1,0,3,2> + 2641510738U, // <5,1,0,4>: Cost 3 vext2 <4,0,5,1>, <0,4,1,5> + 2641510814U, // <5,1,0,5>: Cost 3 vext2 <4,0,5,1>, <0,5,1,0> + 3720561142U, // <5,1,0,6>: Cost 4 vext2 <4,u,5,1>, <0,6,1,7> + 3298141357U, // <5,1,0,7>: Cost 4 vrev <1,5,7,0> + 1573077661U, // <5,1,0,u>: Cost 2 vext2 <4,u,5,1>, LHS + 2223891567U, // <5,1,1,0>: Cost 3 vrev <1,5,0,1> + 2687124276U, // <5,1,1,1>: Cost 3 vext3 <0,4,1,5>, <1,1,1,1> + 2646819734U, // <5,1,1,2>: Cost 3 vext2 <4,u,5,1>, <1,2,3,0> + 2687124296U, // <5,1,1,3>: Cost 3 vext3 <0,4,1,5>, <1,1,3,3> + 2691326803U, // <5,1,1,4>: Cost 3 vext3 <1,1,4,5>, <1,1,4,5> + 2691400540U, // <5,1,1,5>: Cost 3 vext3 <1,1,5,5>, <1,1,5,5> + 3765216101U, // <5,1,1,6>: Cost 4 vext3 <1,1,6,5>, <1,1,6,5> + 3765289838U, // <5,1,1,7>: Cost 4 vext3 <1,1,7,5>, <1,1,7,5> + 2687124341U, // <5,1,1,u>: Cost 3 vext3 <0,4,1,5>, <1,1,u,3> + 3297641584U, // <5,1,2,0>: Cost 4 vrev <1,5,0,2> + 3763520391U, // <5,1,2,1>: Cost 4 vext3 <0,u,1,5>, <1,2,1,3> + 2646820456U, // <5,1,2,2>: Cost 3 vext2 <4,u,5,1>, <2,2,2,2> + 2687124374U, // <5,1,2,3>: Cost 3 vext3 <0,4,1,5>, <1,2,3,0> + 2691990436U, // <5,1,2,4>: Cost 3 vext3 <1,2,4,5>, <1,2,4,5> + 2687124395U, // <5,1,2,5>: Cost 3 vext3 <0,4,1,5>, <1,2,5,3> + 2646820794U, // <5,1,2,6>: Cost 3 vext2 <4,u,5,1>, <2,6,3,7> + 3808199610U, // <5,1,2,7>: Cost 4 vext3 <u,3,4,5>, <1,2,7,0> + 2687124419U, // <5,1,2,u>: Cost 3 vext3 <0,4,1,5>, <1,2,u,0> + 2577440870U, // <5,1,3,0>: Cost 3 vext1 <4,5,1,3>, LHS + 2687124440U, // <5,1,3,1>: Cost 3 vext3 <0,4,1,5>, <1,3,1,3> + 3759686627U, // <5,1,3,2>: Cost 4 vext3 <0,2,3,5>, <1,3,2,5> + 2692580332U, // <5,1,3,3>: Cost 3 vext3 <1,3,3,5>, <1,3,3,5> + 2687124469U, // <5,1,3,4>: Cost 3 vext3 <0,4,1,5>, <1,3,4,5> + 2685207552U, // <5,1,3,5>: Cost 3 vext3 <0,1,2,5>, <1,3,5,7> + 3760866313U, // <5,1,3,6>: Cost 4 vext3 <0,4,1,5>, <1,3,6,7> + 2692875280U, // <5,1,3,7>: Cost 3 vext3 <1,3,7,5>, <1,3,7,5> + 2687124503U, // <5,1,3,u>: Cost 3 vext3 <0,4,1,5>, <1,3,u,3> + 1567771538U, // <5,1,4,0>: Cost 2 vext2 <4,0,5,1>, <4,0,5,1> + 2693096491U, // <5,1,4,1>: Cost 3 vext3 <1,4,1,5>, <1,4,1,5> + 2693170228U, // <5,1,4,2>: Cost 3 vext3 <1,4,2,5>, <1,4,2,5> + 2687124541U, // <5,1,4,3>: Cost 3 vext3 <0,4,1,5>, <1,4,3,5> + 2646822096U, // <5,1,4,4>: Cost 3 vext2 <4,u,5,1>, <4,4,4,4> + 1573080374U, // <5,1,4,5>: Cost 2 vext2 <4,u,5,1>, RHS + 2646822260U, // <5,1,4,6>: Cost 3 vext2 <4,u,5,1>, <4,6,4,6> + 3298174129U, // <5,1,4,7>: Cost 4 vrev <1,5,7,4> + 1573080602U, // <5,1,4,u>: Cost 2 vext2 <4,u,5,1>, <4,u,5,1> + 2687124591U, // <5,1,5,0>: Cost 3 vext3 <0,4,1,5>, <1,5,0,1> + 2646822543U, // <5,1,5,1>: Cost 3 vext2 <4,u,5,1>, <5,1,0,1> + 3760866433U, // <5,1,5,2>: Cost 4 vext3 <0,4,1,5>, <1,5,2,1> + 2687124624U, // <5,1,5,3>: Cost 3 vext3 <0,4,1,5>, <1,5,3,7> + 2687124631U, // <5,1,5,4>: Cost 3 vext3 <0,4,1,5>, <1,5,4,5> + 2646822916U, // <5,1,5,5>: Cost 3 vext2 <4,u,5,1>, <5,5,5,5> + 2646823010U, // <5,1,5,6>: Cost 3 vext2 <4,u,5,1>, <5,6,7,0> + 2646823080U, // <5,1,5,7>: Cost 3 vext2 <4,u,5,1>, <5,7,5,7> + 2687124663U, // <5,1,5,u>: Cost 3 vext3 <0,4,1,5>, <1,5,u,1> + 2553577574U, // <5,1,6,0>: Cost 3 vext1 <0,5,1,6>, LHS + 3763520719U, // <5,1,6,1>: Cost 4 vext3 <0,u,1,5>, <1,6,1,7> + 2646823418U, // <5,1,6,2>: Cost 3 vext2 <4,u,5,1>, <6,2,7,3> + 3760866529U, // <5,1,6,3>: Cost 4 vext3 <0,4,1,5>, <1,6,3,7> + 2553580854U, // <5,1,6,4>: Cost 3 vext1 <0,5,1,6>, RHS + 2687124723U, // <5,1,6,5>: Cost 3 vext3 <0,4,1,5>, <1,6,5,7> + 2646823736U, // <5,1,6,6>: Cost 3 vext2 <4,u,5,1>, <6,6,6,6> + 2646823758U, // <5,1,6,7>: Cost 3 vext2 <4,u,5,1>, <6,7,0,1> + 2646823839U, // <5,1,6,u>: Cost 3 vext2 <4,u,5,1>, <6,u,0,1> + 2559557734U, // <5,1,7,0>: Cost 3 vext1 <1,5,1,7>, LHS + 2559558452U, // <5,1,7,1>: Cost 3 vext1 <1,5,1,7>, <1,1,1,1> + 2571503270U, // <5,1,7,2>: Cost 3 vext1 <3,5,1,7>, <2,3,0,1> + 2040971366U, // <5,1,7,3>: Cost 2 vtrnr RHS, LHS + 2559561014U, // <5,1,7,4>: Cost 3 vext1 <1,5,1,7>, RHS + 2595393232U, // <5,1,7,5>: Cost 3 vext1 <7,5,1,7>, <5,1,7,3> + 4188455035U, // <5,1,7,6>: Cost 4 vtrnr RHS, <0,1,4,6> + 2646824556U, // <5,1,7,7>: Cost 3 vext2 <4,u,5,1>, <7,7,7,7> + 2040971371U, // <5,1,7,u>: Cost 2 vtrnr RHS, LHS + 1591662326U, // <5,1,u,0>: Cost 2 vext2 <u,0,5,1>, <u,0,5,1> + 1573082926U, // <5,1,u,1>: Cost 2 vext2 <4,u,5,1>, LHS + 2695824760U, // <5,1,u,2>: Cost 3 vext3 <1,u,2,5>, <1,u,2,5> + 2040979558U, // <5,1,u,3>: Cost 2 vtrnr RHS, LHS + 2687124874U, // <5,1,u,4>: Cost 3 vext3 <0,4,1,5>, <1,u,4,5> + 1573083290U, // <5,1,u,5>: Cost 2 vext2 <4,u,5,1>, RHS + 2646825168U, // <5,1,u,6>: Cost 3 vext2 <4,u,5,1>, <u,6,3,7> + 2646825216U, // <5,1,u,7>: Cost 3 vext2 <4,u,5,1>, <u,7,0,1> + 2040979563U, // <5,1,u,u>: Cost 2 vtrnr RHS, LHS + 3702652928U, // <5,2,0,0>: Cost 4 vext2 <1,u,5,2>, <0,0,0,0> + 2628911206U, // <5,2,0,1>: Cost 3 vext2 <1,u,5,2>, LHS + 2641518756U, // <5,2,0,2>: Cost 3 vext2 <4,0,5,2>, <0,2,0,2> + 3759760847U, // <5,2,0,3>: Cost 4 vext3 <0,2,4,5>, <2,0,3,2> + 3760866775U, // <5,2,0,4>: Cost 4 vext3 <0,4,1,5>, <2,0,4,1> + 3759539680U, // <5,2,0,5>: Cost 4 vext3 <0,2,1,5>, <2,0,5,1> + 3760866796U, // <5,2,0,6>: Cost 4 vext3 <0,4,1,5>, <2,0,6,4> + 3304114054U, // <5,2,0,7>: Cost 4 vrev <2,5,7,0> + 2628911773U, // <5,2,0,u>: Cost 3 vext2 <1,u,5,2>, LHS + 2623603464U, // <5,2,1,0>: Cost 3 vext2 <1,0,5,2>, <1,0,5,2> + 3698008921U, // <5,2,1,1>: Cost 4 vext2 <1,1,5,2>, <1,1,5,2> + 3633325603U, // <5,2,1,2>: Cost 4 vext1 <1,5,2,1>, <2,1,3,5> + 2687125027U, // <5,2,1,3>: Cost 3 vext3 <0,4,1,5>, <2,1,3,5> + 3633327414U, // <5,2,1,4>: Cost 4 vext1 <1,5,2,1>, RHS + 3759539760U, // <5,2,1,5>: Cost 4 vext3 <0,2,1,5>, <2,1,5,0> + 3760866876U, // <5,2,1,6>: Cost 4 vext3 <0,4,1,5>, <2,1,6,3> + 3304122247U, // <5,2,1,7>: Cost 4 vrev <2,5,7,1> + 2687125072U, // <5,2,1,u>: Cost 3 vext3 <0,4,1,5>, <2,1,u,5> + 3633332326U, // <5,2,2,0>: Cost 4 vext1 <1,5,2,2>, LHS + 3759760992U, // <5,2,2,1>: Cost 4 vext3 <0,2,4,5>, <2,2,1,3> + 2687125096U, // <5,2,2,2>: Cost 3 vext3 <0,4,1,5>, <2,2,2,2> + 2687125106U, // <5,2,2,3>: Cost 3 vext3 <0,4,1,5>, <2,2,3,3> + 2697963133U, // <5,2,2,4>: Cost 3 vext3 <2,2,4,5>, <2,2,4,5> + 3759466120U, // <5,2,2,5>: Cost 4 vext3 <0,2,0,5>, <2,2,5,7> + 3760866960U, // <5,2,2,6>: Cost 4 vext3 <0,4,1,5>, <2,2,6,6> + 3771926168U, // <5,2,2,7>: Cost 4 vext3 <2,2,7,5>, <2,2,7,5> + 2687125151U, // <5,2,2,u>: Cost 3 vext3 <0,4,1,5>, <2,2,u,3> + 2687125158U, // <5,2,3,0>: Cost 3 vext3 <0,4,1,5>, <2,3,0,1> + 2698405555U, // <5,2,3,1>: Cost 3 vext3 <2,3,1,5>, <2,3,1,5> + 2577516238U, // <5,2,3,2>: Cost 3 vext1 <4,5,2,3>, <2,3,4,5> + 3759687365U, // <5,2,3,3>: Cost 4 vext3 <0,2,3,5>, <2,3,3,5> + 1624884942U, // <5,2,3,4>: Cost 2 vext3 <2,3,4,5>, <2,3,4,5> + 2698700503U, // <5,2,3,5>: Cost 3 vext3 <2,3,5,5>, <2,3,5,5> + 3772368608U, // <5,2,3,6>: Cost 4 vext3 <2,3,4,5>, <2,3,6,5> + 3702655716U, // <5,2,3,7>: Cost 4 vext2 <1,u,5,2>, <3,7,3,7> + 1625179890U, // <5,2,3,u>: Cost 2 vext3 <2,3,u,5>, <2,3,u,5> + 2641521555U, // <5,2,4,0>: Cost 3 vext2 <4,0,5,2>, <4,0,5,2> + 3772368642U, // <5,2,4,1>: Cost 4 vext3 <2,3,4,5>, <2,4,1,3> + 2699142925U, // <5,2,4,2>: Cost 3 vext3 <2,4,2,5>, <2,4,2,5> + 2698626838U, // <5,2,4,3>: Cost 3 vext3 <2,3,4,5>, <2,4,3,5> + 2698626848U, // <5,2,4,4>: Cost 3 vext3 <2,3,4,5>, <2,4,4,6> + 2628914486U, // <5,2,4,5>: Cost 3 vext2 <1,u,5,2>, RHS + 2645503353U, // <5,2,4,6>: Cost 3 vext2 <4,6,5,2>, <4,6,5,2> + 3304146826U, // <5,2,4,7>: Cost 4 vrev <2,5,7,4> + 2628914729U, // <5,2,4,u>: Cost 3 vext2 <1,u,5,2>, RHS + 2553643110U, // <5,2,5,0>: Cost 3 vext1 <0,5,2,5>, LHS + 3758950227U, // <5,2,5,1>: Cost 4 vext3 <0,1,2,5>, <2,5,1,3> + 3759761248U, // <5,2,5,2>: Cost 4 vext3 <0,2,4,5>, <2,5,2,7> + 2982396006U, // <5,2,5,3>: Cost 3 vzipr <4,u,5,5>, LHS + 2553646390U, // <5,2,5,4>: Cost 3 vext1 <0,5,2,5>, RHS + 2553647108U, // <5,2,5,5>: Cost 3 vext1 <0,5,2,5>, <5,5,5,5> + 3760867204U, // <5,2,5,6>: Cost 4 vext3 <0,4,1,5>, <2,5,6,7> + 3702657141U, // <5,2,5,7>: Cost 4 vext2 <1,u,5,2>, <5,7,0,1> + 2982396011U, // <5,2,5,u>: Cost 3 vzipr <4,u,5,5>, LHS + 3627393126U, // <5,2,6,0>: Cost 4 vext1 <0,5,2,6>, LHS + 3760867236U, // <5,2,6,1>: Cost 4 vext3 <0,4,1,5>, <2,6,1,3> + 2645504506U, // <5,2,6,2>: Cost 3 vext2 <4,6,5,2>, <6,2,7,3> + 2687125434U, // <5,2,6,3>: Cost 3 vext3 <0,4,1,5>, <2,6,3,7> + 2700617665U, // <5,2,6,4>: Cost 3 vext3 <2,6,4,5>, <2,6,4,5> + 3760867276U, // <5,2,6,5>: Cost 4 vext3 <0,4,1,5>, <2,6,5,7> + 3763521493U, // <5,2,6,6>: Cost 4 vext3 <0,u,1,5>, <2,6,6,7> + 3719246670U, // <5,2,6,7>: Cost 4 vext2 <4,6,5,2>, <6,7,0,1> + 2687125479U, // <5,2,6,u>: Cost 3 vext3 <0,4,1,5>, <2,6,u,7> + 2565603430U, // <5,2,7,0>: Cost 3 vext1 <2,5,2,7>, LHS + 2553660150U, // <5,2,7,1>: Cost 3 vext1 <0,5,2,7>, <1,0,3,2> + 2565605216U, // <5,2,7,2>: Cost 3 vext1 <2,5,2,7>, <2,5,2,7> + 2961178726U, // <5,2,7,3>: Cost 3 vzipr <1,3,5,7>, LHS + 2565606710U, // <5,2,7,4>: Cost 3 vext1 <2,5,2,7>, RHS + 4034920552U, // <5,2,7,5>: Cost 4 vzipr <1,3,5,7>, <0,1,2,5> + 3114713292U, // <5,2,7,6>: Cost 3 vtrnr RHS, <0,2,4,6> + 3702658668U, // <5,2,7,7>: Cost 4 vext2 <1,u,5,2>, <7,7,7,7> + 2961178731U, // <5,2,7,u>: Cost 3 vzipr <1,3,5,7>, LHS + 2687125563U, // <5,2,u,0>: Cost 3 vext3 <0,4,1,5>, <2,u,0,1> + 2628917038U, // <5,2,u,1>: Cost 3 vext2 <1,u,5,2>, LHS + 2565613409U, // <5,2,u,2>: Cost 3 vext1 <2,5,2,u>, <2,5,2,u> + 2687125592U, // <5,2,u,3>: Cost 3 vext3 <0,4,1,5>, <2,u,3,3> + 1628203107U, // <5,2,u,4>: Cost 2 vext3 <2,u,4,5>, <2,u,4,5> + 2628917402U, // <5,2,u,5>: Cost 3 vext2 <1,u,5,2>, RHS + 2702092405U, // <5,2,u,6>: Cost 3 vext3 <2,u,6,5>, <2,u,6,5> + 3304179598U, // <5,2,u,7>: Cost 4 vrev <2,5,7,u> + 1628498055U, // <5,2,u,u>: Cost 2 vext3 <2,u,u,5>, <2,u,u,5> + 3760867467U, // <5,3,0,0>: Cost 4 vext3 <0,4,1,5>, <3,0,0,0> + 2687125654U, // <5,3,0,1>: Cost 3 vext3 <0,4,1,5>, <3,0,1,2> + 3759761565U, // <5,3,0,2>: Cost 4 vext3 <0,2,4,5>, <3,0,2,0> + 3633391766U, // <5,3,0,3>: Cost 4 vext1 <1,5,3,0>, <3,0,1,2> + 2687125680U, // <5,3,0,4>: Cost 3 vext3 <0,4,1,5>, <3,0,4,1> + 3760277690U, // <5,3,0,5>: Cost 4 vext3 <0,3,2,5>, <3,0,5,2> + 3310013014U, // <5,3,0,6>: Cost 4 vrev <3,5,6,0> + 2236344927U, // <5,3,0,7>: Cost 3 vrev <3,5,7,0> + 2687125717U, // <5,3,0,u>: Cost 3 vext3 <0,4,1,5>, <3,0,u,2> + 3760867551U, // <5,3,1,0>: Cost 4 vext3 <0,4,1,5>, <3,1,0,3> + 3760867558U, // <5,3,1,1>: Cost 4 vext3 <0,4,1,5>, <3,1,1,1> + 2624938923U, // <5,3,1,2>: Cost 3 vext2 <1,2,5,3>, <1,2,5,3> + 2703198460U, // <5,3,1,3>: Cost 3 vext3 <3,1,3,5>, <3,1,3,5> + 3760867587U, // <5,3,1,4>: Cost 4 vext3 <0,4,1,5>, <3,1,4,3> + 2636219536U, // <5,3,1,5>: Cost 3 vext2 <3,1,5,3>, <1,5,3,7> + 3698681075U, // <5,3,1,6>: Cost 4 vext2 <1,2,5,3>, <1,6,5,7> + 2703493408U, // <5,3,1,7>: Cost 3 vext3 <3,1,7,5>, <3,1,7,5> + 2628920721U, // <5,3,1,u>: Cost 3 vext2 <1,u,5,3>, <1,u,5,3> + 3766765870U, // <5,3,2,0>: Cost 4 vext3 <1,4,0,5>, <3,2,0,1> + 3698681379U, // <5,3,2,1>: Cost 4 vext2 <1,2,5,3>, <2,1,3,5> + 3760867649U, // <5,3,2,2>: Cost 4 vext3 <0,4,1,5>, <3,2,2,2> + 2698627404U, // <5,3,2,3>: Cost 3 vext3 <2,3,4,5>, <3,2,3,4> + 2703935830U, // <5,3,2,4>: Cost 3 vext3 <3,2,4,5>, <3,2,4,5> + 2698627422U, // <5,3,2,5>: Cost 3 vext3 <2,3,4,5>, <3,2,5,4> + 3760867686U, // <5,3,2,6>: Cost 4 vext3 <0,4,1,5>, <3,2,6,3> + 3769788783U, // <5,3,2,7>: Cost 4 vext3 <1,u,5,5>, <3,2,7,3> + 2701945209U, // <5,3,2,u>: Cost 3 vext3 <2,u,4,5>, <3,2,u,4> + 3760867711U, // <5,3,3,0>: Cost 4 vext3 <0,4,1,5>, <3,3,0,1> + 2636220684U, // <5,3,3,1>: Cost 3 vext2 <3,1,5,3>, <3,1,5,3> + 3772369298U, // <5,3,3,2>: Cost 4 vext3 <2,3,4,5>, <3,3,2,2> + 2687125916U, // <5,3,3,3>: Cost 3 vext3 <0,4,1,5>, <3,3,3,3> + 2704599463U, // <5,3,3,4>: Cost 3 vext3 <3,3,4,5>, <3,3,4,5> + 2704673200U, // <5,3,3,5>: Cost 3 vext3 <3,3,5,5>, <3,3,5,5> + 3709962935U, // <5,3,3,6>: Cost 4 vext2 <3,1,5,3>, <3,6,7,7> + 3772369346U, // <5,3,3,7>: Cost 4 vext3 <2,3,4,5>, <3,3,7,5> + 2704894411U, // <5,3,3,u>: Cost 3 vext3 <3,3,u,5>, <3,3,u,5> + 2704968148U, // <5,3,4,0>: Cost 3 vext3 <3,4,0,5>, <3,4,0,5> + 3698682850U, // <5,3,4,1>: Cost 4 vext2 <1,2,5,3>, <4,1,5,0> + 2642857014U, // <5,3,4,2>: Cost 3 vext2 <4,2,5,3>, <4,2,5,3> + 2705189359U, // <5,3,4,3>: Cost 3 vext3 <3,4,3,5>, <3,4,3,5> + 2705263096U, // <5,3,4,4>: Cost 3 vext3 <3,4,4,5>, <3,4,4,5> + 2685946370U, // <5,3,4,5>: Cost 3 vext3 <0,2,3,5>, <3,4,5,6> + 3779152394U, // <5,3,4,6>: Cost 4 vext3 <3,4,6,5>, <3,4,6,5> + 2236377699U, // <5,3,4,7>: Cost 3 vrev <3,5,7,4> + 2687126045U, // <5,3,4,u>: Cost 3 vext3 <0,4,1,5>, <3,4,u,6> + 2571632742U, // <5,3,5,0>: Cost 3 vext1 <3,5,3,5>, LHS + 2559689870U, // <5,3,5,1>: Cost 3 vext1 <1,5,3,5>, <1,5,3,5> + 2571634382U, // <5,3,5,2>: Cost 3 vext1 <3,5,3,5>, <2,3,4,5> + 2571635264U, // <5,3,5,3>: Cost 3 vext1 <3,5,3,5>, <3,5,3,5> + 2571636022U, // <5,3,5,4>: Cost 3 vext1 <3,5,3,5>, RHS + 2559692804U, // <5,3,5,5>: Cost 3 vext1 <1,5,3,5>, <5,5,5,5> + 3720581218U, // <5,3,5,6>: Cost 4 vext2 <4,u,5,3>, <5,6,7,0> + 2236385892U, // <5,3,5,7>: Cost 3 vrev <3,5,7,5> + 2571638574U, // <5,3,5,u>: Cost 3 vext1 <3,5,3,5>, LHS + 2565668966U, // <5,3,6,0>: Cost 3 vext1 <2,5,3,6>, LHS + 3633439887U, // <5,3,6,1>: Cost 4 vext1 <1,5,3,6>, <1,5,3,6> + 2565670760U, // <5,3,6,2>: Cost 3 vext1 <2,5,3,6>, <2,5,3,6> + 2565671426U, // <5,3,6,3>: Cost 3 vext1 <2,5,3,6>, <3,4,5,6> + 2565672246U, // <5,3,6,4>: Cost 3 vext1 <2,5,3,6>, RHS + 3639414630U, // <5,3,6,5>: Cost 4 vext1 <2,5,3,6>, <5,3,6,0> + 4047521640U, // <5,3,6,6>: Cost 4 vzipr <3,4,5,6>, <2,5,3,6> + 2725169844U, // <5,3,6,7>: Cost 3 vext3 <6,7,4,5>, <3,6,7,4> + 2565674798U, // <5,3,6,u>: Cost 3 vext1 <2,5,3,6>, LHS + 1485963366U, // <5,3,7,0>: Cost 2 vext1 <1,5,3,7>, LHS + 1485964432U, // <5,3,7,1>: Cost 2 vext1 <1,5,3,7>, <1,5,3,7> + 2559706728U, // <5,3,7,2>: Cost 3 vext1 <1,5,3,7>, <2,2,2,2> + 2559707286U, // <5,3,7,3>: Cost 3 vext1 <1,5,3,7>, <3,0,1,2> + 1485966646U, // <5,3,7,4>: Cost 2 vext1 <1,5,3,7>, RHS + 2559708880U, // <5,3,7,5>: Cost 3 vext1 <1,5,3,7>, <5,1,7,3> + 2601513466U, // <5,3,7,6>: Cost 3 vext1 <u,5,3,7>, <6,2,7,3> + 3114714112U, // <5,3,7,7>: Cost 3 vtrnr RHS, <1,3,5,7> + 1485969198U, // <5,3,7,u>: Cost 2 vext1 <1,5,3,7>, LHS + 1485971558U, // <5,3,u,0>: Cost 2 vext1 <1,5,3,u>, LHS + 1485972625U, // <5,3,u,1>: Cost 2 vext1 <1,5,3,u>, <1,5,3,u> + 2559714920U, // <5,3,u,2>: Cost 3 vext1 <1,5,3,u>, <2,2,2,2> + 2559715478U, // <5,3,u,3>: Cost 3 vext1 <1,5,3,u>, <3,0,1,2> + 1485974838U, // <5,3,u,4>: Cost 2 vext1 <1,5,3,u>, RHS + 2687126342U, // <5,3,u,5>: Cost 3 vext3 <0,4,1,5>, <3,u,5,6> + 2601521658U, // <5,3,u,6>: Cost 3 vext1 <u,5,3,u>, <6,2,7,3> + 2236410471U, // <5,3,u,7>: Cost 3 vrev <3,5,7,u> + 1485977390U, // <5,3,u,u>: Cost 2 vext1 <1,5,3,u>, LHS + 3627491430U, // <5,4,0,0>: Cost 4 vext1 <0,5,4,0>, LHS + 2636890214U, // <5,4,0,1>: Cost 3 vext2 <3,2,5,4>, LHS + 3703333028U, // <5,4,0,2>: Cost 4 vext2 <2,0,5,4>, <0,2,0,2> + 3782249348U, // <5,4,0,3>: Cost 4 vext3 <4,0,3,5>, <4,0,3,5> + 2642198866U, // <5,4,0,4>: Cost 3 vext2 <4,1,5,4>, <0,4,1,5> + 2687126418U, // <5,4,0,5>: Cost 3 vext3 <0,4,1,5>, <4,0,5,1> + 2242243887U, // <5,4,0,6>: Cost 3 vrev <4,5,6,0> + 3316059448U, // <5,4,0,7>: Cost 4 vrev <4,5,7,0> + 2636890781U, // <5,4,0,u>: Cost 3 vext2 <3,2,5,4>, LHS + 2241809658U, // <5,4,1,0>: Cost 3 vrev <4,5,0,1> + 3698025307U, // <5,4,1,1>: Cost 4 vext2 <1,1,5,4>, <1,1,5,4> + 3698688940U, // <5,4,1,2>: Cost 4 vext2 <1,2,5,4>, <1,2,5,4> + 3698689024U, // <5,4,1,3>: Cost 4 vext2 <1,2,5,4>, <1,3,5,7> + 3700016206U, // <5,4,1,4>: Cost 4 vext2 <1,4,5,4>, <1,4,5,4> + 2687126498U, // <5,4,1,5>: Cost 3 vext3 <0,4,1,5>, <4,1,5,0> + 3760868336U, // <5,4,1,6>: Cost 4 vext3 <0,4,1,5>, <4,1,6,5> + 3316067641U, // <5,4,1,7>: Cost 4 vrev <4,5,7,1> + 2242399554U, // <5,4,1,u>: Cost 3 vrev <4,5,u,1> + 3703334371U, // <5,4,2,0>: Cost 4 vext2 <2,0,5,4>, <2,0,5,4> + 3703998004U, // <5,4,2,1>: Cost 4 vext2 <2,1,5,4>, <2,1,5,4> + 3704661637U, // <5,4,2,2>: Cost 4 vext2 <2,2,5,4>, <2,2,5,4> + 2636891854U, // <5,4,2,3>: Cost 3 vext2 <3,2,5,4>, <2,3,4,5> + 3705988903U, // <5,4,2,4>: Cost 4 vext2 <2,4,5,4>, <2,4,5,4> + 2698628150U, // <5,4,2,5>: Cost 3 vext3 <2,3,4,5>, <4,2,5,3> + 3760868415U, // <5,4,2,6>: Cost 4 vext3 <0,4,1,5>, <4,2,6,3> + 3783871562U, // <5,4,2,7>: Cost 4 vext3 <4,2,7,5>, <4,2,7,5> + 2666752099U, // <5,4,2,u>: Cost 3 vext2 <u,2,5,4>, <2,u,4,5> + 3639459942U, // <5,4,3,0>: Cost 4 vext1 <2,5,4,3>, LHS + 3709970701U, // <5,4,3,1>: Cost 4 vext2 <3,1,5,4>, <3,1,5,4> + 2636892510U, // <5,4,3,2>: Cost 3 vext2 <3,2,5,4>, <3,2,5,4> + 3710634396U, // <5,4,3,3>: Cost 4 vext2 <3,2,5,4>, <3,3,3,3> + 2638219776U, // <5,4,3,4>: Cost 3 vext2 <3,4,5,4>, <3,4,5,4> + 3766987908U, // <5,4,3,5>: Cost 4 vext3 <1,4,3,5>, <4,3,5,0> + 2710719634U, // <5,4,3,6>: Cost 3 vext3 <4,3,6,5>, <4,3,6,5> + 3914097664U, // <5,4,3,7>: Cost 4 vuzpr <3,5,7,4>, <1,3,5,7> + 2640874308U, // <5,4,3,u>: Cost 3 vext2 <3,u,5,4>, <3,u,5,4> + 2583642214U, // <5,4,4,0>: Cost 3 vext1 <5,5,4,4>, LHS + 2642201574U, // <5,4,4,1>: Cost 3 vext2 <4,1,5,4>, <4,1,5,4> + 3710635062U, // <5,4,4,2>: Cost 4 vext2 <3,2,5,4>, <4,2,5,3> + 3717270664U, // <5,4,4,3>: Cost 4 vext2 <4,3,5,4>, <4,3,5,4> + 2713963728U, // <5,4,4,4>: Cost 3 vext3 <4,u,5,5>, <4,4,4,4> + 1637567706U, // <5,4,4,5>: Cost 2 vext3 <4,4,5,5>, <4,4,5,5> + 2242276659U, // <5,4,4,6>: Cost 3 vrev <4,5,6,4> + 2646183372U, // <5,4,4,7>: Cost 3 vext2 <4,7,5,4>, <4,7,5,4> + 1637788917U, // <5,4,4,u>: Cost 2 vext3 <4,4,u,5>, <4,4,u,5> + 2559762534U, // <5,4,5,0>: Cost 3 vext1 <1,5,4,5>, LHS + 2559763607U, // <5,4,5,1>: Cost 3 vext1 <1,5,4,5>, <1,5,4,5> + 2698628366U, // <5,4,5,2>: Cost 3 vext3 <2,3,4,5>, <4,5,2,3> + 3633506454U, // <5,4,5,3>: Cost 4 vext1 <1,5,4,5>, <3,0,1,2> + 2559765814U, // <5,4,5,4>: Cost 3 vext1 <1,5,4,5>, RHS + 2583654395U, // <5,4,5,5>: Cost 3 vext1 <5,5,4,5>, <5,5,4,5> + 1613385014U, // <5,4,5,6>: Cost 2 vext3 <0,4,1,5>, RHS + 3901639990U, // <5,4,5,7>: Cost 4 vuzpr <1,5,0,4>, RHS + 1613385032U, // <5,4,5,u>: Cost 2 vext3 <0,4,1,5>, RHS + 2559770726U, // <5,4,6,0>: Cost 3 vext1 <1,5,4,6>, LHS + 2559771648U, // <5,4,6,1>: Cost 3 vext1 <1,5,4,6>, <1,3,5,7> + 3633514088U, // <5,4,6,2>: Cost 4 vext1 <1,5,4,6>, <2,2,2,2> + 2571717122U, // <5,4,6,3>: Cost 3 vext1 <3,5,4,6>, <3,4,5,6> + 2559774006U, // <5,4,6,4>: Cost 3 vext1 <1,5,4,6>, RHS + 2712636796U, // <5,4,6,5>: Cost 3 vext3 <4,6,5,5>, <4,6,5,5> + 3760868743U, // <5,4,6,6>: Cost 4 vext3 <0,4,1,5>, <4,6,6,7> + 2712784270U, // <5,4,6,7>: Cost 3 vext3 <4,6,7,5>, <4,6,7,5> + 2559776558U, // <5,4,6,u>: Cost 3 vext1 <1,5,4,6>, LHS + 2565750886U, // <5,4,7,0>: Cost 3 vext1 <2,5,4,7>, LHS + 2565751706U, // <5,4,7,1>: Cost 3 vext1 <2,5,4,7>, <1,2,3,4> + 2565752690U, // <5,4,7,2>: Cost 3 vext1 <2,5,4,7>, <2,5,4,7> + 2571725387U, // <5,4,7,3>: Cost 3 vext1 <3,5,4,7>, <3,5,4,7> + 2565754166U, // <5,4,7,4>: Cost 3 vext1 <2,5,4,7>, RHS + 3114713426U, // <5,4,7,5>: Cost 3 vtrnr RHS, <0,4,1,5> + 94817590U, // <5,4,7,6>: Cost 1 vrev RHS + 2595616175U, // <5,4,7,7>: Cost 3 vext1 <7,5,4,7>, <7,5,4,7> + 94965064U, // <5,4,7,u>: Cost 1 vrev RHS + 2559787110U, // <5,4,u,0>: Cost 3 vext1 <1,5,4,u>, LHS + 2559788186U, // <5,4,u,1>: Cost 3 vext1 <1,5,4,u>, <1,5,4,u> + 2242014483U, // <5,4,u,2>: Cost 3 vrev <4,5,2,u> + 2667419628U, // <5,4,u,3>: Cost 3 vext2 <u,3,5,4>, <u,3,5,4> + 2559790390U, // <5,4,u,4>: Cost 3 vext1 <1,5,4,u>, RHS + 1640222238U, // <5,4,u,5>: Cost 2 vext3 <4,u,5,5>, <4,u,5,5> + 94825783U, // <5,4,u,6>: Cost 1 vrev RHS + 2714111536U, // <5,4,u,7>: Cost 3 vext3 <4,u,7,5>, <4,u,7,5> + 94973257U, // <5,4,u,u>: Cost 1 vrev RHS + 2646851584U, // <5,5,0,0>: Cost 3 vext2 <4,u,5,5>, <0,0,0,0> + 1573109862U, // <5,5,0,1>: Cost 2 vext2 <4,u,5,5>, LHS + 2646851748U, // <5,5,0,2>: Cost 3 vext2 <4,u,5,5>, <0,2,0,2> + 3760279130U, // <5,5,0,3>: Cost 4 vext3 <0,3,2,5>, <5,0,3,2> + 2687127138U, // <5,5,0,4>: Cost 3 vext3 <0,4,1,5>, <5,0,4,1> + 2248142847U, // <5,5,0,5>: Cost 3 vrev <5,5,5,0> + 3720593910U, // <5,5,0,6>: Cost 4 vext2 <4,u,5,5>, <0,6,1,7> + 4182502710U, // <5,5,0,7>: Cost 4 vtrnr <3,5,7,0>, RHS + 1573110429U, // <5,5,0,u>: Cost 2 vext2 <4,u,5,5>, LHS + 2646852342U, // <5,5,1,0>: Cost 3 vext2 <4,u,5,5>, <1,0,3,2> + 2624291676U, // <5,5,1,1>: Cost 3 vext2 <1,1,5,5>, <1,1,5,5> + 2646852502U, // <5,5,1,2>: Cost 3 vext2 <4,u,5,5>, <1,2,3,0> + 2646852568U, // <5,5,1,3>: Cost 3 vext2 <4,u,5,5>, <1,3,1,3> + 2715217591U, // <5,5,1,4>: Cost 3 vext3 <5,1,4,5>, <5,1,4,5> + 2628936848U, // <5,5,1,5>: Cost 3 vext2 <1,u,5,5>, <1,5,3,7> + 3698033907U, // <5,5,1,6>: Cost 4 vext2 <1,1,5,5>, <1,6,5,7> + 2713964240U, // <5,5,1,7>: Cost 3 vext3 <4,u,5,5>, <5,1,7,3> + 2628937107U, // <5,5,1,u>: Cost 3 vext2 <1,u,5,5>, <1,u,5,5> + 3645497446U, // <5,5,2,0>: Cost 4 vext1 <3,5,5,2>, LHS + 3760869099U, // <5,5,2,1>: Cost 4 vext3 <0,4,1,5>, <5,2,1,3> + 2646853224U, // <5,5,2,2>: Cost 3 vext2 <4,u,5,5>, <2,2,2,2> + 2698628862U, // <5,5,2,3>: Cost 3 vext3 <2,3,4,5>, <5,2,3,4> + 3772370694U, // <5,5,2,4>: Cost 4 vext3 <2,3,4,5>, <5,2,4,3> + 2713964303U, // <5,5,2,5>: Cost 3 vext3 <4,u,5,5>, <5,2,5,3> + 2646853562U, // <5,5,2,6>: Cost 3 vext2 <4,u,5,5>, <2,6,3,7> + 4038198272U, // <5,5,2,7>: Cost 4 vzipr <1,u,5,2>, <1,3,5,7> + 2701946667U, // <5,5,2,u>: Cost 3 vext3 <2,u,4,5>, <5,2,u,4> + 2646853782U, // <5,5,3,0>: Cost 3 vext2 <4,u,5,5>, <3,0,1,2> + 3698034922U, // <5,5,3,1>: Cost 4 vext2 <1,1,5,5>, <3,1,1,5> + 3702679919U, // <5,5,3,2>: Cost 4 vext2 <1,u,5,5>, <3,2,7,3> + 2637564336U, // <5,5,3,3>: Cost 3 vext2 <3,3,5,5>, <3,3,5,5> + 2646854146U, // <5,5,3,4>: Cost 3 vext2 <4,u,5,5>, <3,4,5,6> + 2638891602U, // <5,5,3,5>: Cost 3 vext2 <3,5,5,5>, <3,5,5,5> + 3702680247U, // <5,5,3,6>: Cost 4 vext2 <1,u,5,5>, <3,6,7,7> + 3702680259U, // <5,5,3,7>: Cost 4 vext2 <1,u,5,5>, <3,7,0,1> + 2646854430U, // <5,5,3,u>: Cost 3 vext2 <4,u,5,5>, <3,u,1,2> + 2646854546U, // <5,5,4,0>: Cost 3 vext2 <4,u,5,5>, <4,0,5,1> + 2642209767U, // <5,5,4,1>: Cost 3 vext2 <4,1,5,5>, <4,1,5,5> + 3711306806U, // <5,5,4,2>: Cost 4 vext2 <3,3,5,5>, <4,2,5,3> + 3645516369U, // <5,5,4,3>: Cost 4 vext1 <3,5,5,4>, <3,5,5,4> + 1570458842U, // <5,5,4,4>: Cost 2 vext2 <4,4,5,5>, <4,4,5,5> + 1573113142U, // <5,5,4,5>: Cost 2 vext2 <4,u,5,5>, RHS + 2645527932U, // <5,5,4,6>: Cost 3 vext2 <4,6,5,5>, <4,6,5,5> + 2713964486U, // <5,5,4,7>: Cost 3 vext3 <4,u,5,5>, <5,4,7,6> + 1573113374U, // <5,5,4,u>: Cost 2 vext2 <4,u,5,5>, <4,u,5,5> + 1509982310U, // <5,5,5,0>: Cost 2 vext1 <5,5,5,5>, LHS + 2646855376U, // <5,5,5,1>: Cost 3 vext2 <4,u,5,5>, <5,1,7,3> + 2583725672U, // <5,5,5,2>: Cost 3 vext1 <5,5,5,5>, <2,2,2,2> + 2583726230U, // <5,5,5,3>: Cost 3 vext1 <5,5,5,5>, <3,0,1,2> + 1509985590U, // <5,5,5,4>: Cost 2 vext1 <5,5,5,5>, RHS + 229035318U, // <5,5,5,5>: Cost 1 vdup1 RHS + 2646855778U, // <5,5,5,6>: Cost 3 vext2 <4,u,5,5>, <5,6,7,0> + 2646855848U, // <5,5,5,7>: Cost 3 vext2 <4,u,5,5>, <5,7,5,7> + 229035318U, // <5,5,5,u>: Cost 1 vdup1 RHS + 2577760358U, // <5,5,6,0>: Cost 3 vext1 <4,5,5,6>, LHS + 3633587361U, // <5,5,6,1>: Cost 4 vext1 <1,5,5,6>, <1,5,5,6> + 2646856186U, // <5,5,6,2>: Cost 3 vext2 <4,u,5,5>, <6,2,7,3> + 3633588738U, // <5,5,6,3>: Cost 4 vext1 <1,5,5,6>, <3,4,5,6> + 2718535756U, // <5,5,6,4>: Cost 3 vext3 <5,6,4,5>, <5,6,4,5> + 2644202223U, // <5,5,6,5>: Cost 3 vext2 <4,4,5,5>, <6,5,7,5> + 2973780482U, // <5,5,6,6>: Cost 3 vzipr <3,4,5,6>, <3,4,5,6> + 2646856526U, // <5,5,6,7>: Cost 3 vext2 <4,u,5,5>, <6,7,0,1> + 2646856607U, // <5,5,6,u>: Cost 3 vext2 <4,u,5,5>, <6,u,0,1> + 2571796582U, // <5,5,7,0>: Cost 3 vext1 <3,5,5,7>, LHS + 3633595392U, // <5,5,7,1>: Cost 4 vext1 <1,5,5,7>, <1,3,5,7> + 2571798222U, // <5,5,7,2>: Cost 3 vext1 <3,5,5,7>, <2,3,4,5> + 2571799124U, // <5,5,7,3>: Cost 3 vext1 <3,5,5,7>, <3,5,5,7> + 2571799862U, // <5,5,7,4>: Cost 3 vext1 <3,5,5,7>, RHS + 3114717188U, // <5,5,7,5>: Cost 3 vtrnr RHS, <5,5,5,5> + 4034923010U, // <5,5,7,6>: Cost 4 vzipr <1,3,5,7>, <3,4,5,6> + 2040974646U, // <5,5,7,7>: Cost 2 vtrnr RHS, RHS + 2040974647U, // <5,5,7,u>: Cost 2 vtrnr RHS, RHS + 1509982310U, // <5,5,u,0>: Cost 2 vext1 <5,5,5,5>, LHS + 1573115694U, // <5,5,u,1>: Cost 2 vext2 <4,u,5,5>, LHS + 2571806414U, // <5,5,u,2>: Cost 3 vext1 <3,5,5,u>, <2,3,4,5> + 2571807317U, // <5,5,u,3>: Cost 3 vext1 <3,5,5,u>, <3,5,5,u> + 1509985590U, // <5,5,u,4>: Cost 2 vext1 <5,5,5,5>, RHS + 229035318U, // <5,5,u,5>: Cost 1 vdup1 RHS + 2646857936U, // <5,5,u,6>: Cost 3 vext2 <4,u,5,5>, <u,6,3,7> + 2040982838U, // <5,5,u,7>: Cost 2 vtrnr RHS, RHS + 229035318U, // <5,5,u,u>: Cost 1 vdup1 RHS + 2638233600U, // <5,6,0,0>: Cost 3 vext2 <3,4,5,6>, <0,0,0,0> + 1564491878U, // <5,6,0,1>: Cost 2 vext2 <3,4,5,6>, LHS + 2632261796U, // <5,6,0,2>: Cost 3 vext2 <2,4,5,6>, <0,2,0,2> + 2638233856U, // <5,6,0,3>: Cost 3 vext2 <3,4,5,6>, <0,3,1,4> + 2638233938U, // <5,6,0,4>: Cost 3 vext2 <3,4,5,6>, <0,4,1,5> + 3706003885U, // <5,6,0,5>: Cost 4 vext2 <2,4,5,6>, <0,5,2,6> + 3706003967U, // <5,6,0,6>: Cost 4 vext2 <2,4,5,6>, <0,6,2,7> + 4047473974U, // <5,6,0,7>: Cost 4 vzipr <3,4,5,0>, RHS + 1564492445U, // <5,6,0,u>: Cost 2 vext2 <3,4,5,6>, LHS + 2638234358U, // <5,6,1,0>: Cost 3 vext2 <3,4,5,6>, <1,0,3,2> + 2638234420U, // <5,6,1,1>: Cost 3 vext2 <3,4,5,6>, <1,1,1,1> + 2638234518U, // <5,6,1,2>: Cost 3 vext2 <3,4,5,6>, <1,2,3,0> + 2638234584U, // <5,6,1,3>: Cost 3 vext2 <3,4,5,6>, <1,3,1,3> + 2626290768U, // <5,6,1,4>: Cost 3 vext2 <1,4,5,6>, <1,4,5,6> + 2638234768U, // <5,6,1,5>: Cost 3 vext2 <3,4,5,6>, <1,5,3,7> + 3700032719U, // <5,6,1,6>: Cost 4 vext2 <1,4,5,6>, <1,6,1,7> + 2982366518U, // <5,6,1,7>: Cost 3 vzipr <4,u,5,1>, RHS + 2628945300U, // <5,6,1,u>: Cost 3 vext2 <1,u,5,6>, <1,u,5,6> + 3706004925U, // <5,6,2,0>: Cost 4 vext2 <2,4,5,6>, <2,0,1,2> + 3711976966U, // <5,6,2,1>: Cost 4 vext2 <3,4,5,6>, <2,1,0,3> + 2638235240U, // <5,6,2,2>: Cost 3 vext2 <3,4,5,6>, <2,2,2,2> + 2638235302U, // <5,6,2,3>: Cost 3 vext2 <3,4,5,6>, <2,3,0,1> + 2632263465U, // <5,6,2,4>: Cost 3 vext2 <2,4,5,6>, <2,4,5,6> + 2638235496U, // <5,6,2,5>: Cost 3 vext2 <3,4,5,6>, <2,5,3,6> + 2638235578U, // <5,6,2,6>: Cost 3 vext2 <3,4,5,6>, <2,6,3,7> + 2713965050U, // <5,6,2,7>: Cost 3 vext3 <4,u,5,5>, <6,2,7,3> + 2634917997U, // <5,6,2,u>: Cost 3 vext2 <2,u,5,6>, <2,u,5,6> + 2638235798U, // <5,6,3,0>: Cost 3 vext2 <3,4,5,6>, <3,0,1,2> + 3711977695U, // <5,6,3,1>: Cost 4 vext2 <3,4,5,6>, <3,1,0,3> + 3710650720U, // <5,6,3,2>: Cost 4 vext2 <3,2,5,6>, <3,2,5,6> + 2638236060U, // <5,6,3,3>: Cost 3 vext2 <3,4,5,6>, <3,3,3,3> + 1564494338U, // <5,6,3,4>: Cost 2 vext2 <3,4,5,6>, <3,4,5,6> + 2638236234U, // <5,6,3,5>: Cost 3 vext2 <3,4,5,6>, <3,5,4,6> + 3711978104U, // <5,6,3,6>: Cost 4 vext2 <3,4,5,6>, <3,6,0,7> + 4034227510U, // <5,6,3,7>: Cost 4 vzipr <1,2,5,3>, RHS + 1567148870U, // <5,6,3,u>: Cost 2 vext2 <3,u,5,6>, <3,u,5,6> + 2577817702U, // <5,6,4,0>: Cost 3 vext1 <4,5,6,4>, LHS + 3700034544U, // <5,6,4,1>: Cost 4 vext2 <1,4,5,6>, <4,1,6,5> + 2723033713U, // <5,6,4,2>: Cost 3 vext3 <6,4,2,5>, <6,4,2,5> + 2638236818U, // <5,6,4,3>: Cost 3 vext2 <3,4,5,6>, <4,3,6,5> + 2644208859U, // <5,6,4,4>: Cost 3 vext2 <4,4,5,6>, <4,4,5,6> + 1564495158U, // <5,6,4,5>: Cost 2 vext2 <3,4,5,6>, RHS + 2645536125U, // <5,6,4,6>: Cost 3 vext2 <4,6,5,6>, <4,6,5,6> + 2723402398U, // <5,6,4,7>: Cost 3 vext3 <6,4,7,5>, <6,4,7,5> + 1564495401U, // <5,6,4,u>: Cost 2 vext2 <3,4,5,6>, RHS + 2577825894U, // <5,6,5,0>: Cost 3 vext1 <4,5,6,5>, LHS + 2662125264U, // <5,6,5,1>: Cost 3 vext2 <7,4,5,6>, <5,1,7,3> + 3775836867U, // <5,6,5,2>: Cost 4 vext3 <2,u,6,5>, <6,5,2,6> + 3711979343U, // <5,6,5,3>: Cost 4 vext2 <3,4,5,6>, <5,3,3,4> + 2650181556U, // <5,6,5,4>: Cost 3 vext2 <5,4,5,6>, <5,4,5,6> + 2662125572U, // <5,6,5,5>: Cost 3 vext2 <7,4,5,6>, <5,5,5,5> + 2638237732U, // <5,6,5,6>: Cost 3 vext2 <3,4,5,6>, <5,6,0,1> + 2982399286U, // <5,6,5,7>: Cost 3 vzipr <4,u,5,5>, RHS + 2982399287U, // <5,6,5,u>: Cost 3 vzipr <4,u,5,5>, RHS + 2583806054U, // <5,6,6,0>: Cost 3 vext1 <5,5,6,6>, LHS + 3711979910U, // <5,6,6,1>: Cost 4 vext2 <3,4,5,6>, <6,1,3,4> + 2662126074U, // <5,6,6,2>: Cost 3 vext2 <7,4,5,6>, <6,2,7,3> + 2583808514U, // <5,6,6,3>: Cost 3 vext1 <5,5,6,6>, <3,4,5,6> + 2583809334U, // <5,6,6,4>: Cost 3 vext1 <5,5,6,6>, RHS + 2583810062U, // <5,6,6,5>: Cost 3 vext1 <5,5,6,6>, <5,5,6,6> + 2638238520U, // <5,6,6,6>: Cost 3 vext2 <3,4,5,6>, <6,6,6,6> + 2973781302U, // <5,6,6,7>: Cost 3 vzipr <3,4,5,6>, RHS + 2973781303U, // <5,6,6,u>: Cost 3 vzipr <3,4,5,6>, RHS + 430358630U, // <5,6,7,0>: Cost 1 vext1 RHS, LHS + 1504101110U, // <5,6,7,1>: Cost 2 vext1 RHS, <1,0,3,2> + 1504101992U, // <5,6,7,2>: Cost 2 vext1 RHS, <2,2,2,2> + 1504102550U, // <5,6,7,3>: Cost 2 vext1 RHS, <3,0,1,2> + 430361910U, // <5,6,7,4>: Cost 1 vext1 RHS, RHS + 1504104390U, // <5,6,7,5>: Cost 2 vext1 RHS, <5,4,7,6> + 1504105272U, // <5,6,7,6>: Cost 2 vext1 RHS, <6,6,6,6> + 1504106092U, // <5,6,7,7>: Cost 2 vext1 RHS, <7,7,7,7> + 430364462U, // <5,6,7,u>: Cost 1 vext1 RHS, LHS + 430366822U, // <5,6,u,0>: Cost 1 vext1 RHS, LHS + 1564497710U, // <5,6,u,1>: Cost 2 vext2 <3,4,5,6>, LHS + 1504110184U, // <5,6,u,2>: Cost 2 vext1 RHS, <2,2,2,2> + 1504110742U, // <5,6,u,3>: Cost 2 vext1 RHS, <3,0,1,2> + 430370103U, // <5,6,u,4>: Cost 1 vext1 RHS, RHS + 1564498074U, // <5,6,u,5>: Cost 2 vext2 <3,4,5,6>, RHS + 1504113146U, // <5,6,u,6>: Cost 2 vext1 RHS, <6,2,7,3> + 1504113658U, // <5,6,u,7>: Cost 2 vext1 RHS, <7,0,1,2> + 430372654U, // <5,6,u,u>: Cost 1 vext1 RHS, LHS + 2625634304U, // <5,7,0,0>: Cost 3 vext2 <1,3,5,7>, <0,0,0,0> + 1551892582U, // <5,7,0,1>: Cost 2 vext2 <1,3,5,7>, LHS + 2625634468U, // <5,7,0,2>: Cost 3 vext2 <1,3,5,7>, <0,2,0,2> + 2571889247U, // <5,7,0,3>: Cost 3 vext1 <3,5,7,0>, <3,5,7,0> + 2625634642U, // <5,7,0,4>: Cost 3 vext2 <1,3,5,7>, <0,4,1,5> + 2595778728U, // <5,7,0,5>: Cost 3 vext1 <7,5,7,0>, <5,7,5,7> + 3699376639U, // <5,7,0,6>: Cost 4 vext2 <1,3,5,7>, <0,6,2,7> + 2260235715U, // <5,7,0,7>: Cost 3 vrev <7,5,7,0> + 1551893149U, // <5,7,0,u>: Cost 2 vext2 <1,3,5,7>, LHS + 2625635062U, // <5,7,1,0>: Cost 3 vext2 <1,3,5,7>, <1,0,3,2> + 2624308020U, // <5,7,1,1>: Cost 3 vext2 <1,1,5,7>, <1,1,1,1> + 2625635222U, // <5,7,1,2>: Cost 3 vext2 <1,3,5,7>, <1,2,3,0> + 1551893504U, // <5,7,1,3>: Cost 2 vext2 <1,3,5,7>, <1,3,5,7> + 2571898166U, // <5,7,1,4>: Cost 3 vext1 <3,5,7,1>, RHS + 2625635472U, // <5,7,1,5>: Cost 3 vext2 <1,3,5,7>, <1,5,3,7> + 2627626227U, // <5,7,1,6>: Cost 3 vext2 <1,6,5,7>, <1,6,5,7> + 3702031684U, // <5,7,1,7>: Cost 4 vext2 <1,7,5,7>, <1,7,5,7> + 1555211669U, // <5,7,1,u>: Cost 2 vext2 <1,u,5,7>, <1,u,5,7> + 2629617126U, // <5,7,2,0>: Cost 3 vext2 <2,0,5,7>, <2,0,5,7> + 3699377670U, // <5,7,2,1>: Cost 4 vext2 <1,3,5,7>, <2,1,0,3> + 2625635944U, // <5,7,2,2>: Cost 3 vext2 <1,3,5,7>, <2,2,2,2> + 2625636006U, // <5,7,2,3>: Cost 3 vext2 <1,3,5,7>, <2,3,0,1> + 2632271658U, // <5,7,2,4>: Cost 3 vext2 <2,4,5,7>, <2,4,5,7> + 2625636201U, // <5,7,2,5>: Cost 3 vext2 <1,3,5,7>, <2,5,3,7> + 2625636282U, // <5,7,2,6>: Cost 3 vext2 <1,3,5,7>, <2,6,3,7> + 3708004381U, // <5,7,2,7>: Cost 4 vext2 <2,7,5,7>, <2,7,5,7> + 2625636411U, // <5,7,2,u>: Cost 3 vext2 <1,3,5,7>, <2,u,0,1> + 2625636502U, // <5,7,3,0>: Cost 3 vext2 <1,3,5,7>, <3,0,1,2> + 2625636604U, // <5,7,3,1>: Cost 3 vext2 <1,3,5,7>, <3,1,3,5> + 3699378478U, // <5,7,3,2>: Cost 4 vext2 <1,3,5,7>, <3,2,0,1> + 2625636764U, // <5,7,3,3>: Cost 3 vext2 <1,3,5,7>, <3,3,3,3> + 2625636866U, // <5,7,3,4>: Cost 3 vext2 <1,3,5,7>, <3,4,5,6> + 2625636959U, // <5,7,3,5>: Cost 3 vext2 <1,3,5,7>, <3,5,7,0> + 3699378808U, // <5,7,3,6>: Cost 4 vext2 <1,3,5,7>, <3,6,0,7> + 2640235254U, // <5,7,3,7>: Cost 3 vext2 <3,7,5,7>, <3,7,5,7> + 2625637150U, // <5,7,3,u>: Cost 3 vext2 <1,3,5,7>, <3,u,1,2> + 2571919462U, // <5,7,4,0>: Cost 3 vext1 <3,5,7,4>, LHS + 2571920384U, // <5,7,4,1>: Cost 3 vext1 <3,5,7,4>, <1,3,5,7> + 3699379260U, // <5,7,4,2>: Cost 4 vext2 <1,3,5,7>, <4,2,6,0> + 2571922019U, // <5,7,4,3>: Cost 3 vext1 <3,5,7,4>, <3,5,7,4> + 2571922742U, // <5,7,4,4>: Cost 3 vext1 <3,5,7,4>, RHS + 1551895862U, // <5,7,4,5>: Cost 2 vext2 <1,3,5,7>, RHS + 2846277980U, // <5,7,4,6>: Cost 3 vuzpr RHS, <0,4,2,6> + 2646207951U, // <5,7,4,7>: Cost 3 vext2 <4,7,5,7>, <4,7,5,7> + 1551896105U, // <5,7,4,u>: Cost 2 vext2 <1,3,5,7>, RHS + 2583871590U, // <5,7,5,0>: Cost 3 vext1 <5,5,7,5>, LHS + 2652180176U, // <5,7,5,1>: Cost 3 vext2 <5,7,5,7>, <5,1,7,3> + 2625638177U, // <5,7,5,2>: Cost 3 vext2 <1,3,5,7>, <5,2,7,3> + 2625638262U, // <5,7,5,3>: Cost 3 vext2 <1,3,5,7>, <5,3,7,7> + 2583874870U, // <5,7,5,4>: Cost 3 vext1 <5,5,7,5>, RHS + 2846281732U, // <5,7,5,5>: Cost 3 vuzpr RHS, <5,5,5,5> + 2651517015U, // <5,7,5,6>: Cost 3 vext2 <5,6,5,7>, <5,6,5,7> + 1772539190U, // <5,7,5,7>: Cost 2 vuzpr RHS, RHS + 1772539191U, // <5,7,5,u>: Cost 2 vuzpr RHS, RHS + 2846281826U, // <5,7,6,0>: Cost 3 vuzpr RHS, <5,6,7,0> + 3699380615U, // <5,7,6,1>: Cost 4 vext2 <1,3,5,7>, <6,1,3,5> + 2846281108U, // <5,7,6,2>: Cost 3 vuzpr RHS, <4,6,u,2> + 2589854210U, // <5,7,6,3>: Cost 3 vext1 <6,5,7,6>, <3,4,5,6> + 2846281830U, // <5,7,6,4>: Cost 3 vuzpr RHS, <5,6,7,4> + 2725467658U, // <5,7,6,5>: Cost 3 vext3 <6,7,u,5>, <7,6,5,u> + 2846281076U, // <5,7,6,6>: Cost 3 vuzpr RHS, <4,6,4,6> + 2846279610U, // <5,7,6,7>: Cost 3 vuzpr RHS, <2,6,3,7> + 2846279611U, // <5,7,6,u>: Cost 3 vuzpr RHS, <2,6,3,u> + 1510146150U, // <5,7,7,0>: Cost 2 vext1 <5,5,7,7>, LHS + 2846282574U, // <5,7,7,1>: Cost 3 vuzpr RHS, <6,7,0,1> + 2583889512U, // <5,7,7,2>: Cost 3 vext1 <5,5,7,7>, <2,2,2,2> + 2846281919U, // <5,7,7,3>: Cost 3 vuzpr RHS, <5,7,u,3> + 1510149430U, // <5,7,7,4>: Cost 2 vext1 <5,5,7,7>, RHS + 1510150168U, // <5,7,7,5>: Cost 2 vext1 <5,5,7,7>, <5,5,7,7> + 2583892474U, // <5,7,7,6>: Cost 3 vext1 <5,5,7,7>, <6,2,7,3> + 2625640044U, // <5,7,7,7>: Cost 3 vext2 <1,3,5,7>, <7,7,7,7> + 1510151982U, // <5,7,7,u>: Cost 2 vext1 <5,5,7,7>, LHS + 1510154342U, // <5,7,u,0>: Cost 2 vext1 <5,5,7,u>, LHS + 1551898414U, // <5,7,u,1>: Cost 2 vext2 <1,3,5,7>, LHS + 2625640325U, // <5,7,u,2>: Cost 3 vext2 <1,3,5,7>, <u,2,3,0> + 1772536477U, // <5,7,u,3>: Cost 2 vuzpr RHS, LHS + 1510157622U, // <5,7,u,4>: Cost 2 vext1 <5,5,7,u>, RHS + 1551898778U, // <5,7,u,5>: Cost 2 vext2 <1,3,5,7>, RHS + 2625640656U, // <5,7,u,6>: Cost 3 vext2 <1,3,5,7>, <u,6,3,7> + 1772539433U, // <5,7,u,7>: Cost 2 vuzpr RHS, RHS + 1551898981U, // <5,7,u,u>: Cost 2 vext2 <1,3,5,7>, LHS + 2625642496U, // <5,u,0,0>: Cost 3 vext2 <1,3,5,u>, <0,0,0,0> + 1551900774U, // <5,u,0,1>: Cost 2 vext2 <1,3,5,u>, LHS + 2625642660U, // <5,u,0,2>: Cost 3 vext2 <1,3,5,u>, <0,2,0,2> + 2698630885U, // <5,u,0,3>: Cost 3 vext3 <2,3,4,5>, <u,0,3,2> + 2687129325U, // <5,u,0,4>: Cost 3 vext3 <0,4,1,5>, <u,0,4,1> + 2689783542U, // <5,u,0,5>: Cost 3 vext3 <0,u,1,5>, <u,0,5,1> + 2266134675U, // <5,u,0,6>: Cost 3 vrev <u,5,6,0> + 2595853772U, // <5,u,0,7>: Cost 3 vext1 <7,5,u,0>, <7,5,u,0> + 1551901341U, // <5,u,0,u>: Cost 2 vext2 <1,3,5,u>, LHS + 2625643254U, // <5,u,1,0>: Cost 3 vext2 <1,3,5,u>, <1,0,3,2> + 2625643316U, // <5,u,1,1>: Cost 3 vext2 <1,3,5,u>, <1,1,1,1> + 1613387566U, // <5,u,1,2>: Cost 2 vext3 <0,4,1,5>, LHS + 1551901697U, // <5,u,1,3>: Cost 2 vext2 <1,3,5,u>, <1,3,5,u> + 2626307154U, // <5,u,1,4>: Cost 3 vext2 <1,4,5,u>, <1,4,5,u> + 2689783622U, // <5,u,1,5>: Cost 3 vext3 <0,u,1,5>, <u,1,5,0> + 2627634420U, // <5,u,1,6>: Cost 3 vext2 <1,6,5,u>, <1,6,5,u> + 2982366536U, // <5,u,1,7>: Cost 3 vzipr <4,u,5,1>, RHS + 1613387620U, // <5,u,1,u>: Cost 2 vext3 <0,4,1,5>, LHS + 2846286742U, // <5,u,2,0>: Cost 3 vuzpr RHS, <1,2,3,0> + 2685796528U, // <5,u,2,1>: Cost 3 vext3 <0,2,1,5>, <0,2,1,5> + 2625644136U, // <5,u,2,2>: Cost 3 vext2 <1,3,5,u>, <2,2,2,2> + 2687129480U, // <5,u,2,3>: Cost 3 vext3 <0,4,1,5>, <u,2,3,3> + 2632279851U, // <5,u,2,4>: Cost 3 vext2 <2,4,5,u>, <2,4,5,u> + 2625644394U, // <5,u,2,5>: Cost 3 vext2 <1,3,5,u>, <2,5,3,u> + 2625644474U, // <5,u,2,6>: Cost 3 vext2 <1,3,5,u>, <2,6,3,7> + 2713966508U, // <5,u,2,7>: Cost 3 vext3 <4,u,5,5>, <u,2,7,3> + 2625644603U, // <5,u,2,u>: Cost 3 vext2 <1,3,5,u>, <2,u,0,1> + 2687129532U, // <5,u,3,0>: Cost 3 vext3 <0,4,1,5>, <u,3,0,1> + 2636261649U, // <5,u,3,1>: Cost 3 vext2 <3,1,5,u>, <3,1,5,u> + 2636925282U, // <5,u,3,2>: Cost 3 vext2 <3,2,5,u>, <3,2,5,u> + 2625644956U, // <5,u,3,3>: Cost 3 vext2 <1,3,5,u>, <3,3,3,3> + 1564510724U, // <5,u,3,4>: Cost 2 vext2 <3,4,5,u>, <3,4,5,u> + 2625645160U, // <5,u,3,5>: Cost 3 vext2 <1,3,5,u>, <3,5,u,0> + 2734610422U, // <5,u,3,6>: Cost 3 vext3 <u,3,6,5>, <u,3,6,5> + 2640243447U, // <5,u,3,7>: Cost 3 vext2 <3,7,5,u>, <3,7,5,u> + 1567165256U, // <5,u,3,u>: Cost 2 vext2 <3,u,5,u>, <3,u,5,u> + 1567828889U, // <5,u,4,0>: Cost 2 vext2 <4,0,5,u>, <4,0,5,u> + 1661163546U, // <5,u,4,1>: Cost 2 vext3 <u,4,1,5>, <u,4,1,5> + 2734463012U, // <5,u,4,2>: Cost 3 vext3 <u,3,4,5>, <u,4,2,6> + 2698631212U, // <5,u,4,3>: Cost 3 vext3 <2,3,4,5>, <u,4,3,5> + 1570458842U, // <5,u,4,4>: Cost 2 vext2 <4,4,5,5>, <4,4,5,5> + 1551904054U, // <5,u,4,5>: Cost 2 vext2 <1,3,5,u>, RHS + 2846286172U, // <5,u,4,6>: Cost 3 vuzpr RHS, <0,4,2,6> + 2646216144U, // <5,u,4,7>: Cost 3 vext2 <4,7,5,u>, <4,7,5,u> + 1551904297U, // <5,u,4,u>: Cost 2 vext2 <1,3,5,u>, RHS + 1509982310U, // <5,u,5,0>: Cost 2 vext1 <5,5,5,5>, LHS + 2560058555U, // <5,u,5,1>: Cost 3 vext1 <1,5,u,5>, <1,5,u,5> + 2698926194U, // <5,u,5,2>: Cost 3 vext3 <2,3,u,5>, <u,5,2,3> + 2698631295U, // <5,u,5,3>: Cost 3 vext3 <2,3,4,5>, <u,5,3,7> + 1509985590U, // <5,u,5,4>: Cost 2 vext1 <5,5,5,5>, RHS + 229035318U, // <5,u,5,5>: Cost 1 vdup1 RHS + 1613387930U, // <5,u,5,6>: Cost 2 vext3 <0,4,1,5>, RHS + 1772547382U, // <5,u,5,7>: Cost 2 vuzpr RHS, RHS + 229035318U, // <5,u,5,u>: Cost 1 vdup1 RHS + 2566037606U, // <5,u,6,0>: Cost 3 vext1 <2,5,u,6>, LHS + 2920044334U, // <5,u,6,1>: Cost 3 vzipl <5,6,7,0>, LHS + 2566039445U, // <5,u,6,2>: Cost 3 vext1 <2,5,u,6>, <2,5,u,6> + 2687129808U, // <5,u,6,3>: Cost 3 vext3 <0,4,1,5>, <u,6,3,7> + 2566040886U, // <5,u,6,4>: Cost 3 vext1 <2,5,u,6>, RHS + 2920044698U, // <5,u,6,5>: Cost 3 vzipl <5,6,7,0>, RHS + 2846289268U, // <5,u,6,6>: Cost 3 vuzpr RHS, <4,6,4,6> + 2973781320U, // <5,u,6,7>: Cost 3 vzipr <3,4,5,6>, RHS + 2687129853U, // <5,u,6,u>: Cost 3 vext3 <0,4,1,5>, <u,6,u,7> + 430506086U, // <5,u,7,0>: Cost 1 vext1 RHS, LHS + 1486333117U, // <5,u,7,1>: Cost 2 vext1 <1,5,u,7>, <1,5,u,7> + 1504249448U, // <5,u,7,2>: Cost 2 vext1 RHS, <2,2,2,2> + 2040971933U, // <5,u,7,3>: Cost 2 vtrnr RHS, LHS + 430509384U, // <5,u,7,4>: Cost 1 vext1 RHS, RHS + 1504251600U, // <5,u,7,5>: Cost 2 vext1 RHS, <5,1,7,3> + 118708378U, // <5,u,7,6>: Cost 1 vrev RHS + 2040974889U, // <5,u,7,7>: Cost 2 vtrnr RHS, RHS + 430511918U, // <5,u,7,u>: Cost 1 vext1 RHS, LHS + 430514278U, // <5,u,u,0>: Cost 1 vext1 RHS, LHS + 1551906606U, // <5,u,u,1>: Cost 2 vext2 <1,3,5,u>, LHS + 1613388133U, // <5,u,u,2>: Cost 2 vext3 <0,4,1,5>, LHS + 1772544669U, // <5,u,u,3>: Cost 2 vuzpr RHS, LHS + 430517577U, // <5,u,u,4>: Cost 1 vext1 RHS, RHS + 229035318U, // <5,u,u,5>: Cost 1 vdup1 RHS + 118716571U, // <5,u,u,6>: Cost 1 vrev RHS + 1772547625U, // <5,u,u,7>: Cost 2 vuzpr RHS, RHS + 430520110U, // <5,u,u,u>: Cost 1 vext1 RHS, LHS + 2686025728U, // <6,0,0,0>: Cost 3 vext3 <0,2,4,6>, <0,0,0,0> + 2686025738U, // <6,0,0,1>: Cost 3 vext3 <0,2,4,6>, <0,0,1,1> + 2686025748U, // <6,0,0,2>: Cost 3 vext3 <0,2,4,6>, <0,0,2,2> + 3779084320U, // <6,0,0,3>: Cost 4 vext3 <3,4,5,6>, <0,0,3,5> + 2642903388U, // <6,0,0,4>: Cost 3 vext2 <4,2,6,0>, <0,4,2,6> + 3657723939U, // <6,0,0,5>: Cost 4 vext1 <5,6,0,0>, <5,6,0,0> + 3926676514U, // <6,0,0,6>: Cost 4 vuzpr <5,6,7,0>, <7,0,5,6> + 3926675786U, // <6,0,0,7>: Cost 4 vuzpr <5,6,7,0>, <6,0,5,7> + 2686025802U, // <6,0,0,u>: Cost 3 vext3 <0,2,4,6>, <0,0,u,2> + 2566070374U, // <6,0,1,0>: Cost 3 vext1 <2,6,0,1>, LHS + 3759767642U, // <6,0,1,1>: Cost 4 vext3 <0,2,4,6>, <0,1,1,0> + 1612284006U, // <6,0,1,2>: Cost 2 vext3 <0,2,4,6>, LHS + 2583988738U, // <6,0,1,3>: Cost 3 vext1 <5,6,0,1>, <3,4,5,6> + 2566073654U, // <6,0,1,4>: Cost 3 vext1 <2,6,0,1>, RHS + 2583990308U, // <6,0,1,5>: Cost 3 vext1 <5,6,0,1>, <5,6,0,1> + 2589963005U, // <6,0,1,6>: Cost 3 vext1 <6,6,0,1>, <6,6,0,1> + 2595935702U, // <6,0,1,7>: Cost 3 vext1 <7,6,0,1>, <7,6,0,1> + 1612284060U, // <6,0,1,u>: Cost 2 vext3 <0,2,4,6>, LHS + 2686025892U, // <6,0,2,0>: Cost 3 vext3 <0,2,4,6>, <0,2,0,2> + 2685804721U, // <6,0,2,1>: Cost 3 vext3 <0,2,1,6>, <0,2,1,6> + 3759620282U, // <6,0,2,2>: Cost 4 vext3 <0,2,2,6>, <0,2,2,6> + 2705342658U, // <6,0,2,3>: Cost 3 vext3 <3,4,5,6>, <0,2,3,5> + 1612284108U, // <6,0,2,4>: Cost 2 vext3 <0,2,4,6>, <0,2,4,6> + 3706029956U, // <6,0,2,5>: Cost 4 vext2 <2,4,6,0>, <2,5,6,7> + 2686173406U, // <6,0,2,6>: Cost 3 vext3 <0,2,6,6>, <0,2,6,6> + 3651769338U, // <6,0,2,7>: Cost 4 vext1 <4,6,0,2>, <7,0,1,2> + 1612579056U, // <6,0,2,u>: Cost 2 vext3 <0,2,u,6>, <0,2,u,6> + 3706030230U, // <6,0,3,0>: Cost 4 vext2 <2,4,6,0>, <3,0,1,2> + 2705342720U, // <6,0,3,1>: Cost 3 vext3 <3,4,5,6>, <0,3,1,4> + 2705342730U, // <6,0,3,2>: Cost 3 vext3 <3,4,5,6>, <0,3,2,5> + 3706030492U, // <6,0,3,3>: Cost 4 vext2 <2,4,6,0>, <3,3,3,3> + 2644896258U, // <6,0,3,4>: Cost 3 vext2 <4,5,6,0>, <3,4,5,6> + 3718638154U, // <6,0,3,5>: Cost 4 vext2 <4,5,6,0>, <3,5,4,6> + 3729918619U, // <6,0,3,6>: Cost 4 vext2 <6,4,6,0>, <3,6,4,6> + 3926672384U, // <6,0,3,7>: Cost 4 vuzpr <5,6,7,0>, <1,3,5,7> + 2705342784U, // <6,0,3,u>: Cost 3 vext3 <3,4,5,6>, <0,3,u,5> + 2687058250U, // <6,0,4,0>: Cost 3 vext3 <0,4,0,6>, <0,4,0,6> + 2686026066U, // <6,0,4,1>: Cost 3 vext3 <0,2,4,6>, <0,4,1,5> + 1613463900U, // <6,0,4,2>: Cost 2 vext3 <0,4,2,6>, <0,4,2,6> + 3761021285U, // <6,0,4,3>: Cost 4 vext3 <0,4,3,6>, <0,4,3,6> + 2687353198U, // <6,0,4,4>: Cost 3 vext3 <0,4,4,6>, <0,4,4,6> + 2632289590U, // <6,0,4,5>: Cost 3 vext2 <2,4,6,0>, RHS + 2645560704U, // <6,0,4,6>: Cost 3 vext2 <4,6,6,0>, <4,6,6,0> + 2646224337U, // <6,0,4,7>: Cost 3 vext2 <4,7,6,0>, <4,7,6,0> + 1613906322U, // <6,0,4,u>: Cost 2 vext3 <0,4,u,6>, <0,4,u,6> + 3651788902U, // <6,0,5,0>: Cost 4 vext1 <4,6,0,5>, LHS + 2687795620U, // <6,0,5,1>: Cost 3 vext3 <0,5,1,6>, <0,5,1,6> + 3761611181U, // <6,0,5,2>: Cost 4 vext3 <0,5,2,6>, <0,5,2,6> + 3723284326U, // <6,0,5,3>: Cost 4 vext2 <5,3,6,0>, <5,3,6,0> + 2646224838U, // <6,0,5,4>: Cost 3 vext2 <4,7,6,0>, <5,4,7,6> + 3718639630U, // <6,0,5,5>: Cost 4 vext2 <4,5,6,0>, <5,5,6,6> + 2652196962U, // <6,0,5,6>: Cost 3 vext2 <5,7,6,0>, <5,6,7,0> + 2852932918U, // <6,0,5,7>: Cost 3 vuzpr <5,6,7,0>, RHS + 2852932919U, // <6,0,5,u>: Cost 3 vuzpr <5,6,7,0>, RHS + 2852933730U, // <6,0,6,0>: Cost 3 vuzpr <5,6,7,0>, <5,6,7,0> + 2925985894U, // <6,0,6,1>: Cost 3 vzipl <6,6,6,6>, LHS + 3060203622U, // <6,0,6,2>: Cost 3 vtrnl <6,6,6,6>, LHS + 3718640178U, // <6,0,6,3>: Cost 4 vext2 <4,5,6,0>, <6,3,4,5> + 2656178832U, // <6,0,6,4>: Cost 3 vext2 <6,4,6,0>, <6,4,6,0> + 3725939378U, // <6,0,6,5>: Cost 4 vext2 <5,7,6,0>, <6,5,0,7> + 2657506098U, // <6,0,6,6>: Cost 3 vext2 <6,6,6,0>, <6,6,6,0> + 2619020110U, // <6,0,6,7>: Cost 3 vext2 <0,2,6,0>, <6,7,0,1> + 2925986461U, // <6,0,6,u>: Cost 3 vzipl <6,6,6,6>, LHS + 2572091494U, // <6,0,7,0>: Cost 3 vext1 <3,6,0,7>, LHS + 2572092310U, // <6,0,7,1>: Cost 3 vext1 <3,6,0,7>, <1,2,3,0> + 2980495524U, // <6,0,7,2>: Cost 3 vzipr RHS, <0,2,0,2> + 2572094072U, // <6,0,7,3>: Cost 3 vext1 <3,6,0,7>, <3,6,0,7> + 2572094774U, // <6,0,7,4>: Cost 3 vext1 <3,6,0,7>, RHS + 4054238242U, // <6,0,7,5>: Cost 4 vzipr RHS, <1,4,0,5> + 3645837653U, // <6,0,7,6>: Cost 4 vext1 <3,6,0,7>, <6,0,7,0> + 4054239054U, // <6,0,7,7>: Cost 4 vzipr RHS, <2,5,0,7> + 2572097326U, // <6,0,7,u>: Cost 3 vext1 <3,6,0,7>, LHS + 2686026378U, // <6,0,u,0>: Cost 3 vext3 <0,2,4,6>, <0,u,0,2> + 2686026386U, // <6,0,u,1>: Cost 3 vext3 <0,2,4,6>, <0,u,1,1> + 1612284573U, // <6,0,u,2>: Cost 2 vext3 <0,2,4,6>, LHS + 2705343144U, // <6,0,u,3>: Cost 3 vext3 <3,4,5,6>, <0,u,3,5> + 1616265906U, // <6,0,u,4>: Cost 2 vext3 <0,u,4,6>, <0,u,4,6> + 2632292506U, // <6,0,u,5>: Cost 3 vext2 <2,4,6,0>, RHS + 2590020356U, // <6,0,u,6>: Cost 3 vext1 <6,6,0,u>, <6,6,0,u> + 2852933161U, // <6,0,u,7>: Cost 3 vuzpr <5,6,7,0>, RHS + 1612284627U, // <6,0,u,u>: Cost 2 vext3 <0,2,4,6>, LHS + 2595995750U, // <6,1,0,0>: Cost 3 vext1 <7,6,1,0>, LHS + 2646229094U, // <6,1,0,1>: Cost 3 vext2 <4,7,6,1>, LHS + 3694092492U, // <6,1,0,2>: Cost 4 vext2 <0,4,6,1>, <0,2,4,6> + 2686026486U, // <6,1,0,3>: Cost 3 vext3 <0,2,4,6>, <1,0,3,2> + 2595999030U, // <6,1,0,4>: Cost 3 vext1 <7,6,1,0>, RHS + 3767730952U, // <6,1,0,5>: Cost 4 vext3 <1,5,4,6>, <1,0,5,2> + 2596000590U, // <6,1,0,6>: Cost 3 vext1 <7,6,1,0>, <6,7,0,1> + 2596001246U, // <6,1,0,7>: Cost 3 vext1 <7,6,1,0>, <7,6,1,0> + 2686026531U, // <6,1,0,u>: Cost 3 vext3 <0,2,4,6>, <1,0,u,2> + 3763602219U, // <6,1,1,0>: Cost 4 vext3 <0,u,2,6>, <1,1,0,1> + 2686026548U, // <6,1,1,1>: Cost 3 vext3 <0,2,4,6>, <1,1,1,1> + 3764929346U, // <6,1,1,2>: Cost 4 vext3 <1,1,2,6>, <1,1,2,6> + 2686026568U, // <6,1,1,3>: Cost 3 vext3 <0,2,4,6>, <1,1,3,3> + 2691334996U, // <6,1,1,4>: Cost 3 vext3 <1,1,4,6>, <1,1,4,6> + 3760874332U, // <6,1,1,5>: Cost 4 vext3 <0,4,1,6>, <1,1,5,5> + 3765224294U, // <6,1,1,6>: Cost 4 vext3 <1,1,6,6>, <1,1,6,6> + 3669751263U, // <6,1,1,7>: Cost 4 vext1 <7,6,1,1>, <7,6,1,1> + 2686026613U, // <6,1,1,u>: Cost 3 vext3 <0,2,4,6>, <1,1,u,3> + 2554208358U, // <6,1,2,0>: Cost 3 vext1 <0,6,1,2>, LHS + 3763602311U, // <6,1,2,1>: Cost 4 vext3 <0,u,2,6>, <1,2,1,3> + 3639895971U, // <6,1,2,2>: Cost 4 vext1 <2,6,1,2>, <2,6,1,2> + 2686026646U, // <6,1,2,3>: Cost 3 vext3 <0,2,4,6>, <1,2,3,0> + 2554211638U, // <6,1,2,4>: Cost 3 vext1 <0,6,1,2>, RHS + 3760874411U, // <6,1,2,5>: Cost 4 vext3 <0,4,1,6>, <1,2,5,3> + 2554212858U, // <6,1,2,6>: Cost 3 vext1 <0,6,1,2>, <6,2,7,3> + 3802973114U, // <6,1,2,7>: Cost 4 vext3 <7,4,5,6>, <1,2,7,0> + 2686026691U, // <6,1,2,u>: Cost 3 vext3 <0,2,4,6>, <1,2,u,0> + 2566160486U, // <6,1,3,0>: Cost 3 vext1 <2,6,1,3>, LHS + 2686026712U, // <6,1,3,1>: Cost 3 vext3 <0,2,4,6>, <1,3,1,3> + 2686026724U, // <6,1,3,2>: Cost 3 vext3 <0,2,4,6>, <1,3,2,6> + 3759768552U, // <6,1,3,3>: Cost 4 vext3 <0,2,4,6>, <1,3,3,1> + 2692662262U, // <6,1,3,4>: Cost 3 vext3 <1,3,4,6>, <1,3,4,6> + 2686026752U, // <6,1,3,5>: Cost 3 vext3 <0,2,4,6>, <1,3,5,7> + 2590053128U, // <6,1,3,6>: Cost 3 vext1 <6,6,1,3>, <6,6,1,3> + 3663795194U, // <6,1,3,7>: Cost 4 vext1 <6,6,1,3>, <7,0,1,2> + 2686026775U, // <6,1,3,u>: Cost 3 vext3 <0,2,4,6>, <1,3,u,3> + 2641587099U, // <6,1,4,0>: Cost 3 vext2 <4,0,6,1>, <4,0,6,1> + 2693104684U, // <6,1,4,1>: Cost 3 vext3 <1,4,1,6>, <1,4,1,6> + 3639912357U, // <6,1,4,2>: Cost 4 vext1 <2,6,1,4>, <2,6,1,4> + 2687206462U, // <6,1,4,3>: Cost 3 vext3 <0,4,2,6>, <1,4,3,6> + 3633941814U, // <6,1,4,4>: Cost 4 vext1 <1,6,1,4>, RHS + 2693399632U, // <6,1,4,5>: Cost 3 vext3 <1,4,5,6>, <1,4,5,6> + 3765077075U, // <6,1,4,6>: Cost 4 vext3 <1,1,4,6>, <1,4,6,0> + 2646232530U, // <6,1,4,7>: Cost 3 vext2 <4,7,6,1>, <4,7,6,1> + 2687206507U, // <6,1,4,u>: Cost 3 vext3 <0,4,2,6>, <1,4,u,6> + 2647559796U, // <6,1,5,0>: Cost 3 vext2 <5,0,6,1>, <5,0,6,1> + 3765077118U, // <6,1,5,1>: Cost 4 vext3 <1,1,4,6>, <1,5,1,7> + 3767583878U, // <6,1,5,2>: Cost 4 vext3 <1,5,2,6>, <1,5,2,6> + 2686026896U, // <6,1,5,3>: Cost 3 vext3 <0,2,4,6>, <1,5,3,7> + 2693989528U, // <6,1,5,4>: Cost 3 vext3 <1,5,4,6>, <1,5,4,6> + 3767805089U, // <6,1,5,5>: Cost 4 vext3 <1,5,5,6>, <1,5,5,6> + 2652868706U, // <6,1,5,6>: Cost 3 vext2 <5,u,6,1>, <5,6,7,0> + 3908250934U, // <6,1,5,7>: Cost 4 vuzpr <2,6,0,1>, RHS + 2686026941U, // <6,1,5,u>: Cost 3 vext3 <0,2,4,6>, <1,5,u,7> + 2554241126U, // <6,1,6,0>: Cost 3 vext1 <0,6,1,6>, LHS + 3763602639U, // <6,1,6,1>: Cost 4 vext3 <0,u,2,6>, <1,6,1,7> + 3759547607U, // <6,1,6,2>: Cost 4 vext3 <0,2,1,6>, <1,6,2,6> + 3115221094U, // <6,1,6,3>: Cost 3 vtrnr <4,6,4,6>, LHS + 2554244406U, // <6,1,6,4>: Cost 3 vext1 <0,6,1,6>, RHS + 3760874739U, // <6,1,6,5>: Cost 4 vext3 <0,4,1,6>, <1,6,5,7> + 2554245944U, // <6,1,6,6>: Cost 3 vext1 <0,6,1,6>, <6,6,6,6> + 3719975758U, // <6,1,6,7>: Cost 4 vext2 <4,7,6,1>, <6,7,0,1> + 3115221099U, // <6,1,6,u>: Cost 3 vtrnr <4,6,4,6>, LHS + 2560221286U, // <6,1,7,0>: Cost 3 vext1 <1,6,1,7>, LHS + 2560222415U, // <6,1,7,1>: Cost 3 vext1 <1,6,1,7>, <1,6,1,7> + 2980497558U, // <6,1,7,2>: Cost 3 vzipr RHS, <3,0,1,2> + 3103211622U, // <6,1,7,3>: Cost 3 vtrnr <2,6,3,7>, LHS + 2560224566U, // <6,1,7,4>: Cost 3 vext1 <1,6,1,7>, RHS + 2980495698U, // <6,1,7,5>: Cost 3 vzipr RHS, <0,4,1,5> + 3633967526U, // <6,1,7,6>: Cost 4 vext1 <1,6,1,7>, <6,1,7,0> + 4054237686U, // <6,1,7,7>: Cost 4 vzipr RHS, <0,6,1,7> + 2560227118U, // <6,1,7,u>: Cost 3 vext1 <1,6,1,7>, LHS + 2560229478U, // <6,1,u,0>: Cost 3 vext1 <1,6,1,u>, LHS + 2686027117U, // <6,1,u,1>: Cost 3 vext3 <0,2,4,6>, <1,u,1,3> + 2686027129U, // <6,1,u,2>: Cost 3 vext3 <0,2,4,6>, <1,u,2,6> + 2686027132U, // <6,1,u,3>: Cost 3 vext3 <0,2,4,6>, <1,u,3,0> + 2687206795U, // <6,1,u,4>: Cost 3 vext3 <0,4,2,6>, <1,u,4,6> + 2686027157U, // <6,1,u,5>: Cost 3 vext3 <0,2,4,6>, <1,u,5,7> + 2590094093U, // <6,1,u,6>: Cost 3 vext1 <6,6,1,u>, <6,6,1,u> + 2596066790U, // <6,1,u,7>: Cost 3 vext1 <7,6,1,u>, <7,6,1,u> + 2686027177U, // <6,1,u,u>: Cost 3 vext3 <0,2,4,6>, <1,u,u,0> + 2646900736U, // <6,2,0,0>: Cost 3 vext2 <4,u,6,2>, <0,0,0,0> + 1573159014U, // <6,2,0,1>: Cost 2 vext2 <4,u,6,2>, LHS + 2646900900U, // <6,2,0,2>: Cost 3 vext2 <4,u,6,2>, <0,2,0,2> + 3759769037U, // <6,2,0,3>: Cost 4 vext3 <0,2,4,6>, <2,0,3,0> + 2641592668U, // <6,2,0,4>: Cost 3 vext2 <4,0,6,2>, <0,4,2,6> + 3779085794U, // <6,2,0,5>: Cost 4 vext3 <3,4,5,6>, <2,0,5,3> + 2686027244U, // <6,2,0,6>: Cost 3 vext3 <0,2,4,6>, <2,0,6,4> + 3669816807U, // <6,2,0,7>: Cost 4 vext1 <7,6,2,0>, <7,6,2,0> + 1573159581U, // <6,2,0,u>: Cost 2 vext2 <4,u,6,2>, LHS + 2230527897U, // <6,2,1,0>: Cost 3 vrev <2,6,0,1> + 2646901556U, // <6,2,1,1>: Cost 3 vext2 <4,u,6,2>, <1,1,1,1> + 2646901654U, // <6,2,1,2>: Cost 3 vext2 <4,u,6,2>, <1,2,3,0> + 2847047782U, // <6,2,1,3>: Cost 3 vuzpr <4,6,u,2>, LHS + 3771049517U, // <6,2,1,4>: Cost 4 vext3 <2,1,4,6>, <2,1,4,6> + 2646901904U, // <6,2,1,5>: Cost 3 vext2 <4,u,6,2>, <1,5,3,7> + 2686027324U, // <6,2,1,6>: Cost 3 vext3 <0,2,4,6>, <2,1,6,3> + 3669825000U, // <6,2,1,7>: Cost 4 vext1 <7,6,2,1>, <7,6,2,1> + 2231117793U, // <6,2,1,u>: Cost 3 vrev <2,6,u,1> + 3763603029U, // <6,2,2,0>: Cost 4 vext3 <0,u,2,6>, <2,2,0,1> + 3759769184U, // <6,2,2,1>: Cost 4 vext3 <0,2,4,6>, <2,2,1,3> + 2686027368U, // <6,2,2,2>: Cost 3 vext3 <0,2,4,6>, <2,2,2,2> + 2686027378U, // <6,2,2,3>: Cost 3 vext3 <0,2,4,6>, <2,2,3,3> + 2697971326U, // <6,2,2,4>: Cost 3 vext3 <2,2,4,6>, <2,2,4,6> + 3759769224U, // <6,2,2,5>: Cost 4 vext3 <0,2,4,6>, <2,2,5,7> + 2698118800U, // <6,2,2,6>: Cost 3 vext3 <2,2,6,6>, <2,2,6,6> + 3920794092U, // <6,2,2,7>: Cost 4 vuzpr <4,6,u,2>, <6,2,5,7> + 2686027423U, // <6,2,2,u>: Cost 3 vext3 <0,2,4,6>, <2,2,u,3> + 2686027430U, // <6,2,3,0>: Cost 3 vext3 <0,2,4,6>, <2,3,0,1> + 3759769262U, // <6,2,3,1>: Cost 4 vext3 <0,2,4,6>, <2,3,1,0> + 2698487485U, // <6,2,3,2>: Cost 3 vext3 <2,3,2,6>, <2,3,2,6> + 2705344196U, // <6,2,3,3>: Cost 3 vext3 <3,4,5,6>, <2,3,3,4> + 2686027470U, // <6,2,3,4>: Cost 3 vext3 <0,2,4,6>, <2,3,4,5> + 2698708696U, // <6,2,3,5>: Cost 3 vext3 <2,3,5,6>, <2,3,5,6> + 2724660961U, // <6,2,3,6>: Cost 3 vext3 <6,6,6,6>, <2,3,6,6> + 2729232104U, // <6,2,3,7>: Cost 3 vext3 <7,4,5,6>, <2,3,7,4> + 2686027502U, // <6,2,3,u>: Cost 3 vext3 <0,2,4,6>, <2,3,u,1> + 1567853468U, // <6,2,4,0>: Cost 2 vext2 <4,0,6,2>, <4,0,6,2> + 3759769351U, // <6,2,4,1>: Cost 4 vext3 <0,2,4,6>, <2,4,1,u> + 2699151118U, // <6,2,4,2>: Cost 3 vext3 <2,4,2,6>, <2,4,2,6> + 2686027543U, // <6,2,4,3>: Cost 3 vext3 <0,2,4,6>, <2,4,3,6> + 2699298592U, // <6,2,4,4>: Cost 3 vext3 <2,4,4,6>, <2,4,4,6> + 1573162294U, // <6,2,4,5>: Cost 2 vext2 <4,u,6,2>, RHS + 2686027564U, // <6,2,4,6>: Cost 3 vext3 <0,2,4,6>, <2,4,6,0> + 3719982547U, // <6,2,4,7>: Cost 4 vext2 <4,7,6,2>, <4,7,6,2> + 1573162532U, // <6,2,4,u>: Cost 2 vext2 <4,u,6,2>, <4,u,6,2> + 3779086154U, // <6,2,5,0>: Cost 4 vext3 <3,4,5,6>, <2,5,0,3> + 2646904528U, // <6,2,5,1>: Cost 3 vext2 <4,u,6,2>, <5,1,7,3> + 3759769440U, // <6,2,5,2>: Cost 4 vext3 <0,2,4,6>, <2,5,2,7> + 2699888488U, // <6,2,5,3>: Cost 3 vext3 <2,5,3,6>, <2,5,3,6> + 2230855617U, // <6,2,5,4>: Cost 3 vrev <2,6,4,5> + 2646904836U, // <6,2,5,5>: Cost 3 vext2 <4,u,6,2>, <5,5,5,5> + 2646904930U, // <6,2,5,6>: Cost 3 vext2 <4,u,6,2>, <5,6,7,0> + 2847051062U, // <6,2,5,7>: Cost 3 vuzpr <4,6,u,2>, RHS + 2700257173U, // <6,2,5,u>: Cost 3 vext3 <2,5,u,6>, <2,5,u,6> + 2687207321U, // <6,2,6,0>: Cost 3 vext3 <0,4,2,6>, <2,6,0,1> + 2686027684U, // <6,2,6,1>: Cost 3 vext3 <0,2,4,6>, <2,6,1,3> + 2566260656U, // <6,2,6,2>: Cost 3 vext1 <2,6,2,6>, <2,6,2,6> + 2685806522U, // <6,2,6,3>: Cost 3 vext3 <0,2,1,6>, <2,6,3,7> + 2687207361U, // <6,2,6,4>: Cost 3 vext3 <0,4,2,6>, <2,6,4,5> + 2686027724U, // <6,2,6,5>: Cost 3 vext3 <0,2,4,6>, <2,6,5,7> + 2646905656U, // <6,2,6,6>: Cost 3 vext2 <4,u,6,2>, <6,6,6,6> + 2646905678U, // <6,2,6,7>: Cost 3 vext2 <4,u,6,2>, <6,7,0,1> + 2686027751U, // <6,2,6,u>: Cost 3 vext3 <0,2,4,6>, <2,6,u,7> + 2554323046U, // <6,2,7,0>: Cost 3 vext1 <0,6,2,7>, LHS + 2572239606U, // <6,2,7,1>: Cost 3 vext1 <3,6,2,7>, <1,0,3,2> + 2566268849U, // <6,2,7,2>: Cost 3 vext1 <2,6,2,7>, <2,6,2,7> + 1906753638U, // <6,2,7,3>: Cost 2 vzipr RHS, LHS + 2554326326U, // <6,2,7,4>: Cost 3 vext1 <0,6,2,7>, RHS + 3304687564U, // <6,2,7,5>: Cost 4 vrev <2,6,5,7> + 2980495708U, // <6,2,7,6>: Cost 3 vzipr RHS, <0,4,2,6> + 2646906476U, // <6,2,7,7>: Cost 3 vext2 <4,u,6,2>, <7,7,7,7> + 1906753643U, // <6,2,7,u>: Cost 2 vzipr RHS, LHS + 1591744256U, // <6,2,u,0>: Cost 2 vext2 <u,0,6,2>, <u,0,6,2> + 1573164846U, // <6,2,u,1>: Cost 2 vext2 <4,u,6,2>, LHS + 2701805650U, // <6,2,u,2>: Cost 3 vext3 <2,u,2,6>, <2,u,2,6> + 1906761830U, // <6,2,u,3>: Cost 2 vzipr RHS, LHS + 2686027875U, // <6,2,u,4>: Cost 3 vext3 <0,2,4,6>, <2,u,4,5> + 1573165210U, // <6,2,u,5>: Cost 2 vext2 <4,u,6,2>, RHS + 2686322800U, // <6,2,u,6>: Cost 3 vext3 <0,2,u,6>, <2,u,6,0> + 2847051305U, // <6,2,u,7>: Cost 3 vuzpr <4,6,u,2>, RHS + 1906761835U, // <6,2,u,u>: Cost 2 vzipr RHS, LHS + 3759769739U, // <6,3,0,0>: Cost 4 vext3 <0,2,4,6>, <3,0,0,0> + 2686027926U, // <6,3,0,1>: Cost 3 vext3 <0,2,4,6>, <3,0,1,2> + 2686027937U, // <6,3,0,2>: Cost 3 vext3 <0,2,4,6>, <3,0,2,4> + 3640027286U, // <6,3,0,3>: Cost 4 vext1 <2,6,3,0>, <3,0,1,2> + 2687207601U, // <6,3,0,4>: Cost 3 vext3 <0,4,2,6>, <3,0,4,2> + 2705344698U, // <6,3,0,5>: Cost 3 vext3 <3,4,5,6>, <3,0,5,2> + 3663917847U, // <6,3,0,6>: Cost 4 vext1 <6,6,3,0>, <6,6,3,0> + 2237008560U, // <6,3,0,7>: Cost 3 vrev <3,6,7,0> + 2686027989U, // <6,3,0,u>: Cost 3 vext3 <0,2,4,6>, <3,0,u,2> + 3759769823U, // <6,3,1,0>: Cost 4 vext3 <0,2,4,6>, <3,1,0,3> + 3759769830U, // <6,3,1,1>: Cost 4 vext3 <0,2,4,6>, <3,1,1,1> + 3759769841U, // <6,3,1,2>: Cost 4 vext3 <0,2,4,6>, <3,1,2,3> + 3759769848U, // <6,3,1,3>: Cost 4 vext3 <0,2,4,6>, <3,1,3,1> + 2703280390U, // <6,3,1,4>: Cost 3 vext3 <3,1,4,6>, <3,1,4,6> + 3759769868U, // <6,3,1,5>: Cost 4 vext3 <0,2,4,6>, <3,1,5,3> + 3704063194U, // <6,3,1,6>: Cost 4 vext2 <2,1,6,3>, <1,6,3,0> + 3767732510U, // <6,3,1,7>: Cost 4 vext3 <1,5,4,6>, <3,1,7,3> + 2703280390U, // <6,3,1,u>: Cost 3 vext3 <3,1,4,6>, <3,1,4,6> + 3704063468U, // <6,3,2,0>: Cost 4 vext2 <2,1,6,3>, <2,0,6,4> + 2630321724U, // <6,3,2,1>: Cost 3 vext2 <2,1,6,3>, <2,1,6,3> + 3759769921U, // <6,3,2,2>: Cost 4 vext3 <0,2,4,6>, <3,2,2,2> + 3759769928U, // <6,3,2,3>: Cost 4 vext3 <0,2,4,6>, <3,2,3,0> + 3704063767U, // <6,3,2,4>: Cost 4 vext2 <2,1,6,3>, <2,4,3,6> + 3704063876U, // <6,3,2,5>: Cost 4 vext2 <2,1,6,3>, <2,5,6,7> + 2636957626U, // <6,3,2,6>: Cost 3 vext2 <3,2,6,3>, <2,6,3,7> + 3777907058U, // <6,3,2,7>: Cost 4 vext3 <3,2,7,6>, <3,2,7,6> + 2630321724U, // <6,3,2,u>: Cost 3 vext2 <2,1,6,3>, <2,1,6,3> + 3759769983U, // <6,3,3,0>: Cost 4 vext3 <0,2,4,6>, <3,3,0,1> + 3710036245U, // <6,3,3,1>: Cost 4 vext2 <3,1,6,3>, <3,1,6,3> + 2636958054U, // <6,3,3,2>: Cost 3 vext2 <3,2,6,3>, <3,2,6,3> + 2686028188U, // <6,3,3,3>: Cost 3 vext3 <0,2,4,6>, <3,3,3,3> + 2704607656U, // <6,3,3,4>: Cost 3 vext3 <3,3,4,6>, <3,3,4,6> + 3773041072U, // <6,3,3,5>: Cost 4 vext3 <2,4,4,6>, <3,3,5,5> + 3711363731U, // <6,3,3,6>: Cost 4 vext2 <3,3,6,3>, <3,6,3,7> + 3767732676U, // <6,3,3,7>: Cost 4 vext3 <1,5,4,6>, <3,3,7,7> + 2707999179U, // <6,3,3,u>: Cost 3 vext3 <3,u,5,6>, <3,3,u,5> + 2584232038U, // <6,3,4,0>: Cost 3 vext1 <5,6,3,4>, LHS + 2642267118U, // <6,3,4,1>: Cost 3 vext2 <4,1,6,3>, <4,1,6,3> + 2642930751U, // <6,3,4,2>: Cost 3 vext2 <4,2,6,3>, <4,2,6,3> + 2705197552U, // <6,3,4,3>: Cost 3 vext3 <3,4,3,6>, <3,4,3,6> + 2584235318U, // <6,3,4,4>: Cost 3 vext1 <5,6,3,4>, RHS + 1631603202U, // <6,3,4,5>: Cost 2 vext3 <3,4,5,6>, <3,4,5,6> + 2654211444U, // <6,3,4,6>: Cost 3 vext2 <6,1,6,3>, <4,6,4,6> + 2237041332U, // <6,3,4,7>: Cost 3 vrev <3,6,7,4> + 1631824413U, // <6,3,4,u>: Cost 2 vext3 <3,4,u,6>, <3,4,u,6> + 3640066150U, // <6,3,5,0>: Cost 4 vext1 <2,6,3,5>, LHS + 3772746288U, // <6,3,5,1>: Cost 4 vext3 <2,4,0,6>, <3,5,1,7> + 3640067790U, // <6,3,5,2>: Cost 4 vext1 <2,6,3,5>, <2,3,4,5> + 3773041216U, // <6,3,5,3>: Cost 4 vext3 <2,4,4,6>, <3,5,3,5> + 2705934922U, // <6,3,5,4>: Cost 3 vext3 <3,5,4,6>, <3,5,4,6> + 3773041236U, // <6,3,5,5>: Cost 4 vext3 <2,4,4,6>, <3,5,5,7> + 3779086940U, // <6,3,5,6>: Cost 4 vext3 <3,4,5,6>, <3,5,6,6> + 3767732831U, // <6,3,5,7>: Cost 4 vext3 <1,5,4,6>, <3,5,7,0> + 2706229870U, // <6,3,5,u>: Cost 3 vext3 <3,5,u,6>, <3,5,u,6> + 2602164326U, // <6,3,6,0>: Cost 3 vext1 <u,6,3,6>, LHS + 2654212512U, // <6,3,6,1>: Cost 3 vext2 <6,1,6,3>, <6,1,6,3> + 2566334393U, // <6,3,6,2>: Cost 3 vext1 <2,6,3,6>, <2,6,3,6> + 3704066588U, // <6,3,6,3>: Cost 4 vext2 <2,1,6,3>, <6,3,2,1> + 2602167524U, // <6,3,6,4>: Cost 3 vext1 <u,6,3,6>, <4,4,6,6> + 3710702321U, // <6,3,6,5>: Cost 4 vext2 <3,2,6,3>, <6,5,7,7> + 2724661933U, // <6,3,6,6>: Cost 3 vext3 <6,6,6,6>, <3,6,6,6> + 3710702465U, // <6,3,6,7>: Cost 4 vext2 <3,2,6,3>, <6,7,5,7> + 2602170158U, // <6,3,6,u>: Cost 3 vext1 <u,6,3,6>, LHS + 1492598886U, // <6,3,7,0>: Cost 2 vext1 <2,6,3,7>, LHS + 2560369889U, // <6,3,7,1>: Cost 3 vext1 <1,6,3,7>, <1,6,3,7> + 1492600762U, // <6,3,7,2>: Cost 2 vext1 <2,6,3,7>, <2,6,3,7> + 2566342806U, // <6,3,7,3>: Cost 3 vext1 <2,6,3,7>, <3,0,1,2> + 1492602166U, // <6,3,7,4>: Cost 2 vext1 <2,6,3,7>, RHS + 2602176208U, // <6,3,7,5>: Cost 3 vext1 <u,6,3,7>, <5,1,7,3> + 2566345210U, // <6,3,7,6>: Cost 3 vext1 <2,6,3,7>, <6,2,7,3> + 2980496528U, // <6,3,7,7>: Cost 3 vzipr RHS, <1,5,3,7> + 1492604718U, // <6,3,7,u>: Cost 2 vext1 <2,6,3,7>, LHS + 1492607078U, // <6,3,u,0>: Cost 2 vext1 <2,6,3,u>, LHS + 2686028574U, // <6,3,u,1>: Cost 3 vext3 <0,2,4,6>, <3,u,1,2> + 1492608955U, // <6,3,u,2>: Cost 2 vext1 <2,6,3,u>, <2,6,3,u> + 2566350998U, // <6,3,u,3>: Cost 3 vext1 <2,6,3,u>, <3,0,1,2> + 1492610358U, // <6,3,u,4>: Cost 2 vext1 <2,6,3,u>, RHS + 1634257734U, // <6,3,u,5>: Cost 2 vext3 <3,u,5,6>, <3,u,5,6> + 2566353489U, // <6,3,u,6>: Cost 3 vext1 <2,6,3,u>, <6,3,u,0> + 2980504720U, // <6,3,u,7>: Cost 3 vzipr RHS, <1,5,3,7> + 1492612910U, // <6,3,u,u>: Cost 2 vext1 <2,6,3,u>, LHS + 3703406592U, // <6,4,0,0>: Cost 4 vext2 <2,0,6,4>, <0,0,0,0> + 2629664870U, // <6,4,0,1>: Cost 3 vext2 <2,0,6,4>, LHS + 2629664972U, // <6,4,0,2>: Cost 3 vext2 <2,0,6,4>, <0,2,4,6> + 3779087232U, // <6,4,0,3>: Cost 4 vext3 <3,4,5,6>, <4,0,3,1> + 2642936156U, // <6,4,0,4>: Cost 3 vext2 <4,2,6,4>, <0,4,2,6> + 2712570770U, // <6,4,0,5>: Cost 3 vext3 <4,6,4,6>, <4,0,5,1> + 2687208348U, // <6,4,0,6>: Cost 3 vext3 <0,4,2,6>, <4,0,6,2> + 3316723081U, // <6,4,0,7>: Cost 4 vrev <4,6,7,0> + 2629665437U, // <6,4,0,u>: Cost 3 vext2 <2,0,6,4>, LHS + 2242473291U, // <6,4,1,0>: Cost 3 vrev <4,6,0,1> + 3700089652U, // <6,4,1,1>: Cost 4 vext2 <1,4,6,4>, <1,1,1,1> + 3703407510U, // <6,4,1,2>: Cost 4 vext2 <2,0,6,4>, <1,2,3,0> + 2852962406U, // <6,4,1,3>: Cost 3 vuzpr <5,6,7,4>, LHS + 3628166454U, // <6,4,1,4>: Cost 4 vext1 <0,6,4,1>, RHS + 3760876514U, // <6,4,1,5>: Cost 4 vext3 <0,4,1,6>, <4,1,5,0> + 2687208430U, // <6,4,1,6>: Cost 3 vext3 <0,4,2,6>, <4,1,6,3> + 3316731274U, // <6,4,1,7>: Cost 4 vrev <4,6,7,1> + 2243063187U, // <6,4,1,u>: Cost 3 vrev <4,6,u,1> + 2629666284U, // <6,4,2,0>: Cost 3 vext2 <2,0,6,4>, <2,0,6,4> + 3703408188U, // <6,4,2,1>: Cost 4 vext2 <2,0,6,4>, <2,1,6,3> + 3703408232U, // <6,4,2,2>: Cost 4 vext2 <2,0,6,4>, <2,2,2,2> + 3703408294U, // <6,4,2,3>: Cost 4 vext2 <2,0,6,4>, <2,3,0,1> + 2632320816U, // <6,4,2,4>: Cost 3 vext2 <2,4,6,4>, <2,4,6,4> + 2923384118U, // <6,4,2,5>: Cost 3 vzipl <6,2,7,3>, RHS + 2687208508U, // <6,4,2,6>: Cost 3 vext3 <0,4,2,6>, <4,2,6,0> + 3760950341U, // <6,4,2,7>: Cost 4 vext3 <0,4,2,6>, <4,2,7,0> + 2634975348U, // <6,4,2,u>: Cost 3 vext2 <2,u,6,4>, <2,u,6,4> + 3703408790U, // <6,4,3,0>: Cost 4 vext2 <2,0,6,4>, <3,0,1,2> + 3316305238U, // <6,4,3,1>: Cost 4 vrev <4,6,1,3> + 3703408947U, // <6,4,3,2>: Cost 4 vext2 <2,0,6,4>, <3,2,0,6> + 3703409052U, // <6,4,3,3>: Cost 4 vext2 <2,0,6,4>, <3,3,3,3> + 2644929026U, // <6,4,3,4>: Cost 3 vext2 <4,5,6,4>, <3,4,5,6> + 3718670922U, // <6,4,3,5>: Cost 4 vext2 <4,5,6,4>, <3,5,4,6> + 2705345682U, // <6,4,3,6>: Cost 3 vext3 <3,4,5,6>, <4,3,6,5> + 3926705152U, // <6,4,3,7>: Cost 4 vuzpr <5,6,7,4>, <1,3,5,7> + 2668817222U, // <6,4,3,u>: Cost 3 vext2 <u,5,6,4>, <3,u,5,6> + 2590277734U, // <6,4,4,0>: Cost 3 vext1 <6,6,4,4>, LHS + 3716017135U, // <6,4,4,1>: Cost 4 vext2 <4,1,6,4>, <4,1,6,4> + 2642938944U, // <6,4,4,2>: Cost 3 vext2 <4,2,6,4>, <4,2,6,4> + 3717344401U, // <6,4,4,3>: Cost 4 vext2 <4,3,6,4>, <4,3,6,4> + 2712571088U, // <6,4,4,4>: Cost 3 vext3 <4,6,4,6>, <4,4,4,4> + 2629668150U, // <6,4,4,5>: Cost 3 vext2 <2,0,6,4>, RHS + 1637649636U, // <6,4,4,6>: Cost 2 vext3 <4,4,6,6>, <4,4,6,6> + 2646257109U, // <6,4,4,7>: Cost 3 vext2 <4,7,6,4>, <4,7,6,4> + 1637649636U, // <6,4,4,u>: Cost 2 vext3 <4,4,6,6>, <4,4,6,6> + 2566398054U, // <6,4,5,0>: Cost 3 vext1 <2,6,4,5>, LHS + 3760876805U, // <6,4,5,1>: Cost 4 vext3 <0,4,1,6>, <4,5,1,3> + 2566399937U, // <6,4,5,2>: Cost 3 vext1 <2,6,4,5>, <2,6,4,5> + 2584316418U, // <6,4,5,3>: Cost 3 vext1 <5,6,4,5>, <3,4,5,6> + 2566401334U, // <6,4,5,4>: Cost 3 vext1 <2,6,4,5>, RHS + 2584318028U, // <6,4,5,5>: Cost 3 vext1 <5,6,4,5>, <5,6,4,5> + 1612287286U, // <6,4,5,6>: Cost 2 vext3 <0,2,4,6>, RHS + 2852965686U, // <6,4,5,7>: Cost 3 vuzpr <5,6,7,4>, RHS + 1612287304U, // <6,4,5,u>: Cost 2 vext3 <0,2,4,6>, RHS + 1504608358U, // <6,4,6,0>: Cost 2 vext1 <4,6,4,6>, LHS + 2578350838U, // <6,4,6,1>: Cost 3 vext1 <4,6,4,6>, <1,0,3,2> + 2578351720U, // <6,4,6,2>: Cost 3 vext1 <4,6,4,6>, <2,2,2,2> + 2578352278U, // <6,4,6,3>: Cost 3 vext1 <4,6,4,6>, <3,0,1,2> + 1504611638U, // <6,4,6,4>: Cost 2 vext1 <4,6,4,6>, RHS + 2578353872U, // <6,4,6,5>: Cost 3 vext1 <4,6,4,6>, <5,1,7,3> + 2578354682U, // <6,4,6,6>: Cost 3 vext1 <4,6,4,6>, <6,2,7,3> + 2578355194U, // <6,4,6,7>: Cost 3 vext1 <4,6,4,6>, <7,0,1,2> + 1504614190U, // <6,4,6,u>: Cost 2 vext1 <4,6,4,6>, LHS + 2572386406U, // <6,4,7,0>: Cost 3 vext1 <3,6,4,7>, LHS + 2572387226U, // <6,4,7,1>: Cost 3 vext1 <3,6,4,7>, <1,2,3,4> + 3640157902U, // <6,4,7,2>: Cost 4 vext1 <2,6,4,7>, <2,3,4,5> + 2572389020U, // <6,4,7,3>: Cost 3 vext1 <3,6,4,7>, <3,6,4,7> + 2572389686U, // <6,4,7,4>: Cost 3 vext1 <3,6,4,7>, RHS + 2980497102U, // <6,4,7,5>: Cost 3 vzipr RHS, <2,3,4,5> + 2980495564U, // <6,4,7,6>: Cost 3 vzipr RHS, <0,2,4,6> + 4054239090U, // <6,4,7,7>: Cost 4 vzipr RHS, <2,5,4,7> + 2572392238U, // <6,4,7,u>: Cost 3 vext1 <3,6,4,7>, LHS + 1504608358U, // <6,4,u,0>: Cost 2 vext1 <4,6,4,6>, LHS + 2629670702U, // <6,4,u,1>: Cost 3 vext2 <2,0,6,4>, LHS + 2566424516U, // <6,4,u,2>: Cost 3 vext1 <2,6,4,u>, <2,6,4,u> + 2584340994U, // <6,4,u,3>: Cost 3 vext1 <5,6,4,u>, <3,4,5,6> + 1640156694U, // <6,4,u,4>: Cost 2 vext3 <4,u,4,6>, <4,u,4,6> + 2629671066U, // <6,4,u,5>: Cost 3 vext2 <2,0,6,4>, RHS + 1612287529U, // <6,4,u,6>: Cost 2 vext3 <0,2,4,6>, RHS + 2852965929U, // <6,4,u,7>: Cost 3 vuzpr <5,6,7,4>, RHS + 1612287547U, // <6,4,u,u>: Cost 2 vext3 <0,2,4,6>, RHS + 3708723200U, // <6,5,0,0>: Cost 4 vext2 <2,u,6,5>, <0,0,0,0> + 2634981478U, // <6,5,0,1>: Cost 3 vext2 <2,u,6,5>, LHS + 3694125260U, // <6,5,0,2>: Cost 4 vext2 <0,4,6,5>, <0,2,4,6> + 3779087962U, // <6,5,0,3>: Cost 4 vext3 <3,4,5,6>, <5,0,3,2> + 3760877154U, // <6,5,0,4>: Cost 4 vext3 <0,4,1,6>, <5,0,4,1> + 4195110916U, // <6,5,0,5>: Cost 4 vtrnr <5,6,7,0>, <5,5,5,5> + 3696779775U, // <6,5,0,6>: Cost 4 vext2 <0,u,6,5>, <0,6,2,7> + 1175212130U, // <6,5,0,7>: Cost 2 vrev <5,6,7,0> + 1175285867U, // <6,5,0,u>: Cost 2 vrev <5,6,u,0> + 2248445988U, // <6,5,1,0>: Cost 3 vrev <5,6,0,1> + 3698107237U, // <6,5,1,1>: Cost 4 vext2 <1,1,6,5>, <1,1,6,5> + 3708724118U, // <6,5,1,2>: Cost 4 vext2 <2,u,6,5>, <1,2,3,0> + 3908575334U, // <6,5,1,3>: Cost 4 vuzpr <2,6,4,5>, LHS + 3716023376U, // <6,5,1,4>: Cost 4 vext2 <4,1,6,5>, <1,4,5,6> + 3708724368U, // <6,5,1,5>: Cost 4 vext2 <2,u,6,5>, <1,5,3,7> + 3767733960U, // <6,5,1,6>: Cost 4 vext3 <1,5,4,6>, <5,1,6,4> + 2712571600U, // <6,5,1,7>: Cost 3 vext3 <4,6,4,6>, <5,1,7,3> + 2712571609U, // <6,5,1,u>: Cost 3 vext3 <4,6,4,6>, <5,1,u,3> + 2578391142U, // <6,5,2,0>: Cost 3 vext1 <4,6,5,2>, LHS + 3704079934U, // <6,5,2,1>: Cost 4 vext2 <2,1,6,5>, <2,1,6,5> + 3708724840U, // <6,5,2,2>: Cost 4 vext2 <2,u,6,5>, <2,2,2,2> + 3705407182U, // <6,5,2,3>: Cost 4 vext2 <2,3,6,5>, <2,3,4,5> + 2578394422U, // <6,5,2,4>: Cost 3 vext1 <4,6,5,2>, RHS + 3717351272U, // <6,5,2,5>: Cost 4 vext2 <4,3,6,5>, <2,5,3,6> + 2634983354U, // <6,5,2,6>: Cost 3 vext2 <2,u,6,5>, <2,6,3,7> + 3115486518U, // <6,5,2,7>: Cost 3 vtrnr <4,6,u,2>, RHS + 2634983541U, // <6,5,2,u>: Cost 3 vext2 <2,u,6,5>, <2,u,6,5> + 3708725398U, // <6,5,3,0>: Cost 4 vext2 <2,u,6,5>, <3,0,1,2> + 3710052631U, // <6,5,3,1>: Cost 4 vext2 <3,1,6,5>, <3,1,6,5> + 3708725606U, // <6,5,3,2>: Cost 4 vext2 <2,u,6,5>, <3,2,6,3> + 3708725660U, // <6,5,3,3>: Cost 4 vext2 <2,u,6,5>, <3,3,3,3> + 2643610114U, // <6,5,3,4>: Cost 3 vext2 <4,3,6,5>, <3,4,5,6> + 3717352010U, // <6,5,3,5>: Cost 4 vext2 <4,3,6,5>, <3,5,4,6> + 3773632358U, // <6,5,3,6>: Cost 4 vext3 <2,5,3,6>, <5,3,6,0> + 2248978533U, // <6,5,3,7>: Cost 3 vrev <5,6,7,3> + 2249052270U, // <6,5,3,u>: Cost 3 vrev <5,6,u,3> + 2596323430U, // <6,5,4,0>: Cost 3 vext1 <7,6,5,4>, LHS + 3716025328U, // <6,5,4,1>: Cost 4 vext2 <4,1,6,5>, <4,1,6,5> + 3716688961U, // <6,5,4,2>: Cost 4 vext2 <4,2,6,5>, <4,2,6,5> + 2643610770U, // <6,5,4,3>: Cost 3 vext2 <4,3,6,5>, <4,3,6,5> + 2596326710U, // <6,5,4,4>: Cost 3 vext1 <7,6,5,4>, RHS + 2634984758U, // <6,5,4,5>: Cost 3 vext2 <2,u,6,5>, RHS + 3767734199U, // <6,5,4,6>: Cost 4 vext3 <1,5,4,6>, <5,4,6,0> + 1643696070U, // <6,5,4,7>: Cost 2 vext3 <5,4,7,6>, <5,4,7,6> + 1643769807U, // <6,5,4,u>: Cost 2 vext3 <5,4,u,6>, <5,4,u,6> + 2578415718U, // <6,5,5,0>: Cost 3 vext1 <4,6,5,5>, LHS + 3652158198U, // <6,5,5,1>: Cost 4 vext1 <4,6,5,5>, <1,0,3,2> + 3652159080U, // <6,5,5,2>: Cost 4 vext1 <4,6,5,5>, <2,2,2,2> + 3652159638U, // <6,5,5,3>: Cost 4 vext1 <4,6,5,5>, <3,0,1,2> + 2578418998U, // <6,5,5,4>: Cost 3 vext1 <4,6,5,5>, RHS + 2712571908U, // <6,5,5,5>: Cost 3 vext3 <4,6,4,6>, <5,5,5,5> + 2718027790U, // <6,5,5,6>: Cost 3 vext3 <5,5,6,6>, <5,5,6,6> + 2712571928U, // <6,5,5,7>: Cost 3 vext3 <4,6,4,6>, <5,5,7,7> + 2712571937U, // <6,5,5,u>: Cost 3 vext3 <4,6,4,6>, <5,5,u,7> + 2705346596U, // <6,5,6,0>: Cost 3 vext3 <3,4,5,6>, <5,6,0,1> + 3767144496U, // <6,5,6,1>: Cost 4 vext3 <1,4,5,6>, <5,6,1,4> + 3773116473U, // <6,5,6,2>: Cost 4 vext3 <2,4,5,6>, <5,6,2,4> + 2705346626U, // <6,5,6,3>: Cost 3 vext3 <3,4,5,6>, <5,6,3,4> + 2705346636U, // <6,5,6,4>: Cost 3 vext3 <3,4,5,6>, <5,6,4,5> + 3908577217U, // <6,5,6,5>: Cost 4 vuzpr <2,6,4,5>, <2,6,4,5> + 2578428728U, // <6,5,6,6>: Cost 3 vext1 <4,6,5,6>, <6,6,6,6> + 2712572002U, // <6,5,6,7>: Cost 3 vext3 <4,6,4,6>, <5,6,7,0> + 2705346668U, // <6,5,6,u>: Cost 3 vext3 <3,4,5,6>, <5,6,u,1> + 2560516198U, // <6,5,7,0>: Cost 3 vext1 <1,6,5,7>, LHS + 2560517363U, // <6,5,7,1>: Cost 3 vext1 <1,6,5,7>, <1,6,5,7> + 2566490060U, // <6,5,7,2>: Cost 3 vext1 <2,6,5,7>, <2,6,5,7> + 3634260118U, // <6,5,7,3>: Cost 4 vext1 <1,6,5,7>, <3,0,1,2> + 2560519478U, // <6,5,7,4>: Cost 3 vext1 <1,6,5,7>, RHS + 2980498650U, // <6,5,7,5>: Cost 3 vzipr RHS, <4,4,5,5> + 2980497922U, // <6,5,7,6>: Cost 3 vzipr RHS, <3,4,5,6> + 3103214902U, // <6,5,7,7>: Cost 3 vtrnr <2,6,3,7>, RHS + 2560522030U, // <6,5,7,u>: Cost 3 vext1 <1,6,5,7>, LHS + 2560524390U, // <6,5,u,0>: Cost 3 vext1 <1,6,5,u>, LHS + 2560525556U, // <6,5,u,1>: Cost 3 vext1 <1,6,5,u>, <1,6,5,u> + 2566498253U, // <6,5,u,2>: Cost 3 vext1 <2,6,5,u>, <2,6,5,u> + 2646931439U, // <6,5,u,3>: Cost 3 vext2 <4,u,6,5>, <u,3,5,7> + 2560527670U, // <6,5,u,4>: Cost 3 vext1 <1,6,5,u>, RHS + 2634987674U, // <6,5,u,5>: Cost 3 vext2 <2,u,6,5>, RHS + 2980506114U, // <6,5,u,6>: Cost 3 vzipr RHS, <3,4,5,6> + 1175277674U, // <6,5,u,7>: Cost 2 vrev <5,6,7,u> + 1175351411U, // <6,5,u,u>: Cost 2 vrev <5,6,u,u> + 2578448486U, // <6,6,0,0>: Cost 3 vext1 <4,6,6,0>, LHS + 1573191782U, // <6,6,0,1>: Cost 2 vext2 <4,u,6,6>, LHS + 2686030124U, // <6,6,0,2>: Cost 3 vext3 <0,2,4,6>, <6,0,2,4> + 3779088690U, // <6,6,0,3>: Cost 4 vext3 <3,4,5,6>, <6,0,3,1> + 2687209788U, // <6,6,0,4>: Cost 3 vext3 <0,4,2,6>, <6,0,4,2> + 3652194000U, // <6,6,0,5>: Cost 4 vext1 <4,6,6,0>, <5,1,7,3> + 2254852914U, // <6,6,0,6>: Cost 3 vrev <6,6,6,0> + 4041575734U, // <6,6,0,7>: Cost 4 vzipr <2,4,6,0>, RHS + 1573192349U, // <6,6,0,u>: Cost 2 vext2 <4,u,6,6>, LHS + 2646934262U, // <6,6,1,0>: Cost 3 vext2 <4,u,6,6>, <1,0,3,2> + 2646934324U, // <6,6,1,1>: Cost 3 vext2 <4,u,6,6>, <1,1,1,1> + 2646934422U, // <6,6,1,2>: Cost 3 vext2 <4,u,6,6>, <1,2,3,0> + 2846785638U, // <6,6,1,3>: Cost 3 vuzpr <4,6,4,6>, LHS + 3760951694U, // <6,6,1,4>: Cost 4 vext3 <0,4,2,6>, <6,1,4,3> + 2646934672U, // <6,6,1,5>: Cost 3 vext2 <4,u,6,6>, <1,5,3,7> + 2712572320U, // <6,6,1,6>: Cost 3 vext3 <4,6,4,6>, <6,1,6,3> + 3775549865U, // <6,6,1,7>: Cost 4 vext3 <2,u,2,6>, <6,1,7,3> + 2846785643U, // <6,6,1,u>: Cost 3 vuzpr <4,6,4,6>, LHS + 3759772094U, // <6,6,2,0>: Cost 4 vext3 <0,2,4,6>, <6,2,0,6> + 3704751676U, // <6,6,2,1>: Cost 4 vext2 <2,2,6,6>, <2,1,6,3> + 2631009936U, // <6,6,2,2>: Cost 3 vext2 <2,2,6,6>, <2,2,6,6> + 2646935206U, // <6,6,2,3>: Cost 3 vext2 <4,u,6,6>, <2,3,0,1> + 3759772127U, // <6,6,2,4>: Cost 4 vext3 <0,2,4,6>, <6,2,4,3> + 3704752004U, // <6,6,2,5>: Cost 4 vext2 <2,2,6,6>, <2,5,6,7> + 2646935482U, // <6,6,2,6>: Cost 3 vext2 <4,u,6,6>, <2,6,3,7> + 2712572410U, // <6,6,2,7>: Cost 3 vext3 <4,6,4,6>, <6,2,7,3> + 2712572419U, // <6,6,2,u>: Cost 3 vext3 <4,6,4,6>, <6,2,u,3> + 2646935702U, // <6,6,3,0>: Cost 3 vext2 <4,u,6,6>, <3,0,1,2> + 3777024534U, // <6,6,3,1>: Cost 4 vext3 <3,1,4,6>, <6,3,1,4> + 3704752453U, // <6,6,3,2>: Cost 4 vext2 <2,2,6,6>, <3,2,2,6> + 2646935964U, // <6,6,3,3>: Cost 3 vext2 <4,u,6,6>, <3,3,3,3> + 2705347122U, // <6,6,3,4>: Cost 3 vext3 <3,4,5,6>, <6,3,4,5> + 3779678778U, // <6,6,3,5>: Cost 4 vext3 <3,5,4,6>, <6,3,5,4> + 2657553069U, // <6,6,3,6>: Cost 3 vext2 <6,6,6,6>, <3,6,6,6> + 4039609654U, // <6,6,3,7>: Cost 4 vzipr <2,1,6,3>, RHS + 2708001366U, // <6,6,3,u>: Cost 3 vext3 <3,u,5,6>, <6,3,u,5> + 2578481254U, // <6,6,4,0>: Cost 3 vext1 <4,6,6,4>, LHS + 3652223734U, // <6,6,4,1>: Cost 4 vext1 <4,6,6,4>, <1,0,3,2> + 3760951922U, // <6,6,4,2>: Cost 4 vext3 <0,4,2,6>, <6,4,2,6> + 3779089019U, // <6,6,4,3>: Cost 4 vext3 <3,4,5,6>, <6,4,3,6> + 1570540772U, // <6,6,4,4>: Cost 2 vext2 <4,4,6,6>, <4,4,6,6> + 1573195062U, // <6,6,4,5>: Cost 2 vext2 <4,u,6,6>, RHS + 2712572560U, // <6,6,4,6>: Cost 3 vext3 <4,6,4,6>, <6,4,6,0> + 2723410591U, // <6,6,4,7>: Cost 3 vext3 <6,4,7,6>, <6,4,7,6> + 1573195304U, // <6,6,4,u>: Cost 2 vext2 <4,u,6,6>, <4,u,6,6> + 3640287334U, // <6,6,5,0>: Cost 4 vext1 <2,6,6,5>, LHS + 2646937296U, // <6,6,5,1>: Cost 3 vext2 <4,u,6,6>, <5,1,7,3> + 3640289235U, // <6,6,5,2>: Cost 4 vext1 <2,6,6,5>, <2,6,6,5> + 3720679279U, // <6,6,5,3>: Cost 4 vext2 <4,u,6,6>, <5,3,7,0> + 2646937542U, // <6,6,5,4>: Cost 3 vext2 <4,u,6,6>, <5,4,7,6> + 2646937604U, // <6,6,5,5>: Cost 3 vext2 <4,u,6,6>, <5,5,5,5> + 2646937698U, // <6,6,5,6>: Cost 3 vext2 <4,u,6,6>, <5,6,7,0> + 2846788918U, // <6,6,5,7>: Cost 3 vuzpr <4,6,4,6>, RHS + 2846788919U, // <6,6,5,u>: Cost 3 vuzpr <4,6,4,6>, RHS + 1516699750U, // <6,6,6,0>: Cost 2 vext1 <6,6,6,6>, LHS + 2590442230U, // <6,6,6,1>: Cost 3 vext1 <6,6,6,6>, <1,0,3,2> + 2646938106U, // <6,6,6,2>: Cost 3 vext2 <4,u,6,6>, <6,2,7,3> + 2590443670U, // <6,6,6,3>: Cost 3 vext1 <6,6,6,6>, <3,0,1,2> + 1516703030U, // <6,6,6,4>: Cost 2 vext1 <6,6,6,6>, RHS + 2590445264U, // <6,6,6,5>: Cost 3 vext1 <6,6,6,6>, <5,1,7,3> + 296144182U, // <6,6,6,6>: Cost 1 vdup2 RHS + 2712572738U, // <6,6,6,7>: Cost 3 vext3 <4,6,4,6>, <6,6,7,7> + 296144182U, // <6,6,6,u>: Cost 1 vdup2 RHS + 2566561894U, // <6,6,7,0>: Cost 3 vext1 <2,6,6,7>, LHS + 3634332924U, // <6,6,7,1>: Cost 4 vext1 <1,6,6,7>, <1,6,6,7> + 2566563797U, // <6,6,7,2>: Cost 3 vext1 <2,6,6,7>, <2,6,6,7> + 2584480258U, // <6,6,7,3>: Cost 3 vext1 <5,6,6,7>, <3,4,5,6> + 2566565174U, // <6,6,7,4>: Cost 3 vext1 <2,6,6,7>, RHS + 2717438846U, // <6,6,7,5>: Cost 3 vext3 <5,4,7,6>, <6,7,5,4> + 2980500280U, // <6,6,7,6>: Cost 3 vzipr RHS, <6,6,6,6> + 1906756918U, // <6,6,7,7>: Cost 2 vzipr RHS, RHS + 1906756919U, // <6,6,7,u>: Cost 2 vzipr RHS, RHS + 1516699750U, // <6,6,u,0>: Cost 2 vext1 <6,6,6,6>, LHS + 1573197614U, // <6,6,u,1>: Cost 2 vext2 <4,u,6,6>, LHS + 2566571990U, // <6,6,u,2>: Cost 3 vext1 <2,6,6,u>, <2,6,6,u> + 2846786205U, // <6,6,u,3>: Cost 3 vuzpr <4,6,4,6>, LHS + 1516703030U, // <6,6,u,4>: Cost 2 vext1 <6,6,6,6>, RHS + 1573197978U, // <6,6,u,5>: Cost 2 vext2 <4,u,6,6>, RHS + 296144182U, // <6,6,u,6>: Cost 1 vdup2 RHS + 1906765110U, // <6,6,u,7>: Cost 2 vzipr RHS, RHS + 296144182U, // <6,6,u,u>: Cost 1 vdup2 RHS + 1571209216U, // <6,7,0,0>: Cost 2 vext2 RHS, <0,0,0,0> + 497467494U, // <6,7,0,1>: Cost 1 vext2 RHS, LHS + 1571209380U, // <6,7,0,2>: Cost 2 vext2 RHS, <0,2,0,2> + 2644951292U, // <6,7,0,3>: Cost 3 vext2 RHS, <0,3,1,0> + 1571209554U, // <6,7,0,4>: Cost 2 vext2 RHS, <0,4,1,5> + 1510756450U, // <6,7,0,5>: Cost 2 vext1 <5,6,7,0>, <5,6,7,0> + 2644951542U, // <6,7,0,6>: Cost 3 vext2 RHS, <0,6,1,7> + 2584499194U, // <6,7,0,7>: Cost 3 vext1 <5,6,7,0>, <7,0,1,2> + 497468061U, // <6,7,0,u>: Cost 1 vext2 RHS, LHS + 1571209974U, // <6,7,1,0>: Cost 2 vext2 RHS, <1,0,3,2> + 1571210036U, // <6,7,1,1>: Cost 2 vext2 RHS, <1,1,1,1> + 1571210134U, // <6,7,1,2>: Cost 2 vext2 RHS, <1,2,3,0> + 1571210200U, // <6,7,1,3>: Cost 2 vext2 RHS, <1,3,1,3> + 2644952098U, // <6,7,1,4>: Cost 3 vext2 RHS, <1,4,0,5> + 1571210384U, // <6,7,1,5>: Cost 2 vext2 RHS, <1,5,3,7> + 2644952271U, // <6,7,1,6>: Cost 3 vext2 RHS, <1,6,1,7> + 2578535418U, // <6,7,1,7>: Cost 3 vext1 <4,6,7,1>, <7,0,1,2> + 1571210605U, // <6,7,1,u>: Cost 2 vext2 RHS, <1,u,1,3> + 2644952509U, // <6,7,2,0>: Cost 3 vext2 RHS, <2,0,1,2> + 2644952582U, // <6,7,2,1>: Cost 3 vext2 RHS, <2,1,0,3> + 1571210856U, // <6,7,2,2>: Cost 2 vext2 RHS, <2,2,2,2> + 1571210918U, // <6,7,2,3>: Cost 2 vext2 RHS, <2,3,0,1> + 2644952828U, // <6,7,2,4>: Cost 3 vext2 RHS, <2,4,0,6> + 2633009028U, // <6,7,2,5>: Cost 3 vext2 <2,5,6,7>, <2,5,6,7> + 1571211194U, // <6,7,2,6>: Cost 2 vext2 RHS, <2,6,3,7> + 2668840938U, // <6,7,2,7>: Cost 3 vext2 RHS, <2,7,0,1> + 1571211323U, // <6,7,2,u>: Cost 2 vext2 RHS, <2,u,0,1> + 1571211414U, // <6,7,3,0>: Cost 2 vext2 RHS, <3,0,1,2> + 2644953311U, // <6,7,3,1>: Cost 3 vext2 RHS, <3,1,0,3> + 2644953390U, // <6,7,3,2>: Cost 3 vext2 RHS, <3,2,0,1> + 1571211676U, // <6,7,3,3>: Cost 2 vext2 RHS, <3,3,3,3> + 1571211778U, // <6,7,3,4>: Cost 2 vext2 RHS, <3,4,5,6> + 2644953648U, // <6,7,3,5>: Cost 3 vext2 RHS, <3,5,1,7> + 2644953720U, // <6,7,3,6>: Cost 3 vext2 RHS, <3,6,0,7> + 2644953795U, // <6,7,3,7>: Cost 3 vext2 RHS, <3,7,0,1> + 1571212062U, // <6,7,3,u>: Cost 2 vext2 RHS, <3,u,1,2> + 1573202834U, // <6,7,4,0>: Cost 2 vext2 RHS, <4,0,5,1> + 2644954058U, // <6,7,4,1>: Cost 3 vext2 RHS, <4,1,2,3> + 2644954166U, // <6,7,4,2>: Cost 3 vext2 RHS, <4,2,5,3> + 2644954258U, // <6,7,4,3>: Cost 3 vext2 RHS, <4,3,6,5> + 1571212496U, // <6,7,4,4>: Cost 2 vext2 RHS, <4,4,4,4> + 497470774U, // <6,7,4,5>: Cost 1 vext2 RHS, RHS + 1573203316U, // <6,7,4,6>: Cost 2 vext2 RHS, <4,6,4,6> + 2646281688U, // <6,7,4,7>: Cost 3 vext2 <4,7,6,7>, <4,7,6,7> + 497471017U, // <6,7,4,u>: Cost 1 vext2 RHS, RHS + 2644954696U, // <6,7,5,0>: Cost 3 vext2 RHS, <5,0,1,2> + 1573203664U, // <6,7,5,1>: Cost 2 vext2 RHS, <5,1,7,3> + 2644954878U, // <6,7,5,2>: Cost 3 vext2 RHS, <5,2,3,4> + 2644954991U, // <6,7,5,3>: Cost 3 vext2 RHS, <5,3,7,0> + 1571213254U, // <6,7,5,4>: Cost 2 vext2 RHS, <5,4,7,6> + 1571213316U, // <6,7,5,5>: Cost 2 vext2 RHS, <5,5,5,5> + 1571213410U, // <6,7,5,6>: Cost 2 vext2 RHS, <5,6,7,0> + 1573204136U, // <6,7,5,7>: Cost 2 vext2 RHS, <5,7,5,7> + 1573204217U, // <6,7,5,u>: Cost 2 vext2 RHS, <5,u,5,7> + 2644955425U, // <6,7,6,0>: Cost 3 vext2 RHS, <6,0,1,2> + 2644955561U, // <6,7,6,1>: Cost 3 vext2 RHS, <6,1,7,3> + 1573204474U, // <6,7,6,2>: Cost 2 vext2 RHS, <6,2,7,3> + 2644955698U, // <6,7,6,3>: Cost 3 vext2 RHS, <6,3,4,5> + 2644955789U, // <6,7,6,4>: Cost 3 vext2 RHS, <6,4,5,6> + 2644955889U, // <6,7,6,5>: Cost 3 vext2 RHS, <6,5,7,7> + 1571214136U, // <6,7,6,6>: Cost 2 vext2 RHS, <6,6,6,6> + 1571214158U, // <6,7,6,7>: Cost 2 vext2 RHS, <6,7,0,1> + 1573204895U, // <6,7,6,u>: Cost 2 vext2 RHS, <6,u,0,1> + 1573204986U, // <6,7,7,0>: Cost 2 vext2 RHS, <7,0,1,2> + 2572608656U, // <6,7,7,1>: Cost 3 vext1 <3,6,7,7>, <1,5,3,7> + 2644956362U, // <6,7,7,2>: Cost 3 vext2 RHS, <7,2,6,3> + 2572610231U, // <6,7,7,3>: Cost 3 vext1 <3,6,7,7>, <3,6,7,7> + 1573205350U, // <6,7,7,4>: Cost 2 vext2 RHS, <7,4,5,6> + 2646947220U, // <6,7,7,5>: Cost 3 vext2 RHS, <7,5,1,7> + 1516786498U, // <6,7,7,6>: Cost 2 vext1 <6,6,7,7>, <6,6,7,7> + 1571214956U, // <6,7,7,7>: Cost 2 vext2 RHS, <7,7,7,7> + 1573205634U, // <6,7,7,u>: Cost 2 vext2 RHS, <7,u,1,2> + 1571215059U, // <6,7,u,0>: Cost 2 vext2 RHS, <u,0,1,2> + 497473326U, // <6,7,u,1>: Cost 1 vext2 RHS, LHS + 1571215237U, // <6,7,u,2>: Cost 2 vext2 RHS, <u,2,3,0> + 1571215292U, // <6,7,u,3>: Cost 2 vext2 RHS, <u,3,0,1> + 1571215423U, // <6,7,u,4>: Cost 2 vext2 RHS, <u,4,5,6> + 497473690U, // <6,7,u,5>: Cost 1 vext2 RHS, RHS + 1571215568U, // <6,7,u,6>: Cost 2 vext2 RHS, <u,6,3,7> + 1573206272U, // <6,7,u,7>: Cost 2 vext2 RHS, <u,7,0,1> + 497473893U, // <6,7,u,u>: Cost 1 vext2 RHS, LHS + 1571217408U, // <6,u,0,0>: Cost 2 vext2 RHS, <0,0,0,0> + 497475686U, // <6,u,0,1>: Cost 1 vext2 RHS, LHS + 1571217572U, // <6,u,0,2>: Cost 2 vext2 RHS, <0,2,0,2> + 2689865445U, // <6,u,0,3>: Cost 3 vext3 <0,u,2,6>, <u,0,3,2> + 1571217746U, // <6,u,0,4>: Cost 2 vext2 RHS, <0,4,1,5> + 1510830187U, // <6,u,0,5>: Cost 2 vext1 <5,6,u,0>, <5,6,u,0> + 2644959734U, // <6,u,0,6>: Cost 3 vext2 RHS, <0,6,1,7> + 1193130221U, // <6,u,0,7>: Cost 2 vrev <u,6,7,0> + 497476253U, // <6,u,0,u>: Cost 1 vext2 RHS, LHS + 1571218166U, // <6,u,1,0>: Cost 2 vext2 RHS, <1,0,3,2> + 1571218228U, // <6,u,1,1>: Cost 2 vext2 RHS, <1,1,1,1> + 1612289838U, // <6,u,1,2>: Cost 2 vext3 <0,2,4,6>, LHS + 1571218392U, // <6,u,1,3>: Cost 2 vext2 RHS, <1,3,1,3> + 2566663478U, // <6,u,1,4>: Cost 3 vext1 <2,6,u,1>, RHS + 1571218576U, // <6,u,1,5>: Cost 2 vext2 RHS, <1,5,3,7> + 2644960463U, // <6,u,1,6>: Cost 3 vext2 RHS, <1,6,1,7> + 2717439835U, // <6,u,1,7>: Cost 3 vext3 <5,4,7,6>, <u,1,7,3> + 1612289892U, // <6,u,1,u>: Cost 2 vext3 <0,2,4,6>, LHS + 1504870502U, // <6,u,2,0>: Cost 2 vext1 <4,6,u,2>, LHS + 2644960774U, // <6,u,2,1>: Cost 3 vext2 RHS, <2,1,0,3> + 1571219048U, // <6,u,2,2>: Cost 2 vext2 RHS, <2,2,2,2> + 1571219110U, // <6,u,2,3>: Cost 2 vext2 RHS, <2,3,0,1> + 1504873782U, // <6,u,2,4>: Cost 2 vext1 <4,6,u,2>, RHS + 2633017221U, // <6,u,2,5>: Cost 3 vext2 <2,5,6,u>, <2,5,6,u> + 1571219386U, // <6,u,2,6>: Cost 2 vext2 RHS, <2,6,3,7> + 2712573868U, // <6,u,2,7>: Cost 3 vext3 <4,6,4,6>, <u,2,7,3> + 1571219515U, // <6,u,2,u>: Cost 2 vext2 RHS, <2,u,0,1> + 1571219606U, // <6,u,3,0>: Cost 2 vext2 RHS, <3,0,1,2> + 2644961503U, // <6,u,3,1>: Cost 3 vext2 RHS, <3,1,0,3> + 2566678499U, // <6,u,3,2>: Cost 3 vext1 <2,6,u,3>, <2,6,u,3> + 1571219868U, // <6,u,3,3>: Cost 2 vext2 RHS, <3,3,3,3> + 1571219970U, // <6,u,3,4>: Cost 2 vext2 RHS, <3,4,5,6> + 2689865711U, // <6,u,3,5>: Cost 3 vext3 <0,u,2,6>, <u,3,5,7> + 2708002806U, // <6,u,3,6>: Cost 3 vext3 <3,u,5,6>, <u,3,6,5> + 2644961987U, // <6,u,3,7>: Cost 3 vext2 RHS, <3,7,0,1> + 1571220254U, // <6,u,3,u>: Cost 2 vext2 RHS, <3,u,1,2> + 1571220370U, // <6,u,4,0>: Cost 2 vext2 RHS, <4,0,5,1> + 2644962250U, // <6,u,4,1>: Cost 3 vext2 RHS, <4,1,2,3> + 1661245476U, // <6,u,4,2>: Cost 2 vext3 <u,4,2,6>, <u,4,2,6> + 2686031917U, // <6,u,4,3>: Cost 3 vext3 <0,2,4,6>, <u,4,3,6> + 1571220688U, // <6,u,4,4>: Cost 2 vext2 RHS, <4,4,4,4> + 497478967U, // <6,u,4,5>: Cost 1 vext2 RHS, RHS + 1571220852U, // <6,u,4,6>: Cost 2 vext2 RHS, <4,6,4,6> + 1661614161U, // <6,u,4,7>: Cost 2 vext3 <u,4,7,6>, <u,4,7,6> + 497479209U, // <6,u,4,u>: Cost 1 vext2 RHS, RHS + 2566692966U, // <6,u,5,0>: Cost 3 vext1 <2,6,u,5>, LHS + 1571221200U, // <6,u,5,1>: Cost 2 vext2 RHS, <5,1,7,3> + 2566694885U, // <6,u,5,2>: Cost 3 vext1 <2,6,u,5>, <2,6,u,5> + 2689865855U, // <6,u,5,3>: Cost 3 vext3 <0,u,2,6>, <u,5,3,7> + 1571221446U, // <6,u,5,4>: Cost 2 vext2 RHS, <5,4,7,6> + 1571221508U, // <6,u,5,5>: Cost 2 vext2 RHS, <5,5,5,5> + 1612290202U, // <6,u,5,6>: Cost 2 vext3 <0,2,4,6>, RHS + 1571221672U, // <6,u,5,7>: Cost 2 vext2 RHS, <5,7,5,7> + 1612290220U, // <6,u,5,u>: Cost 2 vext3 <0,2,4,6>, RHS + 1504903270U, // <6,u,6,0>: Cost 2 vext1 <4,6,u,6>, LHS + 2644963752U, // <6,u,6,1>: Cost 3 vext2 RHS, <6,1,7,2> + 1571222010U, // <6,u,6,2>: Cost 2 vext2 RHS, <6,2,7,3> + 2686032080U, // <6,u,6,3>: Cost 3 vext3 <0,2,4,6>, <u,6,3,7> + 1504906550U, // <6,u,6,4>: Cost 2 vext1 <4,6,u,6>, RHS + 2644964079U, // <6,u,6,5>: Cost 3 vext2 RHS, <6,5,7,5> + 296144182U, // <6,u,6,6>: Cost 1 vdup2 RHS + 1571222350U, // <6,u,6,7>: Cost 2 vext2 RHS, <6,7,0,1> + 296144182U, // <6,u,6,u>: Cost 1 vdup2 RHS + 1492967526U, // <6,u,7,0>: Cost 2 vext1 <2,6,u,7>, LHS + 2560738574U, // <6,u,7,1>: Cost 3 vext1 <1,6,u,7>, <1,6,u,7> + 1492969447U, // <6,u,7,2>: Cost 2 vext1 <2,6,u,7>, <2,6,u,7> + 1906753692U, // <6,u,7,3>: Cost 2 vzipr RHS, LHS + 1492970806U, // <6,u,7,4>: Cost 2 vext1 <2,6,u,7>, RHS + 2980495761U, // <6,u,7,5>: Cost 3 vzipr RHS, <0,4,u,5> + 1516860235U, // <6,u,7,6>: Cost 2 vext1 <6,6,u,7>, <6,6,u,7> + 1906756936U, // <6,u,7,7>: Cost 2 vzipr RHS, RHS + 1492973358U, // <6,u,7,u>: Cost 2 vext1 <2,6,u,7>, LHS + 1492975718U, // <6,u,u,0>: Cost 2 vext1 <2,6,u,u>, LHS + 497481518U, // <6,u,u,1>: Cost 1 vext2 RHS, LHS + 1612290405U, // <6,u,u,2>: Cost 2 vext3 <0,2,4,6>, LHS + 1571223484U, // <6,u,u,3>: Cost 2 vext2 RHS, <u,3,0,1> + 1492978998U, // <6,u,u,4>: Cost 2 vext1 <2,6,u,u>, RHS + 497481882U, // <6,u,u,5>: Cost 1 vext2 RHS, RHS + 296144182U, // <6,u,u,6>: Cost 1 vdup2 RHS + 1906765128U, // <6,u,u,7>: Cost 2 vzipr RHS, RHS + 497482085U, // <6,u,u,u>: Cost 1 vext2 RHS, LHS + 1638318080U, // <7,0,0,0>: Cost 2 vext3 RHS, <0,0,0,0> + 1638318090U, // <7,0,0,1>: Cost 2 vext3 RHS, <0,0,1,1> + 1638318100U, // <7,0,0,2>: Cost 2 vext3 RHS, <0,0,2,2> + 3646442178U, // <7,0,0,3>: Cost 4 vext1 <3,7,0,0>, <3,7,0,0> + 2712059941U, // <7,0,0,4>: Cost 3 vext3 RHS, <0,0,4,1> + 2651603364U, // <7,0,0,5>: Cost 3 vext2 <5,6,7,0>, <0,5,1,6> + 2590618445U, // <7,0,0,6>: Cost 3 vext1 <6,7,0,0>, <6,7,0,0> + 3785801798U, // <7,0,0,7>: Cost 4 vext3 RHS, <0,0,7,7> + 1638318153U, // <7,0,0,u>: Cost 2 vext3 RHS, <0,0,u,1> + 1516879974U, // <7,0,1,0>: Cost 2 vext1 <6,7,0,1>, LHS + 2693922911U, // <7,0,1,1>: Cost 3 vext3 <1,5,3,7>, <0,1,1,5> + 564576358U, // <7,0,1,2>: Cost 1 vext3 RHS, LHS + 2638996480U, // <7,0,1,3>: Cost 3 vext2 <3,5,7,0>, <1,3,5,7> + 1516883254U, // <7,0,1,4>: Cost 2 vext1 <6,7,0,1>, RHS + 2649613456U, // <7,0,1,5>: Cost 3 vext2 <5,3,7,0>, <1,5,3,7> + 1516884814U, // <7,0,1,6>: Cost 2 vext1 <6,7,0,1>, <6,7,0,1> + 2590626808U, // <7,0,1,7>: Cost 3 vext1 <6,7,0,1>, <7,0,1,0> + 564576412U, // <7,0,1,u>: Cost 1 vext3 RHS, LHS + 1638318244U, // <7,0,2,0>: Cost 2 vext3 RHS, <0,2,0,2> + 2692743344U, // <7,0,2,1>: Cost 3 vext3 <1,3,5,7>, <0,2,1,5> + 2712060084U, // <7,0,2,2>: Cost 3 vext3 RHS, <0,2,2,0> + 2712060094U, // <7,0,2,3>: Cost 3 vext3 RHS, <0,2,3,1> + 1638318284U, // <7,0,2,4>: Cost 2 vext3 RHS, <0,2,4,6> + 2712060118U, // <7,0,2,5>: Cost 3 vext3 RHS, <0,2,5,7> + 2651604922U, // <7,0,2,6>: Cost 3 vext2 <5,6,7,0>, <2,6,3,7> + 2686255336U, // <7,0,2,7>: Cost 3 vext3 <0,2,7,7>, <0,2,7,7> + 1638318316U, // <7,0,2,u>: Cost 2 vext3 RHS, <0,2,u,2> + 2651605142U, // <7,0,3,0>: Cost 3 vext2 <5,6,7,0>, <3,0,1,2> + 2712060156U, // <7,0,3,1>: Cost 3 vext3 RHS, <0,3,1,0> + 2712060165U, // <7,0,3,2>: Cost 3 vext3 RHS, <0,3,2,0> + 2651605404U, // <7,0,3,3>: Cost 3 vext2 <5,6,7,0>, <3,3,3,3> + 2651605506U, // <7,0,3,4>: Cost 3 vext2 <5,6,7,0>, <3,4,5,6> + 2638998111U, // <7,0,3,5>: Cost 3 vext2 <3,5,7,0>, <3,5,7,0> + 2639661744U, // <7,0,3,6>: Cost 3 vext2 <3,6,7,0>, <3,6,7,0> + 3712740068U, // <7,0,3,7>: Cost 4 vext2 <3,5,7,0>, <3,7,3,7> + 2640989010U, // <7,0,3,u>: Cost 3 vext2 <3,u,7,0>, <3,u,7,0> + 2712060232U, // <7,0,4,0>: Cost 3 vext3 RHS, <0,4,0,4> + 1638318418U, // <7,0,4,1>: Cost 2 vext3 RHS, <0,4,1,5> + 1638318428U, // <7,0,4,2>: Cost 2 vext3 RHS, <0,4,2,6> + 3646474950U, // <7,0,4,3>: Cost 4 vext1 <3,7,0,4>, <3,7,0,4> + 2712060270U, // <7,0,4,4>: Cost 3 vext3 RHS, <0,4,4,6> + 1577864502U, // <7,0,4,5>: Cost 2 vext2 <5,6,7,0>, RHS + 2651606388U, // <7,0,4,6>: Cost 3 vext2 <5,6,7,0>, <4,6,4,6> + 3787792776U, // <7,0,4,7>: Cost 4 vext3 RHS, <0,4,7,5> + 1638318481U, // <7,0,4,u>: Cost 2 vext3 RHS, <0,4,u,5> + 2590654566U, // <7,0,5,0>: Cost 3 vext1 <6,7,0,5>, LHS + 2651606736U, // <7,0,5,1>: Cost 3 vext2 <5,6,7,0>, <5,1,7,3> + 2712060334U, // <7,0,5,2>: Cost 3 vext3 RHS, <0,5,2,7> + 2649616239U, // <7,0,5,3>: Cost 3 vext2 <5,3,7,0>, <5,3,7,0> + 2651606982U, // <7,0,5,4>: Cost 3 vext2 <5,6,7,0>, <5,4,7,6> + 2651607044U, // <7,0,5,5>: Cost 3 vext2 <5,6,7,0>, <5,5,5,5> + 1577865314U, // <7,0,5,6>: Cost 2 vext2 <5,6,7,0>, <5,6,7,0> + 2651607208U, // <7,0,5,7>: Cost 3 vext2 <5,6,7,0>, <5,7,5,7> + 1579192580U, // <7,0,5,u>: Cost 2 vext2 <5,u,7,0>, <5,u,7,0> + 2688393709U, // <7,0,6,0>: Cost 3 vext3 <0,6,0,7>, <0,6,0,7> + 2712060406U, // <7,0,6,1>: Cost 3 vext3 RHS, <0,6,1,7> + 2688541183U, // <7,0,6,2>: Cost 3 vext3 <0,6,2,7>, <0,6,2,7> + 2655588936U, // <7,0,6,3>: Cost 3 vext2 <6,3,7,0>, <6,3,7,0> + 3762430481U, // <7,0,6,4>: Cost 4 vext3 <0,6,4,7>, <0,6,4,7> + 2651607730U, // <7,0,6,5>: Cost 3 vext2 <5,6,7,0>, <6,5,0,7> + 2651607864U, // <7,0,6,6>: Cost 3 vext2 <5,6,7,0>, <6,6,6,6> + 2651607886U, // <7,0,6,7>: Cost 3 vext2 <5,6,7,0>, <6,7,0,1> + 2688983605U, // <7,0,6,u>: Cost 3 vext3 <0,6,u,7>, <0,6,u,7> + 2651608058U, // <7,0,7,0>: Cost 3 vext2 <5,6,7,0>, <7,0,1,2> + 2932703334U, // <7,0,7,1>: Cost 3 vzipl <7,7,7,7>, LHS + 3066921062U, // <7,0,7,2>: Cost 3 vtrnl <7,7,7,7>, LHS + 3712742678U, // <7,0,7,3>: Cost 4 vext2 <3,5,7,0>, <7,3,5,7> + 2651608422U, // <7,0,7,4>: Cost 3 vext2 <5,6,7,0>, <7,4,5,6> + 2651608513U, // <7,0,7,5>: Cost 3 vext2 <5,6,7,0>, <7,5,6,7> + 2663552532U, // <7,0,7,6>: Cost 3 vext2 <7,6,7,0>, <7,6,7,0> + 2651608684U, // <7,0,7,7>: Cost 3 vext2 <5,6,7,0>, <7,7,7,7> + 2651608706U, // <7,0,7,u>: Cost 3 vext2 <5,6,7,0>, <7,u,1,2> + 1638318730U, // <7,0,u,0>: Cost 2 vext3 RHS, <0,u,0,2> + 1638318738U, // <7,0,u,1>: Cost 2 vext3 RHS, <0,u,1,1> + 564576925U, // <7,0,u,2>: Cost 1 vext3 RHS, LHS + 2572765898U, // <7,0,u,3>: Cost 3 vext1 <3,7,0,u>, <3,7,0,u> + 1638318770U, // <7,0,u,4>: Cost 2 vext3 RHS, <0,u,4,6> + 1577867418U, // <7,0,u,5>: Cost 2 vext2 <5,6,7,0>, RHS + 1516942165U, // <7,0,u,6>: Cost 2 vext1 <6,7,0,u>, <6,7,0,u> + 2651609344U, // <7,0,u,7>: Cost 3 vext2 <5,6,7,0>, <u,7,0,1> + 564576979U, // <7,0,u,u>: Cost 1 vext3 RHS, LHS + 2590687334U, // <7,1,0,0>: Cost 3 vext1 <6,7,1,0>, LHS + 2639003750U, // <7,1,0,1>: Cost 3 vext2 <3,5,7,1>, LHS + 2793357414U, // <7,1,0,2>: Cost 3 vuzpl <7,0,1,2>, LHS + 1638318838U, // <7,1,0,3>: Cost 2 vext3 RHS, <1,0,3,2> + 2590690614U, // <7,1,0,4>: Cost 3 vext1 <6,7,1,0>, RHS + 2712060679U, // <7,1,0,5>: Cost 3 vext3 RHS, <1,0,5,1> + 2590692182U, // <7,1,0,6>: Cost 3 vext1 <6,7,1,0>, <6,7,1,0> + 3785802521U, // <7,1,0,7>: Cost 4 vext3 RHS, <1,0,7,1> + 1638318883U, // <7,1,0,u>: Cost 2 vext3 RHS, <1,0,u,2> + 2712060715U, // <7,1,1,0>: Cost 3 vext3 RHS, <1,1,0,1> + 1638318900U, // <7,1,1,1>: Cost 2 vext3 RHS, <1,1,1,1> + 3774300994U, // <7,1,1,2>: Cost 4 vext3 <2,6,3,7>, <1,1,2,6> + 1638318920U, // <7,1,1,3>: Cost 2 vext3 RHS, <1,1,3,3> + 2712060755U, // <7,1,1,4>: Cost 3 vext3 RHS, <1,1,4,5> + 2691416926U, // <7,1,1,5>: Cost 3 vext3 <1,1,5,7>, <1,1,5,7> + 2590700375U, // <7,1,1,6>: Cost 3 vext1 <6,7,1,1>, <6,7,1,1> + 3765158766U, // <7,1,1,7>: Cost 4 vext3 <1,1,5,7>, <1,1,7,5> + 1638318965U, // <7,1,1,u>: Cost 2 vext3 RHS, <1,1,u,3> + 2712060796U, // <7,1,2,0>: Cost 3 vext3 RHS, <1,2,0,1> + 2712060807U, // <7,1,2,1>: Cost 3 vext3 RHS, <1,2,1,3> + 3712747112U, // <7,1,2,2>: Cost 4 vext2 <3,5,7,1>, <2,2,2,2> + 1638318998U, // <7,1,2,3>: Cost 2 vext3 RHS, <1,2,3,0> + 2712060836U, // <7,1,2,4>: Cost 3 vext3 RHS, <1,2,4,5> + 2712060843U, // <7,1,2,5>: Cost 3 vext3 RHS, <1,2,5,3> + 2590708568U, // <7,1,2,6>: Cost 3 vext1 <6,7,1,2>, <6,7,1,2> + 2735948730U, // <7,1,2,7>: Cost 3 vext3 RHS, <1,2,7,0> + 1638319043U, // <7,1,2,u>: Cost 2 vext3 RHS, <1,2,u,0> + 2712060876U, // <7,1,3,0>: Cost 3 vext3 RHS, <1,3,0,0> + 1638319064U, // <7,1,3,1>: Cost 2 vext3 RHS, <1,3,1,3> + 2712060894U, // <7,1,3,2>: Cost 3 vext3 RHS, <1,3,2,0> + 2692596718U, // <7,1,3,3>: Cost 3 vext3 <1,3,3,7>, <1,3,3,7> + 2712060917U, // <7,1,3,4>: Cost 3 vext3 RHS, <1,3,4,5> + 1619002368U, // <7,1,3,5>: Cost 2 vext3 <1,3,5,7>, <1,3,5,7> + 2692817929U, // <7,1,3,6>: Cost 3 vext3 <1,3,6,7>, <1,3,6,7> + 2735948814U, // <7,1,3,7>: Cost 3 vext3 RHS, <1,3,7,3> + 1619223579U, // <7,1,3,u>: Cost 2 vext3 <1,3,u,7>, <1,3,u,7> + 2712060962U, // <7,1,4,0>: Cost 3 vext3 RHS, <1,4,0,5> + 2712060971U, // <7,1,4,1>: Cost 3 vext3 RHS, <1,4,1,5> + 2712060980U, // <7,1,4,2>: Cost 3 vext3 RHS, <1,4,2,5> + 2712060989U, // <7,1,4,3>: Cost 3 vext3 RHS, <1,4,3,5> + 3785802822U, // <7,1,4,4>: Cost 4 vext3 RHS, <1,4,4,5> + 2639007030U, // <7,1,4,5>: Cost 3 vext2 <3,5,7,1>, RHS + 2645642634U, // <7,1,4,6>: Cost 3 vext2 <4,6,7,1>, <4,6,7,1> + 3719384520U, // <7,1,4,7>: Cost 4 vext2 <4,6,7,1>, <4,7,5,0> + 2639007273U, // <7,1,4,u>: Cost 3 vext2 <3,5,7,1>, RHS + 2572812390U, // <7,1,5,0>: Cost 3 vext1 <3,7,1,5>, LHS + 2693776510U, // <7,1,5,1>: Cost 3 vext3 <1,5,1,7>, <1,5,1,7> + 3774301318U, // <7,1,5,2>: Cost 4 vext3 <2,6,3,7>, <1,5,2,6> + 1620182160U, // <7,1,5,3>: Cost 2 vext3 <1,5,3,7>, <1,5,3,7> + 2572815670U, // <7,1,5,4>: Cost 3 vext1 <3,7,1,5>, RHS + 3766486178U, // <7,1,5,5>: Cost 4 vext3 <1,3,5,7>, <1,5,5,7> + 2651615331U, // <7,1,5,6>: Cost 3 vext2 <5,6,7,1>, <5,6,7,1> + 2652278964U, // <7,1,5,7>: Cost 3 vext2 <5,7,7,1>, <5,7,7,1> + 1620550845U, // <7,1,5,u>: Cost 2 vext3 <1,5,u,7>, <1,5,u,7> + 3768108230U, // <7,1,6,0>: Cost 4 vext3 <1,6,0,7>, <1,6,0,7> + 2694440143U, // <7,1,6,1>: Cost 3 vext3 <1,6,1,7>, <1,6,1,7> + 2712061144U, // <7,1,6,2>: Cost 3 vext3 RHS, <1,6,2,7> + 2694587617U, // <7,1,6,3>: Cost 3 vext3 <1,6,3,7>, <1,6,3,7> + 3768403178U, // <7,1,6,4>: Cost 4 vext3 <1,6,4,7>, <1,6,4,7> + 2694735091U, // <7,1,6,5>: Cost 3 vext3 <1,6,5,7>, <1,6,5,7> + 3768550652U, // <7,1,6,6>: Cost 4 vext3 <1,6,6,7>, <1,6,6,7> + 2652279630U, // <7,1,6,7>: Cost 3 vext2 <5,7,7,1>, <6,7,0,1> + 2694956302U, // <7,1,6,u>: Cost 3 vext3 <1,6,u,7>, <1,6,u,7> + 2645644282U, // <7,1,7,0>: Cost 3 vext2 <4,6,7,1>, <7,0,1,2> + 2859062094U, // <7,1,7,1>: Cost 3 vuzpr <6,7,0,1>, <6,7,0,1> + 3779462437U, // <7,1,7,2>: Cost 4 vext3 <3,5,1,7>, <1,7,2,3> + 3121938534U, // <7,1,7,3>: Cost 3 vtrnr <5,7,5,7>, LHS + 2554916150U, // <7,1,7,4>: Cost 3 vext1 <0,7,1,7>, RHS + 3769140548U, // <7,1,7,5>: Cost 4 vext3 <1,7,5,7>, <1,7,5,7> + 3726022164U, // <7,1,7,6>: Cost 4 vext2 <5,7,7,1>, <7,6,7,0> + 2554918508U, // <7,1,7,7>: Cost 3 vext1 <0,7,1,7>, <7,7,7,7> + 3121938539U, // <7,1,7,u>: Cost 3 vtrnr <5,7,5,7>, LHS + 2572836966U, // <7,1,u,0>: Cost 3 vext1 <3,7,1,u>, LHS + 1638319469U, // <7,1,u,1>: Cost 2 vext3 RHS, <1,u,1,3> + 2712061299U, // <7,1,u,2>: Cost 3 vext3 RHS, <1,u,2,0> + 1622173059U, // <7,1,u,3>: Cost 2 vext3 <1,u,3,7>, <1,u,3,7> + 2572840246U, // <7,1,u,4>: Cost 3 vext1 <3,7,1,u>, RHS + 1622320533U, // <7,1,u,5>: Cost 2 vext3 <1,u,5,7>, <1,u,5,7> + 2696136094U, // <7,1,u,6>: Cost 3 vext3 <1,u,6,7>, <1,u,6,7> + 2859060777U, // <7,1,u,7>: Cost 3 vuzpr <6,7,0,1>, RHS + 1622541744U, // <7,1,u,u>: Cost 2 vext3 <1,u,u,7>, <1,u,u,7> + 2712061364U, // <7,2,0,0>: Cost 3 vext3 RHS, <2,0,0,2> + 2712061373U, // <7,2,0,1>: Cost 3 vext3 RHS, <2,0,1,2> + 2712061380U, // <7,2,0,2>: Cost 3 vext3 RHS, <2,0,2,0> + 2712061389U, // <7,2,0,3>: Cost 3 vext3 RHS, <2,0,3,0> + 2712061404U, // <7,2,0,4>: Cost 3 vext3 RHS, <2,0,4,6> + 2696725990U, // <7,2,0,5>: Cost 3 vext3 <2,0,5,7>, <2,0,5,7> + 2712061417U, // <7,2,0,6>: Cost 3 vext3 RHS, <2,0,6,1> + 3785803251U, // <7,2,0,7>: Cost 4 vext3 RHS, <2,0,7,2> + 2696947201U, // <7,2,0,u>: Cost 3 vext3 <2,0,u,7>, <2,0,u,7> + 2712061446U, // <7,2,1,0>: Cost 3 vext3 RHS, <2,1,0,3> + 3785803276U, // <7,2,1,1>: Cost 4 vext3 RHS, <2,1,1,0> + 3785803285U, // <7,2,1,2>: Cost 4 vext3 RHS, <2,1,2,0> + 2712061471U, // <7,2,1,3>: Cost 3 vext3 RHS, <2,1,3,1> + 2712061482U, // <7,2,1,4>: Cost 3 vext3 RHS, <2,1,4,3> + 3766486576U, // <7,2,1,5>: Cost 4 vext3 <1,3,5,7>, <2,1,5,0> + 2712061500U, // <7,2,1,6>: Cost 3 vext3 RHS, <2,1,6,3> + 2602718850U, // <7,2,1,7>: Cost 3 vext1 <u,7,2,1>, <7,u,1,2> + 2712061516U, // <7,2,1,u>: Cost 3 vext3 RHS, <2,1,u,1> + 2712061525U, // <7,2,2,0>: Cost 3 vext3 RHS, <2,2,0,1> + 2712061536U, // <7,2,2,1>: Cost 3 vext3 RHS, <2,2,1,3> + 1638319720U, // <7,2,2,2>: Cost 2 vext3 RHS, <2,2,2,2> + 1638319730U, // <7,2,2,3>: Cost 2 vext3 RHS, <2,2,3,3> + 2712061565U, // <7,2,2,4>: Cost 3 vext3 RHS, <2,2,4,5> + 2698053256U, // <7,2,2,5>: Cost 3 vext3 <2,2,5,7>, <2,2,5,7> + 2712061584U, // <7,2,2,6>: Cost 3 vext3 RHS, <2,2,6,6> + 3771795096U, // <7,2,2,7>: Cost 4 vext3 <2,2,5,7>, <2,2,7,5> + 1638319775U, // <7,2,2,u>: Cost 2 vext3 RHS, <2,2,u,3> + 1638319782U, // <7,2,3,0>: Cost 2 vext3 RHS, <2,3,0,1> + 2693924531U, // <7,2,3,1>: Cost 3 vext3 <1,5,3,7>, <2,3,1,5> + 2700560061U, // <7,2,3,2>: Cost 3 vext3 <2,6,3,7>, <2,3,2,6> + 2693924551U, // <7,2,3,3>: Cost 3 vext3 <1,5,3,7>, <2,3,3,7> + 1638319822U, // <7,2,3,4>: Cost 2 vext3 RHS, <2,3,4,5> + 2698716889U, // <7,2,3,5>: Cost 3 vext3 <2,3,5,7>, <2,3,5,7> + 2712061665U, // <7,2,3,6>: Cost 3 vext3 RHS, <2,3,6,6> + 2735949540U, // <7,2,3,7>: Cost 3 vext3 RHS, <2,3,7,0> + 1638319854U, // <7,2,3,u>: Cost 2 vext3 RHS, <2,3,u,1> + 2712061692U, // <7,2,4,0>: Cost 3 vext3 RHS, <2,4,0,6> + 2712061698U, // <7,2,4,1>: Cost 3 vext3 RHS, <2,4,1,3> + 2712061708U, // <7,2,4,2>: Cost 3 vext3 RHS, <2,4,2,4> + 2712061718U, // <7,2,4,3>: Cost 3 vext3 RHS, <2,4,3,5> + 2712061728U, // <7,2,4,4>: Cost 3 vext3 RHS, <2,4,4,6> + 2699380522U, // <7,2,4,5>: Cost 3 vext3 <2,4,5,7>, <2,4,5,7> + 2712061740U, // <7,2,4,6>: Cost 3 vext3 RHS, <2,4,6,0> + 3809691445U, // <7,2,4,7>: Cost 4 vext3 RHS, <2,4,7,0> + 2699601733U, // <7,2,4,u>: Cost 3 vext3 <2,4,u,7>, <2,4,u,7> + 2699675470U, // <7,2,5,0>: Cost 3 vext3 <2,5,0,7>, <2,5,0,7> + 3766486867U, // <7,2,5,1>: Cost 4 vext3 <1,3,5,7>, <2,5,1,3> + 2699822944U, // <7,2,5,2>: Cost 3 vext3 <2,5,2,7>, <2,5,2,7> + 2692745065U, // <7,2,5,3>: Cost 3 vext3 <1,3,5,7>, <2,5,3,7> + 2699970418U, // <7,2,5,4>: Cost 3 vext3 <2,5,4,7>, <2,5,4,7> + 3766486907U, // <7,2,5,5>: Cost 4 vext3 <1,3,5,7>, <2,5,5,7> + 2700117892U, // <7,2,5,6>: Cost 3 vext3 <2,5,6,7>, <2,5,6,7> + 3771795334U, // <7,2,5,7>: Cost 4 vext3 <2,2,5,7>, <2,5,7,0> + 2692745110U, // <7,2,5,u>: Cost 3 vext3 <1,3,5,7>, <2,5,u,7> + 2572894310U, // <7,2,6,0>: Cost 3 vext1 <3,7,2,6>, LHS + 2712061860U, // <7,2,6,1>: Cost 3 vext3 RHS, <2,6,1,3> + 2700486577U, // <7,2,6,2>: Cost 3 vext3 <2,6,2,7>, <2,6,2,7> + 1626818490U, // <7,2,6,3>: Cost 2 vext3 <2,6,3,7>, <2,6,3,7> + 2572897590U, // <7,2,6,4>: Cost 3 vext1 <3,7,2,6>, RHS + 2700707788U, // <7,2,6,5>: Cost 3 vext3 <2,6,5,7>, <2,6,5,7> + 2700781525U, // <7,2,6,6>: Cost 3 vext3 <2,6,6,7>, <2,6,6,7> + 3774597086U, // <7,2,6,7>: Cost 4 vext3 <2,6,7,7>, <2,6,7,7> + 1627187175U, // <7,2,6,u>: Cost 2 vext3 <2,6,u,7>, <2,6,u,7> + 2735949802U, // <7,2,7,0>: Cost 3 vext3 RHS, <2,7,0,1> + 3780200434U, // <7,2,7,1>: Cost 4 vext3 <3,6,2,7>, <2,7,1,0> + 3773564928U, // <7,2,7,2>: Cost 4 vext3 <2,5,2,7>, <2,7,2,5> + 2986541158U, // <7,2,7,3>: Cost 3 vzipr <5,5,7,7>, LHS + 2554989878U, // <7,2,7,4>: Cost 3 vext1 <0,7,2,7>, RHS + 3775113245U, // <7,2,7,5>: Cost 4 vext3 <2,7,5,7>, <2,7,5,7> + 4060283228U, // <7,2,7,6>: Cost 4 vzipr <5,5,7,7>, <0,4,2,6> + 2554992236U, // <7,2,7,7>: Cost 3 vext1 <0,7,2,7>, <7,7,7,7> + 2986541163U, // <7,2,7,u>: Cost 3 vzipr <5,5,7,7>, LHS + 1638320187U, // <7,2,u,0>: Cost 2 vext3 RHS, <2,u,0,1> + 2693924936U, // <7,2,u,1>: Cost 3 vext3 <1,5,3,7>, <2,u,1,5> + 1638319720U, // <7,2,u,2>: Cost 2 vext3 RHS, <2,2,2,2> + 1628145756U, // <7,2,u,3>: Cost 2 vext3 <2,u,3,7>, <2,u,3,7> + 1638320227U, // <7,2,u,4>: Cost 2 vext3 RHS, <2,u,4,5> + 2702035054U, // <7,2,u,5>: Cost 3 vext3 <2,u,5,7>, <2,u,5,7> + 2702108791U, // <7,2,u,6>: Cost 3 vext3 <2,u,6,7>, <2,u,6,7> + 2735949945U, // <7,2,u,7>: Cost 3 vext3 RHS, <2,u,7,0> + 1628514441U, // <7,2,u,u>: Cost 2 vext3 <2,u,u,7>, <2,u,u,7> + 2712062091U, // <7,3,0,0>: Cost 3 vext3 RHS, <3,0,0,0> + 1638320278U, // <7,3,0,1>: Cost 2 vext3 RHS, <3,0,1,2> + 2712062109U, // <7,3,0,2>: Cost 3 vext3 RHS, <3,0,2,0> + 2590836886U, // <7,3,0,3>: Cost 3 vext1 <6,7,3,0>, <3,0,1,2> + 2712062128U, // <7,3,0,4>: Cost 3 vext3 RHS, <3,0,4,1> + 2712062138U, // <7,3,0,5>: Cost 3 vext3 RHS, <3,0,5,2> + 2590839656U, // <7,3,0,6>: Cost 3 vext1 <6,7,3,0>, <6,7,3,0> + 3311414017U, // <7,3,0,7>: Cost 4 vrev <3,7,7,0> + 1638320341U, // <7,3,0,u>: Cost 2 vext3 RHS, <3,0,u,2> + 2237164227U, // <7,3,1,0>: Cost 3 vrev <3,7,0,1> + 2712062182U, // <7,3,1,1>: Cost 3 vext3 RHS, <3,1,1,1> + 2712062193U, // <7,3,1,2>: Cost 3 vext3 RHS, <3,1,2,3> + 2692745468U, // <7,3,1,3>: Cost 3 vext3 <1,3,5,7>, <3,1,3,5> + 2712062214U, // <7,3,1,4>: Cost 3 vext3 RHS, <3,1,4,6> + 2693925132U, // <7,3,1,5>: Cost 3 vext3 <1,5,3,7>, <3,1,5,3> + 3768183059U, // <7,3,1,6>: Cost 4 vext3 <1,6,1,7>, <3,1,6,1> + 2692745504U, // <7,3,1,7>: Cost 3 vext3 <1,3,5,7>, <3,1,7,5> + 2696063273U, // <7,3,1,u>: Cost 3 vext3 <1,u,5,7>, <3,1,u,5> + 2712062254U, // <7,3,2,0>: Cost 3 vext3 RHS, <3,2,0,1> + 2712062262U, // <7,3,2,1>: Cost 3 vext3 RHS, <3,2,1,0> + 2712062273U, // <7,3,2,2>: Cost 3 vext3 RHS, <3,2,2,2> + 2712062280U, // <7,3,2,3>: Cost 3 vext3 RHS, <3,2,3,0> + 2712062294U, // <7,3,2,4>: Cost 3 vext3 RHS, <3,2,4,5> + 2712062302U, // <7,3,2,5>: Cost 3 vext3 RHS, <3,2,5,4> + 2700560742U, // <7,3,2,6>: Cost 3 vext3 <2,6,3,7>, <3,2,6,3> + 2712062319U, // <7,3,2,7>: Cost 3 vext3 RHS, <3,2,7,3> + 2712062325U, // <7,3,2,u>: Cost 3 vext3 RHS, <3,2,u,0> + 2712062335U, // <7,3,3,0>: Cost 3 vext3 RHS, <3,3,0,1> + 2636368158U, // <7,3,3,1>: Cost 3 vext2 <3,1,7,3>, <3,1,7,3> + 2637031791U, // <7,3,3,2>: Cost 3 vext2 <3,2,7,3>, <3,2,7,3> + 1638320540U, // <7,3,3,3>: Cost 2 vext3 RHS, <3,3,3,3> + 2712062374U, // <7,3,3,4>: Cost 3 vext3 RHS, <3,3,4,4> + 2704689586U, // <7,3,3,5>: Cost 3 vext3 <3,3,5,7>, <3,3,5,7> + 2590864235U, // <7,3,3,6>: Cost 3 vext1 <6,7,3,3>, <6,7,3,3> + 2704837060U, // <7,3,3,7>: Cost 3 vext3 <3,3,7,7>, <3,3,7,7> + 1638320540U, // <7,3,3,u>: Cost 2 vext3 RHS, <3,3,3,3> + 2712062416U, // <7,3,4,0>: Cost 3 vext3 RHS, <3,4,0,1> + 2712062426U, // <7,3,4,1>: Cost 3 vext3 RHS, <3,4,1,2> + 2566981640U, // <7,3,4,2>: Cost 3 vext1 <2,7,3,4>, <2,7,3,4> + 2712062447U, // <7,3,4,3>: Cost 3 vext3 RHS, <3,4,3,5> + 2712062456U, // <7,3,4,4>: Cost 3 vext3 RHS, <3,4,4,5> + 1638320642U, // <7,3,4,5>: Cost 2 vext3 RHS, <3,4,5,6> + 2648313204U, // <7,3,4,6>: Cost 3 vext2 <5,1,7,3>, <4,6,4,6> + 3311446789U, // <7,3,4,7>: Cost 4 vrev <3,7,7,4> + 1638320669U, // <7,3,4,u>: Cost 2 vext3 RHS, <3,4,u,6> + 2602819686U, // <7,3,5,0>: Cost 3 vext1 <u,7,3,5>, LHS + 1574571728U, // <7,3,5,1>: Cost 2 vext2 <5,1,7,3>, <5,1,7,3> + 2648977185U, // <7,3,5,2>: Cost 3 vext2 <5,2,7,3>, <5,2,7,3> + 2705869378U, // <7,3,5,3>: Cost 3 vext3 <3,5,3,7>, <3,5,3,7> + 2237491947U, // <7,3,5,4>: Cost 3 vrev <3,7,4,5> + 2706016852U, // <7,3,5,5>: Cost 3 vext3 <3,5,5,7>, <3,5,5,7> + 2648313954U, // <7,3,5,6>: Cost 3 vext2 <5,1,7,3>, <5,6,7,0> + 2692745823U, // <7,3,5,7>: Cost 3 vext3 <1,3,5,7>, <3,5,7,0> + 1579217159U, // <7,3,5,u>: Cost 2 vext2 <5,u,7,3>, <5,u,7,3> + 2706311800U, // <7,3,6,0>: Cost 3 vext3 <3,6,0,7>, <3,6,0,7> + 2654286249U, // <7,3,6,1>: Cost 3 vext2 <6,1,7,3>, <6,1,7,3> + 1581208058U, // <7,3,6,2>: Cost 2 vext2 <6,2,7,3>, <6,2,7,3> + 2706533011U, // <7,3,6,3>: Cost 3 vext3 <3,6,3,7>, <3,6,3,7> + 2706606748U, // <7,3,6,4>: Cost 3 vext3 <3,6,4,7>, <3,6,4,7> + 3780422309U, // <7,3,6,5>: Cost 4 vext3 <3,6,5,7>, <3,6,5,7> + 2712062637U, // <7,3,6,6>: Cost 3 vext3 RHS, <3,6,6,6> + 2706827959U, // <7,3,6,7>: Cost 3 vext3 <3,6,7,7>, <3,6,7,7> + 1585189856U, // <7,3,6,u>: Cost 2 vext2 <6,u,7,3>, <6,u,7,3> + 2693925571U, // <7,3,7,0>: Cost 3 vext3 <1,5,3,7>, <3,7,0,1> + 2693925584U, // <7,3,7,1>: Cost 3 vext3 <1,5,3,7>, <3,7,1,5> + 2700561114U, // <7,3,7,2>: Cost 3 vext3 <2,6,3,7>, <3,7,2,6> + 2572978916U, // <7,3,7,3>: Cost 3 vext1 <3,7,3,7>, <3,7,3,7> + 2693925611U, // <7,3,7,4>: Cost 3 vext3 <1,5,3,7>, <3,7,4,5> + 2707344118U, // <7,3,7,5>: Cost 3 vext3 <3,7,5,7>, <3,7,5,7> + 2654950894U, // <7,3,7,6>: Cost 3 vext2 <6,2,7,3>, <7,6,2,7> + 2648315500U, // <7,3,7,7>: Cost 3 vext2 <5,1,7,3>, <7,7,7,7> + 2693925643U, // <7,3,7,u>: Cost 3 vext3 <1,5,3,7>, <3,7,u,1> + 2237221578U, // <7,3,u,0>: Cost 3 vrev <3,7,0,u> + 1638320926U, // <7,3,u,1>: Cost 2 vext3 RHS, <3,u,1,2> + 1593153452U, // <7,3,u,2>: Cost 2 vext2 <u,2,7,3>, <u,2,7,3> + 1638320540U, // <7,3,u,3>: Cost 2 vext3 RHS, <3,3,3,3> + 2237516526U, // <7,3,u,4>: Cost 3 vrev <3,7,4,u> + 1638320966U, // <7,3,u,5>: Cost 2 vext3 RHS, <3,u,5,6> + 2712062796U, // <7,3,u,6>: Cost 3 vext3 RHS, <3,u,6,3> + 2692967250U, // <7,3,u,7>: Cost 3 vext3 <1,3,u,7>, <3,u,7,0> + 1638320989U, // <7,3,u,u>: Cost 2 vext3 RHS, <3,u,u,2> + 2651635712U, // <7,4,0,0>: Cost 3 vext2 <5,6,7,4>, <0,0,0,0> + 1577893990U, // <7,4,0,1>: Cost 2 vext2 <5,6,7,4>, LHS + 2651635876U, // <7,4,0,2>: Cost 3 vext2 <5,6,7,4>, <0,2,0,2> + 3785804672U, // <7,4,0,3>: Cost 4 vext3 RHS, <4,0,3,1> + 2651636050U, // <7,4,0,4>: Cost 3 vext2 <5,6,7,4>, <0,4,1,5> + 1638468498U, // <7,4,0,5>: Cost 2 vext3 RHS, <4,0,5,1> + 1638468508U, // <7,4,0,6>: Cost 2 vext3 RHS, <4,0,6,2> + 3787795364U, // <7,4,0,7>: Cost 4 vext3 RHS, <4,0,7,1> + 1640459181U, // <7,4,0,u>: Cost 2 vext3 RHS, <4,0,u,1> + 2651636470U, // <7,4,1,0>: Cost 3 vext2 <5,6,7,4>, <1,0,3,2> + 2651636532U, // <7,4,1,1>: Cost 3 vext2 <5,6,7,4>, <1,1,1,1> + 2712062922U, // <7,4,1,2>: Cost 3 vext3 RHS, <4,1,2,3> + 2639029248U, // <7,4,1,3>: Cost 3 vext2 <3,5,7,4>, <1,3,5,7> + 2712062940U, // <7,4,1,4>: Cost 3 vext3 RHS, <4,1,4,3> + 2712062946U, // <7,4,1,5>: Cost 3 vext3 RHS, <4,1,5,0> + 2712062958U, // <7,4,1,6>: Cost 3 vext3 RHS, <4,1,6,3> + 3785804791U, // <7,4,1,7>: Cost 4 vext3 RHS, <4,1,7,3> + 2712062973U, // <7,4,1,u>: Cost 3 vext3 RHS, <4,1,u,0> + 3785804807U, // <7,4,2,0>: Cost 4 vext3 RHS, <4,2,0,1> + 3785804818U, // <7,4,2,1>: Cost 4 vext3 RHS, <4,2,1,3> + 2651637352U, // <7,4,2,2>: Cost 3 vext2 <5,6,7,4>, <2,2,2,2> + 2651637414U, // <7,4,2,3>: Cost 3 vext2 <5,6,7,4>, <2,3,0,1> + 3716753194U, // <7,4,2,4>: Cost 4 vext2 <4,2,7,4>, <2,4,5,7> + 2712063030U, // <7,4,2,5>: Cost 3 vext3 RHS, <4,2,5,3> + 2712063036U, // <7,4,2,6>: Cost 3 vext3 RHS, <4,2,6,0> + 3773123658U, // <7,4,2,7>: Cost 4 vext3 <2,4,5,7>, <4,2,7,5> + 2712063054U, // <7,4,2,u>: Cost 3 vext3 RHS, <4,2,u,0> + 2651637910U, // <7,4,3,0>: Cost 3 vext2 <5,6,7,4>, <3,0,1,2> + 3712772348U, // <7,4,3,1>: Cost 4 vext2 <3,5,7,4>, <3,1,3,5> + 3785804906U, // <7,4,3,2>: Cost 4 vext3 RHS, <4,3,2,1> + 2651638172U, // <7,4,3,3>: Cost 3 vext2 <5,6,7,4>, <3,3,3,3> + 2651638274U, // <7,4,3,4>: Cost 3 vext2 <5,6,7,4>, <3,4,5,6> + 2639030883U, // <7,4,3,5>: Cost 3 vext2 <3,5,7,4>, <3,5,7,4> + 2712063122U, // <7,4,3,6>: Cost 3 vext3 RHS, <4,3,6,5> + 3712772836U, // <7,4,3,7>: Cost 4 vext2 <3,5,7,4>, <3,7,3,7> + 2641021782U, // <7,4,3,u>: Cost 3 vext2 <3,u,7,4>, <3,u,7,4> + 2714053802U, // <7,4,4,0>: Cost 3 vext3 RHS, <4,4,0,2> + 3785804978U, // <7,4,4,1>: Cost 4 vext3 RHS, <4,4,1,1> + 3716754505U, // <7,4,4,2>: Cost 4 vext2 <4,2,7,4>, <4,2,7,4> + 3785804998U, // <7,4,4,3>: Cost 4 vext3 RHS, <4,4,3,3> + 1638321360U, // <7,4,4,4>: Cost 2 vext3 RHS, <4,4,4,4> + 1638468826U, // <7,4,4,5>: Cost 2 vext3 RHS, <4,4,5,5> + 1638468836U, // <7,4,4,6>: Cost 2 vext3 RHS, <4,4,6,6> + 3785215214U, // <7,4,4,7>: Cost 4 vext3 <4,4,7,7>, <4,4,7,7> + 1640459509U, // <7,4,4,u>: Cost 2 vext3 RHS, <4,4,u,5> + 1517207654U, // <7,4,5,0>: Cost 2 vext1 <6,7,4,5>, LHS + 2573034640U, // <7,4,5,1>: Cost 3 vext1 <3,7,4,5>, <1,5,3,7> + 2712063246U, // <7,4,5,2>: Cost 3 vext3 RHS, <4,5,2,3> + 2573036267U, // <7,4,5,3>: Cost 3 vext1 <3,7,4,5>, <3,7,4,5> + 1517210934U, // <7,4,5,4>: Cost 2 vext1 <6,7,4,5>, RHS + 2711989549U, // <7,4,5,5>: Cost 3 vext3 <4,5,5,7>, <4,5,5,7> + 564579638U, // <7,4,5,6>: Cost 1 vext3 RHS, RHS + 2651639976U, // <7,4,5,7>: Cost 3 vext2 <5,6,7,4>, <5,7,5,7> + 564579656U, // <7,4,5,u>: Cost 1 vext3 RHS, RHS + 2712063307U, // <7,4,6,0>: Cost 3 vext3 RHS, <4,6,0,1> + 3767668056U, // <7,4,6,1>: Cost 4 vext3 <1,5,3,7>, <4,6,1,5> + 2651640314U, // <7,4,6,2>: Cost 3 vext2 <5,6,7,4>, <6,2,7,3> + 2655621708U, // <7,4,6,3>: Cost 3 vext2 <6,3,7,4>, <6,3,7,4> + 1638468980U, // <7,4,6,4>: Cost 2 vext3 RHS, <4,6,4,6> + 2712063358U, // <7,4,6,5>: Cost 3 vext3 RHS, <4,6,5,7> + 2712063367U, // <7,4,6,6>: Cost 3 vext3 RHS, <4,6,6,7> + 2712210826U, // <7,4,6,7>: Cost 3 vext3 RHS, <4,6,7,1> + 1638469012U, // <7,4,6,u>: Cost 2 vext3 RHS, <4,6,u,2> + 2651640826U, // <7,4,7,0>: Cost 3 vext2 <5,6,7,4>, <7,0,1,2> + 3773713830U, // <7,4,7,1>: Cost 4 vext3 <2,5,4,7>, <4,7,1,2> + 3773713842U, // <7,4,7,2>: Cost 4 vext3 <2,5,4,7>, <4,7,2,5> + 3780349372U, // <7,4,7,3>: Cost 4 vext3 <3,6,4,7>, <4,7,3,6> + 2651641140U, // <7,4,7,4>: Cost 3 vext2 <5,6,7,4>, <7,4,0,1> + 2712210888U, // <7,4,7,5>: Cost 3 vext3 RHS, <4,7,5,0> + 2712210898U, // <7,4,7,6>: Cost 3 vext3 RHS, <4,7,6,1> + 2651641452U, // <7,4,7,7>: Cost 3 vext2 <5,6,7,4>, <7,7,7,7> + 2713538026U, // <7,4,7,u>: Cost 3 vext3 <4,7,u,7>, <4,7,u,7> + 1517232230U, // <7,4,u,0>: Cost 2 vext1 <6,7,4,u>, LHS + 1577899822U, // <7,4,u,1>: Cost 2 vext2 <5,6,7,4>, LHS + 2712063489U, // <7,4,u,2>: Cost 3 vext3 RHS, <4,u,2,3> + 2573060846U, // <7,4,u,3>: Cost 3 vext1 <3,7,4,u>, <3,7,4,u> + 1640312342U, // <7,4,u,4>: Cost 2 vext3 RHS, <4,u,4,6> + 1638469146U, // <7,4,u,5>: Cost 2 vext3 RHS, <4,u,5,1> + 564579881U, // <7,4,u,6>: Cost 1 vext3 RHS, RHS + 2714054192U, // <7,4,u,7>: Cost 3 vext3 RHS, <4,u,7,5> + 564579899U, // <7,4,u,u>: Cost 1 vext3 RHS, RHS + 2579038310U, // <7,5,0,0>: Cost 3 vext1 <4,7,5,0>, LHS + 2636382310U, // <7,5,0,1>: Cost 3 vext2 <3,1,7,5>, LHS + 2796339302U, // <7,5,0,2>: Cost 3 vuzpl <7,4,5,6>, LHS + 3646810719U, // <7,5,0,3>: Cost 4 vext1 <3,7,5,0>, <3,5,7,0> + 2712063586U, // <7,5,0,4>: Cost 3 vext3 RHS, <5,0,4,1> + 2735951467U, // <7,5,0,5>: Cost 3 vext3 RHS, <5,0,5,1> + 2735951476U, // <7,5,0,6>: Cost 3 vext3 RHS, <5,0,6,1> + 2579043322U, // <7,5,0,7>: Cost 3 vext1 <4,7,5,0>, <7,0,1,2> + 2636382877U, // <7,5,0,u>: Cost 3 vext2 <3,1,7,5>, LHS + 2712211087U, // <7,5,1,0>: Cost 3 vext3 RHS, <5,1,0,1> + 3698180916U, // <7,5,1,1>: Cost 4 vext2 <1,1,7,5>, <1,1,1,1> + 3710124950U, // <7,5,1,2>: Cost 4 vext2 <3,1,7,5>, <1,2,3,0> + 2636383232U, // <7,5,1,3>: Cost 3 vext2 <3,1,7,5>, <1,3,5,7> + 2712211127U, // <7,5,1,4>: Cost 3 vext3 RHS, <5,1,4,5> + 2590994128U, // <7,5,1,5>: Cost 3 vext1 <6,7,5,1>, <5,1,7,3> + 2590995323U, // <7,5,1,6>: Cost 3 vext1 <6,7,5,1>, <6,7,5,1> + 1638469328U, // <7,5,1,7>: Cost 2 vext3 RHS, <5,1,7,3> + 1638469337U, // <7,5,1,u>: Cost 2 vext3 RHS, <5,1,u,3> + 3785805536U, // <7,5,2,0>: Cost 4 vext3 RHS, <5,2,0,1> + 3785805544U, // <7,5,2,1>: Cost 4 vext3 RHS, <5,2,1,0> + 3704817288U, // <7,5,2,2>: Cost 4 vext2 <2,2,7,5>, <2,2,5,7> + 2712063742U, // <7,5,2,3>: Cost 3 vext3 RHS, <5,2,3,4> + 3716761386U, // <7,5,2,4>: Cost 4 vext2 <4,2,7,5>, <2,4,5,7> + 2714054415U, // <7,5,2,5>: Cost 3 vext3 RHS, <5,2,5,3> + 3774304024U, // <7,5,2,6>: Cost 4 vext3 <2,6,3,7>, <5,2,6,3> + 2712063777U, // <7,5,2,7>: Cost 3 vext3 RHS, <5,2,7,3> + 2712063787U, // <7,5,2,u>: Cost 3 vext3 RHS, <5,2,u,4> + 3634888806U, // <7,5,3,0>: Cost 4 vext1 <1,7,5,3>, LHS + 2636384544U, // <7,5,3,1>: Cost 3 vext2 <3,1,7,5>, <3,1,7,5> + 3710790001U, // <7,5,3,2>: Cost 4 vext2 <3,2,7,5>, <3,2,7,5> + 3710126492U, // <7,5,3,3>: Cost 4 vext2 <3,1,7,5>, <3,3,3,3> + 3634892086U, // <7,5,3,4>: Cost 4 vext1 <1,7,5,3>, RHS + 2639039076U, // <7,5,3,5>: Cost 3 vext2 <3,5,7,5>, <3,5,7,5> + 3713444533U, // <7,5,3,6>: Cost 4 vext2 <3,6,7,5>, <3,6,7,5> + 2693926767U, // <7,5,3,7>: Cost 3 vext3 <1,5,3,7>, <5,3,7,0> + 2712063864U, // <7,5,3,u>: Cost 3 vext3 RHS, <5,3,u,0> + 2579071078U, // <7,5,4,0>: Cost 3 vext1 <4,7,5,4>, LHS + 3646841856U, // <7,5,4,1>: Cost 4 vext1 <3,7,5,4>, <1,3,5,7> + 3716762698U, // <7,5,4,2>: Cost 4 vext2 <4,2,7,5>, <4,2,7,5> + 3646843491U, // <7,5,4,3>: Cost 4 vext1 <3,7,5,4>, <3,5,7,4> + 2579074358U, // <7,5,4,4>: Cost 3 vext1 <4,7,5,4>, RHS + 2636385590U, // <7,5,4,5>: Cost 3 vext2 <3,1,7,5>, RHS + 2645675406U, // <7,5,4,6>: Cost 3 vext2 <4,6,7,5>, <4,6,7,5> + 1638322118U, // <7,5,4,7>: Cost 2 vext3 RHS, <5,4,7,6> + 1638469583U, // <7,5,4,u>: Cost 2 vext3 RHS, <5,4,u,6> + 2714054611U, // <7,5,5,0>: Cost 3 vext3 RHS, <5,5,0,1> + 2652974800U, // <7,5,5,1>: Cost 3 vext2 <5,u,7,5>, <5,1,7,3> + 3710127905U, // <7,5,5,2>: Cost 4 vext2 <3,1,7,5>, <5,2,7,3> + 3785805808U, // <7,5,5,3>: Cost 4 vext3 RHS, <5,5,3,3> + 2712211450U, // <7,5,5,4>: Cost 3 vext3 RHS, <5,5,4,4> + 1638322180U, // <7,5,5,5>: Cost 2 vext3 RHS, <5,5,5,5> + 2712064014U, // <7,5,5,6>: Cost 3 vext3 RHS, <5,5,6,6> + 1638469656U, // <7,5,5,7>: Cost 2 vext3 RHS, <5,5,7,7> + 1638469665U, // <7,5,5,u>: Cost 2 vext3 RHS, <5,5,u,7> + 2712064036U, // <7,5,6,0>: Cost 3 vext3 RHS, <5,6,0,1> + 2714054707U, // <7,5,6,1>: Cost 3 vext3 RHS, <5,6,1,7> + 3785805879U, // <7,5,6,2>: Cost 4 vext3 RHS, <5,6,2,2> + 2712064066U, // <7,5,6,3>: Cost 3 vext3 RHS, <5,6,3,4> + 2712064076U, // <7,5,6,4>: Cost 3 vext3 RHS, <5,6,4,5> + 2714054743U, // <7,5,6,5>: Cost 3 vext3 RHS, <5,6,5,7> + 2712064096U, // <7,5,6,6>: Cost 3 vext3 RHS, <5,6,6,7> + 1638322274U, // <7,5,6,7>: Cost 2 vext3 RHS, <5,6,7,0> + 1638469739U, // <7,5,6,u>: Cost 2 vext3 RHS, <5,6,u,0> + 1511325798U, // <7,5,7,0>: Cost 2 vext1 <5,7,5,7>, LHS + 2692747392U, // <7,5,7,1>: Cost 3 vext3 <1,3,5,7>, <5,7,1,3> + 2585069160U, // <7,5,7,2>: Cost 3 vext1 <5,7,5,7>, <2,2,2,2> + 2573126390U, // <7,5,7,3>: Cost 3 vext1 <3,7,5,7>, <3,7,5,7> + 1511329078U, // <7,5,7,4>: Cost 2 vext1 <5,7,5,7>, RHS + 1638469800U, // <7,5,7,5>: Cost 2 vext3 RHS, <5,7,5,7> + 2712211626U, // <7,5,7,6>: Cost 3 vext3 RHS, <5,7,6,0> + 2712211636U, // <7,5,7,7>: Cost 3 vext3 RHS, <5,7,7,1> + 1638469823U, // <7,5,7,u>: Cost 2 vext3 RHS, <5,7,u,3> + 1511333990U, // <7,5,u,0>: Cost 2 vext1 <5,7,5,u>, LHS + 2636388142U, // <7,5,u,1>: Cost 3 vext2 <3,1,7,5>, LHS + 2712211671U, // <7,5,u,2>: Cost 3 vext3 RHS, <5,u,2,0> + 2573134583U, // <7,5,u,3>: Cost 3 vext1 <3,7,5,u>, <3,7,5,u> + 1511337270U, // <7,5,u,4>: Cost 2 vext1 <5,7,5,u>, RHS + 1638469881U, // <7,5,u,5>: Cost 2 vext3 RHS, <5,u,5,7> + 2712064258U, // <7,5,u,6>: Cost 3 vext3 RHS, <5,u,6,7> + 1638469892U, // <7,5,u,7>: Cost 2 vext3 RHS, <5,u,7,0> + 1638469904U, // <7,5,u,u>: Cost 2 vext3 RHS, <5,u,u,3> + 2650324992U, // <7,6,0,0>: Cost 3 vext2 <5,4,7,6>, <0,0,0,0> + 1576583270U, // <7,6,0,1>: Cost 2 vext2 <5,4,7,6>, LHS + 2712064300U, // <7,6,0,2>: Cost 3 vext3 RHS, <6,0,2,4> + 2255295336U, // <7,6,0,3>: Cost 3 vrev <6,7,3,0> + 2712064316U, // <7,6,0,4>: Cost 3 vext3 RHS, <6,0,4,2> + 2585088098U, // <7,6,0,5>: Cost 3 vext1 <5,7,6,0>, <5,6,7,0> + 2735952204U, // <7,6,0,6>: Cost 3 vext3 RHS, <6,0,6,0> + 2712211799U, // <7,6,0,7>: Cost 3 vext3 RHS, <6,0,7,2> + 1576583837U, // <7,6,0,u>: Cost 2 vext2 <5,4,7,6>, LHS + 1181340494U, // <7,6,1,0>: Cost 2 vrev <6,7,0,1> + 2650325812U, // <7,6,1,1>: Cost 3 vext2 <5,4,7,6>, <1,1,1,1> + 2650325910U, // <7,6,1,2>: Cost 3 vext2 <5,4,7,6>, <1,2,3,0> + 2650325976U, // <7,6,1,3>: Cost 3 vext2 <5,4,7,6>, <1,3,1,3> + 2579123510U, // <7,6,1,4>: Cost 3 vext1 <4,7,6,1>, RHS + 2650326160U, // <7,6,1,5>: Cost 3 vext2 <5,4,7,6>, <1,5,3,7> + 2714055072U, // <7,6,1,6>: Cost 3 vext3 RHS, <6,1,6,3> + 2712064425U, // <7,6,1,7>: Cost 3 vext3 RHS, <6,1,7,3> + 1181930390U, // <7,6,1,u>: Cost 2 vrev <6,7,u,1> + 2712211897U, // <7,6,2,0>: Cost 3 vext3 RHS, <6,2,0,1> + 2714055108U, // <7,6,2,1>: Cost 3 vext3 RHS, <6,2,1,3> + 2650326632U, // <7,6,2,2>: Cost 3 vext2 <5,4,7,6>, <2,2,2,2> + 2650326694U, // <7,6,2,3>: Cost 3 vext2 <5,4,7,6>, <2,3,0,1> + 2714055137U, // <7,6,2,4>: Cost 3 vext3 RHS, <6,2,4,5> + 2714055148U, // <7,6,2,5>: Cost 3 vext3 RHS, <6,2,5,7> + 2650326970U, // <7,6,2,6>: Cost 3 vext2 <5,4,7,6>, <2,6,3,7> + 1638470138U, // <7,6,2,7>: Cost 2 vext3 RHS, <6,2,7,3> + 1638470147U, // <7,6,2,u>: Cost 2 vext3 RHS, <6,2,u,3> + 2650327190U, // <7,6,3,0>: Cost 3 vext2 <5,4,7,6>, <3,0,1,2> + 2255172441U, // <7,6,3,1>: Cost 3 vrev <6,7,1,3> + 2255246178U, // <7,6,3,2>: Cost 3 vrev <6,7,2,3> + 2650327452U, // <7,6,3,3>: Cost 3 vext2 <5,4,7,6>, <3,3,3,3> + 2712064562U, // <7,6,3,4>: Cost 3 vext3 RHS, <6,3,4,5> + 2650327627U, // <7,6,3,5>: Cost 3 vext2 <5,4,7,6>, <3,5,4,7> + 3713452726U, // <7,6,3,6>: Cost 4 vext2 <3,6,7,6>, <3,6,7,6> + 2700563016U, // <7,6,3,7>: Cost 3 vext3 <2,6,3,7>, <6,3,7,0> + 2712064593U, // <7,6,3,u>: Cost 3 vext3 RHS, <6,3,u,0> + 2650327954U, // <7,6,4,0>: Cost 3 vext2 <5,4,7,6>, <4,0,5,1> + 2735952486U, // <7,6,4,1>: Cost 3 vext3 RHS, <6,4,1,3> + 2735952497U, // <7,6,4,2>: Cost 3 vext3 RHS, <6,4,2,5> + 2255328108U, // <7,6,4,3>: Cost 3 vrev <6,7,3,4> + 2712212100U, // <7,6,4,4>: Cost 3 vext3 RHS, <6,4,4,6> + 1576586550U, // <7,6,4,5>: Cost 2 vext2 <5,4,7,6>, RHS + 2714055312U, // <7,6,4,6>: Cost 3 vext3 RHS, <6,4,6,0> + 2712212126U, // <7,6,4,7>: Cost 3 vext3 RHS, <6,4,7,5> + 1576586793U, // <7,6,4,u>: Cost 2 vext2 <5,4,7,6>, RHS + 2579152998U, // <7,6,5,0>: Cost 3 vext1 <4,7,6,5>, LHS + 2650328784U, // <7,6,5,1>: Cost 3 vext2 <5,4,7,6>, <5,1,7,3> + 2714055364U, // <7,6,5,2>: Cost 3 vext3 RHS, <6,5,2,7> + 3785806538U, // <7,6,5,3>: Cost 4 vext3 RHS, <6,5,3,4> + 1576587206U, // <7,6,5,4>: Cost 2 vext2 <5,4,7,6>, <5,4,7,6> + 2650329092U, // <7,6,5,5>: Cost 3 vext2 <5,4,7,6>, <5,5,5,5> + 2650329186U, // <7,6,5,6>: Cost 3 vext2 <5,4,7,6>, <5,6,7,0> + 2712064753U, // <7,6,5,7>: Cost 3 vext3 RHS, <6,5,7,7> + 1181963162U, // <7,6,5,u>: Cost 2 vrev <6,7,u,5> + 2714055421U, // <7,6,6,0>: Cost 3 vext3 RHS, <6,6,0,1> + 2714055432U, // <7,6,6,1>: Cost 3 vext3 RHS, <6,6,1,3> + 2650329594U, // <7,6,6,2>: Cost 3 vext2 <5,4,7,6>, <6,2,7,3> + 3785806619U, // <7,6,6,3>: Cost 4 vext3 RHS, <6,6,3,4> + 2712212260U, // <7,6,6,4>: Cost 3 vext3 RHS, <6,6,4,4> + 2714055472U, // <7,6,6,5>: Cost 3 vext3 RHS, <6,6,5,7> + 1638323000U, // <7,6,6,6>: Cost 2 vext3 RHS, <6,6,6,6> + 1638470466U, // <7,6,6,7>: Cost 2 vext3 RHS, <6,6,7,7> + 1638470475U, // <7,6,6,u>: Cost 2 vext3 RHS, <6,6,u,7> + 1638323022U, // <7,6,7,0>: Cost 2 vext3 RHS, <6,7,0,1> + 2712064854U, // <7,6,7,1>: Cost 3 vext3 RHS, <6,7,1,0> + 2712064865U, // <7,6,7,2>: Cost 3 vext3 RHS, <6,7,2,2> + 2712064872U, // <7,6,7,3>: Cost 3 vext3 RHS, <6,7,3,0> + 1638323062U, // <7,6,7,4>: Cost 2 vext3 RHS, <6,7,4,5> + 2712064894U, // <7,6,7,5>: Cost 3 vext3 RHS, <6,7,5,4> + 2712064905U, // <7,6,7,6>: Cost 3 vext3 RHS, <6,7,6,6> + 2712064915U, // <7,6,7,7>: Cost 3 vext3 RHS, <6,7,7,7> + 1638323094U, // <7,6,7,u>: Cost 2 vext3 RHS, <6,7,u,1> + 1638470559U, // <7,6,u,0>: Cost 2 vext3 RHS, <6,u,0,1> + 1576589102U, // <7,6,u,1>: Cost 2 vext2 <5,4,7,6>, LHS + 2712212402U, // <7,6,u,2>: Cost 3 vext3 RHS, <6,u,2,2> + 2712212409U, // <7,6,u,3>: Cost 3 vext3 RHS, <6,u,3,0> + 1638470599U, // <7,6,u,4>: Cost 2 vext3 RHS, <6,u,4,5> + 1576589466U, // <7,6,u,5>: Cost 2 vext2 <5,4,7,6>, RHS + 1638323000U, // <7,6,u,6>: Cost 2 vext3 RHS, <6,6,6,6> + 1638470624U, // <7,6,u,7>: Cost 2 vext3 RHS, <6,u,7,3> + 1638470631U, // <7,6,u,u>: Cost 2 vext3 RHS, <6,u,u,1> + 2712065007U, // <7,7,0,0>: Cost 3 vext3 RHS, <7,0,0,0> + 1638323194U, // <7,7,0,1>: Cost 2 vext3 RHS, <7,0,1,2> + 2712065025U, // <7,7,0,2>: Cost 3 vext3 RHS, <7,0,2,0> + 3646958337U, // <7,7,0,3>: Cost 4 vext1 <3,7,7,0>, <3,7,7,0> + 2712065044U, // <7,7,0,4>: Cost 3 vext3 RHS, <7,0,4,1> + 2585161907U, // <7,7,0,5>: Cost 3 vext1 <5,7,7,0>, <5,7,7,0> + 2591134604U, // <7,7,0,6>: Cost 3 vext1 <6,7,7,0>, <6,7,7,0> + 2591134714U, // <7,7,0,7>: Cost 3 vext1 <6,7,7,0>, <7,0,1,2> + 1638323257U, // <7,7,0,u>: Cost 2 vext3 RHS, <7,0,u,2> + 2712065091U, // <7,7,1,0>: Cost 3 vext3 RHS, <7,1,0,3> + 2712065098U, // <7,7,1,1>: Cost 3 vext3 RHS, <7,1,1,1> + 2712065109U, // <7,7,1,2>: Cost 3 vext3 RHS, <7,1,2,3> + 2692748384U, // <7,7,1,3>: Cost 3 vext3 <1,3,5,7>, <7,1,3,5> + 2585169206U, // <7,7,1,4>: Cost 3 vext1 <5,7,7,1>, RHS + 2693928048U, // <7,7,1,5>: Cost 3 vext3 <1,5,3,7>, <7,1,5,3> + 2585170766U, // <7,7,1,6>: Cost 3 vext1 <5,7,7,1>, <6,7,0,1> + 2735953024U, // <7,7,1,7>: Cost 3 vext3 RHS, <7,1,7,1> + 2695918731U, // <7,7,1,u>: Cost 3 vext3 <1,u,3,7>, <7,1,u,3> + 3770471574U, // <7,7,2,0>: Cost 4 vext3 <2,0,5,7>, <7,2,0,5> + 3785807002U, // <7,7,2,1>: Cost 4 vext3 RHS, <7,2,1,0> + 2712065189U, // <7,7,2,2>: Cost 3 vext3 RHS, <7,2,2,2> + 2712065196U, // <7,7,2,3>: Cost 3 vext3 RHS, <7,2,3,0> + 3773125818U, // <7,7,2,4>: Cost 4 vext3 <2,4,5,7>, <7,2,4,5> + 3766490305U, // <7,7,2,5>: Cost 4 vext3 <1,3,5,7>, <7,2,5,3> + 2700563658U, // <7,7,2,6>: Cost 3 vext3 <2,6,3,7>, <7,2,6,3> + 2735953107U, // <7,7,2,7>: Cost 3 vext3 RHS, <7,2,7,3> + 2701890780U, // <7,7,2,u>: Cost 3 vext3 <2,u,3,7>, <7,2,u,3> + 2712065251U, // <7,7,3,0>: Cost 3 vext3 RHS, <7,3,0,1> + 3766490350U, // <7,7,3,1>: Cost 4 vext3 <1,3,5,7>, <7,3,1,3> + 3774305530U, // <7,7,3,2>: Cost 4 vext3 <2,6,3,7>, <7,3,2,6> + 2637728196U, // <7,7,3,3>: Cost 3 vext2 <3,3,7,7>, <3,3,7,7> + 2712065291U, // <7,7,3,4>: Cost 3 vext3 RHS, <7,3,4,5> + 2585186486U, // <7,7,3,5>: Cost 3 vext1 <5,7,7,3>, <5,7,7,3> + 2639719095U, // <7,7,3,6>: Cost 3 vext2 <3,6,7,7>, <3,6,7,7> + 2640382728U, // <7,7,3,7>: Cost 3 vext2 <3,7,7,7>, <3,7,7,7> + 2641046361U, // <7,7,3,u>: Cost 3 vext2 <3,u,7,7>, <3,u,7,7> + 2712212792U, // <7,7,4,0>: Cost 3 vext3 RHS, <7,4,0,5> + 3646989312U, // <7,7,4,1>: Cost 4 vext1 <3,7,7,4>, <1,3,5,7> + 3785807176U, // <7,7,4,2>: Cost 4 vext3 RHS, <7,4,2,3> + 3646991109U, // <7,7,4,3>: Cost 4 vext1 <3,7,7,4>, <3,7,7,4> + 2712065371U, // <7,7,4,4>: Cost 3 vext3 RHS, <7,4,4,4> + 1638323558U, // <7,7,4,5>: Cost 2 vext3 RHS, <7,4,5,6> + 2712212845U, // <7,7,4,6>: Cost 3 vext3 RHS, <7,4,6,4> + 2591167846U, // <7,7,4,7>: Cost 3 vext1 <6,7,7,4>, <7,4,5,6> + 1638323585U, // <7,7,4,u>: Cost 2 vext3 RHS, <7,4,u,6> + 2585198694U, // <7,7,5,0>: Cost 3 vext1 <5,7,7,5>, LHS + 2712212884U, // <7,7,5,1>: Cost 3 vext3 RHS, <7,5,1,7> + 3711471393U, // <7,7,5,2>: Cost 4 vext2 <3,3,7,7>, <5,2,7,3> + 2649673590U, // <7,7,5,3>: Cost 3 vext2 <5,3,7,7>, <5,3,7,7> + 2712065455U, // <7,7,5,4>: Cost 3 vext3 RHS, <7,5,4,7> + 1577259032U, // <7,7,5,5>: Cost 2 vext2 <5,5,7,7>, <5,5,7,7> + 2712065473U, // <7,7,5,6>: Cost 3 vext3 RHS, <7,5,6,7> + 2712212936U, // <7,7,5,7>: Cost 3 vext3 RHS, <7,5,7,5> + 1579249931U, // <7,7,5,u>: Cost 2 vext2 <5,u,7,7>, <5,u,7,7> + 2591178854U, // <7,7,6,0>: Cost 3 vext1 <6,7,7,6>, LHS + 2735953374U, // <7,7,6,1>: Cost 3 vext3 RHS, <7,6,1,0> + 2712212974U, // <7,7,6,2>: Cost 3 vext3 RHS, <7,6,2,7> + 2655646287U, // <7,7,6,3>: Cost 3 vext2 <6,3,7,7>, <6,3,7,7> + 2591182134U, // <7,7,6,4>: Cost 3 vext1 <6,7,7,6>, RHS + 2656973553U, // <7,7,6,5>: Cost 3 vext2 <6,5,7,7>, <6,5,7,7> + 1583895362U, // <7,7,6,6>: Cost 2 vext2 <6,6,7,7>, <6,6,7,7> + 2712065556U, // <7,7,6,7>: Cost 3 vext3 RHS, <7,6,7,0> + 1585222628U, // <7,7,6,u>: Cost 2 vext2 <6,u,7,7>, <6,u,7,7> + 1523417190U, // <7,7,7,0>: Cost 2 vext1 <7,7,7,7>, LHS + 2597159670U, // <7,7,7,1>: Cost 3 vext1 <7,7,7,7>, <1,0,3,2> + 2597160552U, // <7,7,7,2>: Cost 3 vext1 <7,7,7,7>, <2,2,2,2> + 2597161110U, // <7,7,7,3>: Cost 3 vext1 <7,7,7,7>, <3,0,1,2> + 1523420470U, // <7,7,7,4>: Cost 2 vext1 <7,7,7,7>, RHS + 2651002296U, // <7,7,7,5>: Cost 3 vext2 <5,5,7,7>, <7,5,5,7> + 2657637906U, // <7,7,7,6>: Cost 3 vext2 <6,6,7,7>, <7,6,6,7> + 363253046U, // <7,7,7,7>: Cost 1 vdup3 RHS + 363253046U, // <7,7,7,u>: Cost 1 vdup3 RHS + 1523417190U, // <7,7,u,0>: Cost 2 vext1 <7,7,7,7>, LHS + 1638471298U, // <7,7,u,1>: Cost 2 vext3 RHS, <7,u,1,2> + 2712213132U, // <7,7,u,2>: Cost 3 vext3 RHS, <7,u,2,3> + 2712213138U, // <7,7,u,3>: Cost 3 vext3 RHS, <7,u,3,0> + 1523420470U, // <7,7,u,4>: Cost 2 vext1 <7,7,7,7>, RHS + 1638471338U, // <7,7,u,5>: Cost 2 vext3 RHS, <7,u,5,6> + 1595840756U, // <7,7,u,6>: Cost 2 vext2 <u,6,7,7>, <u,6,7,7> + 363253046U, // <7,7,u,7>: Cost 1 vdup3 RHS + 363253046U, // <7,7,u,u>: Cost 1 vdup3 RHS + 1638318080U, // <7,u,0,0>: Cost 2 vext3 RHS, <0,0,0,0> + 1638323923U, // <7,u,0,1>: Cost 2 vext3 RHS, <u,0,1,2> + 1662211804U, // <7,u,0,2>: Cost 2 vext3 RHS, <u,0,2,2> + 1638323941U, // <7,u,0,3>: Cost 2 vext3 RHS, <u,0,3,2> + 2712065773U, // <7,u,0,4>: Cost 3 vext3 RHS, <u,0,4,1> + 1662359286U, // <7,u,0,5>: Cost 2 vext3 RHS, <u,0,5,1> + 1662359296U, // <7,u,0,6>: Cost 2 vext3 RHS, <u,0,6,2> + 2987150664U, // <7,u,0,7>: Cost 3 vzipr <5,6,7,0>, RHS + 1638323986U, // <7,u,0,u>: Cost 2 vext3 RHS, <u,0,u,2> + 1517469798U, // <7,u,1,0>: Cost 2 vext1 <6,7,u,1>, LHS + 1638318900U, // <7,u,1,1>: Cost 2 vext3 RHS, <1,1,1,1> + 564582190U, // <7,u,1,2>: Cost 1 vext3 RHS, LHS + 1638324023U, // <7,u,1,3>: Cost 2 vext3 RHS, <u,1,3,3> + 1517473078U, // <7,u,1,4>: Cost 2 vext1 <6,7,u,1>, RHS + 2693928777U, // <7,u,1,5>: Cost 3 vext3 <1,5,3,7>, <u,1,5,3> + 1517474710U, // <7,u,1,6>: Cost 2 vext1 <6,7,u,1>, <6,7,u,1> + 1640462171U, // <7,u,1,7>: Cost 2 vext3 RHS, <u,1,7,3> + 564582244U, // <7,u,1,u>: Cost 1 vext3 RHS, LHS + 1638318244U, // <7,u,2,0>: Cost 2 vext3 RHS, <0,2,0,2> + 2712065907U, // <7,u,2,1>: Cost 3 vext3 RHS, <u,2,1,0> + 1638319720U, // <7,u,2,2>: Cost 2 vext3 RHS, <2,2,2,2> + 1638324101U, // <7,u,2,3>: Cost 2 vext3 RHS, <u,2,3,0> + 1638318284U, // <7,u,2,4>: Cost 2 vext3 RHS, <0,2,4,6> + 2712065947U, // <7,u,2,5>: Cost 3 vext3 RHS, <u,2,5,4> + 2700564387U, // <7,u,2,6>: Cost 3 vext3 <2,6,3,7>, <u,2,6,3> + 1640314796U, // <7,u,2,7>: Cost 2 vext3 RHS, <u,2,7,3> + 1638324146U, // <7,u,2,u>: Cost 2 vext3 RHS, <u,2,u,0> + 1638324156U, // <7,u,3,0>: Cost 2 vext3 RHS, <u,3,0,1> + 1638319064U, // <7,u,3,1>: Cost 2 vext3 RHS, <1,3,1,3> + 2700564435U, // <7,u,3,2>: Cost 3 vext3 <2,6,3,7>, <u,3,2,6> + 1638320540U, // <7,u,3,3>: Cost 2 vext3 RHS, <3,3,3,3> + 1638324196U, // <7,u,3,4>: Cost 2 vext3 RHS, <u,3,4,5> + 1638324207U, // <7,u,3,5>: Cost 2 vext3 RHS, <u,3,5,7> + 2700564472U, // <7,u,3,6>: Cost 3 vext3 <2,6,3,7>, <u,3,6,7> + 2695919610U, // <7,u,3,7>: Cost 3 vext3 <1,u,3,7>, <u,3,7,0> + 1638324228U, // <7,u,3,u>: Cost 2 vext3 RHS, <u,3,u,1> + 2712066061U, // <7,u,4,0>: Cost 3 vext3 RHS, <u,4,0,1> + 1662212122U, // <7,u,4,1>: Cost 2 vext3 RHS, <u,4,1,5> + 1662212132U, // <7,u,4,2>: Cost 2 vext3 RHS, <u,4,2,6> + 2712066092U, // <7,u,4,3>: Cost 3 vext3 RHS, <u,4,3,5> + 1638321360U, // <7,u,4,4>: Cost 2 vext3 RHS, <4,4,4,4> + 1638324287U, // <7,u,4,5>: Cost 2 vext3 RHS, <u,4,5,6> + 1662359624U, // <7,u,4,6>: Cost 2 vext3 RHS, <u,4,6,6> + 1640314961U, // <7,u,4,7>: Cost 2 vext3 RHS, <u,4,7,6> + 1638324314U, // <7,u,4,u>: Cost 2 vext3 RHS, <u,4,u,6> + 1517502566U, // <7,u,5,0>: Cost 2 vext1 <6,7,u,5>, LHS + 1574612693U, // <7,u,5,1>: Cost 2 vext2 <5,1,7,u>, <5,1,7,u> + 2712066162U, // <7,u,5,2>: Cost 3 vext3 RHS, <u,5,2,3> + 1638324351U, // <7,u,5,3>: Cost 2 vext3 RHS, <u,5,3,7> + 1576603592U, // <7,u,5,4>: Cost 2 vext2 <5,4,7,u>, <5,4,7,u> + 1577267225U, // <7,u,5,5>: Cost 2 vext2 <5,5,7,u>, <5,5,7,u> + 564582554U, // <7,u,5,6>: Cost 1 vext3 RHS, RHS + 1640462499U, // <7,u,5,7>: Cost 2 vext3 RHS, <u,5,7,7> + 564582572U, // <7,u,5,u>: Cost 1 vext3 RHS, RHS + 2712066223U, // <7,u,6,0>: Cost 3 vext3 RHS, <u,6,0,1> + 2712066238U, // <7,u,6,1>: Cost 3 vext3 RHS, <u,6,1,7> + 1581249023U, // <7,u,6,2>: Cost 2 vext2 <6,2,7,u>, <6,2,7,u> + 1638324432U, // <7,u,6,3>: Cost 2 vext3 RHS, <u,6,3,7> + 1638468980U, // <7,u,6,4>: Cost 2 vext3 RHS, <4,6,4,6> + 2712066274U, // <7,u,6,5>: Cost 3 vext3 RHS, <u,6,5,7> + 1583903555U, // <7,u,6,6>: Cost 2 vext2 <6,6,7,u>, <6,6,7,u> + 1640315117U, // <7,u,6,7>: Cost 2 vext3 RHS, <u,6,7,0> + 1638324477U, // <7,u,6,u>: Cost 2 vext3 RHS, <u,6,u,7> + 1638471936U, // <7,u,7,0>: Cost 2 vext3 RHS, <u,7,0,1> + 2692970763U, // <7,u,7,1>: Cost 3 vext3 <1,3,u,7>, <u,7,1,3> + 2700933399U, // <7,u,7,2>: Cost 3 vext3 <2,6,u,7>, <u,7,2,6> + 2573347601U, // <7,u,7,3>: Cost 3 vext1 <3,7,u,7>, <3,7,u,7> + 1638471976U, // <7,u,7,4>: Cost 2 vext3 RHS, <u,7,4,5> + 1511551171U, // <7,u,7,5>: Cost 2 vext1 <5,7,u,7>, <5,7,u,7> + 2712213815U, // <7,u,7,6>: Cost 3 vext3 RHS, <u,7,6,2> + 363253046U, // <7,u,7,7>: Cost 1 vdup3 RHS + 363253046U, // <7,u,7,u>: Cost 1 vdup3 RHS + 1638324561U, // <7,u,u,0>: Cost 2 vext3 RHS, <u,u,0,1> + 1638324571U, // <7,u,u,1>: Cost 2 vext3 RHS, <u,u,1,2> + 564582757U, // <7,u,u,2>: Cost 1 vext3 RHS, LHS + 1638324587U, // <7,u,u,3>: Cost 2 vext3 RHS, <u,u,3,0> + 1638324601U, // <7,u,u,4>: Cost 2 vext3 RHS, <u,u,4,5> + 1638324611U, // <7,u,u,5>: Cost 2 vext3 RHS, <u,u,5,6> + 564582797U, // <7,u,u,6>: Cost 1 vext3 RHS, RHS + 363253046U, // <7,u,u,7>: Cost 1 vdup3 RHS + 564582811U, // <7,u,u,u>: Cost 1 vext3 RHS, LHS + 135053414U, // <u,0,0,0>: Cost 1 vdup0 LHS + 1611489290U, // <u,0,0,1>: Cost 2 vext3 LHS, <0,0,1,1> + 1611489300U, // <u,0,0,2>: Cost 2 vext3 LHS, <0,0,2,2> + 2568054923U, // <u,0,0,3>: Cost 3 vext1 <3,0,0,0>, <3,0,0,0> + 1481706806U, // <u,0,0,4>: Cost 2 vext1 <0,u,0,0>, RHS + 2555449040U, // <u,0,0,5>: Cost 3 vext1 <0,u,0,0>, <5,1,7,3> + 2591282078U, // <u,0,0,6>: Cost 3 vext1 <6,u,0,0>, <6,u,0,0> + 2591945711U, // <u,0,0,7>: Cost 3 vext1 <7,0,0,0>, <7,0,0,0> + 135053414U, // <u,0,0,u>: Cost 1 vdup0 LHS + 1493655654U, // <u,0,1,0>: Cost 2 vext1 <2,u,0,1>, LHS + 1860550758U, // <u,0,1,1>: Cost 2 vzipl LHS, LHS + 537747563U, // <u,0,1,2>: Cost 1 vext3 LHS, LHS + 2625135576U, // <u,0,1,3>: Cost 3 vext2 <1,2,u,0>, <1,3,1,3> + 1493658934U, // <u,0,1,4>: Cost 2 vext1 <2,u,0,1>, RHS + 2625135760U, // <u,0,1,5>: Cost 3 vext2 <1,2,u,0>, <1,5,3,7> + 1517548447U, // <u,0,1,6>: Cost 2 vext1 <6,u,0,1>, <6,u,0,1> + 2591290362U, // <u,0,1,7>: Cost 3 vext1 <6,u,0,1>, <7,0,1,2> + 537747612U, // <u,0,1,u>: Cost 1 vext3 LHS, LHS + 1611489444U, // <u,0,2,0>: Cost 2 vext3 LHS, <0,2,0,2> + 2685231276U, // <u,0,2,1>: Cost 3 vext3 LHS, <0,2,1,1> + 1994768486U, // <u,0,2,2>: Cost 2 vtrnl LHS, LHS + 2685231294U, // <u,0,2,3>: Cost 3 vext3 LHS, <0,2,3,1> + 1611489484U, // <u,0,2,4>: Cost 2 vext3 LHS, <0,2,4,6> + 2712068310U, // <u,0,2,5>: Cost 3 vext3 RHS, <0,2,5,7> + 2625136570U, // <u,0,2,6>: Cost 3 vext2 <1,2,u,0>, <2,6,3,7> + 2591962097U, // <u,0,2,7>: Cost 3 vext1 <7,0,0,2>, <7,0,0,2> + 1611489516U, // <u,0,2,u>: Cost 2 vext3 LHS, <0,2,u,2> + 2954067968U, // <u,0,3,0>: Cost 3 vzipr LHS, <0,0,0,0> + 2685231356U, // <u,0,3,1>: Cost 3 vext3 LHS, <0,3,1,0> + 72589981U, // <u,0,3,2>: Cost 1 vrev LHS + 2625137052U, // <u,0,3,3>: Cost 3 vext2 <1,2,u,0>, <3,3,3,3> + 2625137154U, // <u,0,3,4>: Cost 3 vext2 <1,2,u,0>, <3,4,5,6> + 2639071848U, // <u,0,3,5>: Cost 3 vext2 <3,5,u,0>, <3,5,u,0> + 2639735481U, // <u,0,3,6>: Cost 3 vext2 <3,6,u,0>, <3,6,u,0> + 2597279354U, // <u,0,3,7>: Cost 3 vext1 <7,u,0,3>, <7,u,0,3> + 73032403U, // <u,0,3,u>: Cost 1 vrev LHS + 2687074636U, // <u,0,4,0>: Cost 3 vext3 <0,4,0,u>, <0,4,0,u> + 1611489618U, // <u,0,4,1>: Cost 2 vext3 LHS, <0,4,1,5> + 1611489628U, // <u,0,4,2>: Cost 2 vext3 LHS, <0,4,2,6> + 3629222038U, // <u,0,4,3>: Cost 4 vext1 <0,u,0,4>, <3,0,1,2> + 2555481398U, // <u,0,4,4>: Cost 3 vext1 <0,u,0,4>, RHS + 1551396150U, // <u,0,4,5>: Cost 2 vext2 <1,2,u,0>, RHS + 2651680116U, // <u,0,4,6>: Cost 3 vext2 <5,6,u,0>, <4,6,4,6> + 2646150600U, // <u,0,4,7>: Cost 3 vext2 <4,7,5,0>, <4,7,5,0> + 1611932050U, // <u,0,4,u>: Cost 2 vext3 LHS, <0,4,u,6> + 2561458278U, // <u,0,5,0>: Cost 3 vext1 <1,u,0,5>, LHS + 1863532646U, // <u,0,5,1>: Cost 2 vzipl RHS, LHS + 2712068526U, // <u,0,5,2>: Cost 3 vext3 RHS, <0,5,2,7> + 2649689976U, // <u,0,5,3>: Cost 3 vext2 <5,3,u,0>, <5,3,u,0> + 2220237489U, // <u,0,5,4>: Cost 3 vrev <0,u,4,5> + 2651680772U, // <u,0,5,5>: Cost 3 vext2 <5,6,u,0>, <5,5,5,5> + 1577939051U, // <u,0,5,6>: Cost 2 vext2 <5,6,u,0>, <5,6,u,0> + 2830077238U, // <u,0,5,7>: Cost 3 vuzpr <1,u,3,0>, RHS + 1579266317U, // <u,0,5,u>: Cost 2 vext2 <5,u,u,0>, <5,u,u,0> + 2555494502U, // <u,0,6,0>: Cost 3 vext1 <0,u,0,6>, LHS + 2712068598U, // <u,0,6,1>: Cost 3 vext3 RHS, <0,6,1,7> + 1997750374U, // <u,0,6,2>: Cost 2 vtrnl RHS, LHS + 2655662673U, // <u,0,6,3>: Cost 3 vext2 <6,3,u,0>, <6,3,u,0> + 2555497782U, // <u,0,6,4>: Cost 3 vext1 <0,u,0,6>, RHS + 2651681459U, // <u,0,6,5>: Cost 3 vext2 <5,6,u,0>, <6,5,0,u> + 2651681592U, // <u,0,6,6>: Cost 3 vext2 <5,6,u,0>, <6,6,6,6> + 2651681614U, // <u,0,6,7>: Cost 3 vext2 <5,6,u,0>, <6,7,0,1> + 1997750428U, // <u,0,6,u>: Cost 2 vtrnl RHS, LHS + 2567446630U, // <u,0,7,0>: Cost 3 vext1 <2,u,0,7>, LHS + 2567447446U, // <u,0,7,1>: Cost 3 vext1 <2,u,0,7>, <1,2,3,0> + 2567448641U, // <u,0,7,2>: Cost 3 vext1 <2,u,0,7>, <2,u,0,7> + 2573421338U, // <u,0,7,3>: Cost 3 vext1 <3,u,0,7>, <3,u,0,7> + 2567449910U, // <u,0,7,4>: Cost 3 vext1 <2,u,0,7>, RHS + 2651682242U, // <u,0,7,5>: Cost 3 vext2 <5,6,u,0>, <7,5,6,u> + 2591339429U, // <u,0,7,6>: Cost 3 vext1 <6,u,0,7>, <6,u,0,7> + 2651682412U, // <u,0,7,7>: Cost 3 vext2 <5,6,u,0>, <7,7,7,7> + 2567452462U, // <u,0,7,u>: Cost 3 vext1 <2,u,0,7>, LHS + 135053414U, // <u,0,u,0>: Cost 1 vdup0 LHS + 1611489938U, // <u,0,u,1>: Cost 2 vext3 LHS, <0,u,1,1> + 537748125U, // <u,0,u,2>: Cost 1 vext3 LHS, LHS + 2685674148U, // <u,0,u,3>: Cost 3 vext3 LHS, <0,u,3,1> + 1611932338U, // <u,0,u,4>: Cost 2 vext3 LHS, <0,u,4,6> + 1551399066U, // <u,0,u,5>: Cost 2 vext2 <1,2,u,0>, RHS + 1517605798U, // <u,0,u,6>: Cost 2 vext1 <6,u,0,u>, <6,u,0,u> + 2830077481U, // <u,0,u,7>: Cost 3 vuzpr <1,u,3,0>, RHS + 537748179U, // <u,0,u,u>: Cost 1 vext3 LHS, LHS + 1544101961U, // <u,1,0,0>: Cost 2 vext2 <0,0,u,1>, <0,0,u,1> + 1558036582U, // <u,1,0,1>: Cost 2 vext2 <2,3,u,1>, LHS + 2619171051U, // <u,1,0,2>: Cost 3 vext2 <0,2,u,1>, <0,2,u,1> + 1611490038U, // <u,1,0,3>: Cost 2 vext3 LHS, <1,0,3,2> + 2555522358U, // <u,1,0,4>: Cost 3 vext1 <0,u,1,0>, RHS + 2712068871U, // <u,1,0,5>: Cost 3 vext3 RHS, <1,0,5,1> + 2591355815U, // <u,1,0,6>: Cost 3 vext1 <6,u,1,0>, <6,u,1,0> + 2597328512U, // <u,1,0,7>: Cost 3 vext1 <7,u,1,0>, <7,u,1,0> + 1611490083U, // <u,1,0,u>: Cost 2 vext3 LHS, <1,0,u,2> + 1481785446U, // <u,1,1,0>: Cost 2 vext1 <0,u,1,1>, LHS + 202162278U, // <u,1,1,1>: Cost 1 vdup1 LHS + 2555528808U, // <u,1,1,2>: Cost 3 vext1 <0,u,1,1>, <2,2,2,2> + 1611490120U, // <u,1,1,3>: Cost 2 vext3 LHS, <1,1,3,3> + 1481788726U, // <u,1,1,4>: Cost 2 vext1 <0,u,1,1>, RHS + 2689876828U, // <u,1,1,5>: Cost 3 vext3 LHS, <1,1,5,5> + 2591364008U, // <u,1,1,6>: Cost 3 vext1 <6,u,1,1>, <6,u,1,1> + 2592691274U, // <u,1,1,7>: Cost 3 vext1 <7,1,1,1>, <7,1,1,1> + 202162278U, // <u,1,1,u>: Cost 1 vdup1 LHS + 1499709542U, // <u,1,2,0>: Cost 2 vext1 <3,u,1,2>, LHS + 2689876871U, // <u,1,2,1>: Cost 3 vext3 LHS, <1,2,1,3> + 2631116445U, // <u,1,2,2>: Cost 3 vext2 <2,2,u,1>, <2,2,u,1> + 835584U, // <u,1,2,3>: Cost 0 copy LHS + 1499712822U, // <u,1,2,4>: Cost 2 vext1 <3,u,1,2>, RHS + 2689876907U, // <u,1,2,5>: Cost 3 vext3 LHS, <1,2,5,3> + 2631780282U, // <u,1,2,6>: Cost 3 vext2 <2,3,u,1>, <2,6,3,7> + 1523603074U, // <u,1,2,7>: Cost 2 vext1 <7,u,1,2>, <7,u,1,2> + 835584U, // <u,1,2,u>: Cost 0 copy LHS + 1487773798U, // <u,1,3,0>: Cost 2 vext1 <1,u,1,3>, LHS + 1611490264U, // <u,1,3,1>: Cost 2 vext3 LHS, <1,3,1,3> + 2685232094U, // <u,1,3,2>: Cost 3 vext3 LHS, <1,3,2,0> + 2018746470U, // <u,1,3,3>: Cost 2 vtrnr LHS, LHS + 1487777078U, // <u,1,3,4>: Cost 2 vext1 <1,u,1,3>, RHS + 1611490304U, // <u,1,3,5>: Cost 2 vext3 LHS, <1,3,5,7> + 2685674505U, // <u,1,3,6>: Cost 3 vext3 LHS, <1,3,6,7> + 2640407307U, // <u,1,3,7>: Cost 3 vext2 <3,7,u,1>, <3,7,u,1> + 1611490327U, // <u,1,3,u>: Cost 2 vext3 LHS, <1,3,u,3> + 1567992749U, // <u,1,4,0>: Cost 2 vext2 <4,0,u,1>, <4,0,u,1> + 2693121070U, // <u,1,4,1>: Cost 3 vext3 <1,4,1,u>, <1,4,1,u> + 2693194807U, // <u,1,4,2>: Cost 3 vext3 <1,4,2,u>, <1,4,2,u> + 1152386432U, // <u,1,4,3>: Cost 2 vrev <1,u,3,4> + 2555555126U, // <u,1,4,4>: Cost 3 vext1 <0,u,1,4>, RHS + 1558039862U, // <u,1,4,5>: Cost 2 vext2 <2,3,u,1>, RHS + 2645716371U, // <u,1,4,6>: Cost 3 vext2 <4,6,u,1>, <4,6,u,1> + 2597361284U, // <u,1,4,7>: Cost 3 vext1 <7,u,1,4>, <7,u,1,4> + 1152755117U, // <u,1,4,u>: Cost 2 vrev <1,u,u,4> + 1481818214U, // <u,1,5,0>: Cost 2 vext1 <0,u,1,5>, LHS + 2555560694U, // <u,1,5,1>: Cost 3 vext1 <0,u,1,5>, <1,0,3,2> + 2555561576U, // <u,1,5,2>: Cost 3 vext1 <0,u,1,5>, <2,2,2,2> + 1611490448U, // <u,1,5,3>: Cost 2 vext3 LHS, <1,5,3,7> + 1481821494U, // <u,1,5,4>: Cost 2 vext1 <0,u,1,5>, RHS + 2651025435U, // <u,1,5,5>: Cost 3 vext2 <5,5,u,1>, <5,5,u,1> + 2651689068U, // <u,1,5,6>: Cost 3 vext2 <5,6,u,1>, <5,6,u,1> + 2823966006U, // <u,1,5,7>: Cost 3 vuzpr <0,u,1,1>, RHS + 1611932861U, // <u,1,5,u>: Cost 2 vext3 LHS, <1,5,u,7> + 2555568230U, // <u,1,6,0>: Cost 3 vext1 <0,u,1,6>, LHS + 2689877199U, // <u,1,6,1>: Cost 3 vext3 LHS, <1,6,1,7> + 2712069336U, // <u,1,6,2>: Cost 3 vext3 RHS, <1,6,2,7> + 2685232353U, // <u,1,6,3>: Cost 3 vext3 LHS, <1,6,3,7> + 2555571510U, // <u,1,6,4>: Cost 3 vext1 <0,u,1,6>, RHS + 2689877235U, // <u,1,6,5>: Cost 3 vext3 LHS, <1,6,5,7> + 2657661765U, // <u,1,6,6>: Cost 3 vext2 <6,6,u,1>, <6,6,u,1> + 1584583574U, // <u,1,6,7>: Cost 2 vext2 <6,7,u,1>, <6,7,u,1> + 1585247207U, // <u,1,6,u>: Cost 2 vext2 <6,u,u,1>, <6,u,u,1> + 2561548390U, // <u,1,7,0>: Cost 3 vext1 <1,u,1,7>, LHS + 2561549681U, // <u,1,7,1>: Cost 3 vext1 <1,u,1,7>, <1,u,1,7> + 2573493926U, // <u,1,7,2>: Cost 3 vext1 <3,u,1,7>, <2,3,0,1> + 2042962022U, // <u,1,7,3>: Cost 2 vtrnr RHS, LHS + 2561551670U, // <u,1,7,4>: Cost 3 vext1 <1,u,1,7>, RHS + 2226300309U, // <u,1,7,5>: Cost 3 vrev <1,u,5,7> + 2658325990U, // <u,1,7,6>: Cost 3 vext2 <6,7,u,1>, <7,6,1,u> + 2658326124U, // <u,1,7,7>: Cost 3 vext2 <6,7,u,1>, <7,7,7,7> + 2042962027U, // <u,1,7,u>: Cost 2 vtrnr RHS, LHS + 1481842790U, // <u,1,u,0>: Cost 2 vext1 <0,u,1,u>, LHS + 202162278U, // <u,1,u,1>: Cost 1 vdup1 LHS + 2685674867U, // <u,1,u,2>: Cost 3 vext3 LHS, <1,u,2,0> + 835584U, // <u,1,u,3>: Cost 0 copy LHS + 1481846070U, // <u,1,u,4>: Cost 2 vext1 <0,u,1,u>, RHS + 1611933077U, // <u,1,u,5>: Cost 2 vext3 LHS, <1,u,5,7> + 2685674910U, // <u,1,u,6>: Cost 3 vext3 LHS, <1,u,6,7> + 1523652232U, // <u,1,u,7>: Cost 2 vext1 <7,u,1,u>, <7,u,1,u> + 835584U, // <u,1,u,u>: Cost 0 copy LHS + 1544110154U, // <u,2,0,0>: Cost 2 vext2 <0,0,u,2>, <0,0,u,2> + 1545437286U, // <u,2,0,1>: Cost 2 vext2 <0,2,u,2>, LHS + 1545437420U, // <u,2,0,2>: Cost 2 vext2 <0,2,u,2>, <0,2,u,2> + 2685232589U, // <u,2,0,3>: Cost 3 vext3 LHS, <2,0,3,0> + 2619179346U, // <u,2,0,4>: Cost 3 vext2 <0,2,u,2>, <0,4,1,5> + 2712069606U, // <u,2,0,5>: Cost 3 vext3 RHS, <2,0,5,7> + 2689877484U, // <u,2,0,6>: Cost 3 vext3 LHS, <2,0,6,4> + 2659656273U, // <u,2,0,7>: Cost 3 vext2 <7,0,u,2>, <0,7,2,u> + 1545437853U, // <u,2,0,u>: Cost 2 vext2 <0,2,u,2>, LHS + 1550082851U, // <u,2,1,0>: Cost 2 vext2 <1,0,u,2>, <1,0,u,2> + 2619179828U, // <u,2,1,1>: Cost 3 vext2 <0,2,u,2>, <1,1,1,1> + 2619179926U, // <u,2,1,2>: Cost 3 vext2 <0,2,u,2>, <1,2,3,0> + 2685232671U, // <u,2,1,3>: Cost 3 vext3 LHS, <2,1,3,1> + 2555604278U, // <u,2,1,4>: Cost 3 vext1 <0,u,2,1>, RHS + 2619180176U, // <u,2,1,5>: Cost 3 vext2 <0,2,u,2>, <1,5,3,7> + 2689877564U, // <u,2,1,6>: Cost 3 vext3 LHS, <2,1,6,3> + 2602718850U, // <u,2,1,7>: Cost 3 vext1 <u,7,2,1>, <7,u,1,2> + 1158703235U, // <u,2,1,u>: Cost 2 vrev <2,u,u,1> + 1481867366U, // <u,2,2,0>: Cost 2 vext1 <0,u,2,2>, LHS + 2555609846U, // <u,2,2,1>: Cost 3 vext1 <0,u,2,2>, <1,0,3,2> + 269271142U, // <u,2,2,2>: Cost 1 vdup2 LHS + 1611490930U, // <u,2,2,3>: Cost 2 vext3 LHS, <2,2,3,3> + 1481870646U, // <u,2,2,4>: Cost 2 vext1 <0,u,2,2>, RHS + 2689877640U, // <u,2,2,5>: Cost 3 vext3 LHS, <2,2,5,7> + 2619180986U, // <u,2,2,6>: Cost 3 vext2 <0,2,u,2>, <2,6,3,7> + 2593436837U, // <u,2,2,7>: Cost 3 vext1 <7,2,2,2>, <7,2,2,2> + 269271142U, // <u,2,2,u>: Cost 1 vdup2 LHS + 408134301U, // <u,2,3,0>: Cost 1 vext1 LHS, LHS + 1481876214U, // <u,2,3,1>: Cost 2 vext1 LHS, <1,0,3,2> + 1481877096U, // <u,2,3,2>: Cost 2 vext1 LHS, <2,2,2,2> + 1880326246U, // <u,2,3,3>: Cost 2 vzipr LHS, LHS + 408137014U, // <u,2,3,4>: Cost 1 vext1 LHS, RHS + 1529654992U, // <u,2,3,5>: Cost 2 vext1 LHS, <5,1,7,3> + 1529655802U, // <u,2,3,6>: Cost 2 vext1 LHS, <6,2,7,3> + 1529656314U, // <u,2,3,7>: Cost 2 vext1 LHS, <7,0,1,2> + 408139566U, // <u,2,3,u>: Cost 1 vext1 LHS, LHS + 1567853468U, // <u,2,4,0>: Cost 2 vext2 <4,0,6,2>, <4,0,6,2> + 2561598362U, // <u,2,4,1>: Cost 3 vext1 <1,u,2,4>, <1,2,3,4> + 2555627214U, // <u,2,4,2>: Cost 3 vext1 <0,u,2,4>, <2,3,4,5> + 2685232918U, // <u,2,4,3>: Cost 3 vext3 LHS, <2,4,3,5> + 2555628854U, // <u,2,4,4>: Cost 3 vext1 <0,u,2,4>, RHS + 1545440566U, // <u,2,4,5>: Cost 2 vext2 <0,2,u,2>, RHS + 1571982740U, // <u,2,4,6>: Cost 2 vext2 <4,6,u,2>, <4,6,u,2> + 2592125957U, // <u,2,4,7>: Cost 3 vext1 <7,0,2,4>, <7,0,2,4> + 1545440809U, // <u,2,4,u>: Cost 2 vext2 <0,2,u,2>, RHS + 2555633766U, // <u,2,5,0>: Cost 3 vext1 <0,u,2,5>, LHS + 2561606550U, // <u,2,5,1>: Cost 3 vext1 <1,u,2,5>, <1,2,3,0> + 2689877856U, // <u,2,5,2>: Cost 3 vext3 LHS, <2,5,2,7> + 2685233000U, // <u,2,5,3>: Cost 3 vext3 LHS, <2,5,3,6> + 1158441059U, // <u,2,5,4>: Cost 2 vrev <2,u,4,5> + 2645725188U, // <u,2,5,5>: Cost 3 vext2 <4,6,u,2>, <5,5,5,5> + 2689877892U, // <u,2,5,6>: Cost 3 vext3 LHS, <2,5,6,7> + 2823900470U, // <u,2,5,7>: Cost 3 vuzpr <0,u,0,2>, RHS + 1158736007U, // <u,2,5,u>: Cost 2 vrev <2,u,u,5> + 1481900134U, // <u,2,6,0>: Cost 2 vext1 <0,u,2,6>, LHS + 2555642614U, // <u,2,6,1>: Cost 3 vext1 <0,u,2,6>, <1,0,3,2> + 2555643496U, // <u,2,6,2>: Cost 3 vext1 <0,u,2,6>, <2,2,2,2> + 1611491258U, // <u,2,6,3>: Cost 2 vext3 LHS, <2,6,3,7> + 1481903414U, // <u,2,6,4>: Cost 2 vext1 <0,u,2,6>, RHS + 2689877964U, // <u,2,6,5>: Cost 3 vext3 LHS, <2,6,5,7> + 2689877973U, // <u,2,6,6>: Cost 3 vext3 LHS, <2,6,6,7> + 2645726030U, // <u,2,6,7>: Cost 3 vext2 <4,6,u,2>, <6,7,0,1> + 1611933671U, // <u,2,6,u>: Cost 2 vext3 LHS, <2,6,u,7> + 1585919033U, // <u,2,7,0>: Cost 2 vext2 <7,0,u,2>, <7,0,u,2> + 2573566710U, // <u,2,7,1>: Cost 3 vext1 <3,u,2,7>, <1,0,3,2> + 2567596115U, // <u,2,7,2>: Cost 3 vext1 <2,u,2,7>, <2,u,2,7> + 1906901094U, // <u,2,7,3>: Cost 2 vzipr RHS, LHS + 2555653430U, // <u,2,7,4>: Cost 3 vext1 <0,u,2,7>, RHS + 2800080230U, // <u,2,7,5>: Cost 3 vuzpl LHS, <7,4,5,6> + 2980643164U, // <u,2,7,6>: Cost 3 vzipr RHS, <0,4,2,6> + 2645726828U, // <u,2,7,7>: Cost 3 vext2 <4,6,u,2>, <7,7,7,7> + 1906901099U, // <u,2,7,u>: Cost 2 vzipr RHS, LHS + 408175266U, // <u,2,u,0>: Cost 1 vext1 LHS, LHS + 1545443118U, // <u,2,u,1>: Cost 2 vext2 <0,2,u,2>, LHS + 269271142U, // <u,2,u,2>: Cost 1 vdup2 LHS + 1611491416U, // <u,2,u,3>: Cost 2 vext3 LHS, <2,u,3,3> + 408177974U, // <u,2,u,4>: Cost 1 vext1 LHS, RHS + 1545443482U, // <u,2,u,5>: Cost 2 vext2 <0,2,u,2>, RHS + 1726339226U, // <u,2,u,6>: Cost 2 vuzpl LHS, RHS + 1529697274U, // <u,2,u,7>: Cost 2 vext1 LHS, <7,0,1,2> + 408180526U, // <u,2,u,u>: Cost 1 vext1 LHS, LHS + 1544781824U, // <u,3,0,0>: Cost 2 vext2 LHS, <0,0,0,0> + 471040156U, // <u,3,0,1>: Cost 1 vext2 LHS, LHS + 1544781988U, // <u,3,0,2>: Cost 2 vext2 LHS, <0,2,0,2> + 2618523900U, // <u,3,0,3>: Cost 3 vext2 LHS, <0,3,1,0> + 1544782162U, // <u,3,0,4>: Cost 2 vext2 LHS, <0,4,1,5> + 2238188352U, // <u,3,0,5>: Cost 3 vrev <3,u,5,0> + 2623169023U, // <u,3,0,6>: Cost 3 vext2 LHS, <0,6,2,7> + 2238335826U, // <u,3,0,7>: Cost 3 vrev <3,u,7,0> + 471040669U, // <u,3,0,u>: Cost 1 vext2 LHS, LHS + 1544782582U, // <u,3,1,0>: Cost 2 vext2 LHS, <1,0,3,2> + 1544782644U, // <u,3,1,1>: Cost 2 vext2 LHS, <1,1,1,1> + 1544782742U, // <u,3,1,2>: Cost 2 vext2 LHS, <1,2,3,0> + 1544782808U, // <u,3,1,3>: Cost 2 vext2 LHS, <1,3,1,3> + 2618524733U, // <u,3,1,4>: Cost 3 vext2 LHS, <1,4,3,5> + 1544782992U, // <u,3,1,5>: Cost 2 vext2 LHS, <1,5,3,7> + 2618524897U, // <u,3,1,6>: Cost 3 vext2 LHS, <1,6,3,7> + 2703517987U, // <u,3,1,7>: Cost 3 vext3 <3,1,7,u>, <3,1,7,u> + 1544783213U, // <u,3,1,u>: Cost 2 vext2 LHS, <1,u,1,3> + 1529716838U, // <u,3,2,0>: Cost 2 vext1 <u,u,3,2>, LHS + 1164167966U, // <u,3,2,1>: Cost 2 vrev <3,u,1,2> + 1544783464U, // <u,3,2,2>: Cost 2 vext2 LHS, <2,2,2,2> + 1544783526U, // <u,3,2,3>: Cost 2 vext2 LHS, <2,3,0,1> + 1529720118U, // <u,3,2,4>: Cost 2 vext1 <u,u,3,2>, RHS + 2618525544U, // <u,3,2,5>: Cost 3 vext2 LHS, <2,5,3,6> + 1544783802U, // <u,3,2,6>: Cost 2 vext2 LHS, <2,6,3,7> + 2704181620U, // <u,3,2,7>: Cost 3 vext3 <3,2,7,u>, <3,2,7,u> + 1544783931U, // <u,3,2,u>: Cost 2 vext2 LHS, <2,u,0,1> + 1544784022U, // <u,3,3,0>: Cost 2 vext2 LHS, <3,0,1,2> + 1487922559U, // <u,3,3,1>: Cost 2 vext1 <1,u,3,3>, <1,u,3,3> + 1493895256U, // <u,3,3,2>: Cost 2 vext1 <2,u,3,3>, <2,u,3,3> + 336380006U, // <u,3,3,3>: Cost 1 vdup3 LHS + 1544784386U, // <u,3,3,4>: Cost 2 vext2 LHS, <3,4,5,6> + 2824054478U, // <u,3,3,5>: Cost 3 vuzpr LHS, <2,3,4,5> + 2238286668U, // <u,3,3,6>: Cost 3 vrev <3,u,6,3> + 2954069136U, // <u,3,3,7>: Cost 3 vzipr LHS, <1,5,3,7> + 336380006U, // <u,3,3,u>: Cost 1 vdup3 LHS + 1487929446U, // <u,3,4,0>: Cost 2 vext1 <1,u,3,4>, LHS + 1487930752U, // <u,3,4,1>: Cost 2 vext1 <1,u,3,4>, <1,u,3,4> + 2623171644U, // <u,3,4,2>: Cost 3 vext2 LHS, <4,2,6,0> + 2561673366U, // <u,3,4,3>: Cost 3 vext1 <1,u,3,4>, <3,0,1,2> + 1487932726U, // <u,3,4,4>: Cost 2 vext1 <1,u,3,4>, RHS + 471043382U, // <u,3,4,5>: Cost 1 vext2 LHS, RHS + 1592561012U, // <u,3,4,6>: Cost 2 vext2 LHS, <4,6,4,6> + 2238368598U, // <u,3,4,7>: Cost 3 vrev <3,u,7,4> + 471043625U, // <u,3,4,u>: Cost 1 vext2 LHS, RHS + 2555707494U, // <u,3,5,0>: Cost 3 vext1 <0,u,3,5>, LHS + 1574645465U, // <u,3,5,1>: Cost 2 vext2 <5,1,u,3>, <5,1,u,3> + 2567653106U, // <u,3,5,2>: Cost 3 vext1 <2,u,3,5>, <2,3,u,5> + 2555709954U, // <u,3,5,3>: Cost 3 vext1 <0,u,3,5>, <3,4,5,6> + 1592561606U, // <u,3,5,4>: Cost 2 vext2 LHS, <5,4,7,6> + 1592561668U, // <u,3,5,5>: Cost 2 vext2 LHS, <5,5,5,5> + 1592561762U, // <u,3,5,6>: Cost 2 vext2 LHS, <5,6,7,0> + 1750314294U, // <u,3,5,7>: Cost 2 vuzpr LHS, RHS + 1750314295U, // <u,3,5,u>: Cost 2 vuzpr LHS, RHS + 2623172897U, // <u,3,6,0>: Cost 3 vext2 LHS, <6,0,1,2> + 2561688962U, // <u,3,6,1>: Cost 3 vext1 <1,u,3,6>, <1,u,3,6> + 1581281795U, // <u,3,6,2>: Cost 2 vext2 <6,2,u,3>, <6,2,u,3> + 2706541204U, // <u,3,6,3>: Cost 3 vext3 <3,6,3,u>, <3,6,3,u> + 2623173261U, // <u,3,6,4>: Cost 3 vext2 LHS, <6,4,5,6> + 1164495686U, // <u,3,6,5>: Cost 2 vrev <3,u,5,6> + 1592562488U, // <u,3,6,6>: Cost 2 vext2 LHS, <6,6,6,6> + 1592562510U, // <u,3,6,7>: Cost 2 vext2 LHS, <6,7,0,1> + 1164716897U, // <u,3,6,u>: Cost 2 vrev <3,u,u,6> + 1487954022U, // <u,3,7,0>: Cost 2 vext1 <1,u,3,7>, LHS + 1487955331U, // <u,3,7,1>: Cost 2 vext1 <1,u,3,7>, <1,u,3,7> + 1493928028U, // <u,3,7,2>: Cost 2 vext1 <2,u,3,7>, <2,u,3,7> + 2561697942U, // <u,3,7,3>: Cost 3 vext1 <1,u,3,7>, <3,0,1,2> + 1487957302U, // <u,3,7,4>: Cost 2 vext1 <1,u,3,7>, RHS + 2707352311U, // <u,3,7,5>: Cost 3 vext3 <3,7,5,u>, <3,7,5,u> + 2655024623U, // <u,3,7,6>: Cost 3 vext2 <6,2,u,3>, <7,6,2,u> + 1592563308U, // <u,3,7,7>: Cost 2 vext2 LHS, <7,7,7,7> + 1487959854U, // <u,3,7,u>: Cost 2 vext1 <1,u,3,7>, LHS + 1544787667U, // <u,3,u,0>: Cost 2 vext2 LHS, <u,0,1,2> + 471045934U, // <u,3,u,1>: Cost 1 vext2 LHS, LHS + 1549432709U, // <u,3,u,2>: Cost 2 vext2 LHS, <u,2,3,0> + 336380006U, // <u,3,u,3>: Cost 1 vdup3 LHS + 1544788031U, // <u,3,u,4>: Cost 2 vext2 LHS, <u,4,5,6> + 471046298U, // <u,3,u,5>: Cost 1 vext2 LHS, RHS + 1549433040U, // <u,3,u,6>: Cost 2 vext2 LHS, <u,6,3,7> + 1750314537U, // <u,3,u,7>: Cost 2 vuzpr LHS, RHS + 471046501U, // <u,3,u,u>: Cost 1 vext2 LHS, LHS + 2625167360U, // <u,4,0,0>: Cost 3 vext2 <1,2,u,4>, <0,0,0,0> + 1551425638U, // <u,4,0,1>: Cost 2 vext2 <1,2,u,4>, LHS + 2619195630U, // <u,4,0,2>: Cost 3 vext2 <0,2,u,4>, <0,2,u,4> + 2619343104U, // <u,4,0,3>: Cost 3 vext2 <0,3,1,4>, <0,3,1,4> + 2625167698U, // <u,4,0,4>: Cost 3 vext2 <1,2,u,4>, <0,4,1,5> + 1638329234U, // <u,4,0,5>: Cost 2 vext3 RHS, <4,0,5,1> + 1638329244U, // <u,4,0,6>: Cost 2 vext3 RHS, <4,0,6,2> + 3787803556U, // <u,4,0,7>: Cost 4 vext3 RHS, <4,0,7,1> + 1551426205U, // <u,4,0,u>: Cost 2 vext2 <1,2,u,4>, LHS + 2555748454U, // <u,4,1,0>: Cost 3 vext1 <0,u,4,1>, LHS + 2625168180U, // <u,4,1,1>: Cost 3 vext2 <1,2,u,4>, <1,1,1,1> + 1551426503U, // <u,4,1,2>: Cost 2 vext2 <1,2,u,4>, <1,2,u,4> + 2625168344U, // <u,4,1,3>: Cost 3 vext2 <1,2,u,4>, <1,3,1,3> + 2555751734U, // <u,4,1,4>: Cost 3 vext1 <0,u,4,1>, RHS + 1860554038U, // <u,4,1,5>: Cost 2 vzipl LHS, RHS + 2689879022U, // <u,4,1,6>: Cost 3 vext3 LHS, <4,1,6,3> + 2592248852U, // <u,4,1,7>: Cost 3 vext1 <7,0,4,1>, <7,0,4,1> + 1555408301U, // <u,4,1,u>: Cost 2 vext2 <1,u,u,4>, <1,u,u,4> + 2555756646U, // <u,4,2,0>: Cost 3 vext1 <0,u,4,2>, LHS + 2625168943U, // <u,4,2,1>: Cost 3 vext2 <1,2,u,4>, <2,1,4,u> + 2625169000U, // <u,4,2,2>: Cost 3 vext2 <1,2,u,4>, <2,2,2,2> + 2619197134U, // <u,4,2,3>: Cost 3 vext2 <0,2,u,4>, <2,3,4,5> + 2555759926U, // <u,4,2,4>: Cost 3 vext1 <0,u,4,2>, RHS + 2712071222U, // <u,4,2,5>: Cost 3 vext3 RHS, <4,2,5,3> + 1994771766U, // <u,4,2,6>: Cost 2 vtrnl LHS, RHS + 2592257045U, // <u,4,2,7>: Cost 3 vext1 <7,0,4,2>, <7,0,4,2> + 1994771784U, // <u,4,2,u>: Cost 2 vtrnl LHS, RHS + 2625169558U, // <u,4,3,0>: Cost 3 vext2 <1,2,u,4>, <3,0,1,2> + 2567709594U, // <u,4,3,1>: Cost 3 vext1 <2,u,4,3>, <1,2,3,4> + 2567710817U, // <u,4,3,2>: Cost 3 vext1 <2,u,4,3>, <2,u,4,3> + 2625169820U, // <u,4,3,3>: Cost 3 vext2 <1,2,u,4>, <3,3,3,3> + 2625169922U, // <u,4,3,4>: Cost 3 vext2 <1,2,u,4>, <3,4,5,6> + 2954069710U, // <u,4,3,5>: Cost 3 vzipr LHS, <2,3,4,5> + 2954068172U, // <u,4,3,6>: Cost 3 vzipr LHS, <0,2,4,6> + 3903849472U, // <u,4,3,7>: Cost 4 vuzpr <1,u,3,4>, <1,3,5,7> + 2954068174U, // <u,4,3,u>: Cost 3 vzipr LHS, <0,2,4,u> + 1505919078U, // <u,4,4,0>: Cost 2 vext1 <4,u,4,4>, LHS + 2567717831U, // <u,4,4,1>: Cost 3 vext1 <2,u,4,4>, <1,2,u,4> + 2567719010U, // <u,4,4,2>: Cost 3 vext1 <2,u,4,4>, <2,u,4,4> + 2570373542U, // <u,4,4,3>: Cost 3 vext1 <3,3,4,4>, <3,3,4,4> + 161926454U, // <u,4,4,4>: Cost 1 vdup0 RHS + 1551428918U, // <u,4,4,5>: Cost 2 vext2 <1,2,u,4>, RHS + 1638329572U, // <u,4,4,6>: Cost 2 vext3 RHS, <4,4,6,6> + 2594927963U, // <u,4,4,7>: Cost 3 vext1 <7,4,4,4>, <7,4,4,4> + 161926454U, // <u,4,4,u>: Cost 1 vdup0 RHS + 1493983334U, // <u,4,5,0>: Cost 2 vext1 <2,u,4,5>, LHS + 2689879301U, // <u,4,5,1>: Cost 3 vext3 LHS, <4,5,1,3> + 1493985379U, // <u,4,5,2>: Cost 2 vext1 <2,u,4,5>, <2,u,4,5> + 2567727254U, // <u,4,5,3>: Cost 3 vext1 <2,u,4,5>, <3,0,1,2> + 1493986614U, // <u,4,5,4>: Cost 2 vext1 <2,u,4,5>, RHS + 1863535926U, // <u,4,5,5>: Cost 2 vzipl RHS, RHS + 537750838U, // <u,4,5,6>: Cost 1 vext3 LHS, RHS + 2830110006U, // <u,4,5,7>: Cost 3 vuzpr <1,u,3,4>, RHS + 537750856U, // <u,4,5,u>: Cost 1 vext3 LHS, RHS + 1482047590U, // <u,4,6,0>: Cost 2 vext1 <0,u,4,6>, LHS + 2555790070U, // <u,4,6,1>: Cost 3 vext1 <0,u,4,6>, <1,0,3,2> + 2555790952U, // <u,4,6,2>: Cost 3 vext1 <0,u,4,6>, <2,2,2,2> + 2555791510U, // <u,4,6,3>: Cost 3 vext1 <0,u,4,6>, <3,0,1,2> + 1482050870U, // <u,4,6,4>: Cost 2 vext1 <0,u,4,6>, RHS + 2689879422U, // <u,4,6,5>: Cost 3 vext3 LHS, <4,6,5,7> + 1997753654U, // <u,4,6,6>: Cost 2 vtrnl RHS, RHS + 2712071562U, // <u,4,6,7>: Cost 3 vext3 RHS, <4,6,7,1> + 1482053422U, // <u,4,6,u>: Cost 2 vext1 <0,u,4,6>, LHS + 2567741542U, // <u,4,7,0>: Cost 3 vext1 <2,u,4,7>, LHS + 2567742362U, // <u,4,7,1>: Cost 3 vext1 <2,u,4,7>, <1,2,3,4> + 2567743589U, // <u,4,7,2>: Cost 3 vext1 <2,u,4,7>, <2,u,4,7> + 2573716286U, // <u,4,7,3>: Cost 3 vext1 <3,u,4,7>, <3,u,4,7> + 2567744822U, // <u,4,7,4>: Cost 3 vext1 <2,u,4,7>, RHS + 2712071624U, // <u,4,7,5>: Cost 3 vext3 RHS, <4,7,5,0> + 96808489U, // <u,4,7,6>: Cost 1 vrev RHS + 2651715180U, // <u,4,7,7>: Cost 3 vext2 <5,6,u,4>, <7,7,7,7> + 96955963U, // <u,4,7,u>: Cost 1 vrev RHS + 1482063974U, // <u,4,u,0>: Cost 2 vext1 <0,u,4,u>, LHS + 1551431470U, // <u,4,u,1>: Cost 2 vext2 <1,2,u,4>, LHS + 1494009958U, // <u,4,u,2>: Cost 2 vext1 <2,u,4,u>, <2,u,4,u> + 2555807894U, // <u,4,u,3>: Cost 3 vext1 <0,u,4,u>, <3,0,1,2> + 161926454U, // <u,4,u,4>: Cost 1 vdup0 RHS + 1551431834U, // <u,4,u,5>: Cost 2 vext2 <1,2,u,4>, RHS + 537751081U, // <u,4,u,6>: Cost 1 vext3 LHS, RHS + 2830110249U, // <u,4,u,7>: Cost 3 vuzpr <1,u,3,4>, RHS + 537751099U, // <u,4,u,u>: Cost 1 vext3 LHS, RHS + 2631811072U, // <u,5,0,0>: Cost 3 vext2 <2,3,u,5>, <0,0,0,0> + 1558069350U, // <u,5,0,1>: Cost 2 vext2 <2,3,u,5>, LHS + 2619203823U, // <u,5,0,2>: Cost 3 vext2 <0,2,u,5>, <0,2,u,5> + 2619867456U, // <u,5,0,3>: Cost 3 vext2 <0,3,u,5>, <0,3,u,5> + 1546273106U, // <u,5,0,4>: Cost 2 vext2 <0,4,1,5>, <0,4,1,5> + 2733010539U, // <u,5,0,5>: Cost 3 vext3 LHS, <5,0,5,1> + 2597622682U, // <u,5,0,6>: Cost 3 vext1 <7,u,5,0>, <6,7,u,5> + 1176539396U, // <u,5,0,7>: Cost 2 vrev <5,u,7,0> + 1558069917U, // <u,5,0,u>: Cost 2 vext2 <2,3,u,5>, LHS + 1505968230U, // <u,5,1,0>: Cost 2 vext1 <4,u,5,1>, LHS + 2624512887U, // <u,5,1,1>: Cost 3 vext2 <1,1,u,5>, <1,1,u,5> + 2631811990U, // <u,5,1,2>: Cost 3 vext2 <2,3,u,5>, <1,2,3,0> + 2618541056U, // <u,5,1,3>: Cost 3 vext2 <0,1,u,5>, <1,3,5,7> + 1505971510U, // <u,5,1,4>: Cost 2 vext1 <4,u,5,1>, RHS + 2627167419U, // <u,5,1,5>: Cost 3 vext2 <1,5,u,5>, <1,5,u,5> + 2579714554U, // <u,5,1,6>: Cost 3 vext1 <4,u,5,1>, <6,2,7,3> + 1638330064U, // <u,5,1,7>: Cost 2 vext3 RHS, <5,1,7,3> + 1638477529U, // <u,5,1,u>: Cost 2 vext3 RHS, <5,1,u,3> + 2561802342U, // <u,5,2,0>: Cost 3 vext1 <1,u,5,2>, LHS + 2561803264U, // <u,5,2,1>: Cost 3 vext1 <1,u,5,2>, <1,3,5,7> + 2631149217U, // <u,5,2,2>: Cost 3 vext2 <2,2,u,5>, <2,2,u,5> + 1558071026U, // <u,5,2,3>: Cost 2 vext2 <2,3,u,5>, <2,3,u,5> + 2561805622U, // <u,5,2,4>: Cost 3 vext1 <1,u,5,2>, RHS + 2714062607U, // <u,5,2,5>: Cost 3 vext3 RHS, <5,2,5,3> + 2631813050U, // <u,5,2,6>: Cost 3 vext2 <2,3,u,5>, <2,6,3,7> + 3092335926U, // <u,5,2,7>: Cost 3 vtrnr <0,u,0,2>, RHS + 1561389191U, // <u,5,2,u>: Cost 2 vext2 <2,u,u,5>, <2,u,u,5> + 2561810534U, // <u,5,3,0>: Cost 3 vext1 <1,u,5,3>, LHS + 2561811857U, // <u,5,3,1>: Cost 3 vext1 <1,u,5,3>, <1,u,5,3> + 2631813474U, // <u,5,3,2>: Cost 3 vext2 <2,3,u,5>, <3,2,5,u> + 2631813532U, // <u,5,3,3>: Cost 3 vext2 <2,3,u,5>, <3,3,3,3> + 2619869698U, // <u,5,3,4>: Cost 3 vext2 <0,3,u,5>, <3,4,5,6> + 3001847002U, // <u,5,3,5>: Cost 3 vzipr LHS, <4,4,5,5> + 2954070530U, // <u,5,3,6>: Cost 3 vzipr LHS, <3,4,5,6> + 2018749750U, // <u,5,3,7>: Cost 2 vtrnr LHS, RHS + 2018749751U, // <u,5,3,u>: Cost 2 vtrnr LHS, RHS + 2573762662U, // <u,5,4,0>: Cost 3 vext1 <3,u,5,4>, LHS + 2620017634U, // <u,5,4,1>: Cost 3 vext2 <0,4,1,5>, <4,1,5,0> + 2573764338U, // <u,5,4,2>: Cost 3 vext1 <3,u,5,4>, <2,3,u,5> + 2573765444U, // <u,5,4,3>: Cost 3 vext1 <3,u,5,4>, <3,u,5,4> + 1570680053U, // <u,5,4,4>: Cost 2 vext2 <4,4,u,5>, <4,4,u,5> + 1558072630U, // <u,5,4,5>: Cost 2 vext2 <2,3,u,5>, RHS + 2645749143U, // <u,5,4,6>: Cost 3 vext2 <4,6,u,5>, <4,6,u,5> + 1638330310U, // <u,5,4,7>: Cost 2 vext3 RHS, <5,4,7,6> + 1558072873U, // <u,5,4,u>: Cost 2 vext2 <2,3,u,5>, RHS + 1506000998U, // <u,5,5,0>: Cost 2 vext1 <4,u,5,5>, LHS + 2561827984U, // <u,5,5,1>: Cost 3 vext1 <1,u,5,5>, <1,5,3,7> + 2579744360U, // <u,5,5,2>: Cost 3 vext1 <4,u,5,5>, <2,2,2,2> + 2579744918U, // <u,5,5,3>: Cost 3 vext1 <4,u,5,5>, <3,0,1,2> + 1506004278U, // <u,5,5,4>: Cost 2 vext1 <4,u,5,5>, RHS + 229035318U, // <u,5,5,5>: Cost 1 vdup1 RHS + 2712072206U, // <u,5,5,6>: Cost 3 vext3 RHS, <5,5,6,6> + 1638330392U, // <u,5,5,7>: Cost 2 vext3 RHS, <5,5,7,7> + 229035318U, // <u,5,5,u>: Cost 1 vdup1 RHS + 1500037222U, // <u,5,6,0>: Cost 2 vext1 <3,u,5,6>, LHS + 2561836436U, // <u,5,6,1>: Cost 3 vext1 <1,u,5,6>, <1,u,5,6> + 2567809133U, // <u,5,6,2>: Cost 3 vext1 <2,u,5,6>, <2,u,5,6> + 1500040006U, // <u,5,6,3>: Cost 2 vext1 <3,u,5,6>, <3,u,5,6> + 1500040502U, // <u,5,6,4>: Cost 2 vext1 <3,u,5,6>, RHS + 2714062935U, // <u,5,6,5>: Cost 3 vext3 RHS, <5,6,5,7> + 2712072288U, // <u,5,6,6>: Cost 3 vext3 RHS, <5,6,6,7> + 27705344U, // <u,5,6,7>: Cost 0 copy RHS + 27705344U, // <u,5,6,u>: Cost 0 copy RHS + 1488101478U, // <u,5,7,0>: Cost 2 vext1 <1,u,5,7>, LHS + 1488102805U, // <u,5,7,1>: Cost 2 vext1 <1,u,5,7>, <1,u,5,7> + 2561844840U, // <u,5,7,2>: Cost 3 vext1 <1,u,5,7>, <2,2,2,2> + 2561845398U, // <u,5,7,3>: Cost 3 vext1 <1,u,5,7>, <3,0,1,2> + 1488104758U, // <u,5,7,4>: Cost 2 vext1 <1,u,5,7>, RHS + 1638330536U, // <u,5,7,5>: Cost 2 vext3 RHS, <5,7,5,7> + 2712072362U, // <u,5,7,6>: Cost 3 vext3 RHS, <5,7,6,0> + 2042965302U, // <u,5,7,7>: Cost 2 vtrnr RHS, RHS + 1488107310U, // <u,5,7,u>: Cost 2 vext1 <1,u,5,7>, LHS + 1488109670U, // <u,5,u,0>: Cost 2 vext1 <1,u,5,u>, LHS + 1488110998U, // <u,5,u,1>: Cost 2 vext1 <1,u,5,u>, <1,u,5,u> + 2561853032U, // <u,5,u,2>: Cost 3 vext1 <1,u,5,u>, <2,2,2,2> + 1500056392U, // <u,5,u,3>: Cost 2 vext1 <3,u,5,u>, <3,u,5,u> + 1488112950U, // <u,5,u,4>: Cost 2 vext1 <1,u,5,u>, RHS + 229035318U, // <u,5,u,5>: Cost 1 vdup1 RHS + 2954111490U, // <u,5,u,6>: Cost 3 vzipr LHS, <3,4,5,6> + 27705344U, // <u,5,u,7>: Cost 0 copy RHS + 27705344U, // <u,5,u,u>: Cost 0 copy RHS + 2619211776U, // <u,6,0,0>: Cost 3 vext2 <0,2,u,6>, <0,0,0,0> + 1545470054U, // <u,6,0,1>: Cost 2 vext2 <0,2,u,6>, LHS + 1545470192U, // <u,6,0,2>: Cost 2 vext2 <0,2,u,6>, <0,2,u,6> + 2255958969U, // <u,6,0,3>: Cost 3 vrev <6,u,3,0> + 1546797458U, // <u,6,0,4>: Cost 2 vext2 <0,4,u,6>, <0,4,u,6> + 2720624971U, // <u,6,0,5>: Cost 3 vext3 <6,0,5,u>, <6,0,5,u> + 2256180180U, // <u,6,0,6>: Cost 3 vrev <6,u,6,0> + 2960682294U, // <u,6,0,7>: Cost 3 vzipr <1,2,u,0>, RHS + 1545470621U, // <u,6,0,u>: Cost 2 vext2 <0,2,u,6>, LHS + 1182004127U, // <u,6,1,0>: Cost 2 vrev <6,u,0,1> + 2619212596U, // <u,6,1,1>: Cost 3 vext2 <0,2,u,6>, <1,1,1,1> + 2619212694U, // <u,6,1,2>: Cost 3 vext2 <0,2,u,6>, <1,2,3,0> + 2619212760U, // <u,6,1,3>: Cost 3 vext2 <0,2,u,6>, <1,3,1,3> + 2626511979U, // <u,6,1,4>: Cost 3 vext2 <1,4,u,6>, <1,4,u,6> + 2619212944U, // <u,6,1,5>: Cost 3 vext2 <0,2,u,6>, <1,5,3,7> + 2714063264U, // <u,6,1,6>: Cost 3 vext3 RHS, <6,1,6,3> + 2967326006U, // <u,6,1,7>: Cost 3 vzipr <2,3,u,1>, RHS + 1182594023U, // <u,6,1,u>: Cost 2 vrev <6,u,u,1> + 1506050150U, // <u,6,2,0>: Cost 2 vext1 <4,u,6,2>, LHS + 2579792630U, // <u,6,2,1>: Cost 3 vext1 <4,u,6,2>, <1,0,3,2> + 2619213416U, // <u,6,2,2>: Cost 3 vext2 <0,2,u,6>, <2,2,2,2> + 2619213478U, // <u,6,2,3>: Cost 3 vext2 <0,2,u,6>, <2,3,0,1> + 1506053430U, // <u,6,2,4>: Cost 2 vext1 <4,u,6,2>, RHS + 2633148309U, // <u,6,2,5>: Cost 3 vext2 <2,5,u,6>, <2,5,u,6> + 2619213754U, // <u,6,2,6>: Cost 3 vext2 <0,2,u,6>, <2,6,3,7> + 1638330874U, // <u,6,2,7>: Cost 2 vext3 RHS, <6,2,7,3> + 1638478339U, // <u,6,2,u>: Cost 2 vext3 RHS, <6,2,u,3> + 2619213974U, // <u,6,3,0>: Cost 3 vext2 <0,2,u,6>, <3,0,1,2> + 2255836074U, // <u,6,3,1>: Cost 3 vrev <6,u,1,3> + 2255909811U, // <u,6,3,2>: Cost 3 vrev <6,u,2,3> + 2619214236U, // <u,6,3,3>: Cost 3 vext2 <0,2,u,6>, <3,3,3,3> + 1564715549U, // <u,6,3,4>: Cost 2 vext2 <3,4,u,6>, <3,4,u,6> + 2639121006U, // <u,6,3,5>: Cost 3 vext2 <3,5,u,6>, <3,5,u,6> + 3001847012U, // <u,6,3,6>: Cost 3 vzipr LHS, <4,4,6,6> + 1880329526U, // <u,6,3,7>: Cost 2 vzipr LHS, RHS + 1880329527U, // <u,6,3,u>: Cost 2 vzipr LHS, RHS + 2567864422U, // <u,6,4,0>: Cost 3 vext1 <2,u,6,4>, LHS + 2733011558U, // <u,6,4,1>: Cost 3 vext3 LHS, <6,4,1,3> + 2567866484U, // <u,6,4,2>: Cost 3 vext1 <2,u,6,4>, <2,u,6,4> + 2638458005U, // <u,6,4,3>: Cost 3 vext2 <3,4,u,6>, <4,3,6,u> + 1570540772U, // <u,6,4,4>: Cost 2 vext2 <4,4,6,6>, <4,4,6,6> + 1545473334U, // <u,6,4,5>: Cost 2 vext2 <0,2,u,6>, RHS + 1572015512U, // <u,6,4,6>: Cost 2 vext2 <4,6,u,6>, <4,6,u,6> + 2960715062U, // <u,6,4,7>: Cost 3 vzipr <1,2,u,4>, RHS + 1545473577U, // <u,6,4,u>: Cost 2 vext2 <0,2,u,6>, RHS + 2567872614U, // <u,6,5,0>: Cost 3 vext1 <2,u,6,5>, LHS + 2645757648U, // <u,6,5,1>: Cost 3 vext2 <4,6,u,6>, <5,1,7,3> + 2567874490U, // <u,6,5,2>: Cost 3 vext1 <2,u,6,5>, <2,6,3,7> + 2576501250U, // <u,6,5,3>: Cost 3 vext1 <4,3,6,5>, <3,4,5,6> + 1576660943U, // <u,6,5,4>: Cost 2 vext2 <5,4,u,6>, <5,4,u,6> + 2645757956U, // <u,6,5,5>: Cost 3 vext2 <4,6,u,6>, <5,5,5,5> + 2645758050U, // <u,6,5,6>: Cost 3 vext2 <4,6,u,6>, <5,6,7,0> + 2824080694U, // <u,6,5,7>: Cost 3 vuzpr <0,u,2,6>, RHS + 1182626795U, // <u,6,5,u>: Cost 2 vrev <6,u,u,5> + 1506082918U, // <u,6,6,0>: Cost 2 vext1 <4,u,6,6>, LHS + 2579825398U, // <u,6,6,1>: Cost 3 vext1 <4,u,6,6>, <1,0,3,2> + 2645758458U, // <u,6,6,2>: Cost 3 vext2 <4,6,u,6>, <6,2,7,3> + 2579826838U, // <u,6,6,3>: Cost 3 vext1 <4,u,6,6>, <3,0,1,2> + 1506086198U, // <u,6,6,4>: Cost 2 vext1 <4,u,6,6>, RHS + 2579828432U, // <u,6,6,5>: Cost 3 vext1 <4,u,6,6>, <5,1,7,3> + 296144182U, // <u,6,6,6>: Cost 1 vdup2 RHS + 1638331202U, // <u,6,6,7>: Cost 2 vext3 RHS, <6,6,7,7> + 296144182U, // <u,6,6,u>: Cost 1 vdup2 RHS + 432349286U, // <u,6,7,0>: Cost 1 vext1 RHS, LHS + 1506091766U, // <u,6,7,1>: Cost 2 vext1 RHS, <1,0,3,2> + 1506092648U, // <u,6,7,2>: Cost 2 vext1 RHS, <2,2,2,2> + 1506093206U, // <u,6,7,3>: Cost 2 vext1 RHS, <3,0,1,2> + 432352809U, // <u,6,7,4>: Cost 1 vext1 RHS, RHS + 1506094800U, // <u,6,7,5>: Cost 2 vext1 RHS, <5,1,7,3> + 1506095610U, // <u,6,7,6>: Cost 2 vext1 RHS, <6,2,7,3> + 1906904374U, // <u,6,7,7>: Cost 2 vzipr RHS, RHS + 432355118U, // <u,6,7,u>: Cost 1 vext1 RHS, LHS + 432357478U, // <u,6,u,0>: Cost 1 vext1 RHS, LHS + 1545475886U, // <u,6,u,1>: Cost 2 vext2 <0,2,u,6>, LHS + 1506100840U, // <u,6,u,2>: Cost 2 vext1 RHS, <2,2,2,2> + 1506101398U, // <u,6,u,3>: Cost 2 vext1 RHS, <3,0,1,2> + 432361002U, // <u,6,u,4>: Cost 1 vext1 RHS, RHS + 1545476250U, // <u,6,u,5>: Cost 2 vext2 <0,2,u,6>, RHS + 296144182U, // <u,6,u,6>: Cost 1 vdup2 RHS + 1880370486U, // <u,6,u,7>: Cost 2 vzipr LHS, RHS + 432363310U, // <u,6,u,u>: Cost 1 vext1 RHS, LHS + 1571356672U, // <u,7,0,0>: Cost 2 vext2 RHS, <0,0,0,0> + 497614950U, // <u,7,0,1>: Cost 1 vext2 RHS, LHS + 1571356836U, // <u,7,0,2>: Cost 2 vext2 RHS, <0,2,0,2> + 2573880146U, // <u,7,0,3>: Cost 3 vext1 <3,u,7,0>, <3,u,7,0> + 1571357010U, // <u,7,0,4>: Cost 2 vext2 RHS, <0,4,1,5> + 1512083716U, // <u,7,0,5>: Cost 2 vext1 <5,u,7,0>, <5,u,7,0> + 2621874741U, // <u,7,0,6>: Cost 3 vext2 <0,6,u,7>, <0,6,u,7> + 2585826298U, // <u,7,0,7>: Cost 3 vext1 <5,u,7,0>, <7,0,1,2> + 497615517U, // <u,7,0,u>: Cost 1 vext2 RHS, LHS + 1571357430U, // <u,7,1,0>: Cost 2 vext2 RHS, <1,0,3,2> + 1571357492U, // <u,7,1,1>: Cost 2 vext2 RHS, <1,1,1,1> + 1571357590U, // <u,7,1,2>: Cost 2 vext2 RHS, <1,2,3,0> + 1552114715U, // <u,7,1,3>: Cost 2 vext2 <1,3,u,7>, <1,3,u,7> + 2573888822U, // <u,7,1,4>: Cost 3 vext1 <3,u,7,1>, RHS + 1553441981U, // <u,7,1,5>: Cost 2 vext2 <1,5,u,7>, <1,5,u,7> + 2627847438U, // <u,7,1,6>: Cost 3 vext2 <1,6,u,7>, <1,6,u,7> + 2727408775U, // <u,7,1,7>: Cost 3 vext3 <7,1,7,u>, <7,1,7,u> + 1555432880U, // <u,7,1,u>: Cost 2 vext2 <1,u,u,7>, <1,u,u,7> + 2629838337U, // <u,7,2,0>: Cost 3 vext2 <2,0,u,7>, <2,0,u,7> + 1188058754U, // <u,7,2,1>: Cost 2 vrev <7,u,1,2> + 1571358312U, // <u,7,2,2>: Cost 2 vext2 RHS, <2,2,2,2> + 1571358374U, // <u,7,2,3>: Cost 2 vext2 RHS, <2,3,0,1> + 2632492869U, // <u,7,2,4>: Cost 3 vext2 <2,4,u,7>, <2,4,u,7> + 2633156502U, // <u,7,2,5>: Cost 3 vext2 <2,5,u,7>, <2,5,u,7> + 1560078311U, // <u,7,2,6>: Cost 2 vext2 <2,6,u,7>, <2,6,u,7> + 2728072408U, // <u,7,2,7>: Cost 3 vext3 <7,2,7,u>, <7,2,7,u> + 1561405577U, // <u,7,2,u>: Cost 2 vext2 <2,u,u,7>, <2,u,u,7> + 1571358870U, // <u,7,3,0>: Cost 2 vext2 RHS, <3,0,1,2> + 2627184913U, // <u,7,3,1>: Cost 3 vext2 <1,5,u,7>, <3,1,5,u> + 2633820523U, // <u,7,3,2>: Cost 3 vext2 <2,6,u,7>, <3,2,6,u> + 1571359132U, // <u,7,3,3>: Cost 2 vext2 RHS, <3,3,3,3> + 1571359234U, // <u,7,3,4>: Cost 2 vext2 RHS, <3,4,5,6> + 1512108295U, // <u,7,3,5>: Cost 2 vext1 <5,u,7,3>, <5,u,7,3> + 1518080992U, // <u,7,3,6>: Cost 2 vext1 <6,u,7,3>, <6,u,7,3> + 2640456465U, // <u,7,3,7>: Cost 3 vext2 <3,7,u,7>, <3,7,u,7> + 1571359518U, // <u,7,3,u>: Cost 2 vext2 RHS, <3,u,1,2> + 1571359634U, // <u,7,4,0>: Cost 2 vext2 RHS, <4,0,5,1> + 2573911067U, // <u,7,4,1>: Cost 3 vext1 <3,u,7,4>, <1,3,u,7> + 2645101622U, // <u,7,4,2>: Cost 3 vext2 RHS, <4,2,5,3> + 2573912918U, // <u,7,4,3>: Cost 3 vext1 <3,u,7,4>, <3,u,7,4> + 1571359952U, // <u,7,4,4>: Cost 2 vext2 RHS, <4,4,4,4> + 497618248U, // <u,7,4,5>: Cost 1 vext2 RHS, RHS + 1571360116U, // <u,7,4,6>: Cost 2 vext2 RHS, <4,6,4,6> + 2645102024U, // <u,7,4,7>: Cost 3 vext2 RHS, <4,7,5,0> + 497618473U, // <u,7,4,u>: Cost 1 vext2 RHS, RHS + 2645102152U, // <u,7,5,0>: Cost 3 vext2 RHS, <5,0,1,2> + 1571360464U, // <u,7,5,1>: Cost 2 vext2 RHS, <5,1,7,3> + 2645102334U, // <u,7,5,2>: Cost 3 vext2 RHS, <5,2,3,4> + 2645102447U, // <u,7,5,3>: Cost 3 vext2 RHS, <5,3,7,0> + 1571360710U, // <u,7,5,4>: Cost 2 vext2 RHS, <5,4,7,6> + 1571360772U, // <u,7,5,5>: Cost 2 vext2 RHS, <5,5,5,5> + 1571360866U, // <u,7,5,6>: Cost 2 vext2 RHS, <5,6,7,0> + 1571360936U, // <u,7,5,7>: Cost 2 vext2 RHS, <5,7,5,7> + 1571361017U, // <u,7,5,u>: Cost 2 vext2 RHS, <5,u,5,7> + 1530044518U, // <u,7,6,0>: Cost 2 vext1 <u,u,7,6>, LHS + 2645103016U, // <u,7,6,1>: Cost 3 vext2 RHS, <6,1,7,2> + 1571361274U, // <u,7,6,2>: Cost 2 vext2 RHS, <6,2,7,3> + 2645103154U, // <u,7,6,3>: Cost 3 vext2 RHS, <6,3,4,5> + 1530047798U, // <u,7,6,4>: Cost 2 vext1 <u,u,7,6>, RHS + 1188386474U, // <u,7,6,5>: Cost 2 vrev <7,u,5,6> + 1571361592U, // <u,7,6,6>: Cost 2 vext2 RHS, <6,6,6,6> + 1571361614U, // <u,7,6,7>: Cost 2 vext2 RHS, <6,7,0,1> + 1571361695U, // <u,7,6,u>: Cost 2 vext2 RHS, <6,u,0,1> + 1571361786U, // <u,7,7,0>: Cost 2 vext2 RHS, <7,0,1,2> + 2573935616U, // <u,7,7,1>: Cost 3 vext1 <3,u,7,7>, <1,3,5,7> + 2645103781U, // <u,7,7,2>: Cost 3 vext2 RHS, <7,2,2,2> + 2573937497U, // <u,7,7,3>: Cost 3 vext1 <3,u,7,7>, <3,u,7,7> + 1571362150U, // <u,7,7,4>: Cost 2 vext2 RHS, <7,4,5,6> + 1512141067U, // <u,7,7,5>: Cost 2 vext1 <5,u,7,7>, <5,u,7,7> + 1518113764U, // <u,7,7,6>: Cost 2 vext1 <6,u,7,7>, <6,u,7,7> + 363253046U, // <u,7,7,7>: Cost 1 vdup3 RHS + 363253046U, // <u,7,7,u>: Cost 1 vdup3 RHS + 1571362515U, // <u,7,u,0>: Cost 2 vext2 RHS, <u,0,1,2> + 497620782U, // <u,7,u,1>: Cost 1 vext2 RHS, LHS + 1571362693U, // <u,7,u,2>: Cost 2 vext2 RHS, <u,2,3,0> + 1571362748U, // <u,7,u,3>: Cost 2 vext2 RHS, <u,3,0,1> + 1571362879U, // <u,7,u,4>: Cost 2 vext2 RHS, <u,4,5,6> + 497621146U, // <u,7,u,5>: Cost 1 vext2 RHS, RHS + 1571363024U, // <u,7,u,6>: Cost 2 vext2 RHS, <u,6,3,7> + 363253046U, // <u,7,u,7>: Cost 1 vdup3 RHS + 497621349U, // <u,7,u,u>: Cost 1 vext2 RHS, LHS + 135053414U, // <u,u,0,0>: Cost 1 vdup0 LHS + 471081121U, // <u,u,0,1>: Cost 1 vext2 LHS, LHS + 1544822948U, // <u,u,0,2>: Cost 2 vext2 LHS, <0,2,0,2> + 1616140005U, // <u,u,0,3>: Cost 2 vext3 LHS, <u,0,3,2> + 1544823122U, // <u,u,0,4>: Cost 2 vext2 LHS, <0,4,1,5> + 1512157453U, // <u,u,0,5>: Cost 2 vext1 <5,u,u,0>, <5,u,u,0> + 1662220032U, // <u,u,0,6>: Cost 2 vext3 RHS, <u,0,6,2> + 1194457487U, // <u,u,0,7>: Cost 2 vrev <u,u,7,0> + 471081629U, // <u,u,0,u>: Cost 1 vext2 LHS, LHS + 1544823542U, // <u,u,1,0>: Cost 2 vext2 LHS, <1,0,3,2> + 202162278U, // <u,u,1,1>: Cost 1 vdup1 LHS + 537753390U, // <u,u,1,2>: Cost 1 vext3 LHS, LHS + 1544823768U, // <u,u,1,3>: Cost 2 vext2 LHS, <1,3,1,3> + 1494248758U, // <u,u,1,4>: Cost 2 vext1 <2,u,u,1>, RHS + 1544823952U, // <u,u,1,5>: Cost 2 vext2 LHS, <1,5,3,7> + 1518138343U, // <u,u,1,6>: Cost 2 vext1 <6,u,u,1>, <6,u,u,1> + 1640322907U, // <u,u,1,7>: Cost 2 vext3 RHS, <u,1,7,3> + 537753444U, // <u,u,1,u>: Cost 1 vext3 LHS, LHS + 1482309734U, // <u,u,2,0>: Cost 2 vext1 <0,u,u,2>, LHS + 1194031451U, // <u,u,2,1>: Cost 2 vrev <u,u,1,2> + 269271142U, // <u,u,2,2>: Cost 1 vdup2 LHS + 835584U, // <u,u,2,3>: Cost 0 copy LHS + 1482313014U, // <u,u,2,4>: Cost 2 vext1 <0,u,u,2>, RHS + 2618566504U, // <u,u,2,5>: Cost 3 vext2 LHS, <2,5,3,6> + 1544824762U, // <u,u,2,6>: Cost 2 vext2 LHS, <2,6,3,7> + 1638479788U, // <u,u,2,7>: Cost 2 vext3 RHS, <u,2,7,3> + 835584U, // <u,u,2,u>: Cost 0 copy LHS + 408576723U, // <u,u,3,0>: Cost 1 vext1 LHS, LHS + 1482318582U, // <u,u,3,1>: Cost 2 vext1 LHS, <1,0,3,2> + 120371557U, // <u,u,3,2>: Cost 1 vrev LHS + 336380006U, // <u,u,3,3>: Cost 1 vdup3 LHS + 408579382U, // <u,u,3,4>: Cost 1 vext1 LHS, RHS + 1616140271U, // <u,u,3,5>: Cost 2 vext3 LHS, <u,3,5,7> + 1530098170U, // <u,u,3,6>: Cost 2 vext1 LHS, <6,2,7,3> + 1880329544U, // <u,u,3,7>: Cost 2 vzipr LHS, RHS + 408581934U, // <u,u,3,u>: Cost 1 vext1 LHS, LHS + 1488298086U, // <u,u,4,0>: Cost 2 vext1 <1,u,u,4>, LHS + 1488299437U, // <u,u,4,1>: Cost 2 vext1 <1,u,u,4>, <1,u,u,4> + 1659271204U, // <u,u,4,2>: Cost 2 vext3 LHS, <u,4,2,6> + 1194195311U, // <u,u,4,3>: Cost 2 vrev <u,u,3,4> + 161926454U, // <u,u,4,4>: Cost 1 vdup0 RHS + 471084342U, // <u,u,4,5>: Cost 1 vext2 LHS, RHS + 1571368308U, // <u,u,4,6>: Cost 2 vext2 RHS, <4,6,4,6> + 1640323153U, // <u,u,4,7>: Cost 2 vext3 RHS, <u,4,7,6> + 471084585U, // <u,u,4,u>: Cost 1 vext2 LHS, RHS + 1494278246U, // <u,u,5,0>: Cost 2 vext1 <2,u,u,5>, LHS + 1571368656U, // <u,u,5,1>: Cost 2 vext2 RHS, <5,1,7,3> + 1494280327U, // <u,u,5,2>: Cost 2 vext1 <2,u,u,5>, <2,u,u,5> + 1616140415U, // <u,u,5,3>: Cost 2 vext3 LHS, <u,5,3,7> + 1494281526U, // <u,u,5,4>: Cost 2 vext1 <2,u,u,5>, RHS + 229035318U, // <u,u,5,5>: Cost 1 vdup1 RHS + 537753754U, // <u,u,5,6>: Cost 1 vext3 LHS, RHS + 1750355254U, // <u,u,5,7>: Cost 2 vuzpr LHS, RHS + 537753772U, // <u,u,5,u>: Cost 1 vext3 LHS, RHS + 1482342502U, // <u,u,6,0>: Cost 2 vext1 <0,u,u,6>, LHS + 2556084982U, // <u,u,6,1>: Cost 3 vext1 <0,u,u,6>, <1,0,3,2> + 1571369466U, // <u,u,6,2>: Cost 2 vext2 RHS, <6,2,7,3> + 1611938000U, // <u,u,6,3>: Cost 2 vext3 LHS, <u,6,3,7> + 1482345782U, // <u,u,6,4>: Cost 2 vext1 <0,u,u,6>, RHS + 1194359171U, // <u,u,6,5>: Cost 2 vrev <u,u,5,6> + 296144182U, // <u,u,6,6>: Cost 1 vdup2 RHS + 27705344U, // <u,u,6,7>: Cost 0 copy RHS + 27705344U, // <u,u,6,u>: Cost 0 copy RHS + 432496742U, // <u,u,7,0>: Cost 1 vext1 RHS, LHS + 1488324016U, // <u,u,7,1>: Cost 2 vext1 <1,u,u,7>, <1,u,u,7> + 1494296713U, // <u,u,7,2>: Cost 2 vext1 <2,u,u,7>, <2,u,u,7> + 1906901148U, // <u,u,7,3>: Cost 2 vzipr RHS, LHS + 432500283U, // <u,u,7,4>: Cost 1 vext1 RHS, RHS + 1506242256U, // <u,u,7,5>: Cost 2 vext1 RHS, <5,1,7,3> + 120699277U, // <u,u,7,6>: Cost 1 vrev RHS + 363253046U, // <u,u,7,7>: Cost 1 vdup3 RHS + 432502574U, // <u,u,7,u>: Cost 1 vext1 RHS, LHS + 408617688U, // <u,u,u,0>: Cost 1 vext1 LHS, LHS + 471086894U, // <u,u,u,1>: Cost 1 vext2 LHS, LHS + 537753957U, // <u,u,u,2>: Cost 1 vext3 LHS, LHS + 835584U, // <u,u,u,3>: Cost 0 copy LHS + 408620342U, // <u,u,u,4>: Cost 1 vext1 LHS, RHS + 471087258U, // <u,u,u,5>: Cost 1 vext2 LHS, RHS + 537753997U, // <u,u,u,6>: Cost 1 vext3 LHS, RHS + 27705344U, // <u,u,u,7>: Cost 0 copy RHS + 835584U, // <u,u,u,u>: Cost 0 copy LHS + 0 +}; diff --git a/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.cpp new file mode 100644 index 0000000..d5bc3f6 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.cpp @@ -0,0 +1,41 @@ +//===- ARMRegisterInfo.cpp - ARM Register Information -----------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the ARM implementation of the TargetRegisterInfo class. +// +//===----------------------------------------------------------------------===// + +#include "ARM.h" +#include "ARMAddressingModes.h" +#include "ARMBaseInstrInfo.h" +#include "ARMInstrInfo.h" +#include "ARMMachineFunctionInfo.h" +#include "ARMRegisterInfo.h" +#include "ARMSubtarget.h" +#include "llvm/Constants.h" +#include "llvm/DerivedTypes.h" +#include "llvm/CodeGen/MachineConstantPool.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineLocation.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/RegisterScavenging.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Target/TargetFrameInfo.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetOptions.h" +#include "llvm/ADT/BitVector.h" +#include "llvm/ADT/SmallVector.h" +using namespace llvm; + +ARMRegisterInfo::ARMRegisterInfo(const ARMBaseInstrInfo &tii, + const ARMSubtarget &sti) + : ARMBaseRegisterInfo(tii, sti) { +} diff --git a/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.h b/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.h new file mode 100644 index 0000000..8edfb9a --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.h @@ -0,0 +1,33 @@ +//===- ARMRegisterInfo.h - ARM Register Information Impl --------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the ARM implementation of the TargetRegisterInfo class. +// +//===----------------------------------------------------------------------===// + +#ifndef ARMREGISTERINFO_H +#define ARMREGISTERINFO_H + +#include "ARM.h" +#include "llvm/Target/TargetRegisterInfo.h" +#include "ARMBaseRegisterInfo.h" + +namespace llvm { + class ARMSubtarget; + class ARMBaseInstrInfo; + class Type; + +struct ARMRegisterInfo : public ARMBaseRegisterInfo { +public: + ARMRegisterInfo(const ARMBaseInstrInfo &tii, const ARMSubtarget &STI); +}; + +} // end namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.td b/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.td new file mode 100644 index 0000000..6beca8b --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.td @@ -0,0 +1,482 @@ +//===- ARMRegisterInfo.td - ARM Register defs -------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Declarations that describe the ARM register file +//===----------------------------------------------------------------------===// + +// Registers are identified with 4-bit ID numbers. +class ARMReg<bits<4> num, string n, list<Register> subregs = []> : Register<n> { + field bits<4> Num; + let Namespace = "ARM"; + let SubRegs = subregs; +} + +class ARMFReg<bits<6> num, string n> : Register<n> { + field bits<6> Num; + let Namespace = "ARM"; +} + +// Subregister indices. +let Namespace = "ARM" in { +// Note: Code depends on these having consecutive numbers. +def ssub_0 : SubRegIndex; +def ssub_1 : SubRegIndex; +def ssub_2 : SubRegIndex; // In a Q reg. +def ssub_3 : SubRegIndex; +def ssub_4 : SubRegIndex; // In a QQ reg. +def ssub_5 : SubRegIndex; +def ssub_6 : SubRegIndex; +def ssub_7 : SubRegIndex; +def ssub_8 : SubRegIndex; // In a QQQQ reg. +def ssub_9 : SubRegIndex; +def ssub_10 : SubRegIndex; +def ssub_11 : SubRegIndex; +def ssub_12 : SubRegIndex; +def ssub_13 : SubRegIndex; +def ssub_14 : SubRegIndex; +def ssub_15 : SubRegIndex; + +def dsub_0 : SubRegIndex; +def dsub_1 : SubRegIndex; +def dsub_2 : SubRegIndex; +def dsub_3 : SubRegIndex; +def dsub_4 : SubRegIndex; +def dsub_5 : SubRegIndex; +def dsub_6 : SubRegIndex; +def dsub_7 : SubRegIndex; + +def qsub_0 : SubRegIndex; +def qsub_1 : SubRegIndex; +def qsub_2 : SubRegIndex; +def qsub_3 : SubRegIndex; + +def qqsub_0 : SubRegIndex; +def qqsub_1 : SubRegIndex; +} + +// Integer registers +def R0 : ARMReg< 0, "r0">, DwarfRegNum<[0]>; +def R1 : ARMReg< 1, "r1">, DwarfRegNum<[1]>; +def R2 : ARMReg< 2, "r2">, DwarfRegNum<[2]>; +def R3 : ARMReg< 3, "r3">, DwarfRegNum<[3]>; +def R4 : ARMReg< 4, "r4">, DwarfRegNum<[4]>; +def R5 : ARMReg< 5, "r5">, DwarfRegNum<[5]>; +def R6 : ARMReg< 6, "r6">, DwarfRegNum<[6]>; +def R7 : ARMReg< 7, "r7">, DwarfRegNum<[7]>; +def R8 : ARMReg< 8, "r8">, DwarfRegNum<[8]>; +def R9 : ARMReg< 9, "r9">, DwarfRegNum<[9]>; +def R10 : ARMReg<10, "r10">, DwarfRegNum<[10]>; +def R11 : ARMReg<11, "r11">, DwarfRegNum<[11]>; +def R12 : ARMReg<12, "r12">, DwarfRegNum<[12]>; +def SP : ARMReg<13, "sp">, DwarfRegNum<[13]>; +def LR : ARMReg<14, "lr">, DwarfRegNum<[14]>; +def PC : ARMReg<15, "pc">, DwarfRegNum<[15]>; + +// Float registers +def S0 : ARMFReg< 0, "s0">; def S1 : ARMFReg< 1, "s1">; +def S2 : ARMFReg< 2, "s2">; def S3 : ARMFReg< 3, "s3">; +def S4 : ARMFReg< 4, "s4">; def S5 : ARMFReg< 5, "s5">; +def S6 : ARMFReg< 6, "s6">; def S7 : ARMFReg< 7, "s7">; +def S8 : ARMFReg< 8, "s8">; def S9 : ARMFReg< 9, "s9">; +def S10 : ARMFReg<10, "s10">; def S11 : ARMFReg<11, "s11">; +def S12 : ARMFReg<12, "s12">; def S13 : ARMFReg<13, "s13">; +def S14 : ARMFReg<14, "s14">; def S15 : ARMFReg<15, "s15">; +def S16 : ARMFReg<16, "s16">; def S17 : ARMFReg<17, "s17">; +def S18 : ARMFReg<18, "s18">; def S19 : ARMFReg<19, "s19">; +def S20 : ARMFReg<20, "s20">; def S21 : ARMFReg<21, "s21">; +def S22 : ARMFReg<22, "s22">; def S23 : ARMFReg<23, "s23">; +def S24 : ARMFReg<24, "s24">; def S25 : ARMFReg<25, "s25">; +def S26 : ARMFReg<26, "s26">; def S27 : ARMFReg<27, "s27">; +def S28 : ARMFReg<28, "s28">; def S29 : ARMFReg<29, "s29">; +def S30 : ARMFReg<30, "s30">; def S31 : ARMFReg<31, "s31">; + +// Aliases of the F* registers used to hold 64-bit fp values (doubles) +let SubRegIndices = [ssub_0, ssub_1] in { +def D0 : ARMReg< 0, "d0", [S0, S1]>; +def D1 : ARMReg< 1, "d1", [S2, S3]>; +def D2 : ARMReg< 2, "d2", [S4, S5]>; +def D3 : ARMReg< 3, "d3", [S6, S7]>; +def D4 : ARMReg< 4, "d4", [S8, S9]>; +def D5 : ARMReg< 5, "d5", [S10, S11]>; +def D6 : ARMReg< 6, "d6", [S12, S13]>; +def D7 : ARMReg< 7, "d7", [S14, S15]>; +def D8 : ARMReg< 8, "d8", [S16, S17]>; +def D9 : ARMReg< 9, "d9", [S18, S19]>; +def D10 : ARMReg<10, "d10", [S20, S21]>; +def D11 : ARMReg<11, "d11", [S22, S23]>; +def D12 : ARMReg<12, "d12", [S24, S25]>; +def D13 : ARMReg<13, "d13", [S26, S27]>; +def D14 : ARMReg<14, "d14", [S28, S29]>; +def D15 : ARMReg<15, "d15", [S30, S31]>; +} + +// VFP3 defines 16 additional double registers +def D16 : ARMFReg<16, "d16">; def D17 : ARMFReg<17, "d17">; +def D18 : ARMFReg<18, "d18">; def D19 : ARMFReg<19, "d19">; +def D20 : ARMFReg<20, "d20">; def D21 : ARMFReg<21, "d21">; +def D22 : ARMFReg<22, "d22">; def D23 : ARMFReg<23, "d23">; +def D24 : ARMFReg<24, "d24">; def D25 : ARMFReg<25, "d25">; +def D26 : ARMFReg<26, "d26">; def D27 : ARMFReg<27, "d27">; +def D28 : ARMFReg<28, "d28">; def D29 : ARMFReg<29, "d29">; +def D30 : ARMFReg<30, "d30">; def D31 : ARMFReg<31, "d31">; + +// Advanced SIMD (NEON) defines 16 quad-word aliases +let SubRegIndices = [dsub_0, dsub_1], + CompositeIndices = [(ssub_2 dsub_1, ssub_0), + (ssub_3 dsub_1, ssub_1)] in { +def Q0 : ARMReg< 0, "q0", [D0, D1]>; +def Q1 : ARMReg< 1, "q1", [D2, D3]>; +def Q2 : ARMReg< 2, "q2", [D4, D5]>; +def Q3 : ARMReg< 3, "q3", [D6, D7]>; +def Q4 : ARMReg< 4, "q4", [D8, D9]>; +def Q5 : ARMReg< 5, "q5", [D10, D11]>; +def Q6 : ARMReg< 6, "q6", [D12, D13]>; +def Q7 : ARMReg< 7, "q7", [D14, D15]>; +} +let SubRegIndices = [dsub_0, dsub_1] in { +def Q8 : ARMReg< 8, "q8", [D16, D17]>; +def Q9 : ARMReg< 9, "q9", [D18, D19]>; +def Q10 : ARMReg<10, "q10", [D20, D21]>; +def Q11 : ARMReg<11, "q11", [D22, D23]>; +def Q12 : ARMReg<12, "q12", [D24, D25]>; +def Q13 : ARMReg<13, "q13", [D26, D27]>; +def Q14 : ARMReg<14, "q14", [D28, D29]>; +def Q15 : ARMReg<15, "q15", [D30, D31]>; +} + +// Pseudo 256-bit registers to represent pairs of Q registers. These should +// never be present in the emitted code. +// These are used for NEON load / store instructions, e.g. vld4, vst3. +// NOTE: It's possible to define more QQ registers since technical the +// starting D register number doesn't have to be multiple of 4. e.g. +// D1, D2, D3, D4 would be a legal quad. But that would make the sub-register +// stuffs very messy. +let SubRegIndices = [qsub_0, qsub_1] in { +let CompositeIndices = [(dsub_2 qsub_1, dsub_0), (dsub_3 qsub_1, dsub_1), + (ssub_4 qsub_1, ssub_0), (ssub_5 qsub_1, ssub_1), + (ssub_6 qsub_1, ssub_2), (ssub_7 qsub_1, ssub_3)] in { +def QQ0 : ARMReg<0, "qq0", [Q0, Q1]>; +def QQ1 : ARMReg<1, "qq1", [Q2, Q3]>; +def QQ2 : ARMReg<2, "qq2", [Q4, Q5]>; +def QQ3 : ARMReg<3, "qq3", [Q6, Q7]>; +} +let CompositeIndices = [(dsub_2 qsub_1, dsub_0), (dsub_3 qsub_1, dsub_1)] in { +def QQ4 : ARMReg<4, "qq4", [Q8, Q9]>; +def QQ5 : ARMReg<5, "qq5", [Q10, Q11]>; +def QQ6 : ARMReg<6, "qq6", [Q12, Q13]>; +def QQ7 : ARMReg<7, "qq7", [Q14, Q15]>; +} +} + +// Pseudo 512-bit registers to represent four consecutive Q registers. +let SubRegIndices = [qqsub_0, qqsub_1] in { +let CompositeIndices = [(qsub_2 qqsub_1, qsub_0), (qsub_3 qqsub_1, qsub_1), + (dsub_4 qqsub_1, dsub_0), (dsub_5 qqsub_1, dsub_1), + (dsub_6 qqsub_1, dsub_2), (dsub_7 qqsub_1, dsub_3), + (ssub_8 qqsub_1, ssub_0), (ssub_9 qqsub_1, ssub_1), + (ssub_10 qqsub_1, ssub_2), (ssub_11 qqsub_1, ssub_3), + (ssub_12 qqsub_1, ssub_4), (ssub_13 qqsub_1, ssub_5), + (ssub_14 qqsub_1, ssub_6), (ssub_15 qqsub_1, ssub_7)] in { +def QQQQ0 : ARMReg<0, "qqqq0", [QQ0, QQ1]>; +def QQQQ1 : ARMReg<1, "qqqq1", [QQ2, QQ3]>; +} +let CompositeIndices = [(qsub_2 qqsub_1, qsub_0), (qsub_3 qqsub_1, qsub_1), + (dsub_4 qqsub_1, dsub_0), (dsub_5 qqsub_1, dsub_1), + (dsub_6 qqsub_1, dsub_2), (dsub_7 qqsub_1, dsub_3)] in { +def QQQQ2 : ARMReg<2, "qqqq2", [QQ4, QQ5]>; +def QQQQ3 : ARMReg<3, "qqqq3", [QQ6, QQ7]>; +} +} + +// Current Program Status Register. +def CPSR : ARMReg<0, "cpsr">; + +def FPSCR : ARMReg<1, "fpscr">; + +// Register classes. +// +// pc == Program Counter +// lr == Link Register +// sp == Stack Pointer +// r12 == ip (scratch) +// r7 == Frame Pointer (thumb-style backtraces) +// r9 == May be reserved as Thread Register +// r11 == Frame Pointer (arm-style backtraces) +// r10 == Stack Limit +// +def GPR : RegisterClass<"ARM", [i32], 32, [R0, R1, R2, R3, R4, R5, R6, + R7, R8, R9, R10, R11, R12, + SP, LR, PC]> { + let MethodProtos = [{ + iterator allocation_order_begin(const MachineFunction &MF) const; + iterator allocation_order_end(const MachineFunction &MF) const; + }]; + let MethodBodies = [{ + // FP is R11, R9 is available. + static const unsigned ARM_GPR_AO_1[] = { + ARM::R0, ARM::R1, ARM::R2, ARM::R3, + ARM::R12,ARM::LR, + ARM::R4, ARM::R5, ARM::R6, ARM::R7, + ARM::R8, ARM::R9, ARM::R10, + ARM::R11 }; + // FP is R11, R9 is not available. + static const unsigned ARM_GPR_AO_2[] = { + ARM::R0, ARM::R1, ARM::R2, ARM::R3, + ARM::R12,ARM::LR, + ARM::R4, ARM::R5, ARM::R6, ARM::R7, + ARM::R8, ARM::R10, + ARM::R11 }; + // FP is R7, R9 is available as non-callee-saved register. + // This is used by Darwin. + static const unsigned ARM_GPR_AO_3[] = { + ARM::R0, ARM::R1, ARM::R2, ARM::R3, + ARM::R9, ARM::R12,ARM::LR, + ARM::R4, ARM::R5, ARM::R6, + ARM::R8, ARM::R10,ARM::R11,ARM::R7 }; + // FP is R7, R9 is not available. + static const unsigned ARM_GPR_AO_4[] = { + ARM::R0, ARM::R1, ARM::R2, ARM::R3, + ARM::R12,ARM::LR, + ARM::R4, ARM::R5, ARM::R6, + ARM::R8, ARM::R10,ARM::R11, + ARM::R7 }; + // FP is R7, R9 is available as callee-saved register. + // This is used by non-Darwin platform in Thumb mode. + static const unsigned ARM_GPR_AO_5[] = { + ARM::R0, ARM::R1, ARM::R2, ARM::R3, + ARM::R12,ARM::LR, + ARM::R4, ARM::R5, ARM::R6, + ARM::R8, ARM::R9, ARM::R10,ARM::R11,ARM::R7 }; + + // For Thumb1 mode, we don't want to allocate hi regs at all, as we + // don't know how to spill them. If we make our prologue/epilogue code + // smarter at some point, we can go back to using the above allocation + // orders for the Thumb1 instructions that know how to use hi regs. + static const unsigned THUMB_GPR_AO[] = { + ARM::R0, ARM::R1, ARM::R2, ARM::R3, + ARM::R4, ARM::R5, ARM::R6, ARM::R7 }; + + GPRClass::iterator + GPRClass::allocation_order_begin(const MachineFunction &MF) const { + const TargetMachine &TM = MF.getTarget(); + const ARMSubtarget &Subtarget = TM.getSubtarget<ARMSubtarget>(); + if (Subtarget.isThumb1Only()) + return THUMB_GPR_AO; + if (Subtarget.isTargetDarwin()) { + if (Subtarget.isR9Reserved()) + return ARM_GPR_AO_4; + else + return ARM_GPR_AO_3; + } else { + if (Subtarget.isR9Reserved()) + return ARM_GPR_AO_2; + else if (Subtarget.isThumb()) + return ARM_GPR_AO_5; + else + return ARM_GPR_AO_1; + } + } + + GPRClass::iterator + GPRClass::allocation_order_end(const MachineFunction &MF) const { + const TargetMachine &TM = MF.getTarget(); + const TargetRegisterInfo *RI = TM.getRegisterInfo(); + const ARMSubtarget &Subtarget = TM.getSubtarget<ARMSubtarget>(); + GPRClass::iterator I; + + if (Subtarget.isThumb1Only()) { + I = THUMB_GPR_AO + (sizeof(THUMB_GPR_AO)/sizeof(unsigned)); + // Mac OS X requires FP not to be clobbered for backtracing purpose. + return (Subtarget.isTargetDarwin() || RI->hasFP(MF)) ? I-1 : I; + } + + if (Subtarget.isTargetDarwin()) { + if (Subtarget.isR9Reserved()) + I = ARM_GPR_AO_4 + (sizeof(ARM_GPR_AO_4)/sizeof(unsigned)); + else + I = ARM_GPR_AO_3 + (sizeof(ARM_GPR_AO_3)/sizeof(unsigned)); + } else { + if (Subtarget.isR9Reserved()) + I = ARM_GPR_AO_2 + (sizeof(ARM_GPR_AO_2)/sizeof(unsigned)); + else if (Subtarget.isThumb()) + I = ARM_GPR_AO_5 + (sizeof(ARM_GPR_AO_5)/sizeof(unsigned)); + else + I = ARM_GPR_AO_1 + (sizeof(ARM_GPR_AO_1)/sizeof(unsigned)); + } + + // Mac OS X requires FP not to be clobbered for backtracing purpose. + return (Subtarget.isTargetDarwin() || RI->hasFP(MF)) ? I-1 : I; + } + }]; +} + +// Thumb registers are R0-R7 normally. Some instructions can still use +// the general GPR register class above (MOV, e.g.) +def tGPR : RegisterClass<"ARM", [i32], 32, [R0, R1, R2, R3, R4, R5, R6, R7]> { + let MethodProtos = [{ + iterator allocation_order_begin(const MachineFunction &MF) const; + iterator allocation_order_end(const MachineFunction &MF) const; + }]; + let MethodBodies = [{ + static const unsigned THUMB_tGPR_AO[] = { + ARM::R0, ARM::R1, ARM::R2, ARM::R3, + ARM::R4, ARM::R5, ARM::R6, ARM::R7 }; + + // FP is R7, only low registers available. + tGPRClass::iterator + tGPRClass::allocation_order_begin(const MachineFunction &MF) const { + return THUMB_tGPR_AO; + } + + tGPRClass::iterator + tGPRClass::allocation_order_end(const MachineFunction &MF) const { + const TargetMachine &TM = MF.getTarget(); + const TargetRegisterInfo *RI = TM.getRegisterInfo(); + const ARMSubtarget &Subtarget = TM.getSubtarget<ARMSubtarget>(); + tGPRClass::iterator I = + THUMB_tGPR_AO + (sizeof(THUMB_tGPR_AO)/sizeof(unsigned)); + // Mac OS X requires FP not to be clobbered for backtracing purpose. + return (Subtarget.isTargetDarwin() || RI->hasFP(MF)) ? I-1 : I; + } + }]; +} + +// Scalar single precision floating point register class.. +def SPR : RegisterClass<"ARM", [f32], 32, [S0, S1, S2, S3, S4, S5, S6, S7, S8, + S9, S10, S11, S12, S13, S14, S15, S16, S17, S18, S19, S20, S21, S22, + S23, S24, S25, S26, S27, S28, S29, S30, S31]>; + +// Subset of SPR which can be used as a source of NEON scalars for 16-bit +// operations +def SPR_8 : RegisterClass<"ARM", [f32], 32, + [S0, S1, S2, S3, S4, S5, S6, S7, + S8, S9, S10, S11, S12, S13, S14, S15]>; + +// Scalar double precision floating point / generic 64-bit vector register +// class. +// ARM requires only word alignment for double. It's more performant if it +// is double-word alignment though. +def DPR : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32], 64, + [D0, D1, D2, D3, D4, D5, D6, D7, + D8, D9, D10, D11, D12, D13, D14, D15, + D16, D17, D18, D19, D20, D21, D22, D23, + D24, D25, D26, D27, D28, D29, D30, D31]> { + let MethodProtos = [{ + iterator allocation_order_begin(const MachineFunction &MF) const; + iterator allocation_order_end(const MachineFunction &MF) const; + }]; + let MethodBodies = [{ + // VFP2 + static const unsigned ARM_DPR_VFP2[] = { + ARM::D0, ARM::D1, ARM::D2, ARM::D3, + ARM::D4, ARM::D5, ARM::D6, ARM::D7, + ARM::D8, ARM::D9, ARM::D10, ARM::D11, + ARM::D12, ARM::D13, ARM::D14, ARM::D15 }; + // VFP3 + static const unsigned ARM_DPR_VFP3[] = { + ARM::D0, ARM::D1, ARM::D2, ARM::D3, + ARM::D4, ARM::D5, ARM::D6, ARM::D7, + ARM::D8, ARM::D9, ARM::D10, ARM::D11, + ARM::D12, ARM::D13, ARM::D14, ARM::D15, + ARM::D16, ARM::D17, ARM::D18, ARM::D19, + ARM::D20, ARM::D21, ARM::D22, ARM::D23, + ARM::D24, ARM::D25, ARM::D26, ARM::D27, + ARM::D28, ARM::D29, ARM::D30, ARM::D31 }; + DPRClass::iterator + DPRClass::allocation_order_begin(const MachineFunction &MF) const { + const TargetMachine &TM = MF.getTarget(); + const ARMSubtarget &Subtarget = TM.getSubtarget<ARMSubtarget>(); + if (Subtarget.hasVFP3()) + return ARM_DPR_VFP3; + return ARM_DPR_VFP2; + } + + DPRClass::iterator + DPRClass::allocation_order_end(const MachineFunction &MF) const { + const TargetMachine &TM = MF.getTarget(); + const ARMSubtarget &Subtarget = TM.getSubtarget<ARMSubtarget>(); + if (Subtarget.hasVFP3()) + return ARM_DPR_VFP3 + (sizeof(ARM_DPR_VFP3)/sizeof(unsigned)); + else + return ARM_DPR_VFP2 + (sizeof(ARM_DPR_VFP2)/sizeof(unsigned)); + } + }]; +} + +// Subset of DPR that are accessible with VFP2 (and so that also have +// 32-bit SPR subregs). +def DPR_VFP2 : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32], 64, + [D0, D1, D2, D3, D4, D5, D6, D7, + D8, D9, D10, D11, D12, D13, D14, D15]> { + let SubRegClasses = [(SPR ssub_0, ssub_1)]; +} + +// Subset of DPR which can be used as a source of NEON scalars for 16-bit +// operations +def DPR_8 : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32], 64, + [D0, D1, D2, D3, D4, D5, D6, D7]> { + let SubRegClasses = [(SPR_8 ssub_0, ssub_1)]; +} + +// Generic 128-bit vector register class. +def QPR : RegisterClass<"ARM", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], 128, + [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7, + Q8, Q9, Q10, Q11, Q12, Q13, Q14, Q15]> { + let SubRegClasses = [(DPR dsub_0, dsub_1)]; +} + +// Subset of QPR that have 32-bit SPR subregs. +def QPR_VFP2 : RegisterClass<"ARM", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], + 128, + [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]> { + let SubRegClasses = [(SPR ssub_0, ssub_1, ssub_2, ssub_3), + (DPR_VFP2 dsub_0, dsub_1)]; +} + +// Subset of QPR that have DPR_8 and SPR_8 subregs. +def QPR_8 : RegisterClass<"ARM", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], + 128, + [Q0, Q1, Q2, Q3]> { + let SubRegClasses = [(SPR_8 ssub_0, ssub_1, ssub_2, ssub_3), + (DPR_8 dsub_0, dsub_1)]; +} + +// Pseudo 256-bit vector register class to model pairs of Q registers +// (4 consecutive D registers). +def QQPR : RegisterClass<"ARM", [v4i64], + 256, + [QQ0, QQ1, QQ2, QQ3, QQ4, QQ5, QQ6, QQ7]> { + let SubRegClasses = [(DPR dsub_0, dsub_1, dsub_2, dsub_3), + (QPR qsub_0, qsub_1)]; +} + +// Subset of QQPR that have 32-bit SPR subregs. +def QQPR_VFP2 : RegisterClass<"ARM", [v4i64], + 256, + [QQ0, QQ1, QQ2, QQ3]> { + let SubRegClasses = [(SPR ssub_0, ssub_1, ssub_2, ssub_3), + (DPR_VFP2 dsub_0, dsub_1, dsub_2, dsub_3), + (QPR_VFP2 qsub_0, qsub_1)]; + +} + +// Pseudo 512-bit vector register class to model 4 consecutive Q registers +// (8 consecutive D registers). +def QQQQPR : RegisterClass<"ARM", [v8i64], + 256, + [QQQQ0, QQQQ1, QQQQ2, QQQQ3]> { + let SubRegClasses = [(DPR dsub_0, dsub_1, dsub_2, dsub_3, + dsub_4, dsub_5, dsub_6, dsub_7), + (QPR qsub_0, qsub_1, qsub_2, qsub_3)]; +} + +// Condition code registers. +def CCR : RegisterClass<"ARM", [i32], 32, [CPSR]>; + diff --git a/contrib/llvm/lib/Target/ARM/ARMRelocations.h b/contrib/llvm/lib/Target/ARM/ARMRelocations.h new file mode 100644 index 0000000..86e7206 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMRelocations.h @@ -0,0 +1,62 @@ +//===- ARMRelocations.h - ARM Code Relocations ------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the ARM target-specific relocation types. +// +//===----------------------------------------------------------------------===// + +#ifndef ARMRELOCATIONS_H +#define ARMRELOCATIONS_H + +#include "llvm/CodeGen/MachineRelocation.h" + +namespace llvm { + namespace ARM { + enum RelocationType { + // reloc_arm_absolute - Absolute relocation, just add the relocated value + // to the value already in memory. + reloc_arm_absolute, + + // reloc_arm_relative - PC relative relocation, add the relocated value to + // the value already in memory, after we adjust it for where the PC is. + reloc_arm_relative, + + // reloc_arm_cp_entry - PC relative relocation for constpool_entry's whose + // addresses are kept locally in a map. + reloc_arm_cp_entry, + + // reloc_arm_vfp_cp_entry - Same as reloc_arm_cp_entry except the offset + // should be divided by 4. + reloc_arm_vfp_cp_entry, + + // reloc_arm_machine_cp_entry - Relocation of a ARM machine constantpool + // entry. + reloc_arm_machine_cp_entry, + + // reloc_arm_jt_base - PC relative relocation for jump tables whose + // addresses are kept locally in a map. + reloc_arm_jt_base, + + // reloc_arm_pic_jt - PIC jump table entry relocation: dest bb - jt base. + reloc_arm_pic_jt, + + // reloc_arm_branch - Branch address relocation. + reloc_arm_branch, + + // reloc_arm_movt - MOVT immediate relocation. + reloc_arm_movt, + + // reloc_arm_movw - MOVW immediate relocation. + reloc_arm_movw + }; + } +} + +#endif + diff --git a/contrib/llvm/lib/Target/ARM/ARMSchedule.td b/contrib/llvm/lib/Target/ARM/ARMSchedule.td new file mode 100644 index 0000000..b60ccca --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMSchedule.td @@ -0,0 +1,159 @@ +//===- ARMSchedule.td - ARM Scheduling Definitions ---------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Instruction Itinerary classes used for ARM +// +def IIC_iALUx : InstrItinClass; +def IIC_iALUi : InstrItinClass; +def IIC_iALUr : InstrItinClass; +def IIC_iALUsi : InstrItinClass; +def IIC_iALUsr : InstrItinClass; +def IIC_iUNAr : InstrItinClass; +def IIC_iUNAsi : InstrItinClass; +def IIC_iUNAsr : InstrItinClass; +def IIC_iCMPi : InstrItinClass; +def IIC_iCMPr : InstrItinClass; +def IIC_iCMPsi : InstrItinClass; +def IIC_iCMPsr : InstrItinClass; +def IIC_iMOVi : InstrItinClass; +def IIC_iMOVr : InstrItinClass; +def IIC_iMOVsi : InstrItinClass; +def IIC_iMOVsr : InstrItinClass; +def IIC_iCMOVi : InstrItinClass; +def IIC_iCMOVr : InstrItinClass; +def IIC_iCMOVsi : InstrItinClass; +def IIC_iCMOVsr : InstrItinClass; +def IIC_iMUL16 : InstrItinClass; +def IIC_iMAC16 : InstrItinClass; +def IIC_iMUL32 : InstrItinClass; +def IIC_iMAC32 : InstrItinClass; +def IIC_iMUL64 : InstrItinClass; +def IIC_iMAC64 : InstrItinClass; +def IIC_iLoadi : InstrItinClass; +def IIC_iLoadr : InstrItinClass; +def IIC_iLoadsi : InstrItinClass; +def IIC_iLoadiu : InstrItinClass; +def IIC_iLoadru : InstrItinClass; +def IIC_iLoadsiu : InstrItinClass; +def IIC_iLoadm : InstrItinClass; +def IIC_iStorei : InstrItinClass; +def IIC_iStorer : InstrItinClass; +def IIC_iStoresi : InstrItinClass; +def IIC_iStoreiu : InstrItinClass; +def IIC_iStoreru : InstrItinClass; +def IIC_iStoresiu : InstrItinClass; +def IIC_iStorem : InstrItinClass; +def IIC_Br : InstrItinClass; +def IIC_fpSTAT : InstrItinClass; +def IIC_fpUNA32 : InstrItinClass; +def IIC_fpUNA64 : InstrItinClass; +def IIC_fpCMP32 : InstrItinClass; +def IIC_fpCMP64 : InstrItinClass; +def IIC_fpCVTSD : InstrItinClass; +def IIC_fpCVTDS : InstrItinClass; +def IIC_fpCVTSH : InstrItinClass; +def IIC_fpCVTHS : InstrItinClass; +def IIC_fpCVTIS : InstrItinClass; +def IIC_fpCVTID : InstrItinClass; +def IIC_fpCVTSI : InstrItinClass; +def IIC_fpCVTDI : InstrItinClass; +def IIC_fpMOVIS : InstrItinClass; +def IIC_fpMOVID : InstrItinClass; +def IIC_fpMOVSI : InstrItinClass; +def IIC_fpMOVDI : InstrItinClass; +def IIC_fpALU32 : InstrItinClass; +def IIC_fpALU64 : InstrItinClass; +def IIC_fpMUL32 : InstrItinClass; +def IIC_fpMUL64 : InstrItinClass; +def IIC_fpMAC32 : InstrItinClass; +def IIC_fpMAC64 : InstrItinClass; +def IIC_fpDIV32 : InstrItinClass; +def IIC_fpDIV64 : InstrItinClass; +def IIC_fpSQRT32 : InstrItinClass; +def IIC_fpSQRT64 : InstrItinClass; +def IIC_fpLoad32 : InstrItinClass; +def IIC_fpLoad64 : InstrItinClass; +def IIC_fpLoadm : InstrItinClass; +def IIC_fpStore32 : InstrItinClass; +def IIC_fpStore64 : InstrItinClass; +def IIC_fpStorem : InstrItinClass; +def IIC_VLD1 : InstrItinClass; +def IIC_VLD2 : InstrItinClass; +def IIC_VLD3 : InstrItinClass; +def IIC_VLD4 : InstrItinClass; +def IIC_VST : InstrItinClass; +def IIC_VUNAD : InstrItinClass; +def IIC_VUNAQ : InstrItinClass; +def IIC_VBIND : InstrItinClass; +def IIC_VBINQ : InstrItinClass; +def IIC_VMOVImm : InstrItinClass; +def IIC_VMOVD : InstrItinClass; +def IIC_VMOVQ : InstrItinClass; +def IIC_VMOVIS : InstrItinClass; +def IIC_VMOVID : InstrItinClass; +def IIC_VMOVISL : InstrItinClass; +def IIC_VMOVSI : InstrItinClass; +def IIC_VMOVDI : InstrItinClass; +def IIC_VPERMD : InstrItinClass; +def IIC_VPERMQ : InstrItinClass; +def IIC_VPERMQ3 : InstrItinClass; +def IIC_VMACD : InstrItinClass; +def IIC_VMACQ : InstrItinClass; +def IIC_VRECSD : InstrItinClass; +def IIC_VRECSQ : InstrItinClass; +def IIC_VCNTiD : InstrItinClass; +def IIC_VCNTiQ : InstrItinClass; +def IIC_VUNAiD : InstrItinClass; +def IIC_VUNAiQ : InstrItinClass; +def IIC_VQUNAiD : InstrItinClass; +def IIC_VQUNAiQ : InstrItinClass; +def IIC_VBINiD : InstrItinClass; +def IIC_VBINiQ : InstrItinClass; +def IIC_VSUBiD : InstrItinClass; +def IIC_VSUBiQ : InstrItinClass; +def IIC_VBINi4D : InstrItinClass; +def IIC_VBINi4Q : InstrItinClass; +def IIC_VSUBi4D : InstrItinClass; +def IIC_VSUBi4Q : InstrItinClass; +def IIC_VABAD : InstrItinClass; +def IIC_VABAQ : InstrItinClass; +def IIC_VSHLiD : InstrItinClass; +def IIC_VSHLiQ : InstrItinClass; +def IIC_VSHLi4D : InstrItinClass; +def IIC_VSHLi4Q : InstrItinClass; +def IIC_VPALiD : InstrItinClass; +def IIC_VPALiQ : InstrItinClass; +def IIC_VMULi16D : InstrItinClass; +def IIC_VMULi32D : InstrItinClass; +def IIC_VMULi16Q : InstrItinClass; +def IIC_VMULi32Q : InstrItinClass; +def IIC_VMACi16D : InstrItinClass; +def IIC_VMACi32D : InstrItinClass; +def IIC_VMACi16Q : InstrItinClass; +def IIC_VMACi32Q : InstrItinClass; +def IIC_VEXTD : InstrItinClass; +def IIC_VEXTQ : InstrItinClass; +def IIC_VTB1 : InstrItinClass; +def IIC_VTB2 : InstrItinClass; +def IIC_VTB3 : InstrItinClass; +def IIC_VTB4 : InstrItinClass; +def IIC_VTBX1 : InstrItinClass; +def IIC_VTBX2 : InstrItinClass; +def IIC_VTBX3 : InstrItinClass; +def IIC_VTBX4 : InstrItinClass; + +//===----------------------------------------------------------------------===// +// Processor instruction itineraries. + +def GenericItineraries : ProcessorItineraries<[], []>; + +include "ARMScheduleV6.td" +include "ARMScheduleA8.td" +include "ARMScheduleA9.td" diff --git a/contrib/llvm/lib/Target/ARM/ARMScheduleA8.td b/contrib/llvm/lib/Target/ARM/ARMScheduleA8.td new file mode 100644 index 0000000..bbfc0b2 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMScheduleA8.td @@ -0,0 +1,618 @@ +//=- ARMScheduleA8.td - ARM Cortex-A8 Scheduling Definitions -*- tablegen -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the itinerary class data for the ARM Cortex A8 processors. +// +//===----------------------------------------------------------------------===// + +// +// Scheduling information derived from "Cortex-A8 Technical Reference Manual". +// Functional Units. +def A8_Issue : FuncUnit; // issue +def A8_Pipe0 : FuncUnit; // pipeline 0 +def A8_Pipe1 : FuncUnit; // pipeline 1 +def A8_LdSt0 : FuncUnit; // pipeline 0 load/store +def A8_LdSt1 : FuncUnit; // pipeline 1 load/store +def A8_NPipe : FuncUnit; // NEON ALU/MUL pipe +def A8_NLSPipe : FuncUnit; // NEON LS pipe +// +// Dual issue pipeline represented by A8_Pipe0 | A8_Pipe1 +// +def CortexA8Itineraries : ProcessorItineraries< + [A8_Issue, A8_Pipe0, A8_Pipe1, A8_LdSt0, A8_LdSt1, A8_NPipe, A8_NLSPipe], [ + // Two fully-pipelined integer ALU pipelines + // + // No operand cycles + InstrItinData<IIC_iALUx , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>]>, + // + // Binary Instructions that produce a result + InstrItinData<IIC_iALUi , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2]>, + InstrItinData<IIC_iALUr , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2, 2]>, + InstrItinData<IIC_iALUsi , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2, 1]>, + InstrItinData<IIC_iALUsr , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2, 1, 1]>, + // + // Unary Instructions that produce a result + InstrItinData<IIC_iUNAr , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2]>, + InstrItinData<IIC_iUNAsi , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1]>, + InstrItinData<IIC_iUNAsr , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1, 1]>, + // + // Compare instructions + InstrItinData<IIC_iCMPi , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2]>, + InstrItinData<IIC_iCMPr , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2]>, + InstrItinData<IIC_iCMPsi , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1]>, + InstrItinData<IIC_iCMPsr , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1, 1]>, + // + // Move instructions, unconditional + InstrItinData<IIC_iMOVi , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [1]>, + InstrItinData<IIC_iMOVr , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [1, 1]>, + InstrItinData<IIC_iMOVsi , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [1, 1]>, + InstrItinData<IIC_iMOVsr , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [1, 1, 1]>, + // + // Move instructions, conditional + InstrItinData<IIC_iCMOVi , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2]>, + InstrItinData<IIC_iCMOVr , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1]>, + InstrItinData<IIC_iCMOVsi , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1]>, + InstrItinData<IIC_iCMOVsr , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1, 1]>, + + // Integer multiply pipeline + // Result written in E5, but that is relative to the last cycle of multicycle, + // so we use 6 for those cases + // + InstrItinData<IIC_iMUL16 , [InstrStage<1, [A8_Pipe0]>], [5, 1, 1]>, + InstrItinData<IIC_iMAC16 , [InstrStage<1, [A8_Pipe1], 0>, + InstrStage<2, [A8_Pipe0]>], [6, 1, 1, 4]>, + InstrItinData<IIC_iMUL32 , [InstrStage<1, [A8_Pipe1], 0>, + InstrStage<2, [A8_Pipe0]>], [6, 1, 1]>, + InstrItinData<IIC_iMAC32 , [InstrStage<1, [A8_Pipe1], 0>, + InstrStage<2, [A8_Pipe0]>], [6, 1, 1, 4]>, + InstrItinData<IIC_iMUL64 , [InstrStage<2, [A8_Pipe1], 0>, + InstrStage<3, [A8_Pipe0]>], [6, 6, 1, 1]>, + InstrItinData<IIC_iMAC64 , [InstrStage<2, [A8_Pipe1], 0>, + InstrStage<3, [A8_Pipe0]>], [6, 6, 1, 1]>, + + // Integer load pipeline + // + // loads have an extra cycle of latency, but are fully pipelined + // use A8_Issue to enforce the 1 load/store per cycle limit + // + // Immediate offset + InstrItinData<IIC_iLoadi , [InstrStage<1, [A8_Issue], 0>, + InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<1, [A8_LdSt0]>], [3, 1]>, + // + // Register offset + InstrItinData<IIC_iLoadr , [InstrStage<1, [A8_Issue], 0>, + InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<1, [A8_LdSt0]>], [3, 1, 1]>, + // + // Scaled register offset, issues over 2 cycles + InstrItinData<IIC_iLoadsi , [InstrStage<2, [A8_Issue], 0>, + InstrStage<1, [A8_Pipe0], 0>, + InstrStage<1, [A8_Pipe1]>, + InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<1, [A8_LdSt0]>], [4, 1, 1]>, + // + // Immediate offset with update + InstrItinData<IIC_iLoadiu , [InstrStage<1, [A8_Issue], 0>, + InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<1, [A8_LdSt0]>], [3, 2, 1]>, + // + // Register offset with update + InstrItinData<IIC_iLoadru , [InstrStage<1, [A8_Issue], 0>, + InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<1, [A8_LdSt0]>], [3, 2, 1, 1]>, + // + // Scaled register offset with update, issues over 2 cycles + InstrItinData<IIC_iLoadsiu , [InstrStage<2, [A8_Issue], 0>, + InstrStage<1, [A8_Pipe0], 0>, + InstrStage<1, [A8_Pipe1]>, + InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<1, [A8_LdSt0]>], [4, 3, 1, 1]>, + // + // Load multiple + InstrItinData<IIC_iLoadm , [InstrStage<2, [A8_Issue], 0>, + InstrStage<2, [A8_Pipe0], 0>, + InstrStage<2, [A8_Pipe1]>, + InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<1, [A8_LdSt0]>]>, + + // Integer store pipeline + // + // use A8_Issue to enforce the 1 load/store per cycle limit + // + // Immediate offset + InstrItinData<IIC_iStorei , [InstrStage<1, [A8_Issue], 0>, + InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<1, [A8_LdSt0]>], [3, 1]>, + // + // Register offset + InstrItinData<IIC_iStorer , [InstrStage<1, [A8_Issue], 0>, + InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<1, [A8_LdSt0]>], [3, 1, 1]>, + // + // Scaled register offset, issues over 2 cycles + InstrItinData<IIC_iStoresi , [InstrStage<2, [A8_Issue], 0>, + InstrStage<1, [A8_Pipe0], 0>, + InstrStage<1, [A8_Pipe1]>, + InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<1, [A8_LdSt0]>], [3, 1, 1]>, + // + // Immediate offset with update + InstrItinData<IIC_iStoreiu , [InstrStage<1, [A8_Issue], 0>, + InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<1, [A8_LdSt0]>], [2, 3, 1]>, + // + // Register offset with update + InstrItinData<IIC_iStoreru , [InstrStage<1, [A8_Issue], 0>, + InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<1, [A8_LdSt0]>], [2, 3, 1, 1]>, + // + // Scaled register offset with update, issues over 2 cycles + InstrItinData<IIC_iStoresiu, [InstrStage<2, [A8_Issue], 0>, + InstrStage<1, [A8_Pipe0], 0>, + InstrStage<1, [A8_Pipe1]>, + InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<1, [A8_LdSt0]>], [3, 3, 1, 1]>, + // + // Store multiple + InstrItinData<IIC_iStorem , [InstrStage<2, [A8_Issue], 0>, + InstrStage<2, [A8_Pipe0], 0>, + InstrStage<2, [A8_Pipe1]>, + InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<1, [A8_LdSt0]>]>, + + // Branch + // + // no delay slots, so the latency of a branch is unimportant + InstrItinData<IIC_Br , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>]>, + + // VFP + // Issue through integer pipeline, and execute in NEON unit. We assume + // RunFast mode so that NFP pipeline is used for single-precision when + // possible. + // + // FP Special Register to Integer Register File Move + InstrItinData<IIC_fpSTAT , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<1, [A8_NLSPipe]>]>, + // + // Single-precision FP Unary + InstrItinData<IIC_fpUNA32 , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<1, [A8_NPipe]>], [7, 1]>, + // + // Double-precision FP Unary + InstrItinData<IIC_fpUNA64 , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<4, [A8_NPipe], 0>, + InstrStage<4, [A8_NLSPipe]>], [4, 1]>, + // + // Single-precision FP Compare + InstrItinData<IIC_fpCMP32 , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<1, [A8_NPipe]>], [1, 1]>, + // + // Double-precision FP Compare + InstrItinData<IIC_fpCMP64 , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<4, [A8_NPipe], 0>, + InstrStage<4, [A8_NLSPipe]>], [4, 1]>, + // + // Single to Double FP Convert + InstrItinData<IIC_fpCVTSD , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<7, [A8_NPipe], 0>, + InstrStage<7, [A8_NLSPipe]>], [7, 1]>, + // + // Double to Single FP Convert + InstrItinData<IIC_fpCVTDS , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<5, [A8_NPipe], 0>, + InstrStage<5, [A8_NLSPipe]>], [5, 1]>, + // + // Single-Precision FP to Integer Convert + InstrItinData<IIC_fpCVTSI , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<1, [A8_NPipe]>], [7, 1]>, + // + // Double-Precision FP to Integer Convert + InstrItinData<IIC_fpCVTDI , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<8, [A8_NPipe], 0>, + InstrStage<8, [A8_NLSPipe]>], [8, 1]>, + // + // Integer to Single-Precision FP Convert + InstrItinData<IIC_fpCVTIS , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<1, [A8_NPipe]>], [7, 1]>, + // + // Integer to Double-Precision FP Convert + InstrItinData<IIC_fpCVTID , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<8, [A8_NPipe], 0>, + InstrStage<8, [A8_NLSPipe]>], [8, 1]>, + // + // Single-precision FP ALU + InstrItinData<IIC_fpALU32 , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<1, [A8_NPipe]>], [7, 1, 1]>, + // + // Double-precision FP ALU + InstrItinData<IIC_fpALU64 , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<9, [A8_NPipe], 0>, + InstrStage<9, [A8_NLSPipe]>], [9, 1, 1]>, + // + // Single-precision FP Multiply + InstrItinData<IIC_fpMUL32 , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<1, [A8_NPipe]>], [7, 1, 1]>, + // + // Double-precision FP Multiply + InstrItinData<IIC_fpMUL64 , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<11, [A8_NPipe], 0>, + InstrStage<11, [A8_NLSPipe]>], [11, 1, 1]>, + // + // Single-precision FP MAC + InstrItinData<IIC_fpMAC32 , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<1, [A8_NPipe]>], [7, 2, 1, 1]>, + // + // Double-precision FP MAC + InstrItinData<IIC_fpMAC64 , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<19, [A8_NPipe], 0>, + InstrStage<19, [A8_NLSPipe]>], [19, 2, 1, 1]>, + // + // Single-precision FP DIV + InstrItinData<IIC_fpDIV32 , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<20, [A8_NPipe], 0>, + InstrStage<20, [A8_NLSPipe]>], [20, 1, 1]>, + // + // Double-precision FP DIV + InstrItinData<IIC_fpDIV64 , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<29, [A8_NPipe], 0>, + InstrStage<29, [A8_NLSPipe]>], [29, 1, 1]>, + // + // Single-precision FP SQRT + InstrItinData<IIC_fpSQRT32, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<19, [A8_NPipe], 0>, + InstrStage<19, [A8_NLSPipe]>], [19, 1]>, + // + // Double-precision FP SQRT + InstrItinData<IIC_fpSQRT64, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<29, [A8_NPipe], 0>, + InstrStage<29, [A8_NLSPipe]>], [29, 1]>, + // + // Single-precision FP Load + // use A8_Issue to enforce the 1 load/store per cycle limit + InstrItinData<IIC_fpLoad32, [InstrStage<1, [A8_Issue], 0>, + InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<1, [A8_LdSt0], 0>, + InstrStage<1, [A8_NLSPipe]>]>, + // + // Double-precision FP Load + // use A8_Issue to enforce the 1 load/store per cycle limit + InstrItinData<IIC_fpLoad64, [InstrStage<2, [A8_Issue], 0>, + InstrStage<1, [A8_Pipe0], 0>, + InstrStage<1, [A8_Pipe1]>, + InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<1, [A8_LdSt0], 0>, + InstrStage<1, [A8_NLSPipe]>]>, + // + // FP Load Multiple + // use A8_Issue to enforce the 1 load/store per cycle limit + InstrItinData<IIC_fpLoadm, [InstrStage<3, [A8_Issue], 0>, + InstrStage<2, [A8_Pipe0], 0>, + InstrStage<2, [A8_Pipe1]>, + InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<1, [A8_LdSt0], 0>, + InstrStage<1, [A8_NLSPipe]>]>, + // + // Single-precision FP Store + // use A8_Issue to enforce the 1 load/store per cycle limit + InstrItinData<IIC_fpStore32,[InstrStage<1, [A8_Issue], 0>, + InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<1, [A8_LdSt0], 0>, + InstrStage<1, [A8_NLSPipe]>]>, + // + // Double-precision FP Store + // use A8_Issue to enforce the 1 load/store per cycle limit + InstrItinData<IIC_fpStore64,[InstrStage<2, [A8_Issue], 0>, + InstrStage<1, [A8_Pipe0], 0>, + InstrStage<1, [A8_Pipe1]>, + InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<1, [A8_LdSt0], 0>, + InstrStage<1, [A8_NLSPipe]>]>, + // + // FP Store Multiple + // use A8_Issue to enforce the 1 load/store per cycle limit + InstrItinData<IIC_fpStorem, [InstrStage<3, [A8_Issue], 0>, + InstrStage<2, [A8_Pipe0], 0>, + InstrStage<2, [A8_Pipe1]>, + InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<1, [A8_LdSt0], 0>, + InstrStage<1, [A8_NLSPipe]>]>, + + // NEON + // Issue through integer pipeline, and execute in NEON unit. + // + // VLD1 + // FIXME: We don't model this instruction properly + InstrItinData<IIC_VLD1, [InstrStage<1, [A8_Issue], 0>, + InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<1, [A8_LdSt0], 0>, + InstrStage<1, [A8_NLSPipe]>]>, + // + // VLD2 + // FIXME: We don't model this instruction properly + InstrItinData<IIC_VLD2, [InstrStage<1, [A8_Issue], 0>, + InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<1, [A8_LdSt0], 0>, + InstrStage<1, [A8_NLSPipe]>], [2, 2, 1]>, + // + // VLD3 + // FIXME: We don't model this instruction properly + InstrItinData<IIC_VLD3, [InstrStage<1, [A8_Issue], 0>, + InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<1, [A8_LdSt0], 0>, + InstrStage<1, [A8_NLSPipe]>], [2, 2, 2, 1]>, + // + // VLD4 + // FIXME: We don't model this instruction properly + InstrItinData<IIC_VLD4, [InstrStage<1, [A8_Issue], 0>, + InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<1, [A8_LdSt0], 0>, + InstrStage<1, [A8_NLSPipe]>], [2, 2, 2, 2, 1]>, + // + // VST + // FIXME: We don't model this instruction properly + InstrItinData<IIC_VST, [InstrStage<1, [A8_Issue], 0>, + InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<1, [A8_LdSt0], 0>, + InstrStage<1, [A8_NLSPipe]>]>, + // + // Double-register FP Unary + InstrItinData<IIC_VUNAD, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<1, [A8_NPipe]>], [5, 2]>, + // + // Quad-register FP Unary + // Result written in N5, but that is relative to the last cycle of multicycle, + // so we use 6 for those cases + InstrItinData<IIC_VUNAQ, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<2, [A8_NPipe]>], [6, 2]>, + // + // Double-register FP Binary + InstrItinData<IIC_VBIND, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<1, [A8_NPipe]>], [5, 2, 2]>, + // + // Quad-register FP Binary + // Result written in N5, but that is relative to the last cycle of multicycle, + // so we use 6 for those cases + InstrItinData<IIC_VBINQ, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<2, [A8_NPipe]>], [6, 2, 2]>, + // + // Move Immediate + InstrItinData<IIC_VMOVImm, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<1, [A8_NPipe]>], [3]>, + // + // Double-register Permute Move + InstrItinData<IIC_VMOVD, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<1, [A8_NLSPipe]>], [2, 1]>, + // + // Quad-register Permute Move + // Result written in N2, but that is relative to the last cycle of multicycle, + // so we use 3 for those cases + InstrItinData<IIC_VMOVQ, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<2, [A8_NLSPipe]>], [3, 1]>, + // + // Integer to Single-precision Move + InstrItinData<IIC_VMOVIS , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<1, [A8_NLSPipe]>], [2, 1]>, + // + // Integer to Double-precision Move + InstrItinData<IIC_VMOVID , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<1, [A8_NLSPipe]>], [2, 1, 1]>, + // + // Single-precision to Integer Move + InstrItinData<IIC_VMOVSI , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<1, [A8_NLSPipe]>], [20, 1]>, + // + // Double-precision to Integer Move + InstrItinData<IIC_VMOVDI , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<1, [A8_NLSPipe]>], [20, 20, 1]>, + // + // Integer to Lane Move + InstrItinData<IIC_VMOVISL , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<2, [A8_NLSPipe]>], [3, 1, 1]>, + // + // Double-register Permute + InstrItinData<IIC_VPERMD, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<1, [A8_NLSPipe]>], [2, 2, 1, 1]>, + // + // Quad-register Permute + // Result written in N2, but that is relative to the last cycle of multicycle, + // so we use 3 for those cases + InstrItinData<IIC_VPERMQ, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<2, [A8_NLSPipe]>], [3, 3, 1, 1]>, + // + // Quad-register Permute (3 cycle issue) + // Result written in N2, but that is relative to the last cycle of multicycle, + // so we use 4 for those cases + InstrItinData<IIC_VPERMQ3, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<1, [A8_NLSPipe]>, + InstrStage<1, [A8_NPipe], 0>, + InstrStage<2, [A8_NLSPipe]>], [4, 4, 1, 1]>, + // + // Double-register FP Multiple-Accumulate + InstrItinData<IIC_VMACD, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<1, [A8_NPipe]>], [9, 3, 2, 2]>, + // + // Quad-register FP Multiple-Accumulate + // Result written in N9, but that is relative to the last cycle of multicycle, + // so we use 10 for those cases + InstrItinData<IIC_VMACQ, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<2, [A8_NPipe]>], [10, 3, 2, 2]>, + // + // Double-register Reciprical Step + InstrItinData<IIC_VRECSD, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<1, [A8_NPipe]>], [9, 2, 2]>, + // + // Quad-register Reciprical Step + InstrItinData<IIC_VRECSQ, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<2, [A8_NPipe]>], [10, 2, 2]>, + // + // Double-register Integer Count + InstrItinData<IIC_VCNTiD, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<1, [A8_NPipe]>], [3, 2, 2]>, + // + // Quad-register Integer Count + // Result written in N3, but that is relative to the last cycle of multicycle, + // so we use 4 for those cases + InstrItinData<IIC_VCNTiQ, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<2, [A8_NPipe]>], [4, 2, 2]>, + // + // Double-register Integer Unary + InstrItinData<IIC_VUNAiD, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<1, [A8_NPipe]>], [4, 2]>, + // + // Quad-register Integer Unary + InstrItinData<IIC_VUNAiQ, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<1, [A8_NPipe]>], [4, 2]>, + // + // Double-register Integer Q-Unary + InstrItinData<IIC_VQUNAiD, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<1, [A8_NPipe]>], [4, 1]>, + // + // Quad-register Integer CountQ-Unary + InstrItinData<IIC_VQUNAiQ, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<1, [A8_NPipe]>], [4, 1]>, + // + // Double-register Integer Binary + InstrItinData<IIC_VBINiD, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<1, [A8_NPipe]>], [3, 2, 2]>, + // + // Quad-register Integer Binary + InstrItinData<IIC_VBINiQ, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<1, [A8_NPipe]>], [3, 2, 2]>, + // + // Double-register Integer Binary (4 cycle) + InstrItinData<IIC_VBINi4D, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<1, [A8_NPipe]>], [4, 2, 1]>, + // + // Quad-register Integer Binary (4 cycle) + InstrItinData<IIC_VBINi4Q, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<1, [A8_NPipe]>], [4, 2, 1]>, + + // + // Double-register Integer Subtract + InstrItinData<IIC_VSUBiD, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<1, [A8_NPipe]>], [3, 2, 1]>, + // + // Quad-register Integer Subtract + InstrItinData<IIC_VSUBiQ, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<1, [A8_NPipe]>], [3, 2, 1]>, + // + // Double-register Integer Subtract + InstrItinData<IIC_VSUBi4D, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<1, [A8_NPipe]>], [4, 2, 1]>, + // + // Quad-register Integer Subtract + InstrItinData<IIC_VSUBi4Q, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<1, [A8_NPipe]>], [4, 2, 1]>, + // + // Double-register Integer Shift + InstrItinData<IIC_VSHLiD, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<1, [A8_NPipe]>], [3, 1, 1]>, + // + // Quad-register Integer Shift + InstrItinData<IIC_VSHLiQ, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<2, [A8_NPipe]>], [4, 1, 1]>, + // + // Double-register Integer Shift (4 cycle) + InstrItinData<IIC_VSHLi4D, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<1, [A8_NPipe]>], [4, 1, 1]>, + // + // Quad-register Integer Shift (4 cycle) + InstrItinData<IIC_VSHLi4Q, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<2, [A8_NPipe]>], [5, 1, 1]>, + // + // Double-register Integer Pair Add Long + InstrItinData<IIC_VPALiD, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<1, [A8_NPipe]>], [6, 3, 1]>, + // + // Quad-register Integer Pair Add Long + InstrItinData<IIC_VPALiQ, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<2, [A8_NPipe]>], [7, 3, 1]>, + // + // Double-register Absolute Difference and Accumulate + InstrItinData<IIC_VABAD, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<1, [A8_NPipe]>], [6, 3, 2, 1]>, + // + // Quad-register Absolute Difference and Accumulate + InstrItinData<IIC_VABAQ, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<2, [A8_NPipe]>], [6, 3, 2, 1]>, + + // + // Double-register Integer Multiply (.8, .16) + InstrItinData<IIC_VMULi16D, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<1, [A8_NPipe]>], [6, 2, 2]>, + // + // Double-register Integer Multiply (.32) + InstrItinData<IIC_VMULi32D, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<2, [A8_NPipe]>], [7, 2, 1]>, + // + // Quad-register Integer Multiply (.8, .16) + InstrItinData<IIC_VMULi16Q, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<2, [A8_NPipe]>], [7, 2, 2]>, + // + // Quad-register Integer Multiply (.32) + InstrItinData<IIC_VMULi32Q, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<1, [A8_NPipe]>, + InstrStage<2, [A8_NLSPipe], 0>, + InstrStage<3, [A8_NPipe]>], [9, 2, 1]>, + // + // Double-register Integer Multiply-Accumulate (.8, .16) + InstrItinData<IIC_VMACi16D, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<1, [A8_NPipe]>], [6, 3, 2, 2]>, + // + // Double-register Integer Multiply-Accumulate (.32) + InstrItinData<IIC_VMACi32D, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<2, [A8_NPipe]>], [7, 3, 2, 1]>, + // + // Quad-register Integer Multiply-Accumulate (.8, .16) + InstrItinData<IIC_VMACi16Q, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<2, [A8_NPipe]>], [7, 3, 2, 2]>, + // + // Quad-register Integer Multiply-Accumulate (.32) + InstrItinData<IIC_VMACi32Q, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<1, [A8_NPipe]>, + InstrStage<2, [A8_NLSPipe], 0>, + InstrStage<3, [A8_NPipe]>], [9, 3, 2, 1]>, + // + // Double-register VEXT + InstrItinData<IIC_VEXTD, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<1, [A8_NLSPipe]>], [2, 1, 1]>, + // + // Quad-register VEXT + InstrItinData<IIC_VEXTQ, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<2, [A8_NLSPipe]>], [3, 1, 1]>, + // + // VTB + InstrItinData<IIC_VTB1, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<2, [A8_NLSPipe]>], [3, 2, 1]>, + InstrItinData<IIC_VTB2, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<2, [A8_NLSPipe]>], [3, 2, 2, 1]>, + InstrItinData<IIC_VTB3, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<1, [A8_NLSPipe]>, + InstrStage<1, [A8_NPipe], 0>, + InstrStage<2, [A8_NLSPipe]>], [4, 2, 2, 3, 1]>, + InstrItinData<IIC_VTB4, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<1, [A8_NLSPipe]>, + InstrStage<1, [A8_NPipe], 0>, + InstrStage<2, [A8_NLSPipe]>], [4, 2, 2, 3, 3, 1]>, + // + // VTBX + InstrItinData<IIC_VTBX1, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<2, [A8_NLSPipe]>], [3, 1, 2, 1]>, + InstrItinData<IIC_VTBX2, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<2, [A8_NLSPipe]>], [3, 1, 2, 2, 1]>, + InstrItinData<IIC_VTBX3, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<1, [A8_NLSPipe]>, + InstrStage<1, [A8_NPipe], 0>, + InstrStage<2, [A8_NLSPipe]>], [4, 1, 2, 2, 3, 1]>, + InstrItinData<IIC_VTBX4, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<1, [A8_NLSPipe]>, + InstrStage<1, [A8_NPipe], 0>, + InstrStage<2, [A8_NLSPipe]>], [4, 1, 2, 2, 3, 3, 1]> +]>; diff --git a/contrib/llvm/lib/Target/ARM/ARMScheduleA9.td b/contrib/llvm/lib/Target/ARM/ARMScheduleA9.td new file mode 100644 index 0000000..75320d9 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMScheduleA9.td @@ -0,0 +1,749 @@ +//=- ARMScheduleA9.td - ARM Cortex-A9 Scheduling Definitions -*- tablegen -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the itinerary class data for the ARM Cortex A9 processors. +// +//===----------------------------------------------------------------------===// + +// +// Ad-hoc scheduling information derived from pretty vague "Cortex-A9 Technical +// Reference Manual". +// +// Functional units +def A9_Issue : FuncUnit; // issue +def A9_Pipe0 : FuncUnit; // pipeline 0 +def A9_Pipe1 : FuncUnit; // pipeline 1 +def A9_LSPipe : FuncUnit; // LS pipe +def A9_NPipe : FuncUnit; // NEON ALU/MUL pipe +def A9_DRegsVFP: FuncUnit; // FP register set, VFP side +def A9_DRegsN : FuncUnit; // FP register set, NEON side + +// Dual issue pipeline represented by A9_Pipe0 | A9_Pipe1 +// +def CortexA9Itineraries : ProcessorItineraries< + [A9_NPipe, A9_DRegsN, A9_DRegsVFP, A9_LSPipe, A9_Pipe0, A9_Pipe1, A9_Issue], [ + // VFP and NEON shares the same register file. This means that every VFP + // instruction should wait for full completion of the consecutive NEON + // instruction and vice-versa. We model this behavior with two artificial FUs: + // DRegsVFP and DRegsVFP. + // + // Every VFP instruction: + // - Acquires DRegsVFP resource for 1 cycle + // - Reserves DRegsN resource for the whole duration (including time to + // register file writeback!). + // Every NEON instruction does the same but with FUs swapped. + // + // Since the reserved FU cannot be acquired this models precisly "cross-domain" + // stalls. + + // VFP + // Issue through integer pipeline, and execute in NEON unit. + + // FP Special Register to Integer Register File Move + InstrItinData<IIC_fpSTAT , [InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrStage<2, [A9_DRegsN], 0, Reserved>, + InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_NPipe]>]>, + // + // Single-precision FP Unary + InstrItinData<IIC_fpUNA32 , [InstrStage<1, [A9_DRegsVFP], 0, Required>, + // Extra latency cycles since wbck is 2 cycles + InstrStage<3, [A9_DRegsN], 0, Reserved>, + InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_NPipe]>], [1, 1]>, + // + // Double-precision FP Unary + InstrItinData<IIC_fpUNA64 , [InstrStage<1, [A9_DRegsVFP], 0, Required>, + // Extra latency cycles since wbck is 2 cycles + InstrStage<3, [A9_DRegsN], 0, Reserved>, + InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_NPipe]>], [1, 1]>, + + // + // Single-precision FP Compare + InstrItinData<IIC_fpCMP32 , [InstrStage<1, [A9_DRegsVFP], 0, Required>, + // Extra latency cycles since wbck is 4 cycles + InstrStage<5, [A9_DRegsN], 0, Reserved>, + InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_NPipe]>], [1, 1]>, + // + // Double-precision FP Compare + InstrItinData<IIC_fpCMP64 , [InstrStage<1, [A9_DRegsVFP], 0, Required>, + // Extra latency cycles since wbck is 4 cycles + InstrStage<5, [A9_DRegsN], 0, Reserved>, + InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_NPipe]>], [1, 1]>, + // + // Single to Double FP Convert + InstrItinData<IIC_fpCVTSD , [InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrStage<5, [A9_DRegsN], 0, Reserved>, + InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_NPipe]>], [4, 1]>, + // + // Double to Single FP Convert + InstrItinData<IIC_fpCVTDS , [InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrStage<5, [A9_DRegsN], 0, Reserved>, + InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_NPipe]>], [4, 1]>, + + // + // Single to Half FP Convert + InstrItinData<IIC_fpCVTSH , [InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrStage<5, [A9_DRegsN], 0, Reserved>, + InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_NPipe]>], [4, 1]>, + // + // Half to Single FP Convert + InstrItinData<IIC_fpCVTHS , [InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrStage<3, [A9_DRegsN], 0, Reserved>, + InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_NPipe]>], [2, 1]>, + + // + // Single-Precision FP to Integer Convert + InstrItinData<IIC_fpCVTSI , [InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrStage<5, [A9_DRegsN], 0, Reserved>, + InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_NPipe]>], [4, 1]>, + // + // Double-Precision FP to Integer Convert + InstrItinData<IIC_fpCVTDI , [InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrStage<5, [A9_DRegsN], 0, Reserved>, + InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_NPipe]>], [4, 1]>, + // + // Integer to Single-Precision FP Convert + InstrItinData<IIC_fpCVTIS , [InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrStage<5, [A9_DRegsN], 0, Reserved>, + InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_NPipe]>], [4, 1]>, + // + // Integer to Double-Precision FP Convert + InstrItinData<IIC_fpCVTID , [InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrStage<5, [A9_DRegsN], 0, Reserved>, + InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_NPipe]>], [4, 1]>, + // + // Single-precision FP ALU + InstrItinData<IIC_fpALU32 , [InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrStage<5, [A9_DRegsN], 0, Reserved>, + InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_NPipe]>], [4, 1, 1]>, + // + // Double-precision FP ALU + InstrItinData<IIC_fpALU64 , [InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrStage<5, [A9_DRegsN], 0, Reserved>, + InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_NPipe]>], [4, 1, 1]>, + // + // Single-precision FP Multiply + InstrItinData<IIC_fpMUL32 , [InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrStage<6, [A9_DRegsN], 0, Reserved>, + InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_NPipe]>], [5, 1, 1]>, + // + // Double-precision FP Multiply + InstrItinData<IIC_fpMUL64 , [InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrStage<7, [A9_DRegsN], 0, Reserved>, + InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<2, [A9_NPipe]>], [6, 1, 1]>, + // + // Single-precision FP MAC + InstrItinData<IIC_fpMAC32 , [InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrStage<9, [A9_DRegsN], 0, Reserved>, + InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_NPipe]>], [8, 0, 1, 1]>, + // + // Double-precision FP MAC + InstrItinData<IIC_fpMAC64 , [InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrStage<10, [A9_DRegsN], 0, Reserved>, + InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<2, [A9_NPipe]>], [9, 0, 1, 1]>, + // + // Single-precision FP DIV + InstrItinData<IIC_fpDIV32 , [InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrStage<16, [A9_DRegsN], 0, Reserved>, + InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<10, [A9_NPipe]>], [15, 1, 1]>, + // + // Double-precision FP DIV + InstrItinData<IIC_fpDIV64 , [InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrStage<26, [A9_DRegsN], 0, Reserved>, + InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<20, [A9_NPipe]>], [25, 1, 1]>, + // + // Single-precision FP SQRT + InstrItinData<IIC_fpSQRT32, [InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrStage<18, [A9_DRegsN], 0, Reserved>, + InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<13, [A9_NPipe]>], [17, 1]>, + // + // Double-precision FP SQRT + InstrItinData<IIC_fpSQRT64, [InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrStage<33, [A9_DRegsN], 0, Reserved>, + InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<28, [A9_NPipe]>], [32, 1]>, + + // + // Integer to Single-precision Move + InstrItinData<IIC_fpMOVIS, [InstrStage<1, [A9_DRegsVFP], 0, Required>, + // Extra 1 latency cycle since wbck is 2 cycles + InstrStage<3, [A9_DRegsN], 0, Reserved>, + InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_NPipe]>], [1, 1]>, + // + // Integer to Double-precision Move + InstrItinData<IIC_fpMOVID, [InstrStage<1, [A9_DRegsVFP], 0, Required>, + // Extra 1 latency cycle since wbck is 2 cycles + InstrStage<3, [A9_DRegsN], 0, Reserved>, + InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_NPipe]>], [1, 1, 1]>, + // + // Single-precision to Integer Move + InstrItinData<IIC_fpMOVSI, [InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrStage<2, [A9_DRegsN], 0, Reserved>, + InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_NPipe]>], [1, 1]>, + // + // Double-precision to Integer Move + InstrItinData<IIC_fpMOVDI, [InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrStage<2, [A9_DRegsN], 0, Reserved>, + InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_NPipe]>], [1, 1, 1]>, + // + // Single-precision FP Load + // use A9_Issue to enforce the 1 load/store per cycle limit + InstrItinData<IIC_fpLoad32, [InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrStage<2, [A9_DRegsN], 0, Reserved>, + InstrStage<1, [A9_Issue], 0>, + InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_LSPipe], 0>, + InstrStage<1, [A9_NPipe]>]>, + // + // Double-precision FP Load + // use A9_Issue to enforce the 1 load/store per cycle limit + InstrItinData<IIC_fpLoad64, [InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrStage<2, [A9_DRegsN], 0, Reserved>, + InstrStage<1, [A9_Issue], 0>, + InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_LSPipe], 0>, + InstrStage<1, [A9_NPipe]>]>, + // + // FP Load Multiple + // use A9_Issue to enforce the 1 load/store per cycle limit + InstrItinData<IIC_fpLoadm, [InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrStage<2, [A9_DRegsN], 0, Reserved>, + InstrStage<1, [A9_Issue], 0>, + InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_LSPipe], 0>, + InstrStage<1, [A9_NPipe]>]>, + // + // Single-precision FP Store + // use A9_Issue to enforce the 1 load/store per cycle limit + InstrItinData<IIC_fpStore32,[InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrStage<2, [A9_DRegsN], 0, Reserved>, + InstrStage<1, [A9_Issue], 0>, + InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_LSPipe], 0>, + InstrStage<1, [A9_NPipe]>]>, + // + // Double-precision FP Store + // use A9_Issue to enforce the 1 load/store per cycle limit + InstrItinData<IIC_fpStore64,[InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrStage<2, [A9_DRegsN], 0, Reserved>, + InstrStage<1, [A9_Issue], 0>, + InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_LSPipe], 0>, + InstrStage<1, [A9_NPipe]>]>, + // + // FP Store Multiple + // use A9_Issue to enforce the 1 load/store per cycle limit + InstrItinData<IIC_fpStorem, [InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrStage<2, [A9_DRegsN], 0, Reserved>, + InstrStage<1, [A9_Issue], 0>, + InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_LSPipe], 0>, + InstrStage<1, [A9_NPipe]>]>, + // NEON + // Issue through integer pipeline, and execute in NEON unit. + // FIXME: Neon pipeline and LdSt unit are multiplexed. + // Add some syntactic sugar to model this! + // VLD1 + // FIXME: We don't model this instruction properly + InstrItinData<IIC_VLD1, [InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_Issue], 0>, + InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_LSPipe], 0>, + InstrStage<1, [A9_NPipe]>]>, + // + // VLD2 + // FIXME: We don't model this instruction properly + InstrItinData<IIC_VLD2, [InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_Issue], 0>, + InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_LSPipe], 0>, + InstrStage<1, [A9_NPipe]>], [2, 2, 1]>, + // + // VLD3 + // FIXME: We don't model this instruction properly + InstrItinData<IIC_VLD3, [InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_Issue], 0>, + InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_LSPipe], 0>, + InstrStage<1, [A9_NPipe]>], [2, 2, 2, 1]>, + // + // VLD4 + // FIXME: We don't model this instruction properly + InstrItinData<IIC_VLD4, [InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_Issue], 0>, + InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_LSPipe], 0>, + InstrStage<1, [A9_NPipe]>], [2, 2, 2, 2, 1]>, + // + // VST + // FIXME: We don't model this instruction properly + InstrItinData<IIC_VST, [InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_Issue], 0>, + InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_LSPipe], 0>, + InstrStage<1, [A9_NPipe]>]>, + // + // Double-register Integer Unary + InstrItinData<IIC_VUNAiD, [InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_NPipe]>], [4, 2]>, + // + // Quad-register Integer Unary + InstrItinData<IIC_VUNAiQ, [InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_NPipe]>], [4, 2]>, + // + // Double-register Integer Q-Unary + InstrItinData<IIC_VQUNAiD, [InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_NPipe]>], [4, 1]>, + // + // Quad-register Integer CountQ-Unary + InstrItinData<IIC_VQUNAiQ, [InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_NPipe]>], [4, 1]>, + // + // Double-register Integer Binary + InstrItinData<IIC_VBINiD, [InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_NPipe]>], [3, 2, 2]>, + // + // Quad-register Integer Binary + InstrItinData<IIC_VBINiQ, [InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_NPipe]>], [3, 2, 2]>, + // + // Double-register Integer Subtract + InstrItinData<IIC_VSUBiD, [InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_NPipe]>], [3, 2, 1]>, + // + // Quad-register Integer Subtract + InstrItinData<IIC_VSUBiQ, [InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_NPipe]>], [3, 2, 1]>, + // + // Double-register Integer Shift + InstrItinData<IIC_VSHLiD, [InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_NPipe]>], [3, 1, 1]>, + // + // Quad-register Integer Shift + InstrItinData<IIC_VSHLiQ, [InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_NPipe]>], [3, 1, 1]>, + // + // Double-register Integer Shift (4 cycle) + InstrItinData<IIC_VSHLi4D, [InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_NPipe]>], [4, 1, 1]>, + // + // Quad-register Integer Shift (4 cycle) + InstrItinData<IIC_VSHLi4Q, [InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_NPipe]>], [4, 1, 1]>, + // + // Double-register Integer Binary (4 cycle) + InstrItinData<IIC_VBINi4D, [InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_NPipe]>], [4, 2, 2]>, + // + // Quad-register Integer Binary (4 cycle) + InstrItinData<IIC_VBINi4Q, [InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_NPipe]>], [4, 2, 2]>, + // + // Double-register Integer Subtract (4 cycle) + InstrItinData<IIC_VSUBiD, [InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_NPipe]>], [4, 2, 1]>, + // + // Quad-register Integer Subtract (4 cycle) + InstrItinData<IIC_VSUBiQ, [InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_NPipe]>], [4, 2, 1]>, + + // + // Double-register Integer Count + InstrItinData<IIC_VCNTiD, [InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_NPipe]>], [3, 2, 2]>, + // + // Quad-register Integer Count + // Result written in N3, but that is relative to the last cycle of multicycle, + // so we use 4 for those cases + InstrItinData<IIC_VCNTiQ, [InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 7 cycles + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<2, [A9_NPipe]>], [4, 2, 2]>, + // + // Double-register Absolute Difference and Accumulate + InstrItinData<IIC_VABAD, [InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_NPipe]>], [6, 3, 2, 1]>, + // + // Quad-register Absolute Difference and Accumulate + InstrItinData<IIC_VABAQ, [InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<2, [A9_NPipe]>], [6, 3, 2, 1]>, + // + // Double-register Integer Pair Add Long + InstrItinData<IIC_VPALiD, [InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_NPipe]>], [6, 3, 1]>, + // + // Quad-register Integer Pair Add Long + InstrItinData<IIC_VPALiQ, [InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<2, [A9_NPipe]>], [6, 3, 1]>, + + // + // Double-register Integer Multiply (.8, .16) + InstrItinData<IIC_VMULi16D, [InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_NPipe]>], [6, 2, 2]>, + // + // Quad-register Integer Multiply (.8, .16) + InstrItinData<IIC_VMULi16Q, [InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 7 cycles + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<2, [A9_NPipe]>], [7, 2, 2]>, + + // + // Double-register Integer Multiply (.32) + InstrItinData<IIC_VMULi32D, [InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 7 cycles + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<2, [A9_NPipe]>], [7, 2, 1]>, + // + // Quad-register Integer Multiply (.32) + InstrItinData<IIC_VMULi32Q, [InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 9 cycles + InstrStage<10, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<4, [A9_NPipe]>], [9, 2, 1]>, + // + // Double-register Integer Multiply-Accumulate (.8, .16) + InstrItinData<IIC_VMACi16D, [InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_NPipe]>], [6, 3, 2, 2]>, + // + // Double-register Integer Multiply-Accumulate (.32) + InstrItinData<IIC_VMACi32D, [InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 7 cycles + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<2, [A9_NPipe]>], [7, 3, 2, 1]>, + // + // Quad-register Integer Multiply-Accumulate (.8, .16) + InstrItinData<IIC_VMACi16Q, [InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 7 cycles + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<2, [A9_NPipe]>], [7, 3, 2, 2]>, + // + // Quad-register Integer Multiply-Accumulate (.32) + InstrItinData<IIC_VMACi32Q, [InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 9 cycles + InstrStage<10, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<4, [A9_NPipe]>], [9, 3, 2, 1]>, + // + // Move Immediate + InstrItinData<IIC_VMOVImm, [InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_NPipe]>], [3]>, + // + // Double-register Permute Move + InstrItinData<IIC_VMOVD, [InstrStage<1, [A9_DRegsN], 0, Required>, + // FIXME: all latencies are arbitrary, no information is available + InstrStage<3, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_LSPipe]>], [2, 1]>, + // + // Quad-register Permute Move + // Result written in N2, but that is relative to the last cycle of multicycle, + // so we use 3 for those cases + InstrItinData<IIC_VMOVQ, [InstrStage<1, [A9_DRegsN], 0, Required>, + // FIXME: all latencies are arbitrary, no information is available + InstrStage<4, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<2, [A9_NPipe]>], [3, 1]>, + // + // Integer to Single-precision Move + InstrItinData<IIC_VMOVIS , [InstrStage<1, [A9_DRegsN], 0, Required>, + // FIXME: all latencies are arbitrary, no information is available + InstrStage<3, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_NPipe]>], [2, 1]>, + // + // Integer to Double-precision Move + InstrItinData<IIC_VMOVID , [InstrStage<1, [A9_DRegsN], 0, Required>, + // FIXME: all latencies are arbitrary, no information is available + InstrStage<3, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_NPipe]>], [2, 1, 1]>, + // + // Single-precision to Integer Move + InstrItinData<IIC_VMOVSI , [InstrStage<1, [A9_DRegsN], 0, Required>, + // FIXME: all latencies are arbitrary, no information is available + InstrStage<3, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_NPipe]>], [2, 1]>, + // + // Double-precision to Integer Move + InstrItinData<IIC_VMOVDI , [InstrStage<1, [A9_DRegsN], 0, Required>, + // FIXME: all latencies are arbitrary, no information is available + InstrStage<3, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_NPipe]>], [2, 2, 1]>, + // + // Integer to Lane Move + InstrItinData<IIC_VMOVISL , [InstrStage<1, [A9_DRegsN], 0, Required>, + // FIXME: all latencies are arbitrary, no information is available + InstrStage<4, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<2, [A9_NPipe]>], [3, 1, 1]>, + + // + // Double-register FP Unary + InstrItinData<IIC_VUNAD, [InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_NPipe]>], [5, 2]>, + // + // Quad-register FP Unary + // Result written in N5, but that is relative to the last cycle of multicycle, + // so we use 6 for those cases + InstrItinData<IIC_VUNAQ, [InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 7 cycles + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<2, [A9_NPipe]>], [6, 2]>, + // + // Double-register FP Binary + // FIXME: We're using this itin for many instructions and [2, 2] here is too + // optimistic. + InstrItinData<IIC_VBIND, [InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 7 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_NPipe]>], [5, 2, 2]>, + // + // Quad-register FP Binary + // Result written in N5, but that is relative to the last cycle of multicycle, + // so we use 6 for those cases + // FIXME: We're using this itin for many instructions and [2, 2] here is too + // optimistic. + InstrItinData<IIC_VBINQ, [InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 8 cycles + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<2, [A9_NPipe]>], [6, 2, 2]>, + // + // Double-register FP Multiple-Accumulate + InstrItinData<IIC_VMACD, [InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 7 cycles + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<2, [A9_NPipe]>], [6, 3, 2, 1]>, + // + // Quad-register FP Multiple-Accumulate + // Result written in N9, but that is relative to the last cycle of multicycle, + // so we use 10 for those cases + InstrItinData<IIC_VMACQ, [InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 9 cycles + InstrStage<10, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<4, [A9_NPipe]>], [8, 4, 2, 1]>, + // + // Double-register Reciprical Step + InstrItinData<IIC_VRECSD, [InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 7 cycles + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<2, [A9_NPipe]>], [6, 2, 2]>, + // + // Quad-register Reciprical Step + InstrItinData<IIC_VRECSQ, [InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 9 cycles + InstrStage<10, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<4, [A9_NPipe]>], [8, 2, 2]>, + // + // Double-register Permute + InstrItinData<IIC_VPERMD, [InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_NPipe]>], [2, 2, 1, 1]>, + // + // Quad-register Permute + // Result written in N2, but that is relative to the last cycle of multicycle, + // so we use 3 for those cases + InstrItinData<IIC_VPERMQ, [InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 7 cycles + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<2, [A9_NPipe]>], [3, 3, 1, 1]>, + // + // Quad-register Permute (3 cycle issue) + // Result written in N2, but that is relative to the last cycle of multicycle, + // so we use 4 for those cases + InstrItinData<IIC_VPERMQ3, [InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 8 cycles + InstrStage<9, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<3, [A9_LSPipe]>], [4, 4, 1, 1]>, + + // + // Double-register VEXT + InstrItinData<IIC_VEXTD, [InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 7 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_NPipe]>], [2, 1, 1]>, + // + // Quad-register VEXT + InstrItinData<IIC_VEXTQ, [InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 9 cycles + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<2, [A9_NPipe]>], [3, 1, 1]>, + // + // VTB + InstrItinData<IIC_VTB1, [InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 7 cycles + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<2, [A9_NPipe]>], [3, 2, 1]>, + InstrItinData<IIC_VTB2, [InstrStage<2, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 7 cycles + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<2, [A9_NPipe]>], [3, 2, 2, 1]>, + InstrItinData<IIC_VTB3, [InstrStage<2, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 8 cycles + InstrStage<9, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<3, [A9_NPipe]>], [4, 2, 2, 3, 1]>, + InstrItinData<IIC_VTB4, [InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 8 cycles + InstrStage<9, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<3, [A9_NPipe]>], [4, 2, 2, 3, 3, 1]>, + // + // VTBX + InstrItinData<IIC_VTBX1, [InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 7 cycles + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<2, [A9_NPipe]>], [3, 1, 2, 1]>, + InstrItinData<IIC_VTBX2, [InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 7 cycles + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<2, [A9_NPipe]>], [3, 1, 2, 2, 1]>, + InstrItinData<IIC_VTBX3, [InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 8 cycles + InstrStage<9, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<3, [A9_NPipe]>], [4, 1, 2, 2, 3, 1]>, + InstrItinData<IIC_VTBX4, [InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 8 cycles + InstrStage<9, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<2, [A9_NPipe]>], [4, 1, 2, 2, 3, 3, 1]> +]>; diff --git a/contrib/llvm/lib/Target/ARM/ARMScheduleV6.td b/contrib/llvm/lib/Target/ARM/ARMScheduleV6.td new file mode 100644 index 0000000..f813022 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMScheduleV6.td @@ -0,0 +1,204 @@ +//===- ARMScheduleV6.td - ARM v6 Scheduling Definitions ----*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the itinerary class data for the ARM v6 processors. +// +//===----------------------------------------------------------------------===// + +// Model based on ARM1176 +// +// Functional Units +def V6_Pipe : FuncUnit; // pipeline + +// Scheduling information derived from "ARM1176JZF-S Technical Reference Manual". +// +def ARMV6Itineraries : ProcessorItineraries< + [V6_Pipe], [ + // + // No operand cycles + InstrItinData<IIC_iALUx , [InstrStage<1, [V6_Pipe]>]>, + // + // Binary Instructions that produce a result + InstrItinData<IIC_iALUi , [InstrStage<1, [V6_Pipe]>], [2, 2]>, + InstrItinData<IIC_iALUr , [InstrStage<1, [V6_Pipe]>], [2, 2, 2]>, + InstrItinData<IIC_iALUsi , [InstrStage<1, [V6_Pipe]>], [2, 2, 1]>, + InstrItinData<IIC_iALUsr , [InstrStage<2, [V6_Pipe]>], [3, 3, 2, 1]>, + // + // Unary Instructions that produce a result + InstrItinData<IIC_iUNAr , [InstrStage<1, [V6_Pipe]>], [2, 2]>, + InstrItinData<IIC_iUNAsi , [InstrStage<1, [V6_Pipe]>], [2, 1]>, + InstrItinData<IIC_iUNAsr , [InstrStage<2, [V6_Pipe]>], [3, 2, 1]>, + // + // Compare instructions + InstrItinData<IIC_iCMPi , [InstrStage<1, [V6_Pipe]>], [2]>, + InstrItinData<IIC_iCMPr , [InstrStage<1, [V6_Pipe]>], [2, 2]>, + InstrItinData<IIC_iCMPsi , [InstrStage<1, [V6_Pipe]>], [2, 1]>, + InstrItinData<IIC_iCMPsr , [InstrStage<2, [V6_Pipe]>], [3, 2, 1]>, + // + // Move instructions, unconditional + InstrItinData<IIC_iMOVi , [InstrStage<1, [V6_Pipe]>], [2]>, + InstrItinData<IIC_iMOVr , [InstrStage<1, [V6_Pipe]>], [2, 2]>, + InstrItinData<IIC_iMOVsi , [InstrStage<1, [V6_Pipe]>], [2, 1]>, + InstrItinData<IIC_iMOVsr , [InstrStage<2, [V6_Pipe]>], [3, 2, 1]>, + // + // Move instructions, conditional + InstrItinData<IIC_iCMOVi , [InstrStage<1, [V6_Pipe]>], [3]>, + InstrItinData<IIC_iCMOVr , [InstrStage<1, [V6_Pipe]>], [3, 2]>, + InstrItinData<IIC_iCMOVsi , [InstrStage<1, [V6_Pipe]>], [3, 1]>, + InstrItinData<IIC_iCMOVsr , [InstrStage<1, [V6_Pipe]>], [4, 2, 1]>, + + // Integer multiply pipeline + // + InstrItinData<IIC_iMUL16 , [InstrStage<1, [V6_Pipe]>], [4, 1, 1]>, + InstrItinData<IIC_iMAC16 , [InstrStage<1, [V6_Pipe]>], [4, 1, 1, 2]>, + InstrItinData<IIC_iMUL32 , [InstrStage<2, [V6_Pipe]>], [5, 1, 1]>, + InstrItinData<IIC_iMAC32 , [InstrStage<2, [V6_Pipe]>], [5, 1, 1, 2]>, + InstrItinData<IIC_iMUL64 , [InstrStage<3, [V6_Pipe]>], [6, 1, 1]>, + InstrItinData<IIC_iMAC64 , [InstrStage<3, [V6_Pipe]>], [6, 1, 1, 2]>, + + // Integer load pipeline + // + // Immediate offset + InstrItinData<IIC_iLoadi , [InstrStage<1, [V6_Pipe]>], [4, 1]>, + // + // Register offset + InstrItinData<IIC_iLoadr , [InstrStage<1, [V6_Pipe]>], [4, 1, 1]>, + // + // Scaled register offset, issues over 2 cycles + InstrItinData<IIC_iLoadsi , [InstrStage<2, [V6_Pipe]>], [5, 2, 1]>, + // + // Immediate offset with update + InstrItinData<IIC_iLoadiu , [InstrStage<1, [V6_Pipe]>], [4, 2, 1]>, + // + // Register offset with update + InstrItinData<IIC_iLoadru , [InstrStage<1, [V6_Pipe]>], [4, 2, 1, 1]>, + // + // Scaled register offset with update, issues over 2 cycles + InstrItinData<IIC_iLoadsiu , [InstrStage<2, [V6_Pipe]>], [5, 2, 2, 1]>, + + // + // Load multiple + InstrItinData<IIC_iLoadm , [InstrStage<3, [V6_Pipe]>]>, + + // Integer store pipeline + // + // Immediate offset + InstrItinData<IIC_iStorei , [InstrStage<1, [V6_Pipe]>], [2, 1]>, + // + // Register offset + InstrItinData<IIC_iStorer , [InstrStage<1, [V6_Pipe]>], [2, 1, 1]>, + + // + // Scaled register offset, issues over 2 cycles + InstrItinData<IIC_iStoresi , [InstrStage<2, [V6_Pipe]>], [2, 2, 1]>, + // + // Immediate offset with update + InstrItinData<IIC_iStoreiu , [InstrStage<1, [V6_Pipe]>], [2, 2, 1]>, + // + // Register offset with update + InstrItinData<IIC_iStoreru , [InstrStage<1, [V6_Pipe]>], [2, 2, 1, 1]>, + // + // Scaled register offset with update, issues over 2 cycles + InstrItinData<IIC_iStoresiu, [InstrStage<2, [V6_Pipe]>], [2, 2, 2, 1]>, + // + // Store multiple + InstrItinData<IIC_iStorem , [InstrStage<3, [V6_Pipe]>]>, + + // Branch + // + // no delay slots, so the latency of a branch is unimportant + InstrItinData<IIC_Br , [InstrStage<1, [V6_Pipe]>]>, + + // VFP + // Issue through integer pipeline, and execute in NEON unit. We assume + // RunFast mode so that NFP pipeline is used for single-precision when + // possible. + // + // FP Special Register to Integer Register File Move + InstrItinData<IIC_fpSTAT , [InstrStage<1, [V6_Pipe]>], [3]>, + // + // Single-precision FP Unary + InstrItinData<IIC_fpUNA32 , [InstrStage<1, [V6_Pipe]>], [5, 2]>, + // + // Double-precision FP Unary + InstrItinData<IIC_fpUNA64 , [InstrStage<1, [V6_Pipe]>], [5, 2]>, + // + // Single-precision FP Compare + InstrItinData<IIC_fpCMP32 , [InstrStage<1, [V6_Pipe]>], [2, 2]>, + // + // Double-precision FP Compare + InstrItinData<IIC_fpCMP64 , [InstrStage<1, [V6_Pipe]>], [2, 2]>, + // + // Single to Double FP Convert + InstrItinData<IIC_fpCVTSD , [InstrStage<1, [V6_Pipe]>], [5, 2]>, + // + // Double to Single FP Convert + InstrItinData<IIC_fpCVTDS , [InstrStage<1, [V6_Pipe]>], [5, 2]>, + // + // Single-Precision FP to Integer Convert + InstrItinData<IIC_fpCVTSI , [InstrStage<1, [V6_Pipe]>], [9, 2]>, + // + // Double-Precision FP to Integer Convert + InstrItinData<IIC_fpCVTDI , [InstrStage<1, [V6_Pipe]>], [9, 2]>, + // + // Integer to Single-Precision FP Convert + InstrItinData<IIC_fpCVTIS , [InstrStage<1, [V6_Pipe]>], [9, 2]>, + // + // Integer to Double-Precision FP Convert + InstrItinData<IIC_fpCVTID , [InstrStage<1, [V6_Pipe]>], [9, 2]>, + // + // Single-precision FP ALU + InstrItinData<IIC_fpALU32 , [InstrStage<1, [V6_Pipe]>], [9, 2, 2]>, + // + // Double-precision FP ALU + InstrItinData<IIC_fpALU64 , [InstrStage<1, [V6_Pipe]>], [9, 2, 2]>, + // + // Single-precision FP Multiply + InstrItinData<IIC_fpMUL32 , [InstrStage<1, [V6_Pipe]>], [9, 2, 2]>, + // + // Double-precision FP Multiply + InstrItinData<IIC_fpMUL64 , [InstrStage<2, [V6_Pipe]>], [9, 2, 2]>, + // + // Single-precision FP MAC + InstrItinData<IIC_fpMAC32 , [InstrStage<1, [V6_Pipe]>], [9, 2, 2, 2]>, + // + // Double-precision FP MAC + InstrItinData<IIC_fpMAC64 , [InstrStage<2, [V6_Pipe]>], [9, 2, 2, 2]>, + // + // Single-precision FP DIV + InstrItinData<IIC_fpDIV32 , [InstrStage<15, [V6_Pipe]>], [20, 2, 2]>, + // + // Double-precision FP DIV + InstrItinData<IIC_fpDIV64 , [InstrStage<29, [V6_Pipe]>], [34, 2, 2]>, + // + // Single-precision FP SQRT + InstrItinData<IIC_fpSQRT32 , [InstrStage<15, [V6_Pipe]>], [20, 2, 2]>, + // + // Double-precision FP SQRT + InstrItinData<IIC_fpSQRT64 , [InstrStage<29, [V6_Pipe]>], [34, 2, 2]>, + // + // Single-precision FP Load + InstrItinData<IIC_fpLoad32 , [InstrStage<1, [V6_Pipe]>], [5, 2, 2]>, + // + // Double-precision FP Load + InstrItinData<IIC_fpLoad64 , [InstrStage<1, [V6_Pipe]>], [5, 2, 2]>, + // + // FP Load Multiple + InstrItinData<IIC_fpLoadm , [InstrStage<3, [V6_Pipe]>]>, + // + // Single-precision FP Store + InstrItinData<IIC_fpStore32 , [InstrStage<1, [V6_Pipe]>], [2, 2, 2]>, + // + // Double-precision FP Store + // use FU_Issue to enforce the 1 load/store per cycle limit + InstrItinData<IIC_fpStore64 , [InstrStage<1, [V6_Pipe]>], [2, 2, 2]>, + // + // FP Store Multiple + InstrItinData<IIC_fpStorem , [InstrStage<3, [V6_Pipe]>]> +]>; diff --git a/contrib/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp new file mode 100644 index 0000000..a289407 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp @@ -0,0 +1,134 @@ +//===-- ARMSelectionDAGInfo.cpp - ARM SelectionDAG Info -------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the ARMSelectionDAGInfo class. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "arm-selectiondag-info" +#include "ARMTargetMachine.h" +using namespace llvm; + +ARMSelectionDAGInfo::ARMSelectionDAGInfo(const TargetMachine &TM) + : TargetSelectionDAGInfo(TM), + Subtarget(&TM.getSubtarget<ARMSubtarget>()) { +} + +ARMSelectionDAGInfo::~ARMSelectionDAGInfo() { +} + +SDValue +ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl, + SDValue Chain, + SDValue Dst, SDValue Src, + SDValue Size, unsigned Align, + bool isVolatile, bool AlwaysInline, + const Value *DstSV, + uint64_t DstSVOff, + const Value *SrcSV, + uint64_t SrcSVOff) const { + // Do repeated 4-byte loads and stores. To be improved. + // This requires 4-byte alignment. + if ((Align & 3) != 0) + return SDValue(); + // This requires the copy size to be a constant, preferrably + // within a subtarget-specific limit. + ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size); + if (!ConstantSize) + return SDValue(); + uint64_t SizeVal = ConstantSize->getZExtValue(); + if (!AlwaysInline && SizeVal > Subtarget->getMaxInlineSizeThreshold()) + return SDValue(); + + unsigned BytesLeft = SizeVal & 3; + unsigned NumMemOps = SizeVal >> 2; + unsigned EmittedNumMemOps = 0; + EVT VT = MVT::i32; + unsigned VTSize = 4; + unsigned i = 0; + const unsigned MAX_LOADS_IN_LDM = 6; + SDValue TFOps[MAX_LOADS_IN_LDM]; + SDValue Loads[MAX_LOADS_IN_LDM]; + uint64_t SrcOff = 0, DstOff = 0; + + // Emit up to MAX_LOADS_IN_LDM loads, then a TokenFactor barrier, then the + // same number of stores. The loads and stores will get combined into + // ldm/stm later on. + while (EmittedNumMemOps < NumMemOps) { + for (i = 0; + i < MAX_LOADS_IN_LDM && EmittedNumMemOps + i < NumMemOps; ++i) { + Loads[i] = DAG.getLoad(VT, dl, Chain, + DAG.getNode(ISD::ADD, dl, MVT::i32, Src, + DAG.getConstant(SrcOff, MVT::i32)), + SrcSV, SrcSVOff + SrcOff, isVolatile, false, 0); + TFOps[i] = Loads[i].getValue(1); + SrcOff += VTSize; + } + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &TFOps[0], i); + + for (i = 0; + i < MAX_LOADS_IN_LDM && EmittedNumMemOps + i < NumMemOps; ++i) { + TFOps[i] = DAG.getStore(Chain, dl, Loads[i], + DAG.getNode(ISD::ADD, dl, MVT::i32, Dst, + DAG.getConstant(DstOff, MVT::i32)), + DstSV, DstSVOff + DstOff, isVolatile, false, 0); + DstOff += VTSize; + } + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &TFOps[0], i); + + EmittedNumMemOps += i; + } + + if (BytesLeft == 0) + return Chain; + + // Issue loads / stores for the trailing (1 - 3) bytes. + unsigned BytesLeftSave = BytesLeft; + i = 0; + while (BytesLeft) { + if (BytesLeft >= 2) { + VT = MVT::i16; + VTSize = 2; + } else { + VT = MVT::i8; + VTSize = 1; + } + + Loads[i] = DAG.getLoad(VT, dl, Chain, + DAG.getNode(ISD::ADD, dl, MVT::i32, Src, + DAG.getConstant(SrcOff, MVT::i32)), + SrcSV, SrcSVOff + SrcOff, false, false, 0); + TFOps[i] = Loads[i].getValue(1); + ++i; + SrcOff += VTSize; + BytesLeft -= VTSize; + } + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &TFOps[0], i); + + i = 0; + BytesLeft = BytesLeftSave; + while (BytesLeft) { + if (BytesLeft >= 2) { + VT = MVT::i16; + VTSize = 2; + } else { + VT = MVT::i8; + VTSize = 1; + } + + TFOps[i] = DAG.getStore(Chain, dl, Loads[i], + DAG.getNode(ISD::ADD, dl, MVT::i32, Dst, + DAG.getConstant(DstOff, MVT::i32)), + DstSV, DstSVOff + DstOff, false, false, 0); + ++i; + DstOff += VTSize; + BytesLeft -= VTSize; + } + return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &TFOps[0], i); +} diff --git a/contrib/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h b/contrib/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h new file mode 100644 index 0000000..d7d00c2 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h @@ -0,0 +1,44 @@ +//===-- ARMSelectionDAGInfo.h - ARM SelectionDAG Info -----------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the ARM subclass for TargetSelectionDAGInfo. +// +//===----------------------------------------------------------------------===// + +#ifndef ARMSELECTIONDAGINFO_H +#define ARMSELECTIONDAGINFO_H + +#include "llvm/Target/TargetSelectionDAGInfo.h" + +namespace llvm { + +class ARMSelectionDAGInfo : public TargetSelectionDAGInfo { + /// Subtarget - Keep a pointer to the ARMSubtarget around so that we can + /// make the right decision when generating code for different targets. + const ARMSubtarget *Subtarget; + +public: + explicit ARMSelectionDAGInfo(const TargetMachine &TM); + ~ARMSelectionDAGInfo(); + + virtual + SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl, + SDValue Chain, + SDValue Dst, SDValue Src, + SDValue Size, unsigned Align, + bool isVolatile, bool AlwaysInline, + const Value *DstSV, + uint64_t DstSVOff, + const Value *SrcSV, + uint64_t SrcSVOff) const; +}; + +} + +#endif diff --git a/contrib/llvm/lib/Target/ARM/ARMSubtarget.cpp b/contrib/llvm/lib/Target/ARM/ARMSubtarget.cpp new file mode 100644 index 0000000..10fd257 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMSubtarget.cpp @@ -0,0 +1,180 @@ +//===-- ARMSubtarget.cpp - ARM Subtarget Information ------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the ARM specific subclass of TargetSubtarget. +// +//===----------------------------------------------------------------------===// + +#include "ARMSubtarget.h" +#include "ARMGenSubtarget.inc" +#include "llvm/GlobalValue.h" +#include "llvm/Target/TargetOptions.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/ADT/SmallVector.h" +using namespace llvm; + +static cl::opt<bool> +ReserveR9("arm-reserve-r9", cl::Hidden, + cl::desc("Reserve R9, making it unavailable as GPR")); + +static cl::opt<bool> +UseMOVT("arm-use-movt", + cl::init(true), cl::Hidden); + +ARMSubtarget::ARMSubtarget(const std::string &TT, const std::string &FS, + bool isT) + : ARMArchVersion(V4) + , ARMFPUType(None) + , UseNEONForSinglePrecisionFP(false) + , SlowVMLx(false) + , IsThumb(isT) + , ThumbMode(Thumb1) + , PostRAScheduler(false) + , IsR9Reserved(ReserveR9) + , UseMovt(UseMOVT) + , HasFP16(false) + , HasHardwareDivide(false) + , HasT2ExtractPack(false) + , stackAlignment(4) + , CPUString("generic") + , TargetType(isELF) // Default to ELF unless otherwise specified. + , TargetABI(ARM_ABI_APCS) { + // default to soft float ABI + if (FloatABIType == FloatABI::Default) + FloatABIType = FloatABI::Soft; + + // Determine default and user specified characteristics + + // Parse features string. + CPUString = ParseSubtargetFeatures(FS, CPUString); + + // When no arch is specified either by CPU or by attributes, make the default + // ARMv4T. + if (CPUString == "generic" && (FS.empty() || FS == "generic")) + ARMArchVersion = V4T; + + // Set the boolean corresponding to the current target triple, or the default + // if one cannot be determined, to true. + unsigned Len = TT.length(); + unsigned Idx = 0; + + if (Len >= 5 && TT.substr(0, 4) == "armv") + Idx = 4; + else if (Len >= 6 && TT.substr(0, 5) == "thumb") { + IsThumb = true; + if (Len >= 7 && TT[5] == 'v') + Idx = 6; + } + if (Idx) { + unsigned SubVer = TT[Idx]; + if (SubVer >= '7' && SubVer <= '9') { + ARMArchVersion = V7A; + if (Len >= Idx+2 && TT[Idx+1] == 'm') + ARMArchVersion = V7M; + } else if (SubVer == '6') { + ARMArchVersion = V6; + if (Len >= Idx+3 && TT[Idx+1] == 't' && TT[Idx+2] == '2') + ARMArchVersion = V6T2; + } else if (SubVer == '5') { + ARMArchVersion = V5T; + if (Len >= Idx+3 && TT[Idx+1] == 't' && TT[Idx+2] == 'e') + ARMArchVersion = V5TE; + } else if (SubVer == '4') { + if (Len >= Idx+2 && TT[Idx+1] == 't') + ARMArchVersion = V4T; + else + ARMArchVersion = V4; + } + } + + // Thumb2 implies at least V6T2. + if (ARMArchVersion >= V6T2) + ThumbMode = Thumb2; + else if (ThumbMode >= Thumb2) + ARMArchVersion = V6T2; + + if (Len >= 10) { + if (TT.find("-darwin") != std::string::npos) + // arm-darwin + TargetType = isDarwin; + } + + if (TT.find("eabi") != std::string::npos) + TargetABI = ARM_ABI_AAPCS; + + if (isAAPCS_ABI()) + stackAlignment = 8; + + if (isTargetDarwin()) + IsR9Reserved = ReserveR9 | (ARMArchVersion < V6); + + if (!isThumb() || hasThumb2()) + PostRAScheduler = true; +} + +/// GVIsIndirectSymbol - true if the GV will be accessed via an indirect symbol. +bool +ARMSubtarget::GVIsIndirectSymbol(const GlobalValue *GV, + Reloc::Model RelocM) const { + if (RelocM == Reloc::Static) + return false; + + // Materializable GVs (in JIT lazy compilation mode) do not require an extra + // load from stub. + bool isDecl = GV->isDeclaration() && !GV->isMaterializable(); + + if (!isTargetDarwin()) { + // Extra load is needed for all externally visible. + if (GV->hasLocalLinkage() || GV->hasHiddenVisibility()) + return false; + return true; + } else { + if (RelocM == Reloc::PIC_) { + // If this is a strong reference to a definition, it is definitely not + // through a stub. + if (!isDecl && !GV->isWeakForLinker()) + return false; + + // Unless we have a symbol with hidden visibility, we have to go through a + // normal $non_lazy_ptr stub because this symbol might be resolved late. + if (!GV->hasHiddenVisibility()) // Non-hidden $non_lazy_ptr reference. + return true; + + // If symbol visibility is hidden, we have a stub for common symbol + // references and external declarations. + if (isDecl || GV->hasCommonLinkage()) + // Hidden $non_lazy_ptr reference. + return true; + + return false; + } else { + // If this is a strong reference to a definition, it is definitely not + // through a stub. + if (!isDecl && !GV->isWeakForLinker()) + return false; + + // Unless we have a symbol with hidden visibility, we have to go through a + // normal $non_lazy_ptr stub because this symbol might be resolved late. + if (!GV->hasHiddenVisibility()) // Non-hidden $non_lazy_ptr reference. + return true; + } + } + + return false; +} + +bool ARMSubtarget::enablePostRAScheduler( + CodeGenOpt::Level OptLevel, + TargetSubtarget::AntiDepBreakMode& Mode, + RegClassVector& CriticalPathRCs) const { + Mode = TargetSubtarget::ANTIDEP_CRITICAL; + CriticalPathRCs.clear(); + CriticalPathRCs.push_back(&ARM::GPRRegClass); + return PostRAScheduler && OptLevel >= CodeGenOpt::Default; +} diff --git a/contrib/llvm/lib/Target/ARM/ARMSubtarget.h b/contrib/llvm/lib/Target/ARM/ARMSubtarget.h new file mode 100644 index 0000000..8332bba --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMSubtarget.h @@ -0,0 +1,176 @@ +//=====---- ARMSubtarget.h - Define Subtarget for the ARM -----*- C++ -*--====// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file declares the ARM specific subclass of TargetSubtarget. +// +//===----------------------------------------------------------------------===// + +#ifndef ARMSUBTARGET_H +#define ARMSUBTARGET_H + +#include "llvm/Target/TargetInstrItineraries.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetSubtarget.h" +#include "ARMBaseRegisterInfo.h" +#include <string> + +namespace llvm { +class GlobalValue; + +class ARMSubtarget : public TargetSubtarget { +protected: + enum ARMArchEnum { + V4, V4T, V5T, V5TE, V6, V6T2, V7A, V7M + }; + + enum ARMFPEnum { + None, VFPv2, VFPv3, NEON + }; + + enum ThumbTypeEnum { + Thumb1, + Thumb2 + }; + + /// ARMArchVersion - ARM architecture version: V4, V4T (base), V5T, V5TE, + /// V6, V6T2, V7A, V7M. + ARMArchEnum ARMArchVersion; + + /// ARMFPUType - Floating Point Unit type. + ARMFPEnum ARMFPUType; + + /// UseNEONForSinglePrecisionFP - if the NEONFP attribute has been + /// specified. Use the method useNEONForSinglePrecisionFP() to + /// determine if NEON should actually be used. + bool UseNEONForSinglePrecisionFP; + + /// SlowVMLx - If the VFP2 instructions are available, indicates whether + /// the VML[AS] instructions are slow (if so, don't use them). + bool SlowVMLx; + + /// IsThumb - True if we are in thumb mode, false if in ARM mode. + bool IsThumb; + + /// ThumbMode - Indicates supported Thumb version. + ThumbTypeEnum ThumbMode; + + /// PostRAScheduler - True if using post-register-allocation scheduler. + bool PostRAScheduler; + + /// IsR9Reserved - True if R9 is a not available as general purpose register. + bool IsR9Reserved; + + /// UseMovt - True if MOVT / MOVW pairs are used for materialization of 32-bit + /// imms (including global addresses). + bool UseMovt; + + /// HasFP16 - True if subtarget supports half-precision FP (We support VFP+HF + /// only so far) + bool HasFP16; + + /// HasHardwareDivide - True if subtarget supports [su]div + bool HasHardwareDivide; + + /// HasT2ExtractPack - True if subtarget supports thumb2 extract/pack + /// instructions. + bool HasT2ExtractPack; + + /// stackAlignment - The minimum alignment known to hold of the stack frame on + /// entry to the function and which must be maintained by every function. + unsigned stackAlignment; + + /// CPUString - String name of used CPU. + std::string CPUString; + + /// Selected instruction itineraries (one entry per itinerary class.) + InstrItineraryData InstrItins; + + public: + enum { + isELF, isDarwin + } TargetType; + + enum { + ARM_ABI_APCS, + ARM_ABI_AAPCS // ARM EABI + } TargetABI; + + /// This constructor initializes the data members to match that + /// of the specified triple. + /// + ARMSubtarget(const std::string &TT, const std::string &FS, bool isThumb); + + /// getMaxInlineSizeThreshold - Returns the maximum memset / memcpy size + /// that still makes it profitable to inline the call. + unsigned getMaxInlineSizeThreshold() const { + // FIXME: For now, we don't lower memcpy's to loads / stores for Thumb1. + // Change this once Thumb1 ldmia / stmia support is added. + return isThumb1Only() ? 0 : 64; + } + /// ParseSubtargetFeatures - Parses features string setting specified + /// subtarget options. Definition of function is auto generated by tblgen. + std::string ParseSubtargetFeatures(const std::string &FS, + const std::string &CPU); + + bool hasV4TOps() const { return ARMArchVersion >= V4T; } + bool hasV5TOps() const { return ARMArchVersion >= V5T; } + bool hasV5TEOps() const { return ARMArchVersion >= V5TE; } + bool hasV6Ops() const { return ARMArchVersion >= V6; } + bool hasV6T2Ops() const { return ARMArchVersion >= V6T2; } + bool hasV7Ops() const { return ARMArchVersion >= V7A; } + + bool hasVFP2() const { return ARMFPUType >= VFPv2; } + bool hasVFP3() const { return ARMFPUType >= VFPv3; } + bool hasNEON() const { return ARMFPUType >= NEON; } + bool useNEONForSinglePrecisionFP() const { + return hasNEON() && UseNEONForSinglePrecisionFP; } + bool hasDivide() const { return HasHardwareDivide; } + bool hasT2ExtractPack() const { return HasT2ExtractPack; } + bool useVMLx() const {return hasVFP2() && !SlowVMLx; } + + bool hasFP16() const { return HasFP16; } + + bool isTargetDarwin() const { return TargetType == isDarwin; } + bool isTargetELF() const { return TargetType == isELF; } + + bool isAPCS_ABI() const { return TargetABI == ARM_ABI_APCS; } + bool isAAPCS_ABI() const { return TargetABI == ARM_ABI_AAPCS; } + + bool isThumb() const { return IsThumb; } + bool isThumb1Only() const { return IsThumb && (ThumbMode == Thumb1); } + bool isThumb2() const { return IsThumb && (ThumbMode == Thumb2); } + bool hasThumb2() const { return ThumbMode >= Thumb2; } + + bool isR9Reserved() const { return IsR9Reserved; } + + bool useMovt() const { return UseMovt && hasV6T2Ops(); } + + const std::string & getCPUString() const { return CPUString; } + + /// enablePostRAScheduler - True at 'More' optimization. + bool enablePostRAScheduler(CodeGenOpt::Level OptLevel, + TargetSubtarget::AntiDepBreakMode& Mode, + RegClassVector& CriticalPathRCs) const; + + /// getInstrItins - Return the instruction itineraies based on subtarget + /// selection. + const InstrItineraryData &getInstrItineraryData() const { return InstrItins; } + + /// getStackAlignment - Returns the minimum alignment known to hold of the + /// stack frame on entry to the function and which must be maintained by every + /// function for this subtarget. + unsigned getStackAlignment() const { return stackAlignment; } + + /// GVIsIndirectSymbol - true if the GV will be accessed via an indirect + /// symbol. + bool GVIsIndirectSymbol(const GlobalValue *GV, Reloc::Model RelocM) const; +}; +} // End llvm namespace + +#endif // ARMSUBTARGET_H diff --git a/contrib/llvm/lib/Target/ARM/ARMTargetMachine.cpp b/contrib/llvm/lib/Target/ARM/ARMTargetMachine.cpp new file mode 100644 index 0000000..b4a9252 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMTargetMachine.cpp @@ -0,0 +1,148 @@ +//===-- ARMTargetMachine.cpp - Define TargetMachine for ARM ---------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// +//===----------------------------------------------------------------------===// + +#include "ARMTargetMachine.h" +#include "ARMMCAsmInfo.h" +#include "ARMFrameInfo.h" +#include "ARM.h" +#include "llvm/PassManager.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/Support/FormattedStream.h" +#include "llvm/Target/TargetOptions.h" +#include "llvm/Target/TargetRegistry.h" +using namespace llvm; + +static MCAsmInfo *createMCAsmInfo(const Target &T, StringRef TT) { + Triple TheTriple(TT); + switch (TheTriple.getOS()) { + case Triple::Darwin: + return new ARMMCAsmInfoDarwin(); + default: + return new ARMELFMCAsmInfo(); + } +} + + +extern "C" void LLVMInitializeARMTarget() { + // Register the target. + RegisterTargetMachine<ARMTargetMachine> X(TheARMTarget); + RegisterTargetMachine<ThumbTargetMachine> Y(TheThumbTarget); + + // Register the target asm info. + RegisterAsmInfoFn A(TheARMTarget, createMCAsmInfo); + RegisterAsmInfoFn B(TheThumbTarget, createMCAsmInfo); +} + +/// TargetMachine ctor - Create an ARM architecture model. +/// +ARMBaseTargetMachine::ARMBaseTargetMachine(const Target &T, + const std::string &TT, + const std::string &FS, + bool isThumb) + : LLVMTargetMachine(T, TT), + Subtarget(TT, FS, isThumb), + FrameInfo(Subtarget), + JITInfo(), + InstrItins(Subtarget.getInstrItineraryData()) { + DefRelocModel = getRelocationModel(); +} + +ARMTargetMachine::ARMTargetMachine(const Target &T, const std::string &TT, + const std::string &FS) + : ARMBaseTargetMachine(T, TT, FS, false), InstrInfo(Subtarget), + DataLayout(Subtarget.isAPCS_ABI() ? + std::string("e-p:32:32-f64:32:32-i64:32:32-n32") : + std::string("e-p:32:32-f64:64:64-i64:64:64-n32")), + TLInfo(*this), + TSInfo(*this) { +} + +ThumbTargetMachine::ThumbTargetMachine(const Target &T, const std::string &TT, + const std::string &FS) + : ARMBaseTargetMachine(T, TT, FS, true), + InstrInfo(Subtarget.hasThumb2() + ? ((ARMBaseInstrInfo*)new Thumb2InstrInfo(Subtarget)) + : ((ARMBaseInstrInfo*)new Thumb1InstrInfo(Subtarget))), + DataLayout(Subtarget.isAPCS_ABI() ? + std::string("e-p:32:32-f64:32:32-i64:32:32-" + "i16:16:32-i8:8:32-i1:8:32-a:0:32-n32") : + std::string("e-p:32:32-f64:64:64-i64:64:64-" + "i16:16:32-i8:8:32-i1:8:32-a:0:32-n32")), + TLInfo(*this), + TSInfo(*this) { +} + + + +// Pass Pipeline Configuration +bool ARMBaseTargetMachine::addInstSelector(PassManagerBase &PM, + CodeGenOpt::Level OptLevel) { + PM.add(createARMISelDag(*this, OptLevel)); + return false; +} + +bool ARMBaseTargetMachine::addPreRegAlloc(PassManagerBase &PM, + CodeGenOpt::Level OptLevel) { + if (Subtarget.hasNEON()) + PM.add(createNEONPreAllocPass()); + + // FIXME: temporarily disabling load / store optimization pass for Thumb1. + if (OptLevel != CodeGenOpt::None && !Subtarget.isThumb1Only()) + PM.add(createARMLoadStoreOptimizationPass(true)); + return true; +} + +bool ARMBaseTargetMachine::addPreSched2(PassManagerBase &PM, + CodeGenOpt::Level OptLevel) { + // FIXME: temporarily disabling load / store optimization pass for Thumb1. + if (OptLevel != CodeGenOpt::None) { + if (!Subtarget.isThumb1Only()) + PM.add(createARMLoadStoreOptimizationPass()); + if (Subtarget.hasNEON()) + PM.add(createNEONMoveFixPass()); + } + + // Expand some pseudo instructions into multiple instructions to allow + // proper scheduling. + PM.add(createARMExpandPseudoPass()); + + return true; +} + +bool ARMBaseTargetMachine::addPreEmitPass(PassManagerBase &PM, + CodeGenOpt::Level OptLevel) { + // FIXME: temporarily disabling load / store optimization pass for Thumb1. + if (OptLevel != CodeGenOpt::None) { + if (!Subtarget.isThumb1Only()) + PM.add(createIfConverterPass()); + } + + if (Subtarget.isThumb2()) { + PM.add(createThumb2ITBlockPass()); + PM.add(createThumb2SizeReductionPass()); + } + + PM.add(createARMConstantIslandPass()); + return true; +} + +bool ARMBaseTargetMachine::addCodeEmitter(PassManagerBase &PM, + CodeGenOpt::Level OptLevel, + JITCodeEmitter &JCE) { + // FIXME: Move this to TargetJITInfo! + if (DefRelocModel == Reloc::Default) + setRelocationModel(Reloc::Static); + + // Machine code emitter pass for ARM. + PM.add(createARMJITCodeEmitterPass(*this, JCE)); + return false; +} diff --git a/contrib/llvm/lib/Target/ARM/ARMTargetMachine.h b/contrib/llvm/lib/Target/ARM/ARMTargetMachine.h new file mode 100644 index 0000000..a222e57 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMTargetMachine.h @@ -0,0 +1,124 @@ +//===-- ARMTargetMachine.h - Define TargetMachine for ARM -------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file declares the ARM specific subclass of TargetMachine. +// +//===----------------------------------------------------------------------===// + +#ifndef ARMTARGETMACHINE_H +#define ARMTARGETMACHINE_H + +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetData.h" +#include "ARMInstrInfo.h" +#include "ARMFrameInfo.h" +#include "ARMJITInfo.h" +#include "ARMSubtarget.h" +#include "ARMISelLowering.h" +#include "ARMSelectionDAGInfo.h" +#include "Thumb1InstrInfo.h" +#include "Thumb2InstrInfo.h" +#include "llvm/ADT/OwningPtr.h" + +namespace llvm { + +class ARMBaseTargetMachine : public LLVMTargetMachine { +protected: + ARMSubtarget Subtarget; + +private: + ARMFrameInfo FrameInfo; + ARMJITInfo JITInfo; + InstrItineraryData InstrItins; + Reloc::Model DefRelocModel; // Reloc model before it's overridden. + +public: + ARMBaseTargetMachine(const Target &T, const std::string &TT, + const std::string &FS, bool isThumb); + + virtual const ARMFrameInfo *getFrameInfo() const { return &FrameInfo; } + virtual ARMJITInfo *getJITInfo() { return &JITInfo; } + virtual const ARMSubtarget *getSubtargetImpl() const { return &Subtarget; } + virtual const InstrItineraryData getInstrItineraryData() const { + return InstrItins; + } + + // Pass Pipeline Configuration + virtual bool addInstSelector(PassManagerBase &PM, CodeGenOpt::Level OptLevel); + virtual bool addPreRegAlloc(PassManagerBase &PM, CodeGenOpt::Level OptLevel); + virtual bool addPreSched2(PassManagerBase &PM, CodeGenOpt::Level OptLevel); + virtual bool addPreEmitPass(PassManagerBase &PM, CodeGenOpt::Level OptLevel); + virtual bool addCodeEmitter(PassManagerBase &PM, CodeGenOpt::Level OptLevel, + JITCodeEmitter &MCE); +}; + +/// ARMTargetMachine - ARM target machine. +/// +class ARMTargetMachine : public ARMBaseTargetMachine { + ARMInstrInfo InstrInfo; + const TargetData DataLayout; // Calculates type size & alignment + ARMTargetLowering TLInfo; + ARMSelectionDAGInfo TSInfo; +public: + ARMTargetMachine(const Target &T, const std::string &TT, + const std::string &FS); + + virtual const ARMRegisterInfo *getRegisterInfo() const { + return &InstrInfo.getRegisterInfo(); + } + + virtual const ARMTargetLowering *getTargetLowering() const { + return &TLInfo; + } + + virtual const ARMSelectionDAGInfo* getSelectionDAGInfo() const { + return &TSInfo; + } + + virtual const ARMInstrInfo *getInstrInfo() const { return &InstrInfo; } + virtual const TargetData *getTargetData() const { return &DataLayout; } +}; + +/// ThumbTargetMachine - Thumb target machine. +/// Due to the way architectures are handled, this represents both +/// Thumb-1 and Thumb-2. +/// +class ThumbTargetMachine : public ARMBaseTargetMachine { + // Either Thumb1InstrInfo or Thumb2InstrInfo. + OwningPtr<ARMBaseInstrInfo> InstrInfo; + const TargetData DataLayout; // Calculates type size & alignment + ARMTargetLowering TLInfo; + ARMSelectionDAGInfo TSInfo; +public: + ThumbTargetMachine(const Target &T, const std::string &TT, + const std::string &FS); + + /// returns either Thumb1RegisterInfo or Thumb2RegisterInfo + virtual const ARMBaseRegisterInfo *getRegisterInfo() const { + return &InstrInfo->getRegisterInfo(); + } + + virtual const ARMTargetLowering *getTargetLowering() const { + return &TLInfo; + } + + virtual const ARMSelectionDAGInfo *getSelectionDAGInfo() const { + return &TSInfo; + } + + /// returns either Thumb1InstrInfo or Thumb2InstrInfo + virtual const ARMBaseInstrInfo *getInstrInfo() const { + return InstrInfo.get(); + } + virtual const TargetData *getTargetData() const { return &DataLayout; } +}; + +} // end namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp b/contrib/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp new file mode 100644 index 0000000..091a3b3 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp @@ -0,0 +1,39 @@ +//===-- llvm/Target/ARMTargetObjectFile.cpp - ARM Object Info Impl --------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "ARMTargetObjectFile.h" +#include "ARMSubtarget.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCSectionELF.h" +#include "llvm/Support/Dwarf.h" +#include "llvm/Target/TargetMachine.h" +using namespace llvm; +using namespace dwarf; + +//===----------------------------------------------------------------------===// +// ELF Target +//===----------------------------------------------------------------------===// + +void ARMElfTargetObjectFile::Initialize(MCContext &Ctx, + const TargetMachine &TM) { + TargetLoweringObjectFileELF::Initialize(Ctx, TM); + + if (TM.getSubtarget<ARMSubtarget>().isAAPCS_ABI()) { + StaticCtorSection = + getContext().getELFSection(".init_array", MCSectionELF::SHT_INIT_ARRAY, + MCSectionELF::SHF_WRITE | + MCSectionELF::SHF_ALLOC, + SectionKind::getDataRel()); + StaticDtorSection = + getContext().getELFSection(".fini_array", MCSectionELF::SHT_FINI_ARRAY, + MCSectionELF::SHF_WRITE | + MCSectionELF::SHF_ALLOC, + SectionKind::getDataRel()); + } +} diff --git a/contrib/llvm/lib/Target/ARM/ARMTargetObjectFile.h b/contrib/llvm/lib/Target/ARM/ARMTargetObjectFile.h new file mode 100644 index 0000000..097fc2c --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMTargetObjectFile.h @@ -0,0 +1,29 @@ +//===-- llvm/Target/ARMTargetObjectFile.h - ARM Object Info -----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TARGET_ARM_TARGETOBJECTFILE_H +#define LLVM_TARGET_ARM_TARGETOBJECTFILE_H + +#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" + +namespace llvm { + +class MCContext; +class TargetMachine; + +class ARMElfTargetObjectFile : public TargetLoweringObjectFileELF { +public: + ARMElfTargetObjectFile() : TargetLoweringObjectFileELF() {} + + virtual void Initialize(MCContext &Ctx, const TargetMachine &TM); +}; + +} // end namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/ARM/AsmParser/ARMAsmLexer.cpp b/contrib/llvm/lib/Target/ARM/AsmParser/ARMAsmLexer.cpp new file mode 100644 index 0000000..f859d1b --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/AsmParser/ARMAsmLexer.cpp @@ -0,0 +1,140 @@ +//===-- ARMAsmLexer.cpp - Tokenize ARM assembly to AsmTokens --------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "ARM.h" +#include "ARMTargetMachine.h" + +#include "llvm/ADT/OwningPtr.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringExtras.h" + +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCParser/MCAsmLexer.h" +#include "llvm/MC/MCParser/MCParsedAsmOperand.h" + +#include "llvm/Target/TargetAsmLexer.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetRegistry.h" + +#include <string> +#include <map> + +using namespace llvm; + +namespace { + + class ARMBaseAsmLexer : public TargetAsmLexer { + const MCAsmInfo &AsmInfo; + + const AsmToken &lexDefinite() { + return getLexer()->Lex(); + } + + AsmToken LexTokenUAL(); + protected: + typedef std::map <std::string, unsigned> rmap_ty; + + rmap_ty RegisterMap; + + void InitRegisterMap(const TargetRegisterInfo *info) { + unsigned numRegs = info->getNumRegs(); + + for (unsigned i = 0; i < numRegs; ++i) { + const char *regName = info->getName(i); + if (regName) + RegisterMap[regName] = i; + } + } + + unsigned MatchRegisterName(StringRef Name) { + rmap_ty::iterator iter = RegisterMap.find(Name.str()); + if (iter != RegisterMap.end()) + return iter->second; + else + return 0; + } + + AsmToken LexToken() { + if (!Lexer) { + SetError(SMLoc(), "No MCAsmLexer installed"); + return AsmToken(AsmToken::Error, "", 0); + } + + switch (AsmInfo.getAssemblerDialect()) { + default: + SetError(SMLoc(), "Unhandled dialect"); + return AsmToken(AsmToken::Error, "", 0); + case 0: + return LexTokenUAL(); + } + } + public: + ARMBaseAsmLexer(const Target &T, const MCAsmInfo &MAI) + : TargetAsmLexer(T), AsmInfo(MAI) { + } + }; + + class ARMAsmLexer : public ARMBaseAsmLexer { + public: + ARMAsmLexer(const Target &T, const MCAsmInfo &MAI) + : ARMBaseAsmLexer(T, MAI) { + std::string tripleString("arm-unknown-unknown"); + std::string featureString; + OwningPtr<const TargetMachine> + targetMachine(T.createTargetMachine(tripleString, featureString)); + InitRegisterMap(targetMachine->getRegisterInfo()); + } + }; + + class ThumbAsmLexer : public ARMBaseAsmLexer { + public: + ThumbAsmLexer(const Target &T, const MCAsmInfo &MAI) + : ARMBaseAsmLexer(T, MAI) { + std::string tripleString("thumb-unknown-unknown"); + std::string featureString; + OwningPtr<const TargetMachine> + targetMachine(T.createTargetMachine(tripleString, featureString)); + InitRegisterMap(targetMachine->getRegisterInfo()); + } + }; +} + +AsmToken ARMBaseAsmLexer::LexTokenUAL() { + const AsmToken &lexedToken = lexDefinite(); + + switch (lexedToken.getKind()) { + default: + return AsmToken(lexedToken); + case AsmToken::Error: + SetError(Lexer->getErrLoc(), Lexer->getErr()); + return AsmToken(lexedToken); + case AsmToken::Identifier: + { + std::string upperCase = lexedToken.getString().str(); + std::string lowerCase = LowercaseString(upperCase); + StringRef lowerRef(lowerCase); + + unsigned regID = MatchRegisterName(lowerRef); + + if (regID) { + return AsmToken(AsmToken::Register, + lexedToken.getString(), + static_cast<int64_t>(regID)); + } else { + return AsmToken(lexedToken); + } + } + } +} + +extern "C" void LLVMInitializeARMAsmLexer() { + RegisterAsmLexer<ARMAsmLexer> X(TheARMTarget); + RegisterAsmLexer<ThumbAsmLexer> Y(TheThumbTarget); +} + diff --git a/contrib/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/contrib/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp new file mode 100644 index 0000000..bfa89c4 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp @@ -0,0 +1,822 @@ +//===-- ARMAsmParser.cpp - Parse ARM assembly to MCInst instructions ------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "ARM.h" +#include "llvm/MC/MCParser/MCAsmLexer.h" +#include "llvm/MC/MCParser/MCAsmParser.h" +#include "llvm/MC/MCParser/MCParsedAsmOperand.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" +#include "llvm/Target/TargetRegistry.h" +#include "llvm/Target/TargetAsmParser.h" +#include "llvm/Support/Compiler.h" +#include "llvm/Support/SourceMgr.h" +#include "llvm/ADT/OwningPtr.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Twine.h" +using namespace llvm; + +namespace { +struct ARMOperand; + +// The shift types for register controlled shifts in arm memory addressing +enum ShiftType { + Lsl, + Lsr, + Asr, + Ror, + Rrx +}; + +class ARMAsmParser : public TargetAsmParser { + MCAsmParser &Parser; + +private: + MCAsmParser &getParser() const { return Parser; } + + MCAsmLexer &getLexer() const { return Parser.getLexer(); } + + void Warning(SMLoc L, const Twine &Msg) { Parser.Warning(L, Msg); } + + bool Error(SMLoc L, const Twine &Msg) { return Parser.Error(L, Msg); } + + bool MaybeParseRegister(OwningPtr<ARMOperand> &Op, bool ParseWriteBack); + + bool ParseRegisterList(OwningPtr<ARMOperand> &Op); + + bool ParseMemory(OwningPtr<ARMOperand> &Op); + + bool ParseMemoryOffsetReg(bool &Negative, + bool &OffsetRegShifted, + enum ShiftType &ShiftType, + const MCExpr *&ShiftAmount, + const MCExpr *&Offset, + bool &OffsetIsReg, + int &OffsetRegNum, + SMLoc &E); + + bool ParseShift(enum ShiftType &St, const MCExpr *&ShiftAmount, SMLoc &E); + + bool ParseOperand(OwningPtr<ARMOperand> &Op); + + bool ParseDirectiveWord(unsigned Size, SMLoc L); + + bool ParseDirectiveThumb(SMLoc L); + + bool ParseDirectiveThumbFunc(SMLoc L); + + bool ParseDirectiveCode(SMLoc L); + + bool ParseDirectiveSyntax(SMLoc L); + + // TODO - For now hacked versions of the next two are in here in this file to + // allow some parser testing until the table gen versions are implemented. + + /// @name Auto-generated Match Functions + /// { + bool MatchInstruction(const SmallVectorImpl<MCParsedAsmOperand*> &Operands, + MCInst &Inst); + + /// MatchRegisterName - Match the given string to a register name and return + /// its register number, or -1 if there is no match. To allow return values + /// to be used directly in register lists, arm registers have values between + /// 0 and 15. + int MatchRegisterName(const StringRef &Name); + + /// } + + +public: + ARMAsmParser(const Target &T, MCAsmParser &_Parser) + : TargetAsmParser(T), Parser(_Parser) {} + + virtual bool ParseInstruction(const StringRef &Name, SMLoc NameLoc, + SmallVectorImpl<MCParsedAsmOperand*> &Operands); + + virtual bool ParseDirective(AsmToken DirectiveID); +}; + +/// ARMOperand - Instances of this class represent a parsed ARM machine +/// instruction. +struct ARMOperand : public MCParsedAsmOperand { +private: + ARMOperand() {} +public: + enum KindTy { + Token, + Register, + Immediate, + Memory + } Kind; + + SMLoc StartLoc, EndLoc; + + union { + struct { + const char *Data; + unsigned Length; + } Tok; + + struct { + unsigned RegNum; + bool Writeback; + } Reg; + + struct { + const MCExpr *Val; + } Imm; + + // This is for all forms of ARM address expressions + struct { + unsigned BaseRegNum; + unsigned OffsetRegNum; // used when OffsetIsReg is true + const MCExpr *Offset; // used when OffsetIsReg is false + const MCExpr *ShiftAmount; // used when OffsetRegShifted is true + enum ShiftType ShiftType; // used when OffsetRegShifted is true + unsigned + OffsetRegShifted : 1, // only used when OffsetIsReg is true + Preindexed : 1, + Postindexed : 1, + OffsetIsReg : 1, + Negative : 1, // only used when OffsetIsReg is true + Writeback : 1; + } Mem; + + }; + + ARMOperand(KindTy K, SMLoc S, SMLoc E) + : Kind(K), StartLoc(S), EndLoc(E) {} + + ARMOperand(const ARMOperand &o) : MCParsedAsmOperand() { + Kind = o.Kind; + StartLoc = o.StartLoc; + EndLoc = o.EndLoc; + switch (Kind) { + case Token: + Tok = o.Tok; + break; + case Register: + Reg = o.Reg; + break; + case Immediate: + Imm = o.Imm; + break; + case Memory: + Mem = o.Mem; + break; + } + } + + /// getStartLoc - Get the location of the first token of this operand. + SMLoc getStartLoc() const { return StartLoc; } + /// getEndLoc - Get the location of the last token of this operand. + SMLoc getEndLoc() const { return EndLoc; } + + StringRef getToken() const { + assert(Kind == Token && "Invalid access!"); + return StringRef(Tok.Data, Tok.Length); + } + + unsigned getReg() const { + assert(Kind == Register && "Invalid access!"); + return Reg.RegNum; + } + + const MCExpr *getImm() const { + assert(Kind == Immediate && "Invalid access!"); + return Imm.Val; + } + + bool isToken() const {return Kind == Token; } + + bool isReg() const { return Kind == Register; } + + void addRegOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::CreateReg(getReg())); + } + + static void CreateToken(OwningPtr<ARMOperand> &Op, StringRef Str, + SMLoc S) { + Op.reset(new ARMOperand); + Op->Kind = Token; + Op->Tok.Data = Str.data(); + Op->Tok.Length = Str.size(); + Op->StartLoc = S; + Op->EndLoc = S; + } + + static void CreateReg(OwningPtr<ARMOperand> &Op, unsigned RegNum, + bool Writeback, SMLoc S, SMLoc E) { + Op.reset(new ARMOperand); + Op->Kind = Register; + Op->Reg.RegNum = RegNum; + Op->Reg.Writeback = Writeback; + + Op->StartLoc = S; + Op->EndLoc = E; + } + + static void CreateImm(OwningPtr<ARMOperand> &Op, const MCExpr *Val, + SMLoc S, SMLoc E) { + Op.reset(new ARMOperand); + Op->Kind = Immediate; + Op->Imm.Val = Val; + + Op->StartLoc = S; + Op->EndLoc = E; + } + + static void CreateMem(OwningPtr<ARMOperand> &Op, + unsigned BaseRegNum, bool OffsetIsReg, + const MCExpr *Offset, unsigned OffsetRegNum, + bool OffsetRegShifted, enum ShiftType ShiftType, + const MCExpr *ShiftAmount, bool Preindexed, + bool Postindexed, bool Negative, bool Writeback, + SMLoc S, SMLoc E) { + Op.reset(new ARMOperand); + Op->Kind = Memory; + Op->Mem.BaseRegNum = BaseRegNum; + Op->Mem.OffsetIsReg = OffsetIsReg; + Op->Mem.Offset = Offset; + Op->Mem.OffsetRegNum = OffsetRegNum; + Op->Mem.OffsetRegShifted = OffsetRegShifted; + Op->Mem.ShiftType = ShiftType; + Op->Mem.ShiftAmount = ShiftAmount; + Op->Mem.Preindexed = Preindexed; + Op->Mem.Postindexed = Postindexed; + Op->Mem.Negative = Negative; + Op->Mem.Writeback = Writeback; + + Op->StartLoc = S; + Op->EndLoc = E; + } +}; + +} // end anonymous namespace. + +/// Try to parse a register name. The token must be an Identifier when called, +/// and if it is a register name a Reg operand is created, the token is eaten +/// and false is returned. Else true is returned and no token is eaten. +/// TODO this is likely to change to allow different register types and or to +/// parse for a specific register type. +bool ARMAsmParser::MaybeParseRegister + (OwningPtr<ARMOperand> &Op, bool ParseWriteBack) { + SMLoc S, E; + const AsmToken &Tok = Parser.getTok(); + assert(Tok.is(AsmToken::Identifier) && "Token is not an Identifier"); + + // FIXME: Validate register for the current architecture; we have to do + // validation later, so maybe there is no need for this here. + int RegNum; + + RegNum = MatchRegisterName(Tok.getString()); + if (RegNum == -1) + return true; + + S = Tok.getLoc(); + + Parser.Lex(); // Eat identifier token. + + E = Parser.getTok().getLoc(); + + bool Writeback = false; + if (ParseWriteBack) { + const AsmToken &ExclaimTok = Parser.getTok(); + if (ExclaimTok.is(AsmToken::Exclaim)) { + E = ExclaimTok.getLoc(); + Writeback = true; + Parser.Lex(); // Eat exclaim token + } + } + + ARMOperand::CreateReg(Op, RegNum, Writeback, S, E); + + return false; +} + +/// Parse a register list, return false if successful else return true or an +/// error. The first token must be a '{' when called. +bool ARMAsmParser::ParseRegisterList(OwningPtr<ARMOperand> &Op) { + SMLoc S, E; + assert(Parser.getTok().is(AsmToken::LCurly) && + "Token is not an Left Curly Brace"); + S = Parser.getTok().getLoc(); + Parser.Lex(); // Eat left curly brace token. + + const AsmToken &RegTok = Parser.getTok(); + SMLoc RegLoc = RegTok.getLoc(); + if (RegTok.isNot(AsmToken::Identifier)) + return Error(RegLoc, "register expected"); + int RegNum = MatchRegisterName(RegTok.getString()); + if (RegNum == -1) + return Error(RegLoc, "register expected"); + Parser.Lex(); // Eat identifier token. + unsigned RegList = 1 << RegNum; + + int HighRegNum = RegNum; + // TODO ranges like "{Rn-Rm}" + while (Parser.getTok().is(AsmToken::Comma)) { + Parser.Lex(); // Eat comma token. + + const AsmToken &RegTok = Parser.getTok(); + SMLoc RegLoc = RegTok.getLoc(); + if (RegTok.isNot(AsmToken::Identifier)) + return Error(RegLoc, "register expected"); + int RegNum = MatchRegisterName(RegTok.getString()); + if (RegNum == -1) + return Error(RegLoc, "register expected"); + + if (RegList & (1 << RegNum)) + Warning(RegLoc, "register duplicated in register list"); + else if (RegNum <= HighRegNum) + Warning(RegLoc, "register not in ascending order in register list"); + RegList |= 1 << RegNum; + HighRegNum = RegNum; + + Parser.Lex(); // Eat identifier token. + } + const AsmToken &RCurlyTok = Parser.getTok(); + if (RCurlyTok.isNot(AsmToken::RCurly)) + return Error(RCurlyTok.getLoc(), "'}' expected"); + E = RCurlyTok.getLoc(); + Parser.Lex(); // Eat left curly brace token. + + return false; +} + +/// Parse an arm memory expression, return false if successful else return true +/// or an error. The first token must be a '[' when called. +/// TODO Only preindexing and postindexing addressing are started, unindexed +/// with option, etc are still to do. +bool ARMAsmParser::ParseMemory(OwningPtr<ARMOperand> &Op) { + SMLoc S, E; + assert(Parser.getTok().is(AsmToken::LBrac) && + "Token is not an Left Bracket"); + S = Parser.getTok().getLoc(); + Parser.Lex(); // Eat left bracket token. + + const AsmToken &BaseRegTok = Parser.getTok(); + if (BaseRegTok.isNot(AsmToken::Identifier)) + return Error(BaseRegTok.getLoc(), "register expected"); + if (MaybeParseRegister(Op, false)) + return Error(BaseRegTok.getLoc(), "register expected"); + int BaseRegNum = Op->getReg(); + + bool Preindexed = false; + bool Postindexed = false; + bool OffsetIsReg = false; + bool Negative = false; + bool Writeback = false; + + // First look for preindexed address forms, that is after the "[Rn" we now + // have to see if the next token is a comma. + const AsmToken &Tok = Parser.getTok(); + if (Tok.is(AsmToken::Comma)) { + Preindexed = true; + Parser.Lex(); // Eat comma token. + int OffsetRegNum; + bool OffsetRegShifted; + enum ShiftType ShiftType; + const MCExpr *ShiftAmount; + const MCExpr *Offset; + if(ParseMemoryOffsetReg(Negative, OffsetRegShifted, ShiftType, ShiftAmount, + Offset, OffsetIsReg, OffsetRegNum, E)) + return true; + const AsmToken &RBracTok = Parser.getTok(); + if (RBracTok.isNot(AsmToken::RBrac)) + return Error(RBracTok.getLoc(), "']' expected"); + E = RBracTok.getLoc(); + Parser.Lex(); // Eat right bracket token. + + const AsmToken &ExclaimTok = Parser.getTok(); + if (ExclaimTok.is(AsmToken::Exclaim)) { + E = ExclaimTok.getLoc(); + Writeback = true; + Parser.Lex(); // Eat exclaim token + } + ARMOperand::CreateMem(Op, BaseRegNum, OffsetIsReg, Offset, OffsetRegNum, + OffsetRegShifted, ShiftType, ShiftAmount, + Preindexed, Postindexed, Negative, Writeback, S, E); + return false; + } + // The "[Rn" we have so far was not followed by a comma. + else if (Tok.is(AsmToken::RBrac)) { + // This is a post indexing addressing forms, that is a ']' follows after + // the "[Rn". + Postindexed = true; + Writeback = true; + E = Tok.getLoc(); + Parser.Lex(); // Eat right bracket token. + + int OffsetRegNum = 0; + bool OffsetRegShifted = false; + enum ShiftType ShiftType; + const MCExpr *ShiftAmount; + const MCExpr *Offset; + + const AsmToken &NextTok = Parser.getTok(); + if (NextTok.isNot(AsmToken::EndOfStatement)) { + if (NextTok.isNot(AsmToken::Comma)) + return Error(NextTok.getLoc(), "',' expected"); + Parser.Lex(); // Eat comma token. + if(ParseMemoryOffsetReg(Negative, OffsetRegShifted, ShiftType, + ShiftAmount, Offset, OffsetIsReg, OffsetRegNum, + E)) + return true; + } + + ARMOperand::CreateMem(Op, BaseRegNum, OffsetIsReg, Offset, OffsetRegNum, + OffsetRegShifted, ShiftType, ShiftAmount, + Preindexed, Postindexed, Negative, Writeback, S, E); + return false; + } + + return true; +} + +/// Parse the offset of a memory operand after we have seen "[Rn," or "[Rn]," +/// we will parse the following (were +/- means that a plus or minus is +/// optional): +/// +/-Rm +/// +/-Rm, shift +/// #offset +/// we return false on success or an error otherwise. +bool ARMAsmParser::ParseMemoryOffsetReg(bool &Negative, + bool &OffsetRegShifted, + enum ShiftType &ShiftType, + const MCExpr *&ShiftAmount, + const MCExpr *&Offset, + bool &OffsetIsReg, + int &OffsetRegNum, + SMLoc &E) { + OwningPtr<ARMOperand> Op; + Negative = false; + OffsetRegShifted = false; + OffsetIsReg = false; + OffsetRegNum = -1; + const AsmToken &NextTok = Parser.getTok(); + E = NextTok.getLoc(); + if (NextTok.is(AsmToken::Plus)) + Parser.Lex(); // Eat plus token. + else if (NextTok.is(AsmToken::Minus)) { + Negative = true; + Parser.Lex(); // Eat minus token + } + // See if there is a register following the "[Rn," or "[Rn]," we have so far. + const AsmToken &OffsetRegTok = Parser.getTok(); + if (OffsetRegTok.is(AsmToken::Identifier)) { + OffsetIsReg = !MaybeParseRegister(Op, false); + if (OffsetIsReg) { + E = Op->getEndLoc(); + OffsetRegNum = Op->getReg(); + } + } + // If we parsed a register as the offset then their can be a shift after that + if (OffsetRegNum != -1) { + // Look for a comma then a shift + const AsmToken &Tok = Parser.getTok(); + if (Tok.is(AsmToken::Comma)) { + Parser.Lex(); // Eat comma token. + + const AsmToken &Tok = Parser.getTok(); + if (ParseShift(ShiftType, ShiftAmount, E)) + return Error(Tok.getLoc(), "shift expected"); + OffsetRegShifted = true; + } + } + else { // the "[Rn," or "[Rn,]" we have so far was not followed by "Rm" + // Look for #offset following the "[Rn," or "[Rn]," + const AsmToken &HashTok = Parser.getTok(); + if (HashTok.isNot(AsmToken::Hash)) + return Error(HashTok.getLoc(), "'#' expected"); + + Parser.Lex(); // Eat hash token. + + if (getParser().ParseExpression(Offset)) + return true; + E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1); + } + return false; +} + +/// ParseShift as one of these two: +/// ( lsl | lsr | asr | ror ) , # shift_amount +/// rrx +/// and returns true if it parses a shift otherwise it returns false. +bool ARMAsmParser::ParseShift(ShiftType &St, + const MCExpr *&ShiftAmount, + SMLoc &E) { + const AsmToken &Tok = Parser.getTok(); + if (Tok.isNot(AsmToken::Identifier)) + return true; + const StringRef &ShiftName = Tok.getString(); + if (ShiftName == "lsl" || ShiftName == "LSL") + St = Lsl; + else if (ShiftName == "lsr" || ShiftName == "LSR") + St = Lsr; + else if (ShiftName == "asr" || ShiftName == "ASR") + St = Asr; + else if (ShiftName == "ror" || ShiftName == "ROR") + St = Ror; + else if (ShiftName == "rrx" || ShiftName == "RRX") + St = Rrx; + else + return true; + Parser.Lex(); // Eat shift type token. + + // Rrx stands alone. + if (St == Rrx) + return false; + + // Otherwise, there must be a '#' and a shift amount. + const AsmToken &HashTok = Parser.getTok(); + if (HashTok.isNot(AsmToken::Hash)) + return Error(HashTok.getLoc(), "'#' expected"); + Parser.Lex(); // Eat hash token. + + if (getParser().ParseExpression(ShiftAmount)) + return true; + + return false; +} + +/// A hack to allow some testing, to be replaced by a real table gen version. +int ARMAsmParser::MatchRegisterName(const StringRef &Name) { + if (Name == "r0" || Name == "R0") + return 0; + else if (Name == "r1" || Name == "R1") + return 1; + else if (Name == "r2" || Name == "R2") + return 2; + else if (Name == "r3" || Name == "R3") + return 3; + else if (Name == "r3" || Name == "R3") + return 3; + else if (Name == "r4" || Name == "R4") + return 4; + else if (Name == "r5" || Name == "R5") + return 5; + else if (Name == "r6" || Name == "R6") + return 6; + else if (Name == "r7" || Name == "R7") + return 7; + else if (Name == "r8" || Name == "R8") + return 8; + else if (Name == "r9" || Name == "R9") + return 9; + else if (Name == "r10" || Name == "R10") + return 10; + else if (Name == "r11" || Name == "R11" || Name == "fp") + return 11; + else if (Name == "r12" || Name == "R12" || Name == "ip") + return 12; + else if (Name == "r13" || Name == "R13" || Name == "sp") + return 13; + else if (Name == "r14" || Name == "R14" || Name == "lr") + return 14; + else if (Name == "r15" || Name == "R15" || Name == "pc") + return 15; + return -1; +} + +/// A hack to allow some testing, to be replaced by a real table gen version. +bool ARMAsmParser:: +MatchInstruction(const SmallVectorImpl<MCParsedAsmOperand*> &Operands, + MCInst &Inst) { + ARMOperand &Op0 = *(ARMOperand*)Operands[0]; + assert(Op0.Kind == ARMOperand::Token && "First operand not a Token"); + const StringRef &Mnemonic = Op0.getToken(); + if (Mnemonic == "add" || + Mnemonic == "stmfd" || + Mnemonic == "str" || + Mnemonic == "ldmfd" || + Mnemonic == "ldr" || + Mnemonic == "mov" || + Mnemonic == "sub" || + Mnemonic == "bl" || + Mnemonic == "push" || + Mnemonic == "blx" || + Mnemonic == "pop") { + // Hard-coded to a valid instruction, till we have a real matcher. + Inst = MCInst(); + Inst.setOpcode(ARM::MOVr); + Inst.addOperand(MCOperand::CreateReg(2)); + Inst.addOperand(MCOperand::CreateReg(2)); + Inst.addOperand(MCOperand::CreateImm(0)); + Inst.addOperand(MCOperand::CreateImm(0)); + Inst.addOperand(MCOperand::CreateReg(0)); + return false; + } + + return true; +} + +/// Parse a arm instruction operand. For now this parses the operand regardless +/// of the mnemonic. +bool ARMAsmParser::ParseOperand(OwningPtr<ARMOperand> &Op) { + SMLoc S, E; + + switch (getLexer().getKind()) { + case AsmToken::Identifier: + if (!MaybeParseRegister(Op, true)) + return false; + // This was not a register so parse other operands that start with an + // identifier (like labels) as expressions and create them as immediates. + const MCExpr *IdVal; + S = Parser.getTok().getLoc(); + if (getParser().ParseExpression(IdVal)) + return true; + E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1); + ARMOperand::CreateImm(Op, IdVal, S, E); + return false; + case AsmToken::LBrac: + return ParseMemory(Op); + case AsmToken::LCurly: + return ParseRegisterList(Op); + case AsmToken::Hash: + // #42 -> immediate. + // TODO: ":lower16:" and ":upper16:" modifiers after # before immediate + S = Parser.getTok().getLoc(); + Parser.Lex(); + const MCExpr *ImmVal; + if (getParser().ParseExpression(ImmVal)) + return true; + E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1); + ARMOperand::CreateImm(Op, ImmVal, S, E); + return false; + default: + return Error(Parser.getTok().getLoc(), "unexpected token in operand"); + } +} + +/// Parse an arm instruction mnemonic followed by its operands. +bool ARMAsmParser::ParseInstruction(const StringRef &Name, SMLoc NameLoc, + SmallVectorImpl<MCParsedAsmOperand*> &Operands) { + OwningPtr<ARMOperand> Op; + ARMOperand::CreateToken(Op, Name, NameLoc); + + Operands.push_back(Op.take()); + + SMLoc Loc = Parser.getTok().getLoc(); + if (getLexer().isNot(AsmToken::EndOfStatement)) { + + // Read the first operand. + OwningPtr<ARMOperand> Op; + if (ParseOperand(Op)) return true; + Operands.push_back(Op.take()); + + while (getLexer().is(AsmToken::Comma)) { + Parser.Lex(); // Eat the comma. + + // Parse and remember the operand. + if (ParseOperand(Op)) return true; + Operands.push_back(Op.take()); + } + } + return false; +} + +/// ParseDirective parses the arm specific directives +bool ARMAsmParser::ParseDirective(AsmToken DirectiveID) { + StringRef IDVal = DirectiveID.getIdentifier(); + if (IDVal == ".word") + return ParseDirectiveWord(4, DirectiveID.getLoc()); + else if (IDVal == ".thumb") + return ParseDirectiveThumb(DirectiveID.getLoc()); + else if (IDVal == ".thumb_func") + return ParseDirectiveThumbFunc(DirectiveID.getLoc()); + else if (IDVal == ".code") + return ParseDirectiveCode(DirectiveID.getLoc()); + else if (IDVal == ".syntax") + return ParseDirectiveSyntax(DirectiveID.getLoc()); + return true; +} + +/// ParseDirectiveWord +/// ::= .word [ expression (, expression)* ] +bool ARMAsmParser::ParseDirectiveWord(unsigned Size, SMLoc L) { + if (getLexer().isNot(AsmToken::EndOfStatement)) { + for (;;) { + const MCExpr *Value; + if (getParser().ParseExpression(Value)) + return true; + + getParser().getStreamer().EmitValue(Value, Size, 0/*addrspace*/); + + if (getLexer().is(AsmToken::EndOfStatement)) + break; + + // FIXME: Improve diagnostic. + if (getLexer().isNot(AsmToken::Comma)) + return Error(L, "unexpected token in directive"); + Parser.Lex(); + } + } + + Parser.Lex(); + return false; +} + +/// ParseDirectiveThumb +/// ::= .thumb +bool ARMAsmParser::ParseDirectiveThumb(SMLoc L) { + if (getLexer().isNot(AsmToken::EndOfStatement)) + return Error(L, "unexpected token in directive"); + Parser.Lex(); + + // TODO: set thumb mode + // TODO: tell the MC streamer the mode + // getParser().getStreamer().Emit???(); + return false; +} + +/// ParseDirectiveThumbFunc +/// ::= .thumbfunc symbol_name +bool ARMAsmParser::ParseDirectiveThumbFunc(SMLoc L) { + const AsmToken &Tok = Parser.getTok(); + if (Tok.isNot(AsmToken::Identifier) && Tok.isNot(AsmToken::String)) + return Error(L, "unexpected token in .syntax directive"); + StringRef ATTRIBUTE_UNUSED SymbolName = Parser.getTok().getIdentifier(); + Parser.Lex(); // Consume the identifier token. + + if (getLexer().isNot(AsmToken::EndOfStatement)) + return Error(L, "unexpected token in directive"); + Parser.Lex(); + + // TODO: mark symbol as a thumb symbol + // getParser().getStreamer().Emit???(); + return false; +} + +/// ParseDirectiveSyntax +/// ::= .syntax unified | divided +bool ARMAsmParser::ParseDirectiveSyntax(SMLoc L) { + const AsmToken &Tok = Parser.getTok(); + if (Tok.isNot(AsmToken::Identifier)) + return Error(L, "unexpected token in .syntax directive"); + const StringRef &Mode = Tok.getString(); + bool unified_syntax; + if (Mode == "unified" || Mode == "UNIFIED") { + Parser.Lex(); + unified_syntax = true; + } + else if (Mode == "divided" || Mode == "DIVIDED") { + Parser.Lex(); + unified_syntax = false; + } + else + return Error(L, "unrecognized syntax mode in .syntax directive"); + + if (getLexer().isNot(AsmToken::EndOfStatement)) + return Error(Parser.getTok().getLoc(), "unexpected token in directive"); + Parser.Lex(); + + // TODO tell the MC streamer the mode + // getParser().getStreamer().Emit???(); + return false; +} + +/// ParseDirectiveCode +/// ::= .code 16 | 32 +bool ARMAsmParser::ParseDirectiveCode(SMLoc L) { + const AsmToken &Tok = Parser.getTok(); + if (Tok.isNot(AsmToken::Integer)) + return Error(L, "unexpected token in .code directive"); + int64_t Val = Parser.getTok().getIntVal(); + bool thumb_mode; + if (Val == 16) { + Parser.Lex(); + thumb_mode = true; + } + else if (Val == 32) { + Parser.Lex(); + thumb_mode = false; + } + else + return Error(L, "invalid operand to .code directive"); + + if (getLexer().isNot(AsmToken::EndOfStatement)) + return Error(Parser.getTok().getLoc(), "unexpected token in directive"); + Parser.Lex(); + + // TODO tell the MC streamer the mode + // getParser().getStreamer().Emit???(); + return false; +} + +extern "C" void LLVMInitializeARMAsmLexer(); + +/// Force static initialization. +extern "C" void LLVMInitializeARMAsmParser() { + RegisterAsmParser<ARMAsmParser> X(TheARMTarget); + RegisterAsmParser<ARMAsmParser> Y(TheThumbTarget); + LLVMInitializeARMAsmLexer(); +} diff --git a/contrib/llvm/lib/Target/ARM/AsmParser/CMakeLists.txt b/contrib/llvm/lib/Target/ARM/AsmParser/CMakeLists.txt new file mode 100644 index 0000000..9ba7c01 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/AsmParser/CMakeLists.txt @@ -0,0 +1,7 @@ +include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. ) + +add_llvm_library(LLVMARMAsmParser + ARMAsmLexer.cpp + ARMAsmParser.cpp + ) + diff --git a/contrib/llvm/lib/Target/ARM/AsmParser/Makefile b/contrib/llvm/lib/Target/ARM/AsmParser/Makefile new file mode 100644 index 0000000..841516f --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/AsmParser/Makefile @@ -0,0 +1,15 @@ +##===- lib/Target/ARM/AsmParser/Makefile -------------------*- Makefile -*-===## +# +# The LLVM Compiler Infrastructure +# +# This file is distributed under the University of Illinois Open Source +# License. See LICENSE.TXT for details. +# +##===----------------------------------------------------------------------===## +LEVEL = ../../../.. +LIBRARYNAME = LLVMARMAsmParser + +# Hack: we need to include 'main' ARM target directory to grab private headers +CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/.. + +include $(LEVEL)/Makefile.common diff --git a/contrib/llvm/lib/Target/ARM/AsmPrinter/ARMAsmPrinter.cpp b/contrib/llvm/lib/Target/ARM/AsmPrinter/ARMAsmPrinter.cpp new file mode 100644 index 0000000..d95efdb --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/AsmPrinter/ARMAsmPrinter.cpp @@ -0,0 +1,1455 @@ +//===-- ARMAsmPrinter.cpp - Print machine code to an ARM .s file ----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains a printer that converts from our internal representation +// of machine-dependent LLVM code to GAS-format ARM assembly language. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "asm-printer" +#include "ARM.h" +#include "ARMBuildAttrs.h" +#include "ARMAddressingModes.h" +#include "ARMConstantPoolValue.h" +#include "ARMInstPrinter.h" +#include "ARMMachineFunctionInfo.h" +#include "ARMMCInstLower.h" +#include "ARMTargetMachine.h" +#include "llvm/Analysis/DebugInfo.h" +#include "llvm/Constants.h" +#include "llvm/Module.h" +#include "llvm/Type.h" +#include "llvm/Assembly/Writer.h" +#include "llvm/CodeGen/AsmPrinter.h" +#include "llvm/CodeGen/MachineModuleInfoImpls.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineJumpTableInfo.h" +#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCSectionMachO.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/Target/Mangler.h" +#include "llvm/Target/TargetData.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetOptions.h" +#include "llvm/Target/TargetRegistry.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" +#include <cctype> +using namespace llvm; + +static cl::opt<bool> +EnableMCInst("enable-arm-mcinst-printer", cl::Hidden, + cl::desc("enable experimental asmprinter gunk in the arm backend")); + +namespace { + class ARMAsmPrinter : public AsmPrinter { + + /// Subtarget - Keep a pointer to the ARMSubtarget around so that we can + /// make the right decision when printing asm code for different targets. + const ARMSubtarget *Subtarget; + + /// AFI - Keep a pointer to ARMFunctionInfo for the current + /// MachineFunction. + ARMFunctionInfo *AFI; + + /// MCP - Keep a pointer to constantpool entries of the current + /// MachineFunction. + const MachineConstantPool *MCP; + + public: + explicit ARMAsmPrinter(TargetMachine &TM, MCStreamer &Streamer) + : AsmPrinter(TM, Streamer), AFI(NULL), MCP(NULL) { + Subtarget = &TM.getSubtarget<ARMSubtarget>(); + } + + virtual const char *getPassName() const { + return "ARM Assembly Printer"; + } + + void printInstructionThroughMCStreamer(const MachineInstr *MI); + + + void printOperand(const MachineInstr *MI, int OpNum, raw_ostream &O, + const char *Modifier = 0); + void printSOImmOperand(const MachineInstr *MI, int OpNum, raw_ostream &O); + void printSOImm2PartOperand(const MachineInstr *MI, int OpNum, + raw_ostream &O); + void printSORegOperand(const MachineInstr *MI, int OpNum, + raw_ostream &O); + void printAddrMode2Operand(const MachineInstr *MI, int OpNum, + raw_ostream &O); + void printAddrMode2OffsetOperand(const MachineInstr *MI, int OpNum, + raw_ostream &O); + void printAddrMode3Operand(const MachineInstr *MI, int OpNum, + raw_ostream &O); + void printAddrMode3OffsetOperand(const MachineInstr *MI, int OpNum, + raw_ostream &O); + void printAddrMode4Operand(const MachineInstr *MI, int OpNum,raw_ostream &O, + const char *Modifier = 0); + void printAddrMode5Operand(const MachineInstr *MI, int OpNum,raw_ostream &O, + const char *Modifier = 0); + void printAddrMode6Operand(const MachineInstr *MI, int OpNum, + raw_ostream &O); + void printAddrMode6OffsetOperand(const MachineInstr *MI, int OpNum, + raw_ostream &O); + void printAddrModePCOperand(const MachineInstr *MI, int OpNum, + raw_ostream &O, + const char *Modifier = 0); + void printBitfieldInvMaskImmOperand (const MachineInstr *MI, int OpNum, + raw_ostream &O); + + void printThumbS4ImmOperand(const MachineInstr *MI, int OpNum, + raw_ostream &O); + void printThumbITMask(const MachineInstr *MI, int OpNum, raw_ostream &O); + void printThumbAddrModeRROperand(const MachineInstr *MI, int OpNum, + raw_ostream &O); + void printThumbAddrModeRI5Operand(const MachineInstr *MI, int OpNum, + raw_ostream &O, + unsigned Scale); + void printThumbAddrModeS1Operand(const MachineInstr *MI, int OpNum, + raw_ostream &O); + void printThumbAddrModeS2Operand(const MachineInstr *MI, int OpNum, + raw_ostream &O); + void printThumbAddrModeS4Operand(const MachineInstr *MI, int OpNum, + raw_ostream &O); + void printThumbAddrModeSPOperand(const MachineInstr *MI, int OpNum, + raw_ostream &O); + + void printT2SOOperand(const MachineInstr *MI, int OpNum, raw_ostream &O); + void printT2AddrModeImm12Operand(const MachineInstr *MI, int OpNum, + raw_ostream &O); + void printT2AddrModeImm8Operand(const MachineInstr *MI, int OpNum, + raw_ostream &O); + void printT2AddrModeImm8s4Operand(const MachineInstr *MI, int OpNum, + raw_ostream &O); + void printT2AddrModeImm8OffsetOperand(const MachineInstr *MI, int OpNum, + raw_ostream &O); + void printT2AddrModeImm8s4OffsetOperand(const MachineInstr *MI, int OpNum, + raw_ostream &O) {} + void printT2AddrModeSoRegOperand(const MachineInstr *MI, int OpNum, + raw_ostream &O); + + void printCPSOptionOperand(const MachineInstr *MI, int OpNum, + raw_ostream &O) {} + void printMSRMaskOperand(const MachineInstr *MI, int OpNum, + raw_ostream &O) {} + void printNegZeroOperand(const MachineInstr *MI, int OpNum, + raw_ostream &O) {} + void printPredicateOperand(const MachineInstr *MI, int OpNum, + raw_ostream &O); + void printMandatoryPredicateOperand(const MachineInstr *MI, int OpNum, + raw_ostream &O); + void printSBitModifierOperand(const MachineInstr *MI, int OpNum, + raw_ostream &O); + void printPCLabel(const MachineInstr *MI, int OpNum, + raw_ostream &O); + void printRegisterList(const MachineInstr *MI, int OpNum, + raw_ostream &O); + void printCPInstOperand(const MachineInstr *MI, int OpNum, + raw_ostream &O, + const char *Modifier); + void printJTBlockOperand(const MachineInstr *MI, int OpNum, + raw_ostream &O); + void printJT2BlockOperand(const MachineInstr *MI, int OpNum, + raw_ostream &O); + void printTBAddrMode(const MachineInstr *MI, int OpNum, + raw_ostream &O); + void printNoHashImmediate(const MachineInstr *MI, int OpNum, + raw_ostream &O); + void printVFPf32ImmOperand(const MachineInstr *MI, int OpNum, + raw_ostream &O); + void printVFPf64ImmOperand(const MachineInstr *MI, int OpNum, + raw_ostream &O); + + void printHex8ImmOperand(const MachineInstr *MI, int OpNum, + raw_ostream &O) { + O << "#0x" << utohexstr(MI->getOperand(OpNum).getImm() & 0xff); + } + void printHex16ImmOperand(const MachineInstr *MI, int OpNum, + raw_ostream &O) { + O << "#0x" << utohexstr(MI->getOperand(OpNum).getImm() & 0xffff); + } + void printHex32ImmOperand(const MachineInstr *MI, int OpNum, + raw_ostream &O) { + O << "#0x" << utohexstr(MI->getOperand(OpNum).getImm() & 0xffffffff); + } + void printHex64ImmOperand(const MachineInstr *MI, int OpNum, + raw_ostream &O) { + O << "#0x" << utohexstr(MI->getOperand(OpNum).getImm()); + } + + virtual bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNum, + unsigned AsmVariant, const char *ExtraCode, + raw_ostream &O); + virtual bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNum, + unsigned AsmVariant, + const char *ExtraCode, raw_ostream &O); + + void printInstruction(const MachineInstr *MI, raw_ostream &O); // autogen + static const char *getRegisterName(unsigned RegNo); + + virtual void EmitInstruction(const MachineInstr *MI); + bool runOnMachineFunction(MachineFunction &F); + + virtual void EmitConstantPool() {} // we emit constant pools customly! + virtual void EmitFunctionEntryLabel(); + void EmitStartOfAsmFile(Module &M); + void EmitEndOfAsmFile(Module &M); + + MCSymbol *GetARMSetPICJumpTableLabel2(unsigned uid, unsigned uid2, + const MachineBasicBlock *MBB) const; + MCSymbol *GetARMJTIPICJumpTableLabel2(unsigned uid, unsigned uid2) const; + + /// EmitMachineConstantPoolValue - Print a machine constantpool value to + /// the .s file. + virtual void EmitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) { + SmallString<128> Str; + raw_svector_ostream OS(Str); + EmitMachineConstantPoolValue(MCPV, OS); + OutStreamer.EmitRawText(OS.str()); + } + + void EmitMachineConstantPoolValue(MachineConstantPoolValue *MCPV, + raw_ostream &O) { + switch (TM.getTargetData()->getTypeAllocSize(MCPV->getType())) { + case 1: O << MAI->getData8bitsDirective(0); break; + case 2: O << MAI->getData16bitsDirective(0); break; + case 4: O << MAI->getData32bitsDirective(0); break; + default: assert(0 && "Unknown CPV size"); + } + + ARMConstantPoolValue *ACPV = static_cast<ARMConstantPoolValue*>(MCPV); + + if (ACPV->isLSDA()) { + O << MAI->getPrivateGlobalPrefix() << "_LSDA_" << getFunctionNumber(); + } else if (ACPV->isBlockAddress()) { + O << *GetBlockAddressSymbol(ACPV->getBlockAddress()); + } else if (ACPV->isGlobalValue()) { + const GlobalValue *GV = ACPV->getGV(); + bool isIndirect = Subtarget->isTargetDarwin() && + Subtarget->GVIsIndirectSymbol(GV, TM.getRelocationModel()); + if (!isIndirect) + O << *Mang->getSymbol(GV); + else { + // FIXME: Remove this when Darwin transition to @GOT like syntax. + MCSymbol *Sym = GetSymbolWithGlobalValueBase(GV, "$non_lazy_ptr"); + O << *Sym; + + MachineModuleInfoMachO &MMIMachO = + MMI->getObjFileInfo<MachineModuleInfoMachO>(); + MachineModuleInfoImpl::StubValueTy &StubSym = + GV->hasHiddenVisibility() ? MMIMachO.getHiddenGVStubEntry(Sym) : + MMIMachO.getGVStubEntry(Sym); + if (StubSym.getPointer() == 0) + StubSym = MachineModuleInfoImpl:: + StubValueTy(Mang->getSymbol(GV), !GV->hasInternalLinkage()); + } + } else { + assert(ACPV->isExtSymbol() && "unrecognized constant pool value"); + O << *GetExternalSymbolSymbol(ACPV->getSymbol()); + } + + if (ACPV->hasModifier()) O << "(" << ACPV->getModifier() << ")"; + if (ACPV->getPCAdjustment() != 0) { + O << "-(" << MAI->getPrivateGlobalPrefix() << "PC" + << getFunctionNumber() << "_" << ACPV->getLabelId() + << "+" << (unsigned)ACPV->getPCAdjustment(); + if (ACPV->mustAddCurrentAddress()) + O << "-."; + O << ')'; + } + } + }; +} // end of anonymous namespace + +#include "ARMGenAsmWriter.inc" + +void ARMAsmPrinter::EmitFunctionEntryLabel() { + if (AFI->isThumbFunction()) { + OutStreamer.EmitRawText(StringRef("\t.code\t16")); + if (!Subtarget->isTargetDarwin()) + OutStreamer.EmitRawText(StringRef("\t.thumb_func")); + else { + // This needs to emit to a temporary string to get properly quoted + // MCSymbols when they have spaces in them. + SmallString<128> Tmp; + raw_svector_ostream OS(Tmp); + OS << "\t.thumb_func\t" << *CurrentFnSym; + OutStreamer.EmitRawText(OS.str()); + } + } + + OutStreamer.EmitLabel(CurrentFnSym); +} + +/// runOnMachineFunction - This uses the printInstruction() +/// method to print assembly for each instruction. +/// +bool ARMAsmPrinter::runOnMachineFunction(MachineFunction &MF) { + AFI = MF.getInfo<ARMFunctionInfo>(); + MCP = MF.getConstantPool(); + + return AsmPrinter::runOnMachineFunction(MF); +} + +void ARMAsmPrinter::printOperand(const MachineInstr *MI, int OpNum, + raw_ostream &O, const char *Modifier) { + const MachineOperand &MO = MI->getOperand(OpNum); + unsigned TF = MO.getTargetFlags(); + + switch (MO.getType()) { + default: + assert(0 && "<unknown operand type>"); + case MachineOperand::MO_Register: { + unsigned Reg = MO.getReg(); + assert(TargetRegisterInfo::isPhysicalRegister(Reg)); + if (Modifier && strcmp(Modifier, "dregpair") == 0) { + unsigned DRegLo = TM.getRegisterInfo()->getSubReg(Reg, ARM::dsub_0); + unsigned DRegHi = TM.getRegisterInfo()->getSubReg(Reg, ARM::dsub_1); + O << '{' + << getRegisterName(DRegLo) << ',' << getRegisterName(DRegHi) + << '}'; + } else if (Modifier && strcmp(Modifier, "lane") == 0) { + unsigned RegNum = ARMRegisterInfo::getRegisterNumbering(Reg); + unsigned DReg = + TM.getRegisterInfo()->getMatchingSuperReg(Reg, + RegNum & 1 ? ARM::ssub_1 : ARM::ssub_0, &ARM::DPR_VFP2RegClass); + O << getRegisterName(DReg) << '[' << (RegNum & 1) << ']'; + } else { + assert(!MO.getSubReg() && "Subregs should be eliminated!"); + O << getRegisterName(Reg); + } + break; + } + case MachineOperand::MO_Immediate: { + int64_t Imm = MO.getImm(); + O << '#'; + if ((Modifier && strcmp(Modifier, "lo16") == 0) || + (TF & ARMII::MO_LO16)) + O << ":lower16:"; + else if ((Modifier && strcmp(Modifier, "hi16") == 0) || + (TF & ARMII::MO_HI16)) + O << ":upper16:"; + O << Imm; + break; + } + case MachineOperand::MO_MachineBasicBlock: + O << *MO.getMBB()->getSymbol(); + return; + case MachineOperand::MO_GlobalAddress: { + bool isCallOp = Modifier && !strcmp(Modifier, "call"); + const GlobalValue *GV = MO.getGlobal(); + + if ((Modifier && strcmp(Modifier, "lo16") == 0) || + (TF & ARMII::MO_LO16)) + O << ":lower16:"; + else if ((Modifier && strcmp(Modifier, "hi16") == 0) || + (TF & ARMII::MO_HI16)) + O << ":upper16:"; + O << *Mang->getSymbol(GV); + + printOffset(MO.getOffset(), O); + + if (isCallOp && Subtarget->isTargetELF() && + TM.getRelocationModel() == Reloc::PIC_) + O << "(PLT)"; + break; + } + case MachineOperand::MO_ExternalSymbol: { + bool isCallOp = Modifier && !strcmp(Modifier, "call"); + O << *GetExternalSymbolSymbol(MO.getSymbolName()); + + if (isCallOp && Subtarget->isTargetELF() && + TM.getRelocationModel() == Reloc::PIC_) + O << "(PLT)"; + break; + } + case MachineOperand::MO_ConstantPoolIndex: + O << *GetCPISymbol(MO.getIndex()); + break; + case MachineOperand::MO_JumpTableIndex: + O << *GetJTISymbol(MO.getIndex()); + break; + } +} + +static void printSOImm(raw_ostream &O, int64_t V, bool VerboseAsm, + const MCAsmInfo *MAI) { + // Break it up into two parts that make up a shifter immediate. + V = ARM_AM::getSOImmVal(V); + assert(V != -1 && "Not a valid so_imm value!"); + + unsigned Imm = ARM_AM::getSOImmValImm(V); + unsigned Rot = ARM_AM::getSOImmValRot(V); + + // Print low-level immediate formation info, per + // A5.1.3: "Data-processing operands - Immediate". + if (Rot) { + O << "#" << Imm << ", " << Rot; + // Pretty printed version. + if (VerboseAsm) { + O << "\t" << MAI->getCommentString() << ' '; + O << (int)ARM_AM::rotr32(Imm, Rot); + } + } else { + O << "#" << Imm; + } +} + +/// printSOImmOperand - SOImm is 4-bit rotate amount in bits 8-11 with 8-bit +/// immediate in bits 0-7. +void ARMAsmPrinter::printSOImmOperand(const MachineInstr *MI, int OpNum, + raw_ostream &O) { + const MachineOperand &MO = MI->getOperand(OpNum); + assert(MO.isImm() && "Not a valid so_imm value!"); + printSOImm(O, MO.getImm(), isVerbose(), MAI); +} + +/// printSOImm2PartOperand - SOImm is broken into two pieces using a 'mov' +/// followed by an 'orr' to materialize. +void ARMAsmPrinter::printSOImm2PartOperand(const MachineInstr *MI, int OpNum, + raw_ostream &O) { + const MachineOperand &MO = MI->getOperand(OpNum); + assert(MO.isImm() && "Not a valid so_imm value!"); + unsigned V1 = ARM_AM::getSOImmTwoPartFirst(MO.getImm()); + unsigned V2 = ARM_AM::getSOImmTwoPartSecond(MO.getImm()); + printSOImm(O, V1, isVerbose(), MAI); + O << "\n\torr"; + printPredicateOperand(MI, 2, O); + O << "\t"; + printOperand(MI, 0, O); + O << ", "; + printOperand(MI, 0, O); + O << ", "; + printSOImm(O, V2, isVerbose(), MAI); +} + +// so_reg is a 4-operand unit corresponding to register forms of the A5.1 +// "Addressing Mode 1 - Data-processing operands" forms. This includes: +// REG 0 0 - e.g. R5 +// REG REG 0,SH_OPC - e.g. R5, ROR R3 +// REG 0 IMM,SH_OPC - e.g. R5, LSL #3 +void ARMAsmPrinter::printSORegOperand(const MachineInstr *MI, int Op, + raw_ostream &O) { + const MachineOperand &MO1 = MI->getOperand(Op); + const MachineOperand &MO2 = MI->getOperand(Op+1); + const MachineOperand &MO3 = MI->getOperand(Op+2); + + O << getRegisterName(MO1.getReg()); + + // Print the shift opc. + O << ", " + << ARM_AM::getShiftOpcStr(ARM_AM::getSORegShOp(MO3.getImm())) + << " "; + + if (MO2.getReg()) { + O << getRegisterName(MO2.getReg()); + assert(ARM_AM::getSORegOffset(MO3.getImm()) == 0); + } else { + O << "#" << ARM_AM::getSORegOffset(MO3.getImm()); + } +} + +void ARMAsmPrinter::printAddrMode2Operand(const MachineInstr *MI, int Op, + raw_ostream &O) { + const MachineOperand &MO1 = MI->getOperand(Op); + const MachineOperand &MO2 = MI->getOperand(Op+1); + const MachineOperand &MO3 = MI->getOperand(Op+2); + + if (!MO1.isReg()) { // FIXME: This is for CP entries, but isn't right. + printOperand(MI, Op, O); + return; + } + + O << "[" << getRegisterName(MO1.getReg()); + + if (!MO2.getReg()) { + if (ARM_AM::getAM2Offset(MO3.getImm())) // Don't print +0. + O << ", #" + << ARM_AM::getAddrOpcStr(ARM_AM::getAM2Op(MO3.getImm())) + << ARM_AM::getAM2Offset(MO3.getImm()); + O << "]"; + return; + } + + O << ", " + << ARM_AM::getAddrOpcStr(ARM_AM::getAM2Op(MO3.getImm())) + << getRegisterName(MO2.getReg()); + + if (unsigned ShImm = ARM_AM::getAM2Offset(MO3.getImm())) + O << ", " + << ARM_AM::getShiftOpcStr(ARM_AM::getAM2ShiftOpc(MO3.getImm())) + << " #" << ShImm; + O << "]"; +} + +void ARMAsmPrinter::printAddrMode2OffsetOperand(const MachineInstr *MI, int Op, + raw_ostream &O) { + const MachineOperand &MO1 = MI->getOperand(Op); + const MachineOperand &MO2 = MI->getOperand(Op+1); + + if (!MO1.getReg()) { + unsigned ImmOffs = ARM_AM::getAM2Offset(MO2.getImm()); + O << "#" + << ARM_AM::getAddrOpcStr(ARM_AM::getAM2Op(MO2.getImm())) + << ImmOffs; + return; + } + + O << ARM_AM::getAddrOpcStr(ARM_AM::getAM2Op(MO2.getImm())) + << getRegisterName(MO1.getReg()); + + if (unsigned ShImm = ARM_AM::getAM2Offset(MO2.getImm())) + O << ", " + << ARM_AM::getShiftOpcStr(ARM_AM::getAM2ShiftOpc(MO2.getImm())) + << " #" << ShImm; +} + +void ARMAsmPrinter::printAddrMode3Operand(const MachineInstr *MI, int Op, + raw_ostream &O) { + const MachineOperand &MO1 = MI->getOperand(Op); + const MachineOperand &MO2 = MI->getOperand(Op+1); + const MachineOperand &MO3 = MI->getOperand(Op+2); + + assert(TargetRegisterInfo::isPhysicalRegister(MO1.getReg())); + O << "[" << getRegisterName(MO1.getReg()); + + if (MO2.getReg()) { + O << ", " + << (char)ARM_AM::getAM3Op(MO3.getImm()) + << getRegisterName(MO2.getReg()) + << "]"; + return; + } + + if (unsigned ImmOffs = ARM_AM::getAM3Offset(MO3.getImm())) + O << ", #" + << ARM_AM::getAddrOpcStr(ARM_AM::getAM3Op(MO3.getImm())) + << ImmOffs; + O << "]"; +} + +void ARMAsmPrinter::printAddrMode3OffsetOperand(const MachineInstr *MI, int Op, + raw_ostream &O){ + const MachineOperand &MO1 = MI->getOperand(Op); + const MachineOperand &MO2 = MI->getOperand(Op+1); + + if (MO1.getReg()) { + O << (char)ARM_AM::getAM3Op(MO2.getImm()) + << getRegisterName(MO1.getReg()); + return; + } + + unsigned ImmOffs = ARM_AM::getAM3Offset(MO2.getImm()); + O << "#" + << ARM_AM::getAddrOpcStr(ARM_AM::getAM3Op(MO2.getImm())) + << ImmOffs; +} + +void ARMAsmPrinter::printAddrMode4Operand(const MachineInstr *MI, int Op, + raw_ostream &O, + const char *Modifier) { + const MachineOperand &MO2 = MI->getOperand(Op+1); + ARM_AM::AMSubMode Mode = ARM_AM::getAM4SubMode(MO2.getImm()); + if (Modifier && strcmp(Modifier, "submode") == 0) { + O << ARM_AM::getAMSubModeStr(Mode); + } else if (Modifier && strcmp(Modifier, "wide") == 0) { + ARM_AM::AMSubMode Mode = ARM_AM::getAM4SubMode(MO2.getImm()); + if (Mode == ARM_AM::ia) + O << ".w"; + } else { + printOperand(MI, Op, O); + } +} + +void ARMAsmPrinter::printAddrMode5Operand(const MachineInstr *MI, int Op, + raw_ostream &O, + const char *Modifier) { + const MachineOperand &MO1 = MI->getOperand(Op); + const MachineOperand &MO2 = MI->getOperand(Op+1); + + if (!MO1.isReg()) { // FIXME: This is for CP entries, but isn't right. + printOperand(MI, Op, O); + return; + } + + assert(TargetRegisterInfo::isPhysicalRegister(MO1.getReg())); + + if (Modifier && strcmp(Modifier, "submode") == 0) { + ARM_AM::AMSubMode Mode = ARM_AM::getAM5SubMode(MO2.getImm()); + O << ARM_AM::getAMSubModeStr(Mode); + return; + } else if (Modifier && strcmp(Modifier, "base") == 0) { + // Used for FSTM{D|S} and LSTM{D|S} operations. + O << getRegisterName(MO1.getReg()); + return; + } + + O << "[" << getRegisterName(MO1.getReg()); + + if (unsigned ImmOffs = ARM_AM::getAM5Offset(MO2.getImm())) { + O << ", #" + << ARM_AM::getAddrOpcStr(ARM_AM::getAM5Op(MO2.getImm())) + << ImmOffs*4; + } + O << "]"; +} + +void ARMAsmPrinter::printAddrMode6Operand(const MachineInstr *MI, int Op, + raw_ostream &O) { + const MachineOperand &MO1 = MI->getOperand(Op); + const MachineOperand &MO2 = MI->getOperand(Op+1); + + O << "[" << getRegisterName(MO1.getReg()); + if (MO2.getImm()) { + // FIXME: Both darwin as and GNU as violate ARM docs here. + O << ", :" << MO2.getImm(); + } + O << "]"; +} + +void ARMAsmPrinter::printAddrMode6OffsetOperand(const MachineInstr *MI, int Op, + raw_ostream &O){ + const MachineOperand &MO = MI->getOperand(Op); + if (MO.getReg() == 0) + O << "!"; + else + O << ", " << getRegisterName(MO.getReg()); +} + +void ARMAsmPrinter::printAddrModePCOperand(const MachineInstr *MI, int Op, + raw_ostream &O, + const char *Modifier) { + if (Modifier && strcmp(Modifier, "label") == 0) { + printPCLabel(MI, Op+1, O); + return; + } + + const MachineOperand &MO1 = MI->getOperand(Op); + assert(TargetRegisterInfo::isPhysicalRegister(MO1.getReg())); + O << "[pc, " << getRegisterName(MO1.getReg()) << "]"; +} + +void +ARMAsmPrinter::printBitfieldInvMaskImmOperand(const MachineInstr *MI, int Op, + raw_ostream &O) { + const MachineOperand &MO = MI->getOperand(Op); + uint32_t v = ~MO.getImm(); + int32_t lsb = CountTrailingZeros_32(v); + int32_t width = (32 - CountLeadingZeros_32 (v)) - lsb; + assert(MO.isImm() && "Not a valid bf_inv_mask_imm value!"); + O << "#" << lsb << ", #" << width; +} + +//===--------------------------------------------------------------------===// + +void ARMAsmPrinter::printThumbS4ImmOperand(const MachineInstr *MI, int Op, + raw_ostream &O) { + O << "#" << MI->getOperand(Op).getImm() * 4; +} + +void +ARMAsmPrinter::printThumbITMask(const MachineInstr *MI, int Op, + raw_ostream &O) { + // (3 - the number of trailing zeros) is the number of then / else. + unsigned Mask = MI->getOperand(Op).getImm(); + unsigned CondBit0 = Mask >> 4 & 1; + unsigned NumTZ = CountTrailingZeros_32(Mask); + assert(NumTZ <= 3 && "Invalid IT mask!"); + for (unsigned Pos = 3, e = NumTZ; Pos > e; --Pos) { + bool T = ((Mask >> Pos) & 1) == CondBit0; + if (T) + O << 't'; + else + O << 'e'; + } +} + +void +ARMAsmPrinter::printThumbAddrModeRROperand(const MachineInstr *MI, int Op, + raw_ostream &O) { + const MachineOperand &MO1 = MI->getOperand(Op); + const MachineOperand &MO2 = MI->getOperand(Op+1); + O << "[" << getRegisterName(MO1.getReg()); + O << ", " << getRegisterName(MO2.getReg()) << "]"; +} + +void +ARMAsmPrinter::printThumbAddrModeRI5Operand(const MachineInstr *MI, int Op, + raw_ostream &O, + unsigned Scale) { + const MachineOperand &MO1 = MI->getOperand(Op); + const MachineOperand &MO2 = MI->getOperand(Op+1); + const MachineOperand &MO3 = MI->getOperand(Op+2); + + if (!MO1.isReg()) { // FIXME: This is for CP entries, but isn't right. + printOperand(MI, Op, O); + return; + } + + O << "[" << getRegisterName(MO1.getReg()); + if (MO3.getReg()) + O << ", " << getRegisterName(MO3.getReg()); + else if (unsigned ImmOffs = MO2.getImm()) + O << ", #" << ImmOffs * Scale; + O << "]"; +} + +void +ARMAsmPrinter::printThumbAddrModeS1Operand(const MachineInstr *MI, int Op, + raw_ostream &O) { + printThumbAddrModeRI5Operand(MI, Op, O, 1); +} +void +ARMAsmPrinter::printThumbAddrModeS2Operand(const MachineInstr *MI, int Op, + raw_ostream &O) { + printThumbAddrModeRI5Operand(MI, Op, O, 2); +} +void +ARMAsmPrinter::printThumbAddrModeS4Operand(const MachineInstr *MI, int Op, + raw_ostream &O) { + printThumbAddrModeRI5Operand(MI, Op, O, 4); +} + +void ARMAsmPrinter::printThumbAddrModeSPOperand(const MachineInstr *MI,int Op, + raw_ostream &O) { + const MachineOperand &MO1 = MI->getOperand(Op); + const MachineOperand &MO2 = MI->getOperand(Op+1); + O << "[" << getRegisterName(MO1.getReg()); + if (unsigned ImmOffs = MO2.getImm()) + O << ", #" << ImmOffs*4; + O << "]"; +} + +//===--------------------------------------------------------------------===// + +// Constant shifts t2_so_reg is a 2-operand unit corresponding to the Thumb2 +// register with shift forms. +// REG 0 0 - e.g. R5 +// REG IMM, SH_OPC - e.g. R5, LSL #3 +void ARMAsmPrinter::printT2SOOperand(const MachineInstr *MI, int OpNum, + raw_ostream &O) { + const MachineOperand &MO1 = MI->getOperand(OpNum); + const MachineOperand &MO2 = MI->getOperand(OpNum+1); + + unsigned Reg = MO1.getReg(); + assert(TargetRegisterInfo::isPhysicalRegister(Reg)); + O << getRegisterName(Reg); + + // Print the shift opc. + O << ", " + << ARM_AM::getShiftOpcStr(ARM_AM::getSORegShOp(MO2.getImm())) + << " "; + + assert(MO2.isImm() && "Not a valid t2_so_reg value!"); + O << "#" << ARM_AM::getSORegOffset(MO2.getImm()); +} + +void ARMAsmPrinter::printT2AddrModeImm12Operand(const MachineInstr *MI, + int OpNum, + raw_ostream &O) { + const MachineOperand &MO1 = MI->getOperand(OpNum); + const MachineOperand &MO2 = MI->getOperand(OpNum+1); + + O << "[" << getRegisterName(MO1.getReg()); + + unsigned OffImm = MO2.getImm(); + if (OffImm) // Don't print +0. + O << ", #" << OffImm; + O << "]"; +} + +void ARMAsmPrinter::printT2AddrModeImm8Operand(const MachineInstr *MI, + int OpNum, + raw_ostream &O) { + const MachineOperand &MO1 = MI->getOperand(OpNum); + const MachineOperand &MO2 = MI->getOperand(OpNum+1); + + O << "[" << getRegisterName(MO1.getReg()); + + int32_t OffImm = (int32_t)MO2.getImm(); + // Don't print +0. + if (OffImm < 0) + O << ", #-" << -OffImm; + else if (OffImm > 0) + O << ", #" << OffImm; + O << "]"; +} + +void ARMAsmPrinter::printT2AddrModeImm8s4Operand(const MachineInstr *MI, + int OpNum, + raw_ostream &O) { + const MachineOperand &MO1 = MI->getOperand(OpNum); + const MachineOperand &MO2 = MI->getOperand(OpNum+1); + + O << "[" << getRegisterName(MO1.getReg()); + + int32_t OffImm = (int32_t)MO2.getImm() / 4; + // Don't print +0. + if (OffImm < 0) + O << ", #-" << -OffImm * 4; + else if (OffImm > 0) + O << ", #" << OffImm * 4; + O << "]"; +} + +void ARMAsmPrinter::printT2AddrModeImm8OffsetOperand(const MachineInstr *MI, + int OpNum, + raw_ostream &O) { + const MachineOperand &MO1 = MI->getOperand(OpNum); + int32_t OffImm = (int32_t)MO1.getImm(); + // Don't print +0. + if (OffImm < 0) + O << "#-" << -OffImm; + else if (OffImm > 0) + O << "#" << OffImm; +} + +void ARMAsmPrinter::printT2AddrModeSoRegOperand(const MachineInstr *MI, + int OpNum, + raw_ostream &O) { + const MachineOperand &MO1 = MI->getOperand(OpNum); + const MachineOperand &MO2 = MI->getOperand(OpNum+1); + const MachineOperand &MO3 = MI->getOperand(OpNum+2); + + O << "[" << getRegisterName(MO1.getReg()); + + assert(MO2.getReg() && "Invalid so_reg load / store address!"); + O << ", " << getRegisterName(MO2.getReg()); + + unsigned ShAmt = MO3.getImm(); + if (ShAmt) { + assert(ShAmt <= 3 && "Not a valid Thumb2 addressing mode!"); + O << ", lsl #" << ShAmt; + } + O << "]"; +} + + +//===--------------------------------------------------------------------===// + +void ARMAsmPrinter::printPredicateOperand(const MachineInstr *MI, int OpNum, + raw_ostream &O) { + ARMCC::CondCodes CC = (ARMCC::CondCodes)MI->getOperand(OpNum).getImm(); + if (CC != ARMCC::AL) + O << ARMCondCodeToString(CC); +} + +void ARMAsmPrinter::printMandatoryPredicateOperand(const MachineInstr *MI, + int OpNum, + raw_ostream &O) { + ARMCC::CondCodes CC = (ARMCC::CondCodes)MI->getOperand(OpNum).getImm(); + O << ARMCondCodeToString(CC); +} + +void ARMAsmPrinter::printSBitModifierOperand(const MachineInstr *MI, int OpNum, + raw_ostream &O){ + unsigned Reg = MI->getOperand(OpNum).getReg(); + if (Reg) { + assert(Reg == ARM::CPSR && "Expect ARM CPSR register!"); + O << 's'; + } +} + +void ARMAsmPrinter::printPCLabel(const MachineInstr *MI, int OpNum, + raw_ostream &O) { + int Id = (int)MI->getOperand(OpNum).getImm(); + O << MAI->getPrivateGlobalPrefix() + << "PC" << getFunctionNumber() << "_" << Id; +} + +void ARMAsmPrinter::printRegisterList(const MachineInstr *MI, int OpNum, + raw_ostream &O) { + O << "{"; + for (unsigned i = OpNum, e = MI->getNumOperands(); i != e; ++i) { + if (MI->getOperand(i).isImplicit()) + continue; + if ((int)i != OpNum) O << ", "; + printOperand(MI, i, O); + } + O << "}"; +} + +void ARMAsmPrinter::printCPInstOperand(const MachineInstr *MI, int OpNum, + raw_ostream &O, const char *Modifier) { + assert(Modifier && "This operand only works with a modifier!"); + // There are two aspects to a CONSTANTPOOL_ENTRY operand, the label and the + // data itself. + if (!strcmp(Modifier, "label")) { + unsigned ID = MI->getOperand(OpNum).getImm(); + OutStreamer.EmitLabel(GetCPISymbol(ID)); + } else { + assert(!strcmp(Modifier, "cpentry") && "Unknown modifier for CPE"); + unsigned CPI = MI->getOperand(OpNum).getIndex(); + + const MachineConstantPoolEntry &MCPE = MCP->getConstants()[CPI]; + + if (MCPE.isMachineConstantPoolEntry()) { + EmitMachineConstantPoolValue(MCPE.Val.MachineCPVal); + } else { + EmitGlobalConstant(MCPE.Val.ConstVal); + } + } +} + +MCSymbol *ARMAsmPrinter:: +GetARMSetPICJumpTableLabel2(unsigned uid, unsigned uid2, + const MachineBasicBlock *MBB) const { + SmallString<60> Name; + raw_svector_ostream(Name) << MAI->getPrivateGlobalPrefix() + << getFunctionNumber() << '_' << uid << '_' << uid2 + << "_set_" << MBB->getNumber(); + return OutContext.GetOrCreateSymbol(Name.str()); +} + +MCSymbol *ARMAsmPrinter:: +GetARMJTIPICJumpTableLabel2(unsigned uid, unsigned uid2) const { + SmallString<60> Name; + raw_svector_ostream(Name) << MAI->getPrivateGlobalPrefix() << "JTI" + << getFunctionNumber() << '_' << uid << '_' << uid2; + return OutContext.GetOrCreateSymbol(Name.str()); +} + +void ARMAsmPrinter::printJTBlockOperand(const MachineInstr *MI, int OpNum, + raw_ostream &O) { + assert(!Subtarget->isThumb2() && "Thumb2 should use double-jump jumptables!"); + + const MachineOperand &MO1 = MI->getOperand(OpNum); + const MachineOperand &MO2 = MI->getOperand(OpNum+1); // Unique Id + + unsigned JTI = MO1.getIndex(); + MCSymbol *JTISymbol = GetARMJTIPICJumpTableLabel2(JTI, MO2.getImm()); + // Can't use EmitLabel until instprinter happens, label comes out in the wrong + // order. + O << *JTISymbol << ":\n"; + + const char *JTEntryDirective = MAI->getData32bitsDirective(); + + const MachineJumpTableInfo *MJTI = MF->getJumpTableInfo(); + const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables(); + const std::vector<MachineBasicBlock*> &JTBBs = JT[JTI].MBBs; + bool UseSet= MAI->hasSetDirective() && TM.getRelocationModel() == Reloc::PIC_; + SmallPtrSet<MachineBasicBlock*, 8> JTSets; + for (unsigned i = 0, e = JTBBs.size(); i != e; ++i) { + MachineBasicBlock *MBB = JTBBs[i]; + bool isNew = JTSets.insert(MBB); + + if (UseSet && isNew) { + O << "\t.set\t" + << *GetARMSetPICJumpTableLabel2(JTI, MO2.getImm(), MBB) << ',' + << *MBB->getSymbol() << '-' << *JTISymbol << '\n'; + } + + O << JTEntryDirective << ' '; + if (UseSet) + O << *GetARMSetPICJumpTableLabel2(JTI, MO2.getImm(), MBB); + else if (TM.getRelocationModel() == Reloc::PIC_) + O << *MBB->getSymbol() << '-' << *JTISymbol; + else + O << *MBB->getSymbol(); + + if (i != e-1) + O << '\n'; + } +} + +void ARMAsmPrinter::printJT2BlockOperand(const MachineInstr *MI, int OpNum, + raw_ostream &O) { + const MachineOperand &MO1 = MI->getOperand(OpNum); + const MachineOperand &MO2 = MI->getOperand(OpNum+1); // Unique Id + unsigned JTI = MO1.getIndex(); + + MCSymbol *JTISymbol = GetARMJTIPICJumpTableLabel2(JTI, MO2.getImm()); + + // Can't use EmitLabel until instprinter happens, label comes out in the wrong + // order. + O << *JTISymbol << ":\n"; + + const MachineJumpTableInfo *MJTI = MF->getJumpTableInfo(); + const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables(); + const std::vector<MachineBasicBlock*> &JTBBs = JT[JTI].MBBs; + bool ByteOffset = false, HalfWordOffset = false; + if (MI->getOpcode() == ARM::t2TBB) + ByteOffset = true; + else if (MI->getOpcode() == ARM::t2TBH) + HalfWordOffset = true; + + for (unsigned i = 0, e = JTBBs.size(); i != e; ++i) { + MachineBasicBlock *MBB = JTBBs[i]; + if (ByteOffset) + O << MAI->getData8bitsDirective(); + else if (HalfWordOffset) + O << MAI->getData16bitsDirective(); + + if (ByteOffset || HalfWordOffset) + O << '(' << *MBB->getSymbol() << "-" << *JTISymbol << ")/2"; + else + O << "\tb.w " << *MBB->getSymbol(); + + if (i != e-1) + O << '\n'; + } +} + +void ARMAsmPrinter::printTBAddrMode(const MachineInstr *MI, int OpNum, + raw_ostream &O) { + O << "[pc, " << getRegisterName(MI->getOperand(OpNum).getReg()); + if (MI->getOpcode() == ARM::t2TBH) + O << ", lsl #1"; + O << ']'; +} + +void ARMAsmPrinter::printNoHashImmediate(const MachineInstr *MI, int OpNum, + raw_ostream &O) { + O << MI->getOperand(OpNum).getImm(); +} + +void ARMAsmPrinter::printVFPf32ImmOperand(const MachineInstr *MI, int OpNum, + raw_ostream &O) { + const ConstantFP *FP = MI->getOperand(OpNum).getFPImm(); + O << '#' << FP->getValueAPF().convertToFloat(); + if (isVerbose()) { + O << "\t\t" << MAI->getCommentString() << ' '; + WriteAsOperand(O, FP, /*PrintType=*/false); + } +} + +void ARMAsmPrinter::printVFPf64ImmOperand(const MachineInstr *MI, int OpNum, + raw_ostream &O) { + const ConstantFP *FP = MI->getOperand(OpNum).getFPImm(); + O << '#' << FP->getValueAPF().convertToDouble(); + if (isVerbose()) { + O << "\t\t" << MAI->getCommentString() << ' '; + WriteAsOperand(O, FP, /*PrintType=*/false); + } +} + +bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum, + unsigned AsmVariant, const char *ExtraCode, + raw_ostream &O) { + // Does this asm operand have a single letter operand modifier? + if (ExtraCode && ExtraCode[0]) { + if (ExtraCode[1] != 0) return true; // Unknown modifier. + + switch (ExtraCode[0]) { + default: return true; // Unknown modifier. + case 'a': // Print as a memory address. + if (MI->getOperand(OpNum).isReg()) { + O << "[" << getRegisterName(MI->getOperand(OpNum).getReg()) << "]"; + return false; + } + // Fallthrough + case 'c': // Don't print "#" before an immediate operand. + if (!MI->getOperand(OpNum).isImm()) + return true; + printNoHashImmediate(MI, OpNum, O); + return false; + case 'P': // Print a VFP double precision register. + case 'q': // Print a NEON quad precision register. + printOperand(MI, OpNum, O); + return false; + case 'Q': + if (TM.getTargetData()->isLittleEndian()) + break; + // Fallthrough + case 'R': + if (TM.getTargetData()->isBigEndian()) + break; + // Fallthrough + case 'H': // Write second word of DI / DF reference. + // Verify that this operand has two consecutive registers. + if (!MI->getOperand(OpNum).isReg() || + OpNum+1 == MI->getNumOperands() || + !MI->getOperand(OpNum+1).isReg()) + return true; + ++OpNum; // Return the high-part. + } + } + + printOperand(MI, OpNum, O); + return false; +} + +bool ARMAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, + unsigned OpNum, unsigned AsmVariant, + const char *ExtraCode, + raw_ostream &O) { + if (ExtraCode && ExtraCode[0]) + return true; // Unknown modifier. + + const MachineOperand &MO = MI->getOperand(OpNum); + assert(MO.isReg() && "unexpected inline asm memory operand"); + O << "[" << getRegisterName(MO.getReg()) << "]"; + return false; +} + +void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) { + if (EnableMCInst) { + printInstructionThroughMCStreamer(MI); + return; + } + + if (MI->getOpcode() == ARM::CONSTPOOL_ENTRY) + EmitAlignment(2); + + SmallString<128> Str; + raw_svector_ostream OS(Str); + if (MI->getOpcode() == ARM::DBG_VALUE) { + unsigned NOps = MI->getNumOperands(); + assert(NOps==4); + OS << '\t' << MAI->getCommentString() << "DEBUG_VALUE: "; + // cast away const; DIetc do not take const operands for some reason. + DIVariable V(const_cast<MDNode *>(MI->getOperand(NOps-1).getMetadata())); + OS << V.getName(); + OS << " <- "; + // Frame address. Currently handles register +- offset only. + assert(MI->getOperand(0).isReg() && MI->getOperand(1).isImm()); + OS << '['; printOperand(MI, 0, OS); OS << '+'; printOperand(MI, 1, OS); + OS << ']'; + OS << "+"; + printOperand(MI, NOps-2, OS); + OutStreamer.EmitRawText(OS.str()); + return; + } + + printInstruction(MI, OS); + OutStreamer.EmitRawText(OS.str()); + + // Make sure the instruction that follows TBB is 2-byte aligned. + // FIXME: Constant island pass should insert an "ALIGN" instruction instead. + if (MI->getOpcode() == ARM::t2TBB) + EmitAlignment(1); +} + +void ARMAsmPrinter::EmitStartOfAsmFile(Module &M) { + if (Subtarget->isTargetDarwin()) { + Reloc::Model RelocM = TM.getRelocationModel(); + if (RelocM == Reloc::PIC_ || RelocM == Reloc::DynamicNoPIC) { + // Declare all the text sections up front (before the DWARF sections + // emitted by AsmPrinter::doInitialization) so the assembler will keep + // them together at the beginning of the object file. This helps + // avoid out-of-range branches that are due a fundamental limitation of + // the way symbol offsets are encoded with the current Darwin ARM + // relocations. + const TargetLoweringObjectFileMachO &TLOFMacho = + static_cast<const TargetLoweringObjectFileMachO &>( + getObjFileLowering()); + OutStreamer.SwitchSection(TLOFMacho.getTextSection()); + OutStreamer.SwitchSection(TLOFMacho.getTextCoalSection()); + OutStreamer.SwitchSection(TLOFMacho.getConstTextCoalSection()); + if (RelocM == Reloc::DynamicNoPIC) { + const MCSection *sect = + OutContext.getMachOSection("__TEXT", "__symbol_stub4", + MCSectionMachO::S_SYMBOL_STUBS, + 12, SectionKind::getText()); + OutStreamer.SwitchSection(sect); + } else { + const MCSection *sect = + OutContext.getMachOSection("__TEXT", "__picsymbolstub4", + MCSectionMachO::S_SYMBOL_STUBS, + 16, SectionKind::getText()); + OutStreamer.SwitchSection(sect); + } + } + } + + // Use unified assembler syntax. + OutStreamer.EmitRawText(StringRef("\t.syntax unified")); + + // Emit ARM Build Attributes + if (Subtarget->isTargetELF()) { + // CPU Type + std::string CPUString = Subtarget->getCPUString(); + if (CPUString != "generic") + OutStreamer.EmitRawText("\t.cpu " + Twine(CPUString)); + + // FIXME: Emit FPU type + if (Subtarget->hasVFP2()) + OutStreamer.EmitRawText("\t.eabi_attribute " + + Twine(ARMBuildAttrs::VFP_arch) + ", 2"); + + // Signal various FP modes. + if (!UnsafeFPMath) { + OutStreamer.EmitRawText("\t.eabi_attribute " + + Twine(ARMBuildAttrs::ABI_FP_denormal) + ", 1"); + OutStreamer.EmitRawText("\t.eabi_attribute " + + Twine(ARMBuildAttrs::ABI_FP_exceptions) + ", 1"); + } + + if (FiniteOnlyFPMath()) + OutStreamer.EmitRawText("\t.eabi_attribute " + + Twine(ARMBuildAttrs::ABI_FP_number_model)+ ", 1"); + else + OutStreamer.EmitRawText("\t.eabi_attribute " + + Twine(ARMBuildAttrs::ABI_FP_number_model)+ ", 3"); + + // 8-bytes alignment stuff. + OutStreamer.EmitRawText("\t.eabi_attribute " + + Twine(ARMBuildAttrs::ABI_align8_needed) + ", 1"); + OutStreamer.EmitRawText("\t.eabi_attribute " + + Twine(ARMBuildAttrs::ABI_align8_preserved) + ", 1"); + + // Hard float. Use both S and D registers and conform to AAPCS-VFP. + if (Subtarget->isAAPCS_ABI() && FloatABIType == FloatABI::Hard) { + OutStreamer.EmitRawText("\t.eabi_attribute " + + Twine(ARMBuildAttrs::ABI_HardFP_use) + ", 3"); + OutStreamer.EmitRawText("\t.eabi_attribute " + + Twine(ARMBuildAttrs::ABI_VFP_args) + ", 1"); + } + // FIXME: Should we signal R9 usage? + } +} + + +void ARMAsmPrinter::EmitEndOfAsmFile(Module &M) { + if (Subtarget->isTargetDarwin()) { + // All darwin targets use mach-o. + const TargetLoweringObjectFileMachO &TLOFMacho = + static_cast<const TargetLoweringObjectFileMachO &>(getObjFileLowering()); + MachineModuleInfoMachO &MMIMacho = + MMI->getObjFileInfo<MachineModuleInfoMachO>(); + + // Output non-lazy-pointers for external and common global variables. + MachineModuleInfoMachO::SymbolListTy Stubs = MMIMacho.GetGVStubList(); + + if (!Stubs.empty()) { + // Switch with ".non_lazy_symbol_pointer" directive. + OutStreamer.SwitchSection(TLOFMacho.getNonLazySymbolPointerSection()); + EmitAlignment(2); + for (unsigned i = 0, e = Stubs.size(); i != e; ++i) { + // L_foo$stub: + OutStreamer.EmitLabel(Stubs[i].first); + // .indirect_symbol _foo + MachineModuleInfoImpl::StubValueTy &MCSym = Stubs[i].second; + OutStreamer.EmitSymbolAttribute(MCSym.getPointer(),MCSA_IndirectSymbol); + + if (MCSym.getInt()) + // External to current translation unit. + OutStreamer.EmitIntValue(0, 4/*size*/, 0/*addrspace*/); + else + // Internal to current translation unit. + // + // When we place the LSDA into the TEXT section, the type info pointers + // need to be indirect and pc-rel. We accomplish this by using NLPs. + // However, sometimes the types are local to the file. So we need to + // fill in the value for the NLP in those cases. + OutStreamer.EmitValue(MCSymbolRefExpr::Create(MCSym.getPointer(), + OutContext), + 4/*size*/, 0/*addrspace*/); + } + + Stubs.clear(); + OutStreamer.AddBlankLine(); + } + + Stubs = MMIMacho.GetHiddenGVStubList(); + if (!Stubs.empty()) { + OutStreamer.SwitchSection(getObjFileLowering().getDataSection()); + EmitAlignment(2); + for (unsigned i = 0, e = Stubs.size(); i != e; ++i) { + // L_foo$stub: + OutStreamer.EmitLabel(Stubs[i].first); + // .long _foo + OutStreamer.EmitValue(MCSymbolRefExpr:: + Create(Stubs[i].second.getPointer(), + OutContext), + 4/*size*/, 0/*addrspace*/); + } + + Stubs.clear(); + OutStreamer.AddBlankLine(); + } + + // Funny Darwin hack: This flag tells the linker that no global symbols + // contain code that falls through to other global symbols (e.g. the obvious + // implementation of multiple entry points). If this doesn't occur, the + // linker can safely perform dead code stripping. Since LLVM never + // generates code that does this, it is always safe to set. + OutStreamer.EmitAssemblerFlag(MCAF_SubsectionsViaSymbols); + } +} + +//===----------------------------------------------------------------------===// + +void ARMAsmPrinter::printInstructionThroughMCStreamer(const MachineInstr *MI) { + ARMMCInstLower MCInstLowering(OutContext, *Mang, *this); + switch (MI->getOpcode()) { + case ARM::t2MOVi32imm: + assert(0 && "Should be lowered by thumb2it pass"); + default: break; + case ARM::PICADD: { // FIXME: Remove asm string from td file. + // This is a pseudo op for a label + instruction sequence, which looks like: + // LPC0: + // add r0, pc, r0 + // This adds the address of LPC0 to r0. + + // Emit the label. + // FIXME: MOVE TO SHARED PLACE. + unsigned Id = (unsigned)MI->getOperand(2).getImm(); + const char *Prefix = MAI->getPrivateGlobalPrefix(); + MCSymbol *Label =OutContext.GetOrCreateSymbol(Twine(Prefix) + + "PC" + Twine(getFunctionNumber()) + "_" + Twine(Id)); + OutStreamer.EmitLabel(Label); + + + // Form and emit tha dd. + MCInst AddInst; + AddInst.setOpcode(ARM::ADDrr); + AddInst.addOperand(MCOperand::CreateReg(MI->getOperand(0).getReg())); + AddInst.addOperand(MCOperand::CreateReg(ARM::PC)); + AddInst.addOperand(MCOperand::CreateReg(MI->getOperand(1).getReg())); + OutStreamer.EmitInstruction(AddInst); + return; + } + case ARM::CONSTPOOL_ENTRY: { // FIXME: Remove asm string from td file. + /// CONSTPOOL_ENTRY - This instruction represents a floating constant pool + /// in the function. The first operand is the ID# for this instruction, the + /// second is the index into the MachineConstantPool that this is, the third + /// is the size in bytes of this constant pool entry. + unsigned LabelId = (unsigned)MI->getOperand(0).getImm(); + unsigned CPIdx = (unsigned)MI->getOperand(1).getIndex(); + + EmitAlignment(2); + OutStreamer.EmitLabel(GetCPISymbol(LabelId)); + + const MachineConstantPoolEntry &MCPE = MCP->getConstants()[CPIdx]; + if (MCPE.isMachineConstantPoolEntry()) + EmitMachineConstantPoolValue(MCPE.Val.MachineCPVal); + else + EmitGlobalConstant(MCPE.Val.ConstVal); + + return; + } + case ARM::MOVi2pieces: { // FIXME: Remove asmstring from td file. + // This is a hack that lowers as a two instruction sequence. + unsigned DstReg = MI->getOperand(0).getReg(); + unsigned ImmVal = (unsigned)MI->getOperand(1).getImm(); + + unsigned SOImmValV1 = ARM_AM::getSOImmTwoPartFirst(ImmVal); + unsigned SOImmValV2 = ARM_AM::getSOImmTwoPartSecond(ImmVal); + + { + MCInst TmpInst; + TmpInst.setOpcode(ARM::MOVi); + TmpInst.addOperand(MCOperand::CreateReg(DstReg)); + TmpInst.addOperand(MCOperand::CreateImm(SOImmValV1)); + + // Predicate. + TmpInst.addOperand(MCOperand::CreateImm(MI->getOperand(2).getImm())); + TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(3).getReg())); + + TmpInst.addOperand(MCOperand::CreateReg(0)); // cc_out + OutStreamer.EmitInstruction(TmpInst); + } + + { + MCInst TmpInst; + TmpInst.setOpcode(ARM::ORRri); + TmpInst.addOperand(MCOperand::CreateReg(DstReg)); // dstreg + TmpInst.addOperand(MCOperand::CreateReg(DstReg)); // inreg + TmpInst.addOperand(MCOperand::CreateImm(SOImmValV2)); // so_imm + // Predicate. + TmpInst.addOperand(MCOperand::CreateImm(MI->getOperand(2).getImm())); + TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(3).getReg())); + + TmpInst.addOperand(MCOperand::CreateReg(0)); // cc_out + OutStreamer.EmitInstruction(TmpInst); + } + return; + } + case ARM::MOVi32imm: { // FIXME: Remove asmstring from td file. + // This is a hack that lowers as a two instruction sequence. + unsigned DstReg = MI->getOperand(0).getReg(); + const MachineOperand &MO = MI->getOperand(1); + MCOperand V1, V2; + if (MO.isImm()) { + unsigned ImmVal = (unsigned)MI->getOperand(1).getImm(); + V1 = MCOperand::CreateImm(ImmVal & 65535); + V2 = MCOperand::CreateImm(ImmVal >> 16); + } else if (MO.isGlobal()) { + MCSymbol *Symbol = MCInstLowering.GetGlobalAddressSymbol(MO); + const MCSymbolRefExpr *SymRef1 = + MCSymbolRefExpr::Create(Symbol, + MCSymbolRefExpr::VK_ARM_LO16, OutContext); + const MCSymbolRefExpr *SymRef2 = + MCSymbolRefExpr::Create(Symbol, + MCSymbolRefExpr::VK_ARM_HI16, OutContext); + V1 = MCOperand::CreateExpr(SymRef1); + V2 = MCOperand::CreateExpr(SymRef2); + } else { + MI->dump(); + llvm_unreachable("cannot handle this operand"); + } + + { + MCInst TmpInst; + TmpInst.setOpcode(ARM::MOVi16); + TmpInst.addOperand(MCOperand::CreateReg(DstReg)); // dstreg + TmpInst.addOperand(V1); // lower16(imm) + + // Predicate. + TmpInst.addOperand(MCOperand::CreateImm(MI->getOperand(2).getImm())); + TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(3).getReg())); + + OutStreamer.EmitInstruction(TmpInst); + } + + { + MCInst TmpInst; + TmpInst.setOpcode(ARM::MOVTi16); + TmpInst.addOperand(MCOperand::CreateReg(DstReg)); // dstreg + TmpInst.addOperand(MCOperand::CreateReg(DstReg)); // srcreg + TmpInst.addOperand(V2); // upper16(imm) + + // Predicate. + TmpInst.addOperand(MCOperand::CreateImm(MI->getOperand(2).getImm())); + TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(3).getReg())); + + OutStreamer.EmitInstruction(TmpInst); + } + + return; + } + } + + MCInst TmpInst; + MCInstLowering.Lower(MI, TmpInst); + OutStreamer.EmitInstruction(TmpInst); +} + +//===----------------------------------------------------------------------===// +// Target Registry Stuff +//===----------------------------------------------------------------------===// + +static MCInstPrinter *createARMMCInstPrinter(const Target &T, + unsigned SyntaxVariant, + const MCAsmInfo &MAI) { + if (SyntaxVariant == 0) + return new ARMInstPrinter(MAI, false); + return 0; +} + +// Force static initialization. +extern "C" void LLVMInitializeARMAsmPrinter() { + RegisterAsmPrinter<ARMAsmPrinter> X(TheARMTarget); + RegisterAsmPrinter<ARMAsmPrinter> Y(TheThumbTarget); + + TargetRegistry::RegisterMCInstPrinter(TheARMTarget, createARMMCInstPrinter); + TargetRegistry::RegisterMCInstPrinter(TheThumbTarget, createARMMCInstPrinter); +} + diff --git a/contrib/llvm/lib/Target/ARM/AsmPrinter/ARMInstPrinter.cpp b/contrib/llvm/lib/Target/ARM/AsmPrinter/ARMInstPrinter.cpp new file mode 100644 index 0000000..2b94b76 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/AsmPrinter/ARMInstPrinter.cpp @@ -0,0 +1,800 @@ +//===-- ARMInstPrinter.cpp - Convert ARM MCInst to assembly syntax --------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This class prints an ARM MCInst to a .s file. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "asm-printer" +#include "ARM.h" // FIXME: FACTOR ENUMS BETTER. +#include "ARMInstPrinter.h" +#include "ARMAddressingModes.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/Support/raw_ostream.h" +using namespace llvm; + +// Include the auto-generated portion of the assembly writer. +#define MachineInstr MCInst +#define ARMAsmPrinter ARMInstPrinter // FIXME: REMOVE. +#include "ARMGenAsmWriter.inc" +#undef MachineInstr +#undef ARMAsmPrinter + +static unsigned NextReg(unsigned Reg) { + switch (Reg) { + default: + assert(0 && "Unexpected register enum"); + + case ARM::D0: + return ARM::D1; + case ARM::D1: + return ARM::D2; + case ARM::D2: + return ARM::D3; + case ARM::D3: + return ARM::D4; + case ARM::D4: + return ARM::D5; + case ARM::D5: + return ARM::D6; + case ARM::D6: + return ARM::D7; + case ARM::D7: + return ARM::D8; + case ARM::D8: + return ARM::D9; + case ARM::D9: + return ARM::D10; + case ARM::D10: + return ARM::D11; + case ARM::D11: + return ARM::D12; + case ARM::D12: + return ARM::D13; + case ARM::D13: + return ARM::D14; + case ARM::D14: + return ARM::D15; + case ARM::D15: + return ARM::D16; + case ARM::D16: + return ARM::D17; + case ARM::D17: + return ARM::D18; + case ARM::D18: + return ARM::D19; + case ARM::D19: + return ARM::D20; + case ARM::D20: + return ARM::D21; + case ARM::D21: + return ARM::D22; + case ARM::D22: + return ARM::D23; + case ARM::D23: + return ARM::D24; + case ARM::D24: + return ARM::D25; + case ARM::D25: + return ARM::D26; + case ARM::D26: + return ARM::D27; + case ARM::D27: + return ARM::D28; + case ARM::D28: + return ARM::D29; + case ARM::D29: + return ARM::D30; + case ARM::D30: + return ARM::D31; + } +} + +void ARMInstPrinter::printInst(const MCInst *MI, raw_ostream &O) { + // Check for MOVs and print canonical forms, instead. + if (MI->getOpcode() == ARM::MOVs) { + const MCOperand &Dst = MI->getOperand(0); + const MCOperand &MO1 = MI->getOperand(1); + const MCOperand &MO2 = MI->getOperand(2); + const MCOperand &MO3 = MI->getOperand(3); + + O << '\t' << ARM_AM::getShiftOpcStr(ARM_AM::getSORegShOp(MO3.getImm())); + printSBitModifierOperand(MI, 6, O); + printPredicateOperand(MI, 4, O); + + O << '\t' << getRegisterName(Dst.getReg()) + << ", " << getRegisterName(MO1.getReg()); + + if (ARM_AM::getSORegShOp(MO3.getImm()) == ARM_AM::rrx) + return; + + O << ", "; + + if (MO2.getReg()) { + O << getRegisterName(MO2.getReg()); + assert(ARM_AM::getSORegOffset(MO3.getImm()) == 0); + } else { + O << "#" << ARM_AM::getSORegOffset(MO3.getImm()); + } + return; + } + + // A8.6.123 PUSH + if ((MI->getOpcode() == ARM::STM_UPD || MI->getOpcode() == ARM::t2STM_UPD) && + MI->getOperand(0).getReg() == ARM::SP) { + const MCOperand &MO1 = MI->getOperand(2); + if (ARM_AM::getAM4SubMode(MO1.getImm()) == ARM_AM::db) { + O << '\t' << "push"; + printPredicateOperand(MI, 3, O); + O << '\t'; + printRegisterList(MI, 5, O); + return; + } + } + + // A8.6.122 POP + if ((MI->getOpcode() == ARM::LDM_UPD || MI->getOpcode() == ARM::t2LDM_UPD) && + MI->getOperand(0).getReg() == ARM::SP) { + const MCOperand &MO1 = MI->getOperand(2); + if (ARM_AM::getAM4SubMode(MO1.getImm()) == ARM_AM::ia) { + O << '\t' << "pop"; + printPredicateOperand(MI, 3, O); + O << '\t'; + printRegisterList(MI, 5, O); + return; + } + } + + // A8.6.355 VPUSH + if ((MI->getOpcode() == ARM::VSTMS_UPD || MI->getOpcode() ==ARM::VSTMD_UPD) && + MI->getOperand(0).getReg() == ARM::SP) { + const MCOperand &MO1 = MI->getOperand(2); + if (ARM_AM::getAM5SubMode(MO1.getImm()) == ARM_AM::db) { + O << '\t' << "vpush"; + printPredicateOperand(MI, 3, O); + O << '\t'; + printRegisterList(MI, 5, O); + return; + } + } + + // A8.6.354 VPOP + if ((MI->getOpcode() == ARM::VLDMS_UPD || MI->getOpcode() ==ARM::VLDMD_UPD) && + MI->getOperand(0).getReg() == ARM::SP) { + const MCOperand &MO1 = MI->getOperand(2); + if (ARM_AM::getAM5SubMode(MO1.getImm()) == ARM_AM::ia) { + O << '\t' << "vpop"; + printPredicateOperand(MI, 3, O); + O << '\t'; + printRegisterList(MI, 5, O); + return; + } + } + + printInstruction(MI, O); + } + +void ARMInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O, const char *Modifier) { + const MCOperand &Op = MI->getOperand(OpNo); + if (Op.isReg()) { + unsigned Reg = Op.getReg(); + if (Modifier && strcmp(Modifier, "dregpair") == 0) { + O << '{' << getRegisterName(Reg) << ", " + << getRegisterName(NextReg(Reg)) << '}'; +#if 0 + // FIXME: Breaks e.g. ARM/vmul.ll. + assert(0); + /* + unsigned DRegLo = TRI->getSubReg(Reg, ARM::dsub_0); + unsigned DRegHi = TRI->getSubReg(Reg, ARM::dsub_1); + O << '{' + << getRegisterName(DRegLo) << ',' << getRegisterName(DRegHi) + << '}';*/ +#endif + } else if (Modifier && strcmp(Modifier, "lane") == 0) { + assert(0); + /* + unsigned RegNum = ARMRegisterInfo::getRegisterNumbering(Reg); + unsigned DReg = TRI->getMatchingSuperReg(Reg, RegNum & 1 ? 2 : 1, + &ARM::DPR_VFP2RegClass); + O << getRegisterName(DReg) << '[' << (RegNum & 1) << ']'; + */ + } else { + O << getRegisterName(Reg); + } + } else if (Op.isImm()) { + assert((Modifier && !strcmp(Modifier, "call")) || + ((Modifier == 0 || Modifier[0] == 0) && "No modifiers supported")); + O << '#' << Op.getImm(); + } else { + if (Modifier && Modifier[0] != 0 && strcmp(Modifier, "call") != 0) + llvm_unreachable("Unsupported modifier"); + assert(Op.isExpr() && "unknown operand kind in printOperand"); + O << *Op.getExpr(); + } +} + +static void printSOImm(raw_ostream &O, int64_t V, bool VerboseAsm, + const MCAsmInfo *MAI) { + // Break it up into two parts that make up a shifter immediate. + V = ARM_AM::getSOImmVal(V); + assert(V != -1 && "Not a valid so_imm value!"); + + unsigned Imm = ARM_AM::getSOImmValImm(V); + unsigned Rot = ARM_AM::getSOImmValRot(V); + + // Print low-level immediate formation info, per + // A5.1.3: "Data-processing operands - Immediate". + if (Rot) { + O << "#" << Imm << ", " << Rot; + // Pretty printed version. + if (VerboseAsm) + O << ' ' << MAI->getCommentString() + << ' ' << (int)ARM_AM::rotr32(Imm, Rot); + } else { + O << "#" << Imm; + } +} + + +/// printSOImmOperand - SOImm is 4-bit rotate amount in bits 8-11 with 8-bit +/// immediate in bits 0-7. +void ARMInstPrinter::printSOImmOperand(const MCInst *MI, unsigned OpNum, + raw_ostream &O) { + const MCOperand &MO = MI->getOperand(OpNum); + assert(MO.isImm() && "Not a valid so_imm value!"); + printSOImm(O, MO.getImm(), VerboseAsm, &MAI); +} + +/// printSOImm2PartOperand - SOImm is broken into two pieces using a 'mov' +/// followed by an 'orr' to materialize. +void ARMInstPrinter::printSOImm2PartOperand(const MCInst *MI, unsigned OpNum, + raw_ostream &O) { + // FIXME: REMOVE this method. + abort(); +} + +// so_reg is a 4-operand unit corresponding to register forms of the A5.1 +// "Addressing Mode 1 - Data-processing operands" forms. This includes: +// REG 0 0 - e.g. R5 +// REG REG 0,SH_OPC - e.g. R5, ROR R3 +// REG 0 IMM,SH_OPC - e.g. R5, LSL #3 +void ARMInstPrinter::printSORegOperand(const MCInst *MI, unsigned OpNum, + raw_ostream &O) { + const MCOperand &MO1 = MI->getOperand(OpNum); + const MCOperand &MO2 = MI->getOperand(OpNum+1); + const MCOperand &MO3 = MI->getOperand(OpNum+2); + + O << getRegisterName(MO1.getReg()); + + // Print the shift opc. + O << ", " + << ARM_AM::getShiftOpcStr(ARM_AM::getSORegShOp(MO3.getImm())) + << ' '; + + if (MO2.getReg()) { + O << getRegisterName(MO2.getReg()); + assert(ARM_AM::getSORegOffset(MO3.getImm()) == 0); + } else { + O << "#" << ARM_AM::getSORegOffset(MO3.getImm()); + } +} + + +void ARMInstPrinter::printAddrMode2Operand(const MCInst *MI, unsigned Op, + raw_ostream &O) { + const MCOperand &MO1 = MI->getOperand(Op); + const MCOperand &MO2 = MI->getOperand(Op+1); + const MCOperand &MO3 = MI->getOperand(Op+2); + + if (!MO1.isReg()) { // FIXME: This is for CP entries, but isn't right. + printOperand(MI, Op, O); + return; + } + + O << "[" << getRegisterName(MO1.getReg()); + + if (!MO2.getReg()) { + if (ARM_AM::getAM2Offset(MO3.getImm())) // Don't print +0. + O << ", #" + << ARM_AM::getAddrOpcStr(ARM_AM::getAM2Op(MO3.getImm())) + << ARM_AM::getAM2Offset(MO3.getImm()); + O << "]"; + return; + } + + O << ", " + << ARM_AM::getAddrOpcStr(ARM_AM::getAM2Op(MO3.getImm())) + << getRegisterName(MO2.getReg()); + + if (unsigned ShImm = ARM_AM::getAM2Offset(MO3.getImm())) + O << ", " + << ARM_AM::getShiftOpcStr(ARM_AM::getAM2ShiftOpc(MO3.getImm())) + << " #" << ShImm; + O << "]"; +} + +void ARMInstPrinter::printAddrMode2OffsetOperand(const MCInst *MI, + unsigned OpNum, + raw_ostream &O) { + const MCOperand &MO1 = MI->getOperand(OpNum); + const MCOperand &MO2 = MI->getOperand(OpNum+1); + + if (!MO1.getReg()) { + unsigned ImmOffs = ARM_AM::getAM2Offset(MO2.getImm()); + O << '#' + << ARM_AM::getAddrOpcStr(ARM_AM::getAM2Op(MO2.getImm())) + << ImmOffs; + return; + } + + O << ARM_AM::getAddrOpcStr(ARM_AM::getAM2Op(MO2.getImm())) + << getRegisterName(MO1.getReg()); + + if (unsigned ShImm = ARM_AM::getAM2Offset(MO2.getImm())) + O << ", " + << ARM_AM::getShiftOpcStr(ARM_AM::getAM2ShiftOpc(MO2.getImm())) + << " #" << ShImm; +} + +void ARMInstPrinter::printAddrMode3Operand(const MCInst *MI, unsigned OpNum, + raw_ostream &O) { + const MCOperand &MO1 = MI->getOperand(OpNum); + const MCOperand &MO2 = MI->getOperand(OpNum+1); + const MCOperand &MO3 = MI->getOperand(OpNum+2); + + O << '[' << getRegisterName(MO1.getReg()); + + if (MO2.getReg()) { + O << ", " << (char)ARM_AM::getAM3Op(MO3.getImm()) + << getRegisterName(MO2.getReg()) << ']'; + return; + } + + if (unsigned ImmOffs = ARM_AM::getAM3Offset(MO3.getImm())) + O << ", #" + << ARM_AM::getAddrOpcStr(ARM_AM::getAM3Op(MO3.getImm())) + << ImmOffs; + O << ']'; +} + +void ARMInstPrinter::printAddrMode3OffsetOperand(const MCInst *MI, + unsigned OpNum, + raw_ostream &O) { + const MCOperand &MO1 = MI->getOperand(OpNum); + const MCOperand &MO2 = MI->getOperand(OpNum+1); + + if (MO1.getReg()) { + O << (char)ARM_AM::getAM3Op(MO2.getImm()) + << getRegisterName(MO1.getReg()); + return; + } + + unsigned ImmOffs = ARM_AM::getAM3Offset(MO2.getImm()); + O << '#' + << ARM_AM::getAddrOpcStr(ARM_AM::getAM3Op(MO2.getImm())) + << ImmOffs; +} + + +void ARMInstPrinter::printAddrMode4Operand(const MCInst *MI, unsigned OpNum, + raw_ostream &O, + const char *Modifier) { + const MCOperand &MO2 = MI->getOperand(OpNum+1); + ARM_AM::AMSubMode Mode = ARM_AM::getAM4SubMode(MO2.getImm()); + if (Modifier && strcmp(Modifier, "submode") == 0) { + O << ARM_AM::getAMSubModeStr(Mode); + } else if (Modifier && strcmp(Modifier, "wide") == 0) { + ARM_AM::AMSubMode Mode = ARM_AM::getAM4SubMode(MO2.getImm()); + if (Mode == ARM_AM::ia) + O << ".w"; + } else { + printOperand(MI, OpNum, O); + } +} + +void ARMInstPrinter::printAddrMode5Operand(const MCInst *MI, unsigned OpNum, + raw_ostream &O, + const char *Modifier) { + const MCOperand &MO1 = MI->getOperand(OpNum); + const MCOperand &MO2 = MI->getOperand(OpNum+1); + + if (!MO1.isReg()) { // FIXME: This is for CP entries, but isn't right. + printOperand(MI, OpNum, O); + return; + } + + if (Modifier && strcmp(Modifier, "submode") == 0) { + ARM_AM::AMSubMode Mode = ARM_AM::getAM5SubMode(MO2.getImm()); + O << ARM_AM::getAMSubModeStr(Mode); + return; + } else if (Modifier && strcmp(Modifier, "base") == 0) { + // Used for FSTM{D|S} and LSTM{D|S} operations. + O << getRegisterName(MO1.getReg()); + return; + } + + O << "[" << getRegisterName(MO1.getReg()); + + if (unsigned ImmOffs = ARM_AM::getAM5Offset(MO2.getImm())) { + O << ", #" + << ARM_AM::getAddrOpcStr(ARM_AM::getAM5Op(MO2.getImm())) + << ImmOffs*4; + } + O << "]"; +} + +void ARMInstPrinter::printAddrMode6Operand(const MCInst *MI, unsigned OpNum, + raw_ostream &O) { + const MCOperand &MO1 = MI->getOperand(OpNum); + const MCOperand &MO2 = MI->getOperand(OpNum+1); + + O << "[" << getRegisterName(MO1.getReg()); + if (MO2.getImm()) { + // FIXME: Both darwin as and GNU as violate ARM docs here. + O << ", :" << MO2.getImm(); + } + O << "]"; +} + +void ARMInstPrinter::printAddrMode6OffsetOperand(const MCInst *MI, + unsigned OpNum, + raw_ostream &O) { + const MCOperand &MO = MI->getOperand(OpNum); + if (MO.getReg() == 0) + O << "!"; + else + O << ", " << getRegisterName(MO.getReg()); +} + +void ARMInstPrinter::printAddrModePCOperand(const MCInst *MI, unsigned OpNum, + raw_ostream &O, + const char *Modifier) { + assert(0 && "FIXME: Implement printAddrModePCOperand"); +} + +void ARMInstPrinter::printBitfieldInvMaskImmOperand (const MCInst *MI, + unsigned OpNum, + raw_ostream &O) { + const MCOperand &MO = MI->getOperand(OpNum); + uint32_t v = ~MO.getImm(); + int32_t lsb = CountTrailingZeros_32(v); + int32_t width = (32 - CountLeadingZeros_32 (v)) - lsb; + assert(MO.isImm() && "Not a valid bf_inv_mask_imm value!"); + O << '#' << lsb << ", #" << width; +} + +void ARMInstPrinter::printRegisterList(const MCInst *MI, unsigned OpNum, + raw_ostream &O) { + O << "{"; + for (unsigned i = OpNum, e = MI->getNumOperands(); i != e; ++i) { + if (i != OpNum) O << ", "; + O << getRegisterName(MI->getOperand(i).getReg()); + } + O << "}"; +} + +void ARMInstPrinter::printCPSOptionOperand(const MCInst *MI, unsigned OpNum, + raw_ostream &O) { + const MCOperand &Op = MI->getOperand(OpNum); + unsigned option = Op.getImm(); + unsigned mode = option & 31; + bool changemode = option >> 5 & 1; + unsigned AIF = option >> 6 & 7; + unsigned imod = option >> 9 & 3; + if (imod == 2) + O << "ie"; + else if (imod == 3) + O << "id"; + O << '\t'; + if (imod > 1) { + if (AIF & 4) O << 'a'; + if (AIF & 2) O << 'i'; + if (AIF & 1) O << 'f'; + if (AIF > 0 && changemode) O << ", "; + } + if (changemode) + O << '#' << mode; +} + +void ARMInstPrinter::printMSRMaskOperand(const MCInst *MI, unsigned OpNum, + raw_ostream &O) { + const MCOperand &Op = MI->getOperand(OpNum); + unsigned Mask = Op.getImm(); + if (Mask) { + O << '_'; + if (Mask & 8) O << 'f'; + if (Mask & 4) O << 's'; + if (Mask & 2) O << 'x'; + if (Mask & 1) O << 'c'; + } +} + +void ARMInstPrinter::printNegZeroOperand(const MCInst *MI, unsigned OpNum, + raw_ostream &O) { + const MCOperand &Op = MI->getOperand(OpNum); + O << '#'; + if (Op.getImm() < 0) + O << '-' << (-Op.getImm() - 1); + else + O << Op.getImm(); +} + +void ARMInstPrinter::printPredicateOperand(const MCInst *MI, unsigned OpNum, + raw_ostream &O) { + ARMCC::CondCodes CC = (ARMCC::CondCodes)MI->getOperand(OpNum).getImm(); + if (CC != ARMCC::AL) + O << ARMCondCodeToString(CC); +} + +void ARMInstPrinter::printMandatoryPredicateOperand(const MCInst *MI, + unsigned OpNum, + raw_ostream &O) { + ARMCC::CondCodes CC = (ARMCC::CondCodes)MI->getOperand(OpNum).getImm(); + O << ARMCondCodeToString(CC); +} + +void ARMInstPrinter::printSBitModifierOperand(const MCInst *MI, unsigned OpNum, + raw_ostream &O) { + if (MI->getOperand(OpNum).getReg()) { + assert(MI->getOperand(OpNum).getReg() == ARM::CPSR && + "Expect ARM CPSR register!"); + O << 's'; + } +} + + + +void ARMInstPrinter::printCPInstOperand(const MCInst *MI, unsigned OpNum, + raw_ostream &O, + const char *Modifier) { + // FIXME: remove this. + abort(); +} + +void ARMInstPrinter::printNoHashImmediate(const MCInst *MI, unsigned OpNum, + raw_ostream &O) { + O << MI->getOperand(OpNum).getImm(); +} + + +void ARMInstPrinter::printPCLabel(const MCInst *MI, unsigned OpNum, + raw_ostream &O) { + // FIXME: remove this. + abort(); +} + +void ARMInstPrinter::printThumbS4ImmOperand(const MCInst *MI, unsigned OpNum, + raw_ostream &O) { + O << "#" << MI->getOperand(OpNum).getImm() * 4; +} + +void ARMInstPrinter::printThumbITMask(const MCInst *MI, unsigned OpNum, + raw_ostream &O) { + // (3 - the number of trailing zeros) is the number of then / else. + unsigned Mask = MI->getOperand(OpNum).getImm(); + unsigned CondBit0 = Mask >> 4 & 1; + unsigned NumTZ = CountTrailingZeros_32(Mask); + assert(NumTZ <= 3 && "Invalid IT mask!"); + for (unsigned Pos = 3, e = NumTZ; Pos > e; --Pos) { + bool T = ((Mask >> Pos) & 1) == CondBit0; + if (T) + O << 't'; + else + O << 'e'; + } +} + +void ARMInstPrinter::printThumbAddrModeRROperand(const MCInst *MI, unsigned Op, + raw_ostream &O) { + const MCOperand &MO1 = MI->getOperand(Op); + const MCOperand &MO2 = MI->getOperand(Op+1); + O << "[" << getRegisterName(MO1.getReg()); + O << ", " << getRegisterName(MO2.getReg()) << "]"; +} + +void ARMInstPrinter::printThumbAddrModeRI5Operand(const MCInst *MI, unsigned Op, + raw_ostream &O, + unsigned Scale) { + const MCOperand &MO1 = MI->getOperand(Op); + const MCOperand &MO2 = MI->getOperand(Op+1); + const MCOperand &MO3 = MI->getOperand(Op+2); + + if (!MO1.isReg()) { // FIXME: This is for CP entries, but isn't right. + printOperand(MI, Op, O); + return; + } + + O << "[" << getRegisterName(MO1.getReg()); + if (MO3.getReg()) + O << ", " << getRegisterName(MO3.getReg()); + else if (unsigned ImmOffs = MO2.getImm()) + O << ", #" << ImmOffs * Scale; + O << "]"; +} + +void ARMInstPrinter::printThumbAddrModeS1Operand(const MCInst *MI, unsigned Op, + raw_ostream &O) { + printThumbAddrModeRI5Operand(MI, Op, O, 1); +} + +void ARMInstPrinter::printThumbAddrModeS2Operand(const MCInst *MI, unsigned Op, + raw_ostream &O) { + printThumbAddrModeRI5Operand(MI, Op, O, 2); +} + +void ARMInstPrinter::printThumbAddrModeS4Operand(const MCInst *MI, unsigned Op, + raw_ostream &O) { + printThumbAddrModeRI5Operand(MI, Op, O, 4); +} + +void ARMInstPrinter::printThumbAddrModeSPOperand(const MCInst *MI, unsigned Op, + raw_ostream &O) { + const MCOperand &MO1 = MI->getOperand(Op); + const MCOperand &MO2 = MI->getOperand(Op+1); + O << "[" << getRegisterName(MO1.getReg()); + if (unsigned ImmOffs = MO2.getImm()) + O << ", #" << ImmOffs*4; + O << "]"; +} + +void ARMInstPrinter::printTBAddrMode(const MCInst *MI, unsigned OpNum, + raw_ostream &O) { + O << "[pc, " << getRegisterName(MI->getOperand(OpNum).getReg()); + if (MI->getOpcode() == ARM::t2TBH) + O << ", lsl #1"; + O << ']'; +} + +// Constant shifts t2_so_reg is a 2-operand unit corresponding to the Thumb2 +// register with shift forms. +// REG 0 0 - e.g. R5 +// REG IMM, SH_OPC - e.g. R5, LSL #3 +void ARMInstPrinter::printT2SOOperand(const MCInst *MI, unsigned OpNum, + raw_ostream &O) { + const MCOperand &MO1 = MI->getOperand(OpNum); + const MCOperand &MO2 = MI->getOperand(OpNum+1); + + unsigned Reg = MO1.getReg(); + O << getRegisterName(Reg); + + // Print the shift opc. + O << ", " + << ARM_AM::getShiftOpcStr(ARM_AM::getSORegShOp(MO2.getImm())) + << " "; + + assert(MO2.isImm() && "Not a valid t2_so_reg value!"); + O << "#" << ARM_AM::getSORegOffset(MO2.getImm()); +} + +void ARMInstPrinter::printT2AddrModeImm12Operand(const MCInst *MI, + unsigned OpNum, + raw_ostream &O) { + const MCOperand &MO1 = MI->getOperand(OpNum); + const MCOperand &MO2 = MI->getOperand(OpNum+1); + + O << "[" << getRegisterName(MO1.getReg()); + + unsigned OffImm = MO2.getImm(); + if (OffImm) // Don't print +0. + O << ", #" << OffImm; + O << "]"; +} + +void ARMInstPrinter::printT2AddrModeImm8Operand(const MCInst *MI, + unsigned OpNum, + raw_ostream &O) { + const MCOperand &MO1 = MI->getOperand(OpNum); + const MCOperand &MO2 = MI->getOperand(OpNum+1); + + O << "[" << getRegisterName(MO1.getReg()); + + int32_t OffImm = (int32_t)MO2.getImm(); + // Don't print +0. + if (OffImm < 0) + O << ", #-" << -OffImm; + else if (OffImm > 0) + O << ", #" << OffImm; + O << "]"; +} + +void ARMInstPrinter::printT2AddrModeImm8s4Operand(const MCInst *MI, + unsigned OpNum, + raw_ostream &O) { + const MCOperand &MO1 = MI->getOperand(OpNum); + const MCOperand &MO2 = MI->getOperand(OpNum+1); + + O << "[" << getRegisterName(MO1.getReg()); + + int32_t OffImm = (int32_t)MO2.getImm() / 4; + // Don't print +0. + if (OffImm < 0) + O << ", #-" << -OffImm * 4; + else if (OffImm > 0) + O << ", #" << OffImm * 4; + O << "]"; +} + +void ARMInstPrinter::printT2AddrModeImm8OffsetOperand(const MCInst *MI, + unsigned OpNum, + raw_ostream &O) { + const MCOperand &MO1 = MI->getOperand(OpNum); + int32_t OffImm = (int32_t)MO1.getImm(); + // Don't print +0. + if (OffImm < 0) + O << "#-" << -OffImm; + else if (OffImm > 0) + O << "#" << OffImm; +} + +void ARMInstPrinter::printT2AddrModeImm8s4OffsetOperand(const MCInst *MI, + unsigned OpNum, + raw_ostream &O) { + const MCOperand &MO1 = MI->getOperand(OpNum); + int32_t OffImm = (int32_t)MO1.getImm() / 4; + // Don't print +0. + if (OffImm < 0) + O << "#-" << -OffImm * 4; + else if (OffImm > 0) + O << "#" << OffImm * 4; +} + +void ARMInstPrinter::printT2AddrModeSoRegOperand(const MCInst *MI, + unsigned OpNum, + raw_ostream &O) { + const MCOperand &MO1 = MI->getOperand(OpNum); + const MCOperand &MO2 = MI->getOperand(OpNum+1); + const MCOperand &MO3 = MI->getOperand(OpNum+2); + + O << "[" << getRegisterName(MO1.getReg()); + + assert(MO2.getReg() && "Invalid so_reg load / store address!"); + O << ", " << getRegisterName(MO2.getReg()); + + unsigned ShAmt = MO3.getImm(); + if (ShAmt) { + assert(ShAmt <= 3 && "Not a valid Thumb2 addressing mode!"); + O << ", lsl #" << ShAmt; + } + O << "]"; +} + +void ARMInstPrinter::printVFPf32ImmOperand(const MCInst *MI, unsigned OpNum, + raw_ostream &O) { + O << '#' << MI->getOperand(OpNum).getImm(); +} + +void ARMInstPrinter::printVFPf64ImmOperand(const MCInst *MI, unsigned OpNum, + raw_ostream &O) { + O << '#' << MI->getOperand(OpNum).getImm(); +} + +void ARMInstPrinter::printHex8ImmOperand(const MCInst *MI, unsigned OpNum, + raw_ostream &O) { + O << "#0x" << utohexstr(MI->getOperand(OpNum).getImm() & 0xff); +} + +void ARMInstPrinter::printHex16ImmOperand(const MCInst *MI, unsigned OpNum, + raw_ostream &O) { + O << "#0x" << utohexstr(MI->getOperand(OpNum).getImm() & 0xffff); +} + +void ARMInstPrinter::printHex32ImmOperand(const MCInst *MI, unsigned OpNum, + raw_ostream &O) { + O << "#0x" << utohexstr(MI->getOperand(OpNum).getImm() & 0xffffffff); +} + +void ARMInstPrinter::printHex64ImmOperand(const MCInst *MI, unsigned OpNum, + raw_ostream &O) { + O << "#0x" << utohexstr(MI->getOperand(OpNum).getImm()); +} diff --git a/contrib/llvm/lib/Target/ARM/AsmPrinter/ARMInstPrinter.h b/contrib/llvm/lib/Target/ARM/AsmPrinter/ARMInstPrinter.h new file mode 100644 index 0000000..be0b7c1 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/AsmPrinter/ARMInstPrinter.h @@ -0,0 +1,119 @@ +//===-- ARMInstPrinter.h - Convert ARM MCInst to assembly syntax ----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This class prints an ARM MCInst to a .s file. +// +//===----------------------------------------------------------------------===// + +#ifndef ARMINSTPRINTER_H +#define ARMINSTPRINTER_H + +#include "llvm/MC/MCInstPrinter.h" + +namespace llvm { + class MCOperand; + +class ARMInstPrinter : public MCInstPrinter { + bool VerboseAsm; +public: + ARMInstPrinter(const MCAsmInfo &MAI, bool verboseAsm) + : MCInstPrinter(MAI), VerboseAsm(verboseAsm) {} + + virtual void printInst(const MCInst *MI, raw_ostream &O); + + // Autogenerated by tblgen. + void printInstruction(const MCInst *MI, raw_ostream &O); + static const char *getRegisterName(unsigned RegNo); + + + void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O, + const char *Modifier = 0); + + void printSOImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O); + void printSOImm2PartOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O); + + void printSORegOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O); + void printAddrMode2Operand(const MCInst *MI, unsigned OpNum, raw_ostream &O); + void printAddrMode2OffsetOperand(const MCInst *MI, unsigned OpNum, + raw_ostream &O); + void printAddrMode3Operand(const MCInst *MI, unsigned OpNum, raw_ostream &O); + void printAddrMode3OffsetOperand(const MCInst *MI, unsigned OpNum, + raw_ostream &O); + void printAddrMode4Operand(const MCInst *MI, unsigned OpNum, raw_ostream &O, + const char *Modifier = 0); + void printAddrMode5Operand(const MCInst *MI, unsigned OpNum, raw_ostream &O, + const char *Modifier = 0); + void printAddrMode6Operand(const MCInst *MI, unsigned OpNum, raw_ostream &O); + void printAddrMode6OffsetOperand(const MCInst *MI, unsigned OpNum, + raw_ostream &O); + void printAddrModePCOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O, + const char *Modifier = 0); + + void printBitfieldInvMaskImmOperand(const MCInst *MI, unsigned OpNum, + raw_ostream &O); + + void printThumbS4ImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O); + void printThumbITMask(const MCInst *MI, unsigned OpNum, raw_ostream &O); + void printThumbAddrModeRROperand(const MCInst *MI, unsigned OpNum, + raw_ostream &O); + void printThumbAddrModeRI5Operand(const MCInst *MI, unsigned OpNum, + raw_ostream &O, unsigned Scale); + void printThumbAddrModeS1Operand(const MCInst *MI, unsigned OpNum, + raw_ostream &O); + void printThumbAddrModeS2Operand(const MCInst *MI, unsigned OpNum, + raw_ostream &O); + void printThumbAddrModeS4Operand(const MCInst *MI, unsigned OpNum, + raw_ostream &O); + void printThumbAddrModeSPOperand(const MCInst *MI, unsigned OpNum, + raw_ostream &O); + + void printT2SOOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O); + void printT2AddrModeImm12Operand(const MCInst *MI, unsigned OpNum, + raw_ostream &O); + void printT2AddrModeImm8Operand(const MCInst *MI, unsigned OpNum, + raw_ostream &O); + void printT2AddrModeImm8s4Operand(const MCInst *MI, unsigned OpNum, + raw_ostream &O); + void printT2AddrModeImm8OffsetOperand(const MCInst *MI, unsigned OpNum, + raw_ostream &O); + void printT2AddrModeImm8s4OffsetOperand(const MCInst *MI, unsigned OpNum, + raw_ostream &O); + void printT2AddrModeSoRegOperand(const MCInst *MI, unsigned OpNum, + raw_ostream &O); + + void printCPSOptionOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O); + void printMSRMaskOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O); + void printNegZeroOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O); + void printPredicateOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O); + void printMandatoryPredicateOperand(const MCInst *MI, unsigned OpNum, + raw_ostream &O); + void printSBitModifierOperand(const MCInst *MI, unsigned OpNum, + raw_ostream &O); + void printRegisterList(const MCInst *MI, unsigned OpNum, raw_ostream &O); + void printCPInstOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O, + const char *Modifier); + void printJTBlockOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O) {} + void printJT2BlockOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O) {} + void printTBAddrMode(const MCInst *MI, unsigned OpNum, raw_ostream &O); + void printNoHashImmediate(const MCInst *MI, unsigned OpNum, raw_ostream &O); + void printVFPf32ImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O); + void printVFPf64ImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O); + void printHex8ImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O); + void printHex16ImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O); + void printHex32ImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O); + void printHex64ImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O); + + void printPCLabel(const MCInst *MI, unsigned OpNum, raw_ostream &O); + // FIXME: Implement. + void PrintSpecial(const MCInst *MI, raw_ostream &O, const char *Kind) {} +}; + +} + +#endif diff --git a/contrib/llvm/lib/Target/ARM/AsmPrinter/ARMMCInstLower.cpp b/contrib/llvm/lib/Target/ARM/AsmPrinter/ARMMCInstLower.cpp new file mode 100644 index 0000000..ab2b06b --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/AsmPrinter/ARMMCInstLower.cpp @@ -0,0 +1,162 @@ +//===-- ARMMCInstLower.cpp - Convert ARM MachineInstr to an MCInst --------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains code to lower ARM MachineInstrs to their corresponding +// MCInst records. +// +//===----------------------------------------------------------------------===// + +#include "ARMMCInstLower.h" +//#include "llvm/CodeGen/MachineModuleInfoImpls.h" +#include "llvm/CodeGen/AsmPrinter.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" +//#include "llvm/MC/MCStreamer.h" +#include "llvm/Target/Mangler.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/ADT/SmallString.h" +using namespace llvm; + + +#if 0 +const ARMSubtarget &ARMMCInstLower::getSubtarget() const { + return AsmPrinter.getSubtarget(); +} + +MachineModuleInfoMachO &ARMMCInstLower::getMachOMMI() const { + assert(getSubtarget().isTargetDarwin() &&"Can only get MachO info on darwin"); + return AsmPrinter.MMI->getObjFileInfo<MachineModuleInfoMachO>(); +} +#endif + +MCSymbol *ARMMCInstLower:: +GetGlobalAddressSymbol(const MachineOperand &MO) const { + // FIXME: HANDLE PLT references how?? + switch (MO.getTargetFlags()) { + default: assert(0 && "Unknown target flag on GV operand"); + case 0: break; + } + + return Printer.Mang->getSymbol(MO.getGlobal()); +} + +MCSymbol *ARMMCInstLower:: +GetExternalSymbolSymbol(const MachineOperand &MO) const { + // FIXME: HANDLE PLT references how?? + switch (MO.getTargetFlags()) { + default: assert(0 && "Unknown target flag on GV operand"); + case 0: break; + } + + return Printer.GetExternalSymbolSymbol(MO.getSymbolName()); +} + + + +MCSymbol *ARMMCInstLower:: +GetJumpTableSymbol(const MachineOperand &MO) const { + SmallString<256> Name; + raw_svector_ostream(Name) << Printer.MAI->getPrivateGlobalPrefix() << "JTI" + << Printer.getFunctionNumber() << '_' << MO.getIndex(); + +#if 0 + switch (MO.getTargetFlags()) { + default: llvm_unreachable("Unknown target flag on GV operand"); + } +#endif + + // Create a symbol for the name. + return Ctx.GetOrCreateSymbol(Name.str()); +} + +MCSymbol *ARMMCInstLower:: +GetConstantPoolIndexSymbol(const MachineOperand &MO) const { + SmallString<256> Name; + raw_svector_ostream(Name) << Printer.MAI->getPrivateGlobalPrefix() << "CPI" + << Printer.getFunctionNumber() << '_' << MO.getIndex(); + +#if 0 + switch (MO.getTargetFlags()) { + default: llvm_unreachable("Unknown target flag on GV operand"); + } +#endif + + // Create a symbol for the name. + return Ctx.GetOrCreateSymbol(Name.str()); +} + +MCOperand ARMMCInstLower:: +LowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const { + // FIXME: We would like an efficient form for this, so we don't have to do a + // lot of extra uniquing. + const MCExpr *Expr = MCSymbolRefExpr::Create(Sym, Ctx); + +#if 0 + switch (MO.getTargetFlags()) { + default: llvm_unreachable("Unknown target flag on GV operand"); + } +#endif + + if (!MO.isJTI() && MO.getOffset()) + Expr = MCBinaryExpr::CreateAdd(Expr, + MCConstantExpr::Create(MO.getOffset(), Ctx), + Ctx); + return MCOperand::CreateExpr(Expr); +} + + +void ARMMCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const { + OutMI.setOpcode(MI->getOpcode()); + + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { + const MachineOperand &MO = MI->getOperand(i); + + MCOperand MCOp; + switch (MO.getType()) { + default: + MI->dump(); + assert(0 && "unknown operand type"); + case MachineOperand::MO_Register: + // Ignore all implicit register operands. + if (MO.isImplicit()) continue; + assert(!MO.getSubReg() && "Subregs should be eliminated!"); + MCOp = MCOperand::CreateReg(MO.getReg()); + break; + case MachineOperand::MO_Immediate: + MCOp = MCOperand::CreateImm(MO.getImm()); + break; + case MachineOperand::MO_MachineBasicBlock: + MCOp = MCOperand::CreateExpr(MCSymbolRefExpr::Create( + MO.getMBB()->getSymbol(), Ctx)); + break; + case MachineOperand::MO_GlobalAddress: + MCOp = LowerSymbolOperand(MO, GetGlobalAddressSymbol(MO)); + break; + case MachineOperand::MO_ExternalSymbol: + MCOp = LowerSymbolOperand(MO, GetExternalSymbolSymbol(MO)); + break; + case MachineOperand::MO_JumpTableIndex: + MCOp = LowerSymbolOperand(MO, GetJumpTableSymbol(MO)); + break; + case MachineOperand::MO_ConstantPoolIndex: + MCOp = LowerSymbolOperand(MO, GetConstantPoolIndexSymbol(MO)); + break; + case MachineOperand::MO_BlockAddress: + MCOp = LowerSymbolOperand(MO, Printer.GetBlockAddressSymbol( + MO.getBlockAddress())); + break; + } + + OutMI.addOperand(MCOp); + } + +} diff --git a/contrib/llvm/lib/Target/ARM/AsmPrinter/ARMMCInstLower.h b/contrib/llvm/lib/Target/ARM/AsmPrinter/ARMMCInstLower.h new file mode 100644 index 0000000..b81a306 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/AsmPrinter/ARMMCInstLower.h @@ -0,0 +1,56 @@ +//===-- ARMMCInstLower.h - Lower MachineInstr to MCInst -------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef ARM_MCINSTLOWER_H +#define ARM_MCINSTLOWER_H + +#include "llvm/Support/Compiler.h" + +namespace llvm { + class AsmPrinter; + class MCAsmInfo; + class MCContext; + class MCInst; + class MCOperand; + class MCSymbol; + class MachineInstr; + class MachineModuleInfoMachO; + class MachineOperand; + class Mangler; + //class ARMSubtarget; + +/// ARMMCInstLower - This class is used to lower an MachineInstr into an MCInst. +class LLVM_LIBRARY_VISIBILITY ARMMCInstLower { + MCContext &Ctx; + Mangler &Mang; + AsmPrinter &Printer; + + //const ARMSubtarget &getSubtarget() const; +public: + ARMMCInstLower(MCContext &ctx, Mangler &mang, AsmPrinter &printer) + : Ctx(ctx), Mang(mang), Printer(printer) {} + + void Lower(const MachineInstr *MI, MCInst &OutMI) const; + + //MCSymbol *GetPICBaseSymbol() const; + MCSymbol *GetGlobalAddressSymbol(const MachineOperand &MO) const; + MCSymbol *GetExternalSymbolSymbol(const MachineOperand &MO) const; + MCSymbol *GetJumpTableSymbol(const MachineOperand &MO) const; + MCSymbol *GetConstantPoolIndexSymbol(const MachineOperand &MO) const; + MCOperand LowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const; + +/* +private: + MachineModuleInfoMachO &getMachOMMI() const; + */ +}; + +} + +#endif diff --git a/contrib/llvm/lib/Target/ARM/AsmPrinter/CMakeLists.txt b/contrib/llvm/lib/Target/ARM/AsmPrinter/CMakeLists.txt new file mode 100644 index 0000000..4e299f8 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/AsmPrinter/CMakeLists.txt @@ -0,0 +1,8 @@ +include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. ) + +add_llvm_library(LLVMARMAsmPrinter + ARMAsmPrinter.cpp + ARMInstPrinter.cpp + ARMMCInstLower.cpp + ) +add_dependencies(LLVMARMAsmPrinter ARMCodeGenTable_gen) diff --git a/contrib/llvm/lib/Target/ARM/AsmPrinter/Makefile b/contrib/llvm/lib/Target/ARM/AsmPrinter/Makefile new file mode 100644 index 0000000..65d372e --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/AsmPrinter/Makefile @@ -0,0 +1,15 @@ +##===- lib/Target/ARM/AsmPrinter/Makefile ------------------*- Makefile -*-===## +# +# The LLVM Compiler Infrastructure +# +# This file is distributed under the University of Illinois Open Source +# License. See LICENSE.TXT for details. +# +##===----------------------------------------------------------------------===## +LEVEL = ../../../.. +LIBRARYNAME = LLVMARMAsmPrinter + +# Hack: we need to include 'main' arm target directory to grab private headers +CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/.. + +include $(LEVEL)/Makefile.common diff --git a/contrib/llvm/lib/Target/ARM/CMakeLists.txt b/contrib/llvm/lib/Target/ARM/CMakeLists.txt new file mode 100644 index 0000000..29e66e1 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/CMakeLists.txt @@ -0,0 +1,43 @@ +set(LLVM_TARGET_DEFINITIONS ARM.td) + +tablegen(ARMGenRegisterInfo.h.inc -gen-register-desc-header) +tablegen(ARMGenRegisterNames.inc -gen-register-enums) +tablegen(ARMGenRegisterInfo.inc -gen-register-desc) +tablegen(ARMGenInstrNames.inc -gen-instr-enums) +tablegen(ARMGenInstrInfo.inc -gen-instr-desc) +tablegen(ARMGenCodeEmitter.inc -gen-emitter) +tablegen(ARMGenAsmWriter.inc -gen-asm-writer) +tablegen(ARMGenDAGISel.inc -gen-dag-isel) +tablegen(ARMGenCallingConv.inc -gen-callingconv) +tablegen(ARMGenSubtarget.inc -gen-subtarget) +tablegen(ARMGenEDInfo.inc -gen-enhanced-disassembly-info) + +add_llvm_target(ARMCodeGen + ARMBaseInstrInfo.cpp + ARMBaseRegisterInfo.cpp + ARMCodeEmitter.cpp + ARMConstantIslandPass.cpp + ARMConstantPoolValue.cpp + ARMExpandPseudoInsts.cpp + ARMISelDAGToDAG.cpp + ARMISelLowering.cpp + ARMInstrInfo.cpp + ARMJITInfo.cpp + ARMLoadStoreOptimizer.cpp + ARMMCAsmInfo.cpp + ARMRegisterInfo.cpp + ARMSubtarget.cpp + ARMTargetMachine.cpp + ARMTargetObjectFile.cpp + NEONMoveFix.cpp + NEONPreAllocPass.cpp + Thumb1InstrInfo.cpp + Thumb1RegisterInfo.cpp + Thumb2ITBlockPass.cpp + Thumb2InstrInfo.cpp + Thumb2RegisterInfo.cpp + Thumb2SizeReduction.cpp + ARMSelectionDAGInfo.cpp + ) + +target_link_libraries (LLVMARMCodeGen LLVMSelectionDAG) diff --git a/contrib/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/contrib/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp new file mode 100644 index 0000000..4de697e --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp @@ -0,0 +1,578 @@ +//===- ARMDisassembler.cpp - Disassembler for ARM/Thumb ISA -----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file is part of the ARM Disassembler. +// It contains code to implement the public interfaces of ARMDisassembler and +// ThumbDisassembler, both of which are instances of MCDisassembler. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "arm-disassembler" + +#include "ARMDisassembler.h" +#include "ARMDisassemblerCore.h" + +#include "llvm/MC/EDInstInfo.h" +#include "llvm/MC/MCInst.h" +#include "llvm/Target/TargetRegistry.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/MemoryObject.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" + +/// ARMGenDecoderTables.inc - ARMDecoderTables.inc is tblgen'ed from +/// ARMDecoderEmitter.cpp TableGen backend. It contains: +/// +/// o Mappings from opcode to ARM/Thumb instruction format +/// +/// o static uint16_t decodeInstruction(uint32_t insn) - the decoding function +/// for an ARM instruction. +/// +/// o static uint16_t decodeThumbInstruction(field_t insn) - the decoding +/// function for a Thumb instruction. +/// +#include "../ARMGenDecoderTables.inc" + +#include "../ARMGenEDInfo.inc" + +using namespace llvm; + +/// showBitVector - Use the raw_ostream to log a diagnostic message describing +/// the inidividual bits of the instruction. +/// +static inline void showBitVector(raw_ostream &os, const uint32_t &insn) { + // Split the bit position markers into more than one lines to fit 80 columns. + os << " 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11" + << " 10 9 8 7 6 5 4 3 2 1 0 \n"; + os << "---------------------------------------------------------------" + << "----------------------------------\n"; + os << '|'; + for (unsigned i = 32; i != 0; --i) { + if (insn >> (i - 1) & 0x01) + os << " 1"; + else + os << " 0"; + os << (i%4 == 1 ? '|' : ':'); + } + os << '\n'; + // Split the bit position markers into more than one lines to fit 80 columns. + os << "---------------------------------------------------------------" + << "----------------------------------\n"; + os << '\n'; +} + +/// decodeARMInstruction is a decorator function which tries special cases of +/// instruction matching before calling the auto-generated decoder function. +static unsigned decodeARMInstruction(uint32_t &insn) { + if (slice(insn, 31, 28) == 15) + goto AutoGenedDecoder; + + // Special case processing, if any, goes here.... + + // LLVM combines the offset mode of A8.6.197 & A8.6.198 into STRB. + // The insufficient encoding information of the combined instruction confuses + // the decoder wrt BFC/BFI. Therefore, we try to recover here. + // For BFC, Inst{27-21} = 0b0111110 & Inst{6-0} = 0b0011111. + // For BFI, Inst{27-21} = 0b0111110 & Inst{6-4} = 0b001 & Inst{3-0} =! 0b1111. + if (slice(insn, 27, 21) == 0x3e && slice(insn, 6, 4) == 1) { + if (slice(insn, 3, 0) == 15) + return ARM::BFC; + else + return ARM::BFI; + } + + // Ditto for ADDSrs, which is a super-instruction for A8.6.7 & A8.6.8. + // As a result, the decoder fails to decode UMULL properly. + if (slice(insn, 27, 21) == 0x04 && slice(insn, 7, 4) == 9) { + return ARM::UMULL; + } + + // Ditto for STR_PRE, which is a super-instruction for A8.6.194 & A8.6.195. + // As a result, the decoder fails to decode SBFX properly. + if (slice(insn, 27, 21) == 0x3d && slice(insn, 6, 4) == 5) + return ARM::SBFX; + + // And STRB_PRE, which is a super-instruction for A8.6.197 & A8.6.198. + // As a result, the decoder fails to decode UBFX properly. + if (slice(insn, 27, 21) == 0x3f && slice(insn, 6, 4) == 5) + return ARM::UBFX; + + // Ditto for STRT, which is a super-instruction for A8.6.210 Encoding A1 & A2. + // As a result, the decoder fails to deocode SSAT properly. + if (slice(insn, 27, 21) == 0x35 && slice(insn, 5, 4) == 1) + return slice(insn, 6, 6) == 0 ? ARM::SSATlsl : ARM::SSATasr; + + // Ditto for RSCrs, which is a super-instruction for A8.6.146 & A8.6.147. + // As a result, the decoder fails to decode STRHT/LDRHT/LDRSHT/LDRSBT. + if (slice(insn, 27, 24) == 0) { + switch (slice(insn, 21, 20)) { + case 2: + switch (slice(insn, 7, 4)) { + case 11: + return ARM::STRHT; + default: + break; // fallthrough + } + break; + case 3: + switch (slice(insn, 7, 4)) { + case 11: + return ARM::LDRHT; + case 13: + return ARM::LDRSBT; + case 15: + return ARM::LDRSHT; + default: + break; // fallthrough + } + break; + default: + break; // fallthrough + } + } + + // Ditto for SBCrs, which is a super-instruction for A8.6.152 & A8.6.153. + // As a result, the decoder fails to decode STRH_Post/LDRD_POST/STRD_POST + // properly. + if (slice(insn, 27, 25) == 0 && slice(insn, 20, 20) == 0) { + unsigned PW = slice(insn, 24, 24) << 1 | slice(insn, 21, 21); + switch (slice(insn, 7, 4)) { + case 11: + switch (PW) { + case 2: // Offset + return ARM::STRH; + case 3: // Pre-indexed + return ARM::STRH_PRE; + case 0: // Post-indexed + return ARM::STRH_POST; + default: + break; // fallthrough + } + break; + case 13: + switch (PW) { + case 2: // Offset + return ARM::LDRD; + case 3: // Pre-indexed + return ARM::LDRD_PRE; + case 0: // Post-indexed + return ARM::LDRD_POST; + default: + break; // fallthrough + } + break; + case 15: + switch (PW) { + case 2: // Offset + return ARM::STRD; + case 3: // Pre-indexed + return ARM::STRD_PRE; + case 0: // Post-indexed + return ARM::STRD_POST; + default: + break; // fallthrough + } + break; + default: + break; // fallthrough + } + } + + // Ditto for SBCSSrs, which is a super-instruction for A8.6.152 & A8.6.153. + // As a result, the decoder fails to decode LDRH_POST/LDRSB_POST/LDRSH_POST + // properly. + if (slice(insn, 27, 25) == 0 && slice(insn, 20, 20) == 1) { + unsigned PW = slice(insn, 24, 24) << 1 | slice(insn, 21, 21); + switch (slice(insn, 7, 4)) { + case 11: + switch (PW) { + case 2: // Offset + return ARM::LDRH; + case 3: // Pre-indexed + return ARM::LDRH_PRE; + case 0: // Post-indexed + return ARM::LDRH_POST; + default: + break; // fallthrough + } + break; + case 13: + switch (PW) { + case 2: // Offset + return ARM::LDRSB; + case 3: // Pre-indexed + return ARM::LDRSB_PRE; + case 0: // Post-indexed + return ARM::LDRSB_POST; + default: + break; // fallthrough + } + break; + case 15: + switch (PW) { + case 2: // Offset + return ARM::LDRSH; + case 3: // Pre-indexed + return ARM::LDRSH_PRE; + case 0: // Post-indexed + return ARM::LDRSH_POST; + default: + break; // fallthrough + } + break; + default: + break; // fallthrough + } + } + +AutoGenedDecoder: + // Calling the auto-generated decoder function. + return decodeInstruction(insn); +} + +// Helper function for special case handling of LDR (literal) and friends. +// See, for example, A6.3.7 Load word: Table A6-18 Load word. +// See A8.6.57 T3, T4 & A8.6.60 T2 and friends for why we morphed the opcode +// before returning it. +static unsigned T2Morph2LoadLiteral(unsigned Opcode) { + switch (Opcode) { + default: + return Opcode; // Return unmorphed opcode. + + case ARM::t2LDRDi8: + return ARM::t2LDRDpci; + + case ARM::t2LDR_POST: case ARM::t2LDR_PRE: + case ARM::t2LDRi12: case ARM::t2LDRi8: + case ARM::t2LDRs: case ARM::t2LDRT: + return ARM::t2LDRpci; + + case ARM::t2LDRB_POST: case ARM::t2LDRB_PRE: + case ARM::t2LDRBi12: case ARM::t2LDRBi8: + case ARM::t2LDRBs: case ARM::t2LDRBT: + return ARM::t2LDRBpci; + + case ARM::t2LDRH_POST: case ARM::t2LDRH_PRE: + case ARM::t2LDRHi12: case ARM::t2LDRHi8: + case ARM::t2LDRHs: case ARM::t2LDRHT: + return ARM::t2LDRHpci; + + case ARM::t2LDRSB_POST: case ARM::t2LDRSB_PRE: + case ARM::t2LDRSBi12: case ARM::t2LDRSBi8: + case ARM::t2LDRSBs: case ARM::t2LDRSBT: + return ARM::t2LDRSBpci; + + case ARM::t2LDRSH_POST: case ARM::t2LDRSH_PRE: + case ARM::t2LDRSHi12: case ARM::t2LDRSHi8: + case ARM::t2LDRSHs: case ARM::t2LDRSHT: + return ARM::t2LDRSHpci; + } +} + +/// decodeThumbSideEffect is a decorator function which can potentially twiddle +/// the instruction or morph the returned opcode under Thumb2. +/// +/// First it checks whether the insn is a NEON or VFP instr; if true, bit +/// twiddling could be performed on insn to turn it into an ARM NEON/VFP +/// equivalent instruction and decodeInstruction is called with the transformed +/// insn. +/// +/// Next, there is special handling for Load byte/halfword/word instruction by +/// checking whether Rn=0b1111 and call T2Morph2LoadLiteral() on the decoded +/// Thumb2 instruction. See comments below for further details. +/// +/// Finally, one last check is made to see whether the insn is a NEON/VFP and +/// decodeInstruction(insn) is invoked on the original insn. +/// +/// Otherwise, decodeThumbInstruction is called with the original insn. +static unsigned decodeThumbSideEffect(bool IsThumb2, uint32_t &insn) { + if (IsThumb2) { + uint16_t op1 = slice(insn, 28, 27); + uint16_t op2 = slice(insn, 26, 20); + + // A6.3 32-bit Thumb instruction encoding + // Table A6-9 32-bit Thumb instruction encoding + + // The coprocessor instructions of interest are transformed to their ARM + // equivalents. + + // --------- Transform Begin Marker --------- + if ((op1 == 1 || op1 == 3) && slice(op2, 6, 4) == 7) { + // A7.4 Advanced SIMD data-processing instructions + // U bit of Thumb corresponds to Inst{24} of ARM. + uint16_t U = slice(op1, 1, 1); + + // Inst{28-24} of ARM = {1,0,0,1,U}; + uint16_t bits28_24 = 9 << 1 | U; + DEBUG(showBitVector(errs(), insn)); + setSlice(insn, 28, 24, bits28_24); + return decodeInstruction(insn); + } + + if (op1 == 3 && slice(op2, 6, 4) == 1 && slice(op2, 0, 0) == 0) { + // A7.7 Advanced SIMD element or structure load/store instructions + // Inst{27-24} of Thumb = 0b1001 + // Inst{27-24} of ARM = 0b0100 + DEBUG(showBitVector(errs(), insn)); + setSlice(insn, 27, 24, 4); + return decodeInstruction(insn); + } + // --------- Transform End Marker --------- + + // See, for example, A6.3.7 Load word: Table A6-18 Load word. + // See A8.6.57 T3, T4 & A8.6.60 T2 and friends for why we morphed the opcode + // before returning it to our caller. + if (op1 == 3 && slice(op2, 6, 5) == 0 && slice(op2, 0, 0) == 1 + && slice(insn, 19, 16) == 15) + return T2Morph2LoadLiteral(decodeThumbInstruction(insn)); + + // One last check for NEON/VFP instructions. + if ((op1 == 1 || op1 == 3) && slice(op2, 6, 6) == 1) + return decodeInstruction(insn); + + // Fall through. + } + + return decodeThumbInstruction(insn); +} + +static inline bool Thumb2PreloadOpcodeNoPCI(unsigned Opcode) { + switch (Opcode) { + default: + return false; + case ARM::t2PLDi12: case ARM::t2PLDi8: + case ARM::t2PLDr: case ARM::t2PLDs: + case ARM::t2PLDWi12: case ARM::t2PLDWi8: + case ARM::t2PLDWr: case ARM::t2PLDWs: + case ARM::t2PLIi12: case ARM::t2PLIi8: + case ARM::t2PLIr: case ARM::t2PLIs: + return true; + } +} + +static inline unsigned T2Morph2Preload2PCI(unsigned Opcode) { + switch (Opcode) { + default: + return 0; + case ARM::t2PLDi12: case ARM::t2PLDi8: + case ARM::t2PLDr: case ARM::t2PLDs: + return ARM::t2PLDpci; + case ARM::t2PLDWi12: case ARM::t2PLDWi8: + case ARM::t2PLDWr: case ARM::t2PLDWs: + return ARM::t2PLDWpci; + case ARM::t2PLIi12: case ARM::t2PLIi8: + case ARM::t2PLIr: case ARM::t2PLIs: + return ARM::t2PLIpci; + } +} + +// +// Public interface for the disassembler +// + +bool ARMDisassembler::getInstruction(MCInst &MI, + uint64_t &Size, + const MemoryObject &Region, + uint64_t Address, + raw_ostream &os) const { + // The machine instruction. + uint32_t insn; + uint8_t bytes[4]; + + // We want to read exactly 4 bytes of data. + if (Region.readBytes(Address, 4, (uint8_t*)bytes, NULL) == -1) + return false; + + // Encoded as a small-endian 32-bit word in the stream. + insn = (bytes[3] << 24) | + (bytes[2] << 16) | + (bytes[1] << 8) | + (bytes[0] << 0); + + unsigned Opcode = decodeARMInstruction(insn); + ARMFormat Format = ARMFormats[Opcode]; + Size = 4; + + DEBUG({ + errs() << "Opcode=" << Opcode << " Name=" << ARMUtils::OpcodeName(Opcode) + << " Format=" << stringForARMFormat(Format) << '(' << (int)Format + << ")\n"; + showBitVector(errs(), insn); + }); + + ARMBasicMCBuilder *Builder = CreateMCBuilder(Opcode, Format); + if (!Builder) + return false; + + if (!Builder->Build(MI, insn)) + return false; + + delete Builder; + + return true; +} + +bool ThumbDisassembler::getInstruction(MCInst &MI, + uint64_t &Size, + const MemoryObject &Region, + uint64_t Address, + raw_ostream &os) const { + // The Thumb instruction stream is a sequence of halhwords. + + // This represents the first halfword as well as the machine instruction + // passed to decodeThumbInstruction(). For 16-bit Thumb instruction, the top + // halfword of insn is 0x00 0x00; otherwise, the first halfword is moved to + // the top half followed by the second halfword. + uint32_t insn = 0; + // Possible second halfword. + uint16_t insn1 = 0; + + // A6.1 Thumb instruction set encoding + // + // If bits [15:11] of the halfword being decoded take any of the following + // values, the halfword is the first halfword of a 32-bit instruction: + // o 0b11101 + // o 0b11110 + // o 0b11111. + // + // Otherwise, the halfword is a 16-bit instruction. + + // Read 2 bytes of data first. + uint8_t bytes[2]; + if (Region.readBytes(Address, 2, (uint8_t*)bytes, NULL) == -1) + return false; + + // Encoded as a small-endian 16-bit halfword in the stream. + insn = (bytes[1] << 8) | bytes[0]; + unsigned bits15_11 = slice(insn, 15, 11); + bool IsThumb2 = false; + + // 32-bit instructions if the bits [15:11] of the halfword matches + // { 0b11101 /* 0x1D */, 0b11110 /* 0x1E */, ob11111 /* 0x1F */ }. + if (bits15_11 == 0x1D || bits15_11 == 0x1E || bits15_11 == 0x1F) { + IsThumb2 = true; + if (Region.readBytes(Address + 2, 2, (uint8_t*)bytes, NULL) == -1) + return false; + // Encoded as a small-endian 16-bit halfword in the stream. + insn1 = (bytes[1] << 8) | bytes[0]; + insn = (insn << 16 | insn1); + } + + // The insn could potentially be bit-twiddled in order to be decoded as an ARM + // NEON/VFP opcode. In such case, the modified insn is later disassembled as + // an ARM NEON/VFP instruction. + // + // This is a short term solution for lack of encoding bits specified for the + // Thumb2 NEON/VFP instructions. The long term solution could be adding some + // infrastructure to have each instruction support more than one encodings. + // Which encoding is used would be based on which subtarget the compiler/ + // disassembler is working with at the time. This would allow the sharing of + // the NEON patterns between ARM and Thumb2, as well as potential greater + // sharing between the regular ARM instructions and the 32-bit wide Thumb2 + // instructions as well. + unsigned Opcode = decodeThumbSideEffect(IsThumb2, insn); + + // A8.6.117/119/120/121. + // PLD/PLDW/PLI instructions with Rn==15 is transformed to the pci variant. + if (Thumb2PreloadOpcodeNoPCI(Opcode) && slice(insn, 19, 16) == 15) + Opcode = T2Morph2Preload2PCI(Opcode); + + ARMFormat Format = ARMFormats[Opcode]; + Size = IsThumb2 ? 4 : 2; + + DEBUG({ + errs() << "Opcode=" << Opcode << " Name=" << ARMUtils::OpcodeName(Opcode) + << " Format=" << stringForARMFormat(Format) << '(' << (int)Format + << ")\n"; + showBitVector(errs(), insn); + }); + + ARMBasicMCBuilder *Builder = CreateMCBuilder(Opcode, Format); + if (!Builder) + return false; + + Builder->SetSession(const_cast<Session *>(&SO)); + + if (!Builder->Build(MI, insn)) + return false; + + delete Builder; + + return true; +} + +// A8.6.50 +// Valid return values are {1, 2, 3, 4}, with 0 signifying an error condition. +static unsigned short CountITSize(unsigned ITMask) { + // First count the trailing zeros of the IT mask. + unsigned TZ = CountTrailingZeros_32(ITMask); + if (TZ > 3) { + DEBUG(errs() << "Encoding error: IT Mask '0000'"); + return 0; + } + return (4 - TZ); +} + +/// Init ITState. Note that at least one bit is always 1 in mask. +bool Session::InitIT(unsigned short bits7_0) { + ITCounter = CountITSize(slice(bits7_0, 3, 0)); + if (ITCounter == 0) + return false; + + // A8.6.50 IT + unsigned short FirstCond = slice(bits7_0, 7, 4); + if (FirstCond == 0xF) { + DEBUG(errs() << "Encoding error: IT FirstCond '1111'"); + return false; + } + if (FirstCond == 0xE && ITCounter != 1) { + DEBUG(errs() << "Encoding error: IT FirstCond '1110' && Mask != '1000'"); + return false; + } + + ITState = bits7_0; + + return true; +} + +/// Update ITState if necessary. +void Session::UpdateIT() { + assert(ITCounter); + --ITCounter; + if (ITCounter == 0) + ITState = 0; + else { + unsigned short NewITState4_0 = slice(ITState, 4, 0) << 1; + setSlice(ITState, 4, 0, NewITState4_0); + } +} + +static MCDisassembler *createARMDisassembler(const Target &T) { + return new ARMDisassembler; +} + +static MCDisassembler *createThumbDisassembler(const Target &T) { + return new ThumbDisassembler; +} + +extern "C" void LLVMInitializeARMDisassembler() { + // Register the disassembler. + TargetRegistry::RegisterMCDisassembler(TheARMTarget, + createARMDisassembler); + TargetRegistry::RegisterMCDisassembler(TheThumbTarget, + createThumbDisassembler); +} + +EDInstInfo *ARMDisassembler::getEDInfo() const { + return instInfoARM; +} + +EDInstInfo *ThumbDisassembler::getEDInfo() const { + return instInfoARM; +} diff --git a/contrib/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.h b/contrib/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.h new file mode 100644 index 0000000..0a74a38 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.h @@ -0,0 +1,99 @@ +//===- ARMDisassembler.h - Disassembler for ARM/Thumb ISA -------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file is part of the ARM Disassembler. +// It contains the header for ARMDisassembler and ThumbDisassembler, both are +// subclasses of MCDisassembler. +// +//===----------------------------------------------------------------------===// + +#ifndef ARMDISASSEMBLER_H +#define ARMDISASSEMBLER_H + +#include "llvm/MC/MCDisassembler.h" + +namespace llvm { + +class MCInst; +class MemoryObject; +class raw_ostream; + +struct EDInstInfo; + +/// ARMDisassembler - ARM disassembler for all ARM platforms. +class ARMDisassembler : public MCDisassembler { +public: + /// Constructor - Initializes the disassembler. + /// + ARMDisassembler() : + MCDisassembler() { + } + + ~ARMDisassembler() { + } + + /// getInstruction - See MCDisassembler. + bool getInstruction(MCInst &instr, + uint64_t &size, + const MemoryObject ®ion, + uint64_t address, + raw_ostream &vStream) const; + + /// getEDInfo - See MCDisassembler. + EDInstInfo *getEDInfo() const; +private: +}; + +// Forward declaration. +class ARMBasicMCBuilder; + +/// Session - Keep track of the IT Block progression. +class Session { + friend class ARMBasicMCBuilder; +public: + Session() : ITCounter(0), ITState(0) {} + ~Session() {} + /// InitIT - Initializes ITCounter/ITState. + bool InitIT(unsigned short bits7_0); + /// UpdateIT - Updates ITCounter/ITState as IT Block progresses. + void UpdateIT(); + +private: + unsigned ITCounter; // Possible values: 0, 1, 2, 3, 4. + unsigned ITState; // A2.5.2 Consists of IT[7:5] and IT[4:0] initially. +}; + +/// ThumbDisassembler - Thumb disassembler for all ARM platforms. +class ThumbDisassembler : public MCDisassembler { +public: + /// Constructor - Initializes the disassembler. + /// + ThumbDisassembler() : + MCDisassembler(), SO() { + } + + ~ThumbDisassembler() { + } + + /// getInstruction - See MCDisassembler. + bool getInstruction(MCInst &instr, + uint64_t &size, + const MemoryObject ®ion, + uint64_t address, + raw_ostream &vStream) const; + + /// getEDInfo - See MCDisassembler. + EDInstInfo *getEDInfo() const; +private: + Session SO; +}; + +} // namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/ARM/Disassembler/ARMDisassemblerCore.cpp b/contrib/llvm/lib/Target/ARM/Disassembler/ARMDisassemblerCore.cpp new file mode 100644 index 0000000..adb7795 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/Disassembler/ARMDisassemblerCore.cpp @@ -0,0 +1,3350 @@ +//===- ARMDisassemblerCore.cpp - ARM disassembler helpers -------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file is part of the ARM Disassembler. +// It contains code to represent the core concepts of Builder and DisassembleFP +// to solve the problem of disassembling an ARM instr. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "arm-disassembler" + +#include "ARMDisassemblerCore.h" +#include "ARMAddressingModes.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +/// ARMGenInstrInfo.inc - ARMGenInstrInfo.inc contains the static const +/// TargetInstrDesc ARMInsts[] definition and the TargetOperandInfo[]'s +/// describing the operand info for each ARMInsts[i]. +/// +/// Together with an instruction's encoding format, we can take advantage of the +/// NumOperands and the OpInfo fields of the target instruction description in +/// the quest to build out the MCOperand list for an MCInst. +/// +/// The general guideline is that with a known format, the number of dst and src +/// operands are well-known. The dst is built first, followed by the src +/// operand(s). The operands not yet used at this point are for the Implicit +/// Uses and Defs by this instr. For the Uses part, the pred:$p operand is +/// defined with two components: +/// +/// def pred { // Operand PredicateOperand +/// ValueType Type = OtherVT; +/// string PrintMethod = "printPredicateOperand"; +/// string AsmOperandLowerMethod = ?; +/// dag MIOperandInfo = (ops i32imm, CCR); +/// AsmOperandClass ParserMatchClass = ImmAsmOperand; +/// dag DefaultOps = (ops (i32 14), (i32 zero_reg)); +/// } +/// +/// which is manifested by the TargetOperandInfo[] of: +/// +/// { 0, 0|(1<<TOI::Predicate), 0 }, +/// { ARM::CCRRegClassID, 0|(1<<TOI::Predicate), 0 } +/// +/// So the first predicate MCOperand corresponds to the immediate part of the +/// ARM condition field (Inst{31-28}), and the second predicate MCOperand +/// corresponds to a register kind of ARM::CPSR. +/// +/// For the Defs part, in the simple case of only cc_out:$s, we have: +/// +/// def cc_out { // Operand OptionalDefOperand +/// ValueType Type = OtherVT; +/// string PrintMethod = "printSBitModifierOperand"; +/// string AsmOperandLowerMethod = ?; +/// dag MIOperandInfo = (ops CCR); +/// AsmOperandClass ParserMatchClass = ImmAsmOperand; +/// dag DefaultOps = (ops (i32 zero_reg)); +/// } +/// +/// which is manifested by the one TargetOperandInfo of: +/// +/// { ARM::CCRRegClassID, 0|(1<<TOI::OptionalDef), 0 } +/// +/// And this maps to one MCOperand with the regsiter kind of ARM::CPSR. +#include "ARMGenInstrInfo.inc" + +using namespace llvm; + +const char *ARMUtils::OpcodeName(unsigned Opcode) { + return ARMInsts[Opcode].Name; +} + +// Return the register enum Based on RegClass and the raw register number. +// For DRegPair, see comments below. +// FIXME: Auto-gened? +static unsigned getRegisterEnum(BO B, unsigned RegClassID, unsigned RawRegister, + bool DRegPair = false) { + + if (DRegPair && RegClassID == ARM::QPRRegClassID) { + // LLVM expects { Dd, Dd+1 } to form a super register; this is not specified + // in the ARM Architecture Manual as far as I understand it (A8.6.307). + // Therefore, we morph the RegClassID to be the sub register class and don't + // subsequently transform the RawRegister encoding when calculating RegNum. + // + // See also ARMinstPrinter::printOperand() wrt "dregpair" modifier part + // where this workaround is meant for. + RegClassID = ARM::DPRRegClassID; + } + + // See also decodeNEONRd(), decodeNEONRn(), decodeNEONRm(). + unsigned RegNum = + RegClassID == ARM::QPRRegClassID ? RawRegister >> 1 : RawRegister; + + switch (RegNum) { + default: + break; + case 0: + switch (RegClassID) { + case ARM::GPRRegClassID: case ARM::tGPRRegClassID: return ARM::R0; + case ARM::DPRRegClassID: case ARM::DPR_8RegClassID: + case ARM::DPR_VFP2RegClassID: + return ARM::D0; + case ARM::QPRRegClassID: case ARM::QPR_8RegClassID: + case ARM::QPR_VFP2RegClassID: + return ARM::Q0; + case ARM::SPRRegClassID: case ARM::SPR_8RegClassID: return ARM::S0; + } + break; + case 1: + switch (RegClassID) { + case ARM::GPRRegClassID: case ARM::tGPRRegClassID: return ARM::R1; + case ARM::DPRRegClassID: case ARM::DPR_8RegClassID: + case ARM::DPR_VFP2RegClassID: + return ARM::D1; + case ARM::QPRRegClassID: case ARM::QPR_8RegClassID: + case ARM::QPR_VFP2RegClassID: + return ARM::Q1; + case ARM::SPRRegClassID: case ARM::SPR_8RegClassID: return ARM::S1; + } + break; + case 2: + switch (RegClassID) { + case ARM::GPRRegClassID: case ARM::tGPRRegClassID: return ARM::R2; + case ARM::DPRRegClassID: case ARM::DPR_8RegClassID: + case ARM::DPR_VFP2RegClassID: + return ARM::D2; + case ARM::QPRRegClassID: case ARM::QPR_8RegClassID: + case ARM::QPR_VFP2RegClassID: + return ARM::Q2; + case ARM::SPRRegClassID: case ARM::SPR_8RegClassID: return ARM::S2; + } + break; + case 3: + switch (RegClassID) { + case ARM::GPRRegClassID: case ARM::tGPRRegClassID: return ARM::R3; + case ARM::DPRRegClassID: case ARM::DPR_8RegClassID: + case ARM::DPR_VFP2RegClassID: + return ARM::D3; + case ARM::QPRRegClassID: case ARM::QPR_8RegClassID: + case ARM::QPR_VFP2RegClassID: + return ARM::Q3; + case ARM::SPRRegClassID: case ARM::SPR_8RegClassID: return ARM::S3; + } + break; + case 4: + switch (RegClassID) { + case ARM::GPRRegClassID: case ARM::tGPRRegClassID: return ARM::R4; + case ARM::DPRRegClassID: case ARM::DPR_8RegClassID: + case ARM::DPR_VFP2RegClassID: + return ARM::D4; + case ARM::QPRRegClassID: case ARM::QPR_VFP2RegClassID: return ARM::Q4; + case ARM::SPRRegClassID: case ARM::SPR_8RegClassID: return ARM::S4; + } + break; + case 5: + switch (RegClassID) { + case ARM::GPRRegClassID: case ARM::tGPRRegClassID: return ARM::R5; + case ARM::DPRRegClassID: case ARM::DPR_8RegClassID: + case ARM::DPR_VFP2RegClassID: + return ARM::D5; + case ARM::QPRRegClassID: case ARM::QPR_VFP2RegClassID: return ARM::Q5; + case ARM::SPRRegClassID: case ARM::SPR_8RegClassID: return ARM::S5; + } + break; + case 6: + switch (RegClassID) { + case ARM::GPRRegClassID: case ARM::tGPRRegClassID: return ARM::R6; + case ARM::DPRRegClassID: case ARM::DPR_8RegClassID: + case ARM::DPR_VFP2RegClassID: + return ARM::D6; + case ARM::QPRRegClassID: case ARM::QPR_VFP2RegClassID: return ARM::Q6; + case ARM::SPRRegClassID: case ARM::SPR_8RegClassID: return ARM::S6; + } + break; + case 7: + switch (RegClassID) { + case ARM::GPRRegClassID: case ARM::tGPRRegClassID: return ARM::R7; + case ARM::DPRRegClassID: case ARM::DPR_8RegClassID: + case ARM::DPR_VFP2RegClassID: + return ARM::D7; + case ARM::QPRRegClassID: case ARM::QPR_VFP2RegClassID: return ARM::Q7; + case ARM::SPRRegClassID: case ARM::SPR_8RegClassID: return ARM::S7; + } + break; + case 8: + switch (RegClassID) { + case ARM::GPRRegClassID: return ARM::R8; + case ARM::DPRRegClassID: case ARM::DPR_VFP2RegClassID: return ARM::D8; + case ARM::QPRRegClassID: return ARM::Q8; + case ARM::SPRRegClassID: case ARM::SPR_8RegClassID: return ARM::S8; + } + break; + case 9: + switch (RegClassID) { + case ARM::GPRRegClassID: return ARM::R9; + case ARM::DPRRegClassID: case ARM::DPR_VFP2RegClassID: return ARM::D9; + case ARM::QPRRegClassID: return ARM::Q9; + case ARM::SPRRegClassID: case ARM::SPR_8RegClassID: return ARM::S9; + } + break; + case 10: + switch (RegClassID) { + case ARM::GPRRegClassID: return ARM::R10; + case ARM::DPRRegClassID: case ARM::DPR_VFP2RegClassID: return ARM::D10; + case ARM::QPRRegClassID: return ARM::Q10; + case ARM::SPRRegClassID: case ARM::SPR_8RegClassID: return ARM::S10; + } + break; + case 11: + switch (RegClassID) { + case ARM::GPRRegClassID: return ARM::R11; + case ARM::DPRRegClassID: case ARM::DPR_VFP2RegClassID: return ARM::D11; + case ARM::QPRRegClassID: return ARM::Q11; + case ARM::SPRRegClassID: case ARM::SPR_8RegClassID: return ARM::S11; + } + break; + case 12: + switch (RegClassID) { + case ARM::GPRRegClassID: return ARM::R12; + case ARM::DPRRegClassID: case ARM::DPR_VFP2RegClassID: return ARM::D12; + case ARM::QPRRegClassID: return ARM::Q12; + case ARM::SPRRegClassID: case ARM::SPR_8RegClassID: return ARM::S12; + } + break; + case 13: + switch (RegClassID) { + case ARM::GPRRegClassID: return ARM::SP; + case ARM::DPRRegClassID: case ARM::DPR_VFP2RegClassID: return ARM::D13; + case ARM::QPRRegClassID: return ARM::Q13; + case ARM::SPRRegClassID: case ARM::SPR_8RegClassID: return ARM::S13; + } + break; + case 14: + switch (RegClassID) { + case ARM::GPRRegClassID: return ARM::LR; + case ARM::DPRRegClassID: case ARM::DPR_VFP2RegClassID: return ARM::D14; + case ARM::QPRRegClassID: return ARM::Q14; + case ARM::SPRRegClassID: case ARM::SPR_8RegClassID: return ARM::S14; + } + break; + case 15: + switch (RegClassID) { + case ARM::GPRRegClassID: return ARM::PC; + case ARM::DPRRegClassID: case ARM::DPR_VFP2RegClassID: return ARM::D15; + case ARM::QPRRegClassID: return ARM::Q15; + case ARM::SPRRegClassID: case ARM::SPR_8RegClassID: return ARM::S15; + } + break; + case 16: + switch (RegClassID) { + case ARM::DPRRegClassID: return ARM::D16; + case ARM::SPRRegClassID: return ARM::S16; + } + break; + case 17: + switch (RegClassID) { + case ARM::DPRRegClassID: return ARM::D17; + case ARM::SPRRegClassID: return ARM::S17; + } + break; + case 18: + switch (RegClassID) { + case ARM::DPRRegClassID: return ARM::D18; + case ARM::SPRRegClassID: return ARM::S18; + } + break; + case 19: + switch (RegClassID) { + case ARM::DPRRegClassID: return ARM::D19; + case ARM::SPRRegClassID: return ARM::S19; + } + break; + case 20: + switch (RegClassID) { + case ARM::DPRRegClassID: return ARM::D20; + case ARM::SPRRegClassID: return ARM::S20; + } + break; + case 21: + switch (RegClassID) { + case ARM::DPRRegClassID: return ARM::D21; + case ARM::SPRRegClassID: return ARM::S21; + } + break; + case 22: + switch (RegClassID) { + case ARM::DPRRegClassID: return ARM::D22; + case ARM::SPRRegClassID: return ARM::S22; + } + break; + case 23: + switch (RegClassID) { + case ARM::DPRRegClassID: return ARM::D23; + case ARM::SPRRegClassID: return ARM::S23; + } + break; + case 24: + switch (RegClassID) { + case ARM::DPRRegClassID: return ARM::D24; + case ARM::SPRRegClassID: return ARM::S24; + } + break; + case 25: + switch (RegClassID) { + case ARM::DPRRegClassID: return ARM::D25; + case ARM::SPRRegClassID: return ARM::S25; + } + break; + case 26: + switch (RegClassID) { + case ARM::DPRRegClassID: return ARM::D26; + case ARM::SPRRegClassID: return ARM::S26; + } + break; + case 27: + switch (RegClassID) { + case ARM::DPRRegClassID: return ARM::D27; + case ARM::SPRRegClassID: return ARM::S27; + } + break; + case 28: + switch (RegClassID) { + case ARM::DPRRegClassID: return ARM::D28; + case ARM::SPRRegClassID: return ARM::S28; + } + break; + case 29: + switch (RegClassID) { + case ARM::DPRRegClassID: return ARM::D29; + case ARM::SPRRegClassID: return ARM::S29; + } + break; + case 30: + switch (RegClassID) { + case ARM::DPRRegClassID: return ARM::D30; + case ARM::SPRRegClassID: return ARM::S30; + } + break; + case 31: + switch (RegClassID) { + case ARM::DPRRegClassID: return ARM::D31; + case ARM::SPRRegClassID: return ARM::S31; + } + break; + } + DEBUG(errs() << "Invalid (RegClassID, RawRegister) combination\n"); + // Encoding error. Mark the builder with error code != 0. + B->SetErr(-1); + return 0; +} + +/////////////////////////////// +// // +// Utility Functions // +// // +/////////////////////////////// + +// Extract/Decode Rd: Inst{15-12}. +static inline unsigned decodeRd(uint32_t insn) { + return (insn >> ARMII::RegRdShift) & ARMII::GPRRegMask; +} + +// Extract/Decode Rn: Inst{19-16}. +static inline unsigned decodeRn(uint32_t insn) { + return (insn >> ARMII::RegRnShift) & ARMII::GPRRegMask; +} + +// Extract/Decode Rm: Inst{3-0}. +static inline unsigned decodeRm(uint32_t insn) { + return (insn & ARMII::GPRRegMask); +} + +// Extract/Decode Rs: Inst{11-8}. +static inline unsigned decodeRs(uint32_t insn) { + return (insn >> ARMII::RegRsShift) & ARMII::GPRRegMask; +} + +static inline unsigned getCondField(uint32_t insn) { + return (insn >> ARMII::CondShift); +} + +static inline unsigned getIBit(uint32_t insn) { + return (insn >> ARMII::I_BitShift) & 1; +} + +static inline unsigned getAM3IBit(uint32_t insn) { + return (insn >> ARMII::AM3_I_BitShift) & 1; +} + +static inline unsigned getPBit(uint32_t insn) { + return (insn >> ARMII::P_BitShift) & 1; +} + +static inline unsigned getUBit(uint32_t insn) { + return (insn >> ARMII::U_BitShift) & 1; +} + +static inline unsigned getPUBits(uint32_t insn) { + return (insn >> ARMII::U_BitShift) & 3; +} + +static inline unsigned getSBit(uint32_t insn) { + return (insn >> ARMII::S_BitShift) & 1; +} + +static inline unsigned getWBit(uint32_t insn) { + return (insn >> ARMII::W_BitShift) & 1; +} + +static inline unsigned getDBit(uint32_t insn) { + return (insn >> ARMII::D_BitShift) & 1; +} + +static inline unsigned getNBit(uint32_t insn) { + return (insn >> ARMII::N_BitShift) & 1; +} + +static inline unsigned getMBit(uint32_t insn) { + return (insn >> ARMII::M_BitShift) & 1; +} + +// See A8.4 Shifts applied to a register. +// A8.4.2 Register controlled shifts. +// +// getShiftOpcForBits - getShiftOpcForBits translates from the ARM encoding bits +// into llvm enums for shift opcode. The API clients should pass in the value +// encoded with two bits, so the assert stays to signal a wrong API usage. +// +// A8-12: DecodeRegShift() +static inline ARM_AM::ShiftOpc getShiftOpcForBits(unsigned bits) { + switch (bits) { + default: assert(0 && "No such value"); return ARM_AM::no_shift; + case 0: return ARM_AM::lsl; + case 1: return ARM_AM::lsr; + case 2: return ARM_AM::asr; + case 3: return ARM_AM::ror; + } +} + +// See A8.4 Shifts applied to a register. +// A8.4.1 Constant shifts. +// +// getImmShiftSE - getImmShiftSE translates from the raw ShiftOpc and raw Imm5 +// encodings into the intended ShiftOpc and shift amount. +// +// A8-11: DecodeImmShift() +static inline void getImmShiftSE(ARM_AM::ShiftOpc &ShOp, unsigned &ShImm) { + // If type == 0b11 and imm5 == 0, we have an rrx, instead. + if (ShOp == ARM_AM::ror && ShImm == 0) + ShOp = ARM_AM::rrx; + // If (lsr or asr) and imm5 == 0, shift amount is 32. + if ((ShOp == ARM_AM::lsr || ShOp == ARM_AM::asr) && ShImm == 0) + ShImm = 32; +} + +// getAMSubModeForBits - getAMSubModeForBits translates from the ARM encoding +// bits Inst{24-23} (P(24) and U(23)) into llvm enums for AMSubMode. The API +// clients should pass in the value encoded with two bits, so the assert stays +// to signal a wrong API usage. +static inline ARM_AM::AMSubMode getAMSubModeForBits(unsigned bits) { + switch (bits) { + default: assert(0 && "No such value"); return ARM_AM::bad_am_submode; + case 1: return ARM_AM::ia; // P=0 U=1 + case 3: return ARM_AM::ib; // P=1 U=1 + case 0: return ARM_AM::da; // P=0 U=0 + case 2: return ARM_AM::db; // P=1 U=0 + } +} + +//////////////////////////////////////////// +// // +// Disassemble function definitions // +// // +//////////////////////////////////////////// + +/// There is a separate Disassemble*Frm function entry for disassembly of an ARM +/// instr into a list of MCOperands in the appropriate order, with possible dst, +/// followed by possible src(s). +/// +/// The processing of the predicate, and the 'S' modifier bit, if MI modifies +/// the CPSR, is factored into ARMBasicMCBuilder's method named +/// TryPredicateAndSBitModifier. + +static bool DisassemblePseudo(MCInst &MI, unsigned Opcode, uint32_t insn, + unsigned short NumOps, unsigned &NumOpsAdded, BO) { + + if (Opcode == ARM::Int_MemBarrierV7 || Opcode == ARM::Int_SyncBarrierV7) + return true; + + assert(0 && "Unexpected pseudo instruction!"); + return false; +} + +// Multiply Instructions. +// MLA, MLS, SMLABB, SMLABT, SMLATB, SMLATT, SMLAWB, SMLAWT, SMMLA, SMMLS: +// Rd{19-16} Rn{3-0} Rm{11-8} Ra{15-12} +// +// MUL, SMMUL, SMULBB, SMULBT, SMULTB, SMULTT, SMULWB, SMULWT: +// Rd{19-16} Rn{3-0} Rm{11-8} +// +// SMLAL, SMULL, UMAAL, UMLAL, UMULL, SMLALBB, SMLALBT, SMLALTB, SMLALTT: +// RdLo{15-12} RdHi{19-16} Rn{3-0} Rm{11-8} +// +// The mapping of the multiply registers to the "regular" ARM registers, where +// there are convenience decoder functions, is: +// +// Inst{15-12} => Rd +// Inst{19-16} => Rn +// Inst{3-0} => Rm +// Inst{11-8} => Rs +static bool DisassembleMulFrm(MCInst &MI, unsigned Opcode, uint32_t insn, + unsigned short NumOps, unsigned &NumOpsAdded, BO B) { + + const TargetInstrDesc &TID = ARMInsts[Opcode]; + unsigned short NumDefs = TID.getNumDefs(); + const TargetOperandInfo *OpInfo = TID.OpInfo; + unsigned &OpIdx = NumOpsAdded; + + OpIdx = 0; + + assert(NumDefs > 0 && "NumDefs should be greater than 0 for MulFrm"); + assert(NumOps >= 3 + && OpInfo[0].RegClass == ARM::GPRRegClassID + && OpInfo[1].RegClass == ARM::GPRRegClassID + && OpInfo[2].RegClass == ARM::GPRRegClassID + && "Expect three register operands"); + + // Instructions with two destination registers have RdLo{15-12} first. + if (NumDefs == 2) { + assert(NumOps >= 4 && OpInfo[3].RegClass == ARM::GPRRegClassID && + "Expect 4th register operand"); + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + decodeRd(insn)))); + ++OpIdx; + } + + // The destination register: RdHi{19-16} or Rd{19-16}. + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + decodeRn(insn)))); + + // The two src regsiters: Rn{3-0}, then Rm{11-8}. + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + decodeRm(insn)))); + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + decodeRs(insn)))); + OpIdx += 3; + + // Many multiply instructions (e.g., MLA) have three src registers. + // The third register operand is Ra{15-12}. + if (OpIdx < NumOps && OpInfo[OpIdx].RegClass == ARM::GPRRegClassID) { + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + decodeRd(insn)))); + ++OpIdx; + } + + return true; +} + +// Helper routines for disassembly of coprocessor instructions. + +static bool LdStCopOpcode(unsigned Opcode) { + if ((Opcode >= ARM::LDC2L_OFFSET && Opcode <= ARM::LDC_PRE) || + (Opcode >= ARM::STC2L_OFFSET && Opcode <= ARM::STC_PRE)) + return true; + return false; +} +static bool CoprocessorOpcode(unsigned Opcode) { + if (LdStCopOpcode(Opcode)) + return true; + + switch (Opcode) { + default: + return false; + case ARM::CDP: case ARM::CDP2: + case ARM::MCR: case ARM::MCR2: case ARM::MRC: case ARM::MRC2: + case ARM::MCRR: case ARM::MCRR2: case ARM::MRRC: case ARM::MRRC2: + return true; + } +} +static inline unsigned GetCoprocessor(uint32_t insn) { + return slice(insn, 11, 8); +} +static inline unsigned GetCopOpc1(uint32_t insn, bool CDP) { + return CDP ? slice(insn, 23, 20) : slice(insn, 23, 21); +} +static inline unsigned GetCopOpc2(uint32_t insn) { + return slice(insn, 7, 5); +} +static inline unsigned GetCopOpc(uint32_t insn) { + return slice(insn, 7, 4); +} +// Most of the operands are in immediate forms, except Rd and Rn, which are ARM +// core registers. +// +// CDP, CDP2: cop opc1 CRd CRn CRm opc2 +// +// MCR, MCR2, MRC, MRC2: cop opc1 Rd CRn CRm opc2 +// +// MCRR, MCRR2, MRRC, MRRc2: cop opc Rd Rn CRm +// +// LDC_OFFSET, LDC_PRE, LDC_POST: cop CRd Rn R0 [+/-]imm8:00 +// and friends +// STC_OFFSET, STC_PRE, STC_POST: cop CRd Rn R0 [+/-]imm8:00 +// and friends +// <-- addrmode2 --> +// +// LDC_OPTION: cop CRd Rn imm8 +// and friends +// STC_OPTION: cop CRd Rn imm8 +// and friends +// +static bool DisassembleCoprocessor(MCInst &MI, unsigned Opcode, uint32_t insn, + unsigned short NumOps, unsigned &NumOpsAdded, BO B) { + + assert(NumOps >= 5 && "Num of operands >= 5 for coprocessor instr"); + + unsigned &OpIdx = NumOpsAdded; + bool OneCopOpc = (Opcode == ARM::MCRR || Opcode == ARM::MCRR2 || + Opcode == ARM::MRRC || Opcode == ARM::MRRC2); + // CDP/CDP2 has no GPR operand; the opc1 operand is also wider (Inst{23-20}). + bool NoGPR = (Opcode == ARM::CDP || Opcode == ARM::CDP2); + bool LdStCop = LdStCopOpcode(Opcode); + + OpIdx = 0; + + MI.addOperand(MCOperand::CreateImm(GetCoprocessor(insn))); + + if (LdStCop) { + // Unindex if P:W = 0b00 --> _OPTION variant + unsigned PW = getPBit(insn) << 1 | getWBit(insn); + + MI.addOperand(MCOperand::CreateImm(decodeRd(insn))); + + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + decodeRn(insn)))); + + if (PW) { + MI.addOperand(MCOperand::CreateReg(0)); + ARM_AM::AddrOpc AddrOpcode = getUBit(insn) ? ARM_AM::add : ARM_AM::sub; + unsigned Offset = ARM_AM::getAM2Opc(AddrOpcode, slice(insn, 7, 0) << 2, + ARM_AM::no_shift); + MI.addOperand(MCOperand::CreateImm(Offset)); + OpIdx = 5; + } else { + MI.addOperand(MCOperand::CreateImm(slice(insn, 7, 0))); + OpIdx = 4; + } + } else { + MI.addOperand(MCOperand::CreateImm(OneCopOpc ? GetCopOpc(insn) + : GetCopOpc1(insn, NoGPR))); + + MI.addOperand(NoGPR ? MCOperand::CreateImm(decodeRd(insn)) + : MCOperand::CreateReg( + getRegisterEnum(B, ARM::GPRRegClassID, + decodeRd(insn)))); + + MI.addOperand(OneCopOpc ? MCOperand::CreateReg( + getRegisterEnum(B, ARM::GPRRegClassID, + decodeRn(insn))) + : MCOperand::CreateImm(decodeRn(insn))); + + MI.addOperand(MCOperand::CreateImm(decodeRm(insn))); + + OpIdx = 5; + + if (!OneCopOpc) { + MI.addOperand(MCOperand::CreateImm(GetCopOpc2(insn))); + ++OpIdx; + } + } + + return true; +} + +// Branch Instructions. +// BLr9: SignExtend(Imm24:'00', 32) +// Bcc, BLr9_pred: SignExtend(Imm24:'00', 32) Pred0 Pred1 +// SMC: ZeroExtend(imm4, 32) +// SVC: ZeroExtend(Imm24, 32) +// +// Various coprocessor instructions are assigned BrFrm arbitrarily. +// Delegates to DisassembleCoprocessor() helper function. +// +// MRS/MRSsys: Rd +// MSR/MSRsys: Rm mask=Inst{19-16} +// BXJ: Rm +// MSRi/MSRsysi: so_imm +// SRSW/SRS: addrmode4:$addr mode_imm +// RFEW/RFE: addrmode4:$addr Rn +static bool DisassembleBrFrm(MCInst &MI, unsigned Opcode, uint32_t insn, + unsigned short NumOps, unsigned &NumOpsAdded, BO B) { + + if (CoprocessorOpcode(Opcode)) + return DisassembleCoprocessor(MI, Opcode, insn, NumOps, NumOpsAdded, B); + + const TargetOperandInfo *OpInfo = ARMInsts[Opcode].OpInfo; + if (!OpInfo) return false; + + // MRS and MRSsys take one GPR reg Rd. + if (Opcode == ARM::MRS || Opcode == ARM::MRSsys) { + assert(NumOps >= 1 && OpInfo[0].RegClass == ARM::GPRRegClassID && + "Reg operand expected"); + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + decodeRd(insn)))); + NumOpsAdded = 1; + return true; + } + // BXJ takes one GPR reg Rm. + if (Opcode == ARM::BXJ) { + assert(NumOps >= 1 && OpInfo[0].RegClass == ARM::GPRRegClassID && + "Reg operand expected"); + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + decodeRm(insn)))); + NumOpsAdded = 1; + return true; + } + // MSR and MSRsys take one GPR reg Rm, followed by the mask. + if (Opcode == ARM::MSR || Opcode == ARM::MSRsys) { + assert(NumOps >= 1 && OpInfo[0].RegClass == ARM::GPRRegClassID && + "Reg operand expected"); + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + decodeRm(insn)))); + MI.addOperand(MCOperand::CreateImm(slice(insn, 19, 16))); + NumOpsAdded = 2; + return true; + } + // MSRi and MSRsysi take one so_imm operand, followed by the mask. + if (Opcode == ARM::MSRi || Opcode == ARM::MSRsysi) { + // SOImm is 4-bit rotate amount in bits 11-8 with 8-bit imm in bits 7-0. + // A5.2.4 Rotate amount is twice the numeric value of Inst{11-8}. + // See also ARMAddressingModes.h: getSOImmValImm() and getSOImmValRot(). + unsigned Rot = (insn >> ARMII::SoRotImmShift) & 0xF; + unsigned Imm = insn & 0xFF; + MI.addOperand(MCOperand::CreateImm(ARM_AM::rotr32(Imm, 2*Rot))); + MI.addOperand(MCOperand::CreateImm(slice(insn, 19, 16))); + NumOpsAdded = 2; + return true; + } + // SRSW and SRS requires addrmode4:$addr for ${addr:submode}, followed by the + // mode immediate (Inst{4-0}). + if (Opcode == ARM::SRSW || Opcode == ARM::SRS || + Opcode == ARM::RFEW || Opcode == ARM::RFE) { + // ARMInstPrinter::printAddrMode4Operand() prints special mode string + // if the base register is SP; so don't set ARM::SP. + MI.addOperand(MCOperand::CreateReg(0)); + ARM_AM::AMSubMode SubMode = getAMSubModeForBits(getPUBits(insn)); + MI.addOperand(MCOperand::CreateImm(ARM_AM::getAM4ModeImm(SubMode))); + + if (Opcode == ARM::SRSW || Opcode == ARM::SRS) + MI.addOperand(MCOperand::CreateImm(slice(insn, 4, 0))); + else + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + decodeRn(insn)))); + NumOpsAdded = 3; + return true; + } + + assert((Opcode == ARM::Bcc || Opcode == ARM::BLr9 || Opcode == ARM::BLr9_pred + || Opcode == ARM::SMC || Opcode == ARM::SVC) && + "Unexpected Opcode"); + + assert(NumOps >= 1 && OpInfo[0].RegClass == 0 && "Reg operand expected"); + + int Imm32 = 0; + if (Opcode == ARM::SMC) { + // ZeroExtend(imm4, 32) where imm24 = Inst{3-0}. + Imm32 = slice(insn, 3, 0); + } else if (Opcode == ARM::SVC) { + // ZeroExtend(imm24, 32) where imm24 = Inst{23-0}. + Imm32 = slice(insn, 23, 0); + } else { + // SignExtend(imm24:'00', 32) where imm24 = Inst{23-0}. + unsigned Imm26 = slice(insn, 23, 0) << 2; + //Imm32 = signextend<signed int, 26>(Imm26); + Imm32 = SignExtend32<26>(Imm26); + + // When executing an ARM instruction, PC reads as the address of the current + // instruction plus 8. The assembler subtracts 8 from the difference + // between the branch instruction and the target address, disassembler has + // to add 8 to compensate. + Imm32 += 8; + } + + MI.addOperand(MCOperand::CreateImm(Imm32)); + NumOpsAdded = 1; + + return true; +} + +// Misc. Branch Instructions. +// BR_JTadd, BR_JTr, BR_JTm +// BLXr9, BXr9 +// BRIND, BX_RET +static bool DisassembleBrMiscFrm(MCInst &MI, unsigned Opcode, uint32_t insn, + unsigned short NumOps, unsigned &NumOpsAdded, BO B) { + + const TargetOperandInfo *OpInfo = ARMInsts[Opcode].OpInfo; + if (!OpInfo) return false; + + unsigned &OpIdx = NumOpsAdded; + + OpIdx = 0; + + // BX_RET has only two predicate operands, do an early return. + if (Opcode == ARM::BX_RET) + return true; + + // BLXr9 and BRIND take one GPR reg. + if (Opcode == ARM::BLXr9 || Opcode == ARM::BRIND) { + assert(NumOps >= 1 && OpInfo[OpIdx].RegClass == ARM::GPRRegClassID && + "Reg operand expected"); + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + decodeRm(insn)))); + OpIdx = 1; + return true; + } + + // BR_JTadd is an ADD with Rd = PC, (Rn, Rm) as the target and index regs. + if (Opcode == ARM::BR_JTadd) { + // InOperandList with GPR:$target and GPR:$idx regs. + + assert(NumOps == 4 && "Expect 4 operands"); + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + decodeRn(insn)))); + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + decodeRm(insn)))); + + // Fill in the two remaining imm operands to signify build completion. + MI.addOperand(MCOperand::CreateImm(0)); + MI.addOperand(MCOperand::CreateImm(0)); + + OpIdx = 4; + return true; + } + + // BR_JTr is a MOV with Rd = PC, and Rm as the source register. + if (Opcode == ARM::BR_JTr) { + // InOperandList with GPR::$target reg. + + assert(NumOps == 3 && "Expect 3 operands"); + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + decodeRm(insn)))); + + // Fill in the two remaining imm operands to signify build completion. + MI.addOperand(MCOperand::CreateImm(0)); + MI.addOperand(MCOperand::CreateImm(0)); + + OpIdx = 3; + return true; + } + + // BR_JTm is an LDR with Rt = PC. + if (Opcode == ARM::BR_JTm) { + // This is the reg/reg form, with base reg followed by +/- reg shop imm. + // See also ARMAddressingModes.h (Addressing Mode #2). + + assert(NumOps == 5 && getIBit(insn) == 1 && "Expect 5 operands && I-bit=1"); + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + decodeRn(insn)))); + + ARM_AM::AddrOpc AddrOpcode = getUBit(insn) ? ARM_AM::add : ARM_AM::sub; + + // Disassemble the offset reg (Rm), shift type, and immediate shift length. + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + decodeRm(insn)))); + // Inst{6-5} encodes the shift opcode. + ARM_AM::ShiftOpc ShOp = getShiftOpcForBits(slice(insn, 6, 5)); + // Inst{11-7} encodes the imm5 shift amount. + unsigned ShImm = slice(insn, 11, 7); + + // A8.4.1. Possible rrx or shift amount of 32... + getImmShiftSE(ShOp, ShImm); + MI.addOperand(MCOperand::CreateImm( + ARM_AM::getAM2Opc(AddrOpcode, ShImm, ShOp))); + + // Fill in the two remaining imm operands to signify build completion. + MI.addOperand(MCOperand::CreateImm(0)); + MI.addOperand(MCOperand::CreateImm(0)); + + OpIdx = 5; + return true; + } + + assert(0 && "Unexpected BrMiscFrm Opcode"); + return false; +} + +static inline bool getBFCInvMask(uint32_t insn, uint32_t &mask) { + uint32_t lsb = slice(insn, 11, 7); + uint32_t msb = slice(insn, 20, 16); + uint32_t Val = 0; + if (msb < lsb) { + DEBUG(errs() << "Encoding error: msb < lsb\n"); + return false; + } + + for (uint32_t i = lsb; i <= msb; ++i) + Val |= (1 << i); + mask = ~Val; + return true; +} + +static inline bool SaturateOpcode(unsigned Opcode) { + switch (Opcode) { + case ARM::SSATlsl: case ARM::SSATasr: case ARM::SSAT16: + case ARM::USATlsl: case ARM::USATasr: case ARM::USAT16: + return true; + default: + return false; + } +} + +static inline unsigned decodeSaturatePos(unsigned Opcode, uint32_t insn) { + switch (Opcode) { + case ARM::SSATlsl: + case ARM::SSATasr: + return slice(insn, 20, 16) + 1; + case ARM::SSAT16: + return slice(insn, 19, 16) + 1; + case ARM::USATlsl: + case ARM::USATasr: + return slice(insn, 20, 16); + case ARM::USAT16: + return slice(insn, 19, 16); + default: + assert(0 && "Invalid opcode passed in"); + return 0; + } +} + +// A major complication is the fact that some of the saturating add/subtract +// operations have Rd Rm Rn, instead of the "normal" Rd Rn Rm. +// They are QADD, QDADD, QDSUB, and QSUB. +static bool DisassembleDPFrm(MCInst &MI, unsigned Opcode, uint32_t insn, + unsigned short NumOps, unsigned &NumOpsAdded, BO B) { + + const TargetInstrDesc &TID = ARMInsts[Opcode]; + unsigned short NumDefs = TID.getNumDefs(); + bool isUnary = isUnaryDP(TID.TSFlags); + const TargetOperandInfo *OpInfo = TID.OpInfo; + unsigned &OpIdx = NumOpsAdded; + + OpIdx = 0; + + // Disassemble register def if there is one. + if (NumDefs && (OpInfo[OpIdx].RegClass == ARM::GPRRegClassID)) { + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + decodeRd(insn)))); + ++OpIdx; + } + + // Now disassemble the src operands. + if (OpIdx >= NumOps) + return false; + + // SSAT/SSAT16/USAT/USAT16 has imm operand after Rd. + if (SaturateOpcode(Opcode)) { + MI.addOperand(MCOperand::CreateImm(decodeSaturatePos(Opcode, insn))); + + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + decodeRm(insn)))); + + if (Opcode == ARM::SSAT16 || Opcode == ARM::USAT16) { + OpIdx += 2; + return true; + } + + // For SSAT operand reg (Rm) has been disassembled above. + // Now disassemble the shift amount. + + // Inst{11-7} encodes the imm5 shift amount. + unsigned ShAmt = slice(insn, 11, 7); + + // A8.6.183. Possible ASR shift amount of 32... + if (Opcode == ARM::SSATasr && ShAmt == 0) + ShAmt = 32; + + MI.addOperand(MCOperand::CreateImm(ShAmt)); + + OpIdx += 3; + return true; + } + + // Special-case handling of BFC/BFI/SBFX/UBFX. + if (Opcode == ARM::BFC || Opcode == ARM::BFI) { + // TIED_TO operand skipped for BFC and Inst{3-0} (Reg) for BFI. + MI.addOperand(MCOperand::CreateReg(Opcode == ARM::BFC ? 0 + : getRegisterEnum(B, ARM::GPRRegClassID, + decodeRm(insn)))); + uint32_t mask = 0; + if (!getBFCInvMask(insn, mask)) + return false; + + MI.addOperand(MCOperand::CreateImm(mask)); + OpIdx += 2; + return true; + } + if (Opcode == ARM::SBFX || Opcode == ARM::UBFX) { + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + decodeRm(insn)))); + MI.addOperand(MCOperand::CreateImm(slice(insn, 11, 7))); + MI.addOperand(MCOperand::CreateImm(slice(insn, 20, 16) + 1)); + OpIdx += 3; + return true; + } + + bool RmRn = (Opcode == ARM::QADD || Opcode == ARM::QDADD || + Opcode == ARM::QDSUB || Opcode == ARM::QSUB); + + // BinaryDP has an Rn operand. + if (!isUnary) { + assert(OpInfo[OpIdx].RegClass == ARM::GPRRegClassID && + "Reg operand expected"); + MI.addOperand(MCOperand::CreateReg( + getRegisterEnum(B, ARM::GPRRegClassID, + RmRn ? decodeRm(insn) : decodeRn(insn)))); + ++OpIdx; + } + + // If this is a two-address operand, skip it, e.g., MOVCCr operand 1. + if (isUnary && (TID.getOperandConstraint(OpIdx, TOI::TIED_TO) != -1)) { + MI.addOperand(MCOperand::CreateReg(0)); + ++OpIdx; + } + + // Now disassemble operand 2. + if (OpIdx >= NumOps) + return false; + + if (OpInfo[OpIdx].RegClass == ARM::GPRRegClassID) { + // We have a reg/reg form. + // Assert disabled because saturating operations, e.g., A8.6.127 QASX, are + // routed here as well. + // assert(getIBit(insn) == 0 && "I_Bit != '0' reg/reg form"); + MI.addOperand(MCOperand::CreateReg( + getRegisterEnum(B, ARM::GPRRegClassID, + RmRn? decodeRn(insn) : decodeRm(insn)))); + ++OpIdx; + } else if (Opcode == ARM::MOVi16 || Opcode == ARM::MOVTi16) { + // We have an imm16 = imm4:imm12 (imm4=Inst{19:16}, imm12 = Inst{11:0}). + assert(getIBit(insn) == 1 && "I_Bit != '1' reg/imm form"); + unsigned Imm16 = slice(insn, 19, 16) << 12 | slice(insn, 11, 0); + MI.addOperand(MCOperand::CreateImm(Imm16)); + ++OpIdx; + } else { + // We have a reg/imm form. + // SOImm is 4-bit rotate amount in bits 11-8 with 8-bit imm in bits 7-0. + // A5.2.4 Rotate amount is twice the numeric value of Inst{11-8}. + // See also ARMAddressingModes.h: getSOImmValImm() and getSOImmValRot(). + assert(getIBit(insn) == 1 && "I_Bit != '1' reg/imm form"); + unsigned Rot = (insn >> ARMII::SoRotImmShift) & 0xF; + unsigned Imm = insn & 0xFF; + MI.addOperand(MCOperand::CreateImm(ARM_AM::rotr32(Imm, 2*Rot))); + ++OpIdx; + } + + return true; +} + +static bool DisassembleDPSoRegFrm(MCInst &MI, unsigned Opcode, uint32_t insn, + unsigned short NumOps, unsigned &NumOpsAdded, BO B) { + + const TargetInstrDesc &TID = ARMInsts[Opcode]; + unsigned short NumDefs = TID.getNumDefs(); + bool isUnary = isUnaryDP(TID.TSFlags); + const TargetOperandInfo *OpInfo = TID.OpInfo; + unsigned &OpIdx = NumOpsAdded; + + OpIdx = 0; + + // Disassemble register def if there is one. + if (NumDefs && (OpInfo[OpIdx].RegClass == ARM::GPRRegClassID)) { + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + decodeRd(insn)))); + ++OpIdx; + } + + // Disassemble the src operands. + if (OpIdx >= NumOps) + return false; + + // BinaryDP has an Rn operand. + if (!isUnary) { + assert(OpInfo[OpIdx].RegClass == ARM::GPRRegClassID && + "Reg operand expected"); + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + decodeRn(insn)))); + ++OpIdx; + } + + // If this is a two-address operand, skip it, e.g., MOVCCs operand 1. + if (isUnary && (TID.getOperandConstraint(OpIdx, TOI::TIED_TO) != -1)) { + MI.addOperand(MCOperand::CreateReg(0)); + ++OpIdx; + } + + // Disassemble operand 2, which consists of three components. + if (OpIdx + 2 >= NumOps) + return false; + + assert((OpInfo[OpIdx].RegClass == ARM::GPRRegClassID) && + (OpInfo[OpIdx+1].RegClass == ARM::GPRRegClassID) && + (OpInfo[OpIdx+2].RegClass == 0) && + "Expect 3 reg operands"); + + // Register-controlled shifts have Inst{7} = 0 and Inst{4} = 1. + unsigned Rs = slice(insn, 4, 4); + + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + decodeRm(insn)))); + if (Rs) { + // Register-controlled shifts: [Rm, Rs, shift]. + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + decodeRs(insn)))); + // Inst{6-5} encodes the shift opcode. + ARM_AM::ShiftOpc ShOp = getShiftOpcForBits(slice(insn, 6, 5)); + MI.addOperand(MCOperand::CreateImm(ARM_AM::getSORegOpc(ShOp, 0))); + } else { + // Constant shifts: [Rm, reg0, shift_imm]. + MI.addOperand(MCOperand::CreateReg(0)); // NoRegister + // Inst{6-5} encodes the shift opcode. + ARM_AM::ShiftOpc ShOp = getShiftOpcForBits(slice(insn, 6, 5)); + // Inst{11-7} encodes the imm5 shift amount. + unsigned ShImm = slice(insn, 11, 7); + + // A8.4.1. Possible rrx or shift amount of 32... + getImmShiftSE(ShOp, ShImm); + MI.addOperand(MCOperand::CreateImm(ARM_AM::getSORegOpc(ShOp, ShImm))); + } + OpIdx += 3; + + return true; +} + +static bool DisassembleLdStFrm(MCInst &MI, unsigned Opcode, uint32_t insn, + unsigned short NumOps, unsigned &NumOpsAdded, bool isStore, BO B) { + + const TargetInstrDesc &TID = ARMInsts[Opcode]; + bool isPrePost = isPrePostLdSt(TID.TSFlags); + const TargetOperandInfo *OpInfo = TID.OpInfo; + if (!OpInfo) return false; + + unsigned &OpIdx = NumOpsAdded; + + OpIdx = 0; + + assert(((!isStore && TID.getNumDefs() > 0) || + (isStore && (TID.getNumDefs() == 0 || isPrePost))) + && "Invalid arguments"); + + // Operand 0 of a pre- and post-indexed store is the address base writeback. + if (isPrePost && isStore) { + assert(OpInfo[OpIdx].RegClass == ARM::GPRRegClassID && + "Reg operand expected"); + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + decodeRn(insn)))); + ++OpIdx; + } + + // Disassemble the dst/src operand. + if (OpIdx >= NumOps) + return false; + + assert(OpInfo[OpIdx].RegClass == ARM::GPRRegClassID && + "Reg operand expected"); + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + decodeRd(insn)))); + ++OpIdx; + + // After dst of a pre- and post-indexed load is the address base writeback. + if (isPrePost && !isStore) { + assert(OpInfo[OpIdx].RegClass == ARM::GPRRegClassID && + "Reg operand expected"); + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + decodeRn(insn)))); + ++OpIdx; + } + + // Disassemble the base operand. + if (OpIdx >= NumOps) + return false; + + assert(OpInfo[OpIdx].RegClass == ARM::GPRRegClassID && + "Reg operand expected"); + assert((!isPrePost || (TID.getOperandConstraint(OpIdx, TOI::TIED_TO) != -1)) + && "Index mode or tied_to operand expected"); + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + decodeRn(insn)))); + ++OpIdx; + + // For reg/reg form, base reg is followed by +/- reg shop imm. + // For immediate form, it is followed by +/- imm12. + // See also ARMAddressingModes.h (Addressing Mode #2). + if (OpIdx + 1 >= NumOps) + return false; + + assert((OpInfo[OpIdx].RegClass == ARM::GPRRegClassID) && + (OpInfo[OpIdx+1].RegClass == 0) && + "Expect 1 reg operand followed by 1 imm operand"); + + ARM_AM::AddrOpc AddrOpcode = getUBit(insn) ? ARM_AM::add : ARM_AM::sub; + if (getIBit(insn) == 0) { + MI.addOperand(MCOperand::CreateReg(0)); + + // Disassemble the 12-bit immediate offset. + unsigned Imm12 = slice(insn, 11, 0); + unsigned Offset = ARM_AM::getAM2Opc(AddrOpcode, Imm12, ARM_AM::no_shift); + MI.addOperand(MCOperand::CreateImm(Offset)); + } else { + // Disassemble the offset reg (Rm), shift type, and immediate shift length. + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + decodeRm(insn)))); + // Inst{6-5} encodes the shift opcode. + ARM_AM::ShiftOpc ShOp = getShiftOpcForBits(slice(insn, 6, 5)); + // Inst{11-7} encodes the imm5 shift amount. + unsigned ShImm = slice(insn, 11, 7); + + // A8.4.1. Possible rrx or shift amount of 32... + getImmShiftSE(ShOp, ShImm); + MI.addOperand(MCOperand::CreateImm( + ARM_AM::getAM2Opc(AddrOpcode, ShImm, ShOp))); + } + OpIdx += 2; + + return true; +} + +static bool DisassembleLdFrm(MCInst &MI, unsigned Opcode, uint32_t insn, + unsigned short NumOps, unsigned &NumOpsAdded, BO B) { + return DisassembleLdStFrm(MI, Opcode, insn, NumOps, NumOpsAdded, false, B); +} + +static bool DisassembleStFrm(MCInst &MI, unsigned Opcode, uint32_t insn, + unsigned short NumOps, unsigned &NumOpsAdded, BO B) { + return DisassembleLdStFrm(MI, Opcode, insn, NumOps, NumOpsAdded, true, B); +} + +static bool HasDualReg(unsigned Opcode) { + switch (Opcode) { + default: + return false; + case ARM::LDRD: case ARM::LDRD_PRE: case ARM::LDRD_POST: + case ARM::STRD: case ARM::STRD_PRE: case ARM::STRD_POST: + return true; + } +} + +static bool DisassembleLdStMiscFrm(MCInst &MI, unsigned Opcode, uint32_t insn, + unsigned short NumOps, unsigned &NumOpsAdded, bool isStore, BO B) { + + const TargetInstrDesc &TID = ARMInsts[Opcode]; + bool isPrePost = isPrePostLdSt(TID.TSFlags); + const TargetOperandInfo *OpInfo = TID.OpInfo; + if (!OpInfo) return false; + + unsigned &OpIdx = NumOpsAdded; + + OpIdx = 0; + + assert(((!isStore && TID.getNumDefs() > 0) || + (isStore && (TID.getNumDefs() == 0 || isPrePost))) + && "Invalid arguments"); + + // Operand 0 of a pre- and post-indexed store is the address base writeback. + if (isPrePost && isStore) { + assert(OpInfo[OpIdx].RegClass == ARM::GPRRegClassID && + "Reg operand expected"); + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + decodeRn(insn)))); + ++OpIdx; + } + + bool DualReg = HasDualReg(Opcode); + + // Disassemble the dst/src operand. + if (OpIdx >= NumOps) + return false; + + assert(OpInfo[OpIdx].RegClass == ARM::GPRRegClassID && + "Reg operand expected"); + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + decodeRd(insn)))); + ++OpIdx; + + // Fill in LDRD and STRD's second operand. + if (DualReg) { + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + decodeRd(insn) + 1))); + ++OpIdx; + } + + // After dst of a pre- and post-indexed load is the address base writeback. + if (isPrePost && !isStore) { + assert(OpInfo[OpIdx].RegClass == ARM::GPRRegClassID && + "Reg operand expected"); + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + decodeRn(insn)))); + ++OpIdx; + } + + // Disassemble the base operand. + if (OpIdx >= NumOps) + return false; + + assert(OpInfo[OpIdx].RegClass == ARM::GPRRegClassID && + "Reg operand expected"); + assert((!isPrePost || (TID.getOperandConstraint(OpIdx, TOI::TIED_TO) != -1)) + && "Index mode or tied_to operand expected"); + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + decodeRn(insn)))); + ++OpIdx; + + // For reg/reg form, base reg is followed by +/- reg. + // For immediate form, it is followed by +/- imm8. + // See also ARMAddressingModes.h (Addressing Mode #3). + if (OpIdx + 1 >= NumOps) + return false; + + assert((OpInfo[OpIdx].RegClass == ARM::GPRRegClassID) && + (OpInfo[OpIdx+1].RegClass == 0) && + "Expect 1 reg operand followed by 1 imm operand"); + + ARM_AM::AddrOpc AddrOpcode = getUBit(insn) ? ARM_AM::add : ARM_AM::sub; + if (getAM3IBit(insn) == 1) { + MI.addOperand(MCOperand::CreateReg(0)); + + // Disassemble the 8-bit immediate offset. + unsigned Imm4H = (insn >> ARMII::ImmHiShift) & 0xF; + unsigned Imm4L = insn & 0xF; + unsigned Offset = ARM_AM::getAM3Opc(AddrOpcode, (Imm4H << 4) | Imm4L); + MI.addOperand(MCOperand::CreateImm(Offset)); + } else { + // Disassemble the offset reg (Rm). + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + decodeRm(insn)))); + unsigned Offset = ARM_AM::getAM3Opc(AddrOpcode, 0); + MI.addOperand(MCOperand::CreateImm(Offset)); + } + OpIdx += 2; + + return true; +} + +static bool DisassembleLdMiscFrm(MCInst &MI, unsigned Opcode, uint32_t insn, + unsigned short NumOps, unsigned &NumOpsAdded, BO B) { + return DisassembleLdStMiscFrm(MI, Opcode, insn, NumOps, NumOpsAdded, false, + B); +} + +static bool DisassembleStMiscFrm(MCInst &MI, unsigned Opcode, uint32_t insn, + unsigned short NumOps, unsigned &NumOpsAdded, BO B) { + return DisassembleLdStMiscFrm(MI, Opcode, insn, NumOps, NumOpsAdded, true, B); +} + +// The algorithm for disassembly of LdStMulFrm is different from others because +// it explicitly populates the two predicate operands after operand 0 (the base) +// and operand 1 (the AM4 mode imm). After operand 3, we need to populate the +// reglist with each affected register encoded as an MCOperand. +static bool DisassembleLdStMulFrm(MCInst &MI, unsigned Opcode, uint32_t insn, + unsigned short NumOps, unsigned &NumOpsAdded, BO B) { + + assert(NumOps >= 5 && "LdStMulFrm expects NumOps >= 5"); + + unsigned &OpIdx = NumOpsAdded; + + OpIdx = 0; + + unsigned Base = getRegisterEnum(B, ARM::GPRRegClassID, decodeRn(insn)); + + // Writeback to base, if necessary. + if (Opcode == ARM::LDM_UPD || Opcode == ARM::STM_UPD) { + MI.addOperand(MCOperand::CreateReg(Base)); + ++OpIdx; + } + + MI.addOperand(MCOperand::CreateReg(Base)); + + ARM_AM::AMSubMode SubMode = getAMSubModeForBits(getPUBits(insn)); + MI.addOperand(MCOperand::CreateImm(ARM_AM::getAM4ModeImm(SubMode))); + + // Handling the two predicate operands before the reglist. + int64_t CondVal = insn >> ARMII::CondShift; + MI.addOperand(MCOperand::CreateImm(CondVal == 0xF ? 0xE : CondVal)); + MI.addOperand(MCOperand::CreateReg(ARM::CPSR)); + + OpIdx += 4; + + // Fill the variadic part of reglist. + unsigned RegListBits = insn & ((1 << 16) - 1); + for (unsigned i = 0; i < 16; ++i) { + if ((RegListBits >> i) & 1) { + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + i))); + ++OpIdx; + } + } + + return true; +} + +// LDREX, LDREXB, LDREXH: Rd Rn +// LDREXD: Rd Rd+1 Rn +// STREX, STREXB, STREXH: Rd Rm Rn +// STREXD: Rd Rm Rm+1 Rn +// +// SWP, SWPB: Rd Rm Rn +static bool DisassembleLdStExFrm(MCInst &MI, unsigned Opcode, uint32_t insn, + unsigned short NumOps, unsigned &NumOpsAdded, BO B) { + + const TargetOperandInfo *OpInfo = ARMInsts[Opcode].OpInfo; + if (!OpInfo) return false; + + unsigned &OpIdx = NumOpsAdded; + + OpIdx = 0; + + assert(NumOps >= 2 + && OpInfo[0].RegClass == ARM::GPRRegClassID + && OpInfo[1].RegClass == ARM::GPRRegClassID + && "Expect 2 reg operands"); + + bool isStore = slice(insn, 20, 20) == 0; + bool isDW = (Opcode == ARM::LDREXD || Opcode == ARM::STREXD); + + // Add the destination operand. + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + decodeRd(insn)))); + ++OpIdx; + + // Store register Exclusive needs a source operand. + if (isStore) { + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + decodeRm(insn)))); + ++OpIdx; + + if (isDW) { + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + decodeRm(insn)+1))); + ++OpIdx; + } + } else if (isDW) { + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + decodeRd(insn)+1))); + ++OpIdx; + } + + // Finally add the pointer operand. + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + decodeRn(insn)))); + ++OpIdx; + + return true; +} + +// Misc. Arithmetic Instructions. +// CLZ: Rd Rm +// PKHBT, PKHTB: Rd Rn Rm , LSL/ASR #imm5 +// RBIT, REV, REV16, REVSH: Rd Rm +static bool DisassembleArithMiscFrm(MCInst &MI, unsigned Opcode, uint32_t insn, + unsigned short NumOps, unsigned &NumOpsAdded, BO B) { + + const TargetOperandInfo *OpInfo = ARMInsts[Opcode].OpInfo; + unsigned &OpIdx = NumOpsAdded; + + OpIdx = 0; + + assert(NumOps >= 2 + && OpInfo[0].RegClass == ARM::GPRRegClassID + && OpInfo[1].RegClass == ARM::GPRRegClassID + && "Expect 2 reg operands"); + + bool ThreeReg = NumOps > 2 && OpInfo[2].RegClass == ARM::GPRRegClassID; + + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + decodeRd(insn)))); + ++OpIdx; + + if (ThreeReg) { + assert(NumOps >= 4 && "Expect >= 4 operands"); + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + decodeRn(insn)))); + ++OpIdx; + } + + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + decodeRm(insn)))); + ++OpIdx; + + // If there is still an operand info left which is an immediate operand, add + // an additional imm5 LSL/ASR operand. + if (ThreeReg && OpInfo[OpIdx].RegClass == 0 + && !OpInfo[OpIdx].isPredicate() && !OpInfo[OpIdx].isOptionalDef()) { + // Extract the 5-bit immediate field Inst{11-7}. + unsigned ShiftAmt = (insn >> ARMII::ShiftShift) & 0x1F; + MI.addOperand(MCOperand::CreateImm(ShiftAmt)); + ++OpIdx; + } + + return true; +} + +// Extend instructions. +// SXT* and UXT*: Rd [Rn] Rm [rot_imm]. +// The 2nd operand register is Rn and the 3rd operand regsiter is Rm for the +// three register operand form. Otherwise, Rn=0b1111 and only Rm is used. +static bool DisassembleExtFrm(MCInst &MI, unsigned Opcode, uint32_t insn, + unsigned short NumOps, unsigned &NumOpsAdded, BO B) { + + const TargetOperandInfo *OpInfo = ARMInsts[Opcode].OpInfo; + unsigned &OpIdx = NumOpsAdded; + + OpIdx = 0; + + assert(NumOps >= 2 + && OpInfo[0].RegClass == ARM::GPRRegClassID + && OpInfo[1].RegClass == ARM::GPRRegClassID + && "Expect 2 reg operands"); + + bool ThreeReg = NumOps > 2 && OpInfo[2].RegClass == ARM::GPRRegClassID; + + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + decodeRd(insn)))); + ++OpIdx; + + if (ThreeReg) { + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + decodeRn(insn)))); + ++OpIdx; + } + + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + decodeRm(insn)))); + ++OpIdx; + + // If there is still an operand info left which is an immediate operand, add + // an additional rotate immediate operand. + if (OpIdx < NumOps && OpInfo[OpIdx].RegClass == 0 + && !OpInfo[OpIdx].isPredicate() && !OpInfo[OpIdx].isOptionalDef()) { + // Extract the 2-bit rotate field Inst{11-10}. + unsigned rot = (insn >> ARMII::ExtRotImmShift) & 3; + // Rotation by 8, 16, or 24 bits. + MI.addOperand(MCOperand::CreateImm(rot << 3)); + ++OpIdx; + } + + return true; +} + +///////////////////////////////////// +// // +// Utility Functions For VFP // +// // +///////////////////////////////////// + +// Extract/Decode Dd/Sd: +// +// SP => d = UInt(Vd:D) +// DP => d = UInt(D:Vd) +static unsigned decodeVFPRd(uint32_t insn, bool isSPVFP) { + return isSPVFP ? (decodeRd(insn) << 1 | getDBit(insn)) + : (decodeRd(insn) | getDBit(insn) << 4); +} + +// Extract/Decode Dn/Sn: +// +// SP => n = UInt(Vn:N) +// DP => n = UInt(N:Vn) +static unsigned decodeVFPRn(uint32_t insn, bool isSPVFP) { + return isSPVFP ? (decodeRn(insn) << 1 | getNBit(insn)) + : (decodeRn(insn) | getNBit(insn) << 4); +} + +// Extract/Decode Dm/Sm: +// +// SP => m = UInt(Vm:M) +// DP => m = UInt(M:Vm) +static unsigned decodeVFPRm(uint32_t insn, bool isSPVFP) { + return isSPVFP ? (decodeRm(insn) << 1 | getMBit(insn)) + : (decodeRm(insn) | getMBit(insn) << 4); +} + +// A7.5.1 +#if 0 +static uint64_t VFPExpandImm(unsigned char byte, unsigned N) { + assert(N == 32 || N == 64); + + uint64_t Result; + unsigned bit6 = slice(byte, 6, 6); + if (N == 32) { + Result = slice(byte, 7, 7) << 31 | slice(byte, 5, 0) << 19; + if (bit6) + Result |= 0x1f << 25; + else + Result |= 0x1 << 30; + } else { + Result = (uint64_t)slice(byte, 7, 7) << 63 | + (uint64_t)slice(byte, 5, 0) << 48; + if (bit6) + Result |= 0xffL << 54; + else + Result |= 0x1L << 62; + } + return Result; +} +#endif + +// VFP Unary Format Instructions: +// +// VCMP[E]ZD, VCMP[E]ZS: compares one floating-point register with zero +// VCVTDS, VCVTSD: converts between double-precision and single-precision +// The rest of the instructions have homogeneous [VFP]Rd and [VFP]Rm registers. +static bool DisassembleVFPUnaryFrm(MCInst &MI, unsigned Opcode, uint32_t insn, + unsigned short NumOps, unsigned &NumOpsAdded, BO B) { + + assert(NumOps >= 1 && "VFPUnaryFrm expects NumOps >= 1"); + + const TargetOperandInfo *OpInfo = ARMInsts[Opcode].OpInfo; + unsigned &OpIdx = NumOpsAdded; + + OpIdx = 0; + + unsigned RegClass = OpInfo[OpIdx].RegClass; + assert((RegClass == ARM::SPRRegClassID || RegClass == ARM::DPRRegClassID) && + "Reg operand expected"); + bool isSP = (RegClass == ARM::SPRRegClassID); + + MI.addOperand(MCOperand::CreateReg( + getRegisterEnum(B, RegClass, decodeVFPRd(insn, isSP)))); + ++OpIdx; + + // Early return for compare with zero instructions. + if (Opcode == ARM::VCMPEZD || Opcode == ARM::VCMPEZS + || Opcode == ARM::VCMPZD || Opcode == ARM::VCMPZS) + return true; + + RegClass = OpInfo[OpIdx].RegClass; + assert((RegClass == ARM::SPRRegClassID || RegClass == ARM::DPRRegClassID) && + "Reg operand expected"); + isSP = (RegClass == ARM::SPRRegClassID); + + MI.addOperand(MCOperand::CreateReg( + getRegisterEnum(B, RegClass, decodeVFPRm(insn, isSP)))); + ++OpIdx; + + return true; +} + +// All the instructions have homogeneous [VFP]Rd, [VFP]Rn, and [VFP]Rm regs. +// Some of them have operand constraints which tie the first operand in the +// InOperandList to that of the dst. As far as asm printing is concerned, this +// tied_to operand is simply skipped. +static bool DisassembleVFPBinaryFrm(MCInst &MI, unsigned Opcode, uint32_t insn, + unsigned short NumOps, unsigned &NumOpsAdded, BO B) { + + assert(NumOps >= 3 && "VFPBinaryFrm expects NumOps >= 3"); + + const TargetInstrDesc &TID = ARMInsts[Opcode]; + const TargetOperandInfo *OpInfo = TID.OpInfo; + unsigned &OpIdx = NumOpsAdded; + + OpIdx = 0; + + unsigned RegClass = OpInfo[OpIdx].RegClass; + assert((RegClass == ARM::SPRRegClassID || RegClass == ARM::DPRRegClassID) && + "Reg operand expected"); + bool isSP = (RegClass == ARM::SPRRegClassID); + + MI.addOperand(MCOperand::CreateReg( + getRegisterEnum(B, RegClass, decodeVFPRd(insn, isSP)))); + ++OpIdx; + + // Skip tied_to operand constraint. + if (TID.getOperandConstraint(OpIdx, TOI::TIED_TO) != -1) { + assert(NumOps >= 4 && "Expect >=4 operands"); + MI.addOperand(MCOperand::CreateReg(0)); + ++OpIdx; + } + + MI.addOperand(MCOperand::CreateReg( + getRegisterEnum(B, RegClass, decodeVFPRn(insn, isSP)))); + ++OpIdx; + + MI.addOperand(MCOperand::CreateReg( + getRegisterEnum(B, RegClass, decodeVFPRm(insn, isSP)))); + ++OpIdx; + + return true; +} + +// A8.6.295 vcvt (floating-point <-> integer) +// Int to FP: VSITOD, VSITOS, VUITOD, VUITOS +// FP to Int: VTOSI[Z|R]D, VTOSI[Z|R]S, VTOUI[Z|R]D, VTOUI[Z|R]S +// +// A8.6.297 vcvt (floating-point and fixed-point) +// Dd|Sd Dd|Sd(TIED_TO) #fbits(= 16|32 - UInt(imm4:i)) +static bool DisassembleVFPConv1Frm(MCInst &MI, unsigned Opcode, uint32_t insn, + unsigned short NumOps, unsigned &NumOpsAdded, BO B) { + + assert(NumOps >= 2 && "VFPConv1Frm expects NumOps >= 2"); + + const TargetInstrDesc &TID = ARMInsts[Opcode]; + const TargetOperandInfo *OpInfo = TID.OpInfo; + if (!OpInfo) return false; + + bool SP = slice(insn, 8, 8) == 0; // A8.6.295 & A8.6.297 + bool fixed_point = slice(insn, 17, 17) == 1; // A8.6.297 + unsigned RegClassID = SP ? ARM::SPRRegClassID : ARM::DPRRegClassID; + + if (fixed_point) { + // A8.6.297 + assert(NumOps >= 3 && "Expect >= 3 operands"); + int size = slice(insn, 7, 7) == 0 ? 16 : 32; + int fbits = size - (slice(insn,3,0) << 1 | slice(insn,5,5)); + MI.addOperand(MCOperand::CreateReg( + getRegisterEnum(B, RegClassID, + decodeVFPRd(insn, SP)))); + + assert(TID.getOperandConstraint(1, TOI::TIED_TO) != -1 && + "Tied to operand expected"); + MI.addOperand(MI.getOperand(0)); + + assert(OpInfo[2].RegClass == 0 && !OpInfo[2].isPredicate() && + !OpInfo[2].isOptionalDef() && "Imm operand expected"); + MI.addOperand(MCOperand::CreateImm(fbits)); + + NumOpsAdded = 3; + } else { + // A8.6.295 + // The Rd (destination) and Rm (source) bits have different interpretations + // depending on their single-precisonness. + unsigned d, m; + if (slice(insn, 18, 18) == 1) { // to_integer operation + d = decodeVFPRd(insn, true /* Is Single Precision */); + MI.addOperand(MCOperand::CreateReg( + getRegisterEnum(B, ARM::SPRRegClassID, d))); + m = decodeVFPRm(insn, SP); + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, RegClassID, m))); + } else { + d = decodeVFPRd(insn, SP); + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, RegClassID, d))); + m = decodeVFPRm(insn, true /* Is Single Precision */); + MI.addOperand(MCOperand::CreateReg( + getRegisterEnum(B, ARM::SPRRegClassID, m))); + } + NumOpsAdded = 2; + } + + return true; +} + +// VMOVRS - A8.6.330 +// Rt => Rd; Sn => UInt(Vn:N) +static bool DisassembleVFPConv2Frm(MCInst &MI, unsigned Opcode, uint32_t insn, + unsigned short NumOps, unsigned &NumOpsAdded, BO B) { + + assert(NumOps >= 2 && "VFPConv2Frm expects NumOps >= 2"); + + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + decodeRd(insn)))); + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::SPRRegClassID, + decodeVFPRn(insn, true)))); + NumOpsAdded = 2; + return true; +} + +// VMOVRRD - A8.6.332 +// Rt => Rd; Rt2 => Rn; Dm => UInt(M:Vm) +// +// VMOVRRS - A8.6.331 +// Rt => Rd; Rt2 => Rn; Sm => UInt(Vm:M); Sm1 = Sm+1 +static bool DisassembleVFPConv3Frm(MCInst &MI, unsigned Opcode, uint32_t insn, + unsigned short NumOps, unsigned &NumOpsAdded, BO B) { + + assert(NumOps >= 3 && "VFPConv3Frm expects NumOps >= 3"); + + const TargetOperandInfo *OpInfo = ARMInsts[Opcode].OpInfo; + unsigned &OpIdx = NumOpsAdded; + + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + decodeRd(insn)))); + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + decodeRn(insn)))); + OpIdx = 2; + + if (OpInfo[OpIdx].RegClass == ARM::SPRRegClassID) { + unsigned Sm = decodeVFPRm(insn, true); + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::SPRRegClassID, + Sm))); + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::SPRRegClassID, + Sm+1))); + OpIdx += 2; + } else { + MI.addOperand(MCOperand::CreateReg( + getRegisterEnum(B, ARM::DPRRegClassID, + decodeVFPRm(insn, false)))); + ++OpIdx; + } + return true; +} + +// VMOVSR - A8.6.330 +// Rt => Rd; Sn => UInt(Vn:N) +static bool DisassembleVFPConv4Frm(MCInst &MI, unsigned Opcode, uint32_t insn, + unsigned short NumOps, unsigned &NumOpsAdded, BO B) { + + assert(NumOps >= 2 && "VFPConv4Frm expects NumOps >= 2"); + + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::SPRRegClassID, + decodeVFPRn(insn, true)))); + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + decodeRd(insn)))); + NumOpsAdded = 2; + return true; +} + +// VMOVDRR - A8.6.332 +// Rt => Rd; Rt2 => Rn; Dm => UInt(M:Vm) +// +// VMOVRRS - A8.6.331 +// Rt => Rd; Rt2 => Rn; Sm => UInt(Vm:M); Sm1 = Sm+1 +static bool DisassembleVFPConv5Frm(MCInst &MI, unsigned Opcode, uint32_t insn, + unsigned short NumOps, unsigned &NumOpsAdded, BO B) { + + assert(NumOps >= 3 && "VFPConv5Frm expects NumOps >= 3"); + + const TargetOperandInfo *OpInfo = ARMInsts[Opcode].OpInfo; + unsigned &OpIdx = NumOpsAdded; + + OpIdx = 0; + + if (OpInfo[OpIdx].RegClass == ARM::SPRRegClassID) { + unsigned Sm = decodeVFPRm(insn, true); + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::SPRRegClassID, + Sm))); + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::SPRRegClassID, + Sm+1))); + OpIdx += 2; + } else { + MI.addOperand(MCOperand::CreateReg( + getRegisterEnum(B, ARM::DPRRegClassID, + decodeVFPRm(insn, false)))); + ++OpIdx; + } + + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + decodeRd(insn)))); + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + decodeRn(insn)))); + OpIdx += 2; + return true; +} + +// VFP Load/Store Instructions. +// VLDRD, VLDRS, VSTRD, VSTRS +static bool DisassembleVFPLdStFrm(MCInst &MI, unsigned Opcode, uint32_t insn, + unsigned short NumOps, unsigned &NumOpsAdded, BO B) { + + assert(NumOps >= 3 && "VFPLdStFrm expects NumOps >= 3"); + + bool isSPVFP = (Opcode == ARM::VLDRS || Opcode == ARM::VSTRS) ? true : false; + unsigned RegClassID = isSPVFP ? ARM::SPRRegClassID : ARM::DPRRegClassID; + + // Extract Dd/Sd for operand 0. + unsigned RegD = decodeVFPRd(insn, isSPVFP); + + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, RegClassID, RegD))); + + unsigned Base = getRegisterEnum(B, ARM::GPRRegClassID, decodeRn(insn)); + MI.addOperand(MCOperand::CreateReg(Base)); + + // Next comes the AM5 Opcode. + ARM_AM::AddrOpc AddrOpcode = getUBit(insn) ? ARM_AM::add : ARM_AM::sub; + unsigned char Imm8 = insn & 0xFF; + MI.addOperand(MCOperand::CreateImm(ARM_AM::getAM5Opc(AddrOpcode, Imm8))); + + NumOpsAdded = 3; + + return true; +} + +// VFP Load/Store Multiple Instructions. +// This is similar to the algorithm for LDM/STM in that operand 0 (the base) and +// operand 1 (the AM5 mode imm) is followed by two predicate operands. It is +// followed by a reglist of either DPR(s) or SPR(s). +// +// VLDMD[_UPD], VLDMS[_UPD], VSTMD[_UPD], VSTMS[_UPD] +static bool DisassembleVFPLdStMulFrm(MCInst &MI, unsigned Opcode, uint32_t insn, + unsigned short NumOps, unsigned &NumOpsAdded, BO B) { + + assert(NumOps >= 5 && "VFPLdStMulFrm expects NumOps >= 5"); + + unsigned &OpIdx = NumOpsAdded; + + OpIdx = 0; + + unsigned Base = getRegisterEnum(B, ARM::GPRRegClassID, decodeRn(insn)); + + // Writeback to base, if necessary. + if (Opcode == ARM::VLDMD_UPD || Opcode == ARM::VLDMS_UPD || + Opcode == ARM::VSTMD_UPD || Opcode == ARM::VSTMS_UPD) { + MI.addOperand(MCOperand::CreateReg(Base)); + ++OpIdx; + } + + MI.addOperand(MCOperand::CreateReg(Base)); + + // Next comes the AM5 Opcode. + ARM_AM::AMSubMode SubMode = getAMSubModeForBits(getPUBits(insn)); + // Must be either "ia" or "db" submode. + if (SubMode != ARM_AM::ia && SubMode != ARM_AM::db) { + DEBUG(errs() << "Illegal addressing mode 5 sub-mode!\n"); + return false; + } + + unsigned char Imm8 = insn & 0xFF; + MI.addOperand(MCOperand::CreateImm(ARM_AM::getAM5Opc(SubMode, Imm8))); + + // Handling the two predicate operands before the reglist. + int64_t CondVal = insn >> ARMII::CondShift; + MI.addOperand(MCOperand::CreateImm(CondVal == 0xF ? 0xE : CondVal)); + MI.addOperand(MCOperand::CreateReg(ARM::CPSR)); + + OpIdx += 4; + + bool isSPVFP = (Opcode == ARM::VLDMS || Opcode == ARM::VLDMS_UPD || + Opcode == ARM::VSTMS || Opcode == ARM::VSTMS_UPD) ? true : false; + unsigned RegClassID = isSPVFP ? ARM::SPRRegClassID : ARM::DPRRegClassID; + + // Extract Dd/Sd. + unsigned RegD = decodeVFPRd(insn, isSPVFP); + + // Fill the variadic part of reglist. + unsigned Regs = isSPVFP ? Imm8 : Imm8/2; + for (unsigned i = 0; i < Regs; ++i) { + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, RegClassID, + RegD + i))); + ++OpIdx; + } + + return true; +} + +// Misc. VFP Instructions. +// FMSTAT (vmrs with Rt=0b1111, i.e., to apsr_nzcv and no register operand) +// FCONSTD (DPR and a VFPf64Imm operand) +// FCONSTS (SPR and a VFPf32Imm operand) +// VMRS/VMSR (GPR operand) +static bool DisassembleVFPMiscFrm(MCInst &MI, unsigned Opcode, uint32_t insn, + unsigned short NumOps, unsigned &NumOpsAdded, BO B) { + + const TargetOperandInfo *OpInfo = ARMInsts[Opcode].OpInfo; + unsigned &OpIdx = NumOpsAdded; + + OpIdx = 0; + + if (Opcode == ARM::FMSTAT) + return true; + + assert(NumOps >= 2 && "VFPMiscFrm expects >=2 operands"); + + unsigned RegEnum = 0; + switch (OpInfo[0].RegClass) { + case ARM::DPRRegClassID: + RegEnum = getRegisterEnum(B, ARM::DPRRegClassID, decodeVFPRd(insn, false)); + break; + case ARM::SPRRegClassID: + RegEnum = getRegisterEnum(B, ARM::SPRRegClassID, decodeVFPRd(insn, true)); + break; + case ARM::GPRRegClassID: + RegEnum = getRegisterEnum(B, ARM::GPRRegClassID, decodeRd(insn)); + break; + default: + assert(0 && "Invalid reg class id"); + return false; + } + + MI.addOperand(MCOperand::CreateReg(RegEnum)); + ++OpIdx; + + // Extract/decode the f64/f32 immediate. + if (OpIdx < NumOps && OpInfo[OpIdx].RegClass == 0 + && !OpInfo[OpIdx].isPredicate() && !OpInfo[OpIdx].isOptionalDef()) { + // The asm syntax specifies the before-expanded <imm>. + // Not VFPExpandImm(slice(insn,19,16) << 4 | slice(insn, 3, 0), + // Opcode == ARM::FCONSTD ? 64 : 32) + MI.addOperand(MCOperand::CreateImm(slice(insn,19,16)<<4 | slice(insn,3,0))); + ++OpIdx; + } + + return true; +} + +// DisassembleThumbFrm() is defined in ThumbDisassemblerCore.h file. +#include "ThumbDisassemblerCore.h" + +///////////////////////////////////////////////////// +// // +// Utility Functions For ARM Advanced SIMD // +// // +///////////////////////////////////////////////////// + +// The following NEON namings are based on A8.6.266 VABA, VABAL. Notice that +// A8.6.303 VDUP (ARM core register)'s D/Vd pair is the N/Vn pair of VABA/VABAL. + +// A7.3 Register encoding + +// Extract/Decode NEON D/Vd: +// +// Note that for quadword, Qd = UInt(D:Vd<3:1>) = Inst{22:15-13}, whereas for +// doubleword, Dd = UInt(D:Vd). We compensate for this difference by +// handling it in the getRegisterEnum() utility function. +// D = Inst{22}, Vd = Inst{15-12} +static unsigned decodeNEONRd(uint32_t insn) { + return ((insn >> ARMII::NEON_D_BitShift) & 1) << 4 + | ((insn >> ARMII::NEON_RegRdShift) & ARMII::NEONRegMask); +} + +// Extract/Decode NEON N/Vn: +// +// Note that for quadword, Qn = UInt(N:Vn<3:1>) = Inst{7:19-17}, whereas for +// doubleword, Dn = UInt(N:Vn). We compensate for this difference by +// handling it in the getRegisterEnum() utility function. +// N = Inst{7}, Vn = Inst{19-16} +static unsigned decodeNEONRn(uint32_t insn) { + return ((insn >> ARMII::NEON_N_BitShift) & 1) << 4 + | ((insn >> ARMII::NEON_RegRnShift) & ARMII::NEONRegMask); +} + +// Extract/Decode NEON M/Vm: +// +// Note that for quadword, Qm = UInt(M:Vm<3:1>) = Inst{5:3-1}, whereas for +// doubleword, Dm = UInt(M:Vm). We compensate for this difference by +// handling it in the getRegisterEnum() utility function. +// M = Inst{5}, Vm = Inst{3-0} +static unsigned decodeNEONRm(uint32_t insn) { + return ((insn >> ARMII::NEON_M_BitShift) & 1) << 4 + | ((insn >> ARMII::NEON_RegRmShift) & ARMII::NEONRegMask); +} + +namespace { +enum ElemSize { + ESizeNA = 0, + ESize8 = 8, + ESize16 = 16, + ESize32 = 32, + ESize64 = 64 +}; +} // End of unnamed namespace + +// size field -> Inst{11-10} +// index_align field -> Inst{7-4} +// +// The Lane Index interpretation depends on the Data Size: +// 8 (encoded as size = 0b00) -> Index = index_align[3:1] +// 16 (encoded as size = 0b01) -> Index = index_align[3:2] +// 32 (encoded as size = 0b10) -> Index = index_align[3] +// +// Ref: A8.6.317 VLD4 (single 4-element structure to one lane). +static unsigned decodeLaneIndex(uint32_t insn) { + unsigned size = insn >> 10 & 3; + assert((size == 0 || size == 1 || size == 2) && + "Encoding error: size should be either 0, 1, or 2"); + + unsigned index_align = insn >> 4 & 0xF; + return (index_align >> 1) >> size; +} + +// imm64 = AdvSIMDExpandImm(op, cmode, i:imm3:imm4) +// op = Inst{5}, cmode = Inst{11-8} +// i = Inst{24} (ARM architecture) +// imm3 = Inst{18-16}, imm4 = Inst{3-0} +// Ref: Table A7-15 Modified immediate values for Advanced SIMD instructions. +static uint64_t decodeN1VImm(uint32_t insn, ElemSize esize) { + unsigned char cmode = (insn >> 8) & 0xF; + unsigned char Imm8 = ((insn >> 24) & 1) << 7 | + ((insn >> 16) & 7) << 4 | + (insn & 0xF); + uint64_t Imm64 = 0; + + switch (esize) { + case ESize8: + Imm64 = Imm8; + break; + case ESize16: + Imm64 = Imm8 << 8*(cmode >> 1 & 1); + break; + case ESize32: { + if (cmode == 12) + Imm64 = (Imm8 << 8) | 0xFF; + else if (cmode == 13) + Imm64 = (Imm8 << 16) | 0xFFFF; + else { + // Imm8 to be shifted left by how many bytes... + Imm64 = Imm8 << 8*(cmode >> 1 & 3); + } + break; + } + case ESize64: { + for (unsigned i = 0; i < 8; ++i) + if ((Imm8 >> i) & 1) + Imm64 |= (uint64_t)0xFF << 8*i; + break; + } + default: + assert(0 && "Unreachable code!"); + return 0; + } + + return Imm64; +} + +// A8.6.339 VMUL, VMULL (by scalar) +// ESize16 => m = Inst{2-0} (Vm<2:0>) D0-D7 +// ESize32 => m = Inst{3-0} (Vm<3:0>) D0-D15 +static unsigned decodeRestrictedDm(uint32_t insn, ElemSize esize) { + switch (esize) { + case ESize16: + return insn & 7; + case ESize32: + return insn & 0xF; + default: + assert(0 && "Unreachable code!"); + return 0; + } +} + +// A8.6.339 VMUL, VMULL (by scalar) +// ESize16 => index = Inst{5:3} (M:Vm<3>) D0-D7 +// ESize32 => index = Inst{5} (M) D0-D15 +static unsigned decodeRestrictedDmIndex(uint32_t insn, ElemSize esize) { + switch (esize) { + case ESize16: + return (((insn >> 5) & 1) << 1) | ((insn >> 3) & 1); + case ESize32: + return (insn >> 5) & 1; + default: + assert(0 && "Unreachable code!"); + return 0; + } +} + +// A8.6.296 VCVT (between floating-point and fixed-point, Advanced SIMD) +// (64 - <fbits>) is encoded as imm6, i.e., Inst{21-16}. +static unsigned decodeVCVTFractionBits(uint32_t insn) { + return 64 - ((insn >> 16) & 0x3F); +} + +// A8.6.302 VDUP (scalar) +// ESize8 => index = Inst{19-17} +// ESize16 => index = Inst{19-18} +// ESize32 => index = Inst{19} +static unsigned decodeNVLaneDupIndex(uint32_t insn, ElemSize esize) { + switch (esize) { + case ESize8: + return (insn >> 17) & 7; + case ESize16: + return (insn >> 18) & 3; + case ESize32: + return (insn >> 19) & 1; + default: + assert(0 && "Unspecified element size!"); + return 0; + } +} + +// A8.6.328 VMOV (ARM core register to scalar) +// A8.6.329 VMOV (scalar to ARM core register) +// ESize8 => index = Inst{21:6-5} +// ESize16 => index = Inst{21:6} +// ESize32 => index = Inst{21} +static unsigned decodeNVLaneOpIndex(uint32_t insn, ElemSize esize) { + switch (esize) { + case ESize8: + return ((insn >> 21) & 1) << 2 | ((insn >> 5) & 3); + case ESize16: + return ((insn >> 21) & 1) << 1 | ((insn >> 6) & 1); + case ESize32: + return ((insn >> 21) & 1); + default: + assert(0 && "Unspecified element size!"); + return 0; + } +} + +// Imm6 = Inst{21-16}, L = Inst{7} +// +// LeftShift == true (A8.6.367 VQSHL, A8.6.387 VSLI): +// case L:imm6 of +// '0001xxx' => esize = 8; shift_amount = imm6 - 8 +// '001xxxx' => esize = 16; shift_amount = imm6 - 16 +// '01xxxxx' => esize = 32; shift_amount = imm6 - 32 +// '1xxxxxx' => esize = 64; shift_amount = imm6 +// +// LeftShift == false (A8.6.376 VRSHR, A8.6.368 VQSHRN): +// case L:imm6 of +// '0001xxx' => esize = 8; shift_amount = 16 - imm6 +// '001xxxx' => esize = 16; shift_amount = 32 - imm6 +// '01xxxxx' => esize = 32; shift_amount = 64 - imm6 +// '1xxxxxx' => esize = 64; shift_amount = 64 - imm6 +// +static unsigned decodeNVSAmt(uint32_t insn, bool LeftShift) { + ElemSize esize = ESizeNA; + unsigned L = (insn >> 7) & 1; + unsigned imm6 = (insn >> 16) & 0x3F; + if (L == 0) { + if (imm6 >> 3 == 1) + esize = ESize8; + else if (imm6 >> 4 == 1) + esize = ESize16; + else if (imm6 >> 5 == 1) + esize = ESize32; + else + assert(0 && "Wrong encoding of Inst{7:21-16}!"); + } else + esize = ESize64; + + if (LeftShift) + return esize == ESize64 ? imm6 : (imm6 - esize); + else + return esize == ESize64 ? (esize - imm6) : (2*esize - imm6); +} + +// A8.6.305 VEXT +// Imm4 = Inst{11-8} +static unsigned decodeN3VImm(uint32_t insn) { + return (insn >> 8) & 0xF; +} + +static bool UseDRegPair(unsigned Opcode) { + switch (Opcode) { + default: + return false; + case ARM::VLD1q8_UPD: + case ARM::VLD1q16_UPD: + case ARM::VLD1q32_UPD: + case ARM::VLD1q64_UPD: + case ARM::VST1q8_UPD: + case ARM::VST1q16_UPD: + case ARM::VST1q32_UPD: + case ARM::VST1q64_UPD: + return true; + } +} + +// VLD* +// D[d] D[d2] ... Rn [TIED_TO Rn] align [Rm] +// VLD*LN* +// D[d] D[d2] ... Rn [TIED_TO Rn] align [Rm] TIED_TO ... imm(idx) +// VST* +// Rn [TIED_TO Rn] align [Rm] D[d] D[d2] ... +// VST*LN* +// Rn [TIED_TO Rn] align [Rm] D[d] D[d2] ... [imm(idx)] +// +// Correctly set VLD*/VST*'s TIED_TO GPR, as the asm printer needs it. +static bool DisassembleNLdSt0(MCInst &MI, unsigned Opcode, uint32_t insn, + unsigned short NumOps, unsigned &NumOpsAdded, bool Store, bool DblSpaced, + BO B) { + + const TargetInstrDesc &TID = ARMInsts[Opcode]; + const TargetOperandInfo *OpInfo = TID.OpInfo; + + // At least one DPR register plus addressing mode #6. + assert(NumOps >= 3 && "Expect >= 3 operands"); + + unsigned &OpIdx = NumOpsAdded; + + OpIdx = 0; + + // We have homogeneous NEON registers for Load/Store. + unsigned RegClass = 0; + + // Double-spaced registers have increments of 2. + unsigned Inc = DblSpaced ? 2 : 1; + + unsigned Rn = decodeRn(insn); + unsigned Rm = decodeRm(insn); + unsigned Rd = decodeNEONRd(insn); + + // A7.7.1 Advanced SIMD addressing mode. + bool WB = Rm != 15; + + // LLVM Addressing Mode #6. + unsigned RmEnum = 0; + if (WB && Rm != 13) + RmEnum = getRegisterEnum(B, ARM::GPRRegClassID, Rm); + + if (Store) { + // Consume possible WB, AddrMode6, possible increment reg, the DPR/QPR's, + // then possible lane index. + assert(OpIdx < NumOps && OpInfo[0].RegClass == ARM::GPRRegClassID && + "Reg operand expected"); + + if (WB) { + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + Rn))); + ++OpIdx; + } + + assert((OpIdx+1) < NumOps && OpInfo[OpIdx].RegClass == ARM::GPRRegClassID && + OpInfo[OpIdx + 1].RegClass == 0 && "Addrmode #6 Operands expected"); + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + Rn))); + MI.addOperand(MCOperand::CreateImm(0)); // Alignment ignored? + OpIdx += 2; + + if (WB) { + MI.addOperand(MCOperand::CreateReg(RmEnum)); + ++OpIdx; + } + + assert(OpIdx < NumOps && + (OpInfo[OpIdx].RegClass == ARM::DPRRegClassID || + OpInfo[OpIdx].RegClass == ARM::QPRRegClassID) && + "Reg operand expected"); + + RegClass = OpInfo[OpIdx].RegClass; + while (OpIdx < NumOps && OpInfo[OpIdx].RegClass == RegClass) { + MI.addOperand(MCOperand::CreateReg( + getRegisterEnum(B, RegClass, Rd, + UseDRegPair(Opcode)))); + Rd += Inc; + ++OpIdx; + } + + // Handle possible lane index. + if (OpIdx < NumOps && OpInfo[OpIdx].RegClass == 0 + && !OpInfo[OpIdx].isPredicate() && !OpInfo[OpIdx].isOptionalDef()) { + MI.addOperand(MCOperand::CreateImm(decodeLaneIndex(insn))); + ++OpIdx; + } + + } else { + // Consume the DPR/QPR's, possible WB, AddrMode6, possible incrment reg, + // possible TIED_TO DPR/QPR's (ignored), then possible lane index. + RegClass = OpInfo[0].RegClass; + + while (OpIdx < NumOps && OpInfo[OpIdx].RegClass == RegClass) { + MI.addOperand(MCOperand::CreateReg( + getRegisterEnum(B, RegClass, Rd, + UseDRegPair(Opcode)))); + Rd += Inc; + ++OpIdx; + } + + if (WB) { + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + Rn))); + ++OpIdx; + } + + assert((OpIdx+1) < NumOps && OpInfo[OpIdx].RegClass == ARM::GPRRegClassID && + OpInfo[OpIdx + 1].RegClass == 0 && "Addrmode #6 Operands expected"); + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + Rn))); + MI.addOperand(MCOperand::CreateImm(0)); // Alignment ignored? + OpIdx += 2; + + if (WB) { + MI.addOperand(MCOperand::CreateReg(RmEnum)); + ++OpIdx; + } + + while (OpIdx < NumOps && OpInfo[OpIdx].RegClass == RegClass) { + assert(TID.getOperandConstraint(OpIdx, TOI::TIED_TO) != -1 && + "Tied to operand expected"); + MI.addOperand(MCOperand::CreateReg(0)); + ++OpIdx; + } + + // Handle possible lane index. + if (OpIdx < NumOps && OpInfo[OpIdx].RegClass == 0 + && !OpInfo[OpIdx].isPredicate() && !OpInfo[OpIdx].isOptionalDef()) { + MI.addOperand(MCOperand::CreateImm(decodeLaneIndex(insn))); + ++OpIdx; + } + } + + return true; +} + +// A7.7 +// If L (Inst{21}) == 0, store instructions. +// Find out about double-spaced-ness of the Opcode and pass it on to +// DisassembleNLdSt0(). +static bool DisassembleNLdSt(MCInst &MI, unsigned Opcode, uint32_t insn, + unsigned short NumOps, unsigned &NumOpsAdded, BO B) { + + const StringRef Name = ARMInsts[Opcode].Name; + bool DblSpaced = false; + + if (Name.find("LN") != std::string::npos) { + // To one lane instructions. + // See, for example, 8.6.317 VLD4 (single 4-element structure to one lane). + + // <size> == 16 && Inst{5} == 1 --> DblSpaced = true + if (Name.endswith("16") || Name.endswith("16_UPD")) + DblSpaced = slice(insn, 5, 5) == 1; + + // <size> == 32 && Inst{6} == 1 --> DblSpaced = true + if (Name.endswith("32") || Name.endswith("32_UPD")) + DblSpaced = slice(insn, 6, 6) == 1; + + } else { + // Multiple n-element structures with type encoded as Inst{11-8}. + // See, for example, A8.6.316 VLD4 (multiple 4-element structures). + + // n == 2 && type == 0b1001 -> DblSpaced = true + if (Name.startswith("VST2") || Name.startswith("VLD2")) + DblSpaced = slice(insn, 11, 8) == 9; + + // n == 3 && type == 0b0101 -> DblSpaced = true + if (Name.startswith("VST3") || Name.startswith("VLD3")) + DblSpaced = slice(insn, 11, 8) == 5; + + // n == 4 && type == 0b0001 -> DblSpaced = true + if (Name.startswith("VST4") || Name.startswith("VLD4")) + DblSpaced = slice(insn, 11, 8) == 1; + + } + return DisassembleNLdSt0(MI, Opcode, insn, NumOps, NumOpsAdded, + slice(insn, 21, 21) == 0, DblSpaced, B); +} + +// VMOV (immediate) +// Qd/Dd imm +static bool DisassembleN1RegModImmFrm(MCInst &MI, unsigned Opcode, + uint32_t insn, unsigned short NumOps, unsigned &NumOpsAdded, BO B) { + + const TargetInstrDesc &TID = ARMInsts[Opcode]; + const TargetOperandInfo *OpInfo = TID.OpInfo; + + assert(NumOps >= 2 && + (OpInfo[0].RegClass == ARM::DPRRegClassID || + OpInfo[0].RegClass == ARM::QPRRegClassID) && + (OpInfo[1].RegClass == 0) && + "Expect 1 reg operand followed by 1 imm operand"); + + // Qd/Dd = Inst{22:15-12} => NEON Rd + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, OpInfo[0].RegClass, + decodeNEONRd(insn)))); + + ElemSize esize = ESizeNA; + switch (Opcode) { + case ARM::VMOVv8i8: + case ARM::VMOVv16i8: + esize = ESize8; + break; + case ARM::VMOVv4i16: + case ARM::VMOVv8i16: + esize = ESize16; + break; + case ARM::VMOVv2i32: + case ARM::VMOVv4i32: + esize = ESize32; + break; + case ARM::VMOVv1i64: + case ARM::VMOVv2i64: + esize = ESize64; + break; + default: + assert(0 && "Unreachable code!"); + return false; + } + + // One register and a modified immediate value. + // Add the imm operand. + MI.addOperand(MCOperand::CreateImm(decodeN1VImm(insn, esize))); + + NumOpsAdded = 2; + return true; +} + +namespace { +enum N2VFlag { + N2V_None, + N2V_VectorDupLane, + N2V_VectorConvert_Between_Float_Fixed +}; +} // End of unnamed namespace + +// Vector Convert [between floating-point and fixed-point] +// Qd/Dd Qm/Dm [fbits] +// +// Vector Duplicate Lane (from scalar to all elements) Instructions. +// VDUPLN16d, VDUPLN16q, VDUPLN32d, VDUPLN32q, VDUPLN8d, VDUPLN8q: +// Qd/Dd Dm index +// +// Vector Move Long: +// Qd Dm +// +// Vector Move Narrow: +// Dd Qm +// +// Others +static bool DisassembleNVdVmOptImm(MCInst &MI, unsigned Opc, uint32_t insn, + unsigned short NumOps, unsigned &NumOpsAdded, N2VFlag Flag, BO B) { + + const TargetInstrDesc &TID = ARMInsts[Opc]; + const TargetOperandInfo *OpInfo = TID.OpInfo; + + assert(NumOps >= 2 && + (OpInfo[0].RegClass == ARM::DPRRegClassID || + OpInfo[0].RegClass == ARM::QPRRegClassID) && + (OpInfo[1].RegClass == ARM::DPRRegClassID || + OpInfo[1].RegClass == ARM::QPRRegClassID) && + "Expect >= 2 operands and first 2 as reg operands"); + + unsigned &OpIdx = NumOpsAdded; + + OpIdx = 0; + + ElemSize esize = ESizeNA; + if (Flag == N2V_VectorDupLane) { + // VDUPLN has its index embedded. Its size can be inferred from the Opcode. + assert(Opc >= ARM::VDUPLN16d && Opc <= ARM::VDUPLN8q && + "Unexpected Opcode"); + esize = (Opc == ARM::VDUPLN8d || Opc == ARM::VDUPLN8q) ? ESize8 + : ((Opc == ARM::VDUPLN16d || Opc == ARM::VDUPLN16q) ? ESize16 + : ESize32); + } + + // Qd/Dd = Inst{22:15-12} => NEON Rd + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, OpInfo[OpIdx].RegClass, + decodeNEONRd(insn)))); + ++OpIdx; + + // VPADAL... + if (TID.getOperandConstraint(OpIdx, TOI::TIED_TO) != -1) { + // TIED_TO operand. + MI.addOperand(MCOperand::CreateReg(0)); + ++OpIdx; + } + + // Dm = Inst{5:3-0} => NEON Rm + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, OpInfo[OpIdx].RegClass, + decodeNEONRm(insn)))); + ++OpIdx; + + // VZIP and others have two TIED_TO reg operands. + int Idx; + while (OpIdx < NumOps && + (Idx = TID.getOperandConstraint(OpIdx, TOI::TIED_TO)) != -1) { + // Add TIED_TO operand. + MI.addOperand(MI.getOperand(Idx)); + ++OpIdx; + } + + // Add the imm operand, if required. + if (OpIdx < NumOps && OpInfo[OpIdx].RegClass == 0 + && !OpInfo[OpIdx].isPredicate() && !OpInfo[OpIdx].isOptionalDef()) { + + unsigned imm = 0xFFFFFFFF; + + if (Flag == N2V_VectorDupLane) + imm = decodeNVLaneDupIndex(insn, esize); + if (Flag == N2V_VectorConvert_Between_Float_Fixed) + imm = decodeVCVTFractionBits(insn); + + assert(imm != 0xFFFFFFFF && "Internal error"); + MI.addOperand(MCOperand::CreateImm(imm)); + ++OpIdx; + } + + return true; +} + +static bool DisassembleN2RegFrm(MCInst &MI, unsigned Opc, uint32_t insn, + unsigned short NumOps, unsigned &NumOpsAdded, BO B) { + + return DisassembleNVdVmOptImm(MI, Opc, insn, NumOps, NumOpsAdded, + N2V_None, B); +} +static bool DisassembleNVCVTFrm(MCInst &MI, unsigned Opc, uint32_t insn, + unsigned short NumOps, unsigned &NumOpsAdded, BO B) { + + return DisassembleNVdVmOptImm(MI, Opc, insn, NumOps, NumOpsAdded, + N2V_VectorConvert_Between_Float_Fixed, B); +} +static bool DisassembleNVecDupLnFrm(MCInst &MI, unsigned Opc, uint32_t insn, + unsigned short NumOps, unsigned &NumOpsAdded, BO B) { + + return DisassembleNVdVmOptImm(MI, Opc, insn, NumOps, NumOpsAdded, + N2V_VectorDupLane, B); +} + +// Vector Shift [Accumulate] Instructions. +// Qd/Dd [Qd/Dd (TIED_TO)] Qm/Dm ShiftAmt +// +// Vector Shift Left Long (with maximum shift count) Instructions. +// VSHLLi16, VSHLLi32, VSHLLi8: Qd Dm imm (== size) +// +static bool DisassembleNVectorShift(MCInst &MI, unsigned Opcode, uint32_t insn, + unsigned short NumOps, unsigned &NumOpsAdded, bool LeftShift, BO B) { + + const TargetInstrDesc &TID = ARMInsts[Opcode]; + const TargetOperandInfo *OpInfo = TID.OpInfo; + + assert(NumOps >= 3 && + (OpInfo[0].RegClass == ARM::DPRRegClassID || + OpInfo[0].RegClass == ARM::QPRRegClassID) && + (OpInfo[1].RegClass == ARM::DPRRegClassID || + OpInfo[1].RegClass == ARM::QPRRegClassID) && + "Expect >= 3 operands and first 2 as reg operands"); + + unsigned &OpIdx = NumOpsAdded; + + OpIdx = 0; + + // Qd/Dd = Inst{22:15-12} => NEON Rd + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, OpInfo[OpIdx].RegClass, + decodeNEONRd(insn)))); + ++OpIdx; + + if (TID.getOperandConstraint(OpIdx, TOI::TIED_TO) != -1) { + // TIED_TO operand. + MI.addOperand(MCOperand::CreateReg(0)); + ++OpIdx; + } + + assert((OpInfo[OpIdx].RegClass == ARM::DPRRegClassID || + OpInfo[OpIdx].RegClass == ARM::QPRRegClassID) && + "Reg operand expected"); + + // Qm/Dm = Inst{5:3-0} => NEON Rm + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, OpInfo[OpIdx].RegClass, + decodeNEONRm(insn)))); + ++OpIdx; + + assert(OpInfo[OpIdx].RegClass == 0 && "Imm operand expected"); + + // Add the imm operand. + + // VSHLL has maximum shift count as the imm, inferred from its size. + unsigned Imm; + switch (Opcode) { + default: + Imm = decodeNVSAmt(insn, LeftShift); + break; + case ARM::VSHLLi8: + Imm = 8; + break; + case ARM::VSHLLi16: + Imm = 16; + break; + case ARM::VSHLLi32: + Imm = 32; + break; + } + MI.addOperand(MCOperand::CreateImm(Imm)); + ++OpIdx; + + return true; +} + +// Left shift instructions. +static bool DisassembleN2RegVecShLFrm(MCInst &MI, unsigned Opcode, + uint32_t insn, unsigned short NumOps, unsigned &NumOpsAdded, BO B) { + + return DisassembleNVectorShift(MI, Opcode, insn, NumOps, NumOpsAdded, true, + B); +} +// Right shift instructions have different shift amount interpretation. +static bool DisassembleN2RegVecShRFrm(MCInst &MI, unsigned Opcode, + uint32_t insn, unsigned short NumOps, unsigned &NumOpsAdded, BO B) { + + return DisassembleNVectorShift(MI, Opcode, insn, NumOps, NumOpsAdded, false, + B); +} + +namespace { +enum N3VFlag { + N3V_None, + N3V_VectorExtract, + N3V_VectorShift, + N3V_Multiply_By_Scalar +}; +} // End of unnamed namespace + +// NEON Three Register Instructions with Optional Immediate Operand +// +// Vector Extract Instructions. +// Qd/Dd Qn/Dn Qm/Dm imm4 +// +// Vector Shift (Register) Instructions. +// Qd/Dd Qm/Dm Qn/Dn (notice the order of m, n) +// +// Vector Multiply [Accumulate/Subtract] [Long] By Scalar Instructions. +// Qd/Dd Qn/Dn RestrictedDm index +// +// Others +static bool DisassembleNVdVnVmOptImm(MCInst &MI, unsigned Opcode, uint32_t insn, + unsigned short NumOps, unsigned &NumOpsAdded, N3VFlag Flag, BO B) { + + const TargetInstrDesc &TID = ARMInsts[Opcode]; + const TargetOperandInfo *OpInfo = TID.OpInfo; + + // No checking for OpInfo[2] because of MOVDneon/MOVQ with only two regs. + assert(NumOps >= 3 && + (OpInfo[0].RegClass == ARM::DPRRegClassID || + OpInfo[0].RegClass == ARM::QPRRegClassID) && + (OpInfo[1].RegClass == ARM::DPRRegClassID || + OpInfo[1].RegClass == ARM::QPRRegClassID) && + "Expect >= 3 operands and first 2 as reg operands"); + + unsigned &OpIdx = NumOpsAdded; + + OpIdx = 0; + + bool VdVnVm = Flag == N3V_VectorShift ? false : true; + bool IsImm4 = Flag == N3V_VectorExtract ? true : false; + bool IsDmRestricted = Flag == N3V_Multiply_By_Scalar ? true : false; + ElemSize esize = ESizeNA; + if (Flag == N3V_Multiply_By_Scalar) { + unsigned size = (insn >> 20) & 3; + if (size == 1) esize = ESize16; + if (size == 2) esize = ESize32; + assert (esize == ESize16 || esize == ESize32); + } + + // Qd/Dd = Inst{22:15-12} => NEON Rd + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, OpInfo[OpIdx].RegClass, + decodeNEONRd(insn)))); + ++OpIdx; + + // VABA, VABAL, VBSLd, VBSLq, ... + if (TID.getOperandConstraint(OpIdx, TOI::TIED_TO) != -1) { + // TIED_TO operand. + MI.addOperand(MCOperand::CreateReg(0)); + ++OpIdx; + } + + // Dn = Inst{7:19-16} => NEON Rn + // or + // Dm = Inst{5:3-0} => NEON Rm + MI.addOperand(MCOperand::CreateReg( + getRegisterEnum(B, OpInfo[OpIdx].RegClass, + VdVnVm ? decodeNEONRn(insn) + : decodeNEONRm(insn)))); + ++OpIdx; + + // Special case handling for VMOVDneon and VMOVQ because they are marked as + // N3RegFrm. + if (Opcode == ARM::VMOVDneon || Opcode == ARM::VMOVQ) + return true; + + // Dm = Inst{5:3-0} => NEON Rm + // or + // Dm is restricted to D0-D7 if size is 16, D0-D15 otherwise + // or + // Dn = Inst{7:19-16} => NEON Rn + unsigned m = VdVnVm ? (IsDmRestricted ? decodeRestrictedDm(insn, esize) + : decodeNEONRm(insn)) + : decodeNEONRn(insn); + + MI.addOperand(MCOperand::CreateReg( + getRegisterEnum(B, OpInfo[OpIdx].RegClass, m))); + ++OpIdx; + + if (OpIdx < NumOps && OpInfo[OpIdx].RegClass == 0 + && !OpInfo[OpIdx].isPredicate() && !OpInfo[OpIdx].isOptionalDef()) { + // Add the imm operand. + unsigned Imm = 0; + if (IsImm4) + Imm = decodeN3VImm(insn); + else if (IsDmRestricted) + Imm = decodeRestrictedDmIndex(insn, esize); + else { + assert(0 && "Internal error: unreachable code!"); + return false; + } + + MI.addOperand(MCOperand::CreateImm(Imm)); + ++OpIdx; + } + + return true; +} + +static bool DisassembleN3RegFrm(MCInst &MI, unsigned Opcode, uint32_t insn, + unsigned short NumOps, unsigned &NumOpsAdded, BO B) { + + return DisassembleNVdVnVmOptImm(MI, Opcode, insn, NumOps, NumOpsAdded, + N3V_None, B); +} +static bool DisassembleN3RegVecShFrm(MCInst &MI, unsigned Opcode, + uint32_t insn, unsigned short NumOps, unsigned &NumOpsAdded, BO B) { + + return DisassembleNVdVnVmOptImm(MI, Opcode, insn, NumOps, NumOpsAdded, + N3V_VectorShift, B); +} +static bool DisassembleNVecExtractFrm(MCInst &MI, unsigned Opcode, uint32_t insn, + unsigned short NumOps, unsigned &NumOpsAdded, BO B) { + + return DisassembleNVdVnVmOptImm(MI, Opcode, insn, NumOps, NumOpsAdded, + N3V_VectorExtract, B); +} +static bool DisassembleNVecMulScalarFrm(MCInst &MI, unsigned Opcode, + uint32_t insn, unsigned short NumOps, unsigned &NumOpsAdded, BO B) { + + return DisassembleNVdVnVmOptImm(MI, Opcode, insn, NumOps, NumOpsAdded, + N3V_Multiply_By_Scalar, B); +} + +// Vector Table Lookup +// +// VTBL1, VTBX1: Dd [Dd(TIED_TO)] Dn Dm +// VTBL2, VTBX2: Dd [Dd(TIED_TO)] Dn Dn+1 Dm +// VTBL3, VTBX3: Dd [Dd(TIED_TO)] Dn Dn+1 Dn+2 Dm +// VTBL4, VTBX4: Dd [Dd(TIED_TO)] Dn Dn+1 Dn+2 Dn+3 Dm +static bool DisassembleNVTBLFrm(MCInst &MI, unsigned Opcode, uint32_t insn, + unsigned short NumOps, unsigned &NumOpsAdded, BO B) { + + const TargetInstrDesc &TID = ARMInsts[Opcode]; + const TargetOperandInfo *OpInfo = TID.OpInfo; + if (!OpInfo) return false; + + assert(NumOps >= 3 && + OpInfo[0].RegClass == ARM::DPRRegClassID && + OpInfo[1].RegClass == ARM::DPRRegClassID && + OpInfo[2].RegClass == ARM::DPRRegClassID && + "Expect >= 3 operands and first 3 as reg operands"); + + unsigned &OpIdx = NumOpsAdded; + + OpIdx = 0; + + unsigned Rn = decodeNEONRn(insn); + + // {Dn} encoded as len = 0b00 + // {Dn Dn+1} encoded as len = 0b01 + // {Dn Dn+1 Dn+2 } encoded as len = 0b10 + // {Dn Dn+1 Dn+2 Dn+3} encoded as len = 0b11 + unsigned Len = slice(insn, 9, 8) + 1; + + // Dd (the destination vector) + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::DPRRegClassID, + decodeNEONRd(insn)))); + ++OpIdx; + + // Process tied_to operand constraint. + int Idx; + if ((Idx = TID.getOperandConstraint(OpIdx, TOI::TIED_TO)) != -1) { + MI.addOperand(MI.getOperand(Idx)); + ++OpIdx; + } + + // Do the <list> now. + for (unsigned i = 0; i < Len; ++i) { + assert(OpIdx < NumOps && OpInfo[OpIdx].RegClass == ARM::DPRRegClassID && + "Reg operand expected"); + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::DPRRegClassID, + Rn + i))); + ++OpIdx; + } + + // Dm (the index vector) + assert(OpIdx < NumOps && OpInfo[OpIdx].RegClass == ARM::DPRRegClassID && + "Reg operand (index vector) expected"); + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::DPRRegClassID, + decodeNEONRm(insn)))); + ++OpIdx; + + return true; +} + +static bool DisassembleNEONFrm(MCInst &MI, unsigned Opcode, uint32_t insn, + unsigned short NumOps, unsigned &NumOpsAdded, BO) { + assert(0 && "Unreachable code!"); + return false; +} + +// Vector Get Lane (move scalar to ARM core register) Instructions. +// VGETLNi32, VGETLNs16, VGETLNs8, VGETLNu16, VGETLNu8: Rt Dn index +static bool DisassembleNEONGetLnFrm(MCInst &MI, unsigned Opcode, uint32_t insn, + unsigned short NumOps, unsigned &NumOpsAdded, BO B) { + + const TargetInstrDesc &TID = ARMInsts[Opcode]; + const TargetOperandInfo *OpInfo = TID.OpInfo; + if (!OpInfo) return false; + + assert(TID.getNumDefs() == 1 && NumOps >= 3 && + OpInfo[0].RegClass == ARM::GPRRegClassID && + OpInfo[1].RegClass == ARM::DPRRegClassID && + OpInfo[2].RegClass == 0 && + "Expect >= 3 operands with one dst operand"); + + ElemSize esize = + Opcode == ARM::VGETLNi32 ? ESize32 + : ((Opcode == ARM::VGETLNs16 || Opcode == ARM::VGETLNu16) ? ESize16 + : ESize32); + + // Rt = Inst{15-12} => ARM Rd + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + decodeRd(insn)))); + + // Dn = Inst{7:19-16} => NEON Rn + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::DPRRegClassID, + decodeNEONRn(insn)))); + + MI.addOperand(MCOperand::CreateImm(decodeNVLaneOpIndex(insn, esize))); + + NumOpsAdded = 3; + return true; +} + +// Vector Set Lane (move ARM core register to scalar) Instructions. +// VSETLNi16, VSETLNi32, VSETLNi8: Dd Dd (TIED_TO) Rt index +static bool DisassembleNEONSetLnFrm(MCInst &MI, unsigned Opcode, uint32_t insn, + unsigned short NumOps, unsigned &NumOpsAdded, BO B) { + + const TargetInstrDesc &TID = ARMInsts[Opcode]; + const TargetOperandInfo *OpInfo = TID.OpInfo; + if (!OpInfo) return false; + + assert(TID.getNumDefs() == 1 && NumOps >= 3 && + OpInfo[0].RegClass == ARM::DPRRegClassID && + OpInfo[1].RegClass == ARM::DPRRegClassID && + TID.getOperandConstraint(1, TOI::TIED_TO) != -1 && + OpInfo[2].RegClass == ARM::GPRRegClassID && + OpInfo[3].RegClass == 0 && + "Expect >= 3 operands with one dst operand"); + + ElemSize esize = + Opcode == ARM::VSETLNi8 ? ESize8 + : (Opcode == ARM::VSETLNi16 ? ESize16 + : ESize32); + + // Dd = Inst{7:19-16} => NEON Rn + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::DPRRegClassID, + decodeNEONRn(insn)))); + + // TIED_TO operand. + MI.addOperand(MCOperand::CreateReg(0)); + + // Rt = Inst{15-12} => ARM Rd + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + decodeRd(insn)))); + + MI.addOperand(MCOperand::CreateImm(decodeNVLaneOpIndex(insn, esize))); + + NumOpsAdded = 4; + return true; +} + +// Vector Duplicate Instructions (from ARM core register to all elements). +// VDUP8d, VDUP16d, VDUP32d, VDUP8q, VDUP16q, VDUP32q: Qd/Dd Rt +static bool DisassembleNEONDupFrm(MCInst &MI, unsigned Opcode, uint32_t insn, + unsigned short NumOps, unsigned &NumOpsAdded, BO B) { + + const TargetOperandInfo *OpInfo = ARMInsts[Opcode].OpInfo; + + assert(NumOps >= 2 && + (OpInfo[0].RegClass == ARM::DPRRegClassID || + OpInfo[0].RegClass == ARM::QPRRegClassID) && + OpInfo[1].RegClass == ARM::GPRRegClassID && + "Expect >= 2 operands and first 2 as reg operand"); + + unsigned RegClass = OpInfo[0].RegClass; + + // Qd/Dd = Inst{7:19-16} => NEON Rn + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, RegClass, + decodeNEONRn(insn)))); + + // Rt = Inst{15-12} => ARM Rd + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + decodeRd(insn)))); + + NumOpsAdded = 2; + return true; +} + +// A8.6.41 DMB +// A8.6.42 DSB +// A8.6.49 ISB +static inline bool MemBarrierInstr(uint32_t insn) { + unsigned op7_4 = slice(insn, 7, 4); + if (slice(insn, 31, 20) == 0xf57 && (op7_4 >= 4 && op7_4 <= 6)) + return true; + + return false; +} + +static inline bool PreLoadOpcode(unsigned Opcode) { + switch(Opcode) { + case ARM::PLDi: case ARM::PLDr: + case ARM::PLDWi: case ARM::PLDWr: + case ARM::PLIi: case ARM::PLIr: + return true; + default: + return false; + } +} + +static bool DisassemblePreLoadFrm(MCInst &MI, unsigned Opcode, uint32_t insn, + unsigned short NumOps, unsigned &NumOpsAdded, BO B) { + + // Preload Data/Instruction requires either 2 or 4 operands. + // PLDi, PLDWi, PLIi: Rn [+/-]imm12 add = (U == '1') + // PLDr[a|m], PLDWr[a|m], PLIr[a|m]: Rn Rm addrmode2_opc + + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + decodeRn(insn)))); + + if (Opcode == ARM::PLDi || Opcode == ARM::PLDWi || Opcode == ARM::PLIi) { + unsigned Imm12 = slice(insn, 11, 0); + bool Negative = getUBit(insn) == 0; + int Offset = Negative ? -1 - Imm12 : 1 * Imm12; + MI.addOperand(MCOperand::CreateImm(Offset)); + NumOpsAdded = 2; + } else { + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + decodeRm(insn)))); + + ARM_AM::AddrOpc AddrOpcode = getUBit(insn) ? ARM_AM::add : ARM_AM::sub; + + // Inst{6-5} encodes the shift opcode. + ARM_AM::ShiftOpc ShOp = getShiftOpcForBits(slice(insn, 6, 5)); + // Inst{11-7} encodes the imm5 shift amount. + unsigned ShImm = slice(insn, 11, 7); + + // A8.4.1. Possible rrx or shift amount of 32... + getImmShiftSE(ShOp, ShImm); + MI.addOperand(MCOperand::CreateImm( + ARM_AM::getAM2Opc(AddrOpcode, ShImm, ShOp))); + NumOpsAdded = 3; + } + + return true; +} + +static bool DisassembleMiscFrm(MCInst &MI, unsigned Opcode, uint32_t insn, + unsigned short NumOps, unsigned &NumOpsAdded, BO B) { + + if (MemBarrierInstr(insn)) + return true; + + switch (Opcode) { + case ARM::CLREX: + case ARM::NOP: + case ARM::TRAP: + case ARM::YIELD: + case ARM::WFE: + case ARM::WFI: + case ARM::SEV: + case ARM::SETENDBE: + case ARM::SETENDLE: + return true; + default: + break; + } + + // CPS has a singleton $opt operand that contains the following information: + // opt{4-0} = mode from Inst{4-0} + // opt{5} = changemode from Inst{17} + // opt{8-6} = AIF from Inst{8-6} + // opt{10-9} = imod from Inst{19-18} with 0b10 as enable and 0b11 as disable + if (Opcode == ARM::CPS) { + unsigned Option = slice(insn, 4, 0) | slice(insn, 17, 17) << 5 | + slice(insn, 8, 6) << 6 | slice(insn, 19, 18) << 9; + MI.addOperand(MCOperand::CreateImm(Option)); + NumOpsAdded = 1; + return true; + } + + // DBG has its option specified in Inst{3-0}. + if (Opcode == ARM::DBG) { + MI.addOperand(MCOperand::CreateImm(slice(insn, 3, 0))); + NumOpsAdded = 1; + return true; + } + + // BKPT takes an imm32 val equal to ZeroExtend(Inst{19-8:3-0}). + if (Opcode == ARM::BKPT) { + MI.addOperand(MCOperand::CreateImm(slice(insn, 19, 8) << 4 | + slice(insn, 3, 0))); + NumOpsAdded = 1; + return true; + } + + if (PreLoadOpcode(Opcode)) + return DisassemblePreLoadFrm(MI, Opcode, insn, NumOps, NumOpsAdded, B); + + assert(0 && "Unexpected misc instruction!"); + return false; +} + +static bool DisassembleThumbMiscFrm(MCInst &MI, unsigned Opcode, uint32_t insn, + unsigned short NumOps, unsigned &NumOpsAdded, BO) { + + assert(0 && "Unexpected thumb misc. instruction!"); + return false; +} + +/// FuncPtrs - FuncPtrs maps ARMFormat to its corresponding DisassembleFP. +/// We divide the disassembly task into different categories, with each one +/// corresponding to a specific instruction encoding format. There could be +/// exceptions when handling a specific format, and that is why the Opcode is +/// also present in the function prototype. +static const DisassembleFP FuncPtrs[] = { + &DisassemblePseudo, + &DisassembleMulFrm, + &DisassembleBrFrm, + &DisassembleBrMiscFrm, + &DisassembleDPFrm, + &DisassembleDPSoRegFrm, + &DisassembleLdFrm, + &DisassembleStFrm, + &DisassembleLdMiscFrm, + &DisassembleStMiscFrm, + &DisassembleLdStMulFrm, + &DisassembleLdStExFrm, + &DisassembleArithMiscFrm, + &DisassembleExtFrm, + &DisassembleVFPUnaryFrm, + &DisassembleVFPBinaryFrm, + &DisassembleVFPConv1Frm, + &DisassembleVFPConv2Frm, + &DisassembleVFPConv3Frm, + &DisassembleVFPConv4Frm, + &DisassembleVFPConv5Frm, + &DisassembleVFPLdStFrm, + &DisassembleVFPLdStMulFrm, + &DisassembleVFPMiscFrm, + &DisassembleThumbFrm, + &DisassembleNEONFrm, + &DisassembleNEONGetLnFrm, + &DisassembleNEONSetLnFrm, + &DisassembleNEONDupFrm, + &DisassembleMiscFrm, + &DisassembleThumbMiscFrm, + + // VLD and VST (including one lane) Instructions. + &DisassembleNLdSt, + + // A7.4.6 One register and a modified immediate value + // 1-Register Instructions with imm. + // LLVM only defines VMOVv instructions. + &DisassembleN1RegModImmFrm, + + // 2-Register Instructions with no imm. + &DisassembleN2RegFrm, + + // 2-Register Instructions with imm (vector convert float/fixed point). + &DisassembleNVCVTFrm, + + // 2-Register Instructions with imm (vector dup lane). + &DisassembleNVecDupLnFrm, + + // Vector Shift Left Instructions. + &DisassembleN2RegVecShLFrm, + + // Vector Shift Righ Instructions, which has different interpretation of the + // shift amount from the imm6 field. + &DisassembleN2RegVecShRFrm, + + // 3-Register Data-Processing Instructions. + &DisassembleN3RegFrm, + + // Vector Shift (Register) Instructions. + // D:Vd M:Vm N:Vn (notice that M:Vm is the first operand) + &DisassembleN3RegVecShFrm, + + // Vector Extract Instructions. + &DisassembleNVecExtractFrm, + + // Vector [Saturating Rounding Doubling] Multiply [Accumulate/Subtract] [Long] + // By Scalar Instructions. + &DisassembleNVecMulScalarFrm, + + // Vector Table Lookup uses byte indexes in a control vector to look up byte + // values in a table and generate a new vector. + &DisassembleNVTBLFrm, + + NULL +}; + +/// BuildIt - BuildIt performs the build step for this ARM Basic MC Builder. +/// The general idea is to set the Opcode for the MCInst, followed by adding +/// the appropriate MCOperands to the MCInst. ARM Basic MC Builder delegates +/// to the Format-specific disassemble function for disassembly, followed by +/// TryPredicateAndSBitModifier() to do PredicateOperand and OptionalDefOperand +/// which follow the Dst/Src Operands. +bool ARMBasicMCBuilder::BuildIt(MCInst &MI, uint32_t insn) { + // Stage 1 sets the Opcode. + MI.setOpcode(Opcode); + // If the number of operands is zero, we're done! + if (NumOps == 0) + return true; + + // Stage 2 calls the format-specific disassemble function to build the operand + // list. + if (Disasm == NULL) + return false; + unsigned NumOpsAdded = 0; + bool OK = (*Disasm)(MI, Opcode, insn, NumOps, NumOpsAdded, this); + + if (!OK || this->Err != 0) return false; + if (NumOpsAdded >= NumOps) + return true; + + // Stage 3 deals with operands unaccounted for after stage 2 is finished. + // FIXME: Should this be done selectively? + return TryPredicateAndSBitModifier(MI, Opcode, insn, NumOps - NumOpsAdded); +} + +// A8.3 Conditional execution +// A8.3.1 Pseudocode details of conditional execution +// Condition bits '111x' indicate the instruction is always executed. +static uint32_t CondCode(uint32_t CondField) { + if (CondField == 0xF) + return ARMCC::AL; + return CondField; +} + +/// DoPredicateOperands - DoPredicateOperands process the predicate operands +/// of some Thumb instructions which come before the reglist operands. It +/// returns true if the two predicate operands have been processed. +bool ARMBasicMCBuilder::DoPredicateOperands(MCInst& MI, unsigned Opcode, + uint32_t /* insn */, unsigned short NumOpsRemaining) { + + assert(NumOpsRemaining > 0 && "Invalid argument"); + + const TargetOperandInfo *OpInfo = ARMInsts[Opcode].OpInfo; + unsigned Idx = MI.getNumOperands(); + + // First, we check whether this instr specifies the PredicateOperand through + // a pair of TargetOperandInfos with isPredicate() property. + if (NumOpsRemaining >= 2 && + OpInfo[Idx].isPredicate() && OpInfo[Idx+1].isPredicate() && + OpInfo[Idx].RegClass == 0 && OpInfo[Idx+1].RegClass == ARM::CCRRegClassID) + { + // If we are inside an IT block, get the IT condition bits maintained via + // ARMBasicMCBuilder::ITState[7:0], through ARMBasicMCBuilder::GetITCond(). + // See also A2.5.2. + if (InITBlock()) + MI.addOperand(MCOperand::CreateImm(GetITCond())); + else + MI.addOperand(MCOperand::CreateImm(ARMCC::AL)); + MI.addOperand(MCOperand::CreateReg(ARM::CPSR)); + return true; + } + + return false; +} + +/// TryPredicateAndSBitModifier - TryPredicateAndSBitModifier tries to process +/// the possible Predicate and SBitModifier, to build the remaining MCOperand +/// constituents. +bool ARMBasicMCBuilder::TryPredicateAndSBitModifier(MCInst& MI, unsigned Opcode, + uint32_t insn, unsigned short NumOpsRemaining) { + + assert(NumOpsRemaining > 0 && "Invalid argument"); + + const TargetOperandInfo *OpInfo = ARMInsts[Opcode].OpInfo; + const std::string &Name = ARMInsts[Opcode].Name; + unsigned Idx = MI.getNumOperands(); + + // First, we check whether this instr specifies the PredicateOperand through + // a pair of TargetOperandInfos with isPredicate() property. + if (NumOpsRemaining >= 2 && + OpInfo[Idx].isPredicate() && OpInfo[Idx+1].isPredicate() && + OpInfo[Idx].RegClass == 0 && OpInfo[Idx+1].RegClass == ARM::CCRRegClassID) + { + // If we are inside an IT block, get the IT condition bits maintained via + // ARMBasicMCBuilder::ITState[7:0], through ARMBasicMCBuilder::GetITCond(). + // See also A2.5.2. + if (InITBlock()) + MI.addOperand(MCOperand::CreateImm(GetITCond())); + else { + if (Name.length() > 1 && Name[0] == 't') { + // Thumb conditional branch instructions have their cond field embedded, + // like ARM. + // + // A8.6.16 B + if (Name == "t2Bcc") + MI.addOperand(MCOperand::CreateImm(CondCode(slice(insn, 25, 22)))); + else if (Name == "tBcc") + MI.addOperand(MCOperand::CreateImm(CondCode(slice(insn, 11, 8)))); + else + MI.addOperand(MCOperand::CreateImm(ARMCC::AL)); + } else { + // ARM instructions get their condition field from Inst{31-28}. + MI.addOperand(MCOperand::CreateImm(CondCode(getCondField(insn)))); + } + } + MI.addOperand(MCOperand::CreateReg(ARM::CPSR)); + Idx += 2; + NumOpsRemaining -= 2; + } + + if (NumOpsRemaining == 0) + return true; + + // Next, if OptionalDefOperand exists, we check whether the 'S' bit is set. + if (OpInfo[Idx].isOptionalDef() && OpInfo[Idx].RegClass==ARM::CCRRegClassID) { + MI.addOperand(MCOperand::CreateReg(getSBit(insn) == 1 ? ARM::CPSR : 0)); + --NumOpsRemaining; + } + + if (NumOpsRemaining == 0) + return true; + else + return false; +} + +/// RunBuildAfterHook - RunBuildAfterHook performs operations deemed necessary +/// after BuildIt is finished. +bool ARMBasicMCBuilder::RunBuildAfterHook(bool Status, MCInst &MI, + uint32_t insn) { + + if (!SP) return Status; + + if (Opcode == ARM::t2IT) + Status = SP->InitIT(slice(insn, 7, 0)) ? Status : false; + else if (InITBlock()) + SP->UpdateIT(); + + return Status; +} + +/// Opcode, Format, and NumOperands make up an ARM Basic MCBuilder. +ARMBasicMCBuilder::ARMBasicMCBuilder(unsigned opc, ARMFormat format, + unsigned short num) + : Opcode(opc), Format(format), NumOps(num), SP(0), Err(0) { + unsigned Idx = (unsigned)format; + assert(Idx < (array_lengthof(FuncPtrs) - 1) && "Unknown format"); + Disasm = FuncPtrs[Idx]; +} + +/// CreateMCBuilder - Return an ARMBasicMCBuilder that can build up the MC +/// infrastructure of an MCInst given the Opcode and Format of the instr. +/// Return NULL if it fails to create/return a proper builder. API clients +/// are responsible for freeing up of the allocated memory. Cacheing can be +/// performed by the API clients to improve performance. +ARMBasicMCBuilder *llvm::CreateMCBuilder(unsigned Opcode, ARMFormat Format) { + // For "Unknown format", fail by returning a NULL pointer. + if ((unsigned)Format >= (array_lengthof(FuncPtrs) - 1)) { + DEBUG(errs() << "Unknown format\n"); + return 0; + } + + return new ARMBasicMCBuilder(Opcode, Format, + ARMInsts[Opcode].getNumOperands()); +} diff --git a/contrib/llvm/lib/Target/ARM/Disassembler/ARMDisassemblerCore.h b/contrib/llvm/lib/Target/ARM/Disassembler/ARMDisassemblerCore.h new file mode 100644 index 0000000..b1d90df --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/Disassembler/ARMDisassemblerCore.h @@ -0,0 +1,262 @@ +//===- ARMDisassemblerCore.h - ARM disassembler helpers ---------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file is part of the ARM Disassembler. +// +// The first part defines the enumeration type of ARM instruction format, which +// specifies the encoding used by the instruction, as well as a helper function +// to convert the enums to printable char strings. +// +// It also contains code to represent the concepts of Builder and DisassembleFP +// to solve the problem of disassembling an ARM instr. +// +//===----------------------------------------------------------------------===// + +#ifndef ARMDISASSEMBLERCORE_H +#define ARMDISASSEMBLERCORE_H + +#include "llvm/MC/MCInst.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "ARMInstrInfo.h" +#include "ARMDisassembler.h" + +namespace llvm { + +class ARMUtils { +public: + static const char *OpcodeName(unsigned Opcode); +}; + +///////////////////////////////////////////////////// +// // +// Enums and Utilities for ARM Instruction Format // +// // +///////////////////////////////////////////////////// + +#define ARM_FORMATS \ + ENTRY(ARM_FORMAT_PSEUDO, 0) \ + ENTRY(ARM_FORMAT_MULFRM, 1) \ + ENTRY(ARM_FORMAT_BRFRM, 2) \ + ENTRY(ARM_FORMAT_BRMISCFRM, 3) \ + ENTRY(ARM_FORMAT_DPFRM, 4) \ + ENTRY(ARM_FORMAT_DPSOREGFRM, 5) \ + ENTRY(ARM_FORMAT_LDFRM, 6) \ + ENTRY(ARM_FORMAT_STFRM, 7) \ + ENTRY(ARM_FORMAT_LDMISCFRM, 8) \ + ENTRY(ARM_FORMAT_STMISCFRM, 9) \ + ENTRY(ARM_FORMAT_LDSTMULFRM, 10) \ + ENTRY(ARM_FORMAT_LDSTEXFRM, 11) \ + ENTRY(ARM_FORMAT_ARITHMISCFRM, 12) \ + ENTRY(ARM_FORMAT_EXTFRM, 13) \ + ENTRY(ARM_FORMAT_VFPUNARYFRM, 14) \ + ENTRY(ARM_FORMAT_VFPBINARYFRM, 15) \ + ENTRY(ARM_FORMAT_VFPCONV1FRM, 16) \ + ENTRY(ARM_FORMAT_VFPCONV2FRM, 17) \ + ENTRY(ARM_FORMAT_VFPCONV3FRM, 18) \ + ENTRY(ARM_FORMAT_VFPCONV4FRM, 19) \ + ENTRY(ARM_FORMAT_VFPCONV5FRM, 20) \ + ENTRY(ARM_FORMAT_VFPLDSTFRM, 21) \ + ENTRY(ARM_FORMAT_VFPLDSTMULFRM, 22) \ + ENTRY(ARM_FORMAT_VFPMISCFRM, 23) \ + ENTRY(ARM_FORMAT_THUMBFRM, 24) \ + ENTRY(ARM_FORMAT_NEONFRM, 25) \ + ENTRY(ARM_FORMAT_NEONGETLNFRM, 26) \ + ENTRY(ARM_FORMAT_NEONSETLNFRM, 27) \ + ENTRY(ARM_FORMAT_NEONDUPFRM, 28) \ + ENTRY(ARM_FORMAT_MISCFRM, 29) \ + ENTRY(ARM_FORMAT_THUMBMISCFRM, 30) \ + ENTRY(ARM_FORMAT_NLdSt, 31) \ + ENTRY(ARM_FORMAT_N1RegModImm, 32) \ + ENTRY(ARM_FORMAT_N2Reg, 33) \ + ENTRY(ARM_FORMAT_NVCVT, 34) \ + ENTRY(ARM_FORMAT_NVecDupLn, 35) \ + ENTRY(ARM_FORMAT_N2RegVecShL, 36) \ + ENTRY(ARM_FORMAT_N2RegVecShR, 37) \ + ENTRY(ARM_FORMAT_N3Reg, 38) \ + ENTRY(ARM_FORMAT_N3RegVecSh, 39) \ + ENTRY(ARM_FORMAT_NVecExtract, 40) \ + ENTRY(ARM_FORMAT_NVecMulScalar, 41) \ + ENTRY(ARM_FORMAT_NVTBL, 42) + +// ARM instruction format specifies the encoding used by the instruction. +#define ENTRY(n, v) n = v, +typedef enum { + ARM_FORMATS + ARM_FORMAT_NA +} ARMFormat; +#undef ENTRY + +// Converts enum to const char*. +static const inline char *stringForARMFormat(ARMFormat form) { +#define ENTRY(n, v) case n: return #n; + switch(form) { + ARM_FORMATS + case ARM_FORMAT_NA: + default: + return ""; + } +#undef ENTRY +} + +/// Expands on the enum definitions from ARMBaseInstrInfo.h. +/// They are being used by the disassembler implementation. +namespace ARMII { + enum { + NEONRegMask = 15, + GPRRegMask = 15, + NEON_RegRdShift = 12, + NEON_D_BitShift = 22, + NEON_RegRnShift = 16, + NEON_N_BitShift = 7, + NEON_RegRmShift = 0, + NEON_M_BitShift = 5 + }; +} + +/// Utility function for extracting [From, To] bits from a uint32_t. +static inline unsigned slice(uint32_t Bits, unsigned From, unsigned To) { + assert(From < 32 && To < 32 && From >= To); + return (Bits >> To) & ((1 << (From - To + 1)) - 1); +} + +/// Utility function for setting [From, To] bits to Val for a uint32_t. +static inline void setSlice(uint32_t &Bits, unsigned From, unsigned To, + uint32_t Val) { + assert(From < 32 && To < 32 && From >= To); + uint32_t Mask = ((1 << (From - To + 1)) - 1); + Bits &= ~(Mask << To); + Bits |= (Val & Mask) << To; +} + +/// Various utilities for checking the target specific flags. + +/// A unary data processing instruction doesn't have an Rn operand. +static inline bool isUnaryDP(unsigned TSFlags) { + return (TSFlags & ARMII::UnaryDP); +} + +/// This four-bit field describes the addressing mode used. +/// See also ARMBaseInstrInfo.h. +static inline unsigned getAddrMode(unsigned TSFlags) { + return (TSFlags & ARMII::AddrModeMask); +} + +/// {IndexModePre, IndexModePost} +/// Only valid for load and store ops. +/// See also ARMBaseInstrInfo.h. +static inline unsigned getIndexMode(unsigned TSFlags) { + return (TSFlags & ARMII::IndexModeMask) >> ARMII::IndexModeShift; +} + +/// Pre-/post-indexed operations define an extra $base_wb in the OutOperandList. +static inline bool isPrePostLdSt(unsigned TSFlags) { + return (TSFlags & ARMII::IndexModeMask) != 0; +} + +// Forward declaration. +class ARMBasicMCBuilder; + +// Builder Object is mostly ignored except in some Thumb disassemble functions. +typedef ARMBasicMCBuilder *BO; + +/// DisassembleFP - DisassembleFP points to a function that disassembles an insn +/// and builds the MCOperand list upon disassembly. It returns false on failure +/// or true on success. The number of operands added is updated upon success. +typedef bool (*DisassembleFP)(MCInst &MI, unsigned Opcode, uint32_t insn, + unsigned short NumOps, unsigned &NumOpsAdded, BO Builder); + +/// CreateMCBuilder - Return an ARMBasicMCBuilder that can build up the MC +/// infrastructure of an MCInst given the Opcode and Format of the instr. +/// Return NULL if it fails to create/return a proper builder. API clients +/// are responsible for freeing up of the allocated memory. Cacheing can be +/// performed by the API clients to improve performance. +extern ARMBasicMCBuilder *CreateMCBuilder(unsigned Opcode, ARMFormat Format); + +/// ARMBasicMCBuilder - ARMBasicMCBuilder represents an ARM MCInst builder that +/// knows how to build up the MCOperand list. +class ARMBasicMCBuilder { + friend ARMBasicMCBuilder *CreateMCBuilder(unsigned Opcode, ARMFormat Format); + unsigned Opcode; + ARMFormat Format; + unsigned short NumOps; + DisassembleFP Disasm; + Session *SP; + int Err; // !=0 if the builder encounters some error condition during build. + +private: + /// Opcode, Format, and NumOperands make up an ARM Basic MCBuilder. + ARMBasicMCBuilder(unsigned opc, ARMFormat format, unsigned short num); + +public: + ARMBasicMCBuilder(ARMBasicMCBuilder &B) + : Opcode(B.Opcode), Format(B.Format), NumOps(B.NumOps), Disasm(B.Disasm), + SP(B.SP) { + Err = 0; + } + + virtual ~ARMBasicMCBuilder() {} + + void SetSession(Session *sp) { + SP = sp; + } + + void SetErr(int ErrCode) { + Err = ErrCode; + } + + /// DoPredicateOperands - DoPredicateOperands process the predicate operands + /// of some Thumb instructions which come before the reglist operands. It + /// returns true if the two predicate operands have been processed. + bool DoPredicateOperands(MCInst& MI, unsigned Opcode, + uint32_t insn, unsigned short NumOpsRemaning); + + /// TryPredicateAndSBitModifier - TryPredicateAndSBitModifier tries to process + /// the possible Predicate and SBitModifier, to build the remaining MCOperand + /// constituents. + bool TryPredicateAndSBitModifier(MCInst& MI, unsigned Opcode, + uint32_t insn, unsigned short NumOpsRemaning); + + /// InITBlock - InITBlock returns true if we are inside an IT block. + bool InITBlock() { + if (SP) + return SP->ITCounter > 0; + + return false; + } + + /// Build - Build delegates to BuildIt to perform the heavy liftling. After + /// that, it invokes RunBuildAfterHook where some housekeepings can be done. + virtual bool Build(MCInst &MI, uint32_t insn) { + bool Status = BuildIt(MI, insn); + return RunBuildAfterHook(Status, MI, insn); + } + + /// BuildIt - BuildIt performs the build step for this ARM Basic MC Builder. + /// The general idea is to set the Opcode for the MCInst, followed by adding + /// the appropriate MCOperands to the MCInst. ARM Basic MC Builder delegates + /// to the Format-specific disassemble function for disassembly, followed by + /// TryPredicateAndSBitModifier() for PredicateOperand and OptionalDefOperand + /// which follow the Dst/Src Operands. + virtual bool BuildIt(MCInst &MI, uint32_t insn); + + /// RunBuildAfterHook - RunBuildAfterHook performs operations deemed necessary + /// after BuildIt is finished. + virtual bool RunBuildAfterHook(bool Status, MCInst &MI, uint32_t insn); + +private: + /// Get condition of the current IT instruction. + unsigned GetITCond() { + assert(SP); + return slice(SP->ITState, 7, 4); + } +}; + +} // namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/ARM/Disassembler/Makefile b/contrib/llvm/lib/Target/ARM/Disassembler/Makefile new file mode 100644 index 0000000..031b6ac --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/Disassembler/Makefile @@ -0,0 +1,16 @@ +##===- lib/Target/ARM/Disassembler/Makefile ----------------*- Makefile -*-===## +# +# The LLVM Compiler Infrastructure +# +# This file is distributed under the University of Illinois Open Source +# License. See LICENSE.TXT for details. +# +##===----------------------------------------------------------------------===## + +LEVEL = ../../../.. +LIBRARYNAME = LLVMARMDisassembler + +# Hack: we need to include 'main' arm target directory to grab private headers +CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/.. + +include $(LEVEL)/Makefile.common diff --git a/contrib/llvm/lib/Target/ARM/Disassembler/ThumbDisassemblerCore.h b/contrib/llvm/lib/Target/ARM/Disassembler/ThumbDisassemblerCore.h new file mode 100644 index 0000000..4b2e308 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/Disassembler/ThumbDisassemblerCore.h @@ -0,0 +1,2248 @@ +//===- ThumbDisassemblerCore.h - Thumb disassembler helpers -----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file is part of the ARM Disassembler. +// It contains code for disassembling a Thumb instr. It is to be included by +// ARMDisassemblerCore.cpp because it contains the static DisassembleThumbFrm() +// function which acts as the dispatcher to disassemble a Thumb instruction. +// +//===----------------------------------------------------------------------===// + +/////////////////////////////// +// // +// Utility Functions // +// // +/////////////////////////////// + +// Utilities for 16-bit Thumb instructions. +/* +15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 + [ tRt ] + [ tRm ] [ tRn ] [ tRd ] + D [ Rm ] [ Rd ] + + [ imm3] + [ imm5 ] + i [ imm5 ] + [ imm7 ] + [ imm8 ] + [ imm11 ] + + [ cond ] +*/ + +// Extract tRt: Inst{10-8}. +static inline unsigned getT1tRt(uint32_t insn) { + return slice(insn, 10, 8); +} + +// Extract tRm: Inst{8-6}. +static inline unsigned getT1tRm(uint32_t insn) { + return slice(insn, 8, 6); +} + +// Extract tRn: Inst{5-3}. +static inline unsigned getT1tRn(uint32_t insn) { + return slice(insn, 5, 3); +} + +// Extract tRd: Inst{2-0}. +static inline unsigned getT1tRd(uint32_t insn) { + return slice(insn, 2, 0); +} + +// Extract [D:Rd]: Inst{7:2-0}. +static inline unsigned getT1Rd(uint32_t insn) { + return slice(insn, 7, 7) << 3 | slice(insn, 2, 0); +} + +// Extract Rm: Inst{6-3}. +static inline unsigned getT1Rm(uint32_t insn) { + return slice(insn, 6, 3); +} + +// Extract imm3: Inst{8-6}. +static inline unsigned getT1Imm3(uint32_t insn) { + return slice(insn, 8, 6); +} + +// Extract imm5: Inst{10-6}. +static inline unsigned getT1Imm5(uint32_t insn) { + return slice(insn, 10, 6); +} + +// Extract i:imm5: Inst{9:7-3}. +static inline unsigned getT1Imm6(uint32_t insn) { + return slice(insn, 9, 9) << 5 | slice(insn, 7, 3); +} + +// Extract imm7: Inst{6-0}. +static inline unsigned getT1Imm7(uint32_t insn) { + return slice(insn, 6, 0); +} + +// Extract imm8: Inst{7-0}. +static inline unsigned getT1Imm8(uint32_t insn) { + return slice(insn, 7, 0); +} + +// Extract imm11: Inst{10-0}. +static inline unsigned getT1Imm11(uint32_t insn) { + return slice(insn, 10, 0); +} + +// Extract cond: Inst{11-8}. +static inline unsigned getT1Cond(uint32_t insn) { + return slice(insn, 11, 8); +} + +static inline bool IsGPR(unsigned RegClass) { + return RegClass == ARM::GPRRegClassID; +} + +// Utilities for 32-bit Thumb instructions. + +// Extract imm4: Inst{19-16}. +static inline unsigned getImm4(uint32_t insn) { + return slice(insn, 19, 16); +} + +// Extract imm3: Inst{14-12}. +static inline unsigned getImm3(uint32_t insn) { + return slice(insn, 14, 12); +} + +// Extract imm8: Inst{7-0}. +static inline unsigned getImm8(uint32_t insn) { + return slice(insn, 7, 0); +} + +// A8.6.61 LDRB (immediate, Thumb) and friends +// +/-: Inst{9} +// imm8: Inst{7-0} +static inline int decodeImm8(uint32_t insn) { + int Offset = getImm8(insn); + return slice(insn, 9, 9) ? Offset : -Offset; +} + +// Extract imm12: Inst{11-0}. +static inline unsigned getImm12(uint32_t insn) { + return slice(insn, 11, 0); +} + +// A8.6.63 LDRB (literal) and friends +// +/-: Inst{23} +// imm12: Inst{11-0} +static inline int decodeImm12(uint32_t insn) { + int Offset = getImm12(insn); + return slice(insn, 23, 23) ? Offset : -Offset; +} + +// Extract imm2: Inst{7-6}. +static inline unsigned getImm2(uint32_t insn) { + return slice(insn, 7, 6); +} + +// For BFI, BFC, t2SBFX, and t2UBFX. +// Extract lsb: Inst{14-12:7-6}. +static inline unsigned getLsb(uint32_t insn) { + return getImm3(insn) << 2 | getImm2(insn); +} + +// For BFI and BFC. +// Extract msb: Inst{4-0}. +static inline unsigned getMsb(uint32_t insn) { + return slice(insn, 4, 0); +} + +// For t2SBFX and t2UBFX. +// Extract widthminus1: Inst{4-0}. +static inline unsigned getWidthMinus1(uint32_t insn) { + return slice(insn, 4, 0); +} + +// For t2ADDri12 and t2SUBri12. +// imm12 = i:imm3:imm8; +static inline unsigned getIImm3Imm8(uint32_t insn) { + return slice(insn, 26, 26) << 11 | getImm3(insn) << 8 | getImm8(insn); +} + +// For t2MOVi16 and t2MOVTi16. +// imm16 = imm4:i:imm3:imm8; +static inline unsigned getImm16(uint32_t insn) { + return getImm4(insn) << 12 | slice(insn, 26, 26) << 11 | + getImm3(insn) << 8 | getImm8(insn); +} + +// Inst{5-4} encodes the shift type. +static inline unsigned getShiftTypeBits(uint32_t insn) { + return slice(insn, 5, 4); +} + +// Inst{14-12}:Inst{7-6} encodes the imm5 shift amount. +static inline unsigned getShiftAmtBits(uint32_t insn) { + return getImm3(insn) << 2 | getImm2(insn); +} + +// A8.6.17 BFC +// Encoding T1 ARMv6T2, ARMv7 +// LLVM-specific encoding for #<lsb> and #<width> +static inline bool getBitfieldInvMask(uint32_t insn, uint32_t &mask) { + uint32_t lsb = getImm3(insn) << 2 | getImm2(insn); + uint32_t msb = getMsb(insn); + uint32_t Val = 0; + if (msb < lsb) { + DEBUG(errs() << "Encoding error: msb < lsb\n"); + return false; + } + for (uint32_t i = lsb; i <= msb; ++i) + Val |= (1 << i); + mask = ~Val; + return true; +} + +// A8.4 Shifts applied to a register +// A8.4.1 Constant shifts +// A8.4.3 Pseudocode details of instruction-specified shifts and rotates +// +// decodeImmShift() returns the shift amount and the the shift opcode. +// Note that, as of Jan-06-2010, LLVM does not support rrx shifted operands yet. +static inline unsigned decodeImmShift(unsigned bits2, unsigned imm5, + ARM_AM::ShiftOpc &ShOp) { + + assert(imm5 < 32 && "Invalid imm5 argument"); + switch (bits2) { + default: assert(0 && "No such value"); + case 0: + ShOp = ARM_AM::lsl; + return imm5; + case 1: + ShOp = ARM_AM::lsr; + return (imm5 == 0 ? 32 : imm5); + case 2: + ShOp = ARM_AM::asr; + return (imm5 == 0 ? 32 : imm5); + case 3: + ShOp = (imm5 == 0 ? ARM_AM::rrx : ARM_AM::ror); + return (imm5 == 0 ? 1 : imm5); + } +} + +// A6.3.2 Modified immediate constants in Thumb instructions +// +// ThumbExpandImm() returns the modified immediate constant given an imm12 for +// Thumb data-processing instructions with modified immediate. +// See also A6.3.1 Data-processing (modified immediate). +static inline unsigned ThumbExpandImm(unsigned imm12) { + assert(imm12 <= 0xFFF && "Invalid imm12 argument"); + + // If the leading two bits is 0b00, the modified immediate constant is + // obtained by splatting the low 8 bits into the first byte, every other byte, + // or every byte of a 32-bit value. + // + // Otherwise, a rotate right of '1':imm12<6:0> by the amount imm12<11:7> is + // performed. + + if (slice(imm12, 11, 10) == 0) { + unsigned short control = slice(imm12, 9, 8); + unsigned imm8 = slice(imm12, 7, 0); + switch (control) { + default: + assert(0 && "No such value"); + return 0; + case 0: + return imm8; + case 1: + return imm8 << 16 | imm8; + case 2: + return imm8 << 24 | imm8 << 8; + case 3: + return imm8 << 24 | imm8 << 16 | imm8 << 8 | imm8; + } + } else { + // A rotate is required. + unsigned Val = 1 << 7 | slice(imm12, 6, 0); + unsigned Amt = slice(imm12, 11, 7); + return ARM_AM::rotr32(Val, Amt); + } +} + +static inline int decodeImm32_B_EncodingT3(uint32_t insn) { + bool S = slice(insn, 26, 26); + bool J1 = slice(insn, 13, 13); + bool J2 = slice(insn, 11, 11); + unsigned Imm21 = slice(insn, 21, 16) << 12 | slice(insn, 10, 0) << 1; + if (S) Imm21 |= 1 << 20; + if (J2) Imm21 |= 1 << 19; + if (J1) Imm21 |= 1 << 18; + + return SignExtend32<21>(Imm21); +} + +static inline int decodeImm32_B_EncodingT4(uint32_t insn) { + unsigned S = slice(insn, 26, 26); + bool I1 = slice(insn, 13, 13) == S; + bool I2 = slice(insn, 11, 11) == S; + unsigned Imm25 = slice(insn, 25, 16) << 12 | slice(insn, 10, 0) << 1; + if (S) Imm25 |= 1 << 24; + if (I1) Imm25 |= 1 << 23; + if (I2) Imm25 |= 1 << 22; + + return SignExtend32<25>(Imm25); +} + +static inline int decodeImm32_BL(uint32_t insn) { + unsigned S = slice(insn, 26, 26); + bool I1 = slice(insn, 13, 13) == S; + bool I2 = slice(insn, 11, 11) == S; + unsigned Imm25 = slice(insn, 25, 16) << 12 | slice(insn, 10, 0) << 1; + if (S) Imm25 |= 1 << 24; + if (I1) Imm25 |= 1 << 23; + if (I2) Imm25 |= 1 << 22; + + return SignExtend32<25>(Imm25); +} + +static inline int decodeImm32_BLX(uint32_t insn) { + unsigned S = slice(insn, 26, 26); + bool I1 = slice(insn, 13, 13) == S; + bool I2 = slice(insn, 11, 11) == S; + unsigned Imm25 = slice(insn, 25, 16) << 12 | slice(insn, 10, 1) << 2; + if (S) Imm25 |= 1 << 24; + if (I1) Imm25 |= 1 << 23; + if (I2) Imm25 |= 1 << 22; + + return SignExtend32<25>(Imm25); +} + +// See, for example, A8.6.221 SXTAB16. +static inline unsigned decodeRotate(uint32_t insn) { + unsigned rotate = slice(insn, 5, 4); + return rotate << 3; +} + +/////////////////////////////////////////////// +// // +// Thumb1 instruction disassembly functions. // +// // +/////////////////////////////////////////////// + +// See "Utilities for 16-bit Thumb instructions" for register naming convention. + +// A6.2.1 Shift (immediate), add, subtract, move, and compare +// +// shift immediate: tRd CPSR tRn imm5 +// add/sub register: tRd CPSR tRn tRm +// add/sub 3-bit immediate: tRd CPSR tRn imm3 +// add/sub 8-bit immediate: tRt CPSR tRt(TIED_TO) imm8 +// mov/cmp immediate: tRt [CPSR] imm8 (CPSR present for mov) +// +// Special case: +// tMOVSr: tRd tRn +static bool DisassembleThumb1General(MCInst &MI, unsigned Opcode, uint32_t insn, + unsigned short NumOps, unsigned &NumOpsAdded, BO B) { + + const TargetOperandInfo *OpInfo = ARMInsts[Opcode].OpInfo; + unsigned &OpIdx = NumOpsAdded; + + OpIdx = 0; + + assert(NumOps >= 2 && OpInfo[0].RegClass == ARM::tGPRRegClassID + && "Invalid arguments"); + + bool Imm3 = (Opcode == ARM::tADDi3 || Opcode == ARM::tSUBi3); + + // Use Rt implies use imm8. + bool UseRt = (Opcode == ARM::tADDi8 || Opcode == ARM::tSUBi8 || + Opcode == ARM::tMOVi8 || Opcode == ARM::tCMPi8); + + // Add the destination operand. + MI.addOperand(MCOperand::CreateReg( + getRegisterEnum(B, ARM::tGPRRegClassID, + UseRt ? getT1tRt(insn) : getT1tRd(insn)))); + ++OpIdx; + + // Check whether the next operand to be added is a CCR Register. + if (OpInfo[OpIdx].RegClass == ARM::CCRRegClassID) { + assert(OpInfo[OpIdx].isOptionalDef() && "Optional def operand expected"); + MI.addOperand(MCOperand::CreateReg(B->InITBlock() ? 0 : ARM::CPSR)); + ++OpIdx; + } + + // Check whether the next operand to be added is a Thumb1 Register. + assert(OpIdx < NumOps && "More operands expected"); + if (OpInfo[OpIdx].RegClass == ARM::tGPRRegClassID) { + // For UseRt, the reg operand is tied to the first reg operand. + MI.addOperand(MCOperand::CreateReg( + getRegisterEnum(B, ARM::tGPRRegClassID, + UseRt ? getT1tRt(insn) : getT1tRn(insn)))); + ++OpIdx; + } + + // Special case for tMOVSr. + if (OpIdx == NumOps) + return true; + + // The next available operand is either a reg operand or an imm operand. + if (OpInfo[OpIdx].RegClass == ARM::tGPRRegClassID) { + // Three register operand instructions. + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::tGPRRegClassID, + getT1tRm(insn)))); + } else { + assert(OpInfo[OpIdx].RegClass == 0 && + !OpInfo[OpIdx].isPredicate() && !OpInfo[OpIdx].isOptionalDef() + && "Pure imm operand expected"); + MI.addOperand(MCOperand::CreateImm(UseRt ? getT1Imm8(insn) + : (Imm3 ? getT1Imm3(insn) + : getT1Imm5(insn)))); + } + ++OpIdx; + + return true; +} + +// A6.2.2 Data-processing +// +// tCMPr, tTST, tCMN: tRd tRn +// tMVN, tRSB: tRd CPSR tRn +// Others: tRd CPSR tRd(TIED_TO) tRn +static bool DisassembleThumb1DP(MCInst &MI, unsigned Opcode, uint32_t insn, + unsigned short NumOps, unsigned &NumOpsAdded, BO B) { + + const TargetInstrDesc &TID = ARMInsts[Opcode]; + const TargetOperandInfo *OpInfo = TID.OpInfo; + unsigned &OpIdx = NumOpsAdded; + + OpIdx = 0; + + assert(NumOps >= 2 && OpInfo[0].RegClass == ARM::tGPRRegClassID && + (OpInfo[1].RegClass == ARM::CCRRegClassID + || OpInfo[1].RegClass == ARM::tGPRRegClassID) + && "Invalid arguments"); + + // Add the destination operand. + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::tGPRRegClassID, + getT1tRd(insn)))); + ++OpIdx; + + // Check whether the next operand to be added is a CCR Register. + if (OpInfo[OpIdx].RegClass == ARM::CCRRegClassID) { + assert(OpInfo[OpIdx].isOptionalDef() && "Optional def operand expected"); + MI.addOperand(MCOperand::CreateReg(B->InITBlock() ? 0 : ARM::CPSR)); + ++OpIdx; + } + + // We have either { tRd(TIED_TO), tRn } or { tRn } remaining. + // Process the TIED_TO operand first. + + assert(OpIdx < NumOps && OpInfo[OpIdx].RegClass == ARM::tGPRRegClassID + && "Thumb reg operand expected"); + int Idx; + if ((Idx = TID.getOperandConstraint(OpIdx, TOI::TIED_TO)) != -1) { + // The reg operand is tied to the first reg operand. + MI.addOperand(MI.getOperand(Idx)); + ++OpIdx; + } + + // Process possible next reg operand. + if (OpIdx < NumOps && OpInfo[OpIdx].RegClass == ARM::tGPRRegClassID) { + // Add tRn operand. + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::tGPRRegClassID, + getT1tRn(insn)))); + ++OpIdx; + } + + return true; +} + +// A6.2.3 Special data instructions and branch and exchange +// +// tADDhirr: Rd Rd(TIED_TO) Rm +// tCMPhir: Rd Rm +// tMOVr, tMOVgpr2gpr, tMOVgpr2tgpr, tMOVtgpr2gpr: Rd|tRd Rm|tRn +// tBX_RET: 0 operand +// tBX_RET_vararg: Rm +// tBLXr_r9: Rm +static bool DisassembleThumb1Special(MCInst &MI, unsigned Opcode, uint32_t insn, + unsigned short NumOps, unsigned &NumOpsAdded, BO B) { + + // tBX_RET has 0 operand. + if (NumOps == 0) + return true; + + // BX/BLX has 1 reg operand: Rm. + if (NumOps == 1) { + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + getT1Rm(insn)))); + NumOpsAdded = 1; + return true; + } + + const TargetInstrDesc &TID = ARMInsts[Opcode]; + const TargetOperandInfo *OpInfo = TID.OpInfo; + unsigned &OpIdx = NumOpsAdded; + + OpIdx = 0; + + // Add the destination operand. + unsigned RegClass = OpInfo[OpIdx].RegClass; + MI.addOperand(MCOperand::CreateReg( + getRegisterEnum(B, RegClass, + IsGPR(RegClass) ? getT1Rd(insn) + : getT1tRd(insn)))); + ++OpIdx; + + // We have either { Rd(TIED_TO), Rm } or { Rm|tRn } remaining. + // Process the TIED_TO operand first. + + assert(OpIdx < NumOps && "More operands expected"); + int Idx; + if ((Idx = TID.getOperandConstraint(OpIdx, TOI::TIED_TO)) != -1) { + // The reg operand is tied to the first reg operand. + MI.addOperand(MI.getOperand(Idx)); + ++OpIdx; + } + + // The next reg operand is either Rm or tRn. + assert(OpIdx < NumOps && "More operands expected"); + RegClass = OpInfo[OpIdx].RegClass; + MI.addOperand(MCOperand::CreateReg( + getRegisterEnum(B, RegClass, + IsGPR(RegClass) ? getT1Rm(insn) + : getT1tRn(insn)))); + ++OpIdx; + + return true; +} + +// A8.6.59 LDR (literal) +// +// tLDRpci: tRt imm8*4 +static bool DisassembleThumb1LdPC(MCInst &MI, unsigned Opcode, uint32_t insn, + unsigned short NumOps, unsigned &NumOpsAdded, BO B) { + + const TargetOperandInfo *OpInfo = ARMInsts[Opcode].OpInfo; + if (!OpInfo) return false; + + assert(NumOps >= 2 && OpInfo[0].RegClass == ARM::tGPRRegClassID && + (OpInfo[1].RegClass == 0 && + !OpInfo[1].isPredicate() && + !OpInfo[1].isOptionalDef()) + && "Invalid arguments"); + + // Add the destination operand. + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::tGPRRegClassID, + getT1tRt(insn)))); + + // And the (imm8 << 2) operand. + MI.addOperand(MCOperand::CreateImm(getT1Imm8(insn) << 2)); + + NumOpsAdded = 2; + + return true; +} + +// Thumb specific addressing modes (see ARMInstrThumb.td): +// +// t_addrmode_rr := reg + reg +// +// t_addrmode_s4 := reg + reg +// reg + imm5 * 4 +// +// t_addrmode_s2 := reg + reg +// reg + imm5 * 2 +// +// t_addrmode_s1 := reg + reg +// reg + imm5 +// +// t_addrmode_sp := sp + imm8 * 4 +// + +// A6.2.4 Load/store single data item +// +// Load/Store Register (reg|imm): tRd tRn imm5 tRm +// Load Register Signed Byte|Halfword: tRd tRn tRm +static bool DisassembleThumb1LdSt(unsigned opA, MCInst &MI, unsigned Opcode, + uint32_t insn, unsigned short NumOps, unsigned &NumOpsAdded, BO B) { + + const TargetInstrDesc &TID = ARMInsts[Opcode]; + const TargetOperandInfo *OpInfo = TID.OpInfo; + unsigned &OpIdx = NumOpsAdded; + + // Table A6-5 16-bit Thumb Load/store instructions + // opA = 0b0101 for STR/LDR (register) and friends. + // Otherwise, we have STR/LDR (immediate) and friends. + bool Imm5 = (opA != 5); + + assert(NumOps >= 2 + && OpInfo[0].RegClass == ARM::tGPRRegClassID + && OpInfo[1].RegClass == ARM::tGPRRegClassID + && "Expect >= 2 operands and first two as thumb reg operands"); + + // Add the destination reg and the base reg. + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::tGPRRegClassID, + getT1tRd(insn)))); + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::tGPRRegClassID, + getT1tRn(insn)))); + OpIdx = 2; + + // We have either { imm5, tRm } or { tRm } remaining. + // Process the imm5 first. Note that STR/LDR (register) should skip the imm5 + // offset operand for t_addrmode_s[1|2|4]. + + assert(OpIdx < NumOps && "More operands expected"); + + if (OpInfo[OpIdx].RegClass == 0 && !OpInfo[OpIdx].isPredicate() && + !OpInfo[OpIdx].isOptionalDef()) { + + MI.addOperand(MCOperand::CreateImm(Imm5 ? getT1Imm5(insn) : 0)); + ++OpIdx; + } + + // The next reg operand is tRm, the offset. + assert(OpIdx < NumOps && OpInfo[OpIdx].RegClass == ARM::tGPRRegClassID + && "Thumb reg operand expected"); + MI.addOperand(MCOperand::CreateReg( + Imm5 ? 0 + : getRegisterEnum(B, ARM::tGPRRegClassID, + getT1tRm(insn)))); + ++OpIdx; + + return true; +} + +// A6.2.4 Load/store single data item +// +// Load/Store Register SP relative: tRt ARM::SP imm8 +static bool DisassembleThumb1LdStSP(MCInst &MI, unsigned Opcode, uint32_t insn, + unsigned short NumOps, unsigned &NumOpsAdded, BO B) { + + assert((Opcode == ARM::tLDRspi || Opcode == ARM::tSTRspi) + && "Unexpected opcode"); + + const TargetOperandInfo *OpInfo = ARMInsts[Opcode].OpInfo; + if (!OpInfo) return false; + + assert(NumOps >= 3 && + OpInfo[0].RegClass == ARM::tGPRRegClassID && + OpInfo[1].RegClass == ARM::GPRRegClassID && + (OpInfo[2].RegClass == 0 && + !OpInfo[2].isPredicate() && + !OpInfo[2].isOptionalDef()) + && "Invalid arguments"); + + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::tGPRRegClassID, + getT1tRt(insn)))); + MI.addOperand(MCOperand::CreateReg(ARM::SP)); + MI.addOperand(MCOperand::CreateImm(getT1Imm8(insn))); + NumOpsAdded = 3; + return true; +} + +// Table A6-1 16-bit Thumb instruction encoding +// A8.6.10 ADR +// +// tADDrPCi: tRt imm8 +static bool DisassembleThumb1AddPCi(MCInst &MI, unsigned Opcode, uint32_t insn, + unsigned short NumOps, unsigned &NumOpsAdded, BO B) { + + assert(Opcode == ARM::tADDrPCi && "Unexpected opcode"); + + const TargetOperandInfo *OpInfo = ARMInsts[Opcode].OpInfo; + if (!OpInfo) return false; + + assert(NumOps >= 2 && OpInfo[0].RegClass == ARM::tGPRRegClassID && + (OpInfo[1].RegClass == 0 && + !OpInfo[1].isPredicate() && + !OpInfo[1].isOptionalDef()) + && "Invalid arguments"); + + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::tGPRRegClassID, + getT1tRt(insn)))); + MI.addOperand(MCOperand::CreateImm(getT1Imm8(insn))); + NumOpsAdded = 2; + return true; +} + +// Table A6-1 16-bit Thumb instruction encoding +// A8.6.8 ADD (SP plus immediate) +// +// tADDrSPi: tRt ARM::SP imm8 +static bool DisassembleThumb1AddSPi(MCInst &MI, unsigned Opcode, uint32_t insn, + unsigned short NumOps, unsigned &NumOpsAdded, BO B) { + + assert(Opcode == ARM::tADDrSPi && "Unexpected opcode"); + + const TargetOperandInfo *OpInfo = ARMInsts[Opcode].OpInfo; + if (!OpInfo) return false; + + assert(NumOps >= 3 && + OpInfo[0].RegClass == ARM::tGPRRegClassID && + OpInfo[1].RegClass == ARM::GPRRegClassID && + (OpInfo[2].RegClass == 0 && + !OpInfo[2].isPredicate() && + !OpInfo[2].isOptionalDef()) + && "Invalid arguments"); + + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::tGPRRegClassID, + getT1tRt(insn)))); + MI.addOperand(MCOperand::CreateReg(ARM::SP)); + MI.addOperand(MCOperand::CreateImm(getT1Imm8(insn))); + NumOpsAdded = 3; + return true; +} + +// tPUSH, tPOP: Pred-Imm Pred-CCR register_list +// +// where register_list = low registers + [lr] for PUSH or +// low registers + [pc] for POP +// +// "low registers" is specified by Inst{7-0} +// lr|pc is specified by Inst{8} +static bool DisassembleThumb1PushPop(MCInst &MI, unsigned Opcode, uint32_t insn, + unsigned short NumOps, unsigned &NumOpsAdded, BO B) { + + assert((Opcode == ARM::tPUSH || Opcode == ARM::tPOP) && "Unexpected opcode"); + + unsigned &OpIdx = NumOpsAdded; + + // Handling the two predicate operands before the reglist. + if (B->DoPredicateOperands(MI, Opcode, insn, NumOps)) + OpIdx += 2; + else { + DEBUG(errs() << "Expected predicate operands not found.\n"); + return false; + } + + unsigned RegListBits = slice(insn, 8, 8) << (Opcode == ARM::tPUSH ? 14 : 15) + | slice(insn, 7, 0); + + // Fill the variadic part of reglist. + for (unsigned i = 0; i < 16; ++i) { + if ((RegListBits >> i) & 1) { + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + i))); + ++OpIdx; + } + } + + return true; +} + +// A6.2.5 Miscellaneous 16-bit instructions +// Delegate to DisassembleThumb1PushPop() for tPUSH & tPOP. +// +// tADDspi, tSUBspi: ARM::SP ARM::SP(TIED_TO) imm7 +// t2IT: firstcond=Inst{7-4} mask=Inst{3-0} +// tCBNZ, tCBZ: tRd imm6*2 +// tBKPT: imm8 +// tNOP, tSEV, tYIELD, tWFE, tWFI: +// no operand (except predicate pair) +// tSETENDBE, tSETENDLE, : +// no operand +// Others: tRd tRn +static bool DisassembleThumb1Misc(MCInst &MI, unsigned Opcode, uint32_t insn, + unsigned short NumOps, unsigned &NumOpsAdded, BO B) { + + if (NumOps == 0) + return true; + + if (Opcode == ARM::tPUSH || Opcode == ARM::tPOP) + return DisassembleThumb1PushPop(MI, Opcode, insn, NumOps, NumOpsAdded, B); + + const TargetOperandInfo *OpInfo = ARMInsts[Opcode].OpInfo; + + // Predicate operands are handled elsewhere. + if (NumOps == 2 && + OpInfo[0].isPredicate() && OpInfo[1].isPredicate() && + OpInfo[0].RegClass == 0 && OpInfo[1].RegClass == ARM::CCRRegClassID) { + return true; + } + + if (Opcode == ARM::tADDspi || Opcode == ARM::tSUBspi) { + // Special case handling for tADDspi and tSUBspi. + // A8.6.8 ADD (SP plus immediate) & A8.6.215 SUB (SP minus immediate) + MI.addOperand(MCOperand::CreateReg(ARM::SP)); + MI.addOperand(MCOperand::CreateReg(ARM::SP)); + MI.addOperand(MCOperand::CreateImm(getT1Imm7(insn))); + NumOpsAdded = 3; + return true; + } + + if (Opcode == ARM::t2IT) { + // Special case handling for If-Then. + // A8.6.50 IT + // Tag the (firstcond[0] bit << 4) along with mask. + + // firstcond + MI.addOperand(MCOperand::CreateImm(slice(insn, 7, 4))); + + // firstcond[0] and mask + MI.addOperand(MCOperand::CreateImm(slice(insn, 4, 0))); + NumOpsAdded = 2; + return true; + } + + if (Opcode == ARM::tBKPT) { + MI.addOperand(MCOperand::CreateImm(getT1Imm8(insn))); // breakpoint value + NumOpsAdded = 1; + return true; + } + + // CPS has a singleton $opt operand that contains the following information: + // opt{4-0} = don't care + // opt{5} = 0 (false) + // opt{8-6} = AIF from Inst{2-0} + // opt{10-9} = 1:imod from Inst{4} with 0b10 as enable and 0b11 as disable + if (Opcode == ARM::tCPS) { + unsigned Option = slice(insn, 2, 0) << 6 | slice(insn, 4, 4) << 9 | 1 << 10; + MI.addOperand(MCOperand::CreateImm(Option)); + NumOpsAdded = 1; + return true; + } + + assert(NumOps >= 2 && OpInfo[0].RegClass == ARM::tGPRRegClassID && + (OpInfo[1].RegClass==0 || OpInfo[1].RegClass==ARM::tGPRRegClassID) + && "Expect >=2 operands"); + + // Add the destination operand. + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::tGPRRegClassID, + getT1tRd(insn)))); + + if (OpInfo[1].RegClass == ARM::tGPRRegClassID) { + // Two register instructions. + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::tGPRRegClassID, + getT1tRn(insn)))); + } else { + // CBNZ, CBZ + assert((Opcode == ARM::tCBNZ || Opcode == ARM::tCBZ) &&"Unexpected opcode"); + MI.addOperand(MCOperand::CreateImm(getT1Imm6(insn) * 2)); + } + + NumOpsAdded = 2; + + return true; +} + +// A8.6.53 LDM / LDMIA +// A8.6.189 STM / STMIA +// +// tLDM_UPD/tSTM_UPD: tRt tRt AM4ModeImm Pred-Imm Pred-CCR register_list +// tLDM: tRt AM4ModeImm Pred-Imm Pred-CCR register_list +static bool DisassembleThumb1LdStMul(bool Ld, MCInst &MI, unsigned Opcode, + uint32_t insn, unsigned short NumOps, unsigned &NumOpsAdded, BO B) { + + assert((Opcode == ARM::tLDM || Opcode == ARM::tLDM_UPD || + Opcode == ARM::tSTM_UPD) && "Unexpected opcode"); + + unsigned &OpIdx = NumOpsAdded; + + unsigned tRt = getT1tRt(insn); + + OpIdx = 0; + + // WB register, if necessary. + if (Opcode == ARM::tLDM_UPD || Opcode == ARM::tSTM_UPD) { + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + tRt))); + ++OpIdx; + } + + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + tRt))); + ++OpIdx; + + // A8.6.53 LDM / LDMIA / LDMFD - Encoding T1 + // A8.6.53 STM / STMIA / STMEA - Encoding T1 + MI.addOperand(MCOperand::CreateImm(ARM_AM::getAM4ModeImm(ARM_AM::ia))); + ++OpIdx; + + // Handling the two predicate operands before the reglist. + if (B->DoPredicateOperands(MI, Opcode, insn, NumOps)) + OpIdx += 2; + else { + DEBUG(errs() << "Expected predicate operands not found.\n"); + return false; + } + + unsigned RegListBits = slice(insn, 7, 0); + + // Fill the variadic part of reglist. + for (unsigned i = 0; i < 8; ++i) { + if ((RegListBits >> i) & 1) { + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::tGPRRegClassID, + i))); + ++OpIdx; + } + } + + return true; +} + +static bool DisassembleThumb1LdMul(MCInst &MI, unsigned Opcode, uint32_t insn, + unsigned short NumOps, unsigned &NumOpsAdded, BO B) { + return DisassembleThumb1LdStMul(true, MI, Opcode, insn, NumOps, NumOpsAdded, + B); +} + +static bool DisassembleThumb1StMul(MCInst &MI, unsigned Opcode, uint32_t insn, + unsigned short NumOps, unsigned &NumOpsAdded, BO B) { + return DisassembleThumb1LdStMul(false, MI, Opcode, insn, NumOps, NumOpsAdded, + B); +} + +// A8.6.16 B Encoding T1 +// cond = Inst{11-8} & imm8 = Inst{7-0} +// imm32 = SignExtend(imm8:'0', 32) +// +// tBcc: offset Pred-Imm Pred-CCR +// tSVC: imm8 Pred-Imm Pred-CCR +// tTRAP: 0 operand (early return) +static bool DisassembleThumb1CondBr(MCInst &MI, unsigned Opcode, uint32_t insn, + unsigned short NumOps, unsigned &NumOpsAdded, BO) { + + if (Opcode == ARM::tTRAP) + return true; + + const TargetOperandInfo *OpInfo = ARMInsts[Opcode].OpInfo; + if (!OpInfo) return false; + + assert(NumOps == 3 && OpInfo[0].RegClass == 0 && + OpInfo[1].isPredicate() && OpInfo[2].RegClass == ARM::CCRRegClassID + && "Exactly 3 operands expected"); + + unsigned Imm8 = getT1Imm8(insn); + MI.addOperand(MCOperand::CreateImm( + Opcode == ARM::tBcc ? SignExtend32<9>(Imm8 << 1) + 4 + : (int)Imm8)); + + // Predicate operands by ARMBasicMCBuilder::TryPredicateAndSBitModifier(). + NumOpsAdded = 1; + + return true; +} + +// A8.6.16 B Encoding T2 +// imm11 = Inst{10-0} +// imm32 = SignExtend(imm11:'0', 32) +// +// tB: offset +static bool DisassembleThumb1Br(MCInst &MI, unsigned Opcode, uint32_t insn, + unsigned short NumOps, unsigned &NumOpsAdded, BO) { + + const TargetOperandInfo *OpInfo = ARMInsts[Opcode].OpInfo; + if (!OpInfo) return false; + + assert(NumOps == 1 && OpInfo[0].RegClass == 0 && "1 imm operand expected"); + + unsigned Imm11 = getT1Imm11(insn); + + // When executing a Thumb instruction, PC reads as the address of the current + // instruction plus 4. The assembler subtracts 4 from the difference between + // the branch instruction and the target address, disassembler has to add 4 to + // to compensate. + MI.addOperand(MCOperand::CreateImm(SignExtend32<12>(Imm11 << 1) + 4)); + + NumOpsAdded = 1; + + return true; + +} + +// See A6.2 16-bit Thumb instruction encoding for instruction classes +// corresponding to op. +// +// Table A6-1 16-bit Thumb instruction encoding (abridged) +// op Instruction or instruction class +// ------ -------------------------------------------------------------------- +// 00xxxx Shift (immediate), add, subtract, move, and compare on page A6-7 +// 010000 Data-processing on page A6-8 +// 010001 Special data instructions and branch and exchange on page A6-9 +// 01001x Load from Literal Pool, see LDR (literal) on page A8-122 +// 0101xx Load/store single data item on page A6-10 +// 011xxx +// 100xxx +// 10100x Generate PC-relative address, see ADR on page A8-32 +// 10101x Generate SP-relative address, see ADD (SP plus immediate) on page A8-28 +// 1011xx Miscellaneous 16-bit instructions on page A6-11 +// 11000x Store multiple registers, see STM / STMIA / STMEA on page A8-374 +// 11001x Load multiple registers, see LDM / LDMIA / LDMFD on page A8-110 a +// 1101xx Conditional branch, and Supervisor Call on page A6-13 +// 11100x Unconditional Branch, see B on page A8-44 +// +static bool DisassembleThumb1(uint16_t op, MCInst &MI, unsigned Opcode, + uint32_t insn, unsigned short NumOps, unsigned &NumOpsAdded, BO B) { + + unsigned op1 = slice(op, 5, 4); + unsigned op2 = slice(op, 3, 2); + unsigned op3 = slice(op, 1, 0); + unsigned opA = slice(op, 5, 2); + switch (op1) { + case 0: + // A6.2.1 Shift (immediate), add, subtract, move, and compare + return DisassembleThumb1General(MI, Opcode, insn, NumOps, NumOpsAdded, B); + case 1: + switch (op2) { + case 0: + switch (op3) { + case 0: + // A6.2.2 Data-processing + return DisassembleThumb1DP(MI, Opcode, insn, NumOps, NumOpsAdded, B); + case 1: + // A6.2.3 Special data instructions and branch and exchange + return DisassembleThumb1Special(MI, Opcode, insn, NumOps, NumOpsAdded, + B); + default: + // A8.6.59 LDR (literal) + return DisassembleThumb1LdPC(MI, Opcode, insn, NumOps, NumOpsAdded, B); + } + break; + default: + // A6.2.4 Load/store single data item + return DisassembleThumb1LdSt(opA, MI, Opcode, insn, NumOps, NumOpsAdded, + B); + break; + } + break; + case 2: + switch (op2) { + case 0: + // A6.2.4 Load/store single data item + return DisassembleThumb1LdSt(opA, MI, Opcode, insn, NumOps, NumOpsAdded, + B); + case 1: + // A6.2.4 Load/store single data item + return DisassembleThumb1LdStSP(MI, Opcode, insn, NumOps, NumOpsAdded, B); + case 2: + if (op3 <= 1) { + // A8.6.10 ADR + return DisassembleThumb1AddPCi(MI, Opcode, insn, NumOps, NumOpsAdded, + B); + } else { + // A8.6.8 ADD (SP plus immediate) + return DisassembleThumb1AddSPi(MI, Opcode, insn, NumOps, NumOpsAdded, + B); + } + default: + // A6.2.5 Miscellaneous 16-bit instructions + return DisassembleThumb1Misc(MI, Opcode, insn, NumOps, NumOpsAdded, B); + } + break; + case 3: + switch (op2) { + case 0: + if (op3 <= 1) { + // A8.6.189 STM / STMIA / STMEA + return DisassembleThumb1StMul(MI, Opcode, insn, NumOps, NumOpsAdded, B); + } else { + // A8.6.53 LDM / LDMIA / LDMFD + return DisassembleThumb1LdMul(MI, Opcode, insn, NumOps, NumOpsAdded, B); + } + case 1: + // A6.2.6 Conditional branch, and Supervisor Call + return DisassembleThumb1CondBr(MI, Opcode, insn, NumOps, NumOpsAdded, B); + case 2: + // Unconditional Branch, see B on page A8-44 + return DisassembleThumb1Br(MI, Opcode, insn, NumOps, NumOpsAdded, B); + default: + assert(0 && "Unreachable code"); + break; + } + break; + default: + assert(0 && "Unreachable code"); + break; + } + + return false; +} + +/////////////////////////////////////////////// +// // +// Thumb2 instruction disassembly functions. // +// // +/////////////////////////////////////////////// + +/////////////////////////////////////////////////////////// +// // +// Note: the register naming follows the ARM convention! // +// // +/////////////////////////////////////////////////////////// + +static inline bool Thumb2SRSOpcode(unsigned Opcode) { + switch (Opcode) { + default: + return false; + case ARM::t2SRSDBW: case ARM::t2SRSDB: + case ARM::t2SRSIAW: case ARM::t2SRSIA: + return true; + } +} + +static inline bool Thumb2RFEOpcode(unsigned Opcode) { + switch (Opcode) { + default: + return false; + case ARM::t2RFEDBW: case ARM::t2RFEDB: + case ARM::t2RFEIAW: case ARM::t2RFEIA: + return true; + } +} + +// t2SRS[IA|DB]W/t2SRS[IA|DB]: mode_imm = Inst{4-0} +static bool DisassembleThumb2SRS(MCInst &MI, unsigned Opcode, uint32_t insn, + unsigned short NumOps, unsigned &NumOpsAdded) { + MI.addOperand(MCOperand::CreateImm(slice(insn, 4, 0))); + NumOpsAdded = 1; + return true; +} + +// t2RFE[IA|DB]W/t2RFE[IA|DB]: Rn +static bool DisassembleThumb2RFE(MCInst &MI, unsigned Opcode, uint32_t insn, + unsigned short NumOps, unsigned &NumOpsAdded, BO B) { + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + decodeRn(insn)))); + NumOpsAdded = 1; + return true; +} + +static bool DisassembleThumb2LdStMul(MCInst &MI, unsigned Opcode, uint32_t insn, + unsigned short NumOps, unsigned &NumOpsAdded, BO B) { + + if (Thumb2SRSOpcode(Opcode)) + return DisassembleThumb2SRS(MI, Opcode, insn, NumOps, NumOpsAdded); + + if (Thumb2RFEOpcode(Opcode)) + return DisassembleThumb2RFE(MI, Opcode, insn, NumOps, NumOpsAdded, B); + + assert((Opcode == ARM::t2LDM || Opcode == ARM::t2LDM_UPD || + Opcode == ARM::t2STM || Opcode == ARM::t2STM_UPD) + && "Unexpected opcode"); + assert(NumOps >= 5 && "Thumb2 LdStMul expects NumOps >= 5"); + + unsigned &OpIdx = NumOpsAdded; + + OpIdx = 0; + + unsigned Base = getRegisterEnum(B, ARM::GPRRegClassID, decodeRn(insn)); + + // Writeback to base. + if (Opcode == ARM::t2LDM_UPD || Opcode == ARM::t2STM_UPD) { + MI.addOperand(MCOperand::CreateReg(Base)); + ++OpIdx; + } + + MI.addOperand(MCOperand::CreateReg(Base)); + ++OpIdx; + + ARM_AM::AMSubMode SubMode = getAMSubModeForBits(getPUBits(insn)); + MI.addOperand(MCOperand::CreateImm(ARM_AM::getAM4ModeImm(SubMode))); + ++OpIdx; + + // Handling the two predicate operands before the reglist. + if (B->DoPredicateOperands(MI, Opcode, insn, NumOps)) + OpIdx += 2; + else { + DEBUG(errs() << "Expected predicate operands not found.\n"); + return false; + } + + unsigned RegListBits = insn & ((1 << 16) - 1); + + // Fill the variadic part of reglist. + for (unsigned i = 0; i < 16; ++i) { + if ((RegListBits >> i) & 1) { + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + i))); + ++OpIdx; + } + } + + return true; +} + +// t2LDREX: Rd Rn +// t2LDREXD: Rd Rs Rn +// t2LDREXB, t2LDREXH: Rd Rn +// t2STREX: Rs Rd Rn +// t2STREXD: Rm Rd Rs Rn +// t2STREXB, t2STREXH: Rm Rd Rn +static bool DisassembleThumb2LdStEx(MCInst &MI, unsigned Opcode, uint32_t insn, + unsigned short NumOps, unsigned &NumOpsAdded, BO B) { + + const TargetOperandInfo *OpInfo = ARMInsts[Opcode].OpInfo; + if (!OpInfo) return false; + + unsigned &OpIdx = NumOpsAdded; + + OpIdx = 0; + + assert(NumOps >= 2 + && OpInfo[0].RegClass == ARM::GPRRegClassID + && OpInfo[1].RegClass == ARM::GPRRegClassID + && "Expect >=2 operands and first two as reg operands"); + + bool isStore = (ARM::t2STREX <= Opcode && Opcode <= ARM::t2STREXH); + bool isSW = (Opcode == ARM::t2LDREX || Opcode == ARM::t2STREX); + bool isDW = (Opcode == ARM::t2LDREXD || Opcode == ARM::t2STREXD); + + // Add the destination operand for store. + if (isStore) { + MI.addOperand(MCOperand::CreateReg( + getRegisterEnum(B, ARM::GPRRegClassID, + isSW ? decodeRs(insn) : decodeRm(insn)))); + ++OpIdx; + } + + // Source operand for store and destination operand for load. + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + decodeRd(insn)))); + ++OpIdx; + + // Thumb2 doubleword complication: with an extra source/destination operand. + if (isDW) { + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + decodeRs(insn)))); + ++OpIdx; + } + + // Finally add the pointer operand. + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + decodeRn(insn)))); + ++OpIdx; + + return true; +} + +// LLVM, as of Jan-05-2010, does not output <Rt2>, i.e., Rs, in the asm. +// Whereas the ARM Arch. Manual does not require that t2 = t+1 like in ARM ISA. +// +// t2LDRDi8: Rd Rs Rn imm8s4 (offset mode) +// t2LDRDpci: Rd Rs imm8s4 (Not decoded, prefer the generic t2LDRDi8 version) +// t2STRDi8: Rd Rs Rn imm8s4 (offset mode) +// +// Ditto for t2LDRD_PRE, t2LDRD_POST, t2STRD_PRE, t2STRD_POST, which are for +// disassembly only and do not have a tied_to writeback base register operand. +static bool DisassembleThumb2LdStDual(MCInst &MI, unsigned Opcode, + uint32_t insn, unsigned short NumOps, unsigned &NumOpsAdded, BO B) { + + const TargetOperandInfo *OpInfo = ARMInsts[Opcode].OpInfo; + if (!OpInfo) return false; + + assert(NumOps >= 4 + && OpInfo[0].RegClass == ARM::GPRRegClassID + && OpInfo[1].RegClass == ARM::GPRRegClassID + && OpInfo[2].RegClass == ARM::GPRRegClassID + && OpInfo[3].RegClass == 0 + && "Expect >= 4 operands and first 3 as reg operands"); + + // Add the <Rt> <Rt2> operands. + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + decodeRd(insn)))); + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + decodeRs(insn)))); + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + decodeRn(insn)))); + + // Finally add (+/-)imm8*4, depending on the U bit. + int Offset = getImm8(insn) * 4; + if (getUBit(insn) == 0) + Offset = -Offset; + MI.addOperand(MCOperand::CreateImm(Offset)); + NumOpsAdded = 4; + + return true; +} + +// PC-based defined for Codegen, which do not get decoded by design: +// +// t2TBB, t2TBH: Rm immDontCare immDontCare +// +// Generic version defined for disassembly: +// +// t2TBBgen, t2TBHgen: Rn Rm Pred-Imm Pred-CCR +static bool DisassembleThumb2TB(MCInst &MI, unsigned Opcode, + uint32_t insn, unsigned short NumOps, unsigned &NumOpsAdded, BO B) { + + assert(NumOps >= 2 && "Expect >= 2 operands"); + + // The generic version of TBB/TBH needs a base register. + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + decodeRn(insn)))); + // Add the index register. + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + decodeRm(insn)))); + NumOpsAdded = 2; + + return true; +} + +static inline bool Thumb2ShiftOpcode(unsigned Opcode) { + switch (Opcode) { + default: + return false; + case ARM::t2MOVCClsl: case ARM::t2MOVCClsr: + case ARM::t2MOVCCasr: case ARM::t2MOVCCror: + case ARM::t2LSLri: case ARM::t2LSRri: + case ARM::t2ASRri: case ARM::t2RORri: + return true; + } +} + +// A6.3.11 Data-processing (shifted register) +// +// Two register operands (Rn=0b1111 no 1st operand reg): Rs Rm +// Two register operands (Rs=0b1111 no dst operand reg): Rn Rm +// Three register operands: Rs Rn Rm +// Three register operands: (Rn=0b1111 Conditional Move) Rs Ro(TIED_TO) Rm +// +// Constant shifts t2_so_reg is a 2-operand unit corresponding to the Thumb2 +// register with shift forms: (Rm, ConstantShiftSpecifier). +// Constant shift specifier: Imm = (ShOp | ShAmt<<3). +// +// There are special instructions, like t2MOVsra_flag and t2MOVsrl_flag, which +// only require two register operands: Rd, Rm in ARM Reference Manual terms, and +// nothing else, because the shift amount is already specified. +// Similar case holds for t2MOVrx, t2ADDrr, ..., etc. +static bool DisassembleThumb2DPSoReg(MCInst &MI, unsigned Opcode, uint32_t insn, + unsigned short NumOps, unsigned &NumOpsAdded, BO B) { + + const TargetInstrDesc &TID = ARMInsts[Opcode]; + const TargetOperandInfo *OpInfo = TID.OpInfo; + unsigned &OpIdx = NumOpsAdded; + + // Special case handling. + if (Opcode == ARM::t2BR_JT) { + assert(NumOps == 4 + && OpInfo[0].RegClass == ARM::GPRRegClassID + && OpInfo[1].RegClass == ARM::GPRRegClassID + && OpInfo[2].RegClass == 0 + && OpInfo[3].RegClass == 0 + && "Exactlt 4 operands expect and first two as reg operands"); + // Only need to populate the src reg operand. + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + decodeRm(insn)))); + MI.addOperand(MCOperand::CreateReg(0)); + MI.addOperand(MCOperand::CreateImm(0)); + MI.addOperand(MCOperand::CreateImm(0)); + NumOpsAdded = 4; + return true; + } + + OpIdx = 0; + + assert(NumOps >= 2 + && OpInfo[0].RegClass == ARM::GPRRegClassID + && OpInfo[1].RegClass == ARM::GPRRegClassID + && "Expect >= 2 operands and first two as reg operands"); + + bool ThreeReg = (NumOps > 2 && OpInfo[2].RegClass == ARM::GPRRegClassID); + bool NoDstReg = (decodeRs(insn) == 0xF); + + // Build the register operands, followed by the constant shift specifier. + + MI.addOperand(MCOperand::CreateReg( + getRegisterEnum(B, ARM::GPRRegClassID, + NoDstReg ? decodeRn(insn) : decodeRs(insn)))); + ++OpIdx; + + if (ThreeReg) { + int Idx; + if ((Idx = TID.getOperandConstraint(OpIdx, TOI::TIED_TO)) != -1) { + // Process tied_to operand constraint. + MI.addOperand(MI.getOperand(Idx)); + ++OpIdx; + } else if (!NoDstReg) { + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + decodeRn(insn)))); + ++OpIdx; + } else { + DEBUG(errs() << "Thumb2 encoding error: d==15 for three-reg operands.\n"); + return false; + } + } + + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + decodeRm(insn)))); + ++OpIdx; + + if (NumOps == OpIdx) + return true; + + if (OpInfo[OpIdx].RegClass == 0 && !OpInfo[OpIdx].isPredicate() + && !OpInfo[OpIdx].isOptionalDef()) { + + if (Thumb2ShiftOpcode(Opcode)) + MI.addOperand(MCOperand::CreateImm(getShiftAmtBits(insn))); + else { + // Build the constant shift specifier operand. + unsigned bits2 = getShiftTypeBits(insn); + unsigned imm5 = getShiftAmtBits(insn); + ARM_AM::ShiftOpc ShOp = ARM_AM::no_shift; + unsigned ShAmt = decodeImmShift(bits2, imm5, ShOp); + + // PKHBT/PKHTB are special in that we need the decodeImmShift() call to + // decode the shift amount from raw imm5 and bits2, but we DO NOT need + // to encode the ShOp, as it's in the asm string already. + if (Opcode == ARM::t2PKHBT || Opcode == ARM::t2PKHTB) + MI.addOperand(MCOperand::CreateImm(ShAmt)); + else + MI.addOperand(MCOperand::CreateImm(ARM_AM::getSORegOpc(ShOp, ShAmt))); + } + ++OpIdx; + } + + return true; +} + +// A6.3.1 Data-processing (modified immediate) +// +// Two register operands: Rs Rn ModImm +// One register operands (Rs=0b1111 no explicit dest reg): Rn ModImm +// One register operands (Rn=0b1111 no explicit src reg): Rs ModImm - {t2MOVi, t2MVNi} +// +// ModImm = ThumbExpandImm(i:imm3:imm8) +static bool DisassembleThumb2DPModImm(MCInst &MI, unsigned Opcode, + uint32_t insn, unsigned short NumOps, unsigned &NumOpsAdded, BO B) { + + const TargetOperandInfo *OpInfo = ARMInsts[Opcode].OpInfo; + unsigned &OpIdx = NumOpsAdded; + + OpIdx = 0; + + assert(NumOps >= 2 && OpInfo[0].RegClass == ARM::GPRRegClassID + && "Expect >= 2 operands and first one as reg operand"); + + bool TwoReg = (OpInfo[1].RegClass == ARM::GPRRegClassID); + bool NoDstReg = (decodeRs(insn) == 0xF); + + // Build the register operands, followed by the modified immediate. + + MI.addOperand(MCOperand::CreateReg( + getRegisterEnum(B, ARM::GPRRegClassID, + NoDstReg ? decodeRn(insn) : decodeRs(insn)))); + ++OpIdx; + + if (TwoReg) { + if (NoDstReg) { + DEBUG(errs()<<"Thumb2 encoding error: d==15 for DPModImm 2-reg instr.\n"); + return false; + } + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + decodeRn(insn)))); + ++OpIdx; + } + + // The modified immediate operand should come next. + assert(OpIdx < NumOps && OpInfo[OpIdx].RegClass == 0 && + !OpInfo[OpIdx].isPredicate() && !OpInfo[OpIdx].isOptionalDef() + && "Pure imm operand expected"); + + // i:imm3:imm8 + // A6.3.2 Modified immediate constants in Thumb instructions + unsigned imm12 = getIImm3Imm8(insn); + MI.addOperand(MCOperand::CreateImm(ThumbExpandImm(imm12))); + ++OpIdx; + + return true; +} + +static inline bool Thumb2SaturateOpcode(unsigned Opcode) { + switch (Opcode) { + case ARM::t2SSATlsl: case ARM::t2SSATasr: case ARM::t2SSAT16: + case ARM::t2USATlsl: case ARM::t2USATasr: case ARM::t2USAT16: + return true; + default: + return false; + } +} + +static inline unsigned decodeThumb2SaturatePos(unsigned Opcode, uint32_t insn) { + switch (Opcode) { + case ARM::t2SSATlsl: + case ARM::t2SSATasr: + return slice(insn, 4, 0) + 1; + case ARM::t2SSAT16: + return slice(insn, 3, 0) + 1; + case ARM::t2USATlsl: + case ARM::t2USATasr: + return slice(insn, 4, 0); + case ARM::t2USAT16: + return slice(insn, 3, 0); + default: + assert(0 && "Unexpected opcode"); + return 0; + } +} + +// A6.3.3 Data-processing (plain binary immediate) +// +// o t2ADDri12, t2SUBri12: Rs Rn imm12 +// o t2LEApcrel (ADR): Rs imm12 +// o t2BFC (BFC): Rs Ro(TIED_TO) bf_inv_mask_imm +// o t2BFI (BFI) (Currently not defined in LLVM as of Jan-07-2010) +// o t2MOVi16: Rs imm16 +// o t2MOVTi16: Rs imm16 +// o t2SBFX (SBFX): Rs Rn lsb width +// o t2UBFX (UBFX): Rs Rn lsb width +// o t2BFI (BFI): Rs Rn lsb width +// +// [Signed|Unsigned] Saturate [16] +// +// o t2SSAT[lsl|asr], t2USAT[lsl|asr]: Rs sat_pos Rn shamt +// o t2SSAT16, t2USAT16: Rs sat_pos Rn +static bool DisassembleThumb2DPBinImm(MCInst &MI, unsigned Opcode, + uint32_t insn, unsigned short NumOps, unsigned &NumOpsAdded, BO B) { + + const TargetInstrDesc &TID = ARMInsts[Opcode]; + const TargetOperandInfo *OpInfo = TID.OpInfo; + unsigned &OpIdx = NumOpsAdded; + + OpIdx = 0; + + assert(NumOps >= 2 && OpInfo[0].RegClass == ARM::GPRRegClassID + && "Expect >= 2 operands and first one as reg operand"); + + bool TwoReg = (OpInfo[1].RegClass == ARM::GPRRegClassID); + + // Build the register operand(s), followed by the immediate(s). + + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + decodeRs(insn)))); + ++OpIdx; + + // t2SSAT/t2SSAT16/t2USAT/t2USAT16 has imm operand after Rd. + if (Thumb2SaturateOpcode(Opcode)) { + MI.addOperand(MCOperand::CreateImm(decodeThumb2SaturatePos(Opcode, insn))); + + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + decodeRn(insn)))); + + if (Opcode == ARM::t2SSAT16 || Opcode == ARM::t2USAT16) { + OpIdx += 2; + return true; + } + + // For SSAT operand reg (Rn) has been disassembled above. + // Now disassemble the shift amount. + + // Inst{14-12:7-6} encodes the imm5 shift amount. + unsigned ShAmt = slice(insn, 14, 12) << 2 | slice(insn, 7, 6); + + MI.addOperand(MCOperand::CreateImm(ShAmt)); + + OpIdx += 3; + return true; + } + + if (TwoReg) { + assert(NumOps >= 3 && "Expect >= 3 operands"); + int Idx; + if ((Idx = TID.getOperandConstraint(OpIdx, TOI::TIED_TO)) != -1) { + // Process tied_to operand constraint. + MI.addOperand(MI.getOperand(Idx)); + } else { + // Add src reg operand. + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + decodeRn(insn)))); + } + ++OpIdx; + } + + assert(OpInfo[OpIdx].RegClass == 0 && !OpInfo[OpIdx].isPredicate() + && !OpInfo[OpIdx].isOptionalDef() + && "Pure imm operand expected"); + + // Pre-increment OpIdx. + ++OpIdx; + + if (Opcode == ARM::t2ADDri12 || Opcode == ARM::t2SUBri12 + || Opcode == ARM::t2LEApcrel) + MI.addOperand(MCOperand::CreateImm(getIImm3Imm8(insn))); + else if (Opcode == ARM::t2MOVi16 || Opcode == ARM::t2MOVTi16) + MI.addOperand(MCOperand::CreateImm(getImm16(insn))); + else if (Opcode == ARM::t2BFC) { + uint32_t mask = 0; + if (getBitfieldInvMask(insn, mask)) + MI.addOperand(MCOperand::CreateImm(mask)); + else + return false; + } else { + // Handle the case of: lsb width + assert((Opcode == ARM::t2SBFX || Opcode == ARM::t2UBFX || + Opcode == ARM::t2BFI) && "Unexpected opcode"); + MI.addOperand(MCOperand::CreateImm(getLsb(insn))); + if (Opcode == ARM::t2BFI) { + if (getMsb(insn) < getLsb(insn)) { + DEBUG(errs() << "Encoding error: msb < lsb\n"); + return false; + } + MI.addOperand(MCOperand::CreateImm(getMsb(insn) - getLsb(insn) + 1)); + } else + MI.addOperand(MCOperand::CreateImm(getWidthMinus1(insn) + 1)); + + ++OpIdx; + } + + return true; +} + +// A6.3.4 Table A6-15 Miscellaneous control instructions +// A8.6.41 DMB +// A8.6.42 DSB +// A8.6.49 ISB +static inline bool t2MiscCtrlInstr(uint32_t insn) { + if (slice(insn, 31, 20) == 0xf3b && slice(insn, 15, 14) == 2 && + slice(insn, 12, 12) == 0) + return true; + + return false; +} + +// A6.3.4 Branches and miscellaneous control +// +// A8.6.16 B +// Branches: t2B, t2Bcc -> imm operand +// +// Branches: t2TPsoft -> no operand +// +// A8.6.23 BL, BLX (immediate) +// Branches (defined in ARMInstrThumb.td): tBLr9, tBLXi_r9 -> imm operand +// +// A8.6.26 +// t2BXJ -> Rn +// +// Miscellaneous control: t2Int_MemBarrierV7 (and its t2DMB variants), +// t2Int_SyncBarrierV7 (and its t2DSB varianst), t2ISBsy, t2CLREX +// -> no operand (except pred-imm pred-ccr for CLREX, memory barrier variants) +// +// Hint: t2NOP, t2YIELD, t2WFE, t2WFI, t2SEV +// -> no operand (except pred-imm pred-ccr) +// +// t2DBG -> imm4 = Inst{3-0} +// +// t2MRS/t2MRSsys -> Rs +// t2MSR/t2MSRsys -> Rn mask=Inst{11-8} +// t2SMC -> imm4 = Inst{19-16} +static bool DisassembleThumb2BrMiscCtrl(MCInst &MI, unsigned Opcode, + uint32_t insn, unsigned short NumOps, unsigned &NumOpsAdded, BO B) { + + if (NumOps == 0) + return true; + + if (t2MiscCtrlInstr(insn)) + return true; + + switch (Opcode) { + case ARM::t2CLREX: + case ARM::t2NOP: + case ARM::t2YIELD: + case ARM::t2WFE: + case ARM::t2WFI: + case ARM::t2SEV: + return true; + default: + break; + } + + // CPS has a singleton $opt operand that contains the following information: + // opt{4-0} = mode from Inst{4-0} + // opt{5} = changemode from Inst{8} + // opt{8-6} = AIF from Inst{7-5} + // opt{10-9} = imod from Inst{10-9} with 0b10 as enable and 0b11 as disable + if (Opcode == ARM::t2CPS) { + unsigned Option = slice(insn, 4, 0) | slice(insn, 8, 8) << 5 | + slice(insn, 7, 5) << 6 | slice(insn, 10, 9) << 9; + MI.addOperand(MCOperand::CreateImm(Option)); + NumOpsAdded = 1; + return true; + } + + // DBG has its option specified in Inst{3-0}. + if (Opcode == ARM::t2DBG) { + MI.addOperand(MCOperand::CreateImm(slice(insn, 3, 0))); + NumOpsAdded = 1; + return true; + } + + // MRS and MRSsys take one GPR reg Rs. + if (Opcode == ARM::t2MRS || Opcode == ARM::t2MRSsys) { + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + decodeRs(insn)))); + NumOpsAdded = 1; + return true; + } + // BXJ takes one GPR reg Rn. + if (Opcode == ARM::t2BXJ) { + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + decodeRn(insn)))); + NumOpsAdded = 1; + return true; + } + // MSR and MSRsys take one GPR reg Rn, followed by the mask. + if (Opcode == ARM::t2MSR || Opcode == ARM::t2MSRsys || Opcode == ARM::t2BXJ) { + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + decodeRn(insn)))); + MI.addOperand(MCOperand::CreateImm(slice(insn, 11, 8))); + NumOpsAdded = 2; + return true; + } + // SMC take imm4. + if (Opcode == ARM::t2SMC) { + MI.addOperand(MCOperand::CreateImm(slice(insn, 19, 16))); + NumOpsAdded = 1; + return true; + } + + // Add the imm operand. + int Offset = 0; + + switch (Opcode) { + default: + assert(0 && "Unexpected opcode"); + return false; + case ARM::t2B: + Offset = decodeImm32_B_EncodingT4(insn); + break; + case ARM::t2Bcc: + Offset = decodeImm32_B_EncodingT3(insn); + break; + case ARM::tBLr9: + Offset = decodeImm32_BL(insn); + break; + case ARM::tBLXi_r9: + Offset = decodeImm32_BLX(insn); + break; + } + // When executing a Thumb instruction, PC reads as the address of the current + // instruction plus 4. The assembler subtracts 4 from the difference between + // the branch instruction and the target address, disassembler has to add 4 to + // to compensate. + MI.addOperand(MCOperand::CreateImm(Offset + 4)); + + NumOpsAdded = 1; + + return true; +} + +static inline bool Thumb2PreloadOpcode(unsigned Opcode) { + switch (Opcode) { + default: + return false; + case ARM::t2PLDi12: case ARM::t2PLDi8: case ARM::t2PLDpci: + case ARM::t2PLDr: case ARM::t2PLDs: + case ARM::t2PLDWi12: case ARM::t2PLDWi8: case ARM::t2PLDWpci: + case ARM::t2PLDWr: case ARM::t2PLDWs: + case ARM::t2PLIi12: case ARM::t2PLIi8: case ARM::t2PLIpci: + case ARM::t2PLIr: case ARM::t2PLIs: + return true; + } +} + +static bool DisassembleThumb2PreLoad(MCInst &MI, unsigned Opcode, uint32_t insn, + unsigned short NumOps, unsigned &NumOpsAdded, BO B) { + + // Preload Data/Instruction requires either 2 or 3 operands. + // t2PLDi12, t2PLDi8, t2PLDpci: Rn [+/-]imm12/imm8 + // t2PLDr: Rn Rm + // t2PLDs: Rn Rm imm2=Inst{5-4} + // Same pattern applies for t2PLDW* and t2PLI*. + + const TargetInstrDesc &TID = ARMInsts[Opcode]; + const TargetOperandInfo *OpInfo = TID.OpInfo; + unsigned &OpIdx = NumOpsAdded; + + OpIdx = 0; + + assert(NumOps >= 2 && + OpInfo[0].RegClass == ARM::GPRRegClassID && + "Expect >= 2 operands and first one as reg operand"); + + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + decodeRn(insn)))); + ++OpIdx; + + if (OpInfo[OpIdx].RegClass == ARM::GPRRegClassID) { + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + decodeRm(insn)))); + } else { + assert(OpInfo[OpIdx].RegClass == 0 && !OpInfo[OpIdx].isPredicate() + && !OpInfo[OpIdx].isOptionalDef() + && "Pure imm operand expected"); + int Offset = 0; + if (Opcode == ARM::t2PLDpci || Opcode == ARM::t2PLDWpci || + Opcode == ARM::t2PLIpci) { + bool Negative = slice(insn, 23, 23) == 0; + unsigned Imm12 = getImm12(insn); + Offset = Negative ? -1 - Imm12 : 1 * Imm12; + } else if (Opcode == ARM::t2PLDi8 || Opcode == ARM::t2PLDWi8 || + Opcode == ARM::t2PLIi8) { + // A8.6.117 Encoding T2: add = FALSE + unsigned Imm8 = getImm8(insn); + Offset = -1 - Imm8; + } else // The i12 forms. See, for example, A8.6.117 Encoding T1. + Offset = decodeImm12(insn); + MI.addOperand(MCOperand::CreateImm(Offset)); + } + ++OpIdx; + + if (OpIdx < NumOps && OpInfo[OpIdx].RegClass == 0 && + !OpInfo[OpIdx].isPredicate() && !OpInfo[OpIdx].isOptionalDef()) { + // Fills in the shift amount for t2PLDs, t2PLDWs, t2PLIs. + MI.addOperand(MCOperand::CreateImm(slice(insn, 5, 4))); + ++OpIdx; + } + + return true; +} + +// A8.6.63 LDRB (literal) +// A8.6.79 LDRSB (literal) +// A8.6.75 LDRH (literal) +// A8.6.83 LDRSH (literal) +// A8.6.59 LDR (literal) +// +// These instrs calculate an address from the PC value and an immediate offset. +// Rd Rn=PC (+/-)imm12 (+ if Inst{23} == 0b1) +static bool DisassembleThumb2Ldpci(MCInst &MI, unsigned Opcode, + uint32_t insn, unsigned short NumOps, unsigned &NumOpsAdded, BO B) { + + const TargetOperandInfo *OpInfo = ARMInsts[Opcode].OpInfo; + if (!OpInfo) return false; + + assert(NumOps >= 2 && + OpInfo[0].RegClass == ARM::GPRRegClassID && + OpInfo[1].RegClass == 0 && + "Expect >= 2 operands, first as reg, and second as imm operand"); + + // Build the register operand, followed by the (+/-)imm12 immediate. + + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + decodeRd(insn)))); + + MI.addOperand(MCOperand::CreateImm(decodeImm12(insn))); + + NumOpsAdded = 2; + + return true; +} + +// A6.3.10 Store single data item +// A6.3.9 Load byte, memory hints +// A6.3.8 Load halfword, memory hints +// A6.3.7 Load word +// +// For example, +// +// t2LDRi12: Rd Rn (+)imm12 +// t2LDRi8: Rd Rn (+/-)imm8 (+ if Inst{9} == 0b1) +// t2LDRs: Rd Rn Rm ConstantShiftSpecifier (see also DisassembleThumb2DPSoReg) +// t2LDR_POST: Rd Rn Rn(TIED_TO) (+/-)imm8 (+ if Inst{9} == 0b1) +// t2LDR_PRE: Rd Rn Rn(TIED_TO) (+/-)imm8 (+ if Inst{9} == 0b1) +// +// t2STRi12: Rd Rn (+)imm12 +// t2STRi8: Rd Rn (+/-)imm8 (+ if Inst{9} == 0b1) +// t2STRs: Rd Rn Rm ConstantShiftSpecifier (see also DisassembleThumb2DPSoReg) +// t2STR_POST: Rn Rd Rn(TIED_TO) (+/-)imm8 (+ if Inst{9} == 0b1) +// t2STR_PRE: Rn Rd Rn(TIED_TO) (+/-)imm8 (+ if Inst{9} == 0b1) +// +// Note that for indexed modes, the Rn(TIED_TO) operand needs to be populated +// correctly, as LLVM AsmPrinter depends on it. For indexed stores, the first +// operand is Rn; for all the other instructions, Rd is the first operand. +// +// Delegates to DisassembleThumb2PreLoad() for preload data/instruction. +// Delegates to DisassembleThumb2Ldpci() for load * literal operations. +static bool DisassembleThumb2LdSt(bool Load, MCInst &MI, unsigned Opcode, + uint32_t insn, unsigned short NumOps, unsigned &NumOpsAdded, BO B) { + + unsigned Rn = decodeRn(insn); + + if (Thumb2PreloadOpcode(Opcode)) + return DisassembleThumb2PreLoad(MI, Opcode, insn, NumOps, NumOpsAdded, B); + + // See, for example, A6.3.7 Load word: Table A6-18 Load word. + if (Load && Rn == 15) + return DisassembleThumb2Ldpci(MI, Opcode, insn, NumOps, NumOpsAdded, B); + + const TargetInstrDesc &TID = ARMInsts[Opcode]; + const TargetOperandInfo *OpInfo = TID.OpInfo; + unsigned &OpIdx = NumOpsAdded; + + OpIdx = 0; + + assert(NumOps >= 3 && + OpInfo[0].RegClass == ARM::GPRRegClassID && + OpInfo[1].RegClass == ARM::GPRRegClassID && + "Expect >= 3 operands and first two as reg operands"); + + bool ThreeReg = (OpInfo[2].RegClass == ARM::GPRRegClassID); + bool TIED_TO = ThreeReg && TID.getOperandConstraint(2, TOI::TIED_TO) != -1; + bool Imm12 = !ThreeReg && slice(insn, 23, 23) == 1; // ARMInstrThumb2.td + + // Build the register operands, followed by the immediate. + unsigned R0, R1, R2 = 0; + unsigned Rd = decodeRd(insn); + int Imm = 0; + + if (!Load && TIED_TO) { + R0 = Rn; + R1 = Rd; + } else { + R0 = Rd; + R1 = Rn; + } + if (ThreeReg) { + if (TIED_TO) { + R2 = Rn; + Imm = decodeImm8(insn); + } else { + R2 = decodeRm(insn); + // See, for example, A8.6.64 LDRB (register). + // And ARMAsmPrinter::printT2AddrModeSoRegOperand(). + // LSL is the default shift opc, and LLVM does not expect it to be encoded + // as part of the immediate operand. + // Imm = ARM_AM::getSORegOpc(ARM_AM::lsl, slice(insn, 5, 4)); + Imm = slice(insn, 5, 4); + } + } else { + if (Imm12) + Imm = getImm12(insn); + else + Imm = decodeImm8(insn); + } + + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + R0))); + ++OpIdx; + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + R1))); + ++OpIdx; + + if (ThreeReg) { + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + R2))); + ++OpIdx; + } + + assert(OpInfo[OpIdx].RegClass == 0 && !OpInfo[OpIdx].isPredicate() + && !OpInfo[OpIdx].isOptionalDef() + && "Pure imm operand expected"); + + MI.addOperand(MCOperand::CreateImm(Imm)); + ++OpIdx; + + return true; +} + +// A6.3.12 Data-processing (register) +// +// Two register operands [rotate]: Rs Rm [rotation(= (rotate:'000'))] +// Three register operands only: Rs Rn Rm +// Three register operands [rotate]: Rs Rn Rm [rotation(= (rotate:'000'))] +// +// Parallel addition and subtraction 32-bit Thumb instructions: Rs Rn Rm +// +// Miscellaneous operations: Rs [Rn] Rm +static bool DisassembleThumb2DPReg(MCInst &MI, unsigned Opcode, uint32_t insn, + unsigned short NumOps, unsigned &NumOpsAdded, BO B) { + + const TargetInstrDesc &TID = ARMInsts[Opcode]; + const TargetOperandInfo *OpInfo = TID.OpInfo; + unsigned &OpIdx = NumOpsAdded; + + OpIdx = 0; + + assert(NumOps >= 2 && + OpInfo[0].RegClass == ARM::GPRRegClassID && + OpInfo[1].RegClass == ARM::GPRRegClassID && + "Expect >= 2 operands and first two as reg operands"); + + // Build the register operands, followed by the optional rotation amount. + + bool ThreeReg = NumOps > 2 && OpInfo[2].RegClass == ARM::GPRRegClassID; + + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + decodeRs(insn)))); + ++OpIdx; + + if (ThreeReg) { + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + decodeRn(insn)))); + ++OpIdx; + } + + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + decodeRm(insn)))); + ++OpIdx; + + if (OpIdx < NumOps && OpInfo[OpIdx].RegClass == 0 + && !OpInfo[OpIdx].isPredicate() && !OpInfo[OpIdx].isOptionalDef()) { + // Add the rotation amount immediate. + MI.addOperand(MCOperand::CreateImm(decodeRotate(insn))); + ++OpIdx; + } + + return true; +} + +// A6.3.16 Multiply, multiply accumulate, and absolute difference +// +// t2MLA, t2MLS, t2SMMLA, t2SMMLS: Rs Rn Rm Ra=Inst{15-12} +// t2MUL, t2SMMUL: Rs Rn Rm +// t2SMLA[BB|BT|TB|TT|WB|WT]: Rs Rn Rm Ra=Inst{15-12} +// t2SMUL[BB|BT|TB|TT|WB|WT]: Rs Rn Rm +// +// Dual halfword multiply: t2SMUAD[X], t2SMUSD[X], t2SMLAD[X], t2SMLSD[X]: +// Rs Rn Rm Ra=Inst{15-12} +// +// Unsigned Sum of Absolute Differences [and Accumulate] +// Rs Rn Rm [Ra=Inst{15-12}] +static bool DisassembleThumb2Mul(MCInst &MI, unsigned Opcode, uint32_t insn, + unsigned short NumOps, unsigned &NumOpsAdded, BO B) { + + const TargetOperandInfo *OpInfo = ARMInsts[Opcode].OpInfo; + + assert(NumOps >= 3 && + OpInfo[0].RegClass == ARM::GPRRegClassID && + OpInfo[1].RegClass == ARM::GPRRegClassID && + OpInfo[2].RegClass == ARM::GPRRegClassID && + "Expect >= 3 operands and first three as reg operands"); + + // Build the register operands. + + bool FourReg = NumOps > 3 && OpInfo[3].RegClass == ARM::GPRRegClassID; + + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + decodeRs(insn)))); + + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + decodeRn(insn)))); + + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + decodeRm(insn)))); + + if (FourReg) + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + decodeRd(insn)))); + + NumOpsAdded = FourReg ? 4 : 3; + + return true; +} + +// A6.3.17 Long multiply, long multiply accumulate, and divide +// +// t2SMULL, t2UMULL, t2SMLAL, t2UMLAL, t2UMAAL: RdLo RdHi Rn Rm +// where RdLo = Inst{15-12} and RdHi = Inst{11-8} +// +// Halfword multiple accumulate long: t2SMLAL<x><y>: RdLo RdHi Rn Rm +// where RdLo = Inst{15-12} and RdHi = Inst{11-8} +// +// Dual halfword multiple: t2SMLALD[X], t2SMLSLD[X]: RdLo RdHi Rn Rm +// where RdLo = Inst{15-12} and RdHi = Inst{11-8} +// +// Signed/Unsigned divide: t2SDIV, t2UDIV: Rs Rn Rm +static bool DisassembleThumb2LongMul(MCInst &MI, unsigned Opcode, uint32_t insn, + unsigned short NumOps, unsigned &NumOpsAdded, BO B) { + + const TargetOperandInfo *OpInfo = ARMInsts[Opcode].OpInfo; + + assert(NumOps >= 3 && + OpInfo[0].RegClass == ARM::GPRRegClassID && + OpInfo[1].RegClass == ARM::GPRRegClassID && + OpInfo[2].RegClass == ARM::GPRRegClassID && + "Expect >= 3 operands and first three as reg operands"); + + bool FourReg = NumOps > 3 && OpInfo[3].RegClass == ARM::GPRRegClassID; + + // Build the register operands. + + if (FourReg) + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + decodeRd(insn)))); + + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + decodeRs(insn)))); + + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + decodeRn(insn)))); + + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + decodeRm(insn)))); + + if (FourReg) + NumOpsAdded = 4; + else + NumOpsAdded = 3; + + return true; +} + +// See A6.3 32-bit Thumb instruction encoding for instruction classes +// corresponding to (op1, op2, op). +// +// Table A6-9 32-bit Thumb instruction encoding +// op1 op2 op Instruction class, see +// --- ------- -- ------------------------------------------------------------ +// 01 00xx0xx - Load/store multiple on page A6-23 +// 00xx1xx - Load/store dual, load/store exclusive, table branch on page A6-24 +// 01xxxxx - Data-processing (shifted register) on page A6-31 +// 1xxxxxx - Coprocessor instructions on page A6-40 +// 10 x0xxxxx 0 Data-processing (modified immediate) on page A6-15 +// x1xxxxx 0 Data-processing (plain binary immediate) on page A6-19 +// - 1 Branches and miscellaneous control on page A6-20 +// 11 000xxx0 - Store single data item on page A6-30 +// 001xxx0 - Advanced SIMD element or structure load/store instructions on page A7-27 +// 00xx001 - Load byte, memory hints on page A6-28 +// 00xx011 - Load halfword, memory hints on page A6-26 +// 00xx101 - Load word on page A6-25 +// 00xx111 - UNDEFINED +// 010xxxx - Data-processing (register) on page A6-33 +// 0110xxx - Multiply, multiply accumulate, and absolute difference on page A6-38 +// 0111xxx - Long multiply, long multiply accumulate, and divide on page A6-39 +// 1xxxxxx - Coprocessor instructions on page A6-40 +// +static bool DisassembleThumb2(uint16_t op1, uint16_t op2, uint16_t op, + MCInst &MI, unsigned Opcode, uint32_t insn, unsigned short NumOps, + unsigned &NumOpsAdded, BO B) { + + switch (op1) { + case 1: + if (slice(op2, 6, 5) == 0) { + if (slice(op2, 2, 2) == 0) { + // Load/store multiple. + return DisassembleThumb2LdStMul(MI, Opcode, insn, NumOps, NumOpsAdded, + B); + } + + // Load/store dual, load/store exclusive, table branch, otherwise. + assert(slice(op2, 2, 2) == 1 && "Thumb2 encoding error!"); + if ((ARM::t2LDREX <= Opcode && Opcode <= ARM::t2LDREXH) || + (ARM::t2STREX <= Opcode && Opcode <= ARM::t2STREXH)) { + // Load/store exclusive. + return DisassembleThumb2LdStEx(MI, Opcode, insn, NumOps, NumOpsAdded, + B); + } + if (Opcode == ARM::t2LDRDi8 || + Opcode == ARM::t2LDRD_PRE || Opcode == ARM::t2LDRD_POST || + Opcode == ARM::t2STRDi8 || + Opcode == ARM::t2STRD_PRE || Opcode == ARM::t2STRD_POST) { + // Load/store dual. + return DisassembleThumb2LdStDual(MI, Opcode, insn, NumOps, NumOpsAdded, + B); + } + if (Opcode == ARM::t2TBBgen || Opcode == ARM::t2TBHgen) { + // Table branch. + return DisassembleThumb2TB(MI, Opcode, insn, NumOps, NumOpsAdded, B); + } + } else if (slice(op2, 6, 5) == 1) { + // Data-processing (shifted register). + return DisassembleThumb2DPSoReg(MI, Opcode, insn, NumOps, NumOpsAdded, B); + } + + // FIXME: A6.3.18 Coprocessor instructions + // But see ThumbDisassembler::getInstruction(). + + break; + case 2: + if (op == 0) { + if (slice(op2, 5, 5) == 0) { + // Data-processing (modified immediate) + return DisassembleThumb2DPModImm(MI, Opcode, insn, NumOps, NumOpsAdded, + B); + } else { + // Data-processing (plain binary immediate) + return DisassembleThumb2DPBinImm(MI, Opcode, insn, NumOps, NumOpsAdded, + B); + } + } else { + // Branches and miscellaneous control on page A6-20. + return DisassembleThumb2BrMiscCtrl(MI, Opcode, insn, NumOps, NumOpsAdded, + B); + } + + break; + case 3: + switch (slice(op2, 6, 5)) { + case 0: + // Load/store instructions... + if (slice(op2, 0, 0) == 0) { + if (slice(op2, 4, 4) == 0) { + // Store single data item on page A6-30 + return DisassembleThumb2LdSt(false, MI,Opcode,insn,NumOps,NumOpsAdded, + B); + } else { + // FIXME: Advanced SIMD element or structure load/store instructions. + // But see ThumbDisassembler::getInstruction(). + ; + } + } else { + // Table A6-9 32-bit Thumb instruction encoding: Load byte|halfword|word + return DisassembleThumb2LdSt(true, MI,Opcode,insn,NumOps,NumOpsAdded, B); + } + break; + case 1: + if (slice(op2, 4, 4) == 0) { + // A6.3.12 Data-processing (register) + return DisassembleThumb2DPReg(MI, Opcode, insn, NumOps, NumOpsAdded, B); + } else if (slice(op2, 3, 3) == 0) { + // A6.3.16 Multiply, multiply accumulate, and absolute difference + return DisassembleThumb2Mul(MI, Opcode, insn, NumOps, NumOpsAdded, B); + } else { + // A6.3.17 Long multiply, long multiply accumulate, and divide + return DisassembleThumb2LongMul(MI, Opcode, insn, NumOps, NumOpsAdded, + B); + } + break; + default: + // FIXME: A6.3.18 Coprocessor instructions + // But see ThumbDisassembler::getInstruction(). + ; + break; + } + + break; + default: + assert(0 && "Thumb2 encoding error!"); + break; + } + + return false; +} + +static bool DisassembleThumbFrm(MCInst &MI, unsigned Opcode, uint32_t insn, + unsigned short NumOps, unsigned &NumOpsAdded, BO Builder) { + + uint16_t HalfWord = slice(insn, 31, 16); + + if (HalfWord == 0) { + // A6.2 16-bit Thumb instruction encoding + // op = bits[15:10] + uint16_t op = slice(insn, 15, 10); + return DisassembleThumb1(op, MI, Opcode, insn, NumOps, NumOpsAdded, + Builder); + } + + unsigned bits15_11 = slice(HalfWord, 15, 11); + + // A6.1 Thumb instruction set encoding + if (!(bits15_11 == 0x1D || bits15_11 == 0x1E || bits15_11 == 0x1F)) { + assert("Bits[15:11] first halfword of Thumb2 instruction is out of range"); + return false; + } + + // A6.3 32-bit Thumb instruction encoding + + uint16_t op1 = slice(HalfWord, 12, 11); + uint16_t op2 = slice(HalfWord, 10, 4); + uint16_t op = slice(insn, 15, 15); + + return DisassembleThumb2(op1, op2, op, MI, Opcode, insn, NumOps, NumOpsAdded, + Builder); +} diff --git a/contrib/llvm/lib/Target/ARM/Makefile b/contrib/llvm/lib/Target/ARM/Makefile new file mode 100644 index 0000000..9e3ff29 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/Makefile @@ -0,0 +1,24 @@ +##===- lib/Target/ARM/Makefile -----------------------------*- Makefile -*-===## +# +# The LLVM Compiler Infrastructure +# +# This file is distributed under the University of Illinois Open Source +# License. See LICENSE.TXT for details. +# +##===----------------------------------------------------------------------===## + +LEVEL = ../../.. +LIBRARYNAME = LLVMARMCodeGen +TARGET = ARM + +# Make sure that tblgen is run, first thing. +BUILT_SOURCES = ARMGenRegisterInfo.h.inc ARMGenRegisterNames.inc \ + ARMGenRegisterInfo.inc ARMGenInstrNames.inc \ + ARMGenInstrInfo.inc ARMGenAsmWriter.inc \ + ARMGenDAGISel.inc ARMGenSubtarget.inc \ + ARMGenCodeEmitter.inc ARMGenCallingConv.inc \ + ARMGenDecoderTables.inc ARMGenEDInfo.inc + +DIRS = AsmPrinter AsmParser Disassembler TargetInfo + +include $(LEVEL)/Makefile.common diff --git a/contrib/llvm/lib/Target/ARM/NEONMoveFix.cpp b/contrib/llvm/lib/Target/ARM/NEONMoveFix.cpp new file mode 100644 index 0000000..0a4400c --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/NEONMoveFix.cpp @@ -0,0 +1,141 @@ +//===-- NEONMoveFix.cpp - Convert vfp reg-reg moves into neon ---*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "neon-mov-fix" +#include "ARM.h" +#include "ARMMachineFunctionInfo.h" +#include "ARMInstrInfo.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +using namespace llvm; + +STATISTIC(NumVMovs, "Number of reg-reg moves converted"); + +namespace { + struct NEONMoveFixPass : public MachineFunctionPass { + static char ID; + NEONMoveFixPass() : MachineFunctionPass(&ID) {} + + virtual bool runOnMachineFunction(MachineFunction &Fn); + + virtual const char *getPassName() const { + return "NEON reg-reg move conversion"; + } + + private: + const TargetRegisterInfo *TRI; + const ARMBaseInstrInfo *TII; + + typedef DenseMap<unsigned, const MachineInstr*> RegMap; + + bool InsertMoves(MachineBasicBlock &MBB); + }; + char NEONMoveFixPass::ID = 0; +} + +bool NEONMoveFixPass::InsertMoves(MachineBasicBlock &MBB) { + RegMap Defs; + bool Modified = false; + + // Walk over MBB tracking the def points of the registers. + MachineBasicBlock::iterator MII = MBB.begin(), E = MBB.end(); + MachineBasicBlock::iterator NextMII; + for (; MII != E; MII = NextMII) { + NextMII = llvm::next(MII); + MachineInstr *MI = &*MII; + + if (MI->getOpcode() == ARM::VMOVD && + !TII->isPredicated(MI)) { + unsigned SrcReg = MI->getOperand(1).getReg(); + // If we do not find an instruction defining the reg, this means the + // register should be live-in for this BB. It's always to better to use + // NEON reg-reg moves. + unsigned Domain = ARMII::DomainNEON; + RegMap::iterator DefMI = Defs.find(SrcReg); + if (DefMI != Defs.end()) { + Domain = DefMI->second->getDesc().TSFlags & ARMII::DomainMask; + // Instructions in general domain are subreg accesses. + // Map them to NEON reg-reg moves. + if (Domain == ARMII::DomainGeneral) + Domain = ARMII::DomainNEON; + } + + if (Domain & ARMII::DomainNEON) { + // Convert VMOVD to VMOVDneon + unsigned DestReg = MI->getOperand(0).getReg(); + + DEBUG({errs() << "vmov convert: "; MI->dump();}); + + // It's safe to ignore imp-defs / imp-uses here, since: + // - We're running late, no intelligent condegen passes should be run + // afterwards + // - The imp-defs / imp-uses are superregs only, we don't care about + // them. + AddDefaultPred(BuildMI(MBB, *MI, MI->getDebugLoc(), + TII->get(ARM::VMOVDneon), DestReg).addReg(SrcReg)); + MBB.erase(MI); + MachineBasicBlock::iterator I = prior(NextMII); + MI = &*I; + + DEBUG({errs() << " into: "; MI->dump();}); + + Modified = true; + ++NumVMovs; + } else { + assert((Domain & ARMII::DomainVFP) && "Invalid domain!"); + // Do nothing. + } + } + + // Update def information. + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { + const MachineOperand& MO = MI->getOperand(i); + if (!MO.isReg() || !MO.isDef()) + continue; + unsigned MOReg = MO.getReg(); + + Defs[MOReg] = MI; + // Catch subregs as well. + for (const unsigned *R = TRI->getSubRegisters(MOReg); *R; ++R) + Defs[*R] = MI; + } + } + + return Modified; +} + +bool NEONMoveFixPass::runOnMachineFunction(MachineFunction &Fn) { + ARMFunctionInfo *AFI = Fn.getInfo<ARMFunctionInfo>(); + const TargetMachine &TM = Fn.getTarget(); + + if (AFI->isThumb1OnlyFunction()) + return false; + + TRI = TM.getRegisterInfo(); + TII = static_cast<const ARMBaseInstrInfo*>(TM.getInstrInfo()); + + bool Modified = false; + for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E; + ++MFI) { + MachineBasicBlock &MBB = *MFI; + Modified |= InsertMoves(MBB); + } + + return Modified; +} + +/// createNEONMoveFixPass - Returns an instance of the NEON reg-reg moves fix +/// pass. +FunctionPass *llvm::createNEONMoveFixPass() { + return new NEONMoveFixPass(); +} diff --git a/contrib/llvm/lib/Target/ARM/NEONPreAllocPass.cpp b/contrib/llvm/lib/Target/ARM/NEONPreAllocPass.cpp new file mode 100644 index 0000000..a725898 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/NEONPreAllocPass.cpp @@ -0,0 +1,530 @@ +//===-- NEONPreAllocPass.cpp - Allocate adjacent NEON registers--*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "neon-prealloc" +#include "ARM.h" +#include "ARMInstrInfo.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +using namespace llvm; + +namespace { + class NEONPreAllocPass : public MachineFunctionPass { + const TargetInstrInfo *TII; + MachineRegisterInfo *MRI; + + public: + static char ID; + NEONPreAllocPass() : MachineFunctionPass(&ID) {} + + virtual bool runOnMachineFunction(MachineFunction &MF); + + virtual const char *getPassName() const { + return "NEON register pre-allocation pass"; + } + + private: + bool FormsRegSequence(MachineInstr *MI, + unsigned FirstOpnd, unsigned NumRegs, + unsigned Offset, unsigned Stride) const; + bool PreAllocNEONRegisters(MachineBasicBlock &MBB); + }; + + char NEONPreAllocPass::ID = 0; +} + +static bool isNEONMultiRegOp(int Opcode, unsigned &FirstOpnd, unsigned &NumRegs, + unsigned &Offset, unsigned &Stride) { + // Default to unit stride with no offset. + Stride = 1; + Offset = 0; + + switch (Opcode) { + default: + break; + + case ARM::VLD1q8: + case ARM::VLD1q16: + case ARM::VLD1q32: + case ARM::VLD1q64: + case ARM::VLD2d8: + case ARM::VLD2d16: + case ARM::VLD2d32: + case ARM::VLD2LNd8: + case ARM::VLD2LNd16: + case ARM::VLD2LNd32: + FirstOpnd = 0; + NumRegs = 2; + return true; + + case ARM::VLD2q8: + case ARM::VLD2q16: + case ARM::VLD2q32: + FirstOpnd = 0; + NumRegs = 4; + return true; + + case ARM::VLD2LNq16: + case ARM::VLD2LNq32: + FirstOpnd = 0; + NumRegs = 2; + Offset = 0; + Stride = 2; + return true; + + case ARM::VLD2LNq16odd: + case ARM::VLD2LNq32odd: + FirstOpnd = 0; + NumRegs = 2; + Offset = 1; + Stride = 2; + return true; + + case ARM::VLD3d8: + case ARM::VLD3d16: + case ARM::VLD3d32: + case ARM::VLD1d64T: + case ARM::VLD3LNd8: + case ARM::VLD3LNd16: + case ARM::VLD3LNd32: + FirstOpnd = 0; + NumRegs = 3; + return true; + + case ARM::VLD3q8_UPD: + case ARM::VLD3q16_UPD: + case ARM::VLD3q32_UPD: + FirstOpnd = 0; + NumRegs = 3; + Offset = 0; + Stride = 2; + return true; + + case ARM::VLD3q8odd_UPD: + case ARM::VLD3q16odd_UPD: + case ARM::VLD3q32odd_UPD: + FirstOpnd = 0; + NumRegs = 3; + Offset = 1; + Stride = 2; + return true; + + case ARM::VLD3LNq16: + case ARM::VLD3LNq32: + FirstOpnd = 0; + NumRegs = 3; + Offset = 0; + Stride = 2; + return true; + + case ARM::VLD3LNq16odd: + case ARM::VLD3LNq32odd: + FirstOpnd = 0; + NumRegs = 3; + Offset = 1; + Stride = 2; + return true; + + case ARM::VLD4d8: + case ARM::VLD4d16: + case ARM::VLD4d32: + case ARM::VLD1d64Q: + case ARM::VLD4LNd8: + case ARM::VLD4LNd16: + case ARM::VLD4LNd32: + FirstOpnd = 0; + NumRegs = 4; + return true; + + case ARM::VLD4q8_UPD: + case ARM::VLD4q16_UPD: + case ARM::VLD4q32_UPD: + FirstOpnd = 0; + NumRegs = 4; + Offset = 0; + Stride = 2; + return true; + + case ARM::VLD4q8odd_UPD: + case ARM::VLD4q16odd_UPD: + case ARM::VLD4q32odd_UPD: + FirstOpnd = 0; + NumRegs = 4; + Offset = 1; + Stride = 2; + return true; + + case ARM::VLD4LNq16: + case ARM::VLD4LNq32: + FirstOpnd = 0; + NumRegs = 4; + Offset = 0; + Stride = 2; + return true; + + case ARM::VLD4LNq16odd: + case ARM::VLD4LNq32odd: + FirstOpnd = 0; + NumRegs = 4; + Offset = 1; + Stride = 2; + return true; + + case ARM::VST1q8: + case ARM::VST1q16: + case ARM::VST1q32: + case ARM::VST1q64: + case ARM::VST2d8: + case ARM::VST2d16: + case ARM::VST2d32: + case ARM::VST2LNd8: + case ARM::VST2LNd16: + case ARM::VST2LNd32: + FirstOpnd = 2; + NumRegs = 2; + return true; + + case ARM::VST2q8: + case ARM::VST2q16: + case ARM::VST2q32: + FirstOpnd = 2; + NumRegs = 4; + return true; + + case ARM::VST2LNq16: + case ARM::VST2LNq32: + FirstOpnd = 2; + NumRegs = 2; + Offset = 0; + Stride = 2; + return true; + + case ARM::VST2LNq16odd: + case ARM::VST2LNq32odd: + FirstOpnd = 2; + NumRegs = 2; + Offset = 1; + Stride = 2; + return true; + + case ARM::VST3d8: + case ARM::VST3d16: + case ARM::VST3d32: + case ARM::VST1d64T: + case ARM::VST3LNd8: + case ARM::VST3LNd16: + case ARM::VST3LNd32: + FirstOpnd = 2; + NumRegs = 3; + return true; + + case ARM::VST3q8_UPD: + case ARM::VST3q16_UPD: + case ARM::VST3q32_UPD: + FirstOpnd = 4; + NumRegs = 3; + Offset = 0; + Stride = 2; + return true; + + case ARM::VST3q8odd_UPD: + case ARM::VST3q16odd_UPD: + case ARM::VST3q32odd_UPD: + FirstOpnd = 4; + NumRegs = 3; + Offset = 1; + Stride = 2; + return true; + + case ARM::VST3LNq16: + case ARM::VST3LNq32: + FirstOpnd = 2; + NumRegs = 3; + Offset = 0; + Stride = 2; + return true; + + case ARM::VST3LNq16odd: + case ARM::VST3LNq32odd: + FirstOpnd = 2; + NumRegs = 3; + Offset = 1; + Stride = 2; + return true; + + case ARM::VST4d8: + case ARM::VST4d16: + case ARM::VST4d32: + case ARM::VST1d64Q: + case ARM::VST4LNd8: + case ARM::VST4LNd16: + case ARM::VST4LNd32: + FirstOpnd = 2; + NumRegs = 4; + return true; + + case ARM::VST4q8_UPD: + case ARM::VST4q16_UPD: + case ARM::VST4q32_UPD: + FirstOpnd = 4; + NumRegs = 4; + Offset = 0; + Stride = 2; + return true; + + case ARM::VST4q8odd_UPD: + case ARM::VST4q16odd_UPD: + case ARM::VST4q32odd_UPD: + FirstOpnd = 4; + NumRegs = 4; + Offset = 1; + Stride = 2; + return true; + + case ARM::VST4LNq16: + case ARM::VST4LNq32: + FirstOpnd = 2; + NumRegs = 4; + Offset = 0; + Stride = 2; + return true; + + case ARM::VST4LNq16odd: + case ARM::VST4LNq32odd: + FirstOpnd = 2; + NumRegs = 4; + Offset = 1; + Stride = 2; + return true; + + case ARM::VTBL2: + FirstOpnd = 1; + NumRegs = 2; + return true; + + case ARM::VTBL3: + FirstOpnd = 1; + NumRegs = 3; + return true; + + case ARM::VTBL4: + FirstOpnd = 1; + NumRegs = 4; + return true; + + case ARM::VTBX2: + FirstOpnd = 2; + NumRegs = 2; + return true; + + case ARM::VTBX3: + FirstOpnd = 2; + NumRegs = 3; + return true; + + case ARM::VTBX4: + FirstOpnd = 2; + NumRegs = 4; + return true; + } + + return false; +} + +bool +NEONPreAllocPass::FormsRegSequence(MachineInstr *MI, + unsigned FirstOpnd, unsigned NumRegs, + unsigned Offset, unsigned Stride) const { + MachineOperand &FMO = MI->getOperand(FirstOpnd); + assert(FMO.isReg() && FMO.getSubReg() == 0 && "unexpected operand"); + unsigned VirtReg = FMO.getReg(); + (void)VirtReg; + assert(TargetRegisterInfo::isVirtualRegister(VirtReg) && + "expected a virtual register"); + + unsigned LastSubIdx = 0; + if (FMO.isDef()) { + MachineInstr *RegSeq = 0; + for (unsigned R = 0; R < NumRegs; ++R) { + const MachineOperand &MO = MI->getOperand(FirstOpnd + R); + assert(MO.isReg() && MO.getSubReg() == 0 && "unexpected operand"); + unsigned VirtReg = MO.getReg(); + assert(TargetRegisterInfo::isVirtualRegister(VirtReg) && + "expected a virtual register"); + // Feeding into a REG_SEQUENCE. + if (!MRI->hasOneNonDBGUse(VirtReg)) + return false; + MachineInstr *UseMI = &*MRI->use_nodbg_begin(VirtReg); + if (!UseMI->isRegSequence()) + return false; + if (RegSeq && RegSeq != UseMI) + return false; + unsigned OpIdx = 1 + (Offset + R * Stride) * 2; + if (UseMI->getOperand(OpIdx).getReg() != VirtReg) + llvm_unreachable("Malformed REG_SEQUENCE instruction!"); + unsigned SubIdx = UseMI->getOperand(OpIdx + 1).getImm(); + if (LastSubIdx) { + if (LastSubIdx != SubIdx-Stride) + return false; + } else { + // Must start from dsub_0 or qsub_0. + if (SubIdx != (ARM::dsub_0+Offset) && + SubIdx != (ARM::qsub_0+Offset)) + return false; + } + RegSeq = UseMI; + LastSubIdx = SubIdx; + } + + // In the case of vld3, etc., make sure the trailing operand of + // REG_SEQUENCE is an undef. + if (NumRegs == 3) { + unsigned OpIdx = 1 + (Offset + 3 * Stride) * 2; + const MachineOperand &MO = RegSeq->getOperand(OpIdx); + unsigned VirtReg = MO.getReg(); + MachineInstr *DefMI = MRI->getVRegDef(VirtReg); + if (!DefMI || !DefMI->isImplicitDef()) + return false; + } + return true; + } + + unsigned LastSrcReg = 0; + SmallVector<unsigned, 4> SubIds; + for (unsigned R = 0; R < NumRegs; ++R) { + const MachineOperand &MO = MI->getOperand(FirstOpnd + R); + assert(MO.isReg() && MO.getSubReg() == 0 && "unexpected operand"); + unsigned VirtReg = MO.getReg(); + assert(TargetRegisterInfo::isVirtualRegister(VirtReg) && + "expected a virtual register"); + // Extracting from a Q or QQ register. + MachineInstr *DefMI = MRI->getVRegDef(VirtReg); + if (!DefMI || !DefMI->isExtractSubreg()) + return false; + VirtReg = DefMI->getOperand(1).getReg(); + if (LastSrcReg && LastSrcReg != VirtReg) + return false; + LastSrcReg = VirtReg; + const TargetRegisterClass *RC = MRI->getRegClass(VirtReg); + if (RC != ARM::QPRRegisterClass && + RC != ARM::QQPRRegisterClass && + RC != ARM::QQQQPRRegisterClass) + return false; + unsigned SubIdx = DefMI->getOperand(2).getImm(); + if (LastSubIdx) { + if (LastSubIdx != SubIdx-Stride) + return false; + } else { + // Must start from dsub_0 or qsub_0. + if (SubIdx != (ARM::dsub_0+Offset) && + SubIdx != (ARM::qsub_0+Offset)) + return false; + } + SubIds.push_back(SubIdx); + LastSubIdx = SubIdx; + } + + // FIXME: Update the uses of EXTRACT_SUBREG from REG_SEQUENCE is + // currently required for correctness. e.g. + // %reg1041;<def> = REG_SEQUENCE %reg1040<kill>, 5, %reg1035<kill>, 6 + // %reg1042<def> = EXTRACT_SUBREG %reg1041, 6 + // %reg1043<def> = EXTRACT_SUBREG %reg1041, 5 + // VST1q16 %reg1025<kill>, 0, %reg1043<kill>, %reg1042<kill>, + // reg1025 and reg1043 should be replaced with reg1041:6 and reg1041:5 + // respectively. + // We need to change how we model uses of REG_SEQUENCE. + for (unsigned R = 0; R < NumRegs; ++R) { + MachineOperand &MO = MI->getOperand(FirstOpnd + R); + unsigned OldReg = MO.getReg(); + MachineInstr *DefMI = MRI->getVRegDef(OldReg); + assert(DefMI->isExtractSubreg()); + MO.setReg(LastSrcReg); + MO.setSubReg(SubIds[R]); + if (R != 0) + MO.setIsKill(false); + // Delete the EXTRACT_SUBREG if its result is now dead. + if (MRI->use_empty(OldReg)) + DefMI->eraseFromParent(); + } + + return true; +} + +bool NEONPreAllocPass::PreAllocNEONRegisters(MachineBasicBlock &MBB) { + bool Modified = false; + + MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); + for (; MBBI != E; ++MBBI) { + MachineInstr *MI = &*MBBI; + unsigned FirstOpnd, NumRegs, Offset, Stride; + if (!isNEONMultiRegOp(MI->getOpcode(), FirstOpnd, NumRegs, Offset, Stride)) + continue; + if (llvm::ModelWithRegSequence() && + FormsRegSequence(MI, FirstOpnd, NumRegs, Offset, Stride)) + continue; + + MachineBasicBlock::iterator NextI = llvm::next(MBBI); + for (unsigned R = 0; R < NumRegs; ++R) { + MachineOperand &MO = MI->getOperand(FirstOpnd + R); + assert(MO.isReg() && MO.getSubReg() == 0 && "unexpected operand"); + unsigned VirtReg = MO.getReg(); + assert(TargetRegisterInfo::isVirtualRegister(VirtReg) && + "expected a virtual register"); + + // For now, just assign a fixed set of adjacent registers. + // This leaves plenty of room for future improvements. + static const unsigned NEONDRegs[] = { + ARM::D0, ARM::D1, ARM::D2, ARM::D3, + ARM::D4, ARM::D5, ARM::D6, ARM::D7 + }; + MO.setReg(NEONDRegs[Offset + R * Stride]); + + if (MO.isUse()) { + // Insert a copy from VirtReg. + TII->copyRegToReg(MBB, MBBI, MO.getReg(), VirtReg, + ARM::DPRRegisterClass, ARM::DPRRegisterClass, + DebugLoc()); + if (MO.isKill()) { + MachineInstr *CopyMI = prior(MBBI); + CopyMI->findRegisterUseOperand(VirtReg)->setIsKill(); + } + MO.setIsKill(); + } else if (MO.isDef() && !MO.isDead()) { + // Add a copy to VirtReg. + TII->copyRegToReg(MBB, NextI, VirtReg, MO.getReg(), + ARM::DPRRegisterClass, ARM::DPRRegisterClass, + DebugLoc()); + } + } + } + + return Modified; +} + +bool NEONPreAllocPass::runOnMachineFunction(MachineFunction &MF) { + TII = MF.getTarget().getInstrInfo(); + MRI = &MF.getRegInfo(); + + bool Modified = false; + for (MachineFunction::iterator MFI = MF.begin(), E = MF.end(); MFI != E; + ++MFI) { + MachineBasicBlock &MBB = *MFI; + Modified |= PreAllocNEONRegisters(MBB); + } + + return Modified; +} + +/// createNEONPreAllocPass - returns an instance of the NEON register +/// pre-allocation pass. +FunctionPass *llvm::createNEONPreAllocPass() { + return new NEONPreAllocPass(); +} diff --git a/contrib/llvm/lib/Target/ARM/README-Thumb.txt b/contrib/llvm/lib/Target/ARM/README-Thumb.txt new file mode 100644 index 0000000..6b605bb --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/README-Thumb.txt @@ -0,0 +1,248 @@ +//===---------------------------------------------------------------------===// +// Random ideas for the ARM backend (Thumb specific). +//===---------------------------------------------------------------------===// + +* Add support for compiling functions in both ARM and Thumb mode, then taking + the smallest. + +* Add support for compiling individual basic blocks in thumb mode, when in a + larger ARM function. This can be used for presumed cold code, like paths + to abort (failure path of asserts), EH handling code, etc. + +* Thumb doesn't have normal pre/post increment addressing modes, but you can + load/store 32-bit integers with pre/postinc by using load/store multiple + instrs with a single register. + +* Make better use of high registers r8, r10, r11, r12 (ip). Some variants of add + and cmp instructions can use high registers. Also, we can use them as + temporaries to spill values into. + +* In thumb mode, short, byte, and bool preferred alignments are currently set + to 4 to accommodate ISA restriction (i.e. add sp, #imm, imm must be multiple + of 4). + +//===---------------------------------------------------------------------===// + +Potential jumptable improvements: + +* If we know function size is less than (1 << 16) * 2 bytes, we can use 16-bit + jumptable entries (e.g. (L1 - L2) >> 1). Or even smaller entries if the + function is even smaller. This also applies to ARM. + +* Thumb jumptable codegen can improve given some help from the assembler. This + is what we generate right now: + + .set PCRELV0, (LJTI1_0_0-(LPCRELL0+4)) +LPCRELL0: + mov r1, #PCRELV0 + add r1, pc + ldr r0, [r0, r1] + mov pc, r0 + .align 2 +LJTI1_0_0: + .long LBB1_3 + ... + +Note there is another pc relative add that we can take advantage of. + add r1, pc, #imm_8 * 4 + +We should be able to generate: + +LPCRELL0: + add r1, LJTI1_0_0 + ldr r0, [r0, r1] + mov pc, r0 + .align 2 +LJTI1_0_0: + .long LBB1_3 + +if the assembler can translate the add to: + add r1, pc, #((LJTI1_0_0-(LPCRELL0+4))&0xfffffffc) + +Note the assembler also does something similar to constpool load: +LPCRELL0: + ldr r0, LCPI1_0 +=> + ldr r0, pc, #((LCPI1_0-(LPCRELL0+4))&0xfffffffc) + + +//===---------------------------------------------------------------------===// + +We compiles the following: + +define i16 @func_entry_2E_ce(i32 %i) { + switch i32 %i, label %bb12.exitStub [ + i32 0, label %bb4.exitStub + i32 1, label %bb9.exitStub + i32 2, label %bb4.exitStub + i32 3, label %bb4.exitStub + i32 7, label %bb9.exitStub + i32 8, label %bb.exitStub + i32 9, label %bb9.exitStub + ] + +bb12.exitStub: + ret i16 0 + +bb4.exitStub: + ret i16 1 + +bb9.exitStub: + ret i16 2 + +bb.exitStub: + ret i16 3 +} + +into: + +_func_entry_2E_ce: + mov r2, #1 + lsl r2, r0 + cmp r0, #9 + bhi LBB1_4 @bb12.exitStub +LBB1_1: @newFuncRoot + mov r1, #13 + tst r2, r1 + bne LBB1_5 @bb4.exitStub +LBB1_2: @newFuncRoot + ldr r1, LCPI1_0 + tst r2, r1 + bne LBB1_6 @bb9.exitStub +LBB1_3: @newFuncRoot + mov r1, #1 + lsl r1, r1, #8 + tst r2, r1 + bne LBB1_7 @bb.exitStub +LBB1_4: @bb12.exitStub + mov r0, #0 + bx lr +LBB1_5: @bb4.exitStub + mov r0, #1 + bx lr +LBB1_6: @bb9.exitStub + mov r0, #2 + bx lr +LBB1_7: @bb.exitStub + mov r0, #3 + bx lr +LBB1_8: + .align 2 +LCPI1_0: + .long 642 + + +gcc compiles to: + + cmp r0, #9 + @ lr needed for prologue + bhi L2 + ldr r3, L11 + mov r2, #1 + mov r1, r2, asl r0 + ands r0, r3, r2, asl r0 + movne r0, #2 + bxne lr + tst r1, #13 + beq L9 +L3: + mov r0, r2 + bx lr +L9: + tst r1, #256 + movne r0, #3 + bxne lr +L2: + mov r0, #0 + bx lr +L12: + .align 2 +L11: + .long 642 + + +GCC is doing a couple of clever things here: + 1. It is predicating one of the returns. This isn't a clear win though: in + cases where that return isn't taken, it is replacing one condbranch with + two 'ne' predicated instructions. + 2. It is sinking the shift of "1 << i" into the tst, and using ands instead of + tst. This will probably require whole function isel. + 3. GCC emits: + tst r1, #256 + we emit: + mov r1, #1 + lsl r1, r1, #8 + tst r2, r1 + + +//===---------------------------------------------------------------------===// + +When spilling in thumb mode and the sp offset is too large to fit in the ldr / +str offset field, we load the offset from a constpool entry and add it to sp: + +ldr r2, LCPI +add r2, sp +ldr r2, [r2] + +These instructions preserve the condition code which is important if the spill +is between a cmp and a bcc instruction. However, we can use the (potentially) +cheaper sequnce if we know it's ok to clobber the condition register. + +add r2, sp, #255 * 4 +add r2, #132 +ldr r2, [r2, #7 * 4] + +This is especially bad when dynamic alloca is used. The all fixed size stack +objects are referenced off the frame pointer with negative offsets. See +oggenc for an example. + + +//===---------------------------------------------------------------------===// + +Poor codegen test/CodeGen/ARM/select.ll f7: + + ldr r5, LCPI1_0 +LPC0: + add r5, pc + ldr r6, LCPI1_1 + ldr r2, LCPI1_2 + mov r3, r6 + mov lr, pc + bx r5 + +//===---------------------------------------------------------------------===// + +Make register allocator / spiller smarter so we can re-materialize "mov r, imm", +etc. Almost all Thumb instructions clobber condition code. + +//===---------------------------------------------------------------------===// + +Add ldmia, stmia support. + +//===---------------------------------------------------------------------===// + +Thumb load / store address mode offsets are scaled. The values kept in the +instruction operands are pre-scale values. This probably ought to be changed +to avoid extra work when we convert Thumb2 instructions to Thumb1 instructions. + +//===---------------------------------------------------------------------===// + +We need to make (some of the) Thumb1 instructions predicable. That will allow +shrinking of predicated Thumb2 instructions. To allow this, we need to be able +to toggle the 's' bit since they do not set CPSR when they are inside IT blocks. + +//===---------------------------------------------------------------------===// + +Make use of hi register variants of cmp: tCMPhir / tCMPZhir. + +//===---------------------------------------------------------------------===// + +Thumb1 immediate field sometimes keep pre-scaled values. See +Thumb1RegisterInfo::eliminateFrameIndex. This is inconsistent from ARM and +Thumb2. + +//===---------------------------------------------------------------------===// + +Rather than having tBR_JTr print a ".align 2" and constant island pass pad it, +add a target specific ALIGN instruction instead. That way, GetInstSizeInBytes +won't have to over-estimate. It can also be used for loop alignment pass. diff --git a/contrib/llvm/lib/Target/ARM/README-Thumb2.txt b/contrib/llvm/lib/Target/ARM/README-Thumb2.txt new file mode 100644 index 0000000..e7c2552 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/README-Thumb2.txt @@ -0,0 +1,6 @@ +//===---------------------------------------------------------------------===// +// Random ideas for the ARM backend (Thumb2 specific). +//===---------------------------------------------------------------------===// + +Make sure jumptable destinations are below the jumptable in order to make use +of tbb / tbh. diff --git a/contrib/llvm/lib/Target/ARM/README.txt b/contrib/llvm/lib/Target/ARM/README.txt new file mode 100644 index 0000000..85d5ca0 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/README.txt @@ -0,0 +1,592 @@ +//===---------------------------------------------------------------------===// +// Random ideas for the ARM backend. +//===---------------------------------------------------------------------===// + +Reimplement 'select' in terms of 'SEL'. + +* We would really like to support UXTAB16, but we need to prove that the + add doesn't need to overflow between the two 16-bit chunks. + +* Implement pre/post increment support. (e.g. PR935) +* Implement smarter constant generation for binops with large immediates. + +A few ARMv6T2 ops should be pattern matched: BFI, SBFX, and UBFX + +Interesting optimization for PIC codegen on arm-linux: +http://gcc.gnu.org/bugzilla/show_bug.cgi?id=43129 + +//===---------------------------------------------------------------------===// + +Crazy idea: Consider code that uses lots of 8-bit or 16-bit values. By the +time regalloc happens, these values are now in a 32-bit register, usually with +the top-bits known to be sign or zero extended. If spilled, we should be able +to spill these to a 8-bit or 16-bit stack slot, zero or sign extending as part +of the reload. + +Doing this reduces the size of the stack frame (important for thumb etc), and +also increases the likelihood that we will be able to reload multiple values +from the stack with a single load. + +//===---------------------------------------------------------------------===// + +The constant island pass is in good shape. Some cleanups might be desirable, +but there is unlikely to be much improvement in the generated code. + +1. There may be some advantage to trying to be smarter about the initial +placement, rather than putting everything at the end. + +2. There might be some compile-time efficiency to be had by representing +consecutive islands as a single block rather than multiple blocks. + +3. Use a priority queue to sort constant pool users in inverse order of + position so we always process the one closed to the end of functions + first. This may simply CreateNewWater. + +//===---------------------------------------------------------------------===// + +Eliminate copysign custom expansion. We are still generating crappy code with +default expansion + if-conversion. + +//===---------------------------------------------------------------------===// + +Eliminate one instruction from: + +define i32 @_Z6slow4bii(i32 %x, i32 %y) { + %tmp = icmp sgt i32 %x, %y + %retval = select i1 %tmp, i32 %x, i32 %y + ret i32 %retval +} + +__Z6slow4bii: + cmp r0, r1 + movgt r1, r0 + mov r0, r1 + bx lr +=> + +__Z6slow4bii: + cmp r0, r1 + movle r0, r1 + bx lr + +//===---------------------------------------------------------------------===// + +Implement long long "X-3" with instructions that fold the immediate in. These +were disabled due to badness with the ARM carry flag on subtracts. + +//===---------------------------------------------------------------------===// + +More load / store optimizations: +1) Better representation for block transfer? This is from Olden/power: + + fldd d0, [r4] + fstd d0, [r4, #+32] + fldd d0, [r4, #+8] + fstd d0, [r4, #+40] + fldd d0, [r4, #+16] + fstd d0, [r4, #+48] + fldd d0, [r4, #+24] + fstd d0, [r4, #+56] + +If we can spare the registers, it would be better to use fldm and fstm here. +Need major register allocator enhancement though. + +2) Can we recognize the relative position of constantpool entries? i.e. Treat + + ldr r0, LCPI17_3 + ldr r1, LCPI17_4 + ldr r2, LCPI17_5 + + as + ldr r0, LCPI17 + ldr r1, LCPI17+4 + ldr r2, LCPI17+8 + + Then the ldr's can be combined into a single ldm. See Olden/power. + +Note for ARM v4 gcc uses ldmia to load a pair of 32-bit values to represent a +double 64-bit FP constant: + + adr r0, L6 + ldmia r0, {r0-r1} + + .align 2 +L6: + .long -858993459 + .long 1074318540 + +3) struct copies appear to be done field by field +instead of by words, at least sometimes: + +struct foo { int x; short s; char c1; char c2; }; +void cpy(struct foo*a, struct foo*b) { *a = *b; } + +llvm code (-O2) + ldrb r3, [r1, #+6] + ldr r2, [r1] + ldrb r12, [r1, #+7] + ldrh r1, [r1, #+4] + str r2, [r0] + strh r1, [r0, #+4] + strb r3, [r0, #+6] + strb r12, [r0, #+7] +gcc code (-O2) + ldmia r1, {r1-r2} + stmia r0, {r1-r2} + +In this benchmark poor handling of aggregate copies has shown up as +having a large effect on size, and possibly speed as well (we don't have +a good way to measure on ARM). + +//===---------------------------------------------------------------------===// + +* Consider this silly example: + +double bar(double x) { + double r = foo(3.1); + return x+r; +} + +_bar: + stmfd sp!, {r4, r5, r7, lr} + add r7, sp, #8 + mov r4, r0 + mov r5, r1 + fldd d0, LCPI1_0 + fmrrd r0, r1, d0 + bl _foo + fmdrr d0, r4, r5 + fmsr s2, r0 + fsitod d1, s2 + faddd d0, d1, d0 + fmrrd r0, r1, d0 + ldmfd sp!, {r4, r5, r7, pc} + +Ignore the prologue and epilogue stuff for a second. Note + mov r4, r0 + mov r5, r1 +the copys to callee-save registers and the fact they are only being used by the +fmdrr instruction. It would have been better had the fmdrr been scheduled +before the call and place the result in a callee-save DPR register. The two +mov ops would not have been necessary. + +//===---------------------------------------------------------------------===// + +Calling convention related stuff: + +* gcc's parameter passing implementation is terrible and we suffer as a result: + +e.g. +struct s { + double d1; + int s1; +}; + +void foo(struct s S) { + printf("%g, %d\n", S.d1, S.s1); +} + +'S' is passed via registers r0, r1, r2. But gcc stores them to the stack, and +then reload them to r1, r2, and r3 before issuing the call (r0 contains the +address of the format string): + + stmfd sp!, {r7, lr} + add r7, sp, #0 + sub sp, sp, #12 + stmia sp, {r0, r1, r2} + ldmia sp, {r1-r2} + ldr r0, L5 + ldr r3, [sp, #8] +L2: + add r0, pc, r0 + bl L_printf$stub + +Instead of a stmia, ldmia, and a ldr, wouldn't it be better to do three moves? + +* Return an aggregate type is even worse: + +e.g. +struct s foo(void) { + struct s S = {1.1, 2}; + return S; +} + + mov ip, r0 + ldr r0, L5 + sub sp, sp, #12 +L2: + add r0, pc, r0 + @ lr needed for prologue + ldmia r0, {r0, r1, r2} + stmia sp, {r0, r1, r2} + stmia ip, {r0, r1, r2} + mov r0, ip + add sp, sp, #12 + bx lr + +r0 (and later ip) is the hidden parameter from caller to store the value in. The +first ldmia loads the constants into r0, r1, r2. The last stmia stores r0, r1, +r2 into the address passed in. However, there is one additional stmia that +stores r0, r1, and r2 to some stack location. The store is dead. + +The llvm-gcc generated code looks like this: + +csretcc void %foo(%struct.s* %agg.result) { +entry: + %S = alloca %struct.s, align 4 ; <%struct.s*> [#uses=1] + %memtmp = alloca %struct.s ; <%struct.s*> [#uses=1] + cast %struct.s* %S to sbyte* ; <sbyte*>:0 [#uses=2] + call void %llvm.memcpy.i32( sbyte* %0, sbyte* cast ({ double, int }* %C.0.904 to sbyte*), uint 12, uint 4 ) + cast %struct.s* %agg.result to sbyte* ; <sbyte*>:1 [#uses=2] + call void %llvm.memcpy.i32( sbyte* %1, sbyte* %0, uint 12, uint 0 ) + cast %struct.s* %memtmp to sbyte* ; <sbyte*>:2 [#uses=1] + call void %llvm.memcpy.i32( sbyte* %2, sbyte* %1, uint 12, uint 0 ) + ret void +} + +llc ends up issuing two memcpy's (the first memcpy becomes 3 loads from +constantpool). Perhaps we should 1) fix llvm-gcc so the memcpy is translated +into a number of load and stores, or 2) custom lower memcpy (of small size) to +be ldmia / stmia. I think option 2 is better but the current register +allocator cannot allocate a chunk of registers at a time. + +A feasible temporary solution is to use specific physical registers at the +lowering time for small (<= 4 words?) transfer size. + +* ARM CSRet calling convention requires the hidden argument to be returned by +the callee. + +//===---------------------------------------------------------------------===// + +We can definitely do a better job on BB placements to eliminate some branches. +It's very common to see llvm generated assembly code that looks like this: + +LBB3: + ... +LBB4: +... + beq LBB3 + b LBB2 + +If BB4 is the only predecessor of BB3, then we can emit BB3 after BB4. We can +then eliminate beq and and turn the unconditional branch to LBB2 to a bne. + +See McCat/18-imp/ComputeBoundingBoxes for an example. + +//===---------------------------------------------------------------------===// + +Pre-/post- indexed load / stores: + +1) We should not make the pre/post- indexed load/store transform if the base ptr +is guaranteed to be live beyond the load/store. This can happen if the base +ptr is live out of the block we are performing the optimization. e.g. + +mov r1, r2 +ldr r3, [r1], #4 +... + +vs. + +ldr r3, [r2] +add r1, r2, #4 +... + +In most cases, this is just a wasted optimization. However, sometimes it can +negatively impact the performance because two-address code is more restrictive +when it comes to scheduling. + +Unfortunately, liveout information is currently unavailable during DAG combine +time. + +2) Consider spliting a indexed load / store into a pair of add/sub + load/store + to solve #1 (in TwoAddressInstructionPass.cpp). + +3) Enhance LSR to generate more opportunities for indexed ops. + +4) Once we added support for multiple result patterns, write indexed loads + patterns instead of C++ instruction selection code. + +5) Use VLDM / VSTM to emulate indexed FP load / store. + +//===---------------------------------------------------------------------===// + +Implement support for some more tricky ways to materialize immediates. For +example, to get 0xffff8000, we can use: + +mov r9, #&3f8000 +sub r9, r9, #&400000 + +//===---------------------------------------------------------------------===// + +We sometimes generate multiple add / sub instructions to update sp in prologue +and epilogue if the inc / dec value is too large to fit in a single immediate +operand. In some cases, perhaps it might be better to load the value from a +constantpool instead. + +//===---------------------------------------------------------------------===// + +GCC generates significantly better code for this function. + +int foo(int StackPtr, unsigned char *Line, unsigned char *Stack, int LineLen) { + int i = 0; + + if (StackPtr != 0) { + while (StackPtr != 0 && i < (((LineLen) < (32768))? (LineLen) : (32768))) + Line[i++] = Stack[--StackPtr]; + if (LineLen > 32768) + { + while (StackPtr != 0 && i < LineLen) + { + i++; + --StackPtr; + } + } + } + return StackPtr; +} + +//===---------------------------------------------------------------------===// + +This should compile to the mlas instruction: +int mlas(int x, int y, int z) { return ((x * y + z) < 0) ? 7 : 13; } + +//===---------------------------------------------------------------------===// + +At some point, we should triage these to see if they still apply to us: + +http://gcc.gnu.org/bugzilla/show_bug.cgi?id=19598 +http://gcc.gnu.org/bugzilla/show_bug.cgi?id=18560 +http://gcc.gnu.org/bugzilla/show_bug.cgi?id=27016 + +http://gcc.gnu.org/bugzilla/show_bug.cgi?id=11831 +http://gcc.gnu.org/bugzilla/show_bug.cgi?id=11826 +http://gcc.gnu.org/bugzilla/show_bug.cgi?id=11825 +http://gcc.gnu.org/bugzilla/show_bug.cgi?id=11824 +http://gcc.gnu.org/bugzilla/show_bug.cgi?id=11823 +http://gcc.gnu.org/bugzilla/show_bug.cgi?id=11820 +http://gcc.gnu.org/bugzilla/show_bug.cgi?id=10982 + +http://gcc.gnu.org/bugzilla/show_bug.cgi?id=10242 +http://gcc.gnu.org/bugzilla/show_bug.cgi?id=9831 +http://gcc.gnu.org/bugzilla/show_bug.cgi?id=9760 +http://gcc.gnu.org/bugzilla/show_bug.cgi?id=9759 +http://gcc.gnu.org/bugzilla/show_bug.cgi?id=9703 +http://gcc.gnu.org/bugzilla/show_bug.cgi?id=9702 +http://gcc.gnu.org/bugzilla/show_bug.cgi?id=9663 + +http://www.inf.u-szeged.hu/gcc-arm/ +http://citeseer.ist.psu.edu/debus04linktime.html + +//===---------------------------------------------------------------------===// + +gcc generates smaller code for this function at -O2 or -Os: + +void foo(signed char* p) { + if (*p == 3) + bar(); + else if (*p == 4) + baz(); + else if (*p == 5) + quux(); +} + +llvm decides it's a good idea to turn the repeated if...else into a +binary tree, as if it were a switch; the resulting code requires -1 +compare-and-branches when *p<=2 or *p==5, the same number if *p==4 +or *p>6, and +1 if *p==3. So it should be a speed win +(on balance). However, the revised code is larger, with 4 conditional +branches instead of 3. + +More seriously, there is a byte->word extend before +each comparison, where there should be only one, and the condition codes +are not remembered when the same two values are compared twice. + +//===---------------------------------------------------------------------===// + +More LSR enhancements possible: + +1. Teach LSR about pre- and post- indexed ops to allow iv increment be merged + in a load / store. +2. Allow iv reuse even when a type conversion is required. For example, i8 + and i32 load / store addressing modes are identical. + + +//===---------------------------------------------------------------------===// + +This: + +int foo(int a, int b, int c, int d) { + long long acc = (long long)a * (long long)b; + acc += (long long)c * (long long)d; + return (int)(acc >> 32); +} + +Should compile to use SMLAL (Signed Multiply Accumulate Long) which multiplies +two signed 32-bit values to produce a 64-bit value, and accumulates this with +a 64-bit value. + +We currently get this with both v4 and v6: + +_foo: + smull r1, r0, r1, r0 + smull r3, r2, r3, r2 + adds r3, r3, r1 + adc r0, r2, r0 + bx lr + +//===---------------------------------------------------------------------===// + +This: + #include <algorithm> + std::pair<unsigned, bool> full_add(unsigned a, unsigned b) + { return std::make_pair(a + b, a + b < a); } + bool no_overflow(unsigned a, unsigned b) + { return !full_add(a, b).second; } + +Should compile to: + +_Z8full_addjj: + adds r2, r1, r2 + movcc r1, #0 + movcs r1, #1 + str r2, [r0, #0] + strb r1, [r0, #4] + mov pc, lr + +_Z11no_overflowjj: + cmn r0, r1 + movcs r0, #0 + movcc r0, #1 + mov pc, lr + +not: + +__Z8full_addjj: + add r3, r2, r1 + str r3, [r0] + mov r2, #1 + mov r12, #0 + cmp r3, r1 + movlo r12, r2 + str r12, [r0, #+4] + bx lr +__Z11no_overflowjj: + add r3, r1, r0 + mov r2, #1 + mov r1, #0 + cmp r3, r0 + movhs r1, r2 + mov r0, r1 + bx lr + +//===---------------------------------------------------------------------===// + +Some of the NEON intrinsics may be appropriate for more general use, either +as target-independent intrinsics or perhaps elsewhere in the ARM backend. +Some of them may also be lowered to target-independent SDNodes, and perhaps +some new SDNodes could be added. + +For example, maximum, minimum, and absolute value operations are well-defined +and standard operations, both for vector and scalar types. + +The current NEON-specific intrinsics for count leading zeros and count one +bits could perhaps be replaced by the target-independent ctlz and ctpop +intrinsics. It may also make sense to add a target-independent "ctls" +intrinsic for "count leading sign bits". Likewise, the backend could use +the target-independent SDNodes for these operations. + +ARMv6 has scalar saturating and halving adds and subtracts. The same +intrinsics could possibly be used for both NEON's vector implementations of +those operations and the ARMv6 scalar versions. + +//===---------------------------------------------------------------------===// + +ARM::MOVCCr is commutable (by flipping the condition). But we need to implement +ARMInstrInfo::commuteInstruction() to support it. + +//===---------------------------------------------------------------------===// + +Split out LDR (literal) from normal ARM LDR instruction. Also consider spliting +LDR into imm12 and so_reg forms. This allows us to clean up some code. e.g. +ARMLoadStoreOptimizer does not need to look at LDR (literal) and LDR (so_reg) +while ARMConstantIslandPass only need to worry about LDR (literal). + +//===---------------------------------------------------------------------===// + +Constant island pass should make use of full range SoImm values for LEApcrel. +Be careful though as the last attempt caused infinite looping on lencod. + +//===---------------------------------------------------------------------===// + +Predication issue. This function: + +extern unsigned array[ 128 ]; +int foo( int x ) { + int y; + y = array[ x & 127 ]; + if ( x & 128 ) + y = 123456789 & ( y >> 2 ); + else + y = 123456789 & y; + return y; +} + +compiles to: + +_foo: + and r1, r0, #127 + ldr r2, LCPI1_0 + ldr r2, [r2] + ldr r1, [r2, +r1, lsl #2] + mov r2, r1, lsr #2 + tst r0, #128 + moveq r2, r1 + ldr r0, LCPI1_1 + and r0, r2, r0 + bx lr + +It would be better to do something like this, to fold the shift into the +conditional move: + + and r1, r0, #127 + ldr r2, LCPI1_0 + ldr r2, [r2] + ldr r1, [r2, +r1, lsl #2] + tst r0, #128 + movne r1, r1, lsr #2 + ldr r0, LCPI1_1 + and r0, r1, r0 + bx lr + +it saves an instruction and a register. + +//===---------------------------------------------------------------------===// + +It might be profitable to cse MOVi16 if there are lots of 32-bit immediates +with the same bottom half. + +//===---------------------------------------------------------------------===// + +Robert Muth started working on an alternate jump table implementation that +does not put the tables in-line in the text. This is more like the llvm +default jump table implementation. This might be useful sometime. Several +revisions of patches are on the mailing list, beginning at: +http://lists.cs.uiuc.edu/pipermail/llvmdev/2009-June/022763.html + +//===---------------------------------------------------------------------===// + +Make use of the "rbit" instruction. + +//===---------------------------------------------------------------------===// + +Take a look at test/CodeGen/Thumb2/machine-licm.ll. ARM should be taught how +to licm and cse the unnecessary load from cp#1. + +//===---------------------------------------------------------------------===// + +The CMN instruction sets the flags like an ADD instruction, while CMP sets +them like a subtract. Therefore to be able to use CMN for comparisons other +than the Z bit, we'll need additional logic to reverse the conditionals +associated with the comparison. Perhaps a pseudo-instruction for the comparison, +with a post-codegen pass to clean up and handle the condition codes? +See PR5694 for testcase. diff --git a/contrib/llvm/lib/Target/ARM/TargetInfo/ARMTargetInfo.cpp b/contrib/llvm/lib/Target/ARM/TargetInfo/ARMTargetInfo.cpp new file mode 100644 index 0000000..163a0a9 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/TargetInfo/ARMTargetInfo.cpp @@ -0,0 +1,23 @@ +//===-- ARMTargetInfo.cpp - ARM Target Implementation ---------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "ARM.h" +#include "llvm/Module.h" +#include "llvm/Target/TargetRegistry.h" +using namespace llvm; + +Target llvm::TheARMTarget, llvm::TheThumbTarget; + +extern "C" void LLVMInitializeARMTargetInfo() { + RegisterTarget<Triple::arm, /*HasJIT=*/true> + X(TheARMTarget, "arm", "ARM"); + + RegisterTarget<Triple::thumb, /*HasJIT=*/true> + Y(TheThumbTarget, "thumb", "Thumb"); +} diff --git a/contrib/llvm/lib/Target/ARM/TargetInfo/CMakeLists.txt b/contrib/llvm/lib/Target/ARM/TargetInfo/CMakeLists.txt new file mode 100644 index 0000000..3910bb0 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/TargetInfo/CMakeLists.txt @@ -0,0 +1,7 @@ +include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. ) + +add_llvm_library(LLVMARMInfo + ARMTargetInfo.cpp + ) + +add_dependencies(LLVMARMInfo ARMCodeGenTable_gen) diff --git a/contrib/llvm/lib/Target/ARM/TargetInfo/Makefile b/contrib/llvm/lib/Target/ARM/TargetInfo/Makefile new file mode 100644 index 0000000..6292ab1 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/TargetInfo/Makefile @@ -0,0 +1,15 @@ +##===- lib/Target/ARM/TargetInfo/Makefile ------------------*- Makefile -*-===## +# +# The LLVM Compiler Infrastructure +# +# This file is distributed under the University of Illinois Open Source +# License. See LICENSE.TXT for details. +# +##===----------------------------------------------------------------------===## +LEVEL = ../../../.. +LIBRARYNAME = LLVMARMInfo + +# Hack: we need to include 'main' target directory to grab private headers +CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/.. + +include $(LEVEL)/Makefile.common diff --git a/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp b/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp new file mode 100644 index 0000000..fae84d4 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp @@ -0,0 +1,266 @@ +//===- Thumb1InstrInfo.cpp - Thumb-1 Instruction Information ----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the Thumb-1 implementation of the TargetInstrInfo class. +// +//===----------------------------------------------------------------------===// + +#include "Thumb1InstrInfo.h" +#include "ARM.h" +#include "ARMGenInstrInfo.inc" +#include "ARMMachineFunctionInfo.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineMemOperand.h" +#include "llvm/CodeGen/PseudoSourceValue.h" +#include "llvm/ADT/SmallVector.h" +#include "Thumb1InstrInfo.h" + +using namespace llvm; + +Thumb1InstrInfo::Thumb1InstrInfo(const ARMSubtarget &STI) + : ARMBaseInstrInfo(STI), RI(*this, STI) { +} + +unsigned Thumb1InstrInfo::getUnindexedOpcode(unsigned Opc) const { + return 0; +} + +bool Thumb1InstrInfo::copyRegToReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + unsigned DestReg, unsigned SrcReg, + const TargetRegisterClass *DestRC, + const TargetRegisterClass *SrcRC, + DebugLoc DL) const { + if (DestRC == ARM::GPRRegisterClass) { + if (SrcRC == ARM::GPRRegisterClass) { + BuildMI(MBB, I, DL, get(ARM::tMOVgpr2gpr), DestReg).addReg(SrcReg); + return true; + } else if (SrcRC == ARM::tGPRRegisterClass) { + BuildMI(MBB, I, DL, get(ARM::tMOVtgpr2gpr), DestReg).addReg(SrcReg); + return true; + } + } else if (DestRC == ARM::tGPRRegisterClass) { + if (SrcRC == ARM::GPRRegisterClass) { + BuildMI(MBB, I, DL, get(ARM::tMOVgpr2tgpr), DestReg).addReg(SrcReg); + return true; + } else if (SrcRC == ARM::tGPRRegisterClass) { + BuildMI(MBB, I, DL, get(ARM::tMOVr), DestReg).addReg(SrcReg); + return true; + } + } + + return false; +} + +bool Thumb1InstrInfo:: +canFoldMemoryOperand(const MachineInstr *MI, + const SmallVectorImpl<unsigned> &Ops) const { + if (Ops.size() != 1) return false; + + unsigned OpNum = Ops[0]; + unsigned Opc = MI->getOpcode(); + switch (Opc) { + default: break; + case ARM::tMOVr: + case ARM::tMOVtgpr2gpr: + case ARM::tMOVgpr2tgpr: + case ARM::tMOVgpr2gpr: { + if (OpNum == 0) { // move -> store + unsigned SrcReg = MI->getOperand(1).getReg(); + if (TargetRegisterInfo::isPhysicalRegister(SrcReg) && + !isARMLowRegister(SrcReg)) + // tSpill cannot take a high register operand. + return false; + } else { // move -> load + unsigned DstReg = MI->getOperand(0).getReg(); + if (TargetRegisterInfo::isPhysicalRegister(DstReg) && + !isARMLowRegister(DstReg)) + // tRestore cannot target a high register operand. + return false; + } + return true; + } + } + + return false; +} + +void Thumb1InstrInfo:: +storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, + unsigned SrcReg, bool isKill, int FI, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const { + assert((RC == ARM::tGPRRegisterClass || + (TargetRegisterInfo::isPhysicalRegister(SrcReg) && + isARMLowRegister(SrcReg))) && "Unknown regclass!"); + + if (RC == ARM::tGPRRegisterClass || + (TargetRegisterInfo::isPhysicalRegister(SrcReg) && + isARMLowRegister(SrcReg))) { + DebugLoc DL; + if (I != MBB.end()) DL = I->getDebugLoc(); + + MachineFunction &MF = *MBB.getParent(); + MachineFrameInfo &MFI = *MF.getFrameInfo(); + MachineMemOperand *MMO = + MF.getMachineMemOperand(PseudoSourceValue::getFixedStack(FI), + MachineMemOperand::MOStore, 0, + MFI.getObjectSize(FI), + MFI.getObjectAlignment(FI)); + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::tSpill)) + .addReg(SrcReg, getKillRegState(isKill)) + .addFrameIndex(FI).addImm(0).addMemOperand(MMO)); + } +} + +void Thumb1InstrInfo:: +loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, + unsigned DestReg, int FI, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const { + assert((RC == ARM::tGPRRegisterClass || + (TargetRegisterInfo::isPhysicalRegister(DestReg) && + isARMLowRegister(DestReg))) && "Unknown regclass!"); + + if (RC == ARM::tGPRRegisterClass || + (TargetRegisterInfo::isPhysicalRegister(DestReg) && + isARMLowRegister(DestReg))) { + DebugLoc DL; + if (I != MBB.end()) DL = I->getDebugLoc(); + + MachineFunction &MF = *MBB.getParent(); + MachineFrameInfo &MFI = *MF.getFrameInfo(); + MachineMemOperand *MMO = + MF.getMachineMemOperand(PseudoSourceValue::getFixedStack(FI), + MachineMemOperand::MOLoad, 0, + MFI.getObjectSize(FI), + MFI.getObjectAlignment(FI)); + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::tRestore), DestReg) + .addFrameIndex(FI).addImm(0).addMemOperand(MMO)); + } +} + +bool Thumb1InstrInfo:: +spillCalleeSavedRegisters(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + const std::vector<CalleeSavedInfo> &CSI, + const TargetRegisterInfo *TRI) const { + if (CSI.empty()) + return false; + + DebugLoc DL; + if (MI != MBB.end()) DL = MI->getDebugLoc(); + + MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, get(ARM::tPUSH)); + AddDefaultPred(MIB); + for (unsigned i = CSI.size(); i != 0; --i) { + unsigned Reg = CSI[i-1].getReg(); + bool isKill = true; + + // Add the callee-saved register as live-in unless it's LR and + // @llvm.returnaddress is called. If LR is returned for @llvm.returnaddress + // then it's already added to the function and entry block live-in sets. + if (Reg == ARM::LR) { + MachineFunction &MF = *MBB.getParent(); + if (MF.getFrameInfo()->isReturnAddressTaken() && + MF.getRegInfo().isLiveIn(Reg)) + isKill = false; + } + + if (isKill) { + MBB.addLiveIn(Reg); + MIB.addReg(Reg, RegState::Kill); + } + } + return true; +} + +bool Thumb1InstrInfo:: +restoreCalleeSavedRegisters(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + const std::vector<CalleeSavedInfo> &CSI, + const TargetRegisterInfo *TRI) const { + MachineFunction &MF = *MBB.getParent(); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + if (CSI.empty()) + return false; + + bool isVarArg = AFI->getVarArgsRegSaveSize() > 0; + DebugLoc DL = MI->getDebugLoc(); + MachineInstrBuilder MIB = BuildMI(MF, DL, get(ARM::tPOP)); + AddDefaultPred(MIB); + + bool NumRegs = false; + for (unsigned i = CSI.size(); i != 0; --i) { + unsigned Reg = CSI[i-1].getReg(); + if (Reg == ARM::LR) { + // Special epilogue for vararg functions. See emitEpilogue + if (isVarArg) + continue; + Reg = ARM::PC; + (*MIB).setDesc(get(ARM::tPOP_RET)); + MI = MBB.erase(MI); + } + MIB.addReg(Reg, getDefRegState(true)); + NumRegs = true; + } + + // It's illegal to emit pop instruction without operands. + if (NumRegs) + MBB.insert(MI, &*MIB); + else + MF.DeleteMachineInstr(MIB); + + return true; +} + +MachineInstr *Thumb1InstrInfo:: +foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, + const SmallVectorImpl<unsigned> &Ops, int FI) const { + if (Ops.size() != 1) return NULL; + + unsigned OpNum = Ops[0]; + unsigned Opc = MI->getOpcode(); + MachineInstr *NewMI = NULL; + switch (Opc) { + default: break; + case ARM::tMOVr: + case ARM::tMOVtgpr2gpr: + case ARM::tMOVgpr2tgpr: + case ARM::tMOVgpr2gpr: { + if (OpNum == 0) { // move -> store + unsigned SrcReg = MI->getOperand(1).getReg(); + bool isKill = MI->getOperand(1).isKill(); + if (TargetRegisterInfo::isPhysicalRegister(SrcReg) && + !isARMLowRegister(SrcReg)) + // tSpill cannot take a high register operand. + break; + NewMI = AddDefaultPred(BuildMI(MF, MI->getDebugLoc(), get(ARM::tSpill)) + .addReg(SrcReg, getKillRegState(isKill)) + .addFrameIndex(FI).addImm(0)); + } else { // move -> load + unsigned DstReg = MI->getOperand(0).getReg(); + if (TargetRegisterInfo::isPhysicalRegister(DstReg) && + !isARMLowRegister(DstReg)) + // tRestore cannot target a high register operand. + break; + bool isDead = MI->getOperand(0).isDead(); + NewMI = AddDefaultPred(BuildMI(MF, MI->getDebugLoc(), get(ARM::tRestore)) + .addReg(DstReg, + RegState::Define | getDeadRegState(isDead)) + .addFrameIndex(FI).addImm(0)); + } + break; + } + } + + return NewMI; +} diff --git a/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.h b/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.h new file mode 100644 index 0000000..c937296 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.h @@ -0,0 +1,84 @@ +//===- Thumb1InstrInfo.h - Thumb-1 Instruction Information ------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the Thumb-1 implementation of the TargetInstrInfo class. +// +//===----------------------------------------------------------------------===// + +#ifndef THUMB1INSTRUCTIONINFO_H +#define THUMB1INSTRUCTIONINFO_H + +#include "llvm/Target/TargetInstrInfo.h" +#include "ARM.h" +#include "ARMInstrInfo.h" +#include "Thumb1RegisterInfo.h" + +namespace llvm { + class ARMSubtarget; + +class Thumb1InstrInfo : public ARMBaseInstrInfo { + Thumb1RegisterInfo RI; +public: + explicit Thumb1InstrInfo(const ARMSubtarget &STI); + + // Return the non-pre/post incrementing version of 'Opc'. Return 0 + // if there is not such an opcode. + unsigned getUnindexedOpcode(unsigned Opc) const; + + /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info. As + /// such, whenever a client has an instance of instruction info, it should + /// always be able to get register info as well (through this method). + /// + const Thumb1RegisterInfo &getRegisterInfo() const { return RI; } + + bool spillCalleeSavedRegisters(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + const std::vector<CalleeSavedInfo> &CSI, + const TargetRegisterInfo *TRI) const; + bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + const std::vector<CalleeSavedInfo> &CSI, + const TargetRegisterInfo *TRI) const; + + bool copyRegToReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + unsigned DestReg, unsigned SrcReg, + const TargetRegisterClass *DestRC, + const TargetRegisterClass *SrcRC, + DebugLoc DL) const; + void storeRegToStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + unsigned SrcReg, bool isKill, int FrameIndex, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const; + + void loadRegFromStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + unsigned DestReg, int FrameIndex, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const; + + bool canFoldMemoryOperand(const MachineInstr *MI, + const SmallVectorImpl<unsigned> &Ops) const; + + MachineInstr* foldMemoryOperandImpl(MachineFunction &MF, + MachineInstr* MI, + const SmallVectorImpl<unsigned> &Ops, + int FrameIndex) const; + + MachineInstr* foldMemoryOperandImpl(MachineFunction &MF, + MachineInstr* MI, + const SmallVectorImpl<unsigned> &Ops, + MachineInstr* LoadMI) const { + return 0; + } +}; +} + +#endif // THUMB1INSTRUCTIONINFO_H diff --git a/contrib/llvm/lib/Target/ARM/Thumb1RegisterInfo.cpp b/contrib/llvm/lib/Target/ARM/Thumb1RegisterInfo.cpp new file mode 100644 index 0000000..2f635fe --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/Thumb1RegisterInfo.cpp @@ -0,0 +1,885 @@ +//===- Thumb1RegisterInfo.cpp - Thumb-1 Register Information ----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the Thumb-1 implementation of the TargetRegisterInfo +// class. +// +//===----------------------------------------------------------------------===// + +#include "ARM.h" +#include "ARMAddressingModes.h" +#include "ARMBaseInstrInfo.h" +#include "ARMMachineFunctionInfo.h" +#include "ARMSubtarget.h" +#include "Thumb1InstrInfo.h" +#include "Thumb1RegisterInfo.h" +#include "llvm/Constants.h" +#include "llvm/DerivedTypes.h" +#include "llvm/Function.h" +#include "llvm/LLVMContext.h" +#include "llvm/CodeGen/MachineConstantPool.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineLocation.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Target/TargetFrameInfo.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/ADT/BitVector.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" + +namespace llvm { +extern cl::opt<bool> ReuseFrameIndexVals; +} + +using namespace llvm; + +Thumb1RegisterInfo::Thumb1RegisterInfo(const ARMBaseInstrInfo &tii, + const ARMSubtarget &sti) + : ARMBaseRegisterInfo(tii, sti) { +} + +/// emitLoadConstPool - Emits a load from constpool to materialize the +/// specified immediate. +void Thumb1RegisterInfo::emitLoadConstPool(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, + DebugLoc dl, + unsigned DestReg, unsigned SubIdx, + int Val, + ARMCC::CondCodes Pred, + unsigned PredReg) const { + MachineFunction &MF = *MBB.getParent(); + MachineConstantPool *ConstantPool = MF.getConstantPool(); + const Constant *C = ConstantInt::get( + Type::getInt32Ty(MBB.getParent()->getFunction()->getContext()), Val); + unsigned Idx = ConstantPool->getConstantPoolIndex(C, 4); + + BuildMI(MBB, MBBI, dl, TII.get(ARM::tLDRcp)) + .addReg(DestReg, getDefRegState(true), SubIdx) + .addConstantPoolIndex(Idx).addImm(Pred).addReg(PredReg); +} + +const TargetRegisterClass* +Thumb1RegisterInfo::getPhysicalRegisterRegClass(unsigned Reg, EVT VT) const { + if (isARMLowRegister(Reg)) + return ARM::tGPRRegisterClass; + switch (Reg) { + default: + break; + case ARM::R8: case ARM::R9: case ARM::R10: case ARM::R11: + case ARM::R12: case ARM::SP: case ARM::LR: case ARM::PC: + return ARM::GPRRegisterClass; + } + + return TargetRegisterInfo::getPhysicalRegisterRegClass(Reg, VT); +} + +bool Thumb1RegisterInfo::hasReservedCallFrame(MachineFunction &MF) const { + const MachineFrameInfo *FFI = MF.getFrameInfo(); + unsigned CFSize = FFI->getMaxCallFrameSize(); + // It's not always a good idea to include the call frame as part of the + // stack frame. ARM (especially Thumb) has small immediate offset to + // address the stack frame. So a large call frame can cause poor codegen + // and may even makes it impossible to scavenge a register. + if (CFSize >= ((1 << 8) - 1) * 4 / 2) // Half of imm8 * 4 + return false; + + return !MF.getFrameInfo()->hasVarSizedObjects(); +} + + +/// emitThumbRegPlusImmInReg - Emits a series of instructions to materialize +/// a destreg = basereg + immediate in Thumb code. Materialize the immediate +/// in a register using mov / mvn sequences or load the immediate from a +/// constpool entry. +static +void emitThumbRegPlusImmInReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, + unsigned DestReg, unsigned BaseReg, + int NumBytes, bool CanChangeCC, + const TargetInstrInfo &TII, + const Thumb1RegisterInfo& MRI, + DebugLoc dl) { + MachineFunction &MF = *MBB.getParent(); + bool isHigh = !isARMLowRegister(DestReg) || + (BaseReg != 0 && !isARMLowRegister(BaseReg)); + bool isSub = false; + // Subtract doesn't have high register version. Load the negative value + // if either base or dest register is a high register. Also, if do not + // issue sub as part of the sequence if condition register is to be + // preserved. + if (NumBytes < 0 && !isHigh && CanChangeCC) { + isSub = true; + NumBytes = -NumBytes; + } + unsigned LdReg = DestReg; + if (DestReg == ARM::SP) { + assert(BaseReg == ARM::SP && "Unexpected!"); + LdReg = MF.getRegInfo().createVirtualRegister(ARM::tGPRRegisterClass); + } + + if (NumBytes <= 255 && NumBytes >= 0) + AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVi8), LdReg)) + .addImm(NumBytes); + else if (NumBytes < 0 && NumBytes >= -255) { + AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVi8), LdReg)) + .addImm(NumBytes); + AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII.get(ARM::tRSB), LdReg)) + .addReg(LdReg, RegState::Kill); + } else + MRI.emitLoadConstPool(MBB, MBBI, dl, LdReg, 0, NumBytes); + + // Emit add / sub. + int Opc = (isSub) ? ARM::tSUBrr : (isHigh ? ARM::tADDhirr : ARM::tADDrr); + MachineInstrBuilder MIB = + BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg); + if (Opc != ARM::tADDhirr) + MIB = AddDefaultT1CC(MIB); + if (DestReg == ARM::SP || isSub) + MIB.addReg(BaseReg).addReg(LdReg, RegState::Kill); + else + MIB.addReg(LdReg).addReg(BaseReg, RegState::Kill); + AddDefaultPred(MIB); +} + +/// calcNumMI - Returns the number of instructions required to materialize +/// the specific add / sub r, c instruction. +static unsigned calcNumMI(int Opc, int ExtraOpc, unsigned Bytes, + unsigned NumBits, unsigned Scale) { + unsigned NumMIs = 0; + unsigned Chunk = ((1 << NumBits) - 1) * Scale; + + if (Opc == ARM::tADDrSPi) { + unsigned ThisVal = (Bytes > Chunk) ? Chunk : Bytes; + Bytes -= ThisVal; + NumMIs++; + NumBits = 8; + Scale = 1; // Followed by a number of tADDi8. + Chunk = ((1 << NumBits) - 1) * Scale; + } + + NumMIs += Bytes / Chunk; + if ((Bytes % Chunk) != 0) + NumMIs++; + if (ExtraOpc) + NumMIs++; + return NumMIs; +} + +/// emitThumbRegPlusImmediate - Emits a series of instructions to materialize +/// a destreg = basereg + immediate in Thumb code. +static +void emitThumbRegPlusImmediate(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, + unsigned DestReg, unsigned BaseReg, + int NumBytes, const TargetInstrInfo &TII, + const Thumb1RegisterInfo& MRI, + DebugLoc dl) { + bool isSub = NumBytes < 0; + unsigned Bytes = (unsigned)NumBytes; + if (isSub) Bytes = -NumBytes; + bool isMul4 = (Bytes & 3) == 0; + bool isTwoAddr = false; + bool DstNotEqBase = false; + unsigned NumBits = 1; + unsigned Scale = 1; + int Opc = 0; + int ExtraOpc = 0; + bool NeedCC = false; + bool NeedPred = false; + + if (DestReg == BaseReg && BaseReg == ARM::SP) { + assert(isMul4 && "Thumb sp inc / dec size must be multiple of 4!"); + NumBits = 7; + Scale = 4; + Opc = isSub ? ARM::tSUBspi : ARM::tADDspi; + isTwoAddr = true; + } else if (!isSub && BaseReg == ARM::SP) { + // r1 = add sp, 403 + // => + // r1 = add sp, 100 * 4 + // r1 = add r1, 3 + if (!isMul4) { + Bytes &= ~3; + ExtraOpc = ARM::tADDi3; + } + NumBits = 8; + Scale = 4; + Opc = ARM::tADDrSPi; + } else { + // sp = sub sp, c + // r1 = sub sp, c + // r8 = sub sp, c + if (DestReg != BaseReg) + DstNotEqBase = true; + NumBits = 8; + if (DestReg == ARM::SP) { + Opc = isSub ? ARM::tSUBspi : ARM::tADDspi; + assert(isMul4 && "Thumb sp inc / dec size must be multiple of 4!"); + NumBits = 7; + Scale = 4; + } else { + Opc = isSub ? ARM::tSUBi8 : ARM::tADDi8; + NumBits = 8; + NeedPred = NeedCC = true; + } + isTwoAddr = true; + } + + unsigned NumMIs = calcNumMI(Opc, ExtraOpc, Bytes, NumBits, Scale); + unsigned Threshold = (DestReg == ARM::SP) ? 3 : 2; + if (NumMIs > Threshold) { + // This will expand into too many instructions. Load the immediate from a + // constpool entry. + emitThumbRegPlusImmInReg(MBB, MBBI, DestReg, BaseReg, NumBytes, true, TII, + MRI, dl); + return; + } + + if (DstNotEqBase) { + if (isARMLowRegister(DestReg) && isARMLowRegister(BaseReg)) { + // If both are low registers, emit DestReg = add BaseReg, max(Imm, 7) + unsigned Chunk = (1 << 3) - 1; + unsigned ThisVal = (Bytes > Chunk) ? Chunk : Bytes; + Bytes -= ThisVal; + const TargetInstrDesc &TID = TII.get(isSub ? ARM::tSUBi3 : ARM::tADDi3); + const MachineInstrBuilder MIB = + AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TID, DestReg)); + AddDefaultPred(MIB.addReg(BaseReg, RegState::Kill).addImm(ThisVal)); + } else { + BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), DestReg) + .addReg(BaseReg, RegState::Kill); + } + BaseReg = DestReg; + } + + unsigned Chunk = ((1 << NumBits) - 1) * Scale; + while (Bytes) { + unsigned ThisVal = (Bytes > Chunk) ? Chunk : Bytes; + Bytes -= ThisVal; + ThisVal /= Scale; + // Build the new tADD / tSUB. + if (isTwoAddr) { + MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg); + if (NeedCC) + MIB = AddDefaultT1CC(MIB); + MIB .addReg(DestReg).addImm(ThisVal); + if (NeedPred) + MIB = AddDefaultPred(MIB); + } + else { + bool isKill = BaseReg != ARM::SP; + MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg); + if (NeedCC) + MIB = AddDefaultT1CC(MIB); + MIB.addReg(BaseReg, getKillRegState(isKill)).addImm(ThisVal); + if (NeedPred) + MIB = AddDefaultPred(MIB); + BaseReg = DestReg; + + if (Opc == ARM::tADDrSPi) { + // r4 = add sp, imm + // r4 = add r4, imm + // ... + NumBits = 8; + Scale = 1; + Chunk = ((1 << NumBits) - 1) * Scale; + Opc = isSub ? ARM::tSUBi8 : ARM::tADDi8; + NeedPred = NeedCC = isTwoAddr = true; + } + } + } + + if (ExtraOpc) { + const TargetInstrDesc &TID = TII.get(ExtraOpc); + AddDefaultPred(AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TID, DestReg)) + .addReg(DestReg, RegState::Kill) + .addImm(((unsigned)NumBytes) & 3)); + } +} + +static void emitSPUpdate(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, + const TargetInstrInfo &TII, DebugLoc dl, + const Thumb1RegisterInfo &MRI, + int NumBytes) { + emitThumbRegPlusImmediate(MBB, MBBI, ARM::SP, ARM::SP, NumBytes, TII, + MRI, dl); +} + +void Thumb1RegisterInfo:: +eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator I) const { + if (!hasReservedCallFrame(MF)) { + // If we have alloca, convert as follows: + // ADJCALLSTACKDOWN -> sub, sp, sp, amount + // ADJCALLSTACKUP -> add, sp, sp, amount + MachineInstr *Old = I; + DebugLoc dl = Old->getDebugLoc(); + unsigned Amount = Old->getOperand(0).getImm(); + if (Amount != 0) { + // We need to keep the stack aligned properly. To do this, we round the + // amount of space needed for the outgoing arguments up to the next + // alignment boundary. + unsigned Align = MF.getTarget().getFrameInfo()->getStackAlignment(); + Amount = (Amount+Align-1)/Align*Align; + + // Replace the pseudo instruction with a new instruction... + unsigned Opc = Old->getOpcode(); + if (Opc == ARM::ADJCALLSTACKDOWN || Opc == ARM::tADJCALLSTACKDOWN) { + emitSPUpdate(MBB, I, TII, dl, *this, -Amount); + } else { + assert(Opc == ARM::ADJCALLSTACKUP || Opc == ARM::tADJCALLSTACKUP); + emitSPUpdate(MBB, I, TII, dl, *this, Amount); + } + } + } + MBB.erase(I); +} + +/// emitThumbConstant - Emit a series of instructions to materialize a +/// constant. +static void emitThumbConstant(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, + unsigned DestReg, int Imm, + const TargetInstrInfo &TII, + const Thumb1RegisterInfo& MRI, + DebugLoc dl) { + bool isSub = Imm < 0; + if (isSub) Imm = -Imm; + + int Chunk = (1 << 8) - 1; + int ThisVal = (Imm > Chunk) ? Chunk : Imm; + Imm -= ThisVal; + AddDefaultPred(AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVi8), + DestReg)) + .addImm(ThisVal)); + if (Imm > 0) + emitThumbRegPlusImmediate(MBB, MBBI, DestReg, DestReg, Imm, TII, MRI, dl); + if (isSub) { + const TargetInstrDesc &TID = TII.get(ARM::tRSB); + AddDefaultPred(AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TID, DestReg)) + .addReg(DestReg, RegState::Kill)); + } +} + +static void removeOperands(MachineInstr &MI, unsigned i) { + unsigned Op = i; + for (unsigned e = MI.getNumOperands(); i != e; ++i) + MI.RemoveOperand(Op); +} + +int Thumb1RegisterInfo:: +rewriteFrameIndex(MachineInstr &MI, unsigned FrameRegIdx, + unsigned FrameReg, int Offset, + unsigned MOVOpc, unsigned ADDriOpc, unsigned SUBriOpc) const +{ + // if/when eliminateFrameIndex() conforms with ARMBaseRegisterInfo + // version then can pull out Thumb1 specific parts here + return 0; +} + +/// saveScavengerRegister - Spill the register so it can be used by the +/// register scavenger. Return true. +bool +Thumb1RegisterInfo::saveScavengerRegister(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + MachineBasicBlock::iterator &UseMI, + const TargetRegisterClass *RC, + unsigned Reg) const { + // Thumb1 can't use the emergency spill slot on the stack because + // ldr/str immediate offsets must be positive, and if we're referencing + // off the frame pointer (if, for example, there are alloca() calls in + // the function, the offset will be negative. Use R12 instead since that's + // a call clobbered register that we know won't be used in Thumb1 mode. + DebugLoc DL; + BuildMI(MBB, I, DL, TII.get(ARM::tMOVtgpr2gpr)). + addReg(ARM::R12, RegState::Define).addReg(Reg, RegState::Kill); + + // The UseMI is where we would like to restore the register. If there's + // interference with R12 before then, however, we'll need to restore it + // before that instead and adjust the UseMI. + bool done = false; + for (MachineBasicBlock::iterator II = I; !done && II != UseMI ; ++II) { + // If this instruction affects R12, adjust our restore point. + for (unsigned i = 0, e = II->getNumOperands(); i != e; ++i) { + const MachineOperand &MO = II->getOperand(i); + if (!MO.isReg() || MO.isUndef() || !MO.getReg() || + TargetRegisterInfo::isVirtualRegister(MO.getReg())) + continue; + if (MO.getReg() == ARM::R12) { + UseMI = II; + done = true; + break; + } + } + } + // Restore the register from R12 + BuildMI(MBB, UseMI, DL, TII.get(ARM::tMOVgpr2tgpr)). + addReg(Reg, RegState::Define).addReg(ARM::R12, RegState::Kill); + + return true; +} + +unsigned +Thumb1RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, + int SPAdj, FrameIndexValue *Value, + RegScavenger *RS) const{ + unsigned VReg = 0; + unsigned i = 0; + MachineInstr &MI = *II; + MachineBasicBlock &MBB = *MI.getParent(); + MachineFunction &MF = *MBB.getParent(); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + DebugLoc dl = MI.getDebugLoc(); + + while (!MI.getOperand(i).isFI()) { + ++i; + assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!"); + } + + unsigned FrameReg = ARM::SP; + int FrameIndex = MI.getOperand(i).getIndex(); + int Offset = MF.getFrameInfo()->getObjectOffset(FrameIndex) + + MF.getFrameInfo()->getStackSize() + SPAdj; + + if (AFI->isGPRCalleeSavedArea1Frame(FrameIndex)) + Offset -= AFI->getGPRCalleeSavedArea1Offset(); + else if (AFI->isGPRCalleeSavedArea2Frame(FrameIndex)) + Offset -= AFI->getGPRCalleeSavedArea2Offset(); + else if (MF.getFrameInfo()->hasVarSizedObjects()) { + assert(SPAdj == 0 && hasFP(MF) && "Unexpected"); + // There are alloca()'s in this function, must reference off the frame + // pointer instead. + FrameReg = getFrameRegister(MF); + Offset -= AFI->getFramePtrSpillOffset(); + } + + // Special handling of dbg_value instructions. + if (MI.isDebugValue()) { + MI.getOperand(i). ChangeToRegister(FrameReg, false /*isDef*/); + MI.getOperand(i+1).ChangeToImmediate(Offset); + return 0; + } + + unsigned Opcode = MI.getOpcode(); + const TargetInstrDesc &Desc = MI.getDesc(); + unsigned AddrMode = (Desc.TSFlags & ARMII::AddrModeMask); + + if (Opcode == ARM::tADDrSPi) { + Offset += MI.getOperand(i+1).getImm(); + + // Can't use tADDrSPi if it's based off the frame pointer. + unsigned NumBits = 0; + unsigned Scale = 1; + if (FrameReg != ARM::SP) { + Opcode = ARM::tADDi3; + MI.setDesc(TII.get(Opcode)); + NumBits = 3; + } else { + NumBits = 8; + Scale = 4; + assert((Offset & 3) == 0 && + "Thumb add/sub sp, #imm immediate must be multiple of 4!"); + } + + unsigned PredReg; + if (Offset == 0 && getInstrPredicate(&MI, PredReg) == ARMCC::AL) { + // Turn it into a move. + MI.setDesc(TII.get(ARM::tMOVgpr2tgpr)); + MI.getOperand(i).ChangeToRegister(FrameReg, false); + // Remove offset and remaining explicit predicate operands. + do MI.RemoveOperand(i+1); + while (MI.getNumOperands() > i+1 && + (!MI.getOperand(i+1).isReg() || !MI.getOperand(i+1).isImm())); + return 0; + } + + // Common case: small offset, fits into instruction. + unsigned Mask = (1 << NumBits) - 1; + if (((Offset / Scale) & ~Mask) == 0) { + // Replace the FrameIndex with sp / fp + if (Opcode == ARM::tADDi3) { + removeOperands(MI, i); + MachineInstrBuilder MIB(&MI); + AddDefaultPred(AddDefaultT1CC(MIB).addReg(FrameReg) + .addImm(Offset / Scale)); + } else { + MI.getOperand(i).ChangeToRegister(FrameReg, false); + MI.getOperand(i+1).ChangeToImmediate(Offset / Scale); + } + return 0; + } + + unsigned DestReg = MI.getOperand(0).getReg(); + unsigned Bytes = (Offset > 0) ? Offset : -Offset; + unsigned NumMIs = calcNumMI(Opcode, 0, Bytes, NumBits, Scale); + // MI would expand into a large number of instructions. Don't try to + // simplify the immediate. + if (NumMIs > 2) { + emitThumbRegPlusImmediate(MBB, II, DestReg, FrameReg, Offset, TII, + *this, dl); + MBB.erase(II); + return 0; + } + + if (Offset > 0) { + // Translate r0 = add sp, imm to + // r0 = add sp, 255*4 + // r0 = add r0, (imm - 255*4) + if (Opcode == ARM::tADDi3) { + removeOperands(MI, i); + MachineInstrBuilder MIB(&MI); + AddDefaultPred(AddDefaultT1CC(MIB).addReg(FrameReg).addImm(Mask)); + } else { + MI.getOperand(i).ChangeToRegister(FrameReg, false); + MI.getOperand(i+1).ChangeToImmediate(Mask); + } + Offset = (Offset - Mask * Scale); + MachineBasicBlock::iterator NII = llvm::next(II); + emitThumbRegPlusImmediate(MBB, NII, DestReg, DestReg, Offset, TII, + *this, dl); + } else { + // Translate r0 = add sp, -imm to + // r0 = -imm (this is then translated into a series of instructons) + // r0 = add r0, sp + emitThumbConstant(MBB, II, DestReg, Offset, TII, *this, dl); + + MI.setDesc(TII.get(ARM::tADDhirr)); + MI.getOperand(i).ChangeToRegister(DestReg, false, false, true); + MI.getOperand(i+1).ChangeToRegister(FrameReg, false); + if (Opcode == ARM::tADDi3) { + MachineInstrBuilder MIB(&MI); + AddDefaultPred(MIB); + } + } + return 0; + } else { + unsigned ImmIdx = 0; + int InstrOffs = 0; + unsigned NumBits = 0; + unsigned Scale = 1; + switch (AddrMode) { + case ARMII::AddrModeT1_s: { + ImmIdx = i+1; + InstrOffs = MI.getOperand(ImmIdx).getImm(); + NumBits = (FrameReg == ARM::SP) ? 8 : 5; + Scale = 4; + break; + } + default: + llvm_unreachable("Unsupported addressing mode!"); + break; + } + + Offset += InstrOffs * Scale; + assert((Offset & (Scale-1)) == 0 && "Can't encode this offset!"); + + // Common case: small offset, fits into instruction. + MachineOperand &ImmOp = MI.getOperand(ImmIdx); + int ImmedOffset = Offset / Scale; + unsigned Mask = (1 << NumBits) - 1; + if ((unsigned)Offset <= Mask * Scale) { + // Replace the FrameIndex with sp + MI.getOperand(i).ChangeToRegister(FrameReg, false); + ImmOp.ChangeToImmediate(ImmedOffset); + return 0; + } + + bool isThumSpillRestore = Opcode == ARM::tRestore || Opcode == ARM::tSpill; + if (AddrMode == ARMII::AddrModeT1_s) { + // Thumb tLDRspi, tSTRspi. These will change to instructions that use + // a different base register. + NumBits = 5; + Mask = (1 << NumBits) - 1; + } + // If this is a thumb spill / restore, we will be using a constpool load to + // materialize the offset. + if (AddrMode == ARMII::AddrModeT1_s && isThumSpillRestore) + ImmOp.ChangeToImmediate(0); + else { + // Otherwise, it didn't fit. Pull in what we can to simplify the immed. + ImmedOffset = ImmedOffset & Mask; + ImmOp.ChangeToImmediate(ImmedOffset); + Offset &= ~(Mask*Scale); + } + } + + // If we get here, the immediate doesn't fit into the instruction. We folded + // as much as possible above, handle the rest, providing a register that is + // SP+LargeImm. + assert(Offset && "This code isn't needed if offset already handled!"); + + // Remove predicate first. + int PIdx = MI.findFirstPredOperandIdx(); + if (PIdx != -1) + removeOperands(MI, PIdx); + + if (Desc.mayLoad()) { + // Use the destination register to materialize sp + offset. + unsigned TmpReg = MI.getOperand(0).getReg(); + bool UseRR = false; + if (Opcode == ARM::tRestore) { + if (FrameReg == ARM::SP) + emitThumbRegPlusImmInReg(MBB, II, TmpReg, FrameReg, + Offset, false, TII, *this, dl); + else { + emitLoadConstPool(MBB, II, dl, TmpReg, 0, Offset); + UseRR = true; + } + } else { + emitThumbRegPlusImmediate(MBB, II, TmpReg, FrameReg, Offset, TII, + *this, dl); + } + + MI.setDesc(TII.get(ARM::tLDR)); + MI.getOperand(i).ChangeToRegister(TmpReg, false, false, true); + if (UseRR) + // Use [reg, reg] addrmode. + MI.addOperand(MachineOperand::CreateReg(FrameReg, false)); + else // tLDR has an extra register operand. + MI.addOperand(MachineOperand::CreateReg(0, false)); + } else if (Desc.mayStore()) { + VReg = MF.getRegInfo().createVirtualRegister(ARM::tGPRRegisterClass); + assert (Value && "Frame index virtual allocated, but Value arg is NULL!"); + bool UseRR = false; + bool TrackVReg = true; + Value->first = FrameReg; // use the frame register as a kind indicator + Value->second = Offset; + + if (Opcode == ARM::tSpill) { + if (FrameReg == ARM::SP) + emitThumbRegPlusImmInReg(MBB, II, VReg, FrameReg, + Offset, false, TII, *this, dl); + else { + emitLoadConstPool(MBB, II, dl, VReg, 0, Offset); + UseRR = true; + TrackVReg = false; + } + } else + emitThumbRegPlusImmediate(MBB, II, VReg, FrameReg, Offset, TII, + *this, dl); + MI.setDesc(TII.get(ARM::tSTR)); + MI.getOperand(i).ChangeToRegister(VReg, false, false, true); + if (UseRR) // Use [reg, reg] addrmode. + MI.addOperand(MachineOperand::CreateReg(FrameReg, false)); + else // tSTR has an extra register operand. + MI.addOperand(MachineOperand::CreateReg(0, false)); + if (!ReuseFrameIndexVals || !TrackVReg) + VReg = 0; + } else + assert(false && "Unexpected opcode!"); + + // Add predicate back if it's needed. + if (MI.getDesc().isPredicable()) { + MachineInstrBuilder MIB(&MI); + AddDefaultPred(MIB); + } + return VReg; +} + +void Thumb1RegisterInfo::emitPrologue(MachineFunction &MF) const { + MachineBasicBlock &MBB = MF.front(); + MachineBasicBlock::iterator MBBI = MBB.begin(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + unsigned VARegSaveSize = AFI->getVarArgsRegSaveSize(); + unsigned NumBytes = MFI->getStackSize(); + const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo(); + DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc(); + + // Thumb add/sub sp, imm8 instructions implicitly multiply the offset by 4. + NumBytes = (NumBytes + 3) & ~3; + MFI->setStackSize(NumBytes); + + // Determine the sizes of each callee-save spill areas and record which frame + // belongs to which callee-save spill areas. + unsigned GPRCS1Size = 0, GPRCS2Size = 0, DPRCSSize = 0; + int FramePtrSpillFI = 0; + + if (VARegSaveSize) + emitSPUpdate(MBB, MBBI, TII, dl, *this, -VARegSaveSize); + + if (!AFI->hasStackFrame()) { + if (NumBytes != 0) + emitSPUpdate(MBB, MBBI, TII, dl, *this, -NumBytes); + return; + } + + for (unsigned i = 0, e = CSI.size(); i != e; ++i) { + unsigned Reg = CSI[i].getReg(); + int FI = CSI[i].getFrameIdx(); + switch (Reg) { + case ARM::R4: + case ARM::R5: + case ARM::R6: + case ARM::R7: + case ARM::LR: + if (Reg == FramePtr) + FramePtrSpillFI = FI; + AFI->addGPRCalleeSavedArea1Frame(FI); + GPRCS1Size += 4; + break; + case ARM::R8: + case ARM::R9: + case ARM::R10: + case ARM::R11: + if (Reg == FramePtr) + FramePtrSpillFI = FI; + if (STI.isTargetDarwin()) { + AFI->addGPRCalleeSavedArea2Frame(FI); + GPRCS2Size += 4; + } else { + AFI->addGPRCalleeSavedArea1Frame(FI); + GPRCS1Size += 4; + } + break; + default: + AFI->addDPRCalleeSavedAreaFrame(FI); + DPRCSSize += 8; + } + } + + if (MBBI != MBB.end() && MBBI->getOpcode() == ARM::tPUSH) { + ++MBBI; + if (MBBI != MBB.end()) + dl = MBBI->getDebugLoc(); + } + + // Darwin ABI requires FP to point to the stack slot that contains the + // previous FP. + if (STI.isTargetDarwin() || hasFP(MF)) { + BuildMI(MBB, MBBI, dl, TII.get(ARM::tADDrSPi), FramePtr) + .addFrameIndex(FramePtrSpillFI).addImm(0); + } + + // Determine starting offsets of spill areas. + unsigned DPRCSOffset = NumBytes - (GPRCS1Size + GPRCS2Size + DPRCSSize); + unsigned GPRCS2Offset = DPRCSOffset + DPRCSSize; + unsigned GPRCS1Offset = GPRCS2Offset + GPRCS2Size; + AFI->setFramePtrSpillOffset(MFI->getObjectOffset(FramePtrSpillFI) + NumBytes); + AFI->setGPRCalleeSavedArea1Offset(GPRCS1Offset); + AFI->setGPRCalleeSavedArea2Offset(GPRCS2Offset); + AFI->setDPRCalleeSavedAreaOffset(DPRCSOffset); + + NumBytes = DPRCSOffset; + if (NumBytes) { + // Insert it after all the callee-save spills. + emitSPUpdate(MBB, MBBI, TII, dl, *this, -NumBytes); + } + + if (STI.isTargetELF() && hasFP(MF)) { + MFI->setOffsetAdjustment(MFI->getOffsetAdjustment() - + AFI->getFramePtrSpillOffset()); + } + + AFI->setGPRCalleeSavedArea1Size(GPRCS1Size); + AFI->setGPRCalleeSavedArea2Size(GPRCS2Size); + AFI->setDPRCalleeSavedAreaSize(DPRCSSize); +} + +static bool isCalleeSavedRegister(unsigned Reg, const unsigned *CSRegs) { + for (unsigned i = 0; CSRegs[i]; ++i) + if (Reg == CSRegs[i]) + return true; + return false; +} + +static bool isCSRestore(MachineInstr *MI, const unsigned *CSRegs) { + if (MI->getOpcode() == ARM::tRestore && + MI->getOperand(1).isFI() && + isCalleeSavedRegister(MI->getOperand(0).getReg(), CSRegs)) + return true; + else if (MI->getOpcode() == ARM::tPOP) { + // The first two operands are predicates. The last two are + // imp-def and imp-use of SP. Check everything in between. + for (int i = 2, e = MI->getNumOperands() - 2; i != e; ++i) + if (!isCalleeSavedRegister(MI->getOperand(i).getReg(), CSRegs)) + return false; + return true; + } + return false; +} + +void Thumb1RegisterInfo::emitEpilogue(MachineFunction &MF, + MachineBasicBlock &MBB) const { + MachineBasicBlock::iterator MBBI = prior(MBB.end()); + assert((MBBI->getOpcode() == ARM::tBX_RET || + MBBI->getOpcode() == ARM::tPOP_RET) && + "Can only insert epilog into returning blocks"); + DebugLoc dl = MBBI->getDebugLoc(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + unsigned VARegSaveSize = AFI->getVarArgsRegSaveSize(); + int NumBytes = (int)MFI->getStackSize(); + const unsigned *CSRegs = getCalleeSavedRegs(); + + if (!AFI->hasStackFrame()) { + if (NumBytes != 0) + emitSPUpdate(MBB, MBBI, TII, dl, *this, NumBytes); + } else { + // Unwind MBBI to point to first LDR / VLDRD. + if (MBBI != MBB.begin()) { + do + --MBBI; + while (MBBI != MBB.begin() && isCSRestore(MBBI, CSRegs)); + if (!isCSRestore(MBBI, CSRegs)) + ++MBBI; + } + + // Move SP to start of FP callee save spill area. + NumBytes -= (AFI->getGPRCalleeSavedArea1Size() + + AFI->getGPRCalleeSavedArea2Size() + + AFI->getDPRCalleeSavedAreaSize()); + + if (hasFP(MF)) { + NumBytes = AFI->getFramePtrSpillOffset() - NumBytes; + // Reset SP based on frame pointer only if the stack frame extends beyond + // frame pointer stack slot or target is ELF and the function has FP. + if (NumBytes) + emitThumbRegPlusImmediate(MBB, MBBI, ARM::SP, FramePtr, -NumBytes, + TII, *this, dl); + else + BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVtgpr2gpr), ARM::SP) + .addReg(FramePtr); + } else { + if (MBBI->getOpcode() == ARM::tBX_RET && + &MBB.front() != MBBI && + prior(MBBI)->getOpcode() == ARM::tPOP) { + MachineBasicBlock::iterator PMBBI = prior(MBBI); + emitSPUpdate(MBB, PMBBI, TII, dl, *this, NumBytes); + } else + emitSPUpdate(MBB, MBBI, TII, dl, *this, NumBytes); + } + } + + if (VARegSaveSize) { + // Unlike T2 and ARM mode, the T1 pop instruction cannot restore + // to LR, and we can't pop the value directly to the PC since + // we need to update the SP after popping the value. Therefore, we + // pop the old LR into R3 as a temporary. + + // Move back past the callee-saved register restoration + while (MBBI != MBB.end() && isCSRestore(MBBI, CSRegs)) + ++MBBI; + // Epilogue for vararg functions: pop LR to R3 and branch off it. + AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tPOP))) + .addReg(ARM::R3, RegState::Define); + + emitSPUpdate(MBB, MBBI, TII, dl, *this, VARegSaveSize); + + BuildMI(MBB, MBBI, dl, TII.get(ARM::tBX_RET_vararg)) + .addReg(ARM::R3, RegState::Kill); + // erase the old tBX_RET instruction + MBB.erase(MBBI); + } +} diff --git a/contrib/llvm/lib/Target/ARM/Thumb1RegisterInfo.h b/contrib/llvm/lib/Target/ARM/Thumb1RegisterInfo.h new file mode 100644 index 0000000..4eca367 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/Thumb1RegisterInfo.h @@ -0,0 +1,70 @@ +//===- Thumb1RegisterInfo.h - Thumb-1 Register Information Impl -*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the Thumb-1 implementation of the TargetRegisterInfo +// class. +// +//===----------------------------------------------------------------------===// + +#ifndef THUMB1REGISTERINFO_H +#define THUMB1REGISTERINFO_H + +#include "ARM.h" +#include "ARMRegisterInfo.h" +#include "llvm/Target/TargetRegisterInfo.h" + +namespace llvm { + class ARMSubtarget; + class ARMBaseInstrInfo; + class Type; + +struct Thumb1RegisterInfo : public ARMBaseRegisterInfo { +public: + Thumb1RegisterInfo(const ARMBaseInstrInfo &tii, const ARMSubtarget &STI); + + /// emitLoadConstPool - Emits a load from constpool to materialize the + /// specified immediate. + void emitLoadConstPool(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, + DebugLoc dl, + unsigned DestReg, unsigned SubIdx, int Val, + ARMCC::CondCodes Pred = ARMCC::AL, + unsigned PredReg = 0) const; + + /// Code Generation virtual methods... + const TargetRegisterClass * + getPhysicalRegisterRegClass(unsigned Reg, EVT VT = MVT::Other) const; + + bool hasReservedCallFrame(MachineFunction &MF) const; + + void eliminateCallFramePseudoInstr(MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator I) const; + + // rewrite MI to access 'Offset' bytes from the FP. Return the offset that + // could not be handled directly in MI. + int rewriteFrameIndex(MachineInstr &MI, unsigned FrameRegIdx, + unsigned FrameReg, int Offset, + unsigned MOVOpc, unsigned ADDriOpc, unsigned SUBriOpc) const; + + bool saveScavengerRegister(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + MachineBasicBlock::iterator &UseMI, + const TargetRegisterClass *RC, + unsigned Reg) const; + unsigned eliminateFrameIndex(MachineBasicBlock::iterator II, + int SPAdj, FrameIndexValue *Value = NULL, + RegScavenger *RS = NULL) const; + + void emitPrologue(MachineFunction &MF) const; + void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const; +}; +} + +#endif // THUMB1REGISTERINFO_H diff --git a/contrib/llvm/lib/Target/ARM/Thumb2ITBlockPass.cpp b/contrib/llvm/lib/Target/ARM/Thumb2ITBlockPass.cpp new file mode 100644 index 0000000..f36d4ef --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/Thumb2ITBlockPass.cpp @@ -0,0 +1,121 @@ +//===-- Thumb2ITBlockPass.cpp - Insert Thumb IT blocks ----------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "thumb2-it" +#include "ARM.h" +#include "ARMMachineFunctionInfo.h" +#include "Thumb2InstrInfo.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/ADT/Statistic.h" +using namespace llvm; + +STATISTIC(NumITs, "Number of IT blocks inserted"); + +namespace { + struct Thumb2ITBlockPass : public MachineFunctionPass { + static char ID; + Thumb2ITBlockPass() : MachineFunctionPass(&ID) {} + + const Thumb2InstrInfo *TII; + ARMFunctionInfo *AFI; + + virtual bool runOnMachineFunction(MachineFunction &Fn); + + virtual const char *getPassName() const { + return "Thumb IT blocks insertion pass"; + } + + private: + bool InsertITBlocks(MachineBasicBlock &MBB); + }; + char Thumb2ITBlockPass::ID = 0; +} + +static ARMCC::CondCodes getPredicate(const MachineInstr *MI, unsigned &PredReg){ + unsigned Opc = MI->getOpcode(); + if (Opc == ARM::tBcc || Opc == ARM::t2Bcc) + return ARMCC::AL; + return llvm::getInstrPredicate(MI, PredReg); +} + +bool Thumb2ITBlockPass::InsertITBlocks(MachineBasicBlock &MBB) { + bool Modified = false; + + MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); + while (MBBI != E) { + MachineInstr *MI = &*MBBI; + DebugLoc dl = MI->getDebugLoc(); + unsigned PredReg = 0; + ARMCC::CondCodes CC = getPredicate(MI, PredReg); + + if (CC == ARMCC::AL) { + ++MBBI; + continue; + } + + // Insert an IT instruction. + MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII->get(ARM::t2IT)) + .addImm(CC); + ++MBBI; + + // Finalize IT mask. + ARMCC::CondCodes OCC = ARMCC::getOppositeCondition(CC); + unsigned Mask = 0, Pos = 3; + // Branches, including tricky ones like LDM_RET, need to end an IT + // block so check the instruction we just put in the block. + while (MBBI != E && Pos && + (!MI->getDesc().isBranch() && !MI->getDesc().isReturn())) { + MachineInstr *NMI = &*MBBI; + MI = NMI; + DebugLoc ndl = NMI->getDebugLoc(); + unsigned NPredReg = 0; + ARMCC::CondCodes NCC = getPredicate(NMI, NPredReg); + if (NCC == CC || NCC == OCC) + Mask |= (NCC & 1) << Pos; + else + break; + --Pos; + ++MBBI; + } + Mask |= (1 << Pos); + // Tag along (firstcond[0] << 4) with the mask. + Mask |= (CC & 1) << 4; + MIB.addImm(Mask); + Modified = true; + ++NumITs; + } + + return Modified; +} + +bool Thumb2ITBlockPass::runOnMachineFunction(MachineFunction &Fn) { + const TargetMachine &TM = Fn.getTarget(); + AFI = Fn.getInfo<ARMFunctionInfo>(); + TII = static_cast<const Thumb2InstrInfo*>(TM.getInstrInfo()); + + if (!AFI->isThumbFunction()) + return false; + + bool Modified = false; + for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E; + ++MFI) { + MachineBasicBlock &MBB = *MFI; + Modified |= InsertITBlocks(MBB); + } + + return Modified; +} + +/// createThumb2ITBlockPass - Returns an instance of the Thumb2 IT blocks +/// insertion pass. +FunctionPass *llvm::createThumb2ITBlockPass() { + return new Thumb2ITBlockPass(); +} diff --git a/contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp b/contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp new file mode 100644 index 0000000..531d5e9 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp @@ -0,0 +1,504 @@ +//===- Thumb2InstrInfo.cpp - Thumb-2 Instruction Information ----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the Thumb-2 implementation of the TargetInstrInfo class. +// +//===----------------------------------------------------------------------===// + +#include "Thumb2InstrInfo.h" +#include "ARM.h" +#include "ARMConstantPoolValue.h" +#include "ARMAddressingModes.h" +#include "ARMGenInstrInfo.inc" +#include "ARMMachineFunctionInfo.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineMemOperand.h" +#include "llvm/CodeGen/PseudoSourceValue.h" +#include "llvm/ADT/SmallVector.h" +#include "Thumb2InstrInfo.h" + +using namespace llvm; + +Thumb2InstrInfo::Thumb2InstrInfo(const ARMSubtarget &STI) + : ARMBaseInstrInfo(STI), RI(*this, STI) { +} + +unsigned Thumb2InstrInfo::getUnindexedOpcode(unsigned Opc) const { + // FIXME + return 0; +} + +bool +Thumb2InstrInfo::copyRegToReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + unsigned DestReg, unsigned SrcReg, + const TargetRegisterClass *DestRC, + const TargetRegisterClass *SrcRC, + DebugLoc DL) const { + if (DestRC == ARM::GPRRegisterClass) { + if (SrcRC == ARM::GPRRegisterClass) { + BuildMI(MBB, I, DL, get(ARM::tMOVgpr2gpr), DestReg).addReg(SrcReg); + return true; + } else if (SrcRC == ARM::tGPRRegisterClass) { + BuildMI(MBB, I, DL, get(ARM::tMOVtgpr2gpr), DestReg).addReg(SrcReg); + return true; + } + } else if (DestRC == ARM::tGPRRegisterClass) { + if (SrcRC == ARM::GPRRegisterClass) { + BuildMI(MBB, I, DL, get(ARM::tMOVgpr2tgpr), DestReg).addReg(SrcReg); + return true; + } else if (SrcRC == ARM::tGPRRegisterClass) { + BuildMI(MBB, I, DL, get(ARM::tMOVr), DestReg).addReg(SrcReg); + return true; + } + } + + // Handle SPR, DPR, and QPR copies. + return ARMBaseInstrInfo::copyRegToReg(MBB, I, DestReg, SrcReg, DestRC, SrcRC, DL); +} + +void Thumb2InstrInfo:: +storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, + unsigned SrcReg, bool isKill, int FI, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const { + if (RC == ARM::GPRRegisterClass || RC == ARM::tGPRRegisterClass) { + DebugLoc DL; + if (I != MBB.end()) DL = I->getDebugLoc(); + + MachineFunction &MF = *MBB.getParent(); + MachineFrameInfo &MFI = *MF.getFrameInfo(); + MachineMemOperand *MMO = + MF.getMachineMemOperand(PseudoSourceValue::getFixedStack(FI), + MachineMemOperand::MOStore, 0, + MFI.getObjectSize(FI), + MFI.getObjectAlignment(FI)); + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::t2STRi12)) + .addReg(SrcReg, getKillRegState(isKill)) + .addFrameIndex(FI).addImm(0).addMemOperand(MMO)); + return; + } + + ARMBaseInstrInfo::storeRegToStackSlot(MBB, I, SrcReg, isKill, FI, RC, TRI); +} + +void Thumb2InstrInfo:: +loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, + unsigned DestReg, int FI, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const { + if (RC == ARM::GPRRegisterClass || RC == ARM::tGPRRegisterClass) { + DebugLoc DL; + if (I != MBB.end()) DL = I->getDebugLoc(); + + MachineFunction &MF = *MBB.getParent(); + MachineFrameInfo &MFI = *MF.getFrameInfo(); + MachineMemOperand *MMO = + MF.getMachineMemOperand(PseudoSourceValue::getFixedStack(FI), + MachineMemOperand::MOLoad, 0, + MFI.getObjectSize(FI), + MFI.getObjectAlignment(FI)); + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::t2LDRi12), DestReg) + .addFrameIndex(FI).addImm(0).addMemOperand(MMO)); + return; + } + + ARMBaseInstrInfo::loadRegFromStackSlot(MBB, I, DestReg, FI, RC, TRI); +} + +void llvm::emitT2RegPlusImmediate(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, DebugLoc dl, + unsigned DestReg, unsigned BaseReg, int NumBytes, + ARMCC::CondCodes Pred, unsigned PredReg, + const ARMBaseInstrInfo &TII) { + bool isSub = NumBytes < 0; + if (isSub) NumBytes = -NumBytes; + + // If profitable, use a movw or movt to materialize the offset. + // FIXME: Use the scavenger to grab a scratch register. + if (DestReg != ARM::SP && DestReg != BaseReg && + NumBytes >= 4096 && + ARM_AM::getT2SOImmVal(NumBytes) == -1) { + bool Fits = false; + if (NumBytes < 65536) { + // Use a movw to materialize the 16-bit constant. + BuildMI(MBB, MBBI, dl, TII.get(ARM::t2MOVi16), DestReg) + .addImm(NumBytes) + .addImm((unsigned)Pred).addReg(PredReg).addReg(0); + Fits = true; + } else if ((NumBytes & 0xffff) == 0) { + // Use a movt to materialize the 32-bit constant. + BuildMI(MBB, MBBI, dl, TII.get(ARM::t2MOVTi16), DestReg) + .addReg(DestReg) + .addImm(NumBytes >> 16) + .addImm((unsigned)Pred).addReg(PredReg).addReg(0); + Fits = true; + } + + if (Fits) { + if (isSub) { + BuildMI(MBB, MBBI, dl, TII.get(ARM::t2SUBrr), DestReg) + .addReg(BaseReg, RegState::Kill) + .addReg(DestReg, RegState::Kill) + .addImm((unsigned)Pred).addReg(PredReg).addReg(0); + } else { + BuildMI(MBB, MBBI, dl, TII.get(ARM::t2ADDrr), DestReg) + .addReg(DestReg, RegState::Kill) + .addReg(BaseReg, RegState::Kill) + .addImm((unsigned)Pred).addReg(PredReg).addReg(0); + } + return; + } + } + + while (NumBytes) { + unsigned ThisVal = NumBytes; + unsigned Opc = 0; + if (DestReg == ARM::SP && BaseReg != ARM::SP) { + // mov sp, rn. Note t2MOVr cannot be used. + BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVgpr2gpr),DestReg).addReg(BaseReg); + BaseReg = ARM::SP; + continue; + } + + bool HasCCOut = true; + if (BaseReg == ARM::SP) { + // sub sp, sp, #imm7 + if (DestReg == ARM::SP && (ThisVal < ((1 << 7)-1) * 4)) { + assert((ThisVal & 3) == 0 && "Stack update is not multiple of 4?"); + Opc = isSub ? ARM::tSUBspi : ARM::tADDspi; + // FIXME: Fix Thumb1 immediate encoding. + BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg) + .addReg(BaseReg).addImm(ThisVal/4); + NumBytes = 0; + continue; + } + + // sub rd, sp, so_imm + Opc = isSub ? ARM::t2SUBrSPi : ARM::t2ADDrSPi; + if (ARM_AM::getT2SOImmVal(NumBytes) != -1) { + NumBytes = 0; + } else { + // FIXME: Move this to ARMAddressingModes.h? + unsigned RotAmt = CountLeadingZeros_32(ThisVal); + ThisVal = ThisVal & ARM_AM::rotr32(0xff000000U, RotAmt); + NumBytes &= ~ThisVal; + assert(ARM_AM::getT2SOImmVal(ThisVal) != -1 && + "Bit extraction didn't work?"); + } + } else { + assert(DestReg != ARM::SP && BaseReg != ARM::SP); + Opc = isSub ? ARM::t2SUBri : ARM::t2ADDri; + if (ARM_AM::getT2SOImmVal(NumBytes) != -1) { + NumBytes = 0; + } else if (ThisVal < 4096) { + Opc = isSub ? ARM::t2SUBri12 : ARM::t2ADDri12; + HasCCOut = false; + NumBytes = 0; + } else { + // FIXME: Move this to ARMAddressingModes.h? + unsigned RotAmt = CountLeadingZeros_32(ThisVal); + ThisVal = ThisVal & ARM_AM::rotr32(0xff000000U, RotAmt); + NumBytes &= ~ThisVal; + assert(ARM_AM::getT2SOImmVal(ThisVal) != -1 && + "Bit extraction didn't work?"); + } + } + + // Build the new ADD / SUB. + MachineInstrBuilder MIB = + AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg) + .addReg(BaseReg, RegState::Kill) + .addImm(ThisVal)); + if (HasCCOut) + AddDefaultCC(MIB); + + BaseReg = DestReg; + } +} + +static unsigned +negativeOffsetOpcode(unsigned opcode) +{ + switch (opcode) { + case ARM::t2LDRi12: return ARM::t2LDRi8; + case ARM::t2LDRHi12: return ARM::t2LDRHi8; + case ARM::t2LDRBi12: return ARM::t2LDRBi8; + case ARM::t2LDRSHi12: return ARM::t2LDRSHi8; + case ARM::t2LDRSBi12: return ARM::t2LDRSBi8; + case ARM::t2STRi12: return ARM::t2STRi8; + case ARM::t2STRBi12: return ARM::t2STRBi8; + case ARM::t2STRHi12: return ARM::t2STRHi8; + + case ARM::t2LDRi8: + case ARM::t2LDRHi8: + case ARM::t2LDRBi8: + case ARM::t2LDRSHi8: + case ARM::t2LDRSBi8: + case ARM::t2STRi8: + case ARM::t2STRBi8: + case ARM::t2STRHi8: + return opcode; + + default: + break; + } + + return 0; +} + +static unsigned +positiveOffsetOpcode(unsigned opcode) +{ + switch (opcode) { + case ARM::t2LDRi8: return ARM::t2LDRi12; + case ARM::t2LDRHi8: return ARM::t2LDRHi12; + case ARM::t2LDRBi8: return ARM::t2LDRBi12; + case ARM::t2LDRSHi8: return ARM::t2LDRSHi12; + case ARM::t2LDRSBi8: return ARM::t2LDRSBi12; + case ARM::t2STRi8: return ARM::t2STRi12; + case ARM::t2STRBi8: return ARM::t2STRBi12; + case ARM::t2STRHi8: return ARM::t2STRHi12; + + case ARM::t2LDRi12: + case ARM::t2LDRHi12: + case ARM::t2LDRBi12: + case ARM::t2LDRSHi12: + case ARM::t2LDRSBi12: + case ARM::t2STRi12: + case ARM::t2STRBi12: + case ARM::t2STRHi12: + return opcode; + + default: + break; + } + + return 0; +} + +static unsigned +immediateOffsetOpcode(unsigned opcode) +{ + switch (opcode) { + case ARM::t2LDRs: return ARM::t2LDRi12; + case ARM::t2LDRHs: return ARM::t2LDRHi12; + case ARM::t2LDRBs: return ARM::t2LDRBi12; + case ARM::t2LDRSHs: return ARM::t2LDRSHi12; + case ARM::t2LDRSBs: return ARM::t2LDRSBi12; + case ARM::t2STRs: return ARM::t2STRi12; + case ARM::t2STRBs: return ARM::t2STRBi12; + case ARM::t2STRHs: return ARM::t2STRHi12; + + case ARM::t2LDRi12: + case ARM::t2LDRHi12: + case ARM::t2LDRBi12: + case ARM::t2LDRSHi12: + case ARM::t2LDRSBi12: + case ARM::t2STRi12: + case ARM::t2STRBi12: + case ARM::t2STRHi12: + case ARM::t2LDRi8: + case ARM::t2LDRHi8: + case ARM::t2LDRBi8: + case ARM::t2LDRSHi8: + case ARM::t2LDRSBi8: + case ARM::t2STRi8: + case ARM::t2STRBi8: + case ARM::t2STRHi8: + return opcode; + + default: + break; + } + + return 0; +} + +bool llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx, + unsigned FrameReg, int &Offset, + const ARMBaseInstrInfo &TII) { + unsigned Opcode = MI.getOpcode(); + const TargetInstrDesc &Desc = MI.getDesc(); + unsigned AddrMode = (Desc.TSFlags & ARMII::AddrModeMask); + bool isSub = false; + + // Memory operands in inline assembly always use AddrModeT2_i12. + if (Opcode == ARM::INLINEASM) + AddrMode = ARMII::AddrModeT2_i12; // FIXME. mode for thumb2? + + if (Opcode == ARM::t2ADDri || Opcode == ARM::t2ADDri12) { + Offset += MI.getOperand(FrameRegIdx+1).getImm(); + + unsigned PredReg; + if (Offset == 0 && getInstrPredicate(&MI, PredReg) == ARMCC::AL) { + // Turn it into a move. + MI.setDesc(TII.get(ARM::tMOVgpr2gpr)); + MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false); + // Remove offset and remaining explicit predicate operands. + do MI.RemoveOperand(FrameRegIdx+1); + while (MI.getNumOperands() > FrameRegIdx+1 && + (!MI.getOperand(FrameRegIdx+1).isReg() || + !MI.getOperand(FrameRegIdx+1).isImm())); + return true; + } + + bool isSP = FrameReg == ARM::SP; + bool HasCCOut = Opcode != ARM::t2ADDri12; + + if (Offset < 0) { + Offset = -Offset; + isSub = true; + MI.setDesc(TII.get(isSP ? ARM::t2SUBrSPi : ARM::t2SUBri)); + } else { + MI.setDesc(TII.get(isSP ? ARM::t2ADDrSPi : ARM::t2ADDri)); + } + + // Common case: small offset, fits into instruction. + if (ARM_AM::getT2SOImmVal(Offset) != -1) { + MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false); + MI.getOperand(FrameRegIdx+1).ChangeToImmediate(Offset); + // Add cc_out operand if the original instruction did not have one. + if (!HasCCOut) + MI.addOperand(MachineOperand::CreateReg(0, false)); + Offset = 0; + return true; + } + // Another common case: imm12. + if (Offset < 4096 && + (!HasCCOut || MI.getOperand(MI.getNumOperands()-1).getReg() == 0)) { + unsigned NewOpc = isSP + ? (isSub ? ARM::t2SUBrSPi12 : ARM::t2ADDrSPi12) + : (isSub ? ARM::t2SUBri12 : ARM::t2ADDri12); + MI.setDesc(TII.get(NewOpc)); + MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false); + MI.getOperand(FrameRegIdx+1).ChangeToImmediate(Offset); + // Remove the cc_out operand. + if (HasCCOut) + MI.RemoveOperand(MI.getNumOperands()-1); + Offset = 0; + return true; + } + + // Otherwise, extract 8 adjacent bits from the immediate into this + // t2ADDri/t2SUBri. + unsigned RotAmt = CountLeadingZeros_32(Offset); + unsigned ThisImmVal = Offset & ARM_AM::rotr32(0xff000000U, RotAmt); + + // We will handle these bits from offset, clear them. + Offset &= ~ThisImmVal; + + assert(ARM_AM::getT2SOImmVal(ThisImmVal) != -1 && + "Bit extraction didn't work?"); + MI.getOperand(FrameRegIdx+1).ChangeToImmediate(ThisImmVal); + // Add cc_out operand if the original instruction did not have one. + if (!HasCCOut) + MI.addOperand(MachineOperand::CreateReg(0, false)); + + } else { + + // AddrMode4 and AddrMode6 cannot handle any offset. + if (AddrMode == ARMII::AddrMode4 || AddrMode == ARMII::AddrMode6) + return false; + + // AddrModeT2_so cannot handle any offset. If there is no offset + // register then we change to an immediate version. + unsigned NewOpc = Opcode; + if (AddrMode == ARMII::AddrModeT2_so) { + unsigned OffsetReg = MI.getOperand(FrameRegIdx+1).getReg(); + if (OffsetReg != 0) { + MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false); + return Offset == 0; + } + + MI.RemoveOperand(FrameRegIdx+1); + MI.getOperand(FrameRegIdx+1).ChangeToImmediate(0); + NewOpc = immediateOffsetOpcode(Opcode); + AddrMode = ARMII::AddrModeT2_i12; + } + + unsigned NumBits = 0; + unsigned Scale = 1; + if (AddrMode == ARMII::AddrModeT2_i8 || AddrMode == ARMII::AddrModeT2_i12) { + // i8 supports only negative, and i12 supports only positive, so + // based on Offset sign convert Opcode to the appropriate + // instruction + Offset += MI.getOperand(FrameRegIdx+1).getImm(); + if (Offset < 0) { + NewOpc = negativeOffsetOpcode(Opcode); + NumBits = 8; + isSub = true; + Offset = -Offset; + } else { + NewOpc = positiveOffsetOpcode(Opcode); + NumBits = 12; + } + } else if (AddrMode == ARMII::AddrMode5) { + // VFP address mode. + const MachineOperand &OffOp = MI.getOperand(FrameRegIdx+1); + int InstrOffs = ARM_AM::getAM5Offset(OffOp.getImm()); + if (ARM_AM::getAM5Op(OffOp.getImm()) == ARM_AM::sub) + InstrOffs *= -1; + NumBits = 8; + Scale = 4; + Offset += InstrOffs * 4; + assert((Offset & (Scale-1)) == 0 && "Can't encode this offset!"); + if (Offset < 0) { + Offset = -Offset; + isSub = true; + } + } else { + llvm_unreachable("Unsupported addressing mode!"); + } + + if (NewOpc != Opcode) + MI.setDesc(TII.get(NewOpc)); + + MachineOperand &ImmOp = MI.getOperand(FrameRegIdx+1); + + // Attempt to fold address computation + // Common case: small offset, fits into instruction. + int ImmedOffset = Offset / Scale; + unsigned Mask = (1 << NumBits) - 1; + if ((unsigned)Offset <= Mask * Scale) { + // Replace the FrameIndex with fp/sp + MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false); + if (isSub) { + if (AddrMode == ARMII::AddrMode5) + // FIXME: Not consistent. + ImmedOffset |= 1 << NumBits; + else + ImmedOffset = -ImmedOffset; + } + ImmOp.ChangeToImmediate(ImmedOffset); + Offset = 0; + return true; + } + + // Otherwise, offset doesn't fit. Pull in what we can to simplify + ImmedOffset = ImmedOffset & Mask; + if (isSub) { + if (AddrMode == ARMII::AddrMode5) + // FIXME: Not consistent. + ImmedOffset |= 1 << NumBits; + else { + ImmedOffset = -ImmedOffset; + if (ImmedOffset == 0) + // Change the opcode back if the encoded offset is zero. + MI.setDesc(TII.get(positiveOffsetOpcode(NewOpc))); + } + } + ImmOp.ChangeToImmediate(ImmedOffset); + Offset &= ~(Mask*Scale); + } + + Offset = (isSub) ? -Offset : Offset; + return Offset == 0; +} diff --git a/contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.h b/contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.h new file mode 100644 index 0000000..2948770 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.h @@ -0,0 +1,61 @@ +//===- Thumb2InstrInfo.h - Thumb-2 Instruction Information ------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the Thumb-2 implementation of the TargetInstrInfo class. +// +//===----------------------------------------------------------------------===// + +#ifndef THUMB2INSTRUCTIONINFO_H +#define THUMB2INSTRUCTIONINFO_H + +#include "llvm/Target/TargetInstrInfo.h" +#include "ARM.h" +#include "ARMInstrInfo.h" +#include "Thumb2RegisterInfo.h" + +namespace llvm { + class ARMSubtarget; + +class Thumb2InstrInfo : public ARMBaseInstrInfo { + Thumb2RegisterInfo RI; +public: + explicit Thumb2InstrInfo(const ARMSubtarget &STI); + + // Return the non-pre/post incrementing version of 'Opc'. Return 0 + // if there is not such an opcode. + unsigned getUnindexedOpcode(unsigned Opc) const; + + bool copyRegToReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + unsigned DestReg, unsigned SrcReg, + const TargetRegisterClass *DestRC, + const TargetRegisterClass *SrcRC, + DebugLoc DL) const; + + void storeRegToStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + unsigned SrcReg, bool isKill, int FrameIndex, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const; + + void loadRegFromStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + unsigned DestReg, int FrameIndex, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const; + + /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info. As + /// such, whenever a client has an instance of instruction info, it should + /// always be able to get register info as well (through this method). + /// + const Thumb2RegisterInfo &getRegisterInfo() const { return RI; } +}; +} + +#endif // THUMB2INSTRUCTIONINFO_H diff --git a/contrib/llvm/lib/Target/ARM/Thumb2RegisterInfo.cpp b/contrib/llvm/lib/Target/ARM/Thumb2RegisterInfo.cpp new file mode 100644 index 0000000..07dd0be --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/Thumb2RegisterInfo.cpp @@ -0,0 +1,62 @@ +//===- Thumb2RegisterInfo.cpp - Thumb-2 Register Information ----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the Thumb-2 implementation of the TargetRegisterInfo +// class. +// +//===----------------------------------------------------------------------===// + +#include "ARM.h" +#include "ARMAddressingModes.h" +#include "ARMBaseInstrInfo.h" +#include "ARMMachineFunctionInfo.h" +#include "ARMSubtarget.h" +#include "Thumb2InstrInfo.h" +#include "Thumb2RegisterInfo.h" +#include "llvm/Constants.h" +#include "llvm/DerivedTypes.h" +#include "llvm/Function.h" +#include "llvm/LLVMContext.h" +#include "llvm/CodeGen/MachineConstantPool.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineLocation.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Target/TargetFrameInfo.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/ADT/BitVector.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/Support/ErrorHandling.h" +using namespace llvm; + +Thumb2RegisterInfo::Thumb2RegisterInfo(const ARMBaseInstrInfo &tii, + const ARMSubtarget &sti) + : ARMBaseRegisterInfo(tii, sti) { +} + +/// emitLoadConstPool - Emits a load from constpool to materialize the +/// specified immediate. +void Thumb2RegisterInfo::emitLoadConstPool(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, + DebugLoc dl, + unsigned DestReg, unsigned SubIdx, + int Val, + ARMCC::CondCodes Pred, + unsigned PredReg) const { + MachineFunction &MF = *MBB.getParent(); + MachineConstantPool *ConstantPool = MF.getConstantPool(); + const Constant *C = ConstantInt::get( + Type::getInt32Ty(MBB.getParent()->getFunction()->getContext()), Val); + unsigned Idx = ConstantPool->getConstantPoolIndex(C, 4); + + BuildMI(MBB, MBBI, dl, TII.get(ARM::t2LDRpci)) + .addReg(DestReg, getDefRegState(true), SubIdx) + .addConstantPoolIndex(Idx).addImm((int64_t)ARMCC::AL).addReg(0); +} diff --git a/contrib/llvm/lib/Target/ARM/Thumb2RegisterInfo.h b/contrib/llvm/lib/Target/ARM/Thumb2RegisterInfo.h new file mode 100644 index 0000000..b3cf2e5 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/Thumb2RegisterInfo.h @@ -0,0 +1,42 @@ +//===- Thumb2RegisterInfo.h - Thumb-2 Register Information Impl -*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the Thumb-2 implementation of the TargetRegisterInfo +// class. +// +//===----------------------------------------------------------------------===// + +#ifndef THUMB2REGISTERINFO_H +#define THUMB2REGISTERINFO_H + +#include "ARM.h" +#include "ARMRegisterInfo.h" +#include "llvm/Target/TargetRegisterInfo.h" + +namespace llvm { + class ARMSubtarget; + class ARMBaseInstrInfo; + class Type; + +struct Thumb2RegisterInfo : public ARMBaseRegisterInfo { +public: + Thumb2RegisterInfo(const ARMBaseInstrInfo &tii, const ARMSubtarget &STI); + + /// emitLoadConstPool - Emits a load from constpool to materialize the + /// specified immediate. + void emitLoadConstPool(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, + DebugLoc dl, + unsigned DestReg, unsigned SubIdx, int Val, + ARMCC::CondCodes Pred = ARMCC::AL, + unsigned PredReg = 0) const; +}; +} + +#endif // THUMB2REGISTERINFO_H diff --git a/contrib/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp b/contrib/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp new file mode 100644 index 0000000..8fe2e42 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp @@ -0,0 +1,721 @@ +//===-- Thumb2SizeReduction.cpp - Thumb2 code size reduction pass -*- C++ -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "t2-reduce-size" +#include "ARM.h" +#include "ARMAddressingModes.h" +#include "ARMBaseRegisterInfo.h" +#include "ARMBaseInstrInfo.h" +#include "Thumb2InstrInfo.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/Statistic.h" +using namespace llvm; + +STATISTIC(NumNarrows, "Number of 32-bit instrs reduced to 16-bit ones"); +STATISTIC(Num2Addrs, "Number of 32-bit instrs reduced to 2addr 16-bit ones"); +STATISTIC(NumLdSts, "Number of 32-bit load / store reduced to 16-bit ones"); + +static cl::opt<int> ReduceLimit("t2-reduce-limit", + cl::init(-1), cl::Hidden); +static cl::opt<int> ReduceLimit2Addr("t2-reduce-limit2", + cl::init(-1), cl::Hidden); +static cl::opt<int> ReduceLimitLdSt("t2-reduce-limit3", + cl::init(-1), cl::Hidden); + +namespace { + /// ReduceTable - A static table with information on mapping from wide + /// opcodes to narrow + struct ReduceEntry { + unsigned WideOpc; // Wide opcode + unsigned NarrowOpc1; // Narrow opcode to transform to + unsigned NarrowOpc2; // Narrow opcode when it's two-address + uint8_t Imm1Limit; // Limit of immediate field (bits) + uint8_t Imm2Limit; // Limit of immediate field when it's two-address + unsigned LowRegs1 : 1; // Only possible if low-registers are used + unsigned LowRegs2 : 1; // Only possible if low-registers are used (2addr) + unsigned PredCC1 : 2; // 0 - If predicated, cc is on and vice versa. + // 1 - No cc field. + // 2 - Always set CPSR. + unsigned PredCC2 : 2; + unsigned Special : 1; // Needs to be dealt with specially + }; + + static const ReduceEntry ReduceTable[] = { + // Wide, Narrow1, Narrow2, imm1,imm2, lo1, lo2, P/C, S + { ARM::t2ADCrr, 0, ARM::tADC, 0, 0, 0, 1, 0,0, 0 }, + { ARM::t2ADDri, ARM::tADDi3, ARM::tADDi8, 3, 8, 1, 1, 0,0, 0 }, + { ARM::t2ADDrr, ARM::tADDrr, ARM::tADDhirr, 0, 0, 1, 0, 0,1, 0 }, + // Note: immediate scale is 4. + { ARM::t2ADDrSPi,ARM::tADDrSPi,0, 8, 0, 1, 0, 1,0, 0 }, + { ARM::t2ADDSri,ARM::tADDi3, ARM::tADDi8, 3, 8, 1, 1, 2,2, 1 }, + { ARM::t2ADDSrr,ARM::tADDrr, 0, 0, 0, 1, 0, 2,0, 1 }, + { ARM::t2ANDrr, 0, ARM::tAND, 0, 0, 0, 1, 0,0, 0 }, + { ARM::t2ASRri, ARM::tASRri, 0, 5, 0, 1, 0, 0,0, 0 }, + { ARM::t2ASRrr, 0, ARM::tASRrr, 0, 0, 0, 1, 0,0, 0 }, + { ARM::t2BICrr, 0, ARM::tBIC, 0, 0, 0, 1, 0,0, 0 }, + //FIXME: Disable CMN, as CCodes are backwards from compare expectations + //{ ARM::t2CMNrr, ARM::tCMN, 0, 0, 0, 1, 0, 2,0, 0 }, + { ARM::t2CMPri, ARM::tCMPi8, 0, 8, 0, 1, 0, 2,0, 0 }, + { ARM::t2CMPrr, ARM::tCMPhir, 0, 0, 0, 0, 0, 2,0, 0 }, + { ARM::t2CMPzri,ARM::tCMPzi8, 0, 8, 0, 1, 0, 2,0, 0 }, + { ARM::t2CMPzrr,ARM::tCMPzhir,0, 0, 0, 0, 0, 2,0, 0 }, + { ARM::t2EORrr, 0, ARM::tEOR, 0, 0, 0, 1, 0,0, 0 }, + // FIXME: adr.n immediate offset must be multiple of 4. + //{ ARM::t2LEApcrelJT,ARM::tLEApcrelJT, 0, 0, 0, 1, 0, 1,0, 0 }, + { ARM::t2LSLri, ARM::tLSLri, 0, 5, 0, 1, 0, 0,0, 0 }, + { ARM::t2LSLrr, 0, ARM::tLSLrr, 0, 0, 0, 1, 0,0, 0 }, + { ARM::t2LSRri, ARM::tLSRri, 0, 5, 0, 1, 0, 0,0, 0 }, + { ARM::t2LSRrr, 0, ARM::tLSRrr, 0, 0, 0, 1, 0,0, 0 }, + { ARM::t2MOVi, ARM::tMOVi8, 0, 8, 0, 1, 0, 0,0, 0 }, + { ARM::t2MOVi16,ARM::tMOVi8, 0, 8, 0, 1, 0, 0,0, 1 }, + // FIXME: Do we need the 16-bit 'S' variant? + { ARM::t2MOVr,ARM::tMOVgpr2gpr,0, 0, 0, 0, 0, 1,0, 0 }, + { ARM::t2MOVCCr,0, ARM::tMOVCCr, 0, 0, 0, 0, 0,1, 0 }, + { ARM::t2MOVCCi,0, ARM::tMOVCCi, 0, 8, 0, 1, 0,1, 0 }, + { ARM::t2MUL, 0, ARM::tMUL, 0, 0, 0, 1, 0,0, 0 }, + { ARM::t2MVNr, ARM::tMVN, 0, 0, 0, 1, 0, 0,0, 0 }, + { ARM::t2ORRrr, 0, ARM::tORR, 0, 0, 0, 1, 0,0, 0 }, + { ARM::t2REV, ARM::tREV, 0, 0, 0, 1, 0, 1,0, 0 }, + { ARM::t2REV16, ARM::tREV16, 0, 0, 0, 1, 0, 1,0, 0 }, + { ARM::t2REVSH, ARM::tREVSH, 0, 0, 0, 1, 0, 1,0, 0 }, + { ARM::t2RORrr, 0, ARM::tROR, 0, 0, 0, 1, 0,0, 0 }, + { ARM::t2RSBri, ARM::tRSB, 0, 0, 0, 1, 0, 0,0, 1 }, + { ARM::t2RSBSri,ARM::tRSB, 0, 0, 0, 1, 0, 2,0, 1 }, + { ARM::t2SBCrr, 0, ARM::tSBC, 0, 0, 0, 1, 0,0, 0 }, + { ARM::t2SUBri, ARM::tSUBi3, ARM::tSUBi8, 3, 8, 1, 1, 0,0, 0 }, + { ARM::t2SUBrr, ARM::tSUBrr, 0, 0, 0, 1, 0, 0,0, 0 }, + { ARM::t2SUBSri,ARM::tSUBi3, ARM::tSUBi8, 3, 8, 1, 1, 2,2, 0 }, + { ARM::t2SUBSrr,ARM::tSUBrr, 0, 0, 0, 1, 0, 2,0, 0 }, + { ARM::t2SXTBr, ARM::tSXTB, 0, 0, 0, 1, 0, 1,0, 0 }, + { ARM::t2SXTHr, ARM::tSXTH, 0, 0, 0, 1, 0, 1,0, 0 }, + { ARM::t2TSTrr, ARM::tTST, 0, 0, 0, 1, 0, 2,0, 0 }, + { ARM::t2UXTBr, ARM::tUXTB, 0, 0, 0, 1, 0, 1,0, 0 }, + { ARM::t2UXTHr, ARM::tUXTH, 0, 0, 0, 1, 0, 1,0, 0 }, + + // FIXME: Clean this up after splitting each Thumb load / store opcode + // into multiple ones. + { ARM::t2LDRi12,ARM::tLDR, ARM::tLDRspi, 5, 8, 1, 0, 0,0, 1 }, + { ARM::t2LDRs, ARM::tLDR, 0, 0, 0, 1, 0, 0,0, 1 }, + { ARM::t2LDRBi12,ARM::tLDRB, 0, 5, 0, 1, 0, 0,0, 1 }, + { ARM::t2LDRBs, ARM::tLDRB, 0, 0, 0, 1, 0, 0,0, 1 }, + { ARM::t2LDRHi12,ARM::tLDRH, 0, 5, 0, 1, 0, 0,0, 1 }, + { ARM::t2LDRHs, ARM::tLDRH, 0, 0, 0, 1, 0, 0,0, 1 }, + { ARM::t2LDRSBs,ARM::tLDRSB, 0, 0, 0, 1, 0, 0,0, 1 }, + { ARM::t2LDRSHs,ARM::tLDRSH, 0, 0, 0, 1, 0, 0,0, 1 }, + { ARM::t2STRi12,ARM::tSTR, ARM::tSTRspi, 5, 8, 1, 0, 0,0, 1 }, + { ARM::t2STRs, ARM::tSTR, 0, 0, 0, 1, 0, 0,0, 1 }, + { ARM::t2STRBi12,ARM::tSTRB, 0, 5, 0, 1, 0, 0,0, 1 }, + { ARM::t2STRBs, ARM::tSTRB, 0, 0, 0, 1, 0, 0,0, 1 }, + { ARM::t2STRHi12,ARM::tSTRH, 0, 5, 0, 1, 0, 0,0, 1 }, + { ARM::t2STRHs, ARM::tSTRH, 0, 0, 0, 1, 0, 0,0, 1 }, + + { ARM::t2LDM, ARM::tLDM, 0, 0, 0, 1, 1, 1,1, 1 }, + { ARM::t2LDM_RET,0, ARM::tPOP_RET, 0, 0, 1, 1, 1,1, 1 }, + { ARM::t2LDM_UPD,ARM::tLDM_UPD,ARM::tPOP, 0, 0, 1, 1, 1,1, 1 }, + // ARM::t2STM (with no basereg writeback) has no Thumb1 equivalent + { ARM::t2STM_UPD,ARM::tSTM_UPD,ARM::tPUSH, 0, 0, 1, 1, 1,1, 1 }, + }; + + class Thumb2SizeReduce : public MachineFunctionPass { + public: + static char ID; + Thumb2SizeReduce(); + + const Thumb2InstrInfo *TII; + + virtual bool runOnMachineFunction(MachineFunction &MF); + + virtual const char *getPassName() const { + return "Thumb2 instruction size reduction pass"; + } + + private: + /// ReduceOpcodeMap - Maps wide opcode to index of entry in ReduceTable. + DenseMap<unsigned, unsigned> ReduceOpcodeMap; + + bool VerifyPredAndCC(MachineInstr *MI, const ReduceEntry &Entry, + bool is2Addr, ARMCC::CondCodes Pred, + bool LiveCPSR, bool &HasCC, bool &CCDead); + + bool ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI, + const ReduceEntry &Entry); + + bool ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI, + const ReduceEntry &Entry, bool LiveCPSR); + + /// ReduceTo2Addr - Reduce a 32-bit instruction to a 16-bit two-address + /// instruction. + bool ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI, + const ReduceEntry &Entry, + bool LiveCPSR); + + /// ReduceToNarrow - Reduce a 32-bit instruction to a 16-bit + /// non-two-address instruction. + bool ReduceToNarrow(MachineBasicBlock &MBB, MachineInstr *MI, + const ReduceEntry &Entry, + bool LiveCPSR); + + /// ReduceMBB - Reduce width of instructions in the specified basic block. + bool ReduceMBB(MachineBasicBlock &MBB); + }; + char Thumb2SizeReduce::ID = 0; +} + +Thumb2SizeReduce::Thumb2SizeReduce() : MachineFunctionPass(&ID) { + for (unsigned i = 0, e = array_lengthof(ReduceTable); i != e; ++i) { + unsigned FromOpc = ReduceTable[i].WideOpc; + if (!ReduceOpcodeMap.insert(std::make_pair(FromOpc, i)).second) + assert(false && "Duplicated entries?"); + } +} + +static bool HasImplicitCPSRDef(const TargetInstrDesc &TID) { + for (const unsigned *Regs = TID.ImplicitDefs; *Regs; ++Regs) + if (*Regs == ARM::CPSR) + return true; + return false; +} + +bool +Thumb2SizeReduce::VerifyPredAndCC(MachineInstr *MI, const ReduceEntry &Entry, + bool is2Addr, ARMCC::CondCodes Pred, + bool LiveCPSR, bool &HasCC, bool &CCDead) { + if ((is2Addr && Entry.PredCC2 == 0) || + (!is2Addr && Entry.PredCC1 == 0)) { + if (Pred == ARMCC::AL) { + // Not predicated, must set CPSR. + if (!HasCC) { + // Original instruction was not setting CPSR, but CPSR is not + // currently live anyway. It's ok to set it. The CPSR def is + // dead though. + if (!LiveCPSR) { + HasCC = true; + CCDead = true; + return true; + } + return false; + } + } else { + // Predicated, must not set CPSR. + if (HasCC) + return false; + } + } else if ((is2Addr && Entry.PredCC2 == 2) || + (!is2Addr && Entry.PredCC1 == 2)) { + /// Old opcode has an optional def of CPSR. + if (HasCC) + return true; + // If both old opcode does not implicit CPSR def, then it's not ok since + // these new opcodes CPSR def is not meant to be thrown away. e.g. CMP. + if (!HasImplicitCPSRDef(MI->getDesc())) + return false; + HasCC = true; + } else { + // 16-bit instruction does not set CPSR. + if (HasCC) + return false; + } + + return true; +} + +static bool VerifyLowRegs(MachineInstr *MI) { + unsigned Opc = MI->getOpcode(); + bool isPCOk = (Opc == ARM::t2LDM_RET || Opc == ARM::t2LDM || + Opc == ARM::t2LDM_UPD); + bool isLROk = (Opc == ARM::t2STM_UPD); + bool isSPOk = isPCOk || isLROk || (Opc == ARM::t2ADDrSPi); + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { + const MachineOperand &MO = MI->getOperand(i); + if (!MO.isReg() || MO.isImplicit()) + continue; + unsigned Reg = MO.getReg(); + if (Reg == 0 || Reg == ARM::CPSR) + continue; + if (isPCOk && Reg == ARM::PC) + continue; + if (isLROk && Reg == ARM::LR) + continue; + if (Reg == ARM::SP) { + if (isSPOk) + continue; + if (i == 1 && (Opc == ARM::t2LDRi12 || Opc == ARM::t2STRi12)) + // Special case for these ldr / str with sp as base register. + continue; + } + if (!isARMLowRegister(Reg)) + return false; + } + return true; +} + +bool +Thumb2SizeReduce::ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI, + const ReduceEntry &Entry) { + if (ReduceLimitLdSt != -1 && ((int)NumLdSts >= ReduceLimitLdSt)) + return false; + + unsigned Scale = 1; + bool HasImmOffset = false; + bool HasShift = false; + bool HasOffReg = true; + bool isLdStMul = false; + unsigned Opc = Entry.NarrowOpc1; + unsigned OpNum = 3; // First 'rest' of operands. + uint8_t ImmLimit = Entry.Imm1Limit; + switch (Entry.WideOpc) { + default: + llvm_unreachable("Unexpected Thumb2 load / store opcode!"); + case ARM::t2LDRi12: + case ARM::t2STRi12: { + unsigned BaseReg = MI->getOperand(1).getReg(); + if (BaseReg == ARM::SP) { + Opc = Entry.NarrowOpc2; + ImmLimit = Entry.Imm2Limit; + HasOffReg = false; + } + Scale = 4; + HasImmOffset = true; + break; + } + case ARM::t2LDRBi12: + case ARM::t2STRBi12: + HasImmOffset = true; + break; + case ARM::t2LDRHi12: + case ARM::t2STRHi12: + Scale = 2; + HasImmOffset = true; + break; + case ARM::t2LDRs: + case ARM::t2LDRBs: + case ARM::t2LDRHs: + case ARM::t2LDRSBs: + case ARM::t2LDRSHs: + case ARM::t2STRs: + case ARM::t2STRBs: + case ARM::t2STRHs: + HasShift = true; + OpNum = 4; + break; + case ARM::t2LDM: { + unsigned BaseReg = MI->getOperand(0).getReg(); + ARM_AM::AMSubMode Mode = ARM_AM::getAM4SubMode(MI->getOperand(1).getImm()); + if (!isARMLowRegister(BaseReg) || Mode != ARM_AM::ia) + return false; + OpNum = 0; + isLdStMul = true; + break; + } + case ARM::t2LDM_RET: { + unsigned BaseReg = MI->getOperand(1).getReg(); + if (BaseReg != ARM::SP) + return false; + Opc = Entry.NarrowOpc2; // tPOP_RET + OpNum = 3; + isLdStMul = true; + break; + } + case ARM::t2LDM_UPD: + case ARM::t2STM_UPD: { + OpNum = 0; + unsigned BaseReg = MI->getOperand(1).getReg(); + ARM_AM::AMSubMode Mode = ARM_AM::getAM4SubMode(MI->getOperand(2).getImm()); + if (BaseReg == ARM::SP && + ((Entry.WideOpc == ARM::t2LDM_UPD && Mode == ARM_AM::ia) || + (Entry.WideOpc == ARM::t2STM_UPD && Mode == ARM_AM::db))) { + Opc = Entry.NarrowOpc2; // tPOP or tPUSH + OpNum = 3; + } else if (!isARMLowRegister(BaseReg) || Mode != ARM_AM::ia) { + return false; + } + isLdStMul = true; + break; + } + } + + unsigned OffsetReg = 0; + bool OffsetKill = false; + if (HasShift) { + OffsetReg = MI->getOperand(2).getReg(); + OffsetKill = MI->getOperand(2).isKill(); + if (MI->getOperand(3).getImm()) + // Thumb1 addressing mode doesn't support shift. + return false; + } + + unsigned OffsetImm = 0; + if (HasImmOffset) { + OffsetImm = MI->getOperand(2).getImm(); + unsigned MaxOffset = ((1 << ImmLimit) - 1) * Scale; + if ((OffsetImm & (Scale-1)) || OffsetImm > MaxOffset) + // Make sure the immediate field fits. + return false; + } + + // Add the 16-bit load / store instruction. + // FIXME: Thumb1 addressing mode encode both immediate and register offset. + DebugLoc dl = MI->getDebugLoc(); + MachineInstrBuilder MIB = BuildMI(MBB, *MI, dl, TII->get(Opc)); + if (!isLdStMul) { + MIB.addOperand(MI->getOperand(0)).addOperand(MI->getOperand(1)); + if (Opc != ARM::tLDRSB && Opc != ARM::tLDRSH) { + // tLDRSB and tLDRSH do not have an immediate offset field. On the other + // hand, it must have an offset register. + // FIXME: Remove this special case. + MIB.addImm(OffsetImm/Scale); + } + assert((!HasShift || OffsetReg) && "Invalid so_reg load / store address!"); + + if (HasOffReg) + MIB.addReg(OffsetReg, getKillRegState(OffsetKill)); + } + + // Transfer the rest of operands. + for (unsigned e = MI->getNumOperands(); OpNum != e; ++OpNum) + MIB.addOperand(MI->getOperand(OpNum)); + + // Transfer memoperands. + (*MIB).setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); + + DEBUG(errs() << "Converted 32-bit: " << *MI << " to 16-bit: " << *MIB); + + MBB.erase(MI); + ++NumLdSts; + return true; +} + +bool +Thumb2SizeReduce::ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI, + const ReduceEntry &Entry, + bool LiveCPSR) { + if (Entry.LowRegs1 && !VerifyLowRegs(MI)) + return false; + + const TargetInstrDesc &TID = MI->getDesc(); + if (TID.mayLoad() || TID.mayStore()) + return ReduceLoadStore(MBB, MI, Entry); + + unsigned Opc = MI->getOpcode(); + switch (Opc) { + default: break; + case ARM::t2ADDSri: + case ARM::t2ADDSrr: { + unsigned PredReg = 0; + if (getInstrPredicate(MI, PredReg) == ARMCC::AL) { + switch (Opc) { + default: break; + case ARM::t2ADDSri: { + if (ReduceTo2Addr(MBB, MI, Entry, LiveCPSR)) + return true; + // fallthrough + } + case ARM::t2ADDSrr: + return ReduceToNarrow(MBB, MI, Entry, LiveCPSR); + } + } + break; + } + case ARM::t2RSBri: + case ARM::t2RSBSri: + if (MI->getOperand(2).getImm() == 0) + return ReduceToNarrow(MBB, MI, Entry, LiveCPSR); + break; + case ARM::t2MOVi16: + // Can convert only 'pure' immediate operands, not immediates obtained as + // globals' addresses. + if (MI->getOperand(1).isImm()) + return ReduceToNarrow(MBB, MI, Entry, LiveCPSR); + break; + } + return false; +} + +bool +Thumb2SizeReduce::ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI, + const ReduceEntry &Entry, + bool LiveCPSR) { + + if (ReduceLimit2Addr != -1 && ((int)Num2Addrs >= ReduceLimit2Addr)) + return false; + + const TargetInstrDesc &TID = MI->getDesc(); + unsigned Reg0 = MI->getOperand(0).getReg(); + unsigned Reg1 = MI->getOperand(1).getReg(); + if (Reg0 != Reg1) + return false; + if (Entry.LowRegs2 && !isARMLowRegister(Reg0)) + return false; + if (Entry.Imm2Limit) { + unsigned Imm = MI->getOperand(2).getImm(); + unsigned Limit = (1 << Entry.Imm2Limit) - 1; + if (Imm > Limit) + return false; + } else { + unsigned Reg2 = MI->getOperand(2).getReg(); + if (Entry.LowRegs2 && !isARMLowRegister(Reg2)) + return false; + } + + // Check if it's possible / necessary to transfer the predicate. + const TargetInstrDesc &NewTID = TII->get(Entry.NarrowOpc2); + unsigned PredReg = 0; + ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg); + bool SkipPred = false; + if (Pred != ARMCC::AL) { + if (!NewTID.isPredicable()) + // Can't transfer predicate, fail. + return false; + } else { + SkipPred = !NewTID.isPredicable(); + } + + bool HasCC = false; + bool CCDead = false; + if (TID.hasOptionalDef()) { + unsigned NumOps = TID.getNumOperands(); + HasCC = (MI->getOperand(NumOps-1).getReg() == ARM::CPSR); + if (HasCC && MI->getOperand(NumOps-1).isDead()) + CCDead = true; + } + if (!VerifyPredAndCC(MI, Entry, true, Pred, LiveCPSR, HasCC, CCDead)) + return false; + + // Add the 16-bit instruction. + DebugLoc dl = MI->getDebugLoc(); + MachineInstrBuilder MIB = BuildMI(MBB, *MI, dl, NewTID); + MIB.addOperand(MI->getOperand(0)); + if (NewTID.hasOptionalDef()) { + if (HasCC) + AddDefaultT1CC(MIB, CCDead); + else + AddNoT1CC(MIB); + } + + // Transfer the rest of operands. + unsigned NumOps = TID.getNumOperands(); + for (unsigned i = 1, e = MI->getNumOperands(); i != e; ++i) { + if (i < NumOps && TID.OpInfo[i].isOptionalDef()) + continue; + if (SkipPred && TID.OpInfo[i].isPredicate()) + continue; + MIB.addOperand(MI->getOperand(i)); + } + + DEBUG(errs() << "Converted 32-bit: " << *MI << " to 16-bit: " << *MIB); + + MBB.erase(MI); + ++Num2Addrs; + return true; +} + +bool +Thumb2SizeReduce::ReduceToNarrow(MachineBasicBlock &MBB, MachineInstr *MI, + const ReduceEntry &Entry, + bool LiveCPSR) { + if (ReduceLimit != -1 && ((int)NumNarrows >= ReduceLimit)) + return false; + + unsigned Limit = ~0U; + unsigned Scale = (Entry.WideOpc == ARM::t2ADDrSPi) ? 4 : 1; + if (Entry.Imm1Limit) + Limit = ((1 << Entry.Imm1Limit) - 1) * Scale; + + const TargetInstrDesc &TID = MI->getDesc(); + for (unsigned i = 0, e = TID.getNumOperands(); i != e; ++i) { + if (TID.OpInfo[i].isPredicate()) + continue; + const MachineOperand &MO = MI->getOperand(i); + if (MO.isReg()) { + unsigned Reg = MO.getReg(); + if (!Reg || Reg == ARM::CPSR) + continue; + if (Entry.WideOpc == ARM::t2ADDrSPi && Reg == ARM::SP) + continue; + if (Entry.LowRegs1 && !isARMLowRegister(Reg)) + return false; + } else if (MO.isImm() && + !TID.OpInfo[i].isPredicate()) { + if (((unsigned)MO.getImm()) > Limit || (MO.getImm() & (Scale-1)) != 0) + return false; + } + } + + // Check if it's possible / necessary to transfer the predicate. + const TargetInstrDesc &NewTID = TII->get(Entry.NarrowOpc1); + unsigned PredReg = 0; + ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg); + bool SkipPred = false; + if (Pred != ARMCC::AL) { + if (!NewTID.isPredicable()) + // Can't transfer predicate, fail. + return false; + } else { + SkipPred = !NewTID.isPredicable(); + } + + bool HasCC = false; + bool CCDead = false; + if (TID.hasOptionalDef()) { + unsigned NumOps = TID.getNumOperands(); + HasCC = (MI->getOperand(NumOps-1).getReg() == ARM::CPSR); + if (HasCC && MI->getOperand(NumOps-1).isDead()) + CCDead = true; + } + if (!VerifyPredAndCC(MI, Entry, false, Pred, LiveCPSR, HasCC, CCDead)) + return false; + + // Add the 16-bit instruction. + DebugLoc dl = MI->getDebugLoc(); + MachineInstrBuilder MIB = BuildMI(MBB, *MI, dl, NewTID); + MIB.addOperand(MI->getOperand(0)); + if (NewTID.hasOptionalDef()) { + if (HasCC) + AddDefaultT1CC(MIB, CCDead); + else + AddNoT1CC(MIB); + } + + // Transfer the rest of operands. + unsigned NumOps = TID.getNumOperands(); + for (unsigned i = 1, e = MI->getNumOperands(); i != e; ++i) { + if (i < NumOps && TID.OpInfo[i].isOptionalDef()) + continue; + if ((TID.getOpcode() == ARM::t2RSBSri || + TID.getOpcode() == ARM::t2RSBri) && i == 2) + // Skip the zero immediate operand, it's now implicit. + continue; + bool isPred = (i < NumOps && TID.OpInfo[i].isPredicate()); + if (SkipPred && isPred) + continue; + const MachineOperand &MO = MI->getOperand(i); + if (Scale > 1 && !isPred && MO.isImm()) + MIB.addImm(MO.getImm() / Scale); + else { + if (MO.isReg() && MO.isImplicit() && MO.getReg() == ARM::CPSR) + // Skip implicit def of CPSR. Either it's modeled as an optional + // def now or it's already an implicit def on the new instruction. + continue; + MIB.addOperand(MO); + } + } + if (!TID.isPredicable() && NewTID.isPredicable()) + AddDefaultPred(MIB); + + DEBUG(errs() << "Converted 32-bit: " << *MI << " to 16-bit: " << *MIB); + + MBB.erase(MI); + ++NumNarrows; + return true; +} + +static bool UpdateCPSRDef(MachineInstr &MI, bool LiveCPSR) { + bool HasDef = false; + for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { + const MachineOperand &MO = MI.getOperand(i); + if (!MO.isReg() || MO.isUndef() || MO.isUse()) + continue; + if (MO.getReg() != ARM::CPSR) + continue; + if (!MO.isDead()) + HasDef = true; + } + + return HasDef || LiveCPSR; +} + +static bool UpdateCPSRUse(MachineInstr &MI, bool LiveCPSR) { + for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { + const MachineOperand &MO = MI.getOperand(i); + if (!MO.isReg() || MO.isUndef() || MO.isDef()) + continue; + if (MO.getReg() != ARM::CPSR) + continue; + assert(LiveCPSR && "CPSR liveness tracking is wrong!"); + if (MO.isKill()) { + LiveCPSR = false; + break; + } + } + + return LiveCPSR; +} + +bool Thumb2SizeReduce::ReduceMBB(MachineBasicBlock &MBB) { + bool Modified = false; + + // Yes, CPSR could be livein. + bool LiveCPSR = MBB.isLiveIn(ARM::CPSR); + + MachineBasicBlock::iterator MII = MBB.begin(), E = MBB.end(); + MachineBasicBlock::iterator NextMII; + for (; MII != E; MII = NextMII) { + NextMII = llvm::next(MII); + + MachineInstr *MI = &*MII; + LiveCPSR = UpdateCPSRUse(*MI, LiveCPSR); + + unsigned Opcode = MI->getOpcode(); + DenseMap<unsigned, unsigned>::iterator OPI = ReduceOpcodeMap.find(Opcode); + if (OPI != ReduceOpcodeMap.end()) { + const ReduceEntry &Entry = ReduceTable[OPI->second]; + // Ignore "special" cases for now. + if (Entry.Special) { + if (ReduceSpecial(MBB, MI, Entry, LiveCPSR)) { + Modified = true; + MachineBasicBlock::iterator I = prior(NextMII); + MI = &*I; + } + goto ProcessNext; + } + + // Try to transform to a 16-bit two-address instruction. + if (Entry.NarrowOpc2 && ReduceTo2Addr(MBB, MI, Entry, LiveCPSR)) { + Modified = true; + MachineBasicBlock::iterator I = prior(NextMII); + MI = &*I; + goto ProcessNext; + } + + // Try to transform ro a 16-bit non-two-address instruction. + if (Entry.NarrowOpc1 && ReduceToNarrow(MBB, MI, Entry, LiveCPSR)) { + Modified = true; + MachineBasicBlock::iterator I = prior(NextMII); + MI = &*I; + } + } + + ProcessNext: + LiveCPSR = UpdateCPSRDef(*MI, LiveCPSR); + } + + return Modified; +} + +bool Thumb2SizeReduce::runOnMachineFunction(MachineFunction &MF) { + const TargetMachine &TM = MF.getTarget(); + TII = static_cast<const Thumb2InstrInfo*>(TM.getInstrInfo()); + + bool Modified = false; + for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) + Modified |= ReduceMBB(*I); + return Modified; +} + +/// createThumb2SizeReductionPass - Returns an instance of the Thumb2 size +/// reduction pass. +FunctionPass *llvm::createThumb2SizeReductionPass() { + return new Thumb2SizeReduce(); +} |