diff options
Diffstat (limited to 'contrib/llvm/lib/Target/ARM')
87 files changed, 33393 insertions, 19121 deletions
diff --git a/contrib/llvm/lib/Target/ARM/ARM.h b/contrib/llvm/lib/Target/ARM/ARM.h index 271ca44..4679f74 100644 --- a/contrib/llvm/lib/Target/ARM/ARM.h +++ b/contrib/llvm/lib/Target/ARM/ARM.h @@ -15,6 +15,7 @@ #ifndef TARGET_ARM_H #define TARGET_ARM_H +#include "ARMBaseInfo.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Target/TargetMachine.h" #include <cassert> @@ -25,97 +26,17 @@ class ARMBaseTargetMachine; class FunctionPass; class JITCodeEmitter; class formatted_raw_ostream; +class MCCodeEmitter; +class TargetAsmBackend; +class MachineInstr; +class ARMAsmPrinter; +class MCInst; -// Enums corresponding to ARM condition codes -namespace ARMCC { - // The CondCodes constants map directly to the 4-bit encoding of the - // condition field for predicated instructions. - enum CondCodes { // Meaning (integer) Meaning (floating-point) - EQ, // Equal Equal - NE, // Not equal Not equal, or unordered - HS, // Carry set >, ==, or unordered - LO, // Carry clear Less than - MI, // Minus, negative Less than - PL, // Plus, positive or zero >, ==, or unordered - VS, // Overflow Unordered - VC, // No overflow Not unordered - HI, // Unsigned higher Greater than, or unordered - LS, // Unsigned lower or same Less than or equal - GE, // Greater than or equal Greater than or equal - LT, // Less than Less than, or unordered - GT, // Greater than Greater than - LE, // Less than or equal <, ==, or unordered - AL // Always (unconditional) Always (unconditional) - }; +MCCodeEmitter *createARMMCCodeEmitter(const Target &, + TargetMachine &TM, + MCContext &Ctx); - inline static CondCodes getOppositeCondition(CondCodes CC) { - switch (CC) { - default: llvm_unreachable("Unknown condition code"); - case EQ: return NE; - case NE: return EQ; - case HS: return LO; - case LO: return HS; - case MI: return PL; - case PL: return MI; - case VS: return VC; - case VC: return VS; - case HI: return LS; - case LS: return HI; - case GE: return LT; - case LT: return GE; - case GT: return LE; - case LE: return GT; - } - } -} // namespace ARMCC - -inline static const char *ARMCondCodeToString(ARMCC::CondCodes CC) { - switch (CC) { - default: llvm_unreachable("Unknown condition code"); - case ARMCC::EQ: return "eq"; - case ARMCC::NE: return "ne"; - case ARMCC::HS: return "hs"; - case ARMCC::LO: return "lo"; - case ARMCC::MI: return "mi"; - case ARMCC::PL: return "pl"; - case ARMCC::VS: return "vs"; - case ARMCC::VC: return "vc"; - case ARMCC::HI: return "hi"; - case ARMCC::LS: return "ls"; - case ARMCC::GE: return "ge"; - case ARMCC::LT: return "lt"; - case ARMCC::GT: return "gt"; - case ARMCC::LE: return "le"; - case ARMCC::AL: return "al"; - } -} - -namespace ARM_MB { - // The Memory Barrier Option constants map directly to the 4-bit encoding of - // the option field for memory barrier operations. - enum MemBOpt { - ST = 14, - ISH = 11, - ISHST = 10, - NSH = 7, - NSHST = 6, - OSH = 3, - OSHST = 2 - }; - - inline static const char *MemBOptToString(unsigned val) { - switch (val) { - default: llvm_unreachable("Unknown memory opetion"); - case ST: return "st"; - case ISH: return "ish"; - case ISHST: return "ishst"; - case NSH: return "nsh"; - case NSHST: return "nshst"; - case OSH: return "osh"; - case OSHST: return "oshst"; - } - } -} // namespace ARM_MB +TargetAsmBackend *createARMAsmBackend(const Target &, const std::string &); FunctionPass *createARMISelDag(ARMBaseTargetMachine &TM, CodeGenOpt::Level OptLevel); @@ -127,23 +48,16 @@ FunctionPass *createARMLoadStoreOptimizationPass(bool PreAlloc = false); FunctionPass *createARMExpandPseudoPass(); FunctionPass *createARMGlobalMergePass(const TargetLowering* tli); FunctionPass *createARMConstantIslandPass(); -FunctionPass *createNEONPreAllocPass(); FunctionPass *createNEONMoveFixPass(); +FunctionPass *createMLxExpansionPass(); FunctionPass *createThumb2ITBlockPass(); FunctionPass *createThumb2SizeReductionPass(); extern Target TheARMTarget, TheThumbTarget; -} // end namespace llvm; - -// Defines symbolic names for ARM registers. This defines a mapping from -// register name to register number. -// -#include "ARMGenRegisterNames.inc" - -// Defines symbolic names for the ARM instructions. -// -#include "ARMGenInstrNames.inc" +void LowerARMMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI, + ARMAsmPrinter &AP); +} // end namespace llvm; #endif diff --git a/contrib/llvm/lib/Target/ARM/ARM.td b/contrib/llvm/lib/Target/ARM/ARM.td index d6a8f19..bf4315f 100644 --- a/contrib/llvm/lib/Target/ARM/ARM.td +++ b/contrib/llvm/lib/Target/ARM/ARM.td @@ -16,6 +16,7 @@ include "llvm/Target/Target.td" + //===----------------------------------------------------------------------===// // ARM Subtarget features. // @@ -32,6 +33,8 @@ def FeatureNoARM : SubtargetFeature<"noarm", "NoARM", "true", "Does not support ARM mode execution">; def FeatureFP16 : SubtargetFeature<"fp16", "HasFP16", "true", "Enable half-precision floating point">; +def FeatureD16 : SubtargetFeature<"d16", "HasD16", "true", + "Restrict VFP3 to 16 double registers">; def FeatureHWDiv : SubtargetFeature<"hwdiv", "HasHardwareDivide", "true", "Enable divide instructions">; def FeatureT2XtPk : SubtargetFeature<"t2xtpk", "HasT2ExtractPack", "true", @@ -43,14 +46,11 @@ def FeatureSlowFPBrcc : SubtargetFeature<"slow-fp-brcc", "SlowFPBrcc", "true", def FeatureVFPOnlySP : SubtargetFeature<"fp-only-sp", "FPOnlySP", "true", "Floating point unit supports single precision only">; -// Some processors have multiply-accumulate instructions that don't -// play nicely with other VFP instructions, and it's generally better +// Some processors have FP multiply-accumulate instructions that don't +// play nicely with other VFP / NEON instructions, and it's generally better // to just not use them. -// FIXME: Currently, this is only flagged for Cortex-A8. It may be true for -// others as well. We should do more benchmarking and confirm one way or -// the other. -def FeatureHasSlowVMLx : SubtargetFeature<"vmlx", "SlowVMLx", "true", - "Disable VFP MAC instructions">; +def FeatureHasSlowFPVMLx : SubtargetFeature<"slowfpvmlx", "SlowFPVMLx", "true", + "Disable VFP / NEON MAC instructions">; // Some processors benefit from using NEON instructions for scalar // single-precision FP operations. def FeatureNEONForFP : SubtargetFeature<"neonfp", "UseNEONForSinglePrecisionFP", @@ -61,6 +61,9 @@ def FeatureNEONForFP : SubtargetFeature<"neonfp", "UseNEONForSinglePrecisionFP", def FeaturePref32BitThumb : SubtargetFeature<"32bit", "Pref32BitThumb", "true", "Prefer 32-bit Thumb instrs">; +// Multiprocessing extension. +def FeatureMP : SubtargetFeature<"mp", "HasMPExtension", "true", + "Supports Multiprocessing extension">; // ARM architectures. def ArchV4T : SubtargetFeature<"v4t", "ARMArchVersion", "V4T", @@ -91,6 +94,18 @@ def ArchV7M : SubtargetFeature<"v7m", "ARMArchVersion", "V7M", include "ARMSchedule.td" +// ARM processor families. +def ProcOthers : SubtargetFeature<"others", "ARMProcFamily", "Others", + "One of the other ARM processor families">; +def ProcA8 : SubtargetFeature<"a8", "ARMProcFamily", "CortexA8", + "Cortex-A8 ARM processors", + [FeatureSlowFPBrcc, FeatureNEONForFP, + FeatureHasSlowFPVMLx, FeatureT2XtPk]>; +def ProcA9 : SubtargetFeature<"a9", "ARMProcFamily", "CortexA9", + "Cortex-A9 ARM processors", + [FeatureHasSlowFPVMLx, FeatureT2XtPk, + FeatureFP16]>; + class ProcNoItin<string Name, list<SubtargetFeature> Features> : Processor<Name, GenericItineraries, Features>; @@ -135,25 +150,27 @@ def : ProcNoItin<"iwmmxt", [ArchV5TE]>; // V6 Processors. def : Processor<"arm1136j-s", ARMV6Itineraries, [ArchV6]>; def : Processor<"arm1136jf-s", ARMV6Itineraries, [ArchV6, FeatureVFP2, - FeatureHasSlowVMLx]>; + FeatureHasSlowFPVMLx]>; def : Processor<"arm1176jz-s", ARMV6Itineraries, [ArchV6]>; -def : Processor<"arm1176jzf-s", ARMV6Itineraries, [ArchV6, FeatureVFP2]>; +def : Processor<"arm1176jzf-s", ARMV6Itineraries, [ArchV6, FeatureVFP2, + FeatureHasSlowFPVMLx]>; def : Processor<"mpcorenovfp", ARMV6Itineraries, [ArchV6]>; -def : Processor<"mpcore", ARMV6Itineraries, [ArchV6, FeatureVFP2]>; +def : Processor<"mpcore", ARMV6Itineraries, [ArchV6, FeatureVFP2, + FeatureHasSlowFPVMLx]>; // V6M Processors. def : Processor<"cortex-m0", ARMV6Itineraries, [ArchV6M]>; // V6T2 Processors. def : Processor<"arm1156t2-s", ARMV6Itineraries, [ArchV6T2]>; -def : Processor<"arm1156t2f-s", ARMV6Itineraries, [ArchV6T2, FeatureVFP2]>; +def : Processor<"arm1156t2f-s", ARMV6Itineraries, [ArchV6T2, FeatureVFP2, + FeatureHasSlowFPVMLx]>; // V7 Processors. def : Processor<"cortex-a8", CortexA8Itineraries, - [ArchV7A, FeatureHasSlowVMLx, - FeatureSlowFPBrcc, FeatureNEONForFP, FeatureT2XtPk]>; + [ArchV7A, ProcA8]>; def : Processor<"cortex-a9", CortexA9Itineraries, - [ArchV7A, FeatureT2XtPk]>; + [ArchV7A, ProcA9]>; // V7M Processors. def : ProcNoItin<"cortex-m3", [ArchV7M]>; @@ -175,6 +192,17 @@ include "ARMInstrInfo.td" def ARMInstrInfo : InstrInfo; + +//===----------------------------------------------------------------------===// +// Assembly printer +//===----------------------------------------------------------------------===// +// ARM Uses the MC printer for asm output, so make sure the TableGen +// AsmWriter bits get associated with the correct class. +def ARMAsmWriter : AsmWriter { + string AsmWriterClassName = "InstPrinter"; + bit isMCAsmWriter = 1; +} + //===----------------------------------------------------------------------===// // Declare the target which we are implementing //===----------------------------------------------------------------------===// @@ -182,4 +210,6 @@ def ARMInstrInfo : InstrInfo; def ARM : Target { // Pull in Instruction Info: let InstructionSet = ARMInstrInfo; + + let AssemblyWriters = [ARMAsmWriter]; } diff --git a/contrib/llvm/lib/Target/ARM/ARMAddressingModes.h b/contrib/llvm/lib/Target/ARM/ARMAddressingModes.h index db48100..19fbf05 100644 --- a/contrib/llvm/lib/Target/ARM/ARMAddressingModes.h +++ b/contrib/llvm/lib/Target/ARM/ARMAddressingModes.h @@ -50,6 +50,16 @@ namespace ARM_AM { } } + static inline unsigned getShiftOpcEncoding(ShiftOpc Op) { + switch (Op) { + default: assert(0 && "Unknown shift opc!"); + case ARM_AM::asr: return 2; + case ARM_AM::lsl: return 0; + case ARM_AM::lsr: return 1; + case ARM_AM::ror: return 3; + } + } + static inline ShiftOpc getShiftOpcForNode(SDValue N) { switch (N.getOpcode()) { default: return ARM_AM::no_shift; @@ -566,6 +576,8 @@ namespace ARM_AM { return Val; } + AMSubMode getLoadStoreMultipleSubMode(int Opcode); + } // end namespace ARM_AM } // end namespace llvm diff --git a/contrib/llvm/lib/Target/ARM/ARMAsmBackend.cpp b/contrib/llvm/lib/Target/ARM/ARMAsmBackend.cpp new file mode 100644 index 0000000..ec23449 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMAsmBackend.cpp @@ -0,0 +1,512 @@ +//===-- ARMAsmBackend.cpp - ARM Assembler Backend -------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "ARM.h" +#include "ARMAddressingModes.h" +#include "ARMFixupKinds.h" +#include "llvm/ADT/Twine.h" +#include "llvm/MC/MCAssembler.h" +#include "llvm/MC/MCDirectives.h" +#include "llvm/MC/MCELFObjectWriter.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCMachObjectWriter.h" +#include "llvm/MC/MCObjectWriter.h" +#include "llvm/MC/MCSectionELF.h" +#include "llvm/MC/MCSectionMachO.h" +#include "llvm/Object/MachOFormat.h" +#include "llvm/Support/ELF.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetAsmBackend.h" +#include "llvm/Target/TargetRegistry.h" +using namespace llvm; + +namespace { +class ARMMachObjectWriter : public MCMachObjectTargetWriter { +public: + ARMMachObjectWriter(bool Is64Bit, uint32_t CPUType, + uint32_t CPUSubtype) + : MCMachObjectTargetWriter(Is64Bit, CPUType, CPUSubtype, + /*UseAggressiveSymbolFolding=*/true) {} +}; + +class ARMELFObjectWriter : public MCELFObjectTargetWriter { +public: + ARMELFObjectWriter(Triple::OSType OSType) + : MCELFObjectTargetWriter(/*Is64Bit*/ false, OSType, ELF::EM_ARM, + /*HasRelocationAddend*/ false) {} +}; + +class ARMAsmBackend : public TargetAsmBackend { + bool isThumbMode; // Currently emitting Thumb code. +public: + ARMAsmBackend(const Target &T) : TargetAsmBackend(), isThumbMode(false) {} + + unsigned getNumFixupKinds() const { return ARM::NumTargetFixupKinds; } + + const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const { + const static MCFixupKindInfo Infos[ARM::NumTargetFixupKinds] = { +// This table *must* be in the order that the fixup_* kinds are defined in +// ARMFixupKinds.h. +// +// Name Offset (bits) Size (bits) Flags +{ "fixup_arm_ldst_pcrel_12", 1, 24, MCFixupKindInfo::FKF_IsPCRel }, +{ "fixup_t2_ldst_pcrel_12", 0, 32, MCFixupKindInfo::FKF_IsPCRel | + MCFixupKindInfo::FKF_IsAlignedDownTo32Bits}, +{ "fixup_arm_pcrel_10", 1, 24, MCFixupKindInfo::FKF_IsPCRel }, +{ "fixup_t2_pcrel_10", 0, 32, MCFixupKindInfo::FKF_IsPCRel | + MCFixupKindInfo::FKF_IsAlignedDownTo32Bits}, +{ "fixup_thumb_adr_pcrel_10",0, 8, MCFixupKindInfo::FKF_IsPCRel | + MCFixupKindInfo::FKF_IsAlignedDownTo32Bits}, +{ "fixup_arm_adr_pcrel_12", 1, 24, MCFixupKindInfo::FKF_IsPCRel }, +{ "fixup_t2_adr_pcrel_12", 0, 32, MCFixupKindInfo::FKF_IsPCRel | + MCFixupKindInfo::FKF_IsAlignedDownTo32Bits}, +{ "fixup_arm_condbranch", 0, 24, MCFixupKindInfo::FKF_IsPCRel }, +{ "fixup_arm_uncondbranch", 0, 24, MCFixupKindInfo::FKF_IsPCRel }, +{ "fixup_t2_condbranch", 0, 32, MCFixupKindInfo::FKF_IsPCRel }, +{ "fixup_t2_uncondbranch", 0, 32, MCFixupKindInfo::FKF_IsPCRel }, +{ "fixup_arm_thumb_br", 0, 16, MCFixupKindInfo::FKF_IsPCRel }, +{ "fixup_arm_thumb_bl", 0, 32, MCFixupKindInfo::FKF_IsPCRel }, +{ "fixup_arm_thumb_blx", 7, 21, MCFixupKindInfo::FKF_IsPCRel }, +{ "fixup_arm_thumb_cb", 0, 16, MCFixupKindInfo::FKF_IsPCRel }, +{ "fixup_arm_thumb_cp", 1, 8, MCFixupKindInfo::FKF_IsPCRel }, +{ "fixup_arm_thumb_bcc", 1, 8, MCFixupKindInfo::FKF_IsPCRel }, +// movw / movt: 16-bits immediate but scattered into two chunks 0 - 12, 16 - 19. +{ "fixup_arm_movt_hi16", 0, 20, 0 }, +{ "fixup_arm_movw_lo16", 0, 20, 0 }, +{ "fixup_t2_movt_hi16", 0, 20, 0 }, +{ "fixup_t2_movw_lo16", 0, 20, 0 }, +{ "fixup_arm_movt_hi16_pcrel", 0, 20, MCFixupKindInfo::FKF_IsPCRel }, +{ "fixup_arm_movw_lo16_pcrel", 0, 20, MCFixupKindInfo::FKF_IsPCRel }, +{ "fixup_t2_movt_hi16_pcrel", 0, 20, MCFixupKindInfo::FKF_IsPCRel }, +{ "fixup_t2_movw_lo16_pcrel", 0, 20, MCFixupKindInfo::FKF_IsPCRel }, + }; + + if (Kind < FirstTargetFixupKind) + return TargetAsmBackend::getFixupKindInfo(Kind); + + assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() && + "Invalid kind!"); + return Infos[Kind - FirstTargetFixupKind]; + } + + bool MayNeedRelaxation(const MCInst &Inst) const; + + void RelaxInstruction(const MCInst &Inst, MCInst &Res) const; + + bool WriteNopData(uint64_t Count, MCObjectWriter *OW) const; + + void HandleAssemblerFlag(MCAssemblerFlag Flag) { + switch (Flag) { + default: break; + case MCAF_Code16: + setIsThumb(true); + break; + case MCAF_Code32: + setIsThumb(false); + break; + } + } + + unsigned getPointerSize() const { return 4; } + bool isThumb() const { return isThumbMode; } + void setIsThumb(bool it) { isThumbMode = it; } +}; +} // end anonymous namespace + +bool ARMAsmBackend::MayNeedRelaxation(const MCInst &Inst) const { + // FIXME: Thumb targets, different move constant targets.. + return false; +} + +void ARMAsmBackend::RelaxInstruction(const MCInst &Inst, MCInst &Res) const { + assert(0 && "ARMAsmBackend::RelaxInstruction() unimplemented"); + return; +} + +bool ARMAsmBackend::WriteNopData(uint64_t Count, MCObjectWriter *OW) const { + if (isThumb()) { + // FIXME: 0xbf00 is the ARMv7 value. For v6 and before, we'll need to + // use 0x46c0 (which is a 'mov r8, r8' insn). + uint64_t NumNops = Count / 2; + for (uint64_t i = 0; i != NumNops; ++i) + OW->Write16(0xbf00); + if (Count & 1) + OW->Write8(0); + return true; + } + // ARM mode + uint64_t NumNops = Count / 4; + for (uint64_t i = 0; i != NumNops; ++i) + OW->Write32(0xe1a00000); + switch (Count % 4) { + default: break; // No leftover bytes to write + case 1: OW->Write8(0); break; + case 2: OW->Write16(0); break; + case 3: OW->Write16(0); OW->Write8(0xa0); break; + } + + return true; +} + +static unsigned adjustFixupValue(unsigned Kind, uint64_t Value) { + switch (Kind) { + default: + llvm_unreachable("Unknown fixup kind!"); + case FK_Data_1: + case FK_Data_2: + case FK_Data_4: + return Value; + case ARM::fixup_arm_movt_hi16: + case ARM::fixup_arm_movt_hi16_pcrel: + Value >>= 16; + // Fallthrough + case ARM::fixup_arm_movw_lo16: + case ARM::fixup_arm_movw_lo16_pcrel: { + unsigned Hi4 = (Value & 0xF000) >> 12; + unsigned Lo12 = Value & 0x0FFF; + // inst{19-16} = Hi4; + // inst{11-0} = Lo12; + Value = (Hi4 << 16) | (Lo12); + return Value; + } + case ARM::fixup_t2_movt_hi16: + case ARM::fixup_t2_movt_hi16_pcrel: + Value >>= 16; + // Fallthrough + case ARM::fixup_t2_movw_lo16: + case ARM::fixup_t2_movw_lo16_pcrel: { + unsigned Hi4 = (Value & 0xF000) >> 12; + unsigned i = (Value & 0x800) >> 11; + unsigned Mid3 = (Value & 0x700) >> 8; + unsigned Lo8 = Value & 0x0FF; + // inst{19-16} = Hi4; + // inst{26} = i; + // inst{14-12} = Mid3; + // inst{7-0} = Lo8; + Value = (Hi4 << 16) | (i << 26) | (Mid3 << 12) | (Lo8); + + uint64_t swapped = (Value & 0xFFFF0000) >> 16; + swapped |= (Value & 0x0000FFFF) << 16; + return swapped; + } + case ARM::fixup_arm_ldst_pcrel_12: + // ARM PC-relative values are offset by 8. + Value -= 4; + // FALLTHROUGH + case ARM::fixup_t2_ldst_pcrel_12: { + // Offset by 4, adjusted by two due to the half-word ordering of thumb. + Value -= 4; + bool isAdd = true; + if ((int64_t)Value < 0) { + Value = -Value; + isAdd = false; + } + assert ((Value < 4096) && "Out of range pc-relative fixup value!"); + Value |= isAdd << 23; + + // Same addressing mode as fixup_arm_pcrel_10, + // but with 16-bit halfwords swapped. + if (Kind == ARM::fixup_t2_ldst_pcrel_12) { + uint64_t swapped = (Value & 0xFFFF0000) >> 16; + swapped |= (Value & 0x0000FFFF) << 16; + return swapped; + } + + return Value; + } + case ARM::fixup_thumb_adr_pcrel_10: + return ((Value - 4) >> 2) & 0xff; + case ARM::fixup_arm_adr_pcrel_12: { + // ARM PC-relative values are offset by 8. + Value -= 8; + unsigned opc = 4; // bits {24-21}. Default to add: 0b0100 + if ((int64_t)Value < 0) { + Value = -Value; + opc = 2; // 0b0010 + } + assert(ARM_AM::getSOImmVal(Value) != -1 && + "Out of range pc-relative fixup value!"); + // Encode the immediate and shift the opcode into place. + return ARM_AM::getSOImmVal(Value) | (opc << 21); + } + + case ARM::fixup_t2_adr_pcrel_12: { + Value -= 4; + unsigned opc = 0; + if ((int64_t)Value < 0) { + Value = -Value; + opc = 5; + } + + uint32_t out = (opc << 21); + out |= (Value & 0x800) << 14; + out |= (Value & 0x700) << 4; + out |= (Value & 0x0FF); + + uint64_t swapped = (out & 0xFFFF0000) >> 16; + swapped |= (out & 0x0000FFFF) << 16; + return swapped; + } + + case ARM::fixup_arm_condbranch: + case ARM::fixup_arm_uncondbranch: + // These values don't encode the low two bits since they're always zero. + // Offset by 8 just as above. + return 0xffffff & ((Value - 8) >> 2); + case ARM::fixup_t2_uncondbranch: { + Value = Value - 4; + Value >>= 1; // Low bit is not encoded. + + uint32_t out = 0; + bool I = Value & 0x800000; + bool J1 = Value & 0x400000; + bool J2 = Value & 0x200000; + J1 ^= I; + J2 ^= I; + + out |= I << 26; // S bit + out |= !J1 << 13; // J1 bit + out |= !J2 << 11; // J2 bit + out |= (Value & 0x1FF800) << 5; // imm6 field + out |= (Value & 0x0007FF); // imm11 field + + uint64_t swapped = (out & 0xFFFF0000) >> 16; + swapped |= (out & 0x0000FFFF) << 16; + return swapped; + } + case ARM::fixup_t2_condbranch: { + Value = Value - 4; + Value >>= 1; // Low bit is not encoded. + + uint64_t out = 0; + out |= (Value & 0x80000) << 7; // S bit + out |= (Value & 0x40000) >> 7; // J2 bit + out |= (Value & 0x20000) >> 4; // J1 bit + out |= (Value & 0x1F800) << 5; // imm6 field + out |= (Value & 0x007FF); // imm11 field + + uint32_t swapped = (out & 0xFFFF0000) >> 16; + swapped |= (out & 0x0000FFFF) << 16; + return swapped; + } + case ARM::fixup_arm_thumb_bl: { + // The value doesn't encode the low bit (always zero) and is offset by + // four. The value is encoded into disjoint bit positions in the destination + // opcode. x = unchanged, I = immediate value bit, S = sign extension bit + // + // BL: xxxxxSIIIIIIIIII xxxxxIIIIIIIIIII + // + // Note that the halfwords are stored high first, low second; so we need + // to transpose the fixup value here to map properly. + unsigned isNeg = (int64_t(Value) < 0) ? 1 : 0; + uint32_t Binary = 0; + Value = 0x3fffff & ((Value - 4) >> 1); + Binary = (Value & 0x7ff) << 16; // Low imm11 value. + Binary |= (Value & 0x1ffc00) >> 11; // High imm10 value. + Binary |= isNeg << 10; // Sign bit. + return Binary; + } + case ARM::fixup_arm_thumb_blx: { + // The value doesn't encode the low two bits (always zero) and is offset by + // four (see fixup_arm_thumb_cp). The value is encoded into disjoint bit + // positions in the destination opcode. x = unchanged, I = immediate value + // bit, S = sign extension bit, 0 = zero. + // + // BLX: xxxxxSIIIIIIIIII xxxxxIIIIIIIIII0 + // + // Note that the halfwords are stored high first, low second; so we need + // to transpose the fixup value here to map properly. + unsigned isNeg = (int64_t(Value) < 0) ? 1 : 0; + uint32_t Binary = 0; + Value = 0xfffff & ((Value - 2) >> 2); + Binary = (Value & 0x3ff) << 17; // Low imm10L value. + Binary |= (Value & 0xffc00) >> 10; // High imm10H value. + Binary |= isNeg << 10; // Sign bit. + return Binary; + } + case ARM::fixup_arm_thumb_cp: + // Offset by 4, and don't encode the low two bits. Two bytes of that + // 'off by 4' is implicitly handled by the half-word ordering of the + // Thumb encoding, so we only need to adjust by 2 here. + return ((Value - 2) >> 2) & 0xff; + case ARM::fixup_arm_thumb_cb: { + // Offset by 4 and don't encode the lower bit, which is always 0. + uint32_t Binary = (Value - 4) >> 1; + return ((Binary & 0x20) << 4) | ((Binary & 0x1f) << 3); + } + case ARM::fixup_arm_thumb_br: + // Offset by 4 and don't encode the lower bit, which is always 0. + return ((Value - 4) >> 1) & 0x7ff; + case ARM::fixup_arm_thumb_bcc: + // Offset by 4 and don't encode the lower bit, which is always 0. + return ((Value - 4) >> 1) & 0xff; + case ARM::fixup_arm_pcrel_10: + Value = Value - 4; // ARM fixups offset by an additional word and don't + // need to adjust for the half-word ordering. + // Fall through. + case ARM::fixup_t2_pcrel_10: { + // Offset by 4, adjusted by two due to the half-word ordering of thumb. + Value = Value - 4; + bool isAdd = true; + if ((int64_t)Value < 0) { + Value = -Value; + isAdd = false; + } + // These values don't encode the low two bits since they're always zero. + Value >>= 2; + assert ((Value < 256) && "Out of range pc-relative fixup value!"); + Value |= isAdd << 23; + + // Same addressing mode as fixup_arm_pcrel_10, + // but with 16-bit halfwords swapped. + if (Kind == ARM::fixup_t2_pcrel_10) { + uint32_t swapped = (Value & 0xFFFF0000) >> 16; + swapped |= (Value & 0x0000FFFF) << 16; + return swapped; + } + + return Value; + } + } +} + +namespace { + +// FIXME: This should be in a separate file. +// ELF is an ELF of course... +class ELFARMAsmBackend : public ARMAsmBackend { +public: + Triple::OSType OSType; + ELFARMAsmBackend(const Target &T, Triple::OSType _OSType) + : ARMAsmBackend(T), OSType(_OSType) { } + + void ApplyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize, + uint64_t Value) const; + + MCObjectWriter *createObjectWriter(raw_ostream &OS) const { + return createELFObjectWriter(new ARMELFObjectWriter(OSType), OS, + /*IsLittleEndian*/ true); + } +}; + +// FIXME: Raise this to share code between Darwin and ELF. +void ELFARMAsmBackend::ApplyFixup(const MCFixup &Fixup, char *Data, + unsigned DataSize, uint64_t Value) const { + unsigned NumBytes = 4; // FIXME: 2 for Thumb + Value = adjustFixupValue(Fixup.getKind(), Value); + if (!Value) return; // Doesn't change encoding. + + unsigned Offset = Fixup.getOffset(); + assert(Offset % NumBytes == 0 && "Offset mod NumBytes is nonzero!"); + + // For each byte of the fragment that the fixup touches, mask in the bits from + // the fixup value. The Value has been "split up" into the appropriate + // bitfields above. + for (unsigned i = 0; i != NumBytes; ++i) + Data[Offset + i] |= uint8_t((Value >> (i * 8)) & 0xff); +} + +// FIXME: This should be in a separate file. +class DarwinARMAsmBackend : public ARMAsmBackend { +public: + DarwinARMAsmBackend(const Target &T) : ARMAsmBackend(T) { } + + void ApplyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize, + uint64_t Value) const; + + MCObjectWriter *createObjectWriter(raw_ostream &OS) const { + // FIXME: Subtarget info should be derived. Force v7 for now. + return createMachObjectWriter(new ARMMachObjectWriter( + /*Is64Bit=*/false, + object::mach::CTM_ARM, + object::mach::CSARM_V7), + OS, + /*IsLittleEndian=*/true); + } + + virtual bool doesSectionRequireSymbols(const MCSection &Section) const { + return false; + } +}; + +/// getFixupKindNumBytes - The number of bytes the fixup may change. +static unsigned getFixupKindNumBytes(unsigned Kind) { + switch (Kind) { + default: + llvm_unreachable("Unknown fixup kind!"); + + case FK_Data_1: + case ARM::fixup_arm_thumb_bcc: + case ARM::fixup_arm_thumb_cp: + case ARM::fixup_thumb_adr_pcrel_10: + return 1; + + case FK_Data_2: + case ARM::fixup_arm_thumb_br: + case ARM::fixup_arm_thumb_cb: + return 2; + + case ARM::fixup_arm_ldst_pcrel_12: + case ARM::fixup_arm_pcrel_10: + case ARM::fixup_arm_adr_pcrel_12: + case ARM::fixup_arm_condbranch: + case ARM::fixup_arm_uncondbranch: + return 3; + + case FK_Data_4: + case ARM::fixup_t2_ldst_pcrel_12: + case ARM::fixup_t2_condbranch: + case ARM::fixup_t2_uncondbranch: + case ARM::fixup_t2_pcrel_10: + case ARM::fixup_t2_adr_pcrel_12: + case ARM::fixup_arm_thumb_bl: + case ARM::fixup_arm_thumb_blx: + case ARM::fixup_arm_movt_hi16: + case ARM::fixup_arm_movw_lo16: + case ARM::fixup_arm_movt_hi16_pcrel: + case ARM::fixup_arm_movw_lo16_pcrel: + case ARM::fixup_t2_movt_hi16: + case ARM::fixup_t2_movw_lo16: + case ARM::fixup_t2_movt_hi16_pcrel: + case ARM::fixup_t2_movw_lo16_pcrel: + return 4; + } +} + +void DarwinARMAsmBackend::ApplyFixup(const MCFixup &Fixup, char *Data, + unsigned DataSize, uint64_t Value) const { + unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind()); + Value = adjustFixupValue(Fixup.getKind(), Value); + if (!Value) return; // Doesn't change encoding. + + unsigned Offset = Fixup.getOffset(); + assert(Offset + NumBytes <= DataSize && "Invalid fixup offset!"); + + // For each byte of the fragment that the fixup touches, mask in the + // bits from the fixup value. + for (unsigned i = 0; i != NumBytes; ++i) + Data[Offset + i] |= uint8_t((Value >> (i * 8)) & 0xff); +} + +} // end anonymous namespace + +TargetAsmBackend *llvm::createARMAsmBackend(const Target &T, + const std::string &TT) { + switch (Triple(TT).getOS()) { + case Triple::Darwin: + return new DarwinARMAsmBackend(T); + case Triple::MinGW32: + case Triple::Cygwin: + case Triple::Win32: + assert(0 && "Windows not supported on ARM"); + default: + return new ELFARMAsmBackend(T, Triple(TT).getOS()); + } +} diff --git a/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.cpp b/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.cpp index 6cfd596..db12b8e 100644 --- a/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.cpp @@ -14,28 +14,31 @@ #define DEBUG_TYPE "asm-printer" #include "ARM.h" -#include "ARMBuildAttrs.h" +#include "ARMAsmPrinter.h" #include "ARMAddressingModes.h" +#include "ARMBuildAttrs.h" +#include "ARMBaseRegisterInfo.h" #include "ARMConstantPoolValue.h" -#include "AsmPrinter/ARMInstPrinter.h" #include "ARMMachineFunctionInfo.h" -#include "ARMMCInstLower.h" +#include "ARMMCExpr.h" #include "ARMTargetMachine.h" +#include "ARMTargetObjectFile.h" +#include "InstPrinter/ARMInstPrinter.h" #include "llvm/Analysis/DebugInfo.h" #include "llvm/Constants.h" #include "llvm/Module.h" #include "llvm/Type.h" #include "llvm/Assembly/Writer.h" -#include "llvm/CodeGen/AsmPrinter.h" #include "llvm/CodeGen/MachineModuleInfoImpls.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineJumpTableInfo.h" -#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" #include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCAssembler.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCSectionMachO.h" +#include "llvm/MC/MCObjectStreamer.h" #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSymbol.h" #include "llvm/Target/Mangler.h" @@ -53,270 +56,127 @@ #include <cctype> using namespace llvm; -static cl::opt<bool> -EnableMCInst("enable-arm-mcinst-printer", cl::Hidden, - cl::desc("enable experimental asmprinter gunk in the arm backend")); - -namespace llvm { - namespace ARM { - enum DW_ISA { - DW_ISA_ARM_thumb = 1, - DW_ISA_ARM_arm = 2 - }; - } -} - namespace { - class ARMAsmPrinter : public AsmPrinter { - - /// Subtarget - Keep a pointer to the ARMSubtarget around so that we can - /// make the right decision when printing asm code for different targets. - const ARMSubtarget *Subtarget; - /// AFI - Keep a pointer to ARMFunctionInfo for the current - /// MachineFunction. - ARMFunctionInfo *AFI; + // Per section and per symbol attributes are not supported. + // To implement them we would need the ability to delay this emission + // until the assembly file is fully parsed/generated as only then do we + // know the symbol and section numbers. + class AttributeEmitter { + public: + virtual void MaybeSwitchVendor(StringRef Vendor) = 0; + virtual void EmitAttribute(unsigned Attribute, unsigned Value) = 0; + virtual void EmitTextAttribute(unsigned Attribute, StringRef String) = 0; + virtual void Finish() = 0; + virtual ~AttributeEmitter() {} + }; - /// MCP - Keep a pointer to constantpool entries of the current - /// MachineFunction. - const MachineConstantPool *MCP; + class AsmAttributeEmitter : public AttributeEmitter { + MCStreamer &Streamer; public: - explicit ARMAsmPrinter(TargetMachine &TM, MCStreamer &Streamer) - : AsmPrinter(TM, Streamer), AFI(NULL), MCP(NULL) { - Subtarget = &TM.getSubtarget<ARMSubtarget>(); - } + AsmAttributeEmitter(MCStreamer &Streamer_) : Streamer(Streamer_) {} + void MaybeSwitchVendor(StringRef Vendor) { } - virtual const char *getPassName() const { - return "ARM Assembly Printer"; + void EmitAttribute(unsigned Attribute, unsigned Value) { + Streamer.EmitRawText("\t.eabi_attribute " + + Twine(Attribute) + ", " + Twine(Value)); } - void printInstructionThroughMCStreamer(const MachineInstr *MI); - - - void printOperand(const MachineInstr *MI, int OpNum, raw_ostream &O, - const char *Modifier = 0); - void printSOImmOperand(const MachineInstr *MI, int OpNum, raw_ostream &O); - void printSOImm2PartOperand(const MachineInstr *MI, int OpNum, - raw_ostream &O); - void printSORegOperand(const MachineInstr *MI, int OpNum, - raw_ostream &O); - void printAddrMode2Operand(const MachineInstr *MI, int OpNum, - raw_ostream &O); - void printAddrMode2OffsetOperand(const MachineInstr *MI, int OpNum, - raw_ostream &O); - void printAddrMode3Operand(const MachineInstr *MI, int OpNum, - raw_ostream &O); - void printAddrMode3OffsetOperand(const MachineInstr *MI, int OpNum, - raw_ostream &O); - void printAddrMode4Operand(const MachineInstr *MI, int OpNum,raw_ostream &O, - const char *Modifier = 0); - void printAddrMode5Operand(const MachineInstr *MI, int OpNum,raw_ostream &O, - const char *Modifier = 0); - void printAddrMode6Operand(const MachineInstr *MI, int OpNum, - raw_ostream &O); - void printAddrMode6OffsetOperand(const MachineInstr *MI, int OpNum, - raw_ostream &O); - void printAddrModePCOperand(const MachineInstr *MI, int OpNum, - raw_ostream &O, - const char *Modifier = 0); - void printBitfieldInvMaskImmOperand(const MachineInstr *MI, int OpNum, - raw_ostream &O); - void printMemBOption(const MachineInstr *MI, int OpNum, - raw_ostream &O); - void printShiftImmOperand(const MachineInstr *MI, int OpNum, - raw_ostream &O); - - void printThumbS4ImmOperand(const MachineInstr *MI, int OpNum, - raw_ostream &O); - void printThumbITMask(const MachineInstr *MI, int OpNum, raw_ostream &O); - void printThumbAddrModeRROperand(const MachineInstr *MI, int OpNum, - raw_ostream &O); - void printThumbAddrModeRI5Operand(const MachineInstr *MI, int OpNum, - raw_ostream &O, - unsigned Scale); - void printThumbAddrModeS1Operand(const MachineInstr *MI, int OpNum, - raw_ostream &O); - void printThumbAddrModeS2Operand(const MachineInstr *MI, int OpNum, - raw_ostream &O); - void printThumbAddrModeS4Operand(const MachineInstr *MI, int OpNum, - raw_ostream &O); - void printThumbAddrModeSPOperand(const MachineInstr *MI, int OpNum, - raw_ostream &O); - - void printT2SOOperand(const MachineInstr *MI, int OpNum, raw_ostream &O); - void printT2AddrModeImm12Operand(const MachineInstr *MI, int OpNum, - raw_ostream &O); - void printT2AddrModeImm8Operand(const MachineInstr *MI, int OpNum, - raw_ostream &O); - void printT2AddrModeImm8s4Operand(const MachineInstr *MI, int OpNum, - raw_ostream &O); - void printT2AddrModeImm8OffsetOperand(const MachineInstr *MI, int OpNum, - raw_ostream &O); - void printT2AddrModeImm8s4OffsetOperand(const MachineInstr *MI, int OpNum, - raw_ostream &O) {} - void printT2AddrModeSoRegOperand(const MachineInstr *MI, int OpNum, - raw_ostream &O); - - void printCPSOptionOperand(const MachineInstr *MI, int OpNum, - raw_ostream &O) {} - void printMSRMaskOperand(const MachineInstr *MI, int OpNum, - raw_ostream &O) {} - void printNegZeroOperand(const MachineInstr *MI, int OpNum, - raw_ostream &O) {} - void printPredicateOperand(const MachineInstr *MI, int OpNum, - raw_ostream &O); - void printMandatoryPredicateOperand(const MachineInstr *MI, int OpNum, - raw_ostream &O); - void printSBitModifierOperand(const MachineInstr *MI, int OpNum, - raw_ostream &O); - void printPCLabel(const MachineInstr *MI, int OpNum, - raw_ostream &O); - void printRegisterList(const MachineInstr *MI, int OpNum, - raw_ostream &O); - void printCPInstOperand(const MachineInstr *MI, int OpNum, - raw_ostream &O, - const char *Modifier); - void printJTBlockOperand(const MachineInstr *MI, int OpNum, - raw_ostream &O); - void printJT2BlockOperand(const MachineInstr *MI, int OpNum, - raw_ostream &O); - void printTBAddrMode(const MachineInstr *MI, int OpNum, - raw_ostream &O); - void printNoHashImmediate(const MachineInstr *MI, int OpNum, - raw_ostream &O); - void printVFPf32ImmOperand(const MachineInstr *MI, int OpNum, - raw_ostream &O); - void printVFPf64ImmOperand(const MachineInstr *MI, int OpNum, - raw_ostream &O); - void printNEONModImmOperand(const MachineInstr *MI, int OpNum, - raw_ostream &O); - - virtual bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNum, - unsigned AsmVariant, const char *ExtraCode, - raw_ostream &O); - virtual bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNum, - unsigned AsmVariant, - const char *ExtraCode, raw_ostream &O); - - void printInstruction(const MachineInstr *MI, raw_ostream &O); // autogen - static const char *getRegisterName(unsigned RegNo); - - virtual void EmitInstruction(const MachineInstr *MI); - bool runOnMachineFunction(MachineFunction &F); - - virtual void EmitConstantPool() {} // we emit constant pools customly! - virtual void EmitFunctionEntryLabel(); - void EmitStartOfAsmFile(Module &M); - void EmitEndOfAsmFile(Module &M); - - MachineLocation getDebugValueLocation(const MachineInstr *MI) const { - MachineLocation Location; - assert (MI->getNumOperands() == 4 && "Invalid no. of machine operands!"); - // Frame address. Currently handles register +- offset only. - if (MI->getOperand(0).isReg() && MI->getOperand(1).isImm()) - Location.set(MI->getOperand(0).getReg(), MI->getOperand(1).getImm()); - else { - DEBUG(dbgs() << "DBG_VALUE instruction ignored! " << *MI << "\n"); + void EmitTextAttribute(unsigned Attribute, StringRef String) { + switch (Attribute) { + case ARMBuildAttrs::CPU_name: + Streamer.EmitRawText(StringRef("\t.cpu ") + LowercaseString(String)); + break; + default: assert(0 && "Unsupported Text attribute in ASM Mode"); break; } - return Location; + } + void Finish() { } + }; + + class ObjectAttributeEmitter : public AttributeEmitter { + MCObjectStreamer &Streamer; + StringRef CurrentVendor; + SmallString<64> Contents; + + public: + ObjectAttributeEmitter(MCObjectStreamer &Streamer_) : + Streamer(Streamer_), CurrentVendor("") { } + + void MaybeSwitchVendor(StringRef Vendor) { + assert(!Vendor.empty() && "Vendor cannot be empty."); + + if (CurrentVendor.empty()) + CurrentVendor = Vendor; + else if (CurrentVendor == Vendor) + return; + else + Finish(); + + CurrentVendor = Vendor; + + assert(Contents.size() == 0); } - virtual unsigned getISAEncoding() { - // ARM/Darwin adds ISA to the DWARF info for each function. - if (!Subtarget->isTargetDarwin()) - return 0; - return Subtarget->isThumb() ? - llvm::ARM::DW_ISA_ARM_thumb : llvm::ARM::DW_ISA_ARM_arm; + void EmitAttribute(unsigned Attribute, unsigned Value) { + // FIXME: should be ULEB + Contents += Attribute; + Contents += Value; } - MCSymbol *GetARMSetPICJumpTableLabel2(unsigned uid, unsigned uid2, - const MachineBasicBlock *MBB) const; - MCSymbol *GetARMJTIPICJumpTableLabel2(unsigned uid, unsigned uid2) const; - - /// EmitMachineConstantPoolValue - Print a machine constantpool value to - /// the .s file. - virtual void EmitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) { - SmallString<128> Str; - raw_svector_ostream OS(Str); - EmitMachineConstantPoolValue(MCPV, OS); - OutStreamer.EmitRawText(OS.str()); + void EmitTextAttribute(unsigned Attribute, StringRef String) { + Contents += Attribute; + Contents += UppercaseString(String); + Contents += 0; } - void EmitMachineConstantPoolValue(MachineConstantPoolValue *MCPV, - raw_ostream &O) { - switch (TM.getTargetData()->getTypeAllocSize(MCPV->getType())) { - case 1: O << MAI->getData8bitsDirective(0); break; - case 2: O << MAI->getData16bitsDirective(0); break; - case 4: O << MAI->getData32bitsDirective(0); break; - default: assert(0 && "Unknown CPV size"); - } + void Finish() { + const size_t ContentsSize = Contents.size(); - ARMConstantPoolValue *ACPV = static_cast<ARMConstantPoolValue*>(MCPV); - - if (ACPV->isLSDA()) { - O << MAI->getPrivateGlobalPrefix() << "_LSDA_" << getFunctionNumber(); - } else if (ACPV->isBlockAddress()) { - O << *GetBlockAddressSymbol(ACPV->getBlockAddress()); - } else if (ACPV->isGlobalValue()) { - const GlobalValue *GV = ACPV->getGV(); - bool isIndirect = Subtarget->isTargetDarwin() && - Subtarget->GVIsIndirectSymbol(GV, TM.getRelocationModel()); - if (!isIndirect) - O << *Mang->getSymbol(GV); - else { - // FIXME: Remove this when Darwin transition to @GOT like syntax. - MCSymbol *Sym = GetSymbolWithGlobalValueBase(GV, "$non_lazy_ptr"); - O << *Sym; - - MachineModuleInfoMachO &MMIMachO = - MMI->getObjFileInfo<MachineModuleInfoMachO>(); - MachineModuleInfoImpl::StubValueTy &StubSym = - GV->hasHiddenVisibility() ? MMIMachO.getHiddenGVStubEntry(Sym) : - MMIMachO.getGVStubEntry(Sym); - if (StubSym.getPointer() == 0) - StubSym = MachineModuleInfoImpl:: - StubValueTy(Mang->getSymbol(GV), !GV->hasInternalLinkage()); - } - } else { - assert(ACPV->isExtSymbol() && "unrecognized constant pool value"); - O << *GetExternalSymbolSymbol(ACPV->getSymbol()); - } + // Vendor size + Vendor name + '\0' + const size_t VendorHeaderSize = 4 + CurrentVendor.size() + 1; - if (ACPV->hasModifier()) O << "(" << ACPV->getModifier() << ")"; - if (ACPV->getPCAdjustment() != 0) { - O << "-(" << MAI->getPrivateGlobalPrefix() << "PC" - << getFunctionNumber() << "_" << ACPV->getLabelId() - << "+" << (unsigned)ACPV->getPCAdjustment(); - if (ACPV->mustAddCurrentAddress()) - O << "-."; - O << ')'; - } + // Tag + Tag Size + const size_t TagHeaderSize = 1 + 4; + + Streamer.EmitIntValue(VendorHeaderSize + TagHeaderSize + ContentsSize, 4); + Streamer.EmitBytes(CurrentVendor, 0); + Streamer.EmitIntValue(0, 1); // '\0' + + Streamer.EmitIntValue(ARMBuildAttrs::File, 1); + Streamer.EmitIntValue(TagHeaderSize + ContentsSize, 4); + + Streamer.EmitBytes(Contents, 0); + + Contents.clear(); } }; + } // end of anonymous namespace -#include "ARMGenAsmWriter.inc" +MachineLocation ARMAsmPrinter:: +getDebugValueLocation(const MachineInstr *MI) const { + MachineLocation Location; + assert(MI->getNumOperands() == 4 && "Invalid no. of machine operands!"); + // Frame address. Currently handles register +- offset only. + if (MI->getOperand(0).isReg() && MI->getOperand(1).isImm()) + Location.set(MI->getOperand(0).getReg(), MI->getOperand(1).getImm()); + else { + DEBUG(dbgs() << "DBG_VALUE instruction ignored! " << *MI << "\n"); + } + return Location; +} void ARMAsmPrinter::EmitFunctionEntryLabel() { if (AFI->isThumbFunction()) { - OutStreamer.EmitRawText(StringRef("\t.code\t16")); - if (!Subtarget->isTargetDarwin()) - OutStreamer.EmitRawText(StringRef("\t.thumb_func")); - else { - // This needs to emit to a temporary string to get properly quoted - // MCSymbols when they have spaces in them. - SmallString<128> Tmp; - raw_svector_ostream OS(Tmp); - OS << "\t.thumb_func\t" << *CurrentFnSym; - OutStreamer.EmitRawText(OS.str()); - } + OutStreamer.EmitAssemblerFlag(MCAF_Code16); + OutStreamer.EmitThumbFunc(Subtarget->isTargetDarwin()? CurrentFnSym : 0); } OutStreamer.EmitLabel(CurrentFnSym); } -/// runOnMachineFunction - This uses the printInstruction() +/// runOnMachineFunction - This uses the EmitInstruction() /// method to print assembly for each instruction. /// bool ARMAsmPrinter::runOnMachineFunction(MachineFunction &MF) { @@ -337,32 +197,18 @@ void ARMAsmPrinter::printOperand(const MachineInstr *MI, int OpNum, case MachineOperand::MO_Register: { unsigned Reg = MO.getReg(); assert(TargetRegisterInfo::isPhysicalRegister(Reg)); - if (Modifier && strcmp(Modifier, "dregpair") == 0) { - unsigned DRegLo = TM.getRegisterInfo()->getSubReg(Reg, ARM::dsub_0); - unsigned DRegHi = TM.getRegisterInfo()->getSubReg(Reg, ARM::dsub_1); - O << '{' - << getRegisterName(DRegLo) << ", " << getRegisterName(DRegHi) - << '}'; - } else if (Modifier && strcmp(Modifier, "lane") == 0) { - unsigned RegNum = ARMRegisterInfo::getRegisterNumbering(Reg); - unsigned DReg = - TM.getRegisterInfo()->getMatchingSuperReg(Reg, - RegNum & 1 ? ARM::ssub_1 : ARM::ssub_0, &ARM::DPR_VFP2RegClass); - O << getRegisterName(DReg) << '[' << (RegNum & 1) << ']'; - } else { - assert(!MO.getSubReg() && "Subregs should be eliminated!"); - O << getRegisterName(Reg); - } + assert(!MO.getSubReg() && "Subregs should be eliminated!"); + O << ARMInstPrinter::getRegisterName(Reg); break; } case MachineOperand::MO_Immediate: { int64_t Imm = MO.getImm(); O << '#'; if ((Modifier && strcmp(Modifier, "lo16") == 0) || - (TF & ARMII::MO_LO16)) + (TF == ARMII::MO_LO16)) O << ":lower16:"; else if ((Modifier && strcmp(Modifier, "hi16") == 0) || - (TF & ARMII::MO_HI16)) + (TF == ARMII::MO_HI16)) O << ":upper16:"; O << Imm; break; @@ -371,9 +217,7 @@ void ARMAsmPrinter::printOperand(const MachineInstr *MI, int OpNum, O << *MO.getMBB()->getSymbol(); return; case MachineOperand::MO_GlobalAddress: { - bool isCallOp = Modifier && !strcmp(Modifier, "call"); const GlobalValue *GV = MO.getGlobal(); - if ((Modifier && strcmp(Modifier, "lo16") == 0) || (TF & ARMII::MO_LO16)) O << ":lower16:"; @@ -383,18 +227,13 @@ void ARMAsmPrinter::printOperand(const MachineInstr *MI, int OpNum, O << *Mang->getSymbol(GV); printOffset(MO.getOffset(), O); - - if (isCallOp && Subtarget->isTargetELF() && - TM.getRelocationModel() == Reloc::PIC_) + if (TF == ARMII::MO_PLT) O << "(PLT)"; break; } case MachineOperand::MO_ExternalSymbol: { - bool isCallOp = Modifier && !strcmp(Modifier, "call"); O << *GetExternalSymbolSymbol(MO.getSymbolName()); - - if (isCallOp && Subtarget->isTargetELF() && - TM.getRelocationModel() == Reloc::PIC_) + if (TF == ARMII::MO_PLT) O << "(PLT)"; break; } @@ -407,538 +246,8 @@ void ARMAsmPrinter::printOperand(const MachineInstr *MI, int OpNum, } } -static void printSOImm(raw_ostream &O, int64_t V, bool VerboseAsm, - const MCAsmInfo *MAI) { - // Break it up into two parts that make up a shifter immediate. - V = ARM_AM::getSOImmVal(V); - assert(V != -1 && "Not a valid so_imm value!"); - - unsigned Imm = ARM_AM::getSOImmValImm(V); - unsigned Rot = ARM_AM::getSOImmValRot(V); - - // Print low-level immediate formation info, per - // A5.1.3: "Data-processing operands - Immediate". - if (Rot) { - O << "#" << Imm << ", " << Rot; - // Pretty printed version. - if (VerboseAsm) { - O << "\t" << MAI->getCommentString() << ' '; - O << (int)ARM_AM::rotr32(Imm, Rot); - } - } else { - O << "#" << Imm; - } -} - -/// printSOImmOperand - SOImm is 4-bit rotate amount in bits 8-11 with 8-bit -/// immediate in bits 0-7. -void ARMAsmPrinter::printSOImmOperand(const MachineInstr *MI, int OpNum, - raw_ostream &O) { - const MachineOperand &MO = MI->getOperand(OpNum); - assert(MO.isImm() && "Not a valid so_imm value!"); - printSOImm(O, MO.getImm(), isVerbose(), MAI); -} - -/// printSOImm2PartOperand - SOImm is broken into two pieces using a 'mov' -/// followed by an 'orr' to materialize. -void ARMAsmPrinter::printSOImm2PartOperand(const MachineInstr *MI, int OpNum, - raw_ostream &O) { - const MachineOperand &MO = MI->getOperand(OpNum); - assert(MO.isImm() && "Not a valid so_imm value!"); - unsigned V1 = ARM_AM::getSOImmTwoPartFirst(MO.getImm()); - unsigned V2 = ARM_AM::getSOImmTwoPartSecond(MO.getImm()); - printSOImm(O, V1, isVerbose(), MAI); - O << "\n\torr"; - printPredicateOperand(MI, 2, O); - O << "\t"; - printOperand(MI, 0, O); - O << ", "; - printOperand(MI, 0, O); - O << ", "; - printSOImm(O, V2, isVerbose(), MAI); -} - -// so_reg is a 4-operand unit corresponding to register forms of the A5.1 -// "Addressing Mode 1 - Data-processing operands" forms. This includes: -// REG 0 0 - e.g. R5 -// REG REG 0,SH_OPC - e.g. R5, ROR R3 -// REG 0 IMM,SH_OPC - e.g. R5, LSL #3 -void ARMAsmPrinter::printSORegOperand(const MachineInstr *MI, int Op, - raw_ostream &O) { - const MachineOperand &MO1 = MI->getOperand(Op); - const MachineOperand &MO2 = MI->getOperand(Op+1); - const MachineOperand &MO3 = MI->getOperand(Op+2); - - O << getRegisterName(MO1.getReg()); - - // Print the shift opc. - ARM_AM::ShiftOpc ShOpc = ARM_AM::getSORegShOp(MO3.getImm()); - O << ", " << ARM_AM::getShiftOpcStr(ShOpc); - if (MO2.getReg()) { - O << ' ' << getRegisterName(MO2.getReg()); - assert(ARM_AM::getSORegOffset(MO3.getImm()) == 0); - } else if (ShOpc != ARM_AM::rrx) { - O << " #" << ARM_AM::getSORegOffset(MO3.getImm()); - } -} - -void ARMAsmPrinter::printAddrMode2Operand(const MachineInstr *MI, int Op, - raw_ostream &O) { - const MachineOperand &MO1 = MI->getOperand(Op); - const MachineOperand &MO2 = MI->getOperand(Op+1); - const MachineOperand &MO3 = MI->getOperand(Op+2); - - if (!MO1.isReg()) { // FIXME: This is for CP entries, but isn't right. - printOperand(MI, Op, O); - return; - } - - O << "[" << getRegisterName(MO1.getReg()); - - if (!MO2.getReg()) { - if (ARM_AM::getAM2Offset(MO3.getImm())) // Don't print +0. - O << ", #" - << ARM_AM::getAddrOpcStr(ARM_AM::getAM2Op(MO3.getImm())) - << ARM_AM::getAM2Offset(MO3.getImm()); - O << "]"; - return; - } - - O << ", " - << ARM_AM::getAddrOpcStr(ARM_AM::getAM2Op(MO3.getImm())) - << getRegisterName(MO2.getReg()); - - if (unsigned ShImm = ARM_AM::getAM2Offset(MO3.getImm())) - O << ", " - << ARM_AM::getShiftOpcStr(ARM_AM::getAM2ShiftOpc(MO3.getImm())) - << " #" << ShImm; - O << "]"; -} - -void ARMAsmPrinter::printAddrMode2OffsetOperand(const MachineInstr *MI, int Op, - raw_ostream &O) { - const MachineOperand &MO1 = MI->getOperand(Op); - const MachineOperand &MO2 = MI->getOperand(Op+1); - - if (!MO1.getReg()) { - unsigned ImmOffs = ARM_AM::getAM2Offset(MO2.getImm()); - O << "#" - << ARM_AM::getAddrOpcStr(ARM_AM::getAM2Op(MO2.getImm())) - << ImmOffs; - return; - } - - O << ARM_AM::getAddrOpcStr(ARM_AM::getAM2Op(MO2.getImm())) - << getRegisterName(MO1.getReg()); - - if (unsigned ShImm = ARM_AM::getAM2Offset(MO2.getImm())) - O << ", " - << ARM_AM::getShiftOpcStr(ARM_AM::getAM2ShiftOpc(MO2.getImm())) - << " #" << ShImm; -} - -void ARMAsmPrinter::printAddrMode3Operand(const MachineInstr *MI, int Op, - raw_ostream &O) { - const MachineOperand &MO1 = MI->getOperand(Op); - const MachineOperand &MO2 = MI->getOperand(Op+1); - const MachineOperand &MO3 = MI->getOperand(Op+2); - - assert(TargetRegisterInfo::isPhysicalRegister(MO1.getReg())); - O << "[" << getRegisterName(MO1.getReg()); - - if (MO2.getReg()) { - O << ", " - << (char)ARM_AM::getAM3Op(MO3.getImm()) - << getRegisterName(MO2.getReg()) - << "]"; - return; - } - - if (unsigned ImmOffs = ARM_AM::getAM3Offset(MO3.getImm())) - O << ", #" - << ARM_AM::getAddrOpcStr(ARM_AM::getAM3Op(MO3.getImm())) - << ImmOffs; - O << "]"; -} - -void ARMAsmPrinter::printAddrMode3OffsetOperand(const MachineInstr *MI, int Op, - raw_ostream &O){ - const MachineOperand &MO1 = MI->getOperand(Op); - const MachineOperand &MO2 = MI->getOperand(Op+1); - - if (MO1.getReg()) { - O << (char)ARM_AM::getAM3Op(MO2.getImm()) - << getRegisterName(MO1.getReg()); - return; - } - - unsigned ImmOffs = ARM_AM::getAM3Offset(MO2.getImm()); - O << "#" - << ARM_AM::getAddrOpcStr(ARM_AM::getAM3Op(MO2.getImm())) - << ImmOffs; -} - -void ARMAsmPrinter::printAddrMode4Operand(const MachineInstr *MI, int Op, - raw_ostream &O, - const char *Modifier) { - const MachineOperand &MO2 = MI->getOperand(Op+1); - ARM_AM::AMSubMode Mode = ARM_AM::getAM4SubMode(MO2.getImm()); - if (Modifier && strcmp(Modifier, "submode") == 0) { - O << ARM_AM::getAMSubModeStr(Mode); - } else if (Modifier && strcmp(Modifier, "wide") == 0) { - ARM_AM::AMSubMode Mode = ARM_AM::getAM4SubMode(MO2.getImm()); - if (Mode == ARM_AM::ia) - O << ".w"; - } else { - printOperand(MI, Op, O); - } -} - -void ARMAsmPrinter::printAddrMode5Operand(const MachineInstr *MI, int Op, - raw_ostream &O, - const char *Modifier) { - const MachineOperand &MO1 = MI->getOperand(Op); - const MachineOperand &MO2 = MI->getOperand(Op+1); - - if (!MO1.isReg()) { // FIXME: This is for CP entries, but isn't right. - printOperand(MI, Op, O); - return; - } - - assert(TargetRegisterInfo::isPhysicalRegister(MO1.getReg())); - - O << "[" << getRegisterName(MO1.getReg()); - - if (unsigned ImmOffs = ARM_AM::getAM5Offset(MO2.getImm())) { - O << ", #" - << ARM_AM::getAddrOpcStr(ARM_AM::getAM5Op(MO2.getImm())) - << ImmOffs*4; - } - O << "]"; -} - -void ARMAsmPrinter::printAddrMode6Operand(const MachineInstr *MI, int Op, - raw_ostream &O) { - const MachineOperand &MO1 = MI->getOperand(Op); - const MachineOperand &MO2 = MI->getOperand(Op+1); - - O << "[" << getRegisterName(MO1.getReg()); - if (MO2.getImm()) { - // FIXME: Both darwin as and GNU as violate ARM docs here. - O << ", :" << (MO2.getImm() << 3); - } - O << "]"; -} - -void ARMAsmPrinter::printAddrMode6OffsetOperand(const MachineInstr *MI, int Op, - raw_ostream &O){ - const MachineOperand &MO = MI->getOperand(Op); - if (MO.getReg() == 0) - O << "!"; - else - O << ", " << getRegisterName(MO.getReg()); -} - -void ARMAsmPrinter::printAddrModePCOperand(const MachineInstr *MI, int Op, - raw_ostream &O, - const char *Modifier) { - if (Modifier && strcmp(Modifier, "label") == 0) { - printPCLabel(MI, Op+1, O); - return; - } - - const MachineOperand &MO1 = MI->getOperand(Op); - assert(TargetRegisterInfo::isPhysicalRegister(MO1.getReg())); - O << "[pc, " << getRegisterName(MO1.getReg()) << "]"; -} - -void -ARMAsmPrinter::printBitfieldInvMaskImmOperand(const MachineInstr *MI, int Op, - raw_ostream &O) { - const MachineOperand &MO = MI->getOperand(Op); - uint32_t v = ~MO.getImm(); - int32_t lsb = CountTrailingZeros_32(v); - int32_t width = (32 - CountLeadingZeros_32 (v)) - lsb; - assert(MO.isImm() && "Not a valid bf_inv_mask_imm value!"); - O << "#" << lsb << ", #" << width; -} - -void -ARMAsmPrinter::printMemBOption(const MachineInstr *MI, int OpNum, - raw_ostream &O) { - unsigned val = MI->getOperand(OpNum).getImm(); - O << ARM_MB::MemBOptToString(val); -} - -void ARMAsmPrinter::printShiftImmOperand(const MachineInstr *MI, int OpNum, - raw_ostream &O) { - unsigned ShiftOp = MI->getOperand(OpNum).getImm(); - ARM_AM::ShiftOpc Opc = ARM_AM::getSORegShOp(ShiftOp); - switch (Opc) { - case ARM_AM::no_shift: - return; - case ARM_AM::lsl: - O << ", lsl #"; - break; - case ARM_AM::asr: - O << ", asr #"; - break; - default: - assert(0 && "unexpected shift opcode for shift immediate operand"); - } - O << ARM_AM::getSORegOffset(ShiftOp); -} - -//===--------------------------------------------------------------------===// - -void ARMAsmPrinter::printThumbS4ImmOperand(const MachineInstr *MI, int Op, - raw_ostream &O) { - O << "#" << MI->getOperand(Op).getImm() * 4; -} - -void -ARMAsmPrinter::printThumbITMask(const MachineInstr *MI, int Op, - raw_ostream &O) { - // (3 - the number of trailing zeros) is the number of then / else. - unsigned Mask = MI->getOperand(Op).getImm(); - unsigned CondBit0 = Mask >> 4 & 1; - unsigned NumTZ = CountTrailingZeros_32(Mask); - assert(NumTZ <= 3 && "Invalid IT mask!"); - for (unsigned Pos = 3, e = NumTZ; Pos > e; --Pos) { - bool T = ((Mask >> Pos) & 1) == CondBit0; - if (T) - O << 't'; - else - O << 'e'; - } -} - -void -ARMAsmPrinter::printThumbAddrModeRROperand(const MachineInstr *MI, int Op, - raw_ostream &O) { - const MachineOperand &MO1 = MI->getOperand(Op); - const MachineOperand &MO2 = MI->getOperand(Op+1); - O << "[" << getRegisterName(MO1.getReg()); - O << ", " << getRegisterName(MO2.getReg()) << "]"; -} - -void -ARMAsmPrinter::printThumbAddrModeRI5Operand(const MachineInstr *MI, int Op, - raw_ostream &O, - unsigned Scale) { - const MachineOperand &MO1 = MI->getOperand(Op); - const MachineOperand &MO2 = MI->getOperand(Op+1); - const MachineOperand &MO3 = MI->getOperand(Op+2); - - if (!MO1.isReg()) { // FIXME: This is for CP entries, but isn't right. - printOperand(MI, Op, O); - return; - } - - O << "[" << getRegisterName(MO1.getReg()); - if (MO3.getReg()) - O << ", " << getRegisterName(MO3.getReg()); - else if (unsigned ImmOffs = MO2.getImm()) - O << ", #" << ImmOffs * Scale; - O << "]"; -} - -void -ARMAsmPrinter::printThumbAddrModeS1Operand(const MachineInstr *MI, int Op, - raw_ostream &O) { - printThumbAddrModeRI5Operand(MI, Op, O, 1); -} -void -ARMAsmPrinter::printThumbAddrModeS2Operand(const MachineInstr *MI, int Op, - raw_ostream &O) { - printThumbAddrModeRI5Operand(MI, Op, O, 2); -} -void -ARMAsmPrinter::printThumbAddrModeS4Operand(const MachineInstr *MI, int Op, - raw_ostream &O) { - printThumbAddrModeRI5Operand(MI, Op, O, 4); -} - -void ARMAsmPrinter::printThumbAddrModeSPOperand(const MachineInstr *MI,int Op, - raw_ostream &O) { - const MachineOperand &MO1 = MI->getOperand(Op); - const MachineOperand &MO2 = MI->getOperand(Op+1); - O << "[" << getRegisterName(MO1.getReg()); - if (unsigned ImmOffs = MO2.getImm()) - O << ", #" << ImmOffs*4; - O << "]"; -} - -//===--------------------------------------------------------------------===// - -// Constant shifts t2_so_reg is a 2-operand unit corresponding to the Thumb2 -// register with shift forms. -// REG 0 0 - e.g. R5 -// REG IMM, SH_OPC - e.g. R5, LSL #3 -void ARMAsmPrinter::printT2SOOperand(const MachineInstr *MI, int OpNum, - raw_ostream &O) { - const MachineOperand &MO1 = MI->getOperand(OpNum); - const MachineOperand &MO2 = MI->getOperand(OpNum+1); - - unsigned Reg = MO1.getReg(); - assert(TargetRegisterInfo::isPhysicalRegister(Reg)); - O << getRegisterName(Reg); - - // Print the shift opc. - assert(MO2.isImm() && "Not a valid t2_so_reg value!"); - ARM_AM::ShiftOpc ShOpc = ARM_AM::getSORegShOp(MO2.getImm()); - O << ", " << ARM_AM::getShiftOpcStr(ShOpc); - if (ShOpc != ARM_AM::rrx) - O << " #" << ARM_AM::getSORegOffset(MO2.getImm()); -} - -void ARMAsmPrinter::printT2AddrModeImm12Operand(const MachineInstr *MI, - int OpNum, - raw_ostream &O) { - const MachineOperand &MO1 = MI->getOperand(OpNum); - const MachineOperand &MO2 = MI->getOperand(OpNum+1); - - O << "[" << getRegisterName(MO1.getReg()); - - unsigned OffImm = MO2.getImm(); - if (OffImm) // Don't print +0. - O << ", #" << OffImm; - O << "]"; -} - -void ARMAsmPrinter::printT2AddrModeImm8Operand(const MachineInstr *MI, - int OpNum, - raw_ostream &O) { - const MachineOperand &MO1 = MI->getOperand(OpNum); - const MachineOperand &MO2 = MI->getOperand(OpNum+1); - - O << "[" << getRegisterName(MO1.getReg()); - - int32_t OffImm = (int32_t)MO2.getImm(); - // Don't print +0. - if (OffImm < 0) - O << ", #-" << -OffImm; - else if (OffImm > 0) - O << ", #" << OffImm; - O << "]"; -} - -void ARMAsmPrinter::printT2AddrModeImm8s4Operand(const MachineInstr *MI, - int OpNum, - raw_ostream &O) { - const MachineOperand &MO1 = MI->getOperand(OpNum); - const MachineOperand &MO2 = MI->getOperand(OpNum+1); - - O << "[" << getRegisterName(MO1.getReg()); - - int32_t OffImm = (int32_t)MO2.getImm() / 4; - // Don't print +0. - if (OffImm < 0) - O << ", #-" << -OffImm * 4; - else if (OffImm > 0) - O << ", #" << OffImm * 4; - O << "]"; -} - -void ARMAsmPrinter::printT2AddrModeImm8OffsetOperand(const MachineInstr *MI, - int OpNum, - raw_ostream &O) { - const MachineOperand &MO1 = MI->getOperand(OpNum); - int32_t OffImm = (int32_t)MO1.getImm(); - // Don't print +0. - if (OffImm < 0) - O << "#-" << -OffImm; - else if (OffImm > 0) - O << "#" << OffImm; -} - -void ARMAsmPrinter::printT2AddrModeSoRegOperand(const MachineInstr *MI, - int OpNum, - raw_ostream &O) { - const MachineOperand &MO1 = MI->getOperand(OpNum); - const MachineOperand &MO2 = MI->getOperand(OpNum+1); - const MachineOperand &MO3 = MI->getOperand(OpNum+2); - - O << "[" << getRegisterName(MO1.getReg()); - - assert(MO2.getReg() && "Invalid so_reg load / store address!"); - O << ", " << getRegisterName(MO2.getReg()); - - unsigned ShAmt = MO3.getImm(); - if (ShAmt) { - assert(ShAmt <= 3 && "Not a valid Thumb2 addressing mode!"); - O << ", lsl #" << ShAmt; - } - O << "]"; -} - - //===--------------------------------------------------------------------===// -void ARMAsmPrinter::printPredicateOperand(const MachineInstr *MI, int OpNum, - raw_ostream &O) { - ARMCC::CondCodes CC = (ARMCC::CondCodes)MI->getOperand(OpNum).getImm(); - if (CC != ARMCC::AL) - O << ARMCondCodeToString(CC); -} - -void ARMAsmPrinter::printMandatoryPredicateOperand(const MachineInstr *MI, - int OpNum, - raw_ostream &O) { - ARMCC::CondCodes CC = (ARMCC::CondCodes)MI->getOperand(OpNum).getImm(); - O << ARMCondCodeToString(CC); -} - -void ARMAsmPrinter::printSBitModifierOperand(const MachineInstr *MI, int OpNum, - raw_ostream &O){ - unsigned Reg = MI->getOperand(OpNum).getReg(); - if (Reg) { - assert(Reg == ARM::CPSR && "Expect ARM CPSR register!"); - O << 's'; - } -} - -void ARMAsmPrinter::printPCLabel(const MachineInstr *MI, int OpNum, - raw_ostream &O) { - int Id = (int)MI->getOperand(OpNum).getImm(); - O << MAI->getPrivateGlobalPrefix() - << "PC" << getFunctionNumber() << "_" << Id; -} - -void ARMAsmPrinter::printRegisterList(const MachineInstr *MI, int OpNum, - raw_ostream &O) { - O << "{"; - for (unsigned i = OpNum, e = MI->getNumOperands(); i != e; ++i) { - if (MI->getOperand(i).isImplicit()) - continue; - if ((int)i != OpNum) O << ", "; - printOperand(MI, i, O); - } - O << "}"; -} - -void ARMAsmPrinter::printCPInstOperand(const MachineInstr *MI, int OpNum, - raw_ostream &O, const char *Modifier) { - assert(Modifier && "This operand only works with a modifier!"); - // There are two aspects to a CONSTANTPOOL_ENTRY operand, the label and the - // data itself. - if (!strcmp(Modifier, "label")) { - unsigned ID = MI->getOperand(OpNum).getImm(); - OutStreamer.EmitLabel(GetCPISymbol(ID)); - } else { - assert(!strcmp(Modifier, "cpentry") && "Unknown modifier for CPE"); - unsigned CPI = MI->getOperand(OpNum).getIndex(); - - const MachineConstantPoolEntry &MCPE = MCP->getConstants()[CPI]; - - if (MCPE.isMachineConstantPoolEntry()) { - EmitMachineConstantPoolValue(MCPE.Val.MachineCPVal); - } else { - EmitGlobalConstant(MCPE.Val.ConstVal); - } - } -} - MCSymbol *ARMAsmPrinter:: GetARMSetPICJumpTableLabel2(unsigned uid, unsigned uid2, const MachineBasicBlock *MBB) const { @@ -957,126 +266,12 @@ GetARMJTIPICJumpTableLabel2(unsigned uid, unsigned uid2) const { return OutContext.GetOrCreateSymbol(Name.str()); } -void ARMAsmPrinter::printJTBlockOperand(const MachineInstr *MI, int OpNum, - raw_ostream &O) { - assert(!Subtarget->isThumb2() && "Thumb2 should use double-jump jumptables!"); - - const MachineOperand &MO1 = MI->getOperand(OpNum); - const MachineOperand &MO2 = MI->getOperand(OpNum+1); // Unique Id - - unsigned JTI = MO1.getIndex(); - MCSymbol *JTISymbol = GetARMJTIPICJumpTableLabel2(JTI, MO2.getImm()); - // Can't use EmitLabel until instprinter happens, label comes out in the wrong - // order. - O << "\n" << *JTISymbol << ":\n"; - - const char *JTEntryDirective = MAI->getData32bitsDirective(); - - const MachineJumpTableInfo *MJTI = MF->getJumpTableInfo(); - const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables(); - const std::vector<MachineBasicBlock*> &JTBBs = JT[JTI].MBBs; - bool UseSet= MAI->hasSetDirective() && TM.getRelocationModel() == Reloc::PIC_; - SmallPtrSet<MachineBasicBlock*, 8> JTSets; - for (unsigned i = 0, e = JTBBs.size(); i != e; ++i) { - MachineBasicBlock *MBB = JTBBs[i]; - bool isNew = JTSets.insert(MBB); - - if (UseSet && isNew) { - O << "\t.set\t" - << *GetARMSetPICJumpTableLabel2(JTI, MO2.getImm(), MBB) << ',' - << *MBB->getSymbol() << '-' << *JTISymbol << '\n'; - } - - O << JTEntryDirective << ' '; - if (UseSet) - O << *GetARMSetPICJumpTableLabel2(JTI, MO2.getImm(), MBB); - else if (TM.getRelocationModel() == Reloc::PIC_) - O << *MBB->getSymbol() << '-' << *JTISymbol; - else - O << *MBB->getSymbol(); - - if (i != e-1) - O << '\n'; - } -} - -void ARMAsmPrinter::printJT2BlockOperand(const MachineInstr *MI, int OpNum, - raw_ostream &O) { - const MachineOperand &MO1 = MI->getOperand(OpNum); - const MachineOperand &MO2 = MI->getOperand(OpNum+1); // Unique Id - unsigned JTI = MO1.getIndex(); - - MCSymbol *JTISymbol = GetARMJTIPICJumpTableLabel2(JTI, MO2.getImm()); - - // Can't use EmitLabel until instprinter happens, label comes out in the wrong - // order. - O << "\n" << *JTISymbol << ":\n"; - - const MachineJumpTableInfo *MJTI = MF->getJumpTableInfo(); - const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables(); - const std::vector<MachineBasicBlock*> &JTBBs = JT[JTI].MBBs; - bool ByteOffset = false, HalfWordOffset = false; - if (MI->getOpcode() == ARM::t2TBB) - ByteOffset = true; - else if (MI->getOpcode() == ARM::t2TBH) - HalfWordOffset = true; - - for (unsigned i = 0, e = JTBBs.size(); i != e; ++i) { - MachineBasicBlock *MBB = JTBBs[i]; - if (ByteOffset) - O << MAI->getData8bitsDirective(); - else if (HalfWordOffset) - O << MAI->getData16bitsDirective(); - - if (ByteOffset || HalfWordOffset) - O << '(' << *MBB->getSymbol() << "-" << *JTISymbol << ")/2"; - else - O << "\tb.w " << *MBB->getSymbol(); - - if (i != e-1) - O << '\n'; - } -} - -void ARMAsmPrinter::printTBAddrMode(const MachineInstr *MI, int OpNum, - raw_ostream &O) { - O << "[pc, " << getRegisterName(MI->getOperand(OpNum).getReg()); - if (MI->getOpcode() == ARM::t2TBH) - O << ", lsl #1"; - O << ']'; -} -void ARMAsmPrinter::printNoHashImmediate(const MachineInstr *MI, int OpNum, - raw_ostream &O) { - O << MI->getOperand(OpNum).getImm(); -} - -void ARMAsmPrinter::printVFPf32ImmOperand(const MachineInstr *MI, int OpNum, - raw_ostream &O) { - const ConstantFP *FP = MI->getOperand(OpNum).getFPImm(); - O << '#' << FP->getValueAPF().convertToFloat(); - if (isVerbose()) { - O << "\t\t" << MAI->getCommentString() << ' '; - WriteAsOperand(O, FP, /*PrintType=*/false); - } -} - -void ARMAsmPrinter::printVFPf64ImmOperand(const MachineInstr *MI, int OpNum, - raw_ostream &O) { - const ConstantFP *FP = MI->getOperand(OpNum).getFPImm(); - O << '#' << FP->getValueAPF().convertToDouble(); - if (isVerbose()) { - O << "\t\t" << MAI->getCommentString() << ' '; - WriteAsOperand(O, FP, /*PrintType=*/false); - } -} - -void ARMAsmPrinter::printNEONModImmOperand(const MachineInstr *MI, int OpNum, - raw_ostream &O) { - unsigned EncodedImm = MI->getOperand(OpNum).getImm(); - unsigned EltBits; - uint64_t Val = ARM_AM::decodeNEONModImm(EncodedImm, EltBits); - O << "#0x" << utohexstr(Val); +MCSymbol *ARMAsmPrinter::GetARMSJLJEHLabel(void) const { + SmallString<60> Name; + raw_svector_ostream(Name) << MAI->getPrivateGlobalPrefix() << "SJLJEH" + << getFunctionNumber(); + return OutContext.GetOrCreateSymbol(Name.str()); } bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum, @@ -1090,14 +285,16 @@ bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum, default: return true; // Unknown modifier. case 'a': // Print as a memory address. if (MI->getOperand(OpNum).isReg()) { - O << "[" << getRegisterName(MI->getOperand(OpNum).getReg()) << "]"; + O << "[" + << ARMInstPrinter::getRegisterName(MI->getOperand(OpNum).getReg()) + << "]"; return false; } // Fallthrough case 'c': // Don't print "#" before an immediate operand. if (!MI->getOperand(OpNum).isImm()) return true; - printNoHashImmediate(MI, OpNum, O); + O << MI->getOperand(OpNum).getImm(); return false; case 'P': // Print a VFP double precision register. case 'q': // Print a NEON quad precision register. @@ -1106,7 +303,7 @@ bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum, case 'Q': case 'R': case 'H': - report_fatal_error("llvm does not support 'Q', 'R', and 'H' modifiers!"); + // These modifiers are not yet supported. return true; } } @@ -1124,48 +321,10 @@ bool ARMAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, const MachineOperand &MO = MI->getOperand(OpNum); assert(MO.isReg() && "unexpected inline asm memory operand"); - O << "[" << getRegisterName(MO.getReg()) << "]"; + O << "[" << ARMInstPrinter::getRegisterName(MO.getReg()) << "]"; return false; } -void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) { - if (EnableMCInst) { - printInstructionThroughMCStreamer(MI); - return; - } - - if (MI->getOpcode() == ARM::CONSTPOOL_ENTRY) - EmitAlignment(2); - - SmallString<128> Str; - raw_svector_ostream OS(Str); - if (MI->getOpcode() == ARM::DBG_VALUE) { - unsigned NOps = MI->getNumOperands(); - assert(NOps==4); - OS << '\t' << MAI->getCommentString() << "DEBUG_VALUE: "; - // cast away const; DIetc do not take const operands for some reason. - DIVariable V(const_cast<MDNode *>(MI->getOperand(NOps-1).getMetadata())); - OS << V.getName(); - OS << " <- "; - // Frame address. Currently handles register +- offset only. - assert(MI->getOperand(0).isReg() && MI->getOperand(1).isImm()); - OS << '['; printOperand(MI, 0, OS); OS << '+'; printOperand(MI, 1, OS); - OS << ']'; - OS << "+"; - printOperand(MI, NOps-2, OS); - OutStreamer.EmitRawText(OS.str()); - return; - } - - printInstruction(MI, OS); - OutStreamer.EmitRawText(OS.str()); - - // Make sure the instruction that follows TBB is 2-byte aligned. - // FIXME: Constant island pass should insert an "ALIGN" instruction instead. - if (MI->getOpcode() == ARM::t2TBB) - EmitAlignment(1); -} - void ARMAsmPrinter::EmitStartOfAsmFile(Module &M) { if (Subtarget->isTargetDarwin()) { Reloc::Model RelocM = TM.getRelocationModel(); @@ -1205,49 +364,12 @@ void ARMAsmPrinter::EmitStartOfAsmFile(Module &M) { } // Use unified assembler syntax. - OutStreamer.EmitRawText(StringRef("\t.syntax unified")); + OutStreamer.EmitAssemblerFlag(MCAF_SyntaxUnified); // Emit ARM Build Attributes if (Subtarget->isTargetELF()) { - // CPU Type - std::string CPUString = Subtarget->getCPUString(); - if (CPUString != "generic") - OutStreamer.EmitRawText("\t.cpu " + Twine(CPUString)); - - // FIXME: Emit FPU type - if (Subtarget->hasVFP2()) - OutStreamer.EmitRawText("\t.eabi_attribute " + - Twine(ARMBuildAttrs::VFP_arch) + ", 2"); - - // Signal various FP modes. - if (!UnsafeFPMath) { - OutStreamer.EmitRawText("\t.eabi_attribute " + - Twine(ARMBuildAttrs::ABI_FP_denormal) + ", 1"); - OutStreamer.EmitRawText("\t.eabi_attribute " + - Twine(ARMBuildAttrs::ABI_FP_exceptions) + ", 1"); - } - if (NoInfsFPMath && NoNaNsFPMath) - OutStreamer.EmitRawText("\t.eabi_attribute " + - Twine(ARMBuildAttrs::ABI_FP_number_model)+ ", 1"); - else - OutStreamer.EmitRawText("\t.eabi_attribute " + - Twine(ARMBuildAttrs::ABI_FP_number_model)+ ", 3"); - - // 8-bytes alignment stuff. - OutStreamer.EmitRawText("\t.eabi_attribute " + - Twine(ARMBuildAttrs::ABI_align8_needed) + ", 1"); - OutStreamer.EmitRawText("\t.eabi_attribute " + - Twine(ARMBuildAttrs::ABI_align8_preserved) + ", 1"); - - // Hard float. Use both S and D registers and conform to AAPCS-VFP. - if (Subtarget->isAAPCS_ABI() && FloatABIType == FloatABI::Hard) { - OutStreamer.EmitRawText("\t.eabi_attribute " + - Twine(ARMBuildAttrs::ABI_HardFP_use) + ", 3"); - OutStreamer.EmitRawText("\t.eabi_attribute " + - Twine(ARMBuildAttrs::ABI_VFP_args) + ", 1"); - } - // FIXME: Should we signal R9 usage? + emitAttributes(); } } @@ -1280,10 +402,10 @@ void ARMAsmPrinter::EmitEndOfAsmFile(Module &M) { else // Internal to current translation unit. // - // When we place the LSDA into the TEXT section, the type info pointers - // need to be indirect and pc-rel. We accomplish this by using NLPs. - // However, sometimes the types are local to the file. So we need to - // fill in the value for the NLP in those cases. + // When we place the LSDA into the TEXT section, the type info + // pointers need to be indirect and pc-rel. We accomplish this by + // using NLPs; however, sometimes the types are local to the file. + // We need to fill in the value for the NLP in those cases. OutStreamer.EmitValue(MCSymbolRefExpr::Create(MCSym.getPointer(), OutContext), 4/*size*/, 0/*addrspace*/); @@ -1321,38 +443,631 @@ void ARMAsmPrinter::EmitEndOfAsmFile(Module &M) { } //===----------------------------------------------------------------------===// +// Helper routines for EmitStartOfAsmFile() and EmitEndOfAsmFile() +// FIXME: +// The following seem like one-off assembler flags, but they actually need +// to appear in the .ARM.attributes section in ELF. +// Instead of subclassing the MCELFStreamer, we do the work here. + +void ARMAsmPrinter::emitAttributes() { + + emitARMAttributeSection(); + + AttributeEmitter *AttrEmitter; + if (OutStreamer.hasRawTextSupport()) + AttrEmitter = new AsmAttributeEmitter(OutStreamer); + else { + MCObjectStreamer &O = static_cast<MCObjectStreamer&>(OutStreamer); + AttrEmitter = new ObjectAttributeEmitter(O); + } + + AttrEmitter->MaybeSwitchVendor("aeabi"); + + std::string CPUString = Subtarget->getCPUString(); + + if (CPUString == "cortex-a8" || + Subtarget->isCortexA8()) { + AttrEmitter->EmitTextAttribute(ARMBuildAttrs::CPU_name, "cortex-a8"); + AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch, ARMBuildAttrs::v7); + AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch_profile, + ARMBuildAttrs::ApplicationProfile); + AttrEmitter->EmitAttribute(ARMBuildAttrs::ARM_ISA_use, + ARMBuildAttrs::Allowed); + AttrEmitter->EmitAttribute(ARMBuildAttrs::THUMB_ISA_use, + ARMBuildAttrs::AllowThumb32); + // Fixme: figure out when this is emitted. + //AttrEmitter->EmitAttribute(ARMBuildAttrs::WMMX_arch, + // ARMBuildAttrs::AllowWMMXv1); + // + + /// ADD additional Else-cases here! + } else if (CPUString == "generic") { + // FIXME: Why these defaults? + AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch, ARMBuildAttrs::v4T); + AttrEmitter->EmitAttribute(ARMBuildAttrs::ARM_ISA_use, + ARMBuildAttrs::Allowed); + AttrEmitter->EmitAttribute(ARMBuildAttrs::THUMB_ISA_use, + ARMBuildAttrs::Allowed); + } -void ARMAsmPrinter::printInstructionThroughMCStreamer(const MachineInstr *MI) { - ARMMCInstLower MCInstLowering(OutContext, *Mang, *this); - switch (MI->getOpcode()) { - case ARM::t2MOVi32imm: - assert(0 && "Should be lowered by thumb2it pass"); + // FIXME: Emit FPU type + if (Subtarget->hasVFP2()) + AttrEmitter->EmitAttribute(ARMBuildAttrs::VFP_arch, + ARMBuildAttrs::AllowFPv2); + + // Signal various FP modes. + if (!UnsafeFPMath) { + AttrEmitter->EmitAttribute(ARMBuildAttrs::ABI_FP_denormal, + ARMBuildAttrs::Allowed); + AttrEmitter->EmitAttribute(ARMBuildAttrs::ABI_FP_exceptions, + ARMBuildAttrs::Allowed); + } + + if (NoInfsFPMath && NoNaNsFPMath) + AttrEmitter->EmitAttribute(ARMBuildAttrs::ABI_FP_number_model, + ARMBuildAttrs::Allowed); + else + AttrEmitter->EmitAttribute(ARMBuildAttrs::ABI_FP_number_model, + ARMBuildAttrs::AllowIEE754); + + // FIXME: add more flags to ARMBuildAttrs.h + // 8-bytes alignment stuff. + AttrEmitter->EmitAttribute(ARMBuildAttrs::ABI_align8_needed, 1); + AttrEmitter->EmitAttribute(ARMBuildAttrs::ABI_align8_preserved, 1); + + // Hard float. Use both S and D registers and conform to AAPCS-VFP. + if (Subtarget->isAAPCS_ABI() && FloatABIType == FloatABI::Hard) { + AttrEmitter->EmitAttribute(ARMBuildAttrs::ABI_HardFP_use, 3); + AttrEmitter->EmitAttribute(ARMBuildAttrs::ABI_VFP_args, 1); + } + // FIXME: Should we signal R9 usage? + + if (Subtarget->hasDivide()) + AttrEmitter->EmitAttribute(ARMBuildAttrs::DIV_use, 1); + + AttrEmitter->Finish(); + delete AttrEmitter; +} + +void ARMAsmPrinter::emitARMAttributeSection() { + // <format-version> + // [ <section-length> "vendor-name" + // [ <file-tag> <size> <attribute>* + // | <section-tag> <size> <section-number>* 0 <attribute>* + // | <symbol-tag> <size> <symbol-number>* 0 <attribute>* + // ]+ + // ]* + + if (OutStreamer.hasRawTextSupport()) + return; + + const ARMElfTargetObjectFile &TLOFELF = + static_cast<const ARMElfTargetObjectFile &> + (getObjFileLowering()); + + OutStreamer.SwitchSection(TLOFELF.getAttributesSection()); + + // Format version + OutStreamer.EmitIntValue(0x41, 1); +} + +//===----------------------------------------------------------------------===// + +static MCSymbol *getPICLabel(const char *Prefix, unsigned FunctionNumber, + unsigned LabelId, MCContext &Ctx) { + + MCSymbol *Label = Ctx.GetOrCreateSymbol(Twine(Prefix) + + "PC" + Twine(FunctionNumber) + "_" + Twine(LabelId)); + return Label; +} + +static MCSymbolRefExpr::VariantKind +getModifierVariantKind(ARMCP::ARMCPModifier Modifier) { + switch (Modifier) { + default: llvm_unreachable("Unknown modifier!"); + case ARMCP::no_modifier: return MCSymbolRefExpr::VK_None; + case ARMCP::TLSGD: return MCSymbolRefExpr::VK_ARM_TLSGD; + case ARMCP::TPOFF: return MCSymbolRefExpr::VK_ARM_TPOFF; + case ARMCP::GOTTPOFF: return MCSymbolRefExpr::VK_ARM_GOTTPOFF; + case ARMCP::GOT: return MCSymbolRefExpr::VK_ARM_GOT; + case ARMCP::GOTOFF: return MCSymbolRefExpr::VK_ARM_GOTOFF; + } + return MCSymbolRefExpr::VK_None; +} + +MCSymbol *ARMAsmPrinter::GetARMGVSymbol(const GlobalValue *GV) { + bool isIndirect = Subtarget->isTargetDarwin() && + Subtarget->GVIsIndirectSymbol(GV, TM.getRelocationModel()); + if (!isIndirect) + return Mang->getSymbol(GV); + + // FIXME: Remove this when Darwin transition to @GOT like syntax. + MCSymbol *MCSym = GetSymbolWithGlobalValueBase(GV, "$non_lazy_ptr"); + MachineModuleInfoMachO &MMIMachO = + MMI->getObjFileInfo<MachineModuleInfoMachO>(); + MachineModuleInfoImpl::StubValueTy &StubSym = + GV->hasHiddenVisibility() ? MMIMachO.getHiddenGVStubEntry(MCSym) : + MMIMachO.getGVStubEntry(MCSym); + if (StubSym.getPointer() == 0) + StubSym = MachineModuleInfoImpl:: + StubValueTy(Mang->getSymbol(GV), !GV->hasInternalLinkage()); + return MCSym; +} + +void ARMAsmPrinter:: +EmitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) { + int Size = TM.getTargetData()->getTypeAllocSize(MCPV->getType()); + + ARMConstantPoolValue *ACPV = static_cast<ARMConstantPoolValue*>(MCPV); + + MCSymbol *MCSym; + if (ACPV->isLSDA()) { + SmallString<128> Str; + raw_svector_ostream OS(Str); + OS << MAI->getPrivateGlobalPrefix() << "_LSDA_" << getFunctionNumber(); + MCSym = OutContext.GetOrCreateSymbol(OS.str()); + } else if (ACPV->isBlockAddress()) { + MCSym = GetBlockAddressSymbol(ACPV->getBlockAddress()); + } else if (ACPV->isGlobalValue()) { + const GlobalValue *GV = ACPV->getGV(); + MCSym = GetARMGVSymbol(GV); + } else { + assert(ACPV->isExtSymbol() && "unrecognized constant pool value"); + MCSym = GetExternalSymbolSymbol(ACPV->getSymbol()); + } + + // Create an MCSymbol for the reference. + const MCExpr *Expr = + MCSymbolRefExpr::Create(MCSym, getModifierVariantKind(ACPV->getModifier()), + OutContext); + + if (ACPV->getPCAdjustment()) { + MCSymbol *PCLabel = getPICLabel(MAI->getPrivateGlobalPrefix(), + getFunctionNumber(), + ACPV->getLabelId(), + OutContext); + const MCExpr *PCRelExpr = MCSymbolRefExpr::Create(PCLabel, OutContext); + PCRelExpr = + MCBinaryExpr::CreateAdd(PCRelExpr, + MCConstantExpr::Create(ACPV->getPCAdjustment(), + OutContext), + OutContext); + if (ACPV->mustAddCurrentAddress()) { + // We want "(<expr> - .)", but MC doesn't have a concept of the '.' + // label, so just emit a local label end reference that instead. + MCSymbol *DotSym = OutContext.CreateTempSymbol(); + OutStreamer.EmitLabel(DotSym); + const MCExpr *DotExpr = MCSymbolRefExpr::Create(DotSym, OutContext); + PCRelExpr = MCBinaryExpr::CreateSub(PCRelExpr, DotExpr, OutContext); + } + Expr = MCBinaryExpr::CreateSub(Expr, PCRelExpr, OutContext); + } + OutStreamer.EmitValue(Expr, Size); +} + +void ARMAsmPrinter::EmitJumpTable(const MachineInstr *MI) { + unsigned Opcode = MI->getOpcode(); + int OpNum = 1; + if (Opcode == ARM::BR_JTadd) + OpNum = 2; + else if (Opcode == ARM::BR_JTm) + OpNum = 3; + + const MachineOperand &MO1 = MI->getOperand(OpNum); + const MachineOperand &MO2 = MI->getOperand(OpNum+1); // Unique Id + unsigned JTI = MO1.getIndex(); + + // Emit a label for the jump table. + MCSymbol *JTISymbol = GetARMJTIPICJumpTableLabel2(JTI, MO2.getImm()); + OutStreamer.EmitLabel(JTISymbol); + + // Emit each entry of the table. + const MachineJumpTableInfo *MJTI = MF->getJumpTableInfo(); + const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables(); + const std::vector<MachineBasicBlock*> &JTBBs = JT[JTI].MBBs; + + for (unsigned i = 0, e = JTBBs.size(); i != e; ++i) { + MachineBasicBlock *MBB = JTBBs[i]; + // Construct an MCExpr for the entry. We want a value of the form: + // (BasicBlockAddr - TableBeginAddr) + // + // For example, a table with entries jumping to basic blocks BB0 and BB1 + // would look like: + // LJTI_0_0: + // .word (LBB0 - LJTI_0_0) + // .word (LBB1 - LJTI_0_0) + const MCExpr *Expr = MCSymbolRefExpr::Create(MBB->getSymbol(), OutContext); + + if (TM.getRelocationModel() == Reloc::PIC_) + Expr = MCBinaryExpr::CreateSub(Expr, MCSymbolRefExpr::Create(JTISymbol, + OutContext), + OutContext); + OutStreamer.EmitValue(Expr, 4); + } +} + +void ARMAsmPrinter::EmitJump2Table(const MachineInstr *MI) { + unsigned Opcode = MI->getOpcode(); + int OpNum = (Opcode == ARM::t2BR_JT) ? 2 : 1; + const MachineOperand &MO1 = MI->getOperand(OpNum); + const MachineOperand &MO2 = MI->getOperand(OpNum+1); // Unique Id + unsigned JTI = MO1.getIndex(); + + // Emit a label for the jump table. + MCSymbol *JTISymbol = GetARMJTIPICJumpTableLabel2(JTI, MO2.getImm()); + OutStreamer.EmitLabel(JTISymbol); + + // Emit each entry of the table. + const MachineJumpTableInfo *MJTI = MF->getJumpTableInfo(); + const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables(); + const std::vector<MachineBasicBlock*> &JTBBs = JT[JTI].MBBs; + unsigned OffsetWidth = 4; + if (MI->getOpcode() == ARM::t2TBB_JT) + OffsetWidth = 1; + else if (MI->getOpcode() == ARM::t2TBH_JT) + OffsetWidth = 2; + + for (unsigned i = 0, e = JTBBs.size(); i != e; ++i) { + MachineBasicBlock *MBB = JTBBs[i]; + const MCExpr *MBBSymbolExpr = MCSymbolRefExpr::Create(MBB->getSymbol(), + OutContext); + // If this isn't a TBB or TBH, the entries are direct branch instructions. + if (OffsetWidth == 4) { + MCInst BrInst; + BrInst.setOpcode(ARM::t2B); + BrInst.addOperand(MCOperand::CreateExpr(MBBSymbolExpr)); + OutStreamer.EmitInstruction(BrInst); + continue; + } + // Otherwise it's an offset from the dispatch instruction. Construct an + // MCExpr for the entry. We want a value of the form: + // (BasicBlockAddr - TableBeginAddr) / 2 + // + // For example, a TBB table with entries jumping to basic blocks BB0 and BB1 + // would look like: + // LJTI_0_0: + // .byte (LBB0 - LJTI_0_0) / 2 + // .byte (LBB1 - LJTI_0_0) / 2 + const MCExpr *Expr = + MCBinaryExpr::CreateSub(MBBSymbolExpr, + MCSymbolRefExpr::Create(JTISymbol, OutContext), + OutContext); + Expr = MCBinaryExpr::CreateDiv(Expr, MCConstantExpr::Create(2, OutContext), + OutContext); + OutStreamer.EmitValue(Expr, OffsetWidth); + } +} + +void ARMAsmPrinter::PrintDebugValueComment(const MachineInstr *MI, + raw_ostream &OS) { + unsigned NOps = MI->getNumOperands(); + assert(NOps==4); + OS << '\t' << MAI->getCommentString() << "DEBUG_VALUE: "; + // cast away const; DIetc do not take const operands for some reason. + DIVariable V(const_cast<MDNode *>(MI->getOperand(NOps-1).getMetadata())); + OS << V.getName(); + OS << " <- "; + // Frame address. Currently handles register +- offset only. + assert(MI->getOperand(0).isReg() && MI->getOperand(1).isImm()); + OS << '['; printOperand(MI, 0, OS); OS << '+'; printOperand(MI, 1, OS); + OS << ']'; + OS << "+"; + printOperand(MI, NOps-2, OS); +} + +static void populateADROperands(MCInst &Inst, unsigned Dest, + const MCSymbol *Label, + unsigned pred, unsigned ccreg, + MCContext &Ctx) { + const MCExpr *SymbolExpr = MCSymbolRefExpr::Create(Label, Ctx); + Inst.addOperand(MCOperand::CreateReg(Dest)); + Inst.addOperand(MCOperand::CreateExpr(SymbolExpr)); + // Add predicate operands. + Inst.addOperand(MCOperand::CreateImm(pred)); + Inst.addOperand(MCOperand::CreateReg(ccreg)); +} + +void ARMAsmPrinter::EmitPatchedInstruction(const MachineInstr *MI, + unsigned Opcode) { + MCInst TmpInst; + + // Emit the instruction as usual, just patch the opcode. + LowerARMMachineInstrToMCInst(MI, TmpInst, *this); + TmpInst.setOpcode(Opcode); + OutStreamer.EmitInstruction(TmpInst); +} + +void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) { + unsigned Opc = MI->getOpcode(); + switch (Opc) { default: break; - case ARM::PICADD: { // FIXME: Remove asm string from td file. + case ARM::t2ADDrSPi: + case ARM::t2ADDrSPi12: + case ARM::t2SUBrSPi: + case ARM::t2SUBrSPi12: + assert ((MI->getOperand(1).getReg() == ARM::SP) && + "Unexpected source register!"); + break; + + case ARM::t2MOVi32imm: assert(0 && "Should be lowered by thumb2it pass"); + case ARM::DBG_VALUE: { + if (isVerbose() && OutStreamer.hasRawTextSupport()) { + SmallString<128> TmpStr; + raw_svector_ostream OS(TmpStr); + PrintDebugValueComment(MI, OS); + OutStreamer.EmitRawText(StringRef(OS.str())); + } + return; + } + case ARM::tBfar: { + MCInst TmpInst; + TmpInst.setOpcode(ARM::tBL); + TmpInst.addOperand(MCOperand::CreateExpr(MCSymbolRefExpr::Create( + MI->getOperand(0).getMBB()->getSymbol(), OutContext))); + OutStreamer.EmitInstruction(TmpInst); + return; + } + case ARM::LEApcrel: + case ARM::tLEApcrel: + case ARM::t2LEApcrel: { + // FIXME: Need to also handle globals and externals + MCInst TmpInst; + TmpInst.setOpcode(MI->getOpcode() == ARM::t2LEApcrel ? ARM::t2ADR + : (MI->getOpcode() == ARM::tLEApcrel ? ARM::tADR + : ARM::ADR)); + populateADROperands(TmpInst, MI->getOperand(0).getReg(), + GetCPISymbol(MI->getOperand(1).getIndex()), + MI->getOperand(2).getImm(), MI->getOperand(3).getReg(), + OutContext); + OutStreamer.EmitInstruction(TmpInst); + return; + } + case ARM::LEApcrelJT: + case ARM::tLEApcrelJT: + case ARM::t2LEApcrelJT: { + MCInst TmpInst; + TmpInst.setOpcode(MI->getOpcode() == ARM::t2LEApcrelJT ? ARM::t2ADR + : (MI->getOpcode() == ARM::tLEApcrelJT ? ARM::tADR + : ARM::ADR)); + populateADROperands(TmpInst, MI->getOperand(0).getReg(), + GetARMJTIPICJumpTableLabel2(MI->getOperand(1).getIndex(), + MI->getOperand(2).getImm()), + MI->getOperand(3).getImm(), MI->getOperand(4).getReg(), + OutContext); + OutStreamer.EmitInstruction(TmpInst); + return; + } + case ARM::MOVPCRX: { + MCInst TmpInst; + TmpInst.setOpcode(ARM::MOVr); + TmpInst.addOperand(MCOperand::CreateReg(ARM::PC)); + TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(0).getReg())); + // Add predicate operands. + TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL)); + TmpInst.addOperand(MCOperand::CreateReg(0)); + // Add 's' bit operand (always reg0 for this) + TmpInst.addOperand(MCOperand::CreateReg(0)); + OutStreamer.EmitInstruction(TmpInst); + return; + } + case ARM::BXr9_CALL: + case ARM::BX_CALL: { + { + MCInst TmpInst; + TmpInst.setOpcode(ARM::MOVr); + TmpInst.addOperand(MCOperand::CreateReg(ARM::LR)); + TmpInst.addOperand(MCOperand::CreateReg(ARM::PC)); + // Add predicate operands. + TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL)); + TmpInst.addOperand(MCOperand::CreateReg(0)); + // Add 's' bit operand (always reg0 for this) + TmpInst.addOperand(MCOperand::CreateReg(0)); + OutStreamer.EmitInstruction(TmpInst); + } + { + MCInst TmpInst; + TmpInst.setOpcode(ARM::BX); + TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(0).getReg())); + OutStreamer.EmitInstruction(TmpInst); + } + return; + } + case ARM::BMOVPCRXr9_CALL: + case ARM::BMOVPCRX_CALL: { + { + MCInst TmpInst; + TmpInst.setOpcode(ARM::MOVr); + TmpInst.addOperand(MCOperand::CreateReg(ARM::LR)); + TmpInst.addOperand(MCOperand::CreateReg(ARM::PC)); + // Add predicate operands. + TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL)); + TmpInst.addOperand(MCOperand::CreateReg(0)); + // Add 's' bit operand (always reg0 for this) + TmpInst.addOperand(MCOperand::CreateReg(0)); + OutStreamer.EmitInstruction(TmpInst); + } + { + MCInst TmpInst; + TmpInst.setOpcode(ARM::MOVr); + TmpInst.addOperand(MCOperand::CreateReg(ARM::PC)); + TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(0).getReg())); + // Add predicate operands. + TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL)); + TmpInst.addOperand(MCOperand::CreateReg(0)); + // Add 's' bit operand (always reg0 for this) + TmpInst.addOperand(MCOperand::CreateReg(0)); + OutStreamer.EmitInstruction(TmpInst); + } + return; + } + case ARM::MOVi16_ga_pcrel: + case ARM::t2MOVi16_ga_pcrel: { + MCInst TmpInst; + TmpInst.setOpcode(Opc == ARM::MOVi16_ga_pcrel? ARM::MOVi16 : ARM::t2MOVi16); + TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(0).getReg())); + + unsigned TF = MI->getOperand(1).getTargetFlags(); + bool isPIC = TF == ARMII::MO_LO16_NONLAZY_PIC; + const GlobalValue *GV = MI->getOperand(1).getGlobal(); + MCSymbol *GVSym = GetARMGVSymbol(GV); + const MCExpr *GVSymExpr = MCSymbolRefExpr::Create(GVSym, OutContext); + if (isPIC) { + MCSymbol *LabelSym = getPICLabel(MAI->getPrivateGlobalPrefix(), + getFunctionNumber(), + MI->getOperand(2).getImm(), OutContext); + const MCExpr *LabelSymExpr= MCSymbolRefExpr::Create(LabelSym, OutContext); + unsigned PCAdj = (Opc == ARM::MOVi16_ga_pcrel) ? 8 : 4; + const MCExpr *PCRelExpr = + ARMMCExpr::CreateLower16(MCBinaryExpr::CreateSub(GVSymExpr, + MCBinaryExpr::CreateAdd(LabelSymExpr, + MCConstantExpr::Create(PCAdj, OutContext), + OutContext), OutContext), OutContext); + TmpInst.addOperand(MCOperand::CreateExpr(PCRelExpr)); + } else { + const MCExpr *RefExpr= ARMMCExpr::CreateLower16(GVSymExpr, OutContext); + TmpInst.addOperand(MCOperand::CreateExpr(RefExpr)); + } + + // Add predicate operands. + TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL)); + TmpInst.addOperand(MCOperand::CreateReg(0)); + // Add 's' bit operand (always reg0 for this) + TmpInst.addOperand(MCOperand::CreateReg(0)); + OutStreamer.EmitInstruction(TmpInst); + return; + } + case ARM::MOVTi16_ga_pcrel: + case ARM::t2MOVTi16_ga_pcrel: { + MCInst TmpInst; + TmpInst.setOpcode(Opc == ARM::MOVTi16_ga_pcrel + ? ARM::MOVTi16 : ARM::t2MOVTi16); + TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(0).getReg())); + TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(1).getReg())); + + unsigned TF = MI->getOperand(2).getTargetFlags(); + bool isPIC = TF == ARMII::MO_HI16_NONLAZY_PIC; + const GlobalValue *GV = MI->getOperand(2).getGlobal(); + MCSymbol *GVSym = GetARMGVSymbol(GV); + const MCExpr *GVSymExpr = MCSymbolRefExpr::Create(GVSym, OutContext); + if (isPIC) { + MCSymbol *LabelSym = getPICLabel(MAI->getPrivateGlobalPrefix(), + getFunctionNumber(), + MI->getOperand(3).getImm(), OutContext); + const MCExpr *LabelSymExpr= MCSymbolRefExpr::Create(LabelSym, OutContext); + unsigned PCAdj = (Opc == ARM::MOVTi16_ga_pcrel) ? 8 : 4; + const MCExpr *PCRelExpr = + ARMMCExpr::CreateUpper16(MCBinaryExpr::CreateSub(GVSymExpr, + MCBinaryExpr::CreateAdd(LabelSymExpr, + MCConstantExpr::Create(PCAdj, OutContext), + OutContext), OutContext), OutContext); + TmpInst.addOperand(MCOperand::CreateExpr(PCRelExpr)); + } else { + const MCExpr *RefExpr= ARMMCExpr::CreateUpper16(GVSymExpr, OutContext); + TmpInst.addOperand(MCOperand::CreateExpr(RefExpr)); + } + // Add predicate operands. + TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL)); + TmpInst.addOperand(MCOperand::CreateReg(0)); + // Add 's' bit operand (always reg0 for this) + TmpInst.addOperand(MCOperand::CreateReg(0)); + OutStreamer.EmitInstruction(TmpInst); + return; + } + case ARM::tPICADD: { // This is a pseudo op for a label + instruction sequence, which looks like: // LPC0: - // add r0, pc, r0 + // add r0, pc // This adds the address of LPC0 to r0. // Emit the label. - // FIXME: MOVE TO SHARED PLACE. - unsigned Id = (unsigned)MI->getOperand(2).getImm(); - const char *Prefix = MAI->getPrivateGlobalPrefix(); - MCSymbol *Label =OutContext.GetOrCreateSymbol(Twine(Prefix) - + "PC" + Twine(getFunctionNumber()) + "_" + Twine(Id)); - OutStreamer.EmitLabel(Label); + OutStreamer.EmitLabel(getPICLabel(MAI->getPrivateGlobalPrefix(), + getFunctionNumber(), MI->getOperand(2).getImm(), + OutContext)); + // Form and emit the add. + MCInst AddInst; + AddInst.setOpcode(ARM::tADDhirr); + AddInst.addOperand(MCOperand::CreateReg(MI->getOperand(0).getReg())); + AddInst.addOperand(MCOperand::CreateReg(MI->getOperand(0).getReg())); + AddInst.addOperand(MCOperand::CreateReg(ARM::PC)); + // Add predicate operands. + AddInst.addOperand(MCOperand::CreateImm(ARMCC::AL)); + AddInst.addOperand(MCOperand::CreateReg(0)); + OutStreamer.EmitInstruction(AddInst); + return; + } + case ARM::PICADD: { + // This is a pseudo op for a label + instruction sequence, which looks like: + // LPC0: + // add r0, pc, r0 + // This adds the address of LPC0 to r0. + + // Emit the label. + OutStreamer.EmitLabel(getPICLabel(MAI->getPrivateGlobalPrefix(), + getFunctionNumber(), MI->getOperand(2).getImm(), + OutContext)); - // Form and emit tha dd. + // Form and emit the add. MCInst AddInst; AddInst.setOpcode(ARM::ADDrr); AddInst.addOperand(MCOperand::CreateReg(MI->getOperand(0).getReg())); AddInst.addOperand(MCOperand::CreateReg(ARM::PC)); AddInst.addOperand(MCOperand::CreateReg(MI->getOperand(1).getReg())); + // Add predicate operands. + AddInst.addOperand(MCOperand::CreateImm(MI->getOperand(3).getImm())); + AddInst.addOperand(MCOperand::CreateReg(MI->getOperand(4).getReg())); + // Add 's' bit operand (always reg0 for this) + AddInst.addOperand(MCOperand::CreateReg(0)); OutStreamer.EmitInstruction(AddInst); return; } - case ARM::CONSTPOOL_ENTRY: { // FIXME: Remove asm string from td file. + case ARM::PICSTR: + case ARM::PICSTRB: + case ARM::PICSTRH: + case ARM::PICLDR: + case ARM::PICLDRB: + case ARM::PICLDRH: + case ARM::PICLDRSB: + case ARM::PICLDRSH: { + // This is a pseudo op for a label + instruction sequence, which looks like: + // LPC0: + // OP r0, [pc, r0] + // The LCP0 label is referenced by a constant pool entry in order to get + // a PC-relative address at the ldr instruction. + + // Emit the label. + OutStreamer.EmitLabel(getPICLabel(MAI->getPrivateGlobalPrefix(), + getFunctionNumber(), MI->getOperand(2).getImm(), + OutContext)); + + // Form and emit the load + unsigned Opcode; + switch (MI->getOpcode()) { + default: + llvm_unreachable("Unexpected opcode!"); + case ARM::PICSTR: Opcode = ARM::STRrs; break; + case ARM::PICSTRB: Opcode = ARM::STRBrs; break; + case ARM::PICSTRH: Opcode = ARM::STRH; break; + case ARM::PICLDR: Opcode = ARM::LDRrs; break; + case ARM::PICLDRB: Opcode = ARM::LDRBrs; break; + case ARM::PICLDRH: Opcode = ARM::LDRH; break; + case ARM::PICLDRSB: Opcode = ARM::LDRSB; break; + case ARM::PICLDRSH: Opcode = ARM::LDRSH; break; + } + MCInst LdStInst; + LdStInst.setOpcode(Opcode); + LdStInst.addOperand(MCOperand::CreateReg(MI->getOperand(0).getReg())); + LdStInst.addOperand(MCOperand::CreateReg(ARM::PC)); + LdStInst.addOperand(MCOperand::CreateReg(MI->getOperand(1).getReg())); + LdStInst.addOperand(MCOperand::CreateImm(0)); + // Add predicate operands. + LdStInst.addOperand(MCOperand::CreateImm(MI->getOperand(3).getImm())); + LdStInst.addOperand(MCOperand::CreateReg(MI->getOperand(4).getReg())); + OutStreamer.EmitInstruction(LdStInst); + + return; + } + case ARM::CONSTPOOL_ENTRY: { /// CONSTPOOL_ENTRY - This instruction represents a floating constant pool /// in the function. The first operand is the ID# for this instruction, the /// second is the index into the MachineConstantPool that this is, the third @@ -1371,100 +1086,450 @@ void ARMAsmPrinter::printInstructionThroughMCStreamer(const MachineInstr *MI) { return; } - case ARM::MOVi2pieces: { // FIXME: Remove asmstring from td file. - // This is a hack that lowers as a two instruction sequence. - unsigned DstReg = MI->getOperand(0).getReg(); - unsigned ImmVal = (unsigned)MI->getOperand(1).getImm(); + case ARM::t2BR_JT: { + // Lower and emit the instruction itself, then the jump table following it. + MCInst TmpInst; + TmpInst.setOpcode(ARM::tMOVgpr2gpr); + TmpInst.addOperand(MCOperand::CreateReg(ARM::PC)); + TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(0).getReg())); + // Add predicate operands. + TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL)); + TmpInst.addOperand(MCOperand::CreateReg(0)); + OutStreamer.EmitInstruction(TmpInst); + // Output the data for the jump table itself + EmitJump2Table(MI); + return; + } + case ARM::t2TBB_JT: { + // Lower and emit the instruction itself, then the jump table following it. + MCInst TmpInst; + + TmpInst.setOpcode(ARM::t2TBB); + TmpInst.addOperand(MCOperand::CreateReg(ARM::PC)); + TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(0).getReg())); + // Add predicate operands. + TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL)); + TmpInst.addOperand(MCOperand::CreateReg(0)); + OutStreamer.EmitInstruction(TmpInst); + // Output the data for the jump table itself + EmitJump2Table(MI); + // Make sure the next instruction is 2-byte aligned. + EmitAlignment(1); + return; + } + case ARM::t2TBH_JT: { + // Lower and emit the instruction itself, then the jump table following it. + MCInst TmpInst; + + TmpInst.setOpcode(ARM::t2TBH); + TmpInst.addOperand(MCOperand::CreateReg(ARM::PC)); + TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(0).getReg())); + // Add predicate operands. + TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL)); + TmpInst.addOperand(MCOperand::CreateReg(0)); + OutStreamer.EmitInstruction(TmpInst); + // Output the data for the jump table itself + EmitJump2Table(MI); + return; + } + case ARM::tBR_JTr: + case ARM::BR_JTr: { + // Lower and emit the instruction itself, then the jump table following it. + // mov pc, target + MCInst TmpInst; + unsigned Opc = MI->getOpcode() == ARM::BR_JTr ? + ARM::MOVr : ARM::tMOVgpr2gpr; + TmpInst.setOpcode(Opc); + TmpInst.addOperand(MCOperand::CreateReg(ARM::PC)); + TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(0).getReg())); + // Add predicate operands. + TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL)); + TmpInst.addOperand(MCOperand::CreateReg(0)); + // Add 's' bit operand (always reg0 for this) + if (Opc == ARM::MOVr) + TmpInst.addOperand(MCOperand::CreateReg(0)); + OutStreamer.EmitInstruction(TmpInst); + + // Make sure the Thumb jump table is 4-byte aligned. + if (Opc == ARM::tMOVgpr2gpr) + EmitAlignment(2); - unsigned SOImmValV1 = ARM_AM::getSOImmTwoPartFirst(ImmVal); - unsigned SOImmValV2 = ARM_AM::getSOImmTwoPartSecond(ImmVal); + // Output the data for the jump table itself + EmitJumpTable(MI); + return; + } + case ARM::BR_JTm: { + // Lower and emit the instruction itself, then the jump table following it. + // ldr pc, target + MCInst TmpInst; + if (MI->getOperand(1).getReg() == 0) { + // literal offset + TmpInst.setOpcode(ARM::LDRi12); + TmpInst.addOperand(MCOperand::CreateReg(ARM::PC)); + TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(0).getReg())); + TmpInst.addOperand(MCOperand::CreateImm(MI->getOperand(2).getImm())); + } else { + TmpInst.setOpcode(ARM::LDRrs); + TmpInst.addOperand(MCOperand::CreateReg(ARM::PC)); + TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(0).getReg())); + TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(1).getReg())); + TmpInst.addOperand(MCOperand::CreateImm(0)); + } + // Add predicate operands. + TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL)); + TmpInst.addOperand(MCOperand::CreateReg(0)); + OutStreamer.EmitInstruction(TmpInst); + // Output the data for the jump table itself + EmitJumpTable(MI); + return; + } + case ARM::BR_JTadd: { + // Lower and emit the instruction itself, then the jump table following it. + // add pc, target, idx + MCInst TmpInst; + TmpInst.setOpcode(ARM::ADDrr); + TmpInst.addOperand(MCOperand::CreateReg(ARM::PC)); + TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(0).getReg())); + TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(1).getReg())); + // Add predicate operands. + TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL)); + TmpInst.addOperand(MCOperand::CreateReg(0)); + // Add 's' bit operand (always reg0 for this) + TmpInst.addOperand(MCOperand::CreateReg(0)); + OutStreamer.EmitInstruction(TmpInst); + + // Output the data for the jump table itself + EmitJumpTable(MI); + return; + } + case ARM::TRAP: { + // Non-Darwin binutils don't yet support the "trap" mnemonic. + // FIXME: Remove this special case when they do. + if (!Subtarget->isTargetDarwin()) { + //.long 0xe7ffdefe @ trap + uint32_t Val = 0xe7ffdefeUL; + OutStreamer.AddComment("trap"); + OutStreamer.EmitIntValue(Val, 4); + return; + } + break; + } + case ARM::tTRAP: { + // Non-Darwin binutils don't yet support the "trap" mnemonic. + // FIXME: Remove this special case when they do. + if (!Subtarget->isTargetDarwin()) { + //.short 57086 @ trap + uint16_t Val = 0xdefe; + OutStreamer.AddComment("trap"); + OutStreamer.EmitIntValue(Val, 2); + return; + } + break; + } + case ARM::t2Int_eh_sjlj_setjmp: + case ARM::t2Int_eh_sjlj_setjmp_nofp: + case ARM::tInt_eh_sjlj_setjmp: { + // Two incoming args: GPR:$src, GPR:$val + // mov $val, pc + // adds $val, #7 + // str $val, [$src, #4] + // movs r0, #0 + // b 1f + // movs r0, #1 + // 1: + unsigned SrcReg = MI->getOperand(0).getReg(); + unsigned ValReg = MI->getOperand(1).getReg(); + MCSymbol *Label = GetARMSJLJEHLabel(); { MCInst TmpInst; - TmpInst.setOpcode(ARM::MOVi); - TmpInst.addOperand(MCOperand::CreateReg(DstReg)); - TmpInst.addOperand(MCOperand::CreateImm(SOImmValV1)); - + TmpInst.setOpcode(ARM::tMOVgpr2tgpr); + TmpInst.addOperand(MCOperand::CreateReg(ValReg)); + TmpInst.addOperand(MCOperand::CreateReg(ARM::PC)); + // 's' bit operand + TmpInst.addOperand(MCOperand::CreateReg(ARM::CPSR)); + OutStreamer.AddComment("eh_setjmp begin"); + OutStreamer.EmitInstruction(TmpInst); + } + { + MCInst TmpInst; + TmpInst.setOpcode(ARM::tADDi3); + TmpInst.addOperand(MCOperand::CreateReg(ValReg)); + // 's' bit operand + TmpInst.addOperand(MCOperand::CreateReg(ARM::CPSR)); + TmpInst.addOperand(MCOperand::CreateReg(ValReg)); + TmpInst.addOperand(MCOperand::CreateImm(7)); // Predicate. - TmpInst.addOperand(MCOperand::CreateImm(MI->getOperand(2).getImm())); - TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(3).getReg())); - - TmpInst.addOperand(MCOperand::CreateReg(0)); // cc_out + TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL)); + TmpInst.addOperand(MCOperand::CreateReg(0)); OutStreamer.EmitInstruction(TmpInst); } - { MCInst TmpInst; - TmpInst.setOpcode(ARM::ORRri); - TmpInst.addOperand(MCOperand::CreateReg(DstReg)); // dstreg - TmpInst.addOperand(MCOperand::CreateReg(DstReg)); // inreg - TmpInst.addOperand(MCOperand::CreateImm(SOImmValV2)); // so_imm + TmpInst.setOpcode(ARM::tSTRi); + TmpInst.addOperand(MCOperand::CreateReg(ValReg)); + TmpInst.addOperand(MCOperand::CreateReg(SrcReg)); + // The offset immediate is #4. The operand value is scaled by 4 for the + // tSTR instruction. + TmpInst.addOperand(MCOperand::CreateImm(1)); // Predicate. - TmpInst.addOperand(MCOperand::CreateImm(MI->getOperand(2).getImm())); - TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(3).getReg())); + TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL)); + TmpInst.addOperand(MCOperand::CreateReg(0)); + OutStreamer.EmitInstruction(TmpInst); + } + { + MCInst TmpInst; + TmpInst.setOpcode(ARM::tMOVi8); + TmpInst.addOperand(MCOperand::CreateReg(ARM::R0)); + TmpInst.addOperand(MCOperand::CreateReg(ARM::CPSR)); + TmpInst.addOperand(MCOperand::CreateImm(0)); + // Predicate. + TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL)); + TmpInst.addOperand(MCOperand::CreateReg(0)); + OutStreamer.EmitInstruction(TmpInst); + } + { + const MCExpr *SymbolExpr = MCSymbolRefExpr::Create(Label, OutContext); + MCInst TmpInst; + TmpInst.setOpcode(ARM::tB); + TmpInst.addOperand(MCOperand::CreateExpr(SymbolExpr)); + OutStreamer.EmitInstruction(TmpInst); + } + { + MCInst TmpInst; + TmpInst.setOpcode(ARM::tMOVi8); + TmpInst.addOperand(MCOperand::CreateReg(ARM::R0)); + TmpInst.addOperand(MCOperand::CreateReg(ARM::CPSR)); + TmpInst.addOperand(MCOperand::CreateImm(1)); + // Predicate. + TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL)); + TmpInst.addOperand(MCOperand::CreateReg(0)); + OutStreamer.AddComment("eh_setjmp end"); + OutStreamer.EmitInstruction(TmpInst); + } + OutStreamer.EmitLabel(Label); + return; + } - TmpInst.addOperand(MCOperand::CreateReg(0)); // cc_out + case ARM::Int_eh_sjlj_setjmp_nofp: + case ARM::Int_eh_sjlj_setjmp: { + // Two incoming args: GPR:$src, GPR:$val + // add $val, pc, #8 + // str $val, [$src, #+4] + // mov r0, #0 + // add pc, pc, #0 + // mov r0, #1 + unsigned SrcReg = MI->getOperand(0).getReg(); + unsigned ValReg = MI->getOperand(1).getReg(); + + { + MCInst TmpInst; + TmpInst.setOpcode(ARM::ADDri); + TmpInst.addOperand(MCOperand::CreateReg(ValReg)); + TmpInst.addOperand(MCOperand::CreateReg(ARM::PC)); + TmpInst.addOperand(MCOperand::CreateImm(8)); + // Predicate. + TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL)); + TmpInst.addOperand(MCOperand::CreateReg(0)); + // 's' bit operand (always reg0 for this). + TmpInst.addOperand(MCOperand::CreateReg(0)); + OutStreamer.AddComment("eh_setjmp begin"); + OutStreamer.EmitInstruction(TmpInst); + } + { + MCInst TmpInst; + TmpInst.setOpcode(ARM::STRi12); + TmpInst.addOperand(MCOperand::CreateReg(ValReg)); + TmpInst.addOperand(MCOperand::CreateReg(SrcReg)); + TmpInst.addOperand(MCOperand::CreateImm(4)); + // Predicate. + TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL)); + TmpInst.addOperand(MCOperand::CreateReg(0)); + OutStreamer.EmitInstruction(TmpInst); + } + { + MCInst TmpInst; + TmpInst.setOpcode(ARM::MOVi); + TmpInst.addOperand(MCOperand::CreateReg(ARM::R0)); + TmpInst.addOperand(MCOperand::CreateImm(0)); + // Predicate. + TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL)); + TmpInst.addOperand(MCOperand::CreateReg(0)); + // 's' bit operand (always reg0 for this). + TmpInst.addOperand(MCOperand::CreateReg(0)); + OutStreamer.EmitInstruction(TmpInst); + } + { + MCInst TmpInst; + TmpInst.setOpcode(ARM::ADDri); + TmpInst.addOperand(MCOperand::CreateReg(ARM::PC)); + TmpInst.addOperand(MCOperand::CreateReg(ARM::PC)); + TmpInst.addOperand(MCOperand::CreateImm(0)); + // Predicate. + TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL)); + TmpInst.addOperand(MCOperand::CreateReg(0)); + // 's' bit operand (always reg0 for this). + TmpInst.addOperand(MCOperand::CreateReg(0)); + OutStreamer.EmitInstruction(TmpInst); + } + { + MCInst TmpInst; + TmpInst.setOpcode(ARM::MOVi); + TmpInst.addOperand(MCOperand::CreateReg(ARM::R0)); + TmpInst.addOperand(MCOperand::CreateImm(1)); + // Predicate. + TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL)); + TmpInst.addOperand(MCOperand::CreateReg(0)); + // 's' bit operand (always reg0 for this). + TmpInst.addOperand(MCOperand::CreateReg(0)); + OutStreamer.AddComment("eh_setjmp end"); OutStreamer.EmitInstruction(TmpInst); } return; } - case ARM::MOVi32imm: { // FIXME: Remove asmstring from td file. - // This is a hack that lowers as a two instruction sequence. - unsigned DstReg = MI->getOperand(0).getReg(); - const MachineOperand &MO = MI->getOperand(1); - MCOperand V1, V2; - if (MO.isImm()) { - unsigned ImmVal = (unsigned)MI->getOperand(1).getImm(); - V1 = MCOperand::CreateImm(ImmVal & 65535); - V2 = MCOperand::CreateImm(ImmVal >> 16); - } else if (MO.isGlobal()) { - MCSymbol *Symbol = MCInstLowering.GetGlobalAddressSymbol(MO); - const MCSymbolRefExpr *SymRef1 = - MCSymbolRefExpr::Create(Symbol, - MCSymbolRefExpr::VK_ARM_LO16, OutContext); - const MCSymbolRefExpr *SymRef2 = - MCSymbolRefExpr::Create(Symbol, - MCSymbolRefExpr::VK_ARM_HI16, OutContext); - V1 = MCOperand::CreateExpr(SymRef1); - V2 = MCOperand::CreateExpr(SymRef2); - } else { - MI->dump(); - llvm_unreachable("cannot handle this operand"); + case ARM::Int_eh_sjlj_longjmp: { + // ldr sp, [$src, #8] + // ldr $scratch, [$src, #4] + // ldr r7, [$src] + // bx $scratch + unsigned SrcReg = MI->getOperand(0).getReg(); + unsigned ScratchReg = MI->getOperand(1).getReg(); + { + MCInst TmpInst; + TmpInst.setOpcode(ARM::LDRi12); + TmpInst.addOperand(MCOperand::CreateReg(ARM::SP)); + TmpInst.addOperand(MCOperand::CreateReg(SrcReg)); + TmpInst.addOperand(MCOperand::CreateImm(8)); + // Predicate. + TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL)); + TmpInst.addOperand(MCOperand::CreateReg(0)); + OutStreamer.EmitInstruction(TmpInst); } - { MCInst TmpInst; - TmpInst.setOpcode(ARM::MOVi16); - TmpInst.addOperand(MCOperand::CreateReg(DstReg)); // dstreg - TmpInst.addOperand(V1); // lower16(imm) - + TmpInst.setOpcode(ARM::LDRi12); + TmpInst.addOperand(MCOperand::CreateReg(ScratchReg)); + TmpInst.addOperand(MCOperand::CreateReg(SrcReg)); + TmpInst.addOperand(MCOperand::CreateImm(4)); // Predicate. - TmpInst.addOperand(MCOperand::CreateImm(MI->getOperand(2).getImm())); - TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(3).getReg())); - + TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL)); + TmpInst.addOperand(MCOperand::CreateReg(0)); OutStreamer.EmitInstruction(TmpInst); } - { MCInst TmpInst; - TmpInst.setOpcode(ARM::MOVTi16); - TmpInst.addOperand(MCOperand::CreateReg(DstReg)); // dstreg - TmpInst.addOperand(MCOperand::CreateReg(DstReg)); // srcreg - TmpInst.addOperand(V2); // upper16(imm) - + TmpInst.setOpcode(ARM::LDRi12); + TmpInst.addOperand(MCOperand::CreateReg(ARM::R7)); + TmpInst.addOperand(MCOperand::CreateReg(SrcReg)); + TmpInst.addOperand(MCOperand::CreateImm(0)); // Predicate. - TmpInst.addOperand(MCOperand::CreateImm(MI->getOperand(2).getImm())); - TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(3).getReg())); - + TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL)); + TmpInst.addOperand(MCOperand::CreateReg(0)); + OutStreamer.EmitInstruction(TmpInst); + } + { + MCInst TmpInst; + TmpInst.setOpcode(ARM::BX); + TmpInst.addOperand(MCOperand::CreateReg(ScratchReg)); + // Predicate. + TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL)); + TmpInst.addOperand(MCOperand::CreateReg(0)); OutStreamer.EmitInstruction(TmpInst); } - return; } + case ARM::tInt_eh_sjlj_longjmp: { + // ldr $scratch, [$src, #8] + // mov sp, $scratch + // ldr $scratch, [$src, #4] + // ldr r7, [$src] + // bx $scratch + unsigned SrcReg = MI->getOperand(0).getReg(); + unsigned ScratchReg = MI->getOperand(1).getReg(); + { + MCInst TmpInst; + TmpInst.setOpcode(ARM::tLDRi); + TmpInst.addOperand(MCOperand::CreateReg(ScratchReg)); + TmpInst.addOperand(MCOperand::CreateReg(SrcReg)); + // The offset immediate is #8. The operand value is scaled by 4 for the + // tLDR instruction. + TmpInst.addOperand(MCOperand::CreateImm(2)); + // Predicate. + TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL)); + TmpInst.addOperand(MCOperand::CreateReg(0)); + OutStreamer.EmitInstruction(TmpInst); + } + { + MCInst TmpInst; + TmpInst.setOpcode(ARM::tMOVtgpr2gpr); + TmpInst.addOperand(MCOperand::CreateReg(ARM::SP)); + TmpInst.addOperand(MCOperand::CreateReg(ScratchReg)); + // Predicate. + TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL)); + TmpInst.addOperand(MCOperand::CreateReg(0)); + OutStreamer.EmitInstruction(TmpInst); + } + { + MCInst TmpInst; + TmpInst.setOpcode(ARM::tLDRi); + TmpInst.addOperand(MCOperand::CreateReg(ScratchReg)); + TmpInst.addOperand(MCOperand::CreateReg(SrcReg)); + TmpInst.addOperand(MCOperand::CreateImm(1)); + // Predicate. + TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL)); + TmpInst.addOperand(MCOperand::CreateReg(0)); + OutStreamer.EmitInstruction(TmpInst); + } + { + MCInst TmpInst; + TmpInst.setOpcode(ARM::tLDRr); + TmpInst.addOperand(MCOperand::CreateReg(ARM::R7)); + TmpInst.addOperand(MCOperand::CreateReg(SrcReg)); + TmpInst.addOperand(MCOperand::CreateReg(0)); + // Predicate. + TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL)); + TmpInst.addOperand(MCOperand::CreateReg(0)); + OutStreamer.EmitInstruction(TmpInst); + } + { + MCInst TmpInst; + TmpInst.setOpcode(ARM::tBX_RET_vararg); + TmpInst.addOperand(MCOperand::CreateReg(ScratchReg)); + // Predicate. + TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL)); + TmpInst.addOperand(MCOperand::CreateReg(0)); + OutStreamer.EmitInstruction(TmpInst); + } + return; + } + // These are the pseudos created to comply with stricter operand restrictions + // on ARMv5. Lower them now to "normal" instructions, since all the + // restrictions are already satisfied. + case ARM::MULv5: + EmitPatchedInstruction(MI, ARM::MUL); + return; + case ARM::MLAv5: + EmitPatchedInstruction(MI, ARM::MLA); + return; + case ARM::SMULLv5: + EmitPatchedInstruction(MI, ARM::SMULL); + return; + case ARM::UMULLv5: + EmitPatchedInstruction(MI, ARM::UMULL); + return; + case ARM::SMLALv5: + EmitPatchedInstruction(MI, ARM::SMLAL); + return; + case ARM::UMLALv5: + EmitPatchedInstruction(MI, ARM::UMLAL); + return; + case ARM::UMAALv5: + EmitPatchedInstruction(MI, ARM::UMAAL); + return; } MCInst TmpInst; - MCInstLowering.Lower(MI, TmpInst); + LowerARMMachineInstrToMCInst(MI, TmpInst, *this); OutStreamer.EmitInstruction(TmpInst); } @@ -1476,7 +1541,7 @@ static MCInstPrinter *createARMMCInstPrinter(const Target &T, unsigned SyntaxVariant, const MCAsmInfo &MAI) { if (SyntaxVariant == 0) - return new ARMInstPrinter(MAI, false); + return new ARMInstPrinter(MAI); return 0; } diff --git a/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.h b/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.h new file mode 100644 index 0000000..5852684 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.h @@ -0,0 +1,112 @@ +//===-- ARMAsmPrinter.h - Print machine code to an ARM .s file ------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// ARM Assembly printer class. +// +//===----------------------------------------------------------------------===// + +#ifndef ARMASMPRINTER_H +#define ARMASMPRINTER_H + +#include "ARM.h" +#include "ARMTargetMachine.h" +#include "llvm/CodeGen/AsmPrinter.h" +#include "llvm/Support/Compiler.h" + +namespace llvm { + +namespace ARM { + enum DW_ISA { + DW_ISA_ARM_thumb = 1, + DW_ISA_ARM_arm = 2 + }; +} + +class LLVM_LIBRARY_VISIBILITY ARMAsmPrinter : public AsmPrinter { + + /// Subtarget - Keep a pointer to the ARMSubtarget around so that we can + /// make the right decision when printing asm code for different targets. + const ARMSubtarget *Subtarget; + + /// AFI - Keep a pointer to ARMFunctionInfo for the current + /// MachineFunction. + ARMFunctionInfo *AFI; + + /// MCP - Keep a pointer to constantpool entries of the current + /// MachineFunction. + const MachineConstantPool *MCP; + +public: + explicit ARMAsmPrinter(TargetMachine &TM, MCStreamer &Streamer) + : AsmPrinter(TM, Streamer), AFI(NULL), MCP(NULL) { + Subtarget = &TM.getSubtarget<ARMSubtarget>(); + } + + virtual const char *getPassName() const { + return "ARM Assembly Printer"; + } + + void printOperand(const MachineInstr *MI, int OpNum, raw_ostream &O, + const char *Modifier = 0); + + virtual bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNum, + unsigned AsmVariant, const char *ExtraCode, + raw_ostream &O); + virtual bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNum, + unsigned AsmVariant, + const char *ExtraCode, raw_ostream &O); + + void EmitJumpTable(const MachineInstr *MI); + void EmitJump2Table(const MachineInstr *MI); + virtual void EmitInstruction(const MachineInstr *MI); + bool runOnMachineFunction(MachineFunction &F); + + virtual void EmitConstantPool() {} // we emit constant pools customly! + virtual void EmitFunctionEntryLabel(); + void EmitStartOfAsmFile(Module &M); + void EmitEndOfAsmFile(Module &M); + +private: + // Helpers for EmitStartOfAsmFile() and EmitEndOfAsmFile() + void emitAttributes(); + + // Helper for ELF .o only + void emitARMAttributeSection(); + + // Generic helper used to emit e.g. ARMv5 mul pseudos + void EmitPatchedInstruction(const MachineInstr *MI, unsigned TargetOpc); + +public: + void PrintDebugValueComment(const MachineInstr *MI, raw_ostream &OS); + + MachineLocation getDebugValueLocation(const MachineInstr *MI) const; + + virtual unsigned getISAEncoding() { + // ARM/Darwin adds ISA to the DWARF info for each function. + if (!Subtarget->isTargetDarwin()) + return 0; + return Subtarget->isThumb() ? + llvm::ARM::DW_ISA_ARM_thumb : llvm::ARM::DW_ISA_ARM_arm; + } + + MCSymbol *GetARMSetPICJumpTableLabel2(unsigned uid, unsigned uid2, + const MachineBasicBlock *MBB) const; + MCSymbol *GetARMJTIPICJumpTableLabel2(unsigned uid, unsigned uid2) const; + + MCSymbol *GetARMSJLJEHLabel(void) const; + + MCSymbol *GetARMGVSymbol(const GlobalValue *GV); + + /// EmitMachineConstantPoolValue - Print a machine constantpool value to + /// the .s file. + virtual void EmitMachineConstantPoolValue(MachineConstantPoolValue *MCPV); +}; +} // end namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/ARM/ARMBaseInfo.h b/contrib/llvm/lib/Target/ARM/ARMBaseInfo.h new file mode 100644 index 0000000..a56cc1a --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMBaseInfo.h @@ -0,0 +1,249 @@ +//===-- ARMBaseInfo.h - Top level definitions for ARM -------- --*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains small standalone helper functions and enum definitions for +// the ARM target useful for the compiler back-end and the MC libraries. +// As such, it deliberately does not include references to LLVM core +// code gen types, passes, etc.. +// +//===----------------------------------------------------------------------===// + +#ifndef ARMBASEINFO_H +#define ARMBASEINFO_H + +#include "llvm/Support/ErrorHandling.h" + +// Note that the following auto-generated files only defined enum types, and +// so are safe to include here. + +// Defines symbolic names for ARM registers. This defines a mapping from +// register name to register number. +// +#include "ARMGenRegisterNames.inc" + +// Defines symbolic names for the ARM instructions. +// +#include "ARMGenInstrNames.inc" + +namespace llvm { + +// Enums corresponding to ARM condition codes +namespace ARMCC { + // The CondCodes constants map directly to the 4-bit encoding of the + // condition field for predicated instructions. + enum CondCodes { // Meaning (integer) Meaning (floating-point) + EQ, // Equal Equal + NE, // Not equal Not equal, or unordered + HS, // Carry set >, ==, or unordered + LO, // Carry clear Less than + MI, // Minus, negative Less than + PL, // Plus, positive or zero >, ==, or unordered + VS, // Overflow Unordered + VC, // No overflow Not unordered + HI, // Unsigned higher Greater than, or unordered + LS, // Unsigned lower or same Less than or equal + GE, // Greater than or equal Greater than or equal + LT, // Less than Less than, or unordered + GT, // Greater than Greater than + LE, // Less than or equal <, ==, or unordered + AL // Always (unconditional) Always (unconditional) + }; + + inline static CondCodes getOppositeCondition(CondCodes CC) { + switch (CC) { + default: llvm_unreachable("Unknown condition code"); + case EQ: return NE; + case NE: return EQ; + case HS: return LO; + case LO: return HS; + case MI: return PL; + case PL: return MI; + case VS: return VC; + case VC: return VS; + case HI: return LS; + case LS: return HI; + case GE: return LT; + case LT: return GE; + case GT: return LE; + case LE: return GT; + } + } +} // namespace ARMCC + +inline static const char *ARMCondCodeToString(ARMCC::CondCodes CC) { + switch (CC) { + default: llvm_unreachable("Unknown condition code"); + case ARMCC::EQ: return "eq"; + case ARMCC::NE: return "ne"; + case ARMCC::HS: return "hs"; + case ARMCC::LO: return "lo"; + case ARMCC::MI: return "mi"; + case ARMCC::PL: return "pl"; + case ARMCC::VS: return "vs"; + case ARMCC::VC: return "vc"; + case ARMCC::HI: return "hi"; + case ARMCC::LS: return "ls"; + case ARMCC::GE: return "ge"; + case ARMCC::LT: return "lt"; + case ARMCC::GT: return "gt"; + case ARMCC::LE: return "le"; + case ARMCC::AL: return "al"; + } +} + +namespace ARM_PROC { + enum IMod { + IE = 2, + ID = 3 + }; + + enum IFlags { + F = 1, + I = 2, + A = 4 + }; + + inline static const char *IFlagsToString(unsigned val) { + switch (val) { + default: llvm_unreachable("Unknown iflags operand"); + case F: return "f"; + case I: return "i"; + case A: return "a"; + } + } + + inline static const char *IModToString(unsigned val) { + switch (val) { + default: llvm_unreachable("Unknown imod operand"); + case IE: return "ie"; + case ID: return "id"; + } + } +} + +namespace ARM_MB { + // The Memory Barrier Option constants map directly to the 4-bit encoding of + // the option field for memory barrier operations. + enum MemBOpt { + SY = 15, + ST = 14, + ISH = 11, + ISHST = 10, + NSH = 7, + NSHST = 6, + OSH = 3, + OSHST = 2 + }; + + inline static const char *MemBOptToString(unsigned val) { + switch (val) { + default: llvm_unreachable("Unknown memory operation"); + case SY: return "sy"; + case ST: return "st"; + case ISH: return "ish"; + case ISHST: return "ishst"; + case NSH: return "nsh"; + case NSHST: return "nshst"; + case OSH: return "osh"; + case OSHST: return "oshst"; + } + } +} // namespace ARM_MB + +/// getARMRegisterNumbering - Given the enum value for some register, e.g. +/// ARM::LR, return the number that it corresponds to (e.g. 14). +inline static unsigned getARMRegisterNumbering(unsigned Reg) { + using namespace ARM; + switch (Reg) { + default: + llvm_unreachable("Unknown ARM register!"); + case R0: case S0: case D0: case Q0: return 0; + case R1: case S1: case D1: case Q1: return 1; + case R2: case S2: case D2: case Q2: return 2; + case R3: case S3: case D3: case Q3: return 3; + case R4: case S4: case D4: case Q4: return 4; + case R5: case S5: case D5: case Q5: return 5; + case R6: case S6: case D6: case Q6: return 6; + case R7: case S7: case D7: case Q7: return 7; + case R8: case S8: case D8: case Q8: return 8; + case R9: case S9: case D9: case Q9: return 9; + case R10: case S10: case D10: case Q10: return 10; + case R11: case S11: case D11: case Q11: return 11; + case R12: case S12: case D12: case Q12: return 12; + case SP: case S13: case D13: case Q13: return 13; + case LR: case S14: case D14: case Q14: return 14; + case PC: case S15: case D15: case Q15: return 15; + + case S16: case D16: return 16; + case S17: case D17: return 17; + case S18: case D18: return 18; + case S19: case D19: return 19; + case S20: case D20: return 20; + case S21: case D21: return 21; + case S22: case D22: return 22; + case S23: case D23: return 23; + case S24: case D24: return 24; + case S25: case D25: return 25; + case S26: case D26: return 26; + case S27: case D27: return 27; + case S28: case D28: return 28; + case S29: case D29: return 29; + case S30: case D30: return 30; + case S31: case D31: return 31; + } +} + +namespace ARMII { + /// Target Operand Flag enum. + enum TOF { + //===------------------------------------------------------------------===// + // ARM Specific MachineOperand flags. + + MO_NO_FLAG, + + /// MO_LO16 - On a symbol operand, this represents a relocation containing + /// lower 16 bit of the address. Used only via movw instruction. + MO_LO16, + + /// MO_HI16 - On a symbol operand, this represents a relocation containing + /// higher 16 bit of the address. Used only via movt instruction. + MO_HI16, + + /// MO_LO16_NONLAZY - On a symbol operand "FOO", this represents a + /// relocation containing lower 16 bit of the non-lazy-ptr indirect symbol, + /// i.e. "FOO$non_lazy_ptr". + /// Used only via movw instruction. + MO_LO16_NONLAZY, + + /// MO_HI16_NONLAZY - On a symbol operand "FOO", this represents a + /// relocation containing lower 16 bit of the non-lazy-ptr indirect symbol, + /// i.e. "FOO$non_lazy_ptr". Used only via movt instruction. + MO_HI16_NONLAZY, + + /// MO_LO16_NONLAZY_PIC - On a symbol operand "FOO", this represents a + /// relocation containing lower 16 bit of the PC relative address of the + /// non-lazy-ptr indirect symbol, i.e. "FOO$non_lazy_ptr - LABEL". + /// Used only via movw instruction. + MO_LO16_NONLAZY_PIC, + + /// MO_HI16_NONLAZY_PIC - On a symbol operand "FOO", this represents a + /// relocation containing lower 16 bit of the PC relative address of the + /// non-lazy-ptr indirect symbol, i.e. "FOO$non_lazy_ptr - LABEL". + /// Used only via movt instruction. + MO_HI16_NONLAZY_PIC, + + /// MO_PLT - On a symbol operand, this represents an ELF PLT reference on a + /// call operand. + MO_PLT + }; +} // end namespace ARMII + +} // end namespace llvm; + +#endif diff --git a/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp index e4f10f9..2268e59 100644 --- a/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -15,13 +15,13 @@ #include "ARM.h" #include "ARMAddressingModes.h" #include "ARMConstantPoolValue.h" +#include "ARMHazardRecognizer.h" #include "ARMMachineFunctionInfo.h" #include "ARMRegisterInfo.h" #include "ARMGenInstrInfo.inc" #include "llvm/Constants.h" #include "llvm/Function.h" #include "llvm/GlobalValue.h" -#include "llvm/ADT/STLExtras.h" #include "llvm/CodeGen/LiveVariables.h" #include "llvm/CodeGen/MachineConstantPool.h" #include "llvm/CodeGen/MachineFrameInfo.h" @@ -34,15 +34,75 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/ADT/STLExtras.h" using namespace llvm; static cl::opt<bool> EnableARM3Addr("enable-arm-3-addr-conv", cl::Hidden, cl::desc("Enable ARM 2-addr to 3-addr conv")); +/// ARM_MLxEntry - Record information about MLA / MLS instructions. +struct ARM_MLxEntry { + unsigned MLxOpc; // MLA / MLS opcode + unsigned MulOpc; // Expanded multiplication opcode + unsigned AddSubOpc; // Expanded add / sub opcode + bool NegAcc; // True if the acc is negated before the add / sub. + bool HasLane; // True if instruction has an extra "lane" operand. +}; + +static const ARM_MLxEntry ARM_MLxTable[] = { + // MLxOpc, MulOpc, AddSubOpc, NegAcc, HasLane + // fp scalar ops + { ARM::VMLAS, ARM::VMULS, ARM::VADDS, false, false }, + { ARM::VMLSS, ARM::VMULS, ARM::VSUBS, false, false }, + { ARM::VMLAD, ARM::VMULD, ARM::VADDD, false, false }, + { ARM::VMLSD, ARM::VMULD, ARM::VSUBD, false, false }, + { ARM::VNMLAS, ARM::VNMULS, ARM::VSUBS, true, false }, + { ARM::VNMLSS, ARM::VMULS, ARM::VSUBS, true, false }, + { ARM::VNMLAD, ARM::VNMULD, ARM::VSUBD, true, false }, + { ARM::VNMLSD, ARM::VMULD, ARM::VSUBD, true, false }, + + // fp SIMD ops + { ARM::VMLAfd, ARM::VMULfd, ARM::VADDfd, false, false }, + { ARM::VMLSfd, ARM::VMULfd, ARM::VSUBfd, false, false }, + { ARM::VMLAfq, ARM::VMULfq, ARM::VADDfq, false, false }, + { ARM::VMLSfq, ARM::VMULfq, ARM::VSUBfq, false, false }, + { ARM::VMLAslfd, ARM::VMULslfd, ARM::VADDfd, false, true }, + { ARM::VMLSslfd, ARM::VMULslfd, ARM::VSUBfd, false, true }, + { ARM::VMLAslfq, ARM::VMULslfq, ARM::VADDfq, false, true }, + { ARM::VMLSslfq, ARM::VMULslfq, ARM::VSUBfq, false, true }, +}; + ARMBaseInstrInfo::ARMBaseInstrInfo(const ARMSubtarget& STI) : TargetInstrInfoImpl(ARMInsts, array_lengthof(ARMInsts)), Subtarget(STI) { + for (unsigned i = 0, e = array_lengthof(ARM_MLxTable); i != e; ++i) { + if (!MLxEntryMap.insert(std::make_pair(ARM_MLxTable[i].MLxOpc, i)).second) + assert(false && "Duplicated entries?"); + MLxHazardOpcodes.insert(ARM_MLxTable[i].AddSubOpc); + MLxHazardOpcodes.insert(ARM_MLxTable[i].MulOpc); + } +} + +// Use a ScoreboardHazardRecognizer for prepass ARM scheduling. TargetInstrImpl +// currently defaults to no prepass hazard recognizer. +ScheduleHazardRecognizer *ARMBaseInstrInfo:: +CreateTargetHazardRecognizer(const TargetMachine *TM, + const ScheduleDAG *DAG) const { + if (usePreRAHazardRecognizer()) { + const InstrItineraryData *II = TM->getInstrItineraryData(); + return new ScoreboardHazardRecognizer(II, DAG, "pre-RA-sched"); + } + return TargetInstrInfoImpl::CreateTargetHazardRecognizer(TM, DAG); +} + +ScheduleHazardRecognizer *ARMBaseInstrInfo:: +CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, + const ScheduleDAG *DAG) const { + if (Subtarget.isThumb2() || Subtarget.hasVFP2()) + return (ScheduleHazardRecognizer *) + new ARMHazardRecognizer(II, *this, getRegisterInfo(), Subtarget, DAG); + return TargetInstrInfoImpl::CreateTargetPostRAHazardRecognizer(II, DAG); } MachineInstr * @@ -140,7 +200,7 @@ ARMBaseInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, if (isLoad) MemMI = BuildMI(MF, MI->getDebugLoc(), get(MemOpc), MI->getOperand(0).getReg()) - .addReg(WBReg).addReg(0).addImm(0).addImm(Pred); + .addReg(WBReg).addImm(0).addImm(Pred); else MemMI = BuildMI(MF, MI->getDebugLoc(), get(MemOpc)).addReg(MI->getOperand(1).getReg()) @@ -151,7 +211,7 @@ ARMBaseInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, if (isLoad) MemMI = BuildMI(MF, MI->getDebugLoc(), get(MemOpc), MI->getOperand(0).getReg()) - .addReg(BaseReg).addReg(0).addImm(0).addImm(Pred); + .addReg(BaseReg).addImm(0).addImm(Pred); else MemMI = BuildMI(MF, MI->getDebugLoc(), get(MemOpc)).addReg(MI->getOperand(1).getReg()) @@ -166,8 +226,7 @@ ARMBaseInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, if (LV) { for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { MachineOperand &MO = MI->getOperand(i); - if (MO.isReg() && MO.getReg() && - TargetRegisterInfo::isVirtualRegister(MO.getReg())) { + if (MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg())) { unsigned Reg = MO.getReg(); LiveVariables::VarInfo &VI = LV->getVarInfo(Reg); @@ -197,43 +256,6 @@ ARMBaseInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, return NewMIs[0]; } -bool -ARMBaseInstrInfo::spillCalleeSavedRegisters(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, - const std::vector<CalleeSavedInfo> &CSI, - const TargetRegisterInfo *TRI) const { - if (CSI.empty()) - return false; - - DebugLoc DL; - if (MI != MBB.end()) DL = MI->getDebugLoc(); - - for (unsigned i = 0, e = CSI.size(); i != e; ++i) { - unsigned Reg = CSI[i].getReg(); - bool isKill = true; - - // Add the callee-saved register as live-in unless it's LR and - // @llvm.returnaddress is called. If LR is returned for @llvm.returnaddress - // then it's already added to the function and entry block live-in sets. - if (Reg == ARM::LR) { - MachineFunction &MF = *MBB.getParent(); - if (MF.getFrameInfo()->isReturnAddressTaken() && - MF.getRegInfo().isLiveIn(Reg)) - isKill = false; - } - - if (isKill) - MBB.addLiveIn(Reg); - - // Insert the spill to the stack frame. The register is killed at the spill - // - const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); - storeRegToStackSlot(MBB, MI, Reg, isKill, - CSI[i].getFrameIdx(), RC, TRI); - } - return true; -} - // Branch analysis. bool ARMBaseInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB, @@ -275,13 +297,31 @@ ARMBaseInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB, // Get the instruction before it if it is a terminator. MachineInstr *SecondLastInst = I; + unsigned SecondLastOpc = SecondLastInst->getOpcode(); + + // If AllowModify is true and the block ends with two or more unconditional + // branches, delete all but the first unconditional branch. + if (AllowModify && isUncondBranchOpcode(LastOpc)) { + while (isUncondBranchOpcode(SecondLastOpc)) { + LastInst->eraseFromParent(); + LastInst = SecondLastInst; + LastOpc = LastInst->getOpcode(); + if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) { + // Return now the only terminator is an unconditional branch. + TBB = LastInst->getOperand(0).getMBB(); + return false; + } else { + SecondLastInst = I; + SecondLastOpc = SecondLastInst->getOpcode(); + } + } + } // If there are three terminators, we don't know what sort of block this is. if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(--I)) return true; // If the block ends with a B and a Bcc, handle it. - unsigned SecondLastOpc = SecondLastInst->getOpcode(); if (isCondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) { TBB = SecondLastInst->getOperand(0).getMBB(); Cond.push_back(SecondLastInst->getOperand(1)); @@ -468,7 +508,7 @@ bool ARMBaseInstrInfo::isPredicable(MachineInstr *MI) const { } /// FIXME: Works around a gcc miscompilation with -fstrict-aliasing. -DISABLE_INLINE +LLVM_ATTRIBUTE_NOINLINE static unsigned getNumJTEntries(const std::vector<MachineJumpTableEntry> &JT, unsigned JTI); static unsigned getNumJTEntries(const std::vector<MachineJumpTableEntry> &JT, @@ -513,6 +553,14 @@ unsigned ARMBaseInstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const { case ARMII::Size2Bytes: return 2; // Thumb1 instruction. case ARMII::SizeSpecial: { switch (Opc) { + case ARM::MOVi16_ga_pcrel: + case ARM::MOVTi16_ga_pcrel: + case ARM::t2MOVi16_ga_pcrel: + case ARM::t2MOVTi16_ga_pcrel: + return 4; + case ARM::MOVi32imm: + case ARM::t2MOVi32imm: + return 8; case ARM::CONSTPOOL_ENTRY: // If this machine instr is a constant pool entry, its size is recorded as // operand #2. @@ -533,13 +581,13 @@ unsigned ARMBaseInstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const { case ARM::BR_JTadd: case ARM::tBR_JTr: case ARM::t2BR_JT: - case ARM::t2TBB: - case ARM::t2TBH: { + case ARM::t2TBB_JT: + case ARM::t2TBH_JT: { // These are jumptable branches, i.e. a branch followed by an inlined // jumptable. The size is 4 + 4 * number of entries. For TBB, each // entry is one byte; TBH two byte each. - unsigned EntrySize = (Opc == ARM::t2TBB) - ? 1 : ((Opc == ARM::t2TBH) ? 2 : 4); + unsigned EntrySize = (Opc == ARM::t2TBB_JT) + ? 1 : ((Opc == ARM::t2TBH_JT) ? 2 : 4); unsigned NumOps = TID.getNumOperands(); MachineOperand JTOP = MI->getOperand(NumOps - (TID.isPredicable() ? 3 : 2)); @@ -557,7 +605,7 @@ unsigned ARMBaseInstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const { // alignment issue. unsigned InstSize = (Opc == ARM::tBR_JTr || Opc == ARM::t2BR_JT) ? 2 : 4; unsigned NumEntries = getNumJTEntries(JT, JTI); - if (Opc == ARM::t2TBB && (NumEntries & 1)) + if (Opc == ARM::t2TBB_JT && (NumEntries & 1)) // Make sure the instruction that follows TBB is 2-byte aligned. // FIXME: Constant island pass should insert an "ALIGN" instruction // instead. @@ -573,84 +621,6 @@ unsigned ARMBaseInstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const { return 0; // Not reached } -unsigned -ARMBaseInstrInfo::isLoadFromStackSlot(const MachineInstr *MI, - int &FrameIndex) const { - switch (MI->getOpcode()) { - default: break; - case ARM::LDR: - case ARM::t2LDRs: // FIXME: don't use t2LDRs to access frame. - if (MI->getOperand(1).isFI() && - MI->getOperand(2).isReg() && - MI->getOperand(3).isImm() && - MI->getOperand(2).getReg() == 0 && - MI->getOperand(3).getImm() == 0) { - FrameIndex = MI->getOperand(1).getIndex(); - return MI->getOperand(0).getReg(); - } - break; - case ARM::t2LDRi12: - case ARM::tRestore: - if (MI->getOperand(1).isFI() && - MI->getOperand(2).isImm() && - MI->getOperand(2).getImm() == 0) { - FrameIndex = MI->getOperand(1).getIndex(); - return MI->getOperand(0).getReg(); - } - break; - case ARM::VLDRD: - case ARM::VLDRS: - if (MI->getOperand(1).isFI() && - MI->getOperand(2).isImm() && - MI->getOperand(2).getImm() == 0) { - FrameIndex = MI->getOperand(1).getIndex(); - return MI->getOperand(0).getReg(); - } - break; - } - - return 0; -} - -unsigned -ARMBaseInstrInfo::isStoreToStackSlot(const MachineInstr *MI, - int &FrameIndex) const { - switch (MI->getOpcode()) { - default: break; - case ARM::STR: - case ARM::t2STRs: // FIXME: don't use t2STRs to access frame. - if (MI->getOperand(1).isFI() && - MI->getOperand(2).isReg() && - MI->getOperand(3).isImm() && - MI->getOperand(2).getReg() == 0 && - MI->getOperand(3).getImm() == 0) { - FrameIndex = MI->getOperand(1).getIndex(); - return MI->getOperand(0).getReg(); - } - break; - case ARM::t2STRi12: - case ARM::tSpill: - if (MI->getOperand(1).isFI() && - MI->getOperand(2).isImm() && - MI->getOperand(2).getImm() == 0) { - FrameIndex = MI->getOperand(1).getIndex(); - return MI->getOperand(0).getReg(); - } - break; - case ARM::VSTRD: - case ARM::VSTRS: - if (MI->getOperand(1).isFI() && - MI->getOperand(2).isImm() && - MI->getOperand(2).getImm() == 0) { - FrameIndex = MI->getOperand(1).getIndex(); - return MI->getOperand(0).getReg(); - } - break; - } - - return 0; -} - void ARMBaseInstrInfo::copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, DebugLoc DL, unsigned DestReg, unsigned SrcReg, @@ -715,8 +685,9 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, unsigned Align = MFI.getObjectAlignment(FI); MachineMemOperand *MMO = - MF.getMachineMemOperand(PseudoSourceValue::getFixedStack(FI), - MachineMemOperand::MOStore, 0, + MF.getMachineMemOperand(MachinePointerInfo( + PseudoSourceValue::getFixedStack(FI)), + MachineMemOperand::MOStore, MFI.getObjectSize(FI), Align); @@ -728,9 +699,9 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, switch (RC->getID()) { case ARM::GPRRegClassID: - AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::STR)) + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::STRi12)) .addReg(SrcReg, getKillRegState(isKill)) - .addFrameIndex(FI).addReg(0).addImm(0).addMemOperand(MMO)); + .addFrameIndex(FI).addImm(0).addMemOperand(MMO)); break; case ARM::SPRRegClassID: AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTRS)) @@ -747,17 +718,15 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, case ARM::QPRRegClassID: case ARM::QPR_VFP2RegClassID: case ARM::QPR_8RegClassID: - // FIXME: Neon instructions should support predicates - if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) { - AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VST1q)) + if (Align >= 16 && getRegisterInfo().needsStackRealignment(MF)) { + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VST1q64Pseudo)) .addFrameIndex(FI).addImm(16) .addReg(SrcReg, getKillRegState(isKill)) .addMemOperand(MMO)); } else { - AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTMQ)) + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTMQIA)) .addReg(SrcReg, getKillRegState(isKill)) .addFrameIndex(FI) - .addImm(ARM_AM::getAM4ModeImm(ARM_AM::ia)) .addMemOperand(MMO)); } break; @@ -766,18 +735,14 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) { // FIXME: It's possible to only store part of the QQ register if the // spilled def has a sub-register index. - MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::VST1d64Q)) - .addFrameIndex(FI).addImm(16); - MIB = AddDReg(MIB, SrcReg, ARM::dsub_0, getKillRegState(isKill), TRI); - MIB = AddDReg(MIB, SrcReg, ARM::dsub_1, 0, TRI); - MIB = AddDReg(MIB, SrcReg, ARM::dsub_2, 0, TRI); - MIB = AddDReg(MIB, SrcReg, ARM::dsub_3, 0, TRI); - AddDefaultPred(MIB.addMemOperand(MMO)); + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VST1d64QPseudo)) + .addFrameIndex(FI).addImm(16) + .addReg(SrcReg, getKillRegState(isKill)) + .addMemOperand(MMO)); } else { MachineInstrBuilder MIB = - AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTMD)) - .addFrameIndex(FI) - .addImm(ARM_AM::getAM4ModeImm(ARM_AM::ia))) + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTMDIA)) + .addFrameIndex(FI)) .addMemOperand(MMO); MIB = AddDReg(MIB, SrcReg, ARM::dsub_0, getKillRegState(isKill), TRI); MIB = AddDReg(MIB, SrcReg, ARM::dsub_1, 0, TRI); @@ -787,9 +752,8 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, break; case ARM::QQQQPRRegClassID: { MachineInstrBuilder MIB = - AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTMD)) - .addFrameIndex(FI) - .addImm(ARM_AM::getAM4ModeImm(ARM_AM::ia))) + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTMDIA)) + .addFrameIndex(FI)) .addMemOperand(MMO); MIB = AddDReg(MIB, SrcReg, ARM::dsub_0, getKillRegState(isKill), TRI); MIB = AddDReg(MIB, SrcReg, ARM::dsub_1, 0, TRI); @@ -806,6 +770,53 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, } } +unsigned +ARMBaseInstrInfo::isStoreToStackSlot(const MachineInstr *MI, + int &FrameIndex) const { + switch (MI->getOpcode()) { + default: break; + case ARM::STRrs: + case ARM::t2STRs: // FIXME: don't use t2STRs to access frame. + if (MI->getOperand(1).isFI() && + MI->getOperand(2).isReg() && + MI->getOperand(3).isImm() && + MI->getOperand(2).getReg() == 0 && + MI->getOperand(3).getImm() == 0) { + FrameIndex = MI->getOperand(1).getIndex(); + return MI->getOperand(0).getReg(); + } + break; + case ARM::STRi12: + case ARM::t2STRi12: + case ARM::tSpill: + case ARM::VSTRD: + case ARM::VSTRS: + if (MI->getOperand(1).isFI() && + MI->getOperand(2).isImm() && + MI->getOperand(2).getImm() == 0) { + FrameIndex = MI->getOperand(1).getIndex(); + return MI->getOperand(0).getReg(); + } + break; + case ARM::VST1q64Pseudo: + if (MI->getOperand(0).isFI() && + MI->getOperand(2).getSubReg() == 0) { + FrameIndex = MI->getOperand(0).getIndex(); + return MI->getOperand(2).getReg(); + } + break; + case ARM::VSTMQIA: + if (MI->getOperand(1).isFI() && + MI->getOperand(0).getSubReg() == 0) { + FrameIndex = MI->getOperand(1).getIndex(); + return MI->getOperand(0).getReg(); + } + break; + } + + return 0; +} + void ARMBaseInstrInfo:: loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, unsigned DestReg, int FI, @@ -817,8 +828,9 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, MachineFrameInfo &MFI = *MF.getFrameInfo(); unsigned Align = MFI.getObjectAlignment(FI); MachineMemOperand *MMO = - MF.getMachineMemOperand(PseudoSourceValue::getFixedStack(FI), - MachineMemOperand::MOLoad, 0, + MF.getMachineMemOperand( + MachinePointerInfo(PseudoSourceValue::getFixedStack(FI)), + MachineMemOperand::MOLoad, MFI.getObjectSize(FI), Align); @@ -830,8 +842,8 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, switch (RC->getID()) { case ARM::GPRRegClassID: - AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::LDR), DestReg) - .addFrameIndex(FI).addReg(0).addImm(0).addMemOperand(MMO)); + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::LDRi12), DestReg) + .addFrameIndex(FI).addImm(0).addMemOperand(MMO)); break; case ARM::SPRRegClassID: AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDRS), DestReg) @@ -846,31 +858,26 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, case ARM::QPRRegClassID: case ARM::QPR_VFP2RegClassID: case ARM::QPR_8RegClassID: - if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) { - AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLD1q), DestReg) + if (Align >= 16 && getRegisterInfo().needsStackRealignment(MF)) { + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLD1q64Pseudo), DestReg) .addFrameIndex(FI).addImm(16) .addMemOperand(MMO)); } else { - AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDMQ), DestReg) + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDMQIA), DestReg) .addFrameIndex(FI) - .addImm(ARM_AM::getAM4ModeImm(ARM_AM::ia)) .addMemOperand(MMO)); } break; case ARM::QQPRRegClassID: case ARM::QQPR_VFP2RegClassID: if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) { - MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::VLD1d64Q)); - MIB = AddDReg(MIB, DestReg, ARM::dsub_0, RegState::Define, TRI); - MIB = AddDReg(MIB, DestReg, ARM::dsub_1, RegState::Define, TRI); - MIB = AddDReg(MIB, DestReg, ARM::dsub_2, RegState::Define, TRI); - MIB = AddDReg(MIB, DestReg, ARM::dsub_3, RegState::Define, TRI); - AddDefaultPred(MIB.addFrameIndex(FI).addImm(16).addMemOperand(MMO)); + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLD1d64QPseudo), DestReg) + .addFrameIndex(FI).addImm(16) + .addMemOperand(MMO)); } else { MachineInstrBuilder MIB = - AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDMD)) - .addFrameIndex(FI) - .addImm(ARM_AM::getAM4ModeImm(ARM_AM::ia))) + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDMDIA)) + .addFrameIndex(FI)) .addMemOperand(MMO); MIB = AddDReg(MIB, DestReg, ARM::dsub_0, RegState::Define, TRI); MIB = AddDReg(MIB, DestReg, ARM::dsub_1, RegState::Define, TRI); @@ -880,9 +887,8 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, break; case ARM::QQQQPRRegClassID: { MachineInstrBuilder MIB = - AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDMD)) - .addFrameIndex(FI) - .addImm(ARM_AM::getAM4ModeImm(ARM_AM::ia))) + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDMDIA)) + .addFrameIndex(FI)) .addMemOperand(MMO); MIB = AddDReg(MIB, DestReg, ARM::dsub_0, RegState::Define, TRI); MIB = AddDReg(MIB, DestReg, ARM::dsub_1, RegState::Define, TRI); @@ -899,6 +905,53 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, } } +unsigned +ARMBaseInstrInfo::isLoadFromStackSlot(const MachineInstr *MI, + int &FrameIndex) const { + switch (MI->getOpcode()) { + default: break; + case ARM::LDRrs: + case ARM::t2LDRs: // FIXME: don't use t2LDRs to access frame. + if (MI->getOperand(1).isFI() && + MI->getOperand(2).isReg() && + MI->getOperand(3).isImm() && + MI->getOperand(2).getReg() == 0 && + MI->getOperand(3).getImm() == 0) { + FrameIndex = MI->getOperand(1).getIndex(); + return MI->getOperand(0).getReg(); + } + break; + case ARM::LDRi12: + case ARM::t2LDRi12: + case ARM::tRestore: + case ARM::VLDRD: + case ARM::VLDRS: + if (MI->getOperand(1).isFI() && + MI->getOperand(2).isImm() && + MI->getOperand(2).getImm() == 0) { + FrameIndex = MI->getOperand(1).getIndex(); + return MI->getOperand(0).getReg(); + } + break; + case ARM::VLD1q64Pseudo: + if (MI->getOperand(1).isFI() && + MI->getOperand(0).getSubReg() == 0) { + FrameIndex = MI->getOperand(1).getIndex(); + return MI->getOperand(0).getReg(); + } + break; + case ARM::VLDMQIA: + if (MI->getOperand(1).isFI() && + MI->getOperand(0).getSubReg() == 0) { + FrameIndex = MI->getOperand(1).getIndex(); + return MI->getOperand(0).getReg(); + } + break; + } + + return 0; +} + MachineInstr* ARMBaseInstrInfo::emitFrameIndexDebugValue(MachineFunction &MF, int FrameIx, uint64_t Offset, @@ -921,7 +974,7 @@ static unsigned duplicateCPV(MachineFunction &MF, unsigned &CPI) { ARMConstantPoolValue *ACPV = static_cast<ARMConstantPoolValue*>(MCPE.Val.MachineCPVal); - unsigned PCLabelId = AFI->createConstPoolEntryUId(); + unsigned PCLabelId = AFI->createPICLabelUId(); ARMConstantPoolValue *NewCPV = 0; // FIXME: The below assumes PIC relocation model and that the function // is Thumb mode (t1 or t2). PCAdjustment would be 8 for ARM mode PIC, and @@ -991,12 +1044,18 @@ ARMBaseInstrInfo::duplicate(MachineInstr *Orig, MachineFunction &MF) const { } bool ARMBaseInstrInfo::produceSameValue(const MachineInstr *MI0, - const MachineInstr *MI1) const { + const MachineInstr *MI1, + const MachineRegisterInfo *MRI) const { int Opcode = MI0->getOpcode(); if (Opcode == ARM::t2LDRpci || Opcode == ARM::t2LDRpci_pic || Opcode == ARM::tLDRpci || - Opcode == ARM::tLDRpci_pic) { + Opcode == ARM::tLDRpci_pic || + Opcode == ARM::MOV_ga_dyn || + Opcode == ARM::MOV_ga_pcrel || + Opcode == ARM::MOV_ga_pcrel_ldr || + Opcode == ARM::t2MOV_ga_dyn || + Opcode == ARM::t2MOV_ga_pcrel) { if (MI1->getOpcode() != Opcode) return false; if (MI0->getNumOperands() != MI1->getNumOperands()) @@ -1007,6 +1066,14 @@ bool ARMBaseInstrInfo::produceSameValue(const MachineInstr *MI0, if (MO0.getOffset() != MO1.getOffset()) return false; + if (Opcode == ARM::MOV_ga_dyn || + Opcode == ARM::MOV_ga_pcrel || + Opcode == ARM::MOV_ga_pcrel_ldr || + Opcode == ARM::t2MOV_ga_dyn || + Opcode == ARM::t2MOV_ga_pcrel) + // Ignore the PC labels. + return MO0.getGlobal() == MO1.getGlobal(); + const MachineFunction *MF = MI0->getParent()->getParent(); const MachineConstantPool *MCP = MF->getConstantPool(); int CPI0 = MO0.getIndex(); @@ -1018,6 +1085,37 @@ bool ARMBaseInstrInfo::produceSameValue(const MachineInstr *MI0, ARMConstantPoolValue *ACPV1 = static_cast<ARMConstantPoolValue*>(MCPE1.Val.MachineCPVal); return ACPV0->hasSameValue(ACPV1); + } else if (Opcode == ARM::PICLDR) { + if (MI1->getOpcode() != Opcode) + return false; + if (MI0->getNumOperands() != MI1->getNumOperands()) + return false; + + unsigned Addr0 = MI0->getOperand(1).getReg(); + unsigned Addr1 = MI1->getOperand(1).getReg(); + if (Addr0 != Addr1) { + if (!MRI || + !TargetRegisterInfo::isVirtualRegister(Addr0) || + !TargetRegisterInfo::isVirtualRegister(Addr1)) + return false; + + // This assumes SSA form. + MachineInstr *Def0 = MRI->getVRegDef(Addr0); + MachineInstr *Def1 = MRI->getVRegDef(Addr1); + // Check if the loaded value, e.g. a constantpool of a global address, are + // the same. + if (!produceSameValue(Def0, Def1, MRI)) + return false; + } + + for (unsigned i = 3, e = MI0->getNumOperands(); i != e; ++i) { + // %vreg12<def> = PICLDR %vreg11, 0, pred:14, pred:%noreg + const MachineOperand &MO0 = MI0->getOperand(i); + const MachineOperand &MO1 = MI1->getOperand(i); + if (!MO0.isIdenticalTo(MO1)) + return false; + } + return true; } return MI0->isIdenticalTo(MI1, MachineInstr::IgnoreVRegDefs); @@ -1040,8 +1138,8 @@ bool ARMBaseInstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, switch (Load1->getMachineOpcode()) { default: return false; - case ARM::LDR: - case ARM::LDRB: + case ARM::LDRi12: + case ARM::LDRBi12: case ARM::LDRD: case ARM::LDRH: case ARM::LDRSB: @@ -1059,8 +1157,8 @@ bool ARMBaseInstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, switch (Load2->getMachineOpcode()) { default: return false; - case ARM::LDR: - case ARM::LDRB: + case ARM::LDRi12: + case ARM::LDRBi12: case ARM::LDRD: case ARM::LDRH: case ARM::LDRSB: @@ -1164,22 +1262,37 @@ bool ARMBaseInstrInfo::isSchedulingBoundary(const MachineInstr *MI, return false; } -bool ARMBaseInstrInfo:: -isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumInstrs) const { - if (!NumInstrs) +bool ARMBaseInstrInfo::isProfitableToIfCvt(MachineBasicBlock &MBB, + unsigned NumCyles, + unsigned ExtraPredCycles, + float Probability, + float Confidence) const { + if (!NumCyles) return false; - if (Subtarget.getCPUString() == "generic") - // Generic (and overly aggressive) if-conversion limits for testing. - return NumInstrs <= 10; - else if (Subtarget.hasV7Ops()) - return NumInstrs <= 3; - return NumInstrs <= 2; + + // Attempt to estimate the relative costs of predication versus branching. + float UnpredCost = Probability * NumCyles; + UnpredCost += 1.0; // The branch itself + UnpredCost += (1.0 - Confidence) * Subtarget.getMispredictionPenalty(); + + return (float)(NumCyles + ExtraPredCycles) < UnpredCost; } - + bool ARMBaseInstrInfo:: -isProfitableToIfCvt(MachineBasicBlock &TMBB, unsigned NumT, - MachineBasicBlock &FMBB, unsigned NumF) const { - return NumT && NumF && NumT <= 2 && NumF <= 2; +isProfitableToIfCvt(MachineBasicBlock &TMBB, + unsigned TCycles, unsigned TExtra, + MachineBasicBlock &FMBB, + unsigned FCycles, unsigned FExtra, + float Probability, float Confidence) const { + if (!TCycles || !FCycles) + return false; + + // Attempt to estimate the relative costs of predication versus branching. + float UnpredCost = Probability * TCycles + (1.0 - Probability) * FCycles; + UnpredCost += 1.0; // The branch itself + UnpredCost += (1.0 - Confidence) * Subtarget.getMispredictionPenalty(); + + return (float)(TCycles + FCycles + TExtra + FExtra) < UnpredCost; } /// getInstrPredicate - If instruction is predicated, returns its predicate @@ -1292,6 +1405,12 @@ bool llvm::rewriteARMFrameIndex(MachineInstr &MI, unsigned FrameRegIdx, unsigned NumBits = 0; unsigned Scale = 1; switch (AddrMode) { + case ARMII::AddrMode_i12: { + ImmIdx = FrameRegIdx + 1; + InstrOffs = MI.getOperand(ImmIdx).getImm(); + NumBits = 12; + break; + } case ARMII::AddrMode2: { ImmIdx = FrameRegIdx+2; InstrOffs = ARM_AM::getAM2Offset(MI.getOperand(ImmIdx).getImm()); @@ -1342,8 +1461,15 @@ bool llvm::rewriteARMFrameIndex(MachineInstr &MI, unsigned FrameRegIdx, if ((unsigned)Offset <= Mask * Scale) { // Replace the FrameIndex with sp MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false); - if (isSub) - ImmedOffset |= 1 << NumBits; + // FIXME: When addrmode2 goes away, this will simplify (like the + // T2 version), as the LDR.i12 versions don't need the encoding + // tricks for the offset value. + if (isSub) { + if (AddrMode == ARMII::AddrMode_i12) + ImmedOffset = -ImmedOffset; + else + ImmedOffset |= 1 << NumBits; + } ImmOp.ChangeToImmediate(ImmedOffset); Offset = 0; return true; @@ -1351,8 +1477,12 @@ bool llvm::rewriteARMFrameIndex(MachineInstr &MI, unsigned FrameRegIdx, // Otherwise, it didn't fit. Pull in what we can to simplify the immed. ImmedOffset = ImmedOffset & Mask; - if (isSub) - ImmedOffset |= 1 << NumBits; + if (isSub) { + if (AddrMode == ARMII::AddrMode_i12) + ImmedOffset = -ImmedOffset; + else + ImmedOffset |= 1 << NumBits; + } ImmOp.ChangeToImmediate(ImmedOffset); Offset &= ~(Mask*Scale); } @@ -1363,25 +1493,88 @@ bool llvm::rewriteARMFrameIndex(MachineInstr &MI, unsigned FrameRegIdx, } bool ARMBaseInstrInfo:: -AnalyzeCompare(const MachineInstr *MI, unsigned &SrcReg, int &CmpValue) const { +AnalyzeCompare(const MachineInstr *MI, unsigned &SrcReg, int &CmpMask, + int &CmpValue) const { switch (MI->getOpcode()) { default: break; case ARM::CMPri: - case ARM::CMPzri: case ARM::t2CMPri: - case ARM::t2CMPzri: SrcReg = MI->getOperand(0).getReg(); + CmpMask = ~0; CmpValue = MI->getOperand(1).getImm(); return true; + case ARM::TSTri: + case ARM::t2TSTri: + SrcReg = MI->getOperand(0).getReg(); + CmpMask = MI->getOperand(1).getImm(); + CmpValue = 0; + return true; } return false; } -/// ConvertToSetZeroFlag - Convert the instruction to set the "zero" flag so -/// that we can remove a "comparison with zero". +/// isSuitableForMask - Identify a suitable 'and' instruction that +/// operates on the given source register and applies the same mask +/// as a 'tst' instruction. Provide a limited look-through for copies. +/// When successful, MI will hold the found instruction. +static bool isSuitableForMask(MachineInstr *&MI, unsigned SrcReg, + int CmpMask, bool CommonUse) { + switch (MI->getOpcode()) { + case ARM::ANDri: + case ARM::t2ANDri: + if (CmpMask != MI->getOperand(2).getImm()) + return false; + if (SrcReg == MI->getOperand(CommonUse ? 1 : 0).getReg()) + return true; + break; + case ARM::COPY: { + // Walk down one instruction which is potentially an 'and'. + const MachineInstr &Copy = *MI; + MachineBasicBlock::iterator AND( + llvm::next(MachineBasicBlock::iterator(MI))); + if (AND == MI->getParent()->end()) return false; + MI = AND; + return isSuitableForMask(MI, Copy.getOperand(0).getReg(), + CmpMask, true); + } + } + + return false; +} + +/// OptimizeCompareInstr - Convert the instruction supplying the argument to the +/// comparison into one that sets the zero bit in the flags register. bool ARMBaseInstrInfo:: -ConvertToSetZeroFlag(MachineInstr *MI, MachineInstr *CmpInstr) const { +OptimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, int CmpMask, + int CmpValue, const MachineRegisterInfo *MRI) const { + if (CmpValue != 0) + return false; + + MachineRegisterInfo::def_iterator DI = MRI->def_begin(SrcReg); + if (llvm::next(DI) != MRI->def_end()) + // Only support one definition. + return false; + + MachineInstr *MI = &*DI; + + // Masked compares sometimes use the same register as the corresponding 'and'. + if (CmpMask != ~0) { + if (!isSuitableForMask(MI, SrcReg, CmpMask, false)) { + MI = 0; + for (MachineRegisterInfo::use_iterator UI = MRI->use_begin(SrcReg), + UE = MRI->use_end(); UI != UE; ++UI) { + if (UI->getParent() != CmpInstr->getParent()) continue; + MachineInstr *PotentialAND = &*UI; + if (!isSuitableForMask(PotentialAND, SrcReg, CmpMask, true)) + continue; + MI = PotentialAND; + break; + } + if (!MI) return false; + } + } + // Conservatively refuse to convert an instruction which isn't in the same BB // as the comparison. if (MI->getParent() != CmpInstr->getParent()) @@ -1391,16 +1584,20 @@ ConvertToSetZeroFlag(MachineInstr *MI, MachineInstr *CmpInstr) const { // want to change. MachineBasicBlock::const_iterator I = CmpInstr, E = MI, B = MI->getParent()->begin(); + + // Early exit if CmpInstr is at the beginning of the BB. + if (I == B) return false; + --I; for (; I != E; --I) { const MachineInstr &Instr = *I; for (unsigned IO = 0, EO = Instr.getNumOperands(); IO != EO; ++IO) { const MachineOperand &MO = Instr.getOperand(IO); - if (!MO.isReg() || !MO.isDef()) continue; + if (!MO.isReg()) continue; - // This instruction modifies CPSR before the one we want to change. We - // can't do this transformation. + // This instruction modifies or uses CPSR after the one we want to + // change. We can't do this transformation. if (MO.getReg() == ARM::CPSR) return false; } @@ -1414,15 +1611,713 @@ ConvertToSetZeroFlag(MachineInstr *MI, MachineInstr *CmpInstr) const { switch (MI->getOpcode()) { default: break; case ARM::ADDri: + case ARM::ANDri: + case ARM::t2ANDri: case ARM::SUBri: case ARM::t2ADDri: case ARM::t2SUBri: - MI->RemoveOperand(5); - MachineInstrBuilder(MI) - .addReg(ARM::CPSR, RegState::Define | RegState::Implicit); + // Toggle the optional operand to CPSR. + MI->getOperand(5).setReg(ARM::CPSR); + MI->getOperand(5).setIsDef(true); CmpInstr->eraseFromParent(); return true; } return false; } + +bool ARMBaseInstrInfo::FoldImmediate(MachineInstr *UseMI, + MachineInstr *DefMI, unsigned Reg, + MachineRegisterInfo *MRI) const { + // Fold large immediates into add, sub, or, xor. + unsigned DefOpc = DefMI->getOpcode(); + if (DefOpc != ARM::t2MOVi32imm && DefOpc != ARM::MOVi32imm) + return false; + if (!DefMI->getOperand(1).isImm()) + // Could be t2MOVi32imm <ga:xx> + return false; + + if (!MRI->hasOneNonDBGUse(Reg)) + return false; + + unsigned UseOpc = UseMI->getOpcode(); + unsigned NewUseOpc = 0; + uint32_t ImmVal = (uint32_t)DefMI->getOperand(1).getImm(); + uint32_t SOImmValV1 = 0, SOImmValV2 = 0; + bool Commute = false; + switch (UseOpc) { + default: return false; + case ARM::SUBrr: + case ARM::ADDrr: + case ARM::ORRrr: + case ARM::EORrr: + case ARM::t2SUBrr: + case ARM::t2ADDrr: + case ARM::t2ORRrr: + case ARM::t2EORrr: { + Commute = UseMI->getOperand(2).getReg() != Reg; + switch (UseOpc) { + default: break; + case ARM::SUBrr: { + if (Commute) + return false; + ImmVal = -ImmVal; + NewUseOpc = ARM::SUBri; + // Fallthrough + } + case ARM::ADDrr: + case ARM::ORRrr: + case ARM::EORrr: { + if (!ARM_AM::isSOImmTwoPartVal(ImmVal)) + return false; + SOImmValV1 = (uint32_t)ARM_AM::getSOImmTwoPartFirst(ImmVal); + SOImmValV2 = (uint32_t)ARM_AM::getSOImmTwoPartSecond(ImmVal); + switch (UseOpc) { + default: break; + case ARM::ADDrr: NewUseOpc = ARM::ADDri; break; + case ARM::ORRrr: NewUseOpc = ARM::ORRri; break; + case ARM::EORrr: NewUseOpc = ARM::EORri; break; + } + break; + } + case ARM::t2SUBrr: { + if (Commute) + return false; + ImmVal = -ImmVal; + NewUseOpc = ARM::t2SUBri; + // Fallthrough + } + case ARM::t2ADDrr: + case ARM::t2ORRrr: + case ARM::t2EORrr: { + if (!ARM_AM::isT2SOImmTwoPartVal(ImmVal)) + return false; + SOImmValV1 = (uint32_t)ARM_AM::getT2SOImmTwoPartFirst(ImmVal); + SOImmValV2 = (uint32_t)ARM_AM::getT2SOImmTwoPartSecond(ImmVal); + switch (UseOpc) { + default: break; + case ARM::t2ADDrr: NewUseOpc = ARM::t2ADDri; break; + case ARM::t2ORRrr: NewUseOpc = ARM::t2ORRri; break; + case ARM::t2EORrr: NewUseOpc = ARM::t2EORri; break; + } + break; + } + } + } + } + + unsigned OpIdx = Commute ? 2 : 1; + unsigned Reg1 = UseMI->getOperand(OpIdx).getReg(); + bool isKill = UseMI->getOperand(OpIdx).isKill(); + unsigned NewReg = MRI->createVirtualRegister(MRI->getRegClass(Reg)); + AddDefaultCC(AddDefaultPred(BuildMI(*UseMI->getParent(), + *UseMI, UseMI->getDebugLoc(), + get(NewUseOpc), NewReg) + .addReg(Reg1, getKillRegState(isKill)) + .addImm(SOImmValV1))); + UseMI->setDesc(get(NewUseOpc)); + UseMI->getOperand(1).setReg(NewReg); + UseMI->getOperand(1).setIsKill(); + UseMI->getOperand(2).ChangeToImmediate(SOImmValV2); + DefMI->eraseFromParent(); + return true; +} + +unsigned +ARMBaseInstrInfo::getNumMicroOps(const InstrItineraryData *ItinData, + const MachineInstr *MI) const { + if (!ItinData || ItinData->isEmpty()) + return 1; + + const TargetInstrDesc &Desc = MI->getDesc(); + unsigned Class = Desc.getSchedClass(); + unsigned UOps = ItinData->Itineraries[Class].NumMicroOps; + if (UOps) + return UOps; + + unsigned Opc = MI->getOpcode(); + switch (Opc) { + default: + llvm_unreachable("Unexpected multi-uops instruction!"); + break; + case ARM::VLDMQIA: + case ARM::VLDMQDB: + case ARM::VSTMQIA: + case ARM::VSTMQDB: + return 2; + + // The number of uOps for load / store multiple are determined by the number + // registers. + // + // On Cortex-A8, each pair of register loads / stores can be scheduled on the + // same cycle. The scheduling for the first load / store must be done + // separately by assuming the the address is not 64-bit aligned. + // + // On Cortex-A9, the formula is simply (#reg / 2) + (#reg % 2). If the address + // is not 64-bit aligned, then AGU would take an extra cycle. For VFP / NEON + // load / store multiple, the formula is (#reg / 2) + (#reg % 2) + 1. + case ARM::VLDMDIA: + case ARM::VLDMDDB: + case ARM::VLDMDIA_UPD: + case ARM::VLDMDDB_UPD: + case ARM::VLDMSIA: + case ARM::VLDMSDB: + case ARM::VLDMSIA_UPD: + case ARM::VLDMSDB_UPD: + case ARM::VSTMDIA: + case ARM::VSTMDDB: + case ARM::VSTMDIA_UPD: + case ARM::VSTMDDB_UPD: + case ARM::VSTMSIA: + case ARM::VSTMSDB: + case ARM::VSTMSIA_UPD: + case ARM::VSTMSDB_UPD: { + unsigned NumRegs = MI->getNumOperands() - Desc.getNumOperands(); + return (NumRegs / 2) + (NumRegs % 2) + 1; + } + + case ARM::LDMIA_RET: + case ARM::LDMIA: + case ARM::LDMDA: + case ARM::LDMDB: + case ARM::LDMIB: + case ARM::LDMIA_UPD: + case ARM::LDMDA_UPD: + case ARM::LDMDB_UPD: + case ARM::LDMIB_UPD: + case ARM::STMIA: + case ARM::STMDA: + case ARM::STMDB: + case ARM::STMIB: + case ARM::STMIA_UPD: + case ARM::STMDA_UPD: + case ARM::STMDB_UPD: + case ARM::STMIB_UPD: + case ARM::tLDMIA: + case ARM::tLDMIA_UPD: + case ARM::tSTMIA: + case ARM::tSTMIA_UPD: + case ARM::tPOP_RET: + case ARM::tPOP: + case ARM::tPUSH: + case ARM::t2LDMIA_RET: + case ARM::t2LDMIA: + case ARM::t2LDMDB: + case ARM::t2LDMIA_UPD: + case ARM::t2LDMDB_UPD: + case ARM::t2STMIA: + case ARM::t2STMDB: + case ARM::t2STMIA_UPD: + case ARM::t2STMDB_UPD: { + unsigned NumRegs = MI->getNumOperands() - Desc.getNumOperands() + 1; + if (Subtarget.isCortexA8()) { + if (NumRegs < 4) + return 2; + // 4 registers would be issued: 2, 2. + // 5 registers would be issued: 2, 2, 1. + UOps = (NumRegs / 2); + if (NumRegs % 2) + ++UOps; + return UOps; + } else if (Subtarget.isCortexA9()) { + UOps = (NumRegs / 2); + // If there are odd number of registers or if it's not 64-bit aligned, + // then it takes an extra AGU (Address Generation Unit) cycle. + if ((NumRegs % 2) || + !MI->hasOneMemOperand() || + (*MI->memoperands_begin())->getAlignment() < 8) + ++UOps; + return UOps; + } else { + // Assume the worst. + return NumRegs; + } + } + } +} + +int +ARMBaseInstrInfo::getVLDMDefCycle(const InstrItineraryData *ItinData, + const TargetInstrDesc &DefTID, + unsigned DefClass, + unsigned DefIdx, unsigned DefAlign) const { + int RegNo = (int)(DefIdx+1) - DefTID.getNumOperands() + 1; + if (RegNo <= 0) + // Def is the address writeback. + return ItinData->getOperandCycle(DefClass, DefIdx); + + int DefCycle; + if (Subtarget.isCortexA8()) { + // (regno / 2) + (regno % 2) + 1 + DefCycle = RegNo / 2 + 1; + if (RegNo % 2) + ++DefCycle; + } else if (Subtarget.isCortexA9()) { + DefCycle = RegNo; + bool isSLoad = false; + + switch (DefTID.getOpcode()) { + default: break; + case ARM::VLDMSIA: + case ARM::VLDMSDB: + case ARM::VLDMSIA_UPD: + case ARM::VLDMSDB_UPD: + isSLoad = true; + break; + } + + // If there are odd number of 'S' registers or if it's not 64-bit aligned, + // then it takes an extra cycle. + if ((isSLoad && (RegNo % 2)) || DefAlign < 8) + ++DefCycle; + } else { + // Assume the worst. + DefCycle = RegNo + 2; + } + + return DefCycle; +} + +int +ARMBaseInstrInfo::getLDMDefCycle(const InstrItineraryData *ItinData, + const TargetInstrDesc &DefTID, + unsigned DefClass, + unsigned DefIdx, unsigned DefAlign) const { + int RegNo = (int)(DefIdx+1) - DefTID.getNumOperands() + 1; + if (RegNo <= 0) + // Def is the address writeback. + return ItinData->getOperandCycle(DefClass, DefIdx); + + int DefCycle; + if (Subtarget.isCortexA8()) { + // 4 registers would be issued: 1, 2, 1. + // 5 registers would be issued: 1, 2, 2. + DefCycle = RegNo / 2; + if (DefCycle < 1) + DefCycle = 1; + // Result latency is issue cycle + 2: E2. + DefCycle += 2; + } else if (Subtarget.isCortexA9()) { + DefCycle = (RegNo / 2); + // If there are odd number of registers or if it's not 64-bit aligned, + // then it takes an extra AGU (Address Generation Unit) cycle. + if ((RegNo % 2) || DefAlign < 8) + ++DefCycle; + // Result latency is AGU cycles + 2. + DefCycle += 2; + } else { + // Assume the worst. + DefCycle = RegNo + 2; + } + + return DefCycle; +} + +int +ARMBaseInstrInfo::getVSTMUseCycle(const InstrItineraryData *ItinData, + const TargetInstrDesc &UseTID, + unsigned UseClass, + unsigned UseIdx, unsigned UseAlign) const { + int RegNo = (int)(UseIdx+1) - UseTID.getNumOperands() + 1; + if (RegNo <= 0) + return ItinData->getOperandCycle(UseClass, UseIdx); + + int UseCycle; + if (Subtarget.isCortexA8()) { + // (regno / 2) + (regno % 2) + 1 + UseCycle = RegNo / 2 + 1; + if (RegNo % 2) + ++UseCycle; + } else if (Subtarget.isCortexA9()) { + UseCycle = RegNo; + bool isSStore = false; + + switch (UseTID.getOpcode()) { + default: break; + case ARM::VSTMSIA: + case ARM::VSTMSDB: + case ARM::VSTMSIA_UPD: + case ARM::VSTMSDB_UPD: + isSStore = true; + break; + } + + // If there are odd number of 'S' registers or if it's not 64-bit aligned, + // then it takes an extra cycle. + if ((isSStore && (RegNo % 2)) || UseAlign < 8) + ++UseCycle; + } else { + // Assume the worst. + UseCycle = RegNo + 2; + } + + return UseCycle; +} + +int +ARMBaseInstrInfo::getSTMUseCycle(const InstrItineraryData *ItinData, + const TargetInstrDesc &UseTID, + unsigned UseClass, + unsigned UseIdx, unsigned UseAlign) const { + int RegNo = (int)(UseIdx+1) - UseTID.getNumOperands() + 1; + if (RegNo <= 0) + return ItinData->getOperandCycle(UseClass, UseIdx); + + int UseCycle; + if (Subtarget.isCortexA8()) { + UseCycle = RegNo / 2; + if (UseCycle < 2) + UseCycle = 2; + // Read in E3. + UseCycle += 2; + } else if (Subtarget.isCortexA9()) { + UseCycle = (RegNo / 2); + // If there are odd number of registers or if it's not 64-bit aligned, + // then it takes an extra AGU (Address Generation Unit) cycle. + if ((RegNo % 2) || UseAlign < 8) + ++UseCycle; + } else { + // Assume the worst. + UseCycle = 1; + } + return UseCycle; +} + +int +ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData, + const TargetInstrDesc &DefTID, + unsigned DefIdx, unsigned DefAlign, + const TargetInstrDesc &UseTID, + unsigned UseIdx, unsigned UseAlign) const { + unsigned DefClass = DefTID.getSchedClass(); + unsigned UseClass = UseTID.getSchedClass(); + + if (DefIdx < DefTID.getNumDefs() && UseIdx < UseTID.getNumOperands()) + return ItinData->getOperandLatency(DefClass, DefIdx, UseClass, UseIdx); + + // This may be a def / use of a variable_ops instruction, the operand + // latency might be determinable dynamically. Let the target try to + // figure it out. + int DefCycle = -1; + bool LdmBypass = false; + switch (DefTID.getOpcode()) { + default: + DefCycle = ItinData->getOperandCycle(DefClass, DefIdx); + break; + + case ARM::VLDMDIA: + case ARM::VLDMDDB: + case ARM::VLDMDIA_UPD: + case ARM::VLDMDDB_UPD: + case ARM::VLDMSIA: + case ARM::VLDMSDB: + case ARM::VLDMSIA_UPD: + case ARM::VLDMSDB_UPD: + DefCycle = getVLDMDefCycle(ItinData, DefTID, DefClass, DefIdx, DefAlign); + break; + + case ARM::LDMIA_RET: + case ARM::LDMIA: + case ARM::LDMDA: + case ARM::LDMDB: + case ARM::LDMIB: + case ARM::LDMIA_UPD: + case ARM::LDMDA_UPD: + case ARM::LDMDB_UPD: + case ARM::LDMIB_UPD: + case ARM::tLDMIA: + case ARM::tLDMIA_UPD: + case ARM::tPUSH: + case ARM::t2LDMIA_RET: + case ARM::t2LDMIA: + case ARM::t2LDMDB: + case ARM::t2LDMIA_UPD: + case ARM::t2LDMDB_UPD: + LdmBypass = 1; + DefCycle = getLDMDefCycle(ItinData, DefTID, DefClass, DefIdx, DefAlign); + break; + } + + if (DefCycle == -1) + // We can't seem to determine the result latency of the def, assume it's 2. + DefCycle = 2; + + int UseCycle = -1; + switch (UseTID.getOpcode()) { + default: + UseCycle = ItinData->getOperandCycle(UseClass, UseIdx); + break; + + case ARM::VSTMDIA: + case ARM::VSTMDDB: + case ARM::VSTMDIA_UPD: + case ARM::VSTMDDB_UPD: + case ARM::VSTMSIA: + case ARM::VSTMSDB: + case ARM::VSTMSIA_UPD: + case ARM::VSTMSDB_UPD: + UseCycle = getVSTMUseCycle(ItinData, UseTID, UseClass, UseIdx, UseAlign); + break; + + case ARM::STMIA: + case ARM::STMDA: + case ARM::STMDB: + case ARM::STMIB: + case ARM::STMIA_UPD: + case ARM::STMDA_UPD: + case ARM::STMDB_UPD: + case ARM::STMIB_UPD: + case ARM::tSTMIA: + case ARM::tSTMIA_UPD: + case ARM::tPOP_RET: + case ARM::tPOP: + case ARM::t2STMIA: + case ARM::t2STMDB: + case ARM::t2STMIA_UPD: + case ARM::t2STMDB_UPD: + UseCycle = getSTMUseCycle(ItinData, UseTID, UseClass, UseIdx, UseAlign); + break; + } + + if (UseCycle == -1) + // Assume it's read in the first stage. + UseCycle = 1; + + UseCycle = DefCycle - UseCycle + 1; + if (UseCycle > 0) { + if (LdmBypass) { + // It's a variable_ops instruction so we can't use DefIdx here. Just use + // first def operand. + if (ItinData->hasPipelineForwarding(DefClass, DefTID.getNumOperands()-1, + UseClass, UseIdx)) + --UseCycle; + } else if (ItinData->hasPipelineForwarding(DefClass, DefIdx, + UseClass, UseIdx)) { + --UseCycle; + } + } + + return UseCycle; +} + +int +ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData, + const MachineInstr *DefMI, unsigned DefIdx, + const MachineInstr *UseMI, unsigned UseIdx) const { + if (DefMI->isCopyLike() || DefMI->isInsertSubreg() || + DefMI->isRegSequence() || DefMI->isImplicitDef()) + return 1; + + const TargetInstrDesc &DefTID = DefMI->getDesc(); + if (!ItinData || ItinData->isEmpty()) + return DefTID.mayLoad() ? 3 : 1; + + const TargetInstrDesc &UseTID = UseMI->getDesc(); + const MachineOperand &DefMO = DefMI->getOperand(DefIdx); + if (DefMO.getReg() == ARM::CPSR) { + if (DefMI->getOpcode() == ARM::FMSTAT) { + // fpscr -> cpsr stalls over 20 cycles on A8 (and earlier?) + return Subtarget.isCortexA9() ? 1 : 20; + } + + // CPSR set and branch can be paired in the same cycle. + if (UseTID.isBranch()) + return 0; + } + + unsigned DefAlign = DefMI->hasOneMemOperand() + ? (*DefMI->memoperands_begin())->getAlignment() : 0; + unsigned UseAlign = UseMI->hasOneMemOperand() + ? (*UseMI->memoperands_begin())->getAlignment() : 0; + int Latency = getOperandLatency(ItinData, DefTID, DefIdx, DefAlign, + UseTID, UseIdx, UseAlign); + + if (Latency > 1 && + (Subtarget.isCortexA8() || Subtarget.isCortexA9())) { + // FIXME: Shifter op hack: no shift (i.e. [r +/- r]) or [r + r << 2] + // variants are one cycle cheaper. + switch (DefTID.getOpcode()) { + default: break; + case ARM::LDRrs: + case ARM::LDRBrs: { + unsigned ShOpVal = DefMI->getOperand(3).getImm(); + unsigned ShImm = ARM_AM::getAM2Offset(ShOpVal); + if (ShImm == 0 || + (ShImm == 2 && ARM_AM::getAM2ShiftOpc(ShOpVal) == ARM_AM::lsl)) + --Latency; + break; + } + case ARM::t2LDRs: + case ARM::t2LDRBs: + case ARM::t2LDRHs: + case ARM::t2LDRSHs: { + // Thumb2 mode: lsl only. + unsigned ShAmt = DefMI->getOperand(3).getImm(); + if (ShAmt == 0 || ShAmt == 2) + --Latency; + break; + } + } + } + + return Latency; +} + +int +ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData, + SDNode *DefNode, unsigned DefIdx, + SDNode *UseNode, unsigned UseIdx) const { + if (!DefNode->isMachineOpcode()) + return 1; + + const TargetInstrDesc &DefTID = get(DefNode->getMachineOpcode()); + + if (isZeroCost(DefTID.Opcode)) + return 0; + + if (!ItinData || ItinData->isEmpty()) + return DefTID.mayLoad() ? 3 : 1; + + if (!UseNode->isMachineOpcode()) { + int Latency = ItinData->getOperandCycle(DefTID.getSchedClass(), DefIdx); + if (Subtarget.isCortexA9()) + return Latency <= 2 ? 1 : Latency - 1; + else + return Latency <= 3 ? 1 : Latency - 2; + } + + const TargetInstrDesc &UseTID = get(UseNode->getMachineOpcode()); + const MachineSDNode *DefMN = dyn_cast<MachineSDNode>(DefNode); + unsigned DefAlign = !DefMN->memoperands_empty() + ? (*DefMN->memoperands_begin())->getAlignment() : 0; + const MachineSDNode *UseMN = dyn_cast<MachineSDNode>(UseNode); + unsigned UseAlign = !UseMN->memoperands_empty() + ? (*UseMN->memoperands_begin())->getAlignment() : 0; + int Latency = getOperandLatency(ItinData, DefTID, DefIdx, DefAlign, + UseTID, UseIdx, UseAlign); + + if (Latency > 1 && + (Subtarget.isCortexA8() || Subtarget.isCortexA9())) { + // FIXME: Shifter op hack: no shift (i.e. [r +/- r]) or [r + r << 2] + // variants are one cycle cheaper. + switch (DefTID.getOpcode()) { + default: break; + case ARM::LDRrs: + case ARM::LDRBrs: { + unsigned ShOpVal = + cast<ConstantSDNode>(DefNode->getOperand(2))->getZExtValue(); + unsigned ShImm = ARM_AM::getAM2Offset(ShOpVal); + if (ShImm == 0 || + (ShImm == 2 && ARM_AM::getAM2ShiftOpc(ShOpVal) == ARM_AM::lsl)) + --Latency; + break; + } + case ARM::t2LDRs: + case ARM::t2LDRBs: + case ARM::t2LDRHs: + case ARM::t2LDRSHs: { + // Thumb2 mode: lsl only. + unsigned ShAmt = + cast<ConstantSDNode>(DefNode->getOperand(2))->getZExtValue(); + if (ShAmt == 0 || ShAmt == 2) + --Latency; + break; + } + } + } + + return Latency; +} + +int ARMBaseInstrInfo::getInstrLatency(const InstrItineraryData *ItinData, + const MachineInstr *MI, + unsigned *PredCost) const { + if (MI->isCopyLike() || MI->isInsertSubreg() || + MI->isRegSequence() || MI->isImplicitDef()) + return 1; + + if (!ItinData || ItinData->isEmpty()) + return 1; + + const TargetInstrDesc &TID = MI->getDesc(); + unsigned Class = TID.getSchedClass(); + unsigned UOps = ItinData->Itineraries[Class].NumMicroOps; + if (PredCost && TID.hasImplicitDefOfPhysReg(ARM::CPSR)) + // When predicated, CPSR is an additional source operand for CPSR updating + // instructions, this apparently increases their latencies. + *PredCost = 1; + if (UOps) + return ItinData->getStageLatency(Class); + return getNumMicroOps(ItinData, MI); +} + +int ARMBaseInstrInfo::getInstrLatency(const InstrItineraryData *ItinData, + SDNode *Node) const { + if (!Node->isMachineOpcode()) + return 1; + + if (!ItinData || ItinData->isEmpty()) + return 1; + + unsigned Opcode = Node->getMachineOpcode(); + switch (Opcode) { + default: + return ItinData->getStageLatency(get(Opcode).getSchedClass()); + case ARM::VLDMQIA: + case ARM::VLDMQDB: + case ARM::VSTMQIA: + case ARM::VSTMQDB: + return 2; + } +} + +bool ARMBaseInstrInfo:: +hasHighOperandLatency(const InstrItineraryData *ItinData, + const MachineRegisterInfo *MRI, + const MachineInstr *DefMI, unsigned DefIdx, + const MachineInstr *UseMI, unsigned UseIdx) const { + unsigned DDomain = DefMI->getDesc().TSFlags & ARMII::DomainMask; + unsigned UDomain = UseMI->getDesc().TSFlags & ARMII::DomainMask; + if (Subtarget.isCortexA8() && + (DDomain == ARMII::DomainVFP || UDomain == ARMII::DomainVFP)) + // CortexA8 VFP instructions are not pipelined. + return true; + + // Hoist VFP / NEON instructions with 4 or higher latency. + int Latency = getOperandLatency(ItinData, DefMI, DefIdx, UseMI, UseIdx); + if (Latency <= 3) + return false; + return DDomain == ARMII::DomainVFP || DDomain == ARMII::DomainNEON || + UDomain == ARMII::DomainVFP || UDomain == ARMII::DomainNEON; +} + +bool ARMBaseInstrInfo:: +hasLowDefLatency(const InstrItineraryData *ItinData, + const MachineInstr *DefMI, unsigned DefIdx) const { + if (!ItinData || ItinData->isEmpty()) + return false; + + unsigned DDomain = DefMI->getDesc().TSFlags & ARMII::DomainMask; + if (DDomain == ARMII::DomainGeneral) { + unsigned DefClass = DefMI->getDesc().getSchedClass(); + int DefCycle = ItinData->getOperandCycle(DefClass, DefIdx); + return (DefCycle != -1 && DefCycle <= 2); + } + return false; +} + +bool +ARMBaseInstrInfo::isFpMLxInstruction(unsigned Opcode, unsigned &MulOpc, + unsigned &AddSubOpc, + bool &NegAcc, bool &HasLane) const { + DenseMap<unsigned, unsigned>::const_iterator I = MLxEntryMap.find(Opcode); + if (I == MLxEntryMap.end()) + return false; + + const ARM_MLxEntry &Entry = ARM_MLxTable[I->second]; + MulOpc = Entry.MulOpc; + AddSubOpc = Entry.AddSubOpc; + NegAcc = Entry.NegAcc; + HasLane = Entry.HasLane; + return true; +} diff --git a/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.h b/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.h index b4f4a33..1fb8872 100644 --- a/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.h +++ b/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.h @@ -17,6 +17,8 @@ #include "ARM.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/Target/TargetInstrInfo.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallSet.h" namespace llvm { class ARMSubtarget; @@ -33,7 +35,7 @@ namespace ARMII { //===------------------------------------------------------------------===// // This four-bit field describes the addressing mode used. - AddrModeMask = 0xf, + AddrModeMask = 0x1f, AddrModeNone = 0, AddrMode1 = 1, AddrMode2 = 2, @@ -50,9 +52,10 @@ namespace ARMII { AddrModeT2_so = 13, AddrModeT2_pc = 14, // +/- i12 for pc relative data AddrModeT2_i8s4 = 15, // i8 * 4 + AddrMode_i12 = 16, // Size* - Flags to keep track of the size of an instruction. - SizeShift = 4, + SizeShift = 5, SizeMask = 7 << SizeShift, SizeSpecial = 1, // 0 byte pseudo or special case. Size8Bytes = 2, @@ -61,7 +64,7 @@ namespace ARMII { // IndexMode - Unindex, pre-indexed, or post-indexed are valid for load // and store ops only. Generic "updating" flag is used for ld/st multiple. - IndexModeShift = 7, + IndexModeShift = 8, IndexModeMask = 3 << IndexModeShift, IndexModePre = 1, IndexModePost = 2, @@ -70,7 +73,7 @@ namespace ARMII { //===------------------------------------------------------------------===// // Instruction encoding formats. // - FormShift = 9, + FormShift = 10, FormMask = 0x3f << FormShift, // Pseudo instructions @@ -143,15 +146,15 @@ namespace ARMII { // UnaryDP - Indicates this is a unary data processing instruction, i.e. // it doesn't have a Rn operand. - UnaryDP = 1 << 15, + UnaryDP = 1 << 16, // Xform16Bit - Indicates this Thumb2 instruction may be transformed into // a 16-bit Thumb instruction if certain conditions are met. - Xform16Bit = 1 << 16, + Xform16Bit = 1 << 17, //===------------------------------------------------------------------===// // Code domain. - DomainShift = 17, + DomainShift = 18, DomainMask = 3 << DomainShift, DomainGeneral = 0 << DomainShift, DomainVFP = 1 << DomainShift, @@ -160,6 +163,11 @@ namespace ARMII { //===------------------------------------------------------------------===// // Field shifts - such shifts are used to set field while generating // machine instructions. + // + // FIXME: This list will need adjusting/fixing as the MC code emitter + // takes shape and the ARMCodeEmitter.cpp bits go away. + ShiftTypeShift = 4, + M_BitShift = 5, ShiftImmShift = 5, ShiftShift = 7, @@ -181,29 +189,15 @@ namespace ARMII { I_BitShift = 25, CondShift = 28 }; - - /// Target Operand Flag enum. - enum TOF { - //===------------------------------------------------------------------===// - // ARM Specific MachineOperand flags. - - MO_NO_FLAG, - - /// MO_LO16 - On a symbol operand, this represents a relocation containing - /// lower 16 bit of the address. Used only via movw instruction. - MO_LO16, - - /// MO_HI16 - On a symbol operand, this represents a relocation containing - /// higher 16 bit of the address. Used only via movt instruction. - MO_HI16 - }; } class ARMBaseInstrInfo : public TargetInstrInfoImpl { const ARMSubtarget &Subtarget; + protected: // Can be only subclassed. explicit ARMBaseInstrInfo(const ARMSubtarget &STI); + public: // Return the non-pre/post incrementing version of 'Opc'. Return 0 // if there is not such an opcode. @@ -216,10 +210,13 @@ public: virtual const ARMBaseRegisterInfo &getRegisterInfo() const =0; const ARMSubtarget &getSubtarget() const { return Subtarget; } - bool spillCalleeSavedRegisters(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, - const std::vector<CalleeSavedInfo> &CSI, - const TargetRegisterInfo *TRI) const; + ScheduleHazardRecognizer * + CreateTargetHazardRecognizer(const TargetMachine *TM, + const ScheduleDAG *DAG) const; + + ScheduleHazardRecognizer * + CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, + const ScheduleDAG *DAG) const; // Branch analysis. virtual bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, @@ -301,7 +298,8 @@ public: MachineInstr *duplicate(MachineInstr *Orig, MachineFunction &MF) const; virtual bool produceSameValue(const MachineInstr *MI0, - const MachineInstr *MI1) const; + const MachineInstr *MI1, + const MachineRegisterInfo *MRI) const; /// areLoadsFromSameBasePtr - This is used by the pre-regalloc scheduler to /// determine if two loads are loading from the same base address. It should @@ -328,26 +326,117 @@ public: const MachineFunction &MF) const; virtual bool isProfitableToIfCvt(MachineBasicBlock &MBB, - unsigned NumInstrs) const; + unsigned NumCyles, unsigned ExtraPredCycles, + float Prob, float Confidence) const; - virtual bool isProfitableToIfCvt(MachineBasicBlock &TMBB,unsigned NumT, - MachineBasicBlock &FMBB,unsigned NumF) const; + virtual bool isProfitableToIfCvt(MachineBasicBlock &TMBB, + unsigned NumT, unsigned ExtraT, + MachineBasicBlock &FMBB, + unsigned NumF, unsigned ExtraF, + float Probability, float Confidence) const; virtual bool isProfitableToDupForIfCvt(MachineBasicBlock &MBB, - unsigned NumInstrs) const { - return NumInstrs && NumInstrs == 1; + unsigned NumCyles, + float Probability, + float Confidence) const { + return NumCyles == 1; } /// AnalyzeCompare - For a comparison instruction, return the source register /// in SrcReg and the value it compares against in CmpValue. Return true if /// the comparison instruction can be analyzed. virtual bool AnalyzeCompare(const MachineInstr *MI, unsigned &SrcReg, - int &CmpValue) const; + int &CmpMask, int &CmpValue) const; - /// ConvertToSetZeroFlag - Convert the instruction to set the zero flag so + /// OptimizeCompareInstr - Convert the instruction to set the zero flag so /// that we can remove a "comparison with zero". - virtual bool ConvertToSetZeroFlag(MachineInstr *Instr, - MachineInstr *CmpInstr) const; + virtual bool OptimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, + int CmpMask, int CmpValue, + const MachineRegisterInfo *MRI) const; + + /// FoldImmediate - 'Reg' is known to be defined by a move immediate + /// instruction, try to fold the immediate into the use instruction. + virtual bool FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI, + unsigned Reg, MachineRegisterInfo *MRI) const; + + virtual unsigned getNumMicroOps(const InstrItineraryData *ItinData, + const MachineInstr *MI) const; + + virtual + int getOperandLatency(const InstrItineraryData *ItinData, + const MachineInstr *DefMI, unsigned DefIdx, + const MachineInstr *UseMI, unsigned UseIdx) const; + virtual + int getOperandLatency(const InstrItineraryData *ItinData, + SDNode *DefNode, unsigned DefIdx, + SDNode *UseNode, unsigned UseIdx) const; +private: + int getVLDMDefCycle(const InstrItineraryData *ItinData, + const TargetInstrDesc &DefTID, + unsigned DefClass, + unsigned DefIdx, unsigned DefAlign) const; + int getLDMDefCycle(const InstrItineraryData *ItinData, + const TargetInstrDesc &DefTID, + unsigned DefClass, + unsigned DefIdx, unsigned DefAlign) const; + int getVSTMUseCycle(const InstrItineraryData *ItinData, + const TargetInstrDesc &UseTID, + unsigned UseClass, + unsigned UseIdx, unsigned UseAlign) const; + int getSTMUseCycle(const InstrItineraryData *ItinData, + const TargetInstrDesc &UseTID, + unsigned UseClass, + unsigned UseIdx, unsigned UseAlign) const; + int getOperandLatency(const InstrItineraryData *ItinData, + const TargetInstrDesc &DefTID, + unsigned DefIdx, unsigned DefAlign, + const TargetInstrDesc &UseTID, + unsigned UseIdx, unsigned UseAlign) const; + + int getInstrLatency(const InstrItineraryData *ItinData, + const MachineInstr *MI, unsigned *PredCost = 0) const; + + int getInstrLatency(const InstrItineraryData *ItinData, + SDNode *Node) const; + + bool hasHighOperandLatency(const InstrItineraryData *ItinData, + const MachineRegisterInfo *MRI, + const MachineInstr *DefMI, unsigned DefIdx, + const MachineInstr *UseMI, unsigned UseIdx) const; + bool hasLowDefLatency(const InstrItineraryData *ItinData, + const MachineInstr *DefMI, unsigned DefIdx) const; + +private: + /// Modeling special VFP / NEON fp MLA / MLS hazards. + + /// MLxEntryMap - Map fp MLA / MLS to the corresponding entry in the internal + /// MLx table. + DenseMap<unsigned, unsigned> MLxEntryMap; + + /// MLxHazardOpcodes - Set of add / sub and multiply opcodes that would cause + /// stalls when scheduled together with fp MLA / MLS opcodes. + SmallSet<unsigned, 16> MLxHazardOpcodes; + +public: + /// isFpMLxInstruction - Return true if the specified opcode is a fp MLA / MLS + /// instruction. + bool isFpMLxInstruction(unsigned Opcode) const { + return MLxEntryMap.count(Opcode); + } + + /// isFpMLxInstruction - This version also returns the multiply opcode and the + /// addition / subtraction opcode to expand to. Return true for 'HasLane' for + /// the MLX instructions with an extra lane operand. + bool isFpMLxInstruction(unsigned Opcode, unsigned &MulOpc, + unsigned &AddSubOpc, bool &NegAcc, + bool &HasLane) const; + + /// canCauseFpMLxStall - Return true if an instruction of the specified opcode + /// will cause stalls when scheduled after (within 4-cycle window) a fp + /// MLA / MLS instruction. + bool canCauseFpMLxStall(unsigned Opcode) const { + return MLxHazardOpcodes.count(Opcode); + } }; static inline @@ -389,7 +478,7 @@ bool isJumpTableBranchOpcode(int Opc) { static inline bool isIndirectBranchOpcode(int Opc) { - return Opc == ARM::BRIND || Opc == ARM::MOVPCRX || Opc == ARM::tBRIND; + return Opc == ARM::BX || Opc == ARM::MOVPCRX || Opc == ARM::tBRIND; } /// getInstrPredicate - If instruction is predicated, returns its predicate @@ -413,6 +502,12 @@ void emitT2RegPlusImmediate(MachineBasicBlock &MBB, unsigned DestReg, unsigned BaseReg, int NumBytes, ARMCC::CondCodes Pred, unsigned PredReg, const ARMBaseInstrInfo &TII); +void emitThumbRegPlusImmediate(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, + unsigned DestReg, unsigned BaseReg, + int NumBytes, const TargetInstrInfo &TII, + const ARMBaseRegisterInfo& MRI, + DebugLoc dl); /// rewriteARMFrameIndex / rewriteT2FrameIndex - diff --git a/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp index eceafad..67a4b7d 100644 --- a/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp @@ -15,6 +15,7 @@ #include "ARMAddressingModes.h" #include "ARMBaseInstrInfo.h" #include "ARMBaseRegisterInfo.h" +#include "ARMFrameLowering.h" #include "ARMInstrInfo.h" #include "ARMMachineFunctionInfo.h" #include "ARMSubtarget.h" @@ -32,120 +33,25 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetFrameInfo.h" +#include "llvm/Target/TargetFrameLowering.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" #include "llvm/ADT/BitVector.h" #include "llvm/ADT/SmallVector.h" #include "llvm/Support/CommandLine.h" -namespace llvm { +using namespace llvm; + static cl::opt<bool> ForceAllBaseRegAlloc("arm-force-base-reg-alloc", cl::Hidden, cl::init(false), cl::desc("Force use of virtual base registers for stack load/store")); static cl::opt<bool> EnableLocalStackAlloc("enable-local-stack-alloc", cl::init(true), cl::Hidden, cl::desc("Enable pre-regalloc stack frame index allocation")); -} - -using namespace llvm; - static cl::opt<bool> EnableBasePointer("arm-use-base-pointer", cl::Hidden, cl::init(true), cl::desc("Enable use of a base pointer for complex stack frames")); -unsigned ARMBaseRegisterInfo::getRegisterNumbering(unsigned RegEnum, - bool *isSPVFP) { - if (isSPVFP) - *isSPVFP = false; - - using namespace ARM; - switch (RegEnum) { - default: - llvm_unreachable("Unknown ARM register!"); - case R0: case D0: case Q0: return 0; - case R1: case D1: case Q1: return 1; - case R2: case D2: case Q2: return 2; - case R3: case D3: case Q3: return 3; - case R4: case D4: case Q4: return 4; - case R5: case D5: case Q5: return 5; - case R6: case D6: case Q6: return 6; - case R7: case D7: case Q7: return 7; - case R8: case D8: case Q8: return 8; - case R9: case D9: case Q9: return 9; - case R10: case D10: case Q10: return 10; - case R11: case D11: case Q11: return 11; - case R12: case D12: case Q12: return 12; - case SP: case D13: case Q13: return 13; - case LR: case D14: case Q14: return 14; - case PC: case D15: case Q15: return 15; - - case D16: return 16; - case D17: return 17; - case D18: return 18; - case D19: return 19; - case D20: return 20; - case D21: return 21; - case D22: return 22; - case D23: return 23; - case D24: return 24; - case D25: return 25; - case D26: return 26; - case D27: return 27; - case D28: return 28; - case D29: return 29; - case D30: return 30; - case D31: return 31; - - case S0: case S1: case S2: case S3: - case S4: case S5: case S6: case S7: - case S8: case S9: case S10: case S11: - case S12: case S13: case S14: case S15: - case S16: case S17: case S18: case S19: - case S20: case S21: case S22: case S23: - case S24: case S25: case S26: case S27: - case S28: case S29: case S30: case S31: { - if (isSPVFP) - *isSPVFP = true; - switch (RegEnum) { - default: return 0; // Avoid compile time warning. - case S0: return 0; - case S1: return 1; - case S2: return 2; - case S3: return 3; - case S4: return 4; - case S5: return 5; - case S6: return 6; - case S7: return 7; - case S8: return 8; - case S9: return 9; - case S10: return 10; - case S11: return 11; - case S12: return 12; - case S13: return 13; - case S14: return 14; - case S15: return 15; - case S16: return 16; - case S17: return 17; - case S18: return 18; - case S19: return 19; - case S20: return 20; - case S21: return 21; - case S22: return 22; - case S23: return 23; - case S24: return 24; - case S25: return 25; - case S26: return 26; - case S27: return 27; - case S28: return 28; - case S29: return 29; - case S30: return 30; - case S31: return 31; - } - } - } -} - ARMBaseRegisterInfo::ARMBaseRegisterInfo(const ARMBaseInstrInfo &tii, const ARMSubtarget &sti) : ARMGenRegisterInfo(ARM::ADJCALLSTACKDOWN, ARM::ADJCALLSTACKUP), @@ -180,12 +86,14 @@ ARMBaseRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { BitVector ARMBaseRegisterInfo:: getReservedRegs(const MachineFunction &MF) const { + const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering(); + // FIXME: avoid re-calculating this everytime. BitVector Reserved(getNumRegs()); Reserved.set(ARM::SP); Reserved.set(ARM::PC); Reserved.set(ARM::FPSCR); - if (hasFP(MF)) + if (TFI->hasFP(MF)) Reserved.set(FramePtr); if (hasBasePointer(MF)) Reserved.set(BasePtr); @@ -197,6 +105,8 @@ getReservedRegs(const MachineFunction &MF) const { bool ARMBaseRegisterInfo::isReservedReg(const MachineFunction &MF, unsigned Reg) const { + const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering(); + switch (Reg) { default: break; case ARM::SP: @@ -208,7 +118,7 @@ bool ARMBaseRegisterInfo::isReservedReg(const MachineFunction &MF, break; case ARM::R7: case ARM::R11: - if (FramePtr == Reg && hasFP(MF)) + if (FramePtr == Reg && TFI->hasFP(MF)) return true; break; case ARM::R9: @@ -444,6 +354,7 @@ std::pair<TargetRegisterClass::iterator,TargetRegisterClass::iterator> ARMBaseRegisterInfo::getAllocationOrder(const TargetRegisterClass *RC, unsigned HintType, unsigned HintReg, const MachineFunction &MF) const { + const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering(); // Alternative register allocation orders when favoring even / odd registers // of register pairs. @@ -525,7 +436,7 @@ ARMBaseRegisterInfo::getAllocationOrder(const TargetRegisterClass *RC, return std::make_pair(RC->allocation_order_begin(MF), RC->allocation_order_end(MF)); - if (!hasFP(MF)) { + if (!TFI->hasFP(MF)) { if (!STI.isR9Reserved()) return std::make_pair(GPREven1, GPREven1 + (sizeof(GPREven1)/sizeof(unsigned))); @@ -554,7 +465,7 @@ ARMBaseRegisterInfo::getAllocationOrder(const TargetRegisterClass *RC, return std::make_pair(RC->allocation_order_begin(MF), RC->allocation_order_end(MF)); - if (!hasFP(MF)) { + if (!TFI->hasFP(MF)) { if (!STI.isR9Reserved()) return std::make_pair(GPROdd1, GPROdd1 + (sizeof(GPROdd1)/sizeof(unsigned))); @@ -606,7 +517,7 @@ ARMBaseRegisterInfo::UpdateRegAllocHint(unsigned Reg, unsigned NewReg, std::pair<unsigned, unsigned> Hint = MRI->getRegAllocationHint(Reg); if ((Hint.first == (unsigned)ARMRI::RegPairOdd || Hint.first == (unsigned)ARMRI::RegPairEven) && - Hint.second && TargetRegisterInfo::isVirtualRegister(Hint.second)) { + TargetRegisterInfo::isVirtualRegister(Hint.second)) { // If 'Reg' is one of the even / odd register pair and it's now changed // (e.g. coalesced) into a different register. The other register of the // pair allocation hint must be updated to reflect the relationship @@ -619,23 +530,6 @@ ARMBaseRegisterInfo::UpdateRegAllocHint(unsigned Reg, unsigned NewReg, } } -/// hasFP - Return true if the specified function should have a dedicated frame -/// pointer register. This is true if the function has variable sized allocas -/// or if frame pointer elimination is disabled. -/// -bool ARMBaseRegisterInfo::hasFP(const MachineFunction &MF) const { - // Mac OS X requires FP not to be clobbered for backtracing purpose. - if (STI.isTargetDarwin()) - return true; - - const MachineFrameInfo *MFI = MF.getFrameInfo(); - // Always eliminate non-leaf frame pointers. - return ((DisableFramePointerElim(MF) && MFI->hasCalls()) || - needsStackRealignment(MF) || - MFI->hasVarSizedObjects() || - MFI->isFrameAddressTaken()); -} - bool ARMBaseRegisterInfo::hasBasePointer(const MachineFunction &MF) const { const MachineFrameInfo *MFI = MF.getFrameInfo(); const ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); @@ -681,7 +575,7 @@ bool ARMBaseRegisterInfo:: needsStackRealignment(const MachineFunction &MF) const { const MachineFrameInfo *MFI = MF.getFrameInfo(); const Function *F = MF.getFunction(); - unsigned StackAlign = MF.getTarget().getFrameInfo()->getStackAlignment(); + unsigned StackAlign = MF.getTarget().getFrameLowering()->getStackAlignment(); bool requiresRealignment = ((MFI->getLocalFrameMaxAlign() > StackAlign) || F->hasFnAttr(Attribute::StackAlignment)); @@ -697,417 +591,19 @@ cannotEliminateFrame(const MachineFunction &MF) const { || needsStackRealignment(MF); } -/// estimateStackSize - Estimate and return the size of the frame. -static unsigned estimateStackSize(MachineFunction &MF) { - const MachineFrameInfo *FFI = MF.getFrameInfo(); - int Offset = 0; - for (int i = FFI->getObjectIndexBegin(); i != 0; ++i) { - int FixedOff = -FFI->getObjectOffset(i); - if (FixedOff > Offset) Offset = FixedOff; - } - for (unsigned i = 0, e = FFI->getObjectIndexEnd(); i != e; ++i) { - if (FFI->isDeadObjectIndex(i)) - continue; - Offset += FFI->getObjectSize(i); - unsigned Align = FFI->getObjectAlignment(i); - // Adjust to alignment boundary - Offset = (Offset+Align-1)/Align*Align; - } - return (unsigned)Offset; -} - -/// estimateRSStackSizeLimit - Look at each instruction that references stack -/// frames and return the stack size limit beyond which some of these -/// instructions will require a scratch register during their expansion later. -unsigned -ARMBaseRegisterInfo::estimateRSStackSizeLimit(MachineFunction &MF) const { - const ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); - unsigned Limit = (1 << 12) - 1; - for (MachineFunction::iterator BB = MF.begin(),E = MF.end(); BB != E; ++BB) { - for (MachineBasicBlock::iterator I = BB->begin(), E = BB->end(); - I != E; ++I) { - for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) { - if (!I->getOperand(i).isFI()) continue; - - // When using ADDri to get the address of a stack object, 255 is the - // largest offset guaranteed to fit in the immediate offset. - if (I->getOpcode() == ARM::ADDri) { - Limit = std::min(Limit, (1U << 8) - 1); - break; - } - - // Otherwise check the addressing mode. - switch (I->getDesc().TSFlags & ARMII::AddrModeMask) { - case ARMII::AddrMode3: - case ARMII::AddrModeT2_i8: - Limit = std::min(Limit, (1U << 8) - 1); - break; - case ARMII::AddrMode5: - case ARMII::AddrModeT2_i8s4: - Limit = std::min(Limit, ((1U << 8) - 1) * 4); - break; - case ARMII::AddrModeT2_i12: - // i12 supports only positive offset so these will be converted to - // i8 opcodes. See llvm::rewriteT2FrameIndex. - if (hasFP(MF) && AFI->hasStackFrame()) - Limit = std::min(Limit, (1U << 8) - 1); - break; - case ARMII::AddrMode6: - // Addressing mode 6 (load/store) instructions can't encode an - // immediate offset for stack references. - return 0; - default: - break; - } - break; // At most one FI per instruction - } - } - } - - return Limit; -} - -static unsigned GetFunctionSizeInBytes(const MachineFunction &MF, - const ARMBaseInstrInfo &TII) { - unsigned FnSize = 0; - for (MachineFunction::const_iterator MBBI = MF.begin(), E = MF.end(); - MBBI != E; ++MBBI) { - const MachineBasicBlock &MBB = *MBBI; - for (MachineBasicBlock::const_iterator I = MBB.begin(),E = MBB.end(); - I != E; ++I) - FnSize += TII.GetInstSizeInBytes(I); - } - return FnSize; -} - -void -ARMBaseRegisterInfo::processFunctionBeforeCalleeSavedScan(MachineFunction &MF, - RegScavenger *RS) const { - // This tells PEI to spill the FP as if it is any other callee-save register - // to take advantage the eliminateFrameIndex machinery. This also ensures it - // is spilled in the order specified by getCalleeSavedRegs() to make it easier - // to combine multiple loads / stores. - bool CanEliminateFrame = true; - bool CS1Spilled = false; - bool LRSpilled = false; - unsigned NumGPRSpills = 0; - SmallVector<unsigned, 4> UnspilledCS1GPRs; - SmallVector<unsigned, 4> UnspilledCS2GPRs; - ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); - MachineFrameInfo *MFI = MF.getFrameInfo(); - - // Spill R4 if Thumb2 function requires stack realignment - it will be used as - // scratch register. - // FIXME: It will be better just to find spare register here. - if (needsStackRealignment(MF) && - AFI->isThumb2Function()) - MF.getRegInfo().setPhysRegUsed(ARM::R4); - - // Spill LR if Thumb1 function uses variable length argument lists. - if (AFI->isThumb1OnlyFunction() && AFI->getVarArgsRegSaveSize() > 0) - MF.getRegInfo().setPhysRegUsed(ARM::LR); - - // Spill the BasePtr if it's used. - if (hasBasePointer(MF)) - MF.getRegInfo().setPhysRegUsed(BasePtr); - - // Don't spill FP if the frame can be eliminated. This is determined - // by scanning the callee-save registers to see if any is used. - const unsigned *CSRegs = getCalleeSavedRegs(); - for (unsigned i = 0; CSRegs[i]; ++i) { - unsigned Reg = CSRegs[i]; - bool Spilled = false; - if (MF.getRegInfo().isPhysRegUsed(Reg)) { - AFI->setCSRegisterIsSpilled(Reg); - Spilled = true; - CanEliminateFrame = false; - } else { - // Check alias registers too. - for (const unsigned *Aliases = getAliasSet(Reg); *Aliases; ++Aliases) { - if (MF.getRegInfo().isPhysRegUsed(*Aliases)) { - Spilled = true; - CanEliminateFrame = false; - } - } - } - - if (!ARM::GPRRegisterClass->contains(Reg)) - continue; - - if (Spilled) { - NumGPRSpills++; - - if (!STI.isTargetDarwin()) { - if (Reg == ARM::LR) - LRSpilled = true; - CS1Spilled = true; - continue; - } - - // Keep track if LR and any of R4, R5, R6, and R7 is spilled. - switch (Reg) { - case ARM::LR: - LRSpilled = true; - // Fallthrough - case ARM::R4: - case ARM::R5: - case ARM::R6: - case ARM::R7: - CS1Spilled = true; - break; - default: - break; - } - } else { - if (!STI.isTargetDarwin()) { - UnspilledCS1GPRs.push_back(Reg); - continue; - } - - switch (Reg) { - case ARM::R4: - case ARM::R5: - case ARM::R6: - case ARM::R7: - case ARM::LR: - UnspilledCS1GPRs.push_back(Reg); - break; - default: - UnspilledCS2GPRs.push_back(Reg); - break; - } - } - } - - bool ForceLRSpill = false; - if (!LRSpilled && AFI->isThumb1OnlyFunction()) { - unsigned FnSize = GetFunctionSizeInBytes(MF, TII); - // Force LR to be spilled if the Thumb function size is > 2048. This enables - // use of BL to implement far jump. If it turns out that it's not needed - // then the branch fix up path will undo it. - if (FnSize >= (1 << 11)) { - CanEliminateFrame = false; - ForceLRSpill = true; - } - } - - // If any of the stack slot references may be out of range of an immediate - // offset, make sure a register (or a spill slot) is available for the - // register scavenger. Note that if we're indexing off the frame pointer, the - // effective stack size is 4 bytes larger since the FP points to the stack - // slot of the previous FP. Also, if we have variable sized objects in the - // function, stack slot references will often be negative, and some of - // our instructions are positive-offset only, so conservatively consider - // that case to want a spill slot (or register) as well. Similarly, if - // the function adjusts the stack pointer during execution and the - // adjustments aren't already part of our stack size estimate, our offset - // calculations may be off, so be conservative. - // FIXME: We could add logic to be more precise about negative offsets - // and which instructions will need a scratch register for them. Is it - // worth the effort and added fragility? - bool BigStack = - (RS && - (estimateStackSize(MF) + ((hasFP(MF) && AFI->hasStackFrame()) ? 4:0) >= - estimateRSStackSizeLimit(MF))) - || MFI->hasVarSizedObjects() - || (MFI->adjustsStack() && !canSimplifyCallFramePseudos(MF)); - - bool ExtraCSSpill = false; - if (BigStack || !CanEliminateFrame || cannotEliminateFrame(MF)) { - AFI->setHasStackFrame(true); - - // If LR is not spilled, but at least one of R4, R5, R6, and R7 is spilled. - // Spill LR as well so we can fold BX_RET to the registers restore (LDM). - if (!LRSpilled && CS1Spilled) { - MF.getRegInfo().setPhysRegUsed(ARM::LR); - AFI->setCSRegisterIsSpilled(ARM::LR); - NumGPRSpills++; - UnspilledCS1GPRs.erase(std::find(UnspilledCS1GPRs.begin(), - UnspilledCS1GPRs.end(), (unsigned)ARM::LR)); - ForceLRSpill = false; - ExtraCSSpill = true; - } - - if (hasFP(MF)) { - MF.getRegInfo().setPhysRegUsed(FramePtr); - NumGPRSpills++; - } - - // If stack and double are 8-byte aligned and we are spilling an odd number - // of GPRs. Spill one extra callee save GPR so we won't have to pad between - // the integer and double callee save areas. - unsigned TargetAlign = MF.getTarget().getFrameInfo()->getStackAlignment(); - if (TargetAlign == 8 && (NumGPRSpills & 1)) { - if (CS1Spilled && !UnspilledCS1GPRs.empty()) { - for (unsigned i = 0, e = UnspilledCS1GPRs.size(); i != e; ++i) { - unsigned Reg = UnspilledCS1GPRs[i]; - // Don't spill high register if the function is thumb1 - if (!AFI->isThumb1OnlyFunction() || - isARMLowRegister(Reg) || Reg == ARM::LR) { - MF.getRegInfo().setPhysRegUsed(Reg); - AFI->setCSRegisterIsSpilled(Reg); - if (!isReservedReg(MF, Reg)) - ExtraCSSpill = true; - break; - } - } - } else if (!UnspilledCS2GPRs.empty() && - !AFI->isThumb1OnlyFunction()) { - unsigned Reg = UnspilledCS2GPRs.front(); - MF.getRegInfo().setPhysRegUsed(Reg); - AFI->setCSRegisterIsSpilled(Reg); - if (!isReservedReg(MF, Reg)) - ExtraCSSpill = true; - } - } - - // Estimate if we might need to scavenge a register at some point in order - // to materialize a stack offset. If so, either spill one additional - // callee-saved register or reserve a special spill slot to facilitate - // register scavenging. Thumb1 needs a spill slot for stack pointer - // adjustments also, even when the frame itself is small. - if (BigStack && !ExtraCSSpill) { - // If any non-reserved CS register isn't spilled, just spill one or two - // extra. That should take care of it! - unsigned NumExtras = TargetAlign / 4; - SmallVector<unsigned, 2> Extras; - while (NumExtras && !UnspilledCS1GPRs.empty()) { - unsigned Reg = UnspilledCS1GPRs.back(); - UnspilledCS1GPRs.pop_back(); - if (!isReservedReg(MF, Reg) && - (!AFI->isThumb1OnlyFunction() || isARMLowRegister(Reg) || - Reg == ARM::LR)) { - Extras.push_back(Reg); - NumExtras--; - } - } - // For non-Thumb1 functions, also check for hi-reg CS registers - if (!AFI->isThumb1OnlyFunction()) { - while (NumExtras && !UnspilledCS2GPRs.empty()) { - unsigned Reg = UnspilledCS2GPRs.back(); - UnspilledCS2GPRs.pop_back(); - if (!isReservedReg(MF, Reg)) { - Extras.push_back(Reg); - NumExtras--; - } - } - } - if (Extras.size() && NumExtras == 0) { - for (unsigned i = 0, e = Extras.size(); i != e; ++i) { - MF.getRegInfo().setPhysRegUsed(Extras[i]); - AFI->setCSRegisterIsSpilled(Extras[i]); - } - } else if (!AFI->isThumb1OnlyFunction()) { - // note: Thumb1 functions spill to R12, not the stack. Reserve a slot - // closest to SP or frame pointer. - const TargetRegisterClass *RC = ARM::GPRRegisterClass; - RS->setScavengingFrameIndex(MFI->CreateStackObject(RC->getSize(), - RC->getAlignment(), - false)); - } - } - } - - if (ForceLRSpill) { - MF.getRegInfo().setPhysRegUsed(ARM::LR); - AFI->setCSRegisterIsSpilled(ARM::LR); - AFI->setLRIsSpilledForFarJump(true); - } -} - unsigned ARMBaseRegisterInfo::getRARegister() const { return ARM::LR; } unsigned ARMBaseRegisterInfo::getFrameRegister(const MachineFunction &MF) const { - if (hasFP(MF)) + const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering(); + + if (TFI->hasFP(MF)) return FramePtr; return ARM::SP; } -// Provide a base+offset reference to an FI slot for debug info. It's the -// same as what we use for resolving the code-gen references for now. -// FIXME: This can go wrong when references are SP-relative and simple call -// frames aren't used. -int -ARMBaseRegisterInfo::getFrameIndexReference(const MachineFunction &MF, int FI, - unsigned &FrameReg) const { - return ResolveFrameIndexReference(MF, FI, FrameReg, 0); -} - -int -ARMBaseRegisterInfo::ResolveFrameIndexReference(const MachineFunction &MF, - int FI, - unsigned &FrameReg, - int SPAdj) const { - const MachineFrameInfo *MFI = MF.getFrameInfo(); - const ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); - int Offset = MFI->getObjectOffset(FI) + MFI->getStackSize(); - int FPOffset = Offset - AFI->getFramePtrSpillOffset(); - bool isFixed = MFI->isFixedObjectIndex(FI); - - FrameReg = ARM::SP; - Offset += SPAdj; - if (AFI->isGPRCalleeSavedArea1Frame(FI)) - return Offset - AFI->getGPRCalleeSavedArea1Offset(); - else if (AFI->isGPRCalleeSavedArea2Frame(FI)) - return Offset - AFI->getGPRCalleeSavedArea2Offset(); - else if (AFI->isDPRCalleeSavedAreaFrame(FI)) - return Offset - AFI->getDPRCalleeSavedAreaOffset(); - - // When dynamically realigning the stack, use the frame pointer for - // parameters, and the stack/base pointer for locals. - if (needsStackRealignment(MF)) { - assert (hasFP(MF) && "dynamic stack realignment without a FP!"); - if (isFixed) { - FrameReg = getFrameRegister(MF); - Offset = FPOffset; - } else if (MFI->hasVarSizedObjects()) { - assert(hasBasePointer(MF) && - "VLAs and dynamic stack alignment, but missing base pointer!"); - FrameReg = BasePtr; - } - return Offset; - } - - // If there is a frame pointer, use it when we can. - if (hasFP(MF) && AFI->hasStackFrame()) { - // Use frame pointer to reference fixed objects. Use it for locals if - // there are VLAs (and thus the SP isn't reliable as a base). - if (isFixed || (MFI->hasVarSizedObjects() && !hasBasePointer(MF))) { - FrameReg = getFrameRegister(MF); - return FPOffset; - } else if (MFI->hasVarSizedObjects()) { - assert(hasBasePointer(MF) && "missing base pointer!"); - // Use the base register since we have it. - FrameReg = BasePtr; - } else if (AFI->isThumb2Function()) { - // In Thumb2 mode, the negative offset is very limited. Try to avoid - // out of range references. - if (FPOffset >= -255 && FPOffset < 0) { - FrameReg = getFrameRegister(MF); - return FPOffset; - } - } else if (Offset > (FPOffset < 0 ? -FPOffset : FPOffset)) { - // Otherwise, use SP or FP, whichever is closer to the stack slot. - FrameReg = getFrameRegister(MF); - return FPOffset; - } - } - // Use the base pointer if we have one. - if (hasBasePointer(MF)) - FrameReg = BasePtr; - return Offset; -} - -int -ARMBaseRegisterInfo::getFrameIndexOffset(const MachineFunction &MF, - int FI) const { - unsigned FrameReg; - return getFrameIndexReference(MF, FI, FrameReg); -} - unsigned ARMBaseRegisterInfo::getEHExceptionRegister() const { llvm_unreachable("What is the exception register"); return 0; @@ -1320,7 +816,7 @@ emitLoadConstPool(MachineBasicBlock &MBB, BuildMI(MBB, MBBI, dl, TII.get(ARM::LDRcp)) .addReg(DestReg, getDefRegState(true), SubIdx) .addConstantPoolIndex(Idx) - .addReg(0).addImm(0).addImm(Pred).addReg(PredReg); + .addImm(0).addImm(Pred).addReg(PredReg); } bool ARMBaseRegisterInfo:: @@ -1338,34 +834,6 @@ requiresVirtualBaseRegisters(const MachineFunction &MF) const { return EnableLocalStackAlloc; } -// hasReservedCallFrame - Under normal circumstances, when a frame pointer is -// not required, we reserve argument space for call sites in the function -// immediately on entry to the current function. This eliminates the need for -// add/sub sp brackets around call sites. Returns true if the call frame is -// included as part of the stack frame. -bool ARMBaseRegisterInfo:: -hasReservedCallFrame(const MachineFunction &MF) const { - const MachineFrameInfo *FFI = MF.getFrameInfo(); - unsigned CFSize = FFI->getMaxCallFrameSize(); - // It's not always a good idea to include the call frame as part of the - // stack frame. ARM (especially Thumb) has small immediate offset to - // address the stack frame. So a large call frame can cause poor codegen - // and may even makes it impossible to scavenge a register. - if (CFSize >= ((1 << 12) - 1) / 2) // Half of imm12 - return false; - - return !MF.getFrameInfo()->hasVarSizedObjects(); -} - -// canSimplifyCallFramePseudos - If there is a reserved call frame, the -// call frame pseudos can be simplified. Unlike most targets, having a FP -// is not sufficient here since we still may reference some objects via SP -// even when FP is available in Thumb2 mode. -bool ARMBaseRegisterInfo:: -canSimplifyCallFramePseudos(const MachineFunction &MF) const { - return hasReservedCallFrame(MF) || MF.getFrameInfo()->hasVarSizedObjects(); -} - static void emitSPUpdate(bool isARM, MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, @@ -1384,7 +852,8 @@ emitSPUpdate(bool isARM, void ARMBaseRegisterInfo:: eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const { - if (!hasReservedCallFrame(MF)) { + const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering(); + if (!TFI->hasReservedCallFrame(MF)) { // If we have alloca, convert as follows: // ADJCALLSTACKDOWN -> sub, sp, sp, amount // ADJCALLSTACKUP -> add, sp, sp, amount @@ -1395,7 +864,7 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, // We need to keep the stack aligned properly. To do this, we round the // amount of space needed for the outgoing arguments up to the next // alignment boundary. - unsigned Align = MF.getTarget().getFrameInfo()->getStackAlignment(); + unsigned Align = TFI->getStackAlignment(); Amount = (Amount+Align-1)/Align*Align; ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); @@ -1433,8 +902,7 @@ getFrameIndexInstrOffset(const MachineInstr *MI, int Idx) const { switch (AddrMode) { case ARMII::AddrModeT2_i8: case ARMII::AddrModeT2_i12: - // i8 supports only negative, and i12 supports only positive, so - // based on Offset sign, consider the appropriate instruction + case ARMII::AddrMode_i12: InstrOffs = MI->getOperand(Idx+1).getImm(); Scale = 1; break; @@ -1496,8 +964,8 @@ needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const { // return false for everything else. unsigned Opc = MI->getOpcode(); switch (Opc) { - case ARM::LDR: case ARM::LDRH: case ARM::LDRB: - case ARM::STR: case ARM::STRH: case ARM::STRB: + case ARM::LDRi12: case ARM::LDRH: case ARM::LDRBi12: + case ARM::STRi12: case ARM::STRH: case ARM::STRBi12: case ARM::t2LDRi12: case ARM::t2LDRi8: case ARM::t2STRi12: case ARM::t2STRi8: case ARM::VLDRS: case ARM::VLDRD: @@ -1516,6 +984,7 @@ needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const { // Note that the incoming offset is based on the SP value at function entry, // so it'll be negative. MachineFunction &MF = *MI->getParent()->getParent(); + const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering(); MachineFrameInfo *MFI = MF.getFrameInfo(); ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); @@ -1542,8 +1011,8 @@ needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const { // The FP is only available if there is no dynamic realignment. We // don't know for sure yet whether we'll need that, so we guess based // on whether there are any local variables that would trigger it. - unsigned StackAlign = MF.getTarget().getFrameInfo()->getStackAlignment(); - if (hasFP(MF) && + unsigned StackAlign = TFI->getStackAlignment(); + if (TFI->hasFP(MF) && !((MFI->getLocalFrameMaxAlign() > StackAlign) && canRealignStack(MF))) { if (isFrameOffsetLegal(MI, FPOffset)) return false; @@ -1560,19 +1029,25 @@ needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const { return true; } -/// materializeFrameBaseRegister - Insert defining instruction(s) for -/// BaseReg to be a pointer to FrameIdx before insertion point I. +/// materializeFrameBaseRegister - Insert defining instruction(s) for BaseReg to +/// be a pointer to FrameIdx at the beginning of the basic block. void ARMBaseRegisterInfo:: -materializeFrameBaseRegister(MachineBasicBlock::iterator I, unsigned BaseReg, - int FrameIdx, int64_t Offset) const { - ARMFunctionInfo *AFI = - I->getParent()->getParent()->getInfo<ARMFunctionInfo>(); +materializeFrameBaseRegister(MachineBasicBlock *MBB, + unsigned BaseReg, int FrameIdx, + int64_t Offset) const { + ARMFunctionInfo *AFI = MBB->getParent()->getInfo<ARMFunctionInfo>(); unsigned ADDriOpc = !AFI->isThumbFunction() ? ARM::ADDri : (AFI->isThumb1OnlyFunction() ? ARM::tADDrSPi : ARM::t2ADDri); + MachineBasicBlock::iterator Ins = MBB->begin(); + DebugLoc DL; // Defaults to "unknown" + if (Ins != MBB->end()) + DL = Ins->getDebugLoc(); + MachineInstrBuilder MIB = - BuildMI(*I->getParent(), I, I->getDebugLoc(), TII.get(ADDriOpc), BaseReg) + BuildMI(*MBB, Ins, DL, TII.get(ADDriOpc), BaseReg) .addFrameIndex(FrameIdx).addImm(Offset); + if (!AFI->isThumb1OnlyFunction()) AddDefaultCC(AddDefaultPred(MIB)); } @@ -1640,6 +1115,7 @@ bool ARMBaseRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI, NumBits = 8; Scale = 4; break; + case ARMII::AddrMode_i12: case ARMII::AddrMode2: NumBits = 12; break; @@ -1679,6 +1155,8 @@ ARMBaseRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, MachineInstr &MI = *II; MachineBasicBlock &MBB = *MI.getParent(); MachineFunction &MF = *MBB.getParent(); + const ARMFrameLowering *TFI = + static_cast<const ARMFrameLowering*>(MF.getTarget().getFrameLowering()); ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); assert(!AFI->isThumb1OnlyFunction() && "This eliminateFrameIndex does not support Thumb1!"); @@ -1691,7 +1169,7 @@ ARMBaseRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, int FrameIndex = MI.getOperand(i).getIndex(); unsigned FrameReg; - int Offset = ResolveFrameIndexReference(MF, FrameIndex, FrameReg, SPAdj); + int Offset = TFI->ResolveFrameIndexReference(MF, FrameIndex, FrameReg, SPAdj); // Special handling of dbg_value instructions. if (MI.isDebugValue()) { @@ -1737,339 +1215,13 @@ ARMBaseRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, emitT2RegPlusImmediate(MBB, II, MI.getDebugLoc(), ScratchReg, FrameReg, Offset, Pred, PredReg, TII); } + // Update the original instruction to use the scratch register. MI.getOperand(i).ChangeToRegister(ScratchReg, false, false, true); + if (MI.getOpcode() == ARM::t2ADDrSPi) + MI.setDesc(TII.get(ARM::t2ADDri)); + else if (MI.getOpcode() == ARM::t2SUBrSPi) + MI.setDesc(TII.get(ARM::t2SUBri)); } } -/// Move iterator past the next bunch of callee save load / store ops for -/// the particular spill area (1: integer area 1, 2: integer area 2, -/// 3: fp area, 0: don't care). -static void movePastCSLoadStoreOps(MachineBasicBlock &MBB, - MachineBasicBlock::iterator &MBBI, - int Opc1, int Opc2, unsigned Area, - const ARMSubtarget &STI) { - while (MBBI != MBB.end() && - ((MBBI->getOpcode() == Opc1) || (MBBI->getOpcode() == Opc2)) && - MBBI->getOperand(1).isFI()) { - if (Area != 0) { - bool Done = false; - unsigned Category = 0; - switch (MBBI->getOperand(0).getReg()) { - case ARM::R4: case ARM::R5: case ARM::R6: case ARM::R7: - case ARM::LR: - Category = 1; - break; - case ARM::R8: case ARM::R9: case ARM::R10: case ARM::R11: - Category = STI.isTargetDarwin() ? 2 : 1; - break; - case ARM::D8: case ARM::D9: case ARM::D10: case ARM::D11: - case ARM::D12: case ARM::D13: case ARM::D14: case ARM::D15: - Category = 3; - break; - default: - Done = true; - break; - } - if (Done || Category != Area) - break; - } - - ++MBBI; - } -} - -void ARMBaseRegisterInfo:: -emitPrologue(MachineFunction &MF) const { - MachineBasicBlock &MBB = MF.front(); - MachineBasicBlock::iterator MBBI = MBB.begin(); - MachineFrameInfo *MFI = MF.getFrameInfo(); - ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); - assert(!AFI->isThumb1OnlyFunction() && - "This emitPrologue does not support Thumb1!"); - bool isARM = !AFI->isThumbFunction(); - unsigned VARegSaveSize = AFI->getVarArgsRegSaveSize(); - unsigned NumBytes = MFI->getStackSize(); - const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo(); - DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc(); - - // Determine the sizes of each callee-save spill areas and record which frame - // belongs to which callee-save spill areas. - unsigned GPRCS1Size = 0, GPRCS2Size = 0, DPRCSSize = 0; - int FramePtrSpillFI = 0; - - // Allocate the vararg register save area. This is not counted in NumBytes. - if (VARegSaveSize) - emitSPUpdate(isARM, MBB, MBBI, dl, TII, -VARegSaveSize); - - if (!AFI->hasStackFrame()) { - if (NumBytes != 0) - emitSPUpdate(isARM, MBB, MBBI, dl, TII, -NumBytes); - return; - } - - for (unsigned i = 0, e = CSI.size(); i != e; ++i) { - unsigned Reg = CSI[i].getReg(); - int FI = CSI[i].getFrameIdx(); - switch (Reg) { - case ARM::R4: - case ARM::R5: - case ARM::R6: - case ARM::R7: - case ARM::LR: - if (Reg == FramePtr) - FramePtrSpillFI = FI; - AFI->addGPRCalleeSavedArea1Frame(FI); - GPRCS1Size += 4; - break; - case ARM::R8: - case ARM::R9: - case ARM::R10: - case ARM::R11: - if (Reg == FramePtr) - FramePtrSpillFI = FI; - if (STI.isTargetDarwin()) { - AFI->addGPRCalleeSavedArea2Frame(FI); - GPRCS2Size += 4; - } else { - AFI->addGPRCalleeSavedArea1Frame(FI); - GPRCS1Size += 4; - } - break; - default: - AFI->addDPRCalleeSavedAreaFrame(FI); - DPRCSSize += 8; - } - } - - // Build the new SUBri to adjust SP for integer callee-save spill area 1. - emitSPUpdate(isARM, MBB, MBBI, dl, TII, -GPRCS1Size); - movePastCSLoadStoreOps(MBB, MBBI, ARM::STR, ARM::t2STRi12, 1, STI); - - // Set FP to point to the stack slot that contains the previous FP. - // For Darwin, FP is R7, which has now been stored in spill area 1. - // Otherwise, if this is not Darwin, all the callee-saved registers go - // into spill area 1, including the FP in R11. In either case, it is - // now safe to emit this assignment. - bool HasFP = hasFP(MF); - if (HasFP) { - unsigned ADDriOpc = !AFI->isThumbFunction() ? ARM::ADDri : ARM::t2ADDri; - MachineInstrBuilder MIB = - BuildMI(MBB, MBBI, dl, TII.get(ADDriOpc), FramePtr) - .addFrameIndex(FramePtrSpillFI).addImm(0); - AddDefaultCC(AddDefaultPred(MIB)); - } - - // Build the new SUBri to adjust SP for integer callee-save spill area 2. - emitSPUpdate(isARM, MBB, MBBI, dl, TII, -GPRCS2Size); - - // Build the new SUBri to adjust SP for FP callee-save spill area. - movePastCSLoadStoreOps(MBB, MBBI, ARM::STR, ARM::t2STRi12, 2, STI); - emitSPUpdate(isARM, MBB, MBBI, dl, TII, -DPRCSSize); - - // Determine starting offsets of spill areas. - unsigned DPRCSOffset = NumBytes - (GPRCS1Size + GPRCS2Size + DPRCSSize); - unsigned GPRCS2Offset = DPRCSOffset + DPRCSSize; - unsigned GPRCS1Offset = GPRCS2Offset + GPRCS2Size; - if (HasFP) - AFI->setFramePtrSpillOffset(MFI->getObjectOffset(FramePtrSpillFI) + - NumBytes); - AFI->setGPRCalleeSavedArea1Offset(GPRCS1Offset); - AFI->setGPRCalleeSavedArea2Offset(GPRCS2Offset); - AFI->setDPRCalleeSavedAreaOffset(DPRCSOffset); - - movePastCSLoadStoreOps(MBB, MBBI, ARM::VSTRD, 0, 3, STI); - NumBytes = DPRCSOffset; - if (NumBytes) { - // Adjust SP after all the callee-save spills. - emitSPUpdate(isARM, MBB, MBBI, dl, TII, -NumBytes); - if (HasFP) - AFI->setShouldRestoreSPFromFP(true); - } - - if (STI.isTargetELF() && hasFP(MF)) { - MFI->setOffsetAdjustment(MFI->getOffsetAdjustment() - - AFI->getFramePtrSpillOffset()); - AFI->setShouldRestoreSPFromFP(true); - } - - AFI->setGPRCalleeSavedArea1Size(GPRCS1Size); - AFI->setGPRCalleeSavedArea2Size(GPRCS2Size); - AFI->setDPRCalleeSavedAreaSize(DPRCSSize); - - // If we need dynamic stack realignment, do it here. Be paranoid and make - // sure if we also have VLAs, we have a base pointer for frame access. - if (needsStackRealignment(MF)) { - unsigned MaxAlign = MFI->getMaxAlignment(); - assert (!AFI->isThumb1OnlyFunction()); - if (!AFI->isThumbFunction()) { - // Emit bic sp, sp, MaxAlign - AddDefaultCC(AddDefaultPred(BuildMI(MBB, MBBI, dl, - TII.get(ARM::BICri), ARM::SP) - .addReg(ARM::SP, RegState::Kill) - .addImm(MaxAlign-1))); - } else { - // We cannot use sp as source/dest register here, thus we're emitting the - // following sequence: - // mov r4, sp - // bic r4, r4, MaxAlign - // mov sp, r4 - // FIXME: It will be better just to find spare register here. - BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVgpr2tgpr), ARM::R4) - .addReg(ARM::SP, RegState::Kill); - AddDefaultCC(AddDefaultPred(BuildMI(MBB, MBBI, dl, - TII.get(ARM::t2BICri), ARM::R4) - .addReg(ARM::R4, RegState::Kill) - .addImm(MaxAlign-1))); - BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVtgpr2gpr), ARM::SP) - .addReg(ARM::R4, RegState::Kill); - } - - AFI->setShouldRestoreSPFromFP(true); - } - - // If we need a base pointer, set it up here. It's whatever the value - // of the stack pointer is at this point. Any variable size objects - // will be allocated after this, so we can still use the base pointer - // to reference locals. - if (hasBasePointer(MF)) { - if (isARM) - BuildMI(MBB, MBBI, dl, TII.get(ARM::MOVr), BasePtr) - .addReg(ARM::SP) - .addImm((unsigned)ARMCC::AL).addReg(0).addReg(0); - else - BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVgpr2gpr), BasePtr) - .addReg(ARM::SP); - } - - // If the frame has variable sized objects then the epilogue must restore - // the sp from fp. - if (!AFI->shouldRestoreSPFromFP() && MFI->hasVarSizedObjects()) - AFI->setShouldRestoreSPFromFP(true); -} - -static bool isCalleeSavedRegister(unsigned Reg, const unsigned *CSRegs) { - for (unsigned i = 0; CSRegs[i]; ++i) - if (Reg == CSRegs[i]) - return true; - return false; -} - -static bool isCSRestore(MachineInstr *MI, - const ARMBaseInstrInfo &TII, - const unsigned *CSRegs) { - return ((MI->getOpcode() == (int)ARM::VLDRD || - MI->getOpcode() == (int)ARM::LDR || - MI->getOpcode() == (int)ARM::t2LDRi12) && - MI->getOperand(1).isFI() && - isCalleeSavedRegister(MI->getOperand(0).getReg(), CSRegs)); -} - -void ARMBaseRegisterInfo:: -emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const { - MachineBasicBlock::iterator MBBI = prior(MBB.end()); - assert(MBBI->getDesc().isReturn() && - "Can only insert epilog into returning blocks"); - unsigned RetOpcode = MBBI->getOpcode(); - DebugLoc dl = MBBI->getDebugLoc(); - MachineFrameInfo *MFI = MF.getFrameInfo(); - ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); - assert(!AFI->isThumb1OnlyFunction() && - "This emitEpilogue does not support Thumb1!"); - bool isARM = !AFI->isThumbFunction(); - - unsigned VARegSaveSize = AFI->getVarArgsRegSaveSize(); - int NumBytes = (int)MFI->getStackSize(); - - if (!AFI->hasStackFrame()) { - if (NumBytes != 0) - emitSPUpdate(isARM, MBB, MBBI, dl, TII, NumBytes); - } else { - // Unwind MBBI to point to first LDR / VLDRD. - const unsigned *CSRegs = getCalleeSavedRegs(); - if (MBBI != MBB.begin()) { - do - --MBBI; - while (MBBI != MBB.begin() && isCSRestore(MBBI, TII, CSRegs)); - if (!isCSRestore(MBBI, TII, CSRegs)) - ++MBBI; - } - - // Move SP to start of FP callee save spill area. - NumBytes -= (AFI->getGPRCalleeSavedArea1Size() + - AFI->getGPRCalleeSavedArea2Size() + - AFI->getDPRCalleeSavedAreaSize()); - - // Reset SP based on frame pointer only if the stack frame extends beyond - // frame pointer stack slot or target is ELF and the function has FP. - if (AFI->shouldRestoreSPFromFP()) { - NumBytes = AFI->getFramePtrSpillOffset() - NumBytes; - if (NumBytes) { - if (isARM) - emitARMRegPlusImmediate(MBB, MBBI, dl, ARM::SP, FramePtr, -NumBytes, - ARMCC::AL, 0, TII); - else - emitT2RegPlusImmediate(MBB, MBBI, dl, ARM::SP, FramePtr, -NumBytes, - ARMCC::AL, 0, TII); - } else { - // Thumb2 or ARM. - if (isARM) - BuildMI(MBB, MBBI, dl, TII.get(ARM::MOVr), ARM::SP) - .addReg(FramePtr).addImm((unsigned)ARMCC::AL).addReg(0).addReg(0); - else - BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVgpr2gpr), ARM::SP) - .addReg(FramePtr); - } - } else if (NumBytes) - emitSPUpdate(isARM, MBB, MBBI, dl, TII, NumBytes); - - // Move SP to start of integer callee save spill area 2. - movePastCSLoadStoreOps(MBB, MBBI, ARM::VLDRD, 0, 3, STI); - emitSPUpdate(isARM, MBB, MBBI, dl, TII, AFI->getDPRCalleeSavedAreaSize()); - - // Move SP to start of integer callee save spill area 1. - movePastCSLoadStoreOps(MBB, MBBI, ARM::LDR, ARM::t2LDRi12, 2, STI); - emitSPUpdate(isARM, MBB, MBBI, dl, TII, AFI->getGPRCalleeSavedArea2Size()); - - // Move SP to SP upon entry to the function. - movePastCSLoadStoreOps(MBB, MBBI, ARM::LDR, ARM::t2LDRi12, 1, STI); - emitSPUpdate(isARM, MBB, MBBI, dl, TII, AFI->getGPRCalleeSavedArea1Size()); - } - - if (RetOpcode == ARM::TCRETURNdi || RetOpcode == ARM::TCRETURNdiND || - RetOpcode == ARM::TCRETURNri || RetOpcode == ARM::TCRETURNriND) { - // Tail call return: adjust the stack pointer and jump to callee. - MBBI = prior(MBB.end()); - MachineOperand &JumpTarget = MBBI->getOperand(0); - - // Jump to label or value in register. - if (RetOpcode == ARM::TCRETURNdi) { - BuildMI(MBB, MBBI, dl, - TII.get(STI.isThumb() ? ARM::TAILJMPdt : ARM::TAILJMPd)). - addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset(), - JumpTarget.getTargetFlags()); - } else if (RetOpcode == ARM::TCRETURNdiND) { - BuildMI(MBB, MBBI, dl, - TII.get(STI.isThumb() ? ARM::TAILJMPdNDt : ARM::TAILJMPdND)). - addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset(), - JumpTarget.getTargetFlags()); - } else if (RetOpcode == ARM::TCRETURNri) { - BuildMI(MBB, MBBI, dl, TII.get(ARM::TAILJMPr)). - addReg(JumpTarget.getReg(), RegState::Kill); - } else if (RetOpcode == ARM::TCRETURNriND) { - BuildMI(MBB, MBBI, dl, TII.get(ARM::TAILJMPrND)). - addReg(JumpTarget.getReg(), RegState::Kill); - } - - MachineInstr *NewMI = prior(MBBI); - for (unsigned i = 1, e = MBBI->getNumOperands(); i != e; ++i) - NewMI->addOperand(MBBI->getOperand(i)); - - // Delete the pseudo instruction TCRETURN. - MBB.erase(MBBI); - } - - if (VARegSaveSize) - emitSPUpdate(isARM, MBB, MBBI, dl, TII, VARegSaveSize); -} - #include "ARMGenRegisterInfo.inc" diff --git a/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h b/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h index fa2eb6c..ba6bd2b 100644 --- a/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h +++ b/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h @@ -44,6 +44,45 @@ static inline bool isARMLowRegister(unsigned Reg) { } } +/// isARMArea1Register - Returns true if the register is a low register (r0-r7) +/// or a stack/pc register that we should push/pop. +static inline bool isARMArea1Register(unsigned Reg, bool isDarwin) { + using namespace ARM; + switch (Reg) { + case R0: case R1: case R2: case R3: + case R4: case R5: case R6: case R7: + case LR: case SP: case PC: + return true; + case R8: case R9: case R10: case R11: + // For darwin we want r7 and lr to be next to each other. + return !isDarwin; + default: + return false; + } +} + +static inline bool isARMArea2Register(unsigned Reg, bool isDarwin) { + using namespace ARM; + switch (Reg) { + case R8: case R9: case R10: case R11: + // Darwin has this second area. + return isDarwin; + default: + return false; + } +} + +static inline bool isARMArea3Register(unsigned Reg, bool isDarwin) { + using namespace ARM; + switch (Reg) { + case D15: case D14: case D13: case D12: + case D11: case D10: case D9: case D8: + return true; + default: + return false; + } +} + class ARMBaseRegisterInfo : public ARMGenRegisterInfo { protected: const ARMBaseInstrInfo &TII; @@ -65,12 +104,6 @@ protected: unsigned getOpcode(int Op) const; public: - /// getRegisterNumbering - Given the enum value for some register, e.g. - /// ARM::LR, return the number that it corresponds to (e.g. 14). It - /// also returns true in isSPVFP if the register is a single precision - /// VFP register. - static unsigned getRegisterNumbering(unsigned RegEnum, bool *isSPVFP = 0); - /// Code Generation virtual methods... const unsigned *getCalleeSavedRegs(const MachineFunction *MF = 0) const; @@ -106,14 +139,13 @@ public: void UpdateRegAllocHint(unsigned Reg, unsigned NewReg, MachineFunction &MF) const; - bool hasFP(const MachineFunction &MF) const; bool hasBasePointer(const MachineFunction &MF) const; bool canRealignStack(const MachineFunction &MF) const; bool needsStackRealignment(const MachineFunction &MF) const; int64_t getFrameIndexInstrOffset(const MachineInstr *MI, int Idx) const; bool needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const; - void materializeFrameBaseRegister(MachineBasicBlock::iterator I, + void materializeFrameBaseRegister(MachineBasicBlock *MBB, unsigned BaseReg, int FrameIdx, int64_t Offset) const; void resolveFrameIndex(MachineBasicBlock::iterator I, @@ -122,17 +154,10 @@ public: bool cannotEliminateFrame(const MachineFunction &MF) const; - void processFunctionBeforeCalleeSavedScan(MachineFunction &MF, - RegScavenger *RS = NULL) const; - // Debug information queries. unsigned getRARegister() const; unsigned getFrameRegister(const MachineFunction &MF) const; - int getFrameIndexReference(const MachineFunction &MF, int FI, - unsigned &FrameReg) const; - int ResolveFrameIndexReference(const MachineFunction &MF, int FI, - unsigned &FrameReg, int SPAdj) const; - int getFrameIndexOffset(const MachineFunction &MF, int FI) const; + unsigned getBaseRegister() const { return BasePtr; } // Exception handling queries. unsigned getEHExceptionRegister() const; @@ -162,9 +187,6 @@ public: virtual bool requiresVirtualBaseRegisters(const MachineFunction &MF) const; - virtual bool hasReservedCallFrame(const MachineFunction &MF) const; - virtual bool canSimplifyCallFramePseudos(const MachineFunction &MF) const; - virtual void eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const; @@ -172,12 +194,7 @@ public: virtual void eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj, RegScavenger *RS = NULL) const; - virtual void emitPrologue(MachineFunction &MF) const; - virtual void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const; - private: - unsigned estimateRSStackSizeLimit(MachineFunction &MF) const; - unsigned getRegisterPairEven(unsigned Reg, const MachineFunction &MF) const; unsigned getRegisterPairOdd(unsigned Reg, const MachineFunction &MF) const; diff --git a/contrib/llvm/lib/Target/ARM/ARMBuildAttrs.h b/contrib/llvm/lib/Target/ARM/ARMBuildAttrs.h index 3b38375..69eddf0 100644 --- a/contrib/llvm/lib/Target/ARM/ARMBuildAttrs.h +++ b/contrib/llvm/lib/Target/ARM/ARMBuildAttrs.h @@ -8,7 +8,7 @@ //===----------------------------------------------------------------------===// // // This file contains enumerations and support routines for ARM build attributes -// as defined in ARM ABI addenda document (ABI release 2.07). +// as defined in ARM ABI addenda document (ABI release 2.08). // //===----------------------------------------------------------------------===// @@ -16,7 +16,14 @@ #define __TARGET_ARMBUILDATTRS_H__ namespace ARMBuildAttrs { - enum { + enum SpecialAttr { + // This is for the .cpu asm attr. It translates into one or more + // AttrType (below) entries in the .ARM.attributes section in the ELF. + SEL_CPU + }; + + enum AttrType { + // Rest correspond to ELF/.ARM.attributes File = 1, Section = 2, Symbol = 3, @@ -52,12 +59,72 @@ namespace ARMBuildAttrs { CPU_unaligned_access = 34, VFP_HP_extension = 36, ABI_FP_16bit_format = 38, + MPextension_use = 42, // was 70, 2.08 ABI + DIV_use = 44, nodefaults = 64, also_compatible_with = 65, T2EE_use = 66, conformance = 67, Virtualization_use = 68, - MPextension_use = 70 + MPextension_use_old = 70 + }; + + // Magic numbers for .ARM.attributes + enum AttrMagic { + Format_Version = 0x41 + }; + + // Legal Values for CPU_arch, (=6), uleb128 + enum CPUArch { + Pre_v4 = 0, + v4 = 1, // e.g. SA110 + v4T = 2, // e.g. ARM7TDMI + v5T = 3, // e.g. ARM9TDMI + v5TE = 4, // e.g. ARM946E_S + v5TEJ = 5, // e.g. ARM926EJ_S + v6 = 6, // e.g. ARM1136J_S + v6KZ = 7, // e.g. ARM1176JZ_S + v6T2 = 8, // e.g. ARM1156T2F_S + v6K = 9, // e.g. ARM1136J_S + v7 = 10, // e.g. Cortex A8, Cortex M3 + v6_M = 11, // e.g. Cortex M1 + v6S_M = 12, // v6_M with the System extensions + v7E_M = 13 // v7_M with DSP extensions + }; + + enum CPUArchProfile { // (=7), uleb128 + Not_Applicable = 0, // pre v7, or cross-profile code + ApplicationProfile = (0x41), // 'A' (e.g. for Cortex A8) + RealTimeProfile = (0x52), // 'R' (e.g. for Cortex R4) + MicroControllerProfile = (0x4D), // 'M' (e.g. for Cortex M3) + SystemProfile = (0x53) // 'S' Application or real-time profile + }; + + // The following have a lot of common use cases + enum { + //ARMISAUse (=8), uleb128 and THUMBISAUse (=9), uleb128 + Not_Allowed = 0, + Allowed = 1, + + // FP_arch (=10), uleb128 (formerly Tag_VFP_arch = 10) + AllowFPv2 = 2, // v2 FP ISA permitted (implies use of the v1 FP ISA) + AllowFPv3A = 3, // v3 FP ISA permitted (implies use of the v2 FP ISA) + AllowFPv3B = 4, // v3 FP ISA permitted, but only D0-D15, S0-S31 + AllowFPv4A = 5, // v4 FP ISA permitted (implies use of v3 FP ISA) + AllowFPv4B = 6, // v4 FP ISA was permitted, but only D0-D15, S0-S31 + + // Tag_WMMX_arch, (=11), uleb128 + AllowThumb32 = 2, // 32-bit Thumb (implies 16-bit instructions) + + // Tag_WMMX_arch, (=11), uleb128 + AllowWMMXv1 = 2, // The user permitted this entity to use WMMX v2 + + // Tag_ABI_FP_denormal, (=20), uleb128 + PreserveFPSign = 2, // sign when flushed-to-zero is preserved + + // Tag_ABI_FP_number_model, (=23), uleb128 + AllowRTABI = 2, // numbers, infinities, and one quiet NaN (see [RTABI]) + AllowIEE754 = 3 // this code to use all the IEEE 754-defined FP encodings }; } diff --git a/contrib/llvm/lib/Target/ARM/ARMCallingConv.h b/contrib/llvm/lib/Target/ARM/ARMCallingConv.h new file mode 100644 index 0000000..ff7db1f --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMCallingConv.h @@ -0,0 +1,160 @@ +//===-- ARMCallingConv.h - ARM Custom Calling Convention Routines ---------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the custom routines for the ARM Calling Convention that +// aren't done by tablegen. +// +//===----------------------------------------------------------------------===// + +#ifndef ARMCALLINGCONV_H +#define ARMCALLINGCONV_H + +#include "llvm/CallingConv.h" +#include "llvm/CodeGen/CallingConvLower.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "ARMBaseInstrInfo.h" +#include "ARMRegisterInfo.h" +#include "ARMSubtarget.h" +#include "ARM.h" + +namespace llvm { + +// APCS f64 is in register pairs, possibly split to stack +static bool f64AssignAPCS(unsigned &ValNo, MVT &ValVT, MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + CCState &State, bool CanFail) { + static const unsigned RegList[] = { ARM::R0, ARM::R1, ARM::R2, ARM::R3 }; + + // Try to get the first register. + if (unsigned Reg = State.AllocateReg(RegList, 4)) + State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo)); + else { + // For the 2nd half of a v2f64, do not fail. + if (CanFail) + return false; + + // Put the whole thing on the stack. + State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT, + State.AllocateStack(8, 4), + LocVT, LocInfo)); + return true; + } + + // Try to get the second register. + if (unsigned Reg = State.AllocateReg(RegList, 4)) + State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo)); + else + State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT, + State.AllocateStack(4, 4), + LocVT, LocInfo)); + return true; +} + +static bool CC_ARM_APCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, + CCState &State) { + if (!f64AssignAPCS(ValNo, ValVT, LocVT, LocInfo, State, true)) + return false; + if (LocVT == MVT::v2f64 && + !f64AssignAPCS(ValNo, ValVT, LocVT, LocInfo, State, false)) + return false; + return true; // we handled it +} + +// AAPCS f64 is in aligned register pairs +static bool f64AssignAAPCS(unsigned &ValNo, MVT &ValVT, MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + CCState &State, bool CanFail) { + static const unsigned HiRegList[] = { ARM::R0, ARM::R2 }; + static const unsigned LoRegList[] = { ARM::R1, ARM::R3 }; + static const unsigned ShadowRegList[] = { ARM::R0, ARM::R1 }; + + unsigned Reg = State.AllocateReg(HiRegList, ShadowRegList, 2); + if (Reg == 0) { + // For the 2nd half of a v2f64, do not just fail. + if (CanFail) + return false; + + // Put the whole thing on the stack. + State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT, + State.AllocateStack(8, 8), + LocVT, LocInfo)); + return true; + } + + unsigned i; + for (i = 0; i < 2; ++i) + if (HiRegList[i] == Reg) + break; + + unsigned T = State.AllocateReg(LoRegList[i]); + (void)T; + assert(T == LoRegList[i] && "Could not allocate register"); + + State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo)); + State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, LoRegList[i], + LocVT, LocInfo)); + return true; +} + +static bool CC_ARM_AAPCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, + CCState &State) { + if (!f64AssignAAPCS(ValNo, ValVT, LocVT, LocInfo, State, true)) + return false; + if (LocVT == MVT::v2f64 && + !f64AssignAAPCS(ValNo, ValVT, LocVT, LocInfo, State, false)) + return false; + return true; // we handled it +} + +static bool f64RetAssign(unsigned &ValNo, MVT &ValVT, MVT &LocVT, + CCValAssign::LocInfo &LocInfo, CCState &State) { + static const unsigned HiRegList[] = { ARM::R0, ARM::R2 }; + static const unsigned LoRegList[] = { ARM::R1, ARM::R3 }; + + unsigned Reg = State.AllocateReg(HiRegList, LoRegList, 2); + if (Reg == 0) + return false; // we didn't handle it + + unsigned i; + for (i = 0; i < 2; ++i) + if (HiRegList[i] == Reg) + break; + + State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo)); + State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, LoRegList[i], + LocVT, LocInfo)); + return true; +} + +static bool RetCC_ARM_APCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, + CCState &State) { + if (!f64RetAssign(ValNo, ValVT, LocVT, LocInfo, State)) + return false; + if (LocVT == MVT::v2f64 && !f64RetAssign(ValNo, ValVT, LocVT, LocInfo, State)) + return false; + return true; // we handled it +} + +static bool RetCC_ARM_AAPCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, + CCState &State) { + return RetCC_ARM_APCS_Custom_f64(ValNo, ValVT, LocVT, LocInfo, ArgFlags, + State); +} + +} // End llvm namespace + +#endif diff --git a/contrib/llvm/lib/Target/ARM/ARMCallingConv.td b/contrib/llvm/lib/Target/ARM/ARMCallingConv.td index 293e32a..426ba13 100644 --- a/contrib/llvm/lib/Target/ARM/ARMCallingConv.td +++ b/contrib/llvm/lib/Target/ARM/ARMCallingConv.td @@ -53,6 +53,34 @@ def RetCC_ARM_APCS : CallingConv<[ ]>; //===----------------------------------------------------------------------===// +// ARM APCS Calling Convention for FastCC (when VFP2 or later is available) +//===----------------------------------------------------------------------===// +def FastCC_ARM_APCS : CallingConv<[ + // Handle all vector types as either f64 or v2f64. + CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>, + CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>, + + CCIfType<[v2f64], CCAssignToReg<[Q0, Q1, Q2, Q3]>>, + CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>, + CCIfType<[f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7, S8, + S9, S10, S11, S12, S13, S14, S15]>>, + CCDelegateTo<CC_ARM_APCS> +]>; + +def RetFastCC_ARM_APCS : CallingConv<[ + // Handle all vector types as either f64 or v2f64. + CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>, + CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>, + + CCIfType<[v2f64], CCAssignToReg<[Q0, Q1, Q2, Q3]>>, + CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>, + CCIfType<[f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7, S8, + S9, S10, S11, S12, S13, S14, S15]>>, + CCDelegateTo<RetCC_ARM_APCS> +]>; + + +//===----------------------------------------------------------------------===// // ARM AAPCS (EABI) Calling Convention, common parts //===----------------------------------------------------------------------===// @@ -105,6 +133,7 @@ def RetCC_ARM_AAPCS : CallingConv<[ //===----------------------------------------------------------------------===// // ARM AAPCS-VFP (EABI) Calling Convention +// Also used for FastCC (when VFP2 or later is available) //===----------------------------------------------------------------------===// def CC_ARM_AAPCS_VFP : CallingConv<[ diff --git a/contrib/llvm/lib/Target/ARM/ARMCodeEmitter.cpp b/contrib/llvm/lib/Target/ARM/ARMCodeEmitter.cpp index b1a702f..9bbf6a0 100644 --- a/contrib/llvm/lib/Target/ARM/ARMCodeEmitter.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMCodeEmitter.cpp @@ -74,7 +74,7 @@ namespace { /// getBinaryCodeForInstr - This function, generated by the /// CodeEmitterGenerator using TableGen, produces the binary encoding for /// machine instructions. - unsigned getBinaryCodeForInstr(const MachineInstr &MI); + unsigned getBinaryCodeForInstr(const MachineInstr &MI) const; bool runOnMachineFunction(MachineFunction &MF); @@ -101,7 +101,6 @@ namespace { unsigned OpIdx); unsigned getMachineSoImmOpValue(unsigned SoImm); - unsigned getAddrModeSBit(const MachineInstr &MI, const TargetInstrDesc &TID) const; @@ -140,8 +139,6 @@ namespace { void emitVFPLoadStoreMultipleInstruction(const MachineInstr &MI); - void emitMiscInstruction(const MachineInstr &MI); - void emitNEONLaneInstruction(const MachineInstr &MI); void emitNEONDupInstruction(const MachineInstr &MI); void emitNEON1RegModImmInstruction(const MachineInstr &MI); @@ -150,20 +147,176 @@ namespace { /// getMachineOpValue - Return binary encoding of operand. If the machine /// operand requires relocation, record the relocation and return zero. - unsigned getMachineOpValue(const MachineInstr &MI,const MachineOperand &MO); - unsigned getMachineOpValue(const MachineInstr &MI, unsigned OpIdx) { + unsigned getMachineOpValue(const MachineInstr &MI, + const MachineOperand &MO) const; + unsigned getMachineOpValue(const MachineInstr &MI, unsigned OpIdx) const { return getMachineOpValue(MI, MI.getOperand(OpIdx)); } + // FIXME: The legacy JIT ARMCodeEmitter doesn't rely on the the + // TableGen'erated getBinaryCodeForInstr() function to encode any + // operand values, instead querying getMachineOpValue() directly for + // each operand it needs to encode. Thus, any of the new encoder + // helper functions can simply return 0 as the values the return + // are already handled elsewhere. They are placeholders to allow this + // encoder to continue to function until the MC encoder is sufficiently + // far along that this one can be eliminated entirely. + unsigned NEONThumb2DataIPostEncoder(const MachineInstr &MI, unsigned Val) + const { return 0; } + unsigned NEONThumb2LoadStorePostEncoder(const MachineInstr &MI,unsigned Val) + const { return 0; } + unsigned NEONThumb2DupPostEncoder(const MachineInstr &MI,unsigned Val) + const { return 0; } + unsigned VFPThumb2PostEncoder(const MachineInstr&MI, unsigned Val) + const { return 0; } + unsigned getAdrLabelOpValue(const MachineInstr &MI, unsigned Op) + const { return 0; } + unsigned getThumbAdrLabelOpValue(const MachineInstr &MI, unsigned Op) + const { return 0; } + unsigned getThumbBLTargetOpValue(const MachineInstr &MI, unsigned Op) + const { return 0; } + unsigned getThumbBLXTargetOpValue(const MachineInstr &MI, unsigned Op) + const { return 0; } + unsigned getThumbBRTargetOpValue(const MachineInstr &MI, unsigned Op) + const { return 0; } + unsigned getThumbBCCTargetOpValue(const MachineInstr &MI, unsigned Op) + const { return 0; } + unsigned getThumbCBTargetOpValue(const MachineInstr &MI, unsigned Op) + const { return 0; } + unsigned getBranchTargetOpValue(const MachineInstr &MI, unsigned Op) + const { return 0; } + unsigned getUnconditionalBranchTargetOpValue(const MachineInstr &MI, + unsigned Op) const { return 0; } + unsigned getARMBranchTargetOpValue(const MachineInstr &MI, unsigned Op) + const { return 0; } + unsigned getCCOutOpValue(const MachineInstr &MI, unsigned Op) + const { return 0; } + unsigned getSOImmOpValue(const MachineInstr &MI, unsigned Op) + const { return 0; } + unsigned getT2SOImmOpValue(const MachineInstr &MI, unsigned Op) + const { return 0; } + unsigned getSORegOpValue(const MachineInstr &MI, unsigned Op) + const { return 0; } + unsigned getThumbAddrModeRegRegOpValue(const MachineInstr &MI, unsigned Op) + const { return 0; } + unsigned getT2AddrModeImm12OpValue(const MachineInstr &MI, unsigned Op) + const { return 0; } + unsigned getT2AddrModeImm8OpValue(const MachineInstr &MI, unsigned Op) + const { return 0; } + unsigned getT2AddrModeImm8s4OpValue(const MachineInstr &MI, unsigned Op) + const { return 0; } + unsigned getT2AddrModeImm8OffsetOpValue(const MachineInstr &MI, unsigned Op) + const { return 0; } + unsigned getT2AddrModeImm12OffsetOpValue(const MachineInstr &MI,unsigned Op) + const { return 0; } + unsigned getT2AddrModeSORegOpValue(const MachineInstr &MI, unsigned Op) + const { return 0; } + unsigned getT2SORegOpValue(const MachineInstr &MI, unsigned Op) + const { return 0; } + unsigned getRotImmOpValue(const MachineInstr &MI, unsigned Op) + const { return 0; } + unsigned getImmMinusOneOpValue(const MachineInstr &MI, unsigned Op) + const { return 0; } + unsigned getT2AdrLabelOpValue(const MachineInstr &MI, unsigned Op) + const { return 0; } + unsigned getAddrMode6AddressOpValue(const MachineInstr &MI, unsigned Op) + const { return 0; } + unsigned getAddrMode6DupAddressOpValue(const MachineInstr &MI, unsigned Op) + const { return 0; } + unsigned getAddrMode6OffsetOpValue(const MachineInstr &MI, unsigned Op) + const { return 0; } + unsigned getBitfieldInvertedMaskOpValue(const MachineInstr &MI, + unsigned Op) const { return 0; } + unsigned getMsbOpValue(const MachineInstr &MI, + unsigned Op) const { return 0; } + uint32_t getLdStmModeOpValue(const MachineInstr &MI, unsigned OpIdx) + const {return 0; } + uint32_t getLdStSORegOpValue(const MachineInstr &MI, unsigned OpIdx) + const { return 0; } + + unsigned getAddrModeImm12OpValue(const MachineInstr &MI, unsigned Op) + const { + // {17-13} = reg + // {12} = (U)nsigned (add == '1', sub == '0') + // {11-0} = imm12 + const MachineOperand &MO = MI.getOperand(Op); + const MachineOperand &MO1 = MI.getOperand(Op + 1); + if (!MO.isReg()) { + emitConstPoolAddress(MO.getIndex(), ARM::reloc_arm_cp_entry); + return 0; + } + unsigned Reg = getARMRegisterNumbering(MO.getReg()); + int32_t Imm12 = MO1.getImm(); + uint32_t Binary; + Binary = Imm12 & 0xfff; + if (Imm12 >= 0) + Binary |= (1 << 12); + Binary |= (Reg << 13); + return Binary; + } + + unsigned getHiLo16ImmOpValue(const MachineInstr &MI, unsigned Op) const { + return 0; + } + + uint32_t getAddrMode2OpValue(const MachineInstr &MI, unsigned OpIdx) + const { return 0;} + uint32_t getAddrMode2OffsetOpValue(const MachineInstr &MI, unsigned OpIdx) + const { return 0;} + uint32_t getAddrMode3OffsetOpValue(const MachineInstr &MI, unsigned OpIdx) + const { return 0;} + uint32_t getAddrMode3OpValue(const MachineInstr &MI, unsigned Op) + const { return 0; } + uint32_t getAddrModeThumbSPOpValue(const MachineInstr &MI, unsigned Op) + const { return 0; } + uint32_t getAddrModeSOpValue(const MachineInstr &MI, unsigned Op) + const { return 0; } + uint32_t getAddrModeISOpValue(const MachineInstr &MI, unsigned Op) + const { return 0; } + uint32_t getAddrModePCOpValue(const MachineInstr &MI, unsigned Op) + const { return 0; } + uint32_t getAddrMode5OpValue(const MachineInstr &MI, unsigned Op) const { + // {17-13} = reg + // {12} = (U)nsigned (add == '1', sub == '0') + // {11-0} = imm12 + const MachineOperand &MO = MI.getOperand(Op); + const MachineOperand &MO1 = MI.getOperand(Op + 1); + if (!MO.isReg()) { + emitConstPoolAddress(MO.getIndex(), ARM::reloc_arm_cp_entry); + return 0; + } + unsigned Reg = getARMRegisterNumbering(MO.getReg()); + int32_t Imm12 = MO1.getImm(); + + // Special value for #-0 + if (Imm12 == INT32_MIN) + Imm12 = 0; + + // Immediate is always encoded as positive. The 'U' bit controls add vs + // sub. + bool isAdd = true; + if (Imm12 < 0) { + Imm12 = -Imm12; + isAdd = false; + } + + uint32_t Binary = Imm12 & 0xfff; + if (isAdd) + Binary |= (1 << 12); + Binary |= (Reg << 13); + return Binary; + } + unsigned getNEONVcvtImm32OpValue(const MachineInstr &MI, unsigned Op) + const { return 0; } + + unsigned getRegisterListOpValue(const MachineInstr &MI, unsigned Op) + const { return 0; } + /// getMovi32Value - Return binary encoding of operand for movw/movt. If the /// machine operand requires relocation, record the relocation and return /// zero. unsigned getMovi32Value(const MachineInstr &MI,const MachineOperand &MO, unsigned Reloc); - unsigned getMovi32Value(const MachineInstr &MI, unsigned OpIdx, - unsigned Reloc) { - return getMovi32Value(MI, MI.getOperand(OpIdx), Reloc); - } /// getShiftOp - Return the shift opcode (bit[6:5]) of the immediate value. /// @@ -173,12 +326,12 @@ namespace { /// fixed up by the relocation stage. void emitGlobalAddress(const GlobalValue *GV, unsigned Reloc, bool MayNeedFarStub, bool Indirect, - intptr_t ACPV = 0); - void emitExternalSymbolAddress(const char *ES, unsigned Reloc); - void emitConstPoolAddress(unsigned CPI, unsigned Reloc); - void emitJumpTableAddress(unsigned JTIndex, unsigned Reloc); + intptr_t ACPV = 0) const; + void emitExternalSymbolAddress(const char *ES, unsigned Reloc) const; + void emitConstPoolAddress(unsigned CPI, unsigned Reloc) const; + void emitJumpTableAddress(unsigned JTIndex, unsigned Reloc) const; void emitMachineBasicBlock(MachineBasicBlock *BB, unsigned Reloc, - intptr_t JTBase = 0); + intptr_t JTBase = 0) const; }; } @@ -266,9 +419,9 @@ unsigned ARMCodeEmitter::getMovi32Value(const MachineInstr &MI, /// getMachineOpValue - Return binary encoding of operand. If the machine /// operand requires relocation, record the relocation and return zero. unsigned ARMCodeEmitter::getMachineOpValue(const MachineInstr &MI, - const MachineOperand &MO) { + const MachineOperand &MO) const { if (MO.isReg()) - return ARMRegisterInfo::getRegisterNumbering(MO.getReg()); + return getARMRegisterNumbering(MO.getReg()); else if (MO.isImm()) return static_cast<unsigned>(MO.getImm()); else if (MO.isGlobal()) @@ -285,12 +438,8 @@ unsigned ARMCodeEmitter::getMachineOpValue(const MachineInstr &MI, emitJumpTableAddress(MO.getIndex(), ARM::reloc_arm_relative); else if (MO.isMBB()) emitMachineBasicBlock(MO.getMBB(), ARM::reloc_arm_branch); - else { -#ifndef NDEBUG - errs() << MO; -#endif - llvm_unreachable(0); - } + else + llvm_unreachable("Unable to encode MachineOperand!"); return 0; } @@ -298,7 +447,7 @@ unsigned ARMCodeEmitter::getMachineOpValue(const MachineInstr &MI, /// void ARMCodeEmitter::emitGlobalAddress(const GlobalValue *GV, unsigned Reloc, bool MayNeedFarStub, bool Indirect, - intptr_t ACPV) { + intptr_t ACPV) const { MachineRelocation MR = Indirect ? MachineRelocation::getIndirectSymbol(MCE.getCurrentPCOffset(), Reloc, const_cast<GlobalValue *>(GV), @@ -312,7 +461,8 @@ void ARMCodeEmitter::emitGlobalAddress(const GlobalValue *GV, unsigned Reloc, /// emitExternalSymbolAddress - Arrange for the address of an external symbol to /// be emitted to the current location in the function, and allow it to be PC /// relative. -void ARMCodeEmitter::emitExternalSymbolAddress(const char *ES, unsigned Reloc) { +void ARMCodeEmitter:: +emitExternalSymbolAddress(const char *ES, unsigned Reloc) const { MCE.addRelocation(MachineRelocation::getExtSym(MCE.getCurrentPCOffset(), Reloc, ES)); } @@ -320,7 +470,7 @@ void ARMCodeEmitter::emitExternalSymbolAddress(const char *ES, unsigned Reloc) { /// emitConstPoolAddress - Arrange for the address of an constant pool /// to be emitted to the current location in the function, and allow it to be PC /// relative. -void ARMCodeEmitter::emitConstPoolAddress(unsigned CPI, unsigned Reloc) { +void ARMCodeEmitter::emitConstPoolAddress(unsigned CPI, unsigned Reloc) const { // Tell JIT emitter we'll resolve the address. MCE.addRelocation(MachineRelocation::getConstPool(MCE.getCurrentPCOffset(), Reloc, CPI, 0, true)); @@ -329,14 +479,16 @@ void ARMCodeEmitter::emitConstPoolAddress(unsigned CPI, unsigned Reloc) { /// emitJumpTableAddress - Arrange for the address of a jump table to /// be emitted to the current location in the function, and allow it to be PC /// relative. -void ARMCodeEmitter::emitJumpTableAddress(unsigned JTIndex, unsigned Reloc) { +void ARMCodeEmitter:: +emitJumpTableAddress(unsigned JTIndex, unsigned Reloc) const { MCE.addRelocation(MachineRelocation::getJumpTable(MCE.getCurrentPCOffset(), Reloc, JTIndex, 0, true)); } /// emitMachineBasicBlock - Emit the specified address basic block. void ARMCodeEmitter::emitMachineBasicBlock(MachineBasicBlock *BB, - unsigned Reloc, intptr_t JTBase) { + unsigned Reloc, + intptr_t JTBase) const { MCE.addRelocation(MachineRelocation::getBB(MCE.getCurrentPCOffset(), Reloc, BB, JTBase)); } @@ -364,6 +516,14 @@ void ARMCodeEmitter::emitInstruction(const MachineInstr &MI) { llvm_unreachable("Unhandled instruction encoding format!"); break; } + case ARMII::MiscFrm: + if (MI.getOpcode() == ARM::LEApcrelJT) { + // Materialize jumptable address. + emitLEApcrelJTInstruction(MI); + break; + } + llvm_unreachable("Unhandled instruction encoding!"); + break; case ARMII::Pseudo: emitPseudoInstruction(MI); break; @@ -418,9 +578,7 @@ void ARMCodeEmitter::emitInstruction(const MachineInstr &MI) { case ARMII::VFPLdStMulFrm: emitVFPLoadStoreMultipleInstruction(MI); break; - case ARMII::VFPMiscFrm: - emitMiscInstruction(MI); - break; + // NEON instructions. case ARMII::NGetLnFrm: case ARMII::NSetLnFrm: @@ -488,7 +646,7 @@ void ARMCodeEmitter::emitConstPoolInstruction(const MachineInstr &MI) { emitGlobalAddress(GV, ARM::reloc_arm_absolute, isa<Function>(GV), false); emitWordLE(0); } else if (const ConstantInt *CI = dyn_cast<ConstantInt>(CV)) { - uint32_t Val = *(uint32_t*)CI->getValue().getRawData(); + uint32_t Val = uint32_t(*CI->getValue().getRawData()); emitWordLE(Val); } else if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CV)) { if (CFP->getType()->isFloatTy()) @@ -588,7 +746,7 @@ void ARMCodeEmitter::emitLEApcrelJTInstruction(const MachineInstr &MI) { const TargetInstrDesc &TID = MI.getDesc(); // Emit the 'add' instruction. - unsigned Binary = 0x4 << 21; // add: Insts{24-31} = 0b0100 + unsigned Binary = 0x4 << 21; // add: Insts{24-21} = 0b0100 // Set the conditional execution predicate Binary |= II->getPredicate(&MI) << ARMII::CondShift; @@ -600,7 +758,7 @@ void ARMCodeEmitter::emitLEApcrelJTInstruction(const MachineInstr &MI) { Binary |= getMachineOpValue(MI, 0) << ARMII::RegRdShift; // Encode Rn which is PC. - Binary |= ARMRegisterInfo::getRegisterNumbering(ARM::PC) << ARMII::RegRnShift; + Binary |= getARMRegisterNumbering(ARM::PC) << ARMII::RegRnShift; // Encode the displacement. Binary |= 1 << ARMII::I_BitShift; @@ -628,7 +786,7 @@ void ARMCodeEmitter::emitPseudoMoveInstruction(const MachineInstr &MI) { // Encode the shift operation. switch (Opcode) { default: break; - case ARM::MOVrx: + case ARM::RRX: // rrx Binary |= 0x6 << 4; break; @@ -659,10 +817,10 @@ void ARMCodeEmitter::emitPseudoInstruction(const MachineInstr &MI) { switch (Opcode) { default: llvm_unreachable("ARMCodeEmitter::emitPseudoInstruction"); - case ARM::BX: - case ARM::BMOVPCRX: - case ARM::BXr9: - case ARM::BMOVPCRXr9: { + case ARM::BX_CALL: + case ARM::BMOVPCRX_CALL: + case ARM::BXr9_CALL: + case ARM::BMOVPCRXr9_CALL: { // First emit mov lr, pc unsigned Binary = 0x01a0e00f; Binary |= II->getPredicate(&MI) << ARMII::CondShift; @@ -720,18 +878,18 @@ void ARMCodeEmitter::emitPseudoInstruction(const MachineInstr &MI) { } case ARM::MOVi32imm: - emitMOVi32immInstruction(MI); - break; - - case ARM::MOVi2pieces: // Two instructions to materialize a constant. - emitMOVi2piecesInstruction(MI); + if (Subtarget->hasV6T2Ops()) + emitMOVi32immInstruction(MI); + else + emitMOVi2piecesInstruction(MI); break; + case ARM::LEApcrelJT: // Materialize jumptable address. emitLEApcrelJTInstruction(MI); break; - case ARM::MOVrx: + case ARM::RRX: case ARM::MOVsrl_flag: case ARM::MOVsra_flag: emitPseudoMoveInstruction(MI); @@ -789,8 +947,7 @@ unsigned ARMCodeEmitter::getMachineSoRegOpValue(const MachineInstr &MI, if (Rs) { // Encode Rs bit[11:8]. assert(ARM_AM::getSORegOffset(MO2.getImm()) == 0); - return Binary | - (ARMRegisterInfo::getRegisterNumbering(Rs) << ARMII::RegRsShift); + return Binary | (getARMRegisterNumbering(Rs) << ARMII::RegRsShift); } // Encode shift_imm bit[11:7]. @@ -841,8 +998,7 @@ void ARMCodeEmitter::emitDataProcessingInstruction(const MachineInstr &MI, Binary |= getMachineOpValue(MI, OpIdx++) << ARMII::RegRdShift; else if (ImplicitRd) // Special handling for implicit use (e.g. PC). - Binary |= (ARMRegisterInfo::getRegisterNumbering(ImplicitRd) - << ARMII::RegRdShift); + Binary |= (getARMRegisterNumbering(ImplicitRd) << ARMII::RegRdShift); if (TID.Opcode == ARM::MOVi16) { // Get immediate from MI. @@ -892,8 +1048,7 @@ void ARMCodeEmitter::emitDataProcessingInstruction(const MachineInstr &MI, if (!isUnary) { if (ImplicitRn) // Special handling for implicit use (e.g. PC). - Binary |= (ARMRegisterInfo::getRegisterNumbering(ImplicitRn) - << ARMII::RegRnShift); + Binary |= (getARMRegisterNumbering(ImplicitRn) << ARMII::RegRnShift); else { Binary |= getMachineOpValue(MI, OpIdx) << ARMII::RegRnShift; ++OpIdx; @@ -910,7 +1065,7 @@ void ARMCodeEmitter::emitDataProcessingInstruction(const MachineInstr &MI, if (MO.isReg()) { // Encode register Rm. - emitWordLE(Binary | ARMRegisterInfo::getRegisterNumbering(MO.getReg())); + emitWordLE(Binary | getARMRegisterNumbering(MO.getReg())); return; } @@ -930,6 +1085,13 @@ void ARMCodeEmitter::emitLoadStoreInstruction(const MachineInstr &MI, // Part of binary is determined by TableGn. unsigned Binary = getBinaryCodeForInstr(MI); + // If this is an LDRi12, STRi12 or LDRcp, nothing more needs be done. + if (MI.getOpcode() == ARM::LDRi12 || MI.getOpcode() == ARM::LDRcp || + MI.getOpcode() == ARM::STRi12) { + emitWordLE(Binary); + return; + } + // Set the conditional execution predicate Binary |= II->getPredicate(&MI) << ARMII::CondShift; @@ -946,16 +1108,14 @@ void ARMCodeEmitter::emitLoadStoreInstruction(const MachineInstr &MI, // Set first operand if (ImplicitRd) // Special handling for implicit use (e.g. PC). - Binary |= (ARMRegisterInfo::getRegisterNumbering(ImplicitRd) - << ARMII::RegRdShift); + Binary |= (getARMRegisterNumbering(ImplicitRd) << ARMII::RegRdShift); else Binary |= getMachineOpValue(MI, OpIdx++) << ARMII::RegRdShift; // Set second operand if (ImplicitRn) // Special handling for implicit use (e.g. PC). - Binary |= (ARMRegisterInfo::getRegisterNumbering(ImplicitRn) - << ARMII::RegRnShift); + Binary |= (getARMRegisterNumbering(ImplicitRn) << ARMII::RegRnShift); else Binary |= getMachineOpValue(MI, OpIdx++) << ARMII::RegRnShift; @@ -978,11 +1138,11 @@ void ARMCodeEmitter::emitLoadStoreInstruction(const MachineInstr &MI, return; } - // Set bit I(25), because this is not in immediate enconding. + // Set bit I(25), because this is not in immediate encoding. Binary |= 1 << ARMII::I_BitShift; assert(TargetRegisterInfo::isPhysicalRegister(MO2.getReg())); // Set bit[3:0] to the corresponding Rm register - Binary |= ARMRegisterInfo::getRegisterNumbering(MO2.getReg()); + Binary |= getARMRegisterNumbering(MO2.getReg()); // If this instr is in scaled register offset/index instruction, set // shift_immed(bit[11:7]) and shift(bit[6:5]) fields. @@ -1026,8 +1186,7 @@ void ARMCodeEmitter::emitMiscLoadStoreInstruction(const MachineInstr &MI, // Set second operand if (ImplicitRn) // Special handling for implicit use (e.g. PC). - Binary |= (ARMRegisterInfo::getRegisterNumbering(ImplicitRn) - << ARMII::RegRnShift); + Binary |= (getARMRegisterNumbering(ImplicitRn) << ARMII::RegRnShift); else Binary |= getMachineOpValue(MI, OpIdx++) << ARMII::RegRnShift; @@ -1046,7 +1205,7 @@ void ARMCodeEmitter::emitMiscLoadStoreInstruction(const MachineInstr &MI, // If this instr is in register offset/index encoding, set bit[3:0] // to the corresponding Rm register. if (MO2.getReg()) { - Binary |= ARMRegisterInfo::getRegisterNumbering(MO2.getReg()); + Binary |= getARMRegisterNumbering(MO2.getReg()); emitWordLE(Binary); return; } @@ -1100,8 +1259,8 @@ void ARMCodeEmitter::emitLoadStoreMultipleInstruction(const MachineInstr &MI) { Binary |= getMachineOpValue(MI, OpIdx++) << ARMII::RegRnShift; // Set addressing mode by modifying bits U(23) and P(24) - const MachineOperand &MO = MI.getOperand(OpIdx++); - Binary |= getAddrModeUPBits(ARM_AM::getAM4SubMode(MO.getImm())); + ARM_AM::AMSubMode Mode = ARM_AM::getLoadStoreMultipleSubMode(MI.getOpcode()); + Binary |= getAddrModeUPBits(ARM_AM::getAM4SubMode(Mode)); // Set bit W(21) if (IsUpdating) @@ -1112,7 +1271,7 @@ void ARMCodeEmitter::emitLoadStoreMultipleInstruction(const MachineInstr &MI) { const MachineOperand &MO = MI.getOperand(i); if (!MO.isReg() || MO.isImplicit()) break; - unsigned RegNum = ARMRegisterInfo::getRegisterNumbering(MO.getReg()); + unsigned RegNum = getARMRegisterNumbering(MO.getReg()); assert(TargetRegisterInfo::isPhysicalRegister(MO.getReg()) && RegNum < 16); Binary |= 0x1 << RegNum; @@ -1349,7 +1508,7 @@ void ARMCodeEmitter::emitMiscBranchInstruction(const MachineInstr &MI) { if (TID.Opcode == ARM::BX_RET || TID.Opcode == ARM::MOVPCLR) // The return register is LR. - Binary |= ARMRegisterInfo::getRegisterNumbering(ARM::LR); + Binary |= getARMRegisterNumbering(ARM::LR); else // otherwise, set the return register Binary |= getMachineOpValue(MI, 0); @@ -1360,8 +1519,8 @@ void ARMCodeEmitter::emitMiscBranchInstruction(const MachineInstr &MI) { static unsigned encodeVFPRd(const MachineInstr &MI, unsigned OpIdx) { unsigned RegD = MI.getOperand(OpIdx).getReg(); unsigned Binary = 0; - bool isSPVFP = false; - RegD = ARMRegisterInfo::getRegisterNumbering(RegD, &isSPVFP); + bool isSPVFP = ARM::SPRRegisterClass->contains(RegD); + RegD = getARMRegisterNumbering(RegD); if (!isSPVFP) Binary |= RegD << ARMII::RegRdShift; else { @@ -1374,8 +1533,8 @@ static unsigned encodeVFPRd(const MachineInstr &MI, unsigned OpIdx) { static unsigned encodeVFPRn(const MachineInstr &MI, unsigned OpIdx) { unsigned RegN = MI.getOperand(OpIdx).getReg(); unsigned Binary = 0; - bool isSPVFP = false; - RegN = ARMRegisterInfo::getRegisterNumbering(RegN, &isSPVFP); + bool isSPVFP = ARM::SPRRegisterClass->contains(RegN); + RegN = getARMRegisterNumbering(RegN); if (!isSPVFP) Binary |= RegN << ARMII::RegRnShift; else { @@ -1388,8 +1547,8 @@ static unsigned encodeVFPRn(const MachineInstr &MI, unsigned OpIdx) { static unsigned encodeVFPRm(const MachineInstr &MI, unsigned OpIdx) { unsigned RegM = MI.getOperand(OpIdx).getReg(); unsigned Binary = 0; - bool isSPVFP = false; - RegM = ARMRegisterInfo::getRegisterNumbering(RegM, &isSPVFP); + bool isSPVFP = ARM::SPRRegisterClass->contains(RegM); + RegM = getARMRegisterNumbering(RegM); if (!isSPVFP) Binary |= RegM; else { @@ -1548,8 +1707,8 @@ ARMCodeEmitter::emitVFPLoadStoreMultipleInstruction(const MachineInstr &MI) { Binary |= getMachineOpValue(MI, OpIdx++) << ARMII::RegRnShift; // Set addressing mode by modifying bits U(23) and P(24) - const MachineOperand &MO = MI.getOperand(OpIdx++); - Binary |= getAddrModeUPBits(ARM_AM::getAM4SubMode(MO.getImm())); + ARM_AM::AMSubMode Mode = ARM_AM::getLoadStoreMultipleSubMode(MI.getOpcode()); + Binary |= getAddrModeUPBits(ARM_AM::getAM4SubMode(Mode)); // Set bit W(21) if (IsUpdating) @@ -1576,63 +1735,10 @@ ARMCodeEmitter::emitVFPLoadStoreMultipleInstruction(const MachineInstr &MI) { emitWordLE(Binary); } -void ARMCodeEmitter::emitMiscInstruction(const MachineInstr &MI) { - unsigned Opcode = MI.getDesc().Opcode; - // Part of binary is determined by TableGn. - unsigned Binary = getBinaryCodeForInstr(MI); - - // Set the conditional execution predicate - Binary |= II->getPredicate(&MI) << ARMII::CondShift; - - switch(Opcode) { - default: - llvm_unreachable("ARMCodeEmitter::emitMiscInstruction"); - - case ARM::FMSTAT: - // No further encoding needed. - break; - - case ARM::VMRS: - case ARM::VMSR: { - const MachineOperand &MO0 = MI.getOperand(0); - // Encode Rt. - Binary |= ARMRegisterInfo::getRegisterNumbering(MO0.getReg()) - << ARMII::RegRdShift; - break; - } - - case ARM::FCONSTD: - case ARM::FCONSTS: { - // Encode Dd / Sd. - Binary |= encodeVFPRd(MI, 0); - - // Encode imm., Table A7-18 VFP modified immediate constants - const MachineOperand &MO1 = MI.getOperand(1); - unsigned Imm = static_cast<unsigned>(MO1.getFPImm()->getValueAPF() - .bitcastToAPInt().getHiBits(32).getLimitedValue()); - unsigned ModifiedImm; - - if(Opcode == ARM::FCONSTS) - ModifiedImm = (Imm & 0x80000000) >> 24 | // a - (Imm & 0x03F80000) >> 19; // bcdefgh - else // Opcode == ARM::FCONSTD - ModifiedImm = (Imm & 0x80000000) >> 24 | // a - (Imm & 0x007F0000) >> 16; // bcdefgh - - // Insts{19-16} = abcd, Insts{3-0} = efgh - Binary |= ((ModifiedImm & 0xF0) >> 4) << 16; - Binary |= (ModifiedImm & 0xF); - break; - } - } - - emitWordLE(Binary); -} - static unsigned encodeNEONRd(const MachineInstr &MI, unsigned OpIdx) { unsigned RegD = MI.getOperand(OpIdx).getReg(); unsigned Binary = 0; - RegD = ARMRegisterInfo::getRegisterNumbering(RegD); + RegD = getARMRegisterNumbering(RegD); Binary |= (RegD & 0xf) << ARMII::RegRdShift; Binary |= ((RegD >> 4) & 1) << ARMII::D_BitShift; return Binary; @@ -1641,7 +1747,7 @@ static unsigned encodeNEONRd(const MachineInstr &MI, unsigned OpIdx) { static unsigned encodeNEONRn(const MachineInstr &MI, unsigned OpIdx) { unsigned RegN = MI.getOperand(OpIdx).getReg(); unsigned Binary = 0; - RegN = ARMRegisterInfo::getRegisterNumbering(RegN); + RegN = getARMRegisterNumbering(RegN); Binary |= (RegN & 0xf) << ARMII::RegRnShift; Binary |= ((RegN >> 4) & 1) << ARMII::N_BitShift; return Binary; @@ -1650,7 +1756,7 @@ static unsigned encodeNEONRn(const MachineInstr &MI, unsigned OpIdx) { static unsigned encodeNEONRm(const MachineInstr &MI, unsigned OpIdx) { unsigned RegM = MI.getOperand(OpIdx).getReg(); unsigned Binary = 0; - RegM = ARMRegisterInfo::getRegisterNumbering(RegM); + RegM = getARMRegisterNumbering(RegM); Binary |= (RegM & 0xf); Binary |= ((RegM >> 4) & 1) << ARMII::M_BitShift; return Binary; @@ -1684,7 +1790,7 @@ void ARMCodeEmitter::emitNEONLaneInstruction(const MachineInstr &MI) { Binary |= (IsThumb ? ARMCC::AL : II->getPredicate(&MI)) << ARMII::CondShift; unsigned RegT = MI.getOperand(RegTOpIdx).getReg(); - RegT = ARMRegisterInfo::getRegisterNumbering(RegT); + RegT = getARMRegisterNumbering(RegT); Binary |= (RegT << ARMII::RegRdShift); Binary |= encodeNEONRn(MI, RegNOpIdx); @@ -1713,7 +1819,7 @@ void ARMCodeEmitter::emitNEONDupInstruction(const MachineInstr &MI) { Binary |= (IsThumb ? ARMCC::AL : II->getPredicate(&MI)) << ARMII::CondShift; unsigned RegT = MI.getOperand(1).getReg(); - RegT = ARMRegisterInfo::getRegisterNumbering(RegT); + RegT = getARMRegisterNumbering(RegT); Binary |= (RegT << ARMII::RegRdShift); Binary |= encodeNEONRn(MI, 0); emitWordLE(Binary); diff --git a/contrib/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp b/contrib/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp index 60e923b..13d1b33 100644 --- a/contrib/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp @@ -1,4 +1,4 @@ -//===-- ARMConstantIslandPass.cpp - ARM constant islands --------*- C++ -*-===// +//===-- ARMConstantIslandPass.cpp - ARM constant islands ------------------===// // // The LLVM Compiler Infrastructure // @@ -316,7 +316,7 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &MF) { } /// The next UID to take is the first unused one. - AFI->initConstPoolEntryUId(CPEMIs.size()); + AFI->initPICLabelUId(CPEMIs.size()); // Do the initial scan of the function, building up information about the // sizes of each block, the location of all the water, and finding all of the @@ -327,7 +327,7 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &MF) { /// Remove dead constant pool entries. - RemoveUnusedCPEntries(); + MadeChange |= RemoveUnusedCPEntries(); // Iteratively place constant pool entries and fix up branches until there // is no change. @@ -368,6 +368,14 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &MF) { if (isThumb && !HasFarJump && AFI->isLRSpilledForFarJump()) MadeChange |= UndoLRSpillRestore(); + // Save the mapping between original and cloned constpool entries. + for (unsigned i = 0, e = CPEntries.size(); i != e; ++i) { + for (unsigned j = 0, je = CPEntries[i].size(); j != je; ++j) { + const CPEntry & CPE = CPEntries[i][j]; + AFI->recordCPEClone(i, CPE.CPI); + } + } + DEBUG(errs() << '\n'; dumpBBs()); BBSizes.clear(); @@ -482,7 +490,7 @@ void ARMConstantIslands::InitialFunctionScan(MachineFunction &MF, HasInlineAsm = true; } - // Now go back through the instructions and build up our data structures + // Now go back through the instructions and build up our data structures. unsigned Offset = 0; for (MachineFunction::iterator MBBI = MF.begin(), E = MF.end(); MBBI != E; ++MBBI) { @@ -603,7 +611,7 @@ void ARMConstantIslands::InitialFunctionScan(MachineFunction &MF, Scale = 4; break; - case ARM::LDR: + case ARM::LDRi12: case ARM::LDRcp: case ARM::t2LDRpci: Bits = 12; // +-offset_12 @@ -611,7 +619,6 @@ void ARMConstantIslands::InitialFunctionScan(MachineFunction &MF, break; case ARM::tLDRpci: - case ARM::tLDRcp: Bits = 8; Scale = 4; // +(offset_8*4) break; @@ -692,7 +699,7 @@ static bool CompareMBBNumbers(const MachineBasicBlock *LHS, /// machine function, it upsets all of the block numbers. Renumber the blocks /// and update the arrays that parallel this numbering. void ARMConstantIslands::UpdateForInsertedWaterBlock(MachineBasicBlock *NewBB) { - // Renumber the MBB's to keep them consequtive. + // Renumber the MBB's to keep them consecutive. NewBB->getParent()->RenumberBlocks(NewBB); // Insert a size into BBSizes to align it properly with the (newly @@ -1242,7 +1249,7 @@ bool ARMConstantIslands::HandleConstantPoolUser(MachineFunction &MF, // No existing clone of this CPE is within range. // We will be generating a new clone. Get a UID for it. - unsigned ID = AFI->createConstPoolEntryUId(); + unsigned ID = AFI->createPICLabelUId(); // Look for water where we can place this CPE. MachineBasicBlock *NewIsland = MF.CreateMachineBasicBlock(); @@ -1644,7 +1651,7 @@ bool ARMConstantIslands::OptimizeThumb2Branches(MachineFunction &MF) { unsigned DestOffset = BBOffsets[DestBB->getNumber()]; if (BrOffset < DestOffset && (DestOffset - BrOffset) <= 126) { MachineBasicBlock::iterator CmpMI = Br.MI; --CmpMI; - if (CmpMI->getOpcode() == ARM::tCMPzi8) { + if (CmpMI->getOpcode() == ARM::tCMPi8) { unsigned Reg = CmpMI->getOperand(0).getReg(); Pred = llvm::getInstrPredicate(CmpMI, PredReg); if (Pred == ARMCC::AL && @@ -1766,7 +1773,7 @@ bool ARMConstantIslands::OptimizeThumb2JumpTables(MachineFunction &MF) { if (!OptOk) continue; - unsigned Opc = ByteOk ? ARM::t2TBB : ARM::t2TBH; + unsigned Opc = ByteOk ? ARM::t2TBB_JT : ARM::t2TBH_JT; MachineInstr *NewJTMI = BuildMI(MBB, MI->getDebugLoc(), TII->get(Opc)) .addReg(IdxReg, getKillRegState(IdxRegKill)) .addJumpTableIndex(JTI, JTOP.getTargetFlags()) diff --git a/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.cpp b/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.cpp index f13ccc6..165a1d8 100644 --- a/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.cpp @@ -24,7 +24,7 @@ using namespace llvm; ARMConstantPoolValue::ARMConstantPoolValue(const Constant *cval, unsigned id, ARMCP::ARMCPKind K, unsigned char PCAdj, - const char *Modif, + ARMCP::ARMCPModifier Modif, bool AddCA) : MachineConstantPoolValue((const Type*)cval->getType()), CVal(cval), S(NULL), LabelId(id), Kind(K), PCAdjust(PCAdj), @@ -33,17 +33,17 @@ ARMConstantPoolValue::ARMConstantPoolValue(const Constant *cval, unsigned id, ARMConstantPoolValue::ARMConstantPoolValue(LLVMContext &C, const char *s, unsigned id, unsigned char PCAdj, - const char *Modif, + ARMCP::ARMCPModifier Modif, bool AddCA) : MachineConstantPoolValue((const Type*)Type::getInt32Ty(C)), CVal(NULL), S(strdup(s)), LabelId(id), Kind(ARMCP::CPExtSymbol), PCAdjust(PCAdj), Modifier(Modif), AddCurrentAddress(AddCA) {} ARMConstantPoolValue::ARMConstantPoolValue(const GlobalValue *gv, - const char *Modif) + ARMCP::ARMCPModifier Modif) : MachineConstantPoolValue((const Type*)Type::getInt32Ty(gv->getContext())), CVal(gv), S(NULL), LabelId(0), Kind(ARMCP::CPValue), PCAdjust(0), - Modifier(Modif) {} + Modifier(Modif), AddCurrentAddress(false) {} const GlobalValue *ARMConstantPoolValue::getGV() const { return dyn_cast_or_null<GlobalValue>(CVal); @@ -53,6 +53,14 @@ const BlockAddress *ARMConstantPoolValue::getBlockAddress() const { return dyn_cast_or_null<BlockAddress>(CVal); } +static bool CPV_streq(const char *S1, const char *S2) { + if (S1 == S2) + return true; + if (S1 && S2 && strcmp(S1, S2) == 0) + return true; + return false; +} + int ARMConstantPoolValue::getExistingMachineCPValue(MachineConstantPool *CP, unsigned Alignment) { unsigned AlignMask = Alignment - 1; @@ -65,8 +73,8 @@ int ARMConstantPoolValue::getExistingMachineCPValue(MachineConstantPool *CP, if (CPV->CVal == CVal && CPV->LabelId == LabelId && CPV->PCAdjust == PCAdjust && - (CPV->S == S || strcmp(CPV->S, S) == 0) && - (CPV->Modifier == Modifier || strcmp(CPV->Modifier, Modifier) == 0)) + CPV_streq(CPV->S, S) && + CPV->Modifier == Modifier) return i; } } @@ -91,8 +99,8 @@ ARMConstantPoolValue::hasSameValue(ARMConstantPoolValue *ACPV) { if (ACPV->Kind == Kind && ACPV->CVal == CVal && ACPV->PCAdjust == PCAdjust && - (ACPV->S == S || strcmp(ACPV->S, S) == 0) && - (ACPV->Modifier == Modifier || strcmp(ACPV->Modifier, Modifier) == 0)) { + CPV_streq(ACPV->S, S) && + ACPV->Modifier == Modifier) { if (ACPV->LabelId == LabelId) return true; // Two PC relative constpool entries containing the same GV address or @@ -113,7 +121,7 @@ void ARMConstantPoolValue::print(raw_ostream &O) const { O << CVal->getName(); else O << S; - if (Modifier) O << "(" << Modifier << ")"; + if (Modifier) O << "(" << getModifierText() << ")"; if (PCAdjust != 0) { O << "-(LPC" << LabelId << "+" << (unsigned)PCAdjust; if (AddCurrentAddress) O << "-."; diff --git a/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.h b/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.h index 3119b54..d008811 100644 --- a/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.h +++ b/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.h @@ -15,6 +15,7 @@ #define LLVM_TARGET_ARM_CONSTANTPOOLVALUE_H #include "llvm/CodeGen/MachineConstantPool.h" +#include "llvm/Support/ErrorHandling.h" #include <cstddef> namespace llvm { @@ -31,6 +32,15 @@ namespace ARMCP { CPBlockAddress, CPLSDA }; + + enum ARMCPModifier { + no_modifier, + TLSGD, + GOT, + GOTOFF, + GOTTPOFF, + TPOFF + }; } /// ARMConstantPoolValue - ARM specific constantpool value. This is used to @@ -43,26 +53,41 @@ class ARMConstantPoolValue : public MachineConstantPoolValue { ARMCP::ARMCPKind Kind; // Kind of constant. unsigned char PCAdjust; // Extra adjustment if constantpool is pc-relative. // 8 for ARM, 4 for Thumb. - const char *Modifier; // GV modifier i.e. (&GV(modifier)-(LPIC+8)) + ARMCP::ARMCPModifier Modifier; // GV modifier i.e. (&GV(modifier)-(LPIC+8)) bool AddCurrentAddress; public: ARMConstantPoolValue(const Constant *cval, unsigned id, ARMCP::ARMCPKind Kind = ARMCP::CPValue, - unsigned char PCAdj = 0, const char *Modifier = NULL, + unsigned char PCAdj = 0, + ARMCP::ARMCPModifier Modifier = ARMCP::no_modifier, bool AddCurrentAddress = false); ARMConstantPoolValue(LLVMContext &C, const char *s, unsigned id, - unsigned char PCAdj = 0, const char *Modifier = NULL, + unsigned char PCAdj = 0, + ARMCP::ARMCPModifier Modifier = ARMCP::no_modifier, bool AddCurrentAddress = false); - ARMConstantPoolValue(const GlobalValue *GV, const char *Modifier); + ARMConstantPoolValue(const GlobalValue *GV, ARMCP::ARMCPModifier Modifier); ARMConstantPoolValue(); ~ARMConstantPoolValue(); const GlobalValue *getGV() const; const char *getSymbol() const { return S; } const BlockAddress *getBlockAddress() const; - const char *getModifier() const { return Modifier; } - bool hasModifier() const { return Modifier != NULL; } + ARMCP::ARMCPModifier getModifier() const { return Modifier; } + const char *getModifierText() const { + switch (Modifier) { + default: llvm_unreachable("Unknown modifier!"); + // FIXME: Are these case sensitive? It'd be nice to lower-case all the + // strings if that's legal. + case ARMCP::no_modifier: return "none"; + case ARMCP::TLSGD: return "tlsgd"; + case ARMCP::GOT: return "GOT"; + case ARMCP::GOTOFF: return "GOTOFF"; + case ARMCP::GOTTPOFF: return "gottpoff"; + case ARMCP::TPOFF: return "tpoff"; + } + } + bool hasModifier() const { return Modifier != ARMCP::no_modifier; } bool mustAddCurrentAddress() const { return AddCurrentAddress; } unsigned getLabelId() const { return LabelId; } unsigned char getPCAdjustment() const { return PCAdjust; } @@ -71,11 +96,7 @@ public: bool isBlockAddress() { return Kind == ARMCP::CPBlockAddress; } bool isLSDA() { return Kind == ARMCP::CPLSDA; } - virtual unsigned getRelocationInfo() const { - // FIXME: This is conservatively claiming that these entries require a - // relocation, we may be able to do better than this. - return 2; - } + virtual unsigned getRelocationInfo() const { return 2; } virtual int getExistingMachineCPValue(MachineConstantPool *CP, unsigned Alignment); diff --git a/contrib/llvm/lib/Target/ARM/ARMELFWriterInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMELFWriterInfo.cpp new file mode 100644 index 0000000..51e68b4 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMELFWriterInfo.cpp @@ -0,0 +1,83 @@ +//===-- ARMELFWriterInfo.cpp - ELF Writer Info for the ARM backend --------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements ELF writer information for the ARM backend. +// +//===----------------------------------------------------------------------===// + +#include "ARMELFWriterInfo.h" +#include "ARMRelocations.h" +#include "llvm/Function.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Target/TargetData.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Support/ELF.h" + +using namespace llvm; + +//===----------------------------------------------------------------------===// +// Implementation of the ARMELFWriterInfo class +//===----------------------------------------------------------------------===// + +ARMELFWriterInfo::ARMELFWriterInfo(TargetMachine &TM) + : TargetELFWriterInfo(TM.getTargetData()->getPointerSizeInBits() == 64, + TM.getTargetData()->isLittleEndian()) { +} + +ARMELFWriterInfo::~ARMELFWriterInfo() {} + +unsigned ARMELFWriterInfo::getRelocationType(unsigned MachineRelTy) const { + switch (MachineRelTy) { + case ARM::reloc_arm_absolute: + case ARM::reloc_arm_relative: + case ARM::reloc_arm_cp_entry: + case ARM::reloc_arm_vfp_cp_entry: + case ARM::reloc_arm_machine_cp_entry: + case ARM::reloc_arm_jt_base: + case ARM::reloc_arm_pic_jt: + assert(0 && "unsupported ARM relocation type"); break; + + case ARM::reloc_arm_branch: return ELF::R_ARM_CALL; break; + case ARM::reloc_arm_movt: return ELF::R_ARM_MOVT_ABS; break; + case ARM::reloc_arm_movw: return ELF::R_ARM_MOVW_ABS_NC; break; + default: + llvm_unreachable("unknown ARM relocation type"); break; + } + return 0; +} + +long int ARMELFWriterInfo::getDefaultAddendForRelTy(unsigned RelTy, + long int Modifier) const { + assert(0 && "ARMELFWriterInfo::getDefaultAddendForRelTy() not implemented"); + return 0; +} + +unsigned ARMELFWriterInfo::getRelocationTySize(unsigned RelTy) const { + assert(0 && "ARMELFWriterInfo::getRelocationTySize() not implemented"); + return 0; +} + +bool ARMELFWriterInfo::isPCRelativeRel(unsigned RelTy) const { + assert(0 && "ARMELFWriterInfo::isPCRelativeRel() not implemented"); + return 1; +} + +unsigned ARMELFWriterInfo::getAbsoluteLabelMachineRelTy() const { + assert(0 && + "ARMELFWriterInfo::getAbsoluteLabelMachineRelTy() not implemented"); + return 0; +} + +long int ARMELFWriterInfo::computeRelocation(unsigned SymOffset, + unsigned RelOffset, + unsigned RelTy) const { + assert(0 && + "ARMELFWriterInfo::getAbsoluteLabelMachineRelTy() not implemented"); + return 0; +} diff --git a/contrib/llvm/lib/Target/ARM/ARMELFWriterInfo.h b/contrib/llvm/lib/Target/ARM/ARMELFWriterInfo.h new file mode 100644 index 0000000..1c4e532 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMELFWriterInfo.h @@ -0,0 +1,58 @@ +//===-- ARMELFWriterInfo.h - ELF Writer Info for ARM ------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements ELF writer information for the ARM backend. +// +//===----------------------------------------------------------------------===// + +#ifndef ARM_ELF_WRITER_INFO_H +#define ARM_ELF_WRITER_INFO_H + +#include "llvm/Target/TargetELFWriterInfo.h" + +namespace llvm { + + class ARMELFWriterInfo : public TargetELFWriterInfo { + public: + ARMELFWriterInfo(TargetMachine &TM); + virtual ~ARMELFWriterInfo(); + + /// getRelocationType - Returns the target specific ELF Relocation type. + /// 'MachineRelTy' contains the object code independent relocation type + virtual unsigned getRelocationType(unsigned MachineRelTy) const; + + /// hasRelocationAddend - True if the target uses an addend in the + /// ELF relocation entry. + virtual bool hasRelocationAddend() const { return false; } + + /// getDefaultAddendForRelTy - Gets the default addend value for a + /// relocation entry based on the target ELF relocation type. + virtual long int getDefaultAddendForRelTy(unsigned RelTy, + long int Modifier = 0) const; + + /// getRelTySize - Returns the size of relocatable field in bits + virtual unsigned getRelocationTySize(unsigned RelTy) const; + + /// isPCRelativeRel - True if the relocation type is pc relative + virtual bool isPCRelativeRel(unsigned RelTy) const; + + /// getJumpTableRelocationTy - Returns the machine relocation type used + /// to reference a jumptable. + virtual unsigned getAbsoluteLabelMachineRelTy() const; + + /// computeRelocation - Some relocatable fields could be relocated + /// directly, avoiding the relocation symbol emission, compute the + /// final relocation value for this symbol. + virtual long int computeRelocation(unsigned SymOffset, unsigned RelOffset, + unsigned RelTy) const; + }; + +} // end llvm namespace + +#endif // ARM_ELF_WRITER_INFO_H diff --git a/contrib/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/contrib/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp index fc2e3c3..bd753d2 100644 --- a/contrib/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp @@ -7,36 +7,38 @@ // //===----------------------------------------------------------------------===// // -// This file contains a pass that expand pseudo instructions into target +// This file contains a pass that expands pseudo instructions into target // instructions to allow proper scheduling, if-conversion, and other late // optimizations. This pass should be run after register allocation but before -// post- regalloc scheduling pass. +// the post-regalloc scheduling pass. // //===----------------------------------------------------------------------===// #define DEBUG_TYPE "arm-pseudo" #include "ARM.h" +#include "ARMAddressingModes.h" #include "ARMBaseInstrInfo.h" +#include "ARMBaseRegisterInfo.h" +#include "ARMMachineFunctionInfo.h" +#include "ARMRegisterInfo.h" +#include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/Target/TargetFrameLowering.h" #include "llvm/Target/TargetRegisterInfo.h" +#include "llvm/Support/raw_ostream.h" // FIXME: for debug only. remove! using namespace llvm; namespace { class ARMExpandPseudo : public MachineFunctionPass { - // Constants for register spacing in NEON load/store instructions. - enum NEONRegSpacing { - SingleSpc, - EvenDblSpc, - OddDblSpc - }; - public: static char ID; ARMExpandPseudo() : MachineFunctionPass(ID) {} - const TargetInstrInfo *TII; + const ARMBaseInstrInfo *TII; const TargetRegisterInfo *TRI; + const ARMSubtarget *STI; + ARMFunctionInfo *AFI; virtual bool runOnMachineFunction(MachineFunction &Fn); @@ -47,11 +49,16 @@ namespace { private: void TransferImpOps(MachineInstr &OldMI, MachineInstrBuilder &UseMI, MachineInstrBuilder &DefMI); + bool ExpandMI(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI); bool ExpandMBB(MachineBasicBlock &MBB); - void ExpandVLD(MachineBasicBlock::iterator &MBBI, unsigned Opc, - bool hasWriteBack, NEONRegSpacing RegSpc, unsigned NumRegs); - void ExpandVST(MachineBasicBlock::iterator &MBBI, unsigned Opc, - bool hasWriteBack, NEONRegSpacing RegSpc, unsigned NumRegs); + void ExpandVLD(MachineBasicBlock::iterator &MBBI); + void ExpandVST(MachineBasicBlock::iterator &MBBI); + void ExpandLaneOp(MachineBasicBlock::iterator &MBBI); + void ExpandVTBL(MachineBasicBlock::iterator &MBBI, + unsigned Opc, bool IsExt, unsigned NumRegs); + void ExpandMOV32BitImm(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI); }; char ARMExpandPseudo::ID = 0; } @@ -67,44 +74,349 @@ void ARMExpandPseudo::TransferImpOps(MachineInstr &OldMI, const MachineOperand &MO = OldMI.getOperand(i); assert(MO.isReg() && MO.getReg()); if (MO.isUse()) - UseMI.addReg(MO.getReg(), getKillRegState(MO.isKill())); + UseMI.addOperand(MO); else - DefMI.addReg(MO.getReg(), - getDefRegState(true) | getDeadRegState(MO.isDead())); + DefMI.addOperand(MO); + } +} + +namespace { + // Constants for register spacing in NEON load/store instructions. + // For quad-register load-lane and store-lane pseudo instructors, the + // spacing is initially assumed to be EvenDblSpc, and that is changed to + // OddDblSpc depending on the lane number operand. + enum NEONRegSpacing { + SingleSpc, + EvenDblSpc, + OddDblSpc + }; + + // Entries for NEON load/store information table. The table is sorted by + // PseudoOpc for fast binary-search lookups. + struct NEONLdStTableEntry { + unsigned PseudoOpc; + unsigned RealOpc; + bool IsLoad; + bool HasWriteBack; + NEONRegSpacing RegSpacing; + unsigned char NumRegs; // D registers loaded or stored + unsigned char RegElts; // elements per D register; used for lane ops + + // Comparison methods for binary search of the table. + bool operator<(const NEONLdStTableEntry &TE) const { + return PseudoOpc < TE.PseudoOpc; + } + friend bool operator<(const NEONLdStTableEntry &TE, unsigned PseudoOpc) { + return TE.PseudoOpc < PseudoOpc; + } + friend bool LLVM_ATTRIBUTE_UNUSED operator<(unsigned PseudoOpc, + const NEONLdStTableEntry &TE) { + return PseudoOpc < TE.PseudoOpc; + } + }; +} + +static const NEONLdStTableEntry NEONLdStTable[] = { +{ ARM::VLD1DUPq16Pseudo, ARM::VLD1DUPq16, true, false, SingleSpc, 2, 4}, +{ ARM::VLD1DUPq16Pseudo_UPD, ARM::VLD1DUPq16_UPD, true, true, SingleSpc, 2, 4}, +{ ARM::VLD1DUPq32Pseudo, ARM::VLD1DUPq32, true, false, SingleSpc, 2, 2}, +{ ARM::VLD1DUPq32Pseudo_UPD, ARM::VLD1DUPq32_UPD, true, true, SingleSpc, 2, 2}, +{ ARM::VLD1DUPq8Pseudo, ARM::VLD1DUPq8, true, false, SingleSpc, 2, 8}, +{ ARM::VLD1DUPq8Pseudo_UPD, ARM::VLD1DUPq8_UPD, true, true, SingleSpc, 2, 8}, + +{ ARM::VLD1LNq16Pseudo, ARM::VLD1LNd16, true, false, EvenDblSpc, 1, 4 }, +{ ARM::VLD1LNq16Pseudo_UPD, ARM::VLD1LNd16_UPD, true, true, EvenDblSpc, 1, 4 }, +{ ARM::VLD1LNq32Pseudo, ARM::VLD1LNd32, true, false, EvenDblSpc, 1, 2 }, +{ ARM::VLD1LNq32Pseudo_UPD, ARM::VLD1LNd32_UPD, true, true, EvenDblSpc, 1, 2 }, +{ ARM::VLD1LNq8Pseudo, ARM::VLD1LNd8, true, false, EvenDblSpc, 1, 8 }, +{ ARM::VLD1LNq8Pseudo_UPD, ARM::VLD1LNd8_UPD, true, true, EvenDblSpc, 1, 8 }, + +{ ARM::VLD1d64QPseudo, ARM::VLD1d64Q, true, false, SingleSpc, 4, 1 }, +{ ARM::VLD1d64QPseudo_UPD, ARM::VLD1d64Q_UPD, true, true, SingleSpc, 4, 1 }, +{ ARM::VLD1d64TPseudo, ARM::VLD1d64T, true, false, SingleSpc, 3, 1 }, +{ ARM::VLD1d64TPseudo_UPD, ARM::VLD1d64T_UPD, true, true, SingleSpc, 3, 1 }, + +{ ARM::VLD1q16Pseudo, ARM::VLD1q16, true, false, SingleSpc, 2, 4 }, +{ ARM::VLD1q16Pseudo_UPD, ARM::VLD1q16_UPD, true, true, SingleSpc, 2, 4 }, +{ ARM::VLD1q32Pseudo, ARM::VLD1q32, true, false, SingleSpc, 2, 2 }, +{ ARM::VLD1q32Pseudo_UPD, ARM::VLD1q32_UPD, true, true, SingleSpc, 2, 2 }, +{ ARM::VLD1q64Pseudo, ARM::VLD1q64, true, false, SingleSpc, 2, 1 }, +{ ARM::VLD1q64Pseudo_UPD, ARM::VLD1q64_UPD, true, true, SingleSpc, 2, 1 }, +{ ARM::VLD1q8Pseudo, ARM::VLD1q8, true, false, SingleSpc, 2, 8 }, +{ ARM::VLD1q8Pseudo_UPD, ARM::VLD1q8_UPD, true, true, SingleSpc, 2, 8 }, + +{ ARM::VLD2DUPd16Pseudo, ARM::VLD2DUPd16, true, false, SingleSpc, 2, 4}, +{ ARM::VLD2DUPd16Pseudo_UPD, ARM::VLD2DUPd16_UPD, true, true, SingleSpc, 2, 4}, +{ ARM::VLD2DUPd32Pseudo, ARM::VLD2DUPd32, true, false, SingleSpc, 2, 2}, +{ ARM::VLD2DUPd32Pseudo_UPD, ARM::VLD2DUPd32_UPD, true, true, SingleSpc, 2, 2}, +{ ARM::VLD2DUPd8Pseudo, ARM::VLD2DUPd8, true, false, SingleSpc, 2, 8}, +{ ARM::VLD2DUPd8Pseudo_UPD, ARM::VLD2DUPd8_UPD, true, true, SingleSpc, 2, 8}, + +{ ARM::VLD2LNd16Pseudo, ARM::VLD2LNd16, true, false, SingleSpc, 2, 4 }, +{ ARM::VLD2LNd16Pseudo_UPD, ARM::VLD2LNd16_UPD, true, true, SingleSpc, 2, 4 }, +{ ARM::VLD2LNd32Pseudo, ARM::VLD2LNd32, true, false, SingleSpc, 2, 2 }, +{ ARM::VLD2LNd32Pseudo_UPD, ARM::VLD2LNd32_UPD, true, true, SingleSpc, 2, 2 }, +{ ARM::VLD2LNd8Pseudo, ARM::VLD2LNd8, true, false, SingleSpc, 2, 8 }, +{ ARM::VLD2LNd8Pseudo_UPD, ARM::VLD2LNd8_UPD, true, true, SingleSpc, 2, 8 }, +{ ARM::VLD2LNq16Pseudo, ARM::VLD2LNq16, true, false, EvenDblSpc, 2, 4 }, +{ ARM::VLD2LNq16Pseudo_UPD, ARM::VLD2LNq16_UPD, true, true, EvenDblSpc, 2, 4 }, +{ ARM::VLD2LNq32Pseudo, ARM::VLD2LNq32, true, false, EvenDblSpc, 2, 2 }, +{ ARM::VLD2LNq32Pseudo_UPD, ARM::VLD2LNq32_UPD, true, true, EvenDblSpc, 2, 2 }, + +{ ARM::VLD2d16Pseudo, ARM::VLD2d16, true, false, SingleSpc, 2, 4 }, +{ ARM::VLD2d16Pseudo_UPD, ARM::VLD2d16_UPD, true, true, SingleSpc, 2, 4 }, +{ ARM::VLD2d32Pseudo, ARM::VLD2d32, true, false, SingleSpc, 2, 2 }, +{ ARM::VLD2d32Pseudo_UPD, ARM::VLD2d32_UPD, true, true, SingleSpc, 2, 2 }, +{ ARM::VLD2d8Pseudo, ARM::VLD2d8, true, false, SingleSpc, 2, 8 }, +{ ARM::VLD2d8Pseudo_UPD, ARM::VLD2d8_UPD, true, true, SingleSpc, 2, 8 }, + +{ ARM::VLD2q16Pseudo, ARM::VLD2q16, true, false, SingleSpc, 4, 4 }, +{ ARM::VLD2q16Pseudo_UPD, ARM::VLD2q16_UPD, true, true, SingleSpc, 4, 4 }, +{ ARM::VLD2q32Pseudo, ARM::VLD2q32, true, false, SingleSpc, 4, 2 }, +{ ARM::VLD2q32Pseudo_UPD, ARM::VLD2q32_UPD, true, true, SingleSpc, 4, 2 }, +{ ARM::VLD2q8Pseudo, ARM::VLD2q8, true, false, SingleSpc, 4, 8 }, +{ ARM::VLD2q8Pseudo_UPD, ARM::VLD2q8_UPD, true, true, SingleSpc, 4, 8 }, + +{ ARM::VLD3DUPd16Pseudo, ARM::VLD3DUPd16, true, false, SingleSpc, 3, 4}, +{ ARM::VLD3DUPd16Pseudo_UPD, ARM::VLD3DUPd16_UPD, true, true, SingleSpc, 3, 4}, +{ ARM::VLD3DUPd32Pseudo, ARM::VLD3DUPd32, true, false, SingleSpc, 3, 2}, +{ ARM::VLD3DUPd32Pseudo_UPD, ARM::VLD3DUPd32_UPD, true, true, SingleSpc, 3, 2}, +{ ARM::VLD3DUPd8Pseudo, ARM::VLD3DUPd8, true, false, SingleSpc, 3, 8}, +{ ARM::VLD3DUPd8Pseudo_UPD, ARM::VLD3DUPd8_UPD, true, true, SingleSpc, 3, 8}, + +{ ARM::VLD3LNd16Pseudo, ARM::VLD3LNd16, true, false, SingleSpc, 3, 4 }, +{ ARM::VLD3LNd16Pseudo_UPD, ARM::VLD3LNd16_UPD, true, true, SingleSpc, 3, 4 }, +{ ARM::VLD3LNd32Pseudo, ARM::VLD3LNd32, true, false, SingleSpc, 3, 2 }, +{ ARM::VLD3LNd32Pseudo_UPD, ARM::VLD3LNd32_UPD, true, true, SingleSpc, 3, 2 }, +{ ARM::VLD3LNd8Pseudo, ARM::VLD3LNd8, true, false, SingleSpc, 3, 8 }, +{ ARM::VLD3LNd8Pseudo_UPD, ARM::VLD3LNd8_UPD, true, true, SingleSpc, 3, 8 }, +{ ARM::VLD3LNq16Pseudo, ARM::VLD3LNq16, true, false, EvenDblSpc, 3, 4 }, +{ ARM::VLD3LNq16Pseudo_UPD, ARM::VLD3LNq16_UPD, true, true, EvenDblSpc, 3, 4 }, +{ ARM::VLD3LNq32Pseudo, ARM::VLD3LNq32, true, false, EvenDblSpc, 3, 2 }, +{ ARM::VLD3LNq32Pseudo_UPD, ARM::VLD3LNq32_UPD, true, true, EvenDblSpc, 3, 2 }, + +{ ARM::VLD3d16Pseudo, ARM::VLD3d16, true, false, SingleSpc, 3, 4 }, +{ ARM::VLD3d16Pseudo_UPD, ARM::VLD3d16_UPD, true, true, SingleSpc, 3, 4 }, +{ ARM::VLD3d32Pseudo, ARM::VLD3d32, true, false, SingleSpc, 3, 2 }, +{ ARM::VLD3d32Pseudo_UPD, ARM::VLD3d32_UPD, true, true, SingleSpc, 3, 2 }, +{ ARM::VLD3d8Pseudo, ARM::VLD3d8, true, false, SingleSpc, 3, 8 }, +{ ARM::VLD3d8Pseudo_UPD, ARM::VLD3d8_UPD, true, true, SingleSpc, 3, 8 }, + +{ ARM::VLD3q16Pseudo_UPD, ARM::VLD3q16_UPD, true, true, EvenDblSpc, 3, 4 }, +{ ARM::VLD3q16oddPseudo, ARM::VLD3q16, true, false, OddDblSpc, 3, 4 }, +{ ARM::VLD3q16oddPseudo_UPD, ARM::VLD3q16_UPD, true, true, OddDblSpc, 3, 4 }, +{ ARM::VLD3q32Pseudo_UPD, ARM::VLD3q32_UPD, true, true, EvenDblSpc, 3, 2 }, +{ ARM::VLD3q32oddPseudo, ARM::VLD3q32, true, false, OddDblSpc, 3, 2 }, +{ ARM::VLD3q32oddPseudo_UPD, ARM::VLD3q32_UPD, true, true, OddDblSpc, 3, 2 }, +{ ARM::VLD3q8Pseudo_UPD, ARM::VLD3q8_UPD, true, true, EvenDblSpc, 3, 8 }, +{ ARM::VLD3q8oddPseudo, ARM::VLD3q8, true, false, OddDblSpc, 3, 8 }, +{ ARM::VLD3q8oddPseudo_UPD, ARM::VLD3q8_UPD, true, true, OddDblSpc, 3, 8 }, + +{ ARM::VLD4DUPd16Pseudo, ARM::VLD4DUPd16, true, false, SingleSpc, 4, 4}, +{ ARM::VLD4DUPd16Pseudo_UPD, ARM::VLD4DUPd16_UPD, true, true, SingleSpc, 4, 4}, +{ ARM::VLD4DUPd32Pseudo, ARM::VLD4DUPd32, true, false, SingleSpc, 4, 2}, +{ ARM::VLD4DUPd32Pseudo_UPD, ARM::VLD4DUPd32_UPD, true, true, SingleSpc, 4, 2}, +{ ARM::VLD4DUPd8Pseudo, ARM::VLD4DUPd8, true, false, SingleSpc, 4, 8}, +{ ARM::VLD4DUPd8Pseudo_UPD, ARM::VLD4DUPd8_UPD, true, true, SingleSpc, 4, 8}, + +{ ARM::VLD4LNd16Pseudo, ARM::VLD4LNd16, true, false, SingleSpc, 4, 4 }, +{ ARM::VLD4LNd16Pseudo_UPD, ARM::VLD4LNd16_UPD, true, true, SingleSpc, 4, 4 }, +{ ARM::VLD4LNd32Pseudo, ARM::VLD4LNd32, true, false, SingleSpc, 4, 2 }, +{ ARM::VLD4LNd32Pseudo_UPD, ARM::VLD4LNd32_UPD, true, true, SingleSpc, 4, 2 }, +{ ARM::VLD4LNd8Pseudo, ARM::VLD4LNd8, true, false, SingleSpc, 4, 8 }, +{ ARM::VLD4LNd8Pseudo_UPD, ARM::VLD4LNd8_UPD, true, true, SingleSpc, 4, 8 }, +{ ARM::VLD4LNq16Pseudo, ARM::VLD4LNq16, true, false, EvenDblSpc, 4, 4 }, +{ ARM::VLD4LNq16Pseudo_UPD, ARM::VLD4LNq16_UPD, true, true, EvenDblSpc, 4, 4 }, +{ ARM::VLD4LNq32Pseudo, ARM::VLD4LNq32, true, false, EvenDblSpc, 4, 2 }, +{ ARM::VLD4LNq32Pseudo_UPD, ARM::VLD4LNq32_UPD, true, true, EvenDblSpc, 4, 2 }, + +{ ARM::VLD4d16Pseudo, ARM::VLD4d16, true, false, SingleSpc, 4, 4 }, +{ ARM::VLD4d16Pseudo_UPD, ARM::VLD4d16_UPD, true, true, SingleSpc, 4, 4 }, +{ ARM::VLD4d32Pseudo, ARM::VLD4d32, true, false, SingleSpc, 4, 2 }, +{ ARM::VLD4d32Pseudo_UPD, ARM::VLD4d32_UPD, true, true, SingleSpc, 4, 2 }, +{ ARM::VLD4d8Pseudo, ARM::VLD4d8, true, false, SingleSpc, 4, 8 }, +{ ARM::VLD4d8Pseudo_UPD, ARM::VLD4d8_UPD, true, true, SingleSpc, 4, 8 }, + +{ ARM::VLD4q16Pseudo_UPD, ARM::VLD4q16_UPD, true, true, EvenDblSpc, 4, 4 }, +{ ARM::VLD4q16oddPseudo, ARM::VLD4q16, true, false, OddDblSpc, 4, 4 }, +{ ARM::VLD4q16oddPseudo_UPD, ARM::VLD4q16_UPD, true, true, OddDblSpc, 4, 4 }, +{ ARM::VLD4q32Pseudo_UPD, ARM::VLD4q32_UPD, true, true, EvenDblSpc, 4, 2 }, +{ ARM::VLD4q32oddPseudo, ARM::VLD4q32, true, false, OddDblSpc, 4, 2 }, +{ ARM::VLD4q32oddPseudo_UPD, ARM::VLD4q32_UPD, true, true, OddDblSpc, 4, 2 }, +{ ARM::VLD4q8Pseudo_UPD, ARM::VLD4q8_UPD, true, true, EvenDblSpc, 4, 8 }, +{ ARM::VLD4q8oddPseudo, ARM::VLD4q8, true, false, OddDblSpc, 4, 8 }, +{ ARM::VLD4q8oddPseudo_UPD, ARM::VLD4q8_UPD, true, true, OddDblSpc, 4, 8 }, + +{ ARM::VST1LNq16Pseudo, ARM::VST1LNd16, false, false, EvenDblSpc, 1, 4 }, +{ ARM::VST1LNq16Pseudo_UPD, ARM::VST1LNd16_UPD,false, true, EvenDblSpc, 1, 4 }, +{ ARM::VST1LNq32Pseudo, ARM::VST1LNd32, false, false, EvenDblSpc, 1, 2 }, +{ ARM::VST1LNq32Pseudo_UPD, ARM::VST1LNd32_UPD,false, true, EvenDblSpc, 1, 2 }, +{ ARM::VST1LNq8Pseudo, ARM::VST1LNd8, false, false, EvenDblSpc, 1, 8 }, +{ ARM::VST1LNq8Pseudo_UPD, ARM::VST1LNd8_UPD, false, true, EvenDblSpc, 1, 8 }, + +{ ARM::VST1d64QPseudo, ARM::VST1d64Q, false, false, SingleSpc, 4, 1 }, +{ ARM::VST1d64QPseudo_UPD, ARM::VST1d64Q_UPD, false, true, SingleSpc, 4, 1 }, +{ ARM::VST1d64TPseudo, ARM::VST1d64T, false, false, SingleSpc, 3, 1 }, +{ ARM::VST1d64TPseudo_UPD, ARM::VST1d64T_UPD, false, true, SingleSpc, 3, 1 }, + +{ ARM::VST1q16Pseudo, ARM::VST1q16, false, false, SingleSpc, 2, 4 }, +{ ARM::VST1q16Pseudo_UPD, ARM::VST1q16_UPD, false, true, SingleSpc, 2, 4 }, +{ ARM::VST1q32Pseudo, ARM::VST1q32, false, false, SingleSpc, 2, 2 }, +{ ARM::VST1q32Pseudo_UPD, ARM::VST1q32_UPD, false, true, SingleSpc, 2, 2 }, +{ ARM::VST1q64Pseudo, ARM::VST1q64, false, false, SingleSpc, 2, 1 }, +{ ARM::VST1q64Pseudo_UPD, ARM::VST1q64_UPD, false, true, SingleSpc, 2, 1 }, +{ ARM::VST1q8Pseudo, ARM::VST1q8, false, false, SingleSpc, 2, 8 }, +{ ARM::VST1q8Pseudo_UPD, ARM::VST1q8_UPD, false, true, SingleSpc, 2, 8 }, + +{ ARM::VST2LNd16Pseudo, ARM::VST2LNd16, false, false, SingleSpc, 2, 4 }, +{ ARM::VST2LNd16Pseudo_UPD, ARM::VST2LNd16_UPD, false, true, SingleSpc, 2, 4 }, +{ ARM::VST2LNd32Pseudo, ARM::VST2LNd32, false, false, SingleSpc, 2, 2 }, +{ ARM::VST2LNd32Pseudo_UPD, ARM::VST2LNd32_UPD, false, true, SingleSpc, 2, 2 }, +{ ARM::VST2LNd8Pseudo, ARM::VST2LNd8, false, false, SingleSpc, 2, 8 }, +{ ARM::VST2LNd8Pseudo_UPD, ARM::VST2LNd8_UPD, false, true, SingleSpc, 2, 8 }, +{ ARM::VST2LNq16Pseudo, ARM::VST2LNq16, false, false, EvenDblSpc, 2, 4}, +{ ARM::VST2LNq16Pseudo_UPD, ARM::VST2LNq16_UPD, false, true, EvenDblSpc, 2, 4}, +{ ARM::VST2LNq32Pseudo, ARM::VST2LNq32, false, false, EvenDblSpc, 2, 2}, +{ ARM::VST2LNq32Pseudo_UPD, ARM::VST2LNq32_UPD, false, true, EvenDblSpc, 2, 2}, + +{ ARM::VST2d16Pseudo, ARM::VST2d16, false, false, SingleSpc, 2, 4 }, +{ ARM::VST2d16Pseudo_UPD, ARM::VST2d16_UPD, false, true, SingleSpc, 2, 4 }, +{ ARM::VST2d32Pseudo, ARM::VST2d32, false, false, SingleSpc, 2, 2 }, +{ ARM::VST2d32Pseudo_UPD, ARM::VST2d32_UPD, false, true, SingleSpc, 2, 2 }, +{ ARM::VST2d8Pseudo, ARM::VST2d8, false, false, SingleSpc, 2, 8 }, +{ ARM::VST2d8Pseudo_UPD, ARM::VST2d8_UPD, false, true, SingleSpc, 2, 8 }, + +{ ARM::VST2q16Pseudo, ARM::VST2q16, false, false, SingleSpc, 4, 4 }, +{ ARM::VST2q16Pseudo_UPD, ARM::VST2q16_UPD, false, true, SingleSpc, 4, 4 }, +{ ARM::VST2q32Pseudo, ARM::VST2q32, false, false, SingleSpc, 4, 2 }, +{ ARM::VST2q32Pseudo_UPD, ARM::VST2q32_UPD, false, true, SingleSpc, 4, 2 }, +{ ARM::VST2q8Pseudo, ARM::VST2q8, false, false, SingleSpc, 4, 8 }, +{ ARM::VST2q8Pseudo_UPD, ARM::VST2q8_UPD, false, true, SingleSpc, 4, 8 }, + +{ ARM::VST3LNd16Pseudo, ARM::VST3LNd16, false, false, SingleSpc, 3, 4 }, +{ ARM::VST3LNd16Pseudo_UPD, ARM::VST3LNd16_UPD, false, true, SingleSpc, 3, 4 }, +{ ARM::VST3LNd32Pseudo, ARM::VST3LNd32, false, false, SingleSpc, 3, 2 }, +{ ARM::VST3LNd32Pseudo_UPD, ARM::VST3LNd32_UPD, false, true, SingleSpc, 3, 2 }, +{ ARM::VST3LNd8Pseudo, ARM::VST3LNd8, false, false, SingleSpc, 3, 8 }, +{ ARM::VST3LNd8Pseudo_UPD, ARM::VST3LNd8_UPD, false, true, SingleSpc, 3, 8 }, +{ ARM::VST3LNq16Pseudo, ARM::VST3LNq16, false, false, EvenDblSpc, 3, 4}, +{ ARM::VST3LNq16Pseudo_UPD, ARM::VST3LNq16_UPD, false, true, EvenDblSpc, 3, 4}, +{ ARM::VST3LNq32Pseudo, ARM::VST3LNq32, false, false, EvenDblSpc, 3, 2}, +{ ARM::VST3LNq32Pseudo_UPD, ARM::VST3LNq32_UPD, false, true, EvenDblSpc, 3, 2}, + +{ ARM::VST3d16Pseudo, ARM::VST3d16, false, false, SingleSpc, 3, 4 }, +{ ARM::VST3d16Pseudo_UPD, ARM::VST3d16_UPD, false, true, SingleSpc, 3, 4 }, +{ ARM::VST3d32Pseudo, ARM::VST3d32, false, false, SingleSpc, 3, 2 }, +{ ARM::VST3d32Pseudo_UPD, ARM::VST3d32_UPD, false, true, SingleSpc, 3, 2 }, +{ ARM::VST3d8Pseudo, ARM::VST3d8, false, false, SingleSpc, 3, 8 }, +{ ARM::VST3d8Pseudo_UPD, ARM::VST3d8_UPD, false, true, SingleSpc, 3, 8 }, + +{ ARM::VST3q16Pseudo_UPD, ARM::VST3q16_UPD, false, true, EvenDblSpc, 3, 4 }, +{ ARM::VST3q16oddPseudo, ARM::VST3q16, false, false, OddDblSpc, 3, 4 }, +{ ARM::VST3q16oddPseudo_UPD, ARM::VST3q16_UPD, false, true, OddDblSpc, 3, 4 }, +{ ARM::VST3q32Pseudo_UPD, ARM::VST3q32_UPD, false, true, EvenDblSpc, 3, 2 }, +{ ARM::VST3q32oddPseudo, ARM::VST3q32, false, false, OddDblSpc, 3, 2 }, +{ ARM::VST3q32oddPseudo_UPD, ARM::VST3q32_UPD, false, true, OddDblSpc, 3, 2 }, +{ ARM::VST3q8Pseudo_UPD, ARM::VST3q8_UPD, false, true, EvenDblSpc, 3, 8 }, +{ ARM::VST3q8oddPseudo, ARM::VST3q8, false, false, OddDblSpc, 3, 8 }, +{ ARM::VST3q8oddPseudo_UPD, ARM::VST3q8_UPD, false, true, OddDblSpc, 3, 8 }, + +{ ARM::VST4LNd16Pseudo, ARM::VST4LNd16, false, false, SingleSpc, 4, 4 }, +{ ARM::VST4LNd16Pseudo_UPD, ARM::VST4LNd16_UPD, false, true, SingleSpc, 4, 4 }, +{ ARM::VST4LNd32Pseudo, ARM::VST4LNd32, false, false, SingleSpc, 4, 2 }, +{ ARM::VST4LNd32Pseudo_UPD, ARM::VST4LNd32_UPD, false, true, SingleSpc, 4, 2 }, +{ ARM::VST4LNd8Pseudo, ARM::VST4LNd8, false, false, SingleSpc, 4, 8 }, +{ ARM::VST4LNd8Pseudo_UPD, ARM::VST4LNd8_UPD, false, true, SingleSpc, 4, 8 }, +{ ARM::VST4LNq16Pseudo, ARM::VST4LNq16, false, false, EvenDblSpc, 4, 4}, +{ ARM::VST4LNq16Pseudo_UPD, ARM::VST4LNq16_UPD, false, true, EvenDblSpc, 4, 4}, +{ ARM::VST4LNq32Pseudo, ARM::VST4LNq32, false, false, EvenDblSpc, 4, 2}, +{ ARM::VST4LNq32Pseudo_UPD, ARM::VST4LNq32_UPD, false, true, EvenDblSpc, 4, 2}, + +{ ARM::VST4d16Pseudo, ARM::VST4d16, false, false, SingleSpc, 4, 4 }, +{ ARM::VST4d16Pseudo_UPD, ARM::VST4d16_UPD, false, true, SingleSpc, 4, 4 }, +{ ARM::VST4d32Pseudo, ARM::VST4d32, false, false, SingleSpc, 4, 2 }, +{ ARM::VST4d32Pseudo_UPD, ARM::VST4d32_UPD, false, true, SingleSpc, 4, 2 }, +{ ARM::VST4d8Pseudo, ARM::VST4d8, false, false, SingleSpc, 4, 8 }, +{ ARM::VST4d8Pseudo_UPD, ARM::VST4d8_UPD, false, true, SingleSpc, 4, 8 }, + +{ ARM::VST4q16Pseudo_UPD, ARM::VST4q16_UPD, false, true, EvenDblSpc, 4, 4 }, +{ ARM::VST4q16oddPseudo, ARM::VST4q16, false, false, OddDblSpc, 4, 4 }, +{ ARM::VST4q16oddPseudo_UPD, ARM::VST4q16_UPD, false, true, OddDblSpc, 4, 4 }, +{ ARM::VST4q32Pseudo_UPD, ARM::VST4q32_UPD, false, true, EvenDblSpc, 4, 2 }, +{ ARM::VST4q32oddPseudo, ARM::VST4q32, false, false, OddDblSpc, 4, 2 }, +{ ARM::VST4q32oddPseudo_UPD, ARM::VST4q32_UPD, false, true, OddDblSpc, 4, 2 }, +{ ARM::VST4q8Pseudo_UPD, ARM::VST4q8_UPD, false, true, EvenDblSpc, 4, 8 }, +{ ARM::VST4q8oddPseudo, ARM::VST4q8, false, false, OddDblSpc, 4, 8 }, +{ ARM::VST4q8oddPseudo_UPD, ARM::VST4q8_UPD, false, true, OddDblSpc, 4, 8 } +}; + +/// LookupNEONLdSt - Search the NEONLdStTable for information about a NEON +/// load or store pseudo instruction. +static const NEONLdStTableEntry *LookupNEONLdSt(unsigned Opcode) { + unsigned NumEntries = array_lengthof(NEONLdStTable); + +#ifndef NDEBUG + // Make sure the table is sorted. + static bool TableChecked = false; + if (!TableChecked) { + for (unsigned i = 0; i != NumEntries-1; ++i) + assert(NEONLdStTable[i] < NEONLdStTable[i+1] && + "NEONLdStTable is not sorted!"); + TableChecked = true; + } +#endif + + const NEONLdStTableEntry *I = + std::lower_bound(NEONLdStTable, NEONLdStTable + NumEntries, Opcode); + if (I != NEONLdStTable + NumEntries && I->PseudoOpc == Opcode) + return I; + return NULL; +} + +/// GetDSubRegs - Get 4 D subregisters of a Q, QQ, or QQQQ register, +/// corresponding to the specified register spacing. Not all of the results +/// are necessarily valid, e.g., a Q register only has 2 D subregisters. +static void GetDSubRegs(unsigned Reg, NEONRegSpacing RegSpc, + const TargetRegisterInfo *TRI, unsigned &D0, + unsigned &D1, unsigned &D2, unsigned &D3) { + if (RegSpc == SingleSpc) { + D0 = TRI->getSubReg(Reg, ARM::dsub_0); + D1 = TRI->getSubReg(Reg, ARM::dsub_1); + D2 = TRI->getSubReg(Reg, ARM::dsub_2); + D3 = TRI->getSubReg(Reg, ARM::dsub_3); + } else if (RegSpc == EvenDblSpc) { + D0 = TRI->getSubReg(Reg, ARM::dsub_0); + D1 = TRI->getSubReg(Reg, ARM::dsub_2); + D2 = TRI->getSubReg(Reg, ARM::dsub_4); + D3 = TRI->getSubReg(Reg, ARM::dsub_6); + } else { + assert(RegSpc == OddDblSpc && "unknown register spacing"); + D0 = TRI->getSubReg(Reg, ARM::dsub_1); + D1 = TRI->getSubReg(Reg, ARM::dsub_3); + D2 = TRI->getSubReg(Reg, ARM::dsub_5); + D3 = TRI->getSubReg(Reg, ARM::dsub_7); } } /// ExpandVLD - Translate VLD pseudo instructions with Q, QQ or QQQQ register /// operands to real VLD instructions with D register operands. -void ARMExpandPseudo::ExpandVLD(MachineBasicBlock::iterator &MBBI, - unsigned Opc, bool hasWriteBack, - NEONRegSpacing RegSpc, unsigned NumRegs) { +void ARMExpandPseudo::ExpandVLD(MachineBasicBlock::iterator &MBBI) { MachineInstr &MI = *MBBI; MachineBasicBlock &MBB = *MI.getParent(); - MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc)); + const NEONLdStTableEntry *TableEntry = LookupNEONLdSt(MI.getOpcode()); + assert(TableEntry && TableEntry->IsLoad && "NEONLdStTable lookup failed"); + NEONRegSpacing RegSpc = TableEntry->RegSpacing; + unsigned NumRegs = TableEntry->NumRegs; + + MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(), + TII->get(TableEntry->RealOpc)); unsigned OpIdx = 0; bool DstIsDead = MI.getOperand(OpIdx).isDead(); unsigned DstReg = MI.getOperand(OpIdx++).getReg(); unsigned D0, D1, D2, D3; - if (RegSpc == SingleSpc) { - D0 = TRI->getSubReg(DstReg, ARM::dsub_0); - D1 = TRI->getSubReg(DstReg, ARM::dsub_1); - D2 = TRI->getSubReg(DstReg, ARM::dsub_2); - D3 = TRI->getSubReg(DstReg, ARM::dsub_3); - } else if (RegSpc == EvenDblSpc) { - D0 = TRI->getSubReg(DstReg, ARM::dsub_0); - D1 = TRI->getSubReg(DstReg, ARM::dsub_2); - D2 = TRI->getSubReg(DstReg, ARM::dsub_4); - D3 = TRI->getSubReg(DstReg, ARM::dsub_6); - } else { - assert(RegSpc == OddDblSpc && "unknown register spacing for VLD"); - D0 = TRI->getSubReg(DstReg, ARM::dsub_1); - D1 = TRI->getSubReg(DstReg, ARM::dsub_3); - D2 = TRI->getSubReg(DstReg, ARM::dsub_5); - D3 = TRI->getSubReg(DstReg, ARM::dsub_7); - } + GetDSubRegs(DstReg, RegSpc, TRI, D0, D1, D2, D3); MIB.addReg(D0, RegState::Define | getDeadRegState(DstIsDead)) .addReg(D1, RegState::Define | getDeadRegState(DstIsDead)); if (NumRegs > 2) @@ -112,107 +424,373 @@ void ARMExpandPseudo::ExpandVLD(MachineBasicBlock::iterator &MBBI, if (NumRegs > 3) MIB.addReg(D3, RegState::Define | getDeadRegState(DstIsDead)); - if (hasWriteBack) { - bool WBIsDead = MI.getOperand(OpIdx).isDead(); - unsigned WBReg = MI.getOperand(OpIdx++).getReg(); - MIB.addReg(WBReg, RegState::Define | getDeadRegState(WBIsDead)); - } + if (TableEntry->HasWriteBack) + MIB.addOperand(MI.getOperand(OpIdx++)); + // Copy the addrmode6 operands. - bool AddrIsKill = MI.getOperand(OpIdx).isKill(); - MIB.addReg(MI.getOperand(OpIdx++).getReg(), getKillRegState(AddrIsKill)); - MIB.addImm(MI.getOperand(OpIdx++).getImm()); - if (hasWriteBack) { - // Copy the am6offset operand. - bool OffsetIsKill = MI.getOperand(OpIdx).isKill(); - MIB.addReg(MI.getOperand(OpIdx++).getReg(), getKillRegState(OffsetIsKill)); - } + MIB.addOperand(MI.getOperand(OpIdx++)); + MIB.addOperand(MI.getOperand(OpIdx++)); + // Copy the am6offset operand. + if (TableEntry->HasWriteBack) + MIB.addOperand(MI.getOperand(OpIdx++)); - MIB = AddDefaultPred(MIB); - TransferImpOps(MI, MIB, MIB); - // For an instruction writing the odd subregs, add an implicit use of the - // super-register because the even subregs were loaded separately. - if (RegSpc == OddDblSpc) - MIB.addReg(DstReg, RegState::Implicit); + // For an instruction writing double-spaced subregs, the pseudo instruction + // has an extra operand that is a use of the super-register. Record the + // operand index and skip over it. + unsigned SrcOpIdx = 0; + if (RegSpc == EvenDblSpc || RegSpc == OddDblSpc) + SrcOpIdx = OpIdx++; + + // Copy the predicate operands. + MIB.addOperand(MI.getOperand(OpIdx++)); + MIB.addOperand(MI.getOperand(OpIdx++)); + + // Copy the super-register source operand used for double-spaced subregs over + // to the new instruction as an implicit operand. + if (SrcOpIdx != 0) { + MachineOperand MO = MI.getOperand(SrcOpIdx); + MO.setImplicit(true); + MIB.addOperand(MO); + } // Add an implicit def for the super-register. MIB.addReg(DstReg, RegState::ImplicitDefine | getDeadRegState(DstIsDead)); + TransferImpOps(MI, MIB, MIB); MI.eraseFromParent(); } /// ExpandVST - Translate VST pseudo instructions with Q, QQ or QQQQ register /// operands to real VST instructions with D register operands. -void ARMExpandPseudo::ExpandVST(MachineBasicBlock::iterator &MBBI, - unsigned Opc, bool hasWriteBack, - NEONRegSpacing RegSpc, unsigned NumRegs) { +void ARMExpandPseudo::ExpandVST(MachineBasicBlock::iterator &MBBI) { MachineInstr &MI = *MBBI; MachineBasicBlock &MBB = *MI.getParent(); - MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc)); + const NEONLdStTableEntry *TableEntry = LookupNEONLdSt(MI.getOpcode()); + assert(TableEntry && !TableEntry->IsLoad && "NEONLdStTable lookup failed"); + NEONRegSpacing RegSpc = TableEntry->RegSpacing; + unsigned NumRegs = TableEntry->NumRegs; + + MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(), + TII->get(TableEntry->RealOpc)); unsigned OpIdx = 0; - if (hasWriteBack) { - bool DstIsDead = MI.getOperand(OpIdx).isDead(); - unsigned DstReg = MI.getOperand(OpIdx++).getReg(); - MIB.addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead)); - } + if (TableEntry->HasWriteBack) + MIB.addOperand(MI.getOperand(OpIdx++)); + // Copy the addrmode6 operands. - bool AddrIsKill = MI.getOperand(OpIdx).isKill(); - MIB.addReg(MI.getOperand(OpIdx++).getReg(), getKillRegState(AddrIsKill)); - MIB.addImm(MI.getOperand(OpIdx++).getImm()); - if (hasWriteBack) { - // Copy the am6offset operand. - bool OffsetIsKill = MI.getOperand(OpIdx).isKill(); - MIB.addReg(MI.getOperand(OpIdx++).getReg(), getKillRegState(OffsetIsKill)); - } + MIB.addOperand(MI.getOperand(OpIdx++)); + MIB.addOperand(MI.getOperand(OpIdx++)); + // Copy the am6offset operand. + if (TableEntry->HasWriteBack) + MIB.addOperand(MI.getOperand(OpIdx++)); bool SrcIsKill = MI.getOperand(OpIdx).isKill(); - unsigned SrcReg = MI.getOperand(OpIdx).getReg(); + unsigned SrcReg = MI.getOperand(OpIdx++).getReg(); unsigned D0, D1, D2, D3; - if (RegSpc == SingleSpc) { - D0 = TRI->getSubReg(SrcReg, ARM::dsub_0); - D1 = TRI->getSubReg(SrcReg, ARM::dsub_1); - D2 = TRI->getSubReg(SrcReg, ARM::dsub_2); - D3 = TRI->getSubReg(SrcReg, ARM::dsub_3); - } else if (RegSpc == EvenDblSpc) { - D0 = TRI->getSubReg(SrcReg, ARM::dsub_0); - D1 = TRI->getSubReg(SrcReg, ARM::dsub_2); - D2 = TRI->getSubReg(SrcReg, ARM::dsub_4); - D3 = TRI->getSubReg(SrcReg, ARM::dsub_6); - } else { - assert(RegSpc == OddDblSpc && "unknown register spacing for VST"); - D0 = TRI->getSubReg(SrcReg, ARM::dsub_1); - D1 = TRI->getSubReg(SrcReg, ARM::dsub_3); - D2 = TRI->getSubReg(SrcReg, ARM::dsub_5); - D3 = TRI->getSubReg(SrcReg, ARM::dsub_7); - } - + GetDSubRegs(SrcReg, RegSpc, TRI, D0, D1, D2, D3); MIB.addReg(D0).addReg(D1); if (NumRegs > 2) MIB.addReg(D2); if (NumRegs > 3) MIB.addReg(D3); - MIB = AddDefaultPred(MIB); + + // Copy the predicate operands. + MIB.addOperand(MI.getOperand(OpIdx++)); + MIB.addOperand(MI.getOperand(OpIdx++)); + + if (SrcIsKill) + // Add an implicit kill for the super-reg. + (*MIB).addRegisterKilled(SrcReg, TRI, true); + TransferImpOps(MI, MIB, MIB); + MI.eraseFromParent(); +} + +/// ExpandLaneOp - Translate VLD*LN and VST*LN instructions with Q, QQ or QQQQ +/// register operands to real instructions with D register operands. +void ARMExpandPseudo::ExpandLaneOp(MachineBasicBlock::iterator &MBBI) { + MachineInstr &MI = *MBBI; + MachineBasicBlock &MBB = *MI.getParent(); + + const NEONLdStTableEntry *TableEntry = LookupNEONLdSt(MI.getOpcode()); + assert(TableEntry && "NEONLdStTable lookup failed"); + NEONRegSpacing RegSpc = TableEntry->RegSpacing; + unsigned NumRegs = TableEntry->NumRegs; + unsigned RegElts = TableEntry->RegElts; + + MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(), + TII->get(TableEntry->RealOpc)); + unsigned OpIdx = 0; + // The lane operand is always the 3rd from last operand, before the 2 + // predicate operands. + unsigned Lane = MI.getOperand(MI.getDesc().getNumOperands() - 3).getImm(); + + // Adjust the lane and spacing as needed for Q registers. + assert(RegSpc != OddDblSpc && "unexpected register spacing for VLD/VST-lane"); + if (RegSpc == EvenDblSpc && Lane >= RegElts) { + RegSpc = OddDblSpc; + Lane -= RegElts; + } + assert(Lane < RegElts && "out of range lane for VLD/VST-lane"); + + unsigned D0 = 0, D1 = 0, D2 = 0, D3 = 0; + unsigned DstReg = 0; + bool DstIsDead = false; + if (TableEntry->IsLoad) { + DstIsDead = MI.getOperand(OpIdx).isDead(); + DstReg = MI.getOperand(OpIdx++).getReg(); + GetDSubRegs(DstReg, RegSpc, TRI, D0, D1, D2, D3); + MIB.addReg(D0, RegState::Define | getDeadRegState(DstIsDead)); + if (NumRegs > 1) + MIB.addReg(D1, RegState::Define | getDeadRegState(DstIsDead)); + if (NumRegs > 2) + MIB.addReg(D2, RegState::Define | getDeadRegState(DstIsDead)); + if (NumRegs > 3) + MIB.addReg(D3, RegState::Define | getDeadRegState(DstIsDead)); + } + + if (TableEntry->HasWriteBack) + MIB.addOperand(MI.getOperand(OpIdx++)); + + // Copy the addrmode6 operands. + MIB.addOperand(MI.getOperand(OpIdx++)); + MIB.addOperand(MI.getOperand(OpIdx++)); + // Copy the am6offset operand. + if (TableEntry->HasWriteBack) + MIB.addOperand(MI.getOperand(OpIdx++)); + + // Grab the super-register source. + MachineOperand MO = MI.getOperand(OpIdx++); + if (!TableEntry->IsLoad) + GetDSubRegs(MO.getReg(), RegSpc, TRI, D0, D1, D2, D3); + + // Add the subregs as sources of the new instruction. + unsigned SrcFlags = (getUndefRegState(MO.isUndef()) | + getKillRegState(MO.isKill())); + MIB.addReg(D0, SrcFlags); + if (NumRegs > 1) + MIB.addReg(D1, SrcFlags); + if (NumRegs > 2) + MIB.addReg(D2, SrcFlags); + if (NumRegs > 3) + MIB.addReg(D3, SrcFlags); + + // Add the lane number operand. + MIB.addImm(Lane); + OpIdx += 1; + + // Copy the predicate operands. + MIB.addOperand(MI.getOperand(OpIdx++)); + MIB.addOperand(MI.getOperand(OpIdx++)); + + // Copy the super-register source to be an implicit source. + MO.setImplicit(true); + MIB.addOperand(MO); + if (TableEntry->IsLoad) + // Add an implicit def for the super-register. + MIB.addReg(DstReg, RegState::ImplicitDefine | getDeadRegState(DstIsDead)); TransferImpOps(MI, MIB, MIB); + MI.eraseFromParent(); +} + +/// ExpandVTBL - Translate VTBL and VTBX pseudo instructions with Q or QQ +/// register operands to real instructions with D register operands. +void ARMExpandPseudo::ExpandVTBL(MachineBasicBlock::iterator &MBBI, + unsigned Opc, bool IsExt, unsigned NumRegs) { + MachineInstr &MI = *MBBI; + MachineBasicBlock &MBB = *MI.getParent(); + + MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc)); + unsigned OpIdx = 0; + + // Transfer the destination register operand. + MIB.addOperand(MI.getOperand(OpIdx++)); + if (IsExt) + MIB.addOperand(MI.getOperand(OpIdx++)); + + bool SrcIsKill = MI.getOperand(OpIdx).isKill(); + unsigned SrcReg = MI.getOperand(OpIdx++).getReg(); + unsigned D0, D1, D2, D3; + GetDSubRegs(SrcReg, SingleSpc, TRI, D0, D1, D2, D3); + MIB.addReg(D0).addReg(D1); + if (NumRegs > 2) + MIB.addReg(D2); + if (NumRegs > 3) + MIB.addReg(D3); + + // Copy the other source register operand. + MIB.addOperand(MI.getOperand(OpIdx++)); + + // Copy the predicate operands. + MIB.addOperand(MI.getOperand(OpIdx++)); + MIB.addOperand(MI.getOperand(OpIdx++)); + if (SrcIsKill) // Add an implicit kill for the super-reg. (*MIB).addRegisterKilled(SrcReg, TRI, true); + TransferImpOps(MI, MIB, MIB); MI.eraseFromParent(); } -bool ARMExpandPseudo::ExpandMBB(MachineBasicBlock &MBB) { - bool Modified = false; +void ARMExpandPseudo::ExpandMOV32BitImm(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI) { + MachineInstr &MI = *MBBI; + unsigned Opcode = MI.getOpcode(); + unsigned PredReg = 0; + ARMCC::CondCodes Pred = llvm::getInstrPredicate(&MI, PredReg); + unsigned DstReg = MI.getOperand(0).getReg(); + bool DstIsDead = MI.getOperand(0).isDead(); + bool isCC = Opcode == ARM::MOVCCi32imm || Opcode == ARM::t2MOVCCi32imm; + const MachineOperand &MO = MI.getOperand(isCC ? 2 : 1); + MachineInstrBuilder LO16, HI16; - MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); - while (MBBI != E) { - MachineInstr &MI = *MBBI; - MachineBasicBlock::iterator NMBBI = llvm::next(MBBI); + if (!STI->hasV6T2Ops() && + (Opcode == ARM::MOVi32imm || Opcode == ARM::MOVCCi32imm)) { + // Expand into a movi + orr. + LO16 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MOVi), DstReg); + HI16 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::ORRri)) + .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead)) + .addReg(DstReg); + + assert (MO.isImm() && "MOVi32imm w/ non-immediate source operand!"); + unsigned ImmVal = (unsigned)MO.getImm(); + unsigned SOImmValV1 = ARM_AM::getSOImmTwoPartFirst(ImmVal); + unsigned SOImmValV2 = ARM_AM::getSOImmTwoPartSecond(ImmVal); + LO16 = LO16.addImm(SOImmValV1); + HI16 = HI16.addImm(SOImmValV2); + (*LO16).setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); + (*HI16).setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); + LO16.addImm(Pred).addReg(PredReg).addReg(0); + HI16.addImm(Pred).addReg(PredReg).addReg(0); + TransferImpOps(MI, LO16, HI16); + MI.eraseFromParent(); + return; + } + + unsigned LO16Opc = 0; + unsigned HI16Opc = 0; + if (Opcode == ARM::t2MOVi32imm || Opcode == ARM::t2MOVCCi32imm) { + LO16Opc = ARM::t2MOVi16; + HI16Opc = ARM::t2MOVTi16; + } else { + LO16Opc = ARM::MOVi16; + HI16Opc = ARM::MOVTi16; + } - bool ModifiedOp = true; - unsigned Opcode = MI.getOpcode(); - switch (Opcode) { + LO16 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(LO16Opc), DstReg); + HI16 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(HI16Opc)) + .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead)) + .addReg(DstReg); + + if (MO.isImm()) { + unsigned Imm = MO.getImm(); + unsigned Lo16 = Imm & 0xffff; + unsigned Hi16 = (Imm >> 16) & 0xffff; + LO16 = LO16.addImm(Lo16); + HI16 = HI16.addImm(Hi16); + } else { + const GlobalValue *GV = MO.getGlobal(); + unsigned TF = MO.getTargetFlags(); + LO16 = LO16.addGlobalAddress(GV, MO.getOffset(), TF | ARMII::MO_LO16); + HI16 = HI16.addGlobalAddress(GV, MO.getOffset(), TF | ARMII::MO_HI16); + } + + (*LO16).setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); + (*HI16).setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); + LO16.addImm(Pred).addReg(PredReg); + HI16.addImm(Pred).addReg(PredReg); + + TransferImpOps(MI, LO16, HI16); + MI.eraseFromParent(); +} + +bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI) { + MachineInstr &MI = *MBBI; + unsigned Opcode = MI.getOpcode(); + switch (Opcode) { default: - ModifiedOp = false; - break; + return false; + case ARM::Int_eh_sjlj_dispatchsetup: { + MachineFunction &MF = *MI.getParent()->getParent(); + const ARMBaseInstrInfo *AII = + static_cast<const ARMBaseInstrInfo*>(TII); + const ARMBaseRegisterInfo &RI = AII->getRegisterInfo(); + // For functions using a base pointer, we rematerialize it (via the frame + // pointer) here since eh.sjlj.setjmp and eh.sjlj.longjmp don't do it + // for us. Otherwise, expand to nothing. + if (RI.hasBasePointer(MF)) { + int32_t NumBytes = AFI->getFramePtrSpillOffset(); + unsigned FramePtr = RI.getFrameRegister(MF); + assert(MF.getTarget().getFrameLowering()->hasFP(MF) && + "base pointer without frame pointer?"); + + if (AFI->isThumb2Function()) { + llvm::emitT2RegPlusImmediate(MBB, MBBI, MI.getDebugLoc(), ARM::R6, + FramePtr, -NumBytes, ARMCC::AL, 0, *TII); + } else if (AFI->isThumbFunction()) { + llvm::emitThumbRegPlusImmediate(MBB, MBBI, ARM::R6, + FramePtr, -NumBytes, + *TII, RI, MI.getDebugLoc()); + } else { + llvm::emitARMRegPlusImmediate(MBB, MBBI, MI.getDebugLoc(), ARM::R6, + FramePtr, -NumBytes, ARMCC::AL, 0, + *TII); + } + // If there's dynamic realignment, adjust for it. + if (RI.needsStackRealignment(MF)) { + MachineFrameInfo *MFI = MF.getFrameInfo(); + unsigned MaxAlign = MFI->getMaxAlignment(); + assert (!AFI->isThumb1OnlyFunction()); + // Emit bic r6, r6, MaxAlign + unsigned bicOpc = AFI->isThumbFunction() ? + ARM::t2BICri : ARM::BICri; + AddDefaultCC(AddDefaultPred(BuildMI(MBB, MBBI, MI.getDebugLoc(), + TII->get(bicOpc), ARM::R6) + .addReg(ARM::R6, RegState::Kill) + .addImm(MaxAlign-1))); + } + + } + MI.eraseFromParent(); + return true; + } - case ARM::tLDRpci_pic: + case ARM::MOVsrl_flag: + case ARM::MOVsra_flag: { + // These are just fancy MOVs insructions. + AddDefaultPred(BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MOVs), + MI.getOperand(0).getReg()) + .addOperand(MI.getOperand(1)) + .addReg(0) + .addImm(ARM_AM::getSORegOpc((Opcode == ARM::MOVsrl_flag ? ARM_AM::lsr + : ARM_AM::asr), 1))) + .addReg(ARM::CPSR, RegState::Define); + MI.eraseFromParent(); + return true; + } + case ARM::RRX: { + // This encodes as "MOVs Rd, Rm, rrx + MachineInstrBuilder MIB = + AddDefaultPred(BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MOVs), + MI.getOperand(0).getReg()) + .addOperand(MI.getOperand(1)) + .addOperand(MI.getOperand(1)) + .addImm(ARM_AM::getSORegOpc(ARM_AM::rrx, 0))) + .addReg(0); + TransferImpOps(MI, MIB, MIB); + MI.eraseFromParent(); + return true; + } + case ARM::TPsoft: { + MachineInstrBuilder MIB = + BuildMI(MBB, MBBI, MI.getDebugLoc(), + TII->get(ARM::BL)) + .addExternalSymbol("__aeabi_read_tp", 0); + + (*MIB).setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); + TransferImpOps(MI, MIB, MIB); + MI.eraseFromParent(); + return true; + } + case ARM::tLDRpci_pic: case ARM::t2LDRpci_pic: { unsigned NewLdOpc = (Opcode == ARM::tLDRpci_pic) ? ARM::tLDRpci : ARM::t2LDRpci; @@ -225,54 +803,73 @@ bool ARMExpandPseudo::ExpandMBB(MachineBasicBlock &MBB) { (*MIB1).setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); MachineInstrBuilder MIB2 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::tPICADD)) - .addReg(DstReg, getDefRegState(true) | getDeadRegState(DstIsDead)) + .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead)) .addReg(DstReg) .addOperand(MI.getOperand(2)); TransferImpOps(MI, MIB1, MIB2); MI.eraseFromParent(); - break; + return true; } - case ARM::MOVi32imm: - case ARM::t2MOVi32imm: { - unsigned PredReg = 0; - ARMCC::CondCodes Pred = llvm::getInstrPredicate(&MI, PredReg); + case ARM::MOV_ga_dyn: + case ARM::MOV_ga_pcrel: + case ARM::MOV_ga_pcrel_ldr: + case ARM::t2MOV_ga_dyn: + case ARM::t2MOV_ga_pcrel: { + // Expand into movw + movw. Also "add pc" / ldr [pc] in PIC mode. + unsigned LabelId = AFI->createPICLabelUId(); unsigned DstReg = MI.getOperand(0).getReg(); bool DstIsDead = MI.getOperand(0).isDead(); - const MachineOperand &MO = MI.getOperand(1); - MachineInstrBuilder LO16, HI16; - - LO16 = BuildMI(MBB, MBBI, MI.getDebugLoc(), - TII->get(Opcode == ARM::MOVi32imm ? - ARM::MOVi16 : ARM::t2MOVi16), - DstReg); - HI16 = BuildMI(MBB, MBBI, MI.getDebugLoc(), - TII->get(Opcode == ARM::MOVi32imm ? - ARM::MOVTi16 : ARM::t2MOVTi16)) - .addReg(DstReg, getDefRegState(true) | getDeadRegState(DstIsDead)) - .addReg(DstReg); - - if (MO.isImm()) { - unsigned Imm = MO.getImm(); - unsigned Lo16 = Imm & 0xffff; - unsigned Hi16 = (Imm >> 16) & 0xffff; - LO16 = LO16.addImm(Lo16); - HI16 = HI16.addImm(Hi16); - } else { - const GlobalValue *GV = MO.getGlobal(); - unsigned TF = MO.getTargetFlags(); - LO16 = LO16.addGlobalAddress(GV, MO.getOffset(), TF | ARMII::MO_LO16); - HI16 = HI16.addGlobalAddress(GV, MO.getOffset(), TF | ARMII::MO_HI16); + const MachineOperand &MO1 = MI.getOperand(1); + const GlobalValue *GV = MO1.getGlobal(); + unsigned TF = MO1.getTargetFlags(); + bool isARM = Opcode != ARM::t2MOV_ga_pcrel; + bool isPIC = (Opcode != ARM::MOV_ga_dyn && Opcode != ARM::t2MOV_ga_dyn); + unsigned LO16Opc = isARM ? ARM::MOVi16_ga_pcrel : ARM::t2MOVi16_ga_pcrel; + unsigned HI16Opc = isARM ? ARM::MOVTi16_ga_pcrel : ARM::t2MOVTi16_ga_pcrel; + unsigned LO16TF = isPIC + ? ARMII::MO_LO16_NONLAZY_PIC : ARMII::MO_LO16_NONLAZY; + unsigned HI16TF = isPIC + ? ARMII::MO_HI16_NONLAZY_PIC : ARMII::MO_HI16_NONLAZY; + unsigned PICAddOpc = isARM + ? (Opcode == ARM::MOV_ga_pcrel_ldr ? ARM::PICLDR : ARM::PICADD) + : ARM::tPICADD; + MachineInstrBuilder MIB1 = BuildMI(MBB, MBBI, MI.getDebugLoc(), + TII->get(LO16Opc), DstReg) + .addGlobalAddress(GV, MO1.getOffset(), TF | LO16TF) + .addImm(LabelId); + MachineInstrBuilder MIB2 = BuildMI(MBB, MBBI, MI.getDebugLoc(), + TII->get(HI16Opc), DstReg) + .addReg(DstReg) + .addGlobalAddress(GV, MO1.getOffset(), TF | HI16TF) + .addImm(LabelId); + if (!isPIC) { + TransferImpOps(MI, MIB1, MIB2); + MI.eraseFromParent(); + return true; } - (*LO16).setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); - (*HI16).setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); - LO16.addImm(Pred).addReg(PredReg); - HI16.addImm(Pred).addReg(PredReg); - TransferImpOps(MI, LO16, HI16); + + MachineInstrBuilder MIB3 = BuildMI(MBB, MBBI, MI.getDebugLoc(), + TII->get(PICAddOpc)) + .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead)) + .addReg(DstReg).addImm(LabelId); + if (isARM) { + AddDefaultPred(MIB3); + if (Opcode == ARM::MOV_ga_pcrel_ldr) + (*MIB2).setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); + } + TransferImpOps(MI, MIB1, MIB3); MI.eraseFromParent(); - break; + return true; } + case ARM::MOVi32imm: + case ARM::MOVCCi32imm: + case ARM::t2MOVi32imm: + case ARM::t2MOVCCi32imm: + ExpandMOV32BitImm(MBB, MBBI); + return true; + case ARM::VMOVQQ: { unsigned DstReg = MI.getOperand(0).getReg(); bool DstIsDead = MI.getOperand(0).isDead(); @@ -285,222 +882,339 @@ bool ARMExpandPseudo::ExpandMBB(MachineBasicBlock &MBB) { MachineInstrBuilder Even = AddDefaultPred(BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::VMOVQ)) - .addReg(EvenDst, - getDefRegState(true) | getDeadRegState(DstIsDead)) - .addReg(EvenSrc, getKillRegState(SrcIsKill))); + .addReg(EvenDst, + RegState::Define | getDeadRegState(DstIsDead)) + .addReg(EvenSrc, getKillRegState(SrcIsKill))); MachineInstrBuilder Odd = AddDefaultPred(BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::VMOVQ)) - .addReg(OddDst, - getDefRegState(true) | getDeadRegState(DstIsDead)) - .addReg(OddSrc, getKillRegState(SrcIsKill))); + .addReg(OddDst, + RegState::Define | getDeadRegState(DstIsDead)) + .addReg(OddSrc, getKillRegState(SrcIsKill))); TransferImpOps(MI, Even, Odd); MI.eraseFromParent(); + return true; + } + + case ARM::VLDMQIA: + case ARM::VLDMQDB: { + unsigned NewOpc = (Opcode == ARM::VLDMQIA) ? ARM::VLDMDIA : ARM::VLDMDDB; + MachineInstrBuilder MIB = + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(NewOpc)); + unsigned OpIdx = 0; + + // Grab the Q register destination. + bool DstIsDead = MI.getOperand(OpIdx).isDead(); + unsigned DstReg = MI.getOperand(OpIdx++).getReg(); + + // Copy the source register. + MIB.addOperand(MI.getOperand(OpIdx++)); + + // Copy the predicate operands. + MIB.addOperand(MI.getOperand(OpIdx++)); + MIB.addOperand(MI.getOperand(OpIdx++)); + + // Add the destination operands (D subregs). + unsigned D0 = TRI->getSubReg(DstReg, ARM::dsub_0); + unsigned D1 = TRI->getSubReg(DstReg, ARM::dsub_1); + MIB.addReg(D0, RegState::Define | getDeadRegState(DstIsDead)) + .addReg(D1, RegState::Define | getDeadRegState(DstIsDead)); + + // Add an implicit def for the super-register. + MIB.addReg(DstReg, RegState::ImplicitDefine | getDeadRegState(DstIsDead)); + TransferImpOps(MI, MIB, MIB); + MI.eraseFromParent(); + return true; + } + + case ARM::VSTMQIA: + case ARM::VSTMQDB: { + unsigned NewOpc = (Opcode == ARM::VSTMQIA) ? ARM::VSTMDIA : ARM::VSTMDDB; + MachineInstrBuilder MIB = + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(NewOpc)); + unsigned OpIdx = 0; + + // Grab the Q register source. + bool SrcIsKill = MI.getOperand(OpIdx).isKill(); + unsigned SrcReg = MI.getOperand(OpIdx++).getReg(); + + // Copy the destination register. + MIB.addOperand(MI.getOperand(OpIdx++)); + + // Copy the predicate operands. + MIB.addOperand(MI.getOperand(OpIdx++)); + MIB.addOperand(MI.getOperand(OpIdx++)); + + // Add the source operands (D subregs). + unsigned D0 = TRI->getSubReg(SrcReg, ARM::dsub_0); + unsigned D1 = TRI->getSubReg(SrcReg, ARM::dsub_1); + MIB.addReg(D0).addReg(D1); + + if (SrcIsKill) + // Add an implicit kill for the Q register. + (*MIB).addRegisterKilled(SrcReg, TRI, true); + + TransferImpOps(MI, MIB, MIB); + MI.eraseFromParent(); + return true; + } + case ARM::VDUPfqf: + case ARM::VDUPfdf:{ + unsigned NewOpc = Opcode == ARM::VDUPfqf ? ARM::VDUPLNfq : ARM::VDUPLNfd; + MachineInstrBuilder MIB = + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(NewOpc)); + unsigned OpIdx = 0; + unsigned SrcReg = MI.getOperand(1).getReg(); + unsigned Lane = getARMRegisterNumbering(SrcReg) & 1; + unsigned DReg = TRI->getMatchingSuperReg(SrcReg, + Lane & 1 ? ARM::ssub_1 : ARM::ssub_0, &ARM::DPR_VFP2RegClass); + // The lane is [0,1] for the containing DReg superregister. + // Copy the dst/src register operands. + MIB.addOperand(MI.getOperand(OpIdx++)); + MIB.addReg(DReg); + ++OpIdx; + // Add the lane select operand. + MIB.addImm(Lane); + // Add the predicate operands. + MIB.addOperand(MI.getOperand(OpIdx++)); + MIB.addOperand(MI.getOperand(OpIdx++)); + + TransferImpOps(MI, MIB, MIB); + MI.eraseFromParent(); + return true; } case ARM::VLD1q8Pseudo: - ExpandVLD(MBBI, ARM::VLD1q8, false, SingleSpc, 2); break; case ARM::VLD1q16Pseudo: - ExpandVLD(MBBI, ARM::VLD1q16, false, SingleSpc, 2); break; case ARM::VLD1q32Pseudo: - ExpandVLD(MBBI, ARM::VLD1q32, false, SingleSpc, 2); break; case ARM::VLD1q64Pseudo: - ExpandVLD(MBBI, ARM::VLD1q64, false, SingleSpc, 2); break; case ARM::VLD1q8Pseudo_UPD: - ExpandVLD(MBBI, ARM::VLD1q8, true, SingleSpc, 2); break; case ARM::VLD1q16Pseudo_UPD: - ExpandVLD(MBBI, ARM::VLD1q16, true, SingleSpc, 2); break; case ARM::VLD1q32Pseudo_UPD: - ExpandVLD(MBBI, ARM::VLD1q32, true, SingleSpc, 2); break; case ARM::VLD1q64Pseudo_UPD: - ExpandVLD(MBBI, ARM::VLD1q64, true, SingleSpc, 2); break; - case ARM::VLD2d8Pseudo: - ExpandVLD(MBBI, ARM::VLD2d8, false, SingleSpc, 2); break; case ARM::VLD2d16Pseudo: - ExpandVLD(MBBI, ARM::VLD2d16, false, SingleSpc, 2); break; case ARM::VLD2d32Pseudo: - ExpandVLD(MBBI, ARM::VLD2d32, false, SingleSpc, 2); break; case ARM::VLD2q8Pseudo: - ExpandVLD(MBBI, ARM::VLD2q8, false, SingleSpc, 4); break; case ARM::VLD2q16Pseudo: - ExpandVLD(MBBI, ARM::VLD2q16, false, SingleSpc, 4); break; case ARM::VLD2q32Pseudo: - ExpandVLD(MBBI, ARM::VLD2q32, false, SingleSpc, 4); break; case ARM::VLD2d8Pseudo_UPD: - ExpandVLD(MBBI, ARM::VLD2d8, true, SingleSpc, 2); break; case ARM::VLD2d16Pseudo_UPD: - ExpandVLD(MBBI, ARM::VLD2d16, true, SingleSpc, 2); break; case ARM::VLD2d32Pseudo_UPD: - ExpandVLD(MBBI, ARM::VLD2d32, true, SingleSpc, 2); break; case ARM::VLD2q8Pseudo_UPD: - ExpandVLD(MBBI, ARM::VLD2q8, true, SingleSpc, 4); break; case ARM::VLD2q16Pseudo_UPD: - ExpandVLD(MBBI, ARM::VLD2q16, true, SingleSpc, 4); break; case ARM::VLD2q32Pseudo_UPD: - ExpandVLD(MBBI, ARM::VLD2q32, true, SingleSpc, 4); break; - case ARM::VLD3d8Pseudo: - ExpandVLD(MBBI, ARM::VLD3d8, false, SingleSpc, 3); break; case ARM::VLD3d16Pseudo: - ExpandVLD(MBBI, ARM::VLD3d16, false, SingleSpc, 3); break; case ARM::VLD3d32Pseudo: - ExpandVLD(MBBI, ARM::VLD3d32, false, SingleSpc, 3); break; case ARM::VLD1d64TPseudo: - ExpandVLD(MBBI, ARM::VLD1d64T, false, SingleSpc, 3); break; case ARM::VLD3d8Pseudo_UPD: - ExpandVLD(MBBI, ARM::VLD3d8_UPD, true, SingleSpc, 3); break; case ARM::VLD3d16Pseudo_UPD: - ExpandVLD(MBBI, ARM::VLD3d16_UPD, true, SingleSpc, 3); break; case ARM::VLD3d32Pseudo_UPD: - ExpandVLD(MBBI, ARM::VLD3d32_UPD, true, SingleSpc, 3); break; case ARM::VLD1d64TPseudo_UPD: - ExpandVLD(MBBI, ARM::VLD1d64T_UPD, true, SingleSpc, 3); break; case ARM::VLD3q8Pseudo_UPD: - ExpandVLD(MBBI, ARM::VLD3q8_UPD, true, EvenDblSpc, 3); break; case ARM::VLD3q16Pseudo_UPD: - ExpandVLD(MBBI, ARM::VLD3q16_UPD, true, EvenDblSpc, 3); break; case ARM::VLD3q32Pseudo_UPD: - ExpandVLD(MBBI, ARM::VLD3q32_UPD, true, EvenDblSpc, 3); break; + case ARM::VLD3q8oddPseudo: + case ARM::VLD3q16oddPseudo: + case ARM::VLD3q32oddPseudo: case ARM::VLD3q8oddPseudo_UPD: - ExpandVLD(MBBI, ARM::VLD3q8_UPD, true, OddDblSpc, 3); break; case ARM::VLD3q16oddPseudo_UPD: - ExpandVLD(MBBI, ARM::VLD3q16_UPD, true, OddDblSpc, 3); break; case ARM::VLD3q32oddPseudo_UPD: - ExpandVLD(MBBI, ARM::VLD3q32_UPD, true, OddDblSpc, 3); break; - case ARM::VLD4d8Pseudo: - ExpandVLD(MBBI, ARM::VLD4d8, false, SingleSpc, 4); break; case ARM::VLD4d16Pseudo: - ExpandVLD(MBBI, ARM::VLD4d16, false, SingleSpc, 4); break; case ARM::VLD4d32Pseudo: - ExpandVLD(MBBI, ARM::VLD4d32, false, SingleSpc, 4); break; case ARM::VLD1d64QPseudo: - ExpandVLD(MBBI, ARM::VLD1d64Q, false, SingleSpc, 4); break; case ARM::VLD4d8Pseudo_UPD: - ExpandVLD(MBBI, ARM::VLD4d8_UPD, true, SingleSpc, 4); break; case ARM::VLD4d16Pseudo_UPD: - ExpandVLD(MBBI, ARM::VLD4d16_UPD, true, SingleSpc, 4); break; case ARM::VLD4d32Pseudo_UPD: - ExpandVLD(MBBI, ARM::VLD4d32_UPD, true, SingleSpc, 4); break; case ARM::VLD1d64QPseudo_UPD: - ExpandVLD(MBBI, ARM::VLD1d64Q_UPD, true, SingleSpc, 4); break; case ARM::VLD4q8Pseudo_UPD: - ExpandVLD(MBBI, ARM::VLD4q8_UPD, true, EvenDblSpc, 4); break; case ARM::VLD4q16Pseudo_UPD: - ExpandVLD(MBBI, ARM::VLD4q16_UPD, true, EvenDblSpc, 4); break; case ARM::VLD4q32Pseudo_UPD: - ExpandVLD(MBBI, ARM::VLD4q32_UPD, true, EvenDblSpc, 4); break; + case ARM::VLD4q8oddPseudo: + case ARM::VLD4q16oddPseudo: + case ARM::VLD4q32oddPseudo: case ARM::VLD4q8oddPseudo_UPD: - ExpandVLD(MBBI, ARM::VLD4q8_UPD, true, OddDblSpc, 4); break; case ARM::VLD4q16oddPseudo_UPD: - ExpandVLD(MBBI, ARM::VLD4q16_UPD, true, OddDblSpc, 4); break; case ARM::VLD4q32oddPseudo_UPD: - ExpandVLD(MBBI, ARM::VLD4q32_UPD, true, OddDblSpc, 4); break; + case ARM::VLD1DUPq8Pseudo: + case ARM::VLD1DUPq16Pseudo: + case ARM::VLD1DUPq32Pseudo: + case ARM::VLD1DUPq8Pseudo_UPD: + case ARM::VLD1DUPq16Pseudo_UPD: + case ARM::VLD1DUPq32Pseudo_UPD: + case ARM::VLD2DUPd8Pseudo: + case ARM::VLD2DUPd16Pseudo: + case ARM::VLD2DUPd32Pseudo: + case ARM::VLD2DUPd8Pseudo_UPD: + case ARM::VLD2DUPd16Pseudo_UPD: + case ARM::VLD2DUPd32Pseudo_UPD: + case ARM::VLD3DUPd8Pseudo: + case ARM::VLD3DUPd16Pseudo: + case ARM::VLD3DUPd32Pseudo: + case ARM::VLD3DUPd8Pseudo_UPD: + case ARM::VLD3DUPd16Pseudo_UPD: + case ARM::VLD3DUPd32Pseudo_UPD: + case ARM::VLD4DUPd8Pseudo: + case ARM::VLD4DUPd16Pseudo: + case ARM::VLD4DUPd32Pseudo: + case ARM::VLD4DUPd8Pseudo_UPD: + case ARM::VLD4DUPd16Pseudo_UPD: + case ARM::VLD4DUPd32Pseudo_UPD: + ExpandVLD(MBBI); + return true; case ARM::VST1q8Pseudo: - ExpandVST(MBBI, ARM::VST1q8, false, SingleSpc, 2); break; case ARM::VST1q16Pseudo: - ExpandVST(MBBI, ARM::VST1q16, false, SingleSpc, 2); break; case ARM::VST1q32Pseudo: - ExpandVST(MBBI, ARM::VST1q32, false, SingleSpc, 2); break; case ARM::VST1q64Pseudo: - ExpandVST(MBBI, ARM::VST1q64, false, SingleSpc, 2); break; case ARM::VST1q8Pseudo_UPD: - ExpandVST(MBBI, ARM::VST1q8_UPD, true, SingleSpc, 2); break; case ARM::VST1q16Pseudo_UPD: - ExpandVST(MBBI, ARM::VST1q16_UPD, true, SingleSpc, 2); break; case ARM::VST1q32Pseudo_UPD: - ExpandVST(MBBI, ARM::VST1q32_UPD, true, SingleSpc, 2); break; case ARM::VST1q64Pseudo_UPD: - ExpandVST(MBBI, ARM::VST1q64_UPD, true, SingleSpc, 2); break; - case ARM::VST2d8Pseudo: - ExpandVST(MBBI, ARM::VST2d8, false, SingleSpc, 2); break; case ARM::VST2d16Pseudo: - ExpandVST(MBBI, ARM::VST2d16, false, SingleSpc, 2); break; case ARM::VST2d32Pseudo: - ExpandVST(MBBI, ARM::VST2d32, false, SingleSpc, 2); break; case ARM::VST2q8Pseudo: - ExpandVST(MBBI, ARM::VST2q8, false, SingleSpc, 4); break; case ARM::VST2q16Pseudo: - ExpandVST(MBBI, ARM::VST2q16, false, SingleSpc, 4); break; case ARM::VST2q32Pseudo: - ExpandVST(MBBI, ARM::VST2q32, false, SingleSpc, 4); break; case ARM::VST2d8Pseudo_UPD: - ExpandVST(MBBI, ARM::VST2d8_UPD, true, SingleSpc, 2); break; case ARM::VST2d16Pseudo_UPD: - ExpandVST(MBBI, ARM::VST2d16_UPD, true, SingleSpc, 2); break; case ARM::VST2d32Pseudo_UPD: - ExpandVST(MBBI, ARM::VST2d32_UPD, true, SingleSpc, 2); break; case ARM::VST2q8Pseudo_UPD: - ExpandVST(MBBI, ARM::VST2q8_UPD, true, SingleSpc, 4); break; case ARM::VST2q16Pseudo_UPD: - ExpandVST(MBBI, ARM::VST2q16_UPD, true, SingleSpc, 4); break; case ARM::VST2q32Pseudo_UPD: - ExpandVST(MBBI, ARM::VST2q32_UPD, true, SingleSpc, 4); break; - case ARM::VST3d8Pseudo: - ExpandVST(MBBI, ARM::VST3d8, false, SingleSpc, 3); break; case ARM::VST3d16Pseudo: - ExpandVST(MBBI, ARM::VST3d16, false, SingleSpc, 3); break; case ARM::VST3d32Pseudo: - ExpandVST(MBBI, ARM::VST3d32, false, SingleSpc, 3); break; case ARM::VST1d64TPseudo: - ExpandVST(MBBI, ARM::VST1d64T, false, SingleSpc, 3); break; case ARM::VST3d8Pseudo_UPD: - ExpandVST(MBBI, ARM::VST3d8_UPD, true, SingleSpc, 3); break; case ARM::VST3d16Pseudo_UPD: - ExpandVST(MBBI, ARM::VST3d16_UPD, true, SingleSpc, 3); break; case ARM::VST3d32Pseudo_UPD: - ExpandVST(MBBI, ARM::VST3d32_UPD, true, SingleSpc, 3); break; case ARM::VST1d64TPseudo_UPD: - ExpandVST(MBBI, ARM::VST1d64T_UPD, true, SingleSpc, 3); break; case ARM::VST3q8Pseudo_UPD: - ExpandVST(MBBI, ARM::VST3q8_UPD, true, EvenDblSpc, 3); break; case ARM::VST3q16Pseudo_UPD: - ExpandVST(MBBI, ARM::VST3q16_UPD, true, EvenDblSpc, 3); break; case ARM::VST3q32Pseudo_UPD: - ExpandVST(MBBI, ARM::VST3q32_UPD, true, EvenDblSpc, 3); break; + case ARM::VST3q8oddPseudo: + case ARM::VST3q16oddPseudo: + case ARM::VST3q32oddPseudo: case ARM::VST3q8oddPseudo_UPD: - ExpandVST(MBBI, ARM::VST3q8_UPD, true, OddDblSpc, 3); break; case ARM::VST3q16oddPseudo_UPD: - ExpandVST(MBBI, ARM::VST3q16_UPD, true, OddDblSpc, 3); break; case ARM::VST3q32oddPseudo_UPD: - ExpandVST(MBBI, ARM::VST3q32_UPD, true, OddDblSpc, 3); break; - case ARM::VST4d8Pseudo: - ExpandVST(MBBI, ARM::VST4d8, false, SingleSpc, 4); break; case ARM::VST4d16Pseudo: - ExpandVST(MBBI, ARM::VST4d16, false, SingleSpc, 4); break; case ARM::VST4d32Pseudo: - ExpandVST(MBBI, ARM::VST4d32, false, SingleSpc, 4); break; case ARM::VST1d64QPseudo: - ExpandVST(MBBI, ARM::VST1d64Q, false, SingleSpc, 4); break; case ARM::VST4d8Pseudo_UPD: - ExpandVST(MBBI, ARM::VST4d8_UPD, true, SingleSpc, 4); break; case ARM::VST4d16Pseudo_UPD: - ExpandVST(MBBI, ARM::VST4d16_UPD, true, SingleSpc, 4); break; case ARM::VST4d32Pseudo_UPD: - ExpandVST(MBBI, ARM::VST4d32_UPD, true, SingleSpc, 4); break; case ARM::VST1d64QPseudo_UPD: - ExpandVST(MBBI, ARM::VST1d64Q_UPD, true, SingleSpc, 4); break; case ARM::VST4q8Pseudo_UPD: - ExpandVST(MBBI, ARM::VST4q8_UPD, true, EvenDblSpc, 4); break; case ARM::VST4q16Pseudo_UPD: - ExpandVST(MBBI, ARM::VST4q16_UPD, true, EvenDblSpc, 4); break; case ARM::VST4q32Pseudo_UPD: - ExpandVST(MBBI, ARM::VST4q32_UPD, true, EvenDblSpc, 4); break; + case ARM::VST4q8oddPseudo: + case ARM::VST4q16oddPseudo: + case ARM::VST4q32oddPseudo: case ARM::VST4q8oddPseudo_UPD: - ExpandVST(MBBI, ARM::VST4q8_UPD, true, OddDblSpc, 4); break; case ARM::VST4q16oddPseudo_UPD: - ExpandVST(MBBI, ARM::VST4q16_UPD, true, OddDblSpc, 4); break; case ARM::VST4q32oddPseudo_UPD: - ExpandVST(MBBI, ARM::VST4q32_UPD, true, OddDblSpc, 4); break; - } + ExpandVST(MBBI); + return true; + + case ARM::VLD1LNq8Pseudo: + case ARM::VLD1LNq16Pseudo: + case ARM::VLD1LNq32Pseudo: + case ARM::VLD1LNq8Pseudo_UPD: + case ARM::VLD1LNq16Pseudo_UPD: + case ARM::VLD1LNq32Pseudo_UPD: + case ARM::VLD2LNd8Pseudo: + case ARM::VLD2LNd16Pseudo: + case ARM::VLD2LNd32Pseudo: + case ARM::VLD2LNq16Pseudo: + case ARM::VLD2LNq32Pseudo: + case ARM::VLD2LNd8Pseudo_UPD: + case ARM::VLD2LNd16Pseudo_UPD: + case ARM::VLD2LNd32Pseudo_UPD: + case ARM::VLD2LNq16Pseudo_UPD: + case ARM::VLD2LNq32Pseudo_UPD: + case ARM::VLD3LNd8Pseudo: + case ARM::VLD3LNd16Pseudo: + case ARM::VLD3LNd32Pseudo: + case ARM::VLD3LNq16Pseudo: + case ARM::VLD3LNq32Pseudo: + case ARM::VLD3LNd8Pseudo_UPD: + case ARM::VLD3LNd16Pseudo_UPD: + case ARM::VLD3LNd32Pseudo_UPD: + case ARM::VLD3LNq16Pseudo_UPD: + case ARM::VLD3LNq32Pseudo_UPD: + case ARM::VLD4LNd8Pseudo: + case ARM::VLD4LNd16Pseudo: + case ARM::VLD4LNd32Pseudo: + case ARM::VLD4LNq16Pseudo: + case ARM::VLD4LNq32Pseudo: + case ARM::VLD4LNd8Pseudo_UPD: + case ARM::VLD4LNd16Pseudo_UPD: + case ARM::VLD4LNd32Pseudo_UPD: + case ARM::VLD4LNq16Pseudo_UPD: + case ARM::VLD4LNq32Pseudo_UPD: + case ARM::VST1LNq8Pseudo: + case ARM::VST1LNq16Pseudo: + case ARM::VST1LNq32Pseudo: + case ARM::VST1LNq8Pseudo_UPD: + case ARM::VST1LNq16Pseudo_UPD: + case ARM::VST1LNq32Pseudo_UPD: + case ARM::VST2LNd8Pseudo: + case ARM::VST2LNd16Pseudo: + case ARM::VST2LNd32Pseudo: + case ARM::VST2LNq16Pseudo: + case ARM::VST2LNq32Pseudo: + case ARM::VST2LNd8Pseudo_UPD: + case ARM::VST2LNd16Pseudo_UPD: + case ARM::VST2LNd32Pseudo_UPD: + case ARM::VST2LNq16Pseudo_UPD: + case ARM::VST2LNq32Pseudo_UPD: + case ARM::VST3LNd8Pseudo: + case ARM::VST3LNd16Pseudo: + case ARM::VST3LNd32Pseudo: + case ARM::VST3LNq16Pseudo: + case ARM::VST3LNq32Pseudo: + case ARM::VST3LNd8Pseudo_UPD: + case ARM::VST3LNd16Pseudo_UPD: + case ARM::VST3LNd32Pseudo_UPD: + case ARM::VST3LNq16Pseudo_UPD: + case ARM::VST3LNq32Pseudo_UPD: + case ARM::VST4LNd8Pseudo: + case ARM::VST4LNd16Pseudo: + case ARM::VST4LNd32Pseudo: + case ARM::VST4LNq16Pseudo: + case ARM::VST4LNq32Pseudo: + case ARM::VST4LNd8Pseudo_UPD: + case ARM::VST4LNd16Pseudo_UPD: + case ARM::VST4LNd32Pseudo_UPD: + case ARM::VST4LNq16Pseudo_UPD: + case ARM::VST4LNq32Pseudo_UPD: + ExpandLaneOp(MBBI); + return true; + + case ARM::VTBL2Pseudo: ExpandVTBL(MBBI, ARM::VTBL2, false, 2); return true; + case ARM::VTBL3Pseudo: ExpandVTBL(MBBI, ARM::VTBL3, false, 3); return true; + case ARM::VTBL4Pseudo: ExpandVTBL(MBBI, ARM::VTBL4, false, 4); return true; + case ARM::VTBX2Pseudo: ExpandVTBL(MBBI, ARM::VTBX2, true, 2); return true; + case ARM::VTBX3Pseudo: ExpandVTBL(MBBI, ARM::VTBX3, true, 3); return true; + case ARM::VTBX4Pseudo: ExpandVTBL(MBBI, ARM::VTBX4, true, 4); return true; + } + + return false; +} - if (ModifiedOp) - Modified = true; +bool ARMExpandPseudo::ExpandMBB(MachineBasicBlock &MBB) { + bool Modified = false; + + MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); + while (MBBI != E) { + MachineBasicBlock::iterator NMBBI = llvm::next(MBBI); + Modified |= ExpandMI(MBB, MBBI); MBBI = NMBBI; } @@ -508,8 +1222,11 @@ bool ARMExpandPseudo::ExpandMBB(MachineBasicBlock &MBB) { } bool ARMExpandPseudo::runOnMachineFunction(MachineFunction &MF) { - TII = MF.getTarget().getInstrInfo(); - TRI = MF.getTarget().getRegisterInfo(); + const TargetMachine &TM = MF.getTarget(); + TII = static_cast<const ARMBaseInstrInfo*>(TM.getInstrInfo()); + TRI = TM.getRegisterInfo(); + STI = &TM.getSubtarget<ARMSubtarget>(); + AFI = MF.getInfo<ARMFunctionInfo>(); bool Modified = false; for (MachineFunction::iterator MFI = MF.begin(), E = MF.end(); MFI != E; diff --git a/contrib/llvm/lib/Target/ARM/ARMFastISel.cpp b/contrib/llvm/lib/Target/ARM/ARMFastISel.cpp index 4892eae..9f29530 100644 --- a/contrib/llvm/lib/Target/ARM/ARMFastISel.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMFastISel.cpp @@ -15,14 +15,17 @@ #include "ARM.h" #include "ARMBaseInstrInfo.h" +#include "ARMCallingConv.h" #include "ARMRegisterInfo.h" #include "ARMTargetMachine.h" #include "ARMSubtarget.h" +#include "ARMConstantPoolValue.h" #include "llvm/CallingConv.h" #include "llvm/DerivedTypes.h" #include "llvm/GlobalVariable.h" #include "llvm/Instructions.h" #include "llvm/IntrinsicInst.h" +#include "llvm/Module.h" #include "llvm/CodeGen/Analysis.h" #include "llvm/CodeGen/FastISel.h" #include "llvm/CodeGen/FunctionLoweringInfo.h" @@ -30,7 +33,9 @@ #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineConstantPool.h" #include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/PseudoSourceValue.h" #include "llvm/Support/CallSite.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/ErrorHandling.h" @@ -43,12 +48,37 @@ using namespace llvm; static cl::opt<bool> -EnableARMFastISel("arm-fast-isel", - cl::desc("Turn on experimental ARM fast-isel support"), - cl::init(false), cl::Hidden); +DisableARMFastISel("disable-arm-fast-isel", + cl::desc("Turn off experimental ARM fast-isel support"), + cl::init(false), cl::Hidden); + +extern cl::opt<bool> EnableARMLongCalls; namespace { + // All possible address modes, plus some. + typedef struct Address { + enum { + RegBase, + FrameIndexBase + } BaseType; + + union { + unsigned Reg; + int FI; + } Base; + + int Offset; + unsigned Scale; + unsigned PlusReg; + + // Innocuous defaults for our address. + Address() + : BaseType(RegBase), Offset(0), Scale(0), PlusReg(0) { + Base.Reg = 0; + } + } Address; + class ARMFastISel : public FastISel { /// Subtarget - Keep a pointer to the ARMSubtarget around so that we can @@ -57,13 +87,14 @@ class ARMFastISel : public FastISel { const TargetMachine &TM; const TargetInstrInfo &TII; const TargetLowering &TLI; - const ARMFunctionInfo *AFI; + ARMFunctionInfo *AFI; - // Convenience variable to avoid checking all the time. + // Convenience variables to avoid some queries. bool isThumb; + LLVMContext *Context; public: - explicit ARMFastISel(FunctionLoweringInfo &funcInfo) + explicit ARMFastISel(FunctionLoweringInfo &funcInfo) : FastISel(funcInfo), TM(funcInfo.MF->getTarget()), TII(*TM.getInstrInfo()), @@ -71,6 +102,7 @@ class ARMFastISel : public FastISel { Subtarget = &TM.getSubtarget<ARMSubtarget>(); AFI = funcInfo.MF->getInfo<ARMFunctionInfo>(); isThumb = AFI->isThumbFunction(); + Context = &funcInfo.Fn->getContext(); } // Code from FastISel.cpp. @@ -102,36 +134,73 @@ class ARMFastISel : public FastISel { virtual unsigned FastEmitInst_extractsubreg(MVT RetVT, unsigned Op0, bool Op0IsKill, uint32_t Idx); - + // Backend specific FastISel code. virtual bool TargetSelectInstruction(const Instruction *I); virtual unsigned TargetMaterializeConstant(const Constant *C); + virtual unsigned TargetMaterializeAlloca(const AllocaInst *AI); #include "ARMGenFastISel.inc" - + // Instruction selection routines. - virtual bool ARMSelectLoad(const Instruction *I); - virtual bool ARMSelectStore(const Instruction *I); - virtual bool ARMSelectBranch(const Instruction *I); + private: + bool SelectLoad(const Instruction *I); + bool SelectStore(const Instruction *I); + bool SelectBranch(const Instruction *I); + bool SelectCmp(const Instruction *I); + bool SelectFPExt(const Instruction *I); + bool SelectFPTrunc(const Instruction *I); + bool SelectBinaryOp(const Instruction *I, unsigned ISDOpcode); + bool SelectSIToFP(const Instruction *I); + bool SelectFPToSI(const Instruction *I); + bool SelectSDiv(const Instruction *I); + bool SelectSRem(const Instruction *I); + bool SelectCall(const Instruction *I); + bool SelectSelect(const Instruction *I); + bool SelectRet(const Instruction *I); // Utility routines. private: - bool isTypeLegal(const Type *Ty, EVT &VT); - bool isLoadTypeLegal(const Type *Ty, EVT &VT); - bool ARMEmitLoad(EVT VT, unsigned &ResultReg, unsigned Reg, int Offset); - bool ARMEmitStore(EVT VT, unsigned SrcReg, unsigned Reg, int Offset); - bool ARMLoadAlloca(const Instruction *I); - bool ARMStoreAlloca(const Instruction *I, unsigned SrcReg); - bool ARMComputeRegOffset(const Value *Obj, unsigned &Reg, int &Offset); - bool ARMMaterializeConstant(const ConstantInt *Val, unsigned &Reg); - + bool isTypeLegal(const Type *Ty, MVT &VT); + bool isLoadTypeLegal(const Type *Ty, MVT &VT); + bool ARMEmitLoad(EVT VT, unsigned &ResultReg, Address &Addr); + bool ARMEmitStore(EVT VT, unsigned SrcReg, Address &Addr); + bool ARMComputeAddress(const Value *Obj, Address &Addr); + void ARMSimplifyAddress(Address &Addr, EVT VT); + unsigned ARMMaterializeFP(const ConstantFP *CFP, EVT VT); + unsigned ARMMaterializeInt(const Constant *C, EVT VT); + unsigned ARMMaterializeGV(const GlobalValue *GV, EVT VT); + unsigned ARMMoveToFPReg(EVT VT, unsigned SrcReg); + unsigned ARMMoveToIntReg(EVT VT, unsigned SrcReg); + + // Call handling routines. + private: + bool FastEmitExtend(ISD::NodeType Opc, EVT DstVT, unsigned Src, EVT SrcVT, + unsigned &ResultReg); + CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool Return); + bool ProcessCallArgs(SmallVectorImpl<Value*> &Args, + SmallVectorImpl<unsigned> &ArgRegs, + SmallVectorImpl<MVT> &ArgVTs, + SmallVectorImpl<ISD::ArgFlagsTy> &ArgFlags, + SmallVectorImpl<unsigned> &RegArgs, + CallingConv::ID CC, + unsigned &NumBytes); + bool FinishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs, + const Instruction *I, CallingConv::ID CC, + unsigned &NumBytes); + bool ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call); + + // OptionalDef handling routines. + private: bool DefinesOptionalPredicate(MachineInstr *MI, bool *CPSR); const MachineInstrBuilder &AddOptionalDefs(const MachineInstrBuilder &MIB); + void AddLoadStoreOperands(EVT VT, Address &Addr, + const MachineInstrBuilder &MIB); }; } // end anonymous namespace -// #include "ARMGenCallingConv.inc" +#include "ARMGenCallingConv.inc" // DefinesOptionalPredicate - This is different from DefinesPredicate in that // we don't care about implicit defs here, just places we'll need to add a @@ -153,6 +222,9 @@ bool ARMFastISel::DefinesOptionalPredicate(MachineInstr *MI, bool *CPSR) { // If the machine is predicable go ahead and add the predicate operands, if // it needs default CC operands add those. +// TODO: If we want to support thumb1 then we'll need to deal with optional +// CPSR defs that need to be added before the remaining operands. See s_cc_out +// for descriptions why. const MachineInstrBuilder & ARMFastISel::AddOptionalDefs(const MachineInstrBuilder &MIB) { MachineInstr *MI = &*MIB; @@ -160,7 +232,7 @@ ARMFastISel::AddOptionalDefs(const MachineInstrBuilder &MIB) { // Do we use a predicate? if (TII.isPredicable(MI)) AddDefaultPred(MIB); - + // Do we optionally set a predicate? Preds is size > 0 iff the predicate // defines CPSR. All other OptionalDefines in ARM are the CCR register. bool CPSR = false; @@ -297,7 +369,7 @@ unsigned ARMFastISel::FastEmitInst_i(unsigned MachineInstOpcode, uint64_t Imm) { unsigned ResultReg = createResultReg(RC); const TargetInstrDesc &II = TII.get(MachineInstOpcode); - + if (II.getNumDefs() >= 1) AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, ResultReg) .addImm(Imm)); @@ -323,16 +395,84 @@ unsigned ARMFastISel::FastEmitInst_extractsubreg(MVT RetVT, return ResultReg; } -unsigned ARMFastISel::TargetMaterializeConstant(const Constant *C) { - EVT VT = TLI.getValueType(C->getType(), true); +// TODO: Don't worry about 64-bit now, but when this is fixed remove the +// checks from the various callers. +unsigned ARMFastISel::ARMMoveToFPReg(EVT VT, unsigned SrcReg) { + if (VT == MVT::f64) return 0; - // Only handle simple types. - if (!VT.isSimple()) return 0; - - // TODO: This should be safe for fp because they're just bits from the - // Constant. - // TODO: Theoretically we could materialize fp constants with instructions - // from VFP3. + unsigned MoveReg = createResultReg(TLI.getRegClassFor(VT)); + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(ARM::VMOVRS), MoveReg) + .addReg(SrcReg)); + return MoveReg; +} + +unsigned ARMFastISel::ARMMoveToIntReg(EVT VT, unsigned SrcReg) { + if (VT == MVT::i64) return 0; + + unsigned MoveReg = createResultReg(TLI.getRegClassFor(VT)); + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(ARM::VMOVSR), MoveReg) + .addReg(SrcReg)); + return MoveReg; +} + +// For double width floating point we need to materialize two constants +// (the high and the low) into integer registers then use a move to get +// the combined constant into an FP reg. +unsigned ARMFastISel::ARMMaterializeFP(const ConstantFP *CFP, EVT VT) { + const APFloat Val = CFP->getValueAPF(); + bool is64bit = VT == MVT::f64; + + // This checks to see if we can use VFP3 instructions to materialize + // a constant, otherwise we have to go through the constant pool. + if (TLI.isFPImmLegal(Val, VT)) { + unsigned Opc = is64bit ? ARM::FCONSTD : ARM::FCONSTS; + unsigned DestReg = createResultReg(TLI.getRegClassFor(VT)); + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), + DestReg) + .addFPImm(CFP)); + return DestReg; + } + + // Require VFP2 for loading fp constants. + if (!Subtarget->hasVFP2()) return false; + + // MachineConstantPool wants an explicit alignment. + unsigned Align = TD.getPrefTypeAlignment(CFP->getType()); + if (Align == 0) { + // TODO: Figure out if this is correct. + Align = TD.getTypeAllocSize(CFP->getType()); + } + unsigned Idx = MCP.getConstantPoolIndex(cast<Constant>(CFP), Align); + unsigned DestReg = createResultReg(TLI.getRegClassFor(VT)); + unsigned Opc = is64bit ? ARM::VLDRD : ARM::VLDRS; + + // The extra reg is for addrmode5. + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), + DestReg) + .addConstantPoolIndex(Idx) + .addReg(0)); + return DestReg; +} + +unsigned ARMFastISel::ARMMaterializeInt(const Constant *C, EVT VT) { + + // For now 32-bit only. + if (VT != MVT::i32) return false; + + unsigned DestReg = createResultReg(TLI.getRegClassFor(VT)); + + // If we can do this in a single instruction without a constant pool entry + // do so now. + const ConstantInt *CI = cast<ConstantInt>(C); + if (Subtarget->hasV6T2Ops() && isUInt<16>(CI->getSExtValue())) { + unsigned Opc = isThumb ? ARM::t2MOVi16 : ARM::MOVi16; + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(Opc), DestReg) + .addImm(CI->getSExtValue())); + return DestReg; + } // MachineConstantPool wants an explicit alignment. unsigned Align = TD.getPrefTypeAlignment(C->getType()); @@ -342,58 +482,144 @@ unsigned ARMFastISel::TargetMaterializeConstant(const Constant *C) { } unsigned Idx = MCP.getConstantPoolIndex(C, Align); - unsigned DestReg = createResultReg(TLI.getRegClassFor(VT)); - // Different addressing modes between ARM/Thumb2 for constant pool loads. if (isThumb) AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, - TII.get(ARM::t2LDRpci)) - .addReg(DestReg).addConstantPoolIndex(Idx)); + TII.get(ARM::t2LDRpci), DestReg) + .addConstantPoolIndex(Idx)); else + // The extra immediate is for addrmode2. AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, - TII.get(ARM::LDRcp)) - .addReg(DestReg).addConstantPoolIndex(Idx) - .addReg(0).addImm(0)); - + TII.get(ARM::LDRcp), DestReg) + .addConstantPoolIndex(Idx) + .addImm(0)); + return DestReg; } -bool ARMFastISel::isTypeLegal(const Type *Ty, EVT &VT) { - VT = TLI.getValueType(Ty, true); - +unsigned ARMFastISel::ARMMaterializeGV(const GlobalValue *GV, EVT VT) { + // For now 32-bit only. + if (VT != MVT::i32) return 0; + + Reloc::Model RelocM = TM.getRelocationModel(); + + // TODO: No external globals for now. + if (Subtarget->GVIsIndirectSymbol(GV, RelocM)) return 0; + + // TODO: Need more magic for ARM PIC. + if (!isThumb && (RelocM == Reloc::PIC_)) return 0; + + // MachineConstantPool wants an explicit alignment. + unsigned Align = TD.getPrefTypeAlignment(GV->getType()); + if (Align == 0) { + // TODO: Figure out if this is correct. + Align = TD.getTypeAllocSize(GV->getType()); + } + + // Grab index. + unsigned PCAdj = (RelocM != Reloc::PIC_) ? 0 : (Subtarget->isThumb() ? 4 : 8); + unsigned Id = AFI->createPICLabelUId(); + ARMConstantPoolValue *CPV = new ARMConstantPoolValue(GV, Id, + ARMCP::CPValue, PCAdj); + unsigned Idx = MCP.getConstantPoolIndex(CPV, Align); + + // Load value. + MachineInstrBuilder MIB; + unsigned DestReg = createResultReg(TLI.getRegClassFor(VT)); + if (isThumb) { + unsigned Opc = (RelocM != Reloc::PIC_) ? ARM::t2LDRpci : ARM::t2LDRpci_pic; + MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), DestReg) + .addConstantPoolIndex(Idx); + if (RelocM == Reloc::PIC_) + MIB.addImm(Id); + } else { + // The extra immediate is for addrmode2. + MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(ARM::LDRcp), + DestReg) + .addConstantPoolIndex(Idx) + .addImm(0); + } + AddOptionalDefs(MIB); + return DestReg; +} + +unsigned ARMFastISel::TargetMaterializeConstant(const Constant *C) { + EVT VT = TLI.getValueType(C->getType(), true); + // Only handle simple types. - if (VT == MVT::Other || !VT.isSimple()) return false; - + if (!VT.isSimple()) return 0; + + if (const ConstantFP *CFP = dyn_cast<ConstantFP>(C)) + return ARMMaterializeFP(CFP, VT); + else if (const GlobalValue *GV = dyn_cast<GlobalValue>(C)) + return ARMMaterializeGV(GV, VT); + else if (isa<ConstantInt>(C)) + return ARMMaterializeInt(C, VT); + + return 0; +} + +unsigned ARMFastISel::TargetMaterializeAlloca(const AllocaInst *AI) { + // Don't handle dynamic allocas. + if (!FuncInfo.StaticAllocaMap.count(AI)) return 0; + + MVT VT; + if (!isLoadTypeLegal(AI->getType(), VT)) return false; + + DenseMap<const AllocaInst*, int>::iterator SI = + FuncInfo.StaticAllocaMap.find(AI); + + // This will get lowered later into the correct offsets and registers + // via rewriteXFrameIndex. + if (SI != FuncInfo.StaticAllocaMap.end()) { + TargetRegisterClass* RC = TLI.getRegClassFor(VT); + unsigned ResultReg = createResultReg(RC); + unsigned Opc = isThumb ? ARM::t2ADDri : ARM::ADDri; + AddOptionalDefs(BuildMI(*FuncInfo.MBB, *FuncInfo.InsertPt, DL, + TII.get(Opc), ResultReg) + .addFrameIndex(SI->second) + .addImm(0)); + return ResultReg; + } + + return 0; +} + +bool ARMFastISel::isTypeLegal(const Type *Ty, MVT &VT) { + EVT evt = TLI.getValueType(Ty, true); + + // Only handle simple types. + if (evt == MVT::Other || !evt.isSimple()) return false; + VT = evt.getSimpleVT(); + // Handle all legal types, i.e. a register that will directly hold this // value. return TLI.isTypeLegal(VT); } -bool ARMFastISel::isLoadTypeLegal(const Type *Ty, EVT &VT) { +bool ARMFastISel::isLoadTypeLegal(const Type *Ty, MVT &VT) { if (isTypeLegal(Ty, VT)) return true; - + // If this is a type than can be sign or zero-extended to a basic operation // go ahead and accept it now. if (VT == MVT::i8 || VT == MVT::i16) return true; - + return false; } -// Computes the Reg+Offset to get to an object. -bool ARMFastISel::ARMComputeRegOffset(const Value *Obj, unsigned &Reg, - int &Offset) { +// Computes the address to get to an object. +bool ARMFastISel::ARMComputeAddress(const Value *Obj, Address &Addr) { // Some boilerplate from the X86 FastISel. const User *U = NULL; unsigned Opcode = Instruction::UserOp1; if (const Instruction *I = dyn_cast<Instruction>(Obj)) { - // Don't walk into other basic blocks; it's possible we haven't - // visited them yet, so the instructions may not yet be assigned - // virtual registers. - if (FuncInfo.MBBMap[I->getParent()] != FuncInfo.MBB) - return false; - - Opcode = I->getOpcode(); - U = I; + // Don't walk into other basic blocks unless the object is an alloca from + // another block, otherwise it may not have a virtual register assigned. + if (FuncInfo.StaticAllocaMap.count(static_cast<const AllocaInst *>(Obj)) || + FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB) { + Opcode = I->getOpcode(); + U = I; + } } else if (const ConstantExpr *C = dyn_cast<ConstantExpr>(Obj)) { Opcode = C->getOpcode(); U = C; @@ -404,141 +630,282 @@ bool ARMFastISel::ARMComputeRegOffset(const Value *Obj, unsigned &Reg, // Fast instruction selection doesn't support the special // address spaces. return false; - + switch (Opcode) { - default: - //errs() << "Failing Opcode is: " << *Op1 << "\n"; + default: break; + case Instruction::BitCast: { + // Look through bitcasts. + return ARMComputeAddress(U->getOperand(0), Addr); + } + case Instruction::IntToPtr: { + // Look past no-op inttoptrs. + if (TLI.getValueType(U->getOperand(0)->getType()) == TLI.getPointerTy()) + return ARMComputeAddress(U->getOperand(0), Addr); + break; + } + case Instruction::PtrToInt: { + // Look past no-op ptrtoints. + if (TLI.getValueType(U->getType()) == TLI.getPointerTy()) + return ARMComputeAddress(U->getOperand(0), Addr); + break; + } + case Instruction::GetElementPtr: { + Address SavedAddr = Addr; + int TmpOffset = Addr.Offset; + + // Iterate through the GEP folding the constants into offsets where + // we can. + gep_type_iterator GTI = gep_type_begin(U); + for (User::const_op_iterator i = U->op_begin() + 1, e = U->op_end(); + i != e; ++i, ++GTI) { + const Value *Op = *i; + if (const StructType *STy = dyn_cast<StructType>(*GTI)) { + const StructLayout *SL = TD.getStructLayout(STy); + unsigned Idx = cast<ConstantInt>(Op)->getZExtValue(); + TmpOffset += SL->getElementOffset(Idx); + } else { + uint64_t S = TD.getTypeAllocSize(GTI.getIndexedType()); + SmallVector<const Value *, 4> Worklist; + Worklist.push_back(Op); + do { + Op = Worklist.pop_back_val(); + if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) { + // Constant-offset addressing. + TmpOffset += CI->getSExtValue() * S; + } else if (isa<AddOperator>(Op) && + isa<ConstantInt>(cast<AddOperator>(Op)->getOperand(1))) { + // An add with a constant operand. Fold the constant. + ConstantInt *CI = + cast<ConstantInt>(cast<AddOperator>(Op)->getOperand(1)); + TmpOffset += CI->getSExtValue() * S; + // Add the other operand back to the work list. + Worklist.push_back(cast<AddOperator>(Op)->getOperand(0)); + } else + goto unsupported_gep; + } while (!Worklist.empty()); + } + } + + // Try to grab the base operand now. + Addr.Offset = TmpOffset; + if (ARMComputeAddress(U->getOperand(0), Addr)) return true; + + // We failed, restore everything and try the other options. + Addr = SavedAddr; + + unsupported_gep: + break; + } case Instruction::Alloca: { - assert(false && "Alloca should have been handled earlier!"); - return false; + const AllocaInst *AI = cast<AllocaInst>(Obj); + DenseMap<const AllocaInst*, int>::iterator SI = + FuncInfo.StaticAllocaMap.find(AI); + if (SI != FuncInfo.StaticAllocaMap.end()) { + Addr.BaseType = Address::FrameIndexBase; + Addr.Base.FI = SI->second; + return true; + } + break; } } - + + // Materialize the global variable's address into a reg which can + // then be used later to load the variable. if (const GlobalValue *GV = dyn_cast<GlobalValue>(Obj)) { - //errs() << "Failing GV is: " << GV << "\n"; - (void)GV; - return false; + unsigned Tmp = ARMMaterializeGV(GV, TLI.getValueType(Obj->getType())); + if (Tmp == 0) return false; + + Addr.Base.Reg = Tmp; + return true; } - + // Try to get this in a register if nothing else has worked. - Reg = getRegForValue(Obj); - if (Reg == 0) return false; + if (Addr.Base.Reg == 0) Addr.Base.Reg = getRegForValue(Obj); + return Addr.Base.Reg != 0; +} + +void ARMFastISel::ARMSimplifyAddress(Address &Addr, EVT VT) { - // Since the offset may be too large for the load instruction + assert(VT.isSimple() && "Non-simple types are invalid here!"); + + bool needsLowering = false; + switch (VT.getSimpleVT().SimpleTy) { + default: + assert(false && "Unhandled load/store type!"); + case MVT::i1: + case MVT::i8: + case MVT::i16: + case MVT::i32: + // Integer loads/stores handle 12-bit offsets. + needsLowering = ((Addr.Offset & 0xfff) != Addr.Offset); + break; + case MVT::f32: + case MVT::f64: + // Floating point operands handle 8-bit offsets. + needsLowering = ((Addr.Offset & 0xff) != Addr.Offset); + break; + } + + // If this is a stack pointer and the offset needs to be simplified then + // put the alloca address into a register, set the base type back to + // register and continue. This should almost never happen. + if (needsLowering && Addr.BaseType == Address::FrameIndexBase) { + TargetRegisterClass *RC = isThumb ? ARM::tGPRRegisterClass : + ARM::GPRRegisterClass; + unsigned ResultReg = createResultReg(RC); + unsigned Opc = isThumb ? ARM::t2ADDri : ARM::ADDri; + AddOptionalDefs(BuildMI(*FuncInfo.MBB, *FuncInfo.InsertPt, DL, + TII.get(Opc), ResultReg) + .addFrameIndex(Addr.Base.FI) + .addImm(0)); + Addr.Base.Reg = ResultReg; + Addr.BaseType = Address::RegBase; + } + + // Since the offset is too large for the load/store instruction // get the reg+offset into a register. - // TODO: Verify the additions work, otherwise we'll need to add the - // offset instead of 0 to the instructions and do all sorts of operand - // munging. - // TODO: Optimize this somewhat. - if (Offset != 0) { + if (needsLowering) { ARMCC::CondCodes Pred = ARMCC::AL; unsigned PredReg = 0; + TargetRegisterClass *RC = isThumb ? ARM::tGPRRegisterClass : + ARM::GPRRegisterClass; + unsigned BaseReg = createResultReg(RC); + if (!isThumb) emitARMRegPlusImmediate(*FuncInfo.MBB, FuncInfo.InsertPt, DL, - Reg, Reg, Offset, Pred, PredReg, + BaseReg, Addr.Base.Reg, Addr.Offset, + Pred, PredReg, static_cast<const ARMBaseInstrInfo&>(TII)); else { assert(AFI->isThumb2Function()); emitT2RegPlusImmediate(*FuncInfo.MBB, FuncInfo.InsertPt, DL, - Reg, Reg, Offset, Pred, PredReg, + BaseReg, Addr.Base.Reg, Addr.Offset, Pred, PredReg, static_cast<const ARMBaseInstrInfo&>(TII)); } + Addr.Offset = 0; + Addr.Base.Reg = BaseReg; } - - return true; } -bool ARMFastISel::ARMLoadAlloca(const Instruction *I) { - Value *Op0 = I->getOperand(0); +void ARMFastISel::AddLoadStoreOperands(EVT VT, Address &Addr, + const MachineInstrBuilder &MIB) { + // addrmode5 output depends on the selection dag addressing dividing the + // offset by 4 that it then later multiplies. Do this here as well. + if (VT.getSimpleVT().SimpleTy == MVT::f32 || + VT.getSimpleVT().SimpleTy == MVT::f64) + Addr.Offset /= 4; + + // Frame base works a bit differently. Handle it separately. + if (Addr.BaseType == Address::FrameIndexBase) { + int FI = Addr.Base.FI; + int Offset = Addr.Offset; + MachineMemOperand *MMO = + FuncInfo.MF->getMachineMemOperand( + MachinePointerInfo::getFixedStack(FI, Offset), + MachineMemOperand::MOLoad, + MFI.getObjectSize(FI), + MFI.getObjectAlignment(FI)); + // Now add the rest of the operands. + MIB.addFrameIndex(FI); - // Verify it's an alloca. - if (const AllocaInst *AI = dyn_cast<AllocaInst>(Op0)) { - DenseMap<const AllocaInst*, int>::iterator SI = - FuncInfo.StaticAllocaMap.find(AI); - - if (SI != FuncInfo.StaticAllocaMap.end()) { - TargetRegisterClass* RC = TLI.getRegClassFor(TLI.getPointerTy()); - unsigned ResultReg = createResultReg(RC); - TII.loadRegFromStackSlot(*FuncInfo.MBB, *FuncInfo.InsertPt, - ResultReg, SI->second, RC, - TM.getRegisterInfo()); - UpdateValueMap(I, ResultReg); - return true; - } + // ARM halfword load/stores need an additional operand. + if (!isThumb && VT.getSimpleVT().SimpleTy == MVT::i16) MIB.addReg(0); + + MIB.addImm(Addr.Offset); + MIB.addMemOperand(MMO); + } else { + // Now add the rest of the operands. + MIB.addReg(Addr.Base.Reg); + + // ARM halfword load/stores need an additional operand. + if (!isThumb && VT.getSimpleVT().SimpleTy == MVT::i16) MIB.addReg(0); + + MIB.addImm(Addr.Offset); } - return false; + AddOptionalDefs(MIB); } -bool ARMFastISel::ARMEmitLoad(EVT VT, unsigned &ResultReg, - unsigned Reg, int Offset) { - +bool ARMFastISel::ARMEmitLoad(EVT VT, unsigned &ResultReg, Address &Addr) { + assert(VT.isSimple() && "Non-simple types are invalid here!"); unsigned Opc; - + TargetRegisterClass *RC; switch (VT.getSimpleVT().SimpleTy) { - default: - assert(false && "Trying to emit for an unhandled type!"); - return false; + // This is mostly going to be Neon/vector support. + default: return false; case MVT::i16: - Opc = isThumb ? ARM::tLDRH : ARM::LDRH; - VT = MVT::i32; + Opc = isThumb ? ARM::t2LDRHi12 : ARM::LDRH; + RC = ARM::GPRRegisterClass; break; case MVT::i8: - Opc = isThumb ? ARM::tLDRB : ARM::LDRB; - VT = MVT::i32; + Opc = isThumb ? ARM::t2LDRBi12 : ARM::LDRBi12; + RC = ARM::GPRRegisterClass; break; case MVT::i32: - Opc = isThumb ? ARM::tLDR : ARM::LDR; + Opc = isThumb ? ARM::t2LDRi12 : ARM::LDRi12; + RC = ARM::GPRRegisterClass; + break; + case MVT::f32: + Opc = ARM::VLDRS; + RC = TLI.getRegClassFor(VT); + break; + case MVT::f64: + Opc = ARM::VLDRD; + RC = TLI.getRegClassFor(VT); break; } - - ResultReg = createResultReg(TLI.getRegClassFor(VT)); - - // TODO: Fix the Addressing modes so that these can share some code. - // Since this is a Thumb1 load this will work in Thumb1 or 2 mode. - if (isThumb) - AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, - TII.get(Opc), ResultReg) - .addReg(Reg).addImm(Offset).addReg(0)); - else - AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, - TII.get(Opc), ResultReg) - .addReg(Reg).addReg(0).addImm(Offset)); - + // Simplify this down to something we can handle. + ARMSimplifyAddress(Addr, VT); + + // Create the base instruction, then add the operands. + ResultReg = createResultReg(RC); + MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(Opc), ResultReg); + AddLoadStoreOperands(VT, Addr, MIB); return true; } -bool ARMFastISel::ARMStoreAlloca(const Instruction *I, unsigned SrcReg) { - Value *Op1 = I->getOperand(1); +bool ARMFastISel::SelectLoad(const Instruction *I) { + // Verify we have a legal type before going any further. + MVT VT; + if (!isLoadTypeLegal(I->getType(), VT)) + return false; - // Verify it's an alloca. - if (const AllocaInst *AI = dyn_cast<AllocaInst>(Op1)) { - DenseMap<const AllocaInst*, int>::iterator SI = - FuncInfo.StaticAllocaMap.find(AI); + // See if we can handle this address. + Address Addr; + if (!ARMComputeAddress(I->getOperand(0), Addr)) return false; - if (SI != FuncInfo.StaticAllocaMap.end()) { - TargetRegisterClass* RC = TLI.getRegClassFor(TLI.getPointerTy()); - assert(SrcReg != 0 && "Nothing to store!"); - TII.storeRegToStackSlot(*FuncInfo.MBB, *FuncInfo.InsertPt, - SrcReg, true /*isKill*/, SI->second, RC, - TM.getRegisterInfo()); - return true; - } - } - return false; + unsigned ResultReg; + if (!ARMEmitLoad(VT, ResultReg, Addr)) return false; + UpdateValueMap(I, ResultReg); + return true; } -bool ARMFastISel::ARMEmitStore(EVT VT, unsigned SrcReg, - unsigned DstReg, int Offset) { +bool ARMFastISel::ARMEmitStore(EVT VT, unsigned SrcReg, Address &Addr) { unsigned StrOpc; switch (VT.getSimpleVT().SimpleTy) { + // This is mostly going to be Neon/vector support. default: return false; - case MVT::i1: - case MVT::i8: StrOpc = isThumb ? ARM::tSTRB : ARM::STRB; break; - case MVT::i16: StrOpc = isThumb ? ARM::tSTRH : ARM::STRH; break; - case MVT::i32: StrOpc = isThumb ? ARM::tSTR : ARM::STR; break; + case MVT::i1: { + unsigned Res = createResultReg(isThumb ? ARM::tGPRRegisterClass : + ARM::GPRRegisterClass); + unsigned Opc = isThumb ? ARM::t2ANDri : ARM::ANDri; + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(Opc), Res) + .addReg(SrcReg).addImm(1)); + SrcReg = Res; + } // Fallthrough here. + case MVT::i8: + StrOpc = isThumb ? ARM::t2STRBi12 : ARM::STRBi12; + break; + case MVT::i16: + StrOpc = isThumb ? ARM::t2STRHi12 : ARM::STRH; + break; + case MVT::i32: + StrOpc = isThumb ? ARM::t2STRi12 : ARM::STRi12; + break; case MVT::f32: if (!Subtarget->hasVFP2()) return false; StrOpc = ARM::VSTRS; @@ -548,91 +915,162 @@ bool ARMFastISel::ARMEmitStore(EVT VT, unsigned SrcReg, StrOpc = ARM::VSTRD; break; } - - if (isThumb) - AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, - TII.get(StrOpc), SrcReg) - .addReg(DstReg).addImm(Offset).addReg(0)); - else - AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, - TII.get(StrOpc), SrcReg) - .addReg(DstReg).addReg(0).addImm(Offset)); - + // Simplify this down to something we can handle. + ARMSimplifyAddress(Addr, VT); + + // Create the base instruction, then add the operands. + MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(StrOpc)) + .addReg(SrcReg, getKillRegState(true)); + AddLoadStoreOperands(VT, Addr, MIB); return true; } -bool ARMFastISel::ARMSelectStore(const Instruction *I) { +bool ARMFastISel::SelectStore(const Instruction *I) { Value *Op0 = I->getOperand(0); unsigned SrcReg = 0; - // Yay type legalization - EVT VT; + // Verify we have a legal type before going any further. + MVT VT; if (!isLoadTypeLegal(I->getOperand(0)->getType(), VT)) return false; // Get the value to be stored into a register. SrcReg = getRegForValue(Op0); - if (SrcReg == 0) - return false; - - // If we're an alloca we know we have a frame index and can emit the store - // quickly. - if (ARMStoreAlloca(I, SrcReg)) - return true; - - // Our register and offset with innocuous defaults. - unsigned Reg = 0; - int Offset = 0; - - // See if we can handle this as Reg + Offset - if (!ARMComputeRegOffset(I->getOperand(1), Reg, Offset)) - return false; - - if (!ARMEmitStore(VT, SrcReg, Reg, Offset /* 0 */)) return false; - - return false; - -} + if (SrcReg == 0) return false; -bool ARMFastISel::ARMSelectLoad(const Instruction *I) { - // If we're an alloca we know we have a frame index and can emit the load - // directly in short order. - if (ARMLoadAlloca(I)) - return true; - - // Verify we have a legal type before going any further. - EVT VT; - if (!isLoadTypeLegal(I->getType(), VT)) - return false; - - // Our register and offset with innocuous defaults. - unsigned Reg = 0; - int Offset = 0; - - // See if we can handle this as Reg + Offset - if (!ARMComputeRegOffset(I->getOperand(0), Reg, Offset)) + // See if we can handle this address. + Address Addr; + if (!ARMComputeAddress(I->getOperand(1), Addr)) return false; - - unsigned ResultReg; - if (!ARMEmitLoad(VT, ResultReg, Reg, Offset /* 0 */)) return false; - - UpdateValueMap(I, ResultReg); + + if (!ARMEmitStore(VT, SrcReg, Addr)) return false; return true; } -bool ARMFastISel::ARMSelectBranch(const Instruction *I) { +static ARMCC::CondCodes getComparePred(CmpInst::Predicate Pred) { + switch (Pred) { + // Needs two compares... + case CmpInst::FCMP_ONE: + case CmpInst::FCMP_UEQ: + default: + // AL is our "false" for now. The other two need more compares. + return ARMCC::AL; + case CmpInst::ICMP_EQ: + case CmpInst::FCMP_OEQ: + return ARMCC::EQ; + case CmpInst::ICMP_SGT: + case CmpInst::FCMP_OGT: + return ARMCC::GT; + case CmpInst::ICMP_SGE: + case CmpInst::FCMP_OGE: + return ARMCC::GE; + case CmpInst::ICMP_UGT: + case CmpInst::FCMP_UGT: + return ARMCC::HI; + case CmpInst::FCMP_OLT: + return ARMCC::MI; + case CmpInst::ICMP_ULE: + case CmpInst::FCMP_OLE: + return ARMCC::LS; + case CmpInst::FCMP_ORD: + return ARMCC::VC; + case CmpInst::FCMP_UNO: + return ARMCC::VS; + case CmpInst::FCMP_UGE: + return ARMCC::PL; + case CmpInst::ICMP_SLT: + case CmpInst::FCMP_ULT: + return ARMCC::LT; + case CmpInst::ICMP_SLE: + case CmpInst::FCMP_ULE: + return ARMCC::LE; + case CmpInst::FCMP_UNE: + case CmpInst::ICMP_NE: + return ARMCC::NE; + case CmpInst::ICMP_UGE: + return ARMCC::HS; + case CmpInst::ICMP_ULT: + return ARMCC::LO; + } +} + +bool ARMFastISel::SelectBranch(const Instruction *I) { const BranchInst *BI = cast<BranchInst>(I); MachineBasicBlock *TBB = FuncInfo.MBBMap[BI->getSuccessor(0)]; MachineBasicBlock *FBB = FuncInfo.MBBMap[BI->getSuccessor(1)]; - + // Simple branch support. - unsigned CondReg = getRegForValue(BI->getCondition()); - if (CondReg == 0) return false; - - unsigned CmpOpc = isThumb ? ARM::t2CMPrr : ARM::CMPrr; - unsigned BrOpc = isThumb ? ARM::t2Bcc : ARM::Bcc; + + // If we can, avoid recomputing the compare - redoing it could lead to wonky + // behavior. + // TODO: Factor this out. + if (const CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition())) { + if (CI->hasOneUse() && (CI->getParent() == I->getParent())) { + MVT VT; + const Type *Ty = CI->getOperand(0)->getType(); + if (!isTypeLegal(Ty, VT)) + return false; + + bool isFloat = (Ty->isDoubleTy() || Ty->isFloatTy()); + if (isFloat && !Subtarget->hasVFP2()) + return false; + + unsigned CmpOpc; + switch (VT.SimpleTy) { + default: return false; + // TODO: Verify compares. + case MVT::f32: + CmpOpc = ARM::VCMPES; + break; + case MVT::f64: + CmpOpc = ARM::VCMPED; + break; + case MVT::i32: + CmpOpc = isThumb ? ARM::t2CMPrr : ARM::CMPrr; + break; + } + + // Get the compare predicate. + ARMCC::CondCodes ARMPred = getComparePred(CI->getPredicate()); + + // We may not handle every CC for now. + if (ARMPred == ARMCC::AL) return false; + + unsigned Arg1 = getRegForValue(CI->getOperand(0)); + if (Arg1 == 0) return false; + + unsigned Arg2 = getRegForValue(CI->getOperand(1)); + if (Arg2 == 0) return false; + + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(CmpOpc)) + .addReg(Arg1).addReg(Arg2)); + + // For floating point we need to move the result to a comparison register + // that we can then use for branches. + if (isFloat) + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(ARM::FMSTAT))); + + unsigned BrOpc = isThumb ? ARM::t2Bcc : ARM::Bcc; + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(BrOpc)) + .addMBB(TBB).addImm(ARMPred).addReg(ARM::CPSR); + FastEmitBranch(FBB, DL); + FuncInfo.MBB->addSuccessor(TBB); + return true; + } + } + + unsigned CmpReg = getRegForValue(BI->getCondition()); + if (CmpReg == 0) return false; + + // Re-set the flags just in case. + unsigned CmpOpc = isThumb ? ARM::t2CMPri : ARM::CMPri; AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(CmpOpc)) - .addReg(CondReg).addReg(CondReg)); + .addReg(CmpReg).addImm(0)); + + unsigned BrOpc = isThumb ? ARM::t2Bcc : ARM::Bcc; BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(BrOpc)) .addMBB(TBB).addImm(ARMCC::NE).addReg(ARM::CPSR); FastEmitBranch(FBB, DL); @@ -640,18 +1078,809 @@ bool ARMFastISel::ARMSelectBranch(const Instruction *I) { return true; } +bool ARMFastISel::SelectCmp(const Instruction *I) { + const CmpInst *CI = cast<CmpInst>(I); + + MVT VT; + const Type *Ty = CI->getOperand(0)->getType(); + if (!isTypeLegal(Ty, VT)) + return false; + + bool isFloat = (Ty->isDoubleTy() || Ty->isFloatTy()); + if (isFloat && !Subtarget->hasVFP2()) + return false; + + unsigned CmpOpc; + unsigned CondReg; + switch (VT.SimpleTy) { + default: return false; + // TODO: Verify compares. + case MVT::f32: + CmpOpc = ARM::VCMPES; + CondReg = ARM::FPSCR; + break; + case MVT::f64: + CmpOpc = ARM::VCMPED; + CondReg = ARM::FPSCR; + break; + case MVT::i32: + CmpOpc = isThumb ? ARM::t2CMPrr : ARM::CMPrr; + CondReg = ARM::CPSR; + break; + } + + // Get the compare predicate. + ARMCC::CondCodes ARMPred = getComparePred(CI->getPredicate()); + + // We may not handle every CC for now. + if (ARMPred == ARMCC::AL) return false; + + unsigned Arg1 = getRegForValue(CI->getOperand(0)); + if (Arg1 == 0) return false; + + unsigned Arg2 = getRegForValue(CI->getOperand(1)); + if (Arg2 == 0) return false; + + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(CmpOpc)) + .addReg(Arg1).addReg(Arg2)); + + // For floating point we need to move the result to a comparison register + // that we can then use for branches. + if (isFloat) + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(ARM::FMSTAT))); + + // Now set a register based on the comparison. Explicitly set the predicates + // here. + unsigned MovCCOpc = isThumb ? ARM::t2MOVCCi : ARM::MOVCCi; + TargetRegisterClass *RC = isThumb ? ARM::rGPRRegisterClass + : ARM::GPRRegisterClass; + unsigned DestReg = createResultReg(RC); + Constant *Zero + = ConstantInt::get(Type::getInt32Ty(*Context), 0); + unsigned ZeroReg = TargetMaterializeConstant(Zero); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(MovCCOpc), DestReg) + .addReg(ZeroReg).addImm(1) + .addImm(ARMPred).addReg(CondReg); + + UpdateValueMap(I, DestReg); + return true; +} + +bool ARMFastISel::SelectFPExt(const Instruction *I) { + // Make sure we have VFP and that we're extending float to double. + if (!Subtarget->hasVFP2()) return false; + + Value *V = I->getOperand(0); + if (!I->getType()->isDoubleTy() || + !V->getType()->isFloatTy()) return false; + + unsigned Op = getRegForValue(V); + if (Op == 0) return false; + + unsigned Result = createResultReg(ARM::DPRRegisterClass); + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(ARM::VCVTDS), Result) + .addReg(Op)); + UpdateValueMap(I, Result); + return true; +} + +bool ARMFastISel::SelectFPTrunc(const Instruction *I) { + // Make sure we have VFP and that we're truncating double to float. + if (!Subtarget->hasVFP2()) return false; + + Value *V = I->getOperand(0); + if (!(I->getType()->isFloatTy() && + V->getType()->isDoubleTy())) return false; + + unsigned Op = getRegForValue(V); + if (Op == 0) return false; + + unsigned Result = createResultReg(ARM::SPRRegisterClass); + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(ARM::VCVTSD), Result) + .addReg(Op)); + UpdateValueMap(I, Result); + return true; +} + +bool ARMFastISel::SelectSIToFP(const Instruction *I) { + // Make sure we have VFP. + if (!Subtarget->hasVFP2()) return false; + + MVT DstVT; + const Type *Ty = I->getType(); + if (!isTypeLegal(Ty, DstVT)) + return false; + + unsigned Op = getRegForValue(I->getOperand(0)); + if (Op == 0) return false; + + // The conversion routine works on fp-reg to fp-reg and the operand above + // was an integer, move it to the fp registers if possible. + unsigned FP = ARMMoveToFPReg(MVT::f32, Op); + if (FP == 0) return false; + + unsigned Opc; + if (Ty->isFloatTy()) Opc = ARM::VSITOS; + else if (Ty->isDoubleTy()) Opc = ARM::VSITOD; + else return 0; + + unsigned ResultReg = createResultReg(TLI.getRegClassFor(DstVT)); + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), + ResultReg) + .addReg(FP)); + UpdateValueMap(I, ResultReg); + return true; +} + +bool ARMFastISel::SelectFPToSI(const Instruction *I) { + // Make sure we have VFP. + if (!Subtarget->hasVFP2()) return false; + + MVT DstVT; + const Type *RetTy = I->getType(); + if (!isTypeLegal(RetTy, DstVT)) + return false; + + unsigned Op = getRegForValue(I->getOperand(0)); + if (Op == 0) return false; + + unsigned Opc; + const Type *OpTy = I->getOperand(0)->getType(); + if (OpTy->isFloatTy()) Opc = ARM::VTOSIZS; + else if (OpTy->isDoubleTy()) Opc = ARM::VTOSIZD; + else return 0; + + // f64->s32 or f32->s32 both need an intermediate f32 reg. + unsigned ResultReg = createResultReg(TLI.getRegClassFor(MVT::f32)); + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), + ResultReg) + .addReg(Op)); + + // This result needs to be in an integer register, but the conversion only + // takes place in fp-regs. + unsigned IntReg = ARMMoveToIntReg(DstVT, ResultReg); + if (IntReg == 0) return false; + + UpdateValueMap(I, IntReg); + return true; +} + +bool ARMFastISel::SelectSelect(const Instruction *I) { + MVT VT; + if (!isTypeLegal(I->getType(), VT)) + return false; + + // Things need to be register sized for register moves. + if (VT != MVT::i32) return false; + const TargetRegisterClass *RC = TLI.getRegClassFor(VT); + + unsigned CondReg = getRegForValue(I->getOperand(0)); + if (CondReg == 0) return false; + unsigned Op1Reg = getRegForValue(I->getOperand(1)); + if (Op1Reg == 0) return false; + unsigned Op2Reg = getRegForValue(I->getOperand(2)); + if (Op2Reg == 0) return false; + + unsigned CmpOpc = isThumb ? ARM::t2TSTri : ARM::TSTri; + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(CmpOpc)) + .addReg(CondReg).addImm(1)); + unsigned ResultReg = createResultReg(RC); + unsigned MovCCOpc = isThumb ? ARM::t2MOVCCr : ARM::MOVCCr; + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(MovCCOpc), ResultReg) + .addReg(Op1Reg).addReg(Op2Reg) + .addImm(ARMCC::EQ).addReg(ARM::CPSR); + UpdateValueMap(I, ResultReg); + return true; +} + +bool ARMFastISel::SelectSDiv(const Instruction *I) { + MVT VT; + const Type *Ty = I->getType(); + if (!isTypeLegal(Ty, VT)) + return false; + + // If we have integer div support we should have selected this automagically. + // In case we have a real miss go ahead and return false and we'll pick + // it up later. + if (Subtarget->hasDivide()) return false; + + // Otherwise emit a libcall. + RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL; + if (VT == MVT::i8) + LC = RTLIB::SDIV_I8; + else if (VT == MVT::i16) + LC = RTLIB::SDIV_I16; + else if (VT == MVT::i32) + LC = RTLIB::SDIV_I32; + else if (VT == MVT::i64) + LC = RTLIB::SDIV_I64; + else if (VT == MVT::i128) + LC = RTLIB::SDIV_I128; + assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported SDIV!"); + + return ARMEmitLibcall(I, LC); +} + +bool ARMFastISel::SelectSRem(const Instruction *I) { + MVT VT; + const Type *Ty = I->getType(); + if (!isTypeLegal(Ty, VT)) + return false; + + RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL; + if (VT == MVT::i8) + LC = RTLIB::SREM_I8; + else if (VT == MVT::i16) + LC = RTLIB::SREM_I16; + else if (VT == MVT::i32) + LC = RTLIB::SREM_I32; + else if (VT == MVT::i64) + LC = RTLIB::SREM_I64; + else if (VT == MVT::i128) + LC = RTLIB::SREM_I128; + assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported SREM!"); + + return ARMEmitLibcall(I, LC); +} + +bool ARMFastISel::SelectBinaryOp(const Instruction *I, unsigned ISDOpcode) { + EVT VT = TLI.getValueType(I->getType(), true); + + // We can get here in the case when we want to use NEON for our fp + // operations, but can't figure out how to. Just use the vfp instructions + // if we have them. + // FIXME: It'd be nice to use NEON instructions. + const Type *Ty = I->getType(); + bool isFloat = (Ty->isDoubleTy() || Ty->isFloatTy()); + if (isFloat && !Subtarget->hasVFP2()) + return false; + + unsigned Op1 = getRegForValue(I->getOperand(0)); + if (Op1 == 0) return false; + + unsigned Op2 = getRegForValue(I->getOperand(1)); + if (Op2 == 0) return false; + + unsigned Opc; + bool is64bit = VT == MVT::f64 || VT == MVT::i64; + switch (ISDOpcode) { + default: return false; + case ISD::FADD: + Opc = is64bit ? ARM::VADDD : ARM::VADDS; + break; + case ISD::FSUB: + Opc = is64bit ? ARM::VSUBD : ARM::VSUBS; + break; + case ISD::FMUL: + Opc = is64bit ? ARM::VMULD : ARM::VMULS; + break; + } + unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT)); + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(Opc), ResultReg) + .addReg(Op1).addReg(Op2)); + UpdateValueMap(I, ResultReg); + return true; +} + +// Call Handling Code + +bool ARMFastISel::FastEmitExtend(ISD::NodeType Opc, EVT DstVT, unsigned Src, + EVT SrcVT, unsigned &ResultReg) { + unsigned RR = FastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(), Opc, + Src, /*TODO: Kill=*/false); + + if (RR != 0) { + ResultReg = RR; + return true; + } else + return false; +} + +// This is largely taken directly from CCAssignFnForNode - we don't support +// varargs in FastISel so that part has been removed. +// TODO: We may not support all of this. +CCAssignFn *ARMFastISel::CCAssignFnForCall(CallingConv::ID CC, bool Return) { + switch (CC) { + default: + llvm_unreachable("Unsupported calling convention"); + case CallingConv::Fast: + // Ignore fastcc. Silence compiler warnings. + (void)RetFastCC_ARM_APCS; + (void)FastCC_ARM_APCS; + // Fallthrough + case CallingConv::C: + // Use target triple & subtarget features to do actual dispatch. + if (Subtarget->isAAPCS_ABI()) { + if (Subtarget->hasVFP2() && + FloatABIType == FloatABI::Hard) + return (Return ? RetCC_ARM_AAPCS_VFP: CC_ARM_AAPCS_VFP); + else + return (Return ? RetCC_ARM_AAPCS: CC_ARM_AAPCS); + } else + return (Return ? RetCC_ARM_APCS: CC_ARM_APCS); + case CallingConv::ARM_AAPCS_VFP: + return (Return ? RetCC_ARM_AAPCS_VFP: CC_ARM_AAPCS_VFP); + case CallingConv::ARM_AAPCS: + return (Return ? RetCC_ARM_AAPCS: CC_ARM_AAPCS); + case CallingConv::ARM_APCS: + return (Return ? RetCC_ARM_APCS: CC_ARM_APCS); + } +} + +bool ARMFastISel::ProcessCallArgs(SmallVectorImpl<Value*> &Args, + SmallVectorImpl<unsigned> &ArgRegs, + SmallVectorImpl<MVT> &ArgVTs, + SmallVectorImpl<ISD::ArgFlagsTy> &ArgFlags, + SmallVectorImpl<unsigned> &RegArgs, + CallingConv::ID CC, + unsigned &NumBytes) { + SmallVector<CCValAssign, 16> ArgLocs; + CCState CCInfo(CC, false, TM, ArgLocs, *Context); + CCInfo.AnalyzeCallOperands(ArgVTs, ArgFlags, CCAssignFnForCall(CC, false)); + + // Get a count of how many bytes are to be pushed on the stack. + NumBytes = CCInfo.getNextStackOffset(); + + // Issue CALLSEQ_START + unsigned AdjStackDown = TM.getRegisterInfo()->getCallFrameSetupOpcode(); + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(AdjStackDown)) + .addImm(NumBytes)); + + // Process the args. + for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { + CCValAssign &VA = ArgLocs[i]; + unsigned Arg = ArgRegs[VA.getValNo()]; + MVT ArgVT = ArgVTs[VA.getValNo()]; + + // We don't handle NEON/vector parameters yet. + if (ArgVT.isVector() || ArgVT.getSizeInBits() > 64) + return false; + + // Handle arg promotion, etc. + switch (VA.getLocInfo()) { + case CCValAssign::Full: break; + case CCValAssign::SExt: { + bool Emitted = FastEmitExtend(ISD::SIGN_EXTEND, VA.getLocVT(), + Arg, ArgVT, Arg); + assert(Emitted && "Failed to emit a sext!"); (void)Emitted; + Emitted = true; + ArgVT = VA.getLocVT(); + break; + } + case CCValAssign::ZExt: { + bool Emitted = FastEmitExtend(ISD::ZERO_EXTEND, VA.getLocVT(), + Arg, ArgVT, Arg); + assert(Emitted && "Failed to emit a zext!"); (void)Emitted; + Emitted = true; + ArgVT = VA.getLocVT(); + break; + } + case CCValAssign::AExt: { + bool Emitted = FastEmitExtend(ISD::ANY_EXTEND, VA.getLocVT(), + Arg, ArgVT, Arg); + if (!Emitted) + Emitted = FastEmitExtend(ISD::ZERO_EXTEND, VA.getLocVT(), + Arg, ArgVT, Arg); + if (!Emitted) + Emitted = FastEmitExtend(ISD::SIGN_EXTEND, VA.getLocVT(), + Arg, ArgVT, Arg); + + assert(Emitted && "Failed to emit a aext!"); (void)Emitted; + ArgVT = VA.getLocVT(); + break; + } + case CCValAssign::BCvt: { + unsigned BC = FastEmit_r(ArgVT, VA.getLocVT(), ISD::BITCAST, Arg, + /*TODO: Kill=*/false); + assert(BC != 0 && "Failed to emit a bitcast!"); + Arg = BC; + ArgVT = VA.getLocVT(); + break; + } + default: llvm_unreachable("Unknown arg promotion!"); + } + + // Now copy/store arg to correct locations. + if (VA.isRegLoc() && !VA.needsCustom()) { + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY), + VA.getLocReg()) + .addReg(Arg); + RegArgs.push_back(VA.getLocReg()); + } else if (VA.needsCustom()) { + // TODO: We need custom lowering for vector (v2f64) args. + if (VA.getLocVT() != MVT::f64) return false; + + CCValAssign &NextVA = ArgLocs[++i]; + + // TODO: Only handle register args for now. + if(!(VA.isRegLoc() && NextVA.isRegLoc())) return false; + + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(ARM::VMOVRRD), VA.getLocReg()) + .addReg(NextVA.getLocReg(), RegState::Define) + .addReg(Arg)); + RegArgs.push_back(VA.getLocReg()); + RegArgs.push_back(NextVA.getLocReg()); + } else { + assert(VA.isMemLoc()); + // Need to store on the stack. + Address Addr; + Addr.BaseType = Address::RegBase; + Addr.Base.Reg = ARM::SP; + Addr.Offset = VA.getLocMemOffset(); + + if (!ARMEmitStore(ArgVT, Arg, Addr)) return false; + } + } + return true; +} + +bool ARMFastISel::FinishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs, + const Instruction *I, CallingConv::ID CC, + unsigned &NumBytes) { + // Issue CALLSEQ_END + unsigned AdjStackUp = TM.getRegisterInfo()->getCallFrameDestroyOpcode(); + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(AdjStackUp)) + .addImm(NumBytes).addImm(0)); + + // Now the return value. + if (RetVT != MVT::isVoid) { + SmallVector<CCValAssign, 16> RVLocs; + CCState CCInfo(CC, false, TM, RVLocs, *Context); + CCInfo.AnalyzeCallResult(RetVT, CCAssignFnForCall(CC, true)); + + // Copy all of the result registers out of their specified physreg. + if (RVLocs.size() == 2 && RetVT == MVT::f64) { + // For this move we copy into two registers and then move into the + // double fp reg we want. + EVT DestVT = RVLocs[0].getValVT(); + TargetRegisterClass* DstRC = TLI.getRegClassFor(DestVT); + unsigned ResultReg = createResultReg(DstRC); + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(ARM::VMOVDRR), ResultReg) + .addReg(RVLocs[0].getLocReg()) + .addReg(RVLocs[1].getLocReg())); + + UsedRegs.push_back(RVLocs[0].getLocReg()); + UsedRegs.push_back(RVLocs[1].getLocReg()); + + // Finally update the result. + UpdateValueMap(I, ResultReg); + } else { + assert(RVLocs.size() == 1 &&"Can't handle non-double multi-reg retvals!"); + EVT CopyVT = RVLocs[0].getValVT(); + TargetRegisterClass* DstRC = TLI.getRegClassFor(CopyVT); + + unsigned ResultReg = createResultReg(DstRC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY), + ResultReg).addReg(RVLocs[0].getLocReg()); + UsedRegs.push_back(RVLocs[0].getLocReg()); + + // Finally update the result. + UpdateValueMap(I, ResultReg); + } + } + + return true; +} + +bool ARMFastISel::SelectRet(const Instruction *I) { + const ReturnInst *Ret = cast<ReturnInst>(I); + const Function &F = *I->getParent()->getParent(); + + if (!FuncInfo.CanLowerReturn) + return false; + + if (F.isVarArg()) + return false; + + CallingConv::ID CC = F.getCallingConv(); + if (Ret->getNumOperands() > 0) { + SmallVector<ISD::OutputArg, 4> Outs; + GetReturnInfo(F.getReturnType(), F.getAttributes().getRetAttributes(), + Outs, TLI); + + // Analyze operands of the call, assigning locations to each operand. + SmallVector<CCValAssign, 16> ValLocs; + CCState CCInfo(CC, F.isVarArg(), TM, ValLocs, I->getContext()); + CCInfo.AnalyzeReturn(Outs, CCAssignFnForCall(CC, true /* is Ret */)); + + const Value *RV = Ret->getOperand(0); + unsigned Reg = getRegForValue(RV); + if (Reg == 0) + return false; + + // Only handle a single return value for now. + if (ValLocs.size() != 1) + return false; + + CCValAssign &VA = ValLocs[0]; + + // Don't bother handling odd stuff for now. + if (VA.getLocInfo() != CCValAssign::Full) + return false; + // Only handle register returns for now. + if (!VA.isRegLoc()) + return false; + // TODO: For now, don't try to handle cases where getLocInfo() + // says Full but the types don't match. + if (TLI.getValueType(RV->getType()) != VA.getValVT()) + return false; + + // Make the copy. + unsigned SrcReg = Reg + VA.getValNo(); + unsigned DstReg = VA.getLocReg(); + const TargetRegisterClass* SrcRC = MRI.getRegClass(SrcReg); + // Avoid a cross-class copy. This is very unlikely. + if (!SrcRC->contains(DstReg)) + return false; + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY), + DstReg).addReg(SrcReg); + + // Mark the register as live out of the function. + MRI.addLiveOut(VA.getLocReg()); + } + + unsigned RetOpc = isThumb ? ARM::tBX_RET : ARM::BX_RET; + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(RetOpc))); + return true; +} + +// A quick function that will emit a call for a named libcall in F with the +// vector of passed arguments for the Instruction in I. We can assume that we +// can emit a call for any libcall we can produce. This is an abridged version +// of the full call infrastructure since we won't need to worry about things +// like computed function pointers or strange arguments at call sites. +// TODO: Try to unify this and the normal call bits for ARM, then try to unify +// with X86. +bool ARMFastISel::ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call) { + CallingConv::ID CC = TLI.getLibcallCallingConv(Call); + + // Handle *simple* calls for now. + const Type *RetTy = I->getType(); + MVT RetVT; + if (RetTy->isVoidTy()) + RetVT = MVT::isVoid; + else if (!isTypeLegal(RetTy, RetVT)) + return false; + + // For now we're using BLX etc on the assumption that we have v5t ops. + if (!Subtarget->hasV5TOps()) return false; + + // TODO: For now if we have long calls specified we don't handle the call. + if (EnableARMLongCalls) return false; + + // Set up the argument vectors. + SmallVector<Value*, 8> Args; + SmallVector<unsigned, 8> ArgRegs; + SmallVector<MVT, 8> ArgVTs; + SmallVector<ISD::ArgFlagsTy, 8> ArgFlags; + Args.reserve(I->getNumOperands()); + ArgRegs.reserve(I->getNumOperands()); + ArgVTs.reserve(I->getNumOperands()); + ArgFlags.reserve(I->getNumOperands()); + for (unsigned i = 0; i < I->getNumOperands(); ++i) { + Value *Op = I->getOperand(i); + unsigned Arg = getRegForValue(Op); + if (Arg == 0) return false; + + const Type *ArgTy = Op->getType(); + MVT ArgVT; + if (!isTypeLegal(ArgTy, ArgVT)) return false; + + ISD::ArgFlagsTy Flags; + unsigned OriginalAlignment = TD.getABITypeAlignment(ArgTy); + Flags.setOrigAlign(OriginalAlignment); + + Args.push_back(Op); + ArgRegs.push_back(Arg); + ArgVTs.push_back(ArgVT); + ArgFlags.push_back(Flags); + } + + // Handle the arguments now that we've gotten them. + SmallVector<unsigned, 4> RegArgs; + unsigned NumBytes; + if (!ProcessCallArgs(Args, ArgRegs, ArgVTs, ArgFlags, RegArgs, CC, NumBytes)) + return false; + + // Issue the call, BLXr9 for darwin, BLX otherwise. This uses V5 ops. + // TODO: Turn this into the table of arm call ops. + MachineInstrBuilder MIB; + unsigned CallOpc; + if(isThumb) { + CallOpc = Subtarget->isTargetDarwin() ? ARM::tBLXi_r9 : ARM::tBLXi; + // Explicitly adding the predicate here. + MIB = AddDefaultPred(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(CallOpc))) + .addExternalSymbol(TLI.getLibcallName(Call)); + } else { + CallOpc = Subtarget->isTargetDarwin() ? ARM::BLr9 : ARM::BL; + // Explicitly adding the predicate here. + MIB = AddDefaultPred(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(CallOpc)) + .addExternalSymbol(TLI.getLibcallName(Call))); + } + + // Add implicit physical register uses to the call. + for (unsigned i = 0, e = RegArgs.size(); i != e; ++i) + MIB.addReg(RegArgs[i]); + + // Finish off the call including any return values. + SmallVector<unsigned, 4> UsedRegs; + if (!FinishCall(RetVT, UsedRegs, I, CC, NumBytes)) return false; + + // Set all unused physreg defs as dead. + static_cast<MachineInstr *>(MIB)->setPhysRegsDeadExcept(UsedRegs, TRI); + + return true; +} + +bool ARMFastISel::SelectCall(const Instruction *I) { + const CallInst *CI = cast<CallInst>(I); + const Value *Callee = CI->getCalledValue(); + + // Can't handle inline asm or worry about intrinsics yet. + if (isa<InlineAsm>(Callee) || isa<IntrinsicInst>(CI)) return false; + + // Only handle global variable Callees that are direct calls. + const GlobalValue *GV = dyn_cast<GlobalValue>(Callee); + if (!GV || Subtarget->GVIsIndirectSymbol(GV, TM.getRelocationModel())) + return false; + + // Check the calling convention. + ImmutableCallSite CS(CI); + CallingConv::ID CC = CS.getCallingConv(); + + // TODO: Avoid some calling conventions? + + // Let SDISel handle vararg functions. + const PointerType *PT = cast<PointerType>(CS.getCalledValue()->getType()); + const FunctionType *FTy = cast<FunctionType>(PT->getElementType()); + if (FTy->isVarArg()) + return false; + + // Handle *simple* calls for now. + const Type *RetTy = I->getType(); + MVT RetVT; + if (RetTy->isVoidTy()) + RetVT = MVT::isVoid; + else if (!isTypeLegal(RetTy, RetVT)) + return false; + + // For now we're using BLX etc on the assumption that we have v5t ops. + // TODO: Maybe? + if (!Subtarget->hasV5TOps()) return false; + + // TODO: For now if we have long calls specified we don't handle the call. + if (EnableARMLongCalls) return false; + + // Set up the argument vectors. + SmallVector<Value*, 8> Args; + SmallVector<unsigned, 8> ArgRegs; + SmallVector<MVT, 8> ArgVTs; + SmallVector<ISD::ArgFlagsTy, 8> ArgFlags; + Args.reserve(CS.arg_size()); + ArgRegs.reserve(CS.arg_size()); + ArgVTs.reserve(CS.arg_size()); + ArgFlags.reserve(CS.arg_size()); + for (ImmutableCallSite::arg_iterator i = CS.arg_begin(), e = CS.arg_end(); + i != e; ++i) { + unsigned Arg = getRegForValue(*i); + + if (Arg == 0) + return false; + ISD::ArgFlagsTy Flags; + unsigned AttrInd = i - CS.arg_begin() + 1; + if (CS.paramHasAttr(AttrInd, Attribute::SExt)) + Flags.setSExt(); + if (CS.paramHasAttr(AttrInd, Attribute::ZExt)) + Flags.setZExt(); + + // FIXME: Only handle *easy* calls for now. + if (CS.paramHasAttr(AttrInd, Attribute::InReg) || + CS.paramHasAttr(AttrInd, Attribute::StructRet) || + CS.paramHasAttr(AttrInd, Attribute::Nest) || + CS.paramHasAttr(AttrInd, Attribute::ByVal)) + return false; + + const Type *ArgTy = (*i)->getType(); + MVT ArgVT; + if (!isTypeLegal(ArgTy, ArgVT)) + return false; + unsigned OriginalAlignment = TD.getABITypeAlignment(ArgTy); + Flags.setOrigAlign(OriginalAlignment); + + Args.push_back(*i); + ArgRegs.push_back(Arg); + ArgVTs.push_back(ArgVT); + ArgFlags.push_back(Flags); + } + + // Handle the arguments now that we've gotten them. + SmallVector<unsigned, 4> RegArgs; + unsigned NumBytes; + if (!ProcessCallArgs(Args, ArgRegs, ArgVTs, ArgFlags, RegArgs, CC, NumBytes)) + return false; + + // Issue the call, BLXr9 for darwin, BLX otherwise. This uses V5 ops. + // TODO: Turn this into the table of arm call ops. + MachineInstrBuilder MIB; + unsigned CallOpc; + // Explicitly adding the predicate here. + if(isThumb) { + CallOpc = Subtarget->isTargetDarwin() ? ARM::tBLXi_r9 : ARM::tBLXi; + // Explicitly adding the predicate here. + MIB = AddDefaultPred(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(CallOpc))) + .addGlobalAddress(GV, 0, 0); + } else { + CallOpc = Subtarget->isTargetDarwin() ? ARM::BLr9 : ARM::BL; + // Explicitly adding the predicate here. + MIB = AddDefaultPred(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(CallOpc)) + .addGlobalAddress(GV, 0, 0)); + } + + // Add implicit physical register uses to the call. + for (unsigned i = 0, e = RegArgs.size(); i != e; ++i) + MIB.addReg(RegArgs[i]); + + // Finish off the call including any return values. + SmallVector<unsigned, 4> UsedRegs; + if (!FinishCall(RetVT, UsedRegs, I, CC, NumBytes)) return false; + + // Set all unused physreg defs as dead. + static_cast<MachineInstr *>(MIB)->setPhysRegsDeadExcept(UsedRegs, TRI); + + return true; + +} + // TODO: SoftFP support. bool ARMFastISel::TargetSelectInstruction(const Instruction *I) { - // No Thumb-1 for now. - if (isThumb && !AFI->isThumb2Function()) return false; - + switch (I->getOpcode()) { case Instruction::Load: - return ARMSelectLoad(I); + return SelectLoad(I); case Instruction::Store: - return ARMSelectStore(I); + return SelectStore(I); case Instruction::Br: - return ARMSelectBranch(I); + return SelectBranch(I); + case Instruction::ICmp: + case Instruction::FCmp: + return SelectCmp(I); + case Instruction::FPExt: + return SelectFPExt(I); + case Instruction::FPTrunc: + return SelectFPTrunc(I); + case Instruction::SIToFP: + return SelectSIToFP(I); + case Instruction::FPToSI: + return SelectFPToSI(I); + case Instruction::FAdd: + return SelectBinaryOp(I, ISD::FADD); + case Instruction::FSub: + return SelectBinaryOp(I, ISD::FSUB); + case Instruction::FMul: + return SelectBinaryOp(I, ISD::FMUL); + case Instruction::SDiv: + return SelectSDiv(I); + case Instruction::SRem: + return SelectSRem(I); + case Instruction::Call: + return SelectCall(I); + case Instruction::Select: + return SelectSelect(I); + case Instruction::Ret: + return SelectRet(I); default: break; } return false; @@ -659,7 +1888,14 @@ bool ARMFastISel::TargetSelectInstruction(const Instruction *I) { namespace llvm { llvm::FastISel *ARM::createFastISel(FunctionLoweringInfo &funcInfo) { - if (EnableARMFastISel) return new ARMFastISel(funcInfo); + // Completely untested on non-darwin. + const TargetMachine &TM = funcInfo.MF->getTarget(); + + // Darwin and thumb1 only for now. + const ARMSubtarget *Subtarget = &TM.getSubtarget<ARMSubtarget>(); + if (Subtarget->isTargetDarwin() && !Subtarget->isThumb1Only() && + !DisableARMFastISel) + return new ARMFastISel(funcInfo); return 0; } } diff --git a/contrib/llvm/lib/Target/ARM/ARMFixupKinds.h b/contrib/llvm/lib/Target/ARM/ARMFixupKinds.h new file mode 100644 index 0000000..3d175e3 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMFixupKinds.h @@ -0,0 +1,97 @@ +//===-- ARM/ARMFixupKinds.h - ARM Specific Fixup Entries --------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_ARM_ARMFIXUPKINDS_H +#define LLVM_ARM_ARMFIXUPKINDS_H + +#include "llvm/MC/MCFixup.h" + +namespace llvm { +namespace ARM { +enum Fixups { + // fixup_arm_ldst_pcrel_12 - 12-bit PC relative relocation for symbol + // addresses + fixup_arm_ldst_pcrel_12 = FirstTargetFixupKind, + + // fixup_t2_ldst_pcrel_12 - Equivalent to fixup_arm_ldst_pcrel_12, with + // the 16-bit halfwords reordered. + fixup_t2_ldst_pcrel_12, + + // fixup_arm_pcrel_10 - 10-bit PC relative relocation for symbol addresses + // used in VFP instructions where the lower 2 bits are not encoded + // (so it's encoded as an 8-bit immediate). + fixup_arm_pcrel_10, + // fixup_t2_pcrel_10 - Equivalent to fixup_arm_pcrel_10, accounting for + // the short-swapped encoding of Thumb2 instructions. + fixup_t2_pcrel_10, + // fixup_thumb_adr_pcrel_10 - 10-bit PC relative relocation for symbol + // addresses where the lower 2 bits are not encoded (so it's encoded as an + // 8-bit immediate). + fixup_thumb_adr_pcrel_10, + // fixup_arm_adr_pcrel_12 - 12-bit PC relative relocation for the ADR + // instruction. + fixup_arm_adr_pcrel_12, + // fixup_t2_adr_pcrel_12 - 12-bit PC relative relocation for the ADR + // instruction. + fixup_t2_adr_pcrel_12, + // fixup_arm_condbranch - 24-bit PC relative relocation for conditional branch + // instructions. + fixup_arm_condbranch, + // fixup_arm_uncondbranch - 24-bit PC relative relocation for + // branch instructions. (unconditional) + fixup_arm_uncondbranch, + // fixup_t2_condbranch - 20-bit PC relative relocation for Thumb2 direct + // uconditional branch instructions. + fixup_t2_condbranch, + // fixup_t2_uncondbranch - 20-bit PC relative relocation for Thumb2 direct + // branch unconditional branch instructions. + fixup_t2_uncondbranch, + + // fixup_arm_thumb_br - 12-bit fixup for Thumb B instructions. + fixup_arm_thumb_br, + + // fixup_arm_thumb_blx - Fixup for Thumb BL instructions. + fixup_arm_thumb_bl, + + // fixup_arm_thumb_blx - Fixup for Thumb BLX instructions. + fixup_arm_thumb_blx, + + // fixup_arm_thumb_cb - Fixup for Thumb branch instructions. + fixup_arm_thumb_cb, + + // fixup_arm_thumb_cp - Fixup for Thumb load/store from constant pool instrs. + fixup_arm_thumb_cp, + + // fixup_arm_thumb_bcc - Fixup for Thumb conditional branching instructions. + fixup_arm_thumb_bcc, + + // The next two are for the movt/movw pair + // the 16bit imm field are split into imm{15-12} and imm{11-0} + fixup_arm_movt_hi16, // :upper16: + fixup_arm_movw_lo16, // :lower16: + fixup_t2_movt_hi16, // :upper16: + fixup_t2_movw_lo16, // :lower16: + + // It is possible to create an "immediate" that happens to be pcrel. + // movw r0, :lower16:Foo-(Bar+8) and movt r0, :upper16:Foo-(Bar+8) + // result in different reloc tags than the above two. + // Needed to support ELF::R_ARM_MOVT_PREL and ELF::R_ARM_MOVW_PREL_NC + fixup_arm_movt_hi16_pcrel, // :upper16: + fixup_arm_movw_lo16_pcrel, // :lower16: + fixup_t2_movt_hi16_pcrel, // :upper16: + fixup_t2_movw_lo16_pcrel, // :lower16: + + // Marker + LastTargetFixupKind, + NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind +}; +} +} + +#endif diff --git a/contrib/llvm/lib/Target/ARM/ARMFrameInfo.h b/contrib/llvm/lib/Target/ARM/ARMFrameInfo.h deleted file mode 100644 index d5dae24..0000000 --- a/contrib/llvm/lib/Target/ARM/ARMFrameInfo.h +++ /dev/null @@ -1,32 +0,0 @@ -//===-- ARMTargetFrameInfo.h - Define TargetFrameInfo for ARM ---*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// -// -//===----------------------------------------------------------------------===// - -#ifndef ARM_FRAMEINFO_H -#define ARM_FRAMEINFO_H - -#include "ARM.h" -#include "ARMSubtarget.h" -#include "llvm/Target/TargetFrameInfo.h" - -namespace llvm { - -class ARMFrameInfo : public TargetFrameInfo { -public: - explicit ARMFrameInfo(const ARMSubtarget &ST) - : TargetFrameInfo(StackGrowsDown, ST.getStackAlignment(), 0, 4) { - } -}; - -} // End llvm namespace - -#endif diff --git a/contrib/llvm/lib/Target/ARM/ARMFrameLowering.cpp b/contrib/llvm/lib/Target/ARM/ARMFrameLowering.cpp new file mode 100644 index 0000000..f42c6db --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMFrameLowering.cpp @@ -0,0 +1,1021 @@ +//=======- ARMFrameLowering.cpp - ARM Frame Information --------*- C++ -*-====// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the ARM implementation of TargetFrameLowering class. +// +//===----------------------------------------------------------------------===// + +#include "ARMFrameLowering.h" +#include "ARMAddressingModes.h" +#include "ARMBaseInstrInfo.h" +#include "ARMBaseRegisterInfo.h" +#include "ARMMachineFunctionInfo.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/RegisterScavenging.h" +#include "llvm/Target/TargetOptions.h" + +using namespace llvm; + +/// hasFP - Return true if the specified function should have a dedicated frame +/// pointer register. This is true if the function has variable sized allocas +/// or if frame pointer elimination is disabled. +bool ARMFrameLowering::hasFP(const MachineFunction &MF) const { + const TargetRegisterInfo *RegInfo = MF.getTarget().getRegisterInfo(); + + // Mac OS X requires FP not to be clobbered for backtracing purpose. + if (STI.isTargetDarwin()) + return true; + + const MachineFrameInfo *MFI = MF.getFrameInfo(); + // Always eliminate non-leaf frame pointers. + return ((DisableFramePointerElim(MF) && MFI->hasCalls()) || + RegInfo->needsStackRealignment(MF) || + MFI->hasVarSizedObjects() || + MFI->isFrameAddressTaken()); +} + +/// hasReservedCallFrame - Under normal circumstances, when a frame pointer is +/// not required, we reserve argument space for call sites in the function +/// immediately on entry to the current function. This eliminates the need for +/// add/sub sp brackets around call sites. Returns true if the call frame is +/// included as part of the stack frame. +bool ARMFrameLowering::hasReservedCallFrame(const MachineFunction &MF) const { + const MachineFrameInfo *FFI = MF.getFrameInfo(); + unsigned CFSize = FFI->getMaxCallFrameSize(); + // It's not always a good idea to include the call frame as part of the + // stack frame. ARM (especially Thumb) has small immediate offset to + // address the stack frame. So a large call frame can cause poor codegen + // and may even makes it impossible to scavenge a register. + if (CFSize >= ((1 << 12) - 1) / 2) // Half of imm12 + return false; + + return !MF.getFrameInfo()->hasVarSizedObjects(); +} + +/// canSimplifyCallFramePseudos - If there is a reserved call frame, the +/// call frame pseudos can be simplified. Unlike most targets, having a FP +/// is not sufficient here since we still may reference some objects via SP +/// even when FP is available in Thumb2 mode. +bool +ARMFrameLowering::canSimplifyCallFramePseudos(const MachineFunction &MF) const { + return hasReservedCallFrame(MF) || MF.getFrameInfo()->hasVarSizedObjects(); +} + +static bool isCalleeSavedRegister(unsigned Reg, const unsigned *CSRegs) { + for (unsigned i = 0; CSRegs[i]; ++i) + if (Reg == CSRegs[i]) + return true; + return false; +} + +static bool isCSRestore(MachineInstr *MI, + const ARMBaseInstrInfo &TII, + const unsigned *CSRegs) { + // Integer spill area is handled with "pop". + if (MI->getOpcode() == ARM::LDMIA_RET || + MI->getOpcode() == ARM::t2LDMIA_RET || + MI->getOpcode() == ARM::LDMIA_UPD || + MI->getOpcode() == ARM::t2LDMIA_UPD || + MI->getOpcode() == ARM::VLDMDIA_UPD) { + // The first two operands are predicates. The last two are + // imp-def and imp-use of SP. Check everything in between. + for (int i = 5, e = MI->getNumOperands(); i != e; ++i) + if (!isCalleeSavedRegister(MI->getOperand(i).getReg(), CSRegs)) + return false; + return true; + } + if ((MI->getOpcode() == ARM::LDR_POST || + MI->getOpcode() == ARM::t2LDR_POST) && + isCalleeSavedRegister(MI->getOperand(0).getReg(), CSRegs) && + MI->getOperand(1).getReg() == ARM::SP) + return true; + + return false; +} + +static void +emitSPUpdate(bool isARM, + MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, + DebugLoc dl, const ARMBaseInstrInfo &TII, + int NumBytes, + ARMCC::CondCodes Pred = ARMCC::AL, unsigned PredReg = 0) { + if (isARM) + emitARMRegPlusImmediate(MBB, MBBI, dl, ARM::SP, ARM::SP, NumBytes, + Pred, PredReg, TII); + else + emitT2RegPlusImmediate(MBB, MBBI, dl, ARM::SP, ARM::SP, NumBytes, + Pred, PredReg, TII); +} + +void ARMFrameLowering::emitPrologue(MachineFunction &MF) const { + MachineBasicBlock &MBB = MF.front(); + MachineBasicBlock::iterator MBBI = MBB.begin(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + const ARMBaseRegisterInfo *RegInfo = + static_cast<const ARMBaseRegisterInfo*>(MF.getTarget().getRegisterInfo()); + const ARMBaseInstrInfo &TII = + *static_cast<const ARMBaseInstrInfo*>(MF.getTarget().getInstrInfo()); + assert(!AFI->isThumb1OnlyFunction() && + "This emitPrologue does not support Thumb1!"); + bool isARM = !AFI->isThumbFunction(); + unsigned VARegSaveSize = AFI->getVarArgsRegSaveSize(); + unsigned NumBytes = MFI->getStackSize(); + const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo(); + DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc(); + unsigned FramePtr = RegInfo->getFrameRegister(MF); + + // Determine the sizes of each callee-save spill areas and record which frame + // belongs to which callee-save spill areas. + unsigned GPRCS1Size = 0, GPRCS2Size = 0, DPRCSSize = 0; + int FramePtrSpillFI = 0; + + // Allocate the vararg register save area. This is not counted in NumBytes. + if (VARegSaveSize) + emitSPUpdate(isARM, MBB, MBBI, dl, TII, -VARegSaveSize); + + if (!AFI->hasStackFrame()) { + if (NumBytes != 0) + emitSPUpdate(isARM, MBB, MBBI, dl, TII, -NumBytes); + return; + } + + for (unsigned i = 0, e = CSI.size(); i != e; ++i) { + unsigned Reg = CSI[i].getReg(); + int FI = CSI[i].getFrameIdx(); + switch (Reg) { + case ARM::R4: + case ARM::R5: + case ARM::R6: + case ARM::R7: + case ARM::LR: + if (Reg == FramePtr) + FramePtrSpillFI = FI; + AFI->addGPRCalleeSavedArea1Frame(FI); + GPRCS1Size += 4; + break; + case ARM::R8: + case ARM::R9: + case ARM::R10: + case ARM::R11: + if (Reg == FramePtr) + FramePtrSpillFI = FI; + if (STI.isTargetDarwin()) { + AFI->addGPRCalleeSavedArea2Frame(FI); + GPRCS2Size += 4; + } else { + AFI->addGPRCalleeSavedArea1Frame(FI); + GPRCS1Size += 4; + } + break; + default: + AFI->addDPRCalleeSavedAreaFrame(FI); + DPRCSSize += 8; + } + } + + // Move past area 1. + if (GPRCS1Size > 0) MBBI++; + + // Set FP to point to the stack slot that contains the previous FP. + // For Darwin, FP is R7, which has now been stored in spill area 1. + // Otherwise, if this is not Darwin, all the callee-saved registers go + // into spill area 1, including the FP in R11. In either case, it is + // now safe to emit this assignment. + bool HasFP = hasFP(MF); + if (HasFP) { + unsigned ADDriOpc = !AFI->isThumbFunction() ? ARM::ADDri : ARM::t2ADDri; + MachineInstrBuilder MIB = + BuildMI(MBB, MBBI, dl, TII.get(ADDriOpc), FramePtr) + .addFrameIndex(FramePtrSpillFI).addImm(0); + AddDefaultCC(AddDefaultPred(MIB)); + } + + // Move past area 2. + if (GPRCS2Size > 0) MBBI++; + + // Determine starting offsets of spill areas. + unsigned DPRCSOffset = NumBytes - (GPRCS1Size + GPRCS2Size + DPRCSSize); + unsigned GPRCS2Offset = DPRCSOffset + DPRCSSize; + unsigned GPRCS1Offset = GPRCS2Offset + GPRCS2Size; + if (HasFP) + AFI->setFramePtrSpillOffset(MFI->getObjectOffset(FramePtrSpillFI) + + NumBytes); + AFI->setGPRCalleeSavedArea1Offset(GPRCS1Offset); + AFI->setGPRCalleeSavedArea2Offset(GPRCS2Offset); + AFI->setDPRCalleeSavedAreaOffset(DPRCSOffset); + + // Move past area 3. + if (DPRCSSize > 0) MBBI++; + + NumBytes = DPRCSOffset; + if (NumBytes) { + // Adjust SP after all the callee-save spills. + emitSPUpdate(isARM, MBB, MBBI, dl, TII, -NumBytes); + if (HasFP && isARM) + // Restore from fp only in ARM mode: e.g. sub sp, r7, #24 + // Note it's not safe to do this in Thumb2 mode because it would have + // taken two instructions: + // mov sp, r7 + // sub sp, #24 + // If an interrupt is taken between the two instructions, then sp is in + // an inconsistent state (pointing to the middle of callee-saved area). + // The interrupt handler can end up clobbering the registers. + AFI->setShouldRestoreSPFromFP(true); + } + + if (STI.isTargetELF() && hasFP(MF)) + MFI->setOffsetAdjustment(MFI->getOffsetAdjustment() - + AFI->getFramePtrSpillOffset()); + + AFI->setGPRCalleeSavedArea1Size(GPRCS1Size); + AFI->setGPRCalleeSavedArea2Size(GPRCS2Size); + AFI->setDPRCalleeSavedAreaSize(DPRCSSize); + + // If we need dynamic stack realignment, do it here. Be paranoid and make + // sure if we also have VLAs, we have a base pointer for frame access. + if (RegInfo->needsStackRealignment(MF)) { + unsigned MaxAlign = MFI->getMaxAlignment(); + assert (!AFI->isThumb1OnlyFunction()); + if (!AFI->isThumbFunction()) { + // Emit bic sp, sp, MaxAlign + AddDefaultCC(AddDefaultPred(BuildMI(MBB, MBBI, dl, + TII.get(ARM::BICri), ARM::SP) + .addReg(ARM::SP, RegState::Kill) + .addImm(MaxAlign-1))); + } else { + // We cannot use sp as source/dest register here, thus we're emitting the + // following sequence: + // mov r4, sp + // bic r4, r4, MaxAlign + // mov sp, r4 + // FIXME: It will be better just to find spare register here. + BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVgpr2tgpr), ARM::R4) + .addReg(ARM::SP, RegState::Kill); + AddDefaultCC(AddDefaultPred(BuildMI(MBB, MBBI, dl, + TII.get(ARM::t2BICri), ARM::R4) + .addReg(ARM::R4, RegState::Kill) + .addImm(MaxAlign-1))); + BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVtgpr2gpr), ARM::SP) + .addReg(ARM::R4, RegState::Kill); + } + + AFI->setShouldRestoreSPFromFP(true); + } + + // If we need a base pointer, set it up here. It's whatever the value + // of the stack pointer is at this point. Any variable size objects + // will be allocated after this, so we can still use the base pointer + // to reference locals. + if (RegInfo->hasBasePointer(MF)) { + if (isARM) + BuildMI(MBB, MBBI, dl, + TII.get(ARM::MOVr), RegInfo->getBaseRegister()) + .addReg(ARM::SP) + .addImm((unsigned)ARMCC::AL).addReg(0).addReg(0); + else + BuildMI(MBB, MBBI, dl, + TII.get(ARM::tMOVgpr2gpr), RegInfo->getBaseRegister()) + .addReg(ARM::SP); + } + + // If the frame has variable sized objects then the epilogue must restore + // the sp from fp. We can assume there's an FP here since hasFP already + // checks for hasVarSizedObjects. + if (MFI->hasVarSizedObjects()) + AFI->setShouldRestoreSPFromFP(true); +} + +void ARMFrameLowering::emitEpilogue(MachineFunction &MF, + MachineBasicBlock &MBB) const { + MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr(); + assert(MBBI->getDesc().isReturn() && + "Can only insert epilog into returning blocks"); + unsigned RetOpcode = MBBI->getOpcode(); + DebugLoc dl = MBBI->getDebugLoc(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + const TargetRegisterInfo *RegInfo = MF.getTarget().getRegisterInfo(); + const ARMBaseInstrInfo &TII = + *static_cast<const ARMBaseInstrInfo*>(MF.getTarget().getInstrInfo()); + assert(!AFI->isThumb1OnlyFunction() && + "This emitEpilogue does not support Thumb1!"); + bool isARM = !AFI->isThumbFunction(); + + unsigned VARegSaveSize = AFI->getVarArgsRegSaveSize(); + int NumBytes = (int)MFI->getStackSize(); + unsigned FramePtr = RegInfo->getFrameRegister(MF); + + if (!AFI->hasStackFrame()) { + if (NumBytes != 0) + emitSPUpdate(isARM, MBB, MBBI, dl, TII, NumBytes); + } else { + // Unwind MBBI to point to first LDR / VLDRD. + const unsigned *CSRegs = RegInfo->getCalleeSavedRegs(); + if (MBBI != MBB.begin()) { + do + --MBBI; + while (MBBI != MBB.begin() && isCSRestore(MBBI, TII, CSRegs)); + if (!isCSRestore(MBBI, TII, CSRegs)) + ++MBBI; + } + + // Move SP to start of FP callee save spill area. + NumBytes -= (AFI->getGPRCalleeSavedArea1Size() + + AFI->getGPRCalleeSavedArea2Size() + + AFI->getDPRCalleeSavedAreaSize()); + + // Reset SP based on frame pointer only if the stack frame extends beyond + // frame pointer stack slot or target is ELF and the function has FP. + if (AFI->shouldRestoreSPFromFP()) { + NumBytes = AFI->getFramePtrSpillOffset() - NumBytes; + if (NumBytes) { + if (isARM) + emitARMRegPlusImmediate(MBB, MBBI, dl, ARM::SP, FramePtr, -NumBytes, + ARMCC::AL, 0, TII); + else { + // It's not possible to restore SP from FP in a single instruction. + // For Darwin, this looks like: + // mov sp, r7 + // sub sp, #24 + // This is bad, if an interrupt is taken after the mov, sp is in an + // inconsistent state. + // Use the first callee-saved register as a scratch register. + assert(MF.getRegInfo().isPhysRegUsed(ARM::R4) && + "No scratch register to restore SP from FP!"); + emitT2RegPlusImmediate(MBB, MBBI, dl, ARM::R4, FramePtr, -NumBytes, + ARMCC::AL, 0, TII); + BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVgpr2gpr), ARM::SP) + .addReg(ARM::R4); + } + } else { + // Thumb2 or ARM. + if (isARM) + BuildMI(MBB, MBBI, dl, TII.get(ARM::MOVr), ARM::SP) + .addReg(FramePtr).addImm((unsigned)ARMCC::AL).addReg(0).addReg(0); + else + BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVgpr2gpr), ARM::SP) + .addReg(FramePtr); + } + } else if (NumBytes) + emitSPUpdate(isARM, MBB, MBBI, dl, TII, NumBytes); + + // Increment past our save areas. + if (AFI->getDPRCalleeSavedAreaSize()) MBBI++; + if (AFI->getGPRCalleeSavedArea2Size()) MBBI++; + if (AFI->getGPRCalleeSavedArea1Size()) MBBI++; + } + + if (RetOpcode == ARM::TCRETURNdi || RetOpcode == ARM::TCRETURNdiND || + RetOpcode == ARM::TCRETURNri || RetOpcode == ARM::TCRETURNriND) { + // Tail call return: adjust the stack pointer and jump to callee. + MBBI = MBB.getLastNonDebugInstr(); + MachineOperand &JumpTarget = MBBI->getOperand(0); + + // Jump to label or value in register. + if (RetOpcode == ARM::TCRETURNdi || RetOpcode == ARM::TCRETURNdiND) { + unsigned TCOpcode = (RetOpcode == ARM::TCRETURNdi) + ? (STI.isThumb() ? ARM::TAILJMPdt : ARM::TAILJMPd) + : (STI.isThumb() ? ARM::TAILJMPdNDt : ARM::TAILJMPdND); + MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII.get(TCOpcode)); + if (JumpTarget.isGlobal()) + MIB.addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset(), + JumpTarget.getTargetFlags()); + else { + assert(JumpTarget.isSymbol()); + MIB.addExternalSymbol(JumpTarget.getSymbolName(), + JumpTarget.getTargetFlags()); + } + } else if (RetOpcode == ARM::TCRETURNri) { + BuildMI(MBB, MBBI, dl, TII.get(ARM::TAILJMPr)). + addReg(JumpTarget.getReg(), RegState::Kill); + } else if (RetOpcode == ARM::TCRETURNriND) { + BuildMI(MBB, MBBI, dl, TII.get(ARM::TAILJMPrND)). + addReg(JumpTarget.getReg(), RegState::Kill); + } + + MachineInstr *NewMI = prior(MBBI); + for (unsigned i = 1, e = MBBI->getNumOperands(); i != e; ++i) + NewMI->addOperand(MBBI->getOperand(i)); + + // Delete the pseudo instruction TCRETURN. + MBB.erase(MBBI); + } + + if (VARegSaveSize) + emitSPUpdate(isARM, MBB, MBBI, dl, TII, VARegSaveSize); +} + +/// getFrameIndexReference - Provide a base+offset reference to an FI slot for +/// debug info. It's the same as what we use for resolving the code-gen +/// references for now. FIXME: This can go wrong when references are +/// SP-relative and simple call frames aren't used. +int +ARMFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI, + unsigned &FrameReg) const { + return ResolveFrameIndexReference(MF, FI, FrameReg, 0); +} + +int +ARMFrameLowering::ResolveFrameIndexReference(const MachineFunction &MF, + int FI, + unsigned &FrameReg, + int SPAdj) const { + const MachineFrameInfo *MFI = MF.getFrameInfo(); + const ARMBaseRegisterInfo *RegInfo = + static_cast<const ARMBaseRegisterInfo*>(MF.getTarget().getRegisterInfo()); + const ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + int Offset = MFI->getObjectOffset(FI) + MFI->getStackSize(); + int FPOffset = Offset - AFI->getFramePtrSpillOffset(); + bool isFixed = MFI->isFixedObjectIndex(FI); + + FrameReg = ARM::SP; + Offset += SPAdj; + if (AFI->isGPRCalleeSavedArea1Frame(FI)) + return Offset - AFI->getGPRCalleeSavedArea1Offset(); + else if (AFI->isGPRCalleeSavedArea2Frame(FI)) + return Offset - AFI->getGPRCalleeSavedArea2Offset(); + else if (AFI->isDPRCalleeSavedAreaFrame(FI)) + return Offset - AFI->getDPRCalleeSavedAreaOffset(); + + // When dynamically realigning the stack, use the frame pointer for + // parameters, and the stack/base pointer for locals. + if (RegInfo->needsStackRealignment(MF)) { + assert (hasFP(MF) && "dynamic stack realignment without a FP!"); + if (isFixed) { + FrameReg = RegInfo->getFrameRegister(MF); + Offset = FPOffset; + } else if (MFI->hasVarSizedObjects()) { + assert(RegInfo->hasBasePointer(MF) && + "VLAs and dynamic stack alignment, but missing base pointer!"); + FrameReg = RegInfo->getBaseRegister(); + } + return Offset; + } + + // If there is a frame pointer, use it when we can. + if (hasFP(MF) && AFI->hasStackFrame()) { + // Use frame pointer to reference fixed objects. Use it for locals if + // there are VLAs (and thus the SP isn't reliable as a base). + if (isFixed || (MFI->hasVarSizedObjects() && + !RegInfo->hasBasePointer(MF))) { + FrameReg = RegInfo->getFrameRegister(MF); + return FPOffset; + } else if (MFI->hasVarSizedObjects()) { + assert(RegInfo->hasBasePointer(MF) && "missing base pointer!"); + // Try to use the frame pointer if we can, else use the base pointer + // since it's available. This is handy for the emergency spill slot, in + // particular. + if (AFI->isThumb2Function()) { + if (FPOffset >= -255 && FPOffset < 0) { + FrameReg = RegInfo->getFrameRegister(MF); + return FPOffset; + } + } else + FrameReg = RegInfo->getBaseRegister(); + } else if (AFI->isThumb2Function()) { + // In Thumb2 mode, the negative offset is very limited. Try to avoid + // out of range references. + if (FPOffset >= -255 && FPOffset < 0) { + FrameReg = RegInfo->getFrameRegister(MF); + return FPOffset; + } + } else if (Offset > (FPOffset < 0 ? -FPOffset : FPOffset)) { + // Otherwise, use SP or FP, whichever is closer to the stack slot. + FrameReg = RegInfo->getFrameRegister(MF); + return FPOffset; + } + } + // Use the base pointer if we have one. + if (RegInfo->hasBasePointer(MF)) + FrameReg = RegInfo->getBaseRegister(); + return Offset; +} + +int ARMFrameLowering::getFrameIndexOffset(const MachineFunction &MF, + int FI) const { + unsigned FrameReg; + return getFrameIndexReference(MF, FI, FrameReg); +} + +void ARMFrameLowering::emitPushInst(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + const std::vector<CalleeSavedInfo> &CSI, + unsigned StmOpc, unsigned StrOpc, + bool NoGap, + bool(*Func)(unsigned, bool)) const { + MachineFunction &MF = *MBB.getParent(); + const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo(); + + DebugLoc DL; + if (MI != MBB.end()) DL = MI->getDebugLoc(); + + SmallVector<std::pair<unsigned,bool>, 4> Regs; + unsigned i = CSI.size(); + while (i != 0) { + unsigned LastReg = 0; + for (; i != 0; --i) { + unsigned Reg = CSI[i-1].getReg(); + if (!(Func)(Reg, STI.isTargetDarwin())) continue; + + // Add the callee-saved register as live-in unless it's LR and + // @llvm.returnaddress is called. If LR is returned for + // @llvm.returnaddress then it's already added to the function and + // entry block live-in sets. + bool isKill = true; + if (Reg == ARM::LR) { + if (MF.getFrameInfo()->isReturnAddressTaken() && + MF.getRegInfo().isLiveIn(Reg)) + isKill = false; + } + + if (isKill) + MBB.addLiveIn(Reg); + + // If NoGap is true, push consecutive registers and then leave the rest + // for other instructions. e.g. + // vpush {d8, d10, d11} -> vpush {d8}, vpush {d10, d11} + if (NoGap && LastReg && LastReg != Reg-1) + break; + LastReg = Reg; + Regs.push_back(std::make_pair(Reg, isKill)); + } + + if (Regs.empty()) + continue; + if (Regs.size() > 1 || StrOpc== 0) { + MachineInstrBuilder MIB = + AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(StmOpc), ARM::SP) + .addReg(ARM::SP)); + for (unsigned i = 0, e = Regs.size(); i < e; ++i) + MIB.addReg(Regs[i].first, getKillRegState(Regs[i].second)); + } else if (Regs.size() == 1) { + MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(StrOpc), + ARM::SP) + .addReg(Regs[0].first, getKillRegState(Regs[0].second)) + .addReg(ARM::SP); + // ARM mode needs an extra reg0 here due to addrmode2. Will go away once + // that refactoring is complete (eventually). + if (StrOpc == ARM::STR_PRE) { + MIB.addReg(0); + MIB.addImm(ARM_AM::getAM2Opc(ARM_AM::sub, 4, ARM_AM::no_shift)); + } else + MIB.addImm(-4); + AddDefaultPred(MIB); + } + Regs.clear(); + } +} + +void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + const std::vector<CalleeSavedInfo> &CSI, + unsigned LdmOpc, unsigned LdrOpc, + bool isVarArg, bool NoGap, + bool(*Func)(unsigned, bool)) const { + MachineFunction &MF = *MBB.getParent(); + const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo(); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + DebugLoc DL = MI->getDebugLoc(); + unsigned RetOpcode = MI->getOpcode(); + bool isTailCall = (RetOpcode == ARM::TCRETURNdi || + RetOpcode == ARM::TCRETURNdiND || + RetOpcode == ARM::TCRETURNri || + RetOpcode == ARM::TCRETURNriND); + + SmallVector<unsigned, 4> Regs; + unsigned i = CSI.size(); + while (i != 0) { + unsigned LastReg = 0; + bool DeleteRet = false; + for (; i != 0; --i) { + unsigned Reg = CSI[i-1].getReg(); + if (!(Func)(Reg, STI.isTargetDarwin())) continue; + + if (Reg == ARM::LR && !isTailCall && !isVarArg && STI.hasV5TOps()) { + Reg = ARM::PC; + LdmOpc = AFI->isThumbFunction() ? ARM::t2LDMIA_RET : ARM::LDMIA_RET; + // Fold the return instruction into the LDM. + DeleteRet = true; + } + + // If NoGap is true, pop consecutive registers and then leave the rest + // for other instructions. e.g. + // vpop {d8, d10, d11} -> vpop {d8}, vpop {d10, d11} + if (NoGap && LastReg && LastReg != Reg-1) + break; + + LastReg = Reg; + Regs.push_back(Reg); + } + + if (Regs.empty()) + continue; + if (Regs.size() > 1 || LdrOpc == 0) { + MachineInstrBuilder MIB = + AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(LdmOpc), ARM::SP) + .addReg(ARM::SP)); + for (unsigned i = 0, e = Regs.size(); i < e; ++i) + MIB.addReg(Regs[i], getDefRegState(true)); + if (DeleteRet) + MI->eraseFromParent(); + MI = MIB; + } else if (Regs.size() == 1) { + // If we adjusted the reg to PC from LR above, switch it back here. We + // only do that for LDM. + if (Regs[0] == ARM::PC) + Regs[0] = ARM::LR; + MachineInstrBuilder MIB = + BuildMI(MBB, MI, DL, TII.get(LdrOpc), Regs[0]) + .addReg(ARM::SP, RegState::Define) + .addReg(ARM::SP); + // ARM mode needs an extra reg0 here due to addrmode2. Will go away once + // that refactoring is complete (eventually). + if (LdrOpc == ARM::LDR_POST) { + MIB.addReg(0); + MIB.addImm(ARM_AM::getAM2Opc(ARM_AM::add, 4, ARM_AM::no_shift)); + } else + MIB.addImm(4); + AddDefaultPred(MIB); + } + Regs.clear(); + } +} + +bool ARMFrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + const std::vector<CalleeSavedInfo> &CSI, + const TargetRegisterInfo *TRI) const { + if (CSI.empty()) + return false; + + MachineFunction &MF = *MBB.getParent(); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + + unsigned PushOpc = AFI->isThumbFunction() ? ARM::t2STMDB_UPD : ARM::STMDB_UPD; + unsigned PushOneOpc = AFI->isThumbFunction() ? ARM::t2STR_PRE : ARM::STR_PRE; + unsigned FltOpc = ARM::VSTMDDB_UPD; + emitPushInst(MBB, MI, CSI, PushOpc, PushOneOpc, false, &isARMArea1Register); + emitPushInst(MBB, MI, CSI, PushOpc, PushOneOpc, false, &isARMArea2Register); + emitPushInst(MBB, MI, CSI, FltOpc, 0, true, &isARMArea3Register); + + return true; +} + +bool ARMFrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + const std::vector<CalleeSavedInfo> &CSI, + const TargetRegisterInfo *TRI) const { + if (CSI.empty()) + return false; + + MachineFunction &MF = *MBB.getParent(); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + bool isVarArg = AFI->getVarArgsRegSaveSize() > 0; + + unsigned PopOpc = AFI->isThumbFunction() ? ARM::t2LDMIA_UPD : ARM::LDMIA_UPD; + unsigned LdrOpc = AFI->isThumbFunction() ? ARM::t2LDR_POST : ARM::LDR_POST; + unsigned FltOpc = ARM::VLDMDIA_UPD; + emitPopInst(MBB, MI, CSI, FltOpc, 0, isVarArg, true, &isARMArea3Register); + emitPopInst(MBB, MI, CSI, PopOpc, LdrOpc, isVarArg, false, + &isARMArea2Register); + emitPopInst(MBB, MI, CSI, PopOpc, LdrOpc, isVarArg, false, + &isARMArea1Register); + + return true; +} + +// FIXME: Make generic? +static unsigned GetFunctionSizeInBytes(const MachineFunction &MF, + const ARMBaseInstrInfo &TII) { + unsigned FnSize = 0; + for (MachineFunction::const_iterator MBBI = MF.begin(), E = MF.end(); + MBBI != E; ++MBBI) { + const MachineBasicBlock &MBB = *MBBI; + for (MachineBasicBlock::const_iterator I = MBB.begin(),E = MBB.end(); + I != E; ++I) + FnSize += TII.GetInstSizeInBytes(I); + } + return FnSize; +} + +/// estimateStackSize - Estimate and return the size of the frame. +/// FIXME: Make generic? +static unsigned estimateStackSize(MachineFunction &MF) { + const MachineFrameInfo *FFI = MF.getFrameInfo(); + int Offset = 0; + for (int i = FFI->getObjectIndexBegin(); i != 0; ++i) { + int FixedOff = -FFI->getObjectOffset(i); + if (FixedOff > Offset) Offset = FixedOff; + } + for (unsigned i = 0, e = FFI->getObjectIndexEnd(); i != e; ++i) { + if (FFI->isDeadObjectIndex(i)) + continue; + Offset += FFI->getObjectSize(i); + unsigned Align = FFI->getObjectAlignment(i); + // Adjust to alignment boundary + Offset = (Offset+Align-1)/Align*Align; + } + return (unsigned)Offset; +} + +/// estimateRSStackSizeLimit - Look at each instruction that references stack +/// frames and return the stack size limit beyond which some of these +/// instructions will require a scratch register during their expansion later. +// FIXME: Move to TII? +static unsigned estimateRSStackSizeLimit(MachineFunction &MF, + const TargetFrameLowering *TFI) { + const ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + unsigned Limit = (1 << 12) - 1; + for (MachineFunction::iterator BB = MF.begin(),E = MF.end(); BB != E; ++BB) { + for (MachineBasicBlock::iterator I = BB->begin(), E = BB->end(); + I != E; ++I) { + for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) { + if (!I->getOperand(i).isFI()) continue; + + // When using ADDri to get the address of a stack object, 255 is the + // largest offset guaranteed to fit in the immediate offset. + if (I->getOpcode() == ARM::ADDri) { + Limit = std::min(Limit, (1U << 8) - 1); + break; + } + + // Otherwise check the addressing mode. + switch (I->getDesc().TSFlags & ARMII::AddrModeMask) { + case ARMII::AddrMode3: + case ARMII::AddrModeT2_i8: + Limit = std::min(Limit, (1U << 8) - 1); + break; + case ARMII::AddrMode5: + case ARMII::AddrModeT2_i8s4: + Limit = std::min(Limit, ((1U << 8) - 1) * 4); + break; + case ARMII::AddrModeT2_i12: + // i12 supports only positive offset so these will be converted to + // i8 opcodes. See llvm::rewriteT2FrameIndex. + if (TFI->hasFP(MF) && AFI->hasStackFrame()) + Limit = std::min(Limit, (1U << 8) - 1); + break; + case ARMII::AddrMode4: + case ARMII::AddrMode6: + // Addressing modes 4 & 6 (load/store) instructions can't encode an + // immediate offset for stack references. + return 0; + default: + break; + } + break; // At most one FI per instruction + } + } + } + + return Limit; +} + +void +ARMFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF, + RegScavenger *RS) const { + // This tells PEI to spill the FP as if it is any other callee-save register + // to take advantage the eliminateFrameIndex machinery. This also ensures it + // is spilled in the order specified by getCalleeSavedRegs() to make it easier + // to combine multiple loads / stores. + bool CanEliminateFrame = true; + bool CS1Spilled = false; + bool LRSpilled = false; + unsigned NumGPRSpills = 0; + SmallVector<unsigned, 4> UnspilledCS1GPRs; + SmallVector<unsigned, 4> UnspilledCS2GPRs; + const ARMBaseRegisterInfo *RegInfo = + static_cast<const ARMBaseRegisterInfo*>(MF.getTarget().getRegisterInfo()); + const ARMBaseInstrInfo &TII = + *static_cast<const ARMBaseInstrInfo*>(MF.getTarget().getInstrInfo()); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + unsigned FramePtr = RegInfo->getFrameRegister(MF); + + // Spill R4 if Thumb2 function requires stack realignment - it will be used as + // scratch register. Also spill R4 if Thumb2 function has varsized objects, + // since it's not always possible to restore sp from fp in a single + // instruction. + // FIXME: It will be better just to find spare register here. + if (AFI->isThumb2Function() && + (MFI->hasVarSizedObjects() || RegInfo->needsStackRealignment(MF))) + MF.getRegInfo().setPhysRegUsed(ARM::R4); + + if (AFI->isThumb1OnlyFunction()) { + // Spill LR if Thumb1 function uses variable length argument lists. + if (AFI->getVarArgsRegSaveSize() > 0) + MF.getRegInfo().setPhysRegUsed(ARM::LR); + + // Spill R4 if Thumb1 epilogue has to restore SP from FP since + // FIXME: It will be better just to find spare register here. + if (MFI->hasVarSizedObjects()) + MF.getRegInfo().setPhysRegUsed(ARM::R4); + } + + // Spill the BasePtr if it's used. + if (RegInfo->hasBasePointer(MF)) + MF.getRegInfo().setPhysRegUsed(RegInfo->getBaseRegister()); + + // Don't spill FP if the frame can be eliminated. This is determined + // by scanning the callee-save registers to see if any is used. + const unsigned *CSRegs = RegInfo->getCalleeSavedRegs(); + for (unsigned i = 0; CSRegs[i]; ++i) { + unsigned Reg = CSRegs[i]; + bool Spilled = false; + if (MF.getRegInfo().isPhysRegUsed(Reg)) { + Spilled = true; + CanEliminateFrame = false; + } else { + // Check alias registers too. + for (const unsigned *Aliases = + RegInfo->getAliasSet(Reg); *Aliases; ++Aliases) { + if (MF.getRegInfo().isPhysRegUsed(*Aliases)) { + Spilled = true; + CanEliminateFrame = false; + } + } + } + + if (!ARM::GPRRegisterClass->contains(Reg)) + continue; + + if (Spilled) { + NumGPRSpills++; + + if (!STI.isTargetDarwin()) { + if (Reg == ARM::LR) + LRSpilled = true; + CS1Spilled = true; + continue; + } + + // Keep track if LR and any of R4, R5, R6, and R7 is spilled. + switch (Reg) { + case ARM::LR: + LRSpilled = true; + // Fallthrough + case ARM::R4: case ARM::R5: + case ARM::R6: case ARM::R7: + CS1Spilled = true; + break; + default: + break; + } + } else { + if (!STI.isTargetDarwin()) { + UnspilledCS1GPRs.push_back(Reg); + continue; + } + + switch (Reg) { + case ARM::R4: case ARM::R5: + case ARM::R6: case ARM::R7: + case ARM::LR: + UnspilledCS1GPRs.push_back(Reg); + break; + default: + UnspilledCS2GPRs.push_back(Reg); + break; + } + } + } + + bool ForceLRSpill = false; + if (!LRSpilled && AFI->isThumb1OnlyFunction()) { + unsigned FnSize = GetFunctionSizeInBytes(MF, TII); + // Force LR to be spilled if the Thumb function size is > 2048. This enables + // use of BL to implement far jump. If it turns out that it's not needed + // then the branch fix up path will undo it. + if (FnSize >= (1 << 11)) { + CanEliminateFrame = false; + ForceLRSpill = true; + } + } + + // If any of the stack slot references may be out of range of an immediate + // offset, make sure a register (or a spill slot) is available for the + // register scavenger. Note that if we're indexing off the frame pointer, the + // effective stack size is 4 bytes larger since the FP points to the stack + // slot of the previous FP. Also, if we have variable sized objects in the + // function, stack slot references will often be negative, and some of + // our instructions are positive-offset only, so conservatively consider + // that case to want a spill slot (or register) as well. Similarly, if + // the function adjusts the stack pointer during execution and the + // adjustments aren't already part of our stack size estimate, our offset + // calculations may be off, so be conservative. + // FIXME: We could add logic to be more precise about negative offsets + // and which instructions will need a scratch register for them. Is it + // worth the effort and added fragility? + bool BigStack = + (RS && + (estimateStackSize(MF) + ((hasFP(MF) && AFI->hasStackFrame()) ? 4:0) >= + estimateRSStackSizeLimit(MF, this))) + || MFI->hasVarSizedObjects() + || (MFI->adjustsStack() && !canSimplifyCallFramePseudos(MF)); + + bool ExtraCSSpill = false; + if (BigStack || !CanEliminateFrame || RegInfo->cannotEliminateFrame(MF)) { + AFI->setHasStackFrame(true); + + // If LR is not spilled, but at least one of R4, R5, R6, and R7 is spilled. + // Spill LR as well so we can fold BX_RET to the registers restore (LDM). + if (!LRSpilled && CS1Spilled) { + MF.getRegInfo().setPhysRegUsed(ARM::LR); + NumGPRSpills++; + UnspilledCS1GPRs.erase(std::find(UnspilledCS1GPRs.begin(), + UnspilledCS1GPRs.end(), (unsigned)ARM::LR)); + ForceLRSpill = false; + ExtraCSSpill = true; + } + + if (hasFP(MF)) { + MF.getRegInfo().setPhysRegUsed(FramePtr); + NumGPRSpills++; + } + + // If stack and double are 8-byte aligned and we are spilling an odd number + // of GPRs, spill one extra callee save GPR so we won't have to pad between + // the integer and double callee save areas. + unsigned TargetAlign = getStackAlignment(); + if (TargetAlign == 8 && (NumGPRSpills & 1)) { + if (CS1Spilled && !UnspilledCS1GPRs.empty()) { + for (unsigned i = 0, e = UnspilledCS1GPRs.size(); i != e; ++i) { + unsigned Reg = UnspilledCS1GPRs[i]; + // Don't spill high register if the function is thumb1 + if (!AFI->isThumb1OnlyFunction() || + isARMLowRegister(Reg) || Reg == ARM::LR) { + MF.getRegInfo().setPhysRegUsed(Reg); + if (!RegInfo->isReservedReg(MF, Reg)) + ExtraCSSpill = true; + break; + } + } + } else if (!UnspilledCS2GPRs.empty() && !AFI->isThumb1OnlyFunction()) { + unsigned Reg = UnspilledCS2GPRs.front(); + MF.getRegInfo().setPhysRegUsed(Reg); + if (!RegInfo->isReservedReg(MF, Reg)) + ExtraCSSpill = true; + } + } + + // Estimate if we might need to scavenge a register at some point in order + // to materialize a stack offset. If so, either spill one additional + // callee-saved register or reserve a special spill slot to facilitate + // register scavenging. Thumb1 needs a spill slot for stack pointer + // adjustments also, even when the frame itself is small. + if (BigStack && !ExtraCSSpill) { + // If any non-reserved CS register isn't spilled, just spill one or two + // extra. That should take care of it! + unsigned NumExtras = TargetAlign / 4; + SmallVector<unsigned, 2> Extras; + while (NumExtras && !UnspilledCS1GPRs.empty()) { + unsigned Reg = UnspilledCS1GPRs.back(); + UnspilledCS1GPRs.pop_back(); + if (!RegInfo->isReservedReg(MF, Reg) && + (!AFI->isThumb1OnlyFunction() || isARMLowRegister(Reg) || + Reg == ARM::LR)) { + Extras.push_back(Reg); + NumExtras--; + } + } + // For non-Thumb1 functions, also check for hi-reg CS registers + if (!AFI->isThumb1OnlyFunction()) { + while (NumExtras && !UnspilledCS2GPRs.empty()) { + unsigned Reg = UnspilledCS2GPRs.back(); + UnspilledCS2GPRs.pop_back(); + if (!RegInfo->isReservedReg(MF, Reg)) { + Extras.push_back(Reg); + NumExtras--; + } + } + } + if (Extras.size() && NumExtras == 0) { + for (unsigned i = 0, e = Extras.size(); i != e; ++i) { + MF.getRegInfo().setPhysRegUsed(Extras[i]); + } + } else if (!AFI->isThumb1OnlyFunction()) { + // note: Thumb1 functions spill to R12, not the stack. Reserve a slot + // closest to SP or frame pointer. + const TargetRegisterClass *RC = ARM::GPRRegisterClass; + RS->setScavengingFrameIndex(MFI->CreateStackObject(RC->getSize(), + RC->getAlignment(), + false)); + } + } + } + + if (ForceLRSpill) { + MF.getRegInfo().setPhysRegUsed(ARM::LR); + AFI->setLRIsSpilledForFarJump(true); + } +} diff --git a/contrib/llvm/lib/Target/ARM/ARMFrameLowering.h b/contrib/llvm/lib/Target/ARM/ARMFrameLowering.h new file mode 100644 index 0000000..1288b70 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMFrameLowering.h @@ -0,0 +1,74 @@ +//==-- ARMTargetFrameLowering.h - Define frame lowering for ARM --*- C++ -*-==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// +// +//===----------------------------------------------------------------------===// + +#ifndef ARM_FRAMEINFO_H +#define ARM_FRAMEINFO_H + +#include "ARM.h" +#include "ARMSubtarget.h" +#include "llvm/Target/TargetFrameLowering.h" + +namespace llvm { + class ARMSubtarget; + +class ARMFrameLowering : public TargetFrameLowering { +protected: + const ARMSubtarget &STI; + +public: + explicit ARMFrameLowering(const ARMSubtarget &sti) + : TargetFrameLowering(StackGrowsDown, sti.getStackAlignment(), 0, 4), + STI(sti) { + } + + /// emitProlog/emitEpilog - These methods insert prolog and epilog code into + /// the function. + void emitPrologue(MachineFunction &MF) const; + void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const; + + bool spillCalleeSavedRegisters(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + const std::vector<CalleeSavedInfo> &CSI, + const TargetRegisterInfo *TRI) const; + + bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + const std::vector<CalleeSavedInfo> &CSI, + const TargetRegisterInfo *TRI) const; + + bool hasFP(const MachineFunction &MF) const; + bool hasReservedCallFrame(const MachineFunction &MF) const; + bool canSimplifyCallFramePseudos(const MachineFunction &MF) const; + int getFrameIndexReference(const MachineFunction &MF, int FI, + unsigned &FrameReg) const; + int ResolveFrameIndexReference(const MachineFunction &MF, int FI, + unsigned &FrameReg, int SPAdj) const; + int getFrameIndexOffset(const MachineFunction &MF, int FI) const; + + void processFunctionBeforeCalleeSavedScan(MachineFunction &MF, + RegScavenger *RS) const; + + private: + void emitPushInst(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, + const std::vector<CalleeSavedInfo> &CSI, unsigned StmOpc, + unsigned StrOpc, bool NoGap, + bool(*Func)(unsigned, bool)) const; + void emitPopInst(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, + const std::vector<CalleeSavedInfo> &CSI, unsigned LdmOpc, + unsigned LdrOpc, bool isVarArg, bool NoGap, + bool(*Func)(unsigned, bool)) const; +}; + +} // End llvm namespace + +#endif diff --git a/contrib/llvm/lib/Target/ARM/ARMGlobalMerge.cpp b/contrib/llvm/lib/Target/ARM/ARMGlobalMerge.cpp index 85b0c6c..3f02383 100644 --- a/contrib/llvm/lib/Target/ARM/ARMGlobalMerge.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMGlobalMerge.cpp @@ -12,7 +12,8 @@ // global). Such a transformation can significantly reduce the register pressure // when many globals are involved. // -// For example, consider the code which touches several global variables at once: +// For example, consider the code which touches several global variables at +// once: // // static int foo[N], bar[N], baz[N]; // @@ -48,7 +49,7 @@ // str r0, [r5], #4 // // note that we saved 2 registers here almostly "for free". -// ===----------------------------------------------------------------------===// +// ===---------------------------------------------------------------------===// #define DEBUG_TYPE "arm-global-merge" #include "ARM.h" @@ -64,16 +65,17 @@ #include "llvm/Pass.h" #include "llvm/Target/TargetData.h" #include "llvm/Target/TargetLowering.h" +#include "llvm/Target/TargetLoweringObjectFile.h" using namespace llvm; namespace { - class LLVM_LIBRARY_VISIBILITY ARMGlobalMerge : public FunctionPass { + class ARMGlobalMerge : public FunctionPass { /// TLI - Keep a pointer of a TargetLowering to consult for determining /// target type sizes. const TargetLowering *TLI; bool doMerge(SmallVectorImpl<GlobalVariable*> &Globals, - Module &M, bool) const; + Module &M, bool isConst) const; public: static char ID; // Pass identification, replacement for typeid. @@ -81,7 +83,7 @@ namespace { : FunctionPass(ID), TLI(tli) {} virtual bool doInitialization(Module &M); - virtual bool runOnFunction(Function& F); + virtual bool runOnFunction(Function &F); const char *getPassName() const { return "Merge internal globals"; @@ -95,13 +97,11 @@ namespace { struct GlobalCmp { const TargetData *TD; - GlobalCmp(const TargetData *td): - TD(td) { } + GlobalCmp(const TargetData *td) : TD(td) { } - bool operator() (const GlobalVariable* GV1, - const GlobalVariable* GV2) { - const Type* Ty1 = cast<PointerType>(GV1->getType())->getElementType(); - const Type* Ty2 = cast<PointerType>(GV2->getType())->getElementType(); + bool operator()(const GlobalVariable *GV1, const GlobalVariable *GV2) { + const Type *Ty1 = cast<PointerType>(GV1->getType())->getElementType(); + const Type *Ty2 = cast<PointerType>(GV2->getType())->getElementType(); return (TD->getTypeAllocSize(Ty1) < TD->getTypeAllocSize(Ty2)); } @@ -130,27 +130,27 @@ bool ARMGlobalMerge::doMerge(SmallVectorImpl<GlobalVariable*> &Globals, uint64_t MergedSize = 0; std::vector<const Type*> Tys; std::vector<Constant*> Inits; - for (j = i; MergedSize < MaxOffset && j != e; ++j) { - const Type* Ty = Globals[j]->getType()->getElementType(); + for (j = i; j != e; ++j) { + const Type *Ty = Globals[j]->getType()->getElementType(); + MergedSize += TD->getTypeAllocSize(Ty); + if (MergedSize > MaxOffset) { + break; + } Tys.push_back(Ty); Inits.push_back(Globals[j]->getInitializer()); - MergedSize += TD->getTypeAllocSize(Ty); } - StructType* MergedTy = StructType::get(M.getContext(), Tys); - Constant* MergedInit = ConstantStruct::get(MergedTy, Inits); - GlobalVariable* MergedGV = new GlobalVariable(M, MergedTy, isConst, + StructType *MergedTy = StructType::get(M.getContext(), Tys); + Constant *MergedInit = ConstantStruct::get(MergedTy, Inits); + GlobalVariable *MergedGV = new GlobalVariable(M, MergedTy, isConst, GlobalValue::InternalLinkage, - MergedInit, "merged"); + MergedInit, "_MergedGlobals"); for (size_t k = i; k < j; ++k) { - SmallVector<Constant*, 2> Idx; - Idx.push_back(ConstantInt::get(Int32Ty, 0)); - Idx.push_back(ConstantInt::get(Int32Ty, k-i)); - - Constant* GEP = - ConstantExpr::getInBoundsGetElementPtr(MergedGV, - &Idx[0], Idx.size()); - + Constant *Idx[2] = { + ConstantInt::get(Int32Ty, 0), + ConstantInt::get(Int32Ty, k-i) + }; + Constant *GEP = ConstantExpr::getInBoundsGetElementPtr(MergedGV, Idx, 2); Globals[k]->replaceAllUsesWith(GEP); Globals[k]->eraseFromParent(); } @@ -161,8 +161,8 @@ bool ARMGlobalMerge::doMerge(SmallVectorImpl<GlobalVariable*> &Globals, } -bool ARMGlobalMerge::doInitialization(Module& M) { - SmallVector<GlobalVariable*, 16> Globals, ConstGlobals; +bool ARMGlobalMerge::doInitialization(Module &M) { + SmallVector<GlobalVariable*, 16> Globals, ConstGlobals, BSSGlobals; const TargetData *TD = TLI->getTargetData(); unsigned MaxOffset = TLI->getMaximalGlobalOffset(); bool Changed = false; @@ -183,8 +183,11 @@ bool ARMGlobalMerge::doInitialization(Module& M) { I->getName().startswith(".llvm.")) continue; - if (TD->getTypeAllocSize(I->getType()) < MaxOffset) { - if (I->isConstant()) + if (TD->getTypeAllocSize(I->getType()->getElementType()) < MaxOffset) { + const TargetLoweringObjectFile &TLOF = TLI->getObjFileLowering(); + if (TLOF.getKindForGlobal(I, TLI->getTargetMachine()).isBSSLocal()) + BSSGlobals.push_back(I); + else if (I->isConstant()) ConstGlobals.push_back(I); else Globals.push_back(I); @@ -193,17 +196,19 @@ bool ARMGlobalMerge::doInitialization(Module& M) { if (Globals.size() > 1) Changed |= doMerge(Globals, M, false); + if (BSSGlobals.size() > 1) + Changed |= doMerge(BSSGlobals, M, false); + // FIXME: This currently breaks the EH processing due to way how the // typeinfo detection works. We might want to detect the TIs and ignore // them in the future. - // if (ConstGlobals.size() > 1) // Changed |= doMerge(ConstGlobals, M, true); return Changed; } -bool ARMGlobalMerge::runOnFunction(Function& F) { +bool ARMGlobalMerge::runOnFunction(Function &F) { return false; } diff --git a/contrib/llvm/lib/Target/ARM/ARMHazardRecognizer.cpp b/contrib/llvm/lib/Target/ARM/ARMHazardRecognizer.cpp new file mode 100644 index 0000000..676b01e --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMHazardRecognizer.cpp @@ -0,0 +1,121 @@ +//===-- ARMHazardRecognizer.cpp - ARM postra hazard recognizer ------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "ARMHazardRecognizer.h" +#include "ARMBaseInstrInfo.h" +#include "ARMBaseRegisterInfo.h" +#include "ARMSubtarget.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/ScheduleDAG.h" +#include "llvm/Target/TargetRegisterInfo.h" +using namespace llvm; + +static bool hasRAWHazard(MachineInstr *DefMI, MachineInstr *MI, + const TargetRegisterInfo &TRI) { + // FIXME: Detect integer instructions properly. + const TargetInstrDesc &TID = MI->getDesc(); + unsigned Domain = TID.TSFlags & ARMII::DomainMask; + if (Domain == ARMII::DomainVFP) { + unsigned Opcode = MI->getOpcode(); + if (Opcode == ARM::VSTRS || Opcode == ARM::VSTRD || + Opcode == ARM::VMOVRS || Opcode == ARM::VMOVRRD) + return false; + } else if (Domain == ARMII::DomainNEON) { + if (MI->getDesc().mayStore() || MI->getDesc().mayLoad()) + return false; + } else + return false; + return MI->readsRegister(DefMI->getOperand(0).getReg(), &TRI); +} + +ScheduleHazardRecognizer::HazardType +ARMHazardRecognizer::getHazardType(SUnit *SU, int Stalls) { + assert(Stalls == 0 && "ARM hazards don't support scoreboard lookahead"); + + MachineInstr *MI = SU->getInstr(); + + if (!MI->isDebugValue()) { + if (ITBlockSize && MI != ITBlockMIs[ITBlockSize-1]) + return Hazard; + + // Look for special VMLA / VMLS hazards. A VMUL / VADD / VSUB following + // a VMLA / VMLS will cause 4 cycle stall. + const TargetInstrDesc &TID = MI->getDesc(); + if (LastMI && (TID.TSFlags & ARMII::DomainMask) != ARMII::DomainGeneral) { + MachineInstr *DefMI = LastMI; + const TargetInstrDesc &LastTID = LastMI->getDesc(); + // Skip over one non-VFP / NEON instruction. + if (!LastTID.isBarrier() && + (LastTID.TSFlags & ARMII::DomainMask) == ARMII::DomainGeneral) { + MachineBasicBlock::iterator I = LastMI; + if (I != LastMI->getParent()->begin()) { + I = llvm::prior(I); + DefMI = &*I; + } + } + + if (TII.isFpMLxInstruction(DefMI->getOpcode()) && + (TII.canCauseFpMLxStall(MI->getOpcode()) || + hasRAWHazard(DefMI, MI, TRI))) { + // Try to schedule another instruction for the next 4 cycles. + if (FpMLxStalls == 0) + FpMLxStalls = 4; + return Hazard; + } + } + } + + return ScoreboardHazardRecognizer::getHazardType(SU, Stalls); +} + +void ARMHazardRecognizer::Reset() { + LastMI = 0; + FpMLxStalls = 0; + ITBlockSize = 0; + ScoreboardHazardRecognizer::Reset(); +} + +void ARMHazardRecognizer::EmitInstruction(SUnit *SU) { + MachineInstr *MI = SU->getInstr(); + unsigned Opcode = MI->getOpcode(); + if (ITBlockSize) { + --ITBlockSize; + } else if (Opcode == ARM::t2IT) { + unsigned Mask = MI->getOperand(1).getImm(); + unsigned NumTZ = CountTrailingZeros_32(Mask); + assert(NumTZ <= 3 && "Invalid IT mask!"); + ITBlockSize = 4 - NumTZ; + MachineBasicBlock::iterator I = MI; + for (unsigned i = 0; i < ITBlockSize; ++i) { + // Advance to the next instruction, skipping any dbg_value instructions. + do { + ++I; + } while (I->isDebugValue()); + ITBlockMIs[ITBlockSize-1-i] = &*I; + } + } + + if (!MI->isDebugValue()) { + LastMI = MI; + FpMLxStalls = 0; + } + + ScoreboardHazardRecognizer::EmitInstruction(SU); +} + +void ARMHazardRecognizer::AdvanceCycle() { + if (FpMLxStalls && --FpMLxStalls == 0) + // Stalled for 4 cycles but still can't schedule any other instructions. + LastMI = 0; + ScoreboardHazardRecognizer::AdvanceCycle(); +} + +void ARMHazardRecognizer::RecedeCycle() { + llvm_unreachable("reverse ARM hazard checking unsupported"); +} diff --git a/contrib/llvm/lib/Target/ARM/ARMHazardRecognizer.h b/contrib/llvm/lib/Target/ARM/ARMHazardRecognizer.h new file mode 100644 index 0000000..2bc218d --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMHazardRecognizer.h @@ -0,0 +1,54 @@ +//===-- ARMHazardRecognizer.h - ARM Hazard Recognizers ----------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines hazard recognizers for scheduling ARM functions. +// +//===----------------------------------------------------------------------===// + +#ifndef ARMHAZARDRECOGNIZER_H +#define ARMHAZARDRECOGNIZER_H + +#include "llvm/CodeGen/ScoreboardHazardRecognizer.h" + +namespace llvm { + +class ARMBaseInstrInfo; +class ARMBaseRegisterInfo; +class ARMSubtarget; +class MachineInstr; + +class ARMHazardRecognizer : public ScoreboardHazardRecognizer { + const ARMBaseInstrInfo &TII; + const ARMBaseRegisterInfo &TRI; + const ARMSubtarget &STI; + + MachineInstr *LastMI; + unsigned FpMLxStalls; + unsigned ITBlockSize; // No. of MIs in current IT block yet to be scheduled. + MachineInstr *ITBlockMIs[4]; + +public: + ARMHazardRecognizer(const InstrItineraryData *ItinData, + const ARMBaseInstrInfo &tii, + const ARMBaseRegisterInfo &tri, + const ARMSubtarget &sti, + const ScheduleDAG *DAG) : + ScoreboardHazardRecognizer(ItinData, DAG, "post-RA-sched"), TII(tii), + TRI(tri), STI(sti), LastMI(0), ITBlockSize(0) {} + + virtual HazardType getHazardType(SUnit *SU, int Stalls); + virtual void Reset(); + virtual void EmitInstruction(SUnit *SU); + virtual void AdvanceCycle(); + virtual void RecedeCycle(); +}; + +} // end namespace llvm + +#endif // ARMHAZARDRECOGNIZER_H diff --git a/contrib/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/contrib/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp index 51a30c1..a506cff 100644 --- a/contrib/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -13,6 +13,7 @@ #define DEBUG_TYPE "arm-isel" #include "ARM.h" +#include "ARMBaseInstrInfo.h" #include "ARMAddressingModes.h" #include "ARMTargetMachine.h" #include "llvm/CallingConv.h" @@ -41,13 +42,25 @@ DisableShifterOp("disable-shifter-op", cl::Hidden, cl::desc("Disable isel of shifter-op"), cl::init(false)); +static cl::opt<bool> +CheckVMLxHazard("check-vmlx-hazard", cl::Hidden, + cl::desc("Check fp vmla / vmls hazard at isel time"), + cl::init(false)); + //===--------------------------------------------------------------------===// /// ARMDAGToDAGISel - ARM specific code to select ARM machine /// instructions for SelectionDAG operations. /// namespace { + +enum AddrMode2Type { + AM2_BASE, // Simple AM2 (+-imm12) + AM2_SHOP // Shifter-op AM2 +}; + class ARMDAGToDAGISel : public SelectionDAGISel { ARMBaseTargetMachine &TM; + const ARMBaseInstrInfo *TII; /// Subtarget - Keep a pointer to the ARMSubtarget around so that we can /// make the right decision when generating code for different targets. @@ -57,7 +70,8 @@ public: explicit ARMDAGToDAGISel(ARMBaseTargetMachine &tm, CodeGenOpt::Level OptLevel) : SelectionDAGISel(tm, OptLevel), TM(tm), - Subtarget(&TM.getSubtarget<ARMSubtarget>()) { + TII(static_cast<const ARMBaseInstrInfo*>(TM.getInstrInfo())), + Subtarget(&TM.getSubtarget<ARMSubtarget>()) { } virtual const char *getPassName() const { @@ -72,60 +86,101 @@ public: SDNode *Select(SDNode *N); - bool SelectShifterOperandReg(SDNode *Op, SDValue N, SDValue &A, + + bool hasNoVMLxHazardUse(SDNode *N) const; + bool isShifterOpProfitable(const SDValue &Shift, + ARM_AM::ShiftOpc ShOpcVal, unsigned ShAmt); + bool SelectShifterOperandReg(SDValue N, SDValue &A, SDValue &B, SDValue &C); - bool SelectAddrMode2(SDNode *Op, SDValue N, SDValue &Base, - SDValue &Offset, SDValue &Opc); + bool SelectShiftShifterOperandReg(SDValue N, SDValue &A, + SDValue &B, SDValue &C); + bool SelectAddrModeImm12(SDValue N, SDValue &Base, SDValue &OffImm); + bool SelectLdStSOReg(SDValue N, SDValue &Base, SDValue &Offset, SDValue &Opc); + + AddrMode2Type SelectAddrMode2Worker(SDValue N, SDValue &Base, + SDValue &Offset, SDValue &Opc); + bool SelectAddrMode2Base(SDValue N, SDValue &Base, SDValue &Offset, + SDValue &Opc) { + return SelectAddrMode2Worker(N, Base, Offset, Opc) == AM2_BASE; + } + + bool SelectAddrMode2ShOp(SDValue N, SDValue &Base, SDValue &Offset, + SDValue &Opc) { + return SelectAddrMode2Worker(N, Base, Offset, Opc) == AM2_SHOP; + } + + bool SelectAddrMode2(SDValue N, SDValue &Base, SDValue &Offset, + SDValue &Opc) { + SelectAddrMode2Worker(N, Base, Offset, Opc); +// return SelectAddrMode2ShOp(N, Base, Offset, Opc); + // This always matches one way or another. + return true; + } + bool SelectAddrMode2Offset(SDNode *Op, SDValue N, SDValue &Offset, SDValue &Opc); - bool SelectAddrMode3(SDNode *Op, SDValue N, SDValue &Base, + bool SelectAddrMode3(SDValue N, SDValue &Base, SDValue &Offset, SDValue &Opc); bool SelectAddrMode3Offset(SDNode *Op, SDValue N, SDValue &Offset, SDValue &Opc); - bool SelectAddrMode4(SDNode *Op, SDValue N, SDValue &Addr, - SDValue &Mode); - bool SelectAddrMode5(SDNode *Op, SDValue N, SDValue &Base, + bool SelectAddrMode5(SDValue N, SDValue &Base, SDValue &Offset); - bool SelectAddrMode6(SDNode *Op, SDValue N, SDValue &Addr, SDValue &Align); - - bool SelectAddrModePC(SDNode *Op, SDValue N, SDValue &Offset, - SDValue &Label); - - bool SelectThumbAddrModeRR(SDNode *Op, SDValue N, SDValue &Base, - SDValue &Offset); - bool SelectThumbAddrModeRI5(SDNode *Op, SDValue N, unsigned Scale, - SDValue &Base, SDValue &OffImm, - SDValue &Offset); - bool SelectThumbAddrModeS1(SDNode *Op, SDValue N, SDValue &Base, - SDValue &OffImm, SDValue &Offset); - bool SelectThumbAddrModeS2(SDNode *Op, SDValue N, SDValue &Base, - SDValue &OffImm, SDValue &Offset); - bool SelectThumbAddrModeS4(SDNode *Op, SDValue N, SDValue &Base, - SDValue &OffImm, SDValue &Offset); - bool SelectThumbAddrModeSP(SDNode *Op, SDValue N, SDValue &Base, - SDValue &OffImm); - - bool SelectT2ShifterOperandReg(SDNode *Op, SDValue N, + bool SelectAddrMode6(SDNode *Parent, SDValue N, SDValue &Addr,SDValue &Align); + + bool SelectAddrModePC(SDValue N, SDValue &Offset, SDValue &Label); + + // Thumb Addressing Modes: + bool SelectThumbAddrModeRR(SDValue N, SDValue &Base, SDValue &Offset); + bool SelectThumbAddrModeRI(SDValue N, SDValue &Base, SDValue &Offset, + unsigned Scale); + bool SelectThumbAddrModeRI5S1(SDValue N, SDValue &Base, SDValue &Offset); + bool SelectThumbAddrModeRI5S2(SDValue N, SDValue &Base, SDValue &Offset); + bool SelectThumbAddrModeRI5S4(SDValue N, SDValue &Base, SDValue &Offset); + bool SelectThumbAddrModeImm5S(SDValue N, unsigned Scale, SDValue &Base, + SDValue &OffImm); + bool SelectThumbAddrModeImm5S1(SDValue N, SDValue &Base, + SDValue &OffImm); + bool SelectThumbAddrModeImm5S2(SDValue N, SDValue &Base, + SDValue &OffImm); + bool SelectThumbAddrModeImm5S4(SDValue N, SDValue &Base, + SDValue &OffImm); + bool SelectThumbAddrModeSP(SDValue N, SDValue &Base, SDValue &OffImm); + + // Thumb 2 Addressing Modes: + bool SelectT2ShifterOperandReg(SDValue N, SDValue &BaseReg, SDValue &Opc); - bool SelectT2AddrModeImm12(SDNode *Op, SDValue N, SDValue &Base, - SDValue &OffImm); - bool SelectT2AddrModeImm8(SDNode *Op, SDValue N, SDValue &Base, + bool SelectT2AddrModeImm12(SDValue N, SDValue &Base, SDValue &OffImm); + bool SelectT2AddrModeImm8(SDValue N, SDValue &Base, SDValue &OffImm); bool SelectT2AddrModeImm8Offset(SDNode *Op, SDValue N, SDValue &OffImm); - bool SelectT2AddrModeImm8s4(SDNode *Op, SDValue N, SDValue &Base, - SDValue &OffImm); - bool SelectT2AddrModeSoReg(SDNode *Op, SDValue N, SDValue &Base, + bool SelectT2AddrModeSoReg(SDValue N, SDValue &Base, SDValue &OffReg, SDValue &ShImm); + inline bool is_so_imm(unsigned Imm) const { + return ARM_AM::getSOImmVal(Imm) != -1; + } + + inline bool is_so_imm_not(unsigned Imm) const { + return ARM_AM::getSOImmVal(~Imm) != -1; + } + + inline bool is_t2_so_imm(unsigned Imm) const { + return ARM_AM::getT2SOImmVal(Imm) != -1; + } + + inline bool is_t2_so_imm_not(unsigned Imm) const { + return ARM_AM::getT2SOImmVal(~Imm) != -1; + } + inline bool Pred_so_imm(SDNode *inN) const { ConstantSDNode *N = cast<ConstantSDNode>(inN); - return ARM_AM::getSOImmVal(N->getZExtValue()) != -1; + return is_so_imm(N->getZExtValue()); } inline bool Pred_t2_so_imm(SDNode *inN) const { ConstantSDNode *N = cast<ConstantSDNode>(inN); - return ARM_AM::getT2SOImmVal(N->getZExtValue()) != -1; + return is_t2_so_imm(N->getZExtValue()); } // Include the pieces autogenerated from the target description. @@ -141,22 +196,30 @@ private: /// 1, 2, 3 or 4. The opcode arrays specify the instructions used for /// loads of D registers and even subregs and odd subregs of Q registers. /// For NumVecs <= 2, QOpcodes1 is not used. - SDNode *SelectVLD(SDNode *N, unsigned NumVecs, unsigned *DOpcodes, + SDNode *SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs, + unsigned *DOpcodes, unsigned *QOpcodes0, unsigned *QOpcodes1); /// SelectVST - Select NEON store intrinsics. NumVecs should /// be 1, 2, 3 or 4. The opcode arrays specify the instructions used for /// stores of D registers and even subregs and odd subregs of Q registers. /// For NumVecs <= 2, QOpcodes1 is not used. - SDNode *SelectVST(SDNode *N, unsigned NumVecs, unsigned *DOpcodes, + SDNode *SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs, + unsigned *DOpcodes, unsigned *QOpcodes0, unsigned *QOpcodes1); /// SelectVLDSTLane - Select NEON load/store lane intrinsics. NumVecs should /// be 2, 3 or 4. The opcode arrays specify the instructions used for - /// load/store of D registers and even subregs and odd subregs of Q registers. - SDNode *SelectVLDSTLane(SDNode *N, bool IsLoad, unsigned NumVecs, - unsigned *DOpcodes, unsigned *QOpcodes0, - unsigned *QOpcodes1); + /// load/store of D registers and Q registers. + SDNode *SelectVLDSTLane(SDNode *N, bool IsLoad, + bool isUpdating, unsigned NumVecs, + unsigned *DOpcodes, unsigned *QOpcodes); + + /// SelectVLDDup - Select NEON load-duplicate intrinsics. NumVecs + /// should be 2, 3 or 4. The opcode array specifies the instructions used + /// for loading D registers. (Q registers are not supported.) + SDNode *SelectVLDDup(SDNode *N, bool isUpdating, unsigned NumVecs, + unsigned *Opcodes); /// SelectVTBL - Select NEON VTBL and VTBX intrinsics. NumVecs should be 2, /// 3 or 4. These are custom-selected so that a REG_SEQUENCE can be @@ -174,10 +237,10 @@ private: SDNode *SelectARMCMOVShiftOp(SDNode *N, SDValue FalseVal, SDValue TrueVal, ARMCC::CondCodes CCVal, SDValue CCR, SDValue InFlag); - SDNode *SelectT2CMOVSoImmOp(SDNode *N, SDValue FalseVal, SDValue TrueVal, + SDNode *SelectT2CMOVImmOp(SDNode *N, SDValue FalseVal, SDValue TrueVal, ARMCC::CondCodes CCVal, SDValue CCR, SDValue InFlag); - SDNode *SelectARMCMOVSoImmOp(SDNode *N, SDValue FalseVal, SDValue TrueVal, + SDNode *SelectARMCMOVImmOp(SDNode *N, SDValue FalseVal, SDValue TrueVal, ARMCC::CondCodes CCVal, SDValue CCR, SDValue InFlag); @@ -199,9 +262,8 @@ private: SDNode *QuadDRegs(EVT VT, SDValue V0, SDValue V1, SDValue V2, SDValue V3); SDNode *QuadQRegs(EVT VT, SDValue V0, SDValue V1, SDValue V2, SDValue V3); - // Form sequences of 8 consecutive D registers. - SDNode *OctoDRegs(EVT VT, SDValue V0, SDValue V1, SDValue V2, SDValue V3, - SDValue V4, SDValue V5, SDValue V6, SDValue V7); + // Get the alignment operand for a NEON VLD or VST instruction. + SDValue GetVLDSTAlign(SDValue Align, unsigned NumVecs, bool is64BitVector); }; } @@ -229,9 +291,85 @@ static bool isOpcWithIntImmediate(SDNode *N, unsigned Opc, unsigned& Imm) { isInt32Immediate(N->getOperand(1).getNode(), Imm); } +/// \brief Check whether a particular node is a constant value representable as +/// (N * Scale) where (N in [\arg RangeMin, \arg RangeMax). +/// +/// \param ScaledConstant [out] - On success, the pre-scaled constant value. +static bool isScaledConstantInRange(SDValue Node, unsigned Scale, + int RangeMin, int RangeMax, + int &ScaledConstant) { + assert(Scale && "Invalid scale!"); + + // Check that this is a constant. + const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Node); + if (!C) + return false; -bool ARMDAGToDAGISel::SelectShifterOperandReg(SDNode *Op, - SDValue N, + ScaledConstant = (int) C->getZExtValue(); + if ((ScaledConstant % Scale) != 0) + return false; + + ScaledConstant /= Scale; + return ScaledConstant >= RangeMin && ScaledConstant < RangeMax; +} + +/// hasNoVMLxHazardUse - Return true if it's desirable to select a FP MLA / MLS +/// node. VFP / NEON fp VMLA / VMLS instructions have special RAW hazards (at +/// least on current ARM implementations) which should be avoidded. +bool ARMDAGToDAGISel::hasNoVMLxHazardUse(SDNode *N) const { + if (OptLevel == CodeGenOpt::None) + return true; + + if (!CheckVMLxHazard) + return true; + + if (!Subtarget->isCortexA8() && !Subtarget->isCortexA9()) + return true; + + if (!N->hasOneUse()) + return false; + + SDNode *Use = *N->use_begin(); + if (Use->getOpcode() == ISD::CopyToReg) + return true; + if (Use->isMachineOpcode()) { + const TargetInstrDesc &TID = TII->get(Use->getMachineOpcode()); + if (TID.mayStore()) + return true; + unsigned Opcode = TID.getOpcode(); + if (Opcode == ARM::VMOVRS || Opcode == ARM::VMOVRRD) + return true; + // vmlx feeding into another vmlx. We actually want to unfold + // the use later in the MLxExpansion pass. e.g. + // vmla + // vmla (stall 8 cycles) + // + // vmul (5 cycles) + // vadd (5 cycles) + // vmla + // This adds up to about 18 - 19 cycles. + // + // vmla + // vmul (stall 4 cycles) + // vadd adds up to about 14 cycles. + return TII->isFpMLxInstruction(Opcode); + } + + return false; +} + +bool ARMDAGToDAGISel::isShifterOpProfitable(const SDValue &Shift, + ARM_AM::ShiftOpc ShOpcVal, + unsigned ShAmt) { + if (!Subtarget->isCortexA9()) + return true; + if (Shift.hasOneUse()) + return true; + // R << 2 is free. + return ShOpcVal == ARM_AM::lsl && ShAmt == 2; +} + +bool ARMDAGToDAGISel::SelectShifterOperandReg(SDValue N, SDValue &BaseReg, SDValue &ShReg, SDValue &Opc) { @@ -251,16 +389,92 @@ bool ARMDAGToDAGISel::SelectShifterOperandReg(SDNode *Op, ShImmVal = RHS->getZExtValue() & 31; } else { ShReg = N.getOperand(1); + if (!isShifterOpProfitable(N, ShOpcVal, ShImmVal)) + return false; } Opc = CurDAG->getTargetConstant(ARM_AM::getSORegOpc(ShOpcVal, ShImmVal), MVT::i32); return true; } -bool ARMDAGToDAGISel::SelectAddrMode2(SDNode *Op, SDValue N, - SDValue &Base, SDValue &Offset, +bool ARMDAGToDAGISel::SelectShiftShifterOperandReg(SDValue N, + SDValue &BaseReg, + SDValue &ShReg, + SDValue &Opc) { + ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(N); + + // Don't match base register only case. That is matched to a separate + // lower complexity pattern with explicit register operand. + if (ShOpcVal == ARM_AM::no_shift) return false; + + BaseReg = N.getOperand(0); + unsigned ShImmVal = 0; + // Do not check isShifterOpProfitable. This must return true. + if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) { + ShReg = CurDAG->getRegister(0, MVT::i32); + ShImmVal = RHS->getZExtValue() & 31; + } else { + ShReg = N.getOperand(1); + } + Opc = CurDAG->getTargetConstant(ARM_AM::getSORegOpc(ShOpcVal, ShImmVal), + MVT::i32); + return true; +} + +bool ARMDAGToDAGISel::SelectAddrModeImm12(SDValue N, + SDValue &Base, + SDValue &OffImm) { + // Match simple R + imm12 operands. + + // Base only. + if (N.getOpcode() != ISD::ADD && N.getOpcode() != ISD::SUB && + !CurDAG->isBaseWithConstantOffset(N)) { + if (N.getOpcode() == ISD::FrameIndex) { + // Match frame index. + int FI = cast<FrameIndexSDNode>(N)->getIndex(); + Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy()); + OffImm = CurDAG->getTargetConstant(0, MVT::i32); + return true; + } + + if (N.getOpcode() == ARMISD::Wrapper && + !(Subtarget->useMovt() && + N.getOperand(0).getOpcode() == ISD::TargetGlobalAddress)) { + Base = N.getOperand(0); + } else + Base = N; + OffImm = CurDAG->getTargetConstant(0, MVT::i32); + return true; + } + + if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) { + int RHSC = (int)RHS->getZExtValue(); + if (N.getOpcode() == ISD::SUB) + RHSC = -RHSC; + + if (RHSC >= 0 && RHSC < 0x1000) { // 12 bits (unsigned) + Base = N.getOperand(0); + if (Base.getOpcode() == ISD::FrameIndex) { + int FI = cast<FrameIndexSDNode>(Base)->getIndex(); + Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy()); + } + OffImm = CurDAG->getTargetConstant(RHSC, MVT::i32); + return true; + } + } + + // Base only. + Base = N; + OffImm = CurDAG->getTargetConstant(0, MVT::i32); + return true; +} + + + +bool ARMDAGToDAGISel::SelectLdStSOReg(SDValue N, SDValue &Base, SDValue &Offset, SDValue &Opc) { - if (N.getOpcode() == ISD::MUL) { + if (N.getOpcode() == ISD::MUL && + (!Subtarget->isCortexA9() || N.hasOneUse())) { if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) { // X * [3,5,9] -> X + X * [2,4,8] etc. int RHSC = (int)RHS->getZExtValue(); @@ -283,7 +497,114 @@ bool ARMDAGToDAGISel::SelectAddrMode2(SDNode *Op, SDValue N, } } - if (N.getOpcode() != ISD::ADD && N.getOpcode() != ISD::SUB) { + if (N.getOpcode() != ISD::ADD && N.getOpcode() != ISD::SUB && + // ISD::OR that is equivalent to an ISD::ADD. + !CurDAG->isBaseWithConstantOffset(N)) + return false; + + // Leave simple R +/- imm12 operands for LDRi12 + if (N.getOpcode() == ISD::ADD || N.getOpcode() == ISD::OR) { + int RHSC; + if (isScaledConstantInRange(N.getOperand(1), /*Scale=*/1, + -0x1000+1, 0x1000, RHSC)) // 12 bits. + return false; + } + + if (Subtarget->isCortexA9() && !N.hasOneUse()) + // Compute R +/- (R << N) and reuse it. + return false; + + // Otherwise this is R +/- [possibly shifted] R. + ARM_AM::AddrOpc AddSub = N.getOpcode() == ISD::SUB ? ARM_AM::sub:ARM_AM::add; + ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOperand(1)); + unsigned ShAmt = 0; + + Base = N.getOperand(0); + Offset = N.getOperand(1); + + if (ShOpcVal != ARM_AM::no_shift) { + // Check to see if the RHS of the shift is a constant, if not, we can't fold + // it. + if (ConstantSDNode *Sh = + dyn_cast<ConstantSDNode>(N.getOperand(1).getOperand(1))) { + ShAmt = Sh->getZExtValue(); + if (isShifterOpProfitable(Offset, ShOpcVal, ShAmt)) + Offset = N.getOperand(1).getOperand(0); + else { + ShAmt = 0; + ShOpcVal = ARM_AM::no_shift; + } + } else { + ShOpcVal = ARM_AM::no_shift; + } + } + + // Try matching (R shl C) + (R). + if (N.getOpcode() != ISD::SUB && ShOpcVal == ARM_AM::no_shift && + !(Subtarget->isCortexA9() || N.getOperand(0).hasOneUse())) { + ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOperand(0)); + if (ShOpcVal != ARM_AM::no_shift) { + // Check to see if the RHS of the shift is a constant, if not, we can't + // fold it. + if (ConstantSDNode *Sh = + dyn_cast<ConstantSDNode>(N.getOperand(0).getOperand(1))) { + ShAmt = Sh->getZExtValue(); + if (!Subtarget->isCortexA9() || + (N.hasOneUse() && + isShifterOpProfitable(N.getOperand(0), ShOpcVal, ShAmt))) { + Offset = N.getOperand(0).getOperand(0); + Base = N.getOperand(1); + } else { + ShAmt = 0; + ShOpcVal = ARM_AM::no_shift; + } + } else { + ShOpcVal = ARM_AM::no_shift; + } + } + } + + Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(AddSub, ShAmt, ShOpcVal), + MVT::i32); + return true; +} + + + + +//----- + +AddrMode2Type ARMDAGToDAGISel::SelectAddrMode2Worker(SDValue N, + SDValue &Base, + SDValue &Offset, + SDValue &Opc) { + if (N.getOpcode() == ISD::MUL && + (!Subtarget->isCortexA9() || N.hasOneUse())) { + if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) { + // X * [3,5,9] -> X + X * [2,4,8] etc. + int RHSC = (int)RHS->getZExtValue(); + if (RHSC & 1) { + RHSC = RHSC & ~1; + ARM_AM::AddrOpc AddSub = ARM_AM::add; + if (RHSC < 0) { + AddSub = ARM_AM::sub; + RHSC = - RHSC; + } + if (isPowerOf2_32(RHSC)) { + unsigned ShAmt = Log2_32(RHSC); + Base = Offset = N.getOperand(0); + Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(AddSub, ShAmt, + ARM_AM::lsl), + MVT::i32); + return AM2_SHOP; + } + } + } + } + + if (N.getOpcode() != ISD::ADD && N.getOpcode() != ISD::SUB && + // ISD::OR that is equivalent to an ADD. + !CurDAG->isBaseWithConstantOffset(N)) { Base = N; if (N.getOpcode() == ISD::FrameIndex) { int FI = cast<FrameIndexSDNode>(N)->getIndex(); @@ -297,36 +618,45 @@ bool ARMDAGToDAGISel::SelectAddrMode2(SDNode *Op, SDValue N, Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(ARM_AM::add, 0, ARM_AM::no_shift), MVT::i32); - return true; + return AM2_BASE; } // Match simple R +/- imm12 operands. - if (N.getOpcode() == ISD::ADD) - if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) { - int RHSC = (int)RHS->getZExtValue(); - if ((RHSC >= 0 && RHSC < 0x1000) || - (RHSC < 0 && RHSC > -0x1000)) { // 12 bits. - Base = N.getOperand(0); - if (Base.getOpcode() == ISD::FrameIndex) { - int FI = cast<FrameIndexSDNode>(Base)->getIndex(); - Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy()); - } - Offset = CurDAG->getRegister(0, MVT::i32); + if (N.getOpcode() != ISD::SUB) { + int RHSC; + if (isScaledConstantInRange(N.getOperand(1), /*Scale=*/1, + -0x1000+1, 0x1000, RHSC)) { // 12 bits. + Base = N.getOperand(0); + if (Base.getOpcode() == ISD::FrameIndex) { + int FI = cast<FrameIndexSDNode>(Base)->getIndex(); + Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy()); + } + Offset = CurDAG->getRegister(0, MVT::i32); - ARM_AM::AddrOpc AddSub = ARM_AM::add; - if (RHSC < 0) { - AddSub = ARM_AM::sub; - RHSC = - RHSC; - } - Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(AddSub, RHSC, - ARM_AM::no_shift), - MVT::i32); - return true; + ARM_AM::AddrOpc AddSub = ARM_AM::add; + if (RHSC < 0) { + AddSub = ARM_AM::sub; + RHSC = - RHSC; } + Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(AddSub, RHSC, + ARM_AM::no_shift), + MVT::i32); + return AM2_BASE; } + } + + if (Subtarget->isCortexA9() && !N.hasOneUse()) { + // Compute R +/- (R << N) and reuse it. + Base = N; + Offset = CurDAG->getRegister(0, MVT::i32); + Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(ARM_AM::add, 0, + ARM_AM::no_shift), + MVT::i32); + return AM2_BASE; + } // Otherwise this is R +/- [possibly shifted] R. - ARM_AM::AddrOpc AddSub = N.getOpcode() == ISD::ADD ? ARM_AM::add:ARM_AM::sub; + ARM_AM::AddrOpc AddSub = N.getOpcode() != ISD::SUB ? ARM_AM::add:ARM_AM::sub; ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOperand(1)); unsigned ShAmt = 0; @@ -339,14 +669,20 @@ bool ARMDAGToDAGISel::SelectAddrMode2(SDNode *Op, SDValue N, if (ConstantSDNode *Sh = dyn_cast<ConstantSDNode>(N.getOperand(1).getOperand(1))) { ShAmt = Sh->getZExtValue(); - Offset = N.getOperand(1).getOperand(0); + if (isShifterOpProfitable(Offset, ShOpcVal, ShAmt)) + Offset = N.getOperand(1).getOperand(0); + else { + ShAmt = 0; + ShOpcVal = ARM_AM::no_shift; + } } else { ShOpcVal = ARM_AM::no_shift; } } // Try matching (R shl C) + (R). - if (N.getOpcode() == ISD::ADD && ShOpcVal == ARM_AM::no_shift) { + if (N.getOpcode() != ISD::SUB && ShOpcVal == ARM_AM::no_shift && + !(Subtarget->isCortexA9() || N.getOperand(0).hasOneUse())) { ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOperand(0)); if (ShOpcVal != ARM_AM::no_shift) { // Check to see if the RHS of the shift is a constant, if not, we can't @@ -354,8 +690,15 @@ bool ARMDAGToDAGISel::SelectAddrMode2(SDNode *Op, SDValue N, if (ConstantSDNode *Sh = dyn_cast<ConstantSDNode>(N.getOperand(0).getOperand(1))) { ShAmt = Sh->getZExtValue(); - Offset = N.getOperand(0).getOperand(0); - Base = N.getOperand(1); + if (!Subtarget->isCortexA9() || + (N.hasOneUse() && + isShifterOpProfitable(N.getOperand(0), ShOpcVal, ShAmt))) { + Offset = N.getOperand(0).getOperand(0); + Base = N.getOperand(1); + } else { + ShAmt = 0; + ShOpcVal = ARM_AM::no_shift; + } } else { ShOpcVal = ARM_AM::no_shift; } @@ -364,7 +707,7 @@ bool ARMDAGToDAGISel::SelectAddrMode2(SDNode *Op, SDValue N, Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(AddSub, ShAmt, ShOpcVal), MVT::i32); - return true; + return AM2_SHOP; } bool ARMDAGToDAGISel::SelectAddrMode2Offset(SDNode *Op, SDValue N, @@ -375,15 +718,13 @@ bool ARMDAGToDAGISel::SelectAddrMode2Offset(SDNode *Op, SDValue N, : cast<StoreSDNode>(Op)->getAddressingMode(); ARM_AM::AddrOpc AddSub = (AM == ISD::PRE_INC || AM == ISD::POST_INC) ? ARM_AM::add : ARM_AM::sub; - if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N)) { - int Val = (int)C->getZExtValue(); - if (Val >= 0 && Val < 0x1000) { // 12 bits. - Offset = CurDAG->getRegister(0, MVT::i32); - Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(AddSub, Val, - ARM_AM::no_shift), - MVT::i32); - return true; - } + int Val; + if (isScaledConstantInRange(N, /*Scale=*/1, 0, 0x1000, Val)) { // 12 bits. + Offset = CurDAG->getRegister(0, MVT::i32); + Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(AddSub, Val, + ARM_AM::no_shift), + MVT::i32); + return true; } Offset = N; @@ -394,7 +735,12 @@ bool ARMDAGToDAGISel::SelectAddrMode2Offset(SDNode *Op, SDValue N, // it. if (ConstantSDNode *Sh = dyn_cast<ConstantSDNode>(N.getOperand(1))) { ShAmt = Sh->getZExtValue(); - Offset = N.getOperand(0); + if (isShifterOpProfitable(N, ShOpcVal, ShAmt)) + Offset = N.getOperand(0); + else { + ShAmt = 0; + ShOpcVal = ARM_AM::no_shift; + } } else { ShOpcVal = ARM_AM::no_shift; } @@ -406,7 +752,7 @@ bool ARMDAGToDAGISel::SelectAddrMode2Offset(SDNode *Op, SDValue N, } -bool ARMDAGToDAGISel::SelectAddrMode3(SDNode *Op, SDValue N, +bool ARMDAGToDAGISel::SelectAddrMode3(SDValue N, SDValue &Base, SDValue &Offset, SDValue &Opc) { if (N.getOpcode() == ISD::SUB) { @@ -417,7 +763,7 @@ bool ARMDAGToDAGISel::SelectAddrMode3(SDNode *Op, SDValue N, return true; } - if (N.getOpcode() != ISD::ADD) { + if (!CurDAG->isBaseWithConstantOffset(N)) { Base = N; if (N.getOpcode() == ISD::FrameIndex) { int FI = cast<FrameIndexSDNode>(N)->getIndex(); @@ -429,25 +775,23 @@ bool ARMDAGToDAGISel::SelectAddrMode3(SDNode *Op, SDValue N, } // If the RHS is +/- imm8, fold into addr mode. - if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) { - int RHSC = (int)RHS->getZExtValue(); - if ((RHSC >= 0 && RHSC < 256) || - (RHSC < 0 && RHSC > -256)) { // note -256 itself isn't allowed. - Base = N.getOperand(0); - if (Base.getOpcode() == ISD::FrameIndex) { - int FI = cast<FrameIndexSDNode>(Base)->getIndex(); - Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy()); - } - Offset = CurDAG->getRegister(0, MVT::i32); + int RHSC; + if (isScaledConstantInRange(N.getOperand(1), /*Scale=*/1, + -256 + 1, 256, RHSC)) { // 8 bits. + Base = N.getOperand(0); + if (Base.getOpcode() == ISD::FrameIndex) { + int FI = cast<FrameIndexSDNode>(Base)->getIndex(); + Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy()); + } + Offset = CurDAG->getRegister(0, MVT::i32); - ARM_AM::AddrOpc AddSub = ARM_AM::add; - if (RHSC < 0) { - AddSub = ARM_AM::sub; - RHSC = - RHSC; - } - Opc = CurDAG->getTargetConstant(ARM_AM::getAM3Opc(AddSub, RHSC),MVT::i32); - return true; + ARM_AM::AddrOpc AddSub = ARM_AM::add; + if (RHSC < 0) { + AddSub = ARM_AM::sub; + RHSC = -RHSC; } + Opc = CurDAG->getTargetConstant(ARM_AM::getAM3Opc(AddSub, RHSC),MVT::i32); + return true; } Base = N.getOperand(0); @@ -464,13 +808,11 @@ bool ARMDAGToDAGISel::SelectAddrMode3Offset(SDNode *Op, SDValue N, : cast<StoreSDNode>(Op)->getAddressingMode(); ARM_AM::AddrOpc AddSub = (AM == ISD::PRE_INC || AM == ISD::POST_INC) ? ARM_AM::add : ARM_AM::sub; - if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N)) { - int Val = (int)C->getZExtValue(); - if (Val >= 0 && Val < 256) { - Offset = CurDAG->getRegister(0, MVT::i32); - Opc = CurDAG->getTargetConstant(ARM_AM::getAM3Opc(AddSub, Val), MVT::i32); - return true; - } + int Val; + if (isScaledConstantInRange(N, /*Scale=*/1, 0, 256, Val)) { // 12 bits. + Offset = CurDAG->getRegister(0, MVT::i32); + Opc = CurDAG->getTargetConstant(ARM_AM::getAM3Opc(AddSub, Val), MVT::i32); + return true; } Offset = N; @@ -478,16 +820,9 @@ bool ARMDAGToDAGISel::SelectAddrMode3Offset(SDNode *Op, SDValue N, return true; } -bool ARMDAGToDAGISel::SelectAddrMode4(SDNode *Op, SDValue N, - SDValue &Addr, SDValue &Mode) { - Addr = N; - Mode = CurDAG->getTargetConstant(ARM_AM::getAM4ModeImm(ARM_AM::ia), MVT::i32); - return true; -} - -bool ARMDAGToDAGISel::SelectAddrMode5(SDNode *Op, SDValue N, +bool ARMDAGToDAGISel::SelectAddrMode5(SDValue N, SDValue &Base, SDValue &Offset) { - if (N.getOpcode() != ISD::ADD) { + if (!CurDAG->isBaseWithConstantOffset(N)) { Base = N; if (N.getOpcode() == ISD::FrameIndex) { int FI = cast<FrameIndexSDNode>(N)->getIndex(); @@ -503,28 +838,23 @@ bool ARMDAGToDAGISel::SelectAddrMode5(SDNode *Op, SDValue N, } // If the RHS is +/- imm8, fold into addr mode. - if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) { - int RHSC = (int)RHS->getZExtValue(); - if ((RHSC & 3) == 0) { // The constant is implicitly multiplied by 4. - RHSC >>= 2; - if ((RHSC >= 0 && RHSC < 256) || - (RHSC < 0 && RHSC > -256)) { // note -256 itself isn't allowed. - Base = N.getOperand(0); - if (Base.getOpcode() == ISD::FrameIndex) { - int FI = cast<FrameIndexSDNode>(Base)->getIndex(); - Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy()); - } + int RHSC; + if (isScaledConstantInRange(N.getOperand(1), /*Scale=*/4, + -256 + 1, 256, RHSC)) { + Base = N.getOperand(0); + if (Base.getOpcode() == ISD::FrameIndex) { + int FI = cast<FrameIndexSDNode>(Base)->getIndex(); + Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy()); + } - ARM_AM::AddrOpc AddSub = ARM_AM::add; - if (RHSC < 0) { - AddSub = ARM_AM::sub; - RHSC = - RHSC; - } - Offset = CurDAG->getTargetConstant(ARM_AM::getAM5Opc(AddSub, RHSC), - MVT::i32); - return true; - } + ARM_AM::AddrOpc AddSub = ARM_AM::add; + if (RHSC < 0) { + AddSub = ARM_AM::sub; + RHSC = -RHSC; } + Offset = CurDAG->getTargetConstant(ARM_AM::getAM5Opc(AddSub, RHSC), + MVT::i32); + return true; } Base = N; @@ -533,30 +863,50 @@ bool ARMDAGToDAGISel::SelectAddrMode5(SDNode *Op, SDValue N, return true; } -bool ARMDAGToDAGISel::SelectAddrMode6(SDNode *Op, SDValue N, - SDValue &Addr, SDValue &Align) { +bool ARMDAGToDAGISel::SelectAddrMode6(SDNode *Parent, SDValue N, SDValue &Addr, + SDValue &Align) { Addr = N; - // Default to no alignment. - Align = CurDAG->getTargetConstant(0, MVT::i32); + + unsigned Alignment = 0; + if (LSBaseSDNode *LSN = dyn_cast<LSBaseSDNode>(Parent)) { + // This case occurs only for VLD1-lane/dup and VST1-lane instructions. + // The maximum alignment is equal to the memory size being referenced. + unsigned LSNAlign = LSN->getAlignment(); + unsigned MemSize = LSN->getMemoryVT().getSizeInBits() / 8; + if (LSNAlign > MemSize && MemSize > 1) + Alignment = MemSize; + } else { + // All other uses of addrmode6 are for intrinsics. For now just record + // the raw alignment value; it will be refined later based on the legal + // alignment operands for the intrinsic. + Alignment = cast<MemIntrinsicSDNode>(Parent)->getAlignment(); + } + + Align = CurDAG->getTargetConstant(Alignment, MVT::i32); return true; } -bool ARMDAGToDAGISel::SelectAddrModePC(SDNode *Op, SDValue N, +bool ARMDAGToDAGISel::SelectAddrModePC(SDValue N, SDValue &Offset, SDValue &Label) { if (N.getOpcode() == ARMISD::PIC_ADD && N.hasOneUse()) { Offset = N.getOperand(0); SDValue N1 = N.getOperand(1); - Label = CurDAG->getTargetConstant(cast<ConstantSDNode>(N1)->getZExtValue(), - MVT::i32); + Label = CurDAG->getTargetConstant(cast<ConstantSDNode>(N1)->getZExtValue(), + MVT::i32); return true; } + return false; } -bool ARMDAGToDAGISel::SelectThumbAddrModeRR(SDNode *Op, SDValue N, + +//===----------------------------------------------------------------------===// +// Thumb Addressing Modes +//===----------------------------------------------------------------------===// + +bool ARMDAGToDAGISel::SelectThumbAddrModeRR(SDValue N, SDValue &Base, SDValue &Offset){ - // FIXME dl should come from the parent load or store, not the address - if (N.getOpcode() != ISD::ADD) { + if (N.getOpcode() != ISD::ADD && !CurDAG->isBaseWithConstantOffset(N)) { ConstantSDNode *NC = dyn_cast<ConstantSDNode>(N); if (!NC || !NC->isNullValue()) return false; @@ -571,82 +921,137 @@ bool ARMDAGToDAGISel::SelectThumbAddrModeRR(SDNode *Op, SDValue N, } bool -ARMDAGToDAGISel::SelectThumbAddrModeRI5(SDNode *Op, SDValue N, - unsigned Scale, SDValue &Base, - SDValue &OffImm, SDValue &Offset) { +ARMDAGToDAGISel::SelectThumbAddrModeRI(SDValue N, SDValue &Base, + SDValue &Offset, unsigned Scale) { if (Scale == 4) { SDValue TmpBase, TmpOffImm; - if (SelectThumbAddrModeSP(Op, N, TmpBase, TmpOffImm)) + if (SelectThumbAddrModeSP(N, TmpBase, TmpOffImm)) return false; // We want to select tLDRspi / tSTRspi instead. + if (N.getOpcode() == ARMISD::Wrapper && N.getOperand(0).getOpcode() == ISD::TargetConstantPool) return false; // We want to select tLDRpci instead. } - if (N.getOpcode() != ISD::ADD) { + if (!CurDAG->isBaseWithConstantOffset(N)) + return false; + + // Thumb does not have [sp, r] address mode. + RegisterSDNode *LHSR = dyn_cast<RegisterSDNode>(N.getOperand(0)); + RegisterSDNode *RHSR = dyn_cast<RegisterSDNode>(N.getOperand(1)); + if ((LHSR && LHSR->getReg() == ARM::SP) || + (RHSR && RHSR->getReg() == ARM::SP)) + return false; + + // FIXME: Why do we explicitly check for a match here and then return false? + // Presumably to allow something else to match, but shouldn't this be + // documented? + int RHSC; + if (isScaledConstantInRange(N.getOperand(1), Scale, 0, 32, RHSC)) + return false; + + Base = N.getOperand(0); + Offset = N.getOperand(1); + return true; +} + +bool +ARMDAGToDAGISel::SelectThumbAddrModeRI5S1(SDValue N, + SDValue &Base, + SDValue &Offset) { + return SelectThumbAddrModeRI(N, Base, Offset, 1); +} + +bool +ARMDAGToDAGISel::SelectThumbAddrModeRI5S2(SDValue N, + SDValue &Base, + SDValue &Offset) { + return SelectThumbAddrModeRI(N, Base, Offset, 2); +} + +bool +ARMDAGToDAGISel::SelectThumbAddrModeRI5S4(SDValue N, + SDValue &Base, + SDValue &Offset) { + return SelectThumbAddrModeRI(N, Base, Offset, 4); +} + +bool +ARMDAGToDAGISel::SelectThumbAddrModeImm5S(SDValue N, unsigned Scale, + SDValue &Base, SDValue &OffImm) { + if (Scale == 4) { + SDValue TmpBase, TmpOffImm; + if (SelectThumbAddrModeSP(N, TmpBase, TmpOffImm)) + return false; // We want to select tLDRspi / tSTRspi instead. + + if (N.getOpcode() == ARMISD::Wrapper && + N.getOperand(0).getOpcode() == ISD::TargetConstantPool) + return false; // We want to select tLDRpci instead. + } + + if (!CurDAG->isBaseWithConstantOffset(N)) { if (N.getOpcode() == ARMISD::Wrapper && !(Subtarget->useMovt() && N.getOperand(0).getOpcode() == ISD::TargetGlobalAddress)) { Base = N.getOperand(0); - } else + } else { Base = N; + } - Offset = CurDAG->getRegister(0, MVT::i32); OffImm = CurDAG->getTargetConstant(0, MVT::i32); return true; } - // Thumb does not have [sp, r] address mode. RegisterSDNode *LHSR = dyn_cast<RegisterSDNode>(N.getOperand(0)); RegisterSDNode *RHSR = dyn_cast<RegisterSDNode>(N.getOperand(1)); if ((LHSR && LHSR->getReg() == ARM::SP) || (RHSR && RHSR->getReg() == ARM::SP)) { + ConstantSDNode *LHS = dyn_cast<ConstantSDNode>(N.getOperand(0)); + ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1)); + unsigned LHSC = LHS ? LHS->getZExtValue() : 0; + unsigned RHSC = RHS ? RHS->getZExtValue() : 0; + + // Thumb does not have [sp, #imm5] address mode for non-zero imm5. + if (LHSC != 0 || RHSC != 0) return false; + Base = N; - Offset = CurDAG->getRegister(0, MVT::i32); OffImm = CurDAG->getTargetConstant(0, MVT::i32); return true; } // If the RHS is + imm5 * scale, fold into addr mode. - if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) { - int RHSC = (int)RHS->getZExtValue(); - if ((RHSC & (Scale-1)) == 0) { // The constant is implicitly multiplied. - RHSC /= Scale; - if (RHSC >= 0 && RHSC < 32) { - Base = N.getOperand(0); - Offset = CurDAG->getRegister(0, MVT::i32); - OffImm = CurDAG->getTargetConstant(RHSC, MVT::i32); - return true; - } - } + int RHSC; + if (isScaledConstantInRange(N.getOperand(1), Scale, 0, 32, RHSC)) { + Base = N.getOperand(0); + OffImm = CurDAG->getTargetConstant(RHSC, MVT::i32); + return true; } Base = N.getOperand(0); - Offset = N.getOperand(1); OffImm = CurDAG->getTargetConstant(0, MVT::i32); return true; } -bool ARMDAGToDAGISel::SelectThumbAddrModeS1(SDNode *Op, SDValue N, - SDValue &Base, SDValue &OffImm, - SDValue &Offset) { - return SelectThumbAddrModeRI5(Op, N, 1, Base, OffImm, Offset); +bool +ARMDAGToDAGISel::SelectThumbAddrModeImm5S4(SDValue N, SDValue &Base, + SDValue &OffImm) { + return SelectThumbAddrModeImm5S(N, 4, Base, OffImm); } -bool ARMDAGToDAGISel::SelectThumbAddrModeS2(SDNode *Op, SDValue N, - SDValue &Base, SDValue &OffImm, - SDValue &Offset) { - return SelectThumbAddrModeRI5(Op, N, 2, Base, OffImm, Offset); +bool +ARMDAGToDAGISel::SelectThumbAddrModeImm5S2(SDValue N, SDValue &Base, + SDValue &OffImm) { + return SelectThumbAddrModeImm5S(N, 2, Base, OffImm); } -bool ARMDAGToDAGISel::SelectThumbAddrModeS4(SDNode *Op, SDValue N, - SDValue &Base, SDValue &OffImm, - SDValue &Offset) { - return SelectThumbAddrModeRI5(Op, N, 4, Base, OffImm, Offset); +bool +ARMDAGToDAGISel::SelectThumbAddrModeImm5S1(SDValue N, SDValue &Base, + SDValue &OffImm) { + return SelectThumbAddrModeImm5S(N, 1, Base, OffImm); } -bool ARMDAGToDAGISel::SelectThumbAddrModeSP(SDNode *Op, SDValue N, - SDValue &Base, SDValue &OffImm) { +bool ARMDAGToDAGISel::SelectThumbAddrModeSP(SDValue N, + SDValue &Base, SDValue &OffImm) { if (N.getOpcode() == ISD::FrameIndex) { int FI = cast<FrameIndexSDNode>(N)->getIndex(); Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy()); @@ -654,35 +1059,35 @@ bool ARMDAGToDAGISel::SelectThumbAddrModeSP(SDNode *Op, SDValue N, return true; } - if (N.getOpcode() != ISD::ADD) + if (!CurDAG->isBaseWithConstantOffset(N)) return false; RegisterSDNode *LHSR = dyn_cast<RegisterSDNode>(N.getOperand(0)); if (N.getOperand(0).getOpcode() == ISD::FrameIndex || (LHSR && LHSR->getReg() == ARM::SP)) { // If the RHS is + imm8 * scale, fold into addr mode. - if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) { - int RHSC = (int)RHS->getZExtValue(); - if ((RHSC & 3) == 0) { // The constant is implicitly multiplied. - RHSC >>= 2; - if (RHSC >= 0 && RHSC < 256) { - Base = N.getOperand(0); - if (Base.getOpcode() == ISD::FrameIndex) { - int FI = cast<FrameIndexSDNode>(Base)->getIndex(); - Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy()); - } - OffImm = CurDAG->getTargetConstant(RHSC, MVT::i32); - return true; - } + int RHSC; + if (isScaledConstantInRange(N.getOperand(1), /*Scale=*/4, 0, 256, RHSC)) { + Base = N.getOperand(0); + if (Base.getOpcode() == ISD::FrameIndex) { + int FI = cast<FrameIndexSDNode>(Base)->getIndex(); + Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy()); } + OffImm = CurDAG->getTargetConstant(RHSC, MVT::i32); + return true; } } return false; } -bool ARMDAGToDAGISel::SelectT2ShifterOperandReg(SDNode *Op, SDValue N, - SDValue &BaseReg, + +//===----------------------------------------------------------------------===// +// Thumb 2 Addressing Modes +//===----------------------------------------------------------------------===// + + +bool ARMDAGToDAGISel::SelectT2ShifterOperandReg(SDValue N, SDValue &BaseReg, SDValue &Opc) { if (DisableShifterOp) return false; @@ -704,19 +1109,22 @@ bool ARMDAGToDAGISel::SelectT2ShifterOperandReg(SDNode *Op, SDValue N, return false; } -bool ARMDAGToDAGISel::SelectT2AddrModeImm12(SDNode *Op, SDValue N, +bool ARMDAGToDAGISel::SelectT2AddrModeImm12(SDValue N, SDValue &Base, SDValue &OffImm) { // Match simple R + imm12 operands. // Base only. - if (N.getOpcode() != ISD::ADD && N.getOpcode() != ISD::SUB) { + if (N.getOpcode() != ISD::ADD && N.getOpcode() != ISD::SUB && + !CurDAG->isBaseWithConstantOffset(N)) { if (N.getOpcode() == ISD::FrameIndex) { - // Match frame index... + // Match frame index. int FI = cast<FrameIndexSDNode>(N)->getIndex(); Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy()); OffImm = CurDAG->getTargetConstant(0, MVT::i32); return true; - } else if (N.getOpcode() == ARMISD::Wrapper && + } + + if (N.getOpcode() == ARMISD::Wrapper && !(Subtarget->useMovt() && N.getOperand(0).getOpcode() == ISD::TargetGlobalAddress)) { Base = N.getOperand(0); @@ -729,7 +1137,7 @@ bool ARMDAGToDAGISel::SelectT2AddrModeImm12(SDNode *Op, SDValue N, } if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) { - if (SelectT2AddrModeImm8(Op, N, Base, OffImm)) + if (SelectT2AddrModeImm8(N, Base, OffImm)) // Let t2LDRi8 handle (R - imm8). return false; @@ -754,24 +1162,26 @@ bool ARMDAGToDAGISel::SelectT2AddrModeImm12(SDNode *Op, SDValue N, return true; } -bool ARMDAGToDAGISel::SelectT2AddrModeImm8(SDNode *Op, SDValue N, +bool ARMDAGToDAGISel::SelectT2AddrModeImm8(SDValue N, SDValue &Base, SDValue &OffImm) { // Match simple R - imm8 operands. - if (N.getOpcode() == ISD::ADD || N.getOpcode() == ISD::SUB) { - if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) { - int RHSC = (int)RHS->getSExtValue(); - if (N.getOpcode() == ISD::SUB) - RHSC = -RHSC; - - if ((RHSC >= -255) && (RHSC < 0)) { // 8 bits (always negative) - Base = N.getOperand(0); - if (Base.getOpcode() == ISD::FrameIndex) { - int FI = cast<FrameIndexSDNode>(Base)->getIndex(); - Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy()); - } - OffImm = CurDAG->getTargetConstant(RHSC, MVT::i32); - return true; + if (N.getOpcode() != ISD::ADD && N.getOpcode() != ISD::SUB && + !CurDAG->isBaseWithConstantOffset(N)) + return false; + + if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) { + int RHSC = (int)RHS->getSExtValue(); + if (N.getOpcode() == ISD::SUB) + RHSC = -RHSC; + + if ((RHSC >= -255) && (RHSC < 0)) { // 8 bits (always negative) + Base = N.getOperand(0); + if (Base.getOpcode() == ISD::FrameIndex) { + int FI = cast<FrameIndexSDNode>(Base)->getIndex(); + Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy()); } + OffImm = CurDAG->getTargetConstant(RHSC, MVT::i32); + return true; } } @@ -784,52 +1194,22 @@ bool ARMDAGToDAGISel::SelectT2AddrModeImm8Offset(SDNode *Op, SDValue N, ISD::MemIndexedMode AM = (Opcode == ISD::LOAD) ? cast<LoadSDNode>(Op)->getAddressingMode() : cast<StoreSDNode>(Op)->getAddressingMode(); - if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N)) { - int RHSC = (int)RHS->getZExtValue(); - if (RHSC >= 0 && RHSC < 0x100) { // 8 bits. - OffImm = ((AM == ISD::PRE_INC) || (AM == ISD::POST_INC)) - ? CurDAG->getTargetConstant(RHSC, MVT::i32) - : CurDAG->getTargetConstant(-RHSC, MVT::i32); - return true; - } - } - - return false; -} - -bool ARMDAGToDAGISel::SelectT2AddrModeImm8s4(SDNode *Op, SDValue N, - SDValue &Base, SDValue &OffImm) { - if (N.getOpcode() == ISD::ADD) { - if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) { - int RHSC = (int)RHS->getZExtValue(); - // 8 bits. - if (((RHSC & 0x3) == 0) && - ((RHSC >= 0 && RHSC < 0x400) || (RHSC < 0 && RHSC > -0x400))) { - Base = N.getOperand(0); - OffImm = CurDAG->getTargetConstant(RHSC, MVT::i32); - return true; - } - } - } else if (N.getOpcode() == ISD::SUB) { - if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) { - int RHSC = (int)RHS->getZExtValue(); - // 8 bits. - if (((RHSC & 0x3) == 0) && (RHSC >= 0 && RHSC < 0x400)) { - Base = N.getOperand(0); - OffImm = CurDAG->getTargetConstant(-RHSC, MVT::i32); - return true; - } - } + int RHSC; + if (isScaledConstantInRange(N, /*Scale=*/1, 0, 0x100, RHSC)) { // 8 bits. + OffImm = ((AM == ISD::PRE_INC) || (AM == ISD::POST_INC)) + ? CurDAG->getTargetConstant(RHSC, MVT::i32) + : CurDAG->getTargetConstant(-RHSC, MVT::i32); + return true; } return false; } -bool ARMDAGToDAGISel::SelectT2AddrModeSoReg(SDNode *Op, SDValue N, +bool ARMDAGToDAGISel::SelectT2AddrModeSoReg(SDValue N, SDValue &Base, SDValue &OffReg, SDValue &ShImm) { // (R - imm8) should be handled by t2LDRi8. The rest are handled by t2LDRi12. - if (N.getOpcode() != ISD::ADD) + if (N.getOpcode() != ISD::ADD && !CurDAG->isBaseWithConstantOffset(N)) return false; // Leave (R + imm12) for t2LDRi12, (R - imm8) for t2LDRi8. @@ -841,6 +1221,12 @@ bool ARMDAGToDAGISel::SelectT2AddrModeSoReg(SDNode *Op, SDValue N, return false; } + if (Subtarget->isCortexA9() && !N.hasOneUse()) { + // Compute R + (R << [1,2,3]) and reuse it. + Base = N; + return false; + } + // Look for (R + R) or (R + (R << [1,2,3])). unsigned ShAmt = 0; Base = N.getOperand(0); @@ -859,11 +1245,12 @@ bool ARMDAGToDAGISel::SelectT2AddrModeSoReg(SDNode *Op, SDValue N, // it. if (ConstantSDNode *Sh = dyn_cast<ConstantSDNode>(OffReg.getOperand(1))) { ShAmt = Sh->getZExtValue(); - if (ShAmt >= 4) { + if (ShAmt < 4 && isShifterOpProfitable(OffReg, ShOpcVal, ShAmt)) + OffReg = OffReg.getOperand(0); + else { ShAmt = 0; ShOpcVal = ARM_AM::no_shift; - } else - OffReg = OffReg.getOperand(0); + } } else { ShOpcVal = ARM_AM::no_shift; } @@ -1045,52 +1432,43 @@ SDNode *ARMDAGToDAGISel::QuadQRegs(EVT VT, SDValue V0, SDValue V1, return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops, 8); } -/// OctoDRegs - Form 8 consecutive D registers. -/// -SDNode *ARMDAGToDAGISel::OctoDRegs(EVT VT, SDValue V0, SDValue V1, - SDValue V2, SDValue V3, - SDValue V4, SDValue V5, - SDValue V6, SDValue V7) { - DebugLoc dl = V0.getNode()->getDebugLoc(); - SDValue SubReg0 = CurDAG->getTargetConstant(ARM::dsub_0, MVT::i32); - SDValue SubReg1 = CurDAG->getTargetConstant(ARM::dsub_1, MVT::i32); - SDValue SubReg2 = CurDAG->getTargetConstant(ARM::dsub_2, MVT::i32); - SDValue SubReg3 = CurDAG->getTargetConstant(ARM::dsub_3, MVT::i32); - SDValue SubReg4 = CurDAG->getTargetConstant(ARM::dsub_4, MVT::i32); - SDValue SubReg5 = CurDAG->getTargetConstant(ARM::dsub_5, MVT::i32); - SDValue SubReg6 = CurDAG->getTargetConstant(ARM::dsub_6, MVT::i32); - SDValue SubReg7 = CurDAG->getTargetConstant(ARM::dsub_7, MVT::i32); - const SDValue Ops[] ={ V0, SubReg0, V1, SubReg1, V2, SubReg2, V3, SubReg3, - V4, SubReg4, V5, SubReg5, V6, SubReg6, V7, SubReg7 }; - return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops, 16); +/// GetVLDSTAlign - Get the alignment (in bytes) for the alignment operand +/// of a NEON VLD or VST instruction. The supported values depend on the +/// number of registers being loaded. +SDValue ARMDAGToDAGISel::GetVLDSTAlign(SDValue Align, unsigned NumVecs, + bool is64BitVector) { + unsigned NumRegs = NumVecs; + if (!is64BitVector && NumVecs < 3) + NumRegs *= 2; + + unsigned Alignment = cast<ConstantSDNode>(Align)->getZExtValue(); + if (Alignment >= 32 && NumRegs == 4) + Alignment = 32; + else if (Alignment >= 16 && (NumRegs == 2 || NumRegs == 4)) + Alignment = 16; + else if (Alignment >= 8) + Alignment = 8; + else + Alignment = 0; + + return CurDAG->getTargetConstant(Alignment, MVT::i32); } -/// GetNEONSubregVT - Given a type for a 128-bit NEON vector, return the type -/// for a 64-bit subregister of the vector. -static EVT GetNEONSubregVT(EVT VT) { - switch (VT.getSimpleVT().SimpleTy) { - default: llvm_unreachable("unhandled NEON type"); - case MVT::v16i8: return MVT::v8i8; - case MVT::v8i16: return MVT::v4i16; - case MVT::v4f32: return MVT::v2f32; - case MVT::v4i32: return MVT::v2i32; - case MVT::v2i64: return MVT::v1i64; - } -} - -SDNode *ARMDAGToDAGISel::SelectVLD(SDNode *N, unsigned NumVecs, +SDNode *ARMDAGToDAGISel::SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs, unsigned *DOpcodes, unsigned *QOpcodes0, unsigned *QOpcodes1) { assert(NumVecs >= 1 && NumVecs <= 4 && "VLD NumVecs out-of-range"); DebugLoc dl = N->getDebugLoc(); SDValue MemAddr, Align; - if (!SelectAddrMode6(N, N->getOperand(2), MemAddr, Align)) + unsigned AddrOpIdx = isUpdating ? 1 : 2; + if (!SelectAddrMode6(N, N->getOperand(AddrOpIdx), MemAddr, Align)) return NULL; SDValue Chain = N->getOperand(0); EVT VT = N->getValueType(0); bool is64BitVector = VT.is64BitVector(); + Align = GetVLDSTAlign(Align, NumVecs, is64BitVector); unsigned OpcodeIndex; switch (VT.getSimpleVT().SimpleTy) { @@ -1120,88 +1498,97 @@ SDNode *ARMDAGToDAGISel::SelectVLD(SDNode *N, unsigned NumVecs, ResTyElts *= 2; ResTy = EVT::getVectorVT(*CurDAG->getContext(), MVT::i64, ResTyElts); } + std::vector<EVT> ResTys; + ResTys.push_back(ResTy); + if (isUpdating) + ResTys.push_back(MVT::i32); + ResTys.push_back(MVT::Other); SDValue Pred = getAL(CurDAG); SDValue Reg0 = CurDAG->getRegister(0, MVT::i32); - SDValue SuperReg; - if (is64BitVector) { - unsigned Opc = DOpcodes[OpcodeIndex]; - const SDValue Ops[] = { MemAddr, Align, Pred, Reg0, Chain }; - SDNode *VLd = CurDAG->getMachineNode(Opc, dl, ResTy, MVT::Other, Ops, 5); - if (NumVecs == 1) - return VLd; - - SuperReg = SDValue(VLd, 0); - assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering"); - for (unsigned Vec = 0; Vec < NumVecs; ++Vec) { - SDValue D = CurDAG->getTargetExtractSubreg(ARM::dsub_0+Vec, - dl, VT, SuperReg); - ReplaceUses(SDValue(N, Vec), D); - } - ReplaceUses(SDValue(N, NumVecs), SDValue(VLd, 1)); - return NULL; - } - - if (NumVecs <= 2) { - // Quad registers are directly supported for VLD1 and VLD2, - // loading pairs of D regs. - unsigned Opc = QOpcodes0[OpcodeIndex]; - const SDValue Ops[] = { MemAddr, Align, Pred, Reg0, Chain }; - SDNode *VLd = CurDAG->getMachineNode(Opc, dl, ResTy, MVT::Other, Ops, 5); - if (NumVecs == 1) - return VLd; + SDNode *VLd; + SmallVector<SDValue, 7> Ops; - SuperReg = SDValue(VLd, 0); - Chain = SDValue(VLd, 1); + // Double registers and VLD1/VLD2 quad registers are directly supported. + if (is64BitVector || NumVecs <= 2) { + unsigned Opc = (is64BitVector ? DOpcodes[OpcodeIndex] : + QOpcodes0[OpcodeIndex]); + Ops.push_back(MemAddr); + Ops.push_back(Align); + if (isUpdating) { + SDValue Inc = N->getOperand(AddrOpIdx + 1); + Ops.push_back(isa<ConstantSDNode>(Inc.getNode()) ? Reg0 : Inc); + } + Ops.push_back(Pred); + Ops.push_back(Reg0); + Ops.push_back(Chain); + VLd = CurDAG->getMachineNode(Opc, dl, ResTys, Ops.data(), Ops.size()); } else { // Otherwise, quad registers are loaded with two separate instructions, // where one loads the even registers and the other loads the odd registers. EVT AddrTy = MemAddr.getValueType(); - // Load the even subregs. - unsigned Opc = QOpcodes0[OpcodeIndex]; + // Load the even subregs. This is always an updating load, so that it + // provides the address to the second load for the odd subregs. SDValue ImplDef = SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, dl, ResTy), 0); const SDValue OpsA[] = { MemAddr, Align, Reg0, ImplDef, Pred, Reg0, Chain }; - SDNode *VLdA = - CurDAG->getMachineNode(Opc, dl, ResTy, AddrTy, MVT::Other, OpsA, 7); + SDNode *VLdA = CurDAG->getMachineNode(QOpcodes0[OpcodeIndex], dl, + ResTy, AddrTy, MVT::Other, OpsA, 7); Chain = SDValue(VLdA, 2); // Load the odd subregs. - Opc = QOpcodes1[OpcodeIndex]; - const SDValue OpsB[] = { SDValue(VLdA, 1), Align, Reg0, SDValue(VLdA, 0), - Pred, Reg0, Chain }; - SDNode *VLdB = - CurDAG->getMachineNode(Opc, dl, ResTy, AddrTy, MVT::Other, OpsB, 7); - SuperReg = SDValue(VLdB, 0); - Chain = SDValue(VLdB, 2); - } - - // Extract out the Q registers. - assert(ARM::qsub_3 == ARM::qsub_0+3 && "Unexpected subreg numbering"); - for (unsigned Vec = 0; Vec < NumVecs; ++Vec) { - SDValue Q = CurDAG->getTargetExtractSubreg(ARM::qsub_0+Vec, - dl, VT, SuperReg); - ReplaceUses(SDValue(N, Vec), Q); - } - ReplaceUses(SDValue(N, NumVecs), Chain); + Ops.push_back(SDValue(VLdA, 1)); + Ops.push_back(Align); + if (isUpdating) { + SDValue Inc = N->getOperand(AddrOpIdx + 1); + assert(isa<ConstantSDNode>(Inc.getNode()) && + "only constant post-increment update allowed for VLD3/4"); + (void)Inc; + Ops.push_back(Reg0); + } + Ops.push_back(SDValue(VLdA, 0)); + Ops.push_back(Pred); + Ops.push_back(Reg0); + Ops.push_back(Chain); + VLd = CurDAG->getMachineNode(QOpcodes1[OpcodeIndex], dl, ResTys, + Ops.data(), Ops.size()); + } + + if (NumVecs == 1) + return VLd; + + // Extract out the subregisters. + SDValue SuperReg = SDValue(VLd, 0); + assert(ARM::dsub_7 == ARM::dsub_0+7 && + ARM::qsub_3 == ARM::qsub_0+3 && "Unexpected subreg numbering"); + unsigned Sub0 = (is64BitVector ? ARM::dsub_0 : ARM::qsub_0); + for (unsigned Vec = 0; Vec < NumVecs; ++Vec) + ReplaceUses(SDValue(N, Vec), + CurDAG->getTargetExtractSubreg(Sub0 + Vec, dl, VT, SuperReg)); + ReplaceUses(SDValue(N, NumVecs), SDValue(VLd, 1)); + if (isUpdating) + ReplaceUses(SDValue(N, NumVecs + 1), SDValue(VLd, 2)); return NULL; } -SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, unsigned NumVecs, +SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs, unsigned *DOpcodes, unsigned *QOpcodes0, unsigned *QOpcodes1) { assert(NumVecs >= 1 && NumVecs <= 4 && "VST NumVecs out-of-range"); DebugLoc dl = N->getDebugLoc(); SDValue MemAddr, Align; - if (!SelectAddrMode6(N, N->getOperand(2), MemAddr, Align)) + unsigned AddrOpIdx = isUpdating ? 1 : 2; + unsigned Vec0Idx = 3; // AddrOpIdx + (isUpdating ? 2 : 1) + if (!SelectAddrMode6(N, N->getOperand(AddrOpIdx), MemAddr, Align)) return NULL; SDValue Chain = N->getOperand(0); - EVT VT = N->getOperand(3).getValueType(); + EVT VT = N->getOperand(Vec0Idx).getValueType(); bool is64BitVector = VT.is64BitVector(); + Align = GetVLDSTAlign(Align, NumVecs, is64BitVector); unsigned OpcodeIndex; switch (VT.getSimpleVT().SimpleTy) { @@ -1222,119 +1609,128 @@ SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, unsigned NumVecs, break; } + std::vector<EVT> ResTys; + if (isUpdating) + ResTys.push_back(MVT::i32); + ResTys.push_back(MVT::Other); + SDValue Pred = getAL(CurDAG); SDValue Reg0 = CurDAG->getRegister(0, MVT::i32); - SmallVector<SDValue, 7> Ops; - Ops.push_back(MemAddr); - Ops.push_back(Align); - if (is64BitVector) { + // Double registers and VST1/VST2 quad registers are directly supported. + if (is64BitVector || NumVecs <= 2) { + SDValue SrcReg; if (NumVecs == 1) { - Ops.push_back(N->getOperand(3)); - } else { - SDValue RegSeq; - SDValue V0 = N->getOperand(0+3); - SDValue V1 = N->getOperand(1+3); - + SrcReg = N->getOperand(Vec0Idx); + } else if (is64BitVector) { // Form a REG_SEQUENCE to force register allocation. + SDValue V0 = N->getOperand(Vec0Idx + 0); + SDValue V1 = N->getOperand(Vec0Idx + 1); if (NumVecs == 2) - RegSeq = SDValue(PairDRegs(MVT::v2i64, V0, V1), 0); + SrcReg = SDValue(PairDRegs(MVT::v2i64, V0, V1), 0); else { - SDValue V2 = N->getOperand(2+3); - // If it's a vld3, form a quad D-register and leave the last part as + SDValue V2 = N->getOperand(Vec0Idx + 2); + // If it's a vst3, form a quad D-register and leave the last part as // an undef. SDValue V3 = (NumVecs == 3) ? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,dl,VT), 0) - : N->getOperand(3+3); - RegSeq = SDValue(QuadDRegs(MVT::v4i64, V0, V1, V2, V3), 0); + : N->getOperand(Vec0Idx + 3); + SrcReg = SDValue(QuadDRegs(MVT::v4i64, V0, V1, V2, V3), 0); } - Ops.push_back(RegSeq); - } - Ops.push_back(Pred); - Ops.push_back(Reg0); // predicate register - Ops.push_back(Chain); - unsigned Opc = DOpcodes[OpcodeIndex]; - return CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops.data(), 6); - } - - if (NumVecs <= 2) { - // Quad registers are directly supported for VST1 and VST2. - unsigned Opc = QOpcodes0[OpcodeIndex]; - if (NumVecs == 1) { - Ops.push_back(N->getOperand(3)); } else { // Form a QQ register. - SDValue Q0 = N->getOperand(3); - SDValue Q1 = N->getOperand(4); - Ops.push_back(SDValue(PairQRegs(MVT::v4i64, Q0, Q1), 0)); + SDValue Q0 = N->getOperand(Vec0Idx); + SDValue Q1 = N->getOperand(Vec0Idx + 1); + SrcReg = SDValue(PairQRegs(MVT::v4i64, Q0, Q1), 0); + } + + unsigned Opc = (is64BitVector ? DOpcodes[OpcodeIndex] : + QOpcodes0[OpcodeIndex]); + Ops.push_back(MemAddr); + Ops.push_back(Align); + if (isUpdating) { + SDValue Inc = N->getOperand(AddrOpIdx + 1); + Ops.push_back(isa<ConstantSDNode>(Inc.getNode()) ? Reg0 : Inc); } + Ops.push_back(SrcReg); Ops.push_back(Pred); - Ops.push_back(Reg0); // predicate register + Ops.push_back(Reg0); Ops.push_back(Chain); - return CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops.data(), 6); + return CurDAG->getMachineNode(Opc, dl, ResTys, Ops.data(), Ops.size()); } // Otherwise, quad registers are stored with two separate instructions, // where one stores the even registers and the other stores the odd registers. // Form the QQQQ REG_SEQUENCE. - SDValue V0 = N->getOperand(0+3); - SDValue V1 = N->getOperand(1+3); - SDValue V2 = N->getOperand(2+3); + SDValue V0 = N->getOperand(Vec0Idx + 0); + SDValue V1 = N->getOperand(Vec0Idx + 1); + SDValue V2 = N->getOperand(Vec0Idx + 2); SDValue V3 = (NumVecs == 3) ? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, dl, VT), 0) - : N->getOperand(3+3); + : N->getOperand(Vec0Idx + 3); SDValue RegSeq = SDValue(QuadQRegs(MVT::v8i64, V0, V1, V2, V3), 0); - // Store the even D registers. - Ops.push_back(Reg0); // post-access address offset - Ops.push_back(RegSeq); - Ops.push_back(Pred); - Ops.push_back(Reg0); // predicate register - Ops.push_back(Chain); - unsigned Opc = QOpcodes0[OpcodeIndex]; - SDNode *VStA = CurDAG->getMachineNode(Opc, dl, MemAddr.getValueType(), - MVT::Other, Ops.data(), 7); + // Store the even D registers. This is always an updating store, so that it + // provides the address to the second store for the odd subregs. + const SDValue OpsA[] = { MemAddr, Align, Reg0, RegSeq, Pred, Reg0, Chain }; + SDNode *VStA = CurDAG->getMachineNode(QOpcodes0[OpcodeIndex], dl, + MemAddr.getValueType(), + MVT::Other, OpsA, 7); Chain = SDValue(VStA, 1); // Store the odd D registers. - Ops[0] = SDValue(VStA, 0); // MemAddr - Ops[6] = Chain; - Opc = QOpcodes1[OpcodeIndex]; - SDNode *VStB = CurDAG->getMachineNode(Opc, dl, MemAddr.getValueType(), - MVT::Other, Ops.data(), 7); - Chain = SDValue(VStB, 1); - ReplaceUses(SDValue(N, 0), Chain); - return NULL; + Ops.push_back(SDValue(VStA, 0)); + Ops.push_back(Align); + if (isUpdating) { + SDValue Inc = N->getOperand(AddrOpIdx + 1); + assert(isa<ConstantSDNode>(Inc.getNode()) && + "only constant post-increment update allowed for VST3/4"); + (void)Inc; + Ops.push_back(Reg0); + } + Ops.push_back(RegSeq); + Ops.push_back(Pred); + Ops.push_back(Reg0); + Ops.push_back(Chain); + return CurDAG->getMachineNode(QOpcodes1[OpcodeIndex], dl, ResTys, + Ops.data(), Ops.size()); } SDNode *ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad, - unsigned NumVecs, unsigned *DOpcodes, - unsigned *QOpcodes0, - unsigned *QOpcodes1) { + bool isUpdating, unsigned NumVecs, + unsigned *DOpcodes, + unsigned *QOpcodes) { assert(NumVecs >=2 && NumVecs <= 4 && "VLDSTLane NumVecs out-of-range"); DebugLoc dl = N->getDebugLoc(); SDValue MemAddr, Align; - if (!SelectAddrMode6(N, N->getOperand(2), MemAddr, Align)) + unsigned AddrOpIdx = isUpdating ? 1 : 2; + unsigned Vec0Idx = 3; // AddrOpIdx + (isUpdating ? 2 : 1) + if (!SelectAddrMode6(N, N->getOperand(AddrOpIdx), MemAddr, Align)) return NULL; SDValue Chain = N->getOperand(0); unsigned Lane = - cast<ConstantSDNode>(N->getOperand(NumVecs+3))->getZExtValue(); - EVT VT = IsLoad ? N->getValueType(0) : N->getOperand(3).getValueType(); + cast<ConstantSDNode>(N->getOperand(Vec0Idx + NumVecs))->getZExtValue(); + EVT VT = N->getOperand(Vec0Idx).getValueType(); bool is64BitVector = VT.is64BitVector(); - // Quad registers are handled by load/store of subregs. Find the subreg info. - unsigned NumElts = 0; - bool Even = false; - EVT RegVT = VT; - if (!is64BitVector) { - RegVT = GetNEONSubregVT(VT); - NumElts = RegVT.getVectorNumElements(); - Even = Lane < NumElts; - } + unsigned Alignment = 0; + if (NumVecs != 3) { + Alignment = cast<ConstantSDNode>(Align)->getZExtValue(); + unsigned NumBytes = NumVecs * VT.getVectorElementType().getSizeInBits()/8; + if (Alignment > NumBytes) + Alignment = NumBytes; + if (Alignment < 8 && Alignment < NumBytes) + Alignment = 0; + // Alignment must be a power of two; make sure of that. + Alignment = (Alignment & -Alignment); + if (Alignment == 1) + Alignment = 0; + } + Align = CurDAG->getTargetConstant(Alignment, MVT::i32); unsigned OpcodeIndex; switch (VT.getSimpleVT().SimpleTy) { @@ -1350,124 +1746,144 @@ SDNode *ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad, case MVT::v4i32: OpcodeIndex = 1; break; } + std::vector<EVT> ResTys; + if (IsLoad) { + unsigned ResTyElts = (NumVecs == 3) ? 4 : NumVecs; + if (!is64BitVector) + ResTyElts *= 2; + ResTys.push_back(EVT::getVectorVT(*CurDAG->getContext(), + MVT::i64, ResTyElts)); + } + if (isUpdating) + ResTys.push_back(MVT::i32); + ResTys.push_back(MVT::Other); + SDValue Pred = getAL(CurDAG); SDValue Reg0 = CurDAG->getRegister(0, MVT::i32); - SmallVector<SDValue, 10> Ops; + SmallVector<SDValue, 8> Ops; Ops.push_back(MemAddr); Ops.push_back(Align); + if (isUpdating) { + SDValue Inc = N->getOperand(AddrOpIdx + 1); + Ops.push_back(isa<ConstantSDNode>(Inc.getNode()) ? Reg0 : Inc); + } - unsigned Opc = 0; - if (is64BitVector) { - Opc = DOpcodes[OpcodeIndex]; - SDValue RegSeq; - SDValue V0 = N->getOperand(0+3); - SDValue V1 = N->getOperand(1+3); - if (NumVecs == 2) { - RegSeq = SDValue(PairDRegs(MVT::v2i64, V0, V1), 0); - } else { - SDValue V2 = N->getOperand(2+3); - SDValue V3 = (NumVecs == 3) - ? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,dl,VT), 0) - : N->getOperand(3+3); - RegSeq = SDValue(QuadDRegs(MVT::v4i64, V0, V1, V2, V3), 0); - } - - // Now extract the D registers back out. - Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::dsub_0, dl, VT, RegSeq)); - Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::dsub_1, dl, VT, RegSeq)); - if (NumVecs > 2) - Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::dsub_2, dl, VT,RegSeq)); - if (NumVecs > 3) - Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::dsub_3, dl, VT,RegSeq)); + SDValue SuperReg; + SDValue V0 = N->getOperand(Vec0Idx + 0); + SDValue V1 = N->getOperand(Vec0Idx + 1); + if (NumVecs == 2) { + if (is64BitVector) + SuperReg = SDValue(PairDRegs(MVT::v2i64, V0, V1), 0); + else + SuperReg = SDValue(PairQRegs(MVT::v4i64, V0, V1), 0); } else { - // Check if this is loading the even or odd subreg of a Q register. - if (Lane < NumElts) { - Opc = QOpcodes0[OpcodeIndex]; - } else { - Lane -= NumElts; - Opc = QOpcodes1[OpcodeIndex]; - } - - SDValue RegSeq; - SDValue V0 = N->getOperand(0+3); - SDValue V1 = N->getOperand(1+3); - if (NumVecs == 2) { - RegSeq = SDValue(PairQRegs(MVT::v4i64, V0, V1), 0); - } else { - SDValue V2 = N->getOperand(2+3); - SDValue V3 = (NumVecs == 3) - ? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,dl,VT), 0) - : N->getOperand(3+3); - RegSeq = SDValue(QuadQRegs(MVT::v8i64, V0, V1, V2, V3), 0); - } - - // Extract the subregs of the input vector. - unsigned SubIdx = Even ? ARM::dsub_0 : ARM::dsub_1; - for (unsigned Vec = 0; Vec < NumVecs; ++Vec) - Ops.push_back(CurDAG->getTargetExtractSubreg(SubIdx+Vec*2, dl, RegVT, - RegSeq)); + SDValue V2 = N->getOperand(Vec0Idx + 2); + SDValue V3 = (NumVecs == 3) + ? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, dl, VT), 0) + : N->getOperand(Vec0Idx + 3); + if (is64BitVector) + SuperReg = SDValue(QuadDRegs(MVT::v4i64, V0, V1, V2, V3), 0); + else + SuperReg = SDValue(QuadQRegs(MVT::v8i64, V0, V1, V2, V3), 0); } + Ops.push_back(SuperReg); Ops.push_back(getI32Imm(Lane)); Ops.push_back(Pred); Ops.push_back(Reg0); Ops.push_back(Chain); + unsigned Opc = (is64BitVector ? DOpcodes[OpcodeIndex] : + QOpcodes[OpcodeIndex]); + SDNode *VLdLn = CurDAG->getMachineNode(Opc, dl, ResTys, + Ops.data(), Ops.size()); if (!IsLoad) - return CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops.data(), NumVecs+6); + return VLdLn; - std::vector<EVT> ResTys(NumVecs, RegVT); - ResTys.push_back(MVT::Other); - SDNode *VLdLn = CurDAG->getMachineNode(Opc, dl, ResTys, Ops.data(),NumVecs+6); + // Extract the subregisters. + SuperReg = SDValue(VLdLn, 0); + assert(ARM::dsub_7 == ARM::dsub_0+7 && + ARM::qsub_3 == ARM::qsub_0+3 && "Unexpected subreg numbering"); + unsigned Sub0 = is64BitVector ? ARM::dsub_0 : ARM::qsub_0; + for (unsigned Vec = 0; Vec < NumVecs; ++Vec) + ReplaceUses(SDValue(N, Vec), + CurDAG->getTargetExtractSubreg(Sub0 + Vec, dl, VT, SuperReg)); + ReplaceUses(SDValue(N, NumVecs), SDValue(VLdLn, 1)); + if (isUpdating) + ReplaceUses(SDValue(N, NumVecs + 1), SDValue(VLdLn, 2)); + return NULL; +} - // Form a REG_SEQUENCE to force register allocation. - SDValue RegSeq; - if (is64BitVector) { - SDValue V0 = SDValue(VLdLn, 0); - SDValue V1 = SDValue(VLdLn, 1); - if (NumVecs == 2) { - RegSeq = SDValue(PairDRegs(MVT::v2i64, V0, V1), 0); - } else { - SDValue V2 = SDValue(VLdLn, 2); - // If it's a vld3, form a quad D-register but discard the last part. - SDValue V3 = (NumVecs == 3) - ? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,dl,VT), 0) - : SDValue(VLdLn, 3); - RegSeq = SDValue(QuadDRegs(MVT::v4i64, V0, V1, V2, V3), 0); - } - } else { - // For 128-bit vectors, take the 64-bit results of the load and insert - // them as subregs into the result. - SDValue V[8]; - for (unsigned Vec = 0, i = 0; Vec < NumVecs; ++Vec, i+=2) { - if (Even) { - V[i] = SDValue(VLdLn, Vec); - V[i+1] = SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, - dl, RegVT), 0); - } else { - V[i] = SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, - dl, RegVT), 0); - V[i+1] = SDValue(VLdLn, Vec); - } - } - if (NumVecs == 3) - V[6] = V[7] = SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, - dl, RegVT), 0); +SDNode *ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool isUpdating, + unsigned NumVecs, unsigned *Opcodes) { + assert(NumVecs >=2 && NumVecs <= 4 && "VLDDup NumVecs out-of-range"); + DebugLoc dl = N->getDebugLoc(); - if (NumVecs == 2) - RegSeq = SDValue(QuadDRegs(MVT::v4i64, V[0], V[1], V[2], V[3]), 0); - else - RegSeq = SDValue(OctoDRegs(MVT::v8i64, V[0], V[1], V[2], V[3], - V[4], V[5], V[6], V[7]), 0); + SDValue MemAddr, Align; + if (!SelectAddrMode6(N, N->getOperand(1), MemAddr, Align)) + return NULL; + + SDValue Chain = N->getOperand(0); + EVT VT = N->getValueType(0); + + unsigned Alignment = 0; + if (NumVecs != 3) { + Alignment = cast<ConstantSDNode>(Align)->getZExtValue(); + unsigned NumBytes = NumVecs * VT.getVectorElementType().getSizeInBits()/8; + if (Alignment > NumBytes) + Alignment = NumBytes; + if (Alignment < 8 && Alignment < NumBytes) + Alignment = 0; + // Alignment must be a power of two; make sure of that. + Alignment = (Alignment & -Alignment); + if (Alignment == 1) + Alignment = 0; + } + Align = CurDAG->getTargetConstant(Alignment, MVT::i32); + + unsigned OpcodeIndex; + switch (VT.getSimpleVT().SimpleTy) { + default: llvm_unreachable("unhandled vld-dup type"); + case MVT::v8i8: OpcodeIndex = 0; break; + case MVT::v4i16: OpcodeIndex = 1; break; + case MVT::v2f32: + case MVT::v2i32: OpcodeIndex = 2; break; + } + + SDValue Pred = getAL(CurDAG); + SDValue Reg0 = CurDAG->getRegister(0, MVT::i32); + SDValue SuperReg; + unsigned Opc = Opcodes[OpcodeIndex]; + SmallVector<SDValue, 6> Ops; + Ops.push_back(MemAddr); + Ops.push_back(Align); + if (isUpdating) { + SDValue Inc = N->getOperand(2); + Ops.push_back(isa<ConstantSDNode>(Inc.getNode()) ? Reg0 : Inc); } + Ops.push_back(Pred); + Ops.push_back(Reg0); + Ops.push_back(Chain); + unsigned ResTyElts = (NumVecs == 3) ? 4 : NumVecs; + std::vector<EVT> ResTys; + ResTys.push_back(EVT::getVectorVT(*CurDAG->getContext(), MVT::i64, ResTyElts)); + if (isUpdating) + ResTys.push_back(MVT::i32); + ResTys.push_back(MVT::Other); + SDNode *VLdDup = + CurDAG->getMachineNode(Opc, dl, ResTys, Ops.data(), Ops.size()); + SuperReg = SDValue(VLdDup, 0); + + // Extract the subregisters. assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering"); - assert(ARM::qsub_3 == ARM::qsub_0+3 && "Unexpected subreg numbering"); - unsigned SubIdx = is64BitVector ? ARM::dsub_0 : ARM::qsub_0; + unsigned SubIdx = ARM::dsub_0; for (unsigned Vec = 0; Vec < NumVecs; ++Vec) ReplaceUses(SDValue(N, Vec), - CurDAG->getTargetExtractSubreg(SubIdx+Vec, dl, VT, RegSeq)); - ReplaceUses(SDValue(N, NumVecs), SDValue(VLdLn, NumVecs)); + CurDAG->getTargetExtractSubreg(SubIdx+Vec, dl, VT, SuperReg)); + ReplaceUses(SDValue(N, NumVecs), SDValue(VLdDup, 1)); + if (isUpdating) + ReplaceUses(SDValue(N, NumVecs + 1), SDValue(VLdDup, 2)); return NULL; } @@ -1486,7 +1902,7 @@ SDNode *ARMDAGToDAGISel::SelectVTBL(SDNode *N, bool IsExt, unsigned NumVecs, RegSeq = SDValue(PairDRegs(MVT::v16i8, V0, V1), 0); else { SDValue V2 = N->getOperand(FirstTblReg + 2); - // If it's a vtbl3, form a quad D-register and leave the last part as + // If it's a vtbl3, form a quad D-register and leave the last part as // an undef. SDValue V3 = (NumVecs == 3) ? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, dl, VT), 0) @@ -1494,17 +1910,10 @@ SDNode *ARMDAGToDAGISel::SelectVTBL(SDNode *N, bool IsExt, unsigned NumVecs, RegSeq = SDValue(QuadDRegs(MVT::v4i64, V0, V1, V2, V3), 0); } - // Now extract the D registers back out. SmallVector<SDValue, 6> Ops; if (IsExt) Ops.push_back(N->getOperand(1)); - Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::dsub_0, dl, VT, RegSeq)); - Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::dsub_1, dl, VT, RegSeq)); - if (NumVecs > 2) - Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::dsub_2, dl, VT, RegSeq)); - if (NumVecs > 3) - Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::dsub_3, dl, VT, RegSeq)); - + Ops.push_back(RegSeq); Ops.push_back(N->getOperand(FirstTblReg + NumVecs)); Ops.push_back(getAL(CurDAG)); // predicate Ops.push_back(CurDAG->getRegister(0, MVT::i32)); // predicate register @@ -1574,7 +1983,7 @@ SelectT2CMOVShiftOp(SDNode *N, SDValue FalseVal, SDValue TrueVal, ARMCC::CondCodes CCVal, SDValue CCR, SDValue InFlag) { SDValue CPTmp0; SDValue CPTmp1; - if (SelectT2ShifterOperandReg(N, TrueVal, CPTmp0, CPTmp1)) { + if (SelectT2ShifterOperandReg(TrueVal, CPTmp0, CPTmp1)) { unsigned SOVal = cast<ConstantSDNode>(CPTmp1)->getZExtValue(); unsigned SOShOp = ARM_AM::getSORegShOp(SOVal); unsigned Opc = 0; @@ -1602,7 +2011,7 @@ SelectARMCMOVShiftOp(SDNode *N, SDValue FalseVal, SDValue TrueVal, SDValue CPTmp0; SDValue CPTmp1; SDValue CPTmp2; - if (SelectShifterOperandReg(N, TrueVal, CPTmp0, CPTmp1, CPTmp2)) { + if (SelectShifterOperandReg(TrueVal, CPTmp0, CPTmp1, CPTmp2)) { SDValue CC = CurDAG->getTargetConstant(CCVal, MVT::i32); SDValue Ops[] = { FalseVal, CPTmp0, CPTmp1, CPTmp2, CC, CCR, InFlag }; return CurDAG->SelectNodeTo(N, ARM::MOVCCs, MVT::i32, Ops, 7); @@ -1611,36 +2020,66 @@ SelectARMCMOVShiftOp(SDNode *N, SDValue FalseVal, SDValue TrueVal, } SDNode *ARMDAGToDAGISel:: -SelectT2CMOVSoImmOp(SDNode *N, SDValue FalseVal, SDValue TrueVal, - ARMCC::CondCodes CCVal, SDValue CCR, SDValue InFlag) { +SelectT2CMOVImmOp(SDNode *N, SDValue FalseVal, SDValue TrueVal, + ARMCC::CondCodes CCVal, SDValue CCR, SDValue InFlag) { ConstantSDNode *T = dyn_cast<ConstantSDNode>(TrueVal); if (!T) return 0; - if (Pred_t2_so_imm(TrueVal.getNode())) { - SDValue True = CurDAG->getTargetConstant(T->getZExtValue(), MVT::i32); + unsigned Opc = 0; + unsigned TrueImm = T->getZExtValue(); + if (is_t2_so_imm(TrueImm)) { + Opc = ARM::t2MOVCCi; + } else if (TrueImm <= 0xffff) { + Opc = ARM::t2MOVCCi16; + } else if (is_t2_so_imm_not(TrueImm)) { + TrueImm = ~TrueImm; + Opc = ARM::t2MVNCCi; + } else if (TrueVal.getNode()->hasOneUse() && Subtarget->hasV6T2Ops()) { + // Large immediate. + Opc = ARM::t2MOVCCi32imm; + } + + if (Opc) { + SDValue True = CurDAG->getTargetConstant(TrueImm, MVT::i32); SDValue CC = CurDAG->getTargetConstant(CCVal, MVT::i32); SDValue Ops[] = { FalseVal, True, CC, CCR, InFlag }; - return CurDAG->SelectNodeTo(N, - ARM::t2MOVCCi, MVT::i32, Ops, 5); + return CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops, 5); } + return 0; } SDNode *ARMDAGToDAGISel:: -SelectARMCMOVSoImmOp(SDNode *N, SDValue FalseVal, SDValue TrueVal, - ARMCC::CondCodes CCVal, SDValue CCR, SDValue InFlag) { +SelectARMCMOVImmOp(SDNode *N, SDValue FalseVal, SDValue TrueVal, + ARMCC::CondCodes CCVal, SDValue CCR, SDValue InFlag) { ConstantSDNode *T = dyn_cast<ConstantSDNode>(TrueVal); if (!T) return 0; - if (Pred_so_imm(TrueVal.getNode())) { - SDValue True = CurDAG->getTargetConstant(T->getZExtValue(), MVT::i32); + unsigned Opc = 0; + unsigned TrueImm = T->getZExtValue(); + bool isSoImm = is_so_imm(TrueImm); + if (isSoImm) { + Opc = ARM::MOVCCi; + } else if (Subtarget->hasV6T2Ops() && TrueImm <= 0xffff) { + Opc = ARM::MOVCCi16; + } else if (is_so_imm_not(TrueImm)) { + TrueImm = ~TrueImm; + Opc = ARM::MVNCCi; + } else if (TrueVal.getNode()->hasOneUse() && + (Subtarget->hasV6T2Ops() || ARM_AM::isSOImmTwoPartVal(TrueImm))) { + // Large immediate. + Opc = ARM::MOVCCi32imm; + } + + if (Opc) { + SDValue True = CurDAG->getTargetConstant(TrueImm, MVT::i32); SDValue CC = CurDAG->getTargetConstant(CCVal, MVT::i32); SDValue Ops[] = { FalseVal, True, CC, CCR, InFlag }; - return CurDAG->SelectNodeTo(N, - ARM::MOVCCi, MVT::i32, Ops, 5); + return CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops, 5); } + return 0; } @@ -1688,18 +2127,18 @@ SDNode *ARMDAGToDAGISel::SelectCMOVOp(SDNode *N) { // (so_imm:i32 (imm:i32):$true), (imm:i32):$cc) // Pattern complexity = 10 cost = 1 size = 0 if (Subtarget->isThumb()) { - SDNode *Res = SelectT2CMOVSoImmOp(N, FalseVal, TrueVal, + SDNode *Res = SelectT2CMOVImmOp(N, FalseVal, TrueVal, CCVal, CCR, InFlag); if (!Res) - Res = SelectT2CMOVSoImmOp(N, TrueVal, FalseVal, + Res = SelectT2CMOVImmOp(N, TrueVal, FalseVal, ARMCC::getOppositeCondition(CCVal), CCR, InFlag); if (Res) return Res; } else { - SDNode *Res = SelectARMCMOVSoImmOp(N, FalseVal, TrueVal, + SDNode *Res = SelectARMCMOVImmOp(N, FalseVal, TrueVal, CCVal, CCR, InFlag); if (!Res) - Res = SelectARMCMOVSoImmOp(N, TrueVal, FalseVal, + Res = SelectARMCMOVImmOp(N, TrueVal, FalseVal, ARMCC::getOppositeCondition(CCVal), CCR, InFlag); if (Res) return Res; @@ -1742,13 +2181,7 @@ SDNode *ARMDAGToDAGISel::SelectConcatVector(SDNode *N) { EVT VT = N->getValueType(0); if (!VT.is128BitVector() || N->getNumOperands() != 2) llvm_unreachable("unexpected CONCAT_VECTORS"); - DebugLoc dl = N->getDebugLoc(); - SDValue V0 = N->getOperand(0); - SDValue V1 = N->getOperand(1); - SDValue SubReg0 = CurDAG->getTargetConstant(ARM::dsub_0, MVT::i32); - SDValue SubReg1 = CurDAG->getTargetConstant(ARM::dsub_1, MVT::i32); - const SDValue Ops[] = { V0, SubReg0, V1, SubReg1 }; - return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops, 4); + return PairDRegs(VT, N->getOperand(0), N->getOperand(1)); } SDNode *ARMDAGToDAGISel::Select(SDNode *N) { @@ -1788,19 +2221,18 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { SDValue Pred = getAL(CurDAG); SDValue PredReg = CurDAG->getRegister(0, MVT::i32); SDValue Ops[] = { CPIdx, Pred, PredReg, CurDAG->getEntryNode() }; - ResNode = CurDAG->getMachineNode(ARM::tLDRcp, dl, MVT::i32, MVT::Other, + ResNode = CurDAG->getMachineNode(ARM::tLDRpci, dl, MVT::i32, MVT::Other, Ops, 4); } else { SDValue Ops[] = { CPIdx, - CurDAG->getRegister(0, MVT::i32), CurDAG->getTargetConstant(0, MVT::i32), getAL(CurDAG), CurDAG->getRegister(0, MVT::i32), CurDAG->getEntryNode() }; ResNode=CurDAG->getMachineNode(ARM::LDRcp, dl, MVT::i32, MVT::Other, - Ops, 6); + Ops, 5); } ReplaceUses(SDValue(N, 0), SDValue(ResNode, 0)); return NULL; @@ -1930,7 +2362,9 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { SDValue Ops[] = { N->getOperand(0), N->getOperand(1), getAL(CurDAG), CurDAG->getRegister(0, MVT::i32), CurDAG->getRegister(0, MVT::i32) }; - return CurDAG->getMachineNode(ARM::UMULL, dl, MVT::i32, MVT::i32, Ops, 5); + return CurDAG->getMachineNode(Subtarget->hasV6Ops() ? + ARM::UMULL : ARM::UMULLv5, + dl, MVT::i32, MVT::i32, Ops, 5); } } case ISD::SMUL_LOHI: { @@ -1944,7 +2378,9 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { SDValue Ops[] = { N->getOperand(0), N->getOperand(1), getAL(CurDAG), CurDAG->getRegister(0, MVT::i32), CurDAG->getRegister(0, MVT::i32) }; - return CurDAG->getMachineNode(ARM::SMULL, dl, MVT::i32, MVT::i32, Ops, 5); + return CurDAG->getMachineNode(Subtarget->hasV6Ops() ? + ARM::SMULL : ARM::SMULLv5, + dl, MVT::i32, MVT::i32, Ops, 5); } } case ISD::LOAD: { @@ -1987,7 +2423,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { MVT::i32); SDValue Ops[] = { N1, Tmp2, N3, Chain, InFlag }; SDNode *ResNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, - MVT::Flag, Ops, 5); + MVT::Glue, Ops, 5); Chain = SDValue(ResNode, 0); if (N->getNumValues() == 2) { InFlag = SDValue(ResNode, 1); @@ -2088,12 +2524,11 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { EVT VecVT = N->getValueType(0); EVT EltVT = VecVT.getVectorElementType(); unsigned NumElts = VecVT.getVectorNumElements(); - if (EltVT.getSimpleVT() == MVT::f64) { + if (EltVT == MVT::f64) { assert(NumElts == 2 && "unexpected type for BUILD_VECTOR"); return PairDRegs(VecVT, N->getOperand(0), N->getOperand(1)); } - assert(EltVT.getSimpleVT() == MVT::f32 && - "unexpected type for BUILD_VECTOR"); + assert(EltVT == MVT::f32 && "unexpected type for BUILD_VECTOR"); if (NumElts == 2) return PairSRegs(VecVT, N->getOperand(0), N->getOperand(1)); assert(NumElts == 4 && "unexpected type for BUILD_VECTOR"); @@ -2101,6 +2536,170 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { N->getOperand(2), N->getOperand(3)); } + case ARMISD::VLD2DUP: { + unsigned Opcodes[] = { ARM::VLD2DUPd8Pseudo, ARM::VLD2DUPd16Pseudo, + ARM::VLD2DUPd32Pseudo }; + return SelectVLDDup(N, false, 2, Opcodes); + } + + case ARMISD::VLD3DUP: { + unsigned Opcodes[] = { ARM::VLD3DUPd8Pseudo, ARM::VLD3DUPd16Pseudo, + ARM::VLD3DUPd32Pseudo }; + return SelectVLDDup(N, false, 3, Opcodes); + } + + case ARMISD::VLD4DUP: { + unsigned Opcodes[] = { ARM::VLD4DUPd8Pseudo, ARM::VLD4DUPd16Pseudo, + ARM::VLD4DUPd32Pseudo }; + return SelectVLDDup(N, false, 4, Opcodes); + } + + case ARMISD::VLD2DUP_UPD: { + unsigned Opcodes[] = { ARM::VLD2DUPd8Pseudo_UPD, ARM::VLD2DUPd16Pseudo_UPD, + ARM::VLD2DUPd32Pseudo_UPD }; + return SelectVLDDup(N, true, 2, Opcodes); + } + + case ARMISD::VLD3DUP_UPD: { + unsigned Opcodes[] = { ARM::VLD3DUPd8Pseudo_UPD, ARM::VLD3DUPd16Pseudo_UPD, + ARM::VLD3DUPd32Pseudo_UPD }; + return SelectVLDDup(N, true, 3, Opcodes); + } + + case ARMISD::VLD4DUP_UPD: { + unsigned Opcodes[] = { ARM::VLD4DUPd8Pseudo_UPD, ARM::VLD4DUPd16Pseudo_UPD, + ARM::VLD4DUPd32Pseudo_UPD }; + return SelectVLDDup(N, true, 4, Opcodes); + } + + case ARMISD::VLD1_UPD: { + unsigned DOpcodes[] = { ARM::VLD1d8_UPD, ARM::VLD1d16_UPD, + ARM::VLD1d32_UPD, ARM::VLD1d64_UPD }; + unsigned QOpcodes[] = { ARM::VLD1q8Pseudo_UPD, ARM::VLD1q16Pseudo_UPD, + ARM::VLD1q32Pseudo_UPD, ARM::VLD1q64Pseudo_UPD }; + return SelectVLD(N, true, 1, DOpcodes, QOpcodes, 0); + } + + case ARMISD::VLD2_UPD: { + unsigned DOpcodes[] = { ARM::VLD2d8Pseudo_UPD, ARM::VLD2d16Pseudo_UPD, + ARM::VLD2d32Pseudo_UPD, ARM::VLD1q64Pseudo_UPD }; + unsigned QOpcodes[] = { ARM::VLD2q8Pseudo_UPD, ARM::VLD2q16Pseudo_UPD, + ARM::VLD2q32Pseudo_UPD }; + return SelectVLD(N, true, 2, DOpcodes, QOpcodes, 0); + } + + case ARMISD::VLD3_UPD: { + unsigned DOpcodes[] = { ARM::VLD3d8Pseudo_UPD, ARM::VLD3d16Pseudo_UPD, + ARM::VLD3d32Pseudo_UPD, ARM::VLD1d64TPseudo_UPD }; + unsigned QOpcodes0[] = { ARM::VLD3q8Pseudo_UPD, + ARM::VLD3q16Pseudo_UPD, + ARM::VLD3q32Pseudo_UPD }; + unsigned QOpcodes1[] = { ARM::VLD3q8oddPseudo_UPD, + ARM::VLD3q16oddPseudo_UPD, + ARM::VLD3q32oddPseudo_UPD }; + return SelectVLD(N, true, 3, DOpcodes, QOpcodes0, QOpcodes1); + } + + case ARMISD::VLD4_UPD: { + unsigned DOpcodes[] = { ARM::VLD4d8Pseudo_UPD, ARM::VLD4d16Pseudo_UPD, + ARM::VLD4d32Pseudo_UPD, ARM::VLD1d64QPseudo_UPD }; + unsigned QOpcodes0[] = { ARM::VLD4q8Pseudo_UPD, + ARM::VLD4q16Pseudo_UPD, + ARM::VLD4q32Pseudo_UPD }; + unsigned QOpcodes1[] = { ARM::VLD4q8oddPseudo_UPD, + ARM::VLD4q16oddPseudo_UPD, + ARM::VLD4q32oddPseudo_UPD }; + return SelectVLD(N, true, 4, DOpcodes, QOpcodes0, QOpcodes1); + } + + case ARMISD::VLD2LN_UPD: { + unsigned DOpcodes[] = { ARM::VLD2LNd8Pseudo_UPD, ARM::VLD2LNd16Pseudo_UPD, + ARM::VLD2LNd32Pseudo_UPD }; + unsigned QOpcodes[] = { ARM::VLD2LNq16Pseudo_UPD, + ARM::VLD2LNq32Pseudo_UPD }; + return SelectVLDSTLane(N, true, true, 2, DOpcodes, QOpcodes); + } + + case ARMISD::VLD3LN_UPD: { + unsigned DOpcodes[] = { ARM::VLD3LNd8Pseudo_UPD, ARM::VLD3LNd16Pseudo_UPD, + ARM::VLD3LNd32Pseudo_UPD }; + unsigned QOpcodes[] = { ARM::VLD3LNq16Pseudo_UPD, + ARM::VLD3LNq32Pseudo_UPD }; + return SelectVLDSTLane(N, true, true, 3, DOpcodes, QOpcodes); + } + + case ARMISD::VLD4LN_UPD: { + unsigned DOpcodes[] = { ARM::VLD4LNd8Pseudo_UPD, ARM::VLD4LNd16Pseudo_UPD, + ARM::VLD4LNd32Pseudo_UPD }; + unsigned QOpcodes[] = { ARM::VLD4LNq16Pseudo_UPD, + ARM::VLD4LNq32Pseudo_UPD }; + return SelectVLDSTLane(N, true, true, 4, DOpcodes, QOpcodes); + } + + case ARMISD::VST1_UPD: { + unsigned DOpcodes[] = { ARM::VST1d8_UPD, ARM::VST1d16_UPD, + ARM::VST1d32_UPD, ARM::VST1d64_UPD }; + unsigned QOpcodes[] = { ARM::VST1q8Pseudo_UPD, ARM::VST1q16Pseudo_UPD, + ARM::VST1q32Pseudo_UPD, ARM::VST1q64Pseudo_UPD }; + return SelectVST(N, true, 1, DOpcodes, QOpcodes, 0); + } + + case ARMISD::VST2_UPD: { + unsigned DOpcodes[] = { ARM::VST2d8Pseudo_UPD, ARM::VST2d16Pseudo_UPD, + ARM::VST2d32Pseudo_UPD, ARM::VST1q64Pseudo_UPD }; + unsigned QOpcodes[] = { ARM::VST2q8Pseudo_UPD, ARM::VST2q16Pseudo_UPD, + ARM::VST2q32Pseudo_UPD }; + return SelectVST(N, true, 2, DOpcodes, QOpcodes, 0); + } + + case ARMISD::VST3_UPD: { + unsigned DOpcodes[] = { ARM::VST3d8Pseudo_UPD, ARM::VST3d16Pseudo_UPD, + ARM::VST3d32Pseudo_UPD, ARM::VST1d64TPseudo_UPD }; + unsigned QOpcodes0[] = { ARM::VST3q8Pseudo_UPD, + ARM::VST3q16Pseudo_UPD, + ARM::VST3q32Pseudo_UPD }; + unsigned QOpcodes1[] = { ARM::VST3q8oddPseudo_UPD, + ARM::VST3q16oddPseudo_UPD, + ARM::VST3q32oddPseudo_UPD }; + return SelectVST(N, true, 3, DOpcodes, QOpcodes0, QOpcodes1); + } + + case ARMISD::VST4_UPD: { + unsigned DOpcodes[] = { ARM::VST4d8Pseudo_UPD, ARM::VST4d16Pseudo_UPD, + ARM::VST4d32Pseudo_UPD, ARM::VST1d64QPseudo_UPD }; + unsigned QOpcodes0[] = { ARM::VST4q8Pseudo_UPD, + ARM::VST4q16Pseudo_UPD, + ARM::VST4q32Pseudo_UPD }; + unsigned QOpcodes1[] = { ARM::VST4q8oddPseudo_UPD, + ARM::VST4q16oddPseudo_UPD, + ARM::VST4q32oddPseudo_UPD }; + return SelectVST(N, true, 4, DOpcodes, QOpcodes0, QOpcodes1); + } + + case ARMISD::VST2LN_UPD: { + unsigned DOpcodes[] = { ARM::VST2LNd8Pseudo_UPD, ARM::VST2LNd16Pseudo_UPD, + ARM::VST2LNd32Pseudo_UPD }; + unsigned QOpcodes[] = { ARM::VST2LNq16Pseudo_UPD, + ARM::VST2LNq32Pseudo_UPD }; + return SelectVLDSTLane(N, false, true, 2, DOpcodes, QOpcodes); + } + + case ARMISD::VST3LN_UPD: { + unsigned DOpcodes[] = { ARM::VST3LNd8Pseudo_UPD, ARM::VST3LNd16Pseudo_UPD, + ARM::VST3LNd32Pseudo_UPD }; + unsigned QOpcodes[] = { ARM::VST3LNq16Pseudo_UPD, + ARM::VST3LNq32Pseudo_UPD }; + return SelectVLDSTLane(N, false, true, 3, DOpcodes, QOpcodes); + } + + case ARMISD::VST4LN_UPD: { + unsigned DOpcodes[] = { ARM::VST4LNd8Pseudo_UPD, ARM::VST4LNd16Pseudo_UPD, + ARM::VST4LNd32Pseudo_UPD }; + unsigned QOpcodes[] = { ARM::VST4LNq16Pseudo_UPD, + ARM::VST4LNq32Pseudo_UPD }; + return SelectVLDSTLane(N, false, true, 4, DOpcodes, QOpcodes); + } + case ISD::INTRINSIC_VOID: case ISD::INTRINSIC_W_CHAIN: { unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); @@ -2113,7 +2712,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { ARM::VLD1d32, ARM::VLD1d64 }; unsigned QOpcodes[] = { ARM::VLD1q8Pseudo, ARM::VLD1q16Pseudo, ARM::VLD1q32Pseudo, ARM::VLD1q64Pseudo }; - return SelectVLD(N, 1, DOpcodes, QOpcodes, 0); + return SelectVLD(N, false, 1, DOpcodes, QOpcodes, 0); } case Intrinsic::arm_neon_vld2: { @@ -2121,7 +2720,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { ARM::VLD2d32Pseudo, ARM::VLD1q64Pseudo }; unsigned QOpcodes[] = { ARM::VLD2q8Pseudo, ARM::VLD2q16Pseudo, ARM::VLD2q32Pseudo }; - return SelectVLD(N, 2, DOpcodes, QOpcodes, 0); + return SelectVLD(N, false, 2, DOpcodes, QOpcodes, 0); } case Intrinsic::arm_neon_vld3: { @@ -2130,10 +2729,10 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { unsigned QOpcodes0[] = { ARM::VLD3q8Pseudo_UPD, ARM::VLD3q16Pseudo_UPD, ARM::VLD3q32Pseudo_UPD }; - unsigned QOpcodes1[] = { ARM::VLD3q8oddPseudo_UPD, - ARM::VLD3q16oddPseudo_UPD, - ARM::VLD3q32oddPseudo_UPD }; - return SelectVLD(N, 3, DOpcodes, QOpcodes0, QOpcodes1); + unsigned QOpcodes1[] = { ARM::VLD3q8oddPseudo, + ARM::VLD3q16oddPseudo, + ARM::VLD3q32oddPseudo }; + return SelectVLD(N, false, 3, DOpcodes, QOpcodes0, QOpcodes1); } case Intrinsic::arm_neon_vld4: { @@ -2142,31 +2741,31 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { unsigned QOpcodes0[] = { ARM::VLD4q8Pseudo_UPD, ARM::VLD4q16Pseudo_UPD, ARM::VLD4q32Pseudo_UPD }; - unsigned QOpcodes1[] = { ARM::VLD4q8oddPseudo_UPD, - ARM::VLD4q16oddPseudo_UPD, - ARM::VLD4q32oddPseudo_UPD }; - return SelectVLD(N, 4, DOpcodes, QOpcodes0, QOpcodes1); + unsigned QOpcodes1[] = { ARM::VLD4q8oddPseudo, + ARM::VLD4q16oddPseudo, + ARM::VLD4q32oddPseudo }; + return SelectVLD(N, false, 4, DOpcodes, QOpcodes0, QOpcodes1); } case Intrinsic::arm_neon_vld2lane: { - unsigned DOpcodes[] = { ARM::VLD2LNd8, ARM::VLD2LNd16, ARM::VLD2LNd32 }; - unsigned QOpcodes0[] = { ARM::VLD2LNq16, ARM::VLD2LNq32 }; - unsigned QOpcodes1[] = { ARM::VLD2LNq16odd, ARM::VLD2LNq32odd }; - return SelectVLDSTLane(N, true, 2, DOpcodes, QOpcodes0, QOpcodes1); + unsigned DOpcodes[] = { ARM::VLD2LNd8Pseudo, ARM::VLD2LNd16Pseudo, + ARM::VLD2LNd32Pseudo }; + unsigned QOpcodes[] = { ARM::VLD2LNq16Pseudo, ARM::VLD2LNq32Pseudo }; + return SelectVLDSTLane(N, true, false, 2, DOpcodes, QOpcodes); } case Intrinsic::arm_neon_vld3lane: { - unsigned DOpcodes[] = { ARM::VLD3LNd8, ARM::VLD3LNd16, ARM::VLD3LNd32 }; - unsigned QOpcodes0[] = { ARM::VLD3LNq16, ARM::VLD3LNq32 }; - unsigned QOpcodes1[] = { ARM::VLD3LNq16odd, ARM::VLD3LNq32odd }; - return SelectVLDSTLane(N, true, 3, DOpcodes, QOpcodes0, QOpcodes1); + unsigned DOpcodes[] = { ARM::VLD3LNd8Pseudo, ARM::VLD3LNd16Pseudo, + ARM::VLD3LNd32Pseudo }; + unsigned QOpcodes[] = { ARM::VLD3LNq16Pseudo, ARM::VLD3LNq32Pseudo }; + return SelectVLDSTLane(N, true, false, 3, DOpcodes, QOpcodes); } case Intrinsic::arm_neon_vld4lane: { - unsigned DOpcodes[] = { ARM::VLD4LNd8, ARM::VLD4LNd16, ARM::VLD4LNd32 }; - unsigned QOpcodes0[] = { ARM::VLD4LNq16, ARM::VLD4LNq32 }; - unsigned QOpcodes1[] = { ARM::VLD4LNq16odd, ARM::VLD4LNq32odd }; - return SelectVLDSTLane(N, true, 4, DOpcodes, QOpcodes0, QOpcodes1); + unsigned DOpcodes[] = { ARM::VLD4LNd8Pseudo, ARM::VLD4LNd16Pseudo, + ARM::VLD4LNd32Pseudo }; + unsigned QOpcodes[] = { ARM::VLD4LNq16Pseudo, ARM::VLD4LNq32Pseudo }; + return SelectVLDSTLane(N, true, false, 4, DOpcodes, QOpcodes); } case Intrinsic::arm_neon_vst1: { @@ -2174,7 +2773,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { ARM::VST1d32, ARM::VST1d64 }; unsigned QOpcodes[] = { ARM::VST1q8Pseudo, ARM::VST1q16Pseudo, ARM::VST1q32Pseudo, ARM::VST1q64Pseudo }; - return SelectVST(N, 1, DOpcodes, QOpcodes, 0); + return SelectVST(N, false, 1, DOpcodes, QOpcodes, 0); } case Intrinsic::arm_neon_vst2: { @@ -2182,7 +2781,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { ARM::VST2d32Pseudo, ARM::VST1q64Pseudo }; unsigned QOpcodes[] = { ARM::VST2q8Pseudo, ARM::VST2q16Pseudo, ARM::VST2q32Pseudo }; - return SelectVST(N, 2, DOpcodes, QOpcodes, 0); + return SelectVST(N, false, 2, DOpcodes, QOpcodes, 0); } case Intrinsic::arm_neon_vst3: { @@ -2191,10 +2790,10 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { unsigned QOpcodes0[] = { ARM::VST3q8Pseudo_UPD, ARM::VST3q16Pseudo_UPD, ARM::VST3q32Pseudo_UPD }; - unsigned QOpcodes1[] = { ARM::VST3q8oddPseudo_UPD, - ARM::VST3q16oddPseudo_UPD, - ARM::VST3q32oddPseudo_UPD }; - return SelectVST(N, 3, DOpcodes, QOpcodes0, QOpcodes1); + unsigned QOpcodes1[] = { ARM::VST3q8oddPseudo, + ARM::VST3q16oddPseudo, + ARM::VST3q32oddPseudo }; + return SelectVST(N, false, 3, DOpcodes, QOpcodes0, QOpcodes1); } case Intrinsic::arm_neon_vst4: { @@ -2203,31 +2802,31 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { unsigned QOpcodes0[] = { ARM::VST4q8Pseudo_UPD, ARM::VST4q16Pseudo_UPD, ARM::VST4q32Pseudo_UPD }; - unsigned QOpcodes1[] = { ARM::VST4q8oddPseudo_UPD, - ARM::VST4q16oddPseudo_UPD, - ARM::VST4q32oddPseudo_UPD }; - return SelectVST(N, 4, DOpcodes, QOpcodes0, QOpcodes1); + unsigned QOpcodes1[] = { ARM::VST4q8oddPseudo, + ARM::VST4q16oddPseudo, + ARM::VST4q32oddPseudo }; + return SelectVST(N, false, 4, DOpcodes, QOpcodes0, QOpcodes1); } case Intrinsic::arm_neon_vst2lane: { - unsigned DOpcodes[] = { ARM::VST2LNd8, ARM::VST2LNd16, ARM::VST2LNd32 }; - unsigned QOpcodes0[] = { ARM::VST2LNq16, ARM::VST2LNq32 }; - unsigned QOpcodes1[] = { ARM::VST2LNq16odd, ARM::VST2LNq32odd }; - return SelectVLDSTLane(N, false, 2, DOpcodes, QOpcodes0, QOpcodes1); + unsigned DOpcodes[] = { ARM::VST2LNd8Pseudo, ARM::VST2LNd16Pseudo, + ARM::VST2LNd32Pseudo }; + unsigned QOpcodes[] = { ARM::VST2LNq16Pseudo, ARM::VST2LNq32Pseudo }; + return SelectVLDSTLane(N, false, false, 2, DOpcodes, QOpcodes); } case Intrinsic::arm_neon_vst3lane: { - unsigned DOpcodes[] = { ARM::VST3LNd8, ARM::VST3LNd16, ARM::VST3LNd32 }; - unsigned QOpcodes0[] = { ARM::VST3LNq16, ARM::VST3LNq32 }; - unsigned QOpcodes1[] = { ARM::VST3LNq16odd, ARM::VST3LNq32odd }; - return SelectVLDSTLane(N, false, 3, DOpcodes, QOpcodes0, QOpcodes1); + unsigned DOpcodes[] = { ARM::VST3LNd8Pseudo, ARM::VST3LNd16Pseudo, + ARM::VST3LNd32Pseudo }; + unsigned QOpcodes[] = { ARM::VST3LNq16Pseudo, ARM::VST3LNq32Pseudo }; + return SelectVLDSTLane(N, false, false, 3, DOpcodes, QOpcodes); } case Intrinsic::arm_neon_vst4lane: { - unsigned DOpcodes[] = { ARM::VST4LNd8, ARM::VST4LNd16, ARM::VST4LNd32 }; - unsigned QOpcodes0[] = { ARM::VST4LNq16, ARM::VST4LNq32 }; - unsigned QOpcodes1[] = { ARM::VST4LNq16odd, ARM::VST4LNq32odd }; - return SelectVLDSTLane(N, false, 4, DOpcodes, QOpcodes0, QOpcodes1); + unsigned DOpcodes[] = { ARM::VST4LNd8Pseudo, ARM::VST4LNd16Pseudo, + ARM::VST4LNd32Pseudo }; + unsigned QOpcodes[] = { ARM::VST4LNq16Pseudo, ARM::VST4LNq32Pseudo }; + return SelectVLDSTLane(N, false, false, 4, DOpcodes, QOpcodes); } } break; @@ -2240,18 +2839,18 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { break; case Intrinsic::arm_neon_vtbl2: - return SelectVTBL(N, false, 2, ARM::VTBL2); + return SelectVTBL(N, false, 2, ARM::VTBL2Pseudo); case Intrinsic::arm_neon_vtbl3: - return SelectVTBL(N, false, 3, ARM::VTBL3); + return SelectVTBL(N, false, 3, ARM::VTBL3Pseudo); case Intrinsic::arm_neon_vtbl4: - return SelectVTBL(N, false, 4, ARM::VTBL4); + return SelectVTBL(N, false, 4, ARM::VTBL4Pseudo); case Intrinsic::arm_neon_vtbx2: - return SelectVTBL(N, true, 2, ARM::VTBX2); + return SelectVTBL(N, true, 2, ARM::VTBX2Pseudo); case Intrinsic::arm_neon_vtbx3: - return SelectVTBL(N, true, 3, ARM::VTBX3); + return SelectVTBL(N, true, 3, ARM::VTBX3Pseudo); case Intrinsic::arm_neon_vtbx4: - return SelectVTBL(N, true, 4, ARM::VTBX4); + return SelectVTBL(N, true, 4, ARM::VTBX4Pseudo); } break; } diff --git a/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp b/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp index ce4a2c9..1835ec0 100644 --- a/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -15,6 +15,7 @@ #define DEBUG_TYPE "arm-isel" #include "ARM.h" #include "ARMAddressingModes.h" +#include "ARMCallingConv.h" #include "ARMConstantPoolValue.h" #include "ARMISelLowering.h" #include "ARMMachineFunctionInfo.h" @@ -28,9 +29,11 @@ #include "llvm/Function.h" #include "llvm/GlobalValue.h" #include "llvm/Instruction.h" +#include "llvm/Instructions.h" #include "llvm/Intrinsics.h" #include "llvm/Type.h" #include "llvm/CodeGen/CallingConvLower.h" +#include "llvm/CodeGen/IntrinsicLowering.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" @@ -41,6 +44,7 @@ #include "llvm/MC/MCSectionMachO.h" #include "llvm/Target/TargetOptions.h" #include "llvm/ADT/VectorExtras.h" +#include "llvm/ADT/StringExtras.h" #include "llvm/ADT/Statistic.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/ErrorHandling.h" @@ -50,6 +54,7 @@ using namespace llvm; STATISTIC(NumTailCalls, "Number of tail calls"); +STATISTIC(NumMovwMovt, "Number of GAs materialized with movw + movt"); // This option should go away when tail calls fully work. static cl::opt<bool> @@ -57,14 +62,7 @@ EnableARMTailCalls("arm-tail-calls", cl::Hidden, cl::desc("Generate tail calls (TEMPORARY OPTION)."), cl::init(false)); -// This option should go away when Machine LICM is smart enough to hoist a -// reg-to-reg VDUP. -static cl::opt<bool> -EnableARMVDUPsplat("arm-vdup-splat", cl::Hidden, - cl::desc("Generate VDUP for integer constant splats (TEMPORARY OPTION)."), - cl::init(false)); - -static cl::opt<bool> +cl::opt<bool> EnableARMLongCalls("arm-long-calls", cl::Hidden, cl::desc("Generate calls via indirect call instructions"), cl::init(false)); @@ -74,28 +72,6 @@ ARMInterworking("arm-interworking", cl::Hidden, cl::desc("Enable / disable ARM interworking (for debugging only)"), cl::init(true)); -static cl::opt<bool> -EnableARMCodePlacement("arm-code-placement", cl::Hidden, - cl::desc("Enable code placement pass for ARM"), - cl::init(false)); - -static bool CC_ARM_APCS_Custom_f64(unsigned &ValNo, EVT &ValVT, EVT &LocVT, - CCValAssign::LocInfo &LocInfo, - ISD::ArgFlagsTy &ArgFlags, - CCState &State); -static bool CC_ARM_AAPCS_Custom_f64(unsigned &ValNo, EVT &ValVT, EVT &LocVT, - CCValAssign::LocInfo &LocInfo, - ISD::ArgFlagsTy &ArgFlags, - CCState &State); -static bool RetCC_ARM_APCS_Custom_f64(unsigned &ValNo, EVT &ValVT, EVT &LocVT, - CCValAssign::LocInfo &LocInfo, - ISD::ArgFlagsTy &ArgFlags, - CCState &State); -static bool RetCC_ARM_AAPCS_Custom_f64(unsigned &ValNo, EVT &ValVT, EVT &LocVT, - CCValAssign::LocInfo &LocInfo, - ISD::ArgFlagsTy &ArgFlags, - CCState &State); - void ARMTargetLowering::addTypeForNEON(EVT VT, EVT PromotedLdStVT, EVT PromotedBitwiseVT) { if (VT != PromotedLdStVT) { @@ -111,8 +87,7 @@ void ARMTargetLowering::addTypeForNEON(EVT VT, EVT PromotedLdStVT, EVT ElemTy = VT.getVectorElementType(); if (ElemTy != MVT::i64 && ElemTy != MVT::f64) setOperationAction(ISD::VSETCC, VT.getSimpleVT(), Custom); - if (ElemTy == MVT::i8 || ElemTy == MVT::i16) - setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT.getSimpleVT(), Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT.getSimpleVT(), Custom); if (ElemTy != MVT::i32) { setOperationAction(ISD::SINT_TO_FP, VT.getSimpleVT(), Expand); setOperationAction(ISD::UINT_TO_FP, VT.getSimpleVT(), Expand); @@ -122,7 +97,7 @@ void ARMTargetLowering::addTypeForNEON(EVT VT, EVT PromotedLdStVT, setOperationAction(ISD::BUILD_VECTOR, VT.getSimpleVT(), Custom); setOperationAction(ISD::VECTOR_SHUFFLE, VT.getSimpleVT(), Custom); setOperationAction(ISD::CONCAT_VECTORS, VT.getSimpleVT(), Legal); - setOperationAction(ISD::EXTRACT_SUBVECTOR, VT.getSimpleVT(), Expand); + setOperationAction(ISD::EXTRACT_SUBVECTOR, VT.getSimpleVT(), Legal); setOperationAction(ISD::SELECT, VT.getSimpleVT(), Expand); setOperationAction(ISD::SELECT_CC, VT.getSimpleVT(), Expand); if (VT.isInteger()) { @@ -131,6 +106,10 @@ void ARMTargetLowering::addTypeForNEON(EVT VT, EVT PromotedLdStVT, setOperationAction(ISD::SRL, VT.getSimpleVT(), Custom); setLoadExtAction(ISD::SEXTLOAD, VT.getSimpleVT(), Expand); setLoadExtAction(ISD::ZEXTLOAD, VT.getSimpleVT(), Expand); + for (unsigned InnerVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; + InnerVT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++InnerVT) + setTruncStoreAction(VT.getSimpleVT(), + (MVT::SimpleValueType)InnerVT, Expand); } setLoadExtAction(ISD::EXTLOAD, VT.getSimpleVT(), Expand); @@ -177,6 +156,7 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) : TargetLowering(TM, createTLOF(TM)) { Subtarget = &TM.getSubtarget<ARMSubtarget>(); RegInfo = TM.getRegisterInfo(); + Itins = TM.getInstrItineraryData(); if (Subtarget->isTargetDarwin()) { // Uses VFP for Thumb libfuncs if available. @@ -260,13 +240,157 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) setLibcallName(RTLIB::SRL_I128, 0); setLibcallName(RTLIB::SRA_I128, 0); - // Libcalls should use the AAPCS base standard ABI, even if hard float - // is in effect, as per the ARM RTABI specification, section 4.1.2. if (Subtarget->isAAPCS_ABI()) { - for (int i = 0; i < RTLIB::UNKNOWN_LIBCALL; ++i) { - setLibcallCallingConv(static_cast<RTLIB::Libcall>(i), - CallingConv::ARM_AAPCS); - } + // Double-precision floating-point arithmetic helper functions + // RTABI chapter 4.1.2, Table 2 + setLibcallName(RTLIB::ADD_F64, "__aeabi_dadd"); + setLibcallName(RTLIB::DIV_F64, "__aeabi_ddiv"); + setLibcallName(RTLIB::MUL_F64, "__aeabi_dmul"); + setLibcallName(RTLIB::SUB_F64, "__aeabi_dsub"); + setLibcallCallingConv(RTLIB::ADD_F64, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::DIV_F64, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::MUL_F64, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::SUB_F64, CallingConv::ARM_AAPCS); + + // Double-precision floating-point comparison helper functions + // RTABI chapter 4.1.2, Table 3 + setLibcallName(RTLIB::OEQ_F64, "__aeabi_dcmpeq"); + setCmpLibcallCC(RTLIB::OEQ_F64, ISD::SETNE); + setLibcallName(RTLIB::UNE_F64, "__aeabi_dcmpeq"); + setCmpLibcallCC(RTLIB::UNE_F64, ISD::SETEQ); + setLibcallName(RTLIB::OLT_F64, "__aeabi_dcmplt"); + setCmpLibcallCC(RTLIB::OLT_F64, ISD::SETNE); + setLibcallName(RTLIB::OLE_F64, "__aeabi_dcmple"); + setCmpLibcallCC(RTLIB::OLE_F64, ISD::SETNE); + setLibcallName(RTLIB::OGE_F64, "__aeabi_dcmpge"); + setCmpLibcallCC(RTLIB::OGE_F64, ISD::SETNE); + setLibcallName(RTLIB::OGT_F64, "__aeabi_dcmpgt"); + setCmpLibcallCC(RTLIB::OGT_F64, ISD::SETNE); + setLibcallName(RTLIB::UO_F64, "__aeabi_dcmpun"); + setCmpLibcallCC(RTLIB::UO_F64, ISD::SETNE); + setLibcallName(RTLIB::O_F64, "__aeabi_dcmpun"); + setCmpLibcallCC(RTLIB::O_F64, ISD::SETEQ); + setLibcallCallingConv(RTLIB::OEQ_F64, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::UNE_F64, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::OLT_F64, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::OLE_F64, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::OGE_F64, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::OGT_F64, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::UO_F64, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::O_F64, CallingConv::ARM_AAPCS); + + // Single-precision floating-point arithmetic helper functions + // RTABI chapter 4.1.2, Table 4 + setLibcallName(RTLIB::ADD_F32, "__aeabi_fadd"); + setLibcallName(RTLIB::DIV_F32, "__aeabi_fdiv"); + setLibcallName(RTLIB::MUL_F32, "__aeabi_fmul"); + setLibcallName(RTLIB::SUB_F32, "__aeabi_fsub"); + setLibcallCallingConv(RTLIB::ADD_F32, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::DIV_F32, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::MUL_F32, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::SUB_F32, CallingConv::ARM_AAPCS); + + // Single-precision floating-point comparison helper functions + // RTABI chapter 4.1.2, Table 5 + setLibcallName(RTLIB::OEQ_F32, "__aeabi_fcmpeq"); + setCmpLibcallCC(RTLIB::OEQ_F32, ISD::SETNE); + setLibcallName(RTLIB::UNE_F32, "__aeabi_fcmpeq"); + setCmpLibcallCC(RTLIB::UNE_F32, ISD::SETEQ); + setLibcallName(RTLIB::OLT_F32, "__aeabi_fcmplt"); + setCmpLibcallCC(RTLIB::OLT_F32, ISD::SETNE); + setLibcallName(RTLIB::OLE_F32, "__aeabi_fcmple"); + setCmpLibcallCC(RTLIB::OLE_F32, ISD::SETNE); + setLibcallName(RTLIB::OGE_F32, "__aeabi_fcmpge"); + setCmpLibcallCC(RTLIB::OGE_F32, ISD::SETNE); + setLibcallName(RTLIB::OGT_F32, "__aeabi_fcmpgt"); + setCmpLibcallCC(RTLIB::OGT_F32, ISD::SETNE); + setLibcallName(RTLIB::UO_F32, "__aeabi_fcmpun"); + setCmpLibcallCC(RTLIB::UO_F32, ISD::SETNE); + setLibcallName(RTLIB::O_F32, "__aeabi_fcmpun"); + setCmpLibcallCC(RTLIB::O_F32, ISD::SETEQ); + setLibcallCallingConv(RTLIB::OEQ_F32, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::UNE_F32, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::OLT_F32, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::OLE_F32, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::OGE_F32, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::OGT_F32, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::UO_F32, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::O_F32, CallingConv::ARM_AAPCS); + + // Floating-point to integer conversions. + // RTABI chapter 4.1.2, Table 6 + setLibcallName(RTLIB::FPTOSINT_F64_I32, "__aeabi_d2iz"); + setLibcallName(RTLIB::FPTOUINT_F64_I32, "__aeabi_d2uiz"); + setLibcallName(RTLIB::FPTOSINT_F64_I64, "__aeabi_d2lz"); + setLibcallName(RTLIB::FPTOUINT_F64_I64, "__aeabi_d2ulz"); + setLibcallName(RTLIB::FPTOSINT_F32_I32, "__aeabi_f2iz"); + setLibcallName(RTLIB::FPTOUINT_F32_I32, "__aeabi_f2uiz"); + setLibcallName(RTLIB::FPTOSINT_F32_I64, "__aeabi_f2lz"); + setLibcallName(RTLIB::FPTOUINT_F32_I64, "__aeabi_f2ulz"); + setLibcallCallingConv(RTLIB::FPTOSINT_F64_I32, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::FPTOUINT_F64_I32, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::FPTOSINT_F64_I64, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::FPTOUINT_F64_I64, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::FPTOSINT_F32_I32, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::FPTOUINT_F32_I32, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::FPTOSINT_F32_I64, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::FPTOUINT_F32_I64, CallingConv::ARM_AAPCS); + + // Conversions between floating types. + // RTABI chapter 4.1.2, Table 7 + setLibcallName(RTLIB::FPROUND_F64_F32, "__aeabi_d2f"); + setLibcallName(RTLIB::FPEXT_F32_F64, "__aeabi_f2d"); + setLibcallCallingConv(RTLIB::FPROUND_F64_F32, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::FPEXT_F32_F64, CallingConv::ARM_AAPCS); + + // Integer to floating-point conversions. + // RTABI chapter 4.1.2, Table 8 + setLibcallName(RTLIB::SINTTOFP_I32_F64, "__aeabi_i2d"); + setLibcallName(RTLIB::UINTTOFP_I32_F64, "__aeabi_ui2d"); + setLibcallName(RTLIB::SINTTOFP_I64_F64, "__aeabi_l2d"); + setLibcallName(RTLIB::UINTTOFP_I64_F64, "__aeabi_ul2d"); + setLibcallName(RTLIB::SINTTOFP_I32_F32, "__aeabi_i2f"); + setLibcallName(RTLIB::UINTTOFP_I32_F32, "__aeabi_ui2f"); + setLibcallName(RTLIB::SINTTOFP_I64_F32, "__aeabi_l2f"); + setLibcallName(RTLIB::UINTTOFP_I64_F32, "__aeabi_ul2f"); + setLibcallCallingConv(RTLIB::SINTTOFP_I32_F64, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::UINTTOFP_I32_F64, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::SINTTOFP_I64_F64, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::UINTTOFP_I64_F64, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::SINTTOFP_I32_F32, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::UINTTOFP_I32_F32, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::SINTTOFP_I64_F32, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::UINTTOFP_I64_F32, CallingConv::ARM_AAPCS); + + // Long long helper functions + // RTABI chapter 4.2, Table 9 + setLibcallName(RTLIB::MUL_I64, "__aeabi_lmul"); + setLibcallName(RTLIB::SDIV_I64, "__aeabi_ldivmod"); + setLibcallName(RTLIB::UDIV_I64, "__aeabi_uldivmod"); + setLibcallName(RTLIB::SHL_I64, "__aeabi_llsl"); + setLibcallName(RTLIB::SRL_I64, "__aeabi_llsr"); + setLibcallName(RTLIB::SRA_I64, "__aeabi_lasr"); + setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::SHL_I64, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::SRL_I64, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::SRA_I64, CallingConv::ARM_AAPCS); + + // Integer division functions + // RTABI chapter 4.3.1 + setLibcallName(RTLIB::SDIV_I8, "__aeabi_idiv"); + setLibcallName(RTLIB::SDIV_I16, "__aeabi_idiv"); + setLibcallName(RTLIB::SDIV_I32, "__aeabi_idiv"); + setLibcallName(RTLIB::UDIV_I8, "__aeabi_uidiv"); + setLibcallName(RTLIB::UDIV_I16, "__aeabi_uidiv"); + setLibcallName(RTLIB::UDIV_I32, "__aeabi_uidiv"); + setLibcallCallingConv(RTLIB::SDIV_I8, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::SDIV_I16, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::SDIV_I32, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::UDIV_I8, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::UDIV_I16, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::UDIV_I32, CallingConv::ARM_AAPCS); } if (Subtarget->isThumb1Only()) @@ -330,9 +454,16 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) setOperationAction(ISD::MUL, MVT::v8i16, Custom); setOperationAction(ISD::MUL, MVT::v4i32, Custom); setOperationAction(ISD::MUL, MVT::v2i64, Custom); + // Custom handling for some vector types to avoid expensive expansions + setOperationAction(ISD::SDIV, MVT::v4i16, Custom); + setOperationAction(ISD::SDIV, MVT::v8i8, Custom); + setOperationAction(ISD::UDIV, MVT::v4i16, Custom); + setOperationAction(ISD::UDIV, MVT::v8i8, Custom); setOperationAction(ISD::VSETCC, MVT::v1i64, Expand); setOperationAction(ISD::VSETCC, MVT::v2i64, Expand); + setTargetDAGCombine(ISD::INTRINSIC_VOID); + setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN); setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); setTargetDAGCombine(ISD::SHL); setTargetDAGCombine(ISD::SRL); @@ -341,6 +472,10 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) setTargetDAGCombine(ISD::ZERO_EXTEND); setTargetDAGCombine(ISD::ANY_EXTEND); setTargetDAGCombine(ISD::SELECT_CC); + setTargetDAGCombine(ISD::BUILD_VECTOR); + setTargetDAGCombine(ISD::VECTOR_SHUFFLE); + setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); + setTargetDAGCombine(ISD::STORE); } computeRegisterProperties(); @@ -397,7 +532,7 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) setOperationAction(ISD::BSWAP, MVT::i32, Expand); // These are expanded into libcalls. - if (!Subtarget->hasDivide()) { + if (!Subtarget->hasDivide() || !Subtarget->isThumb2()) { // v7M has a hardware divider setOperationAction(ISD::SDIV, MVT::i32, Expand); setOperationAction(ISD::UDIV, MVT::i32, Expand); @@ -423,14 +558,15 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); setOperationAction(ISD::EHSELECTION, MVT::i32, Expand); - // FIXME: Shouldn't need this, since no register is used, but the legalizer - // doesn't yet know how to not do that for SjLj. - setExceptionSelectorRegister(ARM::R0); + setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand); + setExceptionPointerRegister(ARM::R0); + setExceptionSelectorRegister(ARM::R1); + setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand); // ARMv6 Thumb1 (except for CPUs that support dmb / dsb) and earlier use // the default expansion. if (Subtarget->hasDataBarrier() || - (Subtarget->hasV6Ops() && !Subtarget->isThumb1Only())) { + (Subtarget->hasV6Ops() && !Subtarget->isThumb())) { // membarrier needs custom lowering; the rest are legal and handled // normally. setOperationAction(ISD::MEMBARRIER, MVT::Other, Custom); @@ -474,6 +610,8 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, Expand); setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i64, Expand); + setOperationAction(ISD::PREFETCH, MVT::Other, Custom); + // Requires SXTB/SXTH, available on v6 and up in both ARM and Thumb modes. if (!Subtarget->hasV6Ops()) { setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand); @@ -484,7 +622,7 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) if (!UseSoftFloat && Subtarget->hasVFP2() && !Subtarget->isThumb1Only()) { // Turn f64->i64 into VMOVRRD, i64 -> f64 to VMOVDRR // iff target supports vfp2. - setOperationAction(ISD::BIT_CONVERT, MVT::i64, Custom); + setOperationAction(ISD::BITCAST, MVT::i64, Custom); setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom); } @@ -493,6 +631,7 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) if (Subtarget->isTargetDarwin()) { setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom); setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom); + setOperationAction(ISD::EH_SJLJ_DISPATCHSETUP, MVT::Other, Custom); } setOperationAction(ISD::SETCC, MVT::i32, Expand); @@ -547,8 +686,10 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) setTargetDAGCombine(ISD::SUB); setTargetDAGCombine(ISD::MUL); - if (Subtarget->hasV6T2Ops()) + if (Subtarget->hasV6T2Ops() || Subtarget->hasNEON()) setTargetDAGCombine(ISD::OR); + if (Subtarget->hasNEON()) + setTargetDAGCombine(ISD::AND); setStackPointerRegisterToSaveRestore(ARM::SP); @@ -557,16 +698,26 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) else setSchedulingPreference(Sched::Hybrid); - maxStoresPerMemcpy = 1; //// temporary - rewrite interface to use type + //// temporary - rewrite interface to use type + maxStoresPerMemcpy = maxStoresPerMemcpyOptSize = 1; // On ARM arguments smaller than 4 bytes are extended, so all arguments // are at least 4 bytes aligned. setMinStackArgumentAlignment(4); - if (EnableARMCodePlacement) - benefitFromCodePlacementOpt = true; + benefitFromCodePlacementOpt = true; } +// FIXME: It might make sense to define the representative register class as the +// nearest super-register that has a non-null superset. For example, DPR_VFP2 is +// a super-register of SPR, and DPR is a superset if DPR_VFP2. Consequently, +// SPR's representative would be DPR_VFP2. This should work well if register +// pressure tracking were modified such that a register use would increment the +// pressure of the register class's representative and all of it's super +// classes' representatives transitively. We have not implemented this because +// of the difficulty prior to coalescing of modeling operand register classes +// due to the common occurence of cross class copies and subregister insertions +// and extractions. std::pair<const TargetRegisterClass*, uint8_t> ARMTargetLowering::findRepresentativeClass(EVT VT) const{ const TargetRegisterClass *RRC = 0; @@ -580,6 +731,12 @@ ARMTargetLowering::findRepresentativeClass(EVT VT) const{ case MVT::f32: case MVT::f64: case MVT::v8i8: case MVT::v4i16: case MVT::v2i32: case MVT::v1i64: case MVT::v2f32: RRC = ARM::DPRRegisterClass; + // When NEON is used for SP, only half of the register file is available + // because operations that define both SP and DP results will be constrained + // to the VFP2 class (D0-D15). We currently model this constraint prior to + // coalescing by double-counting the SP regs. See the FIXME above. + if (Subtarget->useNEONForSinglePrecisionFP()) + Cost = 2; break; case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64: case MVT::v4f32: case MVT::v2f64: @@ -602,6 +759,8 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { switch (Opcode) { default: return 0; case ARMISD::Wrapper: return "ARMISD::Wrapper"; + case ARMISD::WrapperDYN: return "ARMISD::WrapperDYN"; + case ARMISD::WrapperPIC: return "ARMISD::WrapperPIC"; case ARMISD::WrapperJT: return "ARMISD::WrapperJT"; case ARMISD::CALL: return "ARMISD::CALL"; case ARMISD::CALL_PRED: return "ARMISD::CALL_PRED"; @@ -612,7 +771,6 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::BR2_JT: return "ARMISD::BR2_JT"; case ARMISD::RET_FLAG: return "ARMISD::RET_FLAG"; case ARMISD::PIC_ADD: return "ARMISD::PIC_ADD"; - case ARMISD::AND: return "ARMISD::AND"; case ARMISD::CMP: return "ARMISD::CMP"; case ARMISD::CMPZ: return "ARMISD::CMPZ"; case ARMISD::CMPFP: return "ARMISD::CMPFP"; @@ -633,25 +791,33 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::SRA_FLAG: return "ARMISD::SRA_FLAG"; case ARMISD::RRX: return "ARMISD::RRX"; - case ARMISD::VMOVRRD: return "ARMISD::VMOVRRD"; - case ARMISD::VMOVDRR: return "ARMISD::VMOVDRR"; + case ARMISD::VMOVRRD: return "ARMISD::VMOVRRD"; + case ARMISD::VMOVDRR: return "ARMISD::VMOVDRR"; case ARMISD::EH_SJLJ_SETJMP: return "ARMISD::EH_SJLJ_SETJMP"; case ARMISD::EH_SJLJ_LONGJMP:return "ARMISD::EH_SJLJ_LONGJMP"; + case ARMISD::EH_SJLJ_DISPATCHSETUP:return "ARMISD::EH_SJLJ_DISPATCHSETUP"; case ARMISD::TC_RETURN: return "ARMISD::TC_RETURN"; - + case ARMISD::THREAD_POINTER:return "ARMISD::THREAD_POINTER"; case ARMISD::DYN_ALLOC: return "ARMISD::DYN_ALLOC"; case ARMISD::MEMBARRIER: return "ARMISD::MEMBARRIER"; - case ARMISD::SYNCBARRIER: return "ARMISD::SYNCBARRIER"; + case ARMISD::MEMBARRIER_MCR: return "ARMISD::MEMBARRIER_MCR"; + + case ARMISD::PRELOAD: return "ARMISD::PRELOAD"; case ARMISD::VCEQ: return "ARMISD::VCEQ"; + case ARMISD::VCEQZ: return "ARMISD::VCEQZ"; case ARMISD::VCGE: return "ARMISD::VCGE"; + case ARMISD::VCGEZ: return "ARMISD::VCGEZ"; + case ARMISD::VCLEZ: return "ARMISD::VCLEZ"; case ARMISD::VCGEU: return "ARMISD::VCGEU"; case ARMISD::VCGT: return "ARMISD::VCGT"; + case ARMISD::VCGTZ: return "ARMISD::VCGTZ"; + case ARMISD::VCLTZ: return "ARMISD::VCLTZ"; case ARMISD::VCGTU: return "ARMISD::VCGTU"; case ARMISD::VTST: return "ARMISD::VTST"; @@ -693,6 +859,28 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::FMAX: return "ARMISD::FMAX"; case ARMISD::FMIN: return "ARMISD::FMIN"; case ARMISD::BFI: return "ARMISD::BFI"; + case ARMISD::VORRIMM: return "ARMISD::VORRIMM"; + case ARMISD::VBICIMM: return "ARMISD::VBICIMM"; + case ARMISD::VLD2DUP: return "ARMISD::VLD2DUP"; + case ARMISD::VLD3DUP: return "ARMISD::VLD3DUP"; + case ARMISD::VLD4DUP: return "ARMISD::VLD4DUP"; + case ARMISD::VLD1_UPD: return "ARMISD::VLD1_UPD"; + case ARMISD::VLD2_UPD: return "ARMISD::VLD2_UPD"; + case ARMISD::VLD3_UPD: return "ARMISD::VLD3_UPD"; + case ARMISD::VLD4_UPD: return "ARMISD::VLD4_UPD"; + case ARMISD::VLD2LN_UPD: return "ARMISD::VLD2LN_UPD"; + case ARMISD::VLD3LN_UPD: return "ARMISD::VLD3LN_UPD"; + case ARMISD::VLD4LN_UPD: return "ARMISD::VLD4LN_UPD"; + case ARMISD::VLD2DUP_UPD: return "ARMISD::VLD2DUP_UPD"; + case ARMISD::VLD3DUP_UPD: return "ARMISD::VLD3DUP_UPD"; + case ARMISD::VLD4DUP_UPD: return "ARMISD::VLD4DUP_UPD"; + case ARMISD::VST1_UPD: return "ARMISD::VST1_UPD"; + case ARMISD::VST2_UPD: return "ARMISD::VST2_UPD"; + case ARMISD::VST3_UPD: return "ARMISD::VST3_UPD"; + case ARMISD::VST4_UPD: return "ARMISD::VST4_UPD"; + case ARMISD::VST2LN_UPD: return "ARMISD::VST2LN_UPD"; + case ARMISD::VST3LN_UPD: return "ARMISD::VST3LN_UPD"; + case ARMISD::VST4LN_UPD: return "ARMISD::VST4LN_UPD"; } } @@ -735,6 +923,8 @@ Sched::Preference ARMTargetLowering::getSchedulingPreference(SDNode *N) const { for (unsigned i = 0; i != NumVals; ++i) { EVT VT = N->getValueType(i); + if (VT == MVT::Glue || VT == MVT::Other) + continue; if (VT.isFloatingPoint() || VT.isVector()) return Sched::Latency; } @@ -746,25 +936,29 @@ Sched::Preference ARMTargetLowering::getSchedulingPreference(SDNode *N) const { // is not available. const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); const TargetInstrDesc &TID = TII->get(N->getMachineOpcode()); - if (TID.mayLoad()) - return Sched::Latency; - const InstrItineraryData &Itins = getTargetMachine().getInstrItineraryData(); - if (!Itins.isEmpty() && Itins.getStageLatency(TID.getSchedClass()) > 2) + if (TID.getNumDefs() == 0) + return Sched::RegPressure; + if (!Itins->isEmpty() && + Itins->getOperandCycle(TID.getSchedClass(), 0) > 2) return Sched::Latency; + return Sched::RegPressure; } +// FIXME: Move to RegInfo unsigned ARMTargetLowering::getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const { + const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering(); + switch (RC->getID()) { default: return 0; case ARM::tGPRRegClassID: - return RegInfo->hasFP(MF) ? 4 : 5; + return TFI->hasFP(MF) ? 4 : 5; case ARM::GPRRegClassID: { - unsigned FP = RegInfo->hasFP(MF) ? 1 : 0; + unsigned FP = TFI->hasFP(MF) ? 1 : 0; return 10 - FP - (Subtarget->isR9Reserved() ? 1 : 0); } case ARM::SPRRegClassID: // Currently not used as 'rep' register class. @@ -829,136 +1023,6 @@ static void FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode, #include "ARMGenCallingConv.inc" -// APCS f64 is in register pairs, possibly split to stack -static bool f64AssignAPCS(unsigned &ValNo, EVT &ValVT, EVT &LocVT, - CCValAssign::LocInfo &LocInfo, - CCState &State, bool CanFail) { - static const unsigned RegList[] = { ARM::R0, ARM::R1, ARM::R2, ARM::R3 }; - - // Try to get the first register. - if (unsigned Reg = State.AllocateReg(RegList, 4)) - State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo)); - else { - // For the 2nd half of a v2f64, do not fail. - if (CanFail) - return false; - - // Put the whole thing on the stack. - State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT, - State.AllocateStack(8, 4), - LocVT, LocInfo)); - return true; - } - - // Try to get the second register. - if (unsigned Reg = State.AllocateReg(RegList, 4)) - State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo)); - else - State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT, - State.AllocateStack(4, 4), - LocVT, LocInfo)); - return true; -} - -static bool CC_ARM_APCS_Custom_f64(unsigned &ValNo, EVT &ValVT, EVT &LocVT, - CCValAssign::LocInfo &LocInfo, - ISD::ArgFlagsTy &ArgFlags, - CCState &State) { - if (!f64AssignAPCS(ValNo, ValVT, LocVT, LocInfo, State, true)) - return false; - if (LocVT == MVT::v2f64 && - !f64AssignAPCS(ValNo, ValVT, LocVT, LocInfo, State, false)) - return false; - return true; // we handled it -} - -// AAPCS f64 is in aligned register pairs -static bool f64AssignAAPCS(unsigned &ValNo, EVT &ValVT, EVT &LocVT, - CCValAssign::LocInfo &LocInfo, - CCState &State, bool CanFail) { - static const unsigned HiRegList[] = { ARM::R0, ARM::R2 }; - static const unsigned LoRegList[] = { ARM::R1, ARM::R3 }; - static const unsigned ShadowRegList[] = { ARM::R0, ARM::R1 }; - - unsigned Reg = State.AllocateReg(HiRegList, ShadowRegList, 2); - if (Reg == 0) { - // For the 2nd half of a v2f64, do not just fail. - if (CanFail) - return false; - - // Put the whole thing on the stack. - State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT, - State.AllocateStack(8, 8), - LocVT, LocInfo)); - return true; - } - - unsigned i; - for (i = 0; i < 2; ++i) - if (HiRegList[i] == Reg) - break; - - unsigned T = State.AllocateReg(LoRegList[i]); - (void)T; - assert(T == LoRegList[i] && "Could not allocate register"); - - State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo)); - State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, LoRegList[i], - LocVT, LocInfo)); - return true; -} - -static bool CC_ARM_AAPCS_Custom_f64(unsigned &ValNo, EVT &ValVT, EVT &LocVT, - CCValAssign::LocInfo &LocInfo, - ISD::ArgFlagsTy &ArgFlags, - CCState &State) { - if (!f64AssignAAPCS(ValNo, ValVT, LocVT, LocInfo, State, true)) - return false; - if (LocVT == MVT::v2f64 && - !f64AssignAAPCS(ValNo, ValVT, LocVT, LocInfo, State, false)) - return false; - return true; // we handled it -} - -static bool f64RetAssign(unsigned &ValNo, EVT &ValVT, EVT &LocVT, - CCValAssign::LocInfo &LocInfo, CCState &State) { - static const unsigned HiRegList[] = { ARM::R0, ARM::R2 }; - static const unsigned LoRegList[] = { ARM::R1, ARM::R3 }; - - unsigned Reg = State.AllocateReg(HiRegList, LoRegList, 2); - if (Reg == 0) - return false; // we didn't handle it - - unsigned i; - for (i = 0; i < 2; ++i) - if (HiRegList[i] == Reg) - break; - - State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo)); - State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, LoRegList[i], - LocVT, LocInfo)); - return true; -} - -static bool RetCC_ARM_APCS_Custom_f64(unsigned &ValNo, EVT &ValVT, EVT &LocVT, - CCValAssign::LocInfo &LocInfo, - ISD::ArgFlagsTy &ArgFlags, - CCState &State) { - if (!f64RetAssign(ValNo, ValVT, LocVT, LocInfo, State)) - return false; - if (LocVT == MVT::v2f64 && !f64RetAssign(ValNo, ValVT, LocVT, LocInfo, State)) - return false; - return true; // we handled it -} - -static bool RetCC_ARM_AAPCS_Custom_f64(unsigned &ValNo, EVT &ValVT, EVT &LocVT, - CCValAssign::LocInfo &LocInfo, - ISD::ArgFlagsTy &ArgFlags, - CCState &State) { - return RetCC_ARM_APCS_Custom_f64(ValNo, ValVT, LocVT, LocInfo, ArgFlags, - State); -} - /// CCAssignFnForNode - Selects the correct CCAssignFn for a the /// given CallingConvention value. CCAssignFn *ARMTargetLowering::CCAssignFnForNode(CallingConv::ID CC, @@ -967,23 +1031,29 @@ CCAssignFn *ARMTargetLowering::CCAssignFnForNode(CallingConv::ID CC, switch (CC) { default: llvm_unreachable("Unsupported calling convention"); - case CallingConv::C: case CallingConv::Fast: + if (Subtarget->hasVFP2() && !isVarArg) { + if (!Subtarget->isAAPCS_ABI()) + return (Return ? RetFastCC_ARM_APCS : FastCC_ARM_APCS); + // For AAPCS ABI targets, just use VFP variant of the calling convention. + return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP); + } + // Fallthrough + case CallingConv::C: { // Use target triple & subtarget features to do actual dispatch. - if (Subtarget->isAAPCS_ABI()) { - if (Subtarget->hasVFP2() && - FloatABIType == FloatABI::Hard && !isVarArg) - return (Return ? RetCC_ARM_AAPCS_VFP: CC_ARM_AAPCS_VFP); - else - return (Return ? RetCC_ARM_AAPCS: CC_ARM_AAPCS); - } else - return (Return ? RetCC_ARM_APCS: CC_ARM_APCS); + if (!Subtarget->isAAPCS_ABI()) + return (Return ? RetCC_ARM_APCS : CC_ARM_APCS); + else if (Subtarget->hasVFP2() && + FloatABIType == FloatABI::Hard && !isVarArg) + return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP); + return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS); + } case CallingConv::ARM_AAPCS_VFP: - return (Return ? RetCC_ARM_AAPCS_VFP: CC_ARM_AAPCS_VFP); + return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP); case CallingConv::ARM_AAPCS: - return (Return ? RetCC_ARM_AAPCS: CC_ARM_AAPCS); + return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS); case CallingConv::ARM_APCS: - return (Return ? RetCC_ARM_APCS: CC_ARM_APCS); + return (Return ? RetCC_ARM_APCS : CC_ARM_APCS); } } @@ -1050,7 +1120,7 @@ ARMTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, default: llvm_unreachable("Unknown loc info!"); case CCValAssign::Full: break; case CCValAssign::BCvt: - Val = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getValVT(), Val); + Val = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), Val); break; } @@ -1073,7 +1143,7 @@ CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain, SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32); return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(), /*isVolatile=*/false, /*AlwaysInline=*/false, - NULL, 0, NULL, 0); + MachinePointerInfo(0), MachinePointerInfo(0)); } /// LowerMemOpCallTo - Store the argument to the stack. @@ -1086,11 +1156,11 @@ ARMTargetLowering::LowerMemOpCallTo(SDValue Chain, unsigned LocMemOffset = VA.getLocMemOffset(); SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset); PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff); - if (Flags.isByVal()) { + if (Flags.isByVal()) return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl); - } + return DAG.getStore(Chain, dl, Arg, PtrOff, - PseudoSourceValue::getStack(), LocMemOffset, + MachinePointerInfo::getStack(LocMemOffset), false, false, 0); } @@ -1198,7 +1268,7 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee, Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg); break; case CCValAssign::BCvt: - Arg = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getLocVT(), Arg); + Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg); break; } @@ -1289,7 +1359,7 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee, if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { const GlobalValue *GV = G->getGlobal(); // Create a constant pool entry for the callee address - unsigned ARMPCLabelIndex = AFI->createConstPoolEntryUId(); + unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); ARMConstantPoolValue *CPV = new ARMConstantPoolValue(GV, ARMPCLabelIndex, ARMCP::CPValue, 0); @@ -1298,13 +1368,13 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee, CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), CPAddr, - PseudoSourceValue::getConstantPool(), 0, + MachinePointerInfo::getConstantPool(), false, false, 0); } else if (ExternalSymbolSDNode *S=dyn_cast<ExternalSymbolSDNode>(Callee)) { const char *Sym = S->getSymbol(); // Create a constant pool entry for the callee address - unsigned ARMPCLabelIndex = AFI->createConstPoolEntryUId(); + unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); ARMConstantPoolValue *CPV = new ARMConstantPoolValue(*DAG.getContext(), Sym, ARMPCLabelIndex, 0); // Get the address of the callee into a register @@ -1312,7 +1382,7 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee, CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), CPAddr, - PseudoSourceValue::getConstantPool(), 0, + MachinePointerInfo::getConstantPool(), false, false, 0); } } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { @@ -1326,7 +1396,7 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee, isLocalARMFunc = !Subtarget->isThumb() && (!isExt || !ARMInterworking); // tBX takes a register source operand. if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) { - unsigned ARMPCLabelIndex = AFI->createConstPoolEntryUId(); + unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); ARMConstantPoolValue *CPV = new ARMConstantPoolValue(GV, ARMPCLabelIndex, ARMCP::CPValue, 4); @@ -1334,13 +1404,19 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee, CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), CPAddr, - PseudoSourceValue::getConstantPool(), 0, + MachinePointerInfo::getConstantPool(), false, false, 0); SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32); Callee = DAG.getNode(ARMISD::PIC_ADD, dl, getPointerTy(), Callee, PICLabel); - } else - Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy()); + } else { + // On ELF targets for PIC code, direct calls should go through the PLT + unsigned OpFlags = 0; + if (Subtarget->isTargetELF() && + getTargetMachine().getRelocationModel() == Reloc::PIC_) + OpFlags = ARMII::MO_PLT; + Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 0, OpFlags); + } } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { isDirect = true; bool isStub = Subtarget->isTargetDarwin() && @@ -1349,20 +1425,26 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee, // tBX takes a register source operand. const char *Sym = S->getSymbol(); if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) { - unsigned ARMPCLabelIndex = AFI->createConstPoolEntryUId(); + unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); ARMConstantPoolValue *CPV = new ARMConstantPoolValue(*DAG.getContext(), Sym, ARMPCLabelIndex, 4); SDValue CPAddr = DAG.getTargetConstantPool(CPV, getPointerTy(), 4); CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), CPAddr, - PseudoSourceValue::getConstantPool(), 0, + MachinePointerInfo::getConstantPool(), false, false, 0); SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32); Callee = DAG.getNode(ARMISD::PIC_ADD, dl, getPointerTy(), Callee, PICLabel); - } else - Callee = DAG.getTargetExternalSymbol(Sym, getPointerTy()); + } else { + unsigned OpFlags = 0; + // On ELF targets for PIC code, direct calls should go through the PLT + if (Subtarget->isTargetELF() && + getTargetMachine().getRelocationModel() == Reloc::PIC_) + OpFlags = ARMII::MO_PLT; + Callee = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlags); + } } // FIXME: handle tail calls differently. @@ -1391,7 +1473,7 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee, if (InFlag.getNode()) Ops.push_back(InFlag); - SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); + SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); if (isTailCall) return DAG.getNode(ARMISD::TC_RETURN, dl, NodeTys, &Ops[0], Ops.size()); @@ -1421,7 +1503,7 @@ bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, int FI = INT_MAX; if (Arg.getOpcode() == ISD::CopyFromReg) { unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg(); - if (!VR || TargetRegisterInfo::isPhysicalRegister(VR)) + if (!TargetRegisterInfo::isVirtualRegister(VR)) return false; MachineInstr *Def = MRI->getVRegDef(VR); if (!Def) @@ -1490,32 +1572,15 @@ ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, // LR. This means if we need to reload LR, it takes an extra instructions, // which outweighs the value of the tail call; but here we don't know yet // whether LR is going to be used. Probably the right approach is to - // generate the tail call here and turn it back into CALL/RET in + // generate the tail call here and turn it back into CALL/RET in // emitEpilogue if LR is used. - if (Subtarget->isThumb1Only()) - return false; - - // For the moment, we can only do this to functions defined in this - // compilation, or to indirect calls. A Thumb B to an ARM function, - // or vice versa, is not easily fixed up in the linker unlike BL. - // (We could do this by loading the address of the callee into a register; - // that is an extra instruction over the direct call and burns a register - // as well, so is not likely to be a win.) - - // It might be safe to remove this restriction on non-Darwin. // Thumb1 PIC calls to external symbols use BX, so they can be tail calls, // but we need to make sure there are enough registers; the only valid // registers are the 4 used for parameters. We don't currently do this // case. - if (isa<ExternalSymbolSDNode>(Callee)) - return false; - - if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { - const GlobalValue *GV = G->getGlobal(); - if (GV->isDeclaration() || GV->isWeakForLinker()) - return false; - } + if (Subtarget->isThumb1Only()) + return false; // If the calling conventions do not match, then we'd better make sure the // results are returned in the same way as what the caller expects. @@ -1583,7 +1648,7 @@ ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, if (!VA.isRegLoc()) return false; if (!ArgLocs[++i].isRegLoc()) - return false; + return false; if (RegVT == MVT::v2f64) { if (!ArgLocs[++i].isRegLoc()) return false; @@ -1643,7 +1708,7 @@ ARMTargetLowering::LowerReturn(SDValue Chain, default: llvm_unreachable("Unknown loc info!"); case CCValAssign::Full: break; case CCValAssign::BCvt: - Arg = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getLocVT(), Arg); + Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg); break; } @@ -1693,6 +1758,61 @@ ARMTargetLowering::LowerReturn(SDValue Chain, return result; } +bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N) const { + if (N->getNumValues() != 1) + return false; + if (!N->hasNUsesOfValue(1, 0)) + return false; + + unsigned NumCopies = 0; + SDNode* Copies[2]; + SDNode *Use = *N->use_begin(); + if (Use->getOpcode() == ISD::CopyToReg) { + Copies[NumCopies++] = Use; + } else if (Use->getOpcode() == ARMISD::VMOVRRD) { + // f64 returned in a pair of GPRs. + for (SDNode::use_iterator UI = Use->use_begin(), UE = Use->use_end(); + UI != UE; ++UI) { + if (UI->getOpcode() != ISD::CopyToReg) + return false; + Copies[UI.getUse().getResNo()] = *UI; + ++NumCopies; + } + } else if (Use->getOpcode() == ISD::BITCAST) { + // f32 returned in a single GPR. + if (!Use->hasNUsesOfValue(1, 0)) + return false; + Use = *Use->use_begin(); + if (Use->getOpcode() != ISD::CopyToReg || !Use->hasNUsesOfValue(1, 0)) + return false; + Copies[NumCopies++] = Use; + } else { + return false; + } + + if (NumCopies != 1 && NumCopies != 2) + return false; + + bool HasRet = false; + for (unsigned i = 0; i < NumCopies; ++i) { + SDNode *Copy = Copies[i]; + for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end(); + UI != UE; ++UI) { + if (UI->getOpcode() == ISD::CopyToReg) { + SDNode *Use = *UI; + if (Use == Copies[0] || Use == Copies[1]) + continue; + return false; + } + if (UI->getOpcode() != ARMISD::RET_FLAG) + return false; + HasRet = true; + } + } + + return HasRet; +} + // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as // their target counterpart wrapped in the ARMISD::Wrapper node. Suppose N is // one of the above mentioned nodes. It has to be wrapped because otherwise @@ -1732,7 +1852,7 @@ SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op, CPAddr = DAG.getTargetConstantPool(BA, PtrVT, 4); } else { unsigned PCAdj = Subtarget->isThumb() ? 4 : 8; - ARMPCLabelIndex = AFI->createConstPoolEntryUId(); + ARMPCLabelIndex = AFI->createPICLabelUId(); ARMConstantPoolValue *CPV = new ARMConstantPoolValue(BA, ARMPCLabelIndex, ARMCP::CPBlockAddress, PCAdj); @@ -1740,7 +1860,7 @@ SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op, } CPAddr = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, CPAddr); SDValue Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), CPAddr, - PseudoSourceValue::getConstantPool(), 0, + MachinePointerInfo::getConstantPool(), false, false, 0); if (RelocM == Reloc::Static) return Result; @@ -1757,14 +1877,14 @@ ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA, unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8; MachineFunction &MF = DAG.getMachineFunction(); ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); - unsigned ARMPCLabelIndex = AFI->createConstPoolEntryUId(); + unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); ARMConstantPoolValue *CPV = new ARMConstantPoolValue(GA->getGlobal(), ARMPCLabelIndex, - ARMCP::CPValue, PCAdj, "tlsgd", true); + ARMCP::CPValue, PCAdj, ARMCP::TLSGD, true); SDValue Argument = DAG.getTargetConstantPool(CPV, PtrVT, 4); Argument = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Argument); Argument = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Argument, - PseudoSourceValue::getConstantPool(), 0, + MachinePointerInfo::getConstantPool(), false, false, 0); SDValue Chain = Argument.getValue(1); @@ -1802,16 +1922,16 @@ ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA, if (GV->isDeclaration()) { MachineFunction &MF = DAG.getMachineFunction(); ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); - unsigned ARMPCLabelIndex = AFI->createConstPoolEntryUId(); + unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); // Initial exec model. unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8; ARMConstantPoolValue *CPV = new ARMConstantPoolValue(GA->getGlobal(), ARMPCLabelIndex, - ARMCP::CPValue, PCAdj, "gottpoff", true); + ARMCP::CPValue, PCAdj, ARMCP::GOTTPOFF, true); Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4); Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset); Offset = DAG.getLoad(PtrVT, dl, Chain, Offset, - PseudoSourceValue::getConstantPool(), 0, + MachinePointerInfo::getConstantPool(), false, false, 0); Chain = Offset.getValue(1); @@ -1819,15 +1939,15 @@ ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA, Offset = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Offset, PICLabel); Offset = DAG.getLoad(PtrVT, dl, Chain, Offset, - PseudoSourceValue::getConstantPool(), 0, + MachinePointerInfo::getConstantPool(), false, false, 0); } else { // local exec model - ARMConstantPoolValue *CPV = new ARMConstantPoolValue(GV, "tpoff"); + ARMConstantPoolValue *CPV = new ARMConstantPoolValue(GV, ARMCP::TPOFF); Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4); Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset); Offset = DAG.getLoad(PtrVT, dl, Chain, Offset, - PseudoSourceValue::getConstantPool(), 0, + MachinePointerInfo::getConstantPool(), false, false, 0); } @@ -1859,51 +1979,72 @@ SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op, if (RelocM == Reloc::PIC_) { bool UseGOTOFF = GV->hasLocalLinkage() || GV->hasHiddenVisibility(); ARMConstantPoolValue *CPV = - new ARMConstantPoolValue(GV, UseGOTOFF ? "GOTOFF" : "GOT"); + new ARMConstantPoolValue(GV, UseGOTOFF ? ARMCP::GOTOFF : ARMCP::GOT); SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4); CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); SDValue Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr, - PseudoSourceValue::getConstantPool(), 0, + MachinePointerInfo::getConstantPool(), false, false, 0); SDValue Chain = Result.getValue(1); SDValue GOT = DAG.getGLOBAL_OFFSET_TABLE(PtrVT); Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result, GOT); if (!UseGOTOFF) Result = DAG.getLoad(PtrVT, dl, Chain, Result, - PseudoSourceValue::getGOT(), 0, - false, false, 0); + MachinePointerInfo::getGOT(), false, false, 0); return Result; + } + + // If we have T2 ops, we can materialize the address directly via movt/movw + // pair. This is always cheaper. + if (Subtarget->useMovt()) { + ++NumMovwMovt; + // FIXME: Once remat is capable of dealing with instructions with register + // operands, expand this into two nodes. + return DAG.getNode(ARMISD::Wrapper, dl, PtrVT, + DAG.getTargetGlobalAddress(GV, dl, PtrVT)); } else { - // If we have T2 ops, we can materialize the address directly via movt/movw - // pair. This is always cheaper. - if (Subtarget->useMovt()) { - return DAG.getNode(ARMISD::Wrapper, dl, PtrVT, - DAG.getTargetGlobalAddress(GV, dl, PtrVT)); - } else { - SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, 4); - CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); - return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr, - PseudoSourceValue::getConstantPool(), 0, - false, false, 0); - } + SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, 4); + CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); + return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr, + MachinePointerInfo::getConstantPool(), + false, false, 0); } } SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op, SelectionDAG &DAG) const { - MachineFunction &MF = DAG.getMachineFunction(); - ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); - unsigned ARMPCLabelIndex = 0; EVT PtrVT = getPointerTy(); DebugLoc dl = Op.getDebugLoc(); const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); Reloc::Model RelocM = getTargetMachine().getRelocationModel(); + MachineFunction &MF = DAG.getMachineFunction(); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + + if (Subtarget->useMovt()) { + ++NumMovwMovt; + // FIXME: Once remat is capable of dealing with instructions with register + // operands, expand this into two nodes. + if (RelocM == Reloc::Static) + return DAG.getNode(ARMISD::Wrapper, dl, PtrVT, + DAG.getTargetGlobalAddress(GV, dl, PtrVT)); + + unsigned Wrapper = (RelocM == Reloc::PIC_) + ? ARMISD::WrapperPIC : ARMISD::WrapperDYN; + SDValue Result = DAG.getNode(Wrapper, dl, PtrVT, + DAG.getTargetGlobalAddress(GV, dl, PtrVT)); + if (Subtarget->GVIsIndirectSymbol(GV, RelocM)) + Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result, + MachinePointerInfo::getGOT(), false, false, 0); + return Result; + } + + unsigned ARMPCLabelIndex = 0; SDValue CPAddr; - if (RelocM == Reloc::Static) + if (RelocM == Reloc::Static) { CPAddr = DAG.getTargetConstantPool(GV, PtrVT, 4); - else { - ARMPCLabelIndex = AFI->createConstPoolEntryUId(); + } else { + ARMPCLabelIndex = AFI->createPICLabelUId(); unsigned PCAdj = (RelocM != Reloc::PIC_) ? 0 : (Subtarget->isThumb()?4:8); ARMConstantPoolValue *CPV = new ARMConstantPoolValue(GV, ARMPCLabelIndex, ARMCP::CPValue, PCAdj); @@ -1912,7 +2053,7 @@ SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op, CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); SDValue Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr, - PseudoSourceValue::getConstantPool(), 0, + MachinePointerInfo::getConstantPool(), false, false, 0); SDValue Chain = Result.getValue(1); @@ -1922,8 +2063,7 @@ SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op, } if (Subtarget->GVIsIndirectSymbol(GV, RelocM)) - Result = DAG.getLoad(PtrVT, dl, Chain, Result, - PseudoSourceValue::getGOT(), 0, + Result = DAG.getLoad(PtrVT, dl, Chain, Result, MachinePointerInfo::getGOT(), false, false, 0); return Result; @@ -1935,7 +2075,7 @@ SDValue ARMTargetLowering::LowerGLOBAL_OFFSET_TABLE(SDValue Op, "GLOBAL OFFSET TABLE not implemented for non-ELF targets"); MachineFunction &MF = DAG.getMachineFunction(); ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); - unsigned ARMPCLabelIndex = AFI->createConstPoolEntryUId(); + unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); EVT PtrVT = getPointerTy(); DebugLoc dl = Op.getDebugLoc(); unsigned PCAdj = Subtarget->isThumb() ? 4 : 8; @@ -1945,13 +2085,21 @@ SDValue ARMTargetLowering::LowerGLOBAL_OFFSET_TABLE(SDValue Op, SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4); CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); SDValue Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr, - PseudoSourceValue::getConstantPool(), 0, + MachinePointerInfo::getConstantPool(), false, false, 0); SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32); return DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel); } SDValue +ARMTargetLowering::LowerEH_SJLJ_DISPATCHSETUP(SDValue Op, SelectionDAG &DAG) + const { + DebugLoc dl = Op.getDebugLoc(); + return DAG.getNode(ARMISD::EH_SJLJ_DISPATCHSETUP, dl, MVT::Other, + Op.getOperand(0), Op.getOperand(1)); +} + +SDValue ARMTargetLowering::LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const { DebugLoc dl = Op.getDebugLoc(); SDValue Val = DAG.getConstant(0, MVT::i32); @@ -1980,7 +2128,7 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG, case Intrinsic::eh_sjlj_lsda: { MachineFunction &MF = DAG.getMachineFunction(); ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); - unsigned ARMPCLabelIndex = AFI->createConstPoolEntryUId(); + unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); EVT PtrVT = getPointerTy(); DebugLoc dl = Op.getDebugLoc(); Reloc::Model RelocM = getTargetMachine().getRelocationModel(); @@ -1994,7 +2142,7 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG, CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); SDValue Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr, - PseudoSourceValue::getConstantPool(), 0, + MachinePointerInfo::getConstantPool(), false, false, 0); if (RelocM == Reloc::PIC_) { @@ -2009,21 +2157,55 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG, static SDValue LowerMEMBARRIER(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget) { DebugLoc dl = Op.getDebugLoc(); - SDValue Op5 = Op.getOperand(5); - unsigned isDeviceBarrier = cast<ConstantSDNode>(Op5)->getZExtValue(); - // Some subtargets which have dmb and dsb instructions can handle barriers - // directly. Some ARMv6 cpus can support them with the help of mcr - // instruction. Thumb1 and pre-v6 ARM mode use a libcall instead and should - // never get here. - unsigned Opc = isDeviceBarrier ? ARMISD::SYNCBARRIER : ARMISD::MEMBARRIER; - if (Subtarget->hasDataBarrier()) - return DAG.getNode(Opc, dl, MVT::Other, Op.getOperand(0)); - else { - assert(Subtarget->hasV6Ops() && !Subtarget->isThumb1Only() && + if (!Subtarget->hasDataBarrier()) { + // Some ARMv6 cpus can support data barriers with an mcr instruction. + // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get + // here. + assert(Subtarget->hasV6Ops() && !Subtarget->isThumb() && "Unexpected ISD::MEMBARRIER encountered. Should be libcall!"); - return DAG.getNode(Opc, dl, MVT::Other, Op.getOperand(0), + return DAG.getNode(ARMISD::MEMBARRIER_MCR, dl, MVT::Other, Op.getOperand(0), DAG.getConstant(0, MVT::i32)); } + + SDValue Op5 = Op.getOperand(5); + bool isDeviceBarrier = cast<ConstantSDNode>(Op5)->getZExtValue() != 0; + unsigned isLL = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); + unsigned isLS = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); + bool isOnlyStoreBarrier = (isLL == 0 && isLS == 0); + + ARM_MB::MemBOpt DMBOpt; + if (isDeviceBarrier) + DMBOpt = isOnlyStoreBarrier ? ARM_MB::ST : ARM_MB::SY; + else + DMBOpt = isOnlyStoreBarrier ? ARM_MB::ISHST : ARM_MB::ISH; + return DAG.getNode(ARMISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0), + DAG.getConstant(DMBOpt, MVT::i32)); +} + +static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG, + const ARMSubtarget *Subtarget) { + // ARM pre v5TE and Thumb1 does not have preload instructions. + if (!(Subtarget->isThumb2() || + (!Subtarget->isThumb1Only() && Subtarget->hasV5TEOps()))) + // Just preserve the chain. + return Op.getOperand(0); + + DebugLoc dl = Op.getDebugLoc(); + unsigned isRead = ~cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue() & 1; + if (!isRead && + (!Subtarget->hasV7Ops() || !Subtarget->hasMPExtension())) + // ARMv7 with MP extension has PLDW. + return Op.getOperand(0); + + if (Subtarget->isThumb()) + // Invert the bits. + isRead = ~isRead & 1; + unsigned isData = Subtarget->isThumb() ? 0 : 1; + + // Currently there is no intrinsic that matches pli. + return DAG.getNode(ARMISD::PRELOAD, dl, MVT::Other, Op.getOperand(0), + Op.getOperand(1), DAG.getConstant(isRead, MVT::i32), + DAG.getConstant(isData, MVT::i32)); } static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) { @@ -2036,8 +2218,8 @@ static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) { EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); - return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1), SV, 0, - false, false, 0); + return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1), + MachinePointerInfo(SV), false, false, 0); } SDValue @@ -2054,7 +2236,7 @@ ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA, RC = ARM::GPRRegisterClass; // Transform the arguments stored in physical registers into virtual ones. - unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); + unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC, dl); SDValue ArgValue = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32); SDValue ArgValue2; @@ -2065,10 +2247,10 @@ ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA, // Create load node to retrieve arguments from the stack. SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); ArgValue2 = DAG.getLoad(MVT::i32, dl, Root, FIN, - PseudoSourceValue::getFixedStack(FI), 0, + MachinePointerInfo::getFixedStack(FI), false, false, 0); } else { - Reg = MF.addLiveIn(NextVA.getLocReg(), RC); + Reg = MF.addLiveIn(NextVA.getLocReg(), RC, dl); ArgValue2 = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32); } @@ -2119,7 +2301,7 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain, int FI = MFI->CreateFixedObject(8, VA.getLocMemOffset(), true); SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); ArgValue2 = DAG.getLoad(MVT::f64, dl, Chain, FIN, - PseudoSourceValue::getFixedStack(FI), 0, + MachinePointerInfo::getFixedStack(FI), false, false, 0); } else { ArgValue2 = GetF64FormalArgument(VA, ArgLocs[++i], @@ -2149,7 +2331,7 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain, llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering"); // Transform the arguments in physical registers into virtual ones. - unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); + unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC, dl); ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT); } @@ -2160,7 +2342,7 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain, default: llvm_unreachable("Unknown loc info!"); case CCValAssign::Full: break; case CCValAssign::BCvt: - ArgValue = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getValVT(), ArgValue); + ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue); break; case CCValAssign::SExt: ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue, @@ -2188,7 +2370,7 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain, // Create load nodes to retrieve arguments from the stack. SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, FIN, - PseudoSourceValue::getFixedStack(FI), 0, + MachinePointerInfo::getFixedStack(FI), false, false, 0)); } } @@ -2202,7 +2384,7 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain, unsigned NumGPRs = CCInfo.getFirstUnallocated (GPRArgRegs, sizeof(GPRArgRegs) / sizeof(GPRArgRegs[0])); - unsigned Align = MF.getTarget().getFrameInfo()->getStackAlignment(); + unsigned Align = MF.getTarget().getFrameLowering()->getStackAlignment(); unsigned VARegSize = (4 - NumGPRs) * 4; unsigned VARegSaveSize = (VARegSize + Align - 1) & ~(Align - 1); unsigned ArgOffset = CCInfo.getNextStackOffset(); @@ -2214,7 +2396,7 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain, AFI->setVarArgsFrameIndex( MFI->CreateFixedObject(VARegSaveSize, ArgOffset + VARegSaveSize - VARegSize, - true)); + false)); SDValue FIN = DAG.getFrameIndex(AFI->getVarArgsFrameIndex(), getPointerTy()); @@ -2226,12 +2408,12 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain, else RC = ARM::GPRRegisterClass; - unsigned VReg = MF.addLiveIn(GPRArgRegs[NumGPRs], RC); + unsigned VReg = MF.addLiveIn(GPRArgRegs[NumGPRs], RC, dl); SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32); SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, - PseudoSourceValue::getFixedStack(AFI->getVarArgsFrameIndex()), - 0, false, false, 0); + MachinePointerInfo::getFixedStack(AFI->getVarArgsFrameIndex()), + false, false, 0); MemOps.push_back(Store); FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), FIN, DAG.getConstant(4, getPointerTy())); @@ -2320,7 +2502,7 @@ ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, break; } ARMcc = DAG.getConstant(CondCode, MVT::i32); - return DAG.getNode(CompareType, dl, MVT::Flag, LHS, RHS); + return DAG.getNode(CompareType, dl, MVT::Glue, LHS, RHS); } /// Returns a appropriate VFP CMP (fcmp{s|d}+fmstat) for the given operands. @@ -2329,10 +2511,10 @@ ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS, SelectionDAG &DAG, DebugLoc dl) const { SDValue Cmp; if (!isFloatingPointZero(RHS)) - Cmp = DAG.getNode(ARMISD::CMPFP, dl, MVT::Flag, LHS, RHS); + Cmp = DAG.getNode(ARMISD::CMPFP, dl, MVT::Glue, LHS, RHS); else - Cmp = DAG.getNode(ARMISD::CMPFPw0, dl, MVT::Flag, LHS); - return DAG.getNode(ARMISD::FMSTAT, dl, MVT::Flag, Cmp); + Cmp = DAG.getNode(ARMISD::CMPFPw0, dl, MVT::Glue, LHS); + return DAG.getNode(ARMISD::FMSTAT, dl, MVT::Glue, Cmp); } SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { @@ -2444,8 +2626,7 @@ static SDValue bitcastf32Toi32(SDValue Op, SelectionDAG &DAG) { if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op)) return DAG.getLoad(MVT::i32, Op.getDebugLoc(), - Ld->getChain(), Ld->getBasePtr(), - Ld->getSrcValue(), Ld->getSrcValueOffset(), + Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(), Ld->isVolatile(), Ld->isNonTemporal(), Ld->getAlignment()); @@ -2464,7 +2645,7 @@ static void expandf64Toi32(SDValue Op, SelectionDAG &DAG, SDValue Ptr = Ld->getBasePtr(); RetVal1 = DAG.getLoad(MVT::i32, Op.getDebugLoc(), Ld->getChain(), Ptr, - Ld->getSrcValue(), Ld->getSrcValueOffset(), + Ld->getPointerInfo(), Ld->isVolatile(), Ld->isNonTemporal(), Ld->getAlignment()); @@ -2474,7 +2655,7 @@ static void expandf64Toi32(SDValue Op, SelectionDAG &DAG, PtrType, Ptr, DAG.getConstant(4, PtrType)); RetVal2 = DAG.getLoad(MVT::i32, Op.getDebugLoc(), Ld->getChain(), NewPtr, - Ld->getSrcValue(), Ld->getSrcValueOffset() + 4, + Ld->getPointerInfo().getWithOffset(4), Ld->isVolatile(), Ld->isNonTemporal(), NewAlign); return; @@ -2524,7 +2705,7 @@ ARMTargetLowering::OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const { expandf64Toi32(RHS, DAG, RHS1, RHS2); ARMCC::CondCodes CondCode = IntCCToARMCC(CC); ARMcc = DAG.getConstant(CondCode, MVT::i32); - SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Flag); + SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue); SDValue Ops[] = { Chain, ARMcc, LHS1, LHS2, RHS1, RHS2, Dest }; return DAG.getNode(ARMISD::BCC_i64, dl, VTList, Ops, 7); } @@ -2564,7 +2745,7 @@ SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { SDValue ARMcc = DAG.getConstant(CondCode, MVT::i32); SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl); SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); - SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Flag); + SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue); SDValue Ops[] = { Chain, Dest, ARMcc, CCR, Cmp }; SDValue Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops, 5); if (CondCode2 != ARMCC::AL) { @@ -2599,14 +2780,14 @@ SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const { } if (getTargetMachine().getRelocationModel() == Reloc::PIC_) { Addr = DAG.getLoad((EVT)MVT::i32, dl, Chain, Addr, - PseudoSourceValue::getJumpTable(), 0, + MachinePointerInfo::getJumpTable(), false, false, 0); Chain = Addr.getValue(1); Addr = DAG.getNode(ISD::ADD, dl, PTy, Addr, Table); return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI, UId); } else { Addr = DAG.getLoad(PTy, dl, Chain, Addr, - PseudoSourceValue::getJumpTable(), 0, false, false, 0); + MachinePointerInfo::getJumpTable(), false, false, 0); Chain = Addr.getValue(1); return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI, UId); } @@ -2627,7 +2808,7 @@ static SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) { break; } Op = DAG.getNode(Opc, dl, MVT::f32, Op.getOperand(0)); - return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i32, Op); + return DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op); } static SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) { @@ -2646,7 +2827,7 @@ static SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) { break; } - Op = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::f32, Op.getOperand(0)); + Op = DAG.getNode(ISD::BITCAST, dl, MVT::f32, Op.getOperand(0)); return DAG.getNode(Opc, dl, VT, Op); } @@ -2657,12 +2838,46 @@ SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { DebugLoc dl = Op.getDebugLoc(); EVT VT = Op.getValueType(); EVT SrcVT = Tmp1.getValueType(); - SDValue AbsVal = DAG.getNode(ISD::FABS, dl, VT, Tmp0); - SDValue ARMcc = DAG.getConstant(ARMCC::LT, MVT::i32); - SDValue FP0 = DAG.getConstantFP(0.0, SrcVT); - SDValue Cmp = getVFPCmp(Tmp1, FP0, DAG, dl); + bool F2IisFast = Subtarget->isCortexA9() || + Tmp0.getOpcode() == ISD::BITCAST || Tmp0.getOpcode() == ARMISD::VMOVDRR; + + // Bitcast operand 1 to i32. + if (SrcVT == MVT::f64) + Tmp1 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32), + &Tmp1, 1).getValue(1); + Tmp1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp1); + + // If float to int conversion isn't going to be super expensive, then simply + // or in the signbit. + if (F2IisFast) { + SDValue Mask1 = DAG.getConstant(0x80000000, MVT::i32); + SDValue Mask2 = DAG.getConstant(0x7fffffff, MVT::i32); + Tmp1 = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp1, Mask1); + if (VT == MVT::f32) { + Tmp0 = DAG.getNode(ISD::AND, dl, MVT::i32, + DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp0), Mask2); + return DAG.getNode(ISD::BITCAST, dl, MVT::f32, + DAG.getNode(ISD::OR, dl, MVT::i32, Tmp0, Tmp1)); + } + + // f64: Or the high part with signbit and then combine two parts. + Tmp0 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32), + &Tmp0, 1); + SDValue Lo = Tmp0.getValue(0); + SDValue Hi = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp0.getValue(1), Mask2); + Hi = DAG.getNode(ISD::OR, dl, MVT::i32, Hi, Tmp1); + return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi); + } + + // Remove the signbit of operand 0. + Tmp0 = DAG.getNode(ISD::FABS, dl, VT, Tmp0); + + // If operand 1 signbit is one, then negate operand 0. + SDValue ARMcc; + SDValue Cmp = getARMCmp(Tmp1, DAG.getConstant(0, MVT::i32), + ISD::SETLT, ARMcc, DAG, dl); SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); - return DAG.getNode(ARMISD::CNEG, dl, VT, AbsVal, AbsVal, ARMcc, CCR, Cmp); + return DAG.getNode(ARMISD::CNEG, dl, VT, Tmp0, Tmp0, ARMcc, CCR, Cmp); } SDValue ARMTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{ @@ -2678,11 +2893,11 @@ SDValue ARMTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{ SDValue Offset = DAG.getConstant(4, MVT::i32); return DAG.getLoad(VT, dl, DAG.getEntryNode(), DAG.getNode(ISD::ADD, dl, VT, FrameAddr, Offset), - NULL, 0, false, false, 0); + MachinePointerInfo(), false, false, 0); } // Return LR, which contains the return address. Mark it an implicit live-in. - unsigned Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32)); + unsigned Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32), dl); return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT); } @@ -2697,17 +2912,18 @@ SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { ? ARM::R7 : ARM::R11; SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT); while (Depth--) - FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, NULL, 0, + FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, + MachinePointerInfo(), false, false, 0); return FrameAddr; } -/// ExpandBIT_CONVERT - If the target supports VFP, this function is called to +/// ExpandBITCAST - If the target supports VFP, this function is called to /// expand a bit convert where either the source or destination type is i64 to /// use a VMOVDRR or VMOVRRD node. This should not be done when the non-i64 /// operand type is illegal (e.g., v2f32 for a target that doesn't support /// vectors), since the legalizer won't know what to do with that. -static SDValue ExpandBIT_CONVERT(SDNode *N, SelectionDAG &DAG) { +static SDValue ExpandBITCAST(SDNode *N, SelectionDAG &DAG) { const TargetLowering &TLI = DAG.getTargetLoweringInfo(); DebugLoc dl = N->getDebugLoc(); SDValue Op = N->getOperand(0); @@ -2717,7 +2933,7 @@ static SDValue ExpandBIT_CONVERT(SDNode *N, SelectionDAG &DAG) { EVT SrcVT = Op.getValueType(); EVT DstVT = N->getValueType(0); assert((SrcVT == MVT::i64 || DstVT == MVT::i64) && - "ExpandBIT_CONVERT called for non-i64 type"); + "ExpandBITCAST called for non-i64 type"); // Turn i64->f64 into VMOVDRR. if (SrcVT == MVT::i64 && TLI.isTypeLegal(DstVT)) { @@ -2725,7 +2941,7 @@ static SDValue ExpandBIT_CONVERT(SDNode *N, SelectionDAG &DAG) { DAG.getConstant(0, MVT::i32)); SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op, DAG.getConstant(1, MVT::i32)); - return DAG.getNode(ISD::BIT_CONVERT, dl, DstVT, + return DAG.getNode(ISD::BITCAST, dl, DstVT, DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi)); } @@ -2752,7 +2968,7 @@ static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, DebugLoc dl) { SDValue EncodedVal = DAG.getTargetConstant(0, MVT::i32); EVT VmovVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32; SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, EncodedVal); - return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vmov); + return DAG.getNode(ISD::BITCAST, dl, VT, Vmov); } /// LowerShiftRightParts - Lower SRA_PARTS, which returns two @@ -2825,7 +3041,7 @@ SDValue ARMTargetLowering::LowerShiftLeftParts(SDValue Op, return DAG.getMergeValues(Ops, 2, dl); } -SDValue ARMTargetLowering::LowerFLT_ROUNDS_(SDValue Op, +SDValue ARMTargetLowering::LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const { // The rounding mode is in bits 23:22 of the FPSCR. // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0 @@ -2835,11 +3051,11 @@ SDValue ARMTargetLowering::LowerFLT_ROUNDS_(SDValue Op, SDValue FPSCR = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, DAG.getConstant(Intrinsic::arm_get_fpscr, MVT::i32)); - SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPSCR, + SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPSCR, DAG.getConstant(1U << 22, MVT::i32)); SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds, DAG.getConstant(22, MVT::i32)); - return DAG.getNode(ISD::AND, dl, MVT::i32, RMODE, + return DAG.getNode(ISD::AND, dl, MVT::i32, RMODE, DAG.getConstant(3, MVT::i32)); } @@ -2860,33 +3076,40 @@ static SDValue LowerShift(SDNode *N, SelectionDAG &DAG, EVT VT = N->getValueType(0); DebugLoc dl = N->getDebugLoc(); + if (!VT.isVector()) + return SDValue(); + // Lower vector shifts on NEON to use VSHL. - if (VT.isVector()) { - assert(ST->hasNEON() && "unexpected vector shift"); - - // Left shifts translate directly to the vshiftu intrinsic. - if (N->getOpcode() == ISD::SHL) - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(Intrinsic::arm_neon_vshiftu, MVT::i32), - N->getOperand(0), N->getOperand(1)); - - assert((N->getOpcode() == ISD::SRA || - N->getOpcode() == ISD::SRL) && "unexpected vector shift opcode"); - - // NEON uses the same intrinsics for both left and right shifts. For - // right shifts, the shift amounts are negative, so negate the vector of - // shift amounts. - EVT ShiftVT = N->getOperand(1).getValueType(); - SDValue NegatedCount = DAG.getNode(ISD::SUB, dl, ShiftVT, - getZeroVector(ShiftVT, DAG, dl), - N->getOperand(1)); - Intrinsic::ID vshiftInt = (N->getOpcode() == ISD::SRA ? - Intrinsic::arm_neon_vshifts : - Intrinsic::arm_neon_vshiftu); + assert(ST->hasNEON() && "unexpected vector shift"); + + // Left shifts translate directly to the vshiftu intrinsic. + if (N->getOpcode() == ISD::SHL) return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(vshiftInt, MVT::i32), - N->getOperand(0), NegatedCount); - } + DAG.getConstant(Intrinsic::arm_neon_vshiftu, MVT::i32), + N->getOperand(0), N->getOperand(1)); + + assert((N->getOpcode() == ISD::SRA || + N->getOpcode() == ISD::SRL) && "unexpected vector shift opcode"); + + // NEON uses the same intrinsics for both left and right shifts. For + // right shifts, the shift amounts are negative, so negate the vector of + // shift amounts. + EVT ShiftVT = N->getOperand(1).getValueType(); + SDValue NegatedCount = DAG.getNode(ISD::SUB, dl, ShiftVT, + getZeroVector(ShiftVT, DAG, dl), + N->getOperand(1)); + Intrinsic::ID vshiftInt = (N->getOpcode() == ISD::SRA ? + Intrinsic::arm_neon_vshifts : + Intrinsic::arm_neon_vshiftu); + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(vshiftInt, MVT::i32), + N->getOperand(0), NegatedCount); +} + +static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG, + const ARMSubtarget *ST) { + EVT VT = N->getValueType(0); + DebugLoc dl = N->getDebugLoc(); // We can get here for a node like i32 = ISD::SHL i32, i64 if (VT != MVT::i64) @@ -2912,7 +3135,7 @@ static SDValue LowerShift(SDNode *N, SelectionDAG &DAG, // First, build a SRA_FLAG/SRL_FLAG op, which shifts the top part by one and // captures the result into a carry flag. unsigned Opc = N->getOpcode() == ISD::SRL ? ARMISD::SRL_FLAG:ARMISD::SRA_FLAG; - Hi = DAG.getNode(Opc, dl, DAG.getVTList(MVT::i32, MVT::Flag), &Hi, 1); + Hi = DAG.getNode(Opc, dl, DAG.getVTList(MVT::i32, MVT::Glue), &Hi, 1); // The low part is an ARMISD::RRX operand, which shifts the carry in. Lo = DAG.getNode(ARMISD::RRX, dl, MVT::i32, Lo, Hi.getValue(1)); @@ -2998,13 +3221,13 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) { AndOp = Op1; // Ignore bitconvert. - if (AndOp.getNode() && AndOp.getOpcode() == ISD::BIT_CONVERT) + if (AndOp.getNode() && AndOp.getOpcode() == ISD::BITCAST) AndOp = AndOp.getOperand(0); if (AndOp.getNode() && AndOp.getOpcode() == ISD::AND) { Opc = ARMISD::VTST; - Op0 = DAG.getNode(ISD::BIT_CONVERT, dl, VT, AndOp.getOperand(0)); - Op1 = DAG.getNode(ISD::BIT_CONVERT, dl, VT, AndOp.getOperand(1)); + Op0 = DAG.getNode(ISD::BITCAST, dl, VT, AndOp.getOperand(0)); + Op1 = DAG.getNode(ISD::BITCAST, dl, VT, AndOp.getOperand(1)); Invert = !Invert; } } @@ -3013,7 +3236,38 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) { if (Swap) std::swap(Op0, Op1); - SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1); + // If one of the operands is a constant vector zero, attempt to fold the + // comparison to a specialized compare-against-zero form. + SDValue SingleOp; + if (ISD::isBuildVectorAllZeros(Op1.getNode())) + SingleOp = Op0; + else if (ISD::isBuildVectorAllZeros(Op0.getNode())) { + if (Opc == ARMISD::VCGE) + Opc = ARMISD::VCLEZ; + else if (Opc == ARMISD::VCGT) + Opc = ARMISD::VCLTZ; + SingleOp = Op1; + } + + SDValue Result; + if (SingleOp.getNode()) { + switch (Opc) { + case ARMISD::VCEQ: + Result = DAG.getNode(ARMISD::VCEQZ, dl, VT, SingleOp); break; + case ARMISD::VCGE: + Result = DAG.getNode(ARMISD::VCGEZ, dl, VT, SingleOp); break; + case ARMISD::VCLEZ: + Result = DAG.getNode(ARMISD::VCLEZ, dl, VT, SingleOp); break; + case ARMISD::VCGT: + Result = DAG.getNode(ARMISD::VCGTZ, dl, VT, SingleOp); break; + case ARMISD::VCLTZ: + Result = DAG.getNode(ARMISD::VCLTZ, dl, VT, SingleOp); break; + default: + Result = DAG.getNode(Opc, dl, VT, Op0, Op1); + } + } else { + Result = DAG.getNode(Opc, dl, VT, Op0, Op1); + } if (Invert) Result = DAG.getNOT(dl, Result, VT); @@ -3026,7 +3280,7 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) { /// operand (e.g., VMOV). If so, return the encoded value. static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef, unsigned SplatBitSize, SelectionDAG &DAG, - EVT &VT, bool is128Bits, bool isVMOV) { + EVT &VT, bool is128Bits, NEONModImmType type) { unsigned OpCmode, Imm; // SplatBitSize is set to the smallest size that splats the vector, so a @@ -3039,7 +3293,7 @@ static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef, switch (SplatBitSize) { case 8: - if (!isVMOV) + if (type != VMOVModImm) return SDValue(); // Any 1-byte value is OK. Op=0, Cmode=1110. assert((SplatBits & ~0xff) == 0 && "one byte splat value is too big"); @@ -3096,6 +3350,9 @@ static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef, break; } + // cmode == 0b1100 and cmode == 0b1101 are not supported for VORR or VBIC + if (type == OtherModImm) return SDValue(); + if ((SplatBits & ~0xffff) == 0 && ((SplatBits | SplatUndef) & 0xff) == 0xff) { // Value = 0x0000nnff: Op=x, Cmode=1100. @@ -3122,7 +3379,7 @@ static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef, return SDValue(); case 64: { - if (!isVMOV) + if (type != VMOVModImm) return SDValue(); // NEON has a 64-bit VMOV splat where each byte is either 0 or 0xff. uint64_t BitMask = 0xff; @@ -3376,8 +3633,8 @@ static SDValue IsSingleInstrConstant(SDValue N, SelectionDAG &DAG, // If this is a case we can't handle, return null and let the default // expansion code take care of it. -static SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, - const ARMSubtarget *ST) { +SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, + const ARMSubtarget *ST) const { BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode()); DebugLoc dl = Op.getDebugLoc(); EVT VT = Op.getValueType(); @@ -3391,10 +3648,11 @@ static SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, EVT VmovVT; SDValue Val = isNEONModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(), SplatBitSize, - DAG, VmovVT, VT.is128BitVector(), true); + DAG, VmovVT, VT.is128BitVector(), + VMOVModImm); if (Val.getNode()) { SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, Val); - return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vmov); + return DAG.getNode(ISD::BITCAST, dl, VT, Vmov); } // Try an immediate VMVN. @@ -3402,10 +3660,11 @@ static SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, ((1LL << SplatBitSize) - 1)); Val = isNEONModifiedImm(NegatedImm, SplatUndef.getZExtValue(), SplatBitSize, - DAG, VmovVT, VT.is128BitVector(), false); + DAG, VmovVT, VT.is128BitVector(), + VMVNModImm); if (Val.getNode()) { SDValue Vmov = DAG.getNode(ARMISD::VMVNIMM, dl, VmovVT, Val); - return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vmov); + return DAG.getNode(ISD::BITCAST, dl, VT, Vmov); } } } @@ -3439,26 +3698,25 @@ static SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, unsigned EltSize = VT.getVectorElementType().getSizeInBits(); - if (EnableARMVDUPsplat) { - // Use VDUP for non-constant splats. For f32 constant splats, reduce to - // i32 and try again. - if (usesOnlyOneValue && EltSize <= 32) { - if (!isConstant) - return DAG.getNode(ARMISD::VDUP, dl, VT, Value); - if (VT.getVectorElementType().isFloatingPoint()) { - SmallVector<SDValue, 8> Ops; - for (unsigned i = 0; i < NumElts; ++i) - Ops.push_back(DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i32, - Op.getOperand(i))); - SDValue Val = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, &Ops[0], - NumElts); - return DAG.getNode(ISD::BIT_CONVERT, dl, VT, - LowerBUILD_VECTOR(Val, DAG, ST)); - } - SDValue Val = IsSingleInstrConstant(Value, DAG, ST, dl); + // Use VDUP for non-constant splats. For f32 constant splats, reduce to + // i32 and try again. + if (usesOnlyOneValue && EltSize <= 32) { + if (!isConstant) + return DAG.getNode(ARMISD::VDUP, dl, VT, Value); + if (VT.getVectorElementType().isFloatingPoint()) { + SmallVector<SDValue, 8> Ops; + for (unsigned i = 0; i < NumElts; ++i) + Ops.push_back(DAG.getNode(ISD::BITCAST, dl, MVT::i32, + Op.getOperand(i))); + EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts); + SDValue Val = DAG.getNode(ISD::BUILD_VECTOR, dl, VecVT, &Ops[0], NumElts); + Val = LowerBUILD_VECTOR(Val, DAG, ST); if (Val.getNode()) - return DAG.getNode(ARMISD::VDUP, dl, VT, Val); + return DAG.getNode(ISD::BITCAST, dl, VT, Val); } + SDValue Val = IsSingleInstrConstant(Value, DAG, ST, dl); + if (Val.getNode()) + return DAG.getNode(ARMISD::VDUP, dl, VT, Val); } // If all elements are constants and the case above didn't get hit, fall back @@ -3467,10 +3725,11 @@ static SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, if (isConstant) return SDValue(); - if (!EnableARMVDUPsplat) { - // Use VDUP for non-constant splats. - if (usesOnlyOneValue && EltSize <= 32) - return DAG.getNode(ARMISD::VDUP, dl, VT, Value); + // Empirical tests suggest this is rarely worth it for vectors of length <= 2. + if (NumElts >= 4) { + SDValue shuffle = ReconstructShuffle(Op, DAG); + if (shuffle != SDValue()) + return shuffle; } // Vectors with 32- or 64-bit elements can be built by directly assigning @@ -3483,14 +3742,144 @@ static SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts); SmallVector<SDValue, 8> Ops; for (unsigned i = 0; i < NumElts; ++i) - Ops.push_back(DAG.getNode(ISD::BIT_CONVERT, dl, EltVT, Op.getOperand(i))); + Ops.push_back(DAG.getNode(ISD::BITCAST, dl, EltVT, Op.getOperand(i))); SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, &Ops[0],NumElts); - return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Val); + return DAG.getNode(ISD::BITCAST, dl, VT, Val); } return SDValue(); } +// Gather data to see if the operation can be modelled as a +// shuffle in combination with VEXTs. +SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op, + SelectionDAG &DAG) const { + DebugLoc dl = Op.getDebugLoc(); + EVT VT = Op.getValueType(); + unsigned NumElts = VT.getVectorNumElements(); + + SmallVector<SDValue, 2> SourceVecs; + SmallVector<unsigned, 2> MinElts; + SmallVector<unsigned, 2> MaxElts; + + for (unsigned i = 0; i < NumElts; ++i) { + SDValue V = Op.getOperand(i); + if (V.getOpcode() == ISD::UNDEF) + continue; + else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) { + // A shuffle can only come from building a vector from various + // elements of other vectors. + return SDValue(); + } + + // Record this extraction against the appropriate vector if possible... + SDValue SourceVec = V.getOperand(0); + unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue(); + bool FoundSource = false; + for (unsigned j = 0; j < SourceVecs.size(); ++j) { + if (SourceVecs[j] == SourceVec) { + if (MinElts[j] > EltNo) + MinElts[j] = EltNo; + if (MaxElts[j] < EltNo) + MaxElts[j] = EltNo; + FoundSource = true; + break; + } + } + + // Or record a new source if not... + if (!FoundSource) { + SourceVecs.push_back(SourceVec); + MinElts.push_back(EltNo); + MaxElts.push_back(EltNo); + } + } + + // Currently only do something sane when at most two source vectors + // involved. + if (SourceVecs.size() > 2) + return SDValue(); + + SDValue ShuffleSrcs[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT) }; + int VEXTOffsets[2] = {0, 0}; + + // This loop extracts the usage patterns of the source vectors + // and prepares appropriate SDValues for a shuffle if possible. + for (unsigned i = 0; i < SourceVecs.size(); ++i) { + if (SourceVecs[i].getValueType() == VT) { + // No VEXT necessary + ShuffleSrcs[i] = SourceVecs[i]; + VEXTOffsets[i] = 0; + continue; + } else if (SourceVecs[i].getValueType().getVectorNumElements() < NumElts) { + // It probably isn't worth padding out a smaller vector just to + // break it down again in a shuffle. + return SDValue(); + } + + // Since only 64-bit and 128-bit vectors are legal on ARM and + // we've eliminated the other cases... + assert(SourceVecs[i].getValueType().getVectorNumElements() == 2*NumElts && + "unexpected vector sizes in ReconstructShuffle"); + + if (MaxElts[i] - MinElts[i] >= NumElts) { + // Span too large for a VEXT to cope + return SDValue(); + } + + if (MinElts[i] >= NumElts) { + // The extraction can just take the second half + VEXTOffsets[i] = NumElts; + ShuffleSrcs[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, + SourceVecs[i], + DAG.getIntPtrConstant(NumElts)); + } else if (MaxElts[i] < NumElts) { + // The extraction can just take the first half + VEXTOffsets[i] = 0; + ShuffleSrcs[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, + SourceVecs[i], + DAG.getIntPtrConstant(0)); + } else { + // An actual VEXT is needed + VEXTOffsets[i] = MinElts[i]; + SDValue VEXTSrc1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, + SourceVecs[i], + DAG.getIntPtrConstant(0)); + SDValue VEXTSrc2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, + SourceVecs[i], + DAG.getIntPtrConstant(NumElts)); + ShuffleSrcs[i] = DAG.getNode(ARMISD::VEXT, dl, VT, VEXTSrc1, VEXTSrc2, + DAG.getConstant(VEXTOffsets[i], MVT::i32)); + } + } + + SmallVector<int, 8> Mask; + + for (unsigned i = 0; i < NumElts; ++i) { + SDValue Entry = Op.getOperand(i); + if (Entry.getOpcode() == ISD::UNDEF) { + Mask.push_back(-1); + continue; + } + + SDValue ExtractVec = Entry.getOperand(0); + int ExtractElt = cast<ConstantSDNode>(Op.getOperand(i) + .getOperand(1))->getSExtValue(); + if (ExtractVec == SourceVecs[0]) { + Mask.push_back(ExtractElt - VEXTOffsets[0]); + } else { + Mask.push_back(ExtractElt + NumElts - VEXTOffsets[1]); + } + } + + // Final check before we try to produce nonsense... + if (isShuffleMaskLegal(Mask, VT)) + return DAG.getVectorShuffle(VT, dl, ShuffleSrcs[0], ShuffleSrcs[1], + &Mask[0]); + + return SDValue(); +} + /// isShuffleMaskLegal - Targets can use this to indicate that they only /// support *some* VECTOR_SHUFFLE operations, those with specific masks. /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values @@ -3706,8 +4095,8 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { // registers are defined to use, and since i64 is not legal. EVT EltVT = EVT::getFloatingPointVT(EltSize); EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts); - V1 = DAG.getNode(ISD::BIT_CONVERT, dl, VecVT, V1); - V2 = DAG.getNode(ISD::BIT_CONVERT, dl, VecVT, V2); + V1 = DAG.getNode(ISD::BITCAST, dl, VecVT, V1); + V2 = DAG.getNode(ISD::BITCAST, dl, VecVT, V2); SmallVector<SDValue, 8> Ops; for (unsigned i = 0; i < NumElts; ++i) { if (ShuffleMask[i] < 0) @@ -3719,21 +4108,26 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { MVT::i32))); } SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, &Ops[0],NumElts); - return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Val); + return DAG.getNode(ISD::BITCAST, dl, VT, Val); } return SDValue(); } static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) { - EVT VT = Op.getValueType(); - DebugLoc dl = Op.getDebugLoc(); - SDValue Vec = Op.getOperand(0); + // EXTRACT_VECTOR_ELT is legal only for immediate indexes. SDValue Lane = Op.getOperand(1); - assert(VT == MVT::i32 && - Vec.getValueType().getVectorElementType().getSizeInBits() < 32 && - "unexpected type for custom-lowering vector extract"); - return DAG.getNode(ARMISD::VGETLANEu, dl, MVT::i32, Vec, Lane); + if (!isa<ConstantSDNode>(Lane)) + return SDValue(); + + SDValue Vec = Op.getOperand(0); + if (Op.getValueType() == MVT::i32 && + Vec.getValueType().getVectorElementType().getSizeInBits() < 32) { + DebugLoc dl = Op.getDebugLoc(); + return DAG.getNode(ARMISD::VGETLANEu, dl, MVT::i32, Vec, Lane); + } + + return Op; } static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { @@ -3747,25 +4141,123 @@ static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { SDValue Op1 = Op.getOperand(1); if (Op0.getOpcode() != ISD::UNDEF) Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val, - DAG.getNode(ISD::BIT_CONVERT, dl, MVT::f64, Op0), + DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op0), DAG.getIntPtrConstant(0)); if (Op1.getOpcode() != ISD::UNDEF) Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val, - DAG.getNode(ISD::BIT_CONVERT, dl, MVT::f64, Op1), + DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op1), DAG.getIntPtrConstant(1)); - return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), Val); + return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Val); +} + +/// isExtendedBUILD_VECTOR - Check if N is a constant BUILD_VECTOR where each +/// element has been zero/sign-extended, depending on the isSigned parameter, +/// from an integer type half its size. +static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG, + bool isSigned) { + // A v2i64 BUILD_VECTOR will have been legalized to a BITCAST from v4i32. + EVT VT = N->getValueType(0); + if (VT == MVT::v2i64 && N->getOpcode() == ISD::BITCAST) { + SDNode *BVN = N->getOperand(0).getNode(); + if (BVN->getValueType(0) != MVT::v4i32 || + BVN->getOpcode() != ISD::BUILD_VECTOR) + return false; + unsigned LoElt = DAG.getTargetLoweringInfo().isBigEndian() ? 1 : 0; + unsigned HiElt = 1 - LoElt; + ConstantSDNode *Lo0 = dyn_cast<ConstantSDNode>(BVN->getOperand(LoElt)); + ConstantSDNode *Hi0 = dyn_cast<ConstantSDNode>(BVN->getOperand(HiElt)); + ConstantSDNode *Lo1 = dyn_cast<ConstantSDNode>(BVN->getOperand(LoElt+2)); + ConstantSDNode *Hi1 = dyn_cast<ConstantSDNode>(BVN->getOperand(HiElt+2)); + if (!Lo0 || !Hi0 || !Lo1 || !Hi1) + return false; + if (isSigned) { + if (Hi0->getSExtValue() == Lo0->getSExtValue() >> 32 && + Hi1->getSExtValue() == Lo1->getSExtValue() >> 32) + return true; + } else { + if (Hi0->isNullValue() && Hi1->isNullValue()) + return true; + } + return false; + } + + if (N->getOpcode() != ISD::BUILD_VECTOR) + return false; + + for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { + SDNode *Elt = N->getOperand(i).getNode(); + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) { + unsigned EltSize = VT.getVectorElementType().getSizeInBits(); + unsigned HalfSize = EltSize / 2; + if (isSigned) { + int64_t SExtVal = C->getSExtValue(); + if ((SExtVal >> HalfSize) != (SExtVal >> EltSize)) + return false; + } else { + if ((C->getZExtValue() >> HalfSize) != 0) + return false; + } + continue; + } + return false; + } + + return true; +} + +/// isSignExtended - Check if a node is a vector value that is sign-extended +/// or a constant BUILD_VECTOR with sign-extended elements. +static bool isSignExtended(SDNode *N, SelectionDAG &DAG) { + if (N->getOpcode() == ISD::SIGN_EXTEND || ISD::isSEXTLoad(N)) + return true; + if (isExtendedBUILD_VECTOR(N, DAG, true)) + return true; + return false; +} + +/// isZeroExtended - Check if a node is a vector value that is zero-extended +/// or a constant BUILD_VECTOR with zero-extended elements. +static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) { + if (N->getOpcode() == ISD::ZERO_EXTEND || ISD::isZEXTLoad(N)) + return true; + if (isExtendedBUILD_VECTOR(N, DAG, false)) + return true; + return false; } -/// SkipExtension - For a node that is either a SIGN_EXTEND, ZERO_EXTEND, or -/// an extending load, return the unextended value. +/// SkipExtension - For a node that is a SIGN_EXTEND, ZERO_EXTEND, extending +/// load, or BUILD_VECTOR with extended elements, return the unextended value. static SDValue SkipExtension(SDNode *N, SelectionDAG &DAG) { if (N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND) return N->getOperand(0); - LoadSDNode *LD = cast<LoadSDNode>(N); - return DAG.getLoad(LD->getMemoryVT(), N->getDebugLoc(), LD->getChain(), - LD->getBasePtr(), LD->getSrcValue(), - LD->getSrcValueOffset(), LD->isVolatile(), - LD->isNonTemporal(), LD->getAlignment()); + if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) + return DAG.getLoad(LD->getMemoryVT(), N->getDebugLoc(), LD->getChain(), + LD->getBasePtr(), LD->getPointerInfo(), LD->isVolatile(), + LD->isNonTemporal(), LD->getAlignment()); + // Otherwise, the value must be a BUILD_VECTOR. For v2i64, it will + // have been legalized as a BITCAST from v4i32. + if (N->getOpcode() == ISD::BITCAST) { + SDNode *BVN = N->getOperand(0).getNode(); + assert(BVN->getOpcode() == ISD::BUILD_VECTOR && + BVN->getValueType(0) == MVT::v4i32 && "expected v4i32 BUILD_VECTOR"); + unsigned LowElt = DAG.getTargetLoweringInfo().isBigEndian() ? 1 : 0; + return DAG.getNode(ISD::BUILD_VECTOR, N->getDebugLoc(), MVT::v2i32, + BVN->getOperand(LowElt), BVN->getOperand(LowElt+2)); + } + // Construct a new BUILD_VECTOR with elements truncated to half the size. + assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR"); + EVT VT = N->getValueType(0); + unsigned EltSize = VT.getVectorElementType().getSizeInBits() / 2; + unsigned NumElts = VT.getVectorNumElements(); + MVT TruncVT = MVT::getIntegerVT(EltSize); + SmallVector<SDValue, 8> Ops; + for (unsigned i = 0; i != NumElts; ++i) { + ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i)); + const APInt &CInt = C->getAPIntValue(); + Ops.push_back(DAG.getConstant(CInt.trunc(EltSize), TruncVT)); + } + return DAG.getNode(ISD::BUILD_VECTOR, N->getDebugLoc(), + MVT::getVectorVT(TruncVT, NumElts), Ops.data(), NumElts); } static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) { @@ -3776,19 +4268,16 @@ static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) { SDNode *N0 = Op.getOperand(0).getNode(); SDNode *N1 = Op.getOperand(1).getNode(); unsigned NewOpc = 0; - if ((N0->getOpcode() == ISD::SIGN_EXTEND || ISD::isSEXTLoad(N0)) && - (N1->getOpcode() == ISD::SIGN_EXTEND || ISD::isSEXTLoad(N1))) { + if (isSignExtended(N0, DAG) && isSignExtended(N1, DAG)) NewOpc = ARMISD::VMULLs; - } else if ((N0->getOpcode() == ISD::ZERO_EXTEND || ISD::isZEXTLoad(N0)) && - (N1->getOpcode() == ISD::ZERO_EXTEND || ISD::isZEXTLoad(N1))) { + else if (isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG)) NewOpc = ARMISD::VMULLu; - } else if (VT.getSimpleVT().SimpleTy == MVT::v2i64) { + else if (VT == MVT::v2i64) // Fall through to expand this. It is not legal. return SDValue(); - } else { + else // Other vector multiplications are legal. return Op; - } // Legalize to a VMULL instruction. DebugLoc DL = Op.getDebugLoc(); @@ -3801,6 +4290,181 @@ static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) { return DAG.getNode(NewOpc, DL, VT, Op0, Op1); } +static SDValue +LowerSDIV_v4i8(SDValue X, SDValue Y, DebugLoc dl, SelectionDAG &DAG) { + // Convert to float + // float4 xf = vcvt_f32_s32(vmovl_s16(a.lo)); + // float4 yf = vcvt_f32_s32(vmovl_s16(b.lo)); + X = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, X); + Y = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Y); + X = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, X); + Y = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, Y); + // Get reciprocal estimate. + // float4 recip = vrecpeq_f32(yf); + Y = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, + DAG.getConstant(Intrinsic::arm_neon_vrecpe, MVT::i32), Y); + // Because char has a smaller range than uchar, we can actually get away + // without any newton steps. This requires that we use a weird bias + // of 0xb000, however (again, this has been exhaustively tested). + // float4 result = as_float4(as_int4(xf*recip) + 0xb000); + X = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, X, Y); + X = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, X); + Y = DAG.getConstant(0xb000, MVT::i32); + Y = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Y, Y, Y, Y); + X = DAG.getNode(ISD::ADD, dl, MVT::v4i32, X, Y); + X = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, X); + // Convert back to short. + X = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, X); + X = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, X); + return X; +} + +static SDValue +LowerSDIV_v4i16(SDValue N0, SDValue N1, DebugLoc dl, SelectionDAG &DAG) { + SDValue N2; + // Convert to float. + // float4 yf = vcvt_f32_s32(vmovl_s16(y)); + // float4 xf = vcvt_f32_s32(vmovl_s16(x)); + N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N0); + N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N1); + N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0); + N1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1); + + // Use reciprocal estimate and one refinement step. + // float4 recip = vrecpeq_f32(yf); + // recip *= vrecpsq_f32(yf, recip); + N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, + DAG.getConstant(Intrinsic::arm_neon_vrecpe, MVT::i32), N1); + N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, + DAG.getConstant(Intrinsic::arm_neon_vrecps, MVT::i32), + N1, N2); + N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2); + // Because short has a smaller range than ushort, we can actually get away + // with only a single newton step. This requires that we use a weird bias + // of 89, however (again, this has been exhaustively tested). + // float4 result = as_float4(as_int4(xf*recip) + 89); + N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2); + N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0); + N1 = DAG.getConstant(89, MVT::i32); + N1 = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, N1, N1, N1, N1); + N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1); + N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0); + // Convert back to integer and return. + // return vmovn_s32(vcvt_s32_f32(result)); + N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0); + N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0); + return N0; +} + +static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) { + EVT VT = Op.getValueType(); + assert((VT == MVT::v4i16 || VT == MVT::v8i8) && + "unexpected type for custom-lowering ISD::SDIV"); + + DebugLoc dl = Op.getDebugLoc(); + SDValue N0 = Op.getOperand(0); + SDValue N1 = Op.getOperand(1); + SDValue N2, N3; + + if (VT == MVT::v8i8) { + N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N0); + N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N1); + + N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, + DAG.getIntPtrConstant(4)); + N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, + DAG.getIntPtrConstant(4)); + N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, + DAG.getIntPtrConstant(0)); + N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, + DAG.getIntPtrConstant(0)); + + N0 = LowerSDIV_v4i8(N0, N1, dl, DAG); // v4i16 + N2 = LowerSDIV_v4i8(N2, N3, dl, DAG); // v4i16 + + N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2); + N0 = LowerCONCAT_VECTORS(N0, DAG); + + N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i8, N0); + return N0; + } + return LowerSDIV_v4i16(N0, N1, dl, DAG); +} + +static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG) { + EVT VT = Op.getValueType(); + assert((VT == MVT::v4i16 || VT == MVT::v8i8) && + "unexpected type for custom-lowering ISD::UDIV"); + + DebugLoc dl = Op.getDebugLoc(); + SDValue N0 = Op.getOperand(0); + SDValue N1 = Op.getOperand(1); + SDValue N2, N3; + + if (VT == MVT::v8i8) { + N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N0); + N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N1); + + N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, + DAG.getIntPtrConstant(4)); + N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, + DAG.getIntPtrConstant(4)); + N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, + DAG.getIntPtrConstant(0)); + N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, + DAG.getIntPtrConstant(0)); + + N0 = LowerSDIV_v4i16(N0, N1, dl, DAG); // v4i16 + N2 = LowerSDIV_v4i16(N2, N3, dl, DAG); // v4i16 + + N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2); + N0 = LowerCONCAT_VECTORS(N0, DAG); + + N0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v8i8, + DAG.getConstant(Intrinsic::arm_neon_vqmovnsu, MVT::i32), + N0); + return N0; + } + + // v4i16 sdiv ... Convert to float. + // float4 yf = vcvt_f32_s32(vmovl_u16(y)); + // float4 xf = vcvt_f32_s32(vmovl_u16(x)); + N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N0); + N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N1); + N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0); + N1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1); + + // Use reciprocal estimate and two refinement steps. + // float4 recip = vrecpeq_f32(yf); + // recip *= vrecpsq_f32(yf, recip); + // recip *= vrecpsq_f32(yf, recip); + N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, + DAG.getConstant(Intrinsic::arm_neon_vrecpe, MVT::i32), N1); + N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, + DAG.getConstant(Intrinsic::arm_neon_vrecps, MVT::i32), + N1, N2); + N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2); + N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, + DAG.getConstant(Intrinsic::arm_neon_vrecps, MVT::i32), + N1, N2); + N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2); + // Simply multiplying by the reciprocal estimate can leave us a few ulps + // too low, so we add 2 ulps (exhaustive testing shows that this is enough, + // and that it will never cause us to return an answer too large). + // float4 result = as_float4(as_int4(xf*recip) + 89); + N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2); + N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0); + N1 = DAG.getConstant(2, MVT::i32); + N1 = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, N1, N1, N1, N1); + N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1); + N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0); + // Convert back to integer and return. + // return vmovn_u32(vcvt_s32_f32(result)); + N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0); + N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0); + return N0; +} + SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { default: llvm_unreachable("Don't know how to custom lower this!"); @@ -3816,6 +4480,7 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::BR_JT: return LowerBR_JT(Op, DAG); case ISD::VASTART: return LowerVASTART(Op, DAG); case ISD::MEMBARRIER: return LowerMEMBARRIER(Op, DAG, Subtarget); + case ISD::PREFETCH: return LowerPREFETCH(Op, DAG, Subtarget); case ISD::SINT_TO_FP: case ISD::UINT_TO_FP: return LowerINT_TO_FP(Op, DAG); case ISD::FP_TO_SINT: @@ -3826,9 +4491,10 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::GLOBAL_OFFSET_TABLE: return LowerGLOBAL_OFFSET_TABLE(Op, DAG); case ISD::EH_SJLJ_SETJMP: return LowerEH_SJLJ_SETJMP(Op, DAG); case ISD::EH_SJLJ_LONGJMP: return LowerEH_SJLJ_LONGJMP(Op, DAG); + case ISD::EH_SJLJ_DISPATCHSETUP: return LowerEH_SJLJ_DISPATCHSETUP(Op, DAG); case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG, Subtarget); - case ISD::BIT_CONVERT: return ExpandBIT_CONVERT(Op.getNode(), DAG); + case ISD::BITCAST: return ExpandBITCAST(Op.getNode(), DAG); case ISD::SHL: case ISD::SRL: case ISD::SRA: return LowerShift(Op.getNode(), DAG, Subtarget); @@ -3843,6 +4509,8 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); case ISD::MUL: return LowerMUL(Op, DAG); + case ISD::SDIV: return LowerSDIV(Op, DAG); + case ISD::UDIV: return LowerUDIV(Op, DAG); } return SDValue(); } @@ -3857,12 +4525,12 @@ void ARMTargetLowering::ReplaceNodeResults(SDNode *N, default: llvm_unreachable("Don't know how to custom expand this!"); break; - case ISD::BIT_CONVERT: - Res = ExpandBIT_CONVERT(N, DAG); + case ISD::BITCAST: + Res = ExpandBITCAST(N, DAG); break; case ISD::SRL: case ISD::SRA: - Res = LowerShift(N, DAG, Subtarget); + Res = Expand64BitShift(N, DAG, Subtarget); break; } if (Res.getNode()) @@ -3892,7 +4560,7 @@ ARMTargetLowering::EmitAtomicCmpSwap(MachineInstr *MI, default: llvm_unreachable("unsupported size for AtomicCmpSwap!"); case 1: ldrOpc = isThumb2 ? ARM::t2LDREXB : ARM::LDREXB; - strOpc = isThumb2 ? ARM::t2LDREXB : ARM::STREXB; + strOpc = isThumb2 ? ARM::t2STREXB : ARM::STREXB; break; case 2: ldrOpc = isThumb2 ? ARM::t2LDREXH : ARM::LDREXH; @@ -4183,6 +4851,9 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, case ARM::BCCi64: case ARM::BCCZi64: { + // If there is an unconditional branch to the other successor, remove it. + BB->erase(llvm::next(MachineBasicBlock::iterator(MI)), BB->end()); + // Compare both parts that make up the double comparison separately for // equality. bool RHSisZero = MI->getOpcode() == ARM::BCCZi64; @@ -4341,10 +5012,6 @@ static SDValue PerformMULCombine(SDNode *N, if (Subtarget->isThumb1Only()) return SDValue(); - if (DAG.getMachineFunction(). - getFunction()->hasFnAttr(Attribute::OptimizeForSize)) - return SDValue(); - if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) return SDValue(); @@ -4389,10 +5056,67 @@ static SDValue PerformMULCombine(SDNode *N, return SDValue(); } +static SDValue PerformANDCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + // Attempt to use immediate-form VBIC + BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1)); + DebugLoc dl = N->getDebugLoc(); + EVT VT = N->getValueType(0); + SelectionDAG &DAG = DCI.DAG; + + APInt SplatBits, SplatUndef; + unsigned SplatBitSize; + bool HasAnyUndefs; + if (BVN && + BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { + if (SplatBitSize <= 64) { + EVT VbicVT; + SDValue Val = isNEONModifiedImm((~SplatBits).getZExtValue(), + SplatUndef.getZExtValue(), SplatBitSize, + DAG, VbicVT, VT.is128BitVector(), + OtherModImm); + if (Val.getNode()) { + SDValue Input = + DAG.getNode(ISD::BITCAST, dl, VbicVT, N->getOperand(0)); + SDValue Vbic = DAG.getNode(ARMISD::VBICIMM, dl, VbicVT, Input, Val); + return DAG.getNode(ISD::BITCAST, dl, VT, Vbic); + } + } + } + + return SDValue(); +} + /// PerformORCombine - Target-specific dag combine xforms for ISD::OR static SDValue PerformORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget) { + // Attempt to use immediate-form VORR + BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1)); + DebugLoc dl = N->getDebugLoc(); + EVT VT = N->getValueType(0); + SelectionDAG &DAG = DCI.DAG; + + APInt SplatBits, SplatUndef; + unsigned SplatBitSize; + bool HasAnyUndefs; + if (BVN && Subtarget->hasNEON() && + BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { + if (SplatBitSize <= 64) { + EVT VorrVT; + SDValue Val = isNEONModifiedImm(SplatBits.getZExtValue(), + SplatUndef.getZExtValue(), SplatBitSize, + DAG, VorrVT, VT.is128BitVector(), + OtherModImm); + if (Val.getNode()) { + SDValue Input = + DAG.getNode(ISD::BITCAST, dl, VorrVT, N->getOperand(0)); + SDValue Vorr = DAG.getNode(ARMISD::VORRIMM, dl, VorrVT, Input, Val); + return DAG.getNode(ISD::BITCAST, dl, VT, Vorr); + } + } + } + // Try to use the ARM/Thumb2 BFI (bitfield insert) instruction when // reasonable. @@ -4400,7 +5124,6 @@ static SDValue PerformORCombine(SDNode *N, if (Subtarget->isThumb1Only() || !Subtarget->hasV6T2Ops()) return SDValue(); - SelectionDAG &DAG = DCI.DAG; SDValue N0 = N->getOperand(0), N1 = N->getOperand(1); DebugLoc DL = N->getDebugLoc(); // 1) or (and A, mask), val => ARMbfi A, val, mask @@ -4415,40 +5138,46 @@ static SDValue PerformORCombine(SDNode *N, if (N0.getOpcode() != ISD::AND) return SDValue(); - EVT VT = N->getValueType(0); if (VT != MVT::i32) return SDValue(); + SDValue N00 = N0.getOperand(0); // The value and the mask need to be constants so we can verify this is // actually a bitfield set. If the mask is 0xffff, we can do better // via a movt instruction, so don't use BFI in that case. - ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1)); - if (!C) + SDValue MaskOp = N0.getOperand(1); + ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(MaskOp); + if (!MaskC) return SDValue(); - unsigned Mask = C->getZExtValue(); + unsigned Mask = MaskC->getZExtValue(); if (Mask == 0xffff) return SDValue(); SDValue Res; // Case (1): or (and A, mask), val => ARMbfi A, val, mask - if ((C = dyn_cast<ConstantSDNode>(N1))) { - unsigned Val = C->getZExtValue(); - if (!ARM::isBitFieldInvertedMask(Mask) || (Val & ~Mask) != Val) + ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1); + if (N1C) { + unsigned Val = N1C->getZExtValue(); + if ((Val & ~Mask) != Val) return SDValue(); - Val >>= CountTrailingZeros_32(~Mask); - Res = DAG.getNode(ARMISD::BFI, DL, VT, N0.getOperand(0), - DAG.getConstant(Val, MVT::i32), - DAG.getConstant(Mask, MVT::i32)); + if (ARM::isBitFieldInvertedMask(Mask)) { + Val >>= CountTrailingZeros_32(~Mask); - // Do not add new nodes to DAG combiner worklist. - DCI.CombineTo(N, Res, false); + Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, + DAG.getConstant(Val, MVT::i32), + DAG.getConstant(Mask, MVT::i32)); + + // Do not add new nodes to DAG combiner worklist. + DCI.CombineTo(N, Res, false); + return SDValue(); + } } else if (N1.getOpcode() == ISD::AND) { // case (2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask - C = dyn_cast<ConstantSDNode>(N1.getOperand(1)); - if (!C) + ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1)); + if (!N11C) return SDValue(); - unsigned Mask2 = C->getZExtValue(); + unsigned Mask2 = N11C->getZExtValue(); if (ARM::isBitFieldInvertedMask(Mask) && ARM::isBitFieldInvertedMask(~Mask2) && @@ -4462,10 +5191,11 @@ static SDValue PerformORCombine(SDNode *N, unsigned lsb = CountTrailingZeros_32(Mask2); Res = DAG.getNode(ISD::SRL, DL, VT, N1.getOperand(0), DAG.getConstant(lsb, MVT::i32)); - Res = DAG.getNode(ARMISD::BFI, DL, VT, N0.getOperand(0), Res, + Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, Res, DAG.getConstant(Mask, MVT::i32)); // Do not add new nodes to DAG combiner worklist. DCI.CombineTo(N, Res, false); + return SDValue(); } else if (ARM::isBitFieldInvertedMask(~Mask) && ARM::isBitFieldInvertedMask(Mask2) && (CountPopulation_32(~Mask) == CountPopulation_32(Mask2))) { @@ -4476,40 +5206,472 @@ static SDValue PerformORCombine(SDNode *N, return SDValue(); // 2b unsigned lsb = CountTrailingZeros_32(Mask); - Res = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), + Res = DAG.getNode(ISD::SRL, DL, VT, N00, DAG.getConstant(lsb, MVT::i32)); Res = DAG.getNode(ARMISD::BFI, DL, VT, N1.getOperand(0), Res, DAG.getConstant(Mask2, MVT::i32)); // Do not add new nodes to DAG combiner worklist. DCI.CombineTo(N, Res, false); + return SDValue(); } } + if (DAG.MaskedValueIsZero(N1, MaskC->getAPIntValue()) && + N00.getOpcode() == ISD::SHL && isa<ConstantSDNode>(N00.getOperand(1)) && + ARM::isBitFieldInvertedMask(~Mask)) { + // Case (3): or (and (shl A, #shamt), mask), B => ARMbfi B, A, ~mask + // where lsb(mask) == #shamt and masked bits of B are known zero. + SDValue ShAmt = N00.getOperand(1); + unsigned ShAmtC = cast<ConstantSDNode>(ShAmt)->getZExtValue(); + unsigned LSB = CountTrailingZeros_32(Mask); + if (ShAmtC != LSB) + return SDValue(); + + Res = DAG.getNode(ARMISD::BFI, DL, VT, N1, N00.getOperand(0), + DAG.getConstant(~Mask, MVT::i32)); + + // Do not add new nodes to DAG combiner worklist. + DCI.CombineTo(N, Res, false); + } + + return SDValue(); +} + +/// PerformBFICombine - (bfi A, (and B, C1), C2) -> (bfi A, B, C2) iff +/// C1 & C2 == C1. +static SDValue PerformBFICombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + SDValue N1 = N->getOperand(1); + if (N1.getOpcode() == ISD::AND) { + ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1)); + if (!N11C) + return SDValue(); + unsigned Mask = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue(); + unsigned Mask2 = N11C->getZExtValue(); + if ((Mask & Mask2) == Mask2) + return DCI.DAG.getNode(ARMISD::BFI, N->getDebugLoc(), N->getValueType(0), + N->getOperand(0), N1.getOperand(0), + N->getOperand(2)); + } return SDValue(); } /// PerformVMOVRRDCombine - Target-specific dag combine xforms for /// ARMISD::VMOVRRD. static SDValue PerformVMOVRRDCombine(SDNode *N, - TargetLowering::DAGCombinerInfo &DCI) { - // fmrrd(fmdrr x, y) -> x,y + TargetLowering::DAGCombinerInfo &DCI) { + // vmovrrd(vmovdrr x, y) -> x,y SDValue InDouble = N->getOperand(0); if (InDouble.getOpcode() == ARMISD::VMOVDRR) return DCI.CombineTo(N, InDouble.getOperand(0), InDouble.getOperand(1)); return SDValue(); } +/// PerformVMOVDRRCombine - Target-specific dag combine xforms for +/// ARMISD::VMOVDRR. This is also used for BUILD_VECTORs with 2 operands. +static SDValue PerformVMOVDRRCombine(SDNode *N, SelectionDAG &DAG) { + // N=vmovrrd(X); vmovdrr(N:0, N:1) -> bit_convert(X) + SDValue Op0 = N->getOperand(0); + SDValue Op1 = N->getOperand(1); + if (Op0.getOpcode() == ISD::BITCAST) + Op0 = Op0.getOperand(0); + if (Op1.getOpcode() == ISD::BITCAST) + Op1 = Op1.getOperand(0); + if (Op0.getOpcode() == ARMISD::VMOVRRD && + Op0.getNode() == Op1.getNode() && + Op0.getResNo() == 0 && Op1.getResNo() == 1) + return DAG.getNode(ISD::BITCAST, N->getDebugLoc(), + N->getValueType(0), Op0.getOperand(0)); + return SDValue(); +} + +/// PerformSTORECombine - Target-specific dag combine xforms for +/// ISD::STORE. +static SDValue PerformSTORECombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + // Bitcast an i64 store extracted from a vector to f64. + // Otherwise, the i64 value will be legalized to a pair of i32 values. + StoreSDNode *St = cast<StoreSDNode>(N); + SDValue StVal = St->getValue(); + if (!ISD::isNormalStore(St) || St->isVolatile() || + StVal.getValueType() != MVT::i64 || + StVal.getNode()->getOpcode() != ISD::EXTRACT_VECTOR_ELT) + return SDValue(); + + SelectionDAG &DAG = DCI.DAG; + DebugLoc dl = StVal.getDebugLoc(); + SDValue IntVec = StVal.getOperand(0); + EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, + IntVec.getValueType().getVectorNumElements()); + SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, IntVec); + SDValue ExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, + Vec, StVal.getOperand(1)); + dl = N->getDebugLoc(); + SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ExtElt); + // Make the DAGCombiner fold the bitcasts. + DCI.AddToWorklist(Vec.getNode()); + DCI.AddToWorklist(ExtElt.getNode()); + DCI.AddToWorklist(V.getNode()); + return DAG.getStore(St->getChain(), dl, V, St->getBasePtr(), + St->getPointerInfo(), St->isVolatile(), + St->isNonTemporal(), St->getAlignment(), + St->getTBAAInfo()); +} + +/// hasNormalLoadOperand - Check if any of the operands of a BUILD_VECTOR node +/// are normal, non-volatile loads. If so, it is profitable to bitcast an +/// i64 vector to have f64 elements, since the value can then be loaded +/// directly into a VFP register. +static bool hasNormalLoadOperand(SDNode *N) { + unsigned NumElts = N->getValueType(0).getVectorNumElements(); + for (unsigned i = 0; i < NumElts; ++i) { + SDNode *Elt = N->getOperand(i).getNode(); + if (ISD::isNormalLoad(Elt) && !cast<LoadSDNode>(Elt)->isVolatile()) + return true; + } + return false; +} + +/// PerformBUILD_VECTORCombine - Target-specific dag combine xforms for +/// ISD::BUILD_VECTOR. +static SDValue PerformBUILD_VECTORCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI){ + // build_vector(N=ARMISD::VMOVRRD(X), N:1) -> bit_convert(X): + // VMOVRRD is introduced when legalizing i64 types. It forces the i64 value + // into a pair of GPRs, which is fine when the value is used as a scalar, + // but if the i64 value is converted to a vector, we need to undo the VMOVRRD. + SelectionDAG &DAG = DCI.DAG; + if (N->getNumOperands() == 2) { + SDValue RV = PerformVMOVDRRCombine(N, DAG); + if (RV.getNode()) + return RV; + } + + // Load i64 elements as f64 values so that type legalization does not split + // them up into i32 values. + EVT VT = N->getValueType(0); + if (VT.getVectorElementType() != MVT::i64 || !hasNormalLoadOperand(N)) + return SDValue(); + DebugLoc dl = N->getDebugLoc(); + SmallVector<SDValue, 8> Ops; + unsigned NumElts = VT.getVectorNumElements(); + for (unsigned i = 0; i < NumElts; ++i) { + SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(i)); + Ops.push_back(V); + // Make the DAGCombiner fold the bitcast. + DCI.AddToWorklist(V.getNode()); + } + EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, NumElts); + SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, FloatVT, Ops.data(), NumElts); + return DAG.getNode(ISD::BITCAST, dl, VT, BV); +} + +/// PerformInsertEltCombine - Target-specific dag combine xforms for +/// ISD::INSERT_VECTOR_ELT. +static SDValue PerformInsertEltCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + // Bitcast an i64 load inserted into a vector to f64. + // Otherwise, the i64 value will be legalized to a pair of i32 values. + EVT VT = N->getValueType(0); + SDNode *Elt = N->getOperand(1).getNode(); + if (VT.getVectorElementType() != MVT::i64 || + !ISD::isNormalLoad(Elt) || cast<LoadSDNode>(Elt)->isVolatile()) + return SDValue(); + + SelectionDAG &DAG = DCI.DAG; + DebugLoc dl = N->getDebugLoc(); + EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, + VT.getVectorNumElements()); + SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, N->getOperand(0)); + SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(1)); + // Make the DAGCombiner fold the bitcasts. + DCI.AddToWorklist(Vec.getNode()); + DCI.AddToWorklist(V.getNode()); + SDValue InsElt = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, FloatVT, + Vec, V, N->getOperand(2)); + return DAG.getNode(ISD::BITCAST, dl, VT, InsElt); +} + +/// PerformVECTOR_SHUFFLECombine - Target-specific dag combine xforms for +/// ISD::VECTOR_SHUFFLE. +static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG) { + // The LLVM shufflevector instruction does not require the shuffle mask + // length to match the operand vector length, but ISD::VECTOR_SHUFFLE does + // have that requirement. When translating to ISD::VECTOR_SHUFFLE, if the + // operands do not match the mask length, they are extended by concatenating + // them with undef vectors. That is probably the right thing for other + // targets, but for NEON it is better to concatenate two double-register + // size vector operands into a single quad-register size vector. Do that + // transformation here: + // shuffle(concat(v1, undef), concat(v2, undef)) -> + // shuffle(concat(v1, v2), undef) + SDValue Op0 = N->getOperand(0); + SDValue Op1 = N->getOperand(1); + if (Op0.getOpcode() != ISD::CONCAT_VECTORS || + Op1.getOpcode() != ISD::CONCAT_VECTORS || + Op0.getNumOperands() != 2 || + Op1.getNumOperands() != 2) + return SDValue(); + SDValue Concat0Op1 = Op0.getOperand(1); + SDValue Concat1Op1 = Op1.getOperand(1); + if (Concat0Op1.getOpcode() != ISD::UNDEF || + Concat1Op1.getOpcode() != ISD::UNDEF) + return SDValue(); + // Skip the transformation if any of the types are illegal. + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + EVT VT = N->getValueType(0); + if (!TLI.isTypeLegal(VT) || + !TLI.isTypeLegal(Concat0Op1.getValueType()) || + !TLI.isTypeLegal(Concat1Op1.getValueType())) + return SDValue(); + + SDValue NewConcat = DAG.getNode(ISD::CONCAT_VECTORS, N->getDebugLoc(), VT, + Op0.getOperand(0), Op1.getOperand(0)); + // Translate the shuffle mask. + SmallVector<int, 16> NewMask; + unsigned NumElts = VT.getVectorNumElements(); + unsigned HalfElts = NumElts/2; + ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N); + for (unsigned n = 0; n < NumElts; ++n) { + int MaskElt = SVN->getMaskElt(n); + int NewElt = -1; + if (MaskElt < (int)HalfElts) + NewElt = MaskElt; + else if (MaskElt >= (int)NumElts && MaskElt < (int)(NumElts + HalfElts)) + NewElt = HalfElts + MaskElt - NumElts; + NewMask.push_back(NewElt); + } + return DAG.getVectorShuffle(VT, N->getDebugLoc(), NewConcat, + DAG.getUNDEF(VT), NewMask.data()); +} + +/// CombineBaseUpdate - Target-specific DAG combine function for VLDDUP and +/// NEON load/store intrinsics to merge base address updates. +static SDValue CombineBaseUpdate(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) + return SDValue(); + + SelectionDAG &DAG = DCI.DAG; + bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID || + N->getOpcode() == ISD::INTRINSIC_W_CHAIN); + unsigned AddrOpIdx = (isIntrinsic ? 2 : 1); + SDValue Addr = N->getOperand(AddrOpIdx); + + // Search for a use of the address operand that is an increment. + for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), + UE = Addr.getNode()->use_end(); UI != UE; ++UI) { + SDNode *User = *UI; + if (User->getOpcode() != ISD::ADD || + UI.getUse().getResNo() != Addr.getResNo()) + continue; + + // Check that the add is independent of the load/store. Otherwise, folding + // it would create a cycle. + if (User->isPredecessorOf(N) || N->isPredecessorOf(User)) + continue; + + // Find the new opcode for the updating load/store. + bool isLoad = true; + bool isLaneOp = false; + unsigned NewOpc = 0; + unsigned NumVecs = 0; + if (isIntrinsic) { + unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); + switch (IntNo) { + default: assert(0 && "unexpected intrinsic for Neon base update"); + case Intrinsic::arm_neon_vld1: NewOpc = ARMISD::VLD1_UPD; + NumVecs = 1; break; + case Intrinsic::arm_neon_vld2: NewOpc = ARMISD::VLD2_UPD; + NumVecs = 2; break; + case Intrinsic::arm_neon_vld3: NewOpc = ARMISD::VLD3_UPD; + NumVecs = 3; break; + case Intrinsic::arm_neon_vld4: NewOpc = ARMISD::VLD4_UPD; + NumVecs = 4; break; + case Intrinsic::arm_neon_vld2lane: NewOpc = ARMISD::VLD2LN_UPD; + NumVecs = 2; isLaneOp = true; break; + case Intrinsic::arm_neon_vld3lane: NewOpc = ARMISD::VLD3LN_UPD; + NumVecs = 3; isLaneOp = true; break; + case Intrinsic::arm_neon_vld4lane: NewOpc = ARMISD::VLD4LN_UPD; + NumVecs = 4; isLaneOp = true; break; + case Intrinsic::arm_neon_vst1: NewOpc = ARMISD::VST1_UPD; + NumVecs = 1; isLoad = false; break; + case Intrinsic::arm_neon_vst2: NewOpc = ARMISD::VST2_UPD; + NumVecs = 2; isLoad = false; break; + case Intrinsic::arm_neon_vst3: NewOpc = ARMISD::VST3_UPD; + NumVecs = 3; isLoad = false; break; + case Intrinsic::arm_neon_vst4: NewOpc = ARMISD::VST4_UPD; + NumVecs = 4; isLoad = false; break; + case Intrinsic::arm_neon_vst2lane: NewOpc = ARMISD::VST2LN_UPD; + NumVecs = 2; isLoad = false; isLaneOp = true; break; + case Intrinsic::arm_neon_vst3lane: NewOpc = ARMISD::VST3LN_UPD; + NumVecs = 3; isLoad = false; isLaneOp = true; break; + case Intrinsic::arm_neon_vst4lane: NewOpc = ARMISD::VST4LN_UPD; + NumVecs = 4; isLoad = false; isLaneOp = true; break; + } + } else { + isLaneOp = true; + switch (N->getOpcode()) { + default: assert(0 && "unexpected opcode for Neon base update"); + case ARMISD::VLD2DUP: NewOpc = ARMISD::VLD2DUP_UPD; NumVecs = 2; break; + case ARMISD::VLD3DUP: NewOpc = ARMISD::VLD3DUP_UPD; NumVecs = 3; break; + case ARMISD::VLD4DUP: NewOpc = ARMISD::VLD4DUP_UPD; NumVecs = 4; break; + } + } + + // Find the size of memory referenced by the load/store. + EVT VecTy; + if (isLoad) + VecTy = N->getValueType(0); + else + VecTy = N->getOperand(AddrOpIdx+1).getValueType(); + unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8; + if (isLaneOp) + NumBytes /= VecTy.getVectorNumElements(); + + // If the increment is a constant, it must match the memory ref size. + SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0); + if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) { + uint64_t IncVal = CInc->getZExtValue(); + if (IncVal != NumBytes) + continue; + } else if (NumBytes >= 3 * 16) { + // VLD3/4 and VST3/4 for 128-bit vectors are implemented with two + // separate instructions that make it harder to use a non-constant update. + continue; + } + + // Create the new updating load/store node. + EVT Tys[6]; + unsigned NumResultVecs = (isLoad ? NumVecs : 0); + unsigned n; + for (n = 0; n < NumResultVecs; ++n) + Tys[n] = VecTy; + Tys[n++] = MVT::i32; + Tys[n] = MVT::Other; + SDVTList SDTys = DAG.getVTList(Tys, NumResultVecs+2); + SmallVector<SDValue, 8> Ops; + Ops.push_back(N->getOperand(0)); // incoming chain + Ops.push_back(N->getOperand(AddrOpIdx)); + Ops.push_back(Inc); + for (unsigned i = AddrOpIdx + 1; i < N->getNumOperands(); ++i) { + Ops.push_back(N->getOperand(i)); + } + MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N); + SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, N->getDebugLoc(), SDTys, + Ops.data(), Ops.size(), + MemInt->getMemoryVT(), + MemInt->getMemOperand()); + + // Update the uses. + std::vector<SDValue> NewResults; + for (unsigned i = 0; i < NumResultVecs; ++i) { + NewResults.push_back(SDValue(UpdN.getNode(), i)); + } + NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs+1)); // chain + DCI.CombineTo(N, NewResults); + DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs)); + + break; + } + return SDValue(); +} + +/// CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a +/// vldN-lane (N > 1) intrinsic, and if all the other uses of that intrinsic +/// are also VDUPLANEs. If so, combine them to a vldN-dup operation and +/// return true. +static bool CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { + SelectionDAG &DAG = DCI.DAG; + EVT VT = N->getValueType(0); + // vldN-dup instructions only support 64-bit vectors for N > 1. + if (!VT.is64BitVector()) + return false; + + // Check if the VDUPLANE operand is a vldN-dup intrinsic. + SDNode *VLD = N->getOperand(0).getNode(); + if (VLD->getOpcode() != ISD::INTRINSIC_W_CHAIN) + return false; + unsigned NumVecs = 0; + unsigned NewOpc = 0; + unsigned IntNo = cast<ConstantSDNode>(VLD->getOperand(1))->getZExtValue(); + if (IntNo == Intrinsic::arm_neon_vld2lane) { + NumVecs = 2; + NewOpc = ARMISD::VLD2DUP; + } else if (IntNo == Intrinsic::arm_neon_vld3lane) { + NumVecs = 3; + NewOpc = ARMISD::VLD3DUP; + } else if (IntNo == Intrinsic::arm_neon_vld4lane) { + NumVecs = 4; + NewOpc = ARMISD::VLD4DUP; + } else { + return false; + } + + // First check that all the vldN-lane uses are VDUPLANEs and that the lane + // numbers match the load. + unsigned VLDLaneNo = + cast<ConstantSDNode>(VLD->getOperand(NumVecs+3))->getZExtValue(); + for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end(); + UI != UE; ++UI) { + // Ignore uses of the chain result. + if (UI.getUse().getResNo() == NumVecs) + continue; + SDNode *User = *UI; + if (User->getOpcode() != ARMISD::VDUPLANE || + VLDLaneNo != cast<ConstantSDNode>(User->getOperand(1))->getZExtValue()) + return false; + } + + // Create the vldN-dup node. + EVT Tys[5]; + unsigned n; + for (n = 0; n < NumVecs; ++n) + Tys[n] = VT; + Tys[n] = MVT::Other; + SDVTList SDTys = DAG.getVTList(Tys, NumVecs+1); + SDValue Ops[] = { VLD->getOperand(0), VLD->getOperand(2) }; + MemIntrinsicSDNode *VLDMemInt = cast<MemIntrinsicSDNode>(VLD); + SDValue VLDDup = DAG.getMemIntrinsicNode(NewOpc, VLD->getDebugLoc(), SDTys, + Ops, 2, VLDMemInt->getMemoryVT(), + VLDMemInt->getMemOperand()); + + // Update the uses. + for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end(); + UI != UE; ++UI) { + unsigned ResNo = UI.getUse().getResNo(); + // Ignore uses of the chain result. + if (ResNo == NumVecs) + continue; + SDNode *User = *UI; + DCI.CombineTo(User, SDValue(VLDDup.getNode(), ResNo)); + } + + // Now the vldN-lane intrinsic is dead except for its chain result. + // Update uses of the chain. + std::vector<SDValue> VLDDupResults; + for (unsigned n = 0; n < NumVecs; ++n) + VLDDupResults.push_back(SDValue(VLDDup.getNode(), n)); + VLDDupResults.push_back(SDValue(VLDDup.getNode(), NumVecs)); + DCI.CombineTo(VLD, VLDDupResults); + + return true; +} + /// PerformVDUPLANECombine - Target-specific dag combine xforms for /// ARMISD::VDUPLANE. static SDValue PerformVDUPLANECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { - // If the source is already a VMOVIMM or VMVNIMM splat, the VDUPLANE is - // redundant. SDValue Op = N->getOperand(0); - EVT VT = N->getValueType(0); - // Ignore bit_converts. - while (Op.getOpcode() == ISD::BIT_CONVERT) + // If the source is a vldN-lane (N > 1) intrinsic, and all the other uses + // of that intrinsic are also VDUPLANEs, combine them to a vldN-dup operation. + if (CombineVLDDUP(N, DCI)) + return SDValue(N, 0); + + // If the source is already a VMOVIMM or VMVNIMM splat, the VDUPLANE is + // redundant. Ignore bit_converts for now; element sizes are checked below. + while (Op.getOpcode() == ISD::BITCAST) Op = Op.getOperand(0); if (Op.getOpcode() != ARMISD::VMOVIMM && Op.getOpcode() != ARMISD::VMVNIMM) return SDValue(); @@ -4521,11 +5683,11 @@ static SDValue PerformVDUPLANECombine(SDNode *N, unsigned EltBits; if (ARM_AM::decodeNEONModImm(Imm, EltBits) == 0) EltSize = 8; + EVT VT = N->getValueType(0); if (EltSize > VT.getVectorElementType().getSizeInBits()) return SDValue(); - SDValue Res = DCI.DAG.getNode(ISD::BIT_CONVERT, N->getDebugLoc(), VT, Op); - return DCI.CombineTo(N, Res, false); + return DCI.DAG.getNode(ISD::BITCAST, N->getDebugLoc(), VT, Op); } /// getVShiftImm - Check if this is a valid build_vector for the immediate @@ -4533,7 +5695,7 @@ static SDValue PerformVDUPLANECombine(SDNode *N, /// build_vector must have the same constant integer value. static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) { // Ignore bit_converts. - while (Op.getOpcode() == ISD::BIT_CONVERT) + while (Op.getOpcode() == ISD::BITCAST) Op = Op.getOperand(0); BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode()); APInt SplatBits, SplatUndef; @@ -4747,7 +5909,8 @@ static SDValue PerformShiftCombine(SDNode *N, SelectionDAG &DAG, EVT VT = N->getValueType(0); // Nothing to be done for scalar shifts. - if (! VT.isVector()) + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (!VT.isVector() || !TLI.isTypeLegal(VT)) return SDValue(); assert(ST->hasNEON() && "unexpected vector shift"); @@ -4793,7 +5956,8 @@ static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG, if (VT == MVT::i32 && (EltVT == MVT::i8 || EltVT == MVT::i16) && - TLI.isTypeLegal(Vec.getValueType())) { + TLI.isTypeLegal(Vec.getValueType()) && + isa<ConstantSDNode>(Lane)) { unsigned Opc = 0; switch (N->getOpcode()) { @@ -4906,7 +6070,14 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, case ISD::SUB: return PerformSUBCombine(N, DCI); case ISD::MUL: return PerformMULCombine(N, DCI, Subtarget); case ISD::OR: return PerformORCombine(N, DCI, Subtarget); + case ISD::AND: return PerformANDCombine(N, DCI); + case ARMISD::BFI: return PerformBFICombine(N, DCI); case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI); + case ARMISD::VMOVDRR: return PerformVMOVDRRCombine(N, DCI.DAG); + case ISD::STORE: return PerformSTORECombine(N, DCI); + case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DCI); + case ISD::INSERT_VECTOR_ELT: return PerformInsertEltCombine(N, DCI); + case ISD::VECTOR_SHUFFLE: return PerformVECTOR_SHUFFLECombine(N, DCI.DAG); case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI); case ISD::INTRINSIC_WO_CHAIN: return PerformIntrinsicCombine(N, DCI.DAG); case ISD::SHL: @@ -4916,20 +6087,42 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, case ISD::ZERO_EXTEND: case ISD::ANY_EXTEND: return PerformExtendCombine(N, DCI.DAG, Subtarget); case ISD::SELECT_CC: return PerformSELECT_CCCombine(N, DCI.DAG, Subtarget); + case ARMISD::VLD2DUP: + case ARMISD::VLD3DUP: + case ARMISD::VLD4DUP: + return CombineBaseUpdate(N, DCI); + case ISD::INTRINSIC_VOID: + case ISD::INTRINSIC_W_CHAIN: + switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) { + case Intrinsic::arm_neon_vld1: + case Intrinsic::arm_neon_vld2: + case Intrinsic::arm_neon_vld3: + case Intrinsic::arm_neon_vld4: + case Intrinsic::arm_neon_vld2lane: + case Intrinsic::arm_neon_vld3lane: + case Intrinsic::arm_neon_vld4lane: + case Intrinsic::arm_neon_vst1: + case Intrinsic::arm_neon_vst2: + case Intrinsic::arm_neon_vst3: + case Intrinsic::arm_neon_vst4: + case Intrinsic::arm_neon_vst2lane: + case Intrinsic::arm_neon_vst3lane: + case Intrinsic::arm_neon_vst4lane: + return CombineBaseUpdate(N, DCI); + default: break; + } + break; } return SDValue(); } -bool ARMTargetLowering::allowsUnalignedMemoryAccesses(EVT VT) const { - if (!Subtarget->hasV6Ops()) - // Pre-v6 does not support unaligned mem access. - return false; +bool ARMTargetLowering::isDesirableToTransformToIntegerOp(unsigned Opc, + EVT VT) const { + return (VT == MVT::f32) && (Opc == ISD::LOAD || Opc == ISD::STORE); +} - // v6+ may or may not support unaligned mem access depending on the system - // configuration. - // FIXME: This is pretty conservative. Should we provide cmdline option to - // control the behaviour? - if (!Subtarget->isTargetDarwin()) +bool ARMTargetLowering::allowsUnalignedMemoryAccesses(EVT VT) const { + if (!Subtarget->allowsUnalignedMem()) return false; switch (VT.getSimpleVT().SimpleTy) { @@ -5143,7 +6336,7 @@ bool ARMTargetLowering::isLegalICmpImmediate(int64_t Imm) const { if (!Subtarget->isThumb()) return ARM_AM::getSOImmVal(Imm) != -1; if (Subtarget->isThumb2()) - return ARM_AM::getT2SOImmVal(Imm) != -1; + return ARM_AM::getT2SOImmVal(Imm) != -1; return Imm >= 0 && Imm <= 255; } @@ -5348,6 +6541,37 @@ void ARMTargetLowering::computeMaskedBitsForTargetNode(const SDValue Op, // ARM Inline Assembly Support //===----------------------------------------------------------------------===// +bool ARMTargetLowering::ExpandInlineAsm(CallInst *CI) const { + // Looking for "rev" which is V6+. + if (!Subtarget->hasV6Ops()) + return false; + + InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue()); + std::string AsmStr = IA->getAsmString(); + SmallVector<StringRef, 4> AsmPieces; + SplitString(AsmStr, AsmPieces, ";\n"); + + switch (AsmPieces.size()) { + default: return false; + case 1: + AsmStr = AsmPieces[0]; + AsmPieces.clear(); + SplitString(AsmStr, AsmPieces, " \t,"); + + // rev $0, $1 + if (AsmPieces.size() == 3 && + AsmPieces[0] == "rev" && AsmPieces[1] == "$0" && AsmPieces[2] == "$1" && + IA->getConstraintString().compare(0, 4, "=l,l") == 0) { + const IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); + if (Ty && Ty->getBitWidth() == 32) + return IntrinsicLowering::LowerToByteSwap(CI); + } + break; + } + + return false; +} + /// getConstraintType - Given a constraint letter, return the type of /// constraint it is for this target. ARMTargetLowering::ConstraintType @@ -5362,6 +6586,40 @@ ARMTargetLowering::getConstraintType(const std::string &Constraint) const { return TargetLowering::getConstraintType(Constraint); } +/// Examine constraint type and operand type and determine a weight value. +/// This object must already have been set up with the operand type +/// and the current alternative constraint selected. +TargetLowering::ConstraintWeight +ARMTargetLowering::getSingleConstraintMatchWeight( + AsmOperandInfo &info, const char *constraint) const { + ConstraintWeight weight = CW_Invalid; + Value *CallOperandVal = info.CallOperandVal; + // If we don't have a value, we can't do a match, + // but allow it at the lowest weight. + if (CallOperandVal == NULL) + return CW_Default; + const Type *type = CallOperandVal->getType(); + // Look at the constraint type. + switch (*constraint) { + default: + weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint); + break; + case 'l': + if (type->isIntegerTy()) { + if (Subtarget->isThumb()) + weight = CW_SpecificReg; + else + weight = CW_Register; + } + break; + case 'w': + if (type->isFloatingPointTy()) + weight = CW_Register; + break; + } + return weight; +} + std::pair<unsigned, const TargetRegisterClass*> ARMTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, EVT VT) const { @@ -5664,3 +6922,63 @@ bool ARMTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { return ARM::getVFPf64Imm(Imm) != -1; return false; } + +/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as +/// MemIntrinsicNodes. The associated MachineMemOperands record the alignment +/// specified in the intrinsic calls. +bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, + const CallInst &I, + unsigned Intrinsic) const { + switch (Intrinsic) { + case Intrinsic::arm_neon_vld1: + case Intrinsic::arm_neon_vld2: + case Intrinsic::arm_neon_vld3: + case Intrinsic::arm_neon_vld4: + case Intrinsic::arm_neon_vld2lane: + case Intrinsic::arm_neon_vld3lane: + case Intrinsic::arm_neon_vld4lane: { + Info.opc = ISD::INTRINSIC_W_CHAIN; + // Conservatively set memVT to the entire set of vectors loaded. + uint64_t NumElts = getTargetData()->getTypeAllocSize(I.getType()) / 8; + Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); + Info.ptrVal = I.getArgOperand(0); + Info.offset = 0; + Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1); + Info.align = cast<ConstantInt>(AlignArg)->getZExtValue(); + Info.vol = false; // volatile loads with NEON intrinsics not supported + Info.readMem = true; + Info.writeMem = false; + return true; + } + case Intrinsic::arm_neon_vst1: + case Intrinsic::arm_neon_vst2: + case Intrinsic::arm_neon_vst3: + case Intrinsic::arm_neon_vst4: + case Intrinsic::arm_neon_vst2lane: + case Intrinsic::arm_neon_vst3lane: + case Intrinsic::arm_neon_vst4lane: { + Info.opc = ISD::INTRINSIC_VOID; + // Conservatively set memVT to the entire set of vectors stored. + unsigned NumElts = 0; + for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) { + const Type *ArgTy = I.getArgOperand(ArgI)->getType(); + if (!ArgTy->isVectorTy()) + break; + NumElts += getTargetData()->getTypeAllocSize(ArgTy) / 8; + } + Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); + Info.ptrVal = I.getArgOperand(0); + Info.offset = 0; + Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1); + Info.align = cast<ConstantInt>(AlignArg)->getZExtValue(); + Info.vol = false; // volatile stores with NEON intrinsics not supported + Info.readMem = false; + Info.writeMem = true; + return true; + } + default: + break; + } + + return false; +} diff --git a/contrib/llvm/lib/Target/ARM/ARMISelLowering.h b/contrib/llvm/lib/Target/ARM/ARMISelLowering.h index ba9ea7f..dc400c4 100644 --- a/contrib/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/contrib/llvm/lib/Target/ARM/ARMISelLowering.h @@ -34,6 +34,10 @@ namespace llvm { Wrapper, // Wrapper - A wrapper node for TargetConstantPool, // TargetExternalSymbol, and TargetGlobalAddress. + WrapperDYN, // WrapperDYN - A wrapper node for TargetGlobalAddress in + // DYN mode. + WrapperPIC, // WrapperPIC - A wrapper node for TargetGlobalAddress in + // PIC mode. WrapperJT, // WrapperJT - A wrapper node for TargetJumpTable CALL, // Function call. @@ -47,8 +51,6 @@ namespace llvm { PIC_ADD, // Add with a PC operand and a PIC label. - AND, // ARM "and" instruction that sets the 's' flag in CPSR. - CMP, // ARM compare instructions. CMPZ, // ARM compare that sets only Z flag. CMPFP, // ARM VFP compare instruction, sets FPSCR. @@ -73,8 +75,9 @@ namespace llvm { VMOVRRD, // double to two gprs. VMOVDRR, // Two gprs to double. - EH_SJLJ_SETJMP, // SjLj exception handling setjmp. - EH_SJLJ_LONGJMP, // SjLj exception handling longjmp. + EH_SJLJ_SETJMP, // SjLj exception handling setjmp. + EH_SJLJ_LONGJMP, // SjLj exception handling longjmp. + EH_SJLJ_DISPATCHSETUP, // SjLj exception handling dispatch setup. TC_RETURN, // Tail call return pseudo. @@ -82,13 +85,20 @@ namespace llvm { DYN_ALLOC, // Dynamic allocation on the stack. - MEMBARRIER, // Memory barrier - SYNCBARRIER, // Memory sync barrier + MEMBARRIER, // Memory barrier (DMB) + MEMBARRIER_MCR, // Memory barrier (MCR) + + PRELOAD, // Preload VCEQ, // Vector compare equal. + VCEQZ, // Vector compare equal to zero. VCGE, // Vector compare greater than or equal. + VCGEZ, // Vector compare greater than or equal to zero. + VCLEZ, // Vector compare less than or equal to zero. VCGEU, // Vector compare unsigned greater than or equal. VCGT, // Vector compare greater than. + VCGTZ, // Vector compare greater than zero. + VCLTZ, // Vector compare less than zero. VCGTU, // Vector compare unsigned greater than. VTST, // Vector test bits. @@ -161,7 +171,38 @@ namespace llvm { FMIN, // Bit-field insert - BFI + BFI, + + // Vector OR with immediate + VORRIMM, + // Vector AND with NOT of immediate + VBICIMM, + + // Vector load N-element structure to all lanes: + VLD2DUP = ISD::FIRST_TARGET_MEMORY_OPCODE, + VLD3DUP, + VLD4DUP, + + // NEON loads with post-increment base updates: + VLD1_UPD, + VLD2_UPD, + VLD3_UPD, + VLD4_UPD, + VLD2LN_UPD, + VLD3LN_UPD, + VLD4LN_UPD, + VLD2DUP_UPD, + VLD3DUP_UPD, + VLD4DUP_UPD, + + // NEON stores with post-increment base updates: + VST1_UPD, + VST2_UPD, + VST3_UPD, + VST4_UPD, + VST2LN_UPD, + VST3LN_UPD, + VST4LN_UPD }; } @@ -193,14 +234,16 @@ namespace llvm { virtual void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue>&Results, SelectionDAG &DAG) const; - virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const; - virtual const char *getTargetNodeName(unsigned Opcode) const; virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *MBB) const; + virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const; + + bool isDesirableToTransformToIntegerOp(unsigned Opc, EVT VT) const; + /// allowsUnalignedMemoryAccesses - Returns true if the target allows /// unaligned memory accesses. of the specified type. /// FIXME: Add getOptimalMemOpType to implement memcpy with NEON? @@ -241,7 +284,15 @@ namespace llvm { unsigned Depth) const; + virtual bool ExpandInlineAsm(CallInst *CI) const; + ConstraintType getConstraintType(const std::string &Constraint) const; + + /// Examine constraint string and operand type and determine a weight value. + /// The operand object must already have been set up with the operand type. + ConstraintWeight getSingleConstraintMatchWeight( + AsmOperandInfo &info, const char *constraint) const; + std::pair<unsigned, const TargetRegisterClass*> getRegForInlineAsmConstraint(const std::string &Constraint, EVT VT) const; @@ -290,6 +341,9 @@ namespace llvm { /// materialize the FP immediate as a load from a constant pool. virtual bool isFPImmLegal(const APFloat &Imm, EVT VT) const; + virtual bool getTgtMemIntrinsic(IntrinsicInfo &Info, + const CallInst &I, + unsigned Intrinsic) const; protected: std::pair<const TargetRegisterClass*, uint8_t> findRepresentativeClass(EVT VT) const; @@ -301,6 +355,8 @@ namespace llvm { const TargetRegisterInfo *RegInfo; + const InstrItineraryData *Itins; + /// ARMPCLabelIndex - Keep track of the number of ARM PC labels created. /// unsigned ARMPCLabelIndex; @@ -329,6 +385,7 @@ namespace llvm { ISD::ArgFlagsTy Flags) const; SDValue LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerEH_SJLJ_DISPATCHSETUP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget) const; SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const; @@ -350,6 +407,10 @@ namespace llvm { SDValue LowerShiftRightParts(SDValue Op, SelectionDAG &DAG) const; SDValue LowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, + const ARMSubtarget *ST) const; + + SDValue ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const; SDValue LowerCallResult(SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg, @@ -393,6 +454,8 @@ namespace llvm { const SmallVectorImpl<SDValue> &OutVals, DebugLoc dl, SelectionDAG &DAG) const; + virtual bool isUsedByReturnOnly(SDNode *N) const; + SDValue getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue &ARMcc, SelectionDAG &DAG, DebugLoc dl) const; SDValue getVFPCmp(SDValue LHS, SDValue RHS, @@ -410,6 +473,13 @@ namespace llvm { }; + enum NEONModImmType { + VMOVModImm, + VMVNModImm, + OtherModImm + }; + + namespace ARM { FastISel *createFastISel(FunctionLoweringInfo &funcInfo); } diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrFormats.td b/contrib/llvm/lib/Target/ARM/ARMInstrFormats.td index 113cfff..765cba4 100644 --- a/contrib/llvm/lib/Target/ARM/ARMInstrFormats.td +++ b/contrib/llvm/lib/Target/ARM/ARMInstrFormats.td @@ -1,4 +1,4 @@ -//===- ARMInstrFormats.td - ARM Instruction Formats --*- tablegen -*---------=// +//===- ARMInstrFormats.td - ARM Instruction Formats ----------*- tablegen -*-=// // // The LLVM Compiler Infrastructure // @@ -71,7 +71,7 @@ def NVTBLFrm : Format<41>; // Misc flags. -// the instruction has a Rn register operand. +// The instruction has an Rn register operand. // UnaryDP - Indicates this is a unary data processing instruction, i.e. // it doesn't have a Rn operand. class UnaryDP { bit isUnaryDataProc = 1; } @@ -84,9 +84,10 @@ class Xform16Bit { bit canXformTo16Bit = 1; } // ARM Instruction flags. These need to match ARMBaseInstrInfo.h. // +// FIXME: Once the JIT is MC-ized, these can go away. // Addressing mode. -class AddrMode<bits<4> val> { - bits<4> Value = val; +class AddrMode<bits<5> val> { + bits<5> Value = val; } def AddrModeNone : AddrMode<0>; def AddrMode1 : AddrMode<1>; @@ -104,6 +105,7 @@ def AddrModeT2_i8 : AddrMode<12>; def AddrModeT2_so : AddrMode<13>; def AddrModeT2_pc : AddrMode<14>; def AddrModeT2_i8s4 : AddrMode<15>; +def AddrMode_i12 : AddrMode<16>; // Instruction size. class SizeFlagVal<bits<3> val> { @@ -134,7 +136,6 @@ def NeonDomain : Domain<2>; // Instructions in Neon domain only def VFPNeonDomain : Domain<3>; // Instructions in both VFP & Neon domains //===----------------------------------------------------------------------===// - // ARM special operands. // @@ -143,6 +144,39 @@ def CondCodeOperand : AsmOperandClass { let SuperClasses = []; } +def CCOutOperand : AsmOperandClass { + let Name = "CCOut"; + let SuperClasses = []; +} + +def MemBarrierOptOperand : AsmOperandClass { + let Name = "MemBarrierOpt"; + let SuperClasses = []; + let ParserMethod = "tryParseMemBarrierOptOperand"; +} + +def ProcIFlagsOperand : AsmOperandClass { + let Name = "ProcIFlags"; + let SuperClasses = []; + let ParserMethod = "tryParseProcIFlagsOperand"; +} + +def MSRMaskOperand : AsmOperandClass { + let Name = "MSRMask"; + let SuperClasses = []; + let ParserMethod = "tryParseMSRMaskOperand"; +} + +// ARM imod and iflag operands, used only by the CPS instruction. +def imod_op : Operand<i32> { + let PrintMethod = "printCPSIMod"; +} + +def iflags_op : Operand<i32> { + let PrintMethod = "printCPSIFlag"; + let ParserMatchClass = ProcIFlagsOperand; +} + // ARM Predicate operand. Default to 14 = always (AL). Second part is CC // register whose default is 0 (no register). def pred : PredicateOperand<OtherVT, (ops i32imm, CCR), @@ -153,16 +187,23 @@ def pred : PredicateOperand<OtherVT, (ops i32imm, CCR), // Conditional code result for instructions whose 's' bit is set, e.g. subs. def cc_out : OptionalDefOperand<OtherVT, (ops CCR), (ops (i32 zero_reg))> { + let EncoderMethod = "getCCOutOpValue"; let PrintMethod = "printSBitModifierOperand"; + let ParserMatchClass = CCOutOperand; } // Same as cc_out except it defaults to setting CPSR. def s_cc_out : OptionalDefOperand<OtherVT, (ops CCR), (ops (i32 CPSR))> { + let EncoderMethod = "getCCOutOpValue"; let PrintMethod = "printSBitModifierOperand"; + let ParserMatchClass = CCOutOperand; } // ARM special operands for disassembly only. // +def setend_op : Operand<i32> { + let PrintMethod = "printSetendOperand"; +} def cps_opt : Operand<i32> { let PrintMethod = "printCPSOptionOperand"; @@ -170,6 +211,7 @@ def cps_opt : Operand<i32> { def msr_mask : Operand<i32> { let PrintMethod = "printMSRMaskOperand"; + let ParserMatchClass = MSRMaskOperand; } // A8.6.117, A8.6.118. Different instructions are generated for #0 and #-0. @@ -179,7 +221,6 @@ def neg_zero : Operand<i32> { } //===----------------------------------------------------------------------===// - // ARM Instruction templates. // @@ -198,14 +239,17 @@ class InstTemplate<AddrMode am, SizeFlagVal sz, IndexMode im, bit isUnaryDataProc = 0; bit canXformTo16Bit = 0; + // If this is a pseudo instruction, mark it isCodeGenOnly. + let isCodeGenOnly = !eq(!cast<string>(f), "Pseudo"); + // The layout of TSFlags should be kept in sync with ARMBaseInstrInfo.h. - let TSFlags{3-0} = AM.Value; - let TSFlags{6-4} = SZ.Value; - let TSFlags{8-7} = IndexModeBits; - let TSFlags{14-9} = Form; - let TSFlags{15} = isUnaryDataProc; - let TSFlags{16} = canXformTo16Bit; - let TSFlags{18-17} = D.Value; + let TSFlags{4-0} = AM.Value; + let TSFlags{7-5} = SZ.Value; + let TSFlags{9-8} = IndexModeBits; + let TSFlags{15-10} = Form; + let TSFlags{16} = isUnaryDataProc; + let TSFlags{17} = canXformTo16Bit; + let TSFlags{19-18} = D.Value; let Constraints = cstr; let Itinerary = itin; @@ -225,25 +269,51 @@ class InstThumb<AddrMode am, SizeFlagVal sz, IndexMode im, Format f, Domain d, string cstr, InstrItinClass itin> : InstTemplate<am, sz, im, f, d, cstr, itin>; -class PseudoInst<dag oops, dag iops, InstrItinClass itin, - string asm, list<dag> pattern> +class PseudoInst<dag oops, dag iops, InstrItinClass itin, list<dag> pattern> + // FIXME: This really should derive from InstTemplate instead, as pseudos + // don't need encoding information. TableGen doesn't like that + // currently. Need to figure out why and fix it. : InstARM<AddrModeNone, SizeSpecial, IndexModeNone, Pseudo, GenericDomain, "", itin> { let OutOperandList = oops; let InOperandList = iops; - let AsmString = asm; let Pattern = pattern; } +// PseudoInst that's ARM-mode only. +class ARMPseudoInst<dag oops, dag iops, SizeFlagVal sz, InstrItinClass itin, + list<dag> pattern> + : PseudoInst<oops, iops, itin, pattern> { + let SZ = sz; + list<Predicate> Predicates = [IsARM]; +} + +// PseudoInst that's Thumb-mode only. +class tPseudoInst<dag oops, dag iops, SizeFlagVal sz, InstrItinClass itin, + list<dag> pattern> + : PseudoInst<oops, iops, itin, pattern> { + let SZ = sz; + list<Predicate> Predicates = [IsThumb]; +} + +// PseudoInst that's Thumb2-mode only. +class t2PseudoInst<dag oops, dag iops, SizeFlagVal sz, InstrItinClass itin, + list<dag> pattern> + : PseudoInst<oops, iops, itin, pattern> { + let SZ = sz; + list<Predicate> Predicates = [IsThumb2]; +} // Almost all ARM instructions are predicable. class I<dag oops, dag iops, AddrMode am, SizeFlagVal sz, IndexMode im, Format f, InstrItinClass itin, string opc, string asm, string cstr, list<dag> pattern> : InstARM<am, sz, im, f, GenericDomain, cstr, itin> { + bits<4> p; + let Inst{31-28} = p; let OutOperandList = oops; let InOperandList = !con(iops, (ins pred:$p)); - let AsmString = !strconcat(opc, !strconcat("${p}", asm)); + let AsmString = !strconcat(opc, "${p}", asm); let Pattern = pattern; list<Predicate> Predicates = [IsARM]; } @@ -270,9 +340,14 @@ class sI<dag oops, dag iops, AddrMode am, SizeFlagVal sz, string opc, string asm, string cstr, list<dag> pattern> : InstARM<am, sz, im, f, GenericDomain, cstr, itin> { + bits<4> p; // Predicate operand + bits<1> s; // condition-code set flag ('1' if the insn should set the flags) + let Inst{31-28} = p; + let Inst{20} = s; + let OutOperandList = oops; let InOperandList = !con(iops, (ins pred:$p, cc_out:$s)); - let AsmString = !strconcat(opc, !strconcat("${p}${s}", asm)); + let AsmString = !strconcat(opc, "${s}${p}", asm); let Pattern = pattern; list<Predicate> Predicates = [IsARM]; } @@ -319,10 +394,6 @@ class ABXI<bits<4> opcod, dag oops, dag iops, InstrItinClass itin, asm, "", pattern> { let Inst{27-24} = opcod; } -class ABXIx2<dag oops, dag iops, InstrItinClass itin, - string asm, list<dag> pattern> - : XI<oops, iops, AddrModeNone, Size8Bytes, IndexModeNone, Pseudo, itin, - asm, "", pattern>; // BR_JT instructions class JTI<dag oops, dag iops, InstrItinClass itin, @@ -335,19 +406,42 @@ class AIldrex<bits<2> opcod, dag oops, dag iops, InstrItinClass itin, string opc, string asm, list<dag> pattern> : I<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, LdStExFrm, itin, opc, asm, "", pattern> { + bits<4> Rt; + bits<4> Rn; let Inst{27-23} = 0b00011; let Inst{22-21} = opcod; let Inst{20} = 1; + let Inst{19-16} = Rn; + let Inst{15-12} = Rt; let Inst{11-0} = 0b111110011111; } class AIstrex<bits<2> opcod, dag oops, dag iops, InstrItinClass itin, string opc, string asm, list<dag> pattern> : I<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, LdStExFrm, itin, opc, asm, "", pattern> { + bits<4> Rd; + bits<4> Rt; + bits<4> Rn; let Inst{27-23} = 0b00011; let Inst{22-21} = opcod; let Inst{20} = 0; + let Inst{19-16} = Rn; + let Inst{15-12} = Rd; let Inst{11-4} = 0b11111001; + let Inst{3-0} = Rt; +} +class AIswp<bit b, dag oops, dag iops, string opc, list<dag> pattern> + : AI<oops, iops, MiscFrm, NoItinerary, opc, "\t$Rt, $Rt2, [$Rn]", pattern> { + bits<4> Rt; + bits<4> Rt2; + bits<4> Rn; + let Inst{27-23} = 0b00010; + let Inst{22} = b; + let Inst{21-20} = 0b00; + let Inst{19-16} = Rn; + let Inst{15-12} = Rt; + let Inst{11-4} = 0b00001001; + let Inst{3-0} = Rt2; } // addrmode1 instructions @@ -372,387 +466,125 @@ class AXI1<bits<4> opcod, dag oops, dag iops, Format f, InstrItinClass itin, let Inst{24-21} = opcod; let Inst{27-26} = 0b00; } -class AI1x2<dag oops, dag iops, Format f, InstrItinClass itin, - string opc, string asm, list<dag> pattern> - : I<oops, iops, AddrMode1, Size8Bytes, IndexModeNone, f, itin, - opc, asm, "", pattern>; - - -// addrmode2 loads and stores -class AI2<dag oops, dag iops, Format f, InstrItinClass itin, - string opc, string asm, list<dag> pattern> - : I<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f, itin, - opc, asm, "", pattern> { - let Inst{27-26} = 0b01; -} // loads -class AI2ldw<dag oops, dag iops, Format f, InstrItinClass itin, - string opc, string asm, list<dag> pattern> - : I<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f, itin, - opc, asm, "", pattern> { - let Inst{20} = 1; // L bit - let Inst{21} = 0; // W bit - let Inst{22} = 0; // B bit - let Inst{24} = 1; // P bit - let Inst{27-26} = 0b01; -} -class AXI2ldw<dag oops, dag iops, Format f, InstrItinClass itin, - string asm, list<dag> pattern> - : XI<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f, itin, - asm, "", pattern> { - let Inst{20} = 1; // L bit - let Inst{21} = 0; // W bit - let Inst{22} = 0; // B bit - let Inst{24} = 1; // P bit - let Inst{27-26} = 0b01; -} -class AI2ldb<dag oops, dag iops, Format f, InstrItinClass itin, - string opc, string asm, list<dag> pattern> - : I<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f, itin, - opc, asm, "", pattern> { - let Inst{20} = 1; // L bit - let Inst{21} = 0; // W bit - let Inst{22} = 1; // B bit - let Inst{24} = 1; // P bit - let Inst{27-26} = 0b01; -} -class AXI2ldb<dag oops, dag iops, Format f, InstrItinClass itin, - string asm, list<dag> pattern> - : XI<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f, itin, - asm, "", pattern> { - let Inst{20} = 1; // L bit - let Inst{21} = 0; // W bit - let Inst{22} = 1; // B bit - let Inst{24} = 1; // P bit - let Inst{27-26} = 0b01; -} - -// stores -class AI2stw<dag oops, dag iops, Format f, InstrItinClass itin, - string opc, string asm, list<dag> pattern> - : I<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f, itin, - opc, asm, "", pattern> { - let Inst{20} = 0; // L bit - let Inst{21} = 0; // W bit - let Inst{22} = 0; // B bit - let Inst{24} = 1; // P bit - let Inst{27-26} = 0b01; -} -class AXI2stw<dag oops, dag iops, Format f, InstrItinClass itin, - string asm, list<dag> pattern> - : XI<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f, itin, - asm, "", pattern> { - let Inst{20} = 0; // L bit - let Inst{21} = 0; // W bit - let Inst{22} = 0; // B bit - let Inst{24} = 1; // P bit - let Inst{27-26} = 0b01; -} -class AI2stb<dag oops, dag iops, Format f, InstrItinClass itin, - string opc, string asm, list<dag> pattern> - : I<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f, itin, - opc, asm, "", pattern> { - let Inst{20} = 0; // L bit - let Inst{21} = 0; // W bit - let Inst{22} = 1; // B bit - let Inst{24} = 1; // P bit - let Inst{27-26} = 0b01; -} -class AXI2stb<dag oops, dag iops, Format f, InstrItinClass itin, - string asm, list<dag> pattern> - : XI<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f, itin, - asm, "", pattern> { - let Inst{20} = 0; // L bit - let Inst{21} = 0; // W bit - let Inst{22} = 1; // B bit - let Inst{24} = 1; // P bit - let Inst{27-26} = 0b01; -} -// Pre-indexed loads -class AI2ldwpr<dag oops, dag iops, Format f, InstrItinClass itin, - string opc, string asm, string cstr, list<dag> pattern> - : I<oops, iops, AddrMode2, Size4Bytes, IndexModePre, f, itin, - opc, asm, cstr, pattern> { - let Inst{20} = 1; // L bit - let Inst{21} = 1; // W bit - let Inst{22} = 0; // B bit - let Inst{24} = 1; // P bit - let Inst{27-26} = 0b01; -} -class AI2ldbpr<dag oops, dag iops, Format f, InstrItinClass itin, - string opc, string asm, string cstr, list<dag> pattern> - : I<oops, iops, AddrMode2, Size4Bytes, IndexModePre, f, itin, - opc, asm, cstr, pattern> { - let Inst{20} = 1; // L bit - let Inst{21} = 1; // W bit - let Inst{22} = 1; // B bit - let Inst{24} = 1; // P bit - let Inst{27-26} = 0b01; -} - -// Pre-indexed stores -class AI2stwpr<dag oops, dag iops, Format f, InstrItinClass itin, - string opc, string asm, string cstr, list<dag> pattern> - : I<oops, iops, AddrMode2, Size4Bytes, IndexModePre, f, itin, - opc, asm, cstr, pattern> { - let Inst{20} = 0; // L bit - let Inst{21} = 1; // W bit - let Inst{22} = 0; // B bit - let Inst{24} = 1; // P bit - let Inst{27-26} = 0b01; -} -class AI2stbpr<dag oops, dag iops, Format f, InstrItinClass itin, - string opc, string asm, string cstr, list<dag> pattern> - : I<oops, iops, AddrMode2, Size4Bytes, IndexModePre, f, itin, +// LDR/LDRB/STR/STRB/... +class AI2ldst<bits<3> op, bit isLd, bit isByte, dag oops, dag iops, AddrMode am, + Format f, InstrItinClass itin, string opc, string asm, + list<dag> pattern> + : I<oops, iops, am, Size4Bytes, IndexModeNone, f, itin, opc, asm, + "", pattern> { + let Inst{27-25} = op; + let Inst{24} = 1; // 24 == P + // 23 == U + let Inst{22} = isByte; + let Inst{21} = 0; // 21 == W + let Inst{20} = isLd; +} +// Indexed load/stores +class AI2ldstidx<bit isLd, bit isByte, bit isPre, dag oops, dag iops, + IndexMode im, Format f, InstrItinClass itin, string opc, + string asm, string cstr, list<dag> pattern> + : I<oops, iops, AddrMode2, Size4Bytes, im, f, itin, opc, asm, cstr, pattern> { - let Inst{20} = 0; // L bit - let Inst{21} = 1; // W bit - let Inst{22} = 1; // B bit - let Inst{24} = 1; // P bit - let Inst{27-26} = 0b01; -} - -// Post-indexed loads -class AI2ldwpo<dag oops, dag iops, Format f, InstrItinClass itin, - string opc, string asm, string cstr, list<dag> pattern> - : I<oops, iops, AddrMode2, Size4Bytes, IndexModePost, f, itin, - opc, asm, cstr,pattern> { - let Inst{20} = 1; // L bit - let Inst{21} = 0; // W bit - let Inst{22} = 0; // B bit - let Inst{24} = 0; // P bit - let Inst{27-26} = 0b01; -} -class AI2ldbpo<dag oops, dag iops, Format f, InstrItinClass itin, - string opc, string asm, string cstr, list<dag> pattern> - : I<oops, iops, AddrMode2, Size4Bytes, IndexModePost, f, itin, - opc, asm, cstr,pattern> { - let Inst{20} = 1; // L bit - let Inst{21} = 0; // W bit - let Inst{22} = 1; // B bit - let Inst{24} = 0; // P bit - let Inst{27-26} = 0b01; -} - -// Post-indexed stores -class AI2stwpo<dag oops, dag iops, Format f, InstrItinClass itin, - string opc, string asm, string cstr, list<dag> pattern> - : I<oops, iops, AddrMode2, Size4Bytes, IndexModePost, f, itin, - opc, asm, cstr,pattern> { - let Inst{20} = 0; // L bit - let Inst{21} = 0; // W bit - let Inst{22} = 0; // B bit - let Inst{24} = 0; // P bit - let Inst{27-26} = 0b01; -} -class AI2stbpo<dag oops, dag iops, Format f, InstrItinClass itin, - string opc, string asm, string cstr, list<dag> pattern> - : I<oops, iops, AddrMode2, Size4Bytes, IndexModePost, f, itin, - opc, asm, cstr,pattern> { - let Inst{20} = 0; // L bit - let Inst{21} = 0; // W bit - let Inst{22} = 1; // B bit - let Inst{24} = 0; // P bit + bits<4> Rt; let Inst{27-26} = 0b01; + let Inst{24} = isPre; // P bit + let Inst{22} = isByte; // B bit + let Inst{21} = isPre; // W bit + let Inst{20} = isLd; // L bit + let Inst{15-12} = Rt; +} +class AI2stridx<bit isByte, bit isPre, dag oops, dag iops, + IndexMode im, Format f, InstrItinClass itin, string opc, + string asm, string cstr, list<dag> pattern> + : AI2ldstidx<0, isByte, isPre, oops, iops, im, f, itin, opc, asm, cstr, + pattern> { + // AM2 store w/ two operands: (GPR, am2offset) + // {13} 1 == Rm, 0 == imm12 + // {12} isAdd + // {11-0} imm12/Rm + bits<14> offset; + bits<4> Rn; + let Inst{25} = offset{13}; + let Inst{23} = offset{12}; + let Inst{19-16} = Rn; + let Inst{11-0} = offset{11-0}; } // addrmode3 instructions -class AI3<dag oops, dag iops, Format f, InstrItinClass itin, - string opc, string asm, list<dag> pattern> - : I<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, itin, - opc, asm, "", pattern>; -class AXI3<dag oops, dag iops, Format f, InstrItinClass itin, - string asm, list<dag> pattern> - : XI<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, itin, - asm, "", pattern>; - -// loads -class AI3ldh<dag oops, dag iops, Format f, InstrItinClass itin, - string opc, string asm, list<dag> pattern> - : I<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, itin, - opc, asm, "", pattern> { - let Inst{4} = 1; - let Inst{5} = 1; // H bit - let Inst{6} = 0; // S bit - let Inst{7} = 1; - let Inst{20} = 1; // L bit - let Inst{21} = 0; // W bit - let Inst{24} = 1; // P bit - let Inst{27-25} = 0b000; -} -class AXI3ldh<dag oops, dag iops, Format f, InstrItinClass itin, - string asm, list<dag> pattern> - : XI<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, itin, - asm, "", pattern> { - let Inst{4} = 1; - let Inst{5} = 1; // H bit - let Inst{6} = 0; // S bit - let Inst{7} = 1; - let Inst{20} = 1; // L bit - let Inst{21} = 0; // W bit - let Inst{24} = 1; // P bit -} -class AI3ldsh<dag oops, dag iops, Format f, InstrItinClass itin, - string opc, string asm, list<dag> pattern> +class AI3ld<bits<4> op, bit op20, dag oops, dag iops, Format f, + InstrItinClass itin, string opc, string asm, list<dag> pattern> : I<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, itin, opc, asm, "", pattern> { - let Inst{4} = 1; - let Inst{5} = 1; // H bit - let Inst{6} = 1; // S bit - let Inst{7} = 1; - let Inst{20} = 1; // L bit - let Inst{21} = 0; // W bit - let Inst{24} = 1; // P bit + bits<14> addr; + bits<4> Rt; let Inst{27-25} = 0b000; -} -class AXI3ldsh<dag oops, dag iops, Format f, InstrItinClass itin, - string asm, list<dag> pattern> - : XI<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, itin, - asm, "", pattern> { - let Inst{4} = 1; - let Inst{5} = 1; // H bit - let Inst{6} = 1; // S bit - let Inst{7} = 1; - let Inst{20} = 1; // L bit - let Inst{21} = 0; // W bit - let Inst{24} = 1; // P bit -} -class AI3ldsb<dag oops, dag iops, Format f, InstrItinClass itin, - string opc, string asm, list<dag> pattern> - : I<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, itin, - opc, asm, "", pattern> { - let Inst{4} = 1; - let Inst{5} = 0; // H bit - let Inst{6} = 1; // S bit - let Inst{7} = 1; - let Inst{20} = 1; // L bit - let Inst{21} = 0; // W bit - let Inst{24} = 1; // P bit + let Inst{24} = 1; // P bit + let Inst{23} = addr{8}; // U bit + let Inst{22} = addr{13}; // 1 == imm8, 0 == Rm + let Inst{21} = 0; // W bit + let Inst{20} = op20; // L bit + let Inst{19-16} = addr{12-9}; // Rn + let Inst{15-12} = Rt; // Rt + let Inst{11-8} = addr{7-4}; // imm7_4/zero + let Inst{7-4} = op; + let Inst{3-0} = addr{3-0}; // imm3_0/Rm +} + +class AI3ldstidx<bits<4> op, bit op20, bit isLd, bit isPre, dag oops, dag iops, + IndexMode im, Format f, InstrItinClass itin, string opc, + string asm, string cstr, list<dag> pattern> + : I<oops, iops, AddrMode3, Size4Bytes, im, f, itin, + opc, asm, cstr, pattern> { + bits<4> Rt; let Inst{27-25} = 0b000; -} -class AXI3ldsb<dag oops, dag iops, Format f, InstrItinClass itin, - string asm, list<dag> pattern> - : XI<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, itin, - asm, "", pattern> { - let Inst{4} = 1; - let Inst{5} = 0; // H bit - let Inst{6} = 1; // S bit - let Inst{7} = 1; - let Inst{20} = 1; // L bit - let Inst{21} = 0; // W bit - let Inst{24} = 1; // P bit -} -class AI3ldd<dag oops, dag iops, Format f, InstrItinClass itin, - string opc, string asm, list<dag> pattern> - : I<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, itin, - opc, asm, "", pattern> { - let Inst{4} = 1; - let Inst{5} = 0; // H bit - let Inst{6} = 1; // S bit - let Inst{7} = 1; - let Inst{20} = 0; // L bit - let Inst{21} = 0; // W bit - let Inst{24} = 1; // P bit + let Inst{24} = isPre; // P bit + let Inst{21} = isPre; // W bit + let Inst{20} = op20; // L bit + let Inst{15-12} = Rt; // Rt + let Inst{7-4} = op; +} +class AI3stridx<bits<4> op, bit isByte, bit isPre, dag oops, dag iops, + IndexMode im, Format f, InstrItinClass itin, string opc, + string asm, string cstr, list<dag> pattern> + : AI2ldstidx<0, isByte, isPre, oops, iops, im, f, itin, opc, asm, cstr, + pattern> { + // AM3 store w/ two operands: (GPR, am3offset) + bits<14> offset; + bits<4> Rt; + bits<4> Rn; let Inst{27-25} = 0b000; + let Inst{23} = offset{8}; + let Inst{22} = offset{9}; + let Inst{19-16} = Rn; + let Inst{15-12} = Rt; // Rt + let Inst{11-8} = offset{7-4}; // imm7_4/zero + let Inst{7-4} = op; + let Inst{3-0} = offset{3-0}; // imm3_0/Rm } // stores -class AI3sth<dag oops, dag iops, Format f, InstrItinClass itin, +class AI3str<bits<4> op, dag oops, dag iops, Format f, InstrItinClass itin, string opc, string asm, list<dag> pattern> : I<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, itin, opc, asm, "", pattern> { - let Inst{4} = 1; - let Inst{5} = 1; // H bit - let Inst{6} = 0; // S bit - let Inst{7} = 1; - let Inst{20} = 0; // L bit - let Inst{21} = 0; // W bit - let Inst{24} = 1; // P bit - let Inst{27-25} = 0b000; -} -class AXI3sth<dag oops, dag iops, Format f, InstrItinClass itin, - string asm, list<dag> pattern> - : XI<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, itin, - asm, "", pattern> { - let Inst{4} = 1; - let Inst{5} = 1; // H bit - let Inst{6} = 0; // S bit - let Inst{7} = 1; - let Inst{20} = 0; // L bit - let Inst{21} = 0; // W bit - let Inst{24} = 1; // P bit -} -class AI3std<dag oops, dag iops, Format f, InstrItinClass itin, - string opc, string asm, list<dag> pattern> - : I<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, itin, - opc, asm, "", pattern> { - let Inst{4} = 1; - let Inst{5} = 1; // H bit - let Inst{6} = 1; // S bit - let Inst{7} = 1; - let Inst{20} = 0; // L bit - let Inst{21} = 0; // W bit - let Inst{24} = 1; // P bit + bits<14> addr; + bits<4> Rt; let Inst{27-25} = 0b000; + let Inst{24} = 1; // P bit + let Inst{23} = addr{8}; // U bit + let Inst{22} = addr{13}; // 1 == imm8, 0 == Rm + let Inst{21} = 0; // W bit + let Inst{20} = 0; // L bit + let Inst{19-16} = addr{12-9}; // Rn + let Inst{15-12} = Rt; // Rt + let Inst{11-8} = addr{7-4}; // imm7_4/zero + let Inst{7-4} = op; + let Inst{3-0} = addr{3-0}; // imm3_0/Rm } -// Pre-indexed loads -class AI3ldhpr<dag oops, dag iops, Format f, InstrItinClass itin, - string opc, string asm, string cstr, list<dag> pattern> - : I<oops, iops, AddrMode3, Size4Bytes, IndexModePre, f, itin, - opc, asm, cstr, pattern> { - let Inst{4} = 1; - let Inst{5} = 1; // H bit - let Inst{6} = 0; // S bit - let Inst{7} = 1; - let Inst{20} = 1; // L bit - let Inst{21} = 1; // W bit - let Inst{24} = 1; // P bit - let Inst{27-25} = 0b000; -} -class AI3ldshpr<dag oops, dag iops, Format f, InstrItinClass itin, - string opc, string asm, string cstr, list<dag> pattern> - : I<oops, iops, AddrMode3, Size4Bytes, IndexModePre, f, itin, - opc, asm, cstr, pattern> { - let Inst{4} = 1; - let Inst{5} = 1; // H bit - let Inst{6} = 1; // S bit - let Inst{7} = 1; - let Inst{20} = 1; // L bit - let Inst{21} = 1; // W bit - let Inst{24} = 1; // P bit - let Inst{27-25} = 0b000; -} -class AI3ldsbpr<dag oops, dag iops, Format f, InstrItinClass itin, - string opc, string asm, string cstr, list<dag> pattern> - : I<oops, iops, AddrMode3, Size4Bytes, IndexModePre, f, itin, - opc, asm, cstr, pattern> { - let Inst{4} = 1; - let Inst{5} = 0; // H bit - let Inst{6} = 1; // S bit - let Inst{7} = 1; - let Inst{20} = 1; // L bit - let Inst{21} = 1; // W bit - let Inst{24} = 1; // P bit - let Inst{27-25} = 0b000; -} -class AI3lddpr<dag oops, dag iops, Format f, InstrItinClass itin, - string opc, string asm, string cstr, list<dag> pattern> - : I<oops, iops, AddrMode3, Size4Bytes, IndexModePre, f, itin, - opc, asm, cstr, pattern> { - let Inst{4} = 1; - let Inst{5} = 0; // H bit - let Inst{6} = 1; // S bit - let Inst{7} = 1; - let Inst{20} = 0; // L bit - let Inst{21} = 1; // W bit - let Inst{24} = 1; // P bit - let Inst{27-25} = 0b000; -} - - // Pre-indexed stores class AI3sthpr<dag oops, dag iops, Format f, InstrItinClass itin, string opc, string asm, string cstr, list<dag> pattern> @@ -781,60 +613,6 @@ class AI3stdpr<dag oops, dag iops, Format f, InstrItinClass itin, let Inst{27-25} = 0b000; } -// Post-indexed loads -class AI3ldhpo<dag oops, dag iops, Format f, InstrItinClass itin, - string opc, string asm, string cstr, list<dag> pattern> - : I<oops, iops, AddrMode3, Size4Bytes, IndexModePost, f, itin, - opc, asm, cstr,pattern> { - let Inst{4} = 1; - let Inst{5} = 1; // H bit - let Inst{6} = 0; // S bit - let Inst{7} = 1; - let Inst{20} = 1; // L bit - let Inst{21} = 0; // W bit - let Inst{24} = 0; // P bit - let Inst{27-25} = 0b000; -} -class AI3ldshpo<dag oops, dag iops, Format f, InstrItinClass itin, - string opc, string asm, string cstr, list<dag> pattern> - : I<oops, iops, AddrMode3, Size4Bytes, IndexModePost, f, itin, - opc, asm, cstr,pattern> { - let Inst{4} = 1; - let Inst{5} = 1; // H bit - let Inst{6} = 1; // S bit - let Inst{7} = 1; - let Inst{20} = 1; // L bit - let Inst{21} = 0; // W bit - let Inst{24} = 0; // P bit - let Inst{27-25} = 0b000; -} -class AI3ldsbpo<dag oops, dag iops, Format f, InstrItinClass itin, - string opc, string asm, string cstr, list<dag> pattern> - : I<oops, iops, AddrMode3, Size4Bytes, IndexModePost, f, itin, - opc, asm, cstr,pattern> { - let Inst{4} = 1; - let Inst{5} = 0; // H bit - let Inst{6} = 1; // S bit - let Inst{7} = 1; - let Inst{20} = 1; // L bit - let Inst{21} = 0; // W bit - let Inst{24} = 0; // P bit - let Inst{27-25} = 0b000; -} -class AI3lddpo<dag oops, dag iops, Format f, InstrItinClass itin, - string opc, string asm, string cstr, list<dag> pattern> - : I<oops, iops, AddrMode3, Size4Bytes, IndexModePost, f, itin, - opc, asm, cstr, pattern> { - let Inst{4} = 1; - let Inst{5} = 0; // H bit - let Inst{6} = 1; // S bit - let Inst{7} = 1; - let Inst{20} = 0; // L bit - let Inst{21} = 0; // W bit - let Inst{24} = 0; // P bit - let Inst{27-25} = 0b000; -} - // Post-indexed stores class AI3sthpo<dag oops, dag iops, Format f, InstrItinClass itin, string opc, string asm, string cstr, list<dag> pattern> @@ -864,21 +642,17 @@ class AI3stdpo<dag oops, dag iops, Format f, InstrItinClass itin, } // addrmode4 instructions -class AXI4ld<dag oops, dag iops, IndexMode im, Format f, InstrItinClass itin, - string asm, string cstr, list<dag> pattern> - : XI<oops, iops, AddrMode4, Size4Bytes, im, f, itin, - asm, cstr, pattern> { - let Inst{20} = 1; // L bit - let Inst{22} = 0; // S bit +class AXI4<dag oops, dag iops, IndexMode im, Format f, InstrItinClass itin, + string asm, string cstr, list<dag> pattern> + : XI<oops, iops, AddrMode4, Size4Bytes, im, f, itin, asm, cstr, pattern> { + bits<4> p; + bits<16> regs; + bits<4> Rn; + let Inst{31-28} = p; let Inst{27-25} = 0b100; -} -class AXI4st<dag oops, dag iops, IndexMode im, Format f, InstrItinClass itin, - string asm, string cstr, list<dag> pattern> - : XI<oops, iops, AddrMode4, Size4Bytes, im, f, itin, - asm, cstr, pattern> { - let Inst{20} = 0; // L bit let Inst{22} = 0; // S bit - let Inst{27-25} = 0b100; + let Inst{19-16} = Rn; + let Inst{15-0} = regs; } // Unsigned multiply, multiply-accumulate instructions. @@ -899,24 +673,65 @@ class AsMul1I<bits<7> opcod, dag oops, dag iops, InstrItinClass itin, } // Most significant word multiply -class AMul2I<bits<7> opcod, dag oops, dag iops, InstrItinClass itin, - string opc, string asm, list<dag> pattern> +class AMul2I<bits<7> opcod, bits<4> opc7_4, dag oops, dag iops, + InstrItinClass itin, string opc, string asm, list<dag> pattern> : I<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, MulFrm, itin, opc, asm, "", pattern> { - let Inst{7-4} = 0b1001; + bits<4> Rd; + bits<4> Rn; + bits<4> Rm; + let Inst{7-4} = opc7_4; let Inst{20} = 1; let Inst{27-21} = opcod; + let Inst{19-16} = Rd; + let Inst{11-8} = Rm; + let Inst{3-0} = Rn; +} +// MSW multiple w/ Ra operand +class AMul2Ia<bits<7> opcod, bits<4> opc7_4, dag oops, dag iops, + InstrItinClass itin, string opc, string asm, list<dag> pattern> + : AMul2I<opcod, opc7_4, oops, iops, itin, opc, asm, pattern> { + bits<4> Ra; + let Inst{15-12} = Ra; } // SMUL<x><y> / SMULW<y> / SMLA<x><y> / SMLAW<x><y> -class AMulxyI<bits<7> opcod, dag oops, dag iops, InstrItinClass itin, - string opc, string asm, list<dag> pattern> +class AMulxyIbase<bits<7> opcod, bits<2> bit6_5, dag oops, dag iops, + InstrItinClass itin, string opc, string asm, list<dag> pattern> : I<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, MulFrm, itin, opc, asm, "", pattern> { + bits<4> Rn; + bits<4> Rm; let Inst{4} = 0; let Inst{7} = 1; let Inst{20} = 0; let Inst{27-21} = opcod; + let Inst{6-5} = bit6_5; + let Inst{11-8} = Rm; + let Inst{3-0} = Rn; +} +class AMulxyI<bits<7> opcod, bits<2> bit6_5, dag oops, dag iops, + InstrItinClass itin, string opc, string asm, list<dag> pattern> + : AMulxyIbase<opcod, bit6_5, oops, iops, itin, opc, asm, pattern> { + bits<4> Rd; + let Inst{19-16} = Rd; +} + +// AMulxyI with Ra operand +class AMulxyIa<bits<7> opcod, bits<2> bit6_5, dag oops, dag iops, + InstrItinClass itin, string opc, string asm, list<dag> pattern> + : AMulxyI<opcod, bit6_5, oops, iops, itin, opc, asm, pattern> { + bits<4> Ra; + let Inst{15-12} = Ra; +} +// SMLAL* +class AMulxyI64<bits<7> opcod, bits<2> bit6_5, dag oops, dag iops, + InstrItinClass itin, string opc, string asm, list<dag> pattern> + : AMulxyIbase<opcod, bit6_5, oops, iops, itin, opc, asm, pattern> { + bits<4> RdLo; + bits<4> RdHi; + let Inst{19-16} = RdHi; + let Inst{15-12} = RdLo; } // Extend instructions. @@ -924,16 +739,47 @@ class AExtI<bits<8> opcod, dag oops, dag iops, InstrItinClass itin, string opc, string asm, list<dag> pattern> : I<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, ExtFrm, itin, opc, asm, "", pattern> { + // All AExtI instructions have Rd and Rm register operands. + bits<4> Rd; + bits<4> Rm; + let Inst{15-12} = Rd; + let Inst{3-0} = Rm; let Inst{7-4} = 0b0111; + let Inst{9-8} = 0b00; let Inst{27-20} = opcod; } // Misc Arithmetic instructions. -class AMiscA1I<bits<8> opcod, dag oops, dag iops, InstrItinClass itin, - string opc, string asm, list<dag> pattern> +class AMiscA1I<bits<8> opcod, bits<4> opc7_4, dag oops, dag iops, + InstrItinClass itin, string opc, string asm, list<dag> pattern> : I<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, ArithMiscFrm, itin, opc, asm, "", pattern> { + bits<4> Rd; + bits<4> Rm; let Inst{27-20} = opcod; + let Inst{19-16} = 0b1111; + let Inst{15-12} = Rd; + let Inst{11-8} = 0b1111; + let Inst{7-4} = opc7_4; + let Inst{3-0} = Rm; +} + +// PKH instructions +class APKHI<bits<8> opcod, bit tb, dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : I<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, ArithMiscFrm, itin, + opc, asm, "", pattern> { + bits<4> Rd; + bits<4> Rn; + bits<4> Rm; + bits<8> sh; + let Inst{27-20} = opcod; + let Inst{19-16} = Rn; + let Inst{15-12} = Rd; + let Inst{11-7} = sh{7-3}; + let Inst{6} = tb; + let Inst{5-4} = 0b01; + let Inst{3-0} = Rm; } //===----------------------------------------------------------------------===// @@ -950,12 +796,9 @@ class ARMV6Pat<dag pattern, dag result> : Pat<pattern, result> { } //===----------------------------------------------------------------------===// -// // Thumb Instruction Format Definitions. // -// TI - Thumb instruction. - class ThumbI<dag oops, dag iops, AddrMode am, SizeFlagVal sz, InstrItinClass itin, string asm, string cstr, list<dag> pattern> : InstThumb<am, sz, IndexModeNone, ThumbFrm, GenericDomain, cstr, itin> { @@ -966,6 +809,7 @@ class ThumbI<dag oops, dag iops, AddrMode am, SizeFlagVal sz, list<Predicate> Predicates = [IsThumb]; } +// TI - Thumb instruction. class TI<dag oops, dag iops, InstrItinClass itin, string asm, list<dag> pattern> : ThumbI<oops, iops, AddrModeNone, Size2Bytes, itin, asm, "", pattern>; @@ -986,6 +830,13 @@ class TIx2<bits<5> opcod1, bits<2> opcod2, bit opcod3, let Inst{12} = opcod3; } +// Move to/from coprocessor instructions +class T1Cop<dag oops, dag iops, string asm, list<dag> pattern> + : ThumbI<oops, iops, AddrModeNone, Size4Bytes, NoItinerary, asm, "", pattern>, + Encoding, Requires<[IsThumb, HasV6]> { + let Inst{31-28} = 0b1110; +} + // BR_JT instructions class TJTI<dag oops, dag iops, InstrItinClass itin, string asm, list<dag> pattern> @@ -999,7 +850,7 @@ class Thumb1I<dag oops, dag iops, AddrMode am, SizeFlagVal sz, let InOperandList = iops; let AsmString = asm; let Pattern = pattern; - list<Predicate> Predicates = [IsThumb1Only]; + list<Predicate> Predicates = [IsThumb, IsThumb1Only]; } class T1I<dag oops, dag iops, InstrItinClass itin, @@ -1008,9 +859,6 @@ class T1I<dag oops, dag iops, InstrItinClass itin, class T1Ix2<dag oops, dag iops, InstrItinClass itin, string asm, list<dag> pattern> : Thumb1I<oops, iops, AddrModeNone, Size4Bytes, itin, asm, "", pattern>; -class T1JTI<dag oops, dag iops, InstrItinClass itin, - string asm, list<dag> pattern> - : Thumb1I<oops, iops, AddrModeNone, SizeSpecial, itin, asm, "", pattern>; // Two-address instructions class T1It<dag oops, dag iops, InstrItinClass itin, @@ -1025,9 +873,9 @@ class Thumb1sI<dag oops, dag iops, AddrMode am, SizeFlagVal sz, : InstThumb<am, sz, IndexModeNone, ThumbFrm, GenericDomain, cstr, itin> { let OutOperandList = !con(oops, (outs s_cc_out:$s)); let InOperandList = !con(iops, (ins pred:$p)); - let AsmString = !strconcat(opc, !strconcat("${s}${p}", asm)); + let AsmString = !strconcat(opc, "${s}${p}", asm); let Pattern = pattern; - list<Predicate> Predicates = [IsThumb1Only]; + list<Predicate> Predicates = [IsThumb, IsThumb1Only]; } class T1sI<dag oops, dag iops, InstrItinClass itin, @@ -1038,7 +886,7 @@ class T1sI<dag oops, dag iops, InstrItinClass itin, class T1sIt<dag oops, dag iops, InstrItinClass itin, string opc, string asm, list<dag> pattern> : Thumb1sI<oops, iops, AddrModeNone, Size2Bytes, itin, opc, asm, - "$lhs = $dst", pattern>; + "$Rn = $Rdn", pattern>; // Thumb1 instruction that can be predicated. class Thumb1pI<dag oops, dag iops, AddrMode am, SizeFlagVal sz, @@ -1047,9 +895,9 @@ class Thumb1pI<dag oops, dag iops, AddrMode am, SizeFlagVal sz, : InstThumb<am, sz, IndexModeNone, ThumbFrm, GenericDomain, cstr, itin> { let OutOperandList = oops; let InOperandList = !con(iops, (ins pred:$p)); - let AsmString = !strconcat(opc, !strconcat("${p}", asm)); + let AsmString = !strconcat(opc, "${p}", asm); let Pattern = pattern; - list<Predicate> Predicates = [IsThumb1Only]; + list<Predicate> Predicates = [IsThumb, IsThumb1Only]; } class T1pI<dag oops, dag iops, InstrItinClass itin, @@ -1060,17 +908,8 @@ class T1pI<dag oops, dag iops, InstrItinClass itin, class T1pIt<dag oops, dag iops, InstrItinClass itin, string opc, string asm, list<dag> pattern> : Thumb1pI<oops, iops, AddrModeNone, Size2Bytes, itin, opc, asm, - "$lhs = $dst", pattern>; + "$Rn = $Rdn", pattern>; -class T1pI1<dag oops, dag iops, InstrItinClass itin, - string opc, string asm, list<dag> pattern> - : Thumb1pI<oops, iops, AddrModeT1_1, Size2Bytes, itin, opc, asm, "", pattern>; -class T1pI2<dag oops, dag iops, InstrItinClass itin, - string opc, string asm, list<dag> pattern> - : Thumb1pI<oops, iops, AddrModeT1_2, Size2Bytes, itin, opc, asm, "", pattern>; -class T1pI4<dag oops, dag iops, InstrItinClass itin, - string opc, string asm, list<dag> pattern> - : Thumb1pI<oops, iops, AddrModeT1_4, Size2Bytes, itin, opc, asm, "", pattern>; class T1pIs<dag oops, dag iops, InstrItinClass itin, string opc, string asm, list<dag> pattern> : Thumb1pI<oops, iops, AddrModeT1_s, Size2Bytes, itin, opc, asm, "", pattern>; @@ -1099,7 +938,7 @@ class T1DataProcessing<bits<4> opcode> : Encoding16 { // A6.2.3 Special data instructions and branch and exchange encoding. class T1Special<bits<4> opcode> : Encoding16 { let Inst{15-10} = 0b010001; - let Inst{9-6} = opcode; + let Inst{9-6} = opcode; } // A6.2.4 Load/store single data item encoding. @@ -1107,12 +946,37 @@ class T1LoadStore<bits<4> opA, bits<3> opB> : Encoding16 { let Inst{15-12} = opA; let Inst{11-9} = opB; } -class T1LdSt<bits<3> opB> : T1LoadStore<0b0101, opB>; -class T1LdSt4Imm<bits<3> opB> : T1LoadStore<0b0110, opB>; // Immediate, 4 bytes -class T1LdSt1Imm<bits<3> opB> : T1LoadStore<0b0111, opB>; // Immediate, 1 byte -class T1LdSt2Imm<bits<3> opB> : T1LoadStore<0b1000, opB>; // Immediate, 2 bytes class T1LdStSP<bits<3> opB> : T1LoadStore<0b1001, opB>; // SP relative +// Helper classes to encode Thumb1 loads and stores. For immediates, the +// following bits are used for "opA" (see A6.2.4): +// +// 0b0110 => Immediate, 4 bytes +// 0b1000 => Immediate, 2 bytes +// 0b0111 => Immediate, 1 byte +class T1pILdStEncode<bits<3> opcode, dag oops, dag iops, AddrMode am, + InstrItinClass itin, string opc, string asm, + list<dag> pattern> + : Thumb1pI<oops, iops, am, Size2Bytes, itin, opc, asm, "", pattern>, + T1LoadStore<0b0101, opcode> { + bits<3> Rt; + bits<8> addr; + let Inst{8-6} = addr{5-3}; // Rm + let Inst{5-3} = addr{2-0}; // Rn + let Inst{2-0} = Rt; +} +class T1pILdStEncodeImm<bits<4> opA, bit opB, dag oops, dag iops, AddrMode am, + InstrItinClass itin, string opc, string asm, + list<dag> pattern> + : Thumb1pI<oops, iops, am, Size2Bytes, itin, opc, asm, "", pattern>, + T1LoadStore<opA, {opB,?,?}> { + bits<3> Rt; + bits<8> addr; + let Inst{10-6} = addr{7-3}; // imm5 + let Inst{5-3} = addr{2-0}; // Rn + let Inst{2-0} = Rt; +} + // A6.2.5 Miscellaneous 16-bit instructions encoding. class T1Misc<bits<7> opcode> : Encoding16 { let Inst{15-12} = 0b1011; @@ -1126,7 +990,7 @@ class Thumb2I<dag oops, dag iops, AddrMode am, SizeFlagVal sz, : InstARM<am, sz, IndexModeNone, ThumbFrm, GenericDomain, cstr, itin> { let OutOperandList = oops; let InOperandList = !con(iops, (ins pred:$p)); - let AsmString = !strconcat(opc, !strconcat("${p}", asm)); + let AsmString = !strconcat(opc, "${p}", asm); let Pattern = pattern; list<Predicate> Predicates = [IsThumb2]; } @@ -1134,16 +998,19 @@ class Thumb2I<dag oops, dag iops, AddrMode am, SizeFlagVal sz, // Same as Thumb2I except it can optionally modify CPSR. Note it's modeled as an // input operand since by default it's a zero register. It will become an // implicit def once it's "flipped". -// +// // FIXME: This uses unified syntax so {s} comes before {p}. We should make it // more consistent. class Thumb2sI<dag oops, dag iops, AddrMode am, SizeFlagVal sz, InstrItinClass itin, string opc, string asm, string cstr, list<dag> pattern> : InstARM<am, sz, IndexModeNone, ThumbFrm, GenericDomain, cstr, itin> { + bits<1> s; // condition-code set flag ('1' if the insn should set the flags) + let Inst{20} = s; + let OutOperandList = oops; let InOperandList = !con(iops, (ins pred:$p, cc_out:$s)); - let AsmString = !strconcat(opc, !strconcat("${s}${p}", asm)); + let AsmString = !strconcat(opc, "${s}${p}", asm); let Pattern = pattern; list<Predicate> Predicates = [IsThumb2]; } @@ -1168,7 +1035,7 @@ class ThumbXI<dag oops, dag iops, AddrMode am, SizeFlagVal sz, let InOperandList = iops; let AsmString = asm; let Pattern = pattern; - list<Predicate> Predicates = [IsThumb1Only]; + list<Predicate> Predicates = [IsThumb, IsThumb1Only]; } class T2I<dag oops, dag iops, InstrItinClass itin, @@ -1186,17 +1053,23 @@ class T2Iso<dag oops, dag iops, InstrItinClass itin, class T2Ipc<dag oops, dag iops, InstrItinClass itin, string opc, string asm, list<dag> pattern> : Thumb2I<oops, iops, AddrModeT2_pc, Size4Bytes, itin, opc, asm, "", pattern>; -class T2Ii8s4<bit P, bit W, bit load, dag oops, dag iops, InstrItinClass itin, +class T2Ii8s4<bit P, bit W, bit isLoad, dag oops, dag iops, InstrItinClass itin, string opc, string asm, list<dag> pattern> : Thumb2I<oops, iops, AddrModeT2_i8s4, Size4Bytes, itin, opc, asm, "", pattern> { - let Inst{31-27} = 0b11101; - let Inst{26-25} = 0b00; + bits<4> Rt; + bits<4> Rt2; + bits<13> addr; + let Inst{31-25} = 0b1110100; let Inst{24} = P; - let Inst{23} = ?; // The U bit. + let Inst{23} = addr{8}; let Inst{22} = 1; let Inst{21} = W; - let Inst{20} = load; + let Inst{20} = isLoad; + let Inst{19-16} = addr{12-9}; + let Inst{15-12} = Rt{3-0}; + let Inst{11-8} = Rt2{3-0}; + let Inst{7-0} = addr{7-0}; } class T2sI<dag oops, dag iops, InstrItinClass itin, @@ -1210,9 +1083,11 @@ class T2JTI<dag oops, dag iops, InstrItinClass itin, string asm, list<dag> pattern> : Thumb2XI<oops, iops, AddrModeNone, SizeSpecial, itin, asm, "", pattern>; -class T2Ix2<dag oops, dag iops, InstrItinClass itin, - string opc, string asm, list<dag> pattern> - : Thumb2I<oops, iops, AddrModeNone, Size8Bytes, itin, opc, asm, "", pattern>; +// Move to/from coprocessor instructions +class T2Cop<dag oops, dag iops, string asm, list<dag> pattern> + : T2XI<oops, iops, NoItinerary, asm, pattern>, Requires<[IsThumb2, HasV6]> { + let Inst{31-28} = 0b1111; +} // Two-address instructions class T2XIt<dag oops, dag iops, InstrItinClass itin, @@ -1227,7 +1102,7 @@ class T2Iidxldst<bit signed, bits<2> opcod, bit load, bit pre, : InstARM<am, Size4Bytes, im, ThumbFrm, GenericDomain, cstr, itin> { let OutOperandList = oops; let InOperandList = !con(iops, (ins pred:$p)); - let AsmString = !strconcat(opc, !strconcat("${p}", asm)); + let AsmString = !strconcat(opc, "${p}", asm); let Pattern = pattern; list<Predicate> Predicates = [IsThumb2]; let Inst{31-27} = 0b11111; @@ -1240,29 +1115,25 @@ class T2Iidxldst<bit signed, bits<2> opcod, bit load, bit pre, // (P, W) = (1, 1) Pre-indexed or (0, 1) Post-indexed let Inst{10} = pre; // The P bit. let Inst{8} = 1; // The W bit. -} -// Helper class for disassembly only -// A6.3.16 & A6.3.17 -// T2Imac - Thumb2 multiply [accumulate, and absolute difference] instructions. -class T2I_mac<bit long, bits<3> op22_20, bits<4> op7_4, dag oops, dag iops, - InstrItinClass itin, string opc, string asm, list<dag> pattern> - : T2I<oops, iops, itin, opc, asm, pattern> { - let Inst{31-27} = 0b11111; - let Inst{26-24} = 0b011; - let Inst{23} = long; - let Inst{22-20} = op22_20; - let Inst{7-4} = op7_4; + bits<9> addr; + let Inst{7-0} = addr{7-0}; + let Inst{9} = addr{8}; // Sign bit + + bits<4> Rt; + bits<4> Rn; + let Inst{15-12} = Rt{3-0}; + let Inst{19-16} = Rn{3-0}; } // Tv5Pat - Same as Pat<>, but requires V5T Thumb mode. class Tv5Pat<dag pattern, dag result> : Pat<pattern, result> { - list<Predicate> Predicates = [IsThumb1Only, HasV5T]; + list<Predicate> Predicates = [IsThumb, IsThumb1Only, HasV5T]; } // T1Pat - Same as Pat<>, but requires that the compiler be in Thumb1 mode. class T1Pat<dag pattern, dag result> : Pat<pattern, result> { - list<Predicate> Predicates = [IsThumb1Only]; + list<Predicate> Predicates = [IsThumb, IsThumb1Only]; } // T2Pat - Same as Pat<>, but requires that the compiler be in Thumb2 mode. @@ -1281,10 +1152,13 @@ class VFPI<dag oops, dag iops, AddrMode am, SizeFlagVal sz, IndexMode im, Format f, InstrItinClass itin, string opc, string asm, string cstr, list<dag> pattern> : InstARM<am, sz, im, f, VFPDomain, cstr, itin> { + bits<4> p; + let Inst{31-28} = p; let OutOperandList = oops; let InOperandList = !con(iops, (ins pred:$p)); - let AsmString = !strconcat(opc, !strconcat("${p}", asm)); + let AsmString = !strconcat(opc, "${p}", asm); let Pattern = pattern; + let PostEncoderMethod = "VFPThumb2PostEncoder"; list<Predicate> Predicates = [HasVFP2]; } @@ -1293,17 +1167,22 @@ class VFPXI<dag oops, dag iops, AddrMode am, SizeFlagVal sz, IndexMode im, Format f, InstrItinClass itin, string asm, string cstr, list<dag> pattern> : InstARM<am, sz, im, f, VFPDomain, cstr, itin> { + bits<4> p; + let Inst{31-28} = p; let OutOperandList = oops; let InOperandList = iops; let AsmString = asm; let Pattern = pattern; + let PostEncoderMethod = "VFPThumb2PostEncoder"; list<Predicate> Predicates = [HasVFP2]; } class VFPAI<dag oops, dag iops, Format f, InstrItinClass itin, string opc, string asm, list<dag> pattern> : VFPI<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, f, itin, - opc, asm, "", pattern>; + opc, asm, "", pattern> { + let PostEncoderMethod = "VFPThumb2PostEncoder"; +} // ARM VFP addrmode5 loads and stores class ADI5<bits<4> opcod1, bits<2> opcod2, dag oops, dag iops, @@ -1311,12 +1190,24 @@ class ADI5<bits<4> opcod1, bits<2> opcod2, dag oops, dag iops, string opc, string asm, list<dag> pattern> : VFPI<oops, iops, AddrMode5, Size4Bytes, IndexModeNone, VFPLdStFrm, itin, opc, asm, "", pattern> { + // Instruction operands. + bits<5> Dd; + bits<13> addr; + + // Encode instruction operands. + let Inst{23} = addr{8}; // U (add = (U == '1')) + let Inst{22} = Dd{4}; + let Inst{19-16} = addr{12-9}; // Rn + let Inst{15-12} = Dd{3-0}; + let Inst{7-0} = addr{7-0}; // imm8 + // TODO: Mark the instructions with the appropriate subtarget info. let Inst{27-24} = opcod1; let Inst{21-20} = opcod2; - let Inst{11-8} = 0b1011; + let Inst{11-9} = 0b101; + let Inst{8} = 1; // Double precision - // 64-bit loads & stores operate on both NEON and VFP pipelines. + // Loads & stores operate on both NEON and VFP pipelines. let D = VFPNeonDomain; } @@ -1325,10 +1216,36 @@ class ASI5<bits<4> opcod1, bits<2> opcod2, dag oops, dag iops, string opc, string asm, list<dag> pattern> : VFPI<oops, iops, AddrMode5, Size4Bytes, IndexModeNone, VFPLdStFrm, itin, opc, asm, "", pattern> { + // Instruction operands. + bits<5> Sd; + bits<13> addr; + + // Encode instruction operands. + let Inst{23} = addr{8}; // U (add = (U == '1')) + let Inst{22} = Sd{0}; + let Inst{19-16} = addr{12-9}; // Rn + let Inst{15-12} = Sd{4-1}; + let Inst{7-0} = addr{7-0}; // imm8 + // TODO: Mark the instructions with the appropriate subtarget info. let Inst{27-24} = opcod1; let Inst{21-20} = opcod2; - let Inst{11-8} = 0b1010; + let Inst{11-9} = 0b101; + let Inst{8} = 0; // Single precision + + // Loads & stores operate on both NEON and VFP pipelines. + let D = VFPNeonDomain; +} + +// VFP Load / store multiple pseudo instructions. +class PseudoVFPLdStM<dag oops, dag iops, InstrItinClass itin, string cstr, + list<dag> pattern> + : InstARM<AddrMode4, Size4Bytes, IndexModeNone, Pseudo, VFPNeonDomain, + cstr, itin> { + let OutOperandList = oops; + let InOperandList = !con(iops, (ins pred:$p)); + let Pattern = pattern; + list<Predicate> Predicates = [HasVFP2]; } // Load / store multiple @@ -1336,21 +1253,40 @@ class AXDI4<dag oops, dag iops, IndexMode im, InstrItinClass itin, string asm, string cstr, list<dag> pattern> : VFPXI<oops, iops, AddrMode4, Size4Bytes, im, VFPLdStMulFrm, itin, asm, cstr, pattern> { + // Instruction operands. + bits<4> Rn; + bits<13> regs; + + // Encode instruction operands. + let Inst{19-16} = Rn; + let Inst{22} = regs{12}; + let Inst{15-12} = regs{11-8}; + let Inst{7-0} = regs{7-0}; + // TODO: Mark the instructions with the appropriate subtarget info. let Inst{27-25} = 0b110; - let Inst{11-8} = 0b1011; - - // 64-bit loads & stores operate on both NEON and VFP pipelines. - let D = VFPNeonDomain; + let Inst{11-9} = 0b101; + let Inst{8} = 1; // Double precision } class AXSI4<dag oops, dag iops, IndexMode im, InstrItinClass itin, string asm, string cstr, list<dag> pattern> : VFPXI<oops, iops, AddrMode4, Size4Bytes, im, VFPLdStMulFrm, itin, asm, cstr, pattern> { + // Instruction operands. + bits<4> Rn; + bits<13> regs; + + // Encode instruction operands. + let Inst{19-16} = Rn; + let Inst{22} = regs{8}; + let Inst{15-12} = regs{12-9}; + let Inst{7-0} = regs{7-0}; + // TODO: Mark the instructions with the appropriate subtarget info. let Inst{27-25} = 0b110; - let Inst{11-8} = 0b1010; + let Inst{11-9} = 0b101; + let Inst{8} = 0; // Single precision } // Double precision, unary @@ -1358,10 +1294,21 @@ class ADuI<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<2> opcod4, bit opcod5, dag oops, dag iops, InstrItinClass itin, string opc, string asm, list<dag> pattern> : VFPAI<oops, iops, VFPUnaryFrm, itin, opc, asm, pattern> { + // Instruction operands. + bits<5> Dd; + bits<5> Dm; + + // Encode instruction operands. + let Inst{3-0} = Dm{3-0}; + let Inst{5} = Dm{4}; + let Inst{15-12} = Dd{3-0}; + let Inst{22} = Dd{4}; + let Inst{27-23} = opcod1; let Inst{21-20} = opcod2; let Inst{19-16} = opcod3; - let Inst{11-8} = 0b1011; + let Inst{11-9} = 0b101; + let Inst{8} = 1; // Double precision let Inst{7-6} = opcod4; let Inst{4} = opcod5; } @@ -1371,24 +1318,25 @@ class ADbI<bits<5> opcod1, bits<2> opcod2, bit op6, bit op4, dag oops, dag iops, InstrItinClass itin, string opc, string asm, list<dag> pattern> : VFPAI<oops, iops, VFPBinaryFrm, itin, opc, asm, pattern> { - let Inst{27-23} = opcod1; - let Inst{21-20} = opcod2; - let Inst{11-8} = 0b1011; - let Inst{6} = op6; - let Inst{4} = op4; -} + // Instruction operands. + bits<5> Dd; + bits<5> Dn; + bits<5> Dm; + + // Encode instruction operands. + let Inst{3-0} = Dm{3-0}; + let Inst{5} = Dm{4}; + let Inst{19-16} = Dn{3-0}; + let Inst{7} = Dn{4}; + let Inst{15-12} = Dd{3-0}; + let Inst{22} = Dd{4}; -// Double precision, binary, VML[AS] (for additional predicate) -class ADbI_vmlX<bits<5> opcod1, bits<2> opcod2, bit op6, bit op4, dag oops, - dag iops, InstrItinClass itin, string opc, string asm, - list<dag> pattern> - : VFPAI<oops, iops, VFPBinaryFrm, itin, opc, asm, pattern> { let Inst{27-23} = opcod1; let Inst{21-20} = opcod2; - let Inst{11-8} = 0b1011; + let Inst{11-9} = 0b101; + let Inst{8} = 1; // Double precision let Inst{6} = op6; let Inst{4} = op4; - list<Predicate> Predicates = [HasVFP2, UseVMLx]; } // Single precision, unary @@ -1396,16 +1344,27 @@ class ASuI<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<2> opcod4, bit opcod5, dag oops, dag iops, InstrItinClass itin, string opc, string asm, list<dag> pattern> : VFPAI<oops, iops, VFPUnaryFrm, itin, opc, asm, pattern> { + // Instruction operands. + bits<5> Sd; + bits<5> Sm; + + // Encode instruction operands. + let Inst{3-0} = Sm{4-1}; + let Inst{5} = Sm{0}; + let Inst{15-12} = Sd{4-1}; + let Inst{22} = Sd{0}; + let Inst{27-23} = opcod1; let Inst{21-20} = opcod2; let Inst{19-16} = opcod3; - let Inst{11-8} = 0b1010; + let Inst{11-9} = 0b101; + let Inst{8} = 0; // Single precision let Inst{7-6} = opcod4; let Inst{4} = opcod5; } -// Single precision unary, if no NEON -// Same as ASuI except not available if NEON is enabled +// Single precision unary, if no NEON. Same as ASuI except not available if +// NEON is enabled. class ASuIn<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<2> opcod4, bit opcod5, dag oops, dag iops, InstrItinClass itin, string opc, string asm, list<dag> pattern> @@ -1418,20 +1377,47 @@ class ASuIn<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<2> opcod4, class ASbI<bits<5> opcod1, bits<2> opcod2, bit op6, bit op4, dag oops, dag iops, InstrItinClass itin, string opc, string asm, list<dag> pattern> : VFPAI<oops, iops, VFPBinaryFrm, itin, opc, asm, pattern> { + // Instruction operands. + bits<5> Sd; + bits<5> Sn; + bits<5> Sm; + + // Encode instruction operands. + let Inst{3-0} = Sm{4-1}; + let Inst{5} = Sm{0}; + let Inst{19-16} = Sn{4-1}; + let Inst{7} = Sn{0}; + let Inst{15-12} = Sd{4-1}; + let Inst{22} = Sd{0}; + let Inst{27-23} = opcod1; let Inst{21-20} = opcod2; - let Inst{11-8} = 0b1010; + let Inst{11-9} = 0b101; + let Inst{8} = 0; // Single precision let Inst{6} = op6; let Inst{4} = op4; } -// Single precision binary, if no NEON -// Same as ASbI except not available if NEON is enabled +// Single precision binary, if no NEON. Same as ASbI except not available if +// NEON is enabled. class ASbIn<bits<5> opcod1, bits<2> opcod2, bit op6, bit op4, dag oops, dag iops, InstrItinClass itin, string opc, string asm, list<dag> pattern> : ASbI<opcod1, opcod2, op6, op4, oops, iops, itin, opc, asm, pattern> { list<Predicate> Predicates = [HasVFP2,DontUseNEONForFP]; + + // Instruction operands. + bits<5> Sd; + bits<5> Sn; + bits<5> Sm; + + // Encode instruction operands. + let Inst{3-0} = Sm{4-1}; + let Inst{5} = Sm{0}; + let Inst{19-16} = Sn{4-1}; + let Inst{7} = Sn{0}; + let Inst{15-12} = Sd{4-1}; + let Inst{22} = Sd{0}; } // VFP conversion instructions @@ -1502,9 +1488,7 @@ class NeonI<dag oops, dag iops, AddrMode am, IndexMode im, Format f, : InstARM<am, Size4Bytes, im, f, NeonDomain, cstr, itin> { let OutOperandList = oops; let InOperandList = !con(iops, (ins pred:$p)); - let AsmString = !strconcat( - !strconcat(!strconcat(opc, "${p}"), !strconcat(".", dt)), - !strconcat("\t", asm)); + let AsmString = !strconcat(opc, "${p}", ".", dt, "\t", asm); let Pattern = pattern; list<Predicate> Predicates = [HasNEON]; } @@ -1516,7 +1500,7 @@ class NeonXI<dag oops, dag iops, AddrMode am, IndexMode im, Format f, : InstARM<am, Size4Bytes, im, f, NeonDomain, cstr, itin> { let OutOperandList = oops; let InOperandList = !con(iops, (ins pred:$p)); - let AsmString = !strconcat(!strconcat(opc, "${p}"), !strconcat("\t", asm)); + let AsmString = !strconcat(opc, "${p}", "\t", asm); let Pattern = pattern; list<Predicate> Predicates = [HasNEON]; } @@ -1531,6 +1515,25 @@ class NLdSt<bit op23, bits<2> op21_20, bits<4> op11_8, bits<4> op7_4, let Inst{21-20} = op21_20; let Inst{11-8} = op11_8; let Inst{7-4} = op7_4; + + let PostEncoderMethod = "NEONThumb2LoadStorePostEncoder"; + + bits<5> Vd; + bits<6> Rn; + bits<4> Rm; + + let Inst{22} = Vd{4}; + let Inst{15-12} = Vd{3-0}; + let Inst{19-16} = Rn{3-0}; + let Inst{3-0} = Rm{3-0}; +} + +class NLdStLn<bit op23, bits<2> op21_20, bits<4> op11_8, bits<4> op7_4, + dag oops, dag iops, InstrItinClass itin, + string opc, string dt, string asm, string cstr, list<dag> pattern> + : NLdSt<op23, op21_20, op11_8, op7_4, oops, iops, itin, opc, + dt, asm, cstr, pattern> { + bits<3> lane; } class PseudoNLdSt<dag oops, dag iops, InstrItinClass itin, string cstr> @@ -1541,11 +1544,22 @@ class PseudoNLdSt<dag oops, dag iops, InstrItinClass itin, string cstr> list<Predicate> Predicates = [HasNEON]; } +class PseudoNeonI<dag oops, dag iops, InstrItinClass itin, string cstr, + list<dag> pattern> + : InstARM<AddrModeNone, Size4Bytes, IndexModeNone, Pseudo, NeonDomain, cstr, + itin> { + let OutOperandList = oops; + let InOperandList = !con(iops, (ins pred:$p)); + let Pattern = pattern; + list<Predicate> Predicates = [HasNEON]; +} + class NDataI<dag oops, dag iops, Format f, InstrItinClass itin, string opc, string dt, string asm, string cstr, list<dag> pattern> : NeonI<oops, iops, AddrModeNone, IndexModeNone, f, itin, opc, dt, asm, cstr, pattern> { let Inst{31-25} = 0b1111001; + let PostEncoderMethod = "NEONThumb2DataIPostEncoder"; } class NDataXI<dag oops, dag iops, Format f, InstrItinClass itin, @@ -1553,6 +1567,7 @@ class NDataXI<dag oops, dag iops, Format f, InstrItinClass itin, : NeonXI<oops, iops, AddrModeNone, IndexModeNone, f, itin, opc, asm, cstr, pattern> { let Inst{31-25} = 0b1111001; + let PostEncoderMethod = "NEONThumb2DataIPostEncoder"; } // NEON "one register and a modified immediate" format. @@ -1569,6 +1584,16 @@ class N1ModImm<bit op23, bits<3> op21_19, bits<4> op11_8, bit op7, bit op6, let Inst{6} = op6; let Inst{5} = op5; let Inst{4} = op4; + + // Instruction operands. + bits<5> Vd; + bits<13> SIMM; + + let Inst{15-12} = Vd{3-0}; + let Inst{22} = Vd{4}; + let Inst{24} = SIMM{7}; + let Inst{18-16} = SIMM{6-4}; + let Inst{3-0} = SIMM{3-0}; } // NEON 2 vector register format. @@ -1584,6 +1609,15 @@ class N2V<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, bits<2> op17_16, let Inst{11-7} = op11_7; let Inst{6} = op6; let Inst{4} = op4; + + // Instruction operands. + bits<5> Vd; + bits<5> Vm; + + let Inst{15-12} = Vd{3-0}; + let Inst{22} = Vd{4}; + let Inst{3-0} = Vm{3-0}; + let Inst{5} = Vm{4}; } // Same as N2V except it doesn't have a datatype suffix. @@ -1599,6 +1633,15 @@ class N2VX<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, bits<2> op17_16, let Inst{11-7} = op11_7; let Inst{6} = op6; let Inst{4} = op4; + + // Instruction operands. + bits<5> Vd; + bits<5> Vm; + + let Inst{15-12} = Vd{3-0}; + let Inst{22} = Vd{4}; + let Inst{3-0} = Vm{3-0}; + let Inst{5} = Vm{4}; } // NEON 2 vector register with immediate. @@ -1612,6 +1655,17 @@ class N2VImm<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6, bit op4, let Inst{7} = op7; let Inst{6} = op6; let Inst{4} = op4; + + // Instruction operands. + bits<5> Vd; + bits<5> Vm; + bits<6> SIMM; + + let Inst{15-12} = Vd{3-0}; + let Inst{22} = Vd{4}; + let Inst{3-0} = Vm{3-0}; + let Inst{5} = Vm{4}; + let Inst{21-16} = SIMM{5-0}; } // NEON 3 vector register format. @@ -1625,6 +1679,18 @@ class N3V<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6, bit op4, let Inst{11-8} = op11_8; let Inst{6} = op6; let Inst{4} = op4; + + // Instruction operands. + bits<5> Vd; + bits<5> Vn; + bits<5> Vm; + + let Inst{15-12} = Vd{3-0}; + let Inst{22} = Vd{4}; + let Inst{19-16} = Vn{3-0}; + let Inst{7} = Vn{4}; + let Inst{3-0} = Vm{3-0}; + let Inst{5} = Vm{4}; } // Same as N3V except it doesn't have a data type suffix. @@ -1639,13 +1705,25 @@ class N3VX<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6, let Inst{11-8} = op11_8; let Inst{6} = op6; let Inst{4} = op4; + + // Instruction operands. + bits<5> Vd; + bits<5> Vn; + bits<5> Vm; + + let Inst{15-12} = Vd{3-0}; + let Inst{22} = Vd{4}; + let Inst{19-16} = Vn{3-0}; + let Inst{7} = Vn{4}; + let Inst{3-0} = Vm{3-0}; + let Inst{5} = Vm{4}; } // NEON VMOVs between scalar and core registers. class NVLaneOp<bits<8> opcod1, bits<4> opcod2, bits<2> opcod3, dag oops, dag iops, Format f, InstrItinClass itin, string opc, string dt, string asm, list<dag> pattern> - : InstARM<AddrModeNone, Size4Bytes, IndexModeNone, f, GenericDomain, + : InstARM<AddrModeNone, Size4Bytes, IndexModeNone, f, NeonDomain, "", itin> { let Inst{27-20} = opcod1; let Inst{11-8} = opcod2; @@ -1654,11 +1732,21 @@ class NVLaneOp<bits<8> opcod1, bits<4> opcod2, bits<2> opcod3, let OutOperandList = oops; let InOperandList = !con(iops, (ins pred:$p)); - let AsmString = !strconcat( - !strconcat(!strconcat(opc, "${p}"), !strconcat(".", dt)), - !strconcat("\t", asm)); + let AsmString = !strconcat(opc, "${p}", ".", dt, "\t", asm); let Pattern = pattern; list<Predicate> Predicates = [HasNEON]; + + let PostEncoderMethod = "NEONThumb2DupPostEncoder"; + + bits<5> V; + bits<4> R; + bits<4> p; + bits<4> lane; + + let Inst{31-28} = p{3-0}; + let Inst{7} = V{4}; + let Inst{19-16} = V{3-0}; + let Inst{15-12} = R{3-0}; } class NVGetLane<bits<8> opcod1, bits<4> opcod2, bits<2> opcod3, dag oops, dag iops, InstrItinClass itin, @@ -1687,6 +1775,15 @@ class NVDupLane<bits<4> op19_16, bit op6, dag oops, dag iops, let Inst{11-7} = 0b11000; let Inst{6} = op6; let Inst{4} = 0; + + bits<5> Vd; + bits<5> Vm; + bits<4> lane; + + let Inst{22} = Vd{4}; + let Inst{15-12} = Vd{3-0}; + let Inst{5} = Vm{4}; + let Inst{3-0} = Vm{3-0}; } // NEONFPPat - Same as Pat<>, but requires that the compiler be using NEON diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.cpp index ba228ff..6f48d96 100644 --- a/contrib/llvm/lib/Target/ARM/ARMInstrInfo.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.cpp @@ -33,13 +33,13 @@ unsigned ARMInstrInfo::getUnindexedOpcode(unsigned Opc) const { default: break; case ARM::LDR_PRE: case ARM::LDR_POST: - return ARM::LDR; + return ARM::LDRi12; case ARM::LDRH_PRE: case ARM::LDRH_POST: return ARM::LDRH; case ARM::LDRB_PRE: case ARM::LDRB_POST: - return ARM::LDRB; + return ARM::LDRBi12; case ARM::LDRSH_PRE: case ARM::LDRSH_POST: return ARM::LDRSH; @@ -48,39 +48,14 @@ unsigned ARMInstrInfo::getUnindexedOpcode(unsigned Opc) const { return ARM::LDRSB; case ARM::STR_PRE: case ARM::STR_POST: - return ARM::STR; + return ARM::STRi12; case ARM::STRH_PRE: case ARM::STRH_POST: return ARM::STRH; case ARM::STRB_PRE: case ARM::STRB_POST: - return ARM::STRB; + return ARM::STRBi12; } return 0; } - -void ARMInstrInfo:: -reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, - unsigned DestReg, unsigned SubIdx, const MachineInstr *Orig, - const TargetRegisterInfo &TRI) const { - DebugLoc dl = Orig->getDebugLoc(); - unsigned Opcode = Orig->getOpcode(); - switch (Opcode) { - default: - break; - case ARM::MOVi2pieces: { - RI.emitLoadConstPool(MBB, I, dl, - DestReg, SubIdx, - Orig->getOperand(1).getImm(), - (ARMCC::CondCodes)Orig->getOperand(2).getImm(), - Orig->getOperand(3).getReg()); - MachineInstr *NewMI = prior(I); - NewMI->getOperand(0).setSubReg(SubIdx); - return; - } - } - - return ARMBaseInstrInfo::reMaterialize(MBB, I, DestReg, SubIdx, Orig, TRI); -} - diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrInfo.h b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.h index 4563ffe..f2c7bdc 100644 --- a/contrib/llvm/lib/Target/ARM/ARMInstrInfo.h +++ b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.h @@ -32,11 +32,6 @@ public: // if there is not such an opcode. unsigned getUnindexedOpcode(unsigned Opc) const; - void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, - unsigned DestReg, unsigned SubIdx, - const MachineInstr *Orig, - const TargetRegisterInfo &TRI) const; - /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info. As /// such, whenever a client has an instance of instruction info, it should /// always be able to get register info as well (through this method). diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrInfo.td b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.td index e66f9b9..c827ce3d 100644 --- a/contrib/llvm/lib/Target/ARM/ARMInstrInfo.td +++ b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.td @@ -58,10 +58,9 @@ def SDT_ARMEH_SJLJ_Setjmp : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisPtrTy<1>, SDTCisInt<2>]>; def SDT_ARMEH_SJLJ_Longjmp: SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisInt<1>]>; -def SDT_ARMMEMBARRIER : SDTypeProfile<0, 0, []>; -def SDT_ARMSYNCBARRIER : SDTypeProfile<0, 0, []>; -def SDT_ARMMEMBARRIERMCR : SDTypeProfile<0, 1, [SDTCisInt<0>]>; -def SDT_ARMSYNCBARRIERMCR : SDTypeProfile<0, 1, [SDTCisInt<0>]>; +def SDT_ARMEH_SJLJ_DispatchSetup: SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>; + +def SDT_ARMMEMBARRIER : SDTypeProfile<0, 1, [SDTCisInt<0>]>; def SDT_ARMTCRET : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>; @@ -70,33 +69,35 @@ def SDT_ARMBFI : SDTypeProfile<1, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, // Node definitions. def ARMWrapper : SDNode<"ARMISD::Wrapper", SDTIntUnaryOp>; +def ARMWrapperDYN : SDNode<"ARMISD::WrapperDYN", SDTIntUnaryOp>; +def ARMWrapperPIC : SDNode<"ARMISD::WrapperPIC", SDTIntUnaryOp>; def ARMWrapperJT : SDNode<"ARMISD::WrapperJT", SDTIntBinOp>; def ARMcallseq_start : SDNode<"ISD::CALLSEQ_START", SDT_ARMCallSeqStart, - [SDNPHasChain, SDNPOutFlag]>; + [SDNPHasChain, SDNPOutGlue]>; def ARMcallseq_end : SDNode<"ISD::CALLSEQ_END", SDT_ARMCallSeqEnd, - [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag]>; + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; def ARMcall : SDNode<"ARMISD::CALL", SDT_ARMcall, - [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, SDNPVariadic]>; def ARMcall_pred : SDNode<"ARMISD::CALL_PRED", SDT_ARMcall, - [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, SDNPVariadic]>; def ARMcall_nolink : SDNode<"ARMISD::CALL_NOLINK", SDT_ARMcall, - [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, SDNPVariadic]>; def ARMretflag : SDNode<"ARMISD::RET_FLAG", SDTNone, - [SDNPHasChain, SDNPOptInFlag]>; + [SDNPHasChain, SDNPOptInGlue]>; def ARMcmov : SDNode<"ARMISD::CMOV", SDT_ARMCMov, - [SDNPInFlag]>; + [SDNPInGlue]>; def ARMcneg : SDNode<"ARMISD::CNEG", SDT_ARMCMov, - [SDNPInFlag]>; + [SDNPInGlue]>; def ARMbrcond : SDNode<"ARMISD::BRCOND", SDT_ARMBrcond, - [SDNPHasChain, SDNPInFlag, SDNPOutFlag]>; + [SDNPHasChain, SDNPInGlue, SDNPOutGlue]>; def ARMbrjt : SDNode<"ARMISD::BR_JT", SDT_ARMBrJT, [SDNPHasChain]>; @@ -106,40 +107,38 @@ def ARMbr2jt : SDNode<"ARMISD::BR2_JT", SDT_ARMBr2JT, def ARMBcci64 : SDNode<"ARMISD::BCC_i64", SDT_ARMBCC_i64, [SDNPHasChain]>; -def ARMand : SDNode<"ARMISD::AND", SDT_ARMAnd, - [SDNPOutFlag]>; - def ARMcmp : SDNode<"ARMISD::CMP", SDT_ARMCmp, - [SDNPOutFlag]>; + [SDNPOutGlue]>; def ARMcmpZ : SDNode<"ARMISD::CMPZ", SDT_ARMCmp, - [SDNPOutFlag, SDNPCommutative]>; + [SDNPOutGlue, SDNPCommutative]>; def ARMpic_add : SDNode<"ARMISD::PIC_ADD", SDT_ARMPICAdd>; -def ARMsrl_flag : SDNode<"ARMISD::SRL_FLAG", SDTIntUnaryOp, [SDNPOutFlag]>; -def ARMsra_flag : SDNode<"ARMISD::SRA_FLAG", SDTIntUnaryOp, [SDNPOutFlag]>; -def ARMrrx : SDNode<"ARMISD::RRX" , SDTIntUnaryOp, [SDNPInFlag ]>; +def ARMsrl_flag : SDNode<"ARMISD::SRL_FLAG", SDTIntUnaryOp, [SDNPOutGlue]>; +def ARMsra_flag : SDNode<"ARMISD::SRA_FLAG", SDTIntUnaryOp, [SDNPOutGlue]>; +def ARMrrx : SDNode<"ARMISD::RRX" , SDTIntUnaryOp, [SDNPInGlue ]>; def ARMthread_pointer: SDNode<"ARMISD::THREAD_POINTER", SDT_ARMThreadPointer>; def ARMeh_sjlj_setjmp: SDNode<"ARMISD::EH_SJLJ_SETJMP", SDT_ARMEH_SJLJ_Setjmp, [SDNPHasChain]>; def ARMeh_sjlj_longjmp: SDNode<"ARMISD::EH_SJLJ_LONGJMP", - SDT_ARMEH_SJLJ_Longjmp, [SDNPHasChain]>; + SDT_ARMEH_SJLJ_Longjmp, [SDNPHasChain]>; +def ARMeh_sjlj_dispatchsetup: SDNode<"ARMISD::EH_SJLJ_DISPATCHSETUP", + SDT_ARMEH_SJLJ_DispatchSetup, [SDNPHasChain]>; + def ARMMemBarrier : SDNode<"ARMISD::MEMBARRIER", SDT_ARMMEMBARRIER, [SDNPHasChain]>; -def ARMSyncBarrier : SDNode<"ARMISD::SYNCBARRIER", SDT_ARMMEMBARRIER, - [SDNPHasChain]>; -def ARMMemBarrierMCR : SDNode<"ARMISD::MEMBARRIER", SDT_ARMMEMBARRIERMCR, - [SDNPHasChain]>; -def ARMSyncBarrierMCR : SDNode<"ARMISD::SYNCBARRIER", SDT_ARMMEMBARRIERMCR, +def ARMMemBarrierMCR : SDNode<"ARMISD::MEMBARRIER_MCR", SDT_ARMMEMBARRIER, [SDNPHasChain]>; +def ARMPreload : SDNode<"ARMISD::PRELOAD", SDTPrefetch, + [SDNPHasChain, SDNPMayLoad, SDNPMayStore]>; def ARMrbit : SDNode<"ARMISD::RBIT", SDTIntUnaryOp>; -def ARMtcret : SDNode<"ARMISD::TC_RETURN", SDT_ARMTCRET, - [SDNPHasChain, SDNPOptInFlag, SDNPVariadic]>; +def ARMtcret : SDNode<"ARMISD::TC_RETURN", SDT_ARMTCRET, + [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>; def ARMbfi : SDNode<"ARMISD::BFI", SDT_ARMBFI>; @@ -147,34 +146,40 @@ def ARMbfi : SDNode<"ARMISD::BFI", SDT_ARMBFI>; //===----------------------------------------------------------------------===// // ARM Instruction Predicate Definitions. // -def HasV4T : Predicate<"Subtarget->hasV4TOps()">; +def HasV4T : Predicate<"Subtarget->hasV4TOps()">, AssemblerPredicate; def NoV4T : Predicate<"!Subtarget->hasV4TOps()">; def HasV5T : Predicate<"Subtarget->hasV5TOps()">; -def HasV5TE : Predicate<"Subtarget->hasV5TEOps()">; -def HasV6 : Predicate<"Subtarget->hasV6Ops()">; -def HasV6T2 : Predicate<"Subtarget->hasV6T2Ops()">; +def HasV5TE : Predicate<"Subtarget->hasV5TEOps()">, AssemblerPredicate; +def HasV6 : Predicate<"Subtarget->hasV6Ops()">, AssemblerPredicate; +def NoV6 : Predicate<"!Subtarget->hasV6Ops()">; +def HasV6T2 : Predicate<"Subtarget->hasV6T2Ops()">, AssemblerPredicate; def NoV6T2 : Predicate<"!Subtarget->hasV6T2Ops()">; -def HasV7 : Predicate<"Subtarget->hasV7Ops()">; +def HasV7 : Predicate<"Subtarget->hasV7Ops()">, AssemblerPredicate; def NoVFP : Predicate<"!Subtarget->hasVFP2()">; -def HasVFP2 : Predicate<"Subtarget->hasVFP2()">; -def HasVFP3 : Predicate<"Subtarget->hasVFP3()">; -def HasNEON : Predicate<"Subtarget->hasNEON()">; -def HasDivide : Predicate<"Subtarget->hasDivide()">; -def HasT2ExtractPack : Predicate<"Subtarget->hasT2ExtractPack()">; -def HasDB : Predicate<"Subtarget->hasDataBarrier()">; +def HasVFP2 : Predicate<"Subtarget->hasVFP2()">, AssemblerPredicate; +def HasVFP3 : Predicate<"Subtarget->hasVFP3()">, AssemblerPredicate; +def HasNEON : Predicate<"Subtarget->hasNEON()">, AssemblerPredicate; +def HasFP16 : Predicate<"Subtarget->hasFP16()">, AssemblerPredicate; +def HasDivide : Predicate<"Subtarget->hasDivide()">, AssemblerPredicate; +def HasT2ExtractPack : Predicate<"Subtarget->hasT2ExtractPack()">, + AssemblerPredicate; +def HasDB : Predicate<"Subtarget->hasDataBarrier()">, + AssemblerPredicate; +def HasMP : Predicate<"Subtarget->hasMPExtension()">, + AssemblerPredicate; def UseNEONForFP : Predicate<"Subtarget->useNEONForSinglePrecisionFP()">; def DontUseNEONForFP : Predicate<"!Subtarget->useNEONForSinglePrecisionFP()">; -def IsThumb : Predicate<"Subtarget->isThumb()">; +def IsThumb : Predicate<"Subtarget->isThumb()">, AssemblerPredicate; def IsThumb1Only : Predicate<"Subtarget->isThumb1Only()">; -def IsThumb2 : Predicate<"Subtarget->isThumb2()">; -def IsARM : Predicate<"!Subtarget->isThumb()">; +def IsThumb2 : Predicate<"Subtarget->isThumb2()">, AssemblerPredicate; +def IsARM : Predicate<"!Subtarget->isThumb()">, AssemblerPredicate; def IsDarwin : Predicate<"Subtarget->isTargetDarwin()">; def IsNotDarwin : Predicate<"!Subtarget->isTargetDarwin()">; // FIXME: Eventually this will be just "hasV6T2Ops". def UseMovt : Predicate<"Subtarget->useMovt()">; def DontUseMovt : Predicate<"!Subtarget->useMovt()">; -def UseVMLx : Predicate<"Subtarget->useVMLx()">; +def UseFPVMLx : Predicate<"Subtarget->useFPVMLx()">; //===----------------------------------------------------------------------===// // ARM Flag Definitions. @@ -199,12 +204,6 @@ def so_imm_not_XFORM : SDNodeXForm<imm, [{ return CurDAG->getTargetConstant(~(int)N->getZExtValue(), MVT::i32); }]>; -// rot_imm predicate - True if the 32-bit immediate is equal to 8, 16, or 24. -def rot_imm : PatLeaf<(i32 imm), [{ - int32_t v = (int32_t)N->getZExtValue(); - return v == 8 || v == 16 || v == 24; -}]>; - /// imm1_15 predicate - True if the 32-bit immediate is in the range [1,15]. def imm1_15 : PatLeaf<(i32 imm), [{ return (int32_t)N->getZExtValue() >= 1 && (int32_t)N->getZExtValue() < 16; @@ -217,12 +216,12 @@ def imm16_31 : PatLeaf<(i32 imm), [{ def so_imm_neg : PatLeaf<(imm), [{ - return ARM_AM::getSOImmVal(-(int)N->getZExtValue()) != -1; + return ARM_AM::getSOImmVal(-(uint32_t)N->getZExtValue()) != -1; }], so_imm_neg_XFORM>; def so_imm_not : PatLeaf<(imm), [{ - return ARM_AM::getSOImmVal(~(int)N->getZExtValue()) != -1; + return ARM_AM::getSOImmVal(~(uint32_t)N->getZExtValue()) != -1; }], so_imm_not_XFORM>; // sext_16_node predicate - True if the SDNode is sign-extended 16 or more bits. @@ -230,15 +229,6 @@ def sext_16_node : PatLeaf<(i32 GPR:$a), [{ return CurDAG->ComputeNumSignBits(SDValue(N,0)) >= 17; }]>; -/// bf_inv_mask_imm predicate - An AND mask to clear an arbitrary width bitfield -/// e.g., 0xf000ffff -def bf_inv_mask_imm : Operand<i32>, - PatLeaf<(imm), [{ - return ARM::isBitFieldInvertedMask(N->getZExtValue()); -}] > { - let PrintMethod = "printBitfieldInvMaskImmOperand"; -} - /// Split a 32-bit immediate into two 16 bit parts. def hi16 : SDNodeXForm<imm, [{ return CurDAG->getTargetConstant((uint32_t)N->getZExtValue() >> 16, MVT::i32); @@ -273,28 +263,103 @@ def sube_live_carry : PatFrag<(ops node:$LHS, node:$RHS), (sube node:$LHS, node:$RHS), [{return N->hasAnyUseOfValue(1);}]>; +// An 'and' node with a single use. +def and_su : PatFrag<(ops node:$lhs, node:$rhs), (and node:$lhs, node:$rhs), [{ + return N->hasOneUse(); +}]>; + +// An 'xor' node with a single use. +def xor_su : PatFrag<(ops node:$lhs, node:$rhs), (xor node:$lhs, node:$rhs), [{ + return N->hasOneUse(); +}]>; + +// An 'fmul' node with a single use. +def fmul_su : PatFrag<(ops node:$lhs, node:$rhs), (fmul node:$lhs, node:$rhs),[{ + return N->hasOneUse(); +}]>; + +// An 'fadd' node which checks for single non-hazardous use. +def fadd_mlx : PatFrag<(ops node:$lhs, node:$rhs),(fadd node:$lhs, node:$rhs),[{ + return hasNoVMLxHazardUse(N); +}]>; + +// An 'fsub' node which checks for single non-hazardous use. +def fsub_mlx : PatFrag<(ops node:$lhs, node:$rhs),(fsub node:$lhs, node:$rhs),[{ + return hasNoVMLxHazardUse(N); +}]>; + //===----------------------------------------------------------------------===// // Operand Definitions. // // Branch target. -def brtarget : Operand<OtherVT>; +// FIXME: rename brtarget to t2_brtarget +def brtarget : Operand<OtherVT> { + let EncoderMethod = "getBranchTargetOpValue"; +} + +// FIXME: get rid of this one? +def uncondbrtarget : Operand<OtherVT> { + let EncoderMethod = "getUnconditionalBranchTargetOpValue"; +} + +// Branch target for ARM. Handles conditional/unconditional +def br_target : Operand<OtherVT> { + let EncoderMethod = "getARMBranchTargetOpValue"; +} + +// Call target. +// FIXME: rename bltarget to t2_bl_target? +def bltarget : Operand<i32> { + // Encoded the same as branch targets. + let EncoderMethod = "getBranchTargetOpValue"; +} + +// Call target for ARM. Handles conditional/unconditional +// FIXME: rename bl_target to t2_bltarget? +def bl_target : Operand<i32> { + // Encoded the same as branch targets. + let EncoderMethod = "getARMBranchTargetOpValue"; +} + // A list of registers separated by comma. Used by load/store multiple. +def RegListAsmOperand : AsmOperandClass { + let Name = "RegList"; + let SuperClasses = []; +} + +def DPRRegListAsmOperand : AsmOperandClass { + let Name = "DPRRegList"; + let SuperClasses = []; +} + +def SPRRegListAsmOperand : AsmOperandClass { + let Name = "SPRRegList"; + let SuperClasses = []; +} + def reglist : Operand<i32> { + let EncoderMethod = "getRegisterListOpValue"; + let ParserMatchClass = RegListAsmOperand; let PrintMethod = "printRegisterList"; } -// An operand for the CONSTPOOL_ENTRY pseudo-instruction. -def cpinst_operand : Operand<i32> { - let PrintMethod = "printCPInstOperand"; +def dpr_reglist : Operand<i32> { + let EncoderMethod = "getRegisterListOpValue"; + let ParserMatchClass = DPRRegListAsmOperand; + let PrintMethod = "printRegisterList"; } -def jtblock_operand : Operand<i32> { - let PrintMethod = "printJTBlockOperand"; +def spr_reglist : Operand<i32> { + let EncoderMethod = "getRegisterListOpValue"; + let ParserMatchClass = SPRRegListAsmOperand; + let PrintMethod = "printRegisterList"; } -def jt2block_operand : Operand<i32> { - let PrintMethod = "printJT2BlockOperand"; + +// An operand for the CONSTPOOL_ENTRY pseudo-instruction. +def cpinst_operand : Operand<i32> { + let PrintMethod = "printCPInstOperand"; } // Local PC labels. @@ -302,6 +367,22 @@ def pclabel : Operand<i32> { let PrintMethod = "printPCLabel"; } +// ADR instruction labels. +def adrlabel : Operand<i32> { + let EncoderMethod = "getAdrLabelOpValue"; +} + +def neon_vcvt_imm32 : Operand<i32> { + let EncoderMethod = "getNEONVcvtImm32OpValue"; +} + +// rot_imm: An integer that encodes a rotate amount. Must be 8, 16, or 24. +def rot_imm : Operand<i32>, PatLeaf<(i32 imm), [{ + int32_t v = (int32_t)N->getZExtValue(); + return v == 8 || v == 16 || v == 24; }]> { + let EncoderMethod = "getRotImmOpValue"; +} + // shift_imm: An integer that encodes a shift amount and the type of shift // (currently either asr or lsl) using the same encoding used for the // immediates in so_reg operands. @@ -313,73 +394,120 @@ def shift_imm : Operand<i32> { def so_reg : Operand<i32>, // reg reg imm ComplexPattern<i32, 3, "SelectShifterOperandReg", [shl,srl,sra,rotr]> { + let EncoderMethod = "getSORegOpValue"; + let PrintMethod = "printSORegOperand"; + let MIOperandInfo = (ops GPR, GPR, i32imm); +} +def shift_so_reg : Operand<i32>, // reg reg imm + ComplexPattern<i32, 3, "SelectShiftShifterOperandReg", + [shl,srl,sra,rotr]> { + let EncoderMethod = "getSORegOpValue"; let PrintMethod = "printSORegOperand"; let MIOperandInfo = (ops GPR, GPR, i32imm); } // so_imm - Match a 32-bit shifter_operand immediate operand, which is an -// 8-bit immediate rotated by an arbitrary number of bits. so_imm values are -// represented in the imm field in the same 12-bit form that they are encoded -// into so_imm instructions: the 8-bit immediate is the least significant bits -// [bits 0-7], the 4-bit shift amount is the next 4 bits [bits 8-11]. +// 8-bit immediate rotated by an arbitrary number of bits. def so_imm : Operand<i32>, PatLeaf<(imm), [{ return Pred_so_imm(N); }]> { + let EncoderMethod = "getSOImmOpValue"; let PrintMethod = "printSOImmOperand"; } // Break so_imm's up into two pieces. This handles immediates with up to 16 // bits set in them. This uses so_imm2part to match and so_imm2part_[12] to // get the first/second pieces. -def so_imm2part : Operand<i32>, - PatLeaf<(imm), [{ +def so_imm2part : PatLeaf<(imm), [{ return ARM_AM::isSOImmTwoPartVal((unsigned)N->getZExtValue()); - }]> { - let PrintMethod = "printSOImm2PartOperand"; -} +}]>; -def so_imm2part_1 : SDNodeXForm<imm, [{ - unsigned V = ARM_AM::getSOImmTwoPartFirst((unsigned)N->getZExtValue()); - return CurDAG->getTargetConstant(V, MVT::i32); +/// arm_i32imm - True for +V6T2, or true only if so_imm2part is true. +/// +def arm_i32imm : PatLeaf<(imm), [{ + if (Subtarget->hasV6T2Ops()) + return true; + return ARM_AM::isSOImmTwoPartVal((unsigned)N->getZExtValue()); }]>; -def so_imm2part_2 : SDNodeXForm<imm, [{ - unsigned V = ARM_AM::getSOImmTwoPartSecond((unsigned)N->getZExtValue()); - return CurDAG->getTargetConstant(V, MVT::i32); +/// imm0_31 predicate - True if the 32-bit immediate is in the range [0,31]. +def imm0_31 : Operand<i32>, PatLeaf<(imm), [{ + return (int32_t)N->getZExtValue() < 32; }]>; -def so_neg_imm2part : Operand<i32>, PatLeaf<(imm), [{ - return ARM_AM::isSOImmTwoPartVal(-(int)N->getZExtValue()); - }]> { - let PrintMethod = "printSOImm2PartOperand"; +/// imm0_31_m1 - Matches and prints like imm0_31, but encodes as 'value - 1'. +def imm0_31_m1 : Operand<i32>, PatLeaf<(imm), [{ + return (int32_t)N->getZExtValue() < 32; +}]> { + let EncoderMethod = "getImmMinusOneOpValue"; } -def so_neg_imm2part_1 : SDNodeXForm<imm, [{ - unsigned V = ARM_AM::getSOImmTwoPartFirst(-(int)N->getZExtValue()); - return CurDAG->getTargetConstant(V, MVT::i32); -}]>; +// i32imm_hilo16 - For movt/movw - sets the MC Encoder method. +// The imm is split into imm{15-12}, imm{11-0} +// +def i32imm_hilo16 : Operand<i32> { + let EncoderMethod = "getHiLo16ImmOpValue"; +} -def so_neg_imm2part_2 : SDNodeXForm<imm, [{ - unsigned V = ARM_AM::getSOImmTwoPartSecond(-(int)N->getZExtValue()); - return CurDAG->getTargetConstant(V, MVT::i32); -}]>; +/// bf_inv_mask_imm predicate - An AND mask to clear an arbitrary width bitfield +/// e.g., 0xf000ffff +def bf_inv_mask_imm : Operand<i32>, + PatLeaf<(imm), [{ + return ARM::isBitFieldInvertedMask(N->getZExtValue()); +}] > { + let EncoderMethod = "getBitfieldInvertedMaskOpValue"; + let PrintMethod = "printBitfieldInvMaskImmOperand"; +} -/// imm0_31 predicate - True if the 32-bit immediate is in the range [0,31]. -def imm0_31 : Operand<i32>, PatLeaf<(imm), [{ - return (int32_t)N->getZExtValue() < 32; +/// lsb_pos_imm - position of the lsb bit, used by BFI4p and t2BFI4p +def lsb_pos_imm : Operand<i32>, PatLeaf<(imm), [{ + return isInt<5>(N->getSExtValue()); }]>; +/// width_imm - number of bits to be copied, used by BFI4p and t2BFI4p +def width_imm : Operand<i32>, PatLeaf<(imm), [{ + return N->getSExtValue() > 0 && N->getSExtValue() <= 32; +}] > { + let EncoderMethod = "getMsbOpValue"; +} + // Define ARM specific addressing modes. -// addrmode2 := reg +/- reg shop imm + +// addrmode_imm12 := reg +/- imm12 +// +def addrmode_imm12 : Operand<i32>, + ComplexPattern<i32, 2, "SelectAddrModeImm12", []> { + // 12-bit immediate operand. Note that instructions using this encode + // #0 and #-0 differently. We flag #-0 as the magic value INT32_MIN. All other + // immediate values are as normal. + + let EncoderMethod = "getAddrModeImm12OpValue"; + let PrintMethod = "printAddrModeImm12Operand"; + let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm); +} +// ldst_so_reg := reg +/- reg shop imm +// +def ldst_so_reg : Operand<i32>, + ComplexPattern<i32, 3, "SelectLdStSOReg", []> { + let EncoderMethod = "getLdStSORegOpValue"; + // FIXME: Simplify the printer + let PrintMethod = "printAddrMode2Operand"; + let MIOperandInfo = (ops GPR:$base, GPR:$offsreg, i32imm:$offsimm); +} + // addrmode2 := reg +/- imm12 +// := reg +/- reg shop imm // def addrmode2 : Operand<i32>, ComplexPattern<i32, 3, "SelectAddrMode2", []> { + let EncoderMethod = "getAddrMode2OpValue"; let PrintMethod = "printAddrMode2Operand"; let MIOperandInfo = (ops GPR:$base, GPR:$offsreg, i32imm:$offsimm); } def am2offset : Operand<i32>, - ComplexPattern<i32, 2, "SelectAddrMode2Offset", []> { + ComplexPattern<i32, 2, "SelectAddrMode2Offset", + [], [SDNPWantRoot]> { + let EncoderMethod = "getAddrMode2OffsetOpValue"; let PrintMethod = "printAddrMode2OffsetOperand"; let MIOperandInfo = (ops GPR, i32imm); } @@ -389,22 +517,29 @@ def am2offset : Operand<i32>, // def addrmode3 : Operand<i32>, ComplexPattern<i32, 3, "SelectAddrMode3", []> { + let EncoderMethod = "getAddrMode3OpValue"; let PrintMethod = "printAddrMode3Operand"; let MIOperandInfo = (ops GPR:$base, GPR:$offsreg, i32imm:$offsimm); } def am3offset : Operand<i32>, - ComplexPattern<i32, 2, "SelectAddrMode3Offset", []> { + ComplexPattern<i32, 2, "SelectAddrMode3Offset", + [], [SDNPWantRoot]> { + let EncoderMethod = "getAddrMode3OffsetOpValue"; let PrintMethod = "printAddrMode3OffsetOperand"; let MIOperandInfo = (ops GPR, i32imm); } -// addrmode4 := reg, <mode|W> +// ldstm_mode := {ia, ib, da, db} // -def addrmode4 : Operand<i32>, - ComplexPattern<i32, 2, "SelectAddrMode4", []> { - let PrintMethod = "printAddrMode4Operand"; - let MIOperandInfo = (ops GPR:$addr, i32imm); +def ldstm_mode : OptionalDefOperand<OtherVT, (ops i32), (ops (i32 1))> { + let EncoderMethod = "getLdStmModeOpValue"; + let PrintMethod = "printLdStmModeOperand"; +} + +def MemMode5AsmOperand : AsmOperandClass { + let Name = "MemMode5"; + let SuperClasses = []; } // addrmode5 := reg +/- imm8*4 @@ -413,19 +548,32 @@ def addrmode5 : Operand<i32>, ComplexPattern<i32, 2, "SelectAddrMode5", []> { let PrintMethod = "printAddrMode5Operand"; let MIOperandInfo = (ops GPR:$base, i32imm); + let ParserMatchClass = MemMode5AsmOperand; + let EncoderMethod = "getAddrMode5OpValue"; } -// addrmode6 := reg with optional writeback +// addrmode6 := reg with optional alignment // def addrmode6 : Operand<i32>, - ComplexPattern<i32, 2, "SelectAddrMode6", []> { + ComplexPattern<i32, 2, "SelectAddrMode6", [], [SDNPWantParent]>{ let PrintMethod = "printAddrMode6Operand"; let MIOperandInfo = (ops GPR:$addr, i32imm); + let EncoderMethod = "getAddrMode6AddressOpValue"; } def am6offset : Operand<i32> { let PrintMethod = "printAddrMode6OffsetOperand"; let MIOperandInfo = (ops GPR); + let EncoderMethod = "getAddrMode6OffsetOpValue"; +} + +// Special version of addrmode6 to handle alignment encoding for VLD-dup +// instructions, specifically VLD4-dup. +def addrmode6dup : Operand<i32>, + ComplexPattern<i32, 2, "SelectAddrMode6", [], [SDNPWantParent]>{ + let PrintMethod = "printAddrMode6Operand"; + let MIOperandInfo = (ops GPR:$addr, i32imm); + let EncoderMethod = "getAddrMode6DupAddressOpValue"; } // addrmodepc := pc + reg @@ -440,6 +588,28 @@ def nohash_imm : Operand<i32> { let PrintMethod = "printNoHashImmediate"; } +def CoprocNumAsmOperand : AsmOperandClass { + let Name = "CoprocNum"; + let SuperClasses = []; + let ParserMethod = "tryParseCoprocNumOperand"; +} + +def CoprocRegAsmOperand : AsmOperandClass { + let Name = "CoprocReg"; + let SuperClasses = []; + let ParserMethod = "tryParseCoprocRegOperand"; +} + +def p_imm : Operand<i32> { + let PrintMethod = "printPImmediate"; + let ParserMatchClass = CoprocNumAsmOperand; +} + +def c_imm : Operand<i32> { + let PrintMethod = "printCImmediate"; + let ParserMatchClass = CoprocRegAsmOperand; +} + //===----------------------------------------------------------------------===// include "ARMInstrFormats.td" @@ -450,55 +620,93 @@ include "ARMInstrFormats.td" /// AsI1_bin_irs - Defines a set of (op r, {so_imm|r|so_reg}) patterns for a /// binop that produces a value. -multiclass AsI1_bin_irs<bits<4> opcod, string opc, PatFrag opnode, - bit Commutable = 0> { +multiclass AsI1_bin_irs<bits<4> opcod, string opc, + InstrItinClass iii, InstrItinClass iir, InstrItinClass iis, + PatFrag opnode, bit Commutable = 0> { // The register-immediate version is re-materializable. This is useful // in particular for taking the address of a local. let isReMaterializable = 1 in { - def ri : AsI1<opcod, (outs GPR:$dst), (ins GPR:$a, so_imm:$b), DPFrm, - IIC_iALUi, opc, "\t$dst, $a, $b", - [(set GPR:$dst, (opnode GPR:$a, so_imm:$b))]> { + def ri : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, so_imm:$imm), DPFrm, + iii, opc, "\t$Rd, $Rn, $imm", + [(set GPR:$Rd, (opnode GPR:$Rn, so_imm:$imm))]> { + bits<4> Rd; + bits<4> Rn; + bits<12> imm; let Inst{25} = 1; + let Inst{19-16} = Rn; + let Inst{15-12} = Rd; + let Inst{11-0} = imm; } } - def rr : AsI1<opcod, (outs GPR:$dst), (ins GPR:$a, GPR:$b), DPFrm, - IIC_iALUr, opc, "\t$dst, $a, $b", - [(set GPR:$dst, (opnode GPR:$a, GPR:$b))]> { - let Inst{11-4} = 0b00000000; + def rr : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), DPFrm, + iir, opc, "\t$Rd, $Rn, $Rm", + [(set GPR:$Rd, (opnode GPR:$Rn, GPR:$Rm))]> { + bits<4> Rd; + bits<4> Rn; + bits<4> Rm; let Inst{25} = 0; let isCommutable = Commutable; + let Inst{19-16} = Rn; + let Inst{15-12} = Rd; + let Inst{11-4} = 0b00000000; + let Inst{3-0} = Rm; } - def rs : AsI1<opcod, (outs GPR:$dst), (ins GPR:$a, so_reg:$b), DPSoRegFrm, - IIC_iALUsr, opc, "\t$dst, $a, $b", - [(set GPR:$dst, (opnode GPR:$a, so_reg:$b))]> { + def rs : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, so_reg:$shift), DPSoRegFrm, + iis, opc, "\t$Rd, $Rn, $shift", + [(set GPR:$Rd, (opnode GPR:$Rn, so_reg:$shift))]> { + bits<4> Rd; + bits<4> Rn; + bits<12> shift; let Inst{25} = 0; + let Inst{19-16} = Rn; + let Inst{15-12} = Rd; + let Inst{11-0} = shift; } } /// AI1_bin_s_irs - Similar to AsI1_bin_irs except it sets the 's' bit so the /// instruction modifies the CPSR register. -let Defs = [CPSR] in { -multiclass AI1_bin_s_irs<bits<4> opcod, string opc, PatFrag opnode, - bit Commutable = 0> { - def ri : AI1<opcod, (outs GPR:$dst), (ins GPR:$a, so_imm:$b), DPFrm, - IIC_iALUi, opc, "\t$dst, $a, $b", - [(set GPR:$dst, (opnode GPR:$a, so_imm:$b))]> { - let Inst{20} = 1; +let isCodeGenOnly = 1, Defs = [CPSR] in { +multiclass AI1_bin_s_irs<bits<4> opcod, string opc, + InstrItinClass iii, InstrItinClass iir, InstrItinClass iis, + PatFrag opnode, bit Commutable = 0> { + def ri : AI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, so_imm:$imm), DPFrm, + iii, opc, "\t$Rd, $Rn, $imm", + [(set GPR:$Rd, (opnode GPR:$Rn, so_imm:$imm))]> { + bits<4> Rd; + bits<4> Rn; + bits<12> imm; let Inst{25} = 1; + let Inst{20} = 1; + let Inst{19-16} = Rn; + let Inst{15-12} = Rd; + let Inst{11-0} = imm; } - def rr : AI1<opcod, (outs GPR:$dst), (ins GPR:$a, GPR:$b), DPFrm, - IIC_iALUr, opc, "\t$dst, $a, $b", - [(set GPR:$dst, (opnode GPR:$a, GPR:$b))]> { + def rr : AI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), DPFrm, + iir, opc, "\t$Rd, $Rn, $Rm", + [(set GPR:$Rd, (opnode GPR:$Rn, GPR:$Rm))]> { + bits<4> Rd; + bits<4> Rn; + bits<4> Rm; let isCommutable = Commutable; - let Inst{11-4} = 0b00000000; - let Inst{20} = 1; let Inst{25} = 0; - } - def rs : AI1<opcod, (outs GPR:$dst), (ins GPR:$a, so_reg:$b), DPSoRegFrm, - IIC_iALUsr, opc, "\t$dst, $a, $b", - [(set GPR:$dst, (opnode GPR:$a, so_reg:$b))]> { let Inst{20} = 1; + let Inst{19-16} = Rn; + let Inst{15-12} = Rd; + let Inst{11-4} = 0b00000000; + let Inst{3-0} = Rm; + } + def rs : AI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, so_reg:$shift), DPSoRegFrm, + iis, opc, "\t$Rd, $Rn, $shift", + [(set GPR:$Rd, (opnode GPR:$Rn, so_reg:$shift))]> { + bits<4> Rd; + bits<4> Rn; + bits<12> shift; let Inst{25} = 0; + let Inst{20} = 1; + let Inst{19-16} = Rn; + let Inst{15-12} = Rd; + let Inst{11-0} = shift; } } } @@ -507,146 +715,233 @@ multiclass AI1_bin_s_irs<bits<4> opcod, string opc, PatFrag opnode, /// patterns. Similar to AsI1_bin_irs except the instruction does not produce /// a explicit result, only implicitly set CPSR. let isCompare = 1, Defs = [CPSR] in { -multiclass AI1_cmp_irs<bits<4> opcod, string opc, PatFrag opnode, - bit Commutable = 0> { - def ri : AI1<opcod, (outs), (ins GPR:$a, so_imm:$b), DPFrm, IIC_iCMPi, - opc, "\t$a, $b", - [(opnode GPR:$a, so_imm:$b)]> { - let Inst{20} = 1; +multiclass AI1_cmp_irs<bits<4> opcod, string opc, + InstrItinClass iii, InstrItinClass iir, InstrItinClass iis, + PatFrag opnode, bit Commutable = 0> { + def ri : AI1<opcod, (outs), (ins GPR:$Rn, so_imm:$imm), DPFrm, iii, + opc, "\t$Rn, $imm", + [(opnode GPR:$Rn, so_imm:$imm)]> { + bits<4> Rn; + bits<12> imm; let Inst{25} = 1; - } - def rr : AI1<opcod, (outs), (ins GPR:$a, GPR:$b), DPFrm, IIC_iCMPr, - opc, "\t$a, $b", - [(opnode GPR:$a, GPR:$b)]> { - let Inst{11-4} = 0b00000000; let Inst{20} = 1; - let Inst{25} = 0; - let isCommutable = Commutable; + let Inst{19-16} = Rn; + let Inst{15-12} = 0b0000; + let Inst{11-0} = imm; } - def rs : AI1<opcod, (outs), (ins GPR:$a, so_reg:$b), DPSoRegFrm, IIC_iCMPsr, - opc, "\t$a, $b", - [(opnode GPR:$a, so_reg:$b)]> { + def rr : AI1<opcod, (outs), (ins GPR:$Rn, GPR:$Rm), DPFrm, iir, + opc, "\t$Rn, $Rm", + [(opnode GPR:$Rn, GPR:$Rm)]> { + bits<4> Rn; + bits<4> Rm; + let isCommutable = Commutable; + let Inst{25} = 0; let Inst{20} = 1; + let Inst{19-16} = Rn; + let Inst{15-12} = 0b0000; + let Inst{11-4} = 0b00000000; + let Inst{3-0} = Rm; + } + def rs : AI1<opcod, (outs), (ins GPR:$Rn, so_reg:$shift), DPSoRegFrm, iis, + opc, "\t$Rn, $shift", + [(opnode GPR:$Rn, so_reg:$shift)]> { + bits<4> Rn; + bits<12> shift; let Inst{25} = 0; + let Inst{20} = 1; + let Inst{19-16} = Rn; + let Inst{15-12} = 0b0000; + let Inst{11-0} = shift; } } } -/// AI_unary_rrot - A unary operation with two forms: one whose operand is a +/// AI_ext_rrot - A unary operation with two forms: one whose operand is a /// register and one whose operand is a register rotated by 8/16/24. /// FIXME: Remove the 'r' variant. Its rot_imm is zero. -multiclass AI_unary_rrot<bits<8> opcod, string opc, PatFrag opnode> { - def r : AExtI<opcod, (outs GPR:$dst), (ins GPR:$src), - IIC_iUNAr, opc, "\t$dst, $src", - [(set GPR:$dst, (opnode GPR:$src))]>, +multiclass AI_ext_rrot<bits<8> opcod, string opc, PatFrag opnode> { + def r : AExtI<opcod, (outs GPR:$Rd), (ins GPR:$Rm), + IIC_iEXTr, opc, "\t$Rd, $Rm", + [(set GPR:$Rd, (opnode GPR:$Rm))]>, Requires<[IsARM, HasV6]> { - let Inst{11-10} = 0b00; + bits<4> Rd; + bits<4> Rm; let Inst{19-16} = 0b1111; + let Inst{15-12} = Rd; + let Inst{11-10} = 0b00; + let Inst{3-0} = Rm; } - def r_rot : AExtI<opcod, (outs GPR:$dst), (ins GPR:$src, i32imm:$rot), - IIC_iUNAsi, opc, "\t$dst, $src, ror $rot", - [(set GPR:$dst, (opnode (rotr GPR:$src, rot_imm:$rot)))]>, + def r_rot : AExtI<opcod, (outs GPR:$Rd), (ins GPR:$Rm, rot_imm:$rot), + IIC_iEXTr, opc, "\t$Rd, $Rm, ror $rot", + [(set GPR:$Rd, (opnode (rotr GPR:$Rm, rot_imm:$rot)))]>, Requires<[IsARM, HasV6]> { + bits<4> Rd; + bits<4> Rm; + bits<2> rot; let Inst{19-16} = 0b1111; + let Inst{15-12} = Rd; + let Inst{11-10} = rot; + let Inst{3-0} = Rm; } } -multiclass AI_unary_rrot_np<bits<8> opcod, string opc> { - def r : AExtI<opcod, (outs GPR:$dst), (ins GPR:$src), - IIC_iUNAr, opc, "\t$dst, $src", +multiclass AI_ext_rrot_np<bits<8> opcod, string opc> { + def r : AExtI<opcod, (outs GPR:$Rd), (ins GPR:$Rm), + IIC_iEXTr, opc, "\t$Rd, $Rm", [/* For disassembly only; pattern left blank */]>, Requires<[IsARM, HasV6]> { - let Inst{11-10} = 0b00; let Inst{19-16} = 0b1111; + let Inst{11-10} = 0b00; } - def r_rot : AExtI<opcod, (outs GPR:$dst), (ins GPR:$src, i32imm:$rot), - IIC_iUNAsi, opc, "\t$dst, $src, ror $rot", + def r_rot : AExtI<opcod, (outs GPR:$Rd), (ins GPR:$Rm, rot_imm:$rot), + IIC_iEXTr, opc, "\t$Rd, $Rm, ror $rot", [/* For disassembly only; pattern left blank */]>, Requires<[IsARM, HasV6]> { + bits<2> rot; let Inst{19-16} = 0b1111; + let Inst{11-10} = rot; } } -/// AI_bin_rrot - A binary operation with two forms: one whose operand is a +/// AI_exta_rrot - A binary operation with two forms: one whose operand is a /// register and one whose operand is a register rotated by 8/16/24. -multiclass AI_bin_rrot<bits<8> opcod, string opc, PatFrag opnode> { - def rr : AExtI<opcod, (outs GPR:$dst), (ins GPR:$LHS, GPR:$RHS), - IIC_iALUr, opc, "\t$dst, $LHS, $RHS", - [(set GPR:$dst, (opnode GPR:$LHS, GPR:$RHS))]>, +multiclass AI_exta_rrot<bits<8> opcod, string opc, PatFrag opnode> { + def rr : AExtI<opcod, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), + IIC_iEXTAr, opc, "\t$Rd, $Rn, $Rm", + [(set GPR:$Rd, (opnode GPR:$Rn, GPR:$Rm))]>, Requires<[IsARM, HasV6]> { + bits<4> Rd; + bits<4> Rm; + bits<4> Rn; + let Inst{19-16} = Rn; + let Inst{15-12} = Rd; let Inst{11-10} = 0b00; + let Inst{9-4} = 0b000111; + let Inst{3-0} = Rm; + } + def rr_rot : AExtI<opcod, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm, + rot_imm:$rot), + IIC_iEXTAr, opc, "\t$Rd, $Rn, $Rm, ror $rot", + [(set GPR:$Rd, (opnode GPR:$Rn, + (rotr GPR:$Rm, rot_imm:$rot)))]>, + Requires<[IsARM, HasV6]> { + bits<4> Rd; + bits<4> Rm; + bits<4> Rn; + bits<2> rot; + let Inst{19-16} = Rn; + let Inst{15-12} = Rd; + let Inst{11-10} = rot; + let Inst{9-4} = 0b000111; + let Inst{3-0} = Rm; } - def rr_rot : AExtI<opcod, (outs GPR:$dst), (ins GPR:$LHS, GPR:$RHS, - i32imm:$rot), - IIC_iALUsi, opc, "\t$dst, $LHS, $RHS, ror $rot", - [(set GPR:$dst, (opnode GPR:$LHS, - (rotr GPR:$RHS, rot_imm:$rot)))]>, - Requires<[IsARM, HasV6]>; } // For disassembly only. -multiclass AI_bin_rrot_np<bits<8> opcod, string opc> { - def rr : AExtI<opcod, (outs GPR:$dst), (ins GPR:$LHS, GPR:$RHS), - IIC_iALUr, opc, "\t$dst, $LHS, $RHS", +multiclass AI_exta_rrot_np<bits<8> opcod, string opc> { + def rr : AExtI<opcod, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), + IIC_iEXTAr, opc, "\t$Rd, $Rn, $Rm", [/* For disassembly only; pattern left blank */]>, Requires<[IsARM, HasV6]> { let Inst{11-10} = 0b00; } - def rr_rot : AExtI<opcod, (outs GPR:$dst), (ins GPR:$LHS, GPR:$RHS, - i32imm:$rot), - IIC_iALUsi, opc, "\t$dst, $LHS, $RHS, ror $rot", + def rr_rot : AExtI<opcod, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm, + rot_imm:$rot), + IIC_iEXTAr, opc, "\t$Rd, $Rn, $Rm, ror $rot", [/* For disassembly only; pattern left blank */]>, - Requires<[IsARM, HasV6]>; + Requires<[IsARM, HasV6]> { + bits<4> Rn; + bits<2> rot; + let Inst{19-16} = Rn; + let Inst{11-10} = rot; + } } /// AI1_adde_sube_irs - Define instructions and patterns for adde and sube. let Uses = [CPSR] in { multiclass AI1_adde_sube_irs<bits<4> opcod, string opc, PatFrag opnode, bit Commutable = 0> { - def ri : AsI1<opcod, (outs GPR:$dst), (ins GPR:$a, so_imm:$b), - DPFrm, IIC_iALUi, opc, "\t$dst, $a, $b", - [(set GPR:$dst, (opnode GPR:$a, so_imm:$b))]>, + def ri : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, so_imm:$imm), + DPFrm, IIC_iALUi, opc, "\t$Rd, $Rn, $imm", + [(set GPR:$Rd, (opnode GPR:$Rn, so_imm:$imm))]>, Requires<[IsARM]> { + bits<4> Rd; + bits<4> Rn; + bits<12> imm; let Inst{25} = 1; + let Inst{15-12} = Rd; + let Inst{19-16} = Rn; + let Inst{11-0} = imm; } - def rr : AsI1<opcod, (outs GPR:$dst), (ins GPR:$a, GPR:$b), - DPFrm, IIC_iALUr, opc, "\t$dst, $a, $b", - [(set GPR:$dst, (opnode GPR:$a, GPR:$b))]>, + def rr : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), + DPFrm, IIC_iALUr, opc, "\t$Rd, $Rn, $Rm", + [(set GPR:$Rd, (opnode GPR:$Rn, GPR:$Rm))]>, Requires<[IsARM]> { - let isCommutable = Commutable; + bits<4> Rd; + bits<4> Rn; + bits<4> Rm; let Inst{11-4} = 0b00000000; let Inst{25} = 0; + let isCommutable = Commutable; + let Inst{3-0} = Rm; + let Inst{15-12} = Rd; + let Inst{19-16} = Rn; } - def rs : AsI1<opcod, (outs GPR:$dst), (ins GPR:$a, so_reg:$b), - DPSoRegFrm, IIC_iALUsr, opc, "\t$dst, $a, $b", - [(set GPR:$dst, (opnode GPR:$a, so_reg:$b))]>, + def rs : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, so_reg:$shift), + DPSoRegFrm, IIC_iALUsr, opc, "\t$Rd, $Rn, $shift", + [(set GPR:$Rd, (opnode GPR:$Rn, so_reg:$shift))]>, Requires<[IsARM]> { + bits<4> Rd; + bits<4> Rn; + bits<12> shift; let Inst{25} = 0; + let Inst{11-0} = shift; + let Inst{15-12} = Rd; + let Inst{19-16} = Rn; } } // Carry setting variants -let Defs = [CPSR] in { +let isCodeGenOnly = 1, Defs = [CPSR] in { multiclass AI1_adde_sube_s_irs<bits<4> opcod, string opc, PatFrag opnode, bit Commutable = 0> { - def Sri : AXI1<opcod, (outs GPR:$dst), (ins GPR:$a, so_imm:$b), - DPFrm, IIC_iALUi, !strconcat(opc, "\t$dst, $a, $b"), - [(set GPR:$dst, (opnode GPR:$a, so_imm:$b))]>, + def Sri : AXI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, so_imm:$imm), + DPFrm, IIC_iALUi, !strconcat(opc, "\t$Rd, $Rn, $imm"), + [(set GPR:$Rd, (opnode GPR:$Rn, so_imm:$imm))]>, Requires<[IsARM]> { + bits<4> Rd; + bits<4> Rn; + bits<12> imm; + let Inst{15-12} = Rd; + let Inst{19-16} = Rn; + let Inst{11-0} = imm; let Inst{20} = 1; let Inst{25} = 1; } - def Srr : AXI1<opcod, (outs GPR:$dst), (ins GPR:$a, GPR:$b), - DPFrm, IIC_iALUr, !strconcat(opc, "\t$dst, $a, $b"), - [(set GPR:$dst, (opnode GPR:$a, GPR:$b))]>, + def Srr : AXI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), + DPFrm, IIC_iALUr, !strconcat(opc, "\t$Rd, $Rn, $Rm"), + [(set GPR:$Rd, (opnode GPR:$Rn, GPR:$Rm))]>, Requires<[IsARM]> { + bits<4> Rd; + bits<4> Rn; + bits<4> Rm; let Inst{11-4} = 0b00000000; + let isCommutable = Commutable; + let Inst{3-0} = Rm; + let Inst{15-12} = Rd; + let Inst{19-16} = Rn; let Inst{20} = 1; let Inst{25} = 0; } - def Srs : AXI1<opcod, (outs GPR:$dst), (ins GPR:$a, so_reg:$b), - DPSoRegFrm, IIC_iALUsr, !strconcat(opc, "\t$dst, $a, $b"), - [(set GPR:$dst, (opnode GPR:$a, so_reg:$b))]>, + def Srs : AXI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, so_reg:$shift), + DPSoRegFrm, IIC_iALUsr, !strconcat(opc, "\t$Rd, $Rn, $shift"), + [(set GPR:$Rd, (opnode GPR:$Rn, so_reg:$shift))]>, Requires<[IsARM]> { + bits<4> Rd; + bits<4> Rn; + bits<12> shift; + let Inst{11-0} = shift; + let Inst{15-12} = Rd; + let Inst{19-16} = Rn; let Inst{20} = 1; let Inst{25} = 0; } @@ -654,6 +949,62 @@ multiclass AI1_adde_sube_s_irs<bits<4> opcod, string opc, PatFrag opnode, } } +let canFoldAsLoad = 1, isReMaterializable = 1 in { +multiclass AI_ldr1<bit isByte, string opc, InstrItinClass iii, + InstrItinClass iir, PatFrag opnode> { + // Note: We use the complex addrmode_imm12 rather than just an input + // GPR and a constrained immediate so that we can use this to match + // frame index references and avoid matching constant pool references. + def i12: AI2ldst<0b010, 1, isByte, (outs GPR:$Rt), (ins addrmode_imm12:$addr), + AddrMode_i12, LdFrm, iii, opc, "\t$Rt, $addr", + [(set GPR:$Rt, (opnode addrmode_imm12:$addr))]> { + bits<4> Rt; + bits<17> addr; + let Inst{23} = addr{12}; // U (add = ('U' == 1)) + let Inst{19-16} = addr{16-13}; // Rn + let Inst{15-12} = Rt; + let Inst{11-0} = addr{11-0}; // imm12 + } + def rs : AI2ldst<0b011, 1, isByte, (outs GPR:$Rt), (ins ldst_so_reg:$shift), + AddrModeNone, LdFrm, iir, opc, "\t$Rt, $shift", + [(set GPR:$Rt, (opnode ldst_so_reg:$shift))]> { + bits<4> Rt; + bits<17> shift; + let Inst{23} = shift{12}; // U (add = ('U' == 1)) + let Inst{19-16} = shift{16-13}; // Rn + let Inst{15-12} = Rt; + let Inst{11-0} = shift{11-0}; + } +} +} + +multiclass AI_str1<bit isByte, string opc, InstrItinClass iii, + InstrItinClass iir, PatFrag opnode> { + // Note: We use the complex addrmode_imm12 rather than just an input + // GPR and a constrained immediate so that we can use this to match + // frame index references and avoid matching constant pool references. + def i12 : AI2ldst<0b010, 0, isByte, (outs), + (ins GPR:$Rt, addrmode_imm12:$addr), + AddrMode_i12, StFrm, iii, opc, "\t$Rt, $addr", + [(opnode GPR:$Rt, addrmode_imm12:$addr)]> { + bits<4> Rt; + bits<17> addr; + let Inst{23} = addr{12}; // U (add = ('U' == 1)) + let Inst{19-16} = addr{16-13}; // Rn + let Inst{15-12} = Rt; + let Inst{11-0} = addr{11-0}; // imm12 + } + def rs : AI2ldst<0b011, 0, isByte, (outs), (ins GPR:$Rt, ldst_so_reg:$shift), + AddrModeNone, StFrm, iir, opc, "\t$Rt, $shift", + [(opnode GPR:$Rt, ldst_so_reg:$shift)]> { + bits<4> Rt; + bits<17> shift; + let Inst{23} = shift{12}; // U (add = ('U' == 1)) + let Inst{19-16} = shift{16-13}; // Rn + let Inst{15-12} = Rt; + let Inst{11-0} = shift{11-0}; + } +} //===----------------------------------------------------------------------===// // Instructions //===----------------------------------------------------------------------===// @@ -669,8 +1020,7 @@ multiclass AI1_adde_sube_s_irs<bits<4> opcod, string opc, PatFrag opnode, let neverHasSideEffects = 1, isNotDuplicable = 1 in def CONSTPOOL_ENTRY : PseudoInst<(outs), (ins cpinst_operand:$instid, cpinst_operand:$cpidx, - i32imm:$size), NoItinerary, - "${instid:label} ${cpidx:cpentry}", []>; + i32imm:$size), NoItinerary, []>; // FIXME: Marking these as hasSideEffects is necessary to prevent machine DCE // from removing one half of the matched pairs. That breaks PEI, which assumes @@ -678,12 +1028,10 @@ PseudoInst<(outs), (ins cpinst_operand:$instid, cpinst_operand:$cpidx, let Defs = [SP], Uses = [SP], hasSideEffects = 1 in { def ADJCALLSTACKUP : PseudoInst<(outs), (ins i32imm:$amt1, i32imm:$amt2, pred:$p), NoItinerary, - "${:comment} ADJCALLSTACKUP $amt1", [(ARMcallseq_end timm:$amt1, timm:$amt2)]>; def ADJCALLSTACKDOWN : PseudoInst<(outs), (ins i32imm:$amt, pred:$p), NoItinerary, - "${:comment} ADJCALLSTACKDOWN $amt", [(ARMcallseq_start timm:$amt)]>; } @@ -691,6 +1039,7 @@ def NOP : AI<(outs), (ins), MiscFrm, NoItinerary, "nop", "", [/* For disassembly only; pattern left blank */]>, Requires<[IsARM, HasV6T2]> { let Inst{27-16} = 0b001100100000; + let Inst{15-8} = 0b11110000; let Inst{7-0} = 0b00000000; } @@ -698,6 +1047,7 @@ def YIELD : AI<(outs), (ins), MiscFrm, NoItinerary, "yield", "", [/* For disassembly only; pattern left blank */]>, Requires<[IsARM, HasV6T2]> { let Inst{27-16} = 0b001100100000; + let Inst{15-8} = 0b11110000; let Inst{7-0} = 0b00000001; } @@ -705,6 +1055,7 @@ def WFE : AI<(outs), (ins), MiscFrm, NoItinerary, "wfe", "", [/* For disassembly only; pattern left blank */]>, Requires<[IsARM, HasV6T2]> { let Inst{27-16} = 0b001100100000; + let Inst{15-8} = 0b11110000; let Inst{7-0} = 0b00000010; } @@ -712,6 +1063,7 @@ def WFI : AI<(outs), (ins), MiscFrm, NoItinerary, "wfi", "", [/* For disassembly only; pattern left blank */]>, Requires<[IsARM, HasV6T2]> { let Inst{27-16} = 0b001100100000; + let Inst{15-8} = 0b11110000; let Inst{7-0} = 0b00000011; } @@ -719,14 +1071,22 @@ def SEL : AI<(outs GPR:$dst), (ins GPR:$a, GPR:$b), DPFrm, NoItinerary, "sel", "\t$dst, $a, $b", [/* For disassembly only; pattern left blank */]>, Requires<[IsARM, HasV6]> { + bits<4> Rd; + bits<4> Rn; + bits<4> Rm; + let Inst{3-0} = Rm; + let Inst{15-12} = Rd; + let Inst{19-16} = Rn; let Inst{27-20} = 0b01101000; let Inst{7-4} = 0b1011; + let Inst{11-8} = 0b1111; } def SEV : AI<(outs), (ins), MiscFrm, NoItinerary, "sev", "", [/* For disassembly only; pattern left blank */]>, Requires<[IsARM, HasV6T2]> { let Inst{27-16} = 0b001100100000; + let Inst{15-8} = 0b11110000; let Inst{7-0} = 0b00000100; } @@ -735,154 +1095,174 @@ def SEV : AI<(outs), (ins), MiscFrm, NoItinerary, "sev", "", def BKPT : AI<(outs), (ins i32imm:$val), MiscFrm, NoItinerary, "bkpt", "\t$val", [/* For disassembly only; pattern left blank */]>, Requires<[IsARM]> { + bits<16> val; + let Inst{3-0} = val{3-0}; + let Inst{19-8} = val{15-4}; let Inst{27-20} = 0b00010010; let Inst{7-4} = 0b0111; } -// Change Processor State is a system instruction -- for disassembly only. -// The singleton $opt operand contains the following information: -// opt{4-0} = mode from Inst{4-0} -// opt{5} = changemode from Inst{17} -// opt{8-6} = AIF from Inst{8-6} -// opt{10-9} = imod from Inst{19-18} with 0b10 as enable and 0b11 as disable -def CPS : AXI<(outs), (ins cps_opt:$opt), MiscFrm, NoItinerary, "cps$opt", - [/* For disassembly only; pattern left blank */]>, - Requires<[IsARM]> { +// Change Processor State is a system instruction -- for disassembly and +// parsing only. +// FIXME: Since the asm parser has currently no clean way to handle optional +// operands, create 3 versions of the same instruction. Once there's a clean +// framework to represent optional operands, change this behavior. +class CPS<dag iops, string asm_ops> + : AXI<(outs), iops, MiscFrm, NoItinerary, !strconcat("cps", asm_ops), + [/* For disassembly only; pattern left blank */]>, Requires<[IsARM]> { + bits<2> imod; + bits<3> iflags; + bits<5> mode; + bit M; + let Inst{31-28} = 0b1111; let Inst{27-20} = 0b00010000; - let Inst{16} = 0; - let Inst{5} = 0; + let Inst{19-18} = imod; + let Inst{17} = M; // Enabled if mode is set; + let Inst{16} = 0; + let Inst{8-6} = iflags; + let Inst{5} = 0; + let Inst{4-0} = mode; } +let M = 1 in + def CPS3p : CPS<(ins imod_op:$imod, iflags_op:$iflags, i32imm:$mode), + "$imod\t$iflags, $mode">; +let mode = 0, M = 0 in + def CPS2p : CPS<(ins imod_op:$imod, iflags_op:$iflags), "$imod\t$iflags">; + +let imod = 0, iflags = 0, M = 1 in + def CPS1p : CPS<(ins i32imm:$mode), "\t$mode">; + // Preload signals the memory system of possible future data/instruction access. // These are for disassembly only. -// -// A8.6.117, A8.6.118. Different instructions are generated for #0 and #-0. -// The neg_zero operand translates -0 to -1, -1 to -2, ..., etc. -multiclass APreLoad<bit data, bit read, string opc> { +multiclass APreLoad<bits<1> read, bits<1> data, string opc> { - def i : AXI<(outs), (ins GPR:$base, neg_zero:$imm), MiscFrm, NoItinerary, - !strconcat(opc, "\t[$base, $imm]"), []> { + def i12 : AXI<(outs), (ins addrmode_imm12:$addr), MiscFrm, IIC_Preload, + !strconcat(opc, "\t$addr"), + [(ARMPreload addrmode_imm12:$addr, (i32 read), (i32 data))]> { + bits<4> Rt; + bits<17> addr; let Inst{31-26} = 0b111101; let Inst{25} = 0; // 0 for immediate form let Inst{24} = data; + let Inst{23} = addr{12}; // U (add = ('U' == 1)) let Inst{22} = read; let Inst{21-20} = 0b01; + let Inst{19-16} = addr{16-13}; // Rn + let Inst{15-12} = 0b1111; + let Inst{11-0} = addr{11-0}; // imm12 } - def r : AXI<(outs), (ins addrmode2:$addr), MiscFrm, NoItinerary, - !strconcat(opc, "\t$addr"), []> { + def rs : AXI<(outs), (ins ldst_so_reg:$shift), MiscFrm, IIC_Preload, + !strconcat(opc, "\t$shift"), + [(ARMPreload ldst_so_reg:$shift, (i32 read), (i32 data))]> { + bits<17> shift; let Inst{31-26} = 0b111101; let Inst{25} = 1; // 1 for register form let Inst{24} = data; + let Inst{23} = shift{12}; // U (add = ('U' == 1)) let Inst{22} = read; let Inst{21-20} = 0b01; - let Inst{4} = 0; + let Inst{19-16} = shift{16-13}; // Rn + let Inst{15-12} = 0b1111; + let Inst{11-0} = shift{11-0}; } } -defm PLD : APreLoad<1, 1, "pld">; -defm PLDW : APreLoad<1, 0, "pldw">; -defm PLI : APreLoad<0, 1, "pli">; - -def SETENDBE : AXI<(outs),(ins), MiscFrm, NoItinerary, "setend\tbe", - [/* For disassembly only; pattern left blank */]>, - Requires<[IsARM]> { - let Inst{31-28} = 0b1111; - let Inst{27-20} = 0b00010000; - let Inst{16} = 1; - let Inst{9} = 1; - let Inst{7-4} = 0b0000; -} +defm PLD : APreLoad<1, 1, "pld">, Requires<[IsARM]>; +defm PLDW : APreLoad<0, 1, "pldw">, Requires<[IsARM,HasV7,HasMP]>; +defm PLI : APreLoad<1, 0, "pli">, Requires<[IsARM,HasV7]>; -def SETENDLE : AXI<(outs),(ins), MiscFrm, NoItinerary, "setend\tle", - [/* For disassembly only; pattern left blank */]>, +def SETEND : AXI<(outs),(ins setend_op:$end), MiscFrm, NoItinerary, + "setend\t$end", + [/* For disassembly only; pattern left blank */]>, Requires<[IsARM]> { - let Inst{31-28} = 0b1111; - let Inst{27-20} = 0b00010000; - let Inst{16} = 1; - let Inst{9} = 0; - let Inst{7-4} = 0b0000; + bits<1> end; + let Inst{31-10} = 0b1111000100000001000000; + let Inst{9} = end; + let Inst{8-0} = 0; } def DBG : AI<(outs), (ins i32imm:$opt), MiscFrm, NoItinerary, "dbg", "\t$opt", [/* For disassembly only; pattern left blank */]>, Requires<[IsARM, HasV7]> { - let Inst{27-16} = 0b001100100000; - let Inst{7-4} = 0b1111; + bits<4> opt; + let Inst{27-4} = 0b001100100000111100001111; + let Inst{3-0} = opt; } // A5.4 Permanently UNDEFINED instructions. -// FIXME: Temporary emitted as raw bytes until this pseudo-op will be added to -// binutils let isBarrier = 1, isTerminator = 1 in -def TRAP : AXI<(outs), (ins), MiscFrm, NoItinerary, - ".long 0xe7ffdefe ${:comment} trap", [(trap)]>, +def TRAP : AXI<(outs), (ins), MiscFrm, NoItinerary, + "trap", [(trap)]>, Requires<[IsARM]> { - let Inst{27-25} = 0b011; - let Inst{24-20} = 0b11111; - let Inst{7-5} = 0b111; - let Inst{4} = 0b1; + let Inst = 0xe7ffdefe; } // Address computation and loads and stores in PIC mode. let isNotDuplicable = 1 in { -def PICADD : AXI1<0b0100, (outs GPR:$dst), (ins GPR:$a, pclabel:$cp, pred:$p), - Pseudo, IIC_iALUr, "\n$cp:\n\tadd$p\t$dst, pc, $a", - [(set GPR:$dst, (ARMpic_add GPR:$a, imm:$cp))]>; +def PICADD : ARMPseudoInst<(outs GPR:$dst), (ins GPR:$a, pclabel:$cp, pred:$p), + Size4Bytes, IIC_iALUr, + [(set GPR:$dst, (ARMpic_add GPR:$a, imm:$cp))]>; let AddedComplexity = 10 in { -def PICLDR : AXI2ldw<(outs GPR:$dst), (ins addrmodepc:$addr, pred:$p), - Pseudo, IIC_iLoadr, "\n${addr:label}:\n\tldr$p\t$dst, $addr", - [(set GPR:$dst, (load addrmodepc:$addr))]>; +def PICLDR : ARMPseudoInst<(outs GPR:$dst), (ins addrmodepc:$addr, pred:$p), + Size4Bytes, IIC_iLoad_r, + [(set GPR:$dst, (load addrmodepc:$addr))]>; -def PICLDRH : AXI3ldh<(outs GPR:$dst), (ins addrmodepc:$addr, pred:$p), - Pseudo, IIC_iLoadr, "\n${addr:label}:\n\tldrh${p}\t$dst, $addr", - [(set GPR:$dst, (zextloadi16 addrmodepc:$addr))]>; +def PICLDRH : ARMPseudoInst<(outs GPR:$Rt), (ins addrmodepc:$addr, pred:$p), + Size4Bytes, IIC_iLoad_bh_r, + [(set GPR:$Rt, (zextloadi16 addrmodepc:$addr))]>; -def PICLDRB : AXI2ldb<(outs GPR:$dst), (ins addrmodepc:$addr, pred:$p), - Pseudo, IIC_iLoadr, "\n${addr:label}:\n\tldrb${p}\t$dst, $addr", - [(set GPR:$dst, (zextloadi8 addrmodepc:$addr))]>; +def PICLDRB : ARMPseudoInst<(outs GPR:$Rt), (ins addrmodepc:$addr, pred:$p), + Size4Bytes, IIC_iLoad_bh_r, + [(set GPR:$Rt, (zextloadi8 addrmodepc:$addr))]>; -def PICLDRSH : AXI3ldsh<(outs GPR:$dst), (ins addrmodepc:$addr, pred:$p), - Pseudo, IIC_iLoadr, "\n${addr:label}:\n\tldrsh${p}\t$dst, $addr", - [(set GPR:$dst, (sextloadi16 addrmodepc:$addr))]>; +def PICLDRSH : ARMPseudoInst<(outs GPR:$Rt), (ins addrmodepc:$addr, pred:$p), + Size4Bytes, IIC_iLoad_bh_r, + [(set GPR:$Rt, (sextloadi16 addrmodepc:$addr))]>; -def PICLDRSB : AXI3ldsb<(outs GPR:$dst), (ins addrmodepc:$addr, pred:$p), - Pseudo, IIC_iLoadr, "\n${addr:label}:\n\tldrsb${p}\t$dst, $addr", - [(set GPR:$dst, (sextloadi8 addrmodepc:$addr))]>; +def PICLDRSB : ARMPseudoInst<(outs GPR:$Rt), (ins addrmodepc:$addr, pred:$p), + Size4Bytes, IIC_iLoad_bh_r, + [(set GPR:$Rt, (sextloadi8 addrmodepc:$addr))]>; } let AddedComplexity = 10 in { -def PICSTR : AXI2stw<(outs), (ins GPR:$src, addrmodepc:$addr, pred:$p), - Pseudo, IIC_iStorer, "\n${addr:label}:\n\tstr$p\t$src, $addr", - [(store GPR:$src, addrmodepc:$addr)]>; +def PICSTR : ARMPseudoInst<(outs), (ins GPR:$src, addrmodepc:$addr, pred:$p), + Size4Bytes, IIC_iStore_r, [(store GPR:$src, addrmodepc:$addr)]>; -def PICSTRH : AXI3sth<(outs), (ins GPR:$src, addrmodepc:$addr, pred:$p), - Pseudo, IIC_iStorer, "\n${addr:label}:\n\tstrh${p}\t$src, $addr", - [(truncstorei16 GPR:$src, addrmodepc:$addr)]>; +def PICSTRH : ARMPseudoInst<(outs), (ins GPR:$src, addrmodepc:$addr, pred:$p), + Size4Bytes, IIC_iStore_bh_r, [(truncstorei16 GPR:$src, + addrmodepc:$addr)]>; -def PICSTRB : AXI2stb<(outs), (ins GPR:$src, addrmodepc:$addr, pred:$p), - Pseudo, IIC_iStorer, "\n${addr:label}:\n\tstrb${p}\t$src, $addr", - [(truncstorei8 GPR:$src, addrmodepc:$addr)]>; +def PICSTRB : ARMPseudoInst<(outs), (ins GPR:$src, addrmodepc:$addr, pred:$p), + Size4Bytes, IIC_iStore_bh_r, [(truncstorei8 GPR:$src, addrmodepc:$addr)]>; } } // isNotDuplicable = 1 // LEApcrel - Load a pc-relative address into a register without offending the // assembler. -let neverHasSideEffects = 1 in { -let isReMaterializable = 1 in -def LEApcrel : AXI1<0x0, (outs GPR:$dst), (ins i32imm:$label, pred:$p), - Pseudo, IIC_iALUi, - "adr$p\t$dst, #$label", []>; - -} // neverHasSideEffects -def LEApcrelJT : AXI1<0x0, (outs GPR:$dst), - (ins i32imm:$label, nohash_imm:$id, pred:$p), - Pseudo, IIC_iALUi, - "adr$p\t$dst, #${label}_${id}", []> { - let Inst{25} = 1; +let neverHasSideEffects = 1, isReMaterializable = 1 in +// The 'adr' mnemonic encodes differently if the label is before or after +// the instruction. The {24-21} opcode bits are set by the fixup, as we don't +// know until then which form of the instruction will be used. +def ADR : AI1<0, (outs GPR:$Rd), (ins adrlabel:$label), + MiscFrm, IIC_iALUi, "adr", "\t$Rd, #$label", []> { + bits<4> Rd; + bits<12> label; + let Inst{27-25} = 0b001; + let Inst{20} = 0; + let Inst{19-16} = 0b1111; + let Inst{15-12} = Rd; + let Inst{11-0} = label; } +def LEApcrel : ARMPseudoInst<(outs GPR:$Rd), (ins i32imm:$label, pred:$p), + Size4Bytes, IIC_iALUi, []>; + +def LEApcrelJT : ARMPseudoInst<(outs GPR:$Rd), + (ins i32imm:$label, nohash_imm:$id, pred:$p), + Size4Bytes, IIC_iALUi, []>; //===----------------------------------------------------------------------===// // Control Flow Instructions. @@ -893,159 +1273,139 @@ let isReturn = 1, isTerminator = 1, isBarrier = 1 in { def BX_RET : AI<(outs), (ins), BrMiscFrm, IIC_Br, "bx", "\tlr", [(ARMretflag)]>, Requires<[IsARM, HasV4T]> { - let Inst{3-0} = 0b1110; - let Inst{7-4} = 0b0001; - let Inst{19-8} = 0b111111111111; - let Inst{27-20} = 0b00010010; + let Inst{27-0} = 0b0001001011111111111100011110; } // ARMV4 only - def MOVPCLR : AI<(outs), (ins), BrMiscFrm, IIC_Br, + def MOVPCLR : AI<(outs), (ins), BrMiscFrm, IIC_Br, "mov", "\tpc, lr", [(ARMretflag)]>, Requires<[IsARM, NoV4T]> { - let Inst{11-0} = 0b000000001110; - let Inst{15-12} = 0b1111; - let Inst{19-16} = 0b0000; - let Inst{27-20} = 0b00011010; + let Inst{27-0} = 0b0001101000001111000000001110; } } // Indirect branches let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in { // ARMV4T and above - def BRIND : AXI<(outs), (ins GPR:$dst), BrMiscFrm, IIC_Br, "bx\t$dst", + def BX : AXI<(outs), (ins GPR:$dst), BrMiscFrm, IIC_Br, "bx\t$dst", [(brind GPR:$dst)]>, Requires<[IsARM, HasV4T]> { - let Inst{7-4} = 0b0001; - let Inst{19-8} = 0b111111111111; - let Inst{27-20} = 0b00010010; - let Inst{31-28} = 0b1110; + bits<4> dst; + let Inst{31-4} = 0b1110000100101111111111110001; + let Inst{3-0} = dst; } // ARMV4 only - def MOVPCRX : AXI<(outs), (ins GPR:$dst), BrMiscFrm, IIC_Br, "mov\tpc, $dst", - [(brind GPR:$dst)]>, - Requires<[IsARM, NoV4T]> { - let Inst{11-4} = 0b00000000; - let Inst{15-12} = 0b1111; - let Inst{19-16} = 0b0000; - let Inst{27-20} = 0b00011010; - let Inst{31-28} = 0b1110; - } + // FIXME: We would really like to define this as a vanilla ARMPat like: + // ARMPat<(brind GPR:$dst), (MOVr PC, GPR:$dst)> + // With that, however, we can't set isBranch, isTerminator, etc.. + def MOVPCRX : ARMPseudoInst<(outs), (ins GPR:$dst), + Size4Bytes, IIC_Br, [(brind GPR:$dst)]>, + Requires<[IsARM, NoV4T]>; } -// FIXME: remove when we have a way to marking a MI with these properties. -// FIXME: Should pc be an implicit operand like PICADD, etc? -let isReturn = 1, isTerminator = 1, isBarrier = 1, mayLoad = 1, - hasExtraDefRegAllocReq = 1 in - def LDM_RET : AXI4ld<(outs GPR:$wb), (ins addrmode4:$addr, pred:$p, - reglist:$dsts, variable_ops), - IndexModeUpd, LdStMulFrm, IIC_Br, - "ldm${addr:submode}${p}\t$addr!, $dsts", - "$addr.addr = $wb", []>; - -// On non-Darwin platforms R9 is callee-saved. +// All calls clobber the non-callee saved registers. SP is marked as +// a use to prevent stack-pointer assignments that appear immediately +// before calls from potentially appearing dead. let isCall = 1, + // On non-Darwin platforms R9 is callee-saved. Defs = [R0, R1, R2, R3, R12, LR, D0, D1, D2, D3, D4, D5, D6, D7, D16, D17, D18, D19, D20, D21, D22, D23, - D24, D25, D26, D27, D28, D29, D30, D31, CPSR, FPSCR] in { - def BL : ABXI<0b1011, (outs), (ins i32imm:$func, variable_ops), - IIC_Br, "bl\t${func:call}", + D24, D25, D26, D27, D28, D29, D30, D31, CPSR, FPSCR], + Uses = [SP] in { + def BL : ABXI<0b1011, (outs), (ins bl_target:$func, variable_ops), + IIC_Br, "bl\t$func", [(ARMcall tglobaladdr:$func)]>, Requires<[IsARM, IsNotDarwin]> { let Inst{31-28} = 0b1110; + bits<24> func; + let Inst{23-0} = func; } - def BL_pred : ABI<0b1011, (outs), (ins i32imm:$func, variable_ops), - IIC_Br, "bl", "\t${func:call}", + def BL_pred : ABI<0b1011, (outs), (ins bl_target:$func, variable_ops), + IIC_Br, "bl", "\t$func", [(ARMcall_pred tglobaladdr:$func)]>, - Requires<[IsARM, IsNotDarwin]>; + Requires<[IsARM, IsNotDarwin]> { + bits<24> func; + let Inst{23-0} = func; + } // ARMv5T and above def BLX : AXI<(outs), (ins GPR:$func, variable_ops), BrMiscFrm, IIC_Br, "blx\t$func", [(ARMcall GPR:$func)]>, Requires<[IsARM, HasV5T, IsNotDarwin]> { - let Inst{7-4} = 0b0011; - let Inst{19-8} = 0b111111111111; - let Inst{27-20} = 0b00010010; + bits<4> func; + let Inst{31-4} = 0b1110000100101111111111110011; + let Inst{3-0} = func; } // ARMv4T // Note: Restrict $func to the tGPR regclass to prevent it being in LR. - def BX : ABXIx2<(outs), (ins tGPR:$func, variable_ops), - IIC_Br, "mov\tlr, pc\n\tbx\t$func", - [(ARMcall_nolink tGPR:$func)]>, - Requires<[IsARM, HasV4T, IsNotDarwin]> { - let Inst{7-4} = 0b0001; - let Inst{19-8} = 0b111111111111; - let Inst{27-20} = 0b00010010; - } + def BX_CALL : ARMPseudoInst<(outs), (ins tGPR:$func, variable_ops), + Size8Bytes, IIC_Br, [(ARMcall_nolink tGPR:$func)]>, + Requires<[IsARM, HasV4T, IsNotDarwin]>; // ARMv4 - def BMOVPCRX : ABXIx2<(outs), (ins tGPR:$func, variable_ops), - IIC_Br, "mov\tlr, pc\n\tmov\tpc, $func", - [(ARMcall_nolink tGPR:$func)]>, - Requires<[IsARM, NoV4T, IsNotDarwin]> { - let Inst{11-4} = 0b00000000; - let Inst{15-12} = 0b1111; - let Inst{19-16} = 0b0000; - let Inst{27-20} = 0b00011010; - } + def BMOVPCRX_CALL : ARMPseudoInst<(outs), (ins tGPR:$func, variable_ops), + Size8Bytes, IIC_Br, [(ARMcall_nolink tGPR:$func)]>, + Requires<[IsARM, NoV4T, IsNotDarwin]>; } -// On Darwin R9 is call-clobbered. let isCall = 1, + // On Darwin R9 is call-clobbered. + // R7 is marked as a use to prevent frame-pointer assignments from being + // moved above / below calls. Defs = [R0, R1, R2, R3, R9, R12, LR, D0, D1, D2, D3, D4, D5, D6, D7, D16, D17, D18, D19, D20, D21, D22, D23, - D24, D25, D26, D27, D28, D29, D30, D31, CPSR, FPSCR] in { - def BLr9 : ABXI<0b1011, (outs), (ins i32imm:$func, variable_ops), - IIC_Br, "bl\t${func:call}", + D24, D25, D26, D27, D28, D29, D30, D31, CPSR, FPSCR], + Uses = [R7, SP] in { + def BLr9 : ABXI<0b1011, (outs), (ins bltarget:$func, variable_ops), + IIC_Br, "bl\t$func", [(ARMcall tglobaladdr:$func)]>, Requires<[IsARM, IsDarwin]> { let Inst{31-28} = 0b1110; + bits<24> func; + let Inst{23-0} = func; } - def BLr9_pred : ABI<0b1011, (outs), (ins i32imm:$func, variable_ops), - IIC_Br, "bl", "\t${func:call}", + def BLr9_pred : ABI<0b1011, (outs), (ins bltarget:$func, variable_ops), + IIC_Br, "bl", "\t$func", [(ARMcall_pred tglobaladdr:$func)]>, - Requires<[IsARM, IsDarwin]>; + Requires<[IsARM, IsDarwin]> { + bits<24> func; + let Inst{23-0} = func; + } // ARMv5T and above def BLXr9 : AXI<(outs), (ins GPR:$func, variable_ops), BrMiscFrm, IIC_Br, "blx\t$func", [(ARMcall GPR:$func)]>, Requires<[IsARM, HasV5T, IsDarwin]> { - let Inst{7-4} = 0b0011; - let Inst{19-8} = 0b111111111111; - let Inst{27-20} = 0b00010010; + bits<4> func; + let Inst{31-4} = 0b1110000100101111111111110011; + let Inst{3-0} = func; } // ARMv4T // Note: Restrict $func to the tGPR regclass to prevent it being in LR. - def BXr9 : ABXIx2<(outs), (ins tGPR:$func, variable_ops), - IIC_Br, "mov\tlr, pc\n\tbx\t$func", - [(ARMcall_nolink tGPR:$func)]>, - Requires<[IsARM, HasV4T, IsDarwin]> { - let Inst{7-4} = 0b0001; - let Inst{19-8} = 0b111111111111; - let Inst{27-20} = 0b00010010; - } + def BXr9_CALL : ARMPseudoInst<(outs), (ins tGPR:$func, variable_ops), + Size8Bytes, IIC_Br, [(ARMcall_nolink tGPR:$func)]>, + Requires<[IsARM, HasV4T, IsDarwin]>; // ARMv4 - def BMOVPCRXr9 : ABXIx2<(outs), (ins tGPR:$func, variable_ops), - IIC_Br, "mov\tlr, pc\n\tmov\tpc, $func", - [(ARMcall_nolink tGPR:$func)]>, - Requires<[IsARM, NoV4T, IsDarwin]> { - let Inst{11-4} = 0b00000000; - let Inst{15-12} = 0b1111; - let Inst{19-16} = 0b0000; - let Inst{27-20} = 0b00011010; - } + def BMOVPCRXr9_CALL : ARMPseudoInst<(outs), (ins tGPR:$func, variable_ops), + Size8Bytes, IIC_Br, [(ARMcall_nolink tGPR:$func)]>, + Requires<[IsARM, NoV4T, IsDarwin]>; } // Tail calls. +// FIXME: These should probably be xformed into the non-TC versions of the +// instructions as part of MC lowering. +// FIXME: These seem to be used for both Thumb and ARM instruction selection. +// Thumb should have its own version since the instruction is actually +// different, even though the mnemonic is the same. let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in { // Darwin versions. let Defs = [R0, R1, R2, R3, R9, R12, @@ -1053,29 +1413,26 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in { D16, D17, D18, D19, D20, D21, D22, D23, D24, D25, D26, D27, D28, D29, D30, D31, PC], Uses = [SP] in { - def TCRETURNdi : AInoP<(outs), (ins i32imm:$dst, variable_ops), - Pseudo, IIC_Br, - "@TC_RETURN","\t$dst", []>, Requires<[IsDarwin]>; + def TCRETURNdi : PseudoInst<(outs), (ins i32imm:$dst, variable_ops), + IIC_Br, []>, Requires<[IsDarwin]>; - def TCRETURNri : AInoP<(outs), (ins tcGPR:$dst, variable_ops), - Pseudo, IIC_Br, - "@TC_RETURN","\t$dst", []>, Requires<[IsDarwin]>; + def TCRETURNri : PseudoInst<(outs), (ins tcGPR:$dst, variable_ops), + IIC_Br, []>, Requires<[IsDarwin]>; def TAILJMPd : ABXI<0b1010, (outs), (ins brtarget:$dst, variable_ops), IIC_Br, "b\t$dst @ TAILCALL", - []>, Requires<[IsDarwin]>; + []>, Requires<[IsARM, IsDarwin]>; def TAILJMPdt: ABXI<0b1010, (outs), (ins brtarget:$dst, variable_ops), IIC_Br, "b.w\t$dst @ TAILCALL", - []>, Requires<[IsDarwin]>; + []>, Requires<[IsThumb, IsDarwin]>; def TAILJMPr : AXI<(outs), (ins tcGPR:$dst, variable_ops), BrMiscFrm, IIC_Br, "bx\t$dst @ TAILCALL", []>, Requires<[IsDarwin]> { - let Inst{7-4} = 0b0001; - let Inst{19-8} = 0b111111111111; - let Inst{27-20} = 0b00010010; - let Inst{31-28} = 0b1110; + bits<4> dst; + let Inst{31-4} = 0b1110000100101111111111110001; + let Inst{3-0} = dst; } } @@ -1085,13 +1442,11 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in { D16, D17, D18, D19, D20, D21, D22, D23, D24, D25, D26, D27, D28, D29, D30, D31, PC], Uses = [SP] in { - def TCRETURNdiND : AInoP<(outs), (ins i32imm:$dst, variable_ops), - Pseudo, IIC_Br, - "@TC_RETURN","\t$dst", []>, Requires<[IsNotDarwin]>; + def TCRETURNdiND : PseudoInst<(outs), (ins i32imm:$dst, variable_ops), + IIC_Br, []>, Requires<[IsNotDarwin]>; - def TCRETURNriND : AInoP<(outs), (ins tcGPR:$dst, variable_ops), - Pseudo, IIC_Br, - "@TC_RETURN","\t$dst", []>, Requires<[IsNotDarwin]>; + def TCRETURNriND : PseudoInst<(outs), (ins tcGPR:$dst, variable_ops), + IIC_Br, []>, Requires<[IsNotDarwin]>; def TAILJMPdND : ABXI<0b1010, (outs), (ins brtarget:$dst, variable_ops), IIC_Br, "b\t$dst @ TAILCALL", @@ -1104,10 +1459,9 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in { def TAILJMPrND : AXI<(outs), (ins tcGPR:$dst, variable_ops), BrMiscFrm, IIC_Br, "bx\t$dst @ TAILCALL", []>, Requires<[IsNotDarwin]> { - let Inst{7-4} = 0b0001; - let Inst{19-8} = 0b111111111111; - let Inst{27-20} = 0b00010010; - let Inst{31-28} = 0b1110; + bits<4> dst; + let Inst{31-4} = 0b1110000100101111111111110001; + let Inst{3-0} = dst; } } } @@ -1117,48 +1471,40 @@ let isBranch = 1, isTerminator = 1 in { let isBarrier = 1 in { let isPredicable = 1 in def B : ABXI<0b1010, (outs), (ins brtarget:$target), IIC_Br, - "b\t$target", [(br bb:$target)]>; + "b\t$target", [(br bb:$target)]> { + bits<24> target; + let Inst{31-28} = 0b1110; + let Inst{23-0} = target; + } - let isNotDuplicable = 1, isIndirectBranch = 1 in { - def BR_JTr : JTI<(outs), (ins GPR:$target, jtblock_operand:$jt, i32imm:$id), - IIC_Br, "mov\tpc, $target$jt", - [(ARMbrjt GPR:$target, tjumptable:$jt, imm:$id)]> { - let Inst{11-4} = 0b00000000; - let Inst{15-12} = 0b1111; - let Inst{20} = 0; // S Bit - let Inst{24-21} = 0b1101; - let Inst{27-25} = 0b000; - } - def BR_JTm : JTI<(outs), - (ins addrmode2:$target, jtblock_operand:$jt, i32imm:$id), - IIC_Br, "ldr\tpc, $target$jt", - [(ARMbrjt (i32 (load addrmode2:$target)), tjumptable:$jt, - imm:$id)]> { - let Inst{15-12} = 0b1111; - let Inst{20} = 1; // L bit - let Inst{21} = 0; // W bit - let Inst{22} = 0; // B bit - let Inst{24} = 1; // P bit - let Inst{27-25} = 0b011; - } - def BR_JTadd : JTI<(outs), - (ins GPR:$target, GPR:$idx, jtblock_operand:$jt, i32imm:$id), - IIC_Br, "add\tpc, $target, $idx$jt", - [(ARMbrjt (add GPR:$target, GPR:$idx), tjumptable:$jt, - imm:$id)]> { - let Inst{15-12} = 0b1111; - let Inst{20} = 0; // S bit - let Inst{24-21} = 0b0100; - let Inst{27-25} = 0b000; - } - } // isNotDuplicable = 1, isIndirectBranch = 1 + let isNotDuplicable = 1, isIndirectBranch = 1 in { + def BR_JTr : ARMPseudoInst<(outs), + (ins GPR:$target, i32imm:$jt, i32imm:$id), + SizeSpecial, IIC_Br, + [(ARMbrjt GPR:$target, tjumptable:$jt, imm:$id)]>; + // FIXME: This shouldn't use the generic "addrmode2," but rather be split + // into i12 and rs suffixed versions. + def BR_JTm : ARMPseudoInst<(outs), + (ins addrmode2:$target, i32imm:$jt, i32imm:$id), + SizeSpecial, IIC_Br, + [(ARMbrjt (i32 (load addrmode2:$target)), tjumptable:$jt, + imm:$id)]>; + def BR_JTadd : ARMPseudoInst<(outs), + (ins GPR:$target, GPR:$idx, i32imm:$jt, i32imm:$id), + SizeSpecial, IIC_Br, + [(ARMbrjt (add GPR:$target, GPR:$idx), tjumptable:$jt, + imm:$id)]>; + } // isNotDuplicable = 1, isIndirectBranch = 1 } // isBarrier = 1 // FIXME: should be able to write a pattern for ARMBrcond, but can't use // a two-value operand where a dag node expects two operands. :( - def Bcc : ABI<0b1010, (outs), (ins brtarget:$target), + def Bcc : ABI<0b1010, (outs), (ins br_target:$target), IIC_Br, "b", "\t$target", - [/*(ARMbrcond bb:$target, imm:$cc, CCR:$ccr)*/]>; + [/*(ARMbrcond bb:$target, imm:$cc, CCR:$ccr)*/]> { + bits<24> target; + let Inst{23-0} = target; + } } // Branch and Exchange Jazelle -- for disassembly only @@ -1172,271 +1518,303 @@ def BXJ : ABI<0b0001, (outs), (ins GPR:$func), NoItinerary, "bxj", "\t$func", // Secure Monitor Call is a system instruction -- for disassembly only def SMC : ABI<0b0001, (outs), (ins i32imm:$opt), NoItinerary, "smc", "\t$opt", [/* For disassembly only; pattern left blank */]> { - let Inst{23-20} = 0b0110; - let Inst{7-4} = 0b0111; + bits<4> opt; + let Inst{23-4} = 0b01100000000000000111; + let Inst{3-0} = opt; } // Supervisor Call (Software Interrupt) -- for disassembly only -let isCall = 1 in { +let isCall = 1, Uses = [SP] in { def SVC : ABI<0b1111, (outs), (ins i32imm:$svc), IIC_Br, "svc", "\t$svc", - [/* For disassembly only; pattern left blank */]>; + [/* For disassembly only; pattern left blank */]> { + bits<24> svc; + let Inst{23-0} = svc; +} } // Store Return State is a system instruction -- for disassembly only -def SRSW : ABXI<{1,0,0,?}, (outs), (ins addrmode4:$addr, i32imm:$mode), - NoItinerary, "srs${addr:submode}\tsp!, $mode", +let isCodeGenOnly = 1 in { // FIXME: This should not use submode! +def SRSW : ABXI<{1,0,0,?}, (outs), (ins ldstm_mode:$amode, i32imm:$mode), + NoItinerary, "srs${amode}\tsp!, $mode", [/* For disassembly only; pattern left blank */]> { let Inst{31-28} = 0b1111; let Inst{22-20} = 0b110; // W = 1 } -def SRS : ABXI<{1,0,0,?}, (outs), (ins addrmode4:$addr, i32imm:$mode), - NoItinerary, "srs${addr:submode}\tsp, $mode", +def SRS : ABXI<{1,0,0,?}, (outs), (ins ldstm_mode:$amode, i32imm:$mode), + NoItinerary, "srs${amode}\tsp, $mode", [/* For disassembly only; pattern left blank */]> { let Inst{31-28} = 0b1111; let Inst{22-20} = 0b100; // W = 0 } // Return From Exception is a system instruction -- for disassembly only -def RFEW : ABXI<{1,0,0,?}, (outs), (ins addrmode4:$addr, GPR:$base), - NoItinerary, "rfe${addr:submode}\t$base!", +def RFEW : ABXI<{1,0,0,?}, (outs), (ins ldstm_mode:$amode, GPR:$base), + NoItinerary, "rfe${amode}\t$base!", [/* For disassembly only; pattern left blank */]> { let Inst{31-28} = 0b1111; let Inst{22-20} = 0b011; // W = 1 } -def RFE : ABXI<{1,0,0,?}, (outs), (ins addrmode4:$addr, GPR:$base), - NoItinerary, "rfe${addr:submode}\t$base", +def RFE : ABXI<{1,0,0,?}, (outs), (ins ldstm_mode:$amode, GPR:$base), + NoItinerary, "rfe${amode}\t$base", [/* For disassembly only; pattern left blank */]> { let Inst{31-28} = 0b1111; let Inst{22-20} = 0b001; // W = 0 } +} // isCodeGenOnly = 1 //===----------------------------------------------------------------------===// // Load / store Instructions. // // Load -let canFoldAsLoad = 1, isReMaterializable = 1 in -def LDR : AI2ldw<(outs GPR:$dst), (ins addrmode2:$addr), LdFrm, IIC_iLoadr, - "ldr", "\t$dst, $addr", - [(set GPR:$dst, (load addrmode2:$addr))]>; + + +defm LDR : AI_ldr1<0, "ldr", IIC_iLoad_r, IIC_iLoad_si, + UnOpFrag<(load node:$Src)>>; +defm LDRB : AI_ldr1<1, "ldrb", IIC_iLoad_bh_r, IIC_iLoad_bh_si, + UnOpFrag<(zextloadi8 node:$Src)>>; +defm STR : AI_str1<0, "str", IIC_iStore_r, IIC_iStore_si, + BinOpFrag<(store node:$LHS, node:$RHS)>>; +defm STRB : AI_str1<1, "strb", IIC_iStore_bh_r, IIC_iStore_bh_si, + BinOpFrag<(truncstorei8 node:$LHS, node:$RHS)>>; // Special LDR for loads from non-pc-relative constpools. let canFoldAsLoad = 1, mayLoad = 1, neverHasSideEffects = 1, isReMaterializable = 1 in -def LDRcp : AI2ldw<(outs GPR:$dst), (ins addrmode2:$addr), LdFrm, IIC_iLoadr, - "ldr", "\t$dst, $addr", []>; +def LDRcp : AI2ldst<0b010, 1, 0, (outs GPR:$Rt), (ins addrmode_imm12:$addr), + AddrMode_i12, LdFrm, IIC_iLoad_r, "ldr", "\t$Rt, $addr", + []> { + bits<4> Rt; + bits<17> addr; + let Inst{23} = addr{12}; // U (add = ('U' == 1)) + let Inst{19-16} = 0b1111; + let Inst{15-12} = Rt; + let Inst{11-0} = addr{11-0}; // imm12 +} // Loads with zero extension -def LDRH : AI3ldh<(outs GPR:$dst), (ins addrmode3:$addr), LdMiscFrm, - IIC_iLoadr, "ldrh", "\t$dst, $addr", - [(set GPR:$dst, (zextloadi16 addrmode3:$addr))]>; - -def LDRB : AI2ldb<(outs GPR:$dst), (ins addrmode2:$addr), LdFrm, - IIC_iLoadr, "ldrb", "\t$dst, $addr", - [(set GPR:$dst, (zextloadi8 addrmode2:$addr))]>; +def LDRH : AI3ld<0b1011, 1, (outs GPR:$Rt), (ins addrmode3:$addr), LdMiscFrm, + IIC_iLoad_bh_r, "ldrh", "\t$Rt, $addr", + [(set GPR:$Rt, (zextloadi16 addrmode3:$addr))]>; // Loads with sign extension -def LDRSH : AI3ldsh<(outs GPR:$dst), (ins addrmode3:$addr), LdMiscFrm, - IIC_iLoadr, "ldrsh", "\t$dst, $addr", - [(set GPR:$dst, (sextloadi16 addrmode3:$addr))]>; - -def LDRSB : AI3ldsb<(outs GPR:$dst), (ins addrmode3:$addr), LdMiscFrm, - IIC_iLoadr, "ldrsb", "\t$dst, $addr", - [(set GPR:$dst, (sextloadi8 addrmode3:$addr))]>; - -let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 in { +def LDRSH : AI3ld<0b1111, 1, (outs GPR:$Rt), (ins addrmode3:$addr), LdMiscFrm, + IIC_iLoad_bh_r, "ldrsh", "\t$Rt, $addr", + [(set GPR:$Rt, (sextloadi16 addrmode3:$addr))]>; + +def LDRSB : AI3ld<0b1101, 1, (outs GPR:$Rt), (ins addrmode3:$addr), LdMiscFrm, + IIC_iLoad_bh_r, "ldrsb", "\t$Rt, $addr", + [(set GPR:$Rt, (sextloadi8 addrmode3:$addr))]>; + +let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1, + isCodeGenOnly = 1 in { // $dst2 doesn't exist in asmstring? +// FIXME: $dst2 isn't in the asm string as it's implied by $Rd (dst2 = Rd+1) +// how to represent that such that tblgen is happy and we don't +// mark this codegen only? // Load doubleword -def LDRD : AI3ldd<(outs GPR:$dst1, GPR:$dst2), (ins addrmode3:$addr), LdMiscFrm, - IIC_iLoadr, "ldrd", "\t$dst1, $addr", +def LDRD : AI3ld<0b1101, 0, (outs GPR:$Rd, GPR:$dst2), + (ins addrmode3:$addr), LdMiscFrm, + IIC_iLoad_d_r, "ldrd", "\t$Rd, $addr", []>, Requires<[IsARM, HasV5TE]>; +} // Indexed loads -def LDR_PRE : AI2ldwpr<(outs GPR:$dst, GPR:$base_wb), - (ins addrmode2:$addr), LdFrm, IIC_iLoadru, - "ldr", "\t$dst, $addr!", "$addr.base = $base_wb", []>; - -def LDR_POST : AI2ldwpo<(outs GPR:$dst, GPR:$base_wb), - (ins GPR:$base, am2offset:$offset), LdFrm, IIC_iLoadru, - "ldr", "\t$dst, [$base], $offset", "$base = $base_wb", []>; - -def LDRH_PRE : AI3ldhpr<(outs GPR:$dst, GPR:$base_wb), - (ins addrmode3:$addr), LdMiscFrm, IIC_iLoadru, - "ldrh", "\t$dst, $addr!", "$addr.base = $base_wb", []>; - -def LDRH_POST : AI3ldhpo<(outs GPR:$dst, GPR:$base_wb), - (ins GPR:$base,am3offset:$offset), LdMiscFrm, IIC_iLoadru, - "ldrh", "\t$dst, [$base], $offset", "$base = $base_wb", []>; - -def LDRB_PRE : AI2ldbpr<(outs GPR:$dst, GPR:$base_wb), - (ins addrmode2:$addr), LdFrm, IIC_iLoadru, - "ldrb", "\t$dst, $addr!", "$addr.base = $base_wb", []>; - -def LDRB_POST : AI2ldbpo<(outs GPR:$dst, GPR:$base_wb), - (ins GPR:$base,am2offset:$offset), LdFrm, IIC_iLoadru, - "ldrb", "\t$dst, [$base], $offset", "$base = $base_wb", []>; - -def LDRSH_PRE : AI3ldshpr<(outs GPR:$dst, GPR:$base_wb), - (ins addrmode3:$addr), LdMiscFrm, IIC_iLoadru, - "ldrsh", "\t$dst, $addr!", "$addr.base = $base_wb", []>; - -def LDRSH_POST: AI3ldshpo<(outs GPR:$dst, GPR:$base_wb), - (ins GPR:$base,am3offset:$offset), LdMiscFrm, IIC_iLoadru, - "ldrsh", "\t$dst, [$base], $offset", "$base = $base_wb", []>; - -def LDRSB_PRE : AI3ldsbpr<(outs GPR:$dst, GPR:$base_wb), - (ins addrmode3:$addr), LdMiscFrm, IIC_iLoadru, - "ldrsb", "\t$dst, $addr!", "$addr.base = $base_wb", []>; - -def LDRSB_POST: AI3ldsbpo<(outs GPR:$dst, GPR:$base_wb), - (ins GPR:$base,am3offset:$offset), LdMiscFrm, IIC_iLoadru, - "ldrsb", "\t$dst, [$base], $offset", "$base = $base_wb", []>; +multiclass AI2_ldridx<bit isByte, string opc, InstrItinClass itin> { + def _PRE : AI2ldstidx<1, isByte, 1, (outs GPR:$Rt, GPR:$Rn_wb), + (ins addrmode2:$addr), IndexModePre, LdFrm, itin, + opc, "\t$Rt, $addr!", "$addr.base = $Rn_wb", []> { + // {17-14} Rn + // {13} 1 == Rm, 0 == imm12 + // {12} isAdd + // {11-0} imm12/Rm + bits<18> addr; + let Inst{25} = addr{13}; + let Inst{23} = addr{12}; + let Inst{19-16} = addr{17-14}; + let Inst{11-0} = addr{11-0}; + } + def _POST : AI2ldstidx<1, isByte, 0, (outs GPR:$Rt, GPR:$Rn_wb), + (ins GPR:$Rn, am2offset:$offset), + IndexModePost, LdFrm, itin, + opc, "\t$Rt, [$Rn], $offset", "$Rn = $Rn_wb", []> { + // {13} 1 == Rm, 0 == imm12 + // {12} isAdd + // {11-0} imm12/Rm + bits<14> offset; + bits<4> Rn; + let Inst{25} = offset{13}; + let Inst{23} = offset{12}; + let Inst{19-16} = Rn; + let Inst{11-0} = offset{11-0}; + } +} -// For disassembly only -def LDRD_PRE : AI3lddpr<(outs GPR:$dst1, GPR:$dst2, GPR:$base_wb), - (ins addrmode3:$addr), LdMiscFrm, IIC_iLoadr, - "ldrd", "\t$dst1, $dst2, $addr!", "$addr.base = $base_wb", []>, - Requires<[IsARM, HasV5TE]>; +let mayLoad = 1, neverHasSideEffects = 1 in { +defm LDR : AI2_ldridx<0, "ldr", IIC_iLoad_ru>; +defm LDRB : AI2_ldridx<1, "ldrb", IIC_iLoad_bh_ru>; +} -// For disassembly only -def LDRD_POST : AI3lddpo<(outs GPR:$dst1, GPR:$dst2, GPR:$base_wb), - (ins GPR:$base,am3offset:$offset), LdMiscFrm, IIC_iLoadr, - "ldrd", "\t$dst1, $dst2, [$base], $offset", "$base = $base_wb", []>, - Requires<[IsARM, HasV5TE]>; +multiclass AI3_ldridx<bits<4> op, bit op20, string opc, InstrItinClass itin> { + def _PRE : AI3ldstidx<op, op20, 1, 1, (outs GPR:$Rt, GPR:$Rn_wb), + (ins addrmode3:$addr), IndexModePre, + LdMiscFrm, itin, + opc, "\t$Rt, $addr!", "$addr.base = $Rn_wb", []> { + bits<14> addr; + let Inst{23} = addr{8}; // U bit + let Inst{22} = addr{13}; // 1 == imm8, 0 == Rm + let Inst{19-16} = addr{12-9}; // Rn + let Inst{11-8} = addr{7-4}; // imm7_4/zero + let Inst{3-0} = addr{3-0}; // imm3_0/Rm + } + def _POST : AI3ldstidx<op, op20, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb), + (ins GPR:$Rn, am3offset:$offset), IndexModePost, + LdMiscFrm, itin, + opc, "\t$Rt, [$Rn], $offset", "$Rn = $Rn_wb", []> { + bits<10> offset; + bits<4> Rn; + let Inst{23} = offset{8}; // U bit + let Inst{22} = offset{9}; // 1 == imm8, 0 == Rm + let Inst{19-16} = Rn; + let Inst{11-8} = offset{7-4}; // imm7_4/zero + let Inst{3-0} = offset{3-0}; // imm3_0/Rm + } +} -} // mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 +let mayLoad = 1, neverHasSideEffects = 1 in { +defm LDRH : AI3_ldridx<0b1011, 1, "ldrh", IIC_iLoad_bh_ru>; +defm LDRSH : AI3_ldridx<0b1111, 1, "ldrsh", IIC_iLoad_bh_ru>; +defm LDRSB : AI3_ldridx<0b1101, 1, "ldrsb", IIC_iLoad_bh_ru>; +let hasExtraDefRegAllocReq = 1, isCodeGenOnly = 1 in +defm LDRD : AI3_ldridx<0b1101, 0, "ldrd", IIC_iLoad_d_ru>; +} // mayLoad = 1, neverHasSideEffects = 1 // LDRT, LDRBT, LDRSBT, LDRHT, LDRSHT are for disassembly only. - -def LDRT : AI2ldwpo<(outs GPR:$dst, GPR:$base_wb), - (ins GPR:$base, am2offset:$offset), LdFrm, IIC_iLoadru, +let mayLoad = 1, neverHasSideEffects = 1 in { +def LDRT : AI2ldstidx<1, 0, 0, (outs GPR:$dst, GPR:$base_wb), + (ins GPR:$base, am2offset:$offset), IndexModeNone, + LdFrm, IIC_iLoad_ru, "ldrt", "\t$dst, [$base], $offset", "$base = $base_wb", []> { let Inst{21} = 1; // overwrite } - -def LDRBT : AI2ldbpo<(outs GPR:$dst, GPR:$base_wb), - (ins GPR:$base,am2offset:$offset), LdFrm, IIC_iLoadru, +def LDRBT : AI2ldstidx<1, 1, 0, (outs GPR:$dst, GPR:$base_wb), + (ins GPR:$base, am2offset:$offset), IndexModeNone, + LdFrm, IIC_iLoad_bh_ru, "ldrbt", "\t$dst, [$base], $offset", "$base = $base_wb", []> { let Inst{21} = 1; // overwrite } - -def LDRSBT : AI3ldsbpo<(outs GPR:$dst, GPR:$base_wb), - (ins GPR:$base,am3offset:$offset), LdMiscFrm, IIC_iLoadru, +def LDRSBT : AI3ldstidx<0b1101, 1, 1, 0, (outs GPR:$dst, GPR:$base_wb), + (ins GPR:$base, am3offset:$offset), IndexModePost, + LdMiscFrm, IIC_iLoad_bh_ru, "ldrsbt", "\t$dst, [$base], $offset", "$base = $base_wb", []> { let Inst{21} = 1; // overwrite } - -def LDRHT : AI3ldhpo<(outs GPR:$dst, GPR:$base_wb), - (ins GPR:$base, am3offset:$offset), LdMiscFrm, IIC_iLoadru, - "ldrht", "\t$dst, [$base], $offset", "$base = $base_wb", []> { +def LDRHT : AI3ldstidx<0b1011, 1, 1, 0, (outs GPR:$dst, GPR:$base_wb), + (ins GPR:$base, am3offset:$offset), IndexModePost, + LdMiscFrm, IIC_iLoad_bh_ru, + "ldrht", "\t$dst, [$base], $offset", "$base = $base_wb", []> { let Inst{21} = 1; // overwrite } - -def LDRSHT : AI3ldshpo<(outs GPR:$dst, GPR:$base_wb), - (ins GPR:$base,am3offset:$offset), LdMiscFrm, IIC_iLoadru, +def LDRSHT : AI3ldstidx<0b1111, 1, 1, 0, (outs GPR:$dst, GPR:$base_wb), + (ins GPR:$base, am3offset:$offset), IndexModePost, + LdMiscFrm, IIC_iLoad_bh_ru, "ldrsht", "\t$dst, [$base], $offset", "$base = $base_wb", []> { let Inst{21} = 1; // overwrite } +} // Store -def STR : AI2stw<(outs), (ins GPR:$src, addrmode2:$addr), StFrm, IIC_iStorer, - "str", "\t$src, $addr", - [(store GPR:$src, addrmode2:$addr)]>; // Stores with truncate -def STRH : AI3sth<(outs), (ins GPR:$src, addrmode3:$addr), StMiscFrm, - IIC_iStorer, "strh", "\t$src, $addr", - [(truncstorei16 GPR:$src, addrmode3:$addr)]>; - -def STRB : AI2stb<(outs), (ins GPR:$src, addrmode2:$addr), StFrm, IIC_iStorer, - "strb", "\t$src, $addr", - [(truncstorei8 GPR:$src, addrmode2:$addr)]>; +def STRH : AI3str<0b1011, (outs), (ins GPR:$Rt, addrmode3:$addr), StMiscFrm, + IIC_iStore_bh_r, "strh", "\t$Rt, $addr", + [(truncstorei16 GPR:$Rt, addrmode3:$addr)]>; // Store doubleword -let mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1 in -def STRD : AI3std<(outs), (ins GPR:$src1, GPR:$src2, addrmode3:$addr), - StMiscFrm, IIC_iStorer, +let mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1, + isCodeGenOnly = 1 in // $src2 doesn't exist in asm string +def STRD : AI3str<0b1111, (outs), (ins GPR:$src1, GPR:$src2, addrmode3:$addr), + StMiscFrm, IIC_iStore_d_r, "strd", "\t$src1, $addr", []>, Requires<[IsARM, HasV5TE]>; // Indexed stores -def STR_PRE : AI2stwpr<(outs GPR:$base_wb), - (ins GPR:$src, GPR:$base, am2offset:$offset), - StFrm, IIC_iStoreru, - "str", "\t$src, [$base, $offset]!", "$base = $base_wb", - [(set GPR:$base_wb, - (pre_store GPR:$src, GPR:$base, am2offset:$offset))]>; - -def STR_POST : AI2stwpo<(outs GPR:$base_wb), - (ins GPR:$src, GPR:$base,am2offset:$offset), - StFrm, IIC_iStoreru, - "str", "\t$src, [$base], $offset", "$base = $base_wb", - [(set GPR:$base_wb, - (post_store GPR:$src, GPR:$base, am2offset:$offset))]>; - -def STRH_PRE : AI3sthpr<(outs GPR:$base_wb), - (ins GPR:$src, GPR:$base,am3offset:$offset), - StMiscFrm, IIC_iStoreru, - "strh", "\t$src, [$base, $offset]!", "$base = $base_wb", - [(set GPR:$base_wb, - (pre_truncsti16 GPR:$src, GPR:$base,am3offset:$offset))]>; - -def STRH_POST: AI3sthpo<(outs GPR:$base_wb), - (ins GPR:$src, GPR:$base,am3offset:$offset), - StMiscFrm, IIC_iStoreru, - "strh", "\t$src, [$base], $offset", "$base = $base_wb", - [(set GPR:$base_wb, (post_truncsti16 GPR:$src, - GPR:$base, am3offset:$offset))]>; - -def STRB_PRE : AI2stbpr<(outs GPR:$base_wb), - (ins GPR:$src, GPR:$base,am2offset:$offset), - StFrm, IIC_iStoreru, - "strb", "\t$src, [$base, $offset]!", "$base = $base_wb", - [(set GPR:$base_wb, (pre_truncsti8 GPR:$src, - GPR:$base, am2offset:$offset))]>; - -def STRB_POST: AI2stbpo<(outs GPR:$base_wb), - (ins GPR:$src, GPR:$base,am2offset:$offset), - StFrm, IIC_iStoreru, - "strb", "\t$src, [$base], $offset", "$base = $base_wb", - [(set GPR:$base_wb, (post_truncsti8 GPR:$src, - GPR:$base, am2offset:$offset))]>; +def STR_PRE : AI2stridx<0, 1, (outs GPR:$Rn_wb), + (ins GPR:$Rt, GPR:$Rn, am2offset:$offset), + IndexModePre, StFrm, IIC_iStore_ru, + "str", "\t$Rt, [$Rn, $offset]!", "$Rn = $Rn_wb", + [(set GPR:$Rn_wb, + (pre_store GPR:$Rt, GPR:$Rn, am2offset:$offset))]>; + +def STR_POST : AI2stridx<0, 0, (outs GPR:$Rn_wb), + (ins GPR:$Rt, GPR:$Rn, am2offset:$offset), + IndexModePost, StFrm, IIC_iStore_ru, + "str", "\t$Rt, [$Rn], $offset", "$Rn = $Rn_wb", + [(set GPR:$Rn_wb, + (post_store GPR:$Rt, GPR:$Rn, am2offset:$offset))]>; + +def STRB_PRE : AI2stridx<1, 1, (outs GPR:$Rn_wb), + (ins GPR:$Rt, GPR:$Rn, am2offset:$offset), + IndexModePre, StFrm, IIC_iStore_bh_ru, + "strb", "\t$Rt, [$Rn, $offset]!", "$Rn = $Rn_wb", + [(set GPR:$Rn_wb, (pre_truncsti8 GPR:$Rt, + GPR:$Rn, am2offset:$offset))]>; +def STRB_POST: AI2stridx<1, 0, (outs GPR:$Rn_wb), + (ins GPR:$Rt, GPR:$Rn, am2offset:$offset), + IndexModePost, StFrm, IIC_iStore_bh_ru, + "strb", "\t$Rt, [$Rn], $offset", "$Rn = $Rn_wb", + [(set GPR:$Rn_wb, (post_truncsti8 GPR:$Rt, + GPR:$Rn, am2offset:$offset))]>; + +def STRH_PRE : AI3stridx<0b1011, 0, 1, (outs GPR:$Rn_wb), + (ins GPR:$Rt, GPR:$Rn, am3offset:$offset), + IndexModePre, StMiscFrm, IIC_iStore_ru, + "strh", "\t$Rt, [$Rn, $offset]!", "$Rn = $Rn_wb", + [(set GPR:$Rn_wb, + (pre_truncsti16 GPR:$Rt, GPR:$Rn, am3offset:$offset))]>; + +def STRH_POST: AI3stridx<0b1011, 0, 0, (outs GPR:$Rn_wb), + (ins GPR:$Rt, GPR:$Rn, am3offset:$offset), + IndexModePost, StMiscFrm, IIC_iStore_bh_ru, + "strh", "\t$Rt, [$Rn], $offset", "$Rn = $Rn_wb", + [(set GPR:$Rn_wb, (post_truncsti16 GPR:$Rt, + GPR:$Rn, am3offset:$offset))]>; // For disassembly only def STRD_PRE : AI3stdpr<(outs GPR:$base_wb), (ins GPR:$src1, GPR:$src2, GPR:$base, am3offset:$offset), - StMiscFrm, IIC_iStoreru, + StMiscFrm, IIC_iStore_d_ru, "strd", "\t$src1, $src2, [$base, $offset]!", "$base = $base_wb", []>; // For disassembly only def STRD_POST: AI3stdpo<(outs GPR:$base_wb), (ins GPR:$src1, GPR:$src2, GPR:$base, am3offset:$offset), - StMiscFrm, IIC_iStoreru, + StMiscFrm, IIC_iStore_d_ru, "strd", "\t$src1, $src2, [$base], $offset", "$base = $base_wb", []>; // STRT, STRBT, and STRHT are for disassembly only. -def STRT : AI2stwpo<(outs GPR:$base_wb), - (ins GPR:$src, GPR:$base,am2offset:$offset), - StFrm, IIC_iStoreru, - "strt", "\t$src, [$base], $offset", "$base = $base_wb", +def STRT : AI2stridx<0, 0, (outs GPR:$Rn_wb), + (ins GPR:$Rt, GPR:$Rn,am2offset:$offset), + IndexModeNone, StFrm, IIC_iStore_ru, + "strt", "\t$Rt, [$Rn], $offset", "$Rn = $Rn_wb", [/* For disassembly only; pattern left blank */]> { let Inst{21} = 1; // overwrite } -def STRBT : AI2stbpo<(outs GPR:$base_wb), - (ins GPR:$src, GPR:$base,am2offset:$offset), - StFrm, IIC_iStoreru, - "strbt", "\t$src, [$base], $offset", "$base = $base_wb", +def STRBT : AI2stridx<1, 0, (outs GPR:$Rn_wb), + (ins GPR:$Rt, GPR:$Rn, am2offset:$offset), + IndexModeNone, StFrm, IIC_iStore_bh_ru, + "strbt", "\t$Rt, [$Rn], $offset", "$Rn = $Rn_wb", [/* For disassembly only; pattern left blank */]> { let Inst{21} = 1; // overwrite } def STRHT: AI3sthpo<(outs GPR:$base_wb), (ins GPR:$src, GPR:$base,am3offset:$offset), - StMiscFrm, IIC_iStoreru, + StMiscFrm, IIC_iStore_bh_ru, "strht", "\t$src, [$base], $offset", "$base = $base_wb", [/* For disassembly only; pattern left blank */]> { let Inst{21} = 1; // overwrite @@ -1446,103 +1824,212 @@ def STRHT: AI3sthpo<(outs GPR:$base_wb), // Load / store multiple Instructions. // -let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 in { -def LDM : AXI4ld<(outs), (ins addrmode4:$addr, pred:$p, - reglist:$dsts, variable_ops), - IndexModeNone, LdStMulFrm, IIC_iLoadm, - "ldm${addr:submode}${p}\t$addr, $dsts", "", []>; - -def LDM_UPD : AXI4ld<(outs GPR:$wb), (ins addrmode4:$addr, pred:$p, - reglist:$dsts, variable_ops), - IndexModeUpd, LdStMulFrm, IIC_iLoadm, - "ldm${addr:submode}${p}\t$addr!, $dsts", - "$addr.addr = $wb", []>; -} // mayLoad, neverHasSideEffects, hasExtraDefRegAllocReq - -let mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1 in { -def STM : AXI4st<(outs), (ins addrmode4:$addr, pred:$p, - reglist:$srcs, variable_ops), - IndexModeNone, LdStMulFrm, IIC_iStorem, - "stm${addr:submode}${p}\t$addr, $srcs", "", []>; - -def STM_UPD : AXI4st<(outs GPR:$wb), (ins addrmode4:$addr, pred:$p, - reglist:$srcs, variable_ops), - IndexModeUpd, LdStMulFrm, IIC_iStorem, - "stm${addr:submode}${p}\t$addr!, $srcs", - "$addr.addr = $wb", []>; -} // mayStore, neverHasSideEffects, hasExtraSrcRegAllocReq +multiclass arm_ldst_mult<string asm, bit L_bit, Format f, + InstrItinClass itin, InstrItinClass itin_upd> { + def IA : + AXI4<(outs), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops), + IndexModeNone, f, itin, + !strconcat(asm, "ia${p}\t$Rn, $regs"), "", []> { + let Inst{24-23} = 0b01; // Increment After + let Inst{21} = 0; // No writeback + let Inst{20} = L_bit; + } + def IA_UPD : + AXI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops), + IndexModeUpd, f, itin_upd, + !strconcat(asm, "ia${p}\t$Rn!, $regs"), "$Rn = $wb", []> { + let Inst{24-23} = 0b01; // Increment After + let Inst{21} = 1; // Writeback + let Inst{20} = L_bit; + } + def DA : + AXI4<(outs), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops), + IndexModeNone, f, itin, + !strconcat(asm, "da${p}\t$Rn, $regs"), "", []> { + let Inst{24-23} = 0b00; // Decrement After + let Inst{21} = 0; // No writeback + let Inst{20} = L_bit; + } + def DA_UPD : + AXI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops), + IndexModeUpd, f, itin_upd, + !strconcat(asm, "da${p}\t$Rn!, $regs"), "$Rn = $wb", []> { + let Inst{24-23} = 0b00; // Decrement After + let Inst{21} = 1; // Writeback + let Inst{20} = L_bit; + } + def DB : + AXI4<(outs), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops), + IndexModeNone, f, itin, + !strconcat(asm, "db${p}\t$Rn, $regs"), "", []> { + let Inst{24-23} = 0b10; // Decrement Before + let Inst{21} = 0; // No writeback + let Inst{20} = L_bit; + } + def DB_UPD : + AXI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops), + IndexModeUpd, f, itin_upd, + !strconcat(asm, "db${p}\t$Rn!, $regs"), "$Rn = $wb", []> { + let Inst{24-23} = 0b10; // Decrement Before + let Inst{21} = 1; // Writeback + let Inst{20} = L_bit; + } + def IB : + AXI4<(outs), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops), + IndexModeNone, f, itin, + !strconcat(asm, "ib${p}\t$Rn, $regs"), "", []> { + let Inst{24-23} = 0b11; // Increment Before + let Inst{21} = 0; // No writeback + let Inst{20} = L_bit; + } + def IB_UPD : + AXI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops), + IndexModeUpd, f, itin_upd, + !strconcat(asm, "ib${p}\t$Rn!, $regs"), "$Rn = $wb", []> { + let Inst{24-23} = 0b11; // Increment Before + let Inst{21} = 1; // Writeback + let Inst{20} = L_bit; + } +} + +let neverHasSideEffects = 1 in { + +let mayLoad = 1, hasExtraDefRegAllocReq = 1 in +defm LDM : arm_ldst_mult<"ldm", 1, LdStMulFrm, IIC_iLoad_m, IIC_iLoad_mu>; + +let mayStore = 1, hasExtraSrcRegAllocReq = 1 in +defm STM : arm_ldst_mult<"stm", 0, LdStMulFrm, IIC_iStore_m, IIC_iStore_mu>; + +} // neverHasSideEffects + +// Load / Store Multiple Mnemonic Aliases +def : MnemonicAlias<"ldm", "ldmia">; +def : MnemonicAlias<"stm", "stmia">; + +// FIXME: remove when we have a way to marking a MI with these properties. +// FIXME: Should pc be an implicit operand like PICADD, etc? +let isReturn = 1, isTerminator = 1, isBarrier = 1, mayLoad = 1, + hasExtraDefRegAllocReq = 1, isCodeGenOnly = 1 in +// FIXME: Should be a pseudo-instruction. +def LDMIA_RET : AXI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, + reglist:$regs, variable_ops), + IndexModeUpd, LdStMulFrm, IIC_iLoad_mBr, + "ldmia${p}\t$Rn!, $regs", + "$Rn = $wb", []> { + let Inst{24-23} = 0b01; // Increment After + let Inst{21} = 1; // Writeback + let Inst{20} = 1; // Load +} //===----------------------------------------------------------------------===// // Move Instructions. // let neverHasSideEffects = 1 in -def MOVr : AsI1<0b1101, (outs GPR:$dst), (ins GPR:$src), DPFrm, IIC_iMOVr, - "mov", "\t$dst, $src", []>, UnaryDP { +def MOVr : AsI1<0b1101, (outs GPR:$Rd), (ins GPR:$Rm), DPFrm, IIC_iMOVr, + "mov", "\t$Rd, $Rm", []>, UnaryDP { + bits<4> Rd; + bits<4> Rm; + let Inst{11-4} = 0b00000000; let Inst{25} = 0; + let Inst{3-0} = Rm; + let Inst{15-12} = Rd; } // A version for the smaller set of tail call registers. let neverHasSideEffects = 1 in -def MOVr_TC : AsI1<0b1101, (outs tcGPR:$dst), (ins tcGPR:$src), DPFrm, - IIC_iMOVr, "mov", "\t$dst, $src", []>, UnaryDP { +def MOVr_TC : AsI1<0b1101, (outs tcGPR:$Rd), (ins tcGPR:$Rm), DPFrm, + IIC_iMOVr, "mov", "\t$Rd, $Rm", []>, UnaryDP { + bits<4> Rd; + bits<4> Rm; + let Inst{11-4} = 0b00000000; let Inst{25} = 0; + let Inst{3-0} = Rm; + let Inst{15-12} = Rd; } -def MOVs : AsI1<0b1101, (outs GPR:$dst), (ins so_reg:$src), +def MOVs : AsI1<0b1101, (outs GPR:$Rd), (ins shift_so_reg:$src), DPSoRegFrm, IIC_iMOVsr, - "mov", "\t$dst, $src", [(set GPR:$dst, so_reg:$src)]>, UnaryDP { + "mov", "\t$Rd, $src", [(set GPR:$Rd, shift_so_reg:$src)]>, + UnaryDP { + bits<4> Rd; + bits<12> src; + let Inst{15-12} = Rd; + let Inst{11-0} = src; let Inst{25} = 0; } -let isReMaterializable = 1, isAsCheapAsAMove = 1 in -def MOVi : AsI1<0b1101, (outs GPR:$dst), (ins so_imm:$src), DPFrm, IIC_iMOVi, - "mov", "\t$dst, $src", [(set GPR:$dst, so_imm:$src)]>, UnaryDP { +let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in +def MOVi : AsI1<0b1101, (outs GPR:$Rd), (ins so_imm:$imm), DPFrm, IIC_iMOVi, + "mov", "\t$Rd, $imm", [(set GPR:$Rd, so_imm:$imm)]>, UnaryDP { + bits<4> Rd; + bits<12> imm; let Inst{25} = 1; + let Inst{15-12} = Rd; + let Inst{19-16} = 0b0000; + let Inst{11-0} = imm; } -let isReMaterializable = 1, isAsCheapAsAMove = 1 in -def MOVi16 : AI1<0b1000, (outs GPR:$dst), (ins i32imm:$src), +let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in +def MOVi16 : AI1<0b1000, (outs GPR:$Rd), (ins i32imm_hilo16:$imm), DPFrm, IIC_iMOVi, - "movw", "\t$dst, $src", - [(set GPR:$dst, imm0_65535:$src)]>, + "movw", "\t$Rd, $imm", + [(set GPR:$Rd, imm0_65535:$imm)]>, Requires<[IsARM, HasV6T2]>, UnaryDP { + bits<4> Rd; + bits<16> imm; + let Inst{15-12} = Rd; + let Inst{11-0} = imm{11-0}; + let Inst{19-16} = imm{15-12}; let Inst{20} = 0; let Inst{25} = 1; } -let Constraints = "$src = $dst" in -def MOVTi16 : AI1<0b1010, (outs GPR:$dst), (ins GPR:$src, i32imm:$imm), +def MOVi16_ga_pcrel : PseudoInst<(outs GPR:$Rd), + (ins i32imm:$addr, pclabel:$id), IIC_iMOVi, []>; + +let Constraints = "$src = $Rd" in { +def MOVTi16 : AI1<0b1010, (outs GPR:$Rd), (ins GPR:$src, i32imm_hilo16:$imm), DPFrm, IIC_iMOVi, - "movt", "\t$dst, $imm", - [(set GPR:$dst, + "movt", "\t$Rd, $imm", + [(set GPR:$Rd, (or (and GPR:$src, 0xffff), lo16AllZero:$imm))]>, UnaryDP, Requires<[IsARM, HasV6T2]> { + bits<4> Rd; + bits<16> imm; + let Inst{15-12} = Rd; + let Inst{11-0} = imm{11-0}; + let Inst{19-16} = imm{15-12}; let Inst{20} = 0; let Inst{25} = 1; } +def MOVTi16_ga_pcrel : PseudoInst<(outs GPR:$Rd), + (ins GPR:$src, i32imm:$addr, pclabel:$id), IIC_iMOVi, []>; + +} // Constraints + def : ARMPat<(or GPR:$src, 0xffff0000), (MOVTi16 GPR:$src, 0xffff)>, Requires<[IsARM, HasV6T2]>; let Uses = [CPSR] in -def MOVrx : AsI1<0b1101, (outs GPR:$dst), (ins GPR:$src), Pseudo, IIC_iMOVsi, - "mov", "\t$dst, $src, rrx", - [(set GPR:$dst, (ARMrrx GPR:$src))]>, UnaryDP; +def RRX: PseudoInst<(outs GPR:$Rd), (ins GPR:$Rm), IIC_iMOVsi, + [(set GPR:$Rd, (ARMrrx GPR:$Rm))]>, UnaryDP, + Requires<[IsARM]>; // These aren't really mov instructions, but we have to define them this way // due to flag operands. let Defs = [CPSR] in { -def MOVsrl_flag : AI1<0b1101, (outs GPR:$dst), (ins GPR:$src), Pseudo, - IIC_iMOVsi, "movs", "\t$dst, $src, lsr #1", - [(set GPR:$dst, (ARMsrl_flag GPR:$src))]>, UnaryDP; -def MOVsra_flag : AI1<0b1101, (outs GPR:$dst), (ins GPR:$src), Pseudo, - IIC_iMOVsi, "movs", "\t$dst, $src, asr #1", - [(set GPR:$dst, (ARMsra_flag GPR:$src))]>, UnaryDP; +def MOVsrl_flag : PseudoInst<(outs GPR:$dst), (ins GPR:$src), IIC_iMOVsi, + [(set GPR:$dst, (ARMsrl_flag GPR:$src))]>, UnaryDP, + Requires<[IsARM]>; +def MOVsra_flag : PseudoInst<(outs GPR:$dst), (ins GPR:$src), IIC_iMOVsi, + [(set GPR:$dst, (ARMsra_flag GPR:$src))]>, UnaryDP, + Requires<[IsARM]>; } //===----------------------------------------------------------------------===// @@ -1551,31 +2038,31 @@ def MOVsra_flag : AI1<0b1101, (outs GPR:$dst), (ins GPR:$src), Pseudo, // Sign extenders -defm SXTB : AI_unary_rrot<0b01101010, - "sxtb", UnOpFrag<(sext_inreg node:$Src, i8)>>; -defm SXTH : AI_unary_rrot<0b01101011, - "sxth", UnOpFrag<(sext_inreg node:$Src, i16)>>; +defm SXTB : AI_ext_rrot<0b01101010, + "sxtb", UnOpFrag<(sext_inreg node:$Src, i8)>>; +defm SXTH : AI_ext_rrot<0b01101011, + "sxth", UnOpFrag<(sext_inreg node:$Src, i16)>>; -defm SXTAB : AI_bin_rrot<0b01101010, +defm SXTAB : AI_exta_rrot<0b01101010, "sxtab", BinOpFrag<(add node:$LHS, (sext_inreg node:$RHS, i8))>>; -defm SXTAH : AI_bin_rrot<0b01101011, +defm SXTAH : AI_exta_rrot<0b01101011, "sxtah", BinOpFrag<(add node:$LHS, (sext_inreg node:$RHS,i16))>>; // For disassembly only -defm SXTB16 : AI_unary_rrot_np<0b01101000, "sxtb16">; +defm SXTB16 : AI_ext_rrot_np<0b01101000, "sxtb16">; // For disassembly only -defm SXTAB16 : AI_bin_rrot_np<0b01101000, "sxtab16">; +defm SXTAB16 : AI_exta_rrot_np<0b01101000, "sxtab16">; // Zero extenders let AddedComplexity = 16 in { -defm UXTB : AI_unary_rrot<0b01101110, - "uxtb" , UnOpFrag<(and node:$Src, 0x000000FF)>>; -defm UXTH : AI_unary_rrot<0b01101111, - "uxth" , UnOpFrag<(and node:$Src, 0x0000FFFF)>>; -defm UXTB16 : AI_unary_rrot<0b01101100, - "uxtb16", UnOpFrag<(and node:$Src, 0x00FF00FF)>>; +defm UXTB : AI_ext_rrot<0b01101110, + "uxtb" , UnOpFrag<(and node:$Src, 0x000000FF)>>; +defm UXTH : AI_ext_rrot<0b01101111, + "uxth" , UnOpFrag<(and node:$Src, 0x0000FFFF)>>; +defm UXTB16 : AI_ext_rrot<0b01101100, + "uxtb16", UnOpFrag<(and node:$Src, 0x00FF00FF)>>; // FIXME: This pattern incorrectly assumes the shl operator is a rotate. // The transformation should probably be done as a combiner action @@ -1586,33 +2073,49 @@ defm UXTB16 : AI_unary_rrot<0b01101100, def : ARMV6Pat<(and (srl GPR:$Src, (i32 8)), 0xFF00FF), (UXTB16r_rot GPR:$Src, 8)>; -defm UXTAB : AI_bin_rrot<0b01101110, "uxtab", +defm UXTAB : AI_exta_rrot<0b01101110, "uxtab", BinOpFrag<(add node:$LHS, (and node:$RHS, 0x00FF))>>; -defm UXTAH : AI_bin_rrot<0b01101111, "uxtah", +defm UXTAH : AI_exta_rrot<0b01101111, "uxtah", BinOpFrag<(add node:$LHS, (and node:$RHS, 0xFFFF))>>; } // This isn't safe in general, the add is two 16-bit units, not a 32-bit add. // For disassembly only -defm UXTAB16 : AI_bin_rrot_np<0b01101100, "uxtab16">; +defm UXTAB16 : AI_exta_rrot_np<0b01101100, "uxtab16">; -def SBFX : I<(outs GPR:$dst), - (ins GPR:$src, imm0_31:$lsb, imm0_31:$width), - AddrMode1, Size4Bytes, IndexModeNone, DPFrm, IIC_iALUi, - "sbfx", "\t$dst, $src, $lsb, $width", "", []>, +def SBFX : I<(outs GPR:$Rd), + (ins GPR:$Rn, imm0_31:$lsb, imm0_31_m1:$width), + AddrMode1, Size4Bytes, IndexModeNone, DPFrm, IIC_iUNAsi, + "sbfx", "\t$Rd, $Rn, $lsb, $width", "", []>, Requires<[IsARM, HasV6T2]> { + bits<4> Rd; + bits<4> Rn; + bits<5> lsb; + bits<5> width; let Inst{27-21} = 0b0111101; let Inst{6-4} = 0b101; + let Inst{20-16} = width; + let Inst{15-12} = Rd; + let Inst{11-7} = lsb; + let Inst{3-0} = Rn; } -def UBFX : I<(outs GPR:$dst), - (ins GPR:$src, imm0_31:$lsb, imm0_31:$width), - AddrMode1, Size4Bytes, IndexModeNone, DPFrm, IIC_iALUi, - "ubfx", "\t$dst, $src, $lsb, $width", "", []>, +def UBFX : I<(outs GPR:$Rd), + (ins GPR:$Rn, imm0_31:$lsb, imm0_31_m1:$width), + AddrMode1, Size4Bytes, IndexModeNone, DPFrm, IIC_iUNAsi, + "ubfx", "\t$Rd, $Rn, $lsb, $width", "", []>, Requires<[IsARM, HasV6T2]> { + bits<4> Rd; + bits<4> Rn; + bits<5> lsb; + bits<5> width; let Inst{27-21} = 0b0111111; let Inst{6-4} = 0b101; + let Inst{20-16} = width; + let Inst{15-12} = Rd; + let Inst{11-7} = lsb; + let Inst{3-0} = Rn; } //===----------------------------------------------------------------------===// @@ -1620,100 +2123,166 @@ def UBFX : I<(outs GPR:$dst), // defm ADD : AsI1_bin_irs<0b0100, "add", + IIC_iALUi, IIC_iALUr, IIC_iALUsr, BinOpFrag<(add node:$LHS, node:$RHS)>, 1>; defm SUB : AsI1_bin_irs<0b0010, "sub", + IIC_iALUi, IIC_iALUr, IIC_iALUsr, BinOpFrag<(sub node:$LHS, node:$RHS)>>; // ADD and SUB with 's' bit set. defm ADDS : AI1_bin_s_irs<0b0100, "adds", + IIC_iALUi, IIC_iALUr, IIC_iALUsr, BinOpFrag<(addc node:$LHS, node:$RHS)>, 1>; defm SUBS : AI1_bin_s_irs<0b0010, "subs", + IIC_iALUi, IIC_iALUr, IIC_iALUsr, BinOpFrag<(subc node:$LHS, node:$RHS)>>; defm ADC : AI1_adde_sube_irs<0b0101, "adc", BinOpFrag<(adde_dead_carry node:$LHS, node:$RHS)>, 1>; defm SBC : AI1_adde_sube_irs<0b0110, "sbc", BinOpFrag<(sube_dead_carry node:$LHS, node:$RHS)>>; + +// ADC and SUBC with 's' bit set. defm ADCS : AI1_adde_sube_s_irs<0b0101, "adcs", BinOpFrag<(adde_live_carry node:$LHS, node:$RHS)>, 1>; defm SBCS : AI1_adde_sube_s_irs<0b0110, "sbcs", BinOpFrag<(sube_live_carry node:$LHS, node:$RHS) >>; -def RSBri : AsI1<0b0011, (outs GPR:$dst), (ins GPR:$a, so_imm:$b), DPFrm, - IIC_iALUi, "rsb", "\t$dst, $a, $b", - [(set GPR:$dst, (sub so_imm:$b, GPR:$a))]> { - let Inst{25} = 1; +def RSBri : AsI1<0b0011, (outs GPR:$Rd), (ins GPR:$Rn, so_imm:$imm), DPFrm, + IIC_iALUi, "rsb", "\t$Rd, $Rn, $imm", + [(set GPR:$Rd, (sub so_imm:$imm, GPR:$Rn))]> { + bits<4> Rd; + bits<4> Rn; + bits<12> imm; + let Inst{25} = 1; + let Inst{15-12} = Rd; + let Inst{19-16} = Rn; + let Inst{11-0} = imm; } // The reg/reg form is only defined for the disassembler; for codegen it is // equivalent to SUBrr. -def RSBrr : AsI1<0b0011, (outs GPR:$dst), (ins GPR:$a, GPR:$b), DPFrm, - IIC_iALUr, "rsb", "\t$dst, $a, $b", +def RSBrr : AsI1<0b0011, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), DPFrm, + IIC_iALUr, "rsb", "\t$Rd, $Rn, $Rm", [/* For disassembly only; pattern left blank */]> { - let Inst{25} = 0; - let Inst{11-4} = 0b00000000; + bits<4> Rd; + bits<4> Rn; + bits<4> Rm; + let Inst{11-4} = 0b00000000; + let Inst{25} = 0; + let Inst{3-0} = Rm; + let Inst{15-12} = Rd; + let Inst{19-16} = Rn; } -def RSBrs : AsI1<0b0011, (outs GPR:$dst), (ins GPR:$a, so_reg:$b), DPSoRegFrm, - IIC_iALUsr, "rsb", "\t$dst, $a, $b", - [(set GPR:$dst, (sub so_reg:$b, GPR:$a))]> { - let Inst{25} = 0; +def RSBrs : AsI1<0b0011, (outs GPR:$Rd), (ins GPR:$Rn, so_reg:$shift), + DPSoRegFrm, IIC_iALUsr, "rsb", "\t$Rd, $Rn, $shift", + [(set GPR:$Rd, (sub so_reg:$shift, GPR:$Rn))]> { + bits<4> Rd; + bits<4> Rn; + bits<12> shift; + let Inst{25} = 0; + let Inst{11-0} = shift; + let Inst{15-12} = Rd; + let Inst{19-16} = Rn; } // RSB with 's' bit set. -let Defs = [CPSR] in { -def RSBSri : AI1<0b0011, (outs GPR:$dst), (ins GPR:$a, so_imm:$b), DPFrm, - IIC_iALUi, "rsbs", "\t$dst, $a, $b", - [(set GPR:$dst, (subc so_imm:$b, GPR:$a))]> { - let Inst{20} = 1; - let Inst{25} = 1; -} -def RSBSrs : AI1<0b0011, (outs GPR:$dst), (ins GPR:$a, so_reg:$b), DPSoRegFrm, - IIC_iALUsr, "rsbs", "\t$dst, $a, $b", - [(set GPR:$dst, (subc so_reg:$b, GPR:$a))]> { - let Inst{20} = 1; - let Inst{25} = 0; +let isCodeGenOnly = 1, Defs = [CPSR] in { +def RSBSri : AI1<0b0011, (outs GPR:$Rd), (ins GPR:$Rn, so_imm:$imm), DPFrm, + IIC_iALUi, "rsbs", "\t$Rd, $Rn, $imm", + [(set GPR:$Rd, (subc so_imm:$imm, GPR:$Rn))]> { + bits<4> Rd; + bits<4> Rn; + bits<12> imm; + let Inst{25} = 1; + let Inst{20} = 1; + let Inst{15-12} = Rd; + let Inst{19-16} = Rn; + let Inst{11-0} = imm; +} +def RSBSrs : AI1<0b0011, (outs GPR:$Rd), (ins GPR:$Rn, so_reg:$shift), + DPSoRegFrm, IIC_iALUsr, "rsbs", "\t$Rd, $Rn, $shift", + [(set GPR:$Rd, (subc so_reg:$shift, GPR:$Rn))]> { + bits<4> Rd; + bits<4> Rn; + bits<12> shift; + let Inst{25} = 0; + let Inst{20} = 1; + let Inst{11-0} = shift; + let Inst{15-12} = Rd; + let Inst{19-16} = Rn; } } let Uses = [CPSR] in { -def RSCri : AsI1<0b0111, (outs GPR:$dst), (ins GPR:$a, so_imm:$b), - DPFrm, IIC_iALUi, "rsc", "\t$dst, $a, $b", - [(set GPR:$dst, (sube_dead_carry so_imm:$b, GPR:$a))]>, +def RSCri : AsI1<0b0111, (outs GPR:$Rd), (ins GPR:$Rn, so_imm:$imm), + DPFrm, IIC_iALUi, "rsc", "\t$Rd, $Rn, $imm", + [(set GPR:$Rd, (sube_dead_carry so_imm:$imm, GPR:$Rn))]>, Requires<[IsARM]> { - let Inst{25} = 1; + bits<4> Rd; + bits<4> Rn; + bits<12> imm; + let Inst{25} = 1; + let Inst{15-12} = Rd; + let Inst{19-16} = Rn; + let Inst{11-0} = imm; } // The reg/reg form is only defined for the disassembler; for codegen it is // equivalent to SUBrr. -def RSCrr : AsI1<0b0111, (outs GPR:$dst), (ins GPR:$a, GPR:$b), - DPFrm, IIC_iALUr, "rsc", "\t$dst, $a, $b", +def RSCrr : AsI1<0b0111, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), + DPFrm, IIC_iALUr, "rsc", "\t$Rd, $Rn, $Rm", [/* For disassembly only; pattern left blank */]> { - let Inst{25} = 0; - let Inst{11-4} = 0b00000000; + bits<4> Rd; + bits<4> Rn; + bits<4> Rm; + let Inst{11-4} = 0b00000000; + let Inst{25} = 0; + let Inst{3-0} = Rm; + let Inst{15-12} = Rd; + let Inst{19-16} = Rn; } -def RSCrs : AsI1<0b0111, (outs GPR:$dst), (ins GPR:$a, so_reg:$b), - DPSoRegFrm, IIC_iALUsr, "rsc", "\t$dst, $a, $b", - [(set GPR:$dst, (sube_dead_carry so_reg:$b, GPR:$a))]>, +def RSCrs : AsI1<0b0111, (outs GPR:$Rd), (ins GPR:$Rn, so_reg:$shift), + DPSoRegFrm, IIC_iALUsr, "rsc", "\t$Rd, $Rn, $shift", + [(set GPR:$Rd, (sube_dead_carry so_reg:$shift, GPR:$Rn))]>, Requires<[IsARM]> { - let Inst{25} = 0; + bits<4> Rd; + bits<4> Rn; + bits<12> shift; + let Inst{25} = 0; + let Inst{11-0} = shift; + let Inst{15-12} = Rd; + let Inst{19-16} = Rn; } } // FIXME: Allow these to be predicated. -let Defs = [CPSR], Uses = [CPSR] in { -def RSCSri : AXI1<0b0111, (outs GPR:$dst), (ins GPR:$a, so_imm:$b), - DPFrm, IIC_iALUi, "rscs\t$dst, $a, $b", - [(set GPR:$dst, (sube_dead_carry so_imm:$b, GPR:$a))]>, +let isCodeGenOnly = 1, Defs = [CPSR], Uses = [CPSR] in { +def RSCSri : AXI1<0b0111, (outs GPR:$Rd), (ins GPR:$Rn, so_imm:$imm), + DPFrm, IIC_iALUi, "rscs\t$Rd, $Rn, $imm", + [(set GPR:$Rd, (sube_dead_carry so_imm:$imm, GPR:$Rn))]>, Requires<[IsARM]> { - let Inst{20} = 1; - let Inst{25} = 1; + bits<4> Rd; + bits<4> Rn; + bits<12> imm; + let Inst{25} = 1; + let Inst{20} = 1; + let Inst{15-12} = Rd; + let Inst{19-16} = Rn; + let Inst{11-0} = imm; } -def RSCSrs : AXI1<0b0111, (outs GPR:$dst), (ins GPR:$a, so_reg:$b), - DPSoRegFrm, IIC_iALUsr, "rscs\t$dst, $a, $b", - [(set GPR:$dst, (sube_dead_carry so_reg:$b, GPR:$a))]>, +def RSCSrs : AXI1<0b0111, (outs GPR:$Rd), (ins GPR:$Rn, so_reg:$shift), + DPSoRegFrm, IIC_iALUsr, "rscs\t$Rd, $Rn, $shift", + [(set GPR:$Rd, (sube_dead_carry so_reg:$shift, GPR:$Rn))]>, Requires<[IsARM]> { - let Inst{20} = 1; - let Inst{25} = 0; + bits<4> Rd; + bits<4> Rn; + bits<12> shift; + let Inst{25} = 0; + let Inst{20} = 1; + let Inst{11-0} = shift; + let Inst{15-12} = Rd; + let Inst{19-16} = Rn; } } @@ -1740,111 +2309,166 @@ def : ARMPat<(adde GPR:$src, so_imm_not:$imm), // ARM Arithmetic Instruction -- for disassembly only // GPR:$dst = GPR:$a op GPR:$b -class AAI<bits<8> op27_20, bits<4> op7_4, string opc, - list<dag> pattern = [/* For disassembly only; pattern left blank */]> - : AI<(outs GPR:$dst), (ins GPR:$a, GPR:$b), DPFrm, IIC_iALUr, - opc, "\t$dst, $a, $b", pattern> { +class AAI<bits<8> op27_20, bits<8> op11_4, string opc, + list<dag> pattern = [/* For disassembly only; pattern left blank */], + dag iops = (ins GPR:$Rn, GPR:$Rm), string asm = "\t$Rd, $Rn, $Rm"> + : AI<(outs GPR:$Rd), iops, DPFrm, IIC_iALUr, opc, asm, pattern> { + bits<4> Rn; + bits<4> Rd; + bits<4> Rm; let Inst{27-20} = op27_20; - let Inst{7-4} = op7_4; + let Inst{11-4} = op11_4; + let Inst{19-16} = Rn; + let Inst{15-12} = Rd; + let Inst{3-0} = Rm; } // Saturating add/subtract -- for disassembly only -def QADD : AAI<0b00010000, 0b0101, "qadd", - [(set GPR:$dst, (int_arm_qadd GPR:$a, GPR:$b))]>; -def QADD16 : AAI<0b01100010, 0b0001, "qadd16">; -def QADD8 : AAI<0b01100010, 0b1001, "qadd8">; -def QASX : AAI<0b01100010, 0b0011, "qasx">; -def QDADD : AAI<0b00010100, 0b0101, "qdadd">; -def QDSUB : AAI<0b00010110, 0b0101, "qdsub">; -def QSAX : AAI<0b01100010, 0b0101, "qsax">; -def QSUB : AAI<0b00010010, 0b0101, "qsub", - [(set GPR:$dst, (int_arm_qsub GPR:$a, GPR:$b))]>; -def QSUB16 : AAI<0b01100010, 0b0111, "qsub16">; -def QSUB8 : AAI<0b01100010, 0b1111, "qsub8">; -def UQADD16 : AAI<0b01100110, 0b0001, "uqadd16">; -def UQADD8 : AAI<0b01100110, 0b1001, "uqadd8">; -def UQASX : AAI<0b01100110, 0b0011, "uqasx">; -def UQSAX : AAI<0b01100110, 0b0101, "uqsax">; -def UQSUB16 : AAI<0b01100110, 0b0111, "uqsub16">; -def UQSUB8 : AAI<0b01100110, 0b1111, "uqsub8">; +def QADD : AAI<0b00010000, 0b00000101, "qadd", + [(set GPR:$Rd, (int_arm_qadd GPR:$Rm, GPR:$Rn))], + (ins GPR:$Rm, GPR:$Rn), "\t$Rd, $Rm, $Rn">; +def QSUB : AAI<0b00010010, 0b00000101, "qsub", + [(set GPR:$Rd, (int_arm_qsub GPR:$Rm, GPR:$Rn))], + (ins GPR:$Rm, GPR:$Rn), "\t$Rd, $Rm, $Rn">; +def QDADD : AAI<0b00010100, 0b00000101, "qdadd", [], (ins GPR:$Rm, GPR:$Rn), + "\t$Rd, $Rm, $Rn">; +def QDSUB : AAI<0b00010110, 0b00000101, "qdsub", [], (ins GPR:$Rm, GPR:$Rn), + "\t$Rd, $Rm, $Rn">; + +def QADD16 : AAI<0b01100010, 0b11110001, "qadd16">; +def QADD8 : AAI<0b01100010, 0b11111001, "qadd8">; +def QASX : AAI<0b01100010, 0b11110011, "qasx">; +def QSAX : AAI<0b01100010, 0b11110101, "qsax">; +def QSUB16 : AAI<0b01100010, 0b11110111, "qsub16">; +def QSUB8 : AAI<0b01100010, 0b11111111, "qsub8">; +def UQADD16 : AAI<0b01100110, 0b11110001, "uqadd16">; +def UQADD8 : AAI<0b01100110, 0b11111001, "uqadd8">; +def UQASX : AAI<0b01100110, 0b11110011, "uqasx">; +def UQSAX : AAI<0b01100110, 0b11110101, "uqsax">; +def UQSUB16 : AAI<0b01100110, 0b11110111, "uqsub16">; +def UQSUB8 : AAI<0b01100110, 0b11111111, "uqsub8">; // Signed/Unsigned add/subtract -- for disassembly only -def SASX : AAI<0b01100001, 0b0011, "sasx">; -def SADD16 : AAI<0b01100001, 0b0001, "sadd16">; -def SADD8 : AAI<0b01100001, 0b1001, "sadd8">; -def SSAX : AAI<0b01100001, 0b0101, "ssax">; -def SSUB16 : AAI<0b01100001, 0b0111, "ssub16">; -def SSUB8 : AAI<0b01100001, 0b1111, "ssub8">; -def UASX : AAI<0b01100101, 0b0011, "uasx">; -def UADD16 : AAI<0b01100101, 0b0001, "uadd16">; -def UADD8 : AAI<0b01100101, 0b1001, "uadd8">; -def USAX : AAI<0b01100101, 0b0101, "usax">; -def USUB16 : AAI<0b01100101, 0b0111, "usub16">; -def USUB8 : AAI<0b01100101, 0b1111, "usub8">; +def SASX : AAI<0b01100001, 0b11110011, "sasx">; +def SADD16 : AAI<0b01100001, 0b11110001, "sadd16">; +def SADD8 : AAI<0b01100001, 0b11111001, "sadd8">; +def SSAX : AAI<0b01100001, 0b11110101, "ssax">; +def SSUB16 : AAI<0b01100001, 0b11110111, "ssub16">; +def SSUB8 : AAI<0b01100001, 0b11111111, "ssub8">; +def UASX : AAI<0b01100101, 0b11110011, "uasx">; +def UADD16 : AAI<0b01100101, 0b11110001, "uadd16">; +def UADD8 : AAI<0b01100101, 0b11111001, "uadd8">; +def USAX : AAI<0b01100101, 0b11110101, "usax">; +def USUB16 : AAI<0b01100101, 0b11110111, "usub16">; +def USUB8 : AAI<0b01100101, 0b11111111, "usub8">; // Signed/Unsigned halving add/subtract -- for disassembly only -def SHASX : AAI<0b01100011, 0b0011, "shasx">; -def SHADD16 : AAI<0b01100011, 0b0001, "shadd16">; -def SHADD8 : AAI<0b01100011, 0b1001, "shadd8">; -def SHSAX : AAI<0b01100011, 0b0101, "shsax">; -def SHSUB16 : AAI<0b01100011, 0b0111, "shsub16">; -def SHSUB8 : AAI<0b01100011, 0b1111, "shsub8">; -def UHASX : AAI<0b01100111, 0b0011, "uhasx">; -def UHADD16 : AAI<0b01100111, 0b0001, "uhadd16">; -def UHADD8 : AAI<0b01100111, 0b1001, "uhadd8">; -def UHSAX : AAI<0b01100111, 0b0101, "uhsax">; -def UHSUB16 : AAI<0b01100111, 0b0111, "uhsub16">; -def UHSUB8 : AAI<0b01100111, 0b1111, "uhsub8">; +def SHASX : AAI<0b01100011, 0b11110011, "shasx">; +def SHADD16 : AAI<0b01100011, 0b11110001, "shadd16">; +def SHADD8 : AAI<0b01100011, 0b11111001, "shadd8">; +def SHSAX : AAI<0b01100011, 0b11110101, "shsax">; +def SHSUB16 : AAI<0b01100011, 0b11110111, "shsub16">; +def SHSUB8 : AAI<0b01100011, 0b11111111, "shsub8">; +def UHASX : AAI<0b01100111, 0b11110011, "uhasx">; +def UHADD16 : AAI<0b01100111, 0b11110001, "uhadd16">; +def UHADD8 : AAI<0b01100111, 0b11111001, "uhadd8">; +def UHSAX : AAI<0b01100111, 0b11110101, "uhsax">; +def UHSUB16 : AAI<0b01100111, 0b11110111, "uhsub16">; +def UHSUB8 : AAI<0b01100111, 0b11111111, "uhsub8">; // Unsigned Sum of Absolute Differences [and Accumulate] -- for disassembly only -def USAD8 : AI<(outs GPR:$dst), (ins GPR:$a, GPR:$b), +def USAD8 : AI<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), MulFrm /* for convenience */, NoItinerary, "usad8", - "\t$dst, $a, $b", []>, + "\t$Rd, $Rn, $Rm", []>, Requires<[IsARM, HasV6]> { + bits<4> Rd; + bits<4> Rn; + bits<4> Rm; let Inst{27-20} = 0b01111000; let Inst{15-12} = 0b1111; let Inst{7-4} = 0b0001; + let Inst{19-16} = Rd; + let Inst{11-8} = Rm; + let Inst{3-0} = Rn; } -def USADA8 : AI<(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc), +def USADA8 : AI<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm, GPR:$Ra), MulFrm /* for convenience */, NoItinerary, "usada8", - "\t$dst, $a, $b, $acc", []>, + "\t$Rd, $Rn, $Rm, $Ra", []>, Requires<[IsARM, HasV6]> { + bits<4> Rd; + bits<4> Rn; + bits<4> Rm; + bits<4> Ra; let Inst{27-20} = 0b01111000; let Inst{7-4} = 0b0001; + let Inst{19-16} = Rd; + let Inst{15-12} = Ra; + let Inst{11-8} = Rm; + let Inst{3-0} = Rn; } // Signed/Unsigned saturate -- for disassembly only -def SSAT : AI<(outs GPR:$dst), (ins i32imm:$bit_pos, GPR:$a, shift_imm:$sh), - SatFrm, NoItinerary, "ssat", "\t$dst, $bit_pos, $a$sh", +def SSAT : AI<(outs GPR:$Rd), (ins i32imm:$sat_imm, GPR:$a, shift_imm:$sh), + SatFrm, NoItinerary, "ssat", "\t$Rd, $sat_imm, $a$sh", [/* For disassembly only; pattern left blank */]> { + bits<4> Rd; + bits<5> sat_imm; + bits<4> Rn; + bits<8> sh; let Inst{27-21} = 0b0110101; let Inst{5-4} = 0b01; + let Inst{20-16} = sat_imm; + let Inst{15-12} = Rd; + let Inst{11-7} = sh{7-3}; + let Inst{6} = sh{0}; + let Inst{3-0} = Rn; } -def SSAT16 : AI<(outs GPR:$dst), (ins i32imm:$bit_pos, GPR:$a), SatFrm, - NoItinerary, "ssat16", "\t$dst, $bit_pos, $a", +def SSAT16 : AI<(outs GPR:$Rd), (ins i32imm:$sat_imm, GPR:$Rn), SatFrm, + NoItinerary, "ssat16", "\t$Rd, $sat_imm, $Rn", [/* For disassembly only; pattern left blank */]> { + bits<4> Rd; + bits<4> sat_imm; + bits<4> Rn; let Inst{27-20} = 0b01101010; - let Inst{7-4} = 0b0011; + let Inst{11-4} = 0b11110011; + let Inst{15-12} = Rd; + let Inst{19-16} = sat_imm; + let Inst{3-0} = Rn; } -def USAT : AI<(outs GPR:$dst), (ins i32imm:$bit_pos, GPR:$a, shift_imm:$sh), - SatFrm, NoItinerary, "usat", "\t$dst, $bit_pos, $a$sh", +def USAT : AI<(outs GPR:$Rd), (ins i32imm:$sat_imm, GPR:$a, shift_imm:$sh), + SatFrm, NoItinerary, "usat", "\t$Rd, $sat_imm, $a$sh", [/* For disassembly only; pattern left blank */]> { + bits<4> Rd; + bits<5> sat_imm; + bits<4> Rn; + bits<8> sh; let Inst{27-21} = 0b0110111; let Inst{5-4} = 0b01; + let Inst{15-12} = Rd; + let Inst{11-7} = sh{7-3}; + let Inst{6} = sh{0}; + let Inst{20-16} = sat_imm; + let Inst{3-0} = Rn; } -def USAT16 : AI<(outs GPR:$dst), (ins i32imm:$bit_pos, GPR:$a), SatFrm, - NoItinerary, "usat16", "\t$dst, $bit_pos, $a", +def USAT16 : AI<(outs GPR:$Rd), (ins i32imm:$sat_imm, GPR:$a), SatFrm, + NoItinerary, "usat16", "\t$Rd, $sat_imm, $a", [/* For disassembly only; pattern left blank */]> { + bits<4> Rd; + bits<4> sat_imm; + bits<4> Rn; let Inst{27-20} = 0b01101110; - let Inst{7-4} = 0b0011; + let Inst{11-4} = 0b11110011; + let Inst{15-12} = Rd; + let Inst{19-16} = sat_imm; + let Inst{3-0} = Rn; } def : ARMV6Pat<(int_arm_ssat GPR:$a, imm:$pos), (SSAT imm:$pos, GPR:$a, 0)>; @@ -1855,52 +2479,100 @@ def : ARMV6Pat<(int_arm_usat GPR:$a, imm:$pos), (USAT imm:$pos, GPR:$a, 0)>; // defm AND : AsI1_bin_irs<0b0000, "and", + IIC_iBITi, IIC_iBITr, IIC_iBITsr, BinOpFrag<(and node:$LHS, node:$RHS)>, 1>; -defm ANDS : AI1_bin_s_irs<0b0000, "and", - BinOpFrag<(ARMand node:$LHS, node:$RHS)>, 1>; defm ORR : AsI1_bin_irs<0b1100, "orr", + IIC_iBITi, IIC_iBITr, IIC_iBITsr, BinOpFrag<(or node:$LHS, node:$RHS)>, 1>; defm EOR : AsI1_bin_irs<0b0001, "eor", + IIC_iBITi, IIC_iBITr, IIC_iBITsr, BinOpFrag<(xor node:$LHS, node:$RHS)>, 1>; defm BIC : AsI1_bin_irs<0b1110, "bic", + IIC_iBITi, IIC_iBITr, IIC_iBITsr, BinOpFrag<(and node:$LHS, (not node:$RHS))>>; -def BFC : I<(outs GPR:$dst), (ins GPR:$src, bf_inv_mask_imm:$imm), +def BFC : I<(outs GPR:$Rd), (ins GPR:$src, bf_inv_mask_imm:$imm), AddrMode1, Size4Bytes, IndexModeNone, DPFrm, IIC_iUNAsi, - "bfc", "\t$dst, $imm", "$src = $dst", - [(set GPR:$dst, (and GPR:$src, bf_inv_mask_imm:$imm))]>, + "bfc", "\t$Rd, $imm", "$src = $Rd", + [(set GPR:$Rd, (and GPR:$src, bf_inv_mask_imm:$imm))]>, Requires<[IsARM, HasV6T2]> { + bits<4> Rd; + bits<10> imm; let Inst{27-21} = 0b0111110; let Inst{6-0} = 0b0011111; + let Inst{15-12} = Rd; + let Inst{11-7} = imm{4-0}; // lsb + let Inst{20-16} = imm{9-5}; // width } // A8.6.18 BFI - Bitfield insert (Encoding A1) -def BFI : I<(outs GPR:$dst), (ins GPR:$src, GPR:$val, bf_inv_mask_imm:$imm), +def BFI : I<(outs GPR:$Rd), (ins GPR:$src, GPR:$Rn, bf_inv_mask_imm:$imm), AddrMode1, Size4Bytes, IndexModeNone, DPFrm, IIC_iUNAsi, - "bfi", "\t$dst, $val, $imm", "$src = $dst", - [(set GPR:$dst, (ARMbfi GPR:$src, GPR:$val, + "bfi", "\t$Rd, $Rn, $imm", "$src = $Rd", + [(set GPR:$Rd, (ARMbfi GPR:$src, GPR:$Rn, bf_inv_mask_imm:$imm))]>, Requires<[IsARM, HasV6T2]> { + bits<4> Rd; + bits<4> Rn; + bits<10> imm; + let Inst{27-21} = 0b0111110; + let Inst{6-4} = 0b001; // Rn: Inst{3-0} != 15 + let Inst{15-12} = Rd; + let Inst{11-7} = imm{4-0}; // lsb + let Inst{20-16} = imm{9-5}; // width + let Inst{3-0} = Rn; +} + +// GNU as only supports this form of bfi (w/ 4 arguments) +let isAsmParserOnly = 1 in +def BFI4p : I<(outs GPR:$Rd), (ins GPR:$src, GPR:$Rn, + lsb_pos_imm:$lsb, width_imm:$width), + AddrMode1, Size4Bytes, IndexModeNone, DPFrm, IIC_iUNAsi, + "bfi", "\t$Rd, $Rn, $lsb, $width", "$src = $Rd", + []>, Requires<[IsARM, HasV6T2]> { + bits<4> Rd; + bits<4> Rn; + bits<5> lsb; + bits<5> width; let Inst{27-21} = 0b0111110; let Inst{6-4} = 0b001; // Rn: Inst{3-0} != 15 + let Inst{15-12} = Rd; + let Inst{11-7} = lsb; + let Inst{20-16} = width; // Custom encoder => lsb+width-1 + let Inst{3-0} = Rn; } -def MVNr : AsI1<0b1111, (outs GPR:$dst), (ins GPR:$src), DPFrm, IIC_iMOVr, - "mvn", "\t$dst, $src", - [(set GPR:$dst, (not GPR:$src))]>, UnaryDP { +def MVNr : AsI1<0b1111, (outs GPR:$Rd), (ins GPR:$Rm), DPFrm, IIC_iMVNr, + "mvn", "\t$Rd, $Rm", + [(set GPR:$Rd, (not GPR:$Rm))]>, UnaryDP { + bits<4> Rd; + bits<4> Rm; let Inst{25} = 0; + let Inst{19-16} = 0b0000; let Inst{11-4} = 0b00000000; -} -def MVNs : AsI1<0b1111, (outs GPR:$dst), (ins so_reg:$src), DPSoRegFrm, - IIC_iMOVsr, "mvn", "\t$dst, $src", - [(set GPR:$dst, (not so_reg:$src))]>, UnaryDP { + let Inst{15-12} = Rd; + let Inst{3-0} = Rm; +} +def MVNs : AsI1<0b1111, (outs GPR:$Rd), (ins so_reg:$shift), DPSoRegFrm, + IIC_iMVNsr, "mvn", "\t$Rd, $shift", + [(set GPR:$Rd, (not so_reg:$shift))]>, UnaryDP { + bits<4> Rd; + bits<12> shift; let Inst{25} = 0; -} -let isReMaterializable = 1, isAsCheapAsAMove = 1 in -def MVNi : AsI1<0b1111, (outs GPR:$dst), (ins so_imm:$imm), DPFrm, - IIC_iMOVi, "mvn", "\t$dst, $imm", - [(set GPR:$dst, so_imm_not:$imm)]>,UnaryDP { - let Inst{25} = 1; + let Inst{19-16} = 0b0000; + let Inst{15-12} = Rd; + let Inst{11-0} = shift; +} +let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in +def MVNi : AsI1<0b1111, (outs GPR:$Rd), (ins so_imm:$imm), DPFrm, + IIC_iMVNi, "mvn", "\t$Rd, $imm", + [(set GPR:$Rd, so_imm_not:$imm)]>,UnaryDP { + bits<4> Rd; + bits<12> imm; + let Inst{25} = 1; + let Inst{19-16} = 0b0000; + let Inst{15-12} = Rd; + let Inst{11-0} = imm; } def : ARMPat<(and GPR:$src, so_imm_not:$imm), @@ -1909,247 +2581,299 @@ def : ARMPat<(and GPR:$src, so_imm_not:$imm), //===----------------------------------------------------------------------===// // Multiply Instructions. // +class AsMul1I32<bits<7> opcod, dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : AsMul1I<opcod, oops, iops, itin, opc, asm, pattern> { + bits<4> Rd; + bits<4> Rm; + bits<4> Rn; + let Inst{19-16} = Rd; + let Inst{11-8} = Rm; + let Inst{3-0} = Rn; +} +class AsMul1I64<bits<7> opcod, dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : AsMul1I<opcod, oops, iops, itin, opc, asm, pattern> { + bits<4> RdLo; + bits<4> RdHi; + bits<4> Rm; + bits<4> Rn; + let Inst{19-16} = RdHi; + let Inst{15-12} = RdLo; + let Inst{11-8} = Rm; + let Inst{3-0} = Rn; +} -let isCommutable = 1 in -def MUL : AsMul1I<0b0000000, (outs GPR:$dst), (ins GPR:$a, GPR:$b), - IIC_iMUL32, "mul", "\t$dst, $a, $b", - [(set GPR:$dst, (mul GPR:$a, GPR:$b))]>; - -def MLA : AsMul1I<0b0000001, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c), - IIC_iMAC32, "mla", "\t$dst, $a, $b, $c", - [(set GPR:$dst, (add (mul GPR:$a, GPR:$b), GPR:$c))]>; - -def MLS : AMul1I<0b0000011, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c), - IIC_iMAC32, "mls", "\t$dst, $a, $b, $c", - [(set GPR:$dst, (sub GPR:$c, (mul GPR:$a, GPR:$b)))]>, - Requires<[IsARM, HasV6T2]>; +let isCommutable = 1 in { +let Constraints = "@earlyclobber $Rd" in +def MULv5: ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm, + pred:$p, cc_out:$s), + Size4Bytes, IIC_iMUL32, + [(set GPR:$Rd, (mul GPR:$Rn, GPR:$Rm))]>, + Requires<[IsARM, NoV6]>; + +def MUL : AsMul1I32<0b0000000, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), + IIC_iMUL32, "mul", "\t$Rd, $Rn, $Rm", + [(set GPR:$Rd, (mul GPR:$Rn, GPR:$Rm))]>, + Requires<[IsARM, HasV6]>; +} + +let Constraints = "@earlyclobber $Rd" in +def MLAv5: ARMPseudoInst<(outs GPR:$Rd), + (ins GPR:$Rn, GPR:$Rm, GPR:$Ra, pred:$p, cc_out:$s), + Size4Bytes, IIC_iMAC32, + [(set GPR:$Rd, (add (mul GPR:$Rn, GPR:$Rm), GPR:$Ra))]>, + Requires<[IsARM, NoV6]> { + bits<4> Ra; + let Inst{15-12} = Ra; +} +def MLA : AsMul1I32<0b0000001, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm, GPR:$Ra), + IIC_iMAC32, "mla", "\t$Rd, $Rn, $Rm, $Ra", + [(set GPR:$Rd, (add (mul GPR:$Rn, GPR:$Rm), GPR:$Ra))]>, + Requires<[IsARM, HasV6]> { + bits<4> Ra; + let Inst{15-12} = Ra; +} + +def MLS : AMul1I<0b0000011, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm, GPR:$Ra), + IIC_iMAC32, "mls", "\t$Rd, $Rn, $Rm, $Ra", + [(set GPR:$Rd, (sub GPR:$Ra, (mul GPR:$Rn, GPR:$Rm)))]>, + Requires<[IsARM, HasV6T2]> { + bits<4> Rd; + bits<4> Rm; + bits<4> Rn; + bits<4> Ra; + let Inst{19-16} = Rd; + let Inst{15-12} = Ra; + let Inst{11-8} = Rm; + let Inst{3-0} = Rn; +} // Extra precision multiplies with low / high results + let neverHasSideEffects = 1 in { let isCommutable = 1 in { -def SMULL : AsMul1I<0b0000110, (outs GPR:$ldst, GPR:$hdst), - (ins GPR:$a, GPR:$b), IIC_iMUL64, - "smull", "\t$ldst, $hdst, $a, $b", []>; +let Constraints = "@earlyclobber $RdLo,@earlyclobber $RdHi" in { +def SMULLv5 : ARMPseudoInst<(outs GPR:$RdLo, GPR:$RdHi), + (ins GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s), + Size4Bytes, IIC_iMUL64, []>, + Requires<[IsARM, NoV6]>; -def UMULL : AsMul1I<0b0000100, (outs GPR:$ldst, GPR:$hdst), - (ins GPR:$a, GPR:$b), IIC_iMUL64, - "umull", "\t$ldst, $hdst, $a, $b", []>; +def UMULLv5 : ARMPseudoInst<(outs GPR:$RdLo, GPR:$RdHi), + (ins GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s), + Size4Bytes, IIC_iMUL64, []>, + Requires<[IsARM, NoV6]>; } -// Multiply + accumulate -def SMLAL : AsMul1I<0b0000111, (outs GPR:$ldst, GPR:$hdst), - (ins GPR:$a, GPR:$b), IIC_iMAC64, - "smlal", "\t$ldst, $hdst, $a, $b", []>; +def SMULL : AsMul1I64<0b0000110, (outs GPR:$RdLo, GPR:$RdHi), + (ins GPR:$Rn, GPR:$Rm), IIC_iMUL64, + "smull", "\t$RdLo, $RdHi, $Rn, $Rm", []>, + Requires<[IsARM, HasV6]>; -def UMLAL : AsMul1I<0b0000101, (outs GPR:$ldst, GPR:$hdst), - (ins GPR:$a, GPR:$b), IIC_iMAC64, - "umlal", "\t$ldst, $hdst, $a, $b", []>; +def UMULL : AsMul1I64<0b0000100, (outs GPR:$RdLo, GPR:$RdHi), + (ins GPR:$Rn, GPR:$Rm), IIC_iMUL64, + "umull", "\t$RdLo, $RdHi, $Rn, $Rm", []>, + Requires<[IsARM, HasV6]>; +} -def UMAAL : AMul1I <0b0000010, (outs GPR:$ldst, GPR:$hdst), - (ins GPR:$a, GPR:$b), IIC_iMAC64, - "umaal", "\t$ldst, $hdst, $a, $b", []>, +// Multiply + accumulate +let Constraints = "@earlyclobber $RdLo,@earlyclobber $RdHi" in { +def SMLALv5 : ARMPseudoInst<(outs GPR:$RdLo, GPR:$RdHi), + (ins GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s), + Size4Bytes, IIC_iMAC64, []>, + Requires<[IsARM, NoV6]>; +def UMLALv5 : ARMPseudoInst<(outs GPR:$RdLo, GPR:$RdHi), + (ins GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s), + Size4Bytes, IIC_iMAC64, []>, + Requires<[IsARM, NoV6]>; +def UMAALv5 : ARMPseudoInst<(outs GPR:$RdLo, GPR:$RdHi), + (ins GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s), + Size4Bytes, IIC_iMAC64, []>, + Requires<[IsARM, NoV6]>; + +} + +def SMLAL : AsMul1I64<0b0000111, (outs GPR:$RdLo, GPR:$RdHi), + (ins GPR:$Rn, GPR:$Rm), IIC_iMAC64, + "smlal", "\t$RdLo, $RdHi, $Rn, $Rm", []>, + Requires<[IsARM, HasV6]>; +def UMLAL : AsMul1I64<0b0000101, (outs GPR:$RdLo, GPR:$RdHi), + (ins GPR:$Rn, GPR:$Rm), IIC_iMAC64, + "umlal", "\t$RdLo, $RdHi, $Rn, $Rm", []>, Requires<[IsARM, HasV6]>; + +def UMAAL : AMul1I <0b0000010, (outs GPR:$RdLo, GPR:$RdHi), + (ins GPR:$Rn, GPR:$Rm), IIC_iMAC64, + "umaal", "\t$RdLo, $RdHi, $Rn, $Rm", []>, + Requires<[IsARM, HasV6]> { + bits<4> RdLo; + bits<4> RdHi; + bits<4> Rm; + bits<4> Rn; + let Inst{19-16} = RdLo; + let Inst{15-12} = RdHi; + let Inst{11-8} = Rm; + let Inst{3-0} = Rn; +} } // neverHasSideEffects // Most significant word multiply -def SMMUL : AMul2I <0b0111010, (outs GPR:$dst), (ins GPR:$a, GPR:$b), - IIC_iMUL32, "smmul", "\t$dst, $a, $b", - [(set GPR:$dst, (mulhs GPR:$a, GPR:$b))]>, +def SMMUL : AMul2I <0b0111010, 0b0001, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), + IIC_iMUL32, "smmul", "\t$Rd, $Rn, $Rm", + [(set GPR:$Rd, (mulhs GPR:$Rn, GPR:$Rm))]>, Requires<[IsARM, HasV6]> { - let Inst{7-4} = 0b0001; let Inst{15-12} = 0b1111; } -def SMMULR : AMul2I <0b0111010, (outs GPR:$dst), (ins GPR:$a, GPR:$b), - IIC_iMUL32, "smmulr", "\t$dst, $a, $b", +def SMMULR : AMul2I <0b0111010, 0b0011, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), + IIC_iMUL32, "smmulr", "\t$Rd, $Rn, $Rm", [/* For disassembly only; pattern left blank */]>, Requires<[IsARM, HasV6]> { - let Inst{7-4} = 0b0011; // R = 1 let Inst{15-12} = 0b1111; } -def SMMLA : AMul2I <0b0111010, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c), - IIC_iMAC32, "smmla", "\t$dst, $a, $b, $c", - [(set GPR:$dst, (add (mulhs GPR:$a, GPR:$b), GPR:$c))]>, - Requires<[IsARM, HasV6]> { - let Inst{7-4} = 0b0001; -} +def SMMLA : AMul2Ia <0b0111010, 0b0001, (outs GPR:$Rd), + (ins GPR:$Rn, GPR:$Rm, GPR:$Ra), + IIC_iMAC32, "smmla", "\t$Rd, $Rn, $Rm, $Ra", + [(set GPR:$Rd, (add (mulhs GPR:$Rn, GPR:$Rm), GPR:$Ra))]>, + Requires<[IsARM, HasV6]>; -def SMMLAR : AMul2I <0b0111010, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c), - IIC_iMAC32, "smmlar", "\t$dst, $a, $b, $c", +def SMMLAR : AMul2Ia <0b0111010, 0b0011, (outs GPR:$Rd), + (ins GPR:$Rn, GPR:$Rm, GPR:$Ra), + IIC_iMAC32, "smmlar", "\t$Rd, $Rn, $Rm, $Ra", [/* For disassembly only; pattern left blank */]>, - Requires<[IsARM, HasV6]> { - let Inst{7-4} = 0b0011; // R = 1 -} + Requires<[IsARM, HasV6]>; -def SMMLS : AMul2I <0b0111010, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c), - IIC_iMAC32, "smmls", "\t$dst, $a, $b, $c", - [(set GPR:$dst, (sub GPR:$c, (mulhs GPR:$a, GPR:$b)))]>, - Requires<[IsARM, HasV6]> { - let Inst{7-4} = 0b1101; -} +def SMMLS : AMul2Ia <0b0111010, 0b1101, (outs GPR:$Rd), + (ins GPR:$Rn, GPR:$Rm, GPR:$Ra), + IIC_iMAC32, "smmls", "\t$Rd, $Rn, $Rm, $Ra", + [(set GPR:$Rd, (sub GPR:$Ra, (mulhs GPR:$Rn, GPR:$Rm)))]>, + Requires<[IsARM, HasV6]>; -def SMMLSR : AMul2I <0b0111010, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c), - IIC_iMAC32, "smmlsr", "\t$dst, $a, $b, $c", +def SMMLSR : AMul2Ia <0b0111010, 0b1111, (outs GPR:$Rd), + (ins GPR:$Rn, GPR:$Rm, GPR:$Ra), + IIC_iMAC32, "smmlsr", "\t$Rd, $Rn, $Rm, $Ra", [/* For disassembly only; pattern left blank */]>, - Requires<[IsARM, HasV6]> { - let Inst{7-4} = 0b1111; // R = 1 -} + Requires<[IsARM, HasV6]>; multiclass AI_smul<string opc, PatFrag opnode> { - def BB : AMulxyI<0b0001011, (outs GPR:$dst), (ins GPR:$a, GPR:$b), - IIC_iMUL32, !strconcat(opc, "bb"), "\t$dst, $a, $b", - [(set GPR:$dst, (opnode (sext_inreg GPR:$a, i16), - (sext_inreg GPR:$b, i16)))]>, - Requires<[IsARM, HasV5TE]> { - let Inst{5} = 0; - let Inst{6} = 0; - } - - def BT : AMulxyI<0b0001011, (outs GPR:$dst), (ins GPR:$a, GPR:$b), - IIC_iMUL32, !strconcat(opc, "bt"), "\t$dst, $a, $b", - [(set GPR:$dst, (opnode (sext_inreg GPR:$a, i16), - (sra GPR:$b, (i32 16))))]>, - Requires<[IsARM, HasV5TE]> { - let Inst{5} = 0; - let Inst{6} = 1; - } - - def TB : AMulxyI<0b0001011, (outs GPR:$dst), (ins GPR:$a, GPR:$b), - IIC_iMUL32, !strconcat(opc, "tb"), "\t$dst, $a, $b", - [(set GPR:$dst, (opnode (sra GPR:$a, (i32 16)), - (sext_inreg GPR:$b, i16)))]>, - Requires<[IsARM, HasV5TE]> { - let Inst{5} = 1; - let Inst{6} = 0; - } - - def TT : AMulxyI<0b0001011, (outs GPR:$dst), (ins GPR:$a, GPR:$b), - IIC_iMUL32, !strconcat(opc, "tt"), "\t$dst, $a, $b", - [(set GPR:$dst, (opnode (sra GPR:$a, (i32 16)), - (sra GPR:$b, (i32 16))))]>, - Requires<[IsARM, HasV5TE]> { - let Inst{5} = 1; - let Inst{6} = 1; - } - - def WB : AMulxyI<0b0001001, (outs GPR:$dst), (ins GPR:$a, GPR:$b), - IIC_iMUL16, !strconcat(opc, "wb"), "\t$dst, $a, $b", - [(set GPR:$dst, (sra (opnode GPR:$a, - (sext_inreg GPR:$b, i16)), (i32 16)))]>, - Requires<[IsARM, HasV5TE]> { - let Inst{5} = 1; - let Inst{6} = 0; - } - - def WT : AMulxyI<0b0001001, (outs GPR:$dst), (ins GPR:$a, GPR:$b), - IIC_iMUL16, !strconcat(opc, "wt"), "\t$dst, $a, $b", - [(set GPR:$dst, (sra (opnode GPR:$a, - (sra GPR:$b, (i32 16))), (i32 16)))]>, - Requires<[IsARM, HasV5TE]> { - let Inst{5} = 1; - let Inst{6} = 1; - } + def BB : AMulxyI<0b0001011, 0b00, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), + IIC_iMUL16, !strconcat(opc, "bb"), "\t$Rd, $Rn, $Rm", + [(set GPR:$Rd, (opnode (sext_inreg GPR:$Rn, i16), + (sext_inreg GPR:$Rm, i16)))]>, + Requires<[IsARM, HasV5TE]>; + + def BT : AMulxyI<0b0001011, 0b10, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), + IIC_iMUL16, !strconcat(opc, "bt"), "\t$Rd, $Rn, $Rm", + [(set GPR:$Rd, (opnode (sext_inreg GPR:$Rn, i16), + (sra GPR:$Rm, (i32 16))))]>, + Requires<[IsARM, HasV5TE]>; + + def TB : AMulxyI<0b0001011, 0b01, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), + IIC_iMUL16, !strconcat(opc, "tb"), "\t$Rd, $Rn, $Rm", + [(set GPR:$Rd, (opnode (sra GPR:$Rn, (i32 16)), + (sext_inreg GPR:$Rm, i16)))]>, + Requires<[IsARM, HasV5TE]>; + + def TT : AMulxyI<0b0001011, 0b11, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), + IIC_iMUL16, !strconcat(opc, "tt"), "\t$Rd, $Rn, $Rm", + [(set GPR:$Rd, (opnode (sra GPR:$Rn, (i32 16)), + (sra GPR:$Rm, (i32 16))))]>, + Requires<[IsARM, HasV5TE]>; + + def WB : AMulxyI<0b0001001, 0b01, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), + IIC_iMUL16, !strconcat(opc, "wb"), "\t$Rd, $Rn, $Rm", + [(set GPR:$Rd, (sra (opnode GPR:$Rn, + (sext_inreg GPR:$Rm, i16)), (i32 16)))]>, + Requires<[IsARM, HasV5TE]>; + + def WT : AMulxyI<0b0001001, 0b11, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), + IIC_iMUL16, !strconcat(opc, "wt"), "\t$Rd, $Rn, $Rm", + [(set GPR:$Rd, (sra (opnode GPR:$Rn, + (sra GPR:$Rm, (i32 16))), (i32 16)))]>, + Requires<[IsARM, HasV5TE]>; } multiclass AI_smla<string opc, PatFrag opnode> { - def BB : AMulxyI<0b0001000, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc), - IIC_iMAC16, !strconcat(opc, "bb"), "\t$dst, $a, $b, $acc", - [(set GPR:$dst, (add GPR:$acc, - (opnode (sext_inreg GPR:$a, i16), - (sext_inreg GPR:$b, i16))))]>, - Requires<[IsARM, HasV5TE]> { - let Inst{5} = 0; - let Inst{6} = 0; - } - - def BT : AMulxyI<0b0001000, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc), - IIC_iMAC16, !strconcat(opc, "bt"), "\t$dst, $a, $b, $acc", - [(set GPR:$dst, (add GPR:$acc, (opnode (sext_inreg GPR:$a, i16), - (sra GPR:$b, (i32 16)))))]>, - Requires<[IsARM, HasV5TE]> { - let Inst{5} = 0; - let Inst{6} = 1; - } - - def TB : AMulxyI<0b0001000, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc), - IIC_iMAC16, !strconcat(opc, "tb"), "\t$dst, $a, $b, $acc", - [(set GPR:$dst, (add GPR:$acc, (opnode (sra GPR:$a, (i32 16)), - (sext_inreg GPR:$b, i16))))]>, - Requires<[IsARM, HasV5TE]> { - let Inst{5} = 1; - let Inst{6} = 0; - } - - def TT : AMulxyI<0b0001000, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc), - IIC_iMAC16, !strconcat(opc, "tt"), "\t$dst, $a, $b, $acc", - [(set GPR:$dst, (add GPR:$acc, (opnode (sra GPR:$a, (i32 16)), - (sra GPR:$b, (i32 16)))))]>, - Requires<[IsARM, HasV5TE]> { - let Inst{5} = 1; - let Inst{6} = 1; - } - - def WB : AMulxyI<0b0001001, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc), - IIC_iMAC16, !strconcat(opc, "wb"), "\t$dst, $a, $b, $acc", - [(set GPR:$dst, (add GPR:$acc, (sra (opnode GPR:$a, - (sext_inreg GPR:$b, i16)), (i32 16))))]>, - Requires<[IsARM, HasV5TE]> { - let Inst{5} = 0; - let Inst{6} = 0; - } - - def WT : AMulxyI<0b0001001, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc), - IIC_iMAC16, !strconcat(opc, "wt"), "\t$dst, $a, $b, $acc", - [(set GPR:$dst, (add GPR:$acc, (sra (opnode GPR:$a, - (sra GPR:$b, (i32 16))), (i32 16))))]>, - Requires<[IsARM, HasV5TE]> { - let Inst{5} = 0; - let Inst{6} = 1; - } + def BB : AMulxyIa<0b0001000, 0b00, (outs GPR:$Rd), + (ins GPR:$Rn, GPR:$Rm, GPR:$Ra), + IIC_iMAC16, !strconcat(opc, "bb"), "\t$Rd, $Rn, $Rm, $Ra", + [(set GPR:$Rd, (add GPR:$Ra, + (opnode (sext_inreg GPR:$Rn, i16), + (sext_inreg GPR:$Rm, i16))))]>, + Requires<[IsARM, HasV5TE]>; + + def BT : AMulxyIa<0b0001000, 0b10, (outs GPR:$Rd), + (ins GPR:$Rn, GPR:$Rm, GPR:$Ra), + IIC_iMAC16, !strconcat(opc, "bt"), "\t$Rd, $Rn, $Rm, $Ra", + [(set GPR:$Rd, (add GPR:$Ra, (opnode (sext_inreg GPR:$Rn, i16), + (sra GPR:$Rm, (i32 16)))))]>, + Requires<[IsARM, HasV5TE]>; + + def TB : AMulxyIa<0b0001000, 0b01, (outs GPR:$Rd), + (ins GPR:$Rn, GPR:$Rm, GPR:$Ra), + IIC_iMAC16, !strconcat(opc, "tb"), "\t$Rd, $Rn, $Rm, $Ra", + [(set GPR:$Rd, (add GPR:$Ra, (opnode (sra GPR:$Rn, (i32 16)), + (sext_inreg GPR:$Rm, i16))))]>, + Requires<[IsARM, HasV5TE]>; + + def TT : AMulxyIa<0b0001000, 0b11, (outs GPR:$Rd), + (ins GPR:$Rn, GPR:$Rm, GPR:$Ra), + IIC_iMAC16, !strconcat(opc, "tt"), "\t$Rd, $Rn, $Rm, $Ra", + [(set GPR:$Rd, (add GPR:$Ra, (opnode (sra GPR:$Rn, (i32 16)), + (sra GPR:$Rm, (i32 16)))))]>, + Requires<[IsARM, HasV5TE]>; + + def WB : AMulxyIa<0b0001001, 0b00, (outs GPR:$Rd), + (ins GPR:$Rn, GPR:$Rm, GPR:$Ra), + IIC_iMAC16, !strconcat(opc, "wb"), "\t$Rd, $Rn, $Rm, $Ra", + [(set GPR:$Rd, (add GPR:$Ra, (sra (opnode GPR:$Rn, + (sext_inreg GPR:$Rm, i16)), (i32 16))))]>, + Requires<[IsARM, HasV5TE]>; + + def WT : AMulxyIa<0b0001001, 0b10, (outs GPR:$Rd), + (ins GPR:$Rn, GPR:$Rm, GPR:$Ra), + IIC_iMAC16, !strconcat(opc, "wt"), "\t$Rd, $Rn, $Rm, $Ra", + [(set GPR:$Rd, (add GPR:$Ra, (sra (opnode GPR:$Rn, + (sra GPR:$Rm, (i32 16))), (i32 16))))]>, + Requires<[IsARM, HasV5TE]>; } defm SMUL : AI_smul<"smul", BinOpFrag<(mul node:$LHS, node:$RHS)>>; defm SMLA : AI_smla<"smla", BinOpFrag<(mul node:$LHS, node:$RHS)>>; // Halfword multiply accumulate long: SMLAL<x><y> -- for disassembly only -def SMLALBB : AMulxyI<0b0001010,(outs GPR:$ldst,GPR:$hdst),(ins GPR:$a,GPR:$b), - IIC_iMAC64, "smlalbb", "\t$ldst, $hdst, $a, $b", +def SMLALBB : AMulxyI64<0b0001010, 0b00, (outs GPR:$RdLo, GPR:$RdHi), + (ins GPR:$Rn, GPR:$Rm), + IIC_iMAC64, "smlalbb", "\t$RdLo, $RdHi, $Rn, $Rm", [/* For disassembly only; pattern left blank */]>, - Requires<[IsARM, HasV5TE]> { - let Inst{5} = 0; - let Inst{6} = 0; -} + Requires<[IsARM, HasV5TE]>; -def SMLALBT : AMulxyI<0b0001010,(outs GPR:$ldst,GPR:$hdst),(ins GPR:$a,GPR:$b), - IIC_iMAC64, "smlalbt", "\t$ldst, $hdst, $a, $b", +def SMLALBT : AMulxyI64<0b0001010, 0b10, (outs GPR:$RdLo, GPR:$RdHi), + (ins GPR:$Rn, GPR:$Rm), + IIC_iMAC64, "smlalbt", "\t$RdLo, $RdHi, $Rn, $Rm", [/* For disassembly only; pattern left blank */]>, - Requires<[IsARM, HasV5TE]> { - let Inst{5} = 0; - let Inst{6} = 1; -} + Requires<[IsARM, HasV5TE]>; -def SMLALTB : AMulxyI<0b0001010,(outs GPR:$ldst,GPR:$hdst),(ins GPR:$a,GPR:$b), - IIC_iMAC64, "smlaltb", "\t$ldst, $hdst, $a, $b", +def SMLALTB : AMulxyI64<0b0001010, 0b01, (outs GPR:$RdLo, GPR:$RdHi), + (ins GPR:$Rn, GPR:$Rm), + IIC_iMAC64, "smlaltb", "\t$RdLo, $RdHi, $Rn, $Rm", [/* For disassembly only; pattern left blank */]>, - Requires<[IsARM, HasV5TE]> { - let Inst{5} = 1; - let Inst{6} = 0; -} + Requires<[IsARM, HasV5TE]>; -def SMLALTT : AMulxyI<0b0001010,(outs GPR:$ldst,GPR:$hdst),(ins GPR:$a,GPR:$b), - IIC_iMAC64, "smlaltt", "\t$ldst, $hdst, $a, $b", +def SMLALTT : AMulxyI64<0b0001010, 0b11, (outs GPR:$RdLo, GPR:$RdHi), + (ins GPR:$Rn, GPR:$Rm), + IIC_iMAC64, "smlaltt", "\t$RdLo, $RdHi, $Rn, $Rm", [/* For disassembly only; pattern left blank */]>, - Requires<[IsARM, HasV5TE]> { - let Inst{5} = 1; - let Inst{6} = 1; -} + Requires<[IsARM, HasV5TE]>; // Helper class for AI_smld -- for disassembly only -class AMulDualI<bit long, bit sub, bit swap, dag oops, dag iops, - InstrItinClass itin, string opc, string asm> +class AMulDualIbase<bit long, bit sub, bit swap, dag oops, dag iops, + InstrItinClass itin, string opc, string asm> : AI<oops, iops, MulFrm, itin, opc, asm, []>, Requires<[IsARM, HasV6]> { + bits<4> Rn; + bits<4> Rm; let Inst{4} = 1; let Inst{5} = swap; let Inst{6} = sub; @@ -2157,21 +2881,46 @@ class AMulDualI<bit long, bit sub, bit swap, dag oops, dag iops, let Inst{21-20} = 0b00; let Inst{22} = long; let Inst{27-23} = 0b01110; + let Inst{11-8} = Rm; + let Inst{3-0} = Rn; +} +class AMulDualI<bit long, bit sub, bit swap, dag oops, dag iops, + InstrItinClass itin, string opc, string asm> + : AMulDualIbase<long, sub, swap, oops, iops, itin, opc, asm> { + bits<4> Rd; + let Inst{15-12} = 0b1111; + let Inst{19-16} = Rd; +} +class AMulDualIa<bit long, bit sub, bit swap, dag oops, dag iops, + InstrItinClass itin, string opc, string asm> + : AMulDualIbase<long, sub, swap, oops, iops, itin, opc, asm> { + bits<4> Ra; + let Inst{15-12} = Ra; +} +class AMulDualI64<bit long, bit sub, bit swap, dag oops, dag iops, + InstrItinClass itin, string opc, string asm> + : AMulDualIbase<long, sub, swap, oops, iops, itin, opc, asm> { + bits<4> RdLo; + bits<4> RdHi; + let Inst{19-16} = RdHi; + let Inst{15-12} = RdLo; } multiclass AI_smld<bit sub, string opc> { - def D : AMulDualI<0, sub, 0, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc), - NoItinerary, !strconcat(opc, "d"), "\t$dst, $a, $b, $acc">; + def D : AMulDualIa<0, sub, 0, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm, GPR:$Ra), + NoItinerary, !strconcat(opc, "d"), "\t$Rd, $Rn, $Rm, $Ra">; - def DX : AMulDualI<0, sub, 1, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc), - NoItinerary, !strconcat(opc, "dx"), "\t$dst, $a, $b, $acc">; + def DX: AMulDualIa<0, sub, 1, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm, GPR:$Ra), + NoItinerary, !strconcat(opc, "dx"), "\t$Rd, $Rn, $Rm, $Ra">; - def LD : AMulDualI<1, sub, 0, (outs GPR:$ldst,GPR:$hdst), (ins GPR:$a,GPR:$b), - NoItinerary, !strconcat(opc, "ld"), "\t$ldst, $hdst, $a, $b">; + def LD: AMulDualI64<1, sub, 0, (outs GPR:$RdLo,GPR:$RdHi), + (ins GPR:$Rn, GPR:$Rm), NoItinerary, + !strconcat(opc, "ld"), "\t$RdLo, $RdHi, $Rn, $Rm">; - def LDX : AMulDualI<1, sub, 1, (outs GPR:$ldst,GPR:$hdst),(ins GPR:$a,GPR:$b), - NoItinerary, !strconcat(opc, "ldx"),"\t$ldst, $hdst, $a, $b">; + def LDX : AMulDualI64<1, sub, 1, (outs GPR:$RdLo,GPR:$RdHi), + (ins GPR:$Rn, GPR:$Rm), NoItinerary, + !strconcat(opc, "ldx"),"\t$RdLo, $RdHi, $Rn, $Rm">; } @@ -2180,16 +2929,10 @@ defm SMLS : AI_smld<1, "smls">; multiclass AI_sdml<bit sub, string opc> { - def D : AMulDualI<0, sub, 0, (outs GPR:$dst), (ins GPR:$a, GPR:$b), - NoItinerary, !strconcat(opc, "d"), "\t$dst, $a, $b"> { - let Inst{15-12} = 0b1111; - } - - def DX : AMulDualI<0, sub, 1, (outs GPR:$dst), (ins GPR:$a, GPR:$b), - NoItinerary, !strconcat(opc, "dx"), "\t$dst, $a, $b"> { - let Inst{15-12} = 0b1111; - } - + def D : AMulDualI<0, sub, 0, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), + NoItinerary, !strconcat(opc, "d"), "\t$Rd, $Rn, $Rm">; + def DX : AMulDualI<0, sub, 1, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), + NoItinerary, !strconcat(opc, "dx"), "\t$Rd, $Rn, $Rm">; } defm SMUA : AI_sdml<0, "smua">; @@ -2199,55 +2942,35 @@ defm SMUS : AI_sdml<1, "smus">; // Misc. Arithmetic Instructions. // -def CLZ : AMiscA1I<0b000010110, (outs GPR:$dst), (ins GPR:$src), IIC_iUNAr, - "clz", "\t$dst, $src", - [(set GPR:$dst, (ctlz GPR:$src))]>, Requires<[IsARM, HasV5T]> { - let Inst{7-4} = 0b0001; - let Inst{11-8} = 0b1111; - let Inst{19-16} = 0b1111; -} - -def RBIT : AMiscA1I<0b01101111, (outs GPR:$dst), (ins GPR:$src), IIC_iUNAr, - "rbit", "\t$dst, $src", - [(set GPR:$dst, (ARMrbit GPR:$src))]>, - Requires<[IsARM, HasV6T2]> { - let Inst{7-4} = 0b0011; - let Inst{11-8} = 0b1111; - let Inst{19-16} = 0b1111; -} - -def REV : AMiscA1I<0b01101011, (outs GPR:$dst), (ins GPR:$src), IIC_iUNAr, - "rev", "\t$dst, $src", - [(set GPR:$dst, (bswap GPR:$src))]>, Requires<[IsARM, HasV6]> { - let Inst{7-4} = 0b0011; - let Inst{11-8} = 0b1111; - let Inst{19-16} = 0b1111; -} - -def REV16 : AMiscA1I<0b01101011, (outs GPR:$dst), (ins GPR:$src), IIC_iUNAr, - "rev16", "\t$dst, $src", - [(set GPR:$dst, - (or (and (srl GPR:$src, (i32 8)), 0xFF), - (or (and (shl GPR:$src, (i32 8)), 0xFF00), - (or (and (srl GPR:$src, (i32 8)), 0xFF0000), - (and (shl GPR:$src, (i32 8)), 0xFF000000)))))]>, - Requires<[IsARM, HasV6]> { - let Inst{7-4} = 0b1011; - let Inst{11-8} = 0b1111; - let Inst{19-16} = 0b1111; -} - -def REVSH : AMiscA1I<0b01101111, (outs GPR:$dst), (ins GPR:$src), IIC_iUNAr, - "revsh", "\t$dst, $src", - [(set GPR:$dst, +def CLZ : AMiscA1I<0b000010110, 0b0001, (outs GPR:$Rd), (ins GPR:$Rm), + IIC_iUNAr, "clz", "\t$Rd, $Rm", + [(set GPR:$Rd, (ctlz GPR:$Rm))]>, Requires<[IsARM, HasV5T]>; + +def RBIT : AMiscA1I<0b01101111, 0b0011, (outs GPR:$Rd), (ins GPR:$Rm), + IIC_iUNAr, "rbit", "\t$Rd, $Rm", + [(set GPR:$Rd, (ARMrbit GPR:$Rm))]>, + Requires<[IsARM, HasV6T2]>; + +def REV : AMiscA1I<0b01101011, 0b0011, (outs GPR:$Rd), (ins GPR:$Rm), + IIC_iUNAr, "rev", "\t$Rd, $Rm", + [(set GPR:$Rd, (bswap GPR:$Rm))]>, Requires<[IsARM, HasV6]>; + +def REV16 : AMiscA1I<0b01101011, 0b1011, (outs GPR:$Rd), (ins GPR:$Rm), + IIC_iUNAr, "rev16", "\t$Rd, $Rm", + [(set GPR:$Rd, + (or (and (srl GPR:$Rm, (i32 8)), 0xFF), + (or (and (shl GPR:$Rm, (i32 8)), 0xFF00), + (or (and (srl GPR:$Rm, (i32 8)), 0xFF0000), + (and (shl GPR:$Rm, (i32 8)), 0xFF000000)))))]>, + Requires<[IsARM, HasV6]>; + +def REVSH : AMiscA1I<0b01101111, 0b1011, (outs GPR:$Rd), (ins GPR:$Rm), + IIC_iUNAr, "revsh", "\t$Rd, $Rm", + [(set GPR:$Rd, (sext_inreg - (or (srl (and GPR:$src, 0xFF00), (i32 8)), - (shl GPR:$src, (i32 8))), i16))]>, - Requires<[IsARM, HasV6]> { - let Inst{7-4} = 0b1011; - let Inst{11-8} = 0b1111; - let Inst{19-16} = 0b1111; -} + (or (srl (and GPR:$Rm, 0xFF00), (i32 8)), + (shl GPR:$Rm, (i32 8))), i16))]>, + Requires<[IsARM, HasV6]>; def lsl_shift_imm : SDNodeXForm<imm, [{ unsigned Sh = ARM_AM::getSORegOpc(ARM_AM::lsl, N->getZExtValue()); @@ -2258,21 +2981,19 @@ def lsl_amt : PatLeaf<(i32 imm), [{ return (N->getZExtValue() < 32); }], lsl_shift_imm>; -def PKHBT : AMiscA1I<0b01101000, (outs GPR:$dst), - (ins GPR:$src1, GPR:$src2, shift_imm:$sh), - IIC_iALUsi, "pkhbt", "\t$dst, $src1, $src2$sh", - [(set GPR:$dst, (or (and GPR:$src1, 0xFFFF), - (and (shl GPR:$src2, lsl_amt:$sh), - 0xFFFF0000)))]>, - Requires<[IsARM, HasV6]> { - let Inst{6-4} = 0b001; -} +def PKHBT : APKHI<0b01101000, 0, (outs GPR:$Rd), + (ins GPR:$Rn, GPR:$Rm, shift_imm:$sh), + IIC_iALUsi, "pkhbt", "\t$Rd, $Rn, $Rm$sh", + [(set GPR:$Rd, (or (and GPR:$Rn, 0xFFFF), + (and (shl GPR:$Rm, lsl_amt:$sh), + 0xFFFF0000)))]>, + Requires<[IsARM, HasV6]>; // Alternate cases for PKHBT where identities eliminate some nodes. -def : ARMV6Pat<(or (and GPR:$src1, 0xFFFF), (and GPR:$src2, 0xFFFF0000)), - (PKHBT GPR:$src1, GPR:$src2, 0)>; -def : ARMV6Pat<(or (and GPR:$src1, 0xFFFF), (shl GPR:$src2, imm16_31:$sh)), - (PKHBT GPR:$src1, GPR:$src2, (lsl_shift_imm imm16_31:$sh))>; +def : ARMV6Pat<(or (and GPR:$Rn, 0xFFFF), (and GPR:$Rm, 0xFFFF0000)), + (PKHBT GPR:$Rn, GPR:$Rm, 0)>; +def : ARMV6Pat<(or (and GPR:$Rn, 0xFFFF), (shl GPR:$Rm, imm16_31:$sh)), + (PKHBT GPR:$Rn, GPR:$Rm, (lsl_shift_imm imm16_31:$sh))>; def asr_shift_imm : SDNodeXForm<imm, [{ unsigned Sh = ARM_AM::getSORegOpc(ARM_AM::asr, N->getZExtValue()); @@ -2285,15 +3006,13 @@ def asr_amt : PatLeaf<(i32 imm), [{ // Note: Shifts of 1-15 bits will be transformed to srl instead of sra and // will match the pattern below. -def PKHTB : AMiscA1I<0b01101000, (outs GPR:$dst), - (ins GPR:$src1, GPR:$src2, shift_imm:$sh), - IIC_iALUsi, "pkhtb", "\t$dst, $src1, $src2$sh", - [(set GPR:$dst, (or (and GPR:$src1, 0xFFFF0000), - (and (sra GPR:$src2, asr_amt:$sh), - 0xFFFF)))]>, - Requires<[IsARM, HasV6]> { - let Inst{6-4} = 0b101; -} +def PKHTB : APKHI<0b01101000, 1, (outs GPR:$Rd), + (ins GPR:$Rn, GPR:$Rm, shift_imm:$sh), + IIC_iBITsi, "pkhtb", "\t$Rd, $Rn, $Rm$sh", + [(set GPR:$Rd, (or (and GPR:$Rn, 0xFFFF0000), + (and (sra GPR:$Rm, asr_amt:$sh), + 0xFFFF)))]>, + Requires<[IsARM, HasV6]>; // Alternate cases for PKHTB where identities eliminate some nodes. Note that // a shift amount of 0 is *not legal* here, it is PKHBT instead. @@ -2308,10 +3027,19 @@ def : ARMV6Pat<(or (and GPR:$src1, 0xFFFF0000), // defm CMP : AI1_cmp_irs<0b1010, "cmp", + IIC_iCMPi, IIC_iCMPr, IIC_iCMPsr, BinOpFrag<(ARMcmp node:$LHS, node:$RHS)>>; -// FIXME: There seems to be a (potential) hardware bug with the CMN instruction -// and comparison with 0. These two pieces of code should give identical +// ARMcmpZ can re-use the above instruction definitions. +def : ARMPat<(ARMcmpZ GPR:$src, so_imm:$imm), + (CMPri GPR:$src, so_imm:$imm)>; +def : ARMPat<(ARMcmpZ GPR:$src, GPR:$rhs), + (CMPrr GPR:$src, GPR:$rhs)>; +def : ARMPat<(ARMcmpZ GPR:$src, so_reg:$rhs), + (CMPrs GPR:$src, so_reg:$rhs)>; + +// FIXME: We have to be careful when using the CMN instruction and comparison +// with 0. One would expect these two pieces of code should give identical // results: // // rsbs r1, r1, 0 @@ -2321,7 +3049,7 @@ defm CMP : AI1_cmp_irs<0b1010, "cmp", // mov r0, #1 // // and: -// +// // cmn r0, r1 // mov r0, #0 // it ls @@ -2336,20 +3064,16 @@ defm CMP : AI1_cmp_irs<0b1010, "cmp", // never a "carry" when this AddWithCarry is performed (because the "carry bit" // parameter to AddWithCarry is defined as 0). // -// The AddWithCarry in the CMP case seems to be relying upon the identity: -// -// ~x + 1 = -x -// -// However when x is 0 and unsigned, this doesn't hold: +// When x is 0 and unsigned: // // x = 0 // ~x = 0xFFFF FFFF // ~x + 1 = 0x1 0000 0000 // (-x = 0) != (0x1 0000 0000 = ~x + 1) // -// Therefore, we should disable *all* versions of CMN, especially when comparing -// against zero, until we can limit when the CMN instruction is used (when we -// know that the RHS is not 0) or when we have a hardware fix for this. +// Therefore, we should disable CMN when comparing against zero, until we can +// limit when the CMN instruction is used (when we know that the RHS is not 0 or +// when it's a comparison which doesn't look at the 'carry' flag). // // (See the ARM docs for the "AddWithCarry" pseudo-code.) // @@ -2360,13 +3084,14 @@ defm CMP : AI1_cmp_irs<0b1010, "cmp", // Note that TST/TEQ don't set all the same flags that CMP does! defm TST : AI1_cmp_irs<0b1000, "tst", - BinOpFrag<(ARMcmpZ (and node:$LHS, node:$RHS), 0)>, 1>; + IIC_iTSTi, IIC_iTSTr, IIC_iTSTsr, + BinOpFrag<(ARMcmpZ (and_su node:$LHS, node:$RHS), 0)>, 1>; defm TEQ : AI1_cmp_irs<0b1001, "teq", - BinOpFrag<(ARMcmpZ (xor node:$LHS, node:$RHS), 0)>, 1>; + IIC_iTSTi, IIC_iTSTr, IIC_iTSTsr, + BinOpFrag<(ARMcmpZ (xor_su node:$LHS, node:$RHS), 0)>, 1>; -defm CMPz : AI1_cmp_irs<0b1010, "cmp", - BinOpFrag<(ARMcmpZ node:$LHS, node:$RHS)>>; defm CMNz : AI1_cmp_irs<0b1011, "cmn", + IIC_iCMPi, IIC_iCMPr, IIC_iCMPsr, BinOpFrag<(ARMcmpZ node:$LHS,(ineg node:$RHS))>>; //def : ARMPat<(ARMcmp GPR:$src, so_imm_neg:$imm), @@ -2381,13 +3106,10 @@ let usesCustomInserter = 1, isBranch = 1, isTerminator = 1, def BCCi64 : PseudoInst<(outs), (ins i32imm:$cc, GPR:$lhs1, GPR:$lhs2, GPR:$rhs1, GPR:$rhs2, brtarget:$dst), IIC_Br, - "${:comment} B\t$dst GPR:$lhs1, GPR:$lhs2, GPR:$rhs1, GPR:$rhs2, imm:$cc", [(ARMBcci64 imm:$cc, GPR:$lhs1, GPR:$lhs2, GPR:$rhs1, GPR:$rhs2, bb:$dst)]>; def BCCZi64 : PseudoInst<(outs), - (ins i32imm:$cc, GPR:$lhs1, GPR:$lhs2, brtarget:$dst), - IIC_Br, - "${:comment} B\t$dst GPR:$lhs1, GPR:$lhs2, 0, 0, imm:$cc", + (ins i32imm:$cc, GPR:$lhs1, GPR:$lhs2, brtarget:$dst), IIC_Br, [(ARMBcci64 imm:$cc, GPR:$lhs1, GPR:$lhs2, 0, 0, bb:$dst)]>; } // usesCustomInserter @@ -2395,29 +3117,87 @@ def BCCZi64 : PseudoInst<(outs), // Conditional moves // FIXME: should be able to write a pattern for ARMcmov, but can't use // a two-value operand where a dag node expects two operands. :( +// FIXME: These should all be pseudo-instructions that get expanded to +// the normal MOV instructions. That would fix the dependency on +// special casing them in tblgen. let neverHasSideEffects = 1 in { -def MOVCCr : AI1<0b1101, (outs GPR:$dst), (ins GPR:$false, GPR:$true), DPFrm, - IIC_iCMOVr, "mov", "\t$dst, $true", - [/*(set GPR:$dst, (ARMcmov GPR:$false, GPR:$true, imm:$cc, CCR:$ccr))*/]>, - RegConstraint<"$false = $dst">, UnaryDP { - let Inst{11-4} = 0b00000000; +def MOVCCr : AI1<0b1101, (outs GPR:$Rd), (ins GPR:$false, GPR:$Rm), DPFrm, + IIC_iCMOVr, "mov", "\t$Rd, $Rm", + [/*(set GPR:$Rd, (ARMcmov GPR:$false, GPR:$Rm, imm:$cc, CCR:$ccr))*/]>, + RegConstraint<"$false = $Rd">, UnaryDP { + bits<4> Rd; + bits<4> Rm; let Inst{25} = 0; + let Inst{20} = 0; + let Inst{15-12} = Rd; + let Inst{11-4} = 0b00000000; + let Inst{3-0} = Rm; } -def MOVCCs : AI1<0b1101, (outs GPR:$dst), - (ins GPR:$false, so_reg:$true), DPSoRegFrm, IIC_iCMOVsr, - "mov", "\t$dst, $true", - [/*(set GPR:$dst, (ARMcmov GPR:$false, so_reg:$true, imm:$cc, CCR:$ccr))*/]>, - RegConstraint<"$false = $dst">, UnaryDP { +def MOVCCs : AI1<0b1101, (outs GPR:$Rd), + (ins GPR:$false, so_reg:$shift), DPSoRegFrm, IIC_iCMOVsr, + "mov", "\t$Rd, $shift", + [/*(set GPR:$Rd, (ARMcmov GPR:$false, so_reg:$shift, imm:$cc, CCR:$ccr))*/]>, + RegConstraint<"$false = $Rd">, UnaryDP { + bits<4> Rd; + bits<12> shift; let Inst{25} = 0; + let Inst{20} = 0; + let Inst{19-16} = 0; + let Inst{15-12} = Rd; + let Inst{11-0} = shift; } -def MOVCCi : AI1<0b1101, (outs GPR:$dst), - (ins GPR:$false, so_imm:$true), DPFrm, IIC_iCMOVi, - "mov", "\t$dst, $true", - [/*(set GPR:$dst, (ARMcmov GPR:$false, so_imm:$true, imm:$cc, CCR:$ccr))*/]>, - RegConstraint<"$false = $dst">, UnaryDP { +let isMoveImm = 1 in +def MOVCCi16 : AI1<0b1000, (outs GPR:$Rd), (ins GPR:$false, i32imm_hilo16:$imm), + DPFrm, IIC_iMOVi, + "movw", "\t$Rd, $imm", + []>, + RegConstraint<"$false = $Rd">, Requires<[IsARM, HasV6T2]>, + UnaryDP { + bits<4> Rd; + bits<16> imm; + let Inst{25} = 1; + let Inst{20} = 0; + let Inst{19-16} = imm{15-12}; + let Inst{15-12} = Rd; + let Inst{11-0} = imm{11-0}; +} + +let isMoveImm = 1 in +def MOVCCi : AI1<0b1101, (outs GPR:$Rd), + (ins GPR:$false, so_imm:$imm), DPFrm, IIC_iCMOVi, + "mov", "\t$Rd, $imm", + [/*(set GPR:$Rd, (ARMcmov GPR:$false, so_imm:$imm, imm:$cc, CCR:$ccr))*/]>, + RegConstraint<"$false = $Rd">, UnaryDP { + bits<4> Rd; + bits<12> imm; let Inst{25} = 1; + let Inst{20} = 0; + let Inst{19-16} = 0b0000; + let Inst{15-12} = Rd; + let Inst{11-0} = imm; +} + +// Two instruction predicate mov immediate. +let isMoveImm = 1 in +def MOVCCi32imm : PseudoInst<(outs GPR:$Rd), + (ins GPR:$false, i32imm:$src, pred:$p), + IIC_iCMOVix2, []>, RegConstraint<"$false = $Rd">; + +let isMoveImm = 1 in +def MVNCCi : AI1<0b1111, (outs GPR:$Rd), + (ins GPR:$false, so_imm:$imm), DPFrm, IIC_iCMOVi, + "mvn", "\t$Rd, $imm", + [/*(set GPR:$Rd, (ARMcmov GPR:$false, so_imm_not:$imm, imm:$cc, CCR:$ccr))*/]>, + RegConstraint<"$false = $Rd">, UnaryDP { + bits<4> Rd; + bits<12> imm; + let Inst{25} = 1; + let Inst{20} = 0; + let Inst{19-16} = 0b0000; + let Inst{15-12} = Rd; + let Inst{11-0} = imm; } } // neverHasSideEffects @@ -2425,64 +3205,41 @@ def MOVCCi : AI1<0b1101, (outs GPR:$dst), // Atomic operations intrinsics // +def memb_opt : Operand<i32> { + let PrintMethod = "printMemBOption"; + let ParserMatchClass = MemBarrierOptOperand; +} + // memory barriers protect the atomic sequences let hasSideEffects = 1 in { -def DMBsy : AInoP<(outs), (ins), MiscFrm, NoItinerary, "dmb", "", - [(ARMMemBarrier)]>, Requires<[IsARM, HasDB]> { +def DMB : AInoP<(outs), (ins memb_opt:$opt), MiscFrm, NoItinerary, + "dmb", "\t$opt", [(ARMMemBarrier (i32 imm:$opt))]>, + Requires<[IsARM, HasDB]> { + bits<4> opt; let Inst{31-4} = 0xf57ff05; - // FIXME: add support for options other than a full system DMB - // See DMB disassembly-only variants below. - let Inst{3-0} = 0b1111; -} - -def DSBsy : AInoP<(outs), (ins), MiscFrm, NoItinerary, "dsb", "", - [(ARMSyncBarrier)]>, Requires<[IsARM, HasDB]> { - let Inst{31-4} = 0xf57ff04; - // FIXME: add support for options other than a full system DSB - // See DSB disassembly-only variants below. - let Inst{3-0} = 0b1111; + let Inst{3-0} = opt; } def DMB_MCR : AInoP<(outs), (ins GPR:$zero), MiscFrm, NoItinerary, "mcr", "\tp15, 0, $zero, c7, c10, 5", [(ARMMemBarrierMCR GPR:$zero)]>, Requires<[IsARM, HasV6]> { - // FIXME: add support for options other than a full system DMB // FIXME: add encoding } - -def DSB_MCR : AInoP<(outs), (ins GPR:$zero), MiscFrm, NoItinerary, - "mcr", "\tp15, 0, $zero, c7, c10, 4", - [(ARMSyncBarrierMCR GPR:$zero)]>, - Requires<[IsARM, HasV6]> { - // FIXME: add support for options other than a full system DSB - // FIXME: add encoding -} -} - -// Memory Barrier Operations Variants -- for disassembly only - -def memb_opt : Operand<i32> { - let PrintMethod = "printMemBOption"; } -class AMBI<bits<4> op7_4, string opc> - : AInoP<(outs), (ins memb_opt:$opt), MiscFrm, NoItinerary, opc, "\t$opt", - [/* For disassembly only; pattern left blank */]>, - Requires<[IsARM, HasDB]> { - let Inst{31-8} = 0xf57ff0; - let Inst{7-4} = op7_4; +def DSB : AInoP<(outs), (ins memb_opt:$opt), MiscFrm, NoItinerary, + "dsb", "\t$opt", + [/* For disassembly only; pattern left blank */]>, + Requires<[IsARM, HasDB]> { + bits<4> opt; + let Inst{31-4} = 0xf57ff04; + let Inst{3-0} = opt; } -// These DMB variants are for disassembly only. -def DMBvar : AMBI<0b0101, "dmb">; - -// These DSB variants are for disassembly only. -def DSBvar : AMBI<0b0100, "dsb">; - // ISB has only full system option -- for disassembly only -def ISBsy : AInoP<(outs), (ins), MiscFrm, NoItinerary, "isb", "", []>, - Requires<[IsARM, HasDB]> { +def ISB : AInoP<(outs), (ins), MiscFrm, NoItinerary, "isb", "", []>, + Requires<[IsARM, HasDB]> { let Inst{31-4} = 0xf57ff06; let Inst{3-0} = 0b1111; } @@ -2491,138 +3248,114 @@ let usesCustomInserter = 1 in { let Uses = [CPSR] in { def ATOMIC_LOAD_ADD_I8 : PseudoInst< (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary, - "${:comment} ATOMIC_LOAD_ADD_I8 PSEUDO!", [(set GPR:$dst, (atomic_load_add_8 GPR:$ptr, GPR:$incr))]>; def ATOMIC_LOAD_SUB_I8 : PseudoInst< (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary, - "${:comment} ATOMIC_LOAD_SUB_I8 PSEUDO!", [(set GPR:$dst, (atomic_load_sub_8 GPR:$ptr, GPR:$incr))]>; def ATOMIC_LOAD_AND_I8 : PseudoInst< (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary, - "${:comment} ATOMIC_LOAD_AND_I8 PSEUDO!", [(set GPR:$dst, (atomic_load_and_8 GPR:$ptr, GPR:$incr))]>; def ATOMIC_LOAD_OR_I8 : PseudoInst< (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary, - "${:comment} ATOMIC_LOAD_OR_I8 PSEUDO!", [(set GPR:$dst, (atomic_load_or_8 GPR:$ptr, GPR:$incr))]>; def ATOMIC_LOAD_XOR_I8 : PseudoInst< (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary, - "${:comment} ATOMIC_LOAD_XOR_I8 PSEUDO!", [(set GPR:$dst, (atomic_load_xor_8 GPR:$ptr, GPR:$incr))]>; def ATOMIC_LOAD_NAND_I8 : PseudoInst< (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary, - "${:comment} ATOMIC_LOAD_NAND_I8 PSEUDO!", [(set GPR:$dst, (atomic_load_nand_8 GPR:$ptr, GPR:$incr))]>; def ATOMIC_LOAD_ADD_I16 : PseudoInst< (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary, - "${:comment} ATOMIC_LOAD_ADD_I16 PSEUDO!", [(set GPR:$dst, (atomic_load_add_16 GPR:$ptr, GPR:$incr))]>; def ATOMIC_LOAD_SUB_I16 : PseudoInst< (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary, - "${:comment} ATOMIC_LOAD_SUB_I16 PSEUDO!", [(set GPR:$dst, (atomic_load_sub_16 GPR:$ptr, GPR:$incr))]>; def ATOMIC_LOAD_AND_I16 : PseudoInst< (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary, - "${:comment} ATOMIC_LOAD_AND_I16 PSEUDO!", [(set GPR:$dst, (atomic_load_and_16 GPR:$ptr, GPR:$incr))]>; def ATOMIC_LOAD_OR_I16 : PseudoInst< (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary, - "${:comment} ATOMIC_LOAD_OR_I16 PSEUDO!", [(set GPR:$dst, (atomic_load_or_16 GPR:$ptr, GPR:$incr))]>; def ATOMIC_LOAD_XOR_I16 : PseudoInst< (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary, - "${:comment} ATOMIC_LOAD_XOR_I16 PSEUDO!", [(set GPR:$dst, (atomic_load_xor_16 GPR:$ptr, GPR:$incr))]>; def ATOMIC_LOAD_NAND_I16 : PseudoInst< (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary, - "${:comment} ATOMIC_LOAD_NAND_I16 PSEUDO!", [(set GPR:$dst, (atomic_load_nand_16 GPR:$ptr, GPR:$incr))]>; def ATOMIC_LOAD_ADD_I32 : PseudoInst< (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary, - "${:comment} ATOMIC_LOAD_ADD_I32 PSEUDO!", [(set GPR:$dst, (atomic_load_add_32 GPR:$ptr, GPR:$incr))]>; def ATOMIC_LOAD_SUB_I32 : PseudoInst< (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary, - "${:comment} ATOMIC_LOAD_SUB_I32 PSEUDO!", [(set GPR:$dst, (atomic_load_sub_32 GPR:$ptr, GPR:$incr))]>; def ATOMIC_LOAD_AND_I32 : PseudoInst< (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary, - "${:comment} ATOMIC_LOAD_AND_I32 PSEUDO!", [(set GPR:$dst, (atomic_load_and_32 GPR:$ptr, GPR:$incr))]>; def ATOMIC_LOAD_OR_I32 : PseudoInst< (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary, - "${:comment} ATOMIC_LOAD_OR_I32 PSEUDO!", [(set GPR:$dst, (atomic_load_or_32 GPR:$ptr, GPR:$incr))]>; def ATOMIC_LOAD_XOR_I32 : PseudoInst< (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary, - "${:comment} ATOMIC_LOAD_XOR_I32 PSEUDO!", [(set GPR:$dst, (atomic_load_xor_32 GPR:$ptr, GPR:$incr))]>; def ATOMIC_LOAD_NAND_I32 : PseudoInst< (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary, - "${:comment} ATOMIC_LOAD_NAND_I32 PSEUDO!", [(set GPR:$dst, (atomic_load_nand_32 GPR:$ptr, GPR:$incr))]>; def ATOMIC_SWAP_I8 : PseudoInst< (outs GPR:$dst), (ins GPR:$ptr, GPR:$new), NoItinerary, - "${:comment} ATOMIC_SWAP_I8 PSEUDO!", [(set GPR:$dst, (atomic_swap_8 GPR:$ptr, GPR:$new))]>; def ATOMIC_SWAP_I16 : PseudoInst< (outs GPR:$dst), (ins GPR:$ptr, GPR:$new), NoItinerary, - "${:comment} ATOMIC_SWAP_I16 PSEUDO!", [(set GPR:$dst, (atomic_swap_16 GPR:$ptr, GPR:$new))]>; def ATOMIC_SWAP_I32 : PseudoInst< (outs GPR:$dst), (ins GPR:$ptr, GPR:$new), NoItinerary, - "${:comment} ATOMIC_SWAP_I32 PSEUDO!", [(set GPR:$dst, (atomic_swap_32 GPR:$ptr, GPR:$new))]>; def ATOMIC_CMP_SWAP_I8 : PseudoInst< (outs GPR:$dst), (ins GPR:$ptr, GPR:$old, GPR:$new), NoItinerary, - "${:comment} ATOMIC_CMP_SWAP_I8 PSEUDO!", [(set GPR:$dst, (atomic_cmp_swap_8 GPR:$ptr, GPR:$old, GPR:$new))]>; def ATOMIC_CMP_SWAP_I16 : PseudoInst< (outs GPR:$dst), (ins GPR:$ptr, GPR:$old, GPR:$new), NoItinerary, - "${:comment} ATOMIC_CMP_SWAP_I16 PSEUDO!", [(set GPR:$dst, (atomic_cmp_swap_16 GPR:$ptr, GPR:$old, GPR:$new))]>; def ATOMIC_CMP_SWAP_I32 : PseudoInst< (outs GPR:$dst), (ins GPR:$ptr, GPR:$old, GPR:$new), NoItinerary, - "${:comment} ATOMIC_CMP_SWAP_I32 PSEUDO!", [(set GPR:$dst, (atomic_cmp_swap_32 GPR:$ptr, GPR:$old, GPR:$new))]>; } } let mayLoad = 1 in { -def LDREXB : AIldrex<0b10, (outs GPR:$dest), (ins GPR:$ptr), NoItinerary, - "ldrexb", "\t$dest, [$ptr]", +def LDREXB : AIldrex<0b10, (outs GPR:$Rt), (ins GPR:$Rn), NoItinerary, + "ldrexb", "\t$Rt, [$Rn]", []>; -def LDREXH : AIldrex<0b11, (outs GPR:$dest), (ins GPR:$ptr), NoItinerary, - "ldrexh", "\t$dest, [$ptr]", +def LDREXH : AIldrex<0b11, (outs GPR:$Rt), (ins GPR:$Rn), NoItinerary, + "ldrexh", "\t$Rt, [$Rn]", []>; -def LDREX : AIldrex<0b00, (outs GPR:$dest), (ins GPR:$ptr), NoItinerary, - "ldrex", "\t$dest, [$ptr]", +def LDREX : AIldrex<0b00, (outs GPR:$Rt), (ins GPR:$Rn), NoItinerary, + "ldrex", "\t$Rt, [$Rn]", []>; -def LDREXD : AIldrex<0b01, (outs GPR:$dest, GPR:$dest2), (ins GPR:$ptr), +def LDREXD : AIldrex<0b01, (outs GPR:$Rt, GPR:$Rt2), (ins GPR:$Rn), NoItinerary, - "ldrexd", "\t$dest, $dest2, [$ptr]", + "ldrexd", "\t$Rt, $Rt2, [$Rn]", []>; } -let mayStore = 1, Constraints = "@earlyclobber $success" in { -def STREXB : AIstrex<0b10, (outs GPR:$success), (ins GPR:$src, GPR:$ptr), +let mayStore = 1, Constraints = "@earlyclobber $Rd" in { +def STREXB : AIstrex<0b10, (outs GPR:$Rd), (ins GPR:$src, GPR:$Rn), NoItinerary, - "strexb", "\t$success, $src, [$ptr]", + "strexb", "\t$Rd, $src, [$Rn]", []>; -def STREXH : AIstrex<0b11, (outs GPR:$success), (ins GPR:$src, GPR:$ptr), +def STREXH : AIstrex<0b11, (outs GPR:$Rd), (ins GPR:$Rt, GPR:$Rn), NoItinerary, - "strexh", "\t$success, $src, [$ptr]", + "strexh", "\t$Rd, $Rt, [$Rn]", []>; -def STREX : AIstrex<0b00, (outs GPR:$success), (ins GPR:$src, GPR:$ptr), +def STREX : AIstrex<0b00, (outs GPR:$Rd), (ins GPR:$Rt, GPR:$Rn), NoItinerary, - "strex", "\t$success, $src, [$ptr]", + "strex", "\t$Rd, $Rt, [$Rn]", []>; -def STREXD : AIstrex<0b01, (outs GPR:$success), - (ins GPR:$src, GPR:$src2, GPR:$ptr), +def STREXD : AIstrex<0b01, (outs GPR:$Rd), + (ins GPR:$Rt, GPR:$Rt2, GPR:$Rn), NoItinerary, - "strexd", "\t$success, $src, $src2, [$ptr]", + "strexd", "\t$Rd, $Rt, $Rt2, [$Rn]", []>; } @@ -2630,29 +3363,15 @@ def STREXD : AIstrex<0b01, (outs GPR:$success), def CLREX : AXI<(outs), (ins), MiscFrm, NoItinerary, "clrex", [/* For disassembly only; pattern left blank */]>, Requires<[IsARM, HasV7]> { - let Inst{31-20} = 0xf57; - let Inst{7-4} = 0b0001; + let Inst{31-0} = 0b11110101011111111111000000011111; } // SWP/SWPB are deprecated in V6/V7 and for disassembly only. let mayLoad = 1 in { -def SWP : AI<(outs GPR:$dst), (ins GPR:$src, GPR:$ptr), LdStExFrm, NoItinerary, - "swp", "\t$dst, $src, [$ptr]", - [/* For disassembly only; pattern left blank */]> { - let Inst{27-23} = 0b00010; - let Inst{22} = 0; // B = 0 - let Inst{21-20} = 0b00; - let Inst{7-4} = 0b1001; -} - -def SWPB : AI<(outs GPR:$dst), (ins GPR:$src, GPR:$ptr), LdStExFrm, NoItinerary, - "swpb", "\t$dst, $src, [$ptr]", - [/* For disassembly only; pattern left blank */]> { - let Inst{27-23} = 0b00010; - let Inst{22} = 1; // B = 1 - let Inst{21-20} = 0b00; - let Inst{7-4} = 0b1001; -} +def SWP : AIswp<0, (outs GPR:$Rt), (ins GPR:$Rt2, GPR:$Rn), "swp", + [/* For disassembly only; pattern left blank */]>; +def SWPB : AIswp<1, (outs GPR:$Rt), (ins GPR:$Rt2, GPR:$Rn), "swpb", + [/* For disassembly only; pattern left blank */]>; } //===----------------------------------------------------------------------===// @@ -2660,10 +3379,11 @@ def SWPB : AI<(outs GPR:$dst), (ins GPR:$src, GPR:$ptr), LdStExFrm, NoItinerary, // // __aeabi_read_tp preserves the registers r1-r3. +// This is a pseudo inst so that we can get the encoding right, +// complete with fixup for the aeabi_read_tp function. let isCall = 1, - Defs = [R0, R12, LR, CPSR] in { - def TPsoft : ABXI<0b1011, (outs), (ins), IIC_Br, - "bl\t__aeabi_read_tp", + Defs = [R0, R12, LR, CPSR], Uses = [SP] in { + def TPsoft : PseudoInst<(outs), (ins), IIC_Br, [(set R0, ARMthread_pointer)]>; } @@ -2680,19 +3400,16 @@ let isCall = 1, // doing so, we also cause the prologue/epilogue code to actively preserve // all of the callee-saved resgisters, which is exactly what we want. // A constant value is passed in $val, and we use the location as a scratch. +// +// These are pseudo-instructions and are lowered to individual MC-insts, so +// no encoding information is necessary. let Defs = [ R0, R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, LR, D0, D1, D2, D3, D4, D5, D6, D7, D8, D9, D10, D11, D12, D13, D14, D15, D16, D17, D18, D19, D20, D21, D22, D23, D24, D25, D26, D27, D28, D29, D30, D31 ], hasSideEffects = 1, isBarrier = 1 in { - def Int_eh_sjlj_setjmp : XI<(outs), (ins GPR:$src, GPR:$val), - AddrModeNone, SizeSpecial, IndexModeNone, - Pseudo, NoItinerary, - "add\t$val, pc, #8\t${:comment} eh_setjmp begin\n\t" - "str\t$val, [$src, #+4]\n\t" - "mov\tr0, #0\n\t" - "add\tpc, pc, #0\n\t" - "mov\tr0, #1 ${:comment} eh_setjmp end", "", + def Int_eh_sjlj_setjmp : PseudoInst<(outs), (ins GPR:$src, GPR:$val), + NoItinerary, [(set R0, (ARMeh_sjlj_setjmp GPR:$src, GPR:$val))]>, Requires<[IsARM, HasVFP2]>; } @@ -2700,14 +3417,8 @@ let Defs = let Defs = [ R0, R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, LR ], hasSideEffects = 1, isBarrier = 1 in { - def Int_eh_sjlj_setjmp_nofp : XI<(outs), (ins GPR:$src, GPR:$val), - AddrModeNone, SizeSpecial, IndexModeNone, - Pseudo, NoItinerary, - "add\t$val, pc, #8\n ${:comment} eh_setjmp begin\n\t" - "str\t$val, [$src, #+4]\n\t" - "mov\tr0, #0\n\t" - "add\tpc, pc, #0\n\t" - "mov\tr0, #1 ${:comment} eh_setjmp end", "", + def Int_eh_sjlj_setjmp_nofp : PseudoInst<(outs), (ins GPR:$src, GPR:$val), + NoItinerary, [(set R0, (ARMeh_sjlj_setjmp GPR:$src, GPR:$val))]>, Requires<[IsARM, NoVFP]>; } @@ -2715,53 +3426,58 @@ let Defs = // FIXME: Non-Darwin version(s) let isBarrier = 1, hasSideEffects = 1, isTerminator = 1, Defs = [ R7, LR, SP ] in { -def Int_eh_sjlj_longjmp : XI<(outs), (ins GPR:$src, GPR:$scratch), - AddrModeNone, SizeSpecial, IndexModeNone, - Pseudo, NoItinerary, - "ldr\tsp, [$src, #8]\n\t" - "ldr\t$scratch, [$src, #4]\n\t" - "ldr\tr7, [$src]\n\t" - "bx\t$scratch", "", +def Int_eh_sjlj_longjmp : PseudoInst<(outs), (ins GPR:$src, GPR:$scratch), + NoItinerary, [(ARMeh_sjlj_longjmp GPR:$src, GPR:$scratch)]>, Requires<[IsARM, IsDarwin]>; } +// eh.sjlj.dispatchsetup pseudo-instruction. +// This pseudo is used for ARM, Thumb1 and Thumb2. Any differences are +// handled when the pseudo is expanded (which happens before any passes +// that need the instruction size). +let isBarrier = 1, hasSideEffects = 1 in +def Int_eh_sjlj_dispatchsetup : + PseudoInst<(outs), (ins GPR:$src), NoItinerary, + [(ARMeh_sjlj_dispatchsetup GPR:$src)]>, + Requires<[IsDarwin]>; + //===----------------------------------------------------------------------===// // Non-Instruction Patterns // // Large immediate handling. -// Two piece so_imms. -let isReMaterializable = 1 in -def MOVi2pieces : AI1x2<(outs GPR:$dst), (ins so_imm2part:$src), - Pseudo, IIC_iMOVi, - "mov", "\t$dst, $src", - [(set GPR:$dst, so_imm2part:$src)]>, - Requires<[IsARM, NoV6T2]>; - -def : ARMPat<(or GPR:$LHS, so_imm2part:$RHS), - (ORRri (ORRri GPR:$LHS, (so_imm2part_1 imm:$RHS)), - (so_imm2part_2 imm:$RHS))>; -def : ARMPat<(xor GPR:$LHS, so_imm2part:$RHS), - (EORri (EORri GPR:$LHS, (so_imm2part_1 imm:$RHS)), - (so_imm2part_2 imm:$RHS))>; -def : ARMPat<(add GPR:$LHS, so_imm2part:$RHS), - (ADDri (ADDri GPR:$LHS, (so_imm2part_1 imm:$RHS)), - (so_imm2part_2 imm:$RHS))>; -def : ARMPat<(add GPR:$LHS, so_neg_imm2part:$RHS), - (SUBri (SUBri GPR:$LHS, (so_neg_imm2part_1 imm:$RHS)), - (so_neg_imm2part_2 imm:$RHS))>; - -// 32-bit immediate using movw + movt. +// 32-bit immediate using two piece so_imms or movw + movt. // This is a single pseudo instruction, the benefit is that it can be remat'd // as a single unit instead of having to handle reg inputs. // FIXME: Remove this when we can do generalized remat. -let isReMaterializable = 1 in -def MOVi32imm : AI1x2<(outs GPR:$dst), (ins i32imm:$src), Pseudo, IIC_iMOVi, - "movw", "\t$dst, ${src:lo16}\n\tmovt${p}\t$dst, ${src:hi16}", - [(set GPR:$dst, (i32 imm:$src))]>, - Requires<[IsARM, HasV6T2]>; +let isReMaterializable = 1, isMoveImm = 1 in +def MOVi32imm : PseudoInst<(outs GPR:$dst), (ins i32imm:$src), IIC_iMOVix2, + [(set GPR:$dst, (arm_i32imm:$src))]>, + Requires<[IsARM]>; + +// Pseudo instruction that combines movw + movt + add pc (if PIC). +// It also makes it possible to rematerialize the instructions. +// FIXME: Remove this when we can do generalized remat and when machine licm +// can properly the instructions. +let isReMaterializable = 1 in { +def MOV_ga_pcrel : PseudoInst<(outs GPR:$dst), (ins i32imm:$addr), + IIC_iMOVix2addpc, + [(set GPR:$dst, (ARMWrapperPIC tglobaladdr:$addr))]>, + Requires<[IsARM, UseMovt]>; + +def MOV_ga_dyn : PseudoInst<(outs GPR:$dst), (ins i32imm:$addr), + IIC_iMOVix2, + [(set GPR:$dst, (ARMWrapperDYN tglobaladdr:$addr))]>, + Requires<[IsARM, UseMovt]>; + +let AddedComplexity = 10 in +def MOV_ga_pcrel_ldr : PseudoInst<(outs GPR:$dst), (ins i32imm:$addr), + IIC_iMOVix2ld, + [(set GPR:$dst, (load (ARMWrapperPIC tglobaladdr:$addr)))]>, + Requires<[IsARM, UseMovt]>; +} // isReMaterializable // ConstantPool, GlobalAddress, and JumpTable def : ARMPat<(ARMWrapper tglobaladdr :$dst), (LEApcrel tglobaladdr :$dst)>, @@ -2800,11 +3516,15 @@ def : ARMPat<(ARMcall texternalsym:$func), (BLr9 texternalsym:$func)>, Requires<[IsARM, IsDarwin]>; // zextload i1 -> zextload i8 -def : ARMPat<(zextloadi1 addrmode2:$addr), (LDRB addrmode2:$addr)>; +def : ARMPat<(zextloadi1 addrmode_imm12:$addr), (LDRBi12 addrmode_imm12:$addr)>; +def : ARMPat<(zextloadi1 ldst_so_reg:$addr), (LDRBrs ldst_so_reg:$addr)>; // extload -> zextload -def : ARMPat<(extloadi1 addrmode2:$addr), (LDRB addrmode2:$addr)>; -def : ARMPat<(extloadi8 addrmode2:$addr), (LDRB addrmode2:$addr)>; +def : ARMPat<(extloadi1 addrmode_imm12:$addr), (LDRBi12 addrmode_imm12:$addr)>; +def : ARMPat<(extloadi1 ldst_so_reg:$addr), (LDRBrs ldst_so_reg:$addr)>; +def : ARMPat<(extloadi8 addrmode_imm12:$addr), (LDRBi12 addrmode_imm12:$addr)>; +def : ARMPat<(extloadi8 ldst_so_reg:$addr), (LDRBrs ldst_so_reg:$addr)>; + def : ARMPat<(extloadi16 addrmode3:$addr), (LDRH addrmode3:$addr)>; def : ARMPat<(extloadi8 addrmodepc:$addr), (PICLDRB addrmodepc:$addr)>; @@ -2889,19 +3609,45 @@ include "ARMInstrNEON.td" // Coprocessor Instructions. For disassembly only. // -def CDP : ABI<0b1110, (outs), (ins nohash_imm:$cop, i32imm:$opc1, - nohash_imm:$CRd, nohash_imm:$CRn, nohash_imm:$CRm, i32imm:$opc2), - NoItinerary, "cdp", "\tp$cop, $opc1, cr$CRd, cr$CRn, cr$CRm, $opc2", - [/* For disassembly only; pattern left blank */]> { - let Inst{4} = 0; -} - -def CDP2 : ABXI<0b1110, (outs), (ins nohash_imm:$cop, i32imm:$opc1, - nohash_imm:$CRd, nohash_imm:$CRn, nohash_imm:$CRm, i32imm:$opc2), - NoItinerary, "cdp2\tp$cop, $opc1, cr$CRd, cr$CRn, cr$CRm, $opc2", +def CDP : ABI<0b1110, (outs), (ins p_imm:$cop, i32imm:$opc1, + c_imm:$CRd, c_imm:$CRn, c_imm:$CRm, i32imm:$opc2), + NoItinerary, "cdp", "\t$cop, $opc1, $CRd, $CRn, $CRm, $opc2", + [/* For disassembly only; pattern left blank */]> { + bits<4> opc1; + bits<4> CRn; + bits<4> CRd; + bits<4> cop; + bits<3> opc2; + bits<4> CRm; + + let Inst{3-0} = CRm; + let Inst{4} = 0; + let Inst{7-5} = opc2; + let Inst{11-8} = cop; + let Inst{15-12} = CRd; + let Inst{19-16} = CRn; + let Inst{23-20} = opc1; +} + +def CDP2 : ABXI<0b1110, (outs), (ins p_imm:$cop, i32imm:$opc1, + c_imm:$CRd, c_imm:$CRn, c_imm:$CRm, i32imm:$opc2), + NoItinerary, "cdp2\t$cop, $opc1, $CRd, $CRn, $CRm, $opc2", [/* For disassembly only; pattern left blank */]> { let Inst{31-28} = 0b1111; - let Inst{4} = 0; + bits<4> opc1; + bits<4> CRn; + bits<4> CRd; + bits<4> cop; + bits<3> opc2; + bits<4> CRm; + + let Inst{3-0} = CRm; + let Inst{4} = 0; + let Inst{7-5} = opc2; + let Inst{11-8} = cop; + let Inst{15-12} = CRd; + let Inst{19-16} = CRn; + let Inst{23-20} = opc1; } class ACI<dag oops, dag iops, string opc, string asm> @@ -3000,110 +3746,164 @@ defm LDC2 : LdStCop<0b1111, 1, "ldc2">; defm STC : LdStCop<{?,?,?,?}, 0, "stc">; defm STC2 : LdStCop<0b1111, 0, "stc2">; -def MCR : ABI<0b1110, (outs), (ins nohash_imm:$cop, i32imm:$opc1, - GPR:$Rt, nohash_imm:$CRn, nohash_imm:$CRm, i32imm:$opc2), - NoItinerary, "mcr", "\tp$cop, $opc1, $Rt, cr$CRn, cr$CRm, $opc2", - [/* For disassembly only; pattern left blank */]> { - let Inst{20} = 0; - let Inst{4} = 1; -} - -def MCR2 : ABXI<0b1110, (outs), (ins nohash_imm:$cop, i32imm:$opc1, - GPR:$Rt, nohash_imm:$CRn, nohash_imm:$CRm, i32imm:$opc2), - NoItinerary, "mcr2\tp$cop, $opc1, $Rt, cr$CRn, cr$CRm, $opc2", - [/* For disassembly only; pattern left blank */]> { - let Inst{31-28} = 0b1111; - let Inst{20} = 0; - let Inst{4} = 1; -} +//===----------------------------------------------------------------------===// +// Move between coprocessor and ARM core register -- for disassembly only +// -def MRC : ABI<0b1110, (outs), (ins nohash_imm:$cop, i32imm:$opc1, - GPR:$Rt, nohash_imm:$CRn, nohash_imm:$CRm, i32imm:$opc2), - NoItinerary, "mrc", "\tp$cop, $opc1, $Rt, cr$CRn, cr$CRm, $opc2", - [/* For disassembly only; pattern left blank */]> { - let Inst{20} = 1; +class MovRCopro<string opc, bit direction> + : ABI<0b1110, (outs), (ins p_imm:$cop, i32imm:$opc1, + GPR:$Rt, c_imm:$CRn, c_imm:$CRm, i32imm:$opc2), + NoItinerary, opc, "\t$cop, $opc1, $Rt, $CRn, $CRm, $opc2", + [/* For disassembly only; pattern left blank */]> { + let Inst{20} = direction; let Inst{4} = 1; -} -def MRC2 : ABXI<0b1110, (outs), (ins nohash_imm:$cop, i32imm:$opc1, - GPR:$Rt, nohash_imm:$CRn, nohash_imm:$CRm, i32imm:$opc2), - NoItinerary, "mrc2\tp$cop, $opc1, $Rt, cr$CRn, cr$CRm, $opc2", - [/* For disassembly only; pattern left blank */]> { + bits<4> Rt; + bits<4> cop; + bits<3> opc1; + bits<3> opc2; + bits<4> CRm; + bits<4> CRn; + + let Inst{15-12} = Rt; + let Inst{11-8} = cop; + let Inst{23-21} = opc1; + let Inst{7-5} = opc2; + let Inst{3-0} = CRm; + let Inst{19-16} = CRn; +} + +def MCR : MovRCopro<"mcr", 0 /* from ARM core register to coprocessor */>; +def MRC : MovRCopro<"mrc", 1 /* from coprocessor to ARM core register */>; + +class MovRCopro2<string opc, bit direction> + : ABXI<0b1110, (outs), (ins p_imm:$cop, i32imm:$opc1, + GPR:$Rt, c_imm:$CRn, c_imm:$CRm, i32imm:$opc2), + NoItinerary, !strconcat(opc, "\t$cop, $opc1, $Rt, $CRn, $CRm, $opc2"), + [/* For disassembly only; pattern left blank */]> { let Inst{31-28} = 0b1111; - let Inst{20} = 1; + let Inst{20} = direction; let Inst{4} = 1; -} -def MCRR : ABI<0b1100, (outs), (ins nohash_imm:$cop, i32imm:$opc, - GPR:$Rt, GPR:$Rt2, nohash_imm:$CRm), - NoItinerary, "mcrr", "\tp$cop, $opc, $Rt, $Rt2, cr$CRm", - [/* For disassembly only; pattern left blank */]> { - let Inst{23-20} = 0b0100; -} - -def MCRR2 : ABXI<0b1100, (outs), (ins nohash_imm:$cop, i32imm:$opc, - GPR:$Rt, GPR:$Rt2, nohash_imm:$CRm), - NoItinerary, "mcrr2\tp$cop, $opc, $Rt, $Rt2, cr$CRm", - [/* For disassembly only; pattern left blank */]> { + bits<4> Rt; + bits<4> cop; + bits<3> opc1; + bits<3> opc2; + bits<4> CRm; + bits<4> CRn; + + let Inst{15-12} = Rt; + let Inst{11-8} = cop; + let Inst{23-21} = opc1; + let Inst{7-5} = opc2; + let Inst{3-0} = CRm; + let Inst{19-16} = CRn; +} + +def MCR2 : MovRCopro2<"mcr2", 0 /* from ARM core register to coprocessor */>; +def MRC2 : MovRCopro2<"mrc2", 1 /* from coprocessor to ARM core register */>; + +class MovRRCopro<string opc, bit direction> + : ABI<0b1100, (outs), (ins p_imm:$cop, i32imm:$opc1, + GPR:$Rt, GPR:$Rt2, c_imm:$CRm), + NoItinerary, opc, "\t$cop, $opc1, $Rt, $Rt2, $CRm", + [/* For disassembly only; pattern left blank */]> { + let Inst{23-21} = 0b010; + let Inst{20} = direction; + + bits<4> Rt; + bits<4> Rt2; + bits<4> cop; + bits<4> opc1; + bits<4> CRm; + + let Inst{15-12} = Rt; + let Inst{19-16} = Rt2; + let Inst{11-8} = cop; + let Inst{7-4} = opc1; + let Inst{3-0} = CRm; +} + +def MCRR : MovRRCopro<"mcrr", 0 /* from ARM core register to coprocessor */>; +def MRRC : MovRRCopro<"mrrc", 1 /* from coprocessor to ARM core register */>; + +class MovRRCopro2<string opc, bit direction> + : ABXI<0b1100, (outs), (ins p_imm:$cop, i32imm:$opc1, + GPR:$Rt, GPR:$Rt2, c_imm:$CRm), + NoItinerary, !strconcat(opc, "\t$cop, $opc1, $Rt, $Rt2, $CRm"), + [/* For disassembly only; pattern left blank */]> { let Inst{31-28} = 0b1111; - let Inst{23-20} = 0b0100; -} + let Inst{23-21} = 0b010; + let Inst{20} = direction; -def MRRC : ABI<0b1100, (outs), (ins nohash_imm:$cop, i32imm:$opc, - GPR:$Rt, GPR:$Rt2, nohash_imm:$CRm), - NoItinerary, "mrrc", "\tp$cop, $opc, $Rt, $Rt2, cr$CRm", - [/* For disassembly only; pattern left blank */]> { - let Inst{23-20} = 0b0101; -} + bits<4> Rt; + bits<4> Rt2; + bits<4> cop; + bits<4> opc1; + bits<4> CRm; -def MRRC2 : ABXI<0b1100, (outs), (ins nohash_imm:$cop, i32imm:$opc, - GPR:$Rt, GPR:$Rt2, nohash_imm:$CRm), - NoItinerary, "mrrc2\tp$cop, $opc, $Rt, $Rt2, cr$CRm", - [/* For disassembly only; pattern left blank */]> { - let Inst{31-28} = 0b1111; - let Inst{23-20} = 0b0101; + let Inst{15-12} = Rt; + let Inst{19-16} = Rt2; + let Inst{11-8} = cop; + let Inst{7-4} = opc1; + let Inst{3-0} = CRm; } +def MCRR2 : MovRRCopro2<"mcrr2", 0 /* from ARM core register to coprocessor */>; +def MRRC2 : MovRRCopro2<"mrrc2", 1 /* from coprocessor to ARM core register */>; + //===----------------------------------------------------------------------===// // Move between special register and ARM core register -- for disassembly only // -def MRS : ABI<0b0001,(outs GPR:$dst),(ins), NoItinerary, "mrs", "\t$dst, cpsr", +// Move to ARM core register from Special Register +def MRS : ABI<0b0001, (outs GPR:$Rd), (ins), NoItinerary, "mrs", "\t$Rd, cpsr", [/* For disassembly only; pattern left blank */]> { - let Inst{23-20} = 0b0000; + bits<4> Rd; + let Inst{23-16} = 0b00001111; + let Inst{15-12} = Rd; let Inst{7-4} = 0b0000; } -def MRSsys : ABI<0b0001,(outs GPR:$dst),(ins), NoItinerary,"mrs","\t$dst, spsr", +def MRSsys : ABI<0b0001, (outs GPR:$Rd), (ins), NoItinerary,"mrs","\t$Rd, spsr", [/* For disassembly only; pattern left blank */]> { - let Inst{23-20} = 0b0100; + bits<4> Rd; + let Inst{23-16} = 0b01001111; + let Inst{15-12} = Rd; let Inst{7-4} = 0b0000; } -def MSR : ABI<0b0001, (outs), (ins GPR:$src, msr_mask:$mask), NoItinerary, - "msr", "\tcpsr$mask, $src", +// Move from ARM core register to Special Register +// +// No need to have both system and application versions, the encodings are the +// same and the assembly parser has no way to distinguish between them. The mask +// operand contains the special register (R Bit) in bit 4 and bits 3-0 contains +// the mask with the fields to be accessed in the special register. +def MSR : ABI<0b0001, (outs), (ins msr_mask:$mask, GPR:$Rn), NoItinerary, + "msr", "\t$mask, $Rn", [/* For disassembly only; pattern left blank */]> { - let Inst{23-20} = 0b0010; - let Inst{7-4} = 0b0000; -} + bits<5> mask; + bits<4> Rn; -def MSRi : ABI<0b0011, (outs), (ins so_imm:$a, msr_mask:$mask), NoItinerary, - "msr", "\tcpsr$mask, $a", - [/* For disassembly only; pattern left blank */]> { - let Inst{23-20} = 0b0010; - let Inst{7-4} = 0b0000; + let Inst{23} = 0; + let Inst{22} = mask{4}; // R bit + let Inst{21-20} = 0b10; + let Inst{19-16} = mask{3-0}; + let Inst{15-12} = 0b1111; + let Inst{11-4} = 0b00000000; + let Inst{3-0} = Rn; } -def MSRsys : ABI<0b0001, (outs), (ins GPR:$src, msr_mask:$mask), NoItinerary, - "msr", "\tspsr$mask, $src", - [/* For disassembly only; pattern left blank */]> { - let Inst{23-20} = 0b0110; - let Inst{7-4} = 0b0000; -} +def MSRi : ABI<0b0011, (outs), (ins msr_mask:$mask, so_imm:$a), NoItinerary, + "msr", "\t$mask, $a", + [/* For disassembly only; pattern left blank */]> { + bits<5> mask; + bits<12> a; -def MSRsysi : ABI<0b0011, (outs), (ins so_imm:$a, msr_mask:$mask), NoItinerary, - "msr", "\tspsr$mask, $a", - [/* For disassembly only; pattern left blank */]> { - let Inst{23-20} = 0b0110; - let Inst{7-4} = 0b0000; + let Inst{23} = 0; + let Inst{22} = mask{4}; // R bit + let Inst{21-20} = 0b10; + let Inst{19-16} = mask{3-0}; + let Inst{15-12} = 0b1111; + let Inst{11-0} = a; } diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrNEON.td b/contrib/llvm/lib/Target/ARM/ARMInstrNEON.td index 4d2f116..1e2e550 100644 --- a/contrib/llvm/lib/Target/ARM/ARMInstrNEON.td +++ b/contrib/llvm/lib/Target/ARM/ARMInstrNEON.td @@ -16,11 +16,17 @@ //===----------------------------------------------------------------------===// def SDTARMVCMP : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisSameAs<1, 2>]>; +def SDTARMVCMPZ : SDTypeProfile<1, 1, []>; def NEONvceq : SDNode<"ARMISD::VCEQ", SDTARMVCMP>; +def NEONvceqz : SDNode<"ARMISD::VCEQZ", SDTARMVCMPZ>; def NEONvcge : SDNode<"ARMISD::VCGE", SDTARMVCMP>; +def NEONvcgez : SDNode<"ARMISD::VCGEZ", SDTARMVCMPZ>; +def NEONvclez : SDNode<"ARMISD::VCLEZ", SDTARMVCMPZ>; def NEONvcgeu : SDNode<"ARMISD::VCGEU", SDTARMVCMP>; def NEONvcgt : SDNode<"ARMISD::VCGT", SDTARMVCMP>; +def NEONvcgtz : SDNode<"ARMISD::VCGTZ", SDTARMVCMPZ>; +def NEONvcltz : SDNode<"ARMISD::VCLTZ", SDTARMVCMPZ>; def NEONvcgtu : SDNode<"ARMISD::VCGTU", SDTARMVCMP>; def NEONvtst : SDNode<"ARMISD::VTST", SDTARMVCMP>; @@ -69,6 +75,11 @@ def SDTARMVMOVIMM : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVT<1, i32>]>; def NEONvmovImm : SDNode<"ARMISD::VMOVIMM", SDTARMVMOVIMM>; def NEONvmvnImm : SDNode<"ARMISD::VMVNIMM", SDTARMVMOVIMM>; +def SDTARMVORRIMM : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0, 1>, + SDTCisVT<2, i32>]>; +def NEONvorrImm : SDNode<"ARMISD::VORRIMM", SDTARMVORRIMM>; +def NEONvbicImm : SDNode<"ARMISD::VBICIMM", SDTARMVORRIMM>; + def NEONvdup : SDNode<"ARMISD::VDUP", SDTypeProfile<1, 1, [SDTCisVec<0>]>>; // VDUPLANE can produce a quad-register result from a double-register source, @@ -129,830 +140,1506 @@ def nModImm : Operand<i32> { // NEON load / store instructions //===----------------------------------------------------------------------===// -// Use vldmia to load a Q register as a D register pair. -// This is equivalent to VLDMD except that it has a Q register operand -// instead of a pair of D registers. -def VLDMQ - : AXDI4<(outs QPR:$dst), (ins addrmode4:$addr, pred:$p), - IndexModeNone, IIC_fpLoadm, - "vldm${addr:submode}${p}\t$addr, ${dst:dregpair}", "", - [(set QPR:$dst, (v2f64 (load addrmode4:$addr)))]>; - -let mayLoad = 1, neverHasSideEffects = 1 in { -// Use vld1 to load a Q register as a D register pair. -// This alternative to VLDMQ allows an alignment to be specified. -// This is equivalent to VLD1q64 except that it has a Q register operand. -def VLD1q - : NLdSt<0,0b10,0b1010,0b1100, (outs QPR:$dst), (ins addrmode6:$addr), - IIC_VLD1, "vld1", "64", "${dst:dregpair}, $addr", "", []>; -} // mayLoad = 1, neverHasSideEffects = 1 - -// Use vstmia to store a Q register as a D register pair. -// This is equivalent to VSTMD except that it has a Q register operand -// instead of a pair of D registers. -def VSTMQ - : AXDI4<(outs), (ins QPR:$src, addrmode4:$addr, pred:$p), - IndexModeNone, IIC_fpStorem, - "vstm${addr:submode}${p}\t$addr, ${src:dregpair}", "", - [(store (v2f64 QPR:$src), addrmode4:$addr)]>; - -let mayStore = 1, neverHasSideEffects = 1 in { -// Use vst1 to store a Q register as a D register pair. -// This alternative to VSTMQ allows an alignment to be specified. -// This is equivalent to VST1q64 except that it has a Q register operand. -def VST1q - : NLdSt<0,0b00,0b1010,0b1100, (outs), (ins addrmode6:$addr, QPR:$src), - IIC_VST, "vst1", "64", "${src:dregpair}, $addr", "", []>; -} // mayStore = 1, neverHasSideEffects = 1 - -let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 in { +// Use VLDM to load a Q register as a D register pair. +// This is a pseudo instruction that is expanded to VLDMD after reg alloc. +def VLDMQIA + : PseudoVFPLdStM<(outs QPR:$dst), (ins GPR:$Rn), + IIC_fpLoad_m, "", + [(set QPR:$dst, (v2f64 (load GPR:$Rn)))]>; +def VLDMQDB + : PseudoVFPLdStM<(outs QPR:$dst), (ins GPR:$Rn), + IIC_fpLoad_m, "", + [(set QPR:$dst, (v2f64 (load GPR:$Rn)))]>; + +// Use VSTM to store a Q register as a D register pair. +// This is a pseudo instruction that is expanded to VSTMD after reg alloc. +def VSTMQIA + : PseudoVFPLdStM<(outs), (ins QPR:$src, GPR:$Rn), + IIC_fpStore_m, "", + [(store (v2f64 QPR:$src), GPR:$Rn)]>; +def VSTMQDB + : PseudoVFPLdStM<(outs), (ins QPR:$src, GPR:$Rn), + IIC_fpStore_m, "", + [(store (v2f64 QPR:$src), GPR:$Rn)]>; // Classes for VLD* pseudo-instructions with multi-register operands. // These are expanded to real instructions after register allocation. -class VLDQPseudo - : PseudoNLdSt<(outs QPR:$dst), (ins addrmode6:$addr), IIC_VST, "">; -class VLDQWBPseudo +class VLDQPseudo<InstrItinClass itin> + : PseudoNLdSt<(outs QPR:$dst), (ins addrmode6:$addr), itin, "">; +class VLDQWBPseudo<InstrItinClass itin> : PseudoNLdSt<(outs QPR:$dst, GPR:$wb), - (ins addrmode6:$addr, am6offset:$offset), IIC_VST, + (ins addrmode6:$addr, am6offset:$offset), itin, "$addr.addr = $wb">; -class VLDQQPseudo - : PseudoNLdSt<(outs QQPR:$dst), (ins addrmode6:$addr), IIC_VST, "">; -class VLDQQWBPseudo +class VLDQQPseudo<InstrItinClass itin> + : PseudoNLdSt<(outs QQPR:$dst), (ins addrmode6:$addr), itin, "">; +class VLDQQWBPseudo<InstrItinClass itin> : PseudoNLdSt<(outs QQPR:$dst, GPR:$wb), - (ins addrmode6:$addr, am6offset:$offset), IIC_VST, + (ins addrmode6:$addr, am6offset:$offset), itin, "$addr.addr = $wb">; -class VLDQQQQWBPseudo +class VLDQQQQPseudo<InstrItinClass itin> + : PseudoNLdSt<(outs QQQQPR:$dst), (ins addrmode6:$addr, QQQQPR:$src), itin,"">; +class VLDQQQQWBPseudo<InstrItinClass itin> : PseudoNLdSt<(outs QQQQPR:$dst, GPR:$wb), - (ins addrmode6:$addr, am6offset:$offset, QQQQPR:$src), IIC_VST, + (ins addrmode6:$addr, am6offset:$offset, QQQQPR:$src), itin, "$addr.addr = $wb, $src = $dst">; +let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 in { + // VLD1 : Vector Load (multiple single elements) class VLD1D<bits<4> op7_4, string Dt> - : NLdSt<0,0b10,0b0111,op7_4, (outs DPR:$dst), - (ins addrmode6:$addr), IIC_VLD1, - "vld1", Dt, "\\{$dst\\}, $addr", "", []>; + : NLdSt<0,0b10,0b0111,op7_4, (outs DPR:$Vd), + (ins addrmode6:$Rn), IIC_VLD1, + "vld1", Dt, "\\{$Vd\\}, $Rn", "", []> { + let Rm = 0b1111; + let Inst{4} = Rn{4}; +} class VLD1Q<bits<4> op7_4, string Dt> - : NLdSt<0,0b10,0b1010,op7_4, (outs DPR:$dst1, DPR:$dst2), - (ins addrmode6:$addr), IIC_VLD1, - "vld1", Dt, "\\{$dst1, $dst2\\}, $addr", "", []>; + : NLdSt<0,0b10,0b1010,op7_4, (outs DPR:$Vd, DPR:$dst2), + (ins addrmode6:$Rn), IIC_VLD1x2, + "vld1", Dt, "\\{$Vd, $dst2\\}, $Rn", "", []> { + let Rm = 0b1111; + let Inst{5-4} = Rn{5-4}; +} -def VLD1d8 : VLD1D<0b0000, "8">; -def VLD1d16 : VLD1D<0b0100, "16">; -def VLD1d32 : VLD1D<0b1000, "32">; -def VLD1d64 : VLD1D<0b1100, "64">; +def VLD1d8 : VLD1D<{0,0,0,?}, "8">; +def VLD1d16 : VLD1D<{0,1,0,?}, "16">; +def VLD1d32 : VLD1D<{1,0,0,?}, "32">; +def VLD1d64 : VLD1D<{1,1,0,?}, "64">; -def VLD1q8 : VLD1Q<0b0000, "8">; -def VLD1q16 : VLD1Q<0b0100, "16">; -def VLD1q32 : VLD1Q<0b1000, "32">; -def VLD1q64 : VLD1Q<0b1100, "64">; +def VLD1q8 : VLD1Q<{0,0,?,?}, "8">; +def VLD1q16 : VLD1Q<{0,1,?,?}, "16">; +def VLD1q32 : VLD1Q<{1,0,?,?}, "32">; +def VLD1q64 : VLD1Q<{1,1,?,?}, "64">; -def VLD1q8Pseudo : VLDQPseudo; -def VLD1q16Pseudo : VLDQPseudo; -def VLD1q32Pseudo : VLDQPseudo; -def VLD1q64Pseudo : VLDQPseudo; +def VLD1q8Pseudo : VLDQPseudo<IIC_VLD1x2>; +def VLD1q16Pseudo : VLDQPseudo<IIC_VLD1x2>; +def VLD1q32Pseudo : VLDQPseudo<IIC_VLD1x2>; +def VLD1q64Pseudo : VLDQPseudo<IIC_VLD1x2>; // ...with address register writeback: class VLD1DWB<bits<4> op7_4, string Dt> - : NLdSt<0,0b10,0b0111,op7_4, (outs DPR:$dst, GPR:$wb), - (ins addrmode6:$addr, am6offset:$offset), IIC_VLD1, - "vld1", Dt, "\\{$dst\\}, $addr$offset", - "$addr.addr = $wb", []>; + : NLdSt<0,0b10,0b0111,op7_4, (outs DPR:$Vd, GPR:$wb), + (ins addrmode6:$Rn, am6offset:$Rm), IIC_VLD1u, + "vld1", Dt, "\\{$Vd\\}, $Rn$Rm", + "$Rn.addr = $wb", []> { + let Inst{4} = Rn{4}; +} class VLD1QWB<bits<4> op7_4, string Dt> - : NLdSt<0,0b10,0b1010,op7_4, (outs QPR:$dst, GPR:$wb), - (ins addrmode6:$addr, am6offset:$offset), IIC_VLD1, - "vld1", Dt, "${dst:dregpair}, $addr$offset", - "$addr.addr = $wb", []>; - -def VLD1d8_UPD : VLD1DWB<0b0000, "8">; -def VLD1d16_UPD : VLD1DWB<0b0100, "16">; -def VLD1d32_UPD : VLD1DWB<0b1000, "32">; -def VLD1d64_UPD : VLD1DWB<0b1100, "64">; - -def VLD1q8_UPD : VLD1QWB<0b0000, "8">; -def VLD1q16_UPD : VLD1QWB<0b0100, "16">; -def VLD1q32_UPD : VLD1QWB<0b1000, "32">; -def VLD1q64_UPD : VLD1QWB<0b1100, "64">; - -def VLD1q8Pseudo_UPD : VLDQWBPseudo; -def VLD1q16Pseudo_UPD : VLDQWBPseudo; -def VLD1q32Pseudo_UPD : VLDQWBPseudo; -def VLD1q64Pseudo_UPD : VLDQWBPseudo; + : NLdSt<0,0b10,0b1010,op7_4, (outs DPR:$Vd, DPR:$dst2, GPR:$wb), + (ins addrmode6:$Rn, am6offset:$Rm), IIC_VLD1x2u, + "vld1", Dt, "\\{$Vd, $dst2\\}, $Rn$Rm", + "$Rn.addr = $wb", []> { + let Inst{5-4} = Rn{5-4}; +} + +def VLD1d8_UPD : VLD1DWB<{0,0,0,?}, "8">; +def VLD1d16_UPD : VLD1DWB<{0,1,0,?}, "16">; +def VLD1d32_UPD : VLD1DWB<{1,0,0,?}, "32">; +def VLD1d64_UPD : VLD1DWB<{1,1,0,?}, "64">; + +def VLD1q8_UPD : VLD1QWB<{0,0,?,?}, "8">; +def VLD1q16_UPD : VLD1QWB<{0,1,?,?}, "16">; +def VLD1q32_UPD : VLD1QWB<{1,0,?,?}, "32">; +def VLD1q64_UPD : VLD1QWB<{1,1,?,?}, "64">; + +def VLD1q8Pseudo_UPD : VLDQWBPseudo<IIC_VLD1x2u>; +def VLD1q16Pseudo_UPD : VLDQWBPseudo<IIC_VLD1x2u>; +def VLD1q32Pseudo_UPD : VLDQWBPseudo<IIC_VLD1x2u>; +def VLD1q64Pseudo_UPD : VLDQWBPseudo<IIC_VLD1x2u>; // ...with 3 registers (some of these are only for the disassembler): class VLD1D3<bits<4> op7_4, string Dt> - : NLdSt<0,0b10,0b0110,op7_4, (outs DPR:$dst1, DPR:$dst2, DPR:$dst3), - (ins addrmode6:$addr), IIC_VLD1, "vld1", Dt, - "\\{$dst1, $dst2, $dst3\\}, $addr", "", []>; + : NLdSt<0,0b10,0b0110,op7_4, (outs DPR:$Vd, DPR:$dst2, DPR:$dst3), + (ins addrmode6:$Rn), IIC_VLD1x3, "vld1", Dt, + "\\{$Vd, $dst2, $dst3\\}, $Rn", "", []> { + let Rm = 0b1111; + let Inst{4} = Rn{4}; +} class VLD1D3WB<bits<4> op7_4, string Dt> - : NLdSt<0,0b10,0b0110,op7_4, (outs DPR:$dst1, DPR:$dst2, DPR:$dst3, GPR:$wb), - (ins addrmode6:$addr, am6offset:$offset), IIC_VLD1, "vld1", Dt, - "\\{$dst1, $dst2, $dst3\\}, $addr$offset", "$addr.addr = $wb", []>; + : NLdSt<0,0b10,0b0110,op7_4, (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, GPR:$wb), + (ins addrmode6:$Rn, am6offset:$Rm), IIC_VLD1x3u, "vld1", Dt, + "\\{$Vd, $dst2, $dst3\\}, $Rn$Rm", "$Rn.addr = $wb", []> { + let Inst{4} = Rn{4}; +} -def VLD1d8T : VLD1D3<0b0000, "8">; -def VLD1d16T : VLD1D3<0b0100, "16">; -def VLD1d32T : VLD1D3<0b1000, "32">; -def VLD1d64T : VLD1D3<0b1100, "64">; +def VLD1d8T : VLD1D3<{0,0,0,?}, "8">; +def VLD1d16T : VLD1D3<{0,1,0,?}, "16">; +def VLD1d32T : VLD1D3<{1,0,0,?}, "32">; +def VLD1d64T : VLD1D3<{1,1,0,?}, "64">; -def VLD1d8T_UPD : VLD1D3WB<0b0000, "8">; -def VLD1d16T_UPD : VLD1D3WB<0b0100, "16">; -def VLD1d32T_UPD : VLD1D3WB<0b1000, "32">; -def VLD1d64T_UPD : VLD1D3WB<0b1100, "64">; +def VLD1d8T_UPD : VLD1D3WB<{0,0,0,?}, "8">; +def VLD1d16T_UPD : VLD1D3WB<{0,1,0,?}, "16">; +def VLD1d32T_UPD : VLD1D3WB<{1,0,0,?}, "32">; +def VLD1d64T_UPD : VLD1D3WB<{1,1,0,?}, "64">; -def VLD1d64TPseudo : VLDQQPseudo; -def VLD1d64TPseudo_UPD : VLDQQWBPseudo; +def VLD1d64TPseudo : VLDQQPseudo<IIC_VLD1x3>; +def VLD1d64TPseudo_UPD : VLDQQWBPseudo<IIC_VLD1x3u>; // ...with 4 registers (some of these are only for the disassembler): class VLD1D4<bits<4> op7_4, string Dt> - : NLdSt<0,0b10,0b0010,op7_4,(outs DPR:$dst1, DPR:$dst2, DPR:$dst3, DPR:$dst4), - (ins addrmode6:$addr), IIC_VLD1, "vld1", Dt, - "\\{$dst1, $dst2, $dst3, $dst4\\}, $addr", "", []>; + : NLdSt<0,0b10,0b0010,op7_4,(outs DPR:$Vd, DPR:$dst2, DPR:$dst3, DPR:$dst4), + (ins addrmode6:$Rn), IIC_VLD1x4, "vld1", Dt, + "\\{$Vd, $dst2, $dst3, $dst4\\}, $Rn", "", []> { + let Rm = 0b1111; + let Inst{5-4} = Rn{5-4}; +} class VLD1D4WB<bits<4> op7_4, string Dt> : NLdSt<0,0b10,0b0010,op7_4, - (outs DPR:$dst1, DPR:$dst2, DPR:$dst3, DPR:$dst4, GPR:$wb), - (ins addrmode6:$addr, am6offset:$offset), IIC_VLD1, "vld1", Dt, - "\\{$dst1, $dst2, $dst3, $dst4\\}, $addr$offset", "$addr.addr = $wb", - []>; + (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, DPR:$dst4, GPR:$wb), + (ins addrmode6:$Rn, am6offset:$Rm), IIC_VLD1x4u, "vld1", Dt, + "\\{$Vd, $dst2, $dst3, $dst4\\}, $Rn$Rm", "$Rn.addr = $wb", + []> { + let Inst{5-4} = Rn{5-4}; +} -def VLD1d8Q : VLD1D4<0b0000, "8">; -def VLD1d16Q : VLD1D4<0b0100, "16">; -def VLD1d32Q : VLD1D4<0b1000, "32">; -def VLD1d64Q : VLD1D4<0b1100, "64">; +def VLD1d8Q : VLD1D4<{0,0,?,?}, "8">; +def VLD1d16Q : VLD1D4<{0,1,?,?}, "16">; +def VLD1d32Q : VLD1D4<{1,0,?,?}, "32">; +def VLD1d64Q : VLD1D4<{1,1,?,?}, "64">; -def VLD1d8Q_UPD : VLD1D4WB<0b0000, "8">; -def VLD1d16Q_UPD : VLD1D4WB<0b0100, "16">; -def VLD1d32Q_UPD : VLD1D4WB<0b1000, "32">; -def VLD1d64Q_UPD : VLD1D4WB<0b1100, "64">; +def VLD1d8Q_UPD : VLD1D4WB<{0,0,?,?}, "8">; +def VLD1d16Q_UPD : VLD1D4WB<{0,1,?,?}, "16">; +def VLD1d32Q_UPD : VLD1D4WB<{1,0,?,?}, "32">; +def VLD1d64Q_UPD : VLD1D4WB<{1,1,?,?}, "64">; -def VLD1d64QPseudo : VLDQQPseudo; -def VLD1d64QPseudo_UPD : VLDQQWBPseudo; +def VLD1d64QPseudo : VLDQQPseudo<IIC_VLD1x4>; +def VLD1d64QPseudo_UPD : VLDQQWBPseudo<IIC_VLD1x4u>; // VLD2 : Vector Load (multiple 2-element structures) class VLD2D<bits<4> op11_8, bits<4> op7_4, string Dt> - : NLdSt<0, 0b10, op11_8, op7_4, (outs DPR:$dst1, DPR:$dst2), - (ins addrmode6:$addr), IIC_VLD2, - "vld2", Dt, "\\{$dst1, $dst2\\}, $addr", "", []>; + : NLdSt<0, 0b10, op11_8, op7_4, (outs DPR:$Vd, DPR:$dst2), + (ins addrmode6:$Rn), IIC_VLD2, + "vld2", Dt, "\\{$Vd, $dst2\\}, $Rn", "", []> { + let Rm = 0b1111; + let Inst{5-4} = Rn{5-4}; +} class VLD2Q<bits<4> op7_4, string Dt> : NLdSt<0, 0b10, 0b0011, op7_4, - (outs DPR:$dst1, DPR:$dst2, DPR:$dst3, DPR:$dst4), - (ins addrmode6:$addr), IIC_VLD2, - "vld2", Dt, "\\{$dst1, $dst2, $dst3, $dst4\\}, $addr", "", []>; + (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, DPR:$dst4), + (ins addrmode6:$Rn), IIC_VLD2x2, + "vld2", Dt, "\\{$Vd, $dst2, $dst3, $dst4\\}, $Rn", "", []> { + let Rm = 0b1111; + let Inst{5-4} = Rn{5-4}; +} -def VLD2d8 : VLD2D<0b1000, 0b0000, "8">; -def VLD2d16 : VLD2D<0b1000, 0b0100, "16">; -def VLD2d32 : VLD2D<0b1000, 0b1000, "32">; +def VLD2d8 : VLD2D<0b1000, {0,0,?,?}, "8">; +def VLD2d16 : VLD2D<0b1000, {0,1,?,?}, "16">; +def VLD2d32 : VLD2D<0b1000, {1,0,?,?}, "32">; -def VLD2q8 : VLD2Q<0b0000, "8">; -def VLD2q16 : VLD2Q<0b0100, "16">; -def VLD2q32 : VLD2Q<0b1000, "32">; +def VLD2q8 : VLD2Q<{0,0,?,?}, "8">; +def VLD2q16 : VLD2Q<{0,1,?,?}, "16">; +def VLD2q32 : VLD2Q<{1,0,?,?}, "32">; -def VLD2d8Pseudo : VLDQPseudo; -def VLD2d16Pseudo : VLDQPseudo; -def VLD2d32Pseudo : VLDQPseudo; +def VLD2d8Pseudo : VLDQPseudo<IIC_VLD2>; +def VLD2d16Pseudo : VLDQPseudo<IIC_VLD2>; +def VLD2d32Pseudo : VLDQPseudo<IIC_VLD2>; -def VLD2q8Pseudo : VLDQQPseudo; -def VLD2q16Pseudo : VLDQQPseudo; -def VLD2q32Pseudo : VLDQQPseudo; +def VLD2q8Pseudo : VLDQQPseudo<IIC_VLD2x2>; +def VLD2q16Pseudo : VLDQQPseudo<IIC_VLD2x2>; +def VLD2q32Pseudo : VLDQQPseudo<IIC_VLD2x2>; // ...with address register writeback: class VLD2DWB<bits<4> op11_8, bits<4> op7_4, string Dt> - : NLdSt<0, 0b10, op11_8, op7_4, (outs DPR:$dst1, DPR:$dst2, GPR:$wb), - (ins addrmode6:$addr, am6offset:$offset), IIC_VLD2, - "vld2", Dt, "\\{$dst1, $dst2\\}, $addr$offset", - "$addr.addr = $wb", []>; + : NLdSt<0, 0b10, op11_8, op7_4, (outs DPR:$Vd, DPR:$dst2, GPR:$wb), + (ins addrmode6:$Rn, am6offset:$Rm), IIC_VLD2u, + "vld2", Dt, "\\{$Vd, $dst2\\}, $Rn$Rm", + "$Rn.addr = $wb", []> { + let Inst{5-4} = Rn{5-4}; +} class VLD2QWB<bits<4> op7_4, string Dt> : NLdSt<0, 0b10, 0b0011, op7_4, - (outs DPR:$dst1, DPR:$dst2, DPR:$dst3, DPR:$dst4, GPR:$wb), - (ins addrmode6:$addr, am6offset:$offset), IIC_VLD2, - "vld2", Dt, "\\{$dst1, $dst2, $dst3, $dst4\\}, $addr$offset", - "$addr.addr = $wb", []>; + (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, DPR:$dst4, GPR:$wb), + (ins addrmode6:$Rn, am6offset:$Rm), IIC_VLD2x2u, + "vld2", Dt, "\\{$Vd, $dst2, $dst3, $dst4\\}, $Rn$Rm", + "$Rn.addr = $wb", []> { + let Inst{5-4} = Rn{5-4}; +} -def VLD2d8_UPD : VLD2DWB<0b1000, 0b0000, "8">; -def VLD2d16_UPD : VLD2DWB<0b1000, 0b0100, "16">; -def VLD2d32_UPD : VLD2DWB<0b1000, 0b1000, "32">; +def VLD2d8_UPD : VLD2DWB<0b1000, {0,0,?,?}, "8">; +def VLD2d16_UPD : VLD2DWB<0b1000, {0,1,?,?}, "16">; +def VLD2d32_UPD : VLD2DWB<0b1000, {1,0,?,?}, "32">; -def VLD2q8_UPD : VLD2QWB<0b0000, "8">; -def VLD2q16_UPD : VLD2QWB<0b0100, "16">; -def VLD2q32_UPD : VLD2QWB<0b1000, "32">; +def VLD2q8_UPD : VLD2QWB<{0,0,?,?}, "8">; +def VLD2q16_UPD : VLD2QWB<{0,1,?,?}, "16">; +def VLD2q32_UPD : VLD2QWB<{1,0,?,?}, "32">; -def VLD2d8Pseudo_UPD : VLDQWBPseudo; -def VLD2d16Pseudo_UPD : VLDQWBPseudo; -def VLD2d32Pseudo_UPD : VLDQWBPseudo; +def VLD2d8Pseudo_UPD : VLDQWBPseudo<IIC_VLD2u>; +def VLD2d16Pseudo_UPD : VLDQWBPseudo<IIC_VLD2u>; +def VLD2d32Pseudo_UPD : VLDQWBPseudo<IIC_VLD2u>; -def VLD2q8Pseudo_UPD : VLDQQWBPseudo; -def VLD2q16Pseudo_UPD : VLDQQWBPseudo; -def VLD2q32Pseudo_UPD : VLDQQWBPseudo; +def VLD2q8Pseudo_UPD : VLDQQWBPseudo<IIC_VLD2x2u>; +def VLD2q16Pseudo_UPD : VLDQQWBPseudo<IIC_VLD2x2u>; +def VLD2q32Pseudo_UPD : VLDQQWBPseudo<IIC_VLD2x2u>; // ...with double-spaced registers (for disassembly only): -def VLD2b8 : VLD2D<0b1001, 0b0000, "8">; -def VLD2b16 : VLD2D<0b1001, 0b0100, "16">; -def VLD2b32 : VLD2D<0b1001, 0b1000, "32">; -def VLD2b8_UPD : VLD2DWB<0b1001, 0b0000, "8">; -def VLD2b16_UPD : VLD2DWB<0b1001, 0b0100, "16">; -def VLD2b32_UPD : VLD2DWB<0b1001, 0b1000, "32">; +def VLD2b8 : VLD2D<0b1001, {0,0,?,?}, "8">; +def VLD2b16 : VLD2D<0b1001, {0,1,?,?}, "16">; +def VLD2b32 : VLD2D<0b1001, {1,0,?,?}, "32">; +def VLD2b8_UPD : VLD2DWB<0b1001, {0,0,?,?}, "8">; +def VLD2b16_UPD : VLD2DWB<0b1001, {0,1,?,?}, "16">; +def VLD2b32_UPD : VLD2DWB<0b1001, {1,0,?,?}, "32">; // VLD3 : Vector Load (multiple 3-element structures) class VLD3D<bits<4> op11_8, bits<4> op7_4, string Dt> - : NLdSt<0, 0b10, op11_8, op7_4, (outs DPR:$dst1, DPR:$dst2, DPR:$dst3), - (ins addrmode6:$addr), IIC_VLD3, - "vld3", Dt, "\\{$dst1, $dst2, $dst3\\}, $addr", "", []>; + : NLdSt<0, 0b10, op11_8, op7_4, (outs DPR:$Vd, DPR:$dst2, DPR:$dst3), + (ins addrmode6:$Rn), IIC_VLD3, + "vld3", Dt, "\\{$Vd, $dst2, $dst3\\}, $Rn", "", []> { + let Rm = 0b1111; + let Inst{4} = Rn{4}; +} -def VLD3d8 : VLD3D<0b0100, 0b0000, "8">; -def VLD3d16 : VLD3D<0b0100, 0b0100, "16">; -def VLD3d32 : VLD3D<0b0100, 0b1000, "32">; +def VLD3d8 : VLD3D<0b0100, {0,0,0,?}, "8">; +def VLD3d16 : VLD3D<0b0100, {0,1,0,?}, "16">; +def VLD3d32 : VLD3D<0b0100, {1,0,0,?}, "32">; -def VLD3d8Pseudo : VLDQQPseudo; -def VLD3d16Pseudo : VLDQQPseudo; -def VLD3d32Pseudo : VLDQQPseudo; +def VLD3d8Pseudo : VLDQQPseudo<IIC_VLD3>; +def VLD3d16Pseudo : VLDQQPseudo<IIC_VLD3>; +def VLD3d32Pseudo : VLDQQPseudo<IIC_VLD3>; // ...with address register writeback: class VLD3DWB<bits<4> op11_8, bits<4> op7_4, string Dt> : NLdSt<0, 0b10, op11_8, op7_4, - (outs DPR:$dst1, DPR:$dst2, DPR:$dst3, GPR:$wb), - (ins addrmode6:$addr, am6offset:$offset), IIC_VLD3, - "vld3", Dt, "\\{$dst1, $dst2, $dst3\\}, $addr$offset", - "$addr.addr = $wb", []>; - -def VLD3d8_UPD : VLD3DWB<0b0100, 0b0000, "8">; -def VLD3d16_UPD : VLD3DWB<0b0100, 0b0100, "16">; -def VLD3d32_UPD : VLD3DWB<0b0100, 0b1000, "32">; - -def VLD3d8Pseudo_UPD : VLDQQWBPseudo; -def VLD3d16Pseudo_UPD : VLDQQWBPseudo; -def VLD3d32Pseudo_UPD : VLDQQWBPseudo; - -// ...with double-spaced registers (non-updating versions for disassembly only): -def VLD3q8 : VLD3D<0b0101, 0b0000, "8">; -def VLD3q16 : VLD3D<0b0101, 0b0100, "16">; -def VLD3q32 : VLD3D<0b0101, 0b1000, "32">; -def VLD3q8_UPD : VLD3DWB<0b0101, 0b0000, "8">; -def VLD3q16_UPD : VLD3DWB<0b0101, 0b0100, "16">; -def VLD3q32_UPD : VLD3DWB<0b0101, 0b1000, "32">; - -def VLD3q8Pseudo_UPD : VLDQQQQWBPseudo; -def VLD3q16Pseudo_UPD : VLDQQQQWBPseudo; -def VLD3q32Pseudo_UPD : VLDQQQQWBPseudo; + (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, GPR:$wb), + (ins addrmode6:$Rn, am6offset:$Rm), IIC_VLD3u, + "vld3", Dt, "\\{$Vd, $dst2, $dst3\\}, $Rn$Rm", + "$Rn.addr = $wb", []> { + let Inst{4} = Rn{4}; +} + +def VLD3d8_UPD : VLD3DWB<0b0100, {0,0,0,?}, "8">; +def VLD3d16_UPD : VLD3DWB<0b0100, {0,1,0,?}, "16">; +def VLD3d32_UPD : VLD3DWB<0b0100, {1,0,0,?}, "32">; + +def VLD3d8Pseudo_UPD : VLDQQWBPseudo<IIC_VLD3u>; +def VLD3d16Pseudo_UPD : VLDQQWBPseudo<IIC_VLD3u>; +def VLD3d32Pseudo_UPD : VLDQQWBPseudo<IIC_VLD3u>; + +// ...with double-spaced registers: +def VLD3q8 : VLD3D<0b0101, {0,0,0,?}, "8">; +def VLD3q16 : VLD3D<0b0101, {0,1,0,?}, "16">; +def VLD3q32 : VLD3D<0b0101, {1,0,0,?}, "32">; +def VLD3q8_UPD : VLD3DWB<0b0101, {0,0,0,?}, "8">; +def VLD3q16_UPD : VLD3DWB<0b0101, {0,1,0,?}, "16">; +def VLD3q32_UPD : VLD3DWB<0b0101, {1,0,0,?}, "32">; + +def VLD3q8Pseudo_UPD : VLDQQQQWBPseudo<IIC_VLD3u>; +def VLD3q16Pseudo_UPD : VLDQQQQWBPseudo<IIC_VLD3u>; +def VLD3q32Pseudo_UPD : VLDQQQQWBPseudo<IIC_VLD3u>; // ...alternate versions to be allocated odd register numbers: -def VLD3q8oddPseudo_UPD : VLDQQQQWBPseudo; -def VLD3q16oddPseudo_UPD : VLDQQQQWBPseudo; -def VLD3q32oddPseudo_UPD : VLDQQQQWBPseudo; +def VLD3q8oddPseudo : VLDQQQQPseudo<IIC_VLD3>; +def VLD3q16oddPseudo : VLDQQQQPseudo<IIC_VLD3>; +def VLD3q32oddPseudo : VLDQQQQPseudo<IIC_VLD3>; + +def VLD3q8oddPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD3u>; +def VLD3q16oddPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD3u>; +def VLD3q32oddPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD3u>; // VLD4 : Vector Load (multiple 4-element structures) class VLD4D<bits<4> op11_8, bits<4> op7_4, string Dt> : NLdSt<0, 0b10, op11_8, op7_4, - (outs DPR:$dst1, DPR:$dst2, DPR:$dst3, DPR:$dst4), - (ins addrmode6:$addr), IIC_VLD4, - "vld4", Dt, "\\{$dst1, $dst2, $dst3, $dst4\\}, $addr", "", []>; + (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, DPR:$dst4), + (ins addrmode6:$Rn), IIC_VLD4, + "vld4", Dt, "\\{$Vd, $dst2, $dst3, $dst4\\}, $Rn", "", []> { + let Rm = 0b1111; + let Inst{5-4} = Rn{5-4}; +} -def VLD4d8 : VLD4D<0b0000, 0b0000, "8">; -def VLD4d16 : VLD4D<0b0000, 0b0100, "16">; -def VLD4d32 : VLD4D<0b0000, 0b1000, "32">; +def VLD4d8 : VLD4D<0b0000, {0,0,?,?}, "8">; +def VLD4d16 : VLD4D<0b0000, {0,1,?,?}, "16">; +def VLD4d32 : VLD4D<0b0000, {1,0,?,?}, "32">; -def VLD4d8Pseudo : VLDQQPseudo; -def VLD4d16Pseudo : VLDQQPseudo; -def VLD4d32Pseudo : VLDQQPseudo; +def VLD4d8Pseudo : VLDQQPseudo<IIC_VLD4>; +def VLD4d16Pseudo : VLDQQPseudo<IIC_VLD4>; +def VLD4d32Pseudo : VLDQQPseudo<IIC_VLD4>; // ...with address register writeback: class VLD4DWB<bits<4> op11_8, bits<4> op7_4, string Dt> : NLdSt<0, 0b10, op11_8, op7_4, - (outs DPR:$dst1, DPR:$dst2, DPR:$dst3, DPR:$dst4, GPR:$wb), - (ins addrmode6:$addr, am6offset:$offset), IIC_VLD4, - "vld4", Dt, "\\{$dst1, $dst2, $dst3, $dst4\\}, $addr$offset", - "$addr.addr = $wb", []>; - -def VLD4d8_UPD : VLD4DWB<0b0000, 0b0000, "8">; -def VLD4d16_UPD : VLD4DWB<0b0000, 0b0100, "16">; -def VLD4d32_UPD : VLD4DWB<0b0000, 0b1000, "32">; - -def VLD4d8Pseudo_UPD : VLDQQWBPseudo; -def VLD4d16Pseudo_UPD : VLDQQWBPseudo; -def VLD4d32Pseudo_UPD : VLDQQWBPseudo; - -// ...with double-spaced registers (non-updating versions for disassembly only): -def VLD4q8 : VLD4D<0b0001, 0b0000, "8">; -def VLD4q16 : VLD4D<0b0001, 0b0100, "16">; -def VLD4q32 : VLD4D<0b0001, 0b1000, "32">; -def VLD4q8_UPD : VLD4DWB<0b0001, 0b0000, "8">; -def VLD4q16_UPD : VLD4DWB<0b0001, 0b0100, "16">; -def VLD4q32_UPD : VLD4DWB<0b0001, 0b1000, "32">; - -def VLD4q8Pseudo_UPD : VLDQQQQWBPseudo; -def VLD4q16Pseudo_UPD : VLDQQQQWBPseudo; -def VLD4q32Pseudo_UPD : VLDQQQQWBPseudo; + (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, DPR:$dst4, GPR:$wb), + (ins addrmode6:$Rn, am6offset:$Rm), IIC_VLD4u, + "vld4", Dt, "\\{$Vd, $dst2, $dst3, $dst4\\}, $Rn$Rm", + "$Rn.addr = $wb", []> { + let Inst{5-4} = Rn{5-4}; +} + +def VLD4d8_UPD : VLD4DWB<0b0000, {0,0,?,?}, "8">; +def VLD4d16_UPD : VLD4DWB<0b0000, {0,1,?,?}, "16">; +def VLD4d32_UPD : VLD4DWB<0b0000, {1,0,?,?}, "32">; + +def VLD4d8Pseudo_UPD : VLDQQWBPseudo<IIC_VLD4u>; +def VLD4d16Pseudo_UPD : VLDQQWBPseudo<IIC_VLD4u>; +def VLD4d32Pseudo_UPD : VLDQQWBPseudo<IIC_VLD4u>; + +// ...with double-spaced registers: +def VLD4q8 : VLD4D<0b0001, {0,0,?,?}, "8">; +def VLD4q16 : VLD4D<0b0001, {0,1,?,?}, "16">; +def VLD4q32 : VLD4D<0b0001, {1,0,?,?}, "32">; +def VLD4q8_UPD : VLD4DWB<0b0001, {0,0,?,?}, "8">; +def VLD4q16_UPD : VLD4DWB<0b0001, {0,1,?,?}, "16">; +def VLD4q32_UPD : VLD4DWB<0b0001, {1,0,?,?}, "32">; + +def VLD4q8Pseudo_UPD : VLDQQQQWBPseudo<IIC_VLD4u>; +def VLD4q16Pseudo_UPD : VLDQQQQWBPseudo<IIC_VLD4u>; +def VLD4q32Pseudo_UPD : VLDQQQQWBPseudo<IIC_VLD4u>; // ...alternate versions to be allocated odd register numbers: -def VLD4q8oddPseudo_UPD : VLDQQQQWBPseudo; -def VLD4q16oddPseudo_UPD : VLDQQQQWBPseudo; -def VLD4q32oddPseudo_UPD : VLDQQQQWBPseudo; +def VLD4q8oddPseudo : VLDQQQQPseudo<IIC_VLD4>; +def VLD4q16oddPseudo : VLDQQQQPseudo<IIC_VLD4>; +def VLD4q32oddPseudo : VLDQQQQPseudo<IIC_VLD4>; + +def VLD4q8oddPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD4u>; +def VLD4q16oddPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD4u>; +def VLD4q32oddPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD4u>; + +} // mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 + +// Classes for VLD*LN pseudo-instructions with multi-register operands. +// These are expanded to real instructions after register allocation. +class VLDQLNPseudo<InstrItinClass itin> + : PseudoNLdSt<(outs QPR:$dst), + (ins addrmode6:$addr, QPR:$src, nohash_imm:$lane), + itin, "$src = $dst">; +class VLDQLNWBPseudo<InstrItinClass itin> + : PseudoNLdSt<(outs QPR:$dst, GPR:$wb), + (ins addrmode6:$addr, am6offset:$offset, QPR:$src, + nohash_imm:$lane), itin, "$addr.addr = $wb, $src = $dst">; +class VLDQQLNPseudo<InstrItinClass itin> + : PseudoNLdSt<(outs QQPR:$dst), + (ins addrmode6:$addr, QQPR:$src, nohash_imm:$lane), + itin, "$src = $dst">; +class VLDQQLNWBPseudo<InstrItinClass itin> + : PseudoNLdSt<(outs QQPR:$dst, GPR:$wb), + (ins addrmode6:$addr, am6offset:$offset, QQPR:$src, + nohash_imm:$lane), itin, "$addr.addr = $wb, $src = $dst">; +class VLDQQQQLNPseudo<InstrItinClass itin> + : PseudoNLdSt<(outs QQQQPR:$dst), + (ins addrmode6:$addr, QQQQPR:$src, nohash_imm:$lane), + itin, "$src = $dst">; +class VLDQQQQLNWBPseudo<InstrItinClass itin> + : PseudoNLdSt<(outs QQQQPR:$dst, GPR:$wb), + (ins addrmode6:$addr, am6offset:$offset, QQQQPR:$src, + nohash_imm:$lane), itin, "$addr.addr = $wb, $src = $dst">; // VLD1LN : Vector Load (single element to one lane) -// FIXME: Not yet implemented. +class VLD1LN<bits<4> op11_8, bits<4> op7_4, string Dt, ValueType Ty, + PatFrag LoadOp> + : NLdStLn<1, 0b10, op11_8, op7_4, (outs DPR:$Vd), + (ins addrmode6:$Rn, DPR:$src, nohash_imm:$lane), + IIC_VLD1ln, "vld1", Dt, "\\{$Vd[$lane]\\}, $Rn", + "$src = $Vd", + [(set DPR:$Vd, (vector_insert (Ty DPR:$src), + (i32 (LoadOp addrmode6:$Rn)), + imm:$lane))]> { + let Rm = 0b1111; +} +class VLD1QLNPseudo<ValueType Ty, PatFrag LoadOp> : VLDQLNPseudo<IIC_VLD1ln> { + let Pattern = [(set QPR:$dst, (vector_insert (Ty QPR:$src), + (i32 (LoadOp addrmode6:$addr)), + imm:$lane))]; +} + +def VLD1LNd8 : VLD1LN<0b0000, {?,?,?,0}, "8", v8i8, extloadi8> { + let Inst{7-5} = lane{2-0}; +} +def VLD1LNd16 : VLD1LN<0b0100, {?,?,0,?}, "16", v4i16, extloadi16> { + let Inst{7-6} = lane{1-0}; + let Inst{4} = Rn{4}; +} +def VLD1LNd32 : VLD1LN<0b1000, {?,0,?,?}, "32", v2i32, load> { + let Inst{7} = lane{0}; + let Inst{5} = Rn{4}; + let Inst{4} = Rn{4}; +} + +def VLD1LNq8Pseudo : VLD1QLNPseudo<v16i8, extloadi8>; +def VLD1LNq16Pseudo : VLD1QLNPseudo<v8i16, extloadi16>; +def VLD1LNq32Pseudo : VLD1QLNPseudo<v4i32, load>; + +def : Pat<(vector_insert (v2f32 DPR:$src), + (f32 (load addrmode6:$addr)), imm:$lane), + (VLD1LNd32 addrmode6:$addr, DPR:$src, imm:$lane)>; +def : Pat<(vector_insert (v4f32 QPR:$src), + (f32 (load addrmode6:$addr)), imm:$lane), + (VLD1LNq32Pseudo addrmode6:$addr, QPR:$src, imm:$lane)>; + +let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 in { + +// ...with address register writeback: +class VLD1LNWB<bits<4> op11_8, bits<4> op7_4, string Dt> + : NLdStLn<1, 0b10, op11_8, op7_4, (outs DPR:$Vd, GPR:$wb), + (ins addrmode6:$Rn, am6offset:$Rm, + DPR:$src, nohash_imm:$lane), IIC_VLD1lnu, "vld1", Dt, + "\\{$Vd[$lane]\\}, $Rn$Rm", + "$src = $Vd, $Rn.addr = $wb", []>; + +def VLD1LNd8_UPD : VLD1LNWB<0b0000, {?,?,?,0}, "8"> { + let Inst{7-5} = lane{2-0}; +} +def VLD1LNd16_UPD : VLD1LNWB<0b0100, {?,?,0,?}, "16"> { + let Inst{7-6} = lane{1-0}; + let Inst{4} = Rn{4}; +} +def VLD1LNd32_UPD : VLD1LNWB<0b1000, {?,0,?,?}, "32"> { + let Inst{7} = lane{0}; + let Inst{5} = Rn{4}; + let Inst{4} = Rn{4}; +} + +def VLD1LNq8Pseudo_UPD : VLDQLNWBPseudo<IIC_VLD1lnu>; +def VLD1LNq16Pseudo_UPD : VLDQLNWBPseudo<IIC_VLD1lnu>; +def VLD1LNq32Pseudo_UPD : VLDQLNWBPseudo<IIC_VLD1lnu>; // VLD2LN : Vector Load (single 2-element structure to one lane) class VLD2LN<bits<4> op11_8, bits<4> op7_4, string Dt> - : NLdSt<1, 0b10, op11_8, op7_4, (outs DPR:$dst1, DPR:$dst2), - (ins addrmode6:$addr, DPR:$src1, DPR:$src2, nohash_imm:$lane), - IIC_VLD2, "vld2", Dt, "\\{$dst1[$lane], $dst2[$lane]\\}, $addr", - "$src1 = $dst1, $src2 = $dst2", []>; + : NLdStLn<1, 0b10, op11_8, op7_4, (outs DPR:$Vd, DPR:$dst2), + (ins addrmode6:$Rn, DPR:$src1, DPR:$src2, nohash_imm:$lane), + IIC_VLD2ln, "vld2", Dt, "\\{$Vd[$lane], $dst2[$lane]\\}, $Rn", + "$src1 = $Vd, $src2 = $dst2", []> { + let Rm = 0b1111; + let Inst{4} = Rn{4}; +} -def VLD2LNd8 : VLD2LN<0b0001, {?,?,?,?}, "8">; -def VLD2LNd16 : VLD2LN<0b0101, {?,?,0,?}, "16">; -def VLD2LNd32 : VLD2LN<0b1001, {?,0,?,?}, "32">; +def VLD2LNd8 : VLD2LN<0b0001, {?,?,?,?}, "8"> { + let Inst{7-5} = lane{2-0}; +} +def VLD2LNd16 : VLD2LN<0b0101, {?,?,0,?}, "16"> { + let Inst{7-6} = lane{1-0}; +} +def VLD2LNd32 : VLD2LN<0b1001, {?,0,0,?}, "32"> { + let Inst{7} = lane{0}; +} + +def VLD2LNd8Pseudo : VLDQLNPseudo<IIC_VLD2ln>; +def VLD2LNd16Pseudo : VLDQLNPseudo<IIC_VLD2ln>; +def VLD2LNd32Pseudo : VLDQLNPseudo<IIC_VLD2ln>; // ...with double-spaced registers: -def VLD2LNq16 : VLD2LN<0b0101, {?,?,1,?}, "16">; -def VLD2LNq32 : VLD2LN<0b1001, {?,1,?,?}, "32">; +def VLD2LNq16 : VLD2LN<0b0101, {?,?,1,?}, "16"> { + let Inst{7-6} = lane{1-0}; +} +def VLD2LNq32 : VLD2LN<0b1001, {?,1,0,?}, "32"> { + let Inst{7} = lane{0}; +} -// ...alternate versions to be allocated odd register numbers: -def VLD2LNq16odd : VLD2LN<0b0101, {?,?,1,?}, "16">; -def VLD2LNq32odd : VLD2LN<0b1001, {?,1,?,?}, "32">; +def VLD2LNq16Pseudo : VLDQQLNPseudo<IIC_VLD2ln>; +def VLD2LNq32Pseudo : VLDQQLNPseudo<IIC_VLD2ln>; // ...with address register writeback: class VLD2LNWB<bits<4> op11_8, bits<4> op7_4, string Dt> - : NLdSt<1, 0b10, op11_8, op7_4, (outs DPR:$dst1, DPR:$dst2, GPR:$wb), - (ins addrmode6:$addr, am6offset:$offset, - DPR:$src1, DPR:$src2, nohash_imm:$lane), IIC_VLD2, "vld2", Dt, - "\\{$dst1[$lane], $dst2[$lane]\\}, $addr$offset", - "$src1 = $dst1, $src2 = $dst2, $addr.addr = $wb", []>; + : NLdStLn<1, 0b10, op11_8, op7_4, (outs DPR:$Vd, DPR:$dst2, GPR:$wb), + (ins addrmode6:$Rn, am6offset:$Rm, + DPR:$src1, DPR:$src2, nohash_imm:$lane), IIC_VLD2lnu, "vld2", Dt, + "\\{$Vd[$lane], $dst2[$lane]\\}, $Rn$Rm", + "$src1 = $Vd, $src2 = $dst2, $Rn.addr = $wb", []> { + let Inst{4} = Rn{4}; +} -def VLD2LNd8_UPD : VLD2LNWB<0b0001, {?,?,?,?}, "8">; -def VLD2LNd16_UPD : VLD2LNWB<0b0101, {?,?,0,?}, "16">; -def VLD2LNd32_UPD : VLD2LNWB<0b1001, {?,0,?,?}, "32">; +def VLD2LNd8_UPD : VLD2LNWB<0b0001, {?,?,?,?}, "8"> { + let Inst{7-5} = lane{2-0}; +} +def VLD2LNd16_UPD : VLD2LNWB<0b0101, {?,?,0,?}, "16"> { + let Inst{7-6} = lane{1-0}; +} +def VLD2LNd32_UPD : VLD2LNWB<0b1001, {?,0,0,?}, "32"> { + let Inst{7} = lane{0}; +} -def VLD2LNq16_UPD : VLD2LNWB<0b0101, {?,?,1,?}, "16">; -def VLD2LNq32_UPD : VLD2LNWB<0b1001, {?,1,?,?}, "32">; +def VLD2LNd8Pseudo_UPD : VLDQLNWBPseudo<IIC_VLD2lnu>; +def VLD2LNd16Pseudo_UPD : VLDQLNWBPseudo<IIC_VLD2lnu>; +def VLD2LNd32Pseudo_UPD : VLDQLNWBPseudo<IIC_VLD2lnu>; + +def VLD2LNq16_UPD : VLD2LNWB<0b0101, {?,?,1,?}, "16"> { + let Inst{7-6} = lane{1-0}; +} +def VLD2LNq32_UPD : VLD2LNWB<0b1001, {?,1,0,?}, "32"> { + let Inst{7} = lane{0}; +} + +def VLD2LNq16Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD2lnu>; +def VLD2LNq32Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD2lnu>; // VLD3LN : Vector Load (single 3-element structure to one lane) class VLD3LN<bits<4> op11_8, bits<4> op7_4, string Dt> - : NLdSt<1, 0b10, op11_8, op7_4, (outs DPR:$dst1, DPR:$dst2, DPR:$dst3), - (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3, - nohash_imm:$lane), IIC_VLD3, "vld3", Dt, - "\\{$dst1[$lane], $dst2[$lane], $dst3[$lane]\\}, $addr", - "$src1 = $dst1, $src2 = $dst2, $src3 = $dst3", []>; + : NLdStLn<1, 0b10, op11_8, op7_4, (outs DPR:$Vd, DPR:$dst2, DPR:$dst3), + (ins addrmode6:$Rn, DPR:$src1, DPR:$src2, DPR:$src3, + nohash_imm:$lane), IIC_VLD3ln, "vld3", Dt, + "\\{$Vd[$lane], $dst2[$lane], $dst3[$lane]\\}, $Rn", + "$src1 = $Vd, $src2 = $dst2, $src3 = $dst3", []> { + let Rm = 0b1111; +} -def VLD3LNd8 : VLD3LN<0b0010, {?,?,?,0}, "8">; -def VLD3LNd16 : VLD3LN<0b0110, {?,?,0,0}, "16">; -def VLD3LNd32 : VLD3LN<0b1010, {?,0,0,0}, "32">; +def VLD3LNd8 : VLD3LN<0b0010, {?,?,?,0}, "8"> { + let Inst{7-5} = lane{2-0}; +} +def VLD3LNd16 : VLD3LN<0b0110, {?,?,0,0}, "16"> { + let Inst{7-6} = lane{1-0}; +} +def VLD3LNd32 : VLD3LN<0b1010, {?,0,0,0}, "32"> { + let Inst{7} = lane{0}; +} + +def VLD3LNd8Pseudo : VLDQQLNPseudo<IIC_VLD3ln>; +def VLD3LNd16Pseudo : VLDQQLNPseudo<IIC_VLD3ln>; +def VLD3LNd32Pseudo : VLDQQLNPseudo<IIC_VLD3ln>; // ...with double-spaced registers: -def VLD3LNq16 : VLD3LN<0b0110, {?,?,1,0}, "16">; -def VLD3LNq32 : VLD3LN<0b1010, {?,1,0,0}, "32">; +def VLD3LNq16 : VLD3LN<0b0110, {?,?,1,0}, "16"> { + let Inst{7-6} = lane{1-0}; +} +def VLD3LNq32 : VLD3LN<0b1010, {?,1,0,0}, "32"> { + let Inst{7} = lane{0}; +} -// ...alternate versions to be allocated odd register numbers: -def VLD3LNq16odd : VLD3LN<0b0110, {?,?,1,0}, "16">; -def VLD3LNq32odd : VLD3LN<0b1010, {?,1,0,0}, "32">; +def VLD3LNq16Pseudo : VLDQQQQLNPseudo<IIC_VLD3ln>; +def VLD3LNq32Pseudo : VLDQQQQLNPseudo<IIC_VLD3ln>; // ...with address register writeback: class VLD3LNWB<bits<4> op11_8, bits<4> op7_4, string Dt> - : NLdSt<1, 0b10, op11_8, op7_4, - (outs DPR:$dst1, DPR:$dst2, DPR:$dst3, GPR:$wb), - (ins addrmode6:$addr, am6offset:$offset, + : NLdStLn<1, 0b10, op11_8, op7_4, + (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, GPR:$wb), + (ins addrmode6:$Rn, am6offset:$Rm, DPR:$src1, DPR:$src2, DPR:$src3, nohash_imm:$lane), - IIC_VLD3, "vld3", Dt, - "\\{$dst1[$lane], $dst2[$lane], $dst3[$lane]\\}, $addr$offset", - "$src1 = $dst1, $src2 = $dst2, $src3 = $dst3, $addr.addr = $wb", + IIC_VLD3lnu, "vld3", Dt, + "\\{$Vd[$lane], $dst2[$lane], $dst3[$lane]\\}, $Rn$Rm", + "$src1 = $Vd, $src2 = $dst2, $src3 = $dst3, $Rn.addr = $wb", []>; -def VLD3LNd8_UPD : VLD3LNWB<0b0010, {?,?,?,0}, "8">; -def VLD3LNd16_UPD : VLD3LNWB<0b0110, {?,?,0,0}, "16">; -def VLD3LNd32_UPD : VLD3LNWB<0b1010, {?,0,0,0}, "32">; +def VLD3LNd8_UPD : VLD3LNWB<0b0010, {?,?,?,0}, "8"> { + let Inst{7-5} = lane{2-0}; +} +def VLD3LNd16_UPD : VLD3LNWB<0b0110, {?,?,0,0}, "16"> { + let Inst{7-6} = lane{1-0}; +} +def VLD3LNd32_UPD : VLD3LNWB<0b1010, {?,0,0,0}, "32"> { + let Inst{7} = lane{0}; +} -def VLD3LNq16_UPD : VLD3LNWB<0b0110, {?,?,1,0}, "16">; -def VLD3LNq32_UPD : VLD3LNWB<0b1010, {?,1,0,0}, "32">; +def VLD3LNd8Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD3lnu>; +def VLD3LNd16Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD3lnu>; +def VLD3LNd32Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD3lnu>; + +def VLD3LNq16_UPD : VLD3LNWB<0b0110, {?,?,1,0}, "16"> { + let Inst{7-6} = lane{1-0}; +} +def VLD3LNq32_UPD : VLD3LNWB<0b1010, {?,1,0,0}, "32"> { + let Inst{7} = lane{0}; +} + +def VLD3LNq16Pseudo_UPD : VLDQQQQLNWBPseudo<IIC_VLD3lnu>; +def VLD3LNq32Pseudo_UPD : VLDQQQQLNWBPseudo<IIC_VLD3lnu>; // VLD4LN : Vector Load (single 4-element structure to one lane) class VLD4LN<bits<4> op11_8, bits<4> op7_4, string Dt> - : NLdSt<1, 0b10, op11_8, op7_4, - (outs DPR:$dst1, DPR:$dst2, DPR:$dst3, DPR:$dst4), - (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4, - nohash_imm:$lane), IIC_VLD4, "vld4", Dt, - "\\{$dst1[$lane], $dst2[$lane], $dst3[$lane], $dst4[$lane]\\}, $addr", - "$src1 = $dst1, $src2 = $dst2, $src3 = $dst3, $src4 = $dst4", []>; + : NLdStLn<1, 0b10, op11_8, op7_4, + (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, DPR:$dst4), + (ins addrmode6:$Rn, DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4, + nohash_imm:$lane), IIC_VLD4ln, "vld4", Dt, + "\\{$Vd[$lane], $dst2[$lane], $dst3[$lane], $dst4[$lane]\\}, $Rn", + "$src1 = $Vd, $src2 = $dst2, $src3 = $dst3, $src4 = $dst4", []> { + let Rm = 0b1111; + let Inst{4} = Rn{4}; +} -def VLD4LNd8 : VLD4LN<0b0011, {?,?,?,?}, "8">; -def VLD4LNd16 : VLD4LN<0b0111, {?,?,0,?}, "16">; -def VLD4LNd32 : VLD4LN<0b1011, {?,0,?,?}, "32">; +def VLD4LNd8 : VLD4LN<0b0011, {?,?,?,?}, "8"> { + let Inst{7-5} = lane{2-0}; +} +def VLD4LNd16 : VLD4LN<0b0111, {?,?,0,?}, "16"> { + let Inst{7-6} = lane{1-0}; +} +def VLD4LNd32 : VLD4LN<0b1011, {?,0,?,?}, "32"> { + let Inst{7} = lane{0}; + let Inst{5} = Rn{5}; +} + +def VLD4LNd8Pseudo : VLDQQLNPseudo<IIC_VLD4ln>; +def VLD4LNd16Pseudo : VLDQQLNPseudo<IIC_VLD4ln>; +def VLD4LNd32Pseudo : VLDQQLNPseudo<IIC_VLD4ln>; // ...with double-spaced registers: -def VLD4LNq16 : VLD4LN<0b0111, {?,?,1,?}, "16">; -def VLD4LNq32 : VLD4LN<0b1011, {?,1,?,?}, "32">; +def VLD4LNq16 : VLD4LN<0b0111, {?,?,1,?}, "16"> { + let Inst{7-6} = lane{1-0}; +} +def VLD4LNq32 : VLD4LN<0b1011, {?,1,?,?}, "32"> { + let Inst{7} = lane{0}; + let Inst{5} = Rn{5}; +} -// ...alternate versions to be allocated odd register numbers: -def VLD4LNq16odd : VLD4LN<0b0111, {?,?,1,?}, "16">; -def VLD4LNq32odd : VLD4LN<0b1011, {?,1,?,?}, "32">; +def VLD4LNq16Pseudo : VLDQQQQLNPseudo<IIC_VLD4ln>; +def VLD4LNq32Pseudo : VLDQQQQLNPseudo<IIC_VLD4ln>; // ...with address register writeback: class VLD4LNWB<bits<4> op11_8, bits<4> op7_4, string Dt> - : NLdSt<1, 0b10, op11_8, op7_4, - (outs DPR:$dst1, DPR:$dst2, DPR:$dst3, DPR:$dst4, GPR:$wb), - (ins addrmode6:$addr, am6offset:$offset, + : NLdStLn<1, 0b10, op11_8, op7_4, + (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, DPR:$dst4, GPR:$wb), + (ins addrmode6:$Rn, am6offset:$Rm, DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4, nohash_imm:$lane), - IIC_VLD4, "vld4", Dt, -"\\{$dst1[$lane], $dst2[$lane], $dst3[$lane], $dst4[$lane]\\}, $addr$offset", -"$src1 = $dst1, $src2 = $dst2, $src3 = $dst3, $src4 = $dst4, $addr.addr = $wb", - []>; + IIC_VLD4lnu, "vld4", Dt, +"\\{$Vd[$lane], $dst2[$lane], $dst3[$lane], $dst4[$lane]\\}, $Rn$Rm", +"$src1 = $Vd, $src2 = $dst2, $src3 = $dst3, $src4 = $dst4, $Rn.addr = $wb", + []> { + let Inst{4} = Rn{4}; +} -def VLD4LNd8_UPD : VLD4LNWB<0b0011, {?,?,?,?}, "8">; -def VLD4LNd16_UPD : VLD4LNWB<0b0111, {?,?,0,?}, "16">; -def VLD4LNd32_UPD : VLD4LNWB<0b1011, {?,0,?,?}, "32">; +def VLD4LNd8_UPD : VLD4LNWB<0b0011, {?,?,?,?}, "8"> { + let Inst{7-5} = lane{2-0}; +} +def VLD4LNd16_UPD : VLD4LNWB<0b0111, {?,?,0,?}, "16"> { + let Inst{7-6} = lane{1-0}; +} +def VLD4LNd32_UPD : VLD4LNWB<0b1011, {?,0,?,?}, "32"> { + let Inst{7} = lane{0}; + let Inst{5} = Rn{5}; +} -def VLD4LNq16_UPD : VLD4LNWB<0b0111, {?,?,1,?}, "16">; -def VLD4LNq32_UPD : VLD4LNWB<0b1011, {?,1,?,?}, "32">; +def VLD4LNd8Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD4lnu>; +def VLD4LNd16Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD4lnu>; +def VLD4LNd32Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD4lnu>; + +def VLD4LNq16_UPD : VLD4LNWB<0b0111, {?,?,1,?}, "16"> { + let Inst{7-6} = lane{1-0}; +} +def VLD4LNq32_UPD : VLD4LNWB<0b1011, {?,1,?,?}, "32"> { + let Inst{7} = lane{0}; + let Inst{5} = Rn{5}; +} + +def VLD4LNq16Pseudo_UPD : VLDQQQQLNWBPseudo<IIC_VLD4lnu>; +def VLD4LNq32Pseudo_UPD : VLDQQQQLNWBPseudo<IIC_VLD4lnu>; + +} // mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 // VLD1DUP : Vector Load (single element to all lanes) +class VLD1DUP<bits<4> op7_4, string Dt, ValueType Ty, PatFrag LoadOp> + : NLdSt<1, 0b10, 0b1100, op7_4, (outs DPR:$Vd), (ins addrmode6dup:$Rn), + IIC_VLD1dup, "vld1", Dt, "\\{$Vd[]\\}, $Rn", "", + [(set DPR:$Vd, (Ty (NEONvdup (i32 (LoadOp addrmode6dup:$Rn)))))]> { + let Rm = 0b1111; + let Inst{4} = Rn{4}; +} +class VLD1QDUPPseudo<ValueType Ty, PatFrag LoadOp> : VLDQPseudo<IIC_VLD1dup> { + let Pattern = [(set QPR:$dst, + (Ty (NEONvdup (i32 (LoadOp addrmode6dup:$addr)))))]; +} + +def VLD1DUPd8 : VLD1DUP<{0,0,0,?}, "8", v8i8, extloadi8>; +def VLD1DUPd16 : VLD1DUP<{0,1,0,?}, "16", v4i16, extloadi16>; +def VLD1DUPd32 : VLD1DUP<{1,0,0,?}, "32", v2i32, load>; + +def VLD1DUPq8Pseudo : VLD1QDUPPseudo<v16i8, extloadi8>; +def VLD1DUPq16Pseudo : VLD1QDUPPseudo<v8i16, extloadi16>; +def VLD1DUPq32Pseudo : VLD1QDUPPseudo<v4i32, load>; + +def : Pat<(v2f32 (NEONvdup (f32 (load addrmode6dup:$addr)))), + (VLD1DUPd32 addrmode6:$addr)>; +def : Pat<(v4f32 (NEONvdup (f32 (load addrmode6dup:$addr)))), + (VLD1DUPq32Pseudo addrmode6:$addr)>; + +let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 in { + +class VLD1QDUP<bits<4> op7_4, string Dt> + : NLdSt<1, 0b10, 0b1100, op7_4, (outs DPR:$Vd, DPR:$dst2), + (ins addrmode6dup:$Rn), IIC_VLD1dup, + "vld1", Dt, "\\{$Vd[], $dst2[]\\}, $Rn", "", []> { + let Rm = 0b1111; + let Inst{4} = Rn{4}; +} + +def VLD1DUPq8 : VLD1QDUP<{0,0,1,0}, "8">; +def VLD1DUPq16 : VLD1QDUP<{0,1,1,?}, "16">; +def VLD1DUPq32 : VLD1QDUP<{1,0,1,?}, "32">; + +// ...with address register writeback: +class VLD1DUPWB<bits<4> op7_4, string Dt> + : NLdSt<1, 0b10, 0b1100, op7_4, (outs DPR:$Vd, GPR:$wb), + (ins addrmode6dup:$Rn, am6offset:$Rm), IIC_VLD1dupu, + "vld1", Dt, "\\{$Vd[]\\}, $Rn$Rm", "$Rn.addr = $wb", []> { + let Inst{4} = Rn{4}; +} +class VLD1QDUPWB<bits<4> op7_4, string Dt> + : NLdSt<1, 0b10, 0b1100, op7_4, (outs DPR:$Vd, DPR:$dst2, GPR:$wb), + (ins addrmode6dup:$Rn, am6offset:$Rm), IIC_VLD1dupu, + "vld1", Dt, "\\{$Vd[], $dst2[]\\}, $Rn$Rm", "$Rn.addr = $wb", []> { + let Inst{4} = Rn{4}; +} + +def VLD1DUPd8_UPD : VLD1DUPWB<{0,0,0,0}, "8">; +def VLD1DUPd16_UPD : VLD1DUPWB<{0,1,0,?}, "16">; +def VLD1DUPd32_UPD : VLD1DUPWB<{1,0,0,?}, "32">; + +def VLD1DUPq8_UPD : VLD1QDUPWB<{0,0,1,0}, "8">; +def VLD1DUPq16_UPD : VLD1QDUPWB<{0,1,1,?}, "16">; +def VLD1DUPq32_UPD : VLD1QDUPWB<{1,0,1,?}, "32">; + +def VLD1DUPq8Pseudo_UPD : VLDQWBPseudo<IIC_VLD1dupu>; +def VLD1DUPq16Pseudo_UPD : VLDQWBPseudo<IIC_VLD1dupu>; +def VLD1DUPq32Pseudo_UPD : VLDQWBPseudo<IIC_VLD1dupu>; + // VLD2DUP : Vector Load (single 2-element structure to all lanes) +class VLD2DUP<bits<4> op7_4, string Dt> + : NLdSt<1, 0b10, 0b1101, op7_4, (outs DPR:$Vd, DPR:$dst2), + (ins addrmode6dup:$Rn), IIC_VLD2dup, + "vld2", Dt, "\\{$Vd[], $dst2[]\\}, $Rn", "", []> { + let Rm = 0b1111; + let Inst{4} = Rn{4}; +} + +def VLD2DUPd8 : VLD2DUP<{0,0,0,?}, "8">; +def VLD2DUPd16 : VLD2DUP<{0,1,0,?}, "16">; +def VLD2DUPd32 : VLD2DUP<{1,0,0,?}, "32">; + +def VLD2DUPd8Pseudo : VLDQPseudo<IIC_VLD2dup>; +def VLD2DUPd16Pseudo : VLDQPseudo<IIC_VLD2dup>; +def VLD2DUPd32Pseudo : VLDQPseudo<IIC_VLD2dup>; + +// ...with double-spaced registers (not used for codegen): +def VLD2DUPd8x2 : VLD2DUP<{0,0,1,?}, "8">; +def VLD2DUPd16x2 : VLD2DUP<{0,1,1,?}, "16">; +def VLD2DUPd32x2 : VLD2DUP<{1,0,1,?}, "32">; + +// ...with address register writeback: +class VLD2DUPWB<bits<4> op7_4, string Dt> + : NLdSt<1, 0b10, 0b1101, op7_4, (outs DPR:$Vd, DPR:$dst2, GPR:$wb), + (ins addrmode6dup:$Rn, am6offset:$Rm), IIC_VLD2dupu, + "vld2", Dt, "\\{$Vd[], $dst2[]\\}, $Rn$Rm", "$Rn.addr = $wb", []> { + let Inst{4} = Rn{4}; +} + +def VLD2DUPd8_UPD : VLD2DUPWB<{0,0,0,0}, "8">; +def VLD2DUPd16_UPD : VLD2DUPWB<{0,1,0,?}, "16">; +def VLD2DUPd32_UPD : VLD2DUPWB<{1,0,0,?}, "32">; + +def VLD2DUPd8x2_UPD : VLD2DUPWB<{0,0,1,0}, "8">; +def VLD2DUPd16x2_UPD : VLD2DUPWB<{0,1,1,?}, "16">; +def VLD2DUPd32x2_UPD : VLD2DUPWB<{1,0,1,?}, "32">; + +def VLD2DUPd8Pseudo_UPD : VLDQWBPseudo<IIC_VLD2dupu>; +def VLD2DUPd16Pseudo_UPD : VLDQWBPseudo<IIC_VLD2dupu>; +def VLD2DUPd32Pseudo_UPD : VLDQWBPseudo<IIC_VLD2dupu>; + // VLD3DUP : Vector Load (single 3-element structure to all lanes) +class VLD3DUP<bits<4> op7_4, string Dt> + : NLdSt<1, 0b10, 0b1110, op7_4, (outs DPR:$Vd, DPR:$dst2, DPR:$dst3), + (ins addrmode6dup:$Rn), IIC_VLD3dup, + "vld3", Dt, "\\{$Vd[], $dst2[], $dst3[]\\}, $Rn", "", []> { + let Rm = 0b1111; + let Inst{4} = Rn{4}; +} + +def VLD3DUPd8 : VLD3DUP<{0,0,0,?}, "8">; +def VLD3DUPd16 : VLD3DUP<{0,1,0,?}, "16">; +def VLD3DUPd32 : VLD3DUP<{1,0,0,?}, "32">; + +def VLD3DUPd8Pseudo : VLDQQPseudo<IIC_VLD3dup>; +def VLD3DUPd16Pseudo : VLDQQPseudo<IIC_VLD3dup>; +def VLD3DUPd32Pseudo : VLDQQPseudo<IIC_VLD3dup>; + +// ...with double-spaced registers (not used for codegen): +def VLD3DUPd8x2 : VLD3DUP<{0,0,1,?}, "8">; +def VLD3DUPd16x2 : VLD3DUP<{0,1,1,?}, "16">; +def VLD3DUPd32x2 : VLD3DUP<{1,0,1,?}, "32">; + +// ...with address register writeback: +class VLD3DUPWB<bits<4> op7_4, string Dt> + : NLdSt<1, 0b10, 0b1110, op7_4, (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, GPR:$wb), + (ins addrmode6dup:$Rn, am6offset:$Rm), IIC_VLD3dupu, + "vld3", Dt, "\\{$Vd[], $dst2[], $dst3[]\\}, $Rn$Rm", + "$Rn.addr = $wb", []> { + let Inst{4} = Rn{4}; +} + +def VLD3DUPd8_UPD : VLD3DUPWB<{0,0,0,0}, "8">; +def VLD3DUPd16_UPD : VLD3DUPWB<{0,1,0,?}, "16">; +def VLD3DUPd32_UPD : VLD3DUPWB<{1,0,0,?}, "32">; + +def VLD3DUPd8x2_UPD : VLD3DUPWB<{0,0,1,0}, "8">; +def VLD3DUPd16x2_UPD : VLD3DUPWB<{0,1,1,?}, "16">; +def VLD3DUPd32x2_UPD : VLD3DUPWB<{1,0,1,?}, "32">; + +def VLD3DUPd8Pseudo_UPD : VLDQQWBPseudo<IIC_VLD3dupu>; +def VLD3DUPd16Pseudo_UPD : VLDQQWBPseudo<IIC_VLD3dupu>; +def VLD3DUPd32Pseudo_UPD : VLDQQWBPseudo<IIC_VLD3dupu>; + // VLD4DUP : Vector Load (single 4-element structure to all lanes) -// FIXME: Not yet implemented. +class VLD4DUP<bits<4> op7_4, string Dt> + : NLdSt<1, 0b10, 0b1111, op7_4, + (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, DPR:$dst4), + (ins addrmode6dup:$Rn), IIC_VLD4dup, + "vld4", Dt, "\\{$Vd[], $dst2[], $dst3[], $dst4[]\\}, $Rn", "", []> { + let Rm = 0b1111; + let Inst{4} = Rn{4}; +} + +def VLD4DUPd8 : VLD4DUP<{0,0,0,?}, "8">; +def VLD4DUPd16 : VLD4DUP<{0,1,0,?}, "16">; +def VLD4DUPd32 : VLD4DUP<{1,?,0,?}, "32"> { let Inst{6} = Rn{5}; } + +def VLD4DUPd8Pseudo : VLDQQPseudo<IIC_VLD4dup>; +def VLD4DUPd16Pseudo : VLDQQPseudo<IIC_VLD4dup>; +def VLD4DUPd32Pseudo : VLDQQPseudo<IIC_VLD4dup>; + +// ...with double-spaced registers (not used for codegen): +def VLD4DUPd8x2 : VLD4DUP<{0,0,1,?}, "8">; +def VLD4DUPd16x2 : VLD4DUP<{0,1,1,?}, "16">; +def VLD4DUPd32x2 : VLD4DUP<{1,?,1,?}, "32"> { let Inst{6} = Rn{5}; } + +// ...with address register writeback: +class VLD4DUPWB<bits<4> op7_4, string Dt> + : NLdSt<1, 0b10, 0b1111, op7_4, + (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, DPR:$dst4, GPR:$wb), + (ins addrmode6dup:$Rn, am6offset:$Rm), IIC_VLD4dupu, + "vld4", Dt, "\\{$Vd[], $dst2[], $dst3[], $dst4[]\\}, $Rn$Rm", + "$Rn.addr = $wb", []> { + let Inst{4} = Rn{4}; +} + +def VLD4DUPd8_UPD : VLD4DUPWB<{0,0,0,0}, "8">; +def VLD4DUPd16_UPD : VLD4DUPWB<{0,1,0,?}, "16">; +def VLD4DUPd32_UPD : VLD4DUPWB<{1,?,0,?}, "32"> { let Inst{6} = Rn{5}; } + +def VLD4DUPd8x2_UPD : VLD4DUPWB<{0,0,1,0}, "8">; +def VLD4DUPd16x2_UPD : VLD4DUPWB<{0,1,1,?}, "16">; +def VLD4DUPd32x2_UPD : VLD4DUPWB<{1,?,1,?}, "32"> { let Inst{6} = Rn{5}; } + +def VLD4DUPd8Pseudo_UPD : VLDQQWBPseudo<IIC_VLD4dupu>; +def VLD4DUPd16Pseudo_UPD : VLDQQWBPseudo<IIC_VLD4dupu>; +def VLD4DUPd32Pseudo_UPD : VLDQQWBPseudo<IIC_VLD4dupu>; + } // mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 let mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1 in { // Classes for VST* pseudo-instructions with multi-register operands. // These are expanded to real instructions after register allocation. -class VSTQPseudo - : PseudoNLdSt<(outs), (ins addrmode6:$addr, QPR:$src), IIC_VST, "">; -class VSTQWBPseudo +class VSTQPseudo<InstrItinClass itin> + : PseudoNLdSt<(outs), (ins addrmode6:$addr, QPR:$src), itin, "">; +class VSTQWBPseudo<InstrItinClass itin> : PseudoNLdSt<(outs GPR:$wb), - (ins addrmode6:$addr, am6offset:$offset, QPR:$src), IIC_VST, + (ins addrmode6:$addr, am6offset:$offset, QPR:$src), itin, "$addr.addr = $wb">; -class VSTQQPseudo - : PseudoNLdSt<(outs), (ins addrmode6:$addr, QQPR:$src), IIC_VST, "">; -class VSTQQWBPseudo +class VSTQQPseudo<InstrItinClass itin> + : PseudoNLdSt<(outs), (ins addrmode6:$addr, QQPR:$src), itin, "">; +class VSTQQWBPseudo<InstrItinClass itin> : PseudoNLdSt<(outs GPR:$wb), - (ins addrmode6:$addr, am6offset:$offset, QQPR:$src), IIC_VST, + (ins addrmode6:$addr, am6offset:$offset, QQPR:$src), itin, "$addr.addr = $wb">; -class VSTQQQQWBPseudo +class VSTQQQQPseudo<InstrItinClass itin> + : PseudoNLdSt<(outs), (ins addrmode6:$addr, QQQQPR:$src), itin, "">; +class VSTQQQQWBPseudo<InstrItinClass itin> : PseudoNLdSt<(outs GPR:$wb), - (ins addrmode6:$addr, am6offset:$offset, QQQQPR:$src), IIC_VST, + (ins addrmode6:$addr, am6offset:$offset, QQQQPR:$src), itin, "$addr.addr = $wb">; // VST1 : Vector Store (multiple single elements) class VST1D<bits<4> op7_4, string Dt> - : NLdSt<0,0b00,0b0111,op7_4, (outs), (ins addrmode6:$addr, DPR:$src), IIC_VST, - "vst1", Dt, "\\{$src\\}, $addr", "", []>; + : NLdSt<0,0b00,0b0111,op7_4, (outs), (ins addrmode6:$Rn, DPR:$Vd), + IIC_VST1, "vst1", Dt, "\\{$Vd\\}, $Rn", "", []> { + let Rm = 0b1111; + let Inst{4} = Rn{4}; +} class VST1Q<bits<4> op7_4, string Dt> : NLdSt<0,0b00,0b1010,op7_4, (outs), - (ins addrmode6:$addr, DPR:$src1, DPR:$src2), IIC_VST, - "vst1", Dt, "\\{$src1, $src2\\}, $addr", "", []>; + (ins addrmode6:$Rn, DPR:$Vd, DPR:$src2), IIC_VST1x2, + "vst1", Dt, "\\{$Vd, $src2\\}, $Rn", "", []> { + let Rm = 0b1111; + let Inst{5-4} = Rn{5-4}; +} -def VST1d8 : VST1D<0b0000, "8">; -def VST1d16 : VST1D<0b0100, "16">; -def VST1d32 : VST1D<0b1000, "32">; -def VST1d64 : VST1D<0b1100, "64">; +def VST1d8 : VST1D<{0,0,0,?}, "8">; +def VST1d16 : VST1D<{0,1,0,?}, "16">; +def VST1d32 : VST1D<{1,0,0,?}, "32">; +def VST1d64 : VST1D<{1,1,0,?}, "64">; -def VST1q8 : VST1Q<0b0000, "8">; -def VST1q16 : VST1Q<0b0100, "16">; -def VST1q32 : VST1Q<0b1000, "32">; -def VST1q64 : VST1Q<0b1100, "64">; +def VST1q8 : VST1Q<{0,0,?,?}, "8">; +def VST1q16 : VST1Q<{0,1,?,?}, "16">; +def VST1q32 : VST1Q<{1,0,?,?}, "32">; +def VST1q64 : VST1Q<{1,1,?,?}, "64">; -def VST1q8Pseudo : VSTQPseudo; -def VST1q16Pseudo : VSTQPseudo; -def VST1q32Pseudo : VSTQPseudo; -def VST1q64Pseudo : VSTQPseudo; +def VST1q8Pseudo : VSTQPseudo<IIC_VST1x2>; +def VST1q16Pseudo : VSTQPseudo<IIC_VST1x2>; +def VST1q32Pseudo : VSTQPseudo<IIC_VST1x2>; +def VST1q64Pseudo : VSTQPseudo<IIC_VST1x2>; // ...with address register writeback: class VST1DWB<bits<4> op7_4, string Dt> : NLdSt<0, 0b00, 0b0111, op7_4, (outs GPR:$wb), - (ins addrmode6:$addr, am6offset:$offset, DPR:$src), IIC_VST, - "vst1", Dt, "\\{$src\\}, $addr$offset", "$addr.addr = $wb", []>; + (ins addrmode6:$Rn, am6offset:$Rm, DPR:$Vd), IIC_VST1u, + "vst1", Dt, "\\{$Vd\\}, $Rn$Rm", "$Rn.addr = $wb", []> { + let Inst{4} = Rn{4}; +} class VST1QWB<bits<4> op7_4, string Dt> : NLdSt<0, 0b00, 0b1010, op7_4, (outs GPR:$wb), - (ins addrmode6:$addr, am6offset:$offset, QPR:$src), IIC_VST, - "vst1", Dt, "${src:dregpair}, $addr$offset", "$addr.addr = $wb", []>; + (ins addrmode6:$Rn, am6offset:$Rm, DPR:$Vd, DPR:$src2), + IIC_VST1x2u, "vst1", Dt, "\\{$Vd, $src2\\}, $Rn$Rm", + "$Rn.addr = $wb", []> { + let Inst{5-4} = Rn{5-4}; +} -def VST1d8_UPD : VST1DWB<0b0000, "8">; -def VST1d16_UPD : VST1DWB<0b0100, "16">; -def VST1d32_UPD : VST1DWB<0b1000, "32">; -def VST1d64_UPD : VST1DWB<0b1100, "64">; +def VST1d8_UPD : VST1DWB<{0,0,0,?}, "8">; +def VST1d16_UPD : VST1DWB<{0,1,0,?}, "16">; +def VST1d32_UPD : VST1DWB<{1,0,0,?}, "32">; +def VST1d64_UPD : VST1DWB<{1,1,0,?}, "64">; -def VST1q8_UPD : VST1QWB<0b0000, "8">; -def VST1q16_UPD : VST1QWB<0b0100, "16">; -def VST1q32_UPD : VST1QWB<0b1000, "32">; -def VST1q64_UPD : VST1QWB<0b1100, "64">; +def VST1q8_UPD : VST1QWB<{0,0,?,?}, "8">; +def VST1q16_UPD : VST1QWB<{0,1,?,?}, "16">; +def VST1q32_UPD : VST1QWB<{1,0,?,?}, "32">; +def VST1q64_UPD : VST1QWB<{1,1,?,?}, "64">; -def VST1q8Pseudo_UPD : VSTQWBPseudo; -def VST1q16Pseudo_UPD : VSTQWBPseudo; -def VST1q32Pseudo_UPD : VSTQWBPseudo; -def VST1q64Pseudo_UPD : VSTQWBPseudo; +def VST1q8Pseudo_UPD : VSTQWBPseudo<IIC_VST1x2u>; +def VST1q16Pseudo_UPD : VSTQWBPseudo<IIC_VST1x2u>; +def VST1q32Pseudo_UPD : VSTQWBPseudo<IIC_VST1x2u>; +def VST1q64Pseudo_UPD : VSTQWBPseudo<IIC_VST1x2u>; // ...with 3 registers (some of these are only for the disassembler): class VST1D3<bits<4> op7_4, string Dt> : NLdSt<0, 0b00, 0b0110, op7_4, (outs), - (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3), - IIC_VST, "vst1", Dt, "\\{$src1, $src2, $src3\\}, $addr", "", []>; + (ins addrmode6:$Rn, DPR:$Vd, DPR:$src2, DPR:$src3), + IIC_VST1x3, "vst1", Dt, "\\{$Vd, $src2, $src3\\}, $Rn", "", []> { + let Rm = 0b1111; + let Inst{4} = Rn{4}; +} class VST1D3WB<bits<4> op7_4, string Dt> : NLdSt<0, 0b00, 0b0110, op7_4, (outs GPR:$wb), - (ins addrmode6:$addr, am6offset:$offset, - DPR:$src1, DPR:$src2, DPR:$src3), - IIC_VST, "vst1", Dt, "\\{$src1, $src2, $src3\\}, $addr$offset", - "$addr.addr = $wb", []>; + (ins addrmode6:$Rn, am6offset:$Rm, + DPR:$Vd, DPR:$src2, DPR:$src3), + IIC_VST1x3u, "vst1", Dt, "\\{$Vd, $src2, $src3\\}, $Rn$Rm", + "$Rn.addr = $wb", []> { + let Inst{4} = Rn{4}; +} -def VST1d8T : VST1D3<0b0000, "8">; -def VST1d16T : VST1D3<0b0100, "16">; -def VST1d32T : VST1D3<0b1000, "32">; -def VST1d64T : VST1D3<0b1100, "64">; +def VST1d8T : VST1D3<{0,0,0,?}, "8">; +def VST1d16T : VST1D3<{0,1,0,?}, "16">; +def VST1d32T : VST1D3<{1,0,0,?}, "32">; +def VST1d64T : VST1D3<{1,1,0,?}, "64">; -def VST1d8T_UPD : VST1D3WB<0b0000, "8">; -def VST1d16T_UPD : VST1D3WB<0b0100, "16">; -def VST1d32T_UPD : VST1D3WB<0b1000, "32">; -def VST1d64T_UPD : VST1D3WB<0b1100, "64">; +def VST1d8T_UPD : VST1D3WB<{0,0,0,?}, "8">; +def VST1d16T_UPD : VST1D3WB<{0,1,0,?}, "16">; +def VST1d32T_UPD : VST1D3WB<{1,0,0,?}, "32">; +def VST1d64T_UPD : VST1D3WB<{1,1,0,?}, "64">; -def VST1d64TPseudo : VSTQQPseudo; -def VST1d64TPseudo_UPD : VSTQQWBPseudo; +def VST1d64TPseudo : VSTQQPseudo<IIC_VST1x3>; +def VST1d64TPseudo_UPD : VSTQQWBPseudo<IIC_VST1x3u>; // ...with 4 registers (some of these are only for the disassembler): class VST1D4<bits<4> op7_4, string Dt> : NLdSt<0, 0b00, 0b0010, op7_4, (outs), - (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4), - IIC_VST, "vst1", Dt, "\\{$src1, $src2, $src3, $src4\\}, $addr", "", - []>; + (ins addrmode6:$Rn, DPR:$Vd, DPR:$src2, DPR:$src3, DPR:$src4), + IIC_VST1x4, "vst1", Dt, "\\{$Vd, $src2, $src3, $src4\\}, $Rn", "", + []> { + let Rm = 0b1111; + let Inst{5-4} = Rn{5-4}; +} class VST1D4WB<bits<4> op7_4, string Dt> : NLdSt<0, 0b00, 0b0010, op7_4, (outs GPR:$wb), - (ins addrmode6:$addr, am6offset:$offset, - DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4), - IIC_VST, "vst1", Dt, "\\{$src1, $src2, $src3, $src4\\}, $addr$offset", - "$addr.addr = $wb", []>; + (ins addrmode6:$Rn, am6offset:$Rm, + DPR:$Vd, DPR:$src2, DPR:$src3, DPR:$src4), IIC_VST1x4u, + "vst1", Dt, "\\{$Vd, $src2, $src3, $src4\\}, $Rn$Rm", + "$Rn.addr = $wb", []> { + let Inst{5-4} = Rn{5-4}; +} -def VST1d8Q : VST1D4<0b0000, "8">; -def VST1d16Q : VST1D4<0b0100, "16">; -def VST1d32Q : VST1D4<0b1000, "32">; -def VST1d64Q : VST1D4<0b1100, "64">; +def VST1d8Q : VST1D4<{0,0,?,?}, "8">; +def VST1d16Q : VST1D4<{0,1,?,?}, "16">; +def VST1d32Q : VST1D4<{1,0,?,?}, "32">; +def VST1d64Q : VST1D4<{1,1,?,?}, "64">; -def VST1d8Q_UPD : VST1D4WB<0b0000, "8">; -def VST1d16Q_UPD : VST1D4WB<0b0100, "16">; -def VST1d32Q_UPD : VST1D4WB<0b1000, "32">; -def VST1d64Q_UPD : VST1D4WB<0b1100, "64">; +def VST1d8Q_UPD : VST1D4WB<{0,0,?,?}, "8">; +def VST1d16Q_UPD : VST1D4WB<{0,1,?,?}, "16">; +def VST1d32Q_UPD : VST1D4WB<{1,0,?,?}, "32">; +def VST1d64Q_UPD : VST1D4WB<{1,1,?,?}, "64">; -def VST1d64QPseudo : VSTQQPseudo; -def VST1d64QPseudo_UPD : VSTQQWBPseudo; +def VST1d64QPseudo : VSTQQPseudo<IIC_VST1x4>; +def VST1d64QPseudo_UPD : VSTQQWBPseudo<IIC_VST1x4u>; // VST2 : Vector Store (multiple 2-element structures) class VST2D<bits<4> op11_8, bits<4> op7_4, string Dt> : NLdSt<0, 0b00, op11_8, op7_4, (outs), - (ins addrmode6:$addr, DPR:$src1, DPR:$src2), - IIC_VST, "vst2", Dt, "\\{$src1, $src2\\}, $addr", "", []>; + (ins addrmode6:$Rn, DPR:$Vd, DPR:$src2), + IIC_VST2, "vst2", Dt, "\\{$Vd, $src2\\}, $Rn", "", []> { + let Rm = 0b1111; + let Inst{5-4} = Rn{5-4}; +} class VST2Q<bits<4> op7_4, string Dt> : NLdSt<0, 0b00, 0b0011, op7_4, (outs), - (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4), - IIC_VST, "vst2", Dt, "\\{$src1, $src2, $src3, $src4\\}, $addr", - "", []>; + (ins addrmode6:$Rn, DPR:$Vd, DPR:$src2, DPR:$src3, DPR:$src4), + IIC_VST2x2, "vst2", Dt, "\\{$Vd, $src2, $src3, $src4\\}, $Rn", + "", []> { + let Rm = 0b1111; + let Inst{5-4} = Rn{5-4}; +} -def VST2d8 : VST2D<0b1000, 0b0000, "8">; -def VST2d16 : VST2D<0b1000, 0b0100, "16">; -def VST2d32 : VST2D<0b1000, 0b1000, "32">; +def VST2d8 : VST2D<0b1000, {0,0,?,?}, "8">; +def VST2d16 : VST2D<0b1000, {0,1,?,?}, "16">; +def VST2d32 : VST2D<0b1000, {1,0,?,?}, "32">; -def VST2q8 : VST2Q<0b0000, "8">; -def VST2q16 : VST2Q<0b0100, "16">; -def VST2q32 : VST2Q<0b1000, "32">; +def VST2q8 : VST2Q<{0,0,?,?}, "8">; +def VST2q16 : VST2Q<{0,1,?,?}, "16">; +def VST2q32 : VST2Q<{1,0,?,?}, "32">; -def VST2d8Pseudo : VSTQPseudo; -def VST2d16Pseudo : VSTQPseudo; -def VST2d32Pseudo : VSTQPseudo; +def VST2d8Pseudo : VSTQPseudo<IIC_VST2>; +def VST2d16Pseudo : VSTQPseudo<IIC_VST2>; +def VST2d32Pseudo : VSTQPseudo<IIC_VST2>; -def VST2q8Pseudo : VSTQQPseudo; -def VST2q16Pseudo : VSTQQPseudo; -def VST2q32Pseudo : VSTQQPseudo; +def VST2q8Pseudo : VSTQQPseudo<IIC_VST2x2>; +def VST2q16Pseudo : VSTQQPseudo<IIC_VST2x2>; +def VST2q32Pseudo : VSTQQPseudo<IIC_VST2x2>; // ...with address register writeback: class VST2DWB<bits<4> op11_8, bits<4> op7_4, string Dt> : NLdSt<0, 0b00, op11_8, op7_4, (outs GPR:$wb), - (ins addrmode6:$addr, am6offset:$offset, DPR:$src1, DPR:$src2), - IIC_VST, "vst2", Dt, "\\{$src1, $src2\\}, $addr$offset", - "$addr.addr = $wb", []>; + (ins addrmode6:$Rn, am6offset:$Rm, DPR:$Vd, DPR:$src2), + IIC_VST2u, "vst2", Dt, "\\{$Vd, $src2\\}, $Rn$Rm", + "$Rn.addr = $wb", []> { + let Inst{5-4} = Rn{5-4}; +} class VST2QWB<bits<4> op7_4, string Dt> : NLdSt<0, 0b00, 0b0011, op7_4, (outs GPR:$wb), - (ins addrmode6:$addr, am6offset:$offset, - DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4), - IIC_VST, "vst2", Dt, "\\{$src1, $src2, $src3, $src4\\}, $addr$offset", - "$addr.addr = $wb", []>; + (ins addrmode6:$Rn, am6offset:$Rm, + DPR:$Vd, DPR:$src2, DPR:$src3, DPR:$src4), IIC_VST2x2u, + "vst2", Dt, "\\{$Vd, $src2, $src3, $src4\\}, $Rn$Rm", + "$Rn.addr = $wb", []> { + let Inst{5-4} = Rn{5-4}; +} -def VST2d8_UPD : VST2DWB<0b1000, 0b0000, "8">; -def VST2d16_UPD : VST2DWB<0b1000, 0b0100, "16">; -def VST2d32_UPD : VST2DWB<0b1000, 0b1000, "32">; +def VST2d8_UPD : VST2DWB<0b1000, {0,0,?,?}, "8">; +def VST2d16_UPD : VST2DWB<0b1000, {0,1,?,?}, "16">; +def VST2d32_UPD : VST2DWB<0b1000, {1,0,?,?}, "32">; -def VST2q8_UPD : VST2QWB<0b0000, "8">; -def VST2q16_UPD : VST2QWB<0b0100, "16">; -def VST2q32_UPD : VST2QWB<0b1000, "32">; +def VST2q8_UPD : VST2QWB<{0,0,?,?}, "8">; +def VST2q16_UPD : VST2QWB<{0,1,?,?}, "16">; +def VST2q32_UPD : VST2QWB<{1,0,?,?}, "32">; -def VST2d8Pseudo_UPD : VSTQWBPseudo; -def VST2d16Pseudo_UPD : VSTQWBPseudo; -def VST2d32Pseudo_UPD : VSTQWBPseudo; +def VST2d8Pseudo_UPD : VSTQWBPseudo<IIC_VST2u>; +def VST2d16Pseudo_UPD : VSTQWBPseudo<IIC_VST2u>; +def VST2d32Pseudo_UPD : VSTQWBPseudo<IIC_VST2u>; -def VST2q8Pseudo_UPD : VSTQQWBPseudo; -def VST2q16Pseudo_UPD : VSTQQWBPseudo; -def VST2q32Pseudo_UPD : VSTQQWBPseudo; +def VST2q8Pseudo_UPD : VSTQQWBPseudo<IIC_VST2x2u>; +def VST2q16Pseudo_UPD : VSTQQWBPseudo<IIC_VST2x2u>; +def VST2q32Pseudo_UPD : VSTQQWBPseudo<IIC_VST2x2u>; // ...with double-spaced registers (for disassembly only): -def VST2b8 : VST2D<0b1001, 0b0000, "8">; -def VST2b16 : VST2D<0b1001, 0b0100, "16">; -def VST2b32 : VST2D<0b1001, 0b1000, "32">; -def VST2b8_UPD : VST2DWB<0b1001, 0b0000, "8">; -def VST2b16_UPD : VST2DWB<0b1001, 0b0100, "16">; -def VST2b32_UPD : VST2DWB<0b1001, 0b1000, "32">; +def VST2b8 : VST2D<0b1001, {0,0,?,?}, "8">; +def VST2b16 : VST2D<0b1001, {0,1,?,?}, "16">; +def VST2b32 : VST2D<0b1001, {1,0,?,?}, "32">; +def VST2b8_UPD : VST2DWB<0b1001, {0,0,?,?}, "8">; +def VST2b16_UPD : VST2DWB<0b1001, {0,1,?,?}, "16">; +def VST2b32_UPD : VST2DWB<0b1001, {1,0,?,?}, "32">; // VST3 : Vector Store (multiple 3-element structures) class VST3D<bits<4> op11_8, bits<4> op7_4, string Dt> : NLdSt<0, 0b00, op11_8, op7_4, (outs), - (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3), IIC_VST, - "vst3", Dt, "\\{$src1, $src2, $src3\\}, $addr", "", []>; + (ins addrmode6:$Rn, DPR:$Vd, DPR:$src2, DPR:$src3), IIC_VST3, + "vst3", Dt, "\\{$Vd, $src2, $src3\\}, $Rn", "", []> { + let Rm = 0b1111; + let Inst{4} = Rn{4}; +} -def VST3d8 : VST3D<0b0100, 0b0000, "8">; -def VST3d16 : VST3D<0b0100, 0b0100, "16">; -def VST3d32 : VST3D<0b0100, 0b1000, "32">; +def VST3d8 : VST3D<0b0100, {0,0,0,?}, "8">; +def VST3d16 : VST3D<0b0100, {0,1,0,?}, "16">; +def VST3d32 : VST3D<0b0100, {1,0,0,?}, "32">; -def VST3d8Pseudo : VSTQQPseudo; -def VST3d16Pseudo : VSTQQPseudo; -def VST3d32Pseudo : VSTQQPseudo; +def VST3d8Pseudo : VSTQQPseudo<IIC_VST3>; +def VST3d16Pseudo : VSTQQPseudo<IIC_VST3>; +def VST3d32Pseudo : VSTQQPseudo<IIC_VST3>; // ...with address register writeback: class VST3DWB<bits<4> op11_8, bits<4> op7_4, string Dt> : NLdSt<0, 0b00, op11_8, op7_4, (outs GPR:$wb), - (ins addrmode6:$addr, am6offset:$offset, - DPR:$src1, DPR:$src2, DPR:$src3), IIC_VST, - "vst3", Dt, "\\{$src1, $src2, $src3\\}, $addr$offset", - "$addr.addr = $wb", []>; - -def VST3d8_UPD : VST3DWB<0b0100, 0b0000, "8">; -def VST3d16_UPD : VST3DWB<0b0100, 0b0100, "16">; -def VST3d32_UPD : VST3DWB<0b0100, 0b1000, "32">; - -def VST3d8Pseudo_UPD : VSTQQWBPseudo; -def VST3d16Pseudo_UPD : VSTQQWBPseudo; -def VST3d32Pseudo_UPD : VSTQQWBPseudo; - -// ...with double-spaced registers (non-updating versions for disassembly only): -def VST3q8 : VST3D<0b0101, 0b0000, "8">; -def VST3q16 : VST3D<0b0101, 0b0100, "16">; -def VST3q32 : VST3D<0b0101, 0b1000, "32">; -def VST3q8_UPD : VST3DWB<0b0101, 0b0000, "8">; -def VST3q16_UPD : VST3DWB<0b0101, 0b0100, "16">; -def VST3q32_UPD : VST3DWB<0b0101, 0b1000, "32">; - -def VST3q8Pseudo_UPD : VSTQQQQWBPseudo; -def VST3q16Pseudo_UPD : VSTQQQQWBPseudo; -def VST3q32Pseudo_UPD : VSTQQQQWBPseudo; + (ins addrmode6:$Rn, am6offset:$Rm, + DPR:$Vd, DPR:$src2, DPR:$src3), IIC_VST3u, + "vst3", Dt, "\\{$Vd, $src2, $src3\\}, $Rn$Rm", + "$Rn.addr = $wb", []> { + let Inst{4} = Rn{4}; +} + +def VST3d8_UPD : VST3DWB<0b0100, {0,0,0,?}, "8">; +def VST3d16_UPD : VST3DWB<0b0100, {0,1,0,?}, "16">; +def VST3d32_UPD : VST3DWB<0b0100, {1,0,0,?}, "32">; + +def VST3d8Pseudo_UPD : VSTQQWBPseudo<IIC_VST3u>; +def VST3d16Pseudo_UPD : VSTQQWBPseudo<IIC_VST3u>; +def VST3d32Pseudo_UPD : VSTQQWBPseudo<IIC_VST3u>; + +// ...with double-spaced registers: +def VST3q8 : VST3D<0b0101, {0,0,0,?}, "8">; +def VST3q16 : VST3D<0b0101, {0,1,0,?}, "16">; +def VST3q32 : VST3D<0b0101, {1,0,0,?}, "32">; +def VST3q8_UPD : VST3DWB<0b0101, {0,0,0,?}, "8">; +def VST3q16_UPD : VST3DWB<0b0101, {0,1,0,?}, "16">; +def VST3q32_UPD : VST3DWB<0b0101, {1,0,0,?}, "32">; + +def VST3q8Pseudo_UPD : VSTQQQQWBPseudo<IIC_VST3u>; +def VST3q16Pseudo_UPD : VSTQQQQWBPseudo<IIC_VST3u>; +def VST3q32Pseudo_UPD : VSTQQQQWBPseudo<IIC_VST3u>; // ...alternate versions to be allocated odd register numbers: -def VST3q8oddPseudo_UPD : VSTQQQQWBPseudo; -def VST3q16oddPseudo_UPD : VSTQQQQWBPseudo; -def VST3q32oddPseudo_UPD : VSTQQQQWBPseudo; +def VST3q8oddPseudo : VSTQQQQPseudo<IIC_VST3>; +def VST3q16oddPseudo : VSTQQQQPseudo<IIC_VST3>; +def VST3q32oddPseudo : VSTQQQQPseudo<IIC_VST3>; + +def VST3q8oddPseudo_UPD : VSTQQQQWBPseudo<IIC_VST3u>; +def VST3q16oddPseudo_UPD : VSTQQQQWBPseudo<IIC_VST3u>; +def VST3q32oddPseudo_UPD : VSTQQQQWBPseudo<IIC_VST3u>; // VST4 : Vector Store (multiple 4-element structures) class VST4D<bits<4> op11_8, bits<4> op7_4, string Dt> : NLdSt<0, 0b00, op11_8, op7_4, (outs), - (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4), - IIC_VST, "vst4", Dt, "\\{$src1, $src2, $src3, $src4\\}, $addr", - "", []>; + (ins addrmode6:$Rn, DPR:$Vd, DPR:$src2, DPR:$src3, DPR:$src4), + IIC_VST4, "vst4", Dt, "\\{$Vd, $src2, $src3, $src4\\}, $Rn", + "", []> { + let Rm = 0b1111; + let Inst{5-4} = Rn{5-4}; +} -def VST4d8 : VST4D<0b0000, 0b0000, "8">; -def VST4d16 : VST4D<0b0000, 0b0100, "16">; -def VST4d32 : VST4D<0b0000, 0b1000, "32">; +def VST4d8 : VST4D<0b0000, {0,0,?,?}, "8">; +def VST4d16 : VST4D<0b0000, {0,1,?,?}, "16">; +def VST4d32 : VST4D<0b0000, {1,0,?,?}, "32">; -def VST4d8Pseudo : VSTQQPseudo; -def VST4d16Pseudo : VSTQQPseudo; -def VST4d32Pseudo : VSTQQPseudo; +def VST4d8Pseudo : VSTQQPseudo<IIC_VST4>; +def VST4d16Pseudo : VSTQQPseudo<IIC_VST4>; +def VST4d32Pseudo : VSTQQPseudo<IIC_VST4>; // ...with address register writeback: class VST4DWB<bits<4> op11_8, bits<4> op7_4, string Dt> : NLdSt<0, 0b00, op11_8, op7_4, (outs GPR:$wb), - (ins addrmode6:$addr, am6offset:$offset, - DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4), IIC_VST, - "vst4", Dt, "\\{$src1, $src2, $src3, $src4\\}, $addr$offset", - "$addr.addr = $wb", []>; - -def VST4d8_UPD : VST4DWB<0b0000, 0b0000, "8">; -def VST4d16_UPD : VST4DWB<0b0000, 0b0100, "16">; -def VST4d32_UPD : VST4DWB<0b0000, 0b1000, "32">; - -def VST4d8Pseudo_UPD : VSTQQWBPseudo; -def VST4d16Pseudo_UPD : VSTQQWBPseudo; -def VST4d32Pseudo_UPD : VSTQQWBPseudo; - -// ...with double-spaced registers (non-updating versions for disassembly only): -def VST4q8 : VST4D<0b0001, 0b0000, "8">; -def VST4q16 : VST4D<0b0001, 0b0100, "16">; -def VST4q32 : VST4D<0b0001, 0b1000, "32">; -def VST4q8_UPD : VST4DWB<0b0001, 0b0000, "8">; -def VST4q16_UPD : VST4DWB<0b0001, 0b0100, "16">; -def VST4q32_UPD : VST4DWB<0b0001, 0b1000, "32">; - -def VST4q8Pseudo_UPD : VSTQQQQWBPseudo; -def VST4q16Pseudo_UPD : VSTQQQQWBPseudo; -def VST4q32Pseudo_UPD : VSTQQQQWBPseudo; + (ins addrmode6:$Rn, am6offset:$Rm, + DPR:$Vd, DPR:$src2, DPR:$src3, DPR:$src4), IIC_VST4u, + "vst4", Dt, "\\{$Vd, $src2, $src3, $src4\\}, $Rn$Rm", + "$Rn.addr = $wb", []> { + let Inst{5-4} = Rn{5-4}; +} + +def VST4d8_UPD : VST4DWB<0b0000, {0,0,?,?}, "8">; +def VST4d16_UPD : VST4DWB<0b0000, {0,1,?,?}, "16">; +def VST4d32_UPD : VST4DWB<0b0000, {1,0,?,?}, "32">; + +def VST4d8Pseudo_UPD : VSTQQWBPseudo<IIC_VST4u>; +def VST4d16Pseudo_UPD : VSTQQWBPseudo<IIC_VST4u>; +def VST4d32Pseudo_UPD : VSTQQWBPseudo<IIC_VST4u>; + +// ...with double-spaced registers: +def VST4q8 : VST4D<0b0001, {0,0,?,?}, "8">; +def VST4q16 : VST4D<0b0001, {0,1,?,?}, "16">; +def VST4q32 : VST4D<0b0001, {1,0,?,?}, "32">; +def VST4q8_UPD : VST4DWB<0b0001, {0,0,?,?}, "8">; +def VST4q16_UPD : VST4DWB<0b0001, {0,1,?,?}, "16">; +def VST4q32_UPD : VST4DWB<0b0001, {1,0,?,?}, "32">; + +def VST4q8Pseudo_UPD : VSTQQQQWBPseudo<IIC_VST4u>; +def VST4q16Pseudo_UPD : VSTQQQQWBPseudo<IIC_VST4u>; +def VST4q32Pseudo_UPD : VSTQQQQWBPseudo<IIC_VST4u>; // ...alternate versions to be allocated odd register numbers: -def VST4q8oddPseudo_UPD : VSTQQQQWBPseudo; -def VST4q16oddPseudo_UPD : VSTQQQQWBPseudo; -def VST4q32oddPseudo_UPD : VSTQQQQWBPseudo; +def VST4q8oddPseudo : VSTQQQQPseudo<IIC_VST4>; +def VST4q16oddPseudo : VSTQQQQPseudo<IIC_VST4>; +def VST4q32oddPseudo : VSTQQQQPseudo<IIC_VST4>; + +def VST4q8oddPseudo_UPD : VSTQQQQWBPseudo<IIC_VST4u>; +def VST4q16oddPseudo_UPD : VSTQQQQWBPseudo<IIC_VST4u>; +def VST4q32oddPseudo_UPD : VSTQQQQWBPseudo<IIC_VST4u>; + +} // mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1 + +// Classes for VST*LN pseudo-instructions with multi-register operands. +// These are expanded to real instructions after register allocation. +class VSTQLNPseudo<InstrItinClass itin> + : PseudoNLdSt<(outs), (ins addrmode6:$addr, QPR:$src, nohash_imm:$lane), + itin, "">; +class VSTQLNWBPseudo<InstrItinClass itin> + : PseudoNLdSt<(outs GPR:$wb), + (ins addrmode6:$addr, am6offset:$offset, QPR:$src, + nohash_imm:$lane), itin, "$addr.addr = $wb">; +class VSTQQLNPseudo<InstrItinClass itin> + : PseudoNLdSt<(outs), (ins addrmode6:$addr, QQPR:$src, nohash_imm:$lane), + itin, "">; +class VSTQQLNWBPseudo<InstrItinClass itin> + : PseudoNLdSt<(outs GPR:$wb), + (ins addrmode6:$addr, am6offset:$offset, QQPR:$src, + nohash_imm:$lane), itin, "$addr.addr = $wb">; +class VSTQQQQLNPseudo<InstrItinClass itin> + : PseudoNLdSt<(outs), (ins addrmode6:$addr, QQQQPR:$src, nohash_imm:$lane), + itin, "">; +class VSTQQQQLNWBPseudo<InstrItinClass itin> + : PseudoNLdSt<(outs GPR:$wb), + (ins addrmode6:$addr, am6offset:$offset, QQQQPR:$src, + nohash_imm:$lane), itin, "$addr.addr = $wb">; // VST1LN : Vector Store (single element from one lane) -// FIXME: Not yet implemented. +class VST1LN<bits<4> op11_8, bits<4> op7_4, string Dt, ValueType Ty, + PatFrag StoreOp, SDNode ExtractOp> + : NLdStLn<1, 0b00, op11_8, op7_4, (outs), + (ins addrmode6:$Rn, DPR:$Vd, nohash_imm:$lane), + IIC_VST1ln, "vst1", Dt, "\\{$Vd[$lane]\\}, $Rn", "", + [(StoreOp (ExtractOp (Ty DPR:$Vd), imm:$lane), addrmode6:$Rn)]> { + let Rm = 0b1111; +} +class VST1QLNPseudo<ValueType Ty, PatFrag StoreOp, SDNode ExtractOp> + : VSTQLNPseudo<IIC_VST1ln> { + let Pattern = [(StoreOp (ExtractOp (Ty QPR:$src), imm:$lane), + addrmode6:$addr)]; +} + +def VST1LNd8 : VST1LN<0b0000, {?,?,?,0}, "8", v8i8, truncstorei8, + NEONvgetlaneu> { + let Inst{7-5} = lane{2-0}; +} +def VST1LNd16 : VST1LN<0b0100, {?,?,0,?}, "16", v4i16, truncstorei16, + NEONvgetlaneu> { + let Inst{7-6} = lane{1-0}; + let Inst{4} = Rn{5}; +} +def VST1LNd32 : VST1LN<0b1000, {?,0,?,?}, "32", v2i32, store, extractelt> { + let Inst{7} = lane{0}; + let Inst{5-4} = Rn{5-4}; +} + +def VST1LNq8Pseudo : VST1QLNPseudo<v16i8, truncstorei8, NEONvgetlaneu>; +def VST1LNq16Pseudo : VST1QLNPseudo<v8i16, truncstorei16, NEONvgetlaneu>; +def VST1LNq32Pseudo : VST1QLNPseudo<v4i32, store, extractelt>; + +def : Pat<(store (extractelt (v2f32 DPR:$src), imm:$lane), addrmode6:$addr), + (VST1LNd32 addrmode6:$addr, DPR:$src, imm:$lane)>; +def : Pat<(store (extractelt (v4f32 QPR:$src), imm:$lane), addrmode6:$addr), + (VST1LNq32Pseudo addrmode6:$addr, QPR:$src, imm:$lane)>; + +let mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1 in { + +// ...with address register writeback: +class VST1LNWB<bits<4> op11_8, bits<4> op7_4, string Dt> + : NLdStLn<1, 0b00, op11_8, op7_4, (outs GPR:$wb), + (ins addrmode6:$Rn, am6offset:$Rm, + DPR:$Vd, nohash_imm:$lane), IIC_VST1lnu, "vst1", Dt, + "\\{$Vd[$lane]\\}, $Rn$Rm", + "$Rn.addr = $wb", []>; + +def VST1LNd8_UPD : VST1LNWB<0b0000, {?,?,?,0}, "8"> { + let Inst{7-5} = lane{2-0}; +} +def VST1LNd16_UPD : VST1LNWB<0b0100, {?,?,0,?}, "16"> { + let Inst{7-6} = lane{1-0}; + let Inst{4} = Rn{5}; +} +def VST1LNd32_UPD : VST1LNWB<0b1000, {?,0,?,?}, "32"> { + let Inst{7} = lane{0}; + let Inst{5-4} = Rn{5-4}; +} + +def VST1LNq8Pseudo_UPD : VSTQLNWBPseudo<IIC_VST1lnu>; +def VST1LNq16Pseudo_UPD : VSTQLNWBPseudo<IIC_VST1lnu>; +def VST1LNq32Pseudo_UPD : VSTQLNWBPseudo<IIC_VST1lnu>; // VST2LN : Vector Store (single 2-element structure from one lane) class VST2LN<bits<4> op11_8, bits<4> op7_4, string Dt> - : NLdSt<1, 0b00, op11_8, op7_4, (outs), - (ins addrmode6:$addr, DPR:$src1, DPR:$src2, nohash_imm:$lane), - IIC_VST, "vst2", Dt, "\\{$src1[$lane], $src2[$lane]\\}, $addr", - "", []>; + : NLdStLn<1, 0b00, op11_8, op7_4, (outs), + (ins addrmode6:$Rn, DPR:$Vd, DPR:$src2, nohash_imm:$lane), + IIC_VST2ln, "vst2", Dt, "\\{$Vd[$lane], $src2[$lane]\\}, $Rn", + "", []> { + let Rm = 0b1111; + let Inst{4} = Rn{4}; +} -def VST2LNd8 : VST2LN<0b0001, {?,?,?,?}, "8">; -def VST2LNd16 : VST2LN<0b0101, {?,?,0,?}, "16">; -def VST2LNd32 : VST2LN<0b1001, {?,0,?,?}, "32">; +def VST2LNd8 : VST2LN<0b0001, {?,?,?,?}, "8"> { + let Inst{7-5} = lane{2-0}; +} +def VST2LNd16 : VST2LN<0b0101, {?,?,0,?}, "16"> { + let Inst{7-6} = lane{1-0}; +} +def VST2LNd32 : VST2LN<0b1001, {?,0,0,?}, "32"> { + let Inst{7} = lane{0}; +} + +def VST2LNd8Pseudo : VSTQLNPseudo<IIC_VST2ln>; +def VST2LNd16Pseudo : VSTQLNPseudo<IIC_VST2ln>; +def VST2LNd32Pseudo : VSTQLNPseudo<IIC_VST2ln>; // ...with double-spaced registers: -def VST2LNq16 : VST2LN<0b0101, {?,?,1,?}, "16">; -def VST2LNq32 : VST2LN<0b1001, {?,1,?,?}, "32">; +def VST2LNq16 : VST2LN<0b0101, {?,?,1,?}, "16"> { + let Inst{7-6} = lane{1-0}; + let Inst{4} = Rn{4}; +} +def VST2LNq32 : VST2LN<0b1001, {?,1,0,?}, "32"> { + let Inst{7} = lane{0}; + let Inst{4} = Rn{4}; +} -// ...alternate versions to be allocated odd register numbers: -def VST2LNq16odd : VST2LN<0b0101, {?,?,1,?}, "16">; -def VST2LNq32odd : VST2LN<0b1001, {?,1,?,?}, "32">; +def VST2LNq16Pseudo : VSTQQLNPseudo<IIC_VST2ln>; +def VST2LNq32Pseudo : VSTQQLNPseudo<IIC_VST2ln>; // ...with address register writeback: class VST2LNWB<bits<4> op11_8, bits<4> op7_4, string Dt> - : NLdSt<1, 0b00, op11_8, op7_4, (outs GPR:$wb), + : NLdStLn<1, 0b00, op11_8, op7_4, (outs GPR:$wb), (ins addrmode6:$addr, am6offset:$offset, - DPR:$src1, DPR:$src2, nohash_imm:$lane), IIC_VST, "vst2", Dt, + DPR:$src1, DPR:$src2, nohash_imm:$lane), IIC_VST2lnu, "vst2", Dt, "\\{$src1[$lane], $src2[$lane]\\}, $addr$offset", - "$addr.addr = $wb", []>; + "$addr.addr = $wb", []> { + let Inst{4} = Rn{4}; +} -def VST2LNd8_UPD : VST2LNWB<0b0001, {?,?,?,?}, "8">; -def VST2LNd16_UPD : VST2LNWB<0b0101, {?,?,0,?}, "16">; -def VST2LNd32_UPD : VST2LNWB<0b1001, {?,0,?,?}, "32">; +def VST2LNd8_UPD : VST2LNWB<0b0001, {?,?,?,?}, "8"> { + let Inst{7-5} = lane{2-0}; +} +def VST2LNd16_UPD : VST2LNWB<0b0101, {?,?,0,?}, "16"> { + let Inst{7-6} = lane{1-0}; +} +def VST2LNd32_UPD : VST2LNWB<0b1001, {?,0,0,?}, "32"> { + let Inst{7} = lane{0}; +} + +def VST2LNd8Pseudo_UPD : VSTQLNWBPseudo<IIC_VST2lnu>; +def VST2LNd16Pseudo_UPD : VSTQLNWBPseudo<IIC_VST2lnu>; +def VST2LNd32Pseudo_UPD : VSTQLNWBPseudo<IIC_VST2lnu>; + +def VST2LNq16_UPD : VST2LNWB<0b0101, {?,?,1,?}, "16"> { + let Inst{7-6} = lane{1-0}; +} +def VST2LNq32_UPD : VST2LNWB<0b1001, {?,1,0,?}, "32"> { + let Inst{7} = lane{0}; +} -def VST2LNq16_UPD : VST2LNWB<0b0101, {?,?,1,?}, "16">; -def VST2LNq32_UPD : VST2LNWB<0b1001, {?,1,?,?}, "32">; +def VST2LNq16Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST2lnu>; +def VST2LNq32Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST2lnu>; // VST3LN : Vector Store (single 3-element structure from one lane) class VST3LN<bits<4> op11_8, bits<4> op7_4, string Dt> - : NLdSt<1, 0b00, op11_8, op7_4, (outs), - (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3, - nohash_imm:$lane), IIC_VST, "vst3", Dt, - "\\{$src1[$lane], $src2[$lane], $src3[$lane]\\}, $addr", "", []>; + : NLdStLn<1, 0b00, op11_8, op7_4, (outs), + (ins addrmode6:$Rn, DPR:$Vd, DPR:$src2, DPR:$src3, + nohash_imm:$lane), IIC_VST3ln, "vst3", Dt, + "\\{$Vd[$lane], $src2[$lane], $src3[$lane]\\}, $Rn", "", []> { + let Rm = 0b1111; +} + +def VST3LNd8 : VST3LN<0b0010, {?,?,?,0}, "8"> { + let Inst{7-5} = lane{2-0}; +} +def VST3LNd16 : VST3LN<0b0110, {?,?,0,0}, "16"> { + let Inst{7-6} = lane{1-0}; +} +def VST3LNd32 : VST3LN<0b1010, {?,0,0,0}, "32"> { + let Inst{7} = lane{0}; +} -def VST3LNd8 : VST3LN<0b0010, {?,?,?,0}, "8">; -def VST3LNd16 : VST3LN<0b0110, {?,?,0,0}, "16">; -def VST3LNd32 : VST3LN<0b1010, {?,0,0,0}, "32">; +def VST3LNd8Pseudo : VSTQQLNPseudo<IIC_VST3ln>; +def VST3LNd16Pseudo : VSTQQLNPseudo<IIC_VST3ln>; +def VST3LNd32Pseudo : VSTQQLNPseudo<IIC_VST3ln>; // ...with double-spaced registers: -def VST3LNq16 : VST3LN<0b0110, {?,?,1,0}, "16">; -def VST3LNq32 : VST3LN<0b1010, {?,1,0,0}, "32">; +def VST3LNq16 : VST3LN<0b0110, {?,?,1,0}, "16"> { + let Inst{7-6} = lane{1-0}; +} +def VST3LNq32 : VST3LN<0b1010, {?,1,0,0}, "32"> { + let Inst{7} = lane{0}; +} -// ...alternate versions to be allocated odd register numbers: -def VST3LNq16odd : VST3LN<0b0110, {?,?,1,0}, "16">; -def VST3LNq32odd : VST3LN<0b1010, {?,1,0,0}, "32">; +def VST3LNq16Pseudo : VSTQQQQLNPseudo<IIC_VST3ln>; +def VST3LNq32Pseudo : VSTQQQQLNPseudo<IIC_VST3ln>; // ...with address register writeback: class VST3LNWB<bits<4> op11_8, bits<4> op7_4, string Dt> - : NLdSt<1, 0b00, op11_8, op7_4, (outs GPR:$wb), - (ins addrmode6:$addr, am6offset:$offset, - DPR:$src1, DPR:$src2, DPR:$src3, nohash_imm:$lane), - IIC_VST, "vst3", Dt, - "\\{$src1[$lane], $src2[$lane], $src3[$lane]\\}, $addr$offset", - "$addr.addr = $wb", []>; + : NLdStLn<1, 0b00, op11_8, op7_4, (outs GPR:$wb), + (ins addrmode6:$Rn, am6offset:$Rm, + DPR:$Vd, DPR:$src2, DPR:$src3, nohash_imm:$lane), + IIC_VST3lnu, "vst3", Dt, + "\\{$Vd[$lane], $src2[$lane], $src3[$lane]\\}, $Rn$Rm", + "$Rn.addr = $wb", []>; + +def VST3LNd8_UPD : VST3LNWB<0b0010, {?,?,?,0}, "8"> { + let Inst{7-5} = lane{2-0}; +} +def VST3LNd16_UPD : VST3LNWB<0b0110, {?,?,0,0}, "16"> { + let Inst{7-6} = lane{1-0}; +} +def VST3LNd32_UPD : VST3LNWB<0b1010, {?,0,0,0}, "32"> { + let Inst{7} = lane{0}; +} -def VST3LNd8_UPD : VST3LNWB<0b0010, {?,?,?,0}, "8">; -def VST3LNd16_UPD : VST3LNWB<0b0110, {?,?,0,0}, "16">; -def VST3LNd32_UPD : VST3LNWB<0b1010, {?,0,0,0}, "32">; +def VST3LNd8Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST3lnu>; +def VST3LNd16Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST3lnu>; +def VST3LNd32Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST3lnu>; + +def VST3LNq16_UPD : VST3LNWB<0b0110, {?,?,1,0}, "16"> { + let Inst{7-6} = lane{1-0}; +} +def VST3LNq32_UPD : VST3LNWB<0b1010, {?,1,0,0}, "32"> { + let Inst{7} = lane{0}; +} -def VST3LNq16_UPD : VST3LNWB<0b0110, {?,?,1,0}, "16">; -def VST3LNq32_UPD : VST3LNWB<0b1010, {?,1,0,0}, "32">; +def VST3LNq16Pseudo_UPD : VSTQQQQLNWBPseudo<IIC_VST3lnu>; +def VST3LNq32Pseudo_UPD : VSTQQQQLNWBPseudo<IIC_VST3lnu>; // VST4LN : Vector Store (single 4-element structure from one lane) class VST4LN<bits<4> op11_8, bits<4> op7_4, string Dt> - : NLdSt<1, 0b00, op11_8, op7_4, (outs), - (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4, - nohash_imm:$lane), IIC_VST, "vst4", Dt, - "\\{$src1[$lane], $src2[$lane], $src3[$lane], $src4[$lane]\\}, $addr", - "", []>; + : NLdStLn<1, 0b00, op11_8, op7_4, (outs), + (ins addrmode6:$Rn, DPR:$Vd, DPR:$src2, DPR:$src3, DPR:$src4, + nohash_imm:$lane), IIC_VST4ln, "vst4", Dt, + "\\{$Vd[$lane], $src2[$lane], $src3[$lane], $src4[$lane]\\}, $Rn", + "", []> { + let Rm = 0b1111; + let Inst{4} = Rn{4}; +} + +def VST4LNd8 : VST4LN<0b0011, {?,?,?,?}, "8"> { + let Inst{7-5} = lane{2-0}; +} +def VST4LNd16 : VST4LN<0b0111, {?,?,0,?}, "16"> { + let Inst{7-6} = lane{1-0}; +} +def VST4LNd32 : VST4LN<0b1011, {?,0,?,?}, "32"> { + let Inst{7} = lane{0}; + let Inst{5} = Rn{5}; +} -def VST4LNd8 : VST4LN<0b0011, {?,?,?,?}, "8">; -def VST4LNd16 : VST4LN<0b0111, {?,?,0,?}, "16">; -def VST4LNd32 : VST4LN<0b1011, {?,0,?,?}, "32">; +def VST4LNd8Pseudo : VSTQQLNPseudo<IIC_VST4ln>; +def VST4LNd16Pseudo : VSTQQLNPseudo<IIC_VST4ln>; +def VST4LNd32Pseudo : VSTQQLNPseudo<IIC_VST4ln>; // ...with double-spaced registers: -def VST4LNq16 : VST4LN<0b0111, {?,?,1,?}, "16">; -def VST4LNq32 : VST4LN<0b1011, {?,1,?,?}, "32">; +def VST4LNq16 : VST4LN<0b0111, {?,?,1,?}, "16"> { + let Inst{7-6} = lane{1-0}; +} +def VST4LNq32 : VST4LN<0b1011, {?,1,?,?}, "32"> { + let Inst{7} = lane{0}; + let Inst{5} = Rn{5}; +} -// ...alternate versions to be allocated odd register numbers: -def VST4LNq16odd : VST4LN<0b0111, {?,?,1,?}, "16">; -def VST4LNq32odd : VST4LN<0b1011, {?,1,?,?}, "32">; +def VST4LNq16Pseudo : VSTQQQQLNPseudo<IIC_VST4ln>; +def VST4LNq32Pseudo : VSTQQQQLNPseudo<IIC_VST4ln>; // ...with address register writeback: class VST4LNWB<bits<4> op11_8, bits<4> op7_4, string Dt> - : NLdSt<1, 0b00, op11_8, op7_4, (outs GPR:$wb), - (ins addrmode6:$addr, am6offset:$offset, - DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4, nohash_imm:$lane), - IIC_VST, "vst4", Dt, - "\\{$src1[$lane], $src2[$lane], $src3[$lane], $src4[$lane]\\}, $addr$offset", - "$addr.addr = $wb", []>; + : NLdStLn<1, 0b00, op11_8, op7_4, (outs GPR:$wb), + (ins addrmode6:$Rn, am6offset:$Rm, + DPR:$Vd, DPR:$src2, DPR:$src3, DPR:$src4, nohash_imm:$lane), + IIC_VST4lnu, "vst4", Dt, + "\\{$Vd[$lane], $src2[$lane], $src3[$lane], $src4[$lane]\\}, $Rn$Rm", + "$Rn.addr = $wb", []> { + let Inst{4} = Rn{4}; +} -def VST4LNd8_UPD : VST4LNWB<0b0011, {?,?,?,?}, "8">; -def VST4LNd16_UPD : VST4LNWB<0b0111, {?,?,0,?}, "16">; -def VST4LNd32_UPD : VST4LNWB<0b1011, {?,0,?,?}, "32">; +def VST4LNd8_UPD : VST4LNWB<0b0011, {?,?,?,?}, "8"> { + let Inst{7-5} = lane{2-0}; +} +def VST4LNd16_UPD : VST4LNWB<0b0111, {?,?,0,?}, "16"> { + let Inst{7-6} = lane{1-0}; +} +def VST4LNd32_UPD : VST4LNWB<0b1011, {?,0,?,?}, "32"> { + let Inst{7} = lane{0}; + let Inst{5} = Rn{5}; +} + +def VST4LNd8Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST4lnu>; +def VST4LNd16Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST4lnu>; +def VST4LNd32Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST4lnu>; + +def VST4LNq16_UPD : VST4LNWB<0b0111, {?,?,1,?}, "16"> { + let Inst{7-6} = lane{1-0}; +} +def VST4LNq32_UPD : VST4LNWB<0b1011, {?,1,?,?}, "32"> { + let Inst{7} = lane{0}; + let Inst{5} = Rn{5}; +} -def VST4LNq16_UPD : VST4LNWB<0b0111, {?,?,1,?}, "16">; -def VST4LNq32_UPD : VST4LNWB<0b1011, {?,1,?,?}, "32">; +def VST4LNq16Pseudo_UPD : VSTQQQQLNWBPseudo<IIC_VST4lnu>; +def VST4LNq32Pseudo_UPD : VSTQQQQLNWBPseudo<IIC_VST4lnu>; } // mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1 @@ -1000,98 +1687,92 @@ def SubReg_i32_lane : SDNodeXForm<imm, [{ // Instruction Classes //===----------------------------------------------------------------------===// -// Basic 2-register operations: single-, double- and quad-register. -class N2VS<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, - bits<2> op17_16, bits<5> op11_7, bit op4, string OpcodeStr, - string Dt, ValueType ResTy, ValueType OpTy, SDNode OpNode> - : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 0, op4, - (outs DPR_VFP2:$dst), (ins DPR_VFP2:$src), - IIC_VUNAD, OpcodeStr, Dt, "$dst, $src", "", []>; +// Basic 2-register operations: double- and quad-register. class N2VD<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, bits<2> op17_16, bits<5> op11_7, bit op4, string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy, SDNode OpNode> - : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 0, op4, (outs DPR:$dst), - (ins DPR:$src), IIC_VUNAD, OpcodeStr, Dt,"$dst, $src", "", - [(set DPR:$dst, (ResTy (OpNode (OpTy DPR:$src))))]>; + : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 0, op4, (outs DPR:$Vd), + (ins DPR:$Vm), IIC_VUNAD, OpcodeStr, Dt,"$Vd, $Vm", "", + [(set DPR:$Vd, (ResTy (OpNode (OpTy DPR:$Vm))))]>; class N2VQ<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, bits<2> op17_16, bits<5> op11_7, bit op4, string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy, SDNode OpNode> - : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 1, op4, (outs QPR:$dst), - (ins QPR:$src), IIC_VUNAQ, OpcodeStr, Dt,"$dst, $src", "", - [(set QPR:$dst, (ResTy (OpNode (OpTy QPR:$src))))]>; + : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 1, op4, (outs QPR:$Vd), + (ins QPR:$Vm), IIC_VUNAQ, OpcodeStr, Dt,"$Vd, $Vm", "", + [(set QPR:$Vd, (ResTy (OpNode (OpTy QPR:$Vm))))]>; // Basic 2-register intrinsics, both double- and quad-register. class N2VDInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, bits<2> op17_16, bits<5> op11_7, bit op4, InstrItinClass itin, string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy, Intrinsic IntOp> - : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 0, op4, (outs DPR:$dst), - (ins DPR:$src), itin, OpcodeStr, Dt, "$dst, $src", "", - [(set DPR:$dst, (ResTy (IntOp (OpTy DPR:$src))))]>; + : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 0, op4, (outs DPR:$Vd), + (ins DPR:$Vm), itin, OpcodeStr, Dt, "$Vd, $Vm", "", + [(set DPR:$Vd, (ResTy (IntOp (OpTy DPR:$Vm))))]>; class N2VQInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, bits<2> op17_16, bits<5> op11_7, bit op4, InstrItinClass itin, string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy, Intrinsic IntOp> - : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 1, op4, (outs QPR:$dst), - (ins QPR:$src), itin, OpcodeStr, Dt, "$dst, $src", "", - [(set QPR:$dst, (ResTy (IntOp (OpTy QPR:$src))))]>; + : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 1, op4, (outs QPR:$Vd), + (ins QPR:$Vm), itin, OpcodeStr, Dt, "$Vd, $Vm", "", + [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$Vm))))]>; // Narrow 2-register operations. class N2VN<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, bits<2> op17_16, bits<5> op11_7, bit op6, bit op4, InstrItinClass itin, string OpcodeStr, string Dt, ValueType TyD, ValueType TyQ, SDNode OpNode> - : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, op6, op4, (outs DPR:$dst), - (ins QPR:$src), itin, OpcodeStr, Dt, "$dst, $src", "", - [(set DPR:$dst, (TyD (OpNode (TyQ QPR:$src))))]>; + : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, op6, op4, (outs DPR:$Vd), + (ins QPR:$Vm), itin, OpcodeStr, Dt, "$Vd, $Vm", "", + [(set DPR:$Vd, (TyD (OpNode (TyQ QPR:$Vm))))]>; // Narrow 2-register intrinsics. class N2VNInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, bits<2> op17_16, bits<5> op11_7, bit op6, bit op4, InstrItinClass itin, string OpcodeStr, string Dt, ValueType TyD, ValueType TyQ, Intrinsic IntOp> - : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, op6, op4, (outs DPR:$dst), - (ins QPR:$src), itin, OpcodeStr, Dt, "$dst, $src", "", - [(set DPR:$dst, (TyD (IntOp (TyQ QPR:$src))))]>; + : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, op6, op4, (outs DPR:$Vd), + (ins QPR:$Vm), itin, OpcodeStr, Dt, "$Vd, $Vm", "", + [(set DPR:$Vd, (TyD (IntOp (TyQ QPR:$Vm))))]>; // Long 2-register operations (currently only used for VMOVL). class N2VL<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, bits<2> op17_16, bits<5> op11_7, bit op6, bit op4, InstrItinClass itin, string OpcodeStr, string Dt, ValueType TyQ, ValueType TyD, SDNode OpNode> - : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, op6, op4, (outs QPR:$dst), - (ins DPR:$src), itin, OpcodeStr, Dt, "$dst, $src", "", - [(set QPR:$dst, (TyQ (OpNode (TyD DPR:$src))))]>; + : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, op6, op4, (outs QPR:$Vd), + (ins DPR:$Vm), itin, OpcodeStr, Dt, "$Vd, $Vm", "", + [(set QPR:$Vd, (TyQ (OpNode (TyD DPR:$Vm))))]>; + +// Long 2-register intrinsics. +class N2VLInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, + bits<2> op17_16, bits<5> op11_7, bit op6, bit op4, + InstrItinClass itin, string OpcodeStr, string Dt, + ValueType TyQ, ValueType TyD, Intrinsic IntOp> + : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, op6, op4, (outs QPR:$Vd), + (ins DPR:$Vm), itin, OpcodeStr, Dt, "$Vd, $Vm", "", + [(set QPR:$Vd, (TyQ (IntOp (TyD DPR:$Vm))))]>; // 2-register shuffles (VTRN/VZIP/VUZP), both double- and quad-register. class N2VDShuffle<bits<2> op19_18, bits<5> op11_7, string OpcodeStr, string Dt> - : N2V<0b11, 0b11, op19_18, 0b10, op11_7, 0, 0, (outs DPR:$dst1, DPR:$dst2), - (ins DPR:$src1, DPR:$src2), IIC_VPERMD, - OpcodeStr, Dt, "$dst1, $dst2", - "$src1 = $dst1, $src2 = $dst2", []>; + : N2V<0b11, 0b11, op19_18, 0b10, op11_7, 0, 0, (outs DPR:$Vd, DPR:$Vm), + (ins DPR:$src1, DPR:$src2), IIC_VPERMD, + OpcodeStr, Dt, "$Vd, $Vm", + "$src1 = $Vd, $src2 = $Vm", []>; class N2VQShuffle<bits<2> op19_18, bits<5> op11_7, InstrItinClass itin, string OpcodeStr, string Dt> - : N2V<0b11, 0b11, op19_18, 0b10, op11_7, 1, 0, (outs QPR:$dst1, QPR:$dst2), - (ins QPR:$src1, QPR:$src2), itin, OpcodeStr, Dt, "$dst1, $dst2", - "$src1 = $dst1, $src2 = $dst2", []>; - -// Basic 3-register operations: single-, double- and quad-register. -class N3VS<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, - string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy, - SDNode OpNode, bit Commutable> - : N3V<op24, op23, op21_20, op11_8, 0, op4, - (outs DPR_VFP2:$dst), (ins DPR_VFP2:$src1, DPR_VFP2:$src2), N3RegFrm, - IIC_VBIND, OpcodeStr, Dt, "$dst, $src1, $src2", "", []> { - let isCommutable = Commutable; -} + : N2V<0b11, 0b11, op19_18, 0b10, op11_7, 1, 0, (outs QPR:$Vd, QPR:$Vm), + (ins QPR:$src1, QPR:$src2), itin, OpcodeStr, Dt, "$Vd, $Vm", + "$src1 = $Vd, $src2 = $Vm", []>; +// Basic 3-register operations: double- and quad-register. class N3VD<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, InstrItinClass itin, string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy, SDNode OpNode, bit Commutable> : N3V<op24, op23, op21_20, op11_8, 0, op4, - (outs DPR:$dst), (ins DPR:$src1, DPR:$src2), N3RegFrm, itin, - OpcodeStr, Dt, "$dst, $src1, $src2", "", - [(set DPR:$dst, (ResTy (OpNode (OpTy DPR:$src1), (OpTy DPR:$src2))))]> { + (outs DPR:$Vd), (ins DPR:$Vn, DPR:$Vm), N3RegFrm, itin, + OpcodeStr, Dt, "$Vd, $Vn, $Vm", "", + [(set DPR:$Vd, (ResTy (OpNode (OpTy DPR:$Vn), (OpTy DPR:$Vm))))]> { let isCommutable = Commutable; } // Same as N3VD but no data type. @@ -1100,31 +1781,31 @@ class N3VDX<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, ValueType ResTy, ValueType OpTy, SDNode OpNode, bit Commutable> : N3VX<op24, op23, op21_20, op11_8, 0, op4, - (outs DPR:$dst), (ins DPR:$src1, DPR:$src2), N3RegFrm, itin, - OpcodeStr, "$dst, $src1, $src2", "", - [(set DPR:$dst, (ResTy (OpNode (OpTy DPR:$src1), (OpTy DPR:$src2))))]>{ + (outs DPR:$Vd), (ins DPR:$Vn, DPR:$Vm), N3RegFrm, itin, + OpcodeStr, "$Vd, $Vn, $Vm", "", + [(set DPR:$Vd, (ResTy (OpNode (OpTy DPR:$Vn), (OpTy DPR:$Vm))))]>{ let isCommutable = Commutable; } -class N3VDSL<bits<2> op21_20, bits<4> op11_8, +class N3VDSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, string OpcodeStr, string Dt, ValueType Ty, SDNode ShOp> : N3V<0, 1, op21_20, op11_8, 1, 0, - (outs DPR:$dst), (ins DPR:$src1, DPR_VFP2:$src2, nohash_imm:$lane), - NVMulSLFrm, itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "", - [(set (Ty DPR:$dst), - (Ty (ShOp (Ty DPR:$src1), - (Ty (NEONvduplane (Ty DPR_VFP2:$src2),imm:$lane)))))]> { + (outs DPR:$Vd), (ins DPR:$Vn, DPR_VFP2:$Vm, nohash_imm:$lane), + NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm[$lane]", "", + [(set (Ty DPR:$Vd), + (Ty (ShOp (Ty DPR:$Vn), + (Ty (NEONvduplane (Ty DPR_VFP2:$Vm),imm:$lane)))))]> { let isCommutable = 0; } -class N3VDSL16<bits<2> op21_20, bits<4> op11_8, +class N3VDSL16<bits<2> op21_20, bits<4> op11_8, string OpcodeStr, string Dt, ValueType Ty, SDNode ShOp> : N3V<0, 1, op21_20, op11_8, 1, 0, - (outs DPR:$dst), (ins DPR:$src1, DPR_8:$src2, nohash_imm:$lane), - NVMulSLFrm, IIC_VMULi16D, OpcodeStr, Dt,"$dst, $src1, $src2[$lane]","", - [(set (Ty DPR:$dst), - (Ty (ShOp (Ty DPR:$src1), - (Ty (NEONvduplane (Ty DPR_8:$src2), imm:$lane)))))]> { + (outs DPR:$Vd), (ins DPR:$Vn, DPR_8:$Vm, nohash_imm:$lane), + NVMulSLFrm, IIC_VMULi16D, OpcodeStr, Dt,"$Vd, $Vn, $Vm[$lane]","", + [(set (Ty DPR:$Vd), + (Ty (ShOp (Ty DPR:$Vn), + (Ty (NEONvduplane (Ty DPR_8:$Vm), imm:$lane)))))]> { let isCommutable = 0; } @@ -1132,40 +1813,40 @@ class N3VQ<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, InstrItinClass itin, string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy, SDNode OpNode, bit Commutable> : N3V<op24, op23, op21_20, op11_8, 1, op4, - (outs QPR:$dst), (ins QPR:$src1, QPR:$src2), N3RegFrm, itin, - OpcodeStr, Dt, "$dst, $src1, $src2", "", - [(set QPR:$dst, (ResTy (OpNode (OpTy QPR:$src1), (OpTy QPR:$src2))))]> { + (outs QPR:$Vd), (ins QPR:$Vn, QPR:$Vm), N3RegFrm, itin, + OpcodeStr, Dt, "$Vd, $Vn, $Vm", "", + [(set QPR:$Vd, (ResTy (OpNode (OpTy QPR:$Vn), (OpTy QPR:$Vm))))]> { let isCommutable = Commutable; } class N3VQX<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, InstrItinClass itin, string OpcodeStr, ValueType ResTy, ValueType OpTy, SDNode OpNode, bit Commutable> : N3VX<op24, op23, op21_20, op11_8, 1, op4, - (outs QPR:$dst), (ins QPR:$src1, QPR:$src2), N3RegFrm, itin, - OpcodeStr, "$dst, $src1, $src2", "", - [(set QPR:$dst, (ResTy (OpNode (OpTy QPR:$src1), (OpTy QPR:$src2))))]>{ + (outs QPR:$Vd), (ins QPR:$Vn, QPR:$Vm), N3RegFrm, itin, + OpcodeStr, "$Vd, $Vn, $Vm", "", + [(set QPR:$Vd, (ResTy (OpNode (OpTy QPR:$Vn), (OpTy QPR:$Vm))))]>{ let isCommutable = Commutable; } -class N3VQSL<bits<2> op21_20, bits<4> op11_8, +class N3VQSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy, SDNode ShOp> : N3V<1, 1, op21_20, op11_8, 1, 0, - (outs QPR:$dst), (ins QPR:$src1, DPR_VFP2:$src2, nohash_imm:$lane), - NVMulSLFrm, itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "", - [(set (ResTy QPR:$dst), - (ResTy (ShOp (ResTy QPR:$src1), - (ResTy (NEONvduplane (OpTy DPR_VFP2:$src2), + (outs QPR:$Vd), (ins QPR:$Vn, DPR_VFP2:$Vm, nohash_imm:$lane), + NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm[$lane]", "", + [(set (ResTy QPR:$Vd), + (ResTy (ShOp (ResTy QPR:$Vn), + (ResTy (NEONvduplane (OpTy DPR_VFP2:$Vm), imm:$lane)))))]> { let isCommutable = 0; } class N3VQSL16<bits<2> op21_20, bits<4> op11_8, string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy, SDNode ShOp> : N3V<1, 1, op21_20, op11_8, 1, 0, - (outs QPR:$dst), (ins QPR:$src1, DPR_8:$src2, nohash_imm:$lane), - NVMulSLFrm, IIC_VMULi16Q, OpcodeStr, Dt,"$dst, $src1, $src2[$lane]","", - [(set (ResTy QPR:$dst), - (ResTy (ShOp (ResTy QPR:$src1), - (ResTy (NEONvduplane (OpTy DPR_8:$src2), + (outs QPR:$Vd), (ins QPR:$Vn, DPR_8:$Vm, nohash_imm:$lane), + NVMulSLFrm, IIC_VMULi16Q, OpcodeStr, Dt,"$Vd, $Vn, $Vm[$lane]","", + [(set (ResTy QPR:$Vd), + (ResTy (ShOp (ResTy QPR:$Vn), + (ResTy (NEONvduplane (OpTy DPR_8:$Vm), imm:$lane)))))]> { let isCommutable = 0; } @@ -1175,30 +1856,39 @@ class N3VDInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, Format f, InstrItinClass itin, string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy, Intrinsic IntOp, bit Commutable> : N3V<op24, op23, op21_20, op11_8, 0, op4, - (outs DPR:$dst), (ins DPR:$src1, DPR:$src2), f, itin, - OpcodeStr, Dt, "$dst, $src1, $src2", "", - [(set DPR:$dst, (ResTy (IntOp (OpTy DPR:$src1), (OpTy DPR:$src2))))]> { + (outs DPR:$Vd), (ins DPR:$Vn, DPR:$Vm), f, itin, + OpcodeStr, Dt, "$Vd, $Vn, $Vm", "", + [(set DPR:$Vd, (ResTy (IntOp (OpTy DPR:$Vn), (OpTy DPR:$Vm))))]> { let isCommutable = Commutable; } -class N3VDIntSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, +class N3VDIntSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, string OpcodeStr, string Dt, ValueType Ty, Intrinsic IntOp> : N3V<0, 1, op21_20, op11_8, 1, 0, - (outs DPR:$dst), (ins DPR:$src1, DPR_VFP2:$src2, nohash_imm:$lane), - NVMulSLFrm, itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "", - [(set (Ty DPR:$dst), - (Ty (IntOp (Ty DPR:$src1), - (Ty (NEONvduplane (Ty DPR_VFP2:$src2), + (outs DPR:$Vd), (ins DPR:$Vn, DPR_VFP2:$Vm, nohash_imm:$lane), + NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm[$lane]", "", + [(set (Ty DPR:$Vd), + (Ty (IntOp (Ty DPR:$Vn), + (Ty (NEONvduplane (Ty DPR_VFP2:$Vm), imm:$lane)))))]> { let isCommutable = 0; } class N3VDIntSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, string OpcodeStr, string Dt, ValueType Ty, Intrinsic IntOp> : N3V<0, 1, op21_20, op11_8, 1, 0, - (outs DPR:$dst), (ins DPR:$src1, DPR_8:$src2, nohash_imm:$lane), - NVMulSLFrm, itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "", - [(set (Ty DPR:$dst), - (Ty (IntOp (Ty DPR:$src1), - (Ty (NEONvduplane (Ty DPR_8:$src2), imm:$lane)))))]> { + (outs DPR:$Vd), (ins DPR:$Vn, DPR_8:$Vm, nohash_imm:$lane), + NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm[$lane]", "", + [(set (Ty DPR:$Vd), + (Ty (IntOp (Ty DPR:$Vn), + (Ty (NEONvduplane (Ty DPR_8:$Vm), imm:$lane)))))]> { + let isCommutable = 0; +} +class N3VDIntSh<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, + Format f, InstrItinClass itin, string OpcodeStr, string Dt, + ValueType ResTy, ValueType OpTy, Intrinsic IntOp> + : N3V<op24, op23, op21_20, op11_8, 0, op4, + (outs DPR:$Vd), (ins DPR:$Vm, DPR:$Vn), f, itin, + OpcodeStr, Dt, "$Vd, $Vm, $Vn", "", + [(set DPR:$Vd, (ResTy (IntOp (OpTy DPR:$Vm), (OpTy DPR:$Vn))))]> { let isCommutable = 0; } @@ -1206,20 +1896,20 @@ class N3VQInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, Format f, InstrItinClass itin, string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy, Intrinsic IntOp, bit Commutable> : N3V<op24, op23, op21_20, op11_8, 1, op4, - (outs QPR:$dst), (ins QPR:$src1, QPR:$src2), f, itin, - OpcodeStr, Dt, "$dst, $src1, $src2", "", - [(set QPR:$dst, (ResTy (IntOp (OpTy QPR:$src1), (OpTy QPR:$src2))))]> { + (outs QPR:$Vd), (ins QPR:$Vn, QPR:$Vm), f, itin, + OpcodeStr, Dt, "$Vd, $Vn, $Vm", "", + [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$Vn), (OpTy QPR:$Vm))))]> { let isCommutable = Commutable; } -class N3VQIntSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, +class N3VQIntSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy, Intrinsic IntOp> : N3V<1, 1, op21_20, op11_8, 1, 0, - (outs QPR:$dst), (ins QPR:$src1, DPR_VFP2:$src2, nohash_imm:$lane), - NVMulSLFrm, itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "", - [(set (ResTy QPR:$dst), - (ResTy (IntOp (ResTy QPR:$src1), - (ResTy (NEONvduplane (OpTy DPR_VFP2:$src2), + (outs QPR:$Vd), (ins QPR:$Vn, DPR_VFP2:$Vm, nohash_imm:$lane), + NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm[$lane]", "", + [(set (ResTy QPR:$Vd), + (ResTy (IntOp (ResTy QPR:$Vn), + (ResTy (NEONvduplane (OpTy DPR_VFP2:$Vm), imm:$lane)))))]> { let isCommutable = 0; } @@ -1227,93 +1917,95 @@ class N3VQIntSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy, Intrinsic IntOp> : N3V<1, 1, op21_20, op11_8, 1, 0, - (outs QPR:$dst), (ins QPR:$src1, DPR_8:$src2, nohash_imm:$lane), - NVMulSLFrm, itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "", - [(set (ResTy QPR:$dst), - (ResTy (IntOp (ResTy QPR:$src1), - (ResTy (NEONvduplane (OpTy DPR_8:$src2), + (outs QPR:$Vd), (ins QPR:$Vn, DPR_8:$Vm, nohash_imm:$lane), + NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm[$lane]", "", + [(set (ResTy QPR:$Vd), + (ResTy (IntOp (ResTy QPR:$Vn), + (ResTy (NEONvduplane (OpTy DPR_8:$Vm), imm:$lane)))))]> { let isCommutable = 0; } +class N3VQIntSh<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, + Format f, InstrItinClass itin, string OpcodeStr, string Dt, + ValueType ResTy, ValueType OpTy, Intrinsic IntOp> + : N3V<op24, op23, op21_20, op11_8, 1, op4, + (outs QPR:$Vd), (ins QPR:$Vm, QPR:$Vn), f, itin, + OpcodeStr, Dt, "$Vd, $Vm, $Vn", "", + [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$Vm), (OpTy QPR:$Vn))))]> { + let isCommutable = 0; +} -// Multiply-Add/Sub operations: single-, double- and quad-register. -class N3VSMulOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, - InstrItinClass itin, string OpcodeStr, string Dt, - ValueType Ty, SDNode MulOp, SDNode OpNode> - : N3V<op24, op23, op21_20, op11_8, 0, op4, - (outs DPR_VFP2:$dst), - (ins DPR_VFP2:$src1, DPR_VFP2:$src2, DPR_VFP2:$src3), N3RegFrm, itin, - OpcodeStr, Dt, "$dst, $src2, $src3", "$src1 = $dst", []>; - +// Multiply-Add/Sub operations: double- and quad-register. class N3VDMulOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, InstrItinClass itin, string OpcodeStr, string Dt, - ValueType Ty, SDNode MulOp, SDNode OpNode> + ValueType Ty, SDPatternOperator MulOp, SDPatternOperator OpNode> : N3V<op24, op23, op21_20, op11_8, 0, op4, - (outs DPR:$dst), (ins DPR:$src1, DPR:$src2, DPR:$src3), N3RegFrm, itin, - OpcodeStr, Dt, "$dst, $src2, $src3", "$src1 = $dst", - [(set DPR:$dst, (Ty (OpNode DPR:$src1, - (Ty (MulOp DPR:$src2, DPR:$src3)))))]>; + (outs DPR:$Vd), (ins DPR:$src1, DPR:$Vn, DPR:$Vm), N3RegFrm, itin, + OpcodeStr, Dt, "$Vd, $Vn, $Vm", "$src1 = $Vd", + [(set DPR:$Vd, (Ty (OpNode DPR:$src1, + (Ty (MulOp DPR:$Vn, DPR:$Vm)))))]>; + class N3VDMulOpSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, string OpcodeStr, string Dt, - ValueType Ty, SDNode MulOp, SDNode ShOp> + ValueType Ty, SDPatternOperator MulOp, SDPatternOperator ShOp> : N3V<0, 1, op21_20, op11_8, 1, 0, - (outs DPR:$dst), - (ins DPR:$src1, DPR:$src2, DPR_VFP2:$src3, nohash_imm:$lane), + (outs DPR:$Vd), + (ins DPR:$src1, DPR:$Vn, DPR_VFP2:$Vm, nohash_imm:$lane), NVMulSLFrm, itin, - OpcodeStr, Dt, "$dst, $src2, $src3[$lane]", "$src1 = $dst", - [(set (Ty DPR:$dst), + OpcodeStr, Dt, "$Vd, $Vn, $Vm[$lane]", "$src1 = $Vd", + [(set (Ty DPR:$Vd), (Ty (ShOp (Ty DPR:$src1), - (Ty (MulOp DPR:$src2, - (Ty (NEONvduplane (Ty DPR_VFP2:$src3), + (Ty (MulOp DPR:$Vn, + (Ty (NEONvduplane (Ty DPR_VFP2:$Vm), imm:$lane)))))))]>; class N3VDMulOpSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, string OpcodeStr, string Dt, ValueType Ty, SDNode MulOp, SDNode ShOp> : N3V<0, 1, op21_20, op11_8, 1, 0, - (outs DPR:$dst), - (ins DPR:$src1, DPR:$src2, DPR_8:$src3, nohash_imm:$lane), + (outs DPR:$Vd), + (ins DPR:$src1, DPR:$Vn, DPR_8:$Vm, nohash_imm:$lane), NVMulSLFrm, itin, - OpcodeStr, Dt, "$dst, $src2, $src3[$lane]", "$src1 = $dst", - [(set (Ty DPR:$dst), + OpcodeStr, Dt, "$Vd, $Vn, $Vm[$lane]", "$src1 = $Vd", + [(set (Ty DPR:$Vd), (Ty (ShOp (Ty DPR:$src1), - (Ty (MulOp DPR:$src2, - (Ty (NEONvduplane (Ty DPR_8:$src3), + (Ty (MulOp DPR:$Vn, + (Ty (NEONvduplane (Ty DPR_8:$Vm), imm:$lane)))))))]>; class N3VQMulOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, InstrItinClass itin, string OpcodeStr, string Dt, ValueType Ty, - SDNode MulOp, SDNode OpNode> + SDPatternOperator MulOp, SDPatternOperator OpNode> : N3V<op24, op23, op21_20, op11_8, 1, op4, - (outs QPR:$dst), (ins QPR:$src1, QPR:$src2, QPR:$src3), N3RegFrm, itin, - OpcodeStr, Dt, "$dst, $src2, $src3", "$src1 = $dst", - [(set QPR:$dst, (Ty (OpNode QPR:$src1, - (Ty (MulOp QPR:$src2, QPR:$src3)))))]>; + (outs QPR:$Vd), (ins QPR:$src1, QPR:$Vn, QPR:$Vm), N3RegFrm, itin, + OpcodeStr, Dt, "$Vd, $Vn, $Vm", "$src1 = $Vd", + [(set QPR:$Vd, (Ty (OpNode QPR:$src1, + (Ty (MulOp QPR:$Vn, QPR:$Vm)))))]>; class N3VQMulOpSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy, - SDNode MulOp, SDNode ShOp> + SDPatternOperator MulOp, SDPatternOperator ShOp> : N3V<1, 1, op21_20, op11_8, 1, 0, - (outs QPR:$dst), - (ins QPR:$src1, QPR:$src2, DPR_VFP2:$src3, nohash_imm:$lane), + (outs QPR:$Vd), + (ins QPR:$src1, QPR:$Vn, DPR_VFP2:$Vm, nohash_imm:$lane), NVMulSLFrm, itin, - OpcodeStr, Dt, "$dst, $src2, $src3[$lane]", "$src1 = $dst", - [(set (ResTy QPR:$dst), + OpcodeStr, Dt, "$Vd, $Vn, $Vm[$lane]", "$src1 = $Vd", + [(set (ResTy QPR:$Vd), (ResTy (ShOp (ResTy QPR:$src1), - (ResTy (MulOp QPR:$src2, - (ResTy (NEONvduplane (OpTy DPR_VFP2:$src3), + (ResTy (MulOp QPR:$Vn, + (ResTy (NEONvduplane (OpTy DPR_VFP2:$Vm), imm:$lane)))))))]>; class N3VQMulOpSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy, SDNode MulOp, SDNode ShOp> : N3V<1, 1, op21_20, op11_8, 1, 0, - (outs QPR:$dst), - (ins QPR:$src1, QPR:$src2, DPR_8:$src3, nohash_imm:$lane), + (outs QPR:$Vd), + (ins QPR:$src1, QPR:$Vn, DPR_8:$Vm, nohash_imm:$lane), NVMulSLFrm, itin, - OpcodeStr, Dt, "$dst, $src2, $src3[$lane]", "$src1 = $dst", - [(set (ResTy QPR:$dst), + OpcodeStr, Dt, "$Vd, $Vn, $Vm[$lane]", "$src1 = $Vd", + [(set (ResTy QPR:$Vd), (ResTy (ShOp (ResTy QPR:$src1), - (ResTy (MulOp QPR:$src2, - (ResTy (NEONvduplane (OpTy DPR_8:$src3), + (ResTy (MulOp QPR:$Vn, + (ResTy (NEONvduplane (OpTy DPR_8:$Vm), imm:$lane)))))))]>; // Neon Intrinsic-Op instructions (VABA): double- and quad-register. @@ -1321,18 +2013,18 @@ class N3VDIntOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, InstrItinClass itin, string OpcodeStr, string Dt, ValueType Ty, Intrinsic IntOp, SDNode OpNode> : N3V<op24, op23, op21_20, op11_8, 0, op4, - (outs DPR:$dst), (ins DPR:$src1, DPR:$src2, DPR:$src3), N3RegFrm, itin, - OpcodeStr, Dt, "$dst, $src2, $src3", "$src1 = $dst", - [(set DPR:$dst, (Ty (OpNode DPR:$src1, - (Ty (IntOp (Ty DPR:$src2), (Ty DPR:$src3))))))]>; + (outs DPR:$Vd), (ins DPR:$src1, DPR:$Vn, DPR:$Vm), N3RegFrm, itin, + OpcodeStr, Dt, "$Vd, $Vn, $Vm", "$src1 = $Vd", + [(set DPR:$Vd, (Ty (OpNode DPR:$src1, + (Ty (IntOp (Ty DPR:$Vn), (Ty DPR:$Vm))))))]>; class N3VQIntOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, InstrItinClass itin, string OpcodeStr, string Dt, ValueType Ty, Intrinsic IntOp, SDNode OpNode> : N3V<op24, op23, op21_20, op11_8, 1, op4, - (outs QPR:$dst), (ins QPR:$src1, QPR:$src2, QPR:$src3), N3RegFrm, itin, - OpcodeStr, Dt, "$dst, $src2, $src3", "$src1 = $dst", - [(set QPR:$dst, (Ty (OpNode QPR:$src1, - (Ty (IntOp (Ty QPR:$src2), (Ty QPR:$src3))))))]>; + (outs QPR:$Vd), (ins QPR:$src1, QPR:$Vn, QPR:$Vm), N3RegFrm, itin, + OpcodeStr, Dt, "$Vd, $Vn, $Vm", "$src1 = $Vd", + [(set QPR:$Vd, (Ty (OpNode QPR:$src1, + (Ty (IntOp (Ty QPR:$Vn), (Ty QPR:$Vm))))))]>; // Neon 3-argument intrinsics, both double- and quad-register. // The destination register is also used as the first source operand register. @@ -1340,52 +2032,52 @@ class N3VDInt3<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, InstrItinClass itin, string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy, Intrinsic IntOp> : N3V<op24, op23, op21_20, op11_8, 0, op4, - (outs DPR:$dst), (ins DPR:$src1, DPR:$src2, DPR:$src3), N3RegFrm, itin, - OpcodeStr, Dt, "$dst, $src2, $src3", "$src1 = $dst", - [(set DPR:$dst, (ResTy (IntOp (OpTy DPR:$src1), - (OpTy DPR:$src2), (OpTy DPR:$src3))))]>; + (outs DPR:$Vd), (ins DPR:$src1, DPR:$Vn, DPR:$Vm), N3RegFrm, itin, + OpcodeStr, Dt, "$Vd, $Vn, $Vm", "$src1 = $Vd", + [(set DPR:$Vd, (ResTy (IntOp (OpTy DPR:$src1), + (OpTy DPR:$Vn), (OpTy DPR:$Vm))))]>; class N3VQInt3<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, InstrItinClass itin, string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy, Intrinsic IntOp> : N3V<op24, op23, op21_20, op11_8, 1, op4, - (outs QPR:$dst), (ins QPR:$src1, QPR:$src2, QPR:$src3), N3RegFrm, itin, - OpcodeStr, Dt, "$dst, $src2, $src3", "$src1 = $dst", - [(set QPR:$dst, (ResTy (IntOp (OpTy QPR:$src1), - (OpTy QPR:$src2), (OpTy QPR:$src3))))]>; + (outs QPR:$Vd), (ins QPR:$src1, QPR:$Vn, QPR:$Vm), N3RegFrm, itin, + OpcodeStr, Dt, "$Vd, $Vn, $Vm", "$src1 = $Vd", + [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$src1), + (OpTy QPR:$Vn), (OpTy QPR:$Vm))))]>; // Long Multiply-Add/Sub operations. class N3VLMulOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, InstrItinClass itin, string OpcodeStr, string Dt, ValueType TyQ, ValueType TyD, SDNode MulOp, SDNode OpNode> : N3V<op24, op23, op21_20, op11_8, 0, op4, - (outs QPR:$dst), (ins QPR:$src1, DPR:$src2, DPR:$src3), N3RegFrm, itin, - OpcodeStr, Dt, "$dst, $src2, $src3", "$src1 = $dst", - [(set QPR:$dst, (OpNode (TyQ QPR:$src1), - (TyQ (MulOp (TyD DPR:$src2), - (TyD DPR:$src3)))))]>; + (outs QPR:$Vd), (ins QPR:$src1, DPR:$Vn, DPR:$Vm), N3RegFrm, itin, + OpcodeStr, Dt, "$Vd, $Vn, $Vm", "$src1 = $Vd", + [(set QPR:$Vd, (OpNode (TyQ QPR:$src1), + (TyQ (MulOp (TyD DPR:$Vn), + (TyD DPR:$Vm)))))]>; class N3VLMulOpSL<bit op24, bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, string OpcodeStr, string Dt, ValueType TyQ, ValueType TyD, SDNode MulOp, SDNode OpNode> - : N3V<op24, 1, op21_20, op11_8, 1, 0, (outs QPR:$dst), - (ins QPR:$src1, DPR:$src2, DPR_VFP2:$src3, nohash_imm:$lane), + : N3V<op24, 1, op21_20, op11_8, 1, 0, (outs QPR:$Vd), + (ins QPR:$src1, DPR:$Vn, DPR_VFP2:$Vm, nohash_imm:$lane), NVMulSLFrm, itin, - OpcodeStr, Dt, "$dst, $src2, $src3[$lane]", "$src1 = $dst", - [(set QPR:$dst, + OpcodeStr, Dt, "$Vd, $Vn, $Vm[$lane]", "$src1 = $Vd", + [(set QPR:$Vd, (OpNode (TyQ QPR:$src1), - (TyQ (MulOp (TyD DPR:$src2), - (TyD (NEONvduplane (TyD DPR_VFP2:$src3), + (TyQ (MulOp (TyD DPR:$Vn), + (TyD (NEONvduplane (TyD DPR_VFP2:$Vm), imm:$lane))))))]>; class N3VLMulOpSL16<bit op24, bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, string OpcodeStr, string Dt, ValueType TyQ, ValueType TyD, SDNode MulOp, SDNode OpNode> - : N3V<op24, 1, op21_20, op11_8, 1, 0, (outs QPR:$dst), - (ins QPR:$src1, DPR:$src2, DPR_8:$src3, nohash_imm:$lane), + : N3V<op24, 1, op21_20, op11_8, 1, 0, (outs QPR:$Vd), + (ins QPR:$src1, DPR:$Vn, DPR_8:$Vm, nohash_imm:$lane), NVMulSLFrm, itin, - OpcodeStr, Dt, "$dst, $src2, $src3[$lane]", "$src1 = $dst", - [(set QPR:$dst, + OpcodeStr, Dt, "$Vd, $Vn, $Vm[$lane]", "$src1 = $Vd", + [(set QPR:$Vd, (OpNode (TyQ QPR:$src1), - (TyQ (MulOp (TyD DPR:$src2), - (TyD (NEONvduplane (TyD DPR_8:$src3), + (TyQ (MulOp (TyD DPR:$Vn), + (TyD (NEONvduplane (TyD DPR_8:$Vm), imm:$lane))))))]>; // Long Intrinsic-Op vector operations with explicit extend (VABAL). @@ -1394,11 +2086,11 @@ class N3VLIntExtOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, ValueType TyQ, ValueType TyD, Intrinsic IntOp, SDNode ExtOp, SDNode OpNode> : N3V<op24, op23, op21_20, op11_8, 0, op4, - (outs QPR:$dst), (ins QPR:$src1, DPR:$src2, DPR:$src3), N3RegFrm, itin, - OpcodeStr, Dt, "$dst, $src2, $src3", "$src1 = $dst", - [(set QPR:$dst, (OpNode (TyQ QPR:$src1), - (TyQ (ExtOp (TyD (IntOp (TyD DPR:$src2), - (TyD DPR:$src3)))))))]>; + (outs QPR:$Vd), (ins QPR:$src1, DPR:$Vn, DPR:$Vm), N3RegFrm, itin, + OpcodeStr, Dt, "$Vd, $Vn, $Vm", "$src1 = $Vd", + [(set QPR:$Vd, (OpNode (TyQ QPR:$src1), + (TyQ (ExtOp (TyD (IntOp (TyD DPR:$Vn), + (TyD DPR:$Vm)))))))]>; // Neon Long 3-argument intrinsic. The destination register is // a quad-register and is also used as the first source operand register. @@ -1406,35 +2098,35 @@ class N3VLInt3<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, InstrItinClass itin, string OpcodeStr, string Dt, ValueType TyQ, ValueType TyD, Intrinsic IntOp> : N3V<op24, op23, op21_20, op11_8, 0, op4, - (outs QPR:$dst), (ins QPR:$src1, DPR:$src2, DPR:$src3), N3RegFrm, itin, - OpcodeStr, Dt, "$dst, $src2, $src3", "$src1 = $dst", - [(set QPR:$dst, - (TyQ (IntOp (TyQ QPR:$src1), (TyD DPR:$src2), (TyD DPR:$src3))))]>; + (outs QPR:$Vd), (ins QPR:$src1, DPR:$Vn, DPR:$Vm), N3RegFrm, itin, + OpcodeStr, Dt, "$Vd, $Vn, $Vm", "$src1 = $Vd", + [(set QPR:$Vd, + (TyQ (IntOp (TyQ QPR:$src1), (TyD DPR:$Vn), (TyD DPR:$Vm))))]>; class N3VLInt3SL<bit op24, bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy, Intrinsic IntOp> : N3V<op24, 1, op21_20, op11_8, 1, 0, - (outs QPR:$dst), - (ins QPR:$src1, DPR:$src2, DPR_VFP2:$src3, nohash_imm:$lane), + (outs QPR:$Vd), + (ins QPR:$src1, DPR:$Vn, DPR_VFP2:$Vm, nohash_imm:$lane), NVMulSLFrm, itin, - OpcodeStr, Dt, "$dst, $src2, $src3[$lane]", "$src1 = $dst", - [(set (ResTy QPR:$dst), + OpcodeStr, Dt, "$Vd, $Vn, $Vm[$lane]", "$src1 = $Vd", + [(set (ResTy QPR:$Vd), (ResTy (IntOp (ResTy QPR:$src1), - (OpTy DPR:$src2), - (OpTy (NEONvduplane (OpTy DPR_VFP2:$src3), + (OpTy DPR:$Vn), + (OpTy (NEONvduplane (OpTy DPR_VFP2:$Vm), imm:$lane)))))]>; class N3VLInt3SL16<bit op24, bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy, Intrinsic IntOp> : N3V<op24, 1, op21_20, op11_8, 1, 0, - (outs QPR:$dst), - (ins QPR:$src1, DPR:$src2, DPR_8:$src3, nohash_imm:$lane), + (outs QPR:$Vd), + (ins QPR:$src1, DPR:$Vn, DPR_8:$Vm, nohash_imm:$lane), NVMulSLFrm, itin, - OpcodeStr, Dt, "$dst, $src2, $src3[$lane]", "$src1 = $dst", - [(set (ResTy QPR:$dst), + OpcodeStr, Dt, "$Vd, $Vn, $Vm[$lane]", "$src1 = $Vd", + [(set (ResTy QPR:$Vd), (ResTy (IntOp (ResTy QPR:$src1), - (OpTy DPR:$src2), - (OpTy (NEONvduplane (OpTy DPR_8:$src3), + (OpTy DPR:$Vn), + (OpTy (NEONvduplane (OpTy DPR_8:$Vm), imm:$lane)))))]>; // Narrowing 3-register intrinsics. @@ -1442,9 +2134,9 @@ class N3VNInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, string OpcodeStr, string Dt, ValueType TyD, ValueType TyQ, Intrinsic IntOp, bit Commutable> : N3V<op24, op23, op21_20, op11_8, 0, op4, - (outs DPR:$dst), (ins QPR:$src1, QPR:$src2), N3RegFrm, IIC_VBINi4D, - OpcodeStr, Dt, "$dst, $src1, $src2", "", - [(set DPR:$dst, (TyD (IntOp (TyQ QPR:$src1), (TyQ QPR:$src2))))]> { + (outs DPR:$Vd), (ins QPR:$Vn, QPR:$Vm), N3RegFrm, IIC_VBINi4D, + OpcodeStr, Dt, "$Vd, $Vn, $Vm", "", + [(set DPR:$Vd, (TyD (IntOp (TyQ QPR:$Vn), (TyQ QPR:$Vm))))]> { let isCommutable = Commutable; } @@ -1453,29 +2145,29 @@ class N3VL<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, InstrItinClass itin, string OpcodeStr, string Dt, ValueType TyQ, ValueType TyD, SDNode OpNode, bit Commutable> : N3V<op24, op23, op21_20, op11_8, 0, op4, - (outs QPR:$dst), (ins DPR:$src1, DPR:$src2), N3RegFrm, itin, - OpcodeStr, Dt, "$dst, $src1, $src2", "", - [(set QPR:$dst, (TyQ (OpNode (TyD DPR:$src1), (TyD DPR:$src2))))]> { + (outs QPR:$Vd), (ins DPR:$Vn, DPR:$Vm), N3RegFrm, itin, + OpcodeStr, Dt, "$Vd, $Vn, $Vm", "", + [(set QPR:$Vd, (TyQ (OpNode (TyD DPR:$Vn), (TyD DPR:$Vm))))]> { let isCommutable = Commutable; } class N3VLSL<bit op24, bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, string OpcodeStr, string Dt, ValueType TyQ, ValueType TyD, SDNode OpNode> : N3V<op24, 1, op21_20, op11_8, 1, 0, - (outs QPR:$dst), (ins DPR:$src1, DPR_VFP2:$src2, nohash_imm:$lane), - NVMulSLFrm, itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "", - [(set QPR:$dst, - (TyQ (OpNode (TyD DPR:$src1), - (TyD (NEONvduplane (TyD DPR_VFP2:$src2),imm:$lane)))))]>; + (outs QPR:$Vd), (ins DPR:$Vn, DPR_VFP2:$Vm, nohash_imm:$lane), + NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm[$lane]", "", + [(set QPR:$Vd, + (TyQ (OpNode (TyD DPR:$Vn), + (TyD (NEONvduplane (TyD DPR_VFP2:$Vm),imm:$lane)))))]>; class N3VLSL16<bit op24, bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, string OpcodeStr, string Dt, ValueType TyQ, ValueType TyD, SDNode OpNode> : N3V<op24, 1, op21_20, op11_8, 1, 0, - (outs QPR:$dst), (ins DPR:$src1, DPR_8:$src2, nohash_imm:$lane), - NVMulSLFrm, itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "", - [(set QPR:$dst, - (TyQ (OpNode (TyD DPR:$src1), - (TyD (NEONvduplane (TyD DPR_8:$src2), imm:$lane)))))]>; + (outs QPR:$Vd), (ins DPR:$Vn, DPR_8:$Vm, nohash_imm:$lane), + NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm[$lane]", "", + [(set QPR:$Vd, + (TyQ (OpNode (TyD DPR:$Vn), + (TyD (NEONvduplane (TyD DPR_8:$Vm), imm:$lane)))))]>; // Long 3-register operations with explicitly extended operands. class N3VLExt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, @@ -1483,10 +2175,10 @@ class N3VLExt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, ValueType TyQ, ValueType TyD, SDNode OpNode, SDNode ExtOp, bit Commutable> : N3V<op24, op23, op21_20, op11_8, 0, op4, - (outs QPR:$dst), (ins DPR:$src1, DPR:$src2), N3RegFrm, itin, - OpcodeStr, Dt, "$dst, $src1, $src2", "", - [(set QPR:$dst, (OpNode (TyQ (ExtOp (TyD DPR:$src1))), - (TyQ (ExtOp (TyD DPR:$src2)))))]> { + (outs QPR:$Vd), (ins DPR:$Vn, DPR:$Vm), N3RegFrm, itin, + OpcodeStr, Dt, "$Vd, $Vn, $Vm", "", + [(set QPR:$Vd, (OpNode (TyQ (ExtOp (TyD DPR:$Vn))), + (TyQ (ExtOp (TyD DPR:$Vm)))))]> { let isCommutable = Commutable; } @@ -1496,10 +2188,10 @@ class N3VLIntExt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, ValueType TyQ, ValueType TyD, Intrinsic IntOp, SDNode ExtOp, bit Commutable> : N3V<op24, op23, op21_20, op11_8, 0, op4, - (outs QPR:$dst), (ins DPR:$src1, DPR:$src2), N3RegFrm, itin, - OpcodeStr, Dt, "$dst, $src1, $src2", "", - [(set QPR:$dst, (TyQ (ExtOp (TyD (IntOp (TyD DPR:$src1), - (TyD DPR:$src2))))))]> { + (outs QPR:$Vd), (ins DPR:$Vn, DPR:$Vm), N3RegFrm, itin, + OpcodeStr, Dt, "$Vd, $Vn, $Vm", "", + [(set QPR:$Vd, (TyQ (ExtOp (TyD (IntOp (TyD DPR:$Vn), + (TyD DPR:$Vm))))))]> { let isCommutable = Commutable; } @@ -1508,30 +2200,30 @@ class N3VLInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, InstrItinClass itin, string OpcodeStr, string Dt, ValueType TyQ, ValueType TyD, Intrinsic IntOp, bit Commutable> : N3V<op24, op23, op21_20, op11_8, 0, op4, - (outs QPR:$dst), (ins DPR:$src1, DPR:$src2), N3RegFrm, itin, - OpcodeStr, Dt, "$dst, $src1, $src2", "", - [(set QPR:$dst, (TyQ (IntOp (TyD DPR:$src1), (TyD DPR:$src2))))]> { + (outs QPR:$Vd), (ins DPR:$Vn, DPR:$Vm), N3RegFrm, itin, + OpcodeStr, Dt, "$Vd, $Vn, $Vm", "", + [(set QPR:$Vd, (TyQ (IntOp (TyD DPR:$Vn), (TyD DPR:$Vm))))]> { let isCommutable = Commutable; } class N3VLIntSL<bit op24, bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy, Intrinsic IntOp> : N3V<op24, 1, op21_20, op11_8, 1, 0, - (outs QPR:$dst), (ins DPR:$src1, DPR_VFP2:$src2, nohash_imm:$lane), - NVMulSLFrm, itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "", - [(set (ResTy QPR:$dst), - (ResTy (IntOp (OpTy DPR:$src1), - (OpTy (NEONvduplane (OpTy DPR_VFP2:$src2), + (outs QPR:$Vd), (ins DPR:$Vn, DPR_VFP2:$Vm, nohash_imm:$lane), + NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm[$lane]", "", + [(set (ResTy QPR:$Vd), + (ResTy (IntOp (OpTy DPR:$Vn), + (OpTy (NEONvduplane (OpTy DPR_VFP2:$Vm), imm:$lane)))))]>; class N3VLIntSL16<bit op24, bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy, Intrinsic IntOp> : N3V<op24, 1, op21_20, op11_8, 1, 0, - (outs QPR:$dst), (ins DPR:$src1, DPR_8:$src2, nohash_imm:$lane), - NVMulSLFrm, itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "", - [(set (ResTy QPR:$dst), - (ResTy (IntOp (OpTy DPR:$src1), - (OpTy (NEONvduplane (OpTy DPR_8:$src2), + (outs QPR:$Vd), (ins DPR:$Vn, DPR_8:$Vm, nohash_imm:$lane), + NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm[$lane]", "", + [(set (ResTy QPR:$Vd), + (ResTy (IntOp (OpTy DPR:$Vn), + (OpTy (NEONvduplane (OpTy DPR_8:$Vm), imm:$lane)))))]>; // Wide 3-register operations. @@ -1539,10 +2231,10 @@ class N3VW<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, string OpcodeStr, string Dt, ValueType TyQ, ValueType TyD, SDNode OpNode, SDNode ExtOp, bit Commutable> : N3V<op24, op23, op21_20, op11_8, 0, op4, - (outs QPR:$dst), (ins QPR:$src1, DPR:$src2), N3RegFrm, IIC_VSUBiD, - OpcodeStr, Dt, "$dst, $src1, $src2", "", - [(set QPR:$dst, (OpNode (TyQ QPR:$src1), - (TyQ (ExtOp (TyD DPR:$src2)))))]> { + (outs QPR:$Vd), (ins QPR:$Vn, DPR:$Vm), N3RegFrm, IIC_VSUBiD, + OpcodeStr, Dt, "$Vd, $Vn, $Vm", "", + [(set QPR:$Vd, (OpNode (TyQ QPR:$Vn), + (TyQ (ExtOp (TyD DPR:$Vm)))))]> { let isCommutable = Commutable; } @@ -1551,16 +2243,16 @@ class N2VDPLInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, bits<2> op17_16, bits<5> op11_7, bit op4, string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy, Intrinsic IntOp> - : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 0, op4, (outs DPR:$dst), - (ins DPR:$src), IIC_VSHLiD, OpcodeStr, Dt, "$dst, $src", "", - [(set DPR:$dst, (ResTy (IntOp (OpTy DPR:$src))))]>; + : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 0, op4, (outs DPR:$Vd), + (ins DPR:$Vm), IIC_VSHLiD, OpcodeStr, Dt, "$Vd, $Vm", "", + [(set DPR:$Vd, (ResTy (IntOp (OpTy DPR:$Vm))))]>; class N2VQPLInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, bits<2> op17_16, bits<5> op11_7, bit op4, string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy, Intrinsic IntOp> - : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 1, op4, (outs QPR:$dst), - (ins QPR:$src), IIC_VSHLiD, OpcodeStr, Dt, "$dst, $src", "", - [(set QPR:$dst, (ResTy (IntOp (OpTy QPR:$src))))]>; + : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 1, op4, (outs QPR:$Vd), + (ins QPR:$Vm), IIC_VSHLiD, OpcodeStr, Dt, "$Vd, $Vm", "", + [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$Vm))))]>; // Pairwise long 2-register accumulate intrinsics, // both double- and quad-register. @@ -1570,17 +2262,17 @@ class N2VDPLInt2<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy, Intrinsic IntOp> : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 0, op4, - (outs DPR:$dst), (ins DPR:$src1, DPR:$src2), IIC_VPALiD, - OpcodeStr, Dt, "$dst, $src2", "$src1 = $dst", - [(set DPR:$dst, (ResTy (IntOp (ResTy DPR:$src1), (OpTy DPR:$src2))))]>; + (outs DPR:$Vd), (ins DPR:$src1, DPR:$Vm), IIC_VPALiD, + OpcodeStr, Dt, "$Vd, $Vm", "$src1 = $Vd", + [(set DPR:$Vd, (ResTy (IntOp (ResTy DPR:$src1), (OpTy DPR:$Vm))))]>; class N2VQPLInt2<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, bits<2> op17_16, bits<5> op11_7, bit op4, string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy, Intrinsic IntOp> : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 1, op4, - (outs QPR:$dst), (ins QPR:$src1, QPR:$src2), IIC_VPALiQ, - OpcodeStr, Dt, "$dst, $src2", "$src1 = $dst", - [(set QPR:$dst, (ResTy (IntOp (ResTy QPR:$src1), (OpTy QPR:$src2))))]>; + (outs QPR:$Vd), (ins QPR:$src1, QPR:$Vm), IIC_VPALiQ, + OpcodeStr, Dt, "$Vd, $Vm", "$src1 = $Vd", + [(set QPR:$Vd, (ResTy (IntOp (ResTy QPR:$src1), (OpTy QPR:$Vm))))]>; // Shift by immediate, // both double- and quad-register. @@ -1588,25 +2280,25 @@ class N2VDSh<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4, Format f, InstrItinClass itin, string OpcodeStr, string Dt, ValueType Ty, SDNode OpNode> : N2VImm<op24, op23, op11_8, op7, 0, op4, - (outs DPR:$dst), (ins DPR:$src, i32imm:$SIMM), f, itin, - OpcodeStr, Dt, "$dst, $src, $SIMM", "", - [(set DPR:$dst, (Ty (OpNode (Ty DPR:$src), (i32 imm:$SIMM))))]>; + (outs DPR:$Vd), (ins DPR:$Vm, i32imm:$SIMM), f, itin, + OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "", + [(set DPR:$Vd, (Ty (OpNode (Ty DPR:$Vm), (i32 imm:$SIMM))))]>; class N2VQSh<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4, Format f, InstrItinClass itin, string OpcodeStr, string Dt, ValueType Ty, SDNode OpNode> : N2VImm<op24, op23, op11_8, op7, 1, op4, - (outs QPR:$dst), (ins QPR:$src, i32imm:$SIMM), f, itin, - OpcodeStr, Dt, "$dst, $src, $SIMM", "", - [(set QPR:$dst, (Ty (OpNode (Ty QPR:$src), (i32 imm:$SIMM))))]>; + (outs QPR:$Vd), (ins QPR:$Vm, i32imm:$SIMM), f, itin, + OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "", + [(set QPR:$Vd, (Ty (OpNode (Ty QPR:$Vm), (i32 imm:$SIMM))))]>; // Long shift by immediate. class N2VLSh<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6, bit op4, string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy, SDNode OpNode> : N2VImm<op24, op23, op11_8, op7, op6, op4, - (outs QPR:$dst), (ins DPR:$src, i32imm:$SIMM), N2RegVShLFrm, - IIC_VSHLiD, OpcodeStr, Dt, "$dst, $src, $SIMM", "", - [(set QPR:$dst, (ResTy (OpNode (OpTy DPR:$src), + (outs QPR:$Vd), (ins DPR:$Vm, i32imm:$SIMM), N2RegVShLFrm, + IIC_VSHLiD, OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "", + [(set QPR:$Vd, (ResTy (OpNode (OpTy DPR:$Vm), (i32 imm:$SIMM))))]>; // Narrow shift by immediate. @@ -1614,42 +2306,42 @@ class N2VNSh<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6, bit op4, InstrItinClass itin, string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy, SDNode OpNode> : N2VImm<op24, op23, op11_8, op7, op6, op4, - (outs DPR:$dst), (ins QPR:$src, i32imm:$SIMM), N2RegVShRFrm, itin, - OpcodeStr, Dt, "$dst, $src, $SIMM", "", - [(set DPR:$dst, (ResTy (OpNode (OpTy QPR:$src), + (outs DPR:$Vd), (ins QPR:$Vm, i32imm:$SIMM), N2RegVShRFrm, itin, + OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "", + [(set DPR:$Vd, (ResTy (OpNode (OpTy QPR:$Vm), (i32 imm:$SIMM))))]>; // Shift right by immediate and accumulate, // both double- and quad-register. class N2VDShAdd<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4, string OpcodeStr, string Dt, ValueType Ty, SDNode ShOp> - : N2VImm<op24, op23, op11_8, op7, 0, op4, (outs DPR:$dst), - (ins DPR:$src1, DPR:$src2, i32imm:$SIMM), N2RegVShRFrm, IIC_VPALiD, - OpcodeStr, Dt, "$dst, $src2, $SIMM", "$src1 = $dst", - [(set DPR:$dst, (Ty (add DPR:$src1, - (Ty (ShOp DPR:$src2, (i32 imm:$SIMM))))))]>; + : N2VImm<op24, op23, op11_8, op7, 0, op4, (outs DPR:$Vd), + (ins DPR:$src1, DPR:$Vm, i32imm:$SIMM), N2RegVShRFrm, IIC_VPALiD, + OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "$src1 = $Vd", + [(set DPR:$Vd, (Ty (add DPR:$src1, + (Ty (ShOp DPR:$Vm, (i32 imm:$SIMM))))))]>; class N2VQShAdd<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4, string OpcodeStr, string Dt, ValueType Ty, SDNode ShOp> - : N2VImm<op24, op23, op11_8, op7, 1, op4, (outs QPR:$dst), - (ins QPR:$src1, QPR:$src2, i32imm:$SIMM), N2RegVShRFrm, IIC_VPALiD, - OpcodeStr, Dt, "$dst, $src2, $SIMM", "$src1 = $dst", - [(set QPR:$dst, (Ty (add QPR:$src1, - (Ty (ShOp QPR:$src2, (i32 imm:$SIMM))))))]>; + : N2VImm<op24, op23, op11_8, op7, 1, op4, (outs QPR:$Vd), + (ins QPR:$src1, QPR:$Vm, i32imm:$SIMM), N2RegVShRFrm, IIC_VPALiD, + OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "$src1 = $Vd", + [(set QPR:$Vd, (Ty (add QPR:$src1, + (Ty (ShOp QPR:$Vm, (i32 imm:$SIMM))))))]>; // Shift by immediate and insert, // both double- and quad-register. class N2VDShIns<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4, Format f, string OpcodeStr, string Dt, ValueType Ty,SDNode ShOp> - : N2VImm<op24, op23, op11_8, op7, 0, op4, (outs DPR:$dst), - (ins DPR:$src1, DPR:$src2, i32imm:$SIMM), f, IIC_VSHLiD, - OpcodeStr, Dt, "$dst, $src2, $SIMM", "$src1 = $dst", - [(set DPR:$dst, (Ty (ShOp DPR:$src1, DPR:$src2, (i32 imm:$SIMM))))]>; + : N2VImm<op24, op23, op11_8, op7, 0, op4, (outs DPR:$Vd), + (ins DPR:$src1, DPR:$Vm, i32imm:$SIMM), f, IIC_VSHLiD, + OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "$src1 = $Vd", + [(set DPR:$Vd, (Ty (ShOp DPR:$src1, DPR:$Vm, (i32 imm:$SIMM))))]>; class N2VQShIns<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4, Format f, string OpcodeStr, string Dt, ValueType Ty,SDNode ShOp> - : N2VImm<op24, op23, op11_8, op7, 1, op4, (outs QPR:$dst), - (ins QPR:$src1, QPR:$src2, i32imm:$SIMM), f, IIC_VSHLiQ, - OpcodeStr, Dt, "$dst, $src2, $SIMM", "$src1 = $dst", - [(set QPR:$dst, (Ty (ShOp QPR:$src1, QPR:$src2, (i32 imm:$SIMM))))]>; + : N2VImm<op24, op23, op11_8, op7, 1, op4, (outs QPR:$Vd), + (ins QPR:$src1, QPR:$Vm, i32imm:$SIMM), f, IIC_VSHLiQ, + OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "$src1 = $Vd", + [(set QPR:$Vd, (Ty (ShOp QPR:$src1, QPR:$Vm, (i32 imm:$SIMM))))]>; // Convert, with fractional bits immediate, // both double- and quad-register. @@ -1657,16 +2349,16 @@ class N2VCvtD<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4, string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy, Intrinsic IntOp> : N2VImm<op24, op23, op11_8, op7, 0, op4, - (outs DPR:$dst), (ins DPR:$src, i32imm:$SIMM), NVCVTFrm, - IIC_VUNAD, OpcodeStr, Dt, "$dst, $src, $SIMM", "", - [(set DPR:$dst, (ResTy (IntOp (OpTy DPR:$src), (i32 imm:$SIMM))))]>; + (outs DPR:$Vd), (ins DPR:$Vm, neon_vcvt_imm32:$SIMM), NVCVTFrm, + IIC_VUNAD, OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "", + [(set DPR:$Vd, (ResTy (IntOp (OpTy DPR:$Vm), (i32 imm:$SIMM))))]>; class N2VCvtQ<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4, string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy, Intrinsic IntOp> : N2VImm<op24, op23, op11_8, op7, 1, op4, - (outs QPR:$dst), (ins QPR:$src, i32imm:$SIMM), NVCVTFrm, - IIC_VUNAQ, OpcodeStr, Dt, "$dst, $src, $SIMM", "", - [(set QPR:$dst, (ResTy (IntOp (OpTy QPR:$src), (i32 imm:$SIMM))))]>; + (outs QPR:$Vd), (ins QPR:$Vm, neon_vcvt_imm32:$SIMM), NVCVTFrm, + IIC_VUNAQ, OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "", + [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$Vm), (i32 imm:$SIMM))))]>; //===----------------------------------------------------------------------===// // Multiclasses @@ -1678,45 +2370,127 @@ class N2VCvtQ<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4, // S = single int (32 bit) elements // D = double int (64 bit) elements -// Neon 2-register vector operations -- for disassembly only. +// Neon 2-register vector operations and intrinsics. -// First with only element sizes of 8, 16 and 32 bits: +// Neon 2-register comparisons. +// source operand element sizes of 8, 16 and 32 bits: multiclass N2V_QHS_cmp<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16, bits<5> op11_7, bit op4, string opc, string Dt, - string asm> { + string asm, SDNode OpNode> { // 64-bit vector types. def v8i8 : N2V<op24_23, op21_20, 0b00, op17_16, op11_7, 0, op4, - (outs DPR:$dst), (ins DPR:$src), NoItinerary, - opc, !strconcat(Dt, "8"), asm, "", []>; + (outs DPR:$Vd), (ins DPR:$Vm), NoItinerary, + opc, !strconcat(Dt, "8"), asm, "", + [(set DPR:$Vd, (v8i8 (OpNode (v8i8 DPR:$Vm))))]>; def v4i16 : N2V<op24_23, op21_20, 0b01, op17_16, op11_7, 0, op4, - (outs DPR:$dst), (ins DPR:$src), NoItinerary, - opc, !strconcat(Dt, "16"), asm, "", []>; + (outs DPR:$Vd), (ins DPR:$Vm), NoItinerary, + opc, !strconcat(Dt, "16"), asm, "", + [(set DPR:$Vd, (v4i16 (OpNode (v4i16 DPR:$Vm))))]>; def v2i32 : N2V<op24_23, op21_20, 0b10, op17_16, op11_7, 0, op4, - (outs DPR:$dst), (ins DPR:$src), NoItinerary, - opc, !strconcat(Dt, "32"), asm, "", []>; + (outs DPR:$Vd), (ins DPR:$Vm), NoItinerary, + opc, !strconcat(Dt, "32"), asm, "", + [(set DPR:$Vd, (v2i32 (OpNode (v2i32 DPR:$Vm))))]>; def v2f32 : N2V<op24_23, op21_20, 0b10, op17_16, op11_7, 0, op4, - (outs DPR:$dst), (ins DPR:$src), NoItinerary, - opc, "f32", asm, "", []> { + (outs DPR:$Vd), (ins DPR:$Vm), NoItinerary, + opc, "f32", asm, "", + [(set DPR:$Vd, (v2i32 (OpNode (v2f32 DPR:$Vm))))]> { let Inst{10} = 1; // overwrite F = 1 } // 128-bit vector types. def v16i8 : N2V<op24_23, op21_20, 0b00, op17_16, op11_7, 1, op4, - (outs QPR:$dst), (ins QPR:$src), NoItinerary, - opc, !strconcat(Dt, "8"), asm, "", []>; + (outs QPR:$Vd), (ins QPR:$Vm), NoItinerary, + opc, !strconcat(Dt, "8"), asm, "", + [(set QPR:$Vd, (v16i8 (OpNode (v16i8 QPR:$Vm))))]>; def v8i16 : N2V<op24_23, op21_20, 0b01, op17_16, op11_7, 1, op4, - (outs QPR:$dst), (ins QPR:$src), NoItinerary, - opc, !strconcat(Dt, "16"), asm, "", []>; + (outs QPR:$Vd), (ins QPR:$Vm), NoItinerary, + opc, !strconcat(Dt, "16"), asm, "", + [(set QPR:$Vd, (v8i16 (OpNode (v8i16 QPR:$Vm))))]>; def v4i32 : N2V<op24_23, op21_20, 0b10, op17_16, op11_7, 1, op4, - (outs QPR:$dst), (ins QPR:$src), NoItinerary, - opc, !strconcat(Dt, "32"), asm, "", []>; + (outs QPR:$Vd), (ins QPR:$Vm), NoItinerary, + opc, !strconcat(Dt, "32"), asm, "", + [(set QPR:$Vd, (v4i32 (OpNode (v4i32 QPR:$Vm))))]>; def v4f32 : N2V<op24_23, op21_20, 0b10, op17_16, op11_7, 1, op4, - (outs QPR:$dst), (ins QPR:$src), NoItinerary, - opc, "f32", asm, "", []> { + (outs QPR:$Vd), (ins QPR:$Vm), NoItinerary, + opc, "f32", asm, "", + [(set QPR:$Vd, (v4i32 (OpNode (v4f32 QPR:$Vm))))]> { let Inst{10} = 1; // overwrite F = 1 } } + +// Neon 2-register vector intrinsics, +// element sizes of 8, 16 and 32 bits: +multiclass N2VInt_QHS<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16, + bits<5> op11_7, bit op4, + InstrItinClass itinD, InstrItinClass itinQ, + string OpcodeStr, string Dt, Intrinsic IntOp> { + // 64-bit vector types. + def v8i8 : N2VDInt<op24_23, op21_20, 0b00, op17_16, op11_7, op4, + itinD, OpcodeStr, !strconcat(Dt, "8"), v8i8, v8i8, IntOp>; + def v4i16 : N2VDInt<op24_23, op21_20, 0b01, op17_16, op11_7, op4, + itinD, OpcodeStr, !strconcat(Dt, "16"),v4i16,v4i16,IntOp>; + def v2i32 : N2VDInt<op24_23, op21_20, 0b10, op17_16, op11_7, op4, + itinD, OpcodeStr, !strconcat(Dt, "32"),v2i32,v2i32,IntOp>; + + // 128-bit vector types. + def v16i8 : N2VQInt<op24_23, op21_20, 0b00, op17_16, op11_7, op4, + itinQ, OpcodeStr, !strconcat(Dt, "8"), v16i8,v16i8,IntOp>; + def v8i16 : N2VQInt<op24_23, op21_20, 0b01, op17_16, op11_7, op4, + itinQ, OpcodeStr, !strconcat(Dt, "16"),v8i16,v8i16,IntOp>; + def v4i32 : N2VQInt<op24_23, op21_20, 0b10, op17_16, op11_7, op4, + itinQ, OpcodeStr, !strconcat(Dt, "32"),v4i32,v4i32,IntOp>; +} + + +// Neon Narrowing 2-register vector operations, +// source operand element sizes of 16, 32 and 64 bits: +multiclass N2VN_HSD<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16, + bits<5> op11_7, bit op6, bit op4, + InstrItinClass itin, string OpcodeStr, string Dt, + SDNode OpNode> { + def v8i8 : N2VN<op24_23, op21_20, 0b00, op17_16, op11_7, op6, op4, + itin, OpcodeStr, !strconcat(Dt, "16"), + v8i8, v8i16, OpNode>; + def v4i16 : N2VN<op24_23, op21_20, 0b01, op17_16, op11_7, op6, op4, + itin, OpcodeStr, !strconcat(Dt, "32"), + v4i16, v4i32, OpNode>; + def v2i32 : N2VN<op24_23, op21_20, 0b10, op17_16, op11_7, op6, op4, + itin, OpcodeStr, !strconcat(Dt, "64"), + v2i32, v2i64, OpNode>; +} + +// Neon Narrowing 2-register vector intrinsics, +// source operand element sizes of 16, 32 and 64 bits: +multiclass N2VNInt_HSD<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16, + bits<5> op11_7, bit op6, bit op4, + InstrItinClass itin, string OpcodeStr, string Dt, + Intrinsic IntOp> { + def v8i8 : N2VNInt<op24_23, op21_20, 0b00, op17_16, op11_7, op6, op4, + itin, OpcodeStr, !strconcat(Dt, "16"), + v8i8, v8i16, IntOp>; + def v4i16 : N2VNInt<op24_23, op21_20, 0b01, op17_16, op11_7, op6, op4, + itin, OpcodeStr, !strconcat(Dt, "32"), + v4i16, v4i32, IntOp>; + def v2i32 : N2VNInt<op24_23, op21_20, 0b10, op17_16, op11_7, op6, op4, + itin, OpcodeStr, !strconcat(Dt, "64"), + v2i32, v2i64, IntOp>; +} + + +// Neon Lengthening 2-register vector intrinsic (currently specific to VMOVL). +// source operand element sizes of 16, 32 and 64 bits: +multiclass N2VL_QHS<bits<2> op24_23, bits<5> op11_7, bit op6, bit op4, + string OpcodeStr, string Dt, SDNode OpNode> { + def v8i16 : N2VL<op24_23, 0b00, 0b10, 0b00, op11_7, op6, op4, IIC_VQUNAiD, + OpcodeStr, !strconcat(Dt, "8"), v8i16, v8i8, OpNode>; + def v4i32 : N2VL<op24_23, 0b01, 0b00, 0b00, op11_7, op6, op4, IIC_VQUNAiD, + OpcodeStr, !strconcat(Dt, "16"), v4i32, v4i16, OpNode>; + def v2i64 : N2VL<op24_23, 0b10, 0b00, 0b00, op11_7, op6, op4, IIC_VQUNAiD, + OpcodeStr, !strconcat(Dt, "32"), v2i64, v2i32, OpNode>; +} + + // Neon 3-register vector operations. // First with only element sizes of 8, 16 and 32 bits: @@ -1726,7 +2500,7 @@ multiclass N3V_QHS<bit op24, bit op23, bits<4> op11_8, bit op4, string OpcodeStr, string Dt, SDNode OpNode, bit Commutable = 0> { // 64-bit vector types. - def v8i8 : N3VD<op24, op23, 0b00, op11_8, op4, itinD16, + def v8i8 : N3VD<op24, op23, 0b00, op11_8, op4, itinD16, OpcodeStr, !strconcat(Dt, "8"), v8i8, v8i8, OpNode, Commutable>; def v4i16 : N3VD<op24, op23, 0b01, op11_8, op4, itinD16, @@ -1775,54 +2549,6 @@ multiclass N3V_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4, } -// Neon Narrowing 2-register vector operations, -// source operand element sizes of 16, 32 and 64 bits: -multiclass N2VN_HSD<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16, - bits<5> op11_7, bit op6, bit op4, - InstrItinClass itin, string OpcodeStr, string Dt, - SDNode OpNode> { - def v8i8 : N2VN<op24_23, op21_20, 0b00, op17_16, op11_7, op6, op4, - itin, OpcodeStr, !strconcat(Dt, "16"), - v8i8, v8i16, OpNode>; - def v4i16 : N2VN<op24_23, op21_20, 0b01, op17_16, op11_7, op6, op4, - itin, OpcodeStr, !strconcat(Dt, "32"), - v4i16, v4i32, OpNode>; - def v2i32 : N2VN<op24_23, op21_20, 0b10, op17_16, op11_7, op6, op4, - itin, OpcodeStr, !strconcat(Dt, "64"), - v2i32, v2i64, OpNode>; -} - -// Neon Narrowing 2-register vector intrinsics, -// source operand element sizes of 16, 32 and 64 bits: -multiclass N2VNInt_HSD<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16, - bits<5> op11_7, bit op6, bit op4, - InstrItinClass itin, string OpcodeStr, string Dt, - Intrinsic IntOp> { - def v8i8 : N2VNInt<op24_23, op21_20, 0b00, op17_16, op11_7, op6, op4, - itin, OpcodeStr, !strconcat(Dt, "16"), - v8i8, v8i16, IntOp>; - def v4i16 : N2VNInt<op24_23, op21_20, 0b01, op17_16, op11_7, op6, op4, - itin, OpcodeStr, !strconcat(Dt, "32"), - v4i16, v4i32, IntOp>; - def v2i32 : N2VNInt<op24_23, op21_20, 0b10, op17_16, op11_7, op6, op4, - itin, OpcodeStr, !strconcat(Dt, "64"), - v2i32, v2i64, IntOp>; -} - - -// Neon Lengthening 2-register vector intrinsic (currently specific to VMOVL). -// source operand element sizes of 16, 32 and 64 bits: -multiclass N2VL_QHS<bits<2> op24_23, bits<5> op11_7, bit op6, bit op4, - string OpcodeStr, string Dt, SDNode OpNode> { - def v8i16 : N2VL<op24_23, 0b00, 0b10, 0b00, op11_7, op6, op4, IIC_VQUNAiD, - OpcodeStr, !strconcat(Dt, "8"), v8i16, v8i8, OpNode>; - def v4i32 : N2VL<op24_23, 0b01, 0b00, 0b00, op11_7, op6, op4, IIC_VQUNAiD, - OpcodeStr, !strconcat(Dt, "16"), v4i32, v4i16, OpNode>; - def v2i64 : N2VL<op24_23, 0b10, 0b00, 0b00, op11_7, op6, op4, IIC_VQUNAiD, - OpcodeStr, !strconcat(Dt, "32"), v2i64, v2i32, OpNode>; -} - - // Neon 3-register vector intrinsics. // First with only element sizes of 16 and 32 bits: @@ -1847,8 +2573,29 @@ multiclass N3VInt_HS<bit op24, bit op23, bits<4> op11_8, bit op4, Format f, OpcodeStr, !strconcat(Dt, "32"), v4i32, v4i32, IntOp, Commutable>; } +multiclass N3VInt_HSSh<bit op24, bit op23, bits<4> op11_8, bit op4, Format f, + InstrItinClass itinD16, InstrItinClass itinD32, + InstrItinClass itinQ16, InstrItinClass itinQ32, + string OpcodeStr, string Dt, + Intrinsic IntOp> { + // 64-bit vector types. + def v4i16 : N3VDIntSh<op24, op23, 0b01, op11_8, op4, f, itinD16, + OpcodeStr, !strconcat(Dt, "16"), + v4i16, v4i16, IntOp>; + def v2i32 : N3VDIntSh<op24, op23, 0b10, op11_8, op4, f, itinD32, + OpcodeStr, !strconcat(Dt, "32"), + v2i32, v2i32, IntOp>; + + // 128-bit vector types. + def v8i16 : N3VQIntSh<op24, op23, 0b01, op11_8, op4, f, itinQ16, + OpcodeStr, !strconcat(Dt, "16"), + v8i16, v8i16, IntOp>; + def v4i32 : N3VQIntSh<op24, op23, 0b10, op11_8, op4, f, itinQ32, + OpcodeStr, !strconcat(Dt, "32"), + v4i32, v4i32, IntOp>; +} -multiclass N3VIntSL_HS<bits<4> op11_8, +multiclass N3VIntSL_HS<bits<4> op11_8, InstrItinClass itinD16, InstrItinClass itinD32, InstrItinClass itinQ16, InstrItinClass itinQ32, string OpcodeStr, string Dt, Intrinsic IntOp> { @@ -1877,6 +2624,21 @@ multiclass N3VInt_QHS<bit op24, bit op23, bits<4> op11_8, bit op4, Format f, OpcodeStr, !strconcat(Dt, "8"), v16i8, v16i8, IntOp, Commutable>; } +multiclass N3VInt_QHSSh<bit op24, bit op23, bits<4> op11_8, bit op4, Format f, + InstrItinClass itinD16, InstrItinClass itinD32, + InstrItinClass itinQ16, InstrItinClass itinQ32, + string OpcodeStr, string Dt, + Intrinsic IntOp> + : N3VInt_HSSh<op24, op23, op11_8, op4, f, itinD16, itinD32, itinQ16, itinQ32, + OpcodeStr, Dt, IntOp> { + def v8i8 : N3VDIntSh<op24, op23, 0b00, op11_8, op4, f, itinD16, + OpcodeStr, !strconcat(Dt, "8"), + v8i8, v8i8, IntOp>; + def v16i8 : N3VQIntSh<op24, op23, 0b00, op11_8, op4, f, itinQ16, + OpcodeStr, !strconcat(Dt, "8"), + v16i8, v16i8, IntOp>; +} + // ....then also with element size of 64 bits: multiclass N3VInt_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4, Format f, @@ -1893,6 +2655,20 @@ multiclass N3VInt_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4, Format f, OpcodeStr, !strconcat(Dt, "64"), v2i64, v2i64, IntOp, Commutable>; } +multiclass N3VInt_QHSDSh<bit op24, bit op23, bits<4> op11_8, bit op4, Format f, + InstrItinClass itinD16, InstrItinClass itinD32, + InstrItinClass itinQ16, InstrItinClass itinQ32, + string OpcodeStr, string Dt, + Intrinsic IntOp> + : N3VInt_QHSSh<op24, op23, op11_8, op4, f, itinD16, itinD32, itinQ16, itinQ32, + OpcodeStr, Dt, IntOp> { + def v1i64 : N3VDIntSh<op24, op23, 0b11, op11_8, op4, f, itinD32, + OpcodeStr, !strconcat(Dt, "64"), + v1i64, v1i64, IntOp>; + def v2i64 : N3VQIntSh<op24, op23, 0b11, op11_8, op4, f, itinQ32, + OpcodeStr, !strconcat(Dt, "64"), + v2i64, v2i64, IntOp>; +} // Neon Narrowing 3-register vector intrinsics, // source operand element sizes of 16, 32 and 64 bits: @@ -1920,7 +2696,7 @@ multiclass N3VL_QHS<bit op24, bit op23, bits<4> op11_8, bit op4, def v8i16 : N3VL<op24, op23, 0b00, op11_8, op4, itin16, OpcodeStr, !strconcat(Dt, "8"), v8i16, v8i8, OpNode, Commutable>; - def v4i32 : N3VL<op24, op23, 0b01, op11_8, op4, itin16, + def v4i32 : N3VL<op24, op23, 0b01, op11_8, op4, itin16, OpcodeStr, !strconcat(Dt, "16"), v4i32, v4i16, OpNode, Commutable>; def v2i64 : N3VL<op24, op23, 0b10, op11_8, op4, itin32, @@ -1944,7 +2720,7 @@ multiclass N3VLExt_QHS<bit op24, bit op23, bits<4> op11_8, bit op4, def v8i16 : N3VLExt<op24, op23, 0b00, op11_8, op4, itin16, OpcodeStr, !strconcat(Dt, "8"), v8i16, v8i8, OpNode, ExtOp, Commutable>; - def v4i32 : N3VLExt<op24, op23, 0b01, op11_8, op4, itin16, + def v4i32 : N3VLExt<op24, op23, 0b01, op11_8, op4, itin16, OpcodeStr, !strconcat(Dt, "16"), v4i32, v4i16, OpNode, ExtOp, Commutable>; def v2i64 : N3VLExt<op24, op23, 0b10, op11_8, op4, itin32, @@ -1959,7 +2735,7 @@ multiclass N3VLInt_HS<bit op24, bit op23, bits<4> op11_8, bit op4, InstrItinClass itin16, InstrItinClass itin32, string OpcodeStr, string Dt, Intrinsic IntOp, bit Commutable = 0> { - def v4i32 : N3VLInt<op24, op23, 0b01, op11_8, op4, itin16, + def v4i32 : N3VLInt<op24, op23, 0b01, op11_8, op4, itin16, OpcodeStr, !strconcat(Dt, "16"), v4i32, v4i16, IntOp, Commutable>; def v2i64 : N3VLInt<op24, op23, 0b10, op11_8, op4, itin32, @@ -1970,7 +2746,7 @@ multiclass N3VLInt_HS<bit op24, bit op23, bits<4> op11_8, bit op4, multiclass N3VLIntSL_HS<bit op24, bits<4> op11_8, InstrItinClass itin, string OpcodeStr, string Dt, Intrinsic IntOp> { - def v4i16 : N3VLIntSL16<op24, 0b01, op11_8, itin, + def v4i16 : N3VLIntSL16<op24, 0b01, op11_8, itin, OpcodeStr, !strconcat(Dt, "16"), v4i32, v4i16, IntOp>; def v2i32 : N3VLIntSL<op24, 0b10, op11_8, itin, OpcodeStr, !strconcat(Dt, "32"), v2i64, v2i32, IntOp>; @@ -1995,7 +2771,7 @@ multiclass N3VLIntExt_QHS<bit op24, bit op23, bits<4> op11_8, bit op4, def v8i16 : N3VLIntExt<op24, op23, 0b00, op11_8, op4, itin, OpcodeStr, !strconcat(Dt, "8"), v8i16, v8i8, IntOp, ExtOp, Commutable>; - def v4i32 : N3VLIntExt<op24, op23, 0b01, op11_8, op4, itin, + def v4i32 : N3VLIntExt<op24, op23, 0b01, op11_8, op4, itin, OpcodeStr, !strconcat(Dt, "16"), v4i32, v4i16, IntOp, ExtOp, Commutable>; def v2i64 : N3VLIntExt<op24, op23, 0b10, op11_8, op4, itin, @@ -2044,7 +2820,7 @@ multiclass N3VMulOp_QHS<bit op24, bit op23, bits<4> op11_8, bit op4, OpcodeStr, !strconcat(Dt, "32"), v4i32, mul, OpNode>; } -multiclass N3VMulOpSL_HS<bits<4> op11_8, +multiclass N3VMulOpSL_HS<bits<4> op11_8, InstrItinClass itinD16, InstrItinClass itinD32, InstrItinClass itinQ16, InstrItinClass itinQ32, string OpcodeStr, string Dt, SDNode ShOp> { @@ -2174,30 +2950,6 @@ multiclass N3VLIntExtOp_QHS<bit op24, bit op23, bits<4> op11_8, bit op4, } -// Neon 2-register vector intrinsics, -// element sizes of 8, 16 and 32 bits: -multiclass N2VInt_QHS<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16, - bits<5> op11_7, bit op4, - InstrItinClass itinD, InstrItinClass itinQ, - string OpcodeStr, string Dt, Intrinsic IntOp> { - // 64-bit vector types. - def v8i8 : N2VDInt<op24_23, op21_20, 0b00, op17_16, op11_7, op4, - itinD, OpcodeStr, !strconcat(Dt, "8"), v8i8, v8i8, IntOp>; - def v4i16 : N2VDInt<op24_23, op21_20, 0b01, op17_16, op11_7, op4, - itinD, OpcodeStr, !strconcat(Dt, "16"),v4i16,v4i16,IntOp>; - def v2i32 : N2VDInt<op24_23, op21_20, 0b10, op17_16, op11_7, op4, - itinD, OpcodeStr, !strconcat(Dt, "32"),v2i32,v2i32,IntOp>; - - // 128-bit vector types. - def v16i8 : N2VQInt<op24_23, op21_20, 0b00, op17_16, op11_7, op4, - itinQ, OpcodeStr, !strconcat(Dt, "8"), v16i8,v16i8,IntOp>; - def v8i16 : N2VQInt<op24_23, op21_20, 0b01, op17_16, op11_7, op4, - itinQ, OpcodeStr, !strconcat(Dt, "16"),v8i16,v8i16,IntOp>; - def v4i32 : N2VQInt<op24_23, op21_20, 0b10, op17_16, op11_7, op4, - itinQ, OpcodeStr, !strconcat(Dt, "32"),v4i32,v4i32,IntOp>; -} - - // Neon Pairwise long 2-register intrinsics, // element sizes of 8, 16 and 32 bits: multiclass N2VPLInt_QHS<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16, @@ -2461,9 +3213,9 @@ def VMULpd : N3VDInt<1, 0, 0b00, 0b1001, 1, N3RegFrm, IIC_VMULi16D, "vmul", "p8", v8i8, v8i8, int_arm_neon_vmulp, 1>; def VMULpq : N3VQInt<1, 0, 0b00, 0b1001, 1, N3RegFrm, IIC_VMULi16Q, "vmul", "p8", v16i8, v16i8, int_arm_neon_vmulp, 1>; -def VMULfd : N3VD<1, 0, 0b00, 0b1101, 1, IIC_VBIND, "vmul", "f32", +def VMULfd : N3VD<1, 0, 0b00, 0b1101, 1, IIC_VFMULD, "vmul", "f32", v2f32, v2f32, fmul, 1>; -def VMULfq : N3VQ<1, 0, 0b00, 0b1101, 1, IIC_VBINQ, "vmul", "f32", +def VMULfq : N3VQ<1, 0, 0b00, 0b1101, 1, IIC_VFMULQ, "vmul", "f32", v4f32, v4f32, fmul, 1>; defm VMULsl : N3VSL_HS<0b1000, "vmul", "i", mul>; def VMULslfd : N3VDSL<0b10, 0b1001, IIC_VBIND, "vmul", "f32", v2f32, fmul>; @@ -2491,7 +3243,7 @@ def : Pat<(v4f32 (fmul (v4f32 QPR:$src1), // VQDMULH : Vector Saturating Doubling Multiply Returning High Half defm VQDMULH : N3VInt_HS<0, 0, 0b1011, 0, N3RegFrm, IIC_VMULi16D, IIC_VMULi32D, - IIC_VMULi16Q, IIC_VMULi32Q, + IIC_VMULi16Q, IIC_VMULi32Q, "vqdmulh", "s", int_arm_neon_vqdmulh, 1>; defm VQDMULHsl: N3VIntSL_HS<0b1100, IIC_VMULi16D, IIC_VMULi32D, IIC_VMULi16Q, IIC_VMULi32Q, @@ -2555,15 +3307,19 @@ defm VQDMULLsl: N3VLIntSL_HS<0, 0b1011, IIC_VMULi16D, defm VMLA : N3VMulOp_QHS<0, 0, 0b1001, 0, IIC_VMACi16D, IIC_VMACi32D, IIC_VMACi16Q, IIC_VMACi32Q, "vmla", "i", add>; def VMLAfd : N3VDMulOp<0, 0, 0b00, 0b1101, 1, IIC_VMACD, "vmla", "f32", - v2f32, fmul, fadd>; + v2f32, fmul_su, fadd_mlx>, + Requires<[HasNEON, UseFPVMLx]>; def VMLAfq : N3VQMulOp<0, 0, 0b00, 0b1101, 1, IIC_VMACQ, "vmla", "f32", - v4f32, fmul, fadd>; + v4f32, fmul_su, fadd_mlx>, + Requires<[HasNEON, UseFPVMLx]>; defm VMLAsl : N3VMulOpSL_HS<0b0000, IIC_VMACi16D, IIC_VMACi32D, IIC_VMACi16Q, IIC_VMACi32Q, "vmla", "i", add>; def VMLAslfd : N3VDMulOpSL<0b10, 0b0001, IIC_VMACD, "vmla", "f32", - v2f32, fmul, fadd>; + v2f32, fmul_su, fadd_mlx>, + Requires<[HasNEON, UseFPVMLx]>; def VMLAslfq : N3VQMulOpSL<0b10, 0b0001, IIC_VMACQ, "vmla", "f32", - v4f32, v2f32, fmul, fadd>; + v4f32, v2f32, fmul_su, fadd_mlx>, + Requires<[HasNEON, UseFPVMLx]>; def : Pat<(v8i16 (add (v8i16 QPR:$src1), (mul (v8i16 QPR:$src2), @@ -2581,14 +3337,15 @@ def : Pat<(v4i32 (add (v4i32 QPR:$src1), (DSubReg_i32_reg imm:$lane))), (SubReg_i32_lane imm:$lane)))>; -def : Pat<(v4f32 (fadd (v4f32 QPR:$src1), - (fmul (v4f32 QPR:$src2), +def : Pat<(v4f32 (fadd_mlx (v4f32 QPR:$src1), + (fmul_su (v4f32 QPR:$src2), (v4f32 (NEONvduplane (v4f32 QPR:$src3), imm:$lane))))), (v4f32 (VMLAslfq (v4f32 QPR:$src1), (v4f32 QPR:$src2), (v2f32 (EXTRACT_SUBREG QPR:$src3, (DSubReg_i32_reg imm:$lane))), - (SubReg_i32_lane imm:$lane)))>; + (SubReg_i32_lane imm:$lane)))>, + Requires<[HasNEON, UseFPVMLx]>; // VMLAL : Vector Multiply Accumulate Long (Q += D * D) defm VMLALs : N3VLMulOp_QHS<0,1,0b1000,0, IIC_VMACi16D, IIC_VMACi32D, @@ -2608,15 +3365,19 @@ defm VQDMLALsl: N3VLInt3SL_HS<0, 0b0011, "vqdmlal", "s", int_arm_neon_vqdmlal>; defm VMLS : N3VMulOp_QHS<1, 0, 0b1001, 0, IIC_VMACi16D, IIC_VMACi32D, IIC_VMACi16Q, IIC_VMACi32Q, "vmls", "i", sub>; def VMLSfd : N3VDMulOp<0, 0, 0b10, 0b1101, 1, IIC_VMACD, "vmls", "f32", - v2f32, fmul, fsub>; + v2f32, fmul_su, fsub_mlx>, + Requires<[HasNEON, UseFPVMLx]>; def VMLSfq : N3VQMulOp<0, 0, 0b10, 0b1101, 1, IIC_VMACQ, "vmls", "f32", - v4f32, fmul, fsub>; + v4f32, fmul_su, fsub_mlx>, + Requires<[HasNEON, UseFPVMLx]>; defm VMLSsl : N3VMulOpSL_HS<0b0100, IIC_VMACi16D, IIC_VMACi32D, IIC_VMACi16Q, IIC_VMACi32Q, "vmls", "i", sub>; def VMLSslfd : N3VDMulOpSL<0b10, 0b0101, IIC_VMACD, "vmls", "f32", - v2f32, fmul, fsub>; + v2f32, fmul_su, fsub_mlx>, + Requires<[HasNEON, UseFPVMLx]>; def VMLSslfq : N3VQMulOpSL<0b10, 0b0101, IIC_VMACQ, "vmls", "f32", - v4f32, v2f32, fmul, fsub>; + v4f32, v2f32, fmul_su, fsub_mlx>, + Requires<[HasNEON, UseFPVMLx]>; def : Pat<(v8i16 (sub (v8i16 QPR:$src1), (mul (v8i16 QPR:$src2), @@ -2634,13 +3395,14 @@ def : Pat<(v4i32 (sub (v4i32 QPR:$src1), (DSubReg_i32_reg imm:$lane))), (SubReg_i32_lane imm:$lane)))>; -def : Pat<(v4f32 (fsub (v4f32 QPR:$src1), - (fmul (v4f32 QPR:$src2), +def : Pat<(v4f32 (fsub_mlx (v4f32 QPR:$src1), + (fmul_su (v4f32 QPR:$src2), (v4f32 (NEONvduplane (v4f32 QPR:$src3), imm:$lane))))), (v4f32 (VMLSslfq (v4f32 QPR:$src1), (v4f32 QPR:$src2), (v2f32 (EXTRACT_SUBREG QPR:$src3, (DSubReg_i32_reg imm:$lane))), - (SubReg_i32_lane imm:$lane)))>; + (SubReg_i32_lane imm:$lane)))>, + Requires<[HasNEON, UseFPVMLx]>; // VMLSL : Vector Multiply Subtract Long (Q -= D * D) defm VMLSLs : N3VLMulOp_QHS<0,1,0b1010,0, IIC_VMACi16D, IIC_VMACi32D, @@ -2703,25 +3465,24 @@ def VCEQfd : N3VD<0,0,0b00,0b1110,0, IIC_VBIND, "vceq", "f32", v2i32, v2f32, NEONvceq, 1>; def VCEQfq : N3VQ<0,0,0b00,0b1110,0, IIC_VBINQ, "vceq", "f32", v4i32, v4f32, NEONvceq, 1>; -// For disassembly only. + defm VCEQz : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00010, 0, "vceq", "i", - "$dst, $src, #0">; + "$Vd, $Vm, #0", NEONvceqz>; // VCGE : Vector Compare Greater Than or Equal defm VCGEs : N3V_QHS<0, 0, 0b0011, 1, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q, "vcge", "s", NEONvcge, 0>; -defm VCGEu : N3V_QHS<1, 0, 0b0011, 1, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, +defm VCGEu : N3V_QHS<1, 0, 0b0011, 1, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q, "vcge", "u", NEONvcgeu, 0>; def VCGEfd : N3VD<1,0,0b00,0b1110,0, IIC_VBIND, "vcge", "f32", v2i32, v2f32, NEONvcge, 0>; def VCGEfq : N3VQ<1,0,0b00,0b1110,0, IIC_VBINQ, "vcge", "f32", v4i32, v4f32, NEONvcge, 0>; -// For disassembly only. + defm VCGEz : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00001, 0, "vcge", "s", - "$dst, $src, #0">; -// For disassembly only. + "$Vd, $Vm, #0", NEONvcgez>; defm VCLEz : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00011, 0, "vcle", "s", - "$dst, $src, #0">; + "$Vd, $Vm, #0", NEONvclez>; // VCGT : Vector Compare Greater Than defm VCGTs : N3V_QHS<0, 0, 0b0011, 0, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, @@ -2732,12 +3493,11 @@ def VCGTfd : N3VD<1,0,0b10,0b1110,0, IIC_VBIND, "vcgt", "f32", v2i32, v2f32, NEONvcgt, 0>; def VCGTfq : N3VQ<1,0,0b10,0b1110,0, IIC_VBINQ, "vcgt", "f32", v4i32, v4f32, NEONvcgt, 0>; -// For disassembly only. + defm VCGTz : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00000, 0, "vcgt", "s", - "$dst, $src, #0">; -// For disassembly only. + "$Vd, $Vm, #0", NEONvcgtz>; defm VCLTz : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00100, 0, "vclt", "s", - "$dst, $src, #0">; + "$Vd, $Vm, #0", NEONvcltz>; // VACGE : Vector Absolute Compare Greater Than or Equal (aka VCAGE) def VACGEd : N3VDInt<1, 0, 0b00, 0b1110, 1, N3RegFrm, IIC_VBIND, "vacge", @@ -2750,7 +3510,7 @@ def VACGTd : N3VDInt<1, 0, 0b10, 0b1110, 1, N3RegFrm, IIC_VBIND, "vacgt", def VACGTq : N3VQInt<1, 0, 0b10, 0b1110, 1, N3RegFrm, IIC_VBINQ, "vacgt", "f32", v4i32, v4f32, int_arm_neon_vacgtq, 0>; // VTST : Vector Test Bits -defm VTST : N3V_QHS<0, 0, 0b1000, 1, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, +defm VTST : N3V_QHS<0, 0, 0b1000, 1, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q, "vtst", "", NEONvtst, 1>; // Vector Bitwise Operations. @@ -2779,104 +3539,190 @@ def VORRd : N3VDX<0, 0, 0b10, 0b0001, 1, IIC_VBINiD, "vorr", def VORRq : N3VQX<0, 0, 0b10, 0b0001, 1, IIC_VBINiQ, "vorr", v4i32, v4i32, or, 1>; +def VORRiv4i16 : N1ModImm<1, 0b000, {1,0,?,1}, 0, 0, 0, 1, + (outs DPR:$Vd), (ins nModImm:$SIMM, DPR:$src), + IIC_VMOVImm, + "vorr", "i16", "$Vd, $SIMM", "$src = $Vd", + [(set DPR:$Vd, + (v4i16 (NEONvorrImm DPR:$src, timm:$SIMM)))]> { + let Inst{9} = SIMM{9}; +} + +def VORRiv2i32 : N1ModImm<1, 0b000, {0,?,?,1}, 0, 0, 0, 1, + (outs DPR:$Vd), (ins nModImm:$SIMM, DPR:$src), + IIC_VMOVImm, + "vorr", "i32", "$Vd, $SIMM", "$src = $Vd", + [(set DPR:$Vd, + (v2i32 (NEONvorrImm DPR:$src, timm:$SIMM)))]> { + let Inst{10-9} = SIMM{10-9}; +} + +def VORRiv8i16 : N1ModImm<1, 0b000, {1,0,?,1}, 0, 1, 0, 1, + (outs QPR:$Vd), (ins nModImm:$SIMM, QPR:$src), + IIC_VMOVImm, + "vorr", "i16", "$Vd, $SIMM", "$src = $Vd", + [(set QPR:$Vd, + (v8i16 (NEONvorrImm QPR:$src, timm:$SIMM)))]> { + let Inst{9} = SIMM{9}; +} + +def VORRiv4i32 : N1ModImm<1, 0b000, {0,?,?,1}, 0, 1, 0, 1, + (outs QPR:$Vd), (ins nModImm:$SIMM, QPR:$src), + IIC_VMOVImm, + "vorr", "i32", "$Vd, $SIMM", "$src = $Vd", + [(set QPR:$Vd, + (v4i32 (NEONvorrImm QPR:$src, timm:$SIMM)))]> { + let Inst{10-9} = SIMM{10-9}; +} + + // VBIC : Vector Bitwise Bit Clear (AND NOT) -def VBICd : N3VX<0, 0, 0b01, 0b0001, 0, 1, (outs DPR:$dst), - (ins DPR:$src1, DPR:$src2), N3RegFrm, IIC_VBINiD, - "vbic", "$dst, $src1, $src2", "", - [(set DPR:$dst, (v2i32 (and DPR:$src1, - (vnotd DPR:$src2))))]>; -def VBICq : N3VX<0, 0, 0b01, 0b0001, 1, 1, (outs QPR:$dst), - (ins QPR:$src1, QPR:$src2), N3RegFrm, IIC_VBINiQ, - "vbic", "$dst, $src1, $src2", "", - [(set QPR:$dst, (v4i32 (and QPR:$src1, - (vnotq QPR:$src2))))]>; +def VBICd : N3VX<0, 0, 0b01, 0b0001, 0, 1, (outs DPR:$Vd), + (ins DPR:$Vn, DPR:$Vm), N3RegFrm, IIC_VBINiD, + "vbic", "$Vd, $Vn, $Vm", "", + [(set DPR:$Vd, (v2i32 (and DPR:$Vn, + (vnotd DPR:$Vm))))]>; +def VBICq : N3VX<0, 0, 0b01, 0b0001, 1, 1, (outs QPR:$Vd), + (ins QPR:$Vn, QPR:$Vm), N3RegFrm, IIC_VBINiQ, + "vbic", "$Vd, $Vn, $Vm", "", + [(set QPR:$Vd, (v4i32 (and QPR:$Vn, + (vnotq QPR:$Vm))))]>; + +def VBICiv4i16 : N1ModImm<1, 0b000, {1,0,?,1}, 0, 0, 1, 1, + (outs DPR:$Vd), (ins nModImm:$SIMM, DPR:$src), + IIC_VMOVImm, + "vbic", "i16", "$Vd, $SIMM", "$src = $Vd", + [(set DPR:$Vd, + (v4i16 (NEONvbicImm DPR:$src, timm:$SIMM)))]> { + let Inst{9} = SIMM{9}; +} + +def VBICiv2i32 : N1ModImm<1, 0b000, {0,?,?,1}, 0, 0, 1, 1, + (outs DPR:$Vd), (ins nModImm:$SIMM, DPR:$src), + IIC_VMOVImm, + "vbic", "i32", "$Vd, $SIMM", "$src = $Vd", + [(set DPR:$Vd, + (v2i32 (NEONvbicImm DPR:$src, timm:$SIMM)))]> { + let Inst{10-9} = SIMM{10-9}; +} + +def VBICiv8i16 : N1ModImm<1, 0b000, {1,0,?,1}, 0, 1, 1, 1, + (outs QPR:$Vd), (ins nModImm:$SIMM, QPR:$src), + IIC_VMOVImm, + "vbic", "i16", "$Vd, $SIMM", "$src = $Vd", + [(set QPR:$Vd, + (v8i16 (NEONvbicImm QPR:$src, timm:$SIMM)))]> { + let Inst{9} = SIMM{9}; +} + +def VBICiv4i32 : N1ModImm<1, 0b000, {0,?,?,1}, 0, 1, 1, 1, + (outs QPR:$Vd), (ins nModImm:$SIMM, QPR:$src), + IIC_VMOVImm, + "vbic", "i32", "$Vd, $SIMM", "$src = $Vd", + [(set QPR:$Vd, + (v4i32 (NEONvbicImm QPR:$src, timm:$SIMM)))]> { + let Inst{10-9} = SIMM{10-9}; +} // VORN : Vector Bitwise OR NOT -def VORNd : N3VX<0, 0, 0b11, 0b0001, 0, 1, (outs DPR:$dst), - (ins DPR:$src1, DPR:$src2), N3RegFrm, IIC_VBINiD, - "vorn", "$dst, $src1, $src2", "", - [(set DPR:$dst, (v2i32 (or DPR:$src1, - (vnotd DPR:$src2))))]>; -def VORNq : N3VX<0, 0, 0b11, 0b0001, 1, 1, (outs QPR:$dst), - (ins QPR:$src1, QPR:$src2), N3RegFrm, IIC_VBINiQ, - "vorn", "$dst, $src1, $src2", "", - [(set QPR:$dst, (v4i32 (or QPR:$src1, - (vnotq QPR:$src2))))]>; +def VORNd : N3VX<0, 0, 0b11, 0b0001, 0, 1, (outs DPR:$Vd), + (ins DPR:$Vn, DPR:$Vm), N3RegFrm, IIC_VBINiD, + "vorn", "$Vd, $Vn, $Vm", "", + [(set DPR:$Vd, (v2i32 (or DPR:$Vn, + (vnotd DPR:$Vm))))]>; +def VORNq : N3VX<0, 0, 0b11, 0b0001, 1, 1, (outs QPR:$Vd), + (ins QPR:$Vn, QPR:$Vm), N3RegFrm, IIC_VBINiQ, + "vorn", "$Vd, $Vn, $Vm", "", + [(set QPR:$Vd, (v4i32 (or QPR:$Vn, + (vnotq QPR:$Vm))))]>; // VMVN : Vector Bitwise NOT (Immediate) let isReMaterializable = 1 in { -def VMVNv4i16 : N1ModImm<1, 0b000, {1,0,?,0}, 0, 0, 1, 1, (outs DPR:$dst), + +def VMVNv4i16 : N1ModImm<1, 0b000, {1,0,?,0}, 0, 0, 1, 1, (outs DPR:$Vd), (ins nModImm:$SIMM), IIC_VMOVImm, - "vmvn", "i16", "$dst, $SIMM", "", - [(set DPR:$dst, (v4i16 (NEONvmvnImm timm:$SIMM)))]>; -def VMVNv8i16 : N1ModImm<1, 0b000, {1,0,?,0}, 0, 1, 1, 1, (outs QPR:$dst), + "vmvn", "i16", "$Vd, $SIMM", "", + [(set DPR:$Vd, (v4i16 (NEONvmvnImm timm:$SIMM)))]> { + let Inst{9} = SIMM{9}; +} + +def VMVNv8i16 : N1ModImm<1, 0b000, {1,0,?,0}, 0, 1, 1, 1, (outs QPR:$Vd), (ins nModImm:$SIMM), IIC_VMOVImm, - "vmvn", "i16", "$dst, $SIMM", "", - [(set QPR:$dst, (v8i16 (NEONvmvnImm timm:$SIMM)))]>; + "vmvn", "i16", "$Vd, $SIMM", "", + [(set QPR:$Vd, (v8i16 (NEONvmvnImm timm:$SIMM)))]> { + let Inst{9} = SIMM{9}; +} -def VMVNv2i32 : N1ModImm<1, 0b000, {?,?,?,?}, 0, 0, 1, 1, (outs DPR:$dst), +def VMVNv2i32 : N1ModImm<1, 0b000, {?,?,?,?}, 0, 0, 1, 1, (outs DPR:$Vd), (ins nModImm:$SIMM), IIC_VMOVImm, - "vmvn", "i32", "$dst, $SIMM", "", - [(set DPR:$dst, (v2i32 (NEONvmvnImm timm:$SIMM)))]>; -def VMVNv4i32 : N1ModImm<1, 0b000, {?,?,?,?}, 0, 1, 1, 1, (outs QPR:$dst), + "vmvn", "i32", "$Vd, $SIMM", "", + [(set DPR:$Vd, (v2i32 (NEONvmvnImm timm:$SIMM)))]> { + let Inst{11-8} = SIMM{11-8}; +} + +def VMVNv4i32 : N1ModImm<1, 0b000, {?,?,?,?}, 0, 1, 1, 1, (outs QPR:$Vd), (ins nModImm:$SIMM), IIC_VMOVImm, - "vmvn", "i32", "$dst, $SIMM", "", - [(set QPR:$dst, (v4i32 (NEONvmvnImm timm:$SIMM)))]>; + "vmvn", "i32", "$Vd, $SIMM", "", + [(set QPR:$Vd, (v4i32 (NEONvmvnImm timm:$SIMM)))]> { + let Inst{11-8} = SIMM{11-8}; +} } // VMVN : Vector Bitwise NOT def VMVNd : N2VX<0b11, 0b11, 0b00, 0b00, 0b01011, 0, 0, - (outs DPR:$dst), (ins DPR:$src), IIC_VSUBiD, - "vmvn", "$dst, $src", "", - [(set DPR:$dst, (v2i32 (vnotd DPR:$src)))]>; + (outs DPR:$Vd), (ins DPR:$Vm), IIC_VSUBiD, + "vmvn", "$Vd, $Vm", "", + [(set DPR:$Vd, (v2i32 (vnotd DPR:$Vm)))]>; def VMVNq : N2VX<0b11, 0b11, 0b00, 0b00, 0b01011, 1, 0, - (outs QPR:$dst), (ins QPR:$src), IIC_VSUBiD, - "vmvn", "$dst, $src", "", - [(set QPR:$dst, (v4i32 (vnotq QPR:$src)))]>; + (outs QPR:$Vd), (ins QPR:$Vm), IIC_VSUBiD, + "vmvn", "$Vd, $Vm", "", + [(set QPR:$Vd, (v4i32 (vnotq QPR:$Vm)))]>; def : Pat<(v2i32 (vnotd DPR:$src)), (VMVNd DPR:$src)>; def : Pat<(v4i32 (vnotq QPR:$src)), (VMVNq QPR:$src)>; // VBSL : Vector Bitwise Select -def VBSLd : N3VX<1, 0, 0b01, 0b0001, 0, 1, (outs DPR:$dst), - (ins DPR:$src1, DPR:$src2, DPR:$src3), +def VBSLd : N3VX<1, 0, 0b01, 0b0001, 0, 1, (outs DPR:$Vd), + (ins DPR:$src1, DPR:$Vn, DPR:$Vm), N3RegFrm, IIC_VCNTiD, - "vbsl", "$dst, $src2, $src3", "$src1 = $dst", - [(set DPR:$dst, - (v2i32 (or (and DPR:$src2, DPR:$src1), - (and DPR:$src3, (vnotd DPR:$src1)))))]>; -def VBSLq : N3VX<1, 0, 0b01, 0b0001, 1, 1, (outs QPR:$dst), - (ins QPR:$src1, QPR:$src2, QPR:$src3), + "vbsl", "$Vd, $Vn, $Vm", "$src1 = $Vd", + [(set DPR:$Vd, + (v2i32 (or (and DPR:$Vn, DPR:$src1), + (and DPR:$Vm, (vnotd DPR:$src1)))))]>; +def VBSLq : N3VX<1, 0, 0b01, 0b0001, 1, 1, (outs QPR:$Vd), + (ins QPR:$src1, QPR:$Vn, QPR:$Vm), N3RegFrm, IIC_VCNTiQ, - "vbsl", "$dst, $src2, $src3", "$src1 = $dst", - [(set QPR:$dst, - (v4i32 (or (and QPR:$src2, QPR:$src1), - (and QPR:$src3, (vnotq QPR:$src1)))))]>; + "vbsl", "$Vd, $Vn, $Vm", "$src1 = $Vd", + [(set QPR:$Vd, + (v4i32 (or (and QPR:$Vn, QPR:$src1), + (and QPR:$Vm, (vnotq QPR:$src1)))))]>; // VBIF : Vector Bitwise Insert if False // like VBSL but with: "vbif $dst, $src3, $src1", "$src2 = $dst", +// FIXME: This instruction's encoding MAY NOT BE correct. def VBIFd : N3VX<1, 0, 0b11, 0b0001, 0, 1, - (outs DPR:$dst), (ins DPR:$src1, DPR:$src2, DPR:$src3), + (outs DPR:$Vd), (ins DPR:$src1, DPR:$Vn, DPR:$Vm), N3RegFrm, IIC_VBINiD, - "vbif", "$dst, $src2, $src3", "$src1 = $dst", + "vbif", "$Vd, $Vn, $Vm", "$src1 = $Vd", [/* For disassembly only; pattern left blank */]>; def VBIFq : N3VX<1, 0, 0b11, 0b0001, 1, 1, - (outs QPR:$dst), (ins QPR:$src1, QPR:$src2, QPR:$src3), + (outs QPR:$Vd), (ins QPR:$src1, QPR:$Vn, QPR:$Vm), N3RegFrm, IIC_VBINiQ, - "vbif", "$dst, $src2, $src3", "$src1 = $dst", + "vbif", "$Vd, $Vn, $Vm", "$src1 = $Vd", [/* For disassembly only; pattern left blank */]>; // VBIT : Vector Bitwise Insert if True // like VBSL but with: "vbit $dst, $src2, $src1", "$src3 = $dst", +// FIXME: This instruction's encoding MAY NOT BE correct. def VBITd : N3VX<1, 0, 0b10, 0b0001, 0, 1, - (outs DPR:$dst), (ins DPR:$src1, DPR:$src2, DPR:$src3), + (outs DPR:$Vd), (ins DPR:$src1, DPR:$Vn, DPR:$Vm), N3RegFrm, IIC_VBINiD, - "vbit", "$dst, $src2, $src3", "$src1 = $dst", + "vbit", "$Vd, $Vn, $Vm", "$src1 = $Vd", [/* For disassembly only; pattern left blank */]>; def VBITq : N3VX<1, 0, 0b10, 0b0001, 1, 1, - (outs QPR:$dst), (ins QPR:$src1, QPR:$src2, QPR:$src3), + (outs QPR:$Vd), (ins QPR:$src1, QPR:$Vn, QPR:$Vm), N3RegFrm, IIC_VBINiQ, - "vbit", "$dst, $src2, $src3", "$src1 = $dst", + "vbit", "$Vd, $Vn, $Vm", "$src1 = $Vd", [/* For disassembly only; pattern left blank */]>; // VBIT/VBIF are not yet implemented. The TwoAddress pass will not go looking @@ -2957,8 +3803,8 @@ def VPADDi16 : N3VDInt<0, 0, 0b01, 0b1011, 1, N3RegFrm, IIC_VSHLiD, def VPADDi32 : N3VDInt<0, 0, 0b10, 0b1011, 1, N3RegFrm, IIC_VSHLiD, "vpadd", "i32", v2i32, v2i32, int_arm_neon_vpadd, 0>; -def VPADDf : N3VDInt<1, 0, 0b00, 0b1101, 0, N3RegFrm, - IIC_VBIND, "vpadd", "f32", +def VPADDf : N3VDInt<1, 0, 0b00, 0b1101, 0, N3RegFrm, + IIC_VPBIND, "vpadd", "f32", v2f32, v2f32, int_arm_neon_vpadd, 0>; // VPADDL : Vector Pairwise Add Long @@ -2986,7 +3832,7 @@ def VPMAXu16 : N3VDInt<1, 0, 0b01, 0b1010, 0, N3RegFrm, IIC_VSUBi4D, "vpmax", "u16", v4i16, v4i16, int_arm_neon_vpmaxu, 0>; def VPMAXu32 : N3VDInt<1, 0, 0b10, 0b1010, 0, N3RegFrm, IIC_VSUBi4D, "vpmax", "u32", v2i32, v2i32, int_arm_neon_vpmaxu, 0>; -def VPMAXf : N3VDInt<1, 0, 0b00, 0b1111, 0, N3RegFrm, IIC_VSUBi4D, "vpmax", +def VPMAXf : N3VDInt<1, 0, 0b00, 0b1111, 0, N3RegFrm, IIC_VPBIND, "vpmax", "f32", v2f32, v2f32, int_arm_neon_vpmaxs, 0>; // VPMIN : Vector Pairwise Minimum @@ -3002,16 +3848,16 @@ def VPMINu16 : N3VDInt<1, 0, 0b01, 0b1010, 1, N3RegFrm, IIC_VSUBi4D, "vpmin", "u16", v4i16, v4i16, int_arm_neon_vpminu, 0>; def VPMINu32 : N3VDInt<1, 0, 0b10, 0b1010, 1, N3RegFrm, IIC_VSUBi4D, "vpmin", "u32", v2i32, v2i32, int_arm_neon_vpminu, 0>; -def VPMINf : N3VDInt<1, 0, 0b10, 0b1111, 0, N3RegFrm, IIC_VSUBi4D, "vpmin", +def VPMINf : N3VDInt<1, 0, 0b10, 0b1111, 0, N3RegFrm, IIC_VPBIND, "vpmin", "f32", v2f32, v2f32, int_arm_neon_vpmins, 0>; // Vector Reciprocal and Reciprocal Square Root Estimate and Step. // VRECPE : Vector Reciprocal Estimate -def VRECPEd : N2VDInt<0b11, 0b11, 0b10, 0b11, 0b01000, 0, +def VRECPEd : N2VDInt<0b11, 0b11, 0b10, 0b11, 0b01000, 0, IIC_VUNAD, "vrecpe", "u32", v2i32, v2i32, int_arm_neon_vrecpe>; -def VRECPEq : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01000, 0, +def VRECPEq : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01000, 0, IIC_VUNAQ, "vrecpe", "u32", v4i32, v4i32, int_arm_neon_vrecpe>; def VRECPEfd : N2VDInt<0b11, 0b11, 0b10, 0b11, 0b01010, 0, @@ -3039,7 +3885,7 @@ def VRSQRTEq : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01001, 0, def VRSQRTEfd : N2VDInt<0b11, 0b11, 0b10, 0b11, 0b01011, 0, IIC_VUNAD, "vrsqrte", "f32", v2f32, v2f32, int_arm_neon_vrsqrte>; -def VRSQRTEfq : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01011, 0, +def VRSQRTEfq : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01011, 0, IIC_VUNAQ, "vrsqrte", "f32", v4f32, v4f32, int_arm_neon_vrsqrte>; @@ -3054,12 +3900,12 @@ def VRSQRTSfq : N3VQInt<0, 0, 0b10, 0b1111, 1, N3RegFrm, // Vector Shifts. // VSHL : Vector Shift -defm VSHLs : N3VInt_QHSD<0, 0, 0b0100, 0, N3RegVShFrm, +defm VSHLs : N3VInt_QHSDSh<0, 0, 0b0100, 0, N3RegVShFrm, IIC_VSHLiD, IIC_VSHLiD, IIC_VSHLiQ, IIC_VSHLiQ, - "vshl", "s", int_arm_neon_vshifts, 0>; -defm VSHLu : N3VInt_QHSD<1, 0, 0b0100, 0, N3RegVShFrm, + "vshl", "s", int_arm_neon_vshifts>; +defm VSHLu : N3VInt_QHSDSh<1, 0, 0b0100, 0, N3RegVShFrm, IIC_VSHLiD, IIC_VSHLiD, IIC_VSHLiQ, IIC_VSHLiQ, - "vshl", "u", int_arm_neon_vshiftu, 0>; + "vshl", "u", int_arm_neon_vshiftu>; // VSHL : Vector Shift Left (Immediate) defm VSHLi : N2VSh_QHSD<0, 1, 0b0101, 1, IIC_VSHLiD, "vshl", "i", NEONvshl, N2RegVShLFrm>; @@ -3093,12 +3939,12 @@ defm VSHRN : N2VNSh_HSD<0,1,0b1000,0,0,1, IIC_VSHLiD, "vshrn", "i", NEONvshrn>; // VRSHL : Vector Rounding Shift -defm VRSHLs : N3VInt_QHSD<0, 0, 0b0101, 0, N3RegVShFrm, +defm VRSHLs : N3VInt_QHSDSh<0, 0, 0b0101, 0, N3RegVShFrm, IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, IIC_VSHLi4Q, - "vrshl", "s", int_arm_neon_vrshifts, 0>; -defm VRSHLu : N3VInt_QHSD<1, 0, 0b0101, 0, N3RegVShFrm, + "vrshl", "s", int_arm_neon_vrshifts>; +defm VRSHLu : N3VInt_QHSDSh<1, 0, 0b0101, 0, N3RegVShFrm, IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, IIC_VSHLi4Q, - "vrshl", "u", int_arm_neon_vrshiftu, 0>; + "vrshl", "u", int_arm_neon_vrshiftu>; // VRSHR : Vector Rounding Shift Right defm VRSHRs : N2VSh_QHSD<0,1,0b0010,1, IIC_VSHLi4D, "vrshr", "s", NEONvrshrs, N2RegVShRFrm>; @@ -3110,12 +3956,12 @@ defm VRSHRN : N2VNSh_HSD<0, 1, 0b1000, 0, 1, 1, IIC_VSHLi4D, "vrshrn", "i", NEONvrshrn>; // VQSHL : Vector Saturating Shift -defm VQSHLs : N3VInt_QHSD<0, 0, 0b0100, 1, N3RegVShFrm, +defm VQSHLs : N3VInt_QHSDSh<0, 0, 0b0100, 1, N3RegVShFrm, IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, IIC_VSHLi4Q, - "vqshl", "s", int_arm_neon_vqshifts, 0>; -defm VQSHLu : N3VInt_QHSD<1, 0, 0b0100, 1, N3RegVShFrm, + "vqshl", "s", int_arm_neon_vqshifts>; +defm VQSHLu : N3VInt_QHSDSh<1, 0, 0b0100, 1, N3RegVShFrm, IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, IIC_VSHLi4Q, - "vqshl", "u", int_arm_neon_vqshiftu, 0>; + "vqshl", "u", int_arm_neon_vqshiftu>; // VQSHL : Vector Saturating Shift Left (Immediate) defm VQSHLsi : N2VSh_QHSD<0,1,0b0111,1, IIC_VSHLi4D, "vqshl", "s",NEONvqshls, N2RegVShLFrm>; @@ -3136,12 +3982,12 @@ defm VQSHRUN : N2VNSh_HSD<1, 1, 0b1000, 0, 0, 1, IIC_VSHLi4D, "vqshrun", "s", NEONvqshrnsu>; // VQRSHL : Vector Saturating Rounding Shift -defm VQRSHLs : N3VInt_QHSD<0, 0, 0b0101, 1, N3RegVShFrm, +defm VQRSHLs : N3VInt_QHSDSh<0, 0, 0b0101, 1, N3RegVShFrm, IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, IIC_VSHLi4Q, - "vqrshl", "s", int_arm_neon_vqrshifts, 0>; -defm VQRSHLu : N3VInt_QHSD<1, 0, 0b0101, 1, N3RegVShFrm, + "vqrshl", "s", int_arm_neon_vqrshifts>; +defm VQRSHLu : N3VInt_QHSDSh<1, 0, 0b0101, 1, N3RegVShFrm, IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, IIC_VSHLi4Q, - "vqrshl", "u", int_arm_neon_vqrshiftu, 0>; + "vqrshl", "u", int_arm_neon_vqrshiftu>; // VQRSHRN : Vector Saturating Rounding Shift Right and Narrow defm VQRSHRNs : N2VNSh_HSD<0, 1, 0b1001, 0, 1, 1, IIC_VSHLi4D, "vqrshrn", "s", @@ -3168,7 +4014,7 @@ defm VSRI : N2VShIns_QHSD<1, 1, 0b0100, 1, "vsri", NEONvsri, N2RegVShRFrm>; // Vector Absolute and Saturating Absolute. // VABS : Vector Absolute Value -defm VABS : N2VInt_QHS<0b11, 0b11, 0b01, 0b00110, 0, +defm VABS : N2VInt_QHS<0b11, 0b11, 0b01, 0b00110, 0, IIC_VUNAiD, IIC_VUNAiQ, "vabs", "s", int_arm_neon_vabs>; def VABSfd : N2VDInt<0b11, 0b11, 0b10, 0b01, 0b01110, 0, @@ -3179,7 +4025,7 @@ def VABSfq : N2VQInt<0b11, 0b11, 0b10, 0b01, 0b01110, 0, v4f32, v4f32, int_arm_neon_vabs>; // VQABS : Vector Saturating Absolute Value -defm VQABS : N2VInt_QHS<0b11, 0b11, 0b00, 0b01110, 0, +defm VQABS : N2VInt_QHS<0b11, 0b11, 0b00, 0b01110, 0, IIC_VQUNAiD, IIC_VQUNAiQ, "vqabs", "s", int_arm_neon_vqabs>; @@ -3191,13 +4037,13 @@ def vnegq : PatFrag<(ops node:$in), (sub (bitconvert (v4i32 NEONimmAllZerosV)), node:$in)>; class VNEGD<bits<2> size, string OpcodeStr, string Dt, ValueType Ty> - : N2V<0b11, 0b11, size, 0b01, 0b00111, 0, 0, (outs DPR:$dst), (ins DPR:$src), - IIC_VSHLiD, OpcodeStr, Dt, "$dst, $src", "", - [(set DPR:$dst, (Ty (vnegd DPR:$src)))]>; + : N2V<0b11, 0b11, size, 0b01, 0b00111, 0, 0, (outs DPR:$Vd), (ins DPR:$Vm), + IIC_VSHLiD, OpcodeStr, Dt, "$Vd, $Vm", "", + [(set DPR:$Vd, (Ty (vnegd DPR:$Vm)))]>; class VNEGQ<bits<2> size, string OpcodeStr, string Dt, ValueType Ty> - : N2V<0b11, 0b11, size, 0b01, 0b00111, 1, 0, (outs QPR:$dst), (ins QPR:$src), - IIC_VSHLiD, OpcodeStr, Dt, "$dst, $src", "", - [(set QPR:$dst, (Ty (vnegq QPR:$src)))]>; + : N2V<0b11, 0b11, size, 0b01, 0b00111, 1, 0, (outs QPR:$Vd), (ins QPR:$Vm), + IIC_VSHLiQ, OpcodeStr, Dt, "$Vd, $Vm", "", + [(set QPR:$Vd, (Ty (vnegq QPR:$Vm)))]>; // VNEG : Vector Negate (integer) def VNEGs8d : VNEGD<0b00, "vneg", "s8", v8i8>; @@ -3209,13 +4055,13 @@ def VNEGs32q : VNEGQ<0b10, "vneg", "s32", v4i32>; // VNEG : Vector Negate (floating-point) def VNEGfd : N2V<0b11, 0b11, 0b10, 0b01, 0b01111, 0, 0, - (outs DPR:$dst), (ins DPR:$src), IIC_VUNAD, - "vneg", "f32", "$dst, $src", "", - [(set DPR:$dst, (v2f32 (fneg DPR:$src)))]>; + (outs DPR:$Vd), (ins DPR:$Vm), IIC_VUNAD, + "vneg", "f32", "$Vd, $Vm", "", + [(set DPR:$Vd, (v2f32 (fneg DPR:$Vm)))]>; def VNEGf32q : N2V<0b11, 0b11, 0b10, 0b01, 0b01111, 1, 0, - (outs QPR:$dst), (ins QPR:$src), IIC_VUNAQ, - "vneg", "f32", "$dst, $src", "", - [(set QPR:$dst, (v4f32 (fneg QPR:$src)))]>; + (outs QPR:$Vd), (ins QPR:$Vm), IIC_VUNAQ, + "vneg", "f32", "$Vd, $Vm", "", + [(set QPR:$Vd, (v4f32 (fneg QPR:$Vm)))]>; def : Pat<(v8i8 (vnegd DPR:$src)), (VNEGs8d DPR:$src)>; def : Pat<(v4i16 (vnegd DPR:$src)), (VNEGs16d DPR:$src)>; @@ -3225,22 +4071,22 @@ def : Pat<(v8i16 (vnegq QPR:$src)), (VNEGs16q QPR:$src)>; def : Pat<(v4i32 (vnegq QPR:$src)), (VNEGs32q QPR:$src)>; // VQNEG : Vector Saturating Negate -defm VQNEG : N2VInt_QHS<0b11, 0b11, 0b00, 0b01111, 0, +defm VQNEG : N2VInt_QHS<0b11, 0b11, 0b00, 0b01111, 0, IIC_VQUNAiD, IIC_VQUNAiQ, "vqneg", "s", int_arm_neon_vqneg>; // Vector Bit Counting Operations. // VCLS : Vector Count Leading Sign Bits -defm VCLS : N2VInt_QHS<0b11, 0b11, 0b00, 0b01000, 0, +defm VCLS : N2VInt_QHS<0b11, 0b11, 0b00, 0b01000, 0, IIC_VCNTiD, IIC_VCNTiQ, "vcls", "s", int_arm_neon_vcls>; // VCLZ : Vector Count Leading Zeros -defm VCLZ : N2VInt_QHS<0b11, 0b11, 0b00, 0b01001, 0, +defm VCLZ : N2VInt_QHS<0b11, 0b11, 0b00, 0b01001, 0, IIC_VCNTiD, IIC_VCNTiQ, "vclz", "i", int_arm_neon_vclz>; // VCNT : Vector Count One Bits -def VCNTd : N2VDInt<0b11, 0b11, 0b00, 0b00, 0b01010, 0, +def VCNTd : N2VDInt<0b11, 0b11, 0b00, 0b00, 0b01010, 0, IIC_VCNTiD, "vcnt", "8", v8i8, v8i8, int_arm_neon_vcnt>; def VCNTq : N2VQInt<0b11, 0b11, 0b00, 0b00, 0b01010, 0, @@ -3249,98 +4095,126 @@ def VCNTq : N2VQInt<0b11, 0b11, 0b00, 0b00, 0b01010, 0, // Vector Swap -- for disassembly only. def VSWPd : N2VX<0b11, 0b11, 0b00, 0b10, 0b00000, 0, 0, - (outs DPR:$dst), (ins DPR:$src), NoItinerary, - "vswp", "$dst, $src", "", []>; + (outs DPR:$Vd), (ins DPR:$Vm), NoItinerary, + "vswp", "$Vd, $Vm", "", []>; def VSWPq : N2VX<0b11, 0b11, 0b00, 0b10, 0b00000, 1, 0, - (outs QPR:$dst), (ins QPR:$src), NoItinerary, - "vswp", "$dst, $src", "", []>; + (outs QPR:$Vd), (ins QPR:$Vm), NoItinerary, + "vswp", "$Vd, $Vm", "", []>; // Vector Move Operations. // VMOV : Vector Move (Register) let neverHasSideEffects = 1 in { -def VMOVDneon: N3VX<0, 0, 0b10, 0b0001, 0, 1, (outs DPR:$dst), (ins DPR:$src), - N3RegFrm, IIC_VMOVD, "vmov", "$dst, $src", "", []>; -def VMOVQ : N3VX<0, 0, 0b10, 0b0001, 1, 1, (outs QPR:$dst), (ins QPR:$src), - N3RegFrm, IIC_VMOVD, "vmov", "$dst, $src", "", []>; +def VMOVDneon: N3VX<0, 0, 0b10, 0b0001, 0, 1, (outs DPR:$Vd), (ins DPR:$Vm), + N3RegFrm, IIC_VMOV, "vmov", "$Vd, $Vm", "", []> { + let Vn{4-0} = Vm{4-0}; +} +def VMOVQ : N3VX<0, 0, 0b10, 0b0001, 1, 1, (outs QPR:$Vd), (ins QPR:$Vm), + N3RegFrm, IIC_VMOV, "vmov", "$Vd, $Vm", "", []> { + let Vn{4-0} = Vm{4-0}; +} // Pseudo vector move instructions for QQ and QQQQ registers. This should // be expanded after register allocation is completed. def VMOVQQ : PseudoInst<(outs QQPR:$dst), (ins QQPR:$src), - NoItinerary, "${:comment} vmov\t$dst, $src", []>; + NoItinerary, []>; def VMOVQQQQ : PseudoInst<(outs QQQQPR:$dst), (ins QQQQPR:$src), - NoItinerary, "${:comment} vmov\t$dst, $src", []>; + NoItinerary, []>; } // neverHasSideEffects // VMOV : Vector Move (Immediate) let isReMaterializable = 1 in { -def VMOVv8i8 : N1ModImm<1, 0b000, 0b1110, 0, 0, 0, 1, (outs DPR:$dst), +def VMOVv8i8 : N1ModImm<1, 0b000, 0b1110, 0, 0, 0, 1, (outs DPR:$Vd), (ins nModImm:$SIMM), IIC_VMOVImm, - "vmov", "i8", "$dst, $SIMM", "", - [(set DPR:$dst, (v8i8 (NEONvmovImm timm:$SIMM)))]>; -def VMOVv16i8 : N1ModImm<1, 0b000, 0b1110, 0, 1, 0, 1, (outs QPR:$dst), + "vmov", "i8", "$Vd, $SIMM", "", + [(set DPR:$Vd, (v8i8 (NEONvmovImm timm:$SIMM)))]>; +def VMOVv16i8 : N1ModImm<1, 0b000, 0b1110, 0, 1, 0, 1, (outs QPR:$Vd), (ins nModImm:$SIMM), IIC_VMOVImm, - "vmov", "i8", "$dst, $SIMM", "", - [(set QPR:$dst, (v16i8 (NEONvmovImm timm:$SIMM)))]>; + "vmov", "i8", "$Vd, $SIMM", "", + [(set QPR:$Vd, (v16i8 (NEONvmovImm timm:$SIMM)))]>; -def VMOVv4i16 : N1ModImm<1, 0b000, {1,0,?,0}, 0, 0, 0, 1, (outs DPR:$dst), +def VMOVv4i16 : N1ModImm<1, 0b000, {1,0,?,0}, 0, 0, 0, 1, (outs DPR:$Vd), (ins nModImm:$SIMM), IIC_VMOVImm, - "vmov", "i16", "$dst, $SIMM", "", - [(set DPR:$dst, (v4i16 (NEONvmovImm timm:$SIMM)))]>; -def VMOVv8i16 : N1ModImm<1, 0b000, {1,0,?,0}, 0, 1, 0, 1, (outs QPR:$dst), + "vmov", "i16", "$Vd, $SIMM", "", + [(set DPR:$Vd, (v4i16 (NEONvmovImm timm:$SIMM)))]> { + let Inst{9} = SIMM{9}; +} + +def VMOVv8i16 : N1ModImm<1, 0b000, {1,0,?,0}, 0, 1, 0, 1, (outs QPR:$Vd), (ins nModImm:$SIMM), IIC_VMOVImm, - "vmov", "i16", "$dst, $SIMM", "", - [(set QPR:$dst, (v8i16 (NEONvmovImm timm:$SIMM)))]>; + "vmov", "i16", "$Vd, $SIMM", "", + [(set QPR:$Vd, (v8i16 (NEONvmovImm timm:$SIMM)))]> { + let Inst{9} = SIMM{9}; +} -def VMOVv2i32 : N1ModImm<1, 0b000, {?,?,?,?}, 0, 0, 0, 1, (outs DPR:$dst), +def VMOVv2i32 : N1ModImm<1, 0b000, {?,?,?,?}, 0, 0, 0, 1, (outs DPR:$Vd), (ins nModImm:$SIMM), IIC_VMOVImm, - "vmov", "i32", "$dst, $SIMM", "", - [(set DPR:$dst, (v2i32 (NEONvmovImm timm:$SIMM)))]>; -def VMOVv4i32 : N1ModImm<1, 0b000, {?,?,?,?}, 0, 1, 0, 1, (outs QPR:$dst), + "vmov", "i32", "$Vd, $SIMM", "", + [(set DPR:$Vd, (v2i32 (NEONvmovImm timm:$SIMM)))]> { + let Inst{11-8} = SIMM{11-8}; +} + +def VMOVv4i32 : N1ModImm<1, 0b000, {?,?,?,?}, 0, 1, 0, 1, (outs QPR:$Vd), (ins nModImm:$SIMM), IIC_VMOVImm, - "vmov", "i32", "$dst, $SIMM", "", - [(set QPR:$dst, (v4i32 (NEONvmovImm timm:$SIMM)))]>; + "vmov", "i32", "$Vd, $SIMM", "", + [(set QPR:$Vd, (v4i32 (NEONvmovImm timm:$SIMM)))]> { + let Inst{11-8} = SIMM{11-8}; +} -def VMOVv1i64 : N1ModImm<1, 0b000, 0b1110, 0, 0, 1, 1, (outs DPR:$dst), +def VMOVv1i64 : N1ModImm<1, 0b000, 0b1110, 0, 0, 1, 1, (outs DPR:$Vd), (ins nModImm:$SIMM), IIC_VMOVImm, - "vmov", "i64", "$dst, $SIMM", "", - [(set DPR:$dst, (v1i64 (NEONvmovImm timm:$SIMM)))]>; -def VMOVv2i64 : N1ModImm<1, 0b000, 0b1110, 0, 1, 1, 1, (outs QPR:$dst), + "vmov", "i64", "$Vd, $SIMM", "", + [(set DPR:$Vd, (v1i64 (NEONvmovImm timm:$SIMM)))]>; +def VMOVv2i64 : N1ModImm<1, 0b000, 0b1110, 0, 1, 1, 1, (outs QPR:$Vd), (ins nModImm:$SIMM), IIC_VMOVImm, - "vmov", "i64", "$dst, $SIMM", "", - [(set QPR:$dst, (v2i64 (NEONvmovImm timm:$SIMM)))]>; + "vmov", "i64", "$Vd, $SIMM", "", + [(set QPR:$Vd, (v2i64 (NEONvmovImm timm:$SIMM)))]>; } // isReMaterializable // VMOV : Vector Get Lane (move scalar to ARM core register) def VGETLNs8 : NVGetLane<{1,1,1,0,0,1,?,1}, 0b1011, {?,?}, - (outs GPR:$dst), (ins DPR:$src, nohash_imm:$lane), - IIC_VMOVSI, "vmov", "s8", "$dst, $src[$lane]", - [(set GPR:$dst, (NEONvgetlanes (v8i8 DPR:$src), - imm:$lane))]>; + (outs GPR:$R), (ins DPR:$V, nohash_imm:$lane), + IIC_VMOVSI, "vmov", "s8", "$R, $V[$lane]", + [(set GPR:$R, (NEONvgetlanes (v8i8 DPR:$V), + imm:$lane))]> { + let Inst{21} = lane{2}; + let Inst{6-5} = lane{1-0}; +} def VGETLNs16 : NVGetLane<{1,1,1,0,0,0,?,1}, 0b1011, {?,1}, - (outs GPR:$dst), (ins DPR:$src, nohash_imm:$lane), - IIC_VMOVSI, "vmov", "s16", "$dst, $src[$lane]", - [(set GPR:$dst, (NEONvgetlanes (v4i16 DPR:$src), - imm:$lane))]>; + (outs GPR:$R), (ins DPR:$V, nohash_imm:$lane), + IIC_VMOVSI, "vmov", "s16", "$R, $V[$lane]", + [(set GPR:$R, (NEONvgetlanes (v4i16 DPR:$V), + imm:$lane))]> { + let Inst{21} = lane{1}; + let Inst{6} = lane{0}; +} def VGETLNu8 : NVGetLane<{1,1,1,0,1,1,?,1}, 0b1011, {?,?}, - (outs GPR:$dst), (ins DPR:$src, nohash_imm:$lane), - IIC_VMOVSI, "vmov", "u8", "$dst, $src[$lane]", - [(set GPR:$dst, (NEONvgetlaneu (v8i8 DPR:$src), - imm:$lane))]>; + (outs GPR:$R), (ins DPR:$V, nohash_imm:$lane), + IIC_VMOVSI, "vmov", "u8", "$R, $V[$lane]", + [(set GPR:$R, (NEONvgetlaneu (v8i8 DPR:$V), + imm:$lane))]> { + let Inst{21} = lane{2}; + let Inst{6-5} = lane{1-0}; +} def VGETLNu16 : NVGetLane<{1,1,1,0,1,0,?,1}, 0b1011, {?,1}, - (outs GPR:$dst), (ins DPR:$src, nohash_imm:$lane), - IIC_VMOVSI, "vmov", "u16", "$dst, $src[$lane]", - [(set GPR:$dst, (NEONvgetlaneu (v4i16 DPR:$src), - imm:$lane))]>; + (outs GPR:$R), (ins DPR:$V, nohash_imm:$lane), + IIC_VMOVSI, "vmov", "u16", "$R, $V[$lane]", + [(set GPR:$R, (NEONvgetlaneu (v4i16 DPR:$V), + imm:$lane))]> { + let Inst{21} = lane{1}; + let Inst{6} = lane{0}; +} def VGETLNi32 : NVGetLane<{1,1,1,0,0,0,?,1}, 0b1011, 0b00, - (outs GPR:$dst), (ins DPR:$src, nohash_imm:$lane), - IIC_VMOVSI, "vmov", "32", "$dst, $src[$lane]", - [(set GPR:$dst, (extractelt (v2i32 DPR:$src), - imm:$lane))]>; + (outs GPR:$R), (ins DPR:$V, nohash_imm:$lane), + IIC_VMOVSI, "vmov", "32", "$R, $V[$lane]", + [(set GPR:$R, (extractelt (v2i32 DPR:$V), + imm:$lane))]> { + let Inst{21} = lane{0}; +} // def VGETLNf32: see FMRDH and FMRDL in ARMInstrVFP.td def : Pat<(NEONvgetlanes (v16i8 QPR:$src), imm:$lane), (VGETLNs8 (v8i8 (EXTRACT_SUBREG QPR:$src, @@ -3376,37 +4250,45 @@ def : Pat<(extractelt (v2f64 QPR:$src1), imm:$src2), // VMOV : Vector Set Lane (move ARM core register to scalar) -let Constraints = "$src1 = $dst" in { -def VSETLNi8 : NVSetLane<{1,1,1,0,0,1,?,0}, 0b1011, {?,?}, (outs DPR:$dst), - (ins DPR:$src1, GPR:$src2, nohash_imm:$lane), - IIC_VMOVISL, "vmov", "8", "$dst[$lane], $src2", - [(set DPR:$dst, (vector_insert (v8i8 DPR:$src1), - GPR:$src2, imm:$lane))]>; -def VSETLNi16 : NVSetLane<{1,1,1,0,0,0,?,0}, 0b1011, {?,1}, (outs DPR:$dst), - (ins DPR:$src1, GPR:$src2, nohash_imm:$lane), - IIC_VMOVISL, "vmov", "16", "$dst[$lane], $src2", - [(set DPR:$dst, (vector_insert (v4i16 DPR:$src1), - GPR:$src2, imm:$lane))]>; -def VSETLNi32 : NVSetLane<{1,1,1,0,0,0,?,0}, 0b1011, 0b00, (outs DPR:$dst), - (ins DPR:$src1, GPR:$src2, nohash_imm:$lane), - IIC_VMOVISL, "vmov", "32", "$dst[$lane], $src2", - [(set DPR:$dst, (insertelt (v2i32 DPR:$src1), - GPR:$src2, imm:$lane))]>; +let Constraints = "$src1 = $V" in { +def VSETLNi8 : NVSetLane<{1,1,1,0,0,1,?,0}, 0b1011, {?,?}, (outs DPR:$V), + (ins DPR:$src1, GPR:$R, nohash_imm:$lane), + IIC_VMOVISL, "vmov", "8", "$V[$lane], $R", + [(set DPR:$V, (vector_insert (v8i8 DPR:$src1), + GPR:$R, imm:$lane))]> { + let Inst{21} = lane{2}; + let Inst{6-5} = lane{1-0}; +} +def VSETLNi16 : NVSetLane<{1,1,1,0,0,0,?,0}, 0b1011, {?,1}, (outs DPR:$V), + (ins DPR:$src1, GPR:$R, nohash_imm:$lane), + IIC_VMOVISL, "vmov", "16", "$V[$lane], $R", + [(set DPR:$V, (vector_insert (v4i16 DPR:$src1), + GPR:$R, imm:$lane))]> { + let Inst{21} = lane{1}; + let Inst{6} = lane{0}; +} +def VSETLNi32 : NVSetLane<{1,1,1,0,0,0,?,0}, 0b1011, 0b00, (outs DPR:$V), + (ins DPR:$src1, GPR:$R, nohash_imm:$lane), + IIC_VMOVISL, "vmov", "32", "$V[$lane], $R", + [(set DPR:$V, (insertelt (v2i32 DPR:$src1), + GPR:$R, imm:$lane))]> { + let Inst{21} = lane{0}; +} } def : Pat<(vector_insert (v16i8 QPR:$src1), GPR:$src2, imm:$lane), - (v16i8 (INSERT_SUBREG QPR:$src1, + (v16i8 (INSERT_SUBREG QPR:$src1, (v8i8 (VSETLNi8 (v8i8 (EXTRACT_SUBREG QPR:$src1, (DSubReg_i8_reg imm:$lane))), GPR:$src2, (SubReg_i8_lane imm:$lane))), (DSubReg_i8_reg imm:$lane)))>; def : Pat<(vector_insert (v8i16 QPR:$src1), GPR:$src2, imm:$lane), - (v8i16 (INSERT_SUBREG QPR:$src1, + (v8i16 (INSERT_SUBREG QPR:$src1, (v4i16 (VSETLNi16 (v4i16 (EXTRACT_SUBREG QPR:$src1, (DSubReg_i16_reg imm:$lane))), GPR:$src2, (SubReg_i16_lane imm:$lane))), (DSubReg_i16_reg imm:$lane)))>; def : Pat<(insertelt (v4i32 QPR:$src1), GPR:$src2, imm:$lane), - (v4i32 (INSERT_SUBREG QPR:$src1, + (v4i32 (INSERT_SUBREG QPR:$src1, (v2i32 (VSETLNi32 (v2i32 (EXTRACT_SUBREG QPR:$src1, (DSubReg_i32_reg imm:$lane))), GPR:$src2, (SubReg_i32_lane imm:$lane))), @@ -3454,13 +4336,13 @@ def : Pat<(v4i32 (scalar_to_vector GPR:$src)), // VDUP : Vector Duplicate (from ARM core register to all elements) class VDUPD<bits<8> opcod1, bits<2> opcod3, string Dt, ValueType Ty> - : NVDup<opcod1, 0b1011, opcod3, (outs DPR:$dst), (ins GPR:$src), - IIC_VMOVIS, "vdup", Dt, "$dst, $src", - [(set DPR:$dst, (Ty (NEONvdup (i32 GPR:$src))))]>; + : NVDup<opcod1, 0b1011, opcod3, (outs DPR:$V), (ins GPR:$R), + IIC_VMOVIS, "vdup", Dt, "$V, $R", + [(set DPR:$V, (Ty (NEONvdup (i32 GPR:$R))))]>; class VDUPQ<bits<8> opcod1, bits<2> opcod3, string Dt, ValueType Ty> - : NVDup<opcod1, 0b1011, opcod3, (outs QPR:$dst), (ins GPR:$src), - IIC_VMOVIS, "vdup", Dt, "$dst, $src", - [(set QPR:$dst, (Ty (NEONvdup (i32 GPR:$src))))]>; + : NVDup<opcod1, 0b1011, opcod3, (outs QPR:$V), (ins GPR:$R), + IIC_VMOVIS, "vdup", Dt, "$V, $R", + [(set QPR:$V, (Ty (NEONvdup (i32 GPR:$R))))]>; def VDUP8d : VDUPD<0b11101100, 0b00, "8", v8i8>; def VDUP16d : VDUPD<0b11101000, 0b01, "16", v4i16>; @@ -3469,40 +4351,56 @@ def VDUP8q : VDUPQ<0b11101110, 0b00, "8", v16i8>; def VDUP16q : VDUPQ<0b11101010, 0b01, "16", v8i16>; def VDUP32q : VDUPQ<0b11101010, 0b00, "32", v4i32>; -def VDUPfd : NVDup<0b11101000, 0b1011, 0b00, (outs DPR:$dst), (ins GPR:$src), - IIC_VMOVIS, "vdup", "32", "$dst, $src", - [(set DPR:$dst, (v2f32 (NEONvdup - (f32 (bitconvert GPR:$src)))))]>; -def VDUPfq : NVDup<0b11101010, 0b1011, 0b00, (outs QPR:$dst), (ins GPR:$src), - IIC_VMOVIS, "vdup", "32", "$dst, $src", - [(set QPR:$dst, (v4f32 (NEONvdup - (f32 (bitconvert GPR:$src)))))]>; +def VDUPfd : NVDup<0b11101000, 0b1011, 0b00, (outs DPR:$V), (ins GPR:$R), + IIC_VMOVIS, "vdup", "32", "$V, $R", + [(set DPR:$V, (v2f32 (NEONvdup + (f32 (bitconvert GPR:$R)))))]>; +def VDUPfq : NVDup<0b11101010, 0b1011, 0b00, (outs QPR:$V), (ins GPR:$R), + IIC_VMOVIS, "vdup", "32", "$V, $R", + [(set QPR:$V, (v4f32 (NEONvdup + (f32 (bitconvert GPR:$R)))))]>; // VDUP : Vector Duplicate Lane (from scalar to all elements) class VDUPLND<bits<4> op19_16, string OpcodeStr, string Dt, ValueType Ty> - : NVDupLane<op19_16, 0, (outs DPR:$dst), (ins DPR:$src, nohash_imm:$lane), - IIC_VMOVD, OpcodeStr, Dt, "$dst, $src[$lane]", - [(set DPR:$dst, (Ty (NEONvduplane (Ty DPR:$src), imm:$lane)))]>; + : NVDupLane<op19_16, 0, (outs DPR:$Vd), (ins DPR:$Vm, nohash_imm:$lane), + IIC_VMOVD, OpcodeStr, Dt, "$Vd, $Vm[$lane]", + [(set DPR:$Vd, (Ty (NEONvduplane (Ty DPR:$Vm), imm:$lane)))]>; class VDUPLNQ<bits<4> op19_16, string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy> - : NVDupLane<op19_16, 1, (outs QPR:$dst), (ins DPR:$src, nohash_imm:$lane), - IIC_VMOVD, OpcodeStr, Dt, "$dst, $src[$lane]", - [(set QPR:$dst, (ResTy (NEONvduplane (OpTy DPR:$src), + : NVDupLane<op19_16, 1, (outs QPR:$Vd), (ins DPR:$Vm, nohash_imm:$lane), + IIC_VMOVQ, OpcodeStr, Dt, "$Vd, $Vm[$lane]", + [(set QPR:$Vd, (ResTy (NEONvduplane (OpTy DPR:$Vm), imm:$lane)))]>; // Inst{19-16} is partially specified depending on the element size. -def VDUPLN8d : VDUPLND<{?,?,?,1}, "vdup", "8", v8i8>; -def VDUPLN16d : VDUPLND<{?,?,1,0}, "vdup", "16", v4i16>; -def VDUPLN32d : VDUPLND<{?,1,0,0}, "vdup", "32", v2i32>; -def VDUPLNfd : VDUPLND<{?,1,0,0}, "vdup", "32", v2f32>; -def VDUPLN8q : VDUPLNQ<{?,?,?,1}, "vdup", "8", v16i8, v8i8>; -def VDUPLN16q : VDUPLNQ<{?,?,1,0}, "vdup", "16", v8i16, v4i16>; -def VDUPLN32q : VDUPLNQ<{?,1,0,0}, "vdup", "32", v4i32, v2i32>; -def VDUPLNfq : VDUPLNQ<{?,1,0,0}, "vdup", "32", v4f32, v2f32>; +def VDUPLN8d : VDUPLND<{?,?,?,1}, "vdup", "8", v8i8> { + let Inst{19-17} = lane{2-0}; +} +def VDUPLN16d : VDUPLND<{?,?,1,0}, "vdup", "16", v4i16> { + let Inst{19-18} = lane{1-0}; +} +def VDUPLN32d : VDUPLND<{?,1,0,0}, "vdup", "32", v2i32> { + let Inst{19} = lane{0}; +} +def VDUPLNfd : VDUPLND<{?,1,0,0}, "vdup", "32", v2f32> { + let Inst{19} = lane{0}; +} +def VDUPLN8q : VDUPLNQ<{?,?,?,1}, "vdup", "8", v16i8, v8i8> { + let Inst{19-17} = lane{2-0}; +} +def VDUPLN16q : VDUPLNQ<{?,?,1,0}, "vdup", "16", v8i16, v4i16> { + let Inst{19-18} = lane{1-0}; +} +def VDUPLN32q : VDUPLNQ<{?,1,0,0}, "vdup", "32", v4i32, v2i32> { + let Inst{19} = lane{0}; +} +def VDUPLNfq : VDUPLNQ<{?,1,0,0}, "vdup", "32", v4f32, v2f32> { + let Inst{19} = lane{0}; +} def : Pat<(v16i8 (NEONvduplane (v16i8 QPR:$src), imm:$lane)), (v16i8 (VDUPLN8q (v8i8 (EXTRACT_SUBREG QPR:$src, @@ -3521,18 +4419,13 @@ def : Pat<(v4f32 (NEONvduplane (v4f32 QPR:$src), imm:$lane)), (DSubReg_i32_reg imm:$lane))), (SubReg_i32_lane imm:$lane)))>; -def VDUPfdf : N2V<0b11, 0b11, {?,1}, {0,0}, 0b11000, 0, 0, - (outs DPR:$dst), (ins SPR:$src), - IIC_VMOVD, "vdup", "32", "$dst, ${src:lane}", "", +def VDUPfdf : PseudoNeonI<(outs DPR:$dst), (ins SPR:$src), IIC_VMOVD, "", [(set DPR:$dst, (v2f32 (NEONvdup (f32 SPR:$src))))]>; - -def VDUPfqf : N2V<0b11, 0b11, {?,1}, {0,0}, 0b11000, 1, 0, - (outs QPR:$dst), (ins SPR:$src), - IIC_VMOVD, "vdup", "32", "$dst, ${src:lane}", "", +def VDUPfqf : PseudoNeonI<(outs QPR:$dst), (ins SPR:$src), IIC_VMOVD, "", [(set QPR:$dst, (v4f32 (NEONvdup (f32 SPR:$src))))]>; // VMOVN : Vector Narrowing Move -defm VMOVN : N2VN_HSD<0b11,0b11,0b10,0b00100,0,0, IIC_VMOVD, +defm VMOVN : N2VN_HSD<0b11,0b11,0b10,0b00100,0,0, IIC_VMOVN, "vmovn", "i", trunc>; // VQMOVN : Vector Saturating Narrowing Move defm VQMOVNs : N2VNInt_HSD<0b11,0b11,0b10,0b00101,0,0, IIC_VQUNAiD, @@ -3585,20 +4478,30 @@ def VCVTxs2fq : N2VCvtQ<0, 1, 0b1110, 0, 1, "vcvt", "f32.s32", def VCVTxu2fq : N2VCvtQ<1, 1, 0b1110, 0, 1, "vcvt", "f32.u32", v4f32, v4i32, int_arm_neon_vcvtfxu2fp>; +// VCVT : Vector Convert Between Half-Precision and Single-Precision. +def VCVTf2h : N2VNInt<0b11, 0b11, 0b01, 0b10, 0b01100, 0, 0, + IIC_VUNAQ, "vcvt", "f16.f32", + v4i16, v4f32, int_arm_neon_vcvtfp2hf>, + Requires<[HasNEON, HasFP16]>; +def VCVTh2f : N2VLInt<0b11, 0b11, 0b01, 0b10, 0b01110, 0, 0, + IIC_VUNAQ, "vcvt", "f32.f16", + v4f32, v4i16, int_arm_neon_vcvthf2fp>, + Requires<[HasNEON, HasFP16]>; + // Vector Reverse. // VREV64 : Vector Reverse elements within 64-bit doublewords class VREV64D<bits<2> op19_18, string OpcodeStr, string Dt, ValueType Ty> - : N2V<0b11, 0b11, op19_18, 0b00, 0b00000, 0, 0, (outs DPR:$dst), - (ins DPR:$src), IIC_VMOVD, - OpcodeStr, Dt, "$dst, $src", "", - [(set DPR:$dst, (Ty (NEONvrev64 (Ty DPR:$src))))]>; + : N2V<0b11, 0b11, op19_18, 0b00, 0b00000, 0, 0, (outs DPR:$Vd), + (ins DPR:$Vm), IIC_VMOVD, + OpcodeStr, Dt, "$Vd, $Vm", "", + [(set DPR:$Vd, (Ty (NEONvrev64 (Ty DPR:$Vm))))]>; class VREV64Q<bits<2> op19_18, string OpcodeStr, string Dt, ValueType Ty> - : N2V<0b11, 0b11, op19_18, 0b00, 0b00000, 1, 0, (outs QPR:$dst), - (ins QPR:$src), IIC_VMOVD, - OpcodeStr, Dt, "$dst, $src", "", - [(set QPR:$dst, (Ty (NEONvrev64 (Ty QPR:$src))))]>; + : N2V<0b11, 0b11, op19_18, 0b00, 0b00000, 1, 0, (outs QPR:$Vd), + (ins QPR:$Vm), IIC_VMOVQ, + OpcodeStr, Dt, "$Vd, $Vm", "", + [(set QPR:$Vd, (Ty (NEONvrev64 (Ty QPR:$Vm))))]>; def VREV64d8 : VREV64D<0b00, "vrev64", "8", v8i8>; def VREV64d16 : VREV64D<0b01, "vrev64", "16", v4i16>; @@ -3613,15 +4516,15 @@ def VREV64qf : VREV64Q<0b10, "vrev64", "32", v4f32>; // VREV32 : Vector Reverse elements within 32-bit words class VREV32D<bits<2> op19_18, string OpcodeStr, string Dt, ValueType Ty> - : N2V<0b11, 0b11, op19_18, 0b00, 0b00001, 0, 0, (outs DPR:$dst), - (ins DPR:$src), IIC_VMOVD, - OpcodeStr, Dt, "$dst, $src", "", - [(set DPR:$dst, (Ty (NEONvrev32 (Ty DPR:$src))))]>; + : N2V<0b11, 0b11, op19_18, 0b00, 0b00001, 0, 0, (outs DPR:$Vd), + (ins DPR:$Vm), IIC_VMOVD, + OpcodeStr, Dt, "$Vd, $Vm", "", + [(set DPR:$Vd, (Ty (NEONvrev32 (Ty DPR:$Vm))))]>; class VREV32Q<bits<2> op19_18, string OpcodeStr, string Dt, ValueType Ty> - : N2V<0b11, 0b11, op19_18, 0b00, 0b00001, 1, 0, (outs QPR:$dst), - (ins QPR:$src), IIC_VMOVD, - OpcodeStr, Dt, "$dst, $src", "", - [(set QPR:$dst, (Ty (NEONvrev32 (Ty QPR:$src))))]>; + : N2V<0b11, 0b11, op19_18, 0b00, 0b00001, 1, 0, (outs QPR:$Vd), + (ins QPR:$Vm), IIC_VMOVQ, + OpcodeStr, Dt, "$Vd, $Vm", "", + [(set QPR:$Vd, (Ty (NEONvrev32 (Ty QPR:$Vm))))]>; def VREV32d8 : VREV32D<0b00, "vrev32", "8", v8i8>; def VREV32d16 : VREV32D<0b01, "vrev32", "16", v4i16>; @@ -3632,46 +4535,91 @@ def VREV32q16 : VREV32Q<0b01, "vrev32", "16", v8i16>; // VREV16 : Vector Reverse elements within 16-bit halfwords class VREV16D<bits<2> op19_18, string OpcodeStr, string Dt, ValueType Ty> - : N2V<0b11, 0b11, op19_18, 0b00, 0b00010, 0, 0, (outs DPR:$dst), - (ins DPR:$src), IIC_VMOVD, - OpcodeStr, Dt, "$dst, $src", "", - [(set DPR:$dst, (Ty (NEONvrev16 (Ty DPR:$src))))]>; + : N2V<0b11, 0b11, op19_18, 0b00, 0b00010, 0, 0, (outs DPR:$Vd), + (ins DPR:$Vm), IIC_VMOVD, + OpcodeStr, Dt, "$Vd, $Vm", "", + [(set DPR:$Vd, (Ty (NEONvrev16 (Ty DPR:$Vm))))]>; class VREV16Q<bits<2> op19_18, string OpcodeStr, string Dt, ValueType Ty> - : N2V<0b11, 0b11, op19_18, 0b00, 0b00010, 1, 0, (outs QPR:$dst), - (ins QPR:$src), IIC_VMOVD, - OpcodeStr, Dt, "$dst, $src", "", - [(set QPR:$dst, (Ty (NEONvrev16 (Ty QPR:$src))))]>; + : N2V<0b11, 0b11, op19_18, 0b00, 0b00010, 1, 0, (outs QPR:$Vd), + (ins QPR:$Vm), IIC_VMOVQ, + OpcodeStr, Dt, "$Vd, $Vm", "", + [(set QPR:$Vd, (Ty (NEONvrev16 (Ty QPR:$Vm))))]>; def VREV16d8 : VREV16D<0b00, "vrev16", "8", v8i8>; def VREV16q8 : VREV16Q<0b00, "vrev16", "8", v16i8>; // Other Vector Shuffles. +// Aligned extractions: really just dropping registers + +class AlignedVEXTq<ValueType DestTy, ValueType SrcTy, SDNodeXForm LaneCVT> + : Pat<(DestTy (vector_extract_subvec (SrcTy QPR:$src), (i32 imm:$start))), + (EXTRACT_SUBREG (SrcTy QPR:$src), (LaneCVT imm:$start))>; + +def : AlignedVEXTq<v8i8, v16i8, DSubReg_i8_reg>; + +def : AlignedVEXTq<v4i16, v8i16, DSubReg_i16_reg>; + +def : AlignedVEXTq<v2i32, v4i32, DSubReg_i32_reg>; + +def : AlignedVEXTq<v1i64, v2i64, DSubReg_f64_reg>; + +def : AlignedVEXTq<v2f32, v4f32, DSubReg_i32_reg>; + + // VEXT : Vector Extract class VEXTd<string OpcodeStr, string Dt, ValueType Ty> - : N3V<0,1,0b11,{?,?,?,?},0,0, (outs DPR:$dst), - (ins DPR:$lhs, DPR:$rhs, i32imm:$index), NVExtFrm, - IIC_VEXTD, OpcodeStr, Dt, "$dst, $lhs, $rhs, $index", "", - [(set DPR:$dst, (Ty (NEONvext (Ty DPR:$lhs), - (Ty DPR:$rhs), imm:$index)))]>; + : N3V<0,1,0b11,{?,?,?,?},0,0, (outs DPR:$Vd), + (ins DPR:$Vn, DPR:$Vm, i32imm:$index), NVExtFrm, + IIC_VEXTD, OpcodeStr, Dt, "$Vd, $Vn, $Vm, $index", "", + [(set DPR:$Vd, (Ty (NEONvext (Ty DPR:$Vn), + (Ty DPR:$Vm), imm:$index)))]> { + bits<4> index; + let Inst{11-8} = index{3-0}; +} class VEXTq<string OpcodeStr, string Dt, ValueType Ty> - : N3V<0,1,0b11,{?,?,?,?},1,0, (outs QPR:$dst), - (ins QPR:$lhs, QPR:$rhs, i32imm:$index), NVExtFrm, - IIC_VEXTQ, OpcodeStr, Dt, "$dst, $lhs, $rhs, $index", "", - [(set QPR:$dst, (Ty (NEONvext (Ty QPR:$lhs), - (Ty QPR:$rhs), imm:$index)))]>; - -def VEXTd8 : VEXTd<"vext", "8", v8i8>; -def VEXTd16 : VEXTd<"vext", "16", v4i16>; -def VEXTd32 : VEXTd<"vext", "32", v2i32>; -def VEXTdf : VEXTd<"vext", "32", v2f32>; - -def VEXTq8 : VEXTq<"vext", "8", v16i8>; -def VEXTq16 : VEXTq<"vext", "16", v8i16>; -def VEXTq32 : VEXTq<"vext", "32", v4i32>; -def VEXTqf : VEXTq<"vext", "32", v4f32>; + : N3V<0,1,0b11,{?,?,?,?},1,0, (outs QPR:$Vd), + (ins QPR:$Vn, QPR:$Vm, i32imm:$index), NVExtFrm, + IIC_VEXTQ, OpcodeStr, Dt, "$Vd, $Vn, $Vm, $index", "", + [(set QPR:$Vd, (Ty (NEONvext (Ty QPR:$Vn), + (Ty QPR:$Vm), imm:$index)))]> { + bits<4> index; + let Inst{11-8} = index{3-0}; +} + +def VEXTd8 : VEXTd<"vext", "8", v8i8> { + let Inst{11-8} = index{3-0}; +} +def VEXTd16 : VEXTd<"vext", "16", v4i16> { + let Inst{11-9} = index{2-0}; + let Inst{8} = 0b0; +} +def VEXTd32 : VEXTd<"vext", "32", v2i32> { + let Inst{11-10} = index{1-0}; + let Inst{9-8} = 0b00; +} +def VEXTdf : VEXTd<"vext", "32", v2f32> { + let Inst{11} = index{0}; + let Inst{10-8} = 0b000; +} + +def VEXTq8 : VEXTq<"vext", "8", v16i8> { + let Inst{11-8} = index{3-0}; +} +def VEXTq16 : VEXTq<"vext", "16", v8i16> { + let Inst{11-9} = index{2-0}; + let Inst{8} = 0b0; +} +def VEXTq32 : VEXTq<"vext", "32", v4i32> { + let Inst{11-10} = index{1-0}; + let Inst{9-8} = 0b00; +} +def VEXTqf : VEXTq<"vext", "32", v4f32> { + let Inst{11} = index{0}; + let Inst{10-8} = 0b000; +} // VTRN : Vector Transpose @@ -3707,160 +4655,120 @@ def VZIPq32 : N2VQShuffle<0b10, 0b00011, IIC_VPERMQ3, "vzip", "32">; // VTBL : Vector Table Lookup def VTBL1 - : N3V<1,1,0b11,0b1000,0,0, (outs DPR:$dst), - (ins DPR:$tbl1, DPR:$src), NVTBLFrm, IIC_VTB1, - "vtbl", "8", "$dst, \\{$tbl1\\}, $src", "", - [(set DPR:$dst, (v8i8 (int_arm_neon_vtbl1 DPR:$tbl1, DPR:$src)))]>; + : N3V<1,1,0b11,0b1000,0,0, (outs DPR:$Vd), + (ins DPR:$Vn, DPR:$Vm), NVTBLFrm, IIC_VTB1, + "vtbl", "8", "$Vd, \\{$Vn\\}, $Vm", "", + [(set DPR:$Vd, (v8i8 (int_arm_neon_vtbl1 DPR:$Vn, DPR:$Vm)))]>; let hasExtraSrcRegAllocReq = 1 in { def VTBL2 - : N3V<1,1,0b11,0b1001,0,0, (outs DPR:$dst), - (ins DPR:$tbl1, DPR:$tbl2, DPR:$src), NVTBLFrm, IIC_VTB2, - "vtbl", "8", "$dst, \\{$tbl1, $tbl2\\}, $src", "", []>; + : N3V<1,1,0b11,0b1001,0,0, (outs DPR:$Vd), + (ins DPR:$Vn, DPR:$tbl2, DPR:$Vm), NVTBLFrm, IIC_VTB2, + "vtbl", "8", "$Vd, \\{$Vn, $tbl2\\}, $Vm", "", []>; def VTBL3 - : N3V<1,1,0b11,0b1010,0,0, (outs DPR:$dst), - (ins DPR:$tbl1, DPR:$tbl2, DPR:$tbl3, DPR:$src), NVTBLFrm, IIC_VTB3, - "vtbl", "8", "$dst, \\{$tbl1, $tbl2, $tbl3\\}, $src", "", []>; + : N3V<1,1,0b11,0b1010,0,0, (outs DPR:$Vd), + (ins DPR:$Vn, DPR:$tbl2, DPR:$tbl3, DPR:$Vm), NVTBLFrm, IIC_VTB3, + "vtbl", "8", "$Vd, \\{$Vn, $tbl2, $tbl3\\}, $Vm", "", []>; def VTBL4 - : N3V<1,1,0b11,0b1011,0,0, (outs DPR:$dst), - (ins DPR:$tbl1, DPR:$tbl2, DPR:$tbl3, DPR:$tbl4, DPR:$src), + : N3V<1,1,0b11,0b1011,0,0, (outs DPR:$Vd), + (ins DPR:$Vn, DPR:$tbl2, DPR:$tbl3, DPR:$tbl4, DPR:$Vm), NVTBLFrm, IIC_VTB4, - "vtbl", "8", "$dst, \\{$tbl1, $tbl2, $tbl3, $tbl4\\}, $src", "", []>; + "vtbl", "8", "$Vd, \\{$Vn, $tbl2, $tbl3, $tbl4\\}, $Vm", "", []>; } // hasExtraSrcRegAllocReq = 1 +def VTBL2Pseudo + : PseudoNeonI<(outs DPR:$dst), (ins QPR:$tbl, DPR:$src), IIC_VTB2, "", []>; +def VTBL3Pseudo + : PseudoNeonI<(outs DPR:$dst), (ins QQPR:$tbl, DPR:$src), IIC_VTB3, "", []>; +def VTBL4Pseudo + : PseudoNeonI<(outs DPR:$dst), (ins QQPR:$tbl, DPR:$src), IIC_VTB4, "", []>; + // VTBX : Vector Table Extension def VTBX1 - : N3V<1,1,0b11,0b1000,1,0, (outs DPR:$dst), - (ins DPR:$orig, DPR:$tbl1, DPR:$src), NVTBLFrm, IIC_VTBX1, - "vtbx", "8", "$dst, \\{$tbl1\\}, $src", "$orig = $dst", - [(set DPR:$dst, (v8i8 (int_arm_neon_vtbx1 - DPR:$orig, DPR:$tbl1, DPR:$src)))]>; + : N3V<1,1,0b11,0b1000,1,0, (outs DPR:$Vd), + (ins DPR:$orig, DPR:$Vn, DPR:$Vm), NVTBLFrm, IIC_VTBX1, + "vtbx", "8", "$Vd, \\{$Vn\\}, $Vm", "$orig = $Vd", + [(set DPR:$Vd, (v8i8 (int_arm_neon_vtbx1 + DPR:$orig, DPR:$Vn, DPR:$Vm)))]>; let hasExtraSrcRegAllocReq = 1 in { def VTBX2 - : N3V<1,1,0b11,0b1001,1,0, (outs DPR:$dst), - (ins DPR:$orig, DPR:$tbl1, DPR:$tbl2, DPR:$src), NVTBLFrm, IIC_VTBX2, - "vtbx", "8", "$dst, \\{$tbl1, $tbl2\\}, $src", "$orig = $dst", []>; + : N3V<1,1,0b11,0b1001,1,0, (outs DPR:$Vd), + (ins DPR:$orig, DPR:$Vn, DPR:$tbl2, DPR:$Vm), NVTBLFrm, IIC_VTBX2, + "vtbx", "8", "$Vd, \\{$Vn, $tbl2\\}, $Vm", "$orig = $Vd", []>; def VTBX3 - : N3V<1,1,0b11,0b1010,1,0, (outs DPR:$dst), - (ins DPR:$orig, DPR:$tbl1, DPR:$tbl2, DPR:$tbl3, DPR:$src), + : N3V<1,1,0b11,0b1010,1,0, (outs DPR:$Vd), + (ins DPR:$orig, DPR:$Vn, DPR:$tbl2, DPR:$tbl3, DPR:$Vm), NVTBLFrm, IIC_VTBX3, - "vtbx", "8", "$dst, \\{$tbl1, $tbl2, $tbl3\\}, $src", - "$orig = $dst", []>; + "vtbx", "8", "$Vd, \\{$Vn, $tbl2, $tbl3\\}, $Vm", + "$orig = $Vd", []>; def VTBX4 - : N3V<1,1,0b11,0b1011,1,0, (outs DPR:$dst), (ins DPR:$orig, DPR:$tbl1, - DPR:$tbl2, DPR:$tbl3, DPR:$tbl4, DPR:$src), NVTBLFrm, IIC_VTBX4, - "vtbx", "8", "$dst, \\{$tbl1, $tbl2, $tbl3, $tbl4\\}, $src", - "$orig = $dst", []>; + : N3V<1,1,0b11,0b1011,1,0, (outs DPR:$Vd), (ins DPR:$orig, DPR:$Vn, + DPR:$tbl2, DPR:$tbl3, DPR:$tbl4, DPR:$Vm), NVTBLFrm, IIC_VTBX4, + "vtbx", "8", "$Vd, \\{$Vn, $tbl2, $tbl3, $tbl4\\}, $Vm", + "$orig = $Vd", []>; } // hasExtraSrcRegAllocReq = 1 +def VTBX2Pseudo + : PseudoNeonI<(outs DPR:$dst), (ins DPR:$orig, QPR:$tbl, DPR:$src), + IIC_VTBX2, "$orig = $dst", []>; +def VTBX3Pseudo + : PseudoNeonI<(outs DPR:$dst), (ins DPR:$orig, QQPR:$tbl, DPR:$src), + IIC_VTBX3, "$orig = $dst", []>; +def VTBX4Pseudo + : PseudoNeonI<(outs DPR:$dst), (ins DPR:$orig, QQPR:$tbl, DPR:$src), + IIC_VTBX4, "$orig = $dst", []>; + //===----------------------------------------------------------------------===// // NEON instructions for single-precision FP math //===----------------------------------------------------------------------===// -class N2VSPat<SDNode OpNode, ValueType ResTy, ValueType OpTy, NeonI Inst> - : NEONFPPat<(ResTy (OpNode SPR:$a)), - (EXTRACT_SUBREG (OpTy (Inst (INSERT_SUBREG (OpTy (IMPLICIT_DEF)), - SPR:$a, ssub_0))), - ssub_0)>; +class N2VSPat<SDNode OpNode, NeonI Inst> + : NEONFPPat<(f32 (OpNode SPR:$a)), + (EXTRACT_SUBREG + (v2f32 (COPY_TO_REGCLASS (Inst + (INSERT_SUBREG + (v2f32 (COPY_TO_REGCLASS (v2f32 (IMPLICIT_DEF)), DPR_VFP2)), + SPR:$a, ssub_0)), DPR_VFP2)), ssub_0)>; class N3VSPat<SDNode OpNode, NeonI Inst> : NEONFPPat<(f32 (OpNode SPR:$a, SPR:$b)), - (EXTRACT_SUBREG (v2f32 - (Inst (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), - SPR:$a, ssub_0), - (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), - SPR:$b, ssub_0))), - ssub_0)>; + (EXTRACT_SUBREG + (v2f32 (COPY_TO_REGCLASS (Inst + (INSERT_SUBREG + (v2f32 (COPY_TO_REGCLASS (v2f32 (IMPLICIT_DEF)), DPR_VFP2)), + SPR:$a, ssub_0), + (INSERT_SUBREG + (v2f32 (COPY_TO_REGCLASS (v2f32 (IMPLICIT_DEF)), DPR_VFP2)), + SPR:$b, ssub_0)), DPR_VFP2)), ssub_0)>; class N3VSMulOpPat<SDNode MulNode, SDNode OpNode, NeonI Inst> : NEONFPPat<(f32 (OpNode SPR:$acc, (f32 (MulNode SPR:$a, SPR:$b)))), - (EXTRACT_SUBREG (Inst (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), - SPR:$acc, ssub_0), - (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), - SPR:$a, ssub_0), - (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), - SPR:$b, ssub_0)), - ssub_0)>; - -// These need separate instructions because they must use DPR_VFP2 register -// class which have SPR sub-registers. - -// Vector Add Operations used for single-precision FP -let neverHasSideEffects = 1 in -def VADDfd_sfp : N3VS<0,0,0b00,0b1101,0, "vadd", "f32", v2f32, v2f32, fadd, 1>; -def : N3VSPat<fadd, VADDfd_sfp>; - -// Vector Sub Operations used for single-precision FP -let neverHasSideEffects = 1 in -def VSUBfd_sfp : N3VS<0,0,0b10,0b1101,0, "vsub", "f32", v2f32, v2f32, fsub, 0>; -def : N3VSPat<fsub, VSUBfd_sfp>; - -// Vector Multiply Operations used for single-precision FP -let neverHasSideEffects = 1 in -def VMULfd_sfp : N3VS<1,0,0b00,0b1101,1, "vmul", "f32", v2f32, v2f32, fmul, 1>; -def : N3VSPat<fmul, VMULfd_sfp>; - -// Vector Multiply-Accumulate/Subtract used for single-precision FP -// vml[as].f32 can cause 4-8 cycle stalls in following ASIMD instructions, so -// we want to avoid them for now. e.g., alternating vmla/vadd instructions. - -//let neverHasSideEffects = 1 in -//def VMLAfd_sfp : N3VSMulOp<0,0,0b00,0b1101,1, IIC_VMACD, "vmla", "f32", -// v2f32, fmul, fadd>; -//def : N3VSMulOpPat<fmul, fadd, VMLAfd_sfp>; - -//let neverHasSideEffects = 1 in -//def VMLSfd_sfp : N3VSMulOp<0,0,0b10,0b1101,1, IIC_VMACD, "vmls", "f32", -// v2f32, fmul, fsub>; -//def : N3VSMulOpPat<fmul, fsub, VMLSfd_sfp>; - -// Vector Absolute used for single-precision FP -let neverHasSideEffects = 1 in -def VABSfd_sfp : N2V<0b11, 0b11, 0b10, 0b01, 0b01110, 0, 0, - (outs DPR_VFP2:$dst), (ins DPR_VFP2:$src), IIC_VUNAD, - "vabs", "f32", "$dst, $src", "", []>; -def : N2VSPat<fabs, f32, v2f32, VABSfd_sfp>; - -// Vector Negate used for single-precision FP -let neverHasSideEffects = 1 in -def VNEGfd_sfp : N2V<0b11, 0b11, 0b10, 0b01, 0b01111, 0, 0, - (outs DPR_VFP2:$dst), (ins DPR_VFP2:$src), IIC_VUNAD, - "vneg", "f32", "$dst, $src", "", []>; -def : N2VSPat<fneg, f32, v2f32, VNEGfd_sfp>; - -// Vector Maximum used for single-precision FP -let neverHasSideEffects = 1 in -def VMAXfd_sfp : N3V<0, 0, 0b00, 0b1111, 0, 0, (outs DPR_VFP2:$dst), - (ins DPR_VFP2:$src1, DPR_VFP2:$src2), N3RegFrm, IIC_VBIND, - "vmax", "f32", "$dst, $src1, $src2", "", []>; -def : N3VSPat<NEONfmax, VMAXfd_sfp>; - -// Vector Minimum used for single-precision FP -let neverHasSideEffects = 1 in -def VMINfd_sfp : N3V<0, 0, 0b00, 0b1111, 0, 0, (outs DPR_VFP2:$dst), - (ins DPR_VFP2:$src1, DPR_VFP2:$src2), N3RegFrm, IIC_VBIND, - "vmin", "f32", "$dst, $src1, $src2", "", []>; -def : N3VSPat<NEONfmin, VMINfd_sfp>; - -// Vector Convert between single-precision FP and integer -let neverHasSideEffects = 1 in -def VCVTf2sd_sfp : N2VS<0b11, 0b11, 0b10, 0b11, 0b01110, 0, "vcvt", "s32.f32", - v2i32, v2f32, fp_to_sint>; -def : N2VSPat<arm_ftosi, f32, v2f32, VCVTf2sd_sfp>; - -let neverHasSideEffects = 1 in -def VCVTf2ud_sfp : N2VS<0b11, 0b11, 0b10, 0b11, 0b01111, 0, "vcvt", "u32.f32", - v2i32, v2f32, fp_to_uint>; -def : N2VSPat<arm_ftoui, f32, v2f32, VCVTf2ud_sfp>; - -let neverHasSideEffects = 1 in -def VCVTs2fd_sfp : N2VS<0b11, 0b11, 0b10, 0b11, 0b01100, 0, "vcvt", "f32.s32", - v2f32, v2i32, sint_to_fp>; -def : N2VSPat<arm_sitof, f32, v2i32, VCVTs2fd_sfp>; - -let neverHasSideEffects = 1 in -def VCVTu2fd_sfp : N2VS<0b11, 0b11, 0b10, 0b11, 0b01101, 0, "vcvt", "f32.u32", - v2f32, v2i32, uint_to_fp>; -def : N2VSPat<arm_uitof, f32, v2i32, VCVTu2fd_sfp>; + (EXTRACT_SUBREG + (v2f32 (COPY_TO_REGCLASS (Inst + (INSERT_SUBREG + (v2f32 (COPY_TO_REGCLASS (v2f32 (IMPLICIT_DEF)), DPR_VFP2)), + SPR:$acc, ssub_0), + (INSERT_SUBREG + (v2f32 (COPY_TO_REGCLASS (v2f32 (IMPLICIT_DEF)), DPR_VFP2)), + SPR:$a, ssub_0), + (INSERT_SUBREG + (v2f32 (COPY_TO_REGCLASS (v2f32 (IMPLICIT_DEF)), DPR_VFP2)), + SPR:$b, ssub_0)), DPR_VFP2)), ssub_0)>; + +def : N3VSPat<fadd, VADDfd>; +def : N3VSPat<fsub, VSUBfd>; +def : N3VSPat<fmul, VMULfd>; +def : N3VSMulOpPat<fmul, fadd, VMLAfd>, + Requires<[HasNEON, UseNEONForFP, UseFPVMLx]>; +def : N3VSMulOpPat<fmul, fsub, VMLSfd>, + Requires<[HasNEON, UseNEONForFP, UseFPVMLx]>; +def : N2VSPat<fabs, VABSfd>; +def : N2VSPat<fneg, VNEGfd>; +def : N3VSPat<NEONfmax, VMAXfd>; +def : N3VSPat<NEONfmin, VMINfd>; +def : N2VSPat<arm_ftosi, VCVTf2sd>; +def : N2VSPat<arm_ftoui, VCVTf2ud>; +def : N2VSPat<arm_sitof, VCVTs2fd>; +def : N2VSPat<arm_uitof, VCVTu2fd>; //===----------------------------------------------------------------------===// // Non-Instruction Patterns diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrThumb.td b/contrib/llvm/lib/Target/ARM/ARMInstrThumb.td index a13ff12..826ef46 100644 --- a/contrib/llvm/lib/Target/ARM/ARMInstrThumb.td +++ b/contrib/llvm/lib/Target/ARM/ARMInstrThumb.td @@ -1,4 +1,4 @@ -//===- ARMInstrThumb.td - Thumb support for ARM ---------------------------===// +//===- ARMInstrThumb.td - Thumb support for ARM ------------*- tablegen -*-===// // // The LLVM Compiler Infrastructure // @@ -16,7 +16,7 @@ // def ARMtcall : SDNode<"ARMISD::tCALL", SDT_ARMcall, - [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, SDNPVariadic]>; def imm_neg_XFORM : SDNodeXForm<imm, [{ @@ -26,7 +26,6 @@ def imm_comp_XFORM : SDNodeXForm<imm, [{ return CurDAG->getTargetConstant(~((uint32_t)N->getZExtValue()), MVT::i32); }]>; - /// imm0_7 predicate - True if the 32-bit immediate is in the range [0,7]. def imm0_7 : PatLeaf<(i32 imm), [{ return (uint32_t)N->getZExtValue() < 8; @@ -50,9 +49,9 @@ def imm8_255_neg : PatLeaf<(i32 imm), [{ return Val >= 8 && Val < 256; }], imm_neg_XFORM>; -// Break imm's up into two pieces: an immediate + a left shift. -// This uses thumb_immshifted to match and thumb_immshifted_val and -// thumb_immshifted_shamt to get the val/shift pieces. +// Break imm's up into two pieces: an immediate + a left shift. This uses +// thumb_immshifted to match and thumb_immshifted_val and thumb_immshifted_shamt +// to get the val/shift pieces. def thumb_immshifted : PatLeaf<(imm), [{ return ARM_AM::isThumbImmShiftedVal((unsigned)N->getZExtValue()); }]>; @@ -67,6 +66,11 @@ def thumb_immshifted_shamt : SDNodeXForm<imm, [{ return CurDAG->getTargetConstant(V, MVT::i32); }]>; +// ADR instruction labels. +def t_adrlabel : Operand<i32> { + let EncoderMethod = "getThumbAdrLabelOpValue"; +} + // Scaled 4 immediate. def t_imm_s4 : Operand<i32> { let PrintMethod = "printThumbS4ImmOperand"; @@ -74,47 +78,114 @@ def t_imm_s4 : Operand<i32> { // Define Thumb specific addressing modes. +def t_brtarget : Operand<OtherVT> { + let EncoderMethod = "getThumbBRTargetOpValue"; +} + +def t_bcctarget : Operand<i32> { + let EncoderMethod = "getThumbBCCTargetOpValue"; +} + +def t_cbtarget : Operand<i32> { + let EncoderMethod = "getThumbCBTargetOpValue"; +} + +def t_bltarget : Operand<i32> { + let EncoderMethod = "getThumbBLTargetOpValue"; +} + +def t_blxtarget : Operand<i32> { + let EncoderMethod = "getThumbBLXTargetOpValue"; +} + +def MemModeRegThumbAsmOperand : AsmOperandClass { + let Name = "MemModeRegThumb"; + let SuperClasses = []; +} + +def MemModeImmThumbAsmOperand : AsmOperandClass { + let Name = "MemModeImmThumb"; + let SuperClasses = []; +} + // t_addrmode_rr := reg + reg // def t_addrmode_rr : Operand<i32>, ComplexPattern<i32, 2, "SelectThumbAddrModeRR", []> { + let EncoderMethod = "getThumbAddrModeRegRegOpValue"; let PrintMethod = "printThumbAddrModeRROperand"; let MIOperandInfo = (ops tGPR:$base, tGPR:$offsreg); } -// t_addrmode_s4 := reg + reg -// reg + imm5 * 4 +// t_addrmode_rrs := reg + reg // -def t_addrmode_s4 : Operand<i32>, - ComplexPattern<i32, 3, "SelectThumbAddrModeS4", []> { - let PrintMethod = "printThumbAddrModeS4Operand"; - let MIOperandInfo = (ops tGPR:$base, i32imm:$offsimm, tGPR:$offsreg); +def t_addrmode_rrs1 : Operand<i32>, + ComplexPattern<i32, 2, "SelectThumbAddrModeRI5S1", []> { + let EncoderMethod = "getThumbAddrModeRegRegOpValue"; + let PrintMethod = "printThumbAddrModeRROperand"; + let MIOperandInfo = (ops tGPR:$base, tGPR:$offsreg); + let ParserMatchClass = MemModeRegThumbAsmOperand; +} +def t_addrmode_rrs2 : Operand<i32>, + ComplexPattern<i32, 2, "SelectThumbAddrModeRI5S2", []> { + let EncoderMethod = "getThumbAddrModeRegRegOpValue"; + let PrintMethod = "printThumbAddrModeRROperand"; + let MIOperandInfo = (ops tGPR:$base, tGPR:$offsreg); + let ParserMatchClass = MemModeRegThumbAsmOperand; +} +def t_addrmode_rrs4 : Operand<i32>, + ComplexPattern<i32, 2, "SelectThumbAddrModeRI5S4", []> { + let EncoderMethod = "getThumbAddrModeRegRegOpValue"; + let PrintMethod = "printThumbAddrModeRROperand"; + let MIOperandInfo = (ops tGPR:$base, tGPR:$offsreg); + let ParserMatchClass = MemModeRegThumbAsmOperand; +} + +// t_addrmode_is4 := reg + imm5 * 4 +// +def t_addrmode_is4 : Operand<i32>, + ComplexPattern<i32, 2, "SelectThumbAddrModeImm5S4", []> { + let EncoderMethod = "getAddrModeISOpValue"; + let PrintMethod = "printThumbAddrModeImm5S4Operand"; + let MIOperandInfo = (ops tGPR:$base, i32imm:$offsimm); + let ParserMatchClass = MemModeImmThumbAsmOperand; } -// t_addrmode_s2 := reg + reg -// reg + imm5 * 2 +// t_addrmode_is2 := reg + imm5 * 2 // -def t_addrmode_s2 : Operand<i32>, - ComplexPattern<i32, 3, "SelectThumbAddrModeS2", []> { - let PrintMethod = "printThumbAddrModeS2Operand"; - let MIOperandInfo = (ops tGPR:$base, i32imm:$offsimm, tGPR:$offsreg); +def t_addrmode_is2 : Operand<i32>, + ComplexPattern<i32, 2, "SelectThumbAddrModeImm5S2", []> { + let EncoderMethod = "getAddrModeISOpValue"; + let PrintMethod = "printThumbAddrModeImm5S2Operand"; + let MIOperandInfo = (ops tGPR:$base, i32imm:$offsimm); + let ParserMatchClass = MemModeImmThumbAsmOperand; } -// t_addrmode_s1 := reg + reg -// reg + imm5 +// t_addrmode_is1 := reg + imm5 // -def t_addrmode_s1 : Operand<i32>, - ComplexPattern<i32, 3, "SelectThumbAddrModeS1", []> { - let PrintMethod = "printThumbAddrModeS1Operand"; - let MIOperandInfo = (ops tGPR:$base, i32imm:$offsimm, tGPR:$offsreg); +def t_addrmode_is1 : Operand<i32>, + ComplexPattern<i32, 2, "SelectThumbAddrModeImm5S1", []> { + let EncoderMethod = "getAddrModeISOpValue"; + let PrintMethod = "printThumbAddrModeImm5S1Operand"; + let MIOperandInfo = (ops tGPR:$base, i32imm:$offsimm); + let ParserMatchClass = MemModeImmThumbAsmOperand; } // t_addrmode_sp := sp + imm8 * 4 // def t_addrmode_sp : Operand<i32>, ComplexPattern<i32, 2, "SelectThumbAddrModeSP", []> { + let EncoderMethod = "getAddrModeThumbSPOpValue"; let PrintMethod = "printThumbAddrModeSPOperand"; let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm); + let ParserMatchClass = MemModeImmThumbAsmOperand; +} + +// t_addrmode_pc := <label> => pc + imm8 * 4 +// +def t_addrmode_pc : Operand<i32> { + let EncoderMethod = "getAddrModePCOpValue"; + let ParserMatchClass = MemModeImmThumbAsmOperand; } //===----------------------------------------------------------------------===// @@ -126,132 +197,162 @@ def t_addrmode_sp : Operand<i32>, // these will always be in pairs, and asserts if it finds otherwise. Better way? let Defs = [SP], Uses = [SP], hasSideEffects = 1 in { def tADJCALLSTACKUP : -PseudoInst<(outs), (ins i32imm:$amt1, i32imm:$amt2), NoItinerary, - "${:comment} tADJCALLSTACKUP $amt1", - [(ARMcallseq_end imm:$amt1, imm:$amt2)]>, Requires<[IsThumb1Only]>; + PseudoInst<(outs), (ins i32imm:$amt1, i32imm:$amt2), NoItinerary, + [(ARMcallseq_end imm:$amt1, imm:$amt2)]>, + Requires<[IsThumb, IsThumb1Only]>; def tADJCALLSTACKDOWN : -PseudoInst<(outs), (ins i32imm:$amt), NoItinerary, - "${:comment} tADJCALLSTACKDOWN $amt", - [(ARMcallseq_start imm:$amt)]>, Requires<[IsThumb1Only]>; + PseudoInst<(outs), (ins i32imm:$amt), NoItinerary, + [(ARMcallseq_start imm:$amt)]>, + Requires<[IsThumb, IsThumb1Only]>; +} + +// T1Disassembly - A simple class to make encoding some disassembly patterns +// easier and less verbose. +class T1Disassembly<bits<2> op1, bits<8> op2> + : T1Encoding<0b101111> { + let Inst{9-8} = op1; + let Inst{7-0} = op2; } def tNOP : T1pI<(outs), (ins), NoItinerary, "nop", "", [/* For disassembly only; pattern left blank */]>, - T1Encoding<0b101111> { - let Inst{9-8} = 0b11; - let Inst{7-0} = 0b00000000; -} + T1Disassembly<0b11, 0x00>; // A8.6.110 def tYIELD : T1pI<(outs), (ins), NoItinerary, "yield", "", [/* For disassembly only; pattern left blank */]>, - T1Encoding<0b101111> { - let Inst{9-8} = 0b11; - let Inst{7-0} = 0b00010000; -} + T1Disassembly<0b11, 0x10>; // A8.6.410 def tWFE : T1pI<(outs), (ins), NoItinerary, "wfe", "", [/* For disassembly only; pattern left blank */]>, - T1Encoding<0b101111> { - let Inst{9-8} = 0b11; - let Inst{7-0} = 0b00100000; -} + T1Disassembly<0b11, 0x20>; // A8.6.408 def tWFI : T1pI<(outs), (ins), NoItinerary, "wfi", "", [/* For disassembly only; pattern left blank */]>, - T1Encoding<0b101111> { - let Inst{9-8} = 0b11; - let Inst{7-0} = 0b00110000; -} + T1Disassembly<0b11, 0x30>; // A8.6.409 def tSEV : T1pI<(outs), (ins), NoItinerary, "sev", "", [/* For disassembly only; pattern left blank */]>, - T1Encoding<0b101111> { - let Inst{9-8} = 0b11; - let Inst{7-0} = 0b01000000; -} + T1Disassembly<0b11, 0x40>; // A8.6.157 + +// The i32imm operand $val can be used by a debugger to store more information +// about the breakpoint. +def tBKPT : T1I<(outs), (ins i32imm:$val), NoItinerary, "bkpt\t$val", + [/* For disassembly only; pattern left blank */]>, + T1Disassembly<0b10, {?,?,?,?,?,?,?,?}> { + // A8.6.22 + bits<8> val; + let Inst{7-0} = val; +} def tSETENDBE : T1I<(outs), (ins), NoItinerary, "setend\tbe", [/* For disassembly only; pattern left blank */]>, T1Encoding<0b101101> { + // A8.6.156 let Inst{9-5} = 0b10010; - let Inst{3} = 1; + let Inst{4} = 1; + let Inst{3} = 1; // Big-Endian + let Inst{2-0} = 0b000; } def tSETENDLE : T1I<(outs), (ins), NoItinerary, "setend\tle", [/* For disassembly only; pattern left blank */]>, T1Encoding<0b101101> { + // A8.6.156 let Inst{9-5} = 0b10010; - let Inst{3} = 0; + let Inst{4} = 1; + let Inst{3} = 0; // Little-Endian + let Inst{2-0} = 0b000; } -// The i32imm operand $val can be used by a debugger to store more information -// about the breakpoint. -def tBKPT : T1I<(outs), (ins i32imm:$val), NoItinerary, "bkpt\t$val", +// Change Processor State is a system instruction -- for disassembly only. +def tCPS : T1I<(outs), (ins imod_op:$imod, iflags_op:$iflags), + NoItinerary, "cps$imod $iflags", [/* For disassembly only; pattern left blank */]>, - T1Encoding<0b101111> { - let Inst{9-8} = 0b10; + T1Misc<0b0110011> { + // A8.6.38 & B6.1.1 + bit imod; + bits<3> iflags; + + let Inst{4} = imod; + let Inst{3} = 0; + let Inst{2-0} = iflags; } -// Change Processor State is a system instruction -- for disassembly only. -// The singleton $opt operand contains the following information: -// opt{4-0} = mode ==> don't care -// opt{5} = changemode ==> 0 (false for 16-bit Thumb instr) -// opt{8-6} = AIF from Inst{2-0} -// opt{10-9} = 1:imod from Inst{4} with 0b10 as enable and 0b11 as disable -// -// The opt{4-0} and opt{5} sub-fields are to accommodate 32-bit Thumb and ARM -// CPS which has more options. -def tCPS : T1I<(outs), (ins cps_opt:$opt), NoItinerary, "cps$opt", - [/* For disassembly only; pattern left blank */]>, - T1Misc<0b0110011>; - // For both thumb1 and thumb2. -let isNotDuplicable = 1 in -def tPICADD : TIt<(outs GPR:$dst), (ins GPR:$lhs, pclabel:$cp), IIC_iALUr, - "\n$cp:\n\tadd\t$dst, pc", - [(set GPR:$dst, (ARMpic_add GPR:$lhs, imm:$cp))]>, +let isNotDuplicable = 1, isCodeGenOnly = 1 in +def tPICADD : TIt<(outs GPR:$dst), (ins GPR:$lhs, pclabel:$cp), IIC_iALUr, "", + [(set GPR:$dst, (ARMpic_add GPR:$lhs, imm:$cp))]>, T1Special<{0,0,?,?}> { - let Inst{6-3} = 0b1111; // A8.6.6 Rm = pc + // A8.6.6 + bits<3> dst; + let Inst{6-3} = 0b1111; // Rm = pc + let Inst{2-0} = dst; } -// PC relative add. +// PC relative add (ADR). def tADDrPCi : T1I<(outs tGPR:$dst), (ins t_imm_s4:$rhs), IIC_iALUi, - "add\t$dst, pc, $rhs", []>, - T1Encoding<{1,0,1,0,0,?}>; // A6.2 & A8.6.10 + "add\t$dst, pc, $rhs", []>, + T1Encoding<{1,0,1,0,0,?}> { + // A6.2 & A8.6.10 + bits<3> dst; + bits<8> rhs; + let Inst{10-8} = dst; + let Inst{7-0} = rhs; +} -// ADD rd, sp, #imm8 +// ADD <Rd>, sp, #<imm8> // This is rematerializable, which is particularly useful for taking the // address of locals. -let isReMaterializable = 1 in { +let isReMaterializable = 1 in def tADDrSPi : T1I<(outs tGPR:$dst), (ins GPR:$sp, t_imm_s4:$rhs), IIC_iALUi, - "add\t$dst, $sp, $rhs", []>, - T1Encoding<{1,0,1,0,1,?}>; // A6.2 & A8.6.8 + "add\t$dst, $sp, $rhs", []>, + T1Encoding<{1,0,1,0,1,?}> { + // A6.2 & A8.6.8 + bits<3> dst; + bits<8> rhs; + let Inst{10-8} = dst; + let Inst{7-0} = rhs; } -// ADD sp, sp, #imm7 +// ADD sp, sp, #<imm7> def tADDspi : TIt<(outs GPR:$dst), (ins GPR:$lhs, t_imm_s4:$rhs), IIC_iALUi, "add\t$dst, $rhs", []>, - T1Misc<{0,0,0,0,0,?,?}>; // A6.2.5 & A8.6.8 + T1Misc<{0,0,0,0,0,?,?}> { + // A6.2.5 & A8.6.8 + bits<7> rhs; + let Inst{6-0} = rhs; +} -// SUB sp, sp, #imm7 +// SUB sp, sp, #<imm7> +// FIXME: The encoding and the ASM string don't match up. def tSUBspi : TIt<(outs GPR:$dst), (ins GPR:$lhs, t_imm_s4:$rhs), IIC_iALUi, "sub\t$dst, $rhs", []>, - T1Misc<{0,0,0,0,1,?,?}>; // A6.2.5 & A8.6.215 + T1Misc<{0,0,0,0,1,?,?}> { + // A6.2.5 & A8.6.214 + bits<7> rhs; + let Inst{6-0} = rhs; +} -// ADD rm, sp +// ADD <Rm>, sp def tADDrSP : TIt<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs), IIC_iALUr, "add\t$dst, $rhs", []>, T1Special<{0,0,?,?}> { - let Inst{6-3} = 0b1101; // A8.6.9 Encoding T1 + // A8.6.9 Encoding T1 + bits<4> dst; + let Inst{7} = dst{3}; + let Inst{6-3} = 0b1101; + let Inst{2-0} = dst{2-0}; } -// ADD sp, rm +// ADD sp, <Rm> def tADDspr : TIt<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs), IIC_iALUr, "add\t$dst, $rhs", []>, T1Special<{0,0,?,?}> { // A8.6.9 Encoding T2 + bits<4> dst; let Inst{7} = 1; + let Inst{6-3} = dst; let Inst{2-0} = 0b101; } @@ -260,21 +361,37 @@ def tADDspr : TIt<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs), IIC_iALUr, // let isReturn = 1, isTerminator = 1, isBarrier = 1 in { - def tBX_RET : TI<(outs), (ins), IIC_Br, "bx\tlr", [(ARMretflag)]>, - T1Special<{1,1,0,?}> { // A6.2.3 & A8.6.25 + def tBX_RET : TI<(outs), (ins), IIC_Br, "bx\tlr", + [(ARMretflag)]>, + T1Special<{1,1,0,?}> { + // A6.2.3 & A8.6.25 let Inst{6-3} = 0b1110; // Rm = lr + let Inst{2-0} = 0b000; } + // Alternative return instruction used by vararg functions. - def tBX_RET_vararg : TI<(outs), (ins tGPR:$target), IIC_Br, "bx\t$target",[]>, - T1Special<{1,1,0,?}>; // A6.2.3 & A8.6.25 + def tBX_RET_vararg : TI<(outs), (ins tGPR:$Rm), + IIC_Br, "bx\t$Rm", + []>, + T1Special<{1,1,0,?}> { + // A6.2.3 & A8.6.25 + bits<4> Rm; + let Inst{6-3} = Rm; + let Inst{2-0} = 0b000; + } } // Indirect branches let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in { - def tBRIND : TI<(outs), (ins GPR:$dst), IIC_Br, "mov\tpc, $dst", - [(brind GPR:$dst)]>, - T1Special<{1,0,1,?}> { - // <Rd> = Inst{7:2-0} = pc + def tBRIND : TI<(outs), (ins GPR:$Rm), + IIC_Br, + "mov\tpc, $Rm", + [(brind GPR:$Rm)]>, + T1Special<{1,0,?,?}> { + // A8.6.97 + bits<4> Rm; + let Inst{7} = 1; // <Rd> = Inst{7:2-0} = pc + let Inst{6-3} = Rm; let Inst{2-0} = 0b111; } } @@ -282,28 +399,52 @@ let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in { // FIXME: remove when we have a way to marking a MI with these properties. let isReturn = 1, isTerminator = 1, isBarrier = 1, mayLoad = 1, hasExtraDefRegAllocReq = 1 in -def tPOP_RET : T1I<(outs), (ins pred:$p, reglist:$dsts, variable_ops), IIC_Br, - "pop${p}\t$dsts", []>, - T1Misc<{1,1,0,?,?,?,?}>; +def tPOP_RET : T1I<(outs), (ins pred:$p, reglist:$regs, variable_ops), + IIC_iPop_Br, + "pop${p}\t$regs", []>, + T1Misc<{1,1,0,?,?,?,?}> { + // A8.6.121 + bits<16> regs; + let Inst{8} = regs{15}; // registers = P:'0000000':register_list + let Inst{7-0} = regs{7-0}; +} +// All calls clobber the non-callee saved registers. SP is marked as a use to +// prevent stack-pointer assignments that appear immediately before calls from +// potentially appearing dead. let isCall = 1, + // On non-Darwin platforms R9 is callee-saved. Defs = [R0, R1, R2, R3, R12, LR, D0, D1, D2, D3, D4, D5, D6, D7, D16, D17, D18, D19, D20, D21, D22, D23, - D24, D25, D26, D27, D28, D29, D30, D31, CPSR, FPSCR] in { + D24, D25, D26, D27, D28, D29, D30, D31, CPSR, FPSCR], + Uses = [SP] in { // Also used for Thumb2 def tBL : TIx2<0b11110, 0b11, 1, - (outs), (ins i32imm:$func, variable_ops), IIC_Br, - "bl\t${func:call}", + (outs), (ins t_bltarget:$func, variable_ops), IIC_Br, + "bl\t$func", [(ARMtcall tglobaladdr:$func)]>, - Requires<[IsThumb, IsNotDarwin]>; + Requires<[IsThumb, IsNotDarwin]> { + bits<21> func; + let Inst{25-16} = func{20-11}; + let Inst{13} = 1; + let Inst{11} = 1; + let Inst{10-0} = func{10-0}; + } // ARMv5T and above, also used for Thumb2 def tBLXi : TIx2<0b11110, 0b11, 0, - (outs), (ins i32imm:$func, variable_ops), IIC_Br, - "blx\t${func:call}", + (outs), (ins t_blxtarget:$func, variable_ops), IIC_Br, + "blx\t$func", [(ARMcall tglobaladdr:$func)]>, - Requires<[IsThumb, HasV5T, IsNotDarwin]>; + Requires<[IsThumb, HasV5T, IsNotDarwin]> { + bits<21> func; + let Inst{25-16} = func{20-11}; + let Inst{13} = 1; + let Inst{11} = 1; + let Inst{10-1} = func{10-1}; + let Inst{0} = 0; // func{0} is assumed zero + } // Also used for Thumb2 def tBLXr : TI<(outs), (ins GPR:$func, variable_ops), IIC_Br, @@ -313,642 +454,1002 @@ let isCall = 1, T1Special<{1,1,1,?}>; // A6.2.3 & A8.6.24; // ARMv4T + // FIXME: Should be a pseudo. + let isCodeGenOnly = 1 in def tBX : TIx2<{?,?,?,?,?}, {?,?}, ?, (outs), (ins tGPR:$func, variable_ops), IIC_Br, "mov\tlr, pc\n\tbx\t$func", [(ARMcall_nolink tGPR:$func)]>, - Requires<[IsThumb1Only, IsNotDarwin]>; + Requires<[IsThumb, IsThumb1Only, IsNotDarwin]>; } -// On Darwin R9 is call-clobbered. let isCall = 1, + // On Darwin R9 is call-clobbered. + // R7 is marked as a use to prevent frame-pointer assignments from being + // moved above / below calls. Defs = [R0, R1, R2, R3, R9, R12, LR, D0, D1, D2, D3, D4, D5, D6, D7, D16, D17, D18, D19, D20, D21, D22, D23, - D24, D25, D26, D27, D28, D29, D30, D31, CPSR, FPSCR] in { + D24, D25, D26, D27, D28, D29, D30, D31, CPSR, FPSCR], + Uses = [R7, SP] in { // Also used for Thumb2 def tBLr9 : TIx2<0b11110, 0b11, 1, - (outs), (ins i32imm:$func, variable_ops), IIC_Br, - "bl\t${func:call}", + (outs), (ins pred:$p, t_bltarget:$func, variable_ops), + IIC_Br, "bl${p}\t$func", [(ARMtcall tglobaladdr:$func)]>, - Requires<[IsThumb, IsDarwin]>; + Requires<[IsThumb, IsDarwin]> { + bits<21> func; + let Inst{25-16} = func{20-11}; + let Inst{13} = 1; + let Inst{11} = 1; + let Inst{10-0} = func{10-0}; + } // ARMv5T and above, also used for Thumb2 def tBLXi_r9 : TIx2<0b11110, 0b11, 0, - (outs), (ins i32imm:$func, variable_ops), IIC_Br, - "blx\t${func:call}", + (outs), (ins pred:$p, t_blxtarget:$func, variable_ops), + IIC_Br, "blx${p}\t$func", [(ARMcall tglobaladdr:$func)]>, - Requires<[IsThumb, HasV5T, IsDarwin]>; + Requires<[IsThumb, HasV5T, IsDarwin]> { + bits<21> func; + let Inst{25-16} = func{20-11}; + let Inst{13} = 1; + let Inst{11} = 1; + let Inst{10-1} = func{10-1}; + let Inst{0} = 0; // func{0} is assumed zero + } // Also used for Thumb2 - def tBLXr_r9 : TI<(outs), (ins GPR:$func, variable_ops), IIC_Br, - "blx\t$func", + def tBLXr_r9 : TI<(outs), (ins pred:$p, GPR:$func, variable_ops), IIC_Br, + "blx${p}\t$func", [(ARMtcall GPR:$func)]>, Requires<[IsThumb, HasV5T, IsDarwin]>, - T1Special<{1,1,1,?}>; // A6.2.3 & A8.6.24 + T1Special<{1,1,1,?}> { + // A6.2.3 & A8.6.24 + bits<4> func; + let Inst{6-3} = func; + let Inst{2-0} = 0b000; + } // ARMv4T + let isCodeGenOnly = 1 in + // FIXME: Should be a pseudo. def tBXr9 : TIx2<{?,?,?,?,?}, {?,?}, ?, (outs), (ins tGPR:$func, variable_ops), IIC_Br, "mov\tlr, pc\n\tbx\t$func", [(ARMcall_nolink tGPR:$func)]>, - Requires<[IsThumb1Only, IsDarwin]>; + Requires<[IsThumb, IsThumb1Only, IsDarwin]>; } -let isBranch = 1, isTerminator = 1 in { - let isBarrier = 1 in { - let isPredicable = 1 in - def tB : T1I<(outs), (ins brtarget:$target), IIC_Br, - "b\t$target", [(br bb:$target)]>, - T1Encoding<{1,1,1,0,0,?}>; +let isBranch = 1, isTerminator = 1, isBarrier = 1 in { + let isPredicable = 1 in + def tB : T1I<(outs), (ins t_brtarget:$target), IIC_Br, + "b\t$target", [(br bb:$target)]>, + T1Encoding<{1,1,1,0,0,?}> { + bits<11> target; + let Inst{10-0} = target; + } // Far jump + // Just a pseudo for a tBL instruction. Needed to let regalloc know about + // the clobber of LR. let Defs = [LR] in - def tBfar : TIx2<0b11110, 0b11, 1, (outs), (ins brtarget:$target), IIC_Br, - "bl\t$target\t${:comment} far jump",[]>; - - def tBR_JTr : T1JTI<(outs), - (ins tGPR:$target, jtblock_operand:$jt, i32imm:$id), - IIC_Br, "mov\tpc, $target\n\t.align\t2$jt", - [(ARMbrjt tGPR:$target, tjumptable:$jt, imm:$id)]>, - Encoding16 { - let Inst{15-7} = 0b010001101; - let Inst{2-0} = 0b111; - } + def tBfar : tPseudoInst<(outs), (ins t_bltarget:$target), + Size4Bytes, IIC_Br, []>; + + def tBR_JTr : tPseudoInst<(outs), + (ins tGPR:$target, i32imm:$jt, i32imm:$id), + SizeSpecial, IIC_Br, + [(ARMbrjt tGPR:$target, tjumptable:$jt, imm:$id)]> { + list<Predicate> Predicates = [IsThumb, IsThumb1Only]; } } // FIXME: should be able to write a pattern for ARMBrcond, but can't use // a two-value operand where a dag node expects two operands. :( let isBranch = 1, isTerminator = 1 in - def tBcc : T1I<(outs), (ins brtarget:$target, pred:$cc), IIC_Br, - "b$cc\t$target", + def tBcc : T1I<(outs), (ins t_bcctarget:$target, pred:$p), IIC_Br, + "b${p}\t$target", [/*(ARMbrcond bb:$target, imm:$cc)*/]>, - T1Encoding<{1,1,0,1,?,?}>; + T1Encoding<{1,1,0,1,?,?}> { + bits<4> p; + bits<8> target; + let Inst{11-8} = p; + let Inst{7-0} = target; +} // Compare and branch on zero / non-zero let isBranch = 1, isTerminator = 1 in { - def tCBZ : T1I<(outs), (ins tGPR:$cmp, brtarget:$target), IIC_Br, - "cbz\t$cmp, $target", []>, - T1Misc<{0,0,?,1,?,?,?}>; + def tCBZ : T1I<(outs), (ins tGPR:$Rn, t_cbtarget:$target), IIC_Br, + "cbz\t$Rn, $target", []>, + T1Misc<{0,0,?,1,?,?,?}> { + // A8.6.27 + bits<6> target; + bits<3> Rn; + let Inst{9} = target{5}; + let Inst{7-3} = target{4-0}; + let Inst{2-0} = Rn; + } - def tCBNZ : T1I<(outs), (ins tGPR:$cmp, brtarget:$target), IIC_Br, + def tCBNZ : T1I<(outs), (ins tGPR:$cmp, t_cbtarget:$target), IIC_Br, "cbnz\t$cmp, $target", []>, - T1Misc<{1,0,?,1,?,?,?}>; + T1Misc<{1,0,?,1,?,?,?}> { + // A8.6.27 + bits<6> target; + bits<3> Rn; + let Inst{9} = target{5}; + let Inst{7-3} = target{4-0}; + let Inst{2-0} = Rn; + } } // A8.6.218 Supervisor Call (Software Interrupt) -- for disassembly only // A8.6.16 B: Encoding T1 // If Inst{11-8} == 0b1111 then SEE SVC -let isCall = 1 in { -def tSVC : T1pI<(outs), (ins i32imm:$svc), IIC_Br, "svc", "\t$svc", []>, - Encoding16 { +let isCall = 1, Uses = [SP] in +def tSVC : T1pI<(outs), (ins i32imm:$imm), IIC_Br, + "svc", "\t$imm", []>, Encoding16 { + bits<8> imm; let Inst{15-12} = 0b1101; - let Inst{11-8} = 0b1111; -} + let Inst{11-8} = 0b1111; + let Inst{7-0} = imm; } -// A8.6.16 B: Encoding T1 -// If Inst{11-8} == 0b1110 then UNDEFINED -// FIXME: Temporary emitted as raw bytes until this pseudo-op will be added to -// binutils +// The assembler uses 0xDEFE for a trap instruction. let isBarrier = 1, isTerminator = 1 in def tTRAP : TI<(outs), (ins), IIC_Br, - ".short 0xdefe ${:comment} trap", [(trap)]>, Encoding16 { - let Inst{15-12} = 0b1101; - let Inst{11-8} = 0b1110; + "trap", [(trap)]>, Encoding16 { + let Inst = 0xdefe; } //===----------------------------------------------------------------------===// // Load Store Instructions. // +// Loads: reg/reg and reg/imm5 let canFoldAsLoad = 1, isReMaterializable = 1 in -def tLDR : T1pI4<(outs tGPR:$dst), (ins t_addrmode_s4:$addr), IIC_iLoadr, - "ldr", "\t$dst, $addr", - [(set tGPR:$dst, (load t_addrmode_s4:$addr))]>, - T1LdSt<0b100>; -def tLDRi: T1pI4<(outs tGPR:$dst), (ins t_addrmode_s4:$addr), IIC_iLoadr, - "ldr", "\t$dst, $addr", - []>, - T1LdSt4Imm<{1,?,?}>; - -def tLDRB : T1pI1<(outs tGPR:$dst), (ins t_addrmode_s1:$addr), IIC_iLoadr, - "ldrb", "\t$dst, $addr", - [(set tGPR:$dst, (zextloadi8 t_addrmode_s1:$addr))]>, - T1LdSt<0b110>; -def tLDRBi: T1pI1<(outs tGPR:$dst), (ins t_addrmode_s1:$addr), IIC_iLoadr, - "ldrb", "\t$dst, $addr", - []>, - T1LdSt1Imm<{1,?,?}>; - -def tLDRH : T1pI2<(outs tGPR:$dst), (ins t_addrmode_s2:$addr), IIC_iLoadr, - "ldrh", "\t$dst, $addr", - [(set tGPR:$dst, (zextloadi16 t_addrmode_s2:$addr))]>, - T1LdSt<0b101>; -def tLDRHi: T1pI2<(outs tGPR:$dst), (ins t_addrmode_s2:$addr), IIC_iLoadr, - "ldrh", "\t$dst, $addr", - []>, - T1LdSt2Imm<{1,?,?}>; +multiclass thumb_ld_rr_ri_enc<bits<3> reg_opc, bits<4> imm_opc, + Operand AddrMode_r, Operand AddrMode_i, + AddrMode am, InstrItinClass itin_r, + InstrItinClass itin_i, string asm, + PatFrag opnode> { + def r : // reg/reg + T1pILdStEncode<reg_opc, + (outs tGPR:$Rt), (ins AddrMode_r:$addr), + am, itin_r, asm, "\t$Rt, $addr", + [(set tGPR:$Rt, (opnode AddrMode_r:$addr))]>; + def i : // reg/imm5 + T1pILdStEncodeImm<imm_opc, 1 /* Load */, + (outs tGPR:$Rt), (ins AddrMode_i:$addr), + am, itin_i, asm, "\t$Rt, $addr", + [(set tGPR:$Rt, (opnode AddrMode_i:$addr))]>; +} +// Stores: reg/reg and reg/imm5 +multiclass thumb_st_rr_ri_enc<bits<3> reg_opc, bits<4> imm_opc, + Operand AddrMode_r, Operand AddrMode_i, + AddrMode am, InstrItinClass itin_r, + InstrItinClass itin_i, string asm, + PatFrag opnode> { + def r : // reg/reg + T1pILdStEncode<reg_opc, + (outs), (ins tGPR:$Rt, AddrMode_r:$addr), + am, itin_r, asm, "\t$Rt, $addr", + [(opnode tGPR:$Rt, AddrMode_r:$addr)]>; + def i : // reg/imm5 + T1pILdStEncodeImm<imm_opc, 0 /* Store */, + (outs), (ins tGPR:$Rt, AddrMode_i:$addr), + am, itin_i, asm, "\t$Rt, $addr", + [(opnode tGPR:$Rt, AddrMode_i:$addr)]>; +} + +// A8.6.57 & A8.6.60 +defm tLDR : thumb_ld_rr_ri_enc<0b100, 0b0110, t_addrmode_rrs4, + t_addrmode_is4, AddrModeT1_4, + IIC_iLoad_r, IIC_iLoad_i, "ldr", + UnOpFrag<(load node:$Src)>>; + +// A8.6.64 & A8.6.61 +defm tLDRB : thumb_ld_rr_ri_enc<0b110, 0b0111, t_addrmode_rrs1, + t_addrmode_is1, AddrModeT1_1, + IIC_iLoad_bh_r, IIC_iLoad_bh_i, "ldrb", + UnOpFrag<(zextloadi8 node:$Src)>>; + +// A8.6.76 & A8.6.73 +defm tLDRH : thumb_ld_rr_ri_enc<0b101, 0b1000, t_addrmode_rrs2, + t_addrmode_is2, AddrModeT1_2, + IIC_iLoad_bh_r, IIC_iLoad_bh_i, "ldrh", + UnOpFrag<(zextloadi16 node:$Src)>>; let AddedComplexity = 10 in -def tLDRSB : T1pI1<(outs tGPR:$dst), (ins t_addrmode_rr:$addr), IIC_iLoadr, +def tLDRSB : // A8.6.80 + T1pILdStEncode<0b011, (outs tGPR:$dst), (ins t_addrmode_rr:$addr), + AddrModeT1_1, IIC_iLoad_bh_r, "ldrsb", "\t$dst, $addr", - [(set tGPR:$dst, (sextloadi8 t_addrmode_rr:$addr))]>, - T1LdSt<0b011>; + [(set tGPR:$dst, (sextloadi8 t_addrmode_rr:$addr))]>; let AddedComplexity = 10 in -def tLDRSH : T1pI2<(outs tGPR:$dst), (ins t_addrmode_rr:$addr), IIC_iLoadr, +def tLDRSH : // A8.6.84 + T1pILdStEncode<0b111, (outs tGPR:$dst), (ins t_addrmode_rr:$addr), + AddrModeT1_2, IIC_iLoad_bh_r, "ldrsh", "\t$dst, $addr", - [(set tGPR:$dst, (sextloadi16 t_addrmode_rr:$addr))]>, - T1LdSt<0b111>; + [(set tGPR:$dst, (sextloadi16 t_addrmode_rr:$addr))]>; let canFoldAsLoad = 1 in -def tLDRspi : T1pIs<(outs tGPR:$dst), (ins t_addrmode_sp:$addr), IIC_iLoadi, - "ldr", "\t$dst, $addr", - [(set tGPR:$dst, (load t_addrmode_sp:$addr))]>, - T1LdStSP<{1,?,?}>; +def tLDRspi : T1pIs<(outs tGPR:$Rt), (ins t_addrmode_sp:$addr), IIC_iLoad_i, + "ldr", "\t$Rt, $addr", + [(set tGPR:$Rt, (load t_addrmode_sp:$addr))]>, + T1LdStSP<{1,?,?}> { + bits<3> Rt; + bits<8> addr; + let Inst{10-8} = Rt; + let Inst{7-0} = addr; +} // Special instruction for restore. It cannot clobber condition register // when it's expanded by eliminateCallFramePseudoInstr(). let canFoldAsLoad = 1, mayLoad = 1, neverHasSideEffects = 1 in -def tRestore : T1pIs<(outs tGPR:$dst), (ins t_addrmode_sp:$addr), IIC_iLoadi, - "ldr", "\t$dst, $addr", []>, - T1LdStSP<{1,?,?}>; +// FIXME: Pseudo for tLDRspi +def tRestore : T1pIs<(outs tGPR:$dst), (ins t_addrmode_sp:$addr), IIC_iLoad_i, + "ldr", "\t$dst, $addr", []>, + T1LdStSP<{1,?,?}> { + bits<3> Rt; + bits<8> addr; + let Inst{10-8} = Rt; + let Inst{7-0} = addr; +} // Load tconstpool // FIXME: Use ldr.n to work around a Darwin assembler bug. let canFoldAsLoad = 1, isReMaterializable = 1 in -def tLDRpci : T1pIs<(outs tGPR:$dst), (ins i32imm:$addr), IIC_iLoadi, - "ldr", ".n\t$dst, $addr", - [(set tGPR:$dst, (load (ARMWrapper tconstpool:$addr)))]>, - T1Encoding<{0,1,0,0,1,?}>; // A6.2 & A8.6.59 - -// Special LDR for loads from non-pc-relative constpools. -let canFoldAsLoad = 1, mayLoad = 1, neverHasSideEffects = 1, - isReMaterializable = 1 in -def tLDRcp : T1pIs<(outs tGPR:$dst), (ins i32imm:$addr), IIC_iLoadi, - "ldr", "\t$dst, $addr", []>, - T1LdStSP<{1,?,?}>; - -def tSTR : T1pI4<(outs), (ins tGPR:$src, t_addrmode_s4:$addr), IIC_iStorer, - "str", "\t$src, $addr", - [(store tGPR:$src, t_addrmode_s4:$addr)]>, - T1LdSt<0b000>; -def tSTRi: T1pI4<(outs), (ins tGPR:$src, t_addrmode_s4:$addr), IIC_iStorer, - "str", "\t$src, $addr", - []>, - T1LdSt4Imm<{0,?,?}>; - -def tSTRB : T1pI1<(outs), (ins tGPR:$src, t_addrmode_s1:$addr), IIC_iStorer, - "strb", "\t$src, $addr", - [(truncstorei8 tGPR:$src, t_addrmode_s1:$addr)]>, - T1LdSt<0b010>; -def tSTRBi: T1pI1<(outs), (ins tGPR:$src, t_addrmode_s1:$addr), IIC_iStorer, - "strb", "\t$src, $addr", - []>, - T1LdSt1Imm<{0,?,?}>; - -def tSTRH : T1pI2<(outs), (ins tGPR:$src, t_addrmode_s2:$addr), IIC_iStorer, - "strh", "\t$src, $addr", - [(truncstorei16 tGPR:$src, t_addrmode_s2:$addr)]>, - T1LdSt<0b001>; -def tSTRHi: T1pI2<(outs), (ins tGPR:$src, t_addrmode_s2:$addr), IIC_iStorer, - "strh", "\t$src, $addr", - []>, - T1LdSt2Imm<{0,?,?}>; - -def tSTRspi : T1pIs<(outs), (ins tGPR:$src, t_addrmode_sp:$addr), IIC_iStorei, - "str", "\t$src, $addr", - [(store tGPR:$src, t_addrmode_sp:$addr)]>, - T1LdStSP<{0,?,?}>; - -let mayStore = 1, neverHasSideEffects = 1 in { -// Special instruction for spill. It cannot clobber condition register -// when it's expanded by eliminateCallFramePseudoInstr(). -def tSpill : T1pIs<(outs), (ins tGPR:$src, t_addrmode_sp:$addr), IIC_iStorei, +def tLDRpci : T1pIs<(outs tGPR:$Rt), (ins t_addrmode_pc:$addr), IIC_iLoad_i, + "ldr", ".n\t$Rt, $addr", + [(set tGPR:$Rt, (load (ARMWrapper tconstpool:$addr)))]>, + T1Encoding<{0,1,0,0,1,?}> { + // A6.2 & A8.6.59 + bits<3> Rt; + bits<8> addr; + let Inst{10-8} = Rt; + let Inst{7-0} = addr; +} + +// A8.6.194 & A8.6.192 +defm tSTR : thumb_st_rr_ri_enc<0b000, 0b0110, t_addrmode_rrs4, + t_addrmode_is4, AddrModeT1_4, + IIC_iStore_r, IIC_iStore_i, "str", + BinOpFrag<(store node:$LHS, node:$RHS)>>; + +// A8.6.197 & A8.6.195 +defm tSTRB : thumb_st_rr_ri_enc<0b010, 0b0111, t_addrmode_rrs1, + t_addrmode_is1, AddrModeT1_1, + IIC_iStore_bh_r, IIC_iStore_bh_i, "strb", + BinOpFrag<(truncstorei8 node:$LHS, node:$RHS)>>; + +// A8.6.207 & A8.6.205 +defm tSTRH : thumb_st_rr_ri_enc<0b001, 0b1000, t_addrmode_rrs2, + t_addrmode_is2, AddrModeT1_2, + IIC_iStore_bh_r, IIC_iStore_bh_i, "strh", + BinOpFrag<(truncstorei16 node:$LHS, node:$RHS)>>; + + +def tSTRspi : T1pIs<(outs), (ins tGPR:$Rt, t_addrmode_sp:$addr), IIC_iStore_i, + "str", "\t$Rt, $addr", + [(store tGPR:$Rt, t_addrmode_sp:$addr)]>, + T1LdStSP<{0,?,?}> { + bits<3> Rt; + bits<8> addr; + let Inst{10-8} = Rt; + let Inst{7-0} = addr; +} + +let mayStore = 1, neverHasSideEffects = 1 in +// Special instruction for spill. It cannot clobber condition register when it's +// expanded by eliminateCallFramePseudoInstr(). +// FIXME: Pseudo for tSTRspi +def tSpill : T1pIs<(outs), (ins tGPR:$src, t_addrmode_sp:$addr), IIC_iStore_i, "str", "\t$src, $addr", []>, - T1LdStSP<{0,?,?}>; + T1LdStSP<{0,?,?}> { + bits<3> Rt; + bits<8> addr; + let Inst{10-8} = Rt; + let Inst{7-0} = addr; } //===----------------------------------------------------------------------===// // Load / store multiple Instructions. // -// These requires base address to be written back or one of the loaded regs. -let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 in { -def tLDM : T1I<(outs), - (ins addrmode4:$addr, pred:$p, reglist:$dsts, variable_ops), - IIC_iLoadm, - "ldm${addr:submode}${p}\t$addr, $dsts", []>, - T1Encoding<{1,1,0,0,1,?}>; // A6.2 & A8.6.53 - -def tLDM_UPD : T1It<(outs tGPR:$wb), - (ins addrmode4:$addr, pred:$p, reglist:$dsts, variable_ops), - IIC_iLoadm, - "ldm${addr:submode}${p}\t$addr!, $dsts", - "$addr.addr = $wb", []>, - T1Encoding<{1,1,0,0,1,?}>; // A6.2 & A8.6.53 -} // mayLoad, neverHasSideEffects = 1, hasExtraDefRegAllocReq - -let mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1 in -def tSTM_UPD : T1It<(outs tGPR:$wb), - (ins addrmode4:$addr, pred:$p, reglist:$srcs, variable_ops), - IIC_iStorem, - "stm${addr:submode}${p}\t$addr!, $srcs", - "$addr.addr = $wb", []>, - T1Encoding<{1,1,0,0,0,?}>; // A6.2 & A8.6.189 +multiclass thumb_ldst_mult<string asm, InstrItinClass itin, + InstrItinClass itin_upd, bits<6> T1Enc, + bit L_bit> { + def IA : + T1I<(outs), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops), + itin, !strconcat(asm, "ia${p}\t$Rn, $regs"), []>, + T1Encoding<T1Enc> { + bits<3> Rn; + bits<8> regs; + let Inst{10-8} = Rn; + let Inst{7-0} = regs; + } + def IA_UPD : + T1It<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops), + itin_upd, !strconcat(asm, "ia${p}\t$Rn!, $regs"), "$Rn = $wb", []>, + T1Encoding<T1Enc> { + bits<3> Rn; + bits<8> regs; + let Inst{10-8} = Rn; + let Inst{7-0} = regs; + } +} + +// These require base address to be written back or one of the loaded regs. +let neverHasSideEffects = 1 in { + +let mayLoad = 1, hasExtraDefRegAllocReq = 1 in +defm tLDM : thumb_ldst_mult<"ldm", IIC_iLoad_m, IIC_iLoad_mu, + {1,1,0,0,1,?}, 1>; + +let mayStore = 1, hasExtraSrcRegAllocReq = 1 in +defm tSTM : thumb_ldst_mult<"stm", IIC_iStore_m, IIC_iStore_mu, + {1,1,0,0,0,?}, 0>; + +} // neverHasSideEffects let mayLoad = 1, Uses = [SP], Defs = [SP], hasExtraDefRegAllocReq = 1 in -def tPOP : T1I<(outs), (ins pred:$p, reglist:$dsts, variable_ops), IIC_Br, - "pop${p}\t$dsts", []>, - T1Misc<{1,1,0,?,?,?,?}>; +def tPOP : T1I<(outs), (ins pred:$p, reglist:$regs, variable_ops), + IIC_iPop, + "pop${p}\t$regs", []>, + T1Misc<{1,1,0,?,?,?,?}> { + bits<16> regs; + let Inst{8} = regs{15}; + let Inst{7-0} = regs{7-0}; +} let mayStore = 1, Uses = [SP], Defs = [SP], hasExtraSrcRegAllocReq = 1 in -def tPUSH : T1I<(outs), (ins pred:$p, reglist:$srcs, variable_ops), IIC_Br, - "push${p}\t$srcs", []>, - T1Misc<{0,1,0,?,?,?,?}>; +def tPUSH : T1I<(outs), (ins pred:$p, reglist:$regs, variable_ops), + IIC_iStore_m, + "push${p}\t$regs", []>, + T1Misc<{0,1,0,?,?,?,?}> { + bits<16> regs; + let Inst{8} = regs{14}; + let Inst{7-0} = regs{7-0}; +} //===----------------------------------------------------------------------===// // Arithmetic Instructions. // +// Helper classes for encoding T1pI patterns: +class T1pIDPEncode<bits<4> opA, dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : T1pI<oops, iops, itin, opc, asm, pattern>, + T1DataProcessing<opA> { + bits<3> Rm; + bits<3> Rn; + let Inst{5-3} = Rm; + let Inst{2-0} = Rn; +} +class T1pIMiscEncode<bits<7> opA, dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : T1pI<oops, iops, itin, opc, asm, pattern>, + T1Misc<opA> { + bits<3> Rm; + bits<3> Rd; + let Inst{5-3} = Rm; + let Inst{2-0} = Rd; +} + +// Helper classes for encoding T1sI patterns: +class T1sIDPEncode<bits<4> opA, dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : T1sI<oops, iops, itin, opc, asm, pattern>, + T1DataProcessing<opA> { + bits<3> Rd; + bits<3> Rn; + let Inst{5-3} = Rn; + let Inst{2-0} = Rd; +} +class T1sIGenEncode<bits<5> opA, dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : T1sI<oops, iops, itin, opc, asm, pattern>, + T1General<opA> { + bits<3> Rm; + bits<3> Rn; + bits<3> Rd; + let Inst{8-6} = Rm; + let Inst{5-3} = Rn; + let Inst{2-0} = Rd; +} +class T1sIGenEncodeImm<bits<5> opA, dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : T1sI<oops, iops, itin, opc, asm, pattern>, + T1General<opA> { + bits<3> Rd; + bits<3> Rm; + let Inst{5-3} = Rm; + let Inst{2-0} = Rd; +} + +// Helper classes for encoding T1sIt patterns: +class T1sItDPEncode<bits<4> opA, dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : T1sIt<oops, iops, itin, opc, asm, pattern>, + T1DataProcessing<opA> { + bits<3> Rdn; + bits<3> Rm; + let Inst{5-3} = Rm; + let Inst{2-0} = Rdn; +} +class T1sItGenEncodeImm<bits<5> opA, dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : T1sIt<oops, iops, itin, opc, asm, pattern>, + T1General<opA> { + bits<3> Rdn; + bits<8> imm8; + let Inst{10-8} = Rdn; + let Inst{7-0} = imm8; +} + // Add with carry register let isCommutable = 1, Uses = [CPSR] in -def tADC : T1sIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), IIC_iALUr, - "adc", "\t$dst, $rhs", - [(set tGPR:$dst, (adde tGPR:$lhs, tGPR:$rhs))]>, - T1DataProcessing<0b0101>; +def tADC : // A8.6.2 + T1sItDPEncode<0b0101, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm), IIC_iALUr, + "adc", "\t$Rdn, $Rm", + [(set tGPR:$Rdn, (adde tGPR:$Rn, tGPR:$Rm))]>; // Add immediate -def tADDi3 : T1sI<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs), IIC_iALUi, - "add", "\t$dst, $lhs, $rhs", - [(set tGPR:$dst, (add tGPR:$lhs, imm0_7:$rhs))]>, - T1General<0b01110>; +def tADDi3 : // A8.6.4 T1 + T1sIGenEncodeImm<0b01110, (outs tGPR:$Rd), (ins tGPR:$Rm, i32imm:$imm3), IIC_iALUi, + "add", "\t$Rd, $Rm, $imm3", + [(set tGPR:$Rd, (add tGPR:$Rm, imm0_7:$imm3))]> { + bits<3> imm3; + let Inst{8-6} = imm3; +} -def tADDi8 : T1sIt<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs), IIC_iALUi, - "add", "\t$dst, $rhs", - [(set tGPR:$dst, (add tGPR:$lhs, imm8_255:$rhs))]>, - T1General<{1,1,0,?,?}>; +def tADDi8 : // A8.6.4 T2 + T1sItGenEncodeImm<{1,1,0,?,?}, (outs tGPR:$Rdn), (ins tGPR:$Rn, i32imm:$imm8), + IIC_iALUi, + "add", "\t$Rdn, $imm8", + [(set tGPR:$Rdn, (add tGPR:$Rn, imm8_255:$imm8))]>; // Add register let isCommutable = 1 in -def tADDrr : T1sI<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), IIC_iALUr, - "add", "\t$dst, $lhs, $rhs", - [(set tGPR:$dst, (add tGPR:$lhs, tGPR:$rhs))]>, - T1General<0b01100>; +def tADDrr : // A8.6.6 T1 + T1sIGenEncode<0b01100, (outs tGPR:$Rd), (ins tGPR:$Rn, tGPR:$Rm), + IIC_iALUr, + "add", "\t$Rd, $Rn, $Rm", + [(set tGPR:$Rd, (add tGPR:$Rn, tGPR:$Rm))]>; let neverHasSideEffects = 1 in -def tADDhirr : T1pIt<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs), IIC_iALUr, - "add", "\t$dst, $rhs", []>, - T1Special<{0,0,?,?}>; +def tADDhirr : T1pIt<(outs GPR:$Rdn), (ins GPR:$Rn, GPR:$Rm), IIC_iALUr, + "add", "\t$Rdn, $Rm", []>, + T1Special<{0,0,?,?}> { + // A8.6.6 T2 + bits<4> Rdn; + bits<4> Rm; + let Inst{7} = Rdn{3}; + let Inst{6-3} = Rm; + let Inst{2-0} = Rdn{2-0}; +} -// And register +// AND register let isCommutable = 1 in -def tAND : T1sIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), IIC_iALUr, - "and", "\t$dst, $rhs", - [(set tGPR:$dst, (and tGPR:$lhs, tGPR:$rhs))]>, - T1DataProcessing<0b0000>; +def tAND : // A8.6.12 + T1sItDPEncode<0b0000, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm), + IIC_iBITr, + "and", "\t$Rdn, $Rm", + [(set tGPR:$Rdn, (and tGPR:$Rn, tGPR:$Rm))]>; // ASR immediate -def tASRri : T1sI<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs), IIC_iMOVsi, - "asr", "\t$dst, $lhs, $rhs", - [(set tGPR:$dst, (sra tGPR:$lhs, (i32 imm:$rhs)))]>, - T1General<{0,1,0,?,?}>; +def tASRri : // A8.6.14 + T1sIGenEncodeImm<{0,1,0,?,?}, (outs tGPR:$Rd), (ins tGPR:$Rm, i32imm:$imm5), + IIC_iMOVsi, + "asr", "\t$Rd, $Rm, $imm5", + [(set tGPR:$Rd, (sra tGPR:$Rm, (i32 imm:$imm5)))]> { + bits<5> imm5; + let Inst{10-6} = imm5; +} // ASR register -def tASRrr : T1sIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), IIC_iMOVsr, - "asr", "\t$dst, $rhs", - [(set tGPR:$dst, (sra tGPR:$lhs, tGPR:$rhs))]>, - T1DataProcessing<0b0100>; +def tASRrr : // A8.6.15 + T1sItDPEncode<0b0100, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm), + IIC_iMOVsr, + "asr", "\t$Rdn, $Rm", + [(set tGPR:$Rdn, (sra tGPR:$Rn, tGPR:$Rm))]>; // BIC register -def tBIC : T1sIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), IIC_iALUr, - "bic", "\t$dst, $rhs", - [(set tGPR:$dst, (and tGPR:$lhs, (not tGPR:$rhs)))]>, - T1DataProcessing<0b1110>; +def tBIC : // A8.6.20 + T1sItDPEncode<0b1110, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm), + IIC_iBITr, + "bic", "\t$Rdn, $Rm", + [(set tGPR:$Rdn, (and tGPR:$Rn, (not tGPR:$Rm)))]>; // CMN register -let Defs = [CPSR] in { +let isCompare = 1, Defs = [CPSR] in { //FIXME: Disable CMN, as CCodes are backwards from compare expectations // Compare-to-zero still works out, just not the relationals -//def tCMN : T1pI<(outs), (ins tGPR:$lhs, tGPR:$rhs), IIC_iCMPr, -// "cmn", "\t$lhs, $rhs", -// [(ARMcmp tGPR:$lhs, (ineg tGPR:$rhs))]>, -// T1DataProcessing<0b1011>; -def tCMNz : T1pI<(outs), (ins tGPR:$lhs, tGPR:$rhs), IIC_iCMPr, - "cmn", "\t$lhs, $rhs", - [(ARMcmpZ tGPR:$lhs, (ineg tGPR:$rhs))]>, - T1DataProcessing<0b1011>; -} +//def tCMN : // A8.6.33 +// T1pIDPEncode<0b1011, (outs), (ins tGPR:$lhs, tGPR:$rhs), +// IIC_iCMPr, +// "cmn", "\t$lhs, $rhs", +// [(ARMcmp tGPR:$lhs, (ineg tGPR:$rhs))]>; + +def tCMNz : // A8.6.33 + T1pIDPEncode<0b1011, (outs), (ins tGPR:$Rn, tGPR:$Rm), + IIC_iCMPr, + "cmn", "\t$Rn, $Rm", + [(ARMcmpZ tGPR:$Rn, (ineg tGPR:$Rm))]>; + +} // isCompare = 1, Defs = [CPSR] // CMP immediate -let Defs = [CPSR] in { -def tCMPi8 : T1pI<(outs), (ins tGPR:$lhs, i32imm:$rhs), IIC_iCMPi, - "cmp", "\t$lhs, $rhs", - [(ARMcmp tGPR:$lhs, imm0_255:$rhs)]>, - T1General<{1,0,1,?,?}>; -def tCMPzi8 : T1pI<(outs), (ins tGPR:$lhs, i32imm:$rhs), IIC_iCMPi, - "cmp", "\t$lhs, $rhs", - [(ARMcmpZ tGPR:$lhs, imm0_255:$rhs)]>, - T1General<{1,0,1,?,?}>; +let isCompare = 1, Defs = [CPSR] in { +def tCMPi8 : T1pI<(outs), (ins tGPR:$Rn, i32imm:$imm8), IIC_iCMPi, + "cmp", "\t$Rn, $imm8", + [(ARMcmp tGPR:$Rn, imm0_255:$imm8)]>, + T1General<{1,0,1,?,?}> { + // A8.6.35 + bits<3> Rn; + bits<8> imm8; + let Inst{10-8} = Rn; + let Inst{7-0} = imm8; } // CMP register -let Defs = [CPSR] in { -def tCMPr : T1pI<(outs), (ins tGPR:$lhs, tGPR:$rhs), IIC_iCMPr, - "cmp", "\t$lhs, $rhs", - [(ARMcmp tGPR:$lhs, tGPR:$rhs)]>, - T1DataProcessing<0b1010>; -def tCMPzr : T1pI<(outs), (ins tGPR:$lhs, tGPR:$rhs), IIC_iCMPr, - "cmp", "\t$lhs, $rhs", - [(ARMcmpZ tGPR:$lhs, tGPR:$rhs)]>, - T1DataProcessing<0b1010>; - -def tCMPhir : T1pI<(outs), (ins GPR:$lhs, GPR:$rhs), IIC_iCMPr, - "cmp", "\t$lhs, $rhs", []>, - T1Special<{0,1,?,?}>; -def tCMPzhir : T1pI<(outs), (ins GPR:$lhs, GPR:$rhs), IIC_iCMPr, - "cmp", "\t$lhs, $rhs", []>, - T1Special<{0,1,?,?}>; +def tCMPr : // A8.6.36 T1 + T1pIDPEncode<0b1010, (outs), (ins tGPR:$Rn, tGPR:$Rm), + IIC_iCMPr, + "cmp", "\t$Rn, $Rm", + [(ARMcmp tGPR:$Rn, tGPR:$Rm)]>; + +def tCMPhir : T1pI<(outs), (ins GPR:$Rn, GPR:$Rm), IIC_iCMPr, + "cmp", "\t$Rn, $Rm", []>, + T1Special<{0,1,?,?}> { + // A8.6.36 T2 + bits<4> Rm; + bits<4> Rn; + let Inst{7} = Rn{3}; + let Inst{6-3} = Rm; + let Inst{2-0} = Rn{2-0}; } +} // isCompare = 1, Defs = [CPSR] // XOR register let isCommutable = 1 in -def tEOR : T1sIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), IIC_iALUr, - "eor", "\t$dst, $rhs", - [(set tGPR:$dst, (xor tGPR:$lhs, tGPR:$rhs))]>, - T1DataProcessing<0b0001>; +def tEOR : // A8.6.45 + T1sItDPEncode<0b0001, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm), + IIC_iBITr, + "eor", "\t$Rdn, $Rm", + [(set tGPR:$Rdn, (xor tGPR:$Rn, tGPR:$Rm))]>; // LSL immediate -def tLSLri : T1sI<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs), IIC_iMOVsi, - "lsl", "\t$dst, $lhs, $rhs", - [(set tGPR:$dst, (shl tGPR:$lhs, (i32 imm:$rhs)))]>, - T1General<{0,0,0,?,?}>; +def tLSLri : // A8.6.88 + T1sIGenEncodeImm<{0,0,0,?,?}, (outs tGPR:$Rd), (ins tGPR:$Rm, i32imm:$imm5), + IIC_iMOVsi, + "lsl", "\t$Rd, $Rm, $imm5", + [(set tGPR:$Rd, (shl tGPR:$Rm, (i32 imm:$imm5)))]> { + bits<5> imm5; + let Inst{10-6} = imm5; +} // LSL register -def tLSLrr : T1sIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), IIC_iMOVsr, - "lsl", "\t$dst, $rhs", - [(set tGPR:$dst, (shl tGPR:$lhs, tGPR:$rhs))]>, - T1DataProcessing<0b0010>; +def tLSLrr : // A8.6.89 + T1sItDPEncode<0b0010, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm), + IIC_iMOVsr, + "lsl", "\t$Rdn, $Rm", + [(set tGPR:$Rdn, (shl tGPR:$Rn, tGPR:$Rm))]>; // LSR immediate -def tLSRri : T1sI<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs), IIC_iMOVsi, - "lsr", "\t$dst, $lhs, $rhs", - [(set tGPR:$dst, (srl tGPR:$lhs, (i32 imm:$rhs)))]>, - T1General<{0,0,1,?,?}>; +def tLSRri : // A8.6.90 + T1sIGenEncodeImm<{0,0,1,?,?}, (outs tGPR:$Rd), (ins tGPR:$Rm, i32imm:$imm5), + IIC_iMOVsi, + "lsr", "\t$Rd, $Rm, $imm5", + [(set tGPR:$Rd, (srl tGPR:$Rm, (i32 imm:$imm5)))]> { + bits<5> imm5; + let Inst{10-6} = imm5; +} // LSR register -def tLSRrr : T1sIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), IIC_iMOVsr, - "lsr", "\t$dst, $rhs", - [(set tGPR:$dst, (srl tGPR:$lhs, tGPR:$rhs))]>, - T1DataProcessing<0b0011>; - -// move register -def tMOVi8 : T1sI<(outs tGPR:$dst), (ins i32imm:$src), IIC_iMOVi, - "mov", "\t$dst, $src", - [(set tGPR:$dst, imm0_255:$src)]>, - T1General<{1,0,0,?,?}>; +def tLSRrr : // A8.6.91 + T1sItDPEncode<0b0011, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm), + IIC_iMOVsr, + "lsr", "\t$Rdn, $Rm", + [(set tGPR:$Rdn, (srl tGPR:$Rn, tGPR:$Rm))]>; + +// Move register +let isMoveImm = 1 in +def tMOVi8 : T1sI<(outs tGPR:$Rd), (ins i32imm:$imm8), IIC_iMOVi, + "mov", "\t$Rd, $imm8", + [(set tGPR:$Rd, imm0_255:$imm8)]>, + T1General<{1,0,0,?,?}> { + // A8.6.96 + bits<3> Rd; + bits<8> imm8; + let Inst{10-8} = Rd; + let Inst{7-0} = imm8; +} // TODO: A7-73: MOV(2) - mov setting flag. - let neverHasSideEffects = 1 in { // FIXME: Make this predicable. -def tMOVr : T1I<(outs tGPR:$dst), (ins tGPR:$src), IIC_iMOVr, - "mov\t$dst, $src", []>, - T1Special<0b1000>; +def tMOVr : T1I<(outs tGPR:$Rd), (ins tGPR:$Rm), IIC_iMOVr, + "mov\t$Rd, $Rm", []>, + T1Special<0b1000> { + // A8.6.97 + bits<4> Rd; + bits<4> Rm; + // Bits {7-6} are encoded by the T1Special value. + let Inst{5-3} = Rm{2-0}; + let Inst{2-0} = Rd{2-0}; +} let Defs = [CPSR] in -def tMOVSr : T1I<(outs tGPR:$dst), (ins tGPR:$src), IIC_iMOVr, - "movs\t$dst, $src", []>, Encoding16 { +def tMOVSr : T1I<(outs tGPR:$Rd), (ins tGPR:$Rm), IIC_iMOVr, + "movs\t$Rd, $Rm", []>, Encoding16 { + // A8.6.97 + bits<3> Rd; + bits<3> Rm; let Inst{15-6} = 0b0000000000; + let Inst{5-3} = Rm; + let Inst{2-0} = Rd; } // FIXME: Make these predicable. -def tMOVgpr2tgpr : T1I<(outs tGPR:$dst), (ins GPR:$src), IIC_iMOVr, - "mov\t$dst, $src", []>, - T1Special<{1,0,0,?}>; -def tMOVtgpr2gpr : T1I<(outs GPR:$dst), (ins tGPR:$src), IIC_iMOVr, - "mov\t$dst, $src", []>, - T1Special<{1,0,?,0}>; -def tMOVgpr2gpr : T1I<(outs GPR:$dst), (ins GPR:$src), IIC_iMOVr, - "mov\t$dst, $src", []>, - T1Special<{1,0,?,?}>; +def tMOVgpr2tgpr : T1I<(outs tGPR:$Rd), (ins GPR:$Rm), IIC_iMOVr, + "mov\t$Rd, $Rm", []>, + T1Special<{1,0,0,?}> { + // A8.6.97 + bits<4> Rd; + bits<4> Rm; + // Bit {7} is encoded by the T1Special value. + let Inst{6-3} = Rm; + let Inst{2-0} = Rd{2-0}; +} +def tMOVtgpr2gpr : T1I<(outs GPR:$Rd), (ins tGPR:$Rm), IIC_iMOVr, + "mov\t$Rd, $Rm", []>, + T1Special<{1,0,?,0}> { + // A8.6.97 + bits<4> Rd; + bits<4> Rm; + // Bit {6} is encoded by the T1Special value. + let Inst{7} = Rd{3}; + let Inst{5-3} = Rm{2-0}; + let Inst{2-0} = Rd{2-0}; +} +def tMOVgpr2gpr : T1I<(outs GPR:$Rd), (ins GPR:$Rm), IIC_iMOVr, + "mov\t$Rd, $Rm", []>, + T1Special<{1,0,?,?}> { + // A8.6.97 + bits<4> Rd; + bits<4> Rm; + let Inst{7} = Rd{3}; + let Inst{6-3} = Rm; + let Inst{2-0} = Rd{2-0}; +} } // neverHasSideEffects -// multiply register +// Multiply register let isCommutable = 1 in -def tMUL : T1sIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), IIC_iMUL32, - "mul", "\t$dst, $rhs, $dst", /* A8.6.105 MUL Encoding T1 */ - [(set tGPR:$dst, (mul tGPR:$lhs, tGPR:$rhs))]>, - T1DataProcessing<0b1101>; - -// move inverse register -def tMVN : T1sI<(outs tGPR:$dst), (ins tGPR:$src), IIC_iMOVr, - "mvn", "\t$dst, $src", - [(set tGPR:$dst, (not tGPR:$src))]>, - T1DataProcessing<0b1111>; - -// bitwise or register +def tMUL : // A8.6.105 T1 + T1sItDPEncode<0b1101, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm), + IIC_iMUL32, + "mul", "\t$Rdn, $Rm, $Rdn", + [(set tGPR:$Rdn, (mul tGPR:$Rn, tGPR:$Rm))]>; + +// Move inverse register +def tMVN : // A8.6.107 + T1sIDPEncode<0b1111, (outs tGPR:$Rd), (ins tGPR:$Rn), IIC_iMVNr, + "mvn", "\t$Rd, $Rn", + [(set tGPR:$Rd, (not tGPR:$Rn))]>; + +// Bitwise or register let isCommutable = 1 in -def tORR : T1sIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), IIC_iALUr, - "orr", "\t$dst, $rhs", - [(set tGPR:$dst, (or tGPR:$lhs, tGPR:$rhs))]>, - T1DataProcessing<0b1100>; - -// swaps -def tREV : T1pI<(outs tGPR:$dst), (ins tGPR:$src), IIC_iUNAr, - "rev", "\t$dst, $src", - [(set tGPR:$dst, (bswap tGPR:$src))]>, - Requires<[IsThumb1Only, HasV6]>, - T1Misc<{1,0,1,0,0,0,?}>; - -def tREV16 : T1pI<(outs tGPR:$dst), (ins tGPR:$src), IIC_iUNAr, - "rev16", "\t$dst, $src", - [(set tGPR:$dst, - (or (and (srl tGPR:$src, (i32 8)), 0xFF), - (or (and (shl tGPR:$src, (i32 8)), 0xFF00), - (or (and (srl tGPR:$src, (i32 8)), 0xFF0000), - (and (shl tGPR:$src, (i32 8)), 0xFF000000)))))]>, - Requires<[IsThumb1Only, HasV6]>, - T1Misc<{1,0,1,0,0,1,?}>; - -def tREVSH : T1pI<(outs tGPR:$dst), (ins tGPR:$src), IIC_iUNAr, - "revsh", "\t$dst, $src", - [(set tGPR:$dst, - (sext_inreg - (or (srl (and tGPR:$src, 0xFF00), (i32 8)), - (shl tGPR:$src, (i32 8))), i16))]>, - Requires<[IsThumb1Only, HasV6]>, - T1Misc<{1,0,1,0,1,1,?}>; - -// rotate right register -def tROR : T1sIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), IIC_iMOVsr, - "ror", "\t$dst, $rhs", - [(set tGPR:$dst, (rotr tGPR:$lhs, tGPR:$rhs))]>, - T1DataProcessing<0b0111>; - -// negate register -def tRSB : T1sI<(outs tGPR:$dst), (ins tGPR:$src), IIC_iALUi, - "rsb", "\t$dst, $src, #0", - [(set tGPR:$dst, (ineg tGPR:$src))]>, - T1DataProcessing<0b1001>; +def tORR : // A8.6.114 + T1sItDPEncode<0b1100, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm), + IIC_iBITr, + "orr", "\t$Rdn, $Rm", + [(set tGPR:$Rdn, (or tGPR:$Rn, tGPR:$Rm))]>; + +// Swaps +def tREV : // A8.6.134 + T1pIMiscEncode<{1,0,1,0,0,0,?}, (outs tGPR:$Rd), (ins tGPR:$Rm), + IIC_iUNAr, + "rev", "\t$Rd, $Rm", + [(set tGPR:$Rd, (bswap tGPR:$Rm))]>, + Requires<[IsThumb, IsThumb1Only, HasV6]>; + +def tREV16 : // A8.6.135 + T1pIMiscEncode<{1,0,1,0,0,1,?}, (outs tGPR:$Rd), (ins tGPR:$Rm), + IIC_iUNAr, + "rev16", "\t$Rd, $Rm", + [(set tGPR:$Rd, + (or (and (srl tGPR:$Rm, (i32 8)), 0xFF), + (or (and (shl tGPR:$Rm, (i32 8)), 0xFF00), + (or (and (srl tGPR:$Rm, (i32 8)), 0xFF0000), + (and (shl tGPR:$Rm, (i32 8)), 0xFF000000)))))]>, + Requires<[IsThumb, IsThumb1Only, HasV6]>; + +def tREVSH : // A8.6.136 + T1pIMiscEncode<{1,0,1,0,1,1,?}, (outs tGPR:$Rd), (ins tGPR:$Rm), + IIC_iUNAr, + "revsh", "\t$Rd, $Rm", + [(set tGPR:$Rd, + (sext_inreg + (or (srl (and tGPR:$Rm, 0xFF00), (i32 8)), + (shl tGPR:$Rm, (i32 8))), i16))]>, + Requires<[IsThumb, IsThumb1Only, HasV6]>; + +// Rotate right register +def tROR : // A8.6.139 + T1sItDPEncode<0b0111, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm), + IIC_iMOVsr, + "ror", "\t$Rdn, $Rm", + [(set tGPR:$Rdn, (rotr tGPR:$Rn, tGPR:$Rm))]>; + +// Negate register +def tRSB : // A8.6.141 + T1sIDPEncode<0b1001, (outs tGPR:$Rd), (ins tGPR:$Rn), + IIC_iALUi, + "rsb", "\t$Rd, $Rn, #0", + [(set tGPR:$Rd, (ineg tGPR:$Rn))]>; // Subtract with carry register let Uses = [CPSR] in -def tSBC : T1sIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), IIC_iALUr, - "sbc", "\t$dst, $rhs", - [(set tGPR:$dst, (sube tGPR:$lhs, tGPR:$rhs))]>, - T1DataProcessing<0b0110>; +def tSBC : // A8.6.151 + T1sItDPEncode<0b0110, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm), + IIC_iALUr, + "sbc", "\t$Rdn, $Rm", + [(set tGPR:$Rdn, (sube tGPR:$Rn, tGPR:$Rm))]>; // Subtract immediate -def tSUBi3 : T1sI<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs), IIC_iALUi, - "sub", "\t$dst, $lhs, $rhs", - [(set tGPR:$dst, (add tGPR:$lhs, imm0_7_neg:$rhs))]>, - T1General<0b01111>; - -def tSUBi8 : T1sIt<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs), IIC_iALUi, - "sub", "\t$dst, $rhs", - [(set tGPR:$dst, (add tGPR:$lhs, imm8_255_neg:$rhs))]>, - T1General<{1,1,1,?,?}>; - -// subtract register -def tSUBrr : T1sI<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), IIC_iALUr, - "sub", "\t$dst, $lhs, $rhs", - [(set tGPR:$dst, (sub tGPR:$lhs, tGPR:$rhs))]>, - T1General<0b01101>; +def tSUBi3 : // A8.6.210 T1 + T1sIGenEncodeImm<0b01111, (outs tGPR:$Rd), (ins tGPR:$Rm, i32imm:$imm3), + IIC_iALUi, + "sub", "\t$Rd, $Rm, $imm3", + [(set tGPR:$Rd, (add tGPR:$Rm, imm0_7_neg:$imm3))]> { + bits<3> imm3; + let Inst{8-6} = imm3; +} -// TODO: A7-96: STMIA - store multiple. +def tSUBi8 : // A8.6.210 T2 + T1sItGenEncodeImm<{1,1,1,?,?}, (outs tGPR:$Rdn), (ins tGPR:$Rn, i32imm:$imm8), + IIC_iALUi, + "sub", "\t$Rdn, $imm8", + [(set tGPR:$Rdn, (add tGPR:$Rn, imm8_255_neg:$imm8))]>; + +// Subtract register +def tSUBrr : // A8.6.212 + T1sIGenEncode<0b01101, (outs tGPR:$Rd), (ins tGPR:$Rn, tGPR:$Rm), + IIC_iALUr, + "sub", "\t$Rd, $Rn, $Rm", + [(set tGPR:$Rd, (sub tGPR:$Rn, tGPR:$Rm))]>; -// sign-extend byte -def tSXTB : T1pI<(outs tGPR:$dst), (ins tGPR:$src), IIC_iUNAr, - "sxtb", "\t$dst, $src", - [(set tGPR:$dst, (sext_inreg tGPR:$src, i8))]>, - Requires<[IsThumb1Only, HasV6]>, - T1Misc<{0,0,1,0,0,1,?}>; - -// sign-extend short -def tSXTH : T1pI<(outs tGPR:$dst), (ins tGPR:$src), IIC_iUNAr, - "sxth", "\t$dst, $src", - [(set tGPR:$dst, (sext_inreg tGPR:$src, i16))]>, - Requires<[IsThumb1Only, HasV6]>, - T1Misc<{0,0,1,0,0,0,?}>; - -// test -let isCommutable = 1, Defs = [CPSR] in -def tTST : T1pI<(outs), (ins tGPR:$lhs, tGPR:$rhs), IIC_iCMPr, - "tst", "\t$lhs, $rhs", - [(ARMcmpZ (and tGPR:$lhs, tGPR:$rhs), 0)]>, - T1DataProcessing<0b1000>; - -// zero-extend byte -def tUXTB : T1pI<(outs tGPR:$dst), (ins tGPR:$src), IIC_iUNAr, - "uxtb", "\t$dst, $src", - [(set tGPR:$dst, (and tGPR:$src, 0xFF))]>, - Requires<[IsThumb1Only, HasV6]>, - T1Misc<{0,0,1,0,1,1,?}>; - -// zero-extend short -def tUXTH : T1pI<(outs tGPR:$dst), (ins tGPR:$src), IIC_iUNAr, - "uxth", "\t$dst, $src", - [(set tGPR:$dst, (and tGPR:$src, 0xFFFF))]>, - Requires<[IsThumb1Only, HasV6]>, - T1Misc<{0,0,1,0,1,0,?}>; +// TODO: A7-96: STMIA - store multiple. +// Sign-extend byte +def tSXTB : // A8.6.222 + T1pIMiscEncode<{0,0,1,0,0,1,?}, (outs tGPR:$Rd), (ins tGPR:$Rm), + IIC_iUNAr, + "sxtb", "\t$Rd, $Rm", + [(set tGPR:$Rd, (sext_inreg tGPR:$Rm, i8))]>, + Requires<[IsThumb, IsThumb1Only, HasV6]>; + +// Sign-extend short +def tSXTH : // A8.6.224 + T1pIMiscEncode<{0,0,1,0,0,0,?}, (outs tGPR:$Rd), (ins tGPR:$Rm), + IIC_iUNAr, + "sxth", "\t$Rd, $Rm", + [(set tGPR:$Rd, (sext_inreg tGPR:$Rm, i16))]>, + Requires<[IsThumb, IsThumb1Only, HasV6]>; + +// Test +let isCompare = 1, isCommutable = 1, Defs = [CPSR] in +def tTST : // A8.6.230 + T1pIDPEncode<0b1000, (outs), (ins tGPR:$Rn, tGPR:$Rm), IIC_iTSTr, + "tst", "\t$Rn, $Rm", + [(ARMcmpZ (and_su tGPR:$Rn, tGPR:$Rm), 0)]>; + +// Zero-extend byte +def tUXTB : // A8.6.262 + T1pIMiscEncode<{0,0,1,0,1,1,?}, (outs tGPR:$Rd), (ins tGPR:$Rm), + IIC_iUNAr, + "uxtb", "\t$Rd, $Rm", + [(set tGPR:$Rd, (and tGPR:$Rm, 0xFF))]>, + Requires<[IsThumb, IsThumb1Only, HasV6]>; + +// Zero-extend short +def tUXTH : // A8.6.264 + T1pIMiscEncode<{0,0,1,0,1,0,?}, (outs tGPR:$Rd), (ins tGPR:$Rm), + IIC_iUNAr, + "uxth", "\t$Rd, $Rm", + [(set tGPR:$Rd, (and tGPR:$Rm, 0xFFFF))]>, + Requires<[IsThumb, IsThumb1Only, HasV6]>; // Conditional move tMOVCCr - Used to implement the Thumb SELECT_CC operation. // Expanded after instruction selection into a branch sequence. let usesCustomInserter = 1 in // Expanded after instruction selection. def tMOVCCr_pseudo : PseudoInst<(outs tGPR:$dst), (ins tGPR:$false, tGPR:$true, pred:$cc), - NoItinerary, "${:comment} tMOVCCr $cc", + NoItinerary, [/*(set tGPR:$dst, (ARMcmov tGPR:$false, tGPR:$true, imm:$cc))*/]>; // 16-bit movcc in IT blocks for Thumb2. let neverHasSideEffects = 1 in { -def tMOVCCr : T1pIt<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs), IIC_iCMOVr, - "mov", "\t$dst, $rhs", []>, - T1Special<{1,0,?,?}>; +def tMOVCCr : T1pIt<(outs GPR:$Rdn), (ins GPR:$Rn, GPR:$Rm), IIC_iCMOVr, + "mov", "\t$Rdn, $Rm", []>, + T1Special<{1,0,?,?}> { + bits<4> Rdn; + bits<4> Rm; + let Inst{7} = Rdn{3}; + let Inst{6-3} = Rm; + let Inst{2-0} = Rdn{2-0}; +} + +let isMoveImm = 1 in +def tMOVCCi : T1pIt<(outs tGPR:$Rdn), (ins tGPR:$Rn, i32imm:$Rm), IIC_iCMOVi, + "mov", "\t$Rdn, $Rm", []>, + T1General<{1,0,0,?,?}> { + bits<3> Rdn; + bits<8> Rm; + let Inst{10-8} = Rdn; + let Inst{7-0} = Rm; +} -def tMOVCCi : T1pIt<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs), IIC_iCMOVi, - "mov", "\t$dst, $rhs", []>, - T1General<{1,0,0,?,?}>; } // neverHasSideEffects // tLEApcrel - Load a pc-relative address into a register without offending the // assembler. -let neverHasSideEffects = 1 in { -let isReMaterializable = 1 in -def tLEApcrel : T1I<(outs tGPR:$dst), (ins i32imm:$label, pred:$p), IIC_iALUi, - "adr$p\t$dst, #$label", []>, - T1Encoding<{1,0,1,0,0,?}>; // A6.2 & A8.6.10 -} // neverHasSideEffects -def tLEApcrelJT : T1I<(outs tGPR:$dst), - (ins i32imm:$label, nohash_imm:$id, pred:$p), - IIC_iALUi, "adr$p\t$dst, #${label}_${id}", []>, - T1Encoding<{1,0,1,0,0,?}>; // A6.2 & A8.6.10 +def tADR : T1I<(outs tGPR:$Rd), (ins t_adrlabel:$addr, pred:$p), + IIC_iALUi, "adr{$p}\t$Rd, #$addr", []>, + T1Encoding<{1,0,1,0,0,?}> { + bits<3> Rd; + bits<8> addr; + let Inst{10-8} = Rd; + let Inst{7-0} = addr; +} + +let neverHasSideEffects = 1, isReMaterializable = 1 in +def tLEApcrel : tPseudoInst<(outs tGPR:$Rd), (ins i32imm:$label, pred:$p), + Size2Bytes, IIC_iALUi, []>; + +def tLEApcrelJT : tPseudoInst<(outs tGPR:$Rd), + (ins i32imm:$label, nohash_imm:$id, pred:$p), + Size2Bytes, IIC_iALUi, []>; + +//===----------------------------------------------------------------------===// +// Move between coprocessor and ARM core register -- for disassembly only +// + +class tMovRCopro<string opc, bit direction> + : T1Cop<(outs), (ins p_imm:$cop, i32imm:$opc1, + GPR:$Rt, c_imm:$CRn, c_imm:$CRm, i32imm:$opc2), + !strconcat(opc, "\t$cop, $opc1, $Rt, $CRn, $CRm, $opc2"), + [/* For disassembly only; pattern left blank */]> { + let Inst{27-24} = 0b1110; + let Inst{20} = direction; + let Inst{4} = 1; + + bits<4> Rt; + bits<4> cop; + bits<3> opc1; + bits<3> opc2; + bits<4> CRm; + bits<4> CRn; + + let Inst{15-12} = Rt; + let Inst{11-8} = cop; + let Inst{23-21} = opc1; + let Inst{7-5} = opc2; + let Inst{3-0} = CRm; + let Inst{19-16} = CRn; +} + +def tMCR : tMovRCopro<"mcr", 0 /* from ARM core register to coprocessor */>; +def tMRC : tMovRCopro<"mrc", 1 /* from coprocessor to ARM core register */>; + +class tMovRRCopro<string opc, bit direction> + : T1Cop<(outs), (ins p_imm:$cop, i32imm:$opc1, GPR:$Rt, GPR:$Rt2, c_imm:$CRm), + !strconcat(opc, "\t$cop, $opc1, $Rt, $Rt2, $CRm"), + [/* For disassembly only; pattern left blank */]> { + let Inst{27-24} = 0b1100; + let Inst{23-21} = 0b010; + let Inst{20} = direction; + + bits<4> Rt; + bits<4> Rt2; + bits<4> cop; + bits<4> opc1; + bits<4> CRm; + + let Inst{15-12} = Rt; + let Inst{19-16} = Rt2; + let Inst{11-8} = cop; + let Inst{7-4} = opc1; + let Inst{3-0} = CRm; +} + +def tMCRR : tMovRRCopro<"mcrr", 0 /* from ARM core register to coprocessor */>; +def tMRRC : tMovRRCopro<"mrrc", 1 /* from coprocessor to ARM core register */>; + +//===----------------------------------------------------------------------===// +// Other Coprocessor Instructions. For disassembly only. +// +def tCDP : T1Cop<(outs), (ins p_imm:$cop, i32imm:$opc1, + c_imm:$CRd, c_imm:$CRn, c_imm:$CRm, i32imm:$opc2), + "cdp\t$cop, $opc1, $CRd, $CRn, $CRm, $opc2", + [/* For disassembly only; pattern left blank */]> { + let Inst{27-24} = 0b1110; + + bits<4> opc1; + bits<4> CRn; + bits<4> CRd; + bits<4> cop; + bits<3> opc2; + bits<4> CRm; + + let Inst{3-0} = CRm; + let Inst{4} = 0; + let Inst{7-5} = opc2; + let Inst{11-8} = cop; + let Inst{15-12} = CRd; + let Inst{19-16} = CRn; + let Inst{23-20} = opc1; +} //===----------------------------------------------------------------------===// // TLS Instructions // // __aeabi_read_tp preserves the registers r1-r3. -let isCall = 1, - Defs = [R0, LR] in { - def tTPsoft : TIx2<0b11110, 0b11, 1, (outs), (ins), IIC_Br, - "bl\t__aeabi_read_tp", - [(set R0, ARMthread_pointer)]>; +let isCall = 1, Defs = [R0, LR], Uses = [SP] in +def tTPsoft : TIx2<0b11110, 0b11, 1, (outs), (ins), IIC_Br, + "bl\t__aeabi_read_tp", + [(set R0, ARMthread_pointer)]> { + // Encoding is 0xf7fffffe. + let Inst = 0xf7fffffe; } +//===----------------------------------------------------------------------===// // SJLJ Exception handling intrinsics -// eh_sjlj_setjmp() is an instruction sequence to store the return -// address and save #0 in R0 for the non-longjmp case. -// Since by its nature we may be coming from some other function to get -// here, and we're using the stack frame for the containing function to -// save/restore registers, we can't keep anything live in regs across -// the eh_sjlj_setjmp(), else it will almost certainly have been tromped upon -// when we get here from a longjmp(). We force everthing out of registers -// except for our own input by listing the relevant registers in Defs. By -// doing so, we also cause the prologue/epilogue code to actively preserve -// all of the callee-saved resgisters, which is exactly what we want. -// $val is a scratch register for our use. -let Defs = - [ R0, R1, R2, R3, R4, R5, R6, R7, R12 ], hasSideEffects = 1, - isBarrier = 1 in { - def tInt_eh_sjlj_setjmp : ThumbXI<(outs),(ins tGPR:$src, tGPR:$val), - AddrModeNone, SizeSpecial, NoItinerary, - "mov\t$val, pc\t${:comment} begin eh.setjmp\n\t" - "adds\t$val, #7\n\t" - "str\t$val, [$src, #4]\n\t" - "movs\tr0, #0\n\t" - "b\t1f\n\t" - "movs\tr0, #1\t${:comment} end eh.setjmp\n\t" - "1:", "", - [(set R0, (ARMeh_sjlj_setjmp tGPR:$src, tGPR:$val))]>; -} +// + +// eh_sjlj_setjmp() is an instruction sequence to store the return address and +// save #0 in R0 for the non-longjmp case. Since by its nature we may be coming +// from some other function to get here, and we're using the stack frame for the +// containing function to save/restore registers, we can't keep anything live in +// regs across the eh_sjlj_setjmp(), else it will almost certainly have been +// tromped upon when we get here from a longjmp(). We force everthing out of +// registers except for our own input by listing the relevant registers in +// Defs. By doing so, we also cause the prologue/epilogue code to actively +// preserve all of the callee-saved resgisters, which is exactly what we want. +// $val is a scratch register for our use. +let Defs = [ R0, R1, R2, R3, R4, R5, R6, R7, R12 ], + hasSideEffects = 1, isBarrier = 1, isCodeGenOnly = 1 in +def tInt_eh_sjlj_setjmp : ThumbXI<(outs),(ins tGPR:$src, tGPR:$val), + AddrModeNone, SizeSpecial, NoItinerary, "","", + [(set R0, (ARMeh_sjlj_setjmp tGPR:$src, tGPR:$val))]>; // FIXME: Non-Darwin version(s) -let isBarrier = 1, hasSideEffects = 1, isTerminator = 1, - Defs = [ R7, LR, SP ] in { +let isBarrier = 1, hasSideEffects = 1, isTerminator = 1, isCodeGenOnly = 1, + Defs = [ R7, LR, SP ] in def tInt_eh_sjlj_longjmp : XI<(outs), (ins GPR:$src, GPR:$scratch), - AddrModeNone, SizeSpecial, IndexModeNone, - Pseudo, NoItinerary, - "ldr\t$scratch, [$src, #8]\n\t" - "mov\tsp, $scratch\n\t" - "ldr\t$scratch, [$src, #4]\n\t" - "ldr\tr7, [$src]\n\t" - "bx\t$scratch", "", - [(ARMeh_sjlj_longjmp GPR:$src, GPR:$scratch)]>, - Requires<[IsThumb, IsDarwin]>; -} + AddrModeNone, SizeSpecial, IndexModeNone, + Pseudo, NoItinerary, "", "", + [(ARMeh_sjlj_longjmp GPR:$src, GPR:$scratch)]>, + Requires<[IsThumb, IsDarwin]>; //===----------------------------------------------------------------------===// // Non-Instruction Patterns // +// Comparisons +def : T1Pat<(ARMcmpZ tGPR:$Rn, imm0_255:$imm8), + (tCMPi8 tGPR:$Rn, imm0_255:$imm8)>; +def : T1Pat<(ARMcmpZ tGPR:$Rn, tGPR:$Rm), + (tCMPr tGPR:$Rn, tGPR:$Rm)>; + // Add with carry def : T1Pat<(addc tGPR:$lhs, imm0_7:$rhs), (tADDi3 tGPR:$lhs, imm0_7:$rhs)>; @@ -991,27 +1492,42 @@ def : Tv5Pat<(ARMcall GPR:$dst), (tBLXr_r9 GPR:$dst)>, Requires<[IsThumb, HasV5T, IsDarwin]>; // zextload i1 -> zextload i8 -def : T1Pat<(zextloadi1 t_addrmode_s1:$addr), - (tLDRB t_addrmode_s1:$addr)>; +def : T1Pat<(zextloadi1 t_addrmode_rrs1:$addr), + (tLDRBr t_addrmode_rrs1:$addr)>; +def : T1Pat<(zextloadi1 t_addrmode_is1:$addr), + (tLDRBi t_addrmode_is1:$addr)>; // extload -> zextload -def : T1Pat<(extloadi1 t_addrmode_s1:$addr), (tLDRB t_addrmode_s1:$addr)>; -def : T1Pat<(extloadi8 t_addrmode_s1:$addr), (tLDRB t_addrmode_s1:$addr)>; -def : T1Pat<(extloadi16 t_addrmode_s2:$addr), (tLDRH t_addrmode_s2:$addr)>; +def : T1Pat<(extloadi1 t_addrmode_rrs1:$addr), (tLDRBr t_addrmode_rrs1:$addr)>; +def : T1Pat<(extloadi1 t_addrmode_is1:$addr), (tLDRBi t_addrmode_is1:$addr)>; +def : T1Pat<(extloadi8 t_addrmode_rrs1:$addr), (tLDRBr t_addrmode_rrs1:$addr)>; +def : T1Pat<(extloadi8 t_addrmode_is1:$addr), (tLDRBi t_addrmode_is1:$addr)>; +def : T1Pat<(extloadi16 t_addrmode_rrs2:$addr), (tLDRHr t_addrmode_rrs2:$addr)>; +def : T1Pat<(extloadi16 t_addrmode_is2:$addr), (tLDRHi t_addrmode_is2:$addr)>; // If it's impossible to use [r,r] address mode for sextload, select to // ldr{b|h} + sxt{b|h} instead. -def : T1Pat<(sextloadi8 t_addrmode_s1:$addr), - (tSXTB (tLDRB t_addrmode_s1:$addr))>, - Requires<[IsThumb1Only, HasV6]>; -def : T1Pat<(sextloadi16 t_addrmode_s2:$addr), - (tSXTH (tLDRH t_addrmode_s2:$addr))>, - Requires<[IsThumb1Only, HasV6]>; - -def : T1Pat<(sextloadi8 t_addrmode_s1:$addr), - (tASRri (tLSLri (tLDRB t_addrmode_s1:$addr), 24), 24)>; -def : T1Pat<(sextloadi16 t_addrmode_s1:$addr), - (tASRri (tLSLri (tLDRH t_addrmode_s1:$addr), 16), 16)>; +def : T1Pat<(sextloadi8 t_addrmode_is1:$addr), + (tSXTB (tLDRBi t_addrmode_is1:$addr))>, + Requires<[IsThumb, IsThumb1Only, HasV6]>; +def : T1Pat<(sextloadi8 t_addrmode_rrs1:$addr), + (tSXTB (tLDRBr t_addrmode_rrs1:$addr))>, + Requires<[IsThumb, IsThumb1Only, HasV6]>; +def : T1Pat<(sextloadi16 t_addrmode_is2:$addr), + (tSXTH (tLDRHi t_addrmode_is2:$addr))>, + Requires<[IsThumb, IsThumb1Only, HasV6]>; +def : T1Pat<(sextloadi16 t_addrmode_rrs2:$addr), + (tSXTH (tLDRHr t_addrmode_rrs2:$addr))>, + Requires<[IsThumb, IsThumb1Only, HasV6]>; + +def : T1Pat<(sextloadi8 t_addrmode_rrs1:$addr), + (tASRri (tLSLri (tLDRBr t_addrmode_rrs1:$addr), 24), 24)>; +def : T1Pat<(sextloadi8 t_addrmode_is1:$addr), + (tASRri (tLSLri (tLDRBi t_addrmode_is1:$addr), 24), 24)>; +def : T1Pat<(sextloadi16 t_addrmode_rrs2:$addr), + (tASRri (tLSLri (tLDRHr t_addrmode_rrs2:$addr), 16), 16)>; +def : T1Pat<(sextloadi16 t_addrmode_is2:$addr), + (tASRri (tLSLri (tLDRHi t_addrmode_is2:$addr), 16), 16)>; // Large immediate handling. @@ -1028,8 +1544,7 @@ def : T1Pat<(i32 imm0_255_comp:$src), // scheduling. let isReMaterializable = 1 in def tLDRpci_pic : PseudoInst<(outs GPR:$dst), (ins i32imm:$addr, pclabel:$cp), - NoItinerary, - "${:comment} ldr.n\t$dst, $addr\n$cp:\n\tadd\t$dst, pc", + NoItinerary, [(set GPR:$dst, (ARMpic_add (load (ARMWrapper tconstpool:$addr)), imm:$cp))]>, - Requires<[IsThumb1Only]>; + Requires<[IsThumb, IsThumb1Only]>; diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrThumb2.td b/contrib/llvm/lib/Target/ARM/ARMInstrThumb2.td index 6ba0a44..0e01be5 100644 --- a/contrib/llvm/lib/Target/ARM/ARMInstrThumb2.td +++ b/contrib/llvm/lib/Target/ARM/ARMInstrThumb2.td @@ -21,16 +21,12 @@ def it_mask : Operand<i32> { let PrintMethod = "printThumbITMask"; } -// Table branch address -def tb_addrmode : Operand<i32> { - let PrintMethod = "printTBAddrMode"; -} - // Shifted operands. No register controlled shifts for Thumb2. // Note: We do not support rrx shifted operands yet. def t2_so_reg : Operand<i32>, // reg imm ComplexPattern<i32, 2, "SelectT2ShifterOperandReg", [shl,srl,sra,rotr]> { + let EncoderMethod = "getT2SORegOpValue"; let PrintMethod = "printT2SOOperand"; let MIOperandInfo = (ops rGPR, i32imm); } @@ -47,11 +43,10 @@ def t2_so_imm_neg_XFORM : SDNodeXForm<imm, [{ // t2_so_imm - Match a 32-bit immediate operand, which is an // 8-bit immediate rotated by an arbitrary number of bits, or an 8-bit -// immediate splatted into multiple bytes of the word. t2_so_imm values are -// represented in the imm field in the same 12-bit form that they are encoded -// into t2_so_imm instructions: the 8-bit immediate is the least significant -// bits [bits 0-7], the 4-bit shift/splat amount is the next 4 bits [bits 8-11]. -def t2_so_imm : Operand<i32>, PatLeaf<(imm), [{ return Pred_t2_so_imm(N); }]>; +// immediate splatted into multiple bytes of the word. +def t2_so_imm : Operand<i32>, PatLeaf<(imm), [{ return Pred_t2_so_imm(N); }]> { + let EncoderMethod = "getT2SOImmOpValue"; +} // t2_so_imm_not - Match an immediate that is a complement // of a t2_so_imm. @@ -63,7 +58,7 @@ def t2_so_imm_not : Operand<i32>, // t2_so_imm_neg - Match an immediate that is a negation of a t2_so_imm. def t2_so_imm_neg : Operand<i32>, PatLeaf<(imm), [{ - return ARM_AM::getT2SOImmVal(-((int)N->getZExtValue())) != -1; + return ARM_AM::getT2SOImmVal(-((uint32_t)N->getZExtValue())) != -1; }], t2_so_imm_neg_XFORM>; // Break t2_so_imm's up into two pieces. This handles immediates with up to 16 @@ -128,27 +123,41 @@ def imm0_255_not : PatLeaf<(i32 imm), [{ // t2addrmode_imm12 := reg + imm12 def t2addrmode_imm12 : Operand<i32>, ComplexPattern<i32, 2, "SelectT2AddrModeImm12", []> { - let PrintMethod = "printT2AddrModeImm12Operand"; + let PrintMethod = "printAddrModeImm12Operand"; + let EncoderMethod = "getAddrModeImm12OpValue"; let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm); + let ParserMatchClass = MemMode5AsmOperand; } +// ADR instruction labels. +def t2adrlabel : Operand<i32> { + let EncoderMethod = "getT2AdrLabelOpValue"; +} + + // t2addrmode_imm8 := reg +/- imm8 def t2addrmode_imm8 : Operand<i32>, ComplexPattern<i32, 2, "SelectT2AddrModeImm8", []> { let PrintMethod = "printT2AddrModeImm8Operand"; + let EncoderMethod = "getT2AddrModeImm8OpValue"; let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm); + let ParserMatchClass = MemMode5AsmOperand; } def t2am_imm8_offset : Operand<i32>, - ComplexPattern<i32, 1, "SelectT2AddrModeImm8Offset", []>{ + ComplexPattern<i32, 1, "SelectT2AddrModeImm8Offset", + [], [SDNPWantRoot]> { let PrintMethod = "printT2AddrModeImm8OffsetOperand"; + let EncoderMethod = "getT2AddrModeImm8OffsetOpValue"; + let ParserMatchClass = MemMode5AsmOperand; } // t2addrmode_imm8s4 := reg +/- (imm8 << 2) -def t2addrmode_imm8s4 : Operand<i32>, - ComplexPattern<i32, 2, "SelectT2AddrModeImm8s4", []> { +def t2addrmode_imm8s4 : Operand<i32> { let PrintMethod = "printT2AddrModeImm8s4Operand"; + let EncoderMethod = "getT2AddrModeImm8s4OpValue"; let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm); + let ParserMatchClass = MemMode5AsmOperand; } def t2am_imm8s4_offset : Operand<i32> { @@ -159,7 +168,9 @@ def t2am_imm8s4_offset : Operand<i32> { def t2addrmode_so_reg : Operand<i32>, ComplexPattern<i32, 3, "SelectT2AddrModeSoReg", []> { let PrintMethod = "printT2AddrModeSoRegOperand"; + let EncoderMethod = "getT2AddrModeSORegOpValue"; let MIOperandInfo = (ops GPR:$base, rGPR:$offsreg, i32imm:$offsimm); + let ParserMatchClass = MemMode5AsmOperand; } @@ -167,45 +178,294 @@ def t2addrmode_so_reg : Operand<i32>, // Multiclass helpers... // + +class T2OneRegImm<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : T2I<oops, iops, itin, opc, asm, pattern> { + bits<4> Rd; + bits<12> imm; + + let Inst{11-8} = Rd; + let Inst{26} = imm{11}; + let Inst{14-12} = imm{10-8}; + let Inst{7-0} = imm{7-0}; +} + + +class T2sOneRegImm<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : T2sI<oops, iops, itin, opc, asm, pattern> { + bits<4> Rd; + bits<4> Rn; + bits<12> imm; + + let Inst{11-8} = Rd; + let Inst{26} = imm{11}; + let Inst{14-12} = imm{10-8}; + let Inst{7-0} = imm{7-0}; +} + +class T2OneRegCmpImm<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : T2I<oops, iops, itin, opc, asm, pattern> { + bits<4> Rn; + bits<12> imm; + + let Inst{19-16} = Rn; + let Inst{26} = imm{11}; + let Inst{14-12} = imm{10-8}; + let Inst{7-0} = imm{7-0}; +} + + +class T2OneRegShiftedReg<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : T2I<oops, iops, itin, opc, asm, pattern> { + bits<4> Rd; + bits<12> ShiftedRm; + + let Inst{11-8} = Rd; + let Inst{3-0} = ShiftedRm{3-0}; + let Inst{5-4} = ShiftedRm{6-5}; + let Inst{14-12} = ShiftedRm{11-9}; + let Inst{7-6} = ShiftedRm{8-7}; +} + +class T2sOneRegShiftedReg<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : T2sI<oops, iops, itin, opc, asm, pattern> { + bits<4> Rd; + bits<12> ShiftedRm; + + let Inst{11-8} = Rd; + let Inst{3-0} = ShiftedRm{3-0}; + let Inst{5-4} = ShiftedRm{6-5}; + let Inst{14-12} = ShiftedRm{11-9}; + let Inst{7-6} = ShiftedRm{8-7}; +} + +class T2OneRegCmpShiftedReg<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : T2I<oops, iops, itin, opc, asm, pattern> { + bits<4> Rn; + bits<12> ShiftedRm; + + let Inst{19-16} = Rn; + let Inst{3-0} = ShiftedRm{3-0}; + let Inst{5-4} = ShiftedRm{6-5}; + let Inst{14-12} = ShiftedRm{11-9}; + let Inst{7-6} = ShiftedRm{8-7}; +} + +class T2TwoReg<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : T2I<oops, iops, itin, opc, asm, pattern> { + bits<4> Rd; + bits<4> Rm; + + let Inst{11-8} = Rd; + let Inst{3-0} = Rm; +} + +class T2sTwoReg<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : T2sI<oops, iops, itin, opc, asm, pattern> { + bits<4> Rd; + bits<4> Rm; + + let Inst{11-8} = Rd; + let Inst{3-0} = Rm; +} + +class T2TwoRegCmp<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : T2I<oops, iops, itin, opc, asm, pattern> { + bits<4> Rn; + bits<4> Rm; + + let Inst{19-16} = Rn; + let Inst{3-0} = Rm; +} + + +class T2TwoRegImm<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : T2I<oops, iops, itin, opc, asm, pattern> { + bits<4> Rd; + bits<4> Rn; + bits<12> imm; + + let Inst{11-8} = Rd; + let Inst{19-16} = Rn; + let Inst{26} = imm{11}; + let Inst{14-12} = imm{10-8}; + let Inst{7-0} = imm{7-0}; +} + +class T2sTwoRegImm<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : T2sI<oops, iops, itin, opc, asm, pattern> { + bits<4> Rd; + bits<4> Rn; + bits<12> imm; + + let Inst{11-8} = Rd; + let Inst{19-16} = Rn; + let Inst{26} = imm{11}; + let Inst{14-12} = imm{10-8}; + let Inst{7-0} = imm{7-0}; +} + +class T2TwoRegShiftImm<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : T2I<oops, iops, itin, opc, asm, pattern> { + bits<4> Rd; + bits<4> Rm; + bits<5> imm; + + let Inst{11-8} = Rd; + let Inst{3-0} = Rm; + let Inst{14-12} = imm{4-2}; + let Inst{7-6} = imm{1-0}; +} + +class T2sTwoRegShiftImm<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : T2sI<oops, iops, itin, opc, asm, pattern> { + bits<4> Rd; + bits<4> Rm; + bits<5> imm; + + let Inst{11-8} = Rd; + let Inst{3-0} = Rm; + let Inst{14-12} = imm{4-2}; + let Inst{7-6} = imm{1-0}; +} + +class T2ThreeReg<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : T2I<oops, iops, itin, opc, asm, pattern> { + bits<4> Rd; + bits<4> Rn; + bits<4> Rm; + + let Inst{11-8} = Rd; + let Inst{19-16} = Rn; + let Inst{3-0} = Rm; +} + +class T2sThreeReg<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : T2sI<oops, iops, itin, opc, asm, pattern> { + bits<4> Rd; + bits<4> Rn; + bits<4> Rm; + + let Inst{11-8} = Rd; + let Inst{19-16} = Rn; + let Inst{3-0} = Rm; +} + +class T2TwoRegShiftedReg<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : T2I<oops, iops, itin, opc, asm, pattern> { + bits<4> Rd; + bits<4> Rn; + bits<12> ShiftedRm; + + let Inst{11-8} = Rd; + let Inst{19-16} = Rn; + let Inst{3-0} = ShiftedRm{3-0}; + let Inst{5-4} = ShiftedRm{6-5}; + let Inst{14-12} = ShiftedRm{11-9}; + let Inst{7-6} = ShiftedRm{8-7}; +} + +class T2sTwoRegShiftedReg<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : T2sI<oops, iops, itin, opc, asm, pattern> { + bits<4> Rd; + bits<4> Rn; + bits<12> ShiftedRm; + + let Inst{11-8} = Rd; + let Inst{19-16} = Rn; + let Inst{3-0} = ShiftedRm{3-0}; + let Inst{5-4} = ShiftedRm{6-5}; + let Inst{14-12} = ShiftedRm{11-9}; + let Inst{7-6} = ShiftedRm{8-7}; +} + +class T2FourReg<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : T2I<oops, iops, itin, opc, asm, pattern> { + bits<4> Rd; + bits<4> Rn; + bits<4> Rm; + bits<4> Ra; + + let Inst{19-16} = Rn; + let Inst{15-12} = Ra; + let Inst{11-8} = Rd; + let Inst{3-0} = Rm; +} + +class T2MulLong<bits<3> opc22_20, bits<4> opc7_4, + dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : T2I<oops, iops, itin, opc, asm, pattern> { + bits<4> RdLo; + bits<4> RdHi; + bits<4> Rn; + bits<4> Rm; + + let Inst{31-23} = 0b111110111; + let Inst{22-20} = opc22_20; + let Inst{19-16} = Rn; + let Inst{15-12} = RdLo; + let Inst{11-8} = RdHi; + let Inst{7-4} = opc7_4; + let Inst{3-0} = Rm; +} + + /// T2I_un_irs - Defines a set of (op reg, {so_imm|r|so_reg}) patterns for a /// unary operation that produces a value. These are predicable and can be /// changed to modify CPSR. -multiclass T2I_un_irs<bits<4> opcod, string opc, PatFrag opnode, - bit Cheap = 0, bit ReMat = 0> { +multiclass T2I_un_irs<bits<4> opcod, string opc, + InstrItinClass iii, InstrItinClass iir, InstrItinClass iis, + PatFrag opnode, bit Cheap = 0, bit ReMat = 0> { // shifted imm - def i : T2sI<(outs rGPR:$dst), (ins t2_so_imm:$src), IIC_iMOVi, - opc, "\t$dst, $src", - [(set rGPR:$dst, (opnode t2_so_imm:$src))]> { + def i : T2sOneRegImm<(outs rGPR:$Rd), (ins t2_so_imm:$imm), iii, + opc, "\t$Rd, $imm", + [(set rGPR:$Rd, (opnode t2_so_imm:$imm))]> { let isAsCheapAsAMove = Cheap; let isReMaterializable = ReMat; let Inst{31-27} = 0b11110; let Inst{25} = 0; let Inst{24-21} = opcod; - let Inst{20} = ?; // The S bit. let Inst{19-16} = 0b1111; // Rn let Inst{15} = 0; } // register - def r : T2sI<(outs rGPR:$dst), (ins rGPR:$src), IIC_iMOVr, - opc, ".w\t$dst, $src", - [(set rGPR:$dst, (opnode rGPR:$src))]> { + def r : T2sTwoReg<(outs rGPR:$Rd), (ins rGPR:$Rm), iir, + opc, ".w\t$Rd, $Rm", + [(set rGPR:$Rd, (opnode rGPR:$Rm))]> { let Inst{31-27} = 0b11101; let Inst{26-25} = 0b01; let Inst{24-21} = opcod; - let Inst{20} = ?; // The S bit. let Inst{19-16} = 0b1111; // Rn let Inst{14-12} = 0b000; // imm3 let Inst{7-6} = 0b00; // imm2 let Inst{5-4} = 0b00; // type } // shifted register - def s : T2sI<(outs rGPR:$dst), (ins t2_so_reg:$src), IIC_iMOVsi, - opc, ".w\t$dst, $src", - [(set rGPR:$dst, (opnode t2_so_reg:$src))]> { + def s : T2sOneRegShiftedReg<(outs rGPR:$Rd), (ins t2_so_reg:$ShiftedRm), iis, + opc, ".w\t$Rd, $ShiftedRm", + [(set rGPR:$Rd, (opnode t2_so_reg:$ShiftedRm))]> { let Inst{31-27} = 0b11101; let Inst{26-25} = 0b01; let Inst{24-21} = opcod; - let Inst{20} = ?; // The S bit. let Inst{19-16} = 0b1111; // Rn } } @@ -213,94 +473,97 @@ multiclass T2I_un_irs<bits<4> opcod, string opc, PatFrag opnode, /// T2I_bin_irs - Defines a set of (op reg, {so_imm|r|so_reg}) patterns for a /// binary operation that produces a value. These are predicable and can be /// changed to modify CPSR. -multiclass T2I_bin_irs<bits<4> opcod, string opc, PatFrag opnode, - bit Commutable = 0, string wide = ""> { +multiclass T2I_bin_irs<bits<4> opcod, string opc, + InstrItinClass iii, InstrItinClass iir, InstrItinClass iis, + PatFrag opnode, bit Commutable = 0, string wide = ""> { // shifted imm - def ri : T2sI<(outs rGPR:$dst), (ins rGPR:$lhs, t2_so_imm:$rhs), IIC_iALUi, - opc, "\t$dst, $lhs, $rhs", - [(set rGPR:$dst, (opnode rGPR:$lhs, t2_so_imm:$rhs))]> { + def ri : T2sTwoRegImm< + (outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_imm:$imm), iii, + opc, "\t$Rd, $Rn, $imm", + [(set rGPR:$Rd, (opnode rGPR:$Rn, t2_so_imm:$imm))]> { let Inst{31-27} = 0b11110; let Inst{25} = 0; let Inst{24-21} = opcod; - let Inst{20} = ?; // The S bit. let Inst{15} = 0; } // register - def rr : T2sI<(outs rGPR:$dst), (ins rGPR:$lhs, rGPR:$rhs), IIC_iALUr, - opc, !strconcat(wide, "\t$dst, $lhs, $rhs"), - [(set rGPR:$dst, (opnode rGPR:$lhs, rGPR:$rhs))]> { + def rr : T2sThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), iir, + opc, !strconcat(wide, "\t$Rd, $Rn, $Rm"), + [(set rGPR:$Rd, (opnode rGPR:$Rn, rGPR:$Rm))]> { let isCommutable = Commutable; let Inst{31-27} = 0b11101; let Inst{26-25} = 0b01; let Inst{24-21} = opcod; - let Inst{20} = ?; // The S bit. let Inst{14-12} = 0b000; // imm3 let Inst{7-6} = 0b00; // imm2 let Inst{5-4} = 0b00; // type } // shifted register - def rs : T2sI<(outs rGPR:$dst), (ins rGPR:$lhs, t2_so_reg:$rhs), IIC_iALUsi, - opc, !strconcat(wide, "\t$dst, $lhs, $rhs"), - [(set rGPR:$dst, (opnode rGPR:$lhs, t2_so_reg:$rhs))]> { + def rs : T2sTwoRegShiftedReg< + (outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_reg:$ShiftedRm), iis, + opc, !strconcat(wide, "\t$Rd, $Rn, $ShiftedRm"), + [(set rGPR:$Rd, (opnode rGPR:$Rn, t2_so_reg:$ShiftedRm))]> { let Inst{31-27} = 0b11101; let Inst{26-25} = 0b01; let Inst{24-21} = opcod; - let Inst{20} = ?; // The S bit. } } /// T2I_bin_w_irs - Same as T2I_bin_irs except these operations need // the ".w" prefix to indicate that they are wide. -multiclass T2I_bin_w_irs<bits<4> opcod, string opc, PatFrag opnode, - bit Commutable = 0> : - T2I_bin_irs<opcod, opc, opnode, Commutable, ".w">; +multiclass T2I_bin_w_irs<bits<4> opcod, string opc, + InstrItinClass iii, InstrItinClass iir, InstrItinClass iis, + PatFrag opnode, bit Commutable = 0> : + T2I_bin_irs<opcod, opc, iii, iir, iis, opnode, Commutable, ".w">; /// T2I_rbin_is - Same as T2I_bin_irs except the order of operands are /// reversed. The 'rr' form is only defined for the disassembler; for codegen /// it is equivalent to the T2I_bin_irs counterpart. multiclass T2I_rbin_irs<bits<4> opcod, string opc, PatFrag opnode> { // shifted imm - def ri : T2sI<(outs rGPR:$dst), (ins rGPR:$rhs, t2_so_imm:$lhs), IIC_iALUi, - opc, ".w\t$dst, $rhs, $lhs", - [(set rGPR:$dst, (opnode t2_so_imm:$lhs, rGPR:$rhs))]> { + def ri : T2sTwoRegImm< + (outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_imm:$imm), IIC_iALUi, + opc, ".w\t$Rd, $Rn, $imm", + [(set rGPR:$Rd, (opnode t2_so_imm:$imm, rGPR:$Rn))]> { let Inst{31-27} = 0b11110; let Inst{25} = 0; let Inst{24-21} = opcod; - let Inst{20} = ?; // The S bit. let Inst{15} = 0; } // register - def rr : T2sI<(outs rGPR:$dst), (ins rGPR:$rhs, rGPR:$lhs), IIC_iALUr, - opc, "\t$dst, $rhs, $lhs", + def rr : T2sThreeReg< + (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iALUr, + opc, "\t$Rd, $Rn, $Rm", [/* For disassembly only; pattern left blank */]> { let Inst{31-27} = 0b11101; let Inst{26-25} = 0b01; let Inst{24-21} = opcod; - let Inst{20} = ?; // The S bit. let Inst{14-12} = 0b000; // imm3 let Inst{7-6} = 0b00; // imm2 let Inst{5-4} = 0b00; // type } // shifted register - def rs : T2sI<(outs rGPR:$dst), (ins rGPR:$rhs, t2_so_reg:$lhs), IIC_iALUsi, - opc, "\t$dst, $rhs, $lhs", - [(set rGPR:$dst, (opnode t2_so_reg:$lhs, rGPR:$rhs))]> { + def rs : T2sTwoRegShiftedReg< + (outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_reg:$ShiftedRm), + IIC_iALUsir, opc, "\t$Rd, $Rn, $ShiftedRm", + [(set rGPR:$Rd, (opnode t2_so_reg:$ShiftedRm, rGPR:$Rn))]> { let Inst{31-27} = 0b11101; let Inst{26-25} = 0b01; let Inst{24-21} = opcod; - let Inst{20} = ?; // The S bit. } } /// T2I_bin_s_irs - Similar to T2I_bin_irs except it sets the 's' bit so the /// instruction modifies the CPSR register. -let Defs = [CPSR] in { -multiclass T2I_bin_s_irs<bits<4> opcod, string opc, PatFrag opnode, - bit Commutable = 0> { +let isCodeGenOnly = 1, Defs = [CPSR] in { +multiclass T2I_bin_s_irs<bits<4> opcod, string opc, + InstrItinClass iii, InstrItinClass iir, InstrItinClass iis, + PatFrag opnode, bit Commutable = 0> { // shifted imm - def ri : T2I<(outs rGPR:$dst), (ins GPR:$lhs, t2_so_imm:$rhs), IIC_iALUi, - !strconcat(opc, "s"), ".w\t$dst, $lhs, $rhs", - [(set rGPR:$dst, (opnode GPR:$lhs, t2_so_imm:$rhs))]> { + def ri : T2TwoRegImm< + (outs rGPR:$Rd), (ins GPR:$Rn, t2_so_imm:$imm), iii, + !strconcat(opc, "s"), ".w\t$Rd, $Rn, $imm", + [(set rGPR:$Rd, (opnode GPR:$Rn, t2_so_imm:$imm))]> { let Inst{31-27} = 0b11110; let Inst{25} = 0; let Inst{24-21} = opcod; @@ -308,9 +571,10 @@ multiclass T2I_bin_s_irs<bits<4> opcod, string opc, PatFrag opnode, let Inst{15} = 0; } // register - def rr : T2I<(outs rGPR:$dst), (ins GPR:$lhs, rGPR:$rhs), IIC_iALUr, - !strconcat(opc, "s"), ".w\t$dst, $lhs, $rhs", - [(set rGPR:$dst, (opnode GPR:$lhs, rGPR:$rhs))]> { + def rr : T2ThreeReg< + (outs rGPR:$Rd), (ins GPR:$Rn, rGPR:$Rm), iir, + !strconcat(opc, "s"), ".w\t$Rd, $Rn, $Rm", + [(set rGPR:$Rd, (opnode GPR:$Rn, rGPR:$Rm))]> { let isCommutable = Commutable; let Inst{31-27} = 0b11101; let Inst{26-25} = 0b01; @@ -321,9 +585,10 @@ multiclass T2I_bin_s_irs<bits<4> opcod, string opc, PatFrag opnode, let Inst{5-4} = 0b00; // type } // shifted register - def rs : T2I<(outs rGPR:$dst), (ins GPR:$lhs, t2_so_reg:$rhs), IIC_iALUsi, - !strconcat(opc, "s"), ".w\t$dst, $lhs, $rhs", - [(set rGPR:$dst, (opnode GPR:$lhs, t2_so_reg:$rhs))]> { + def rs : T2TwoRegShiftedReg< + (outs rGPR:$Rd), (ins GPR:$Rn, t2_so_reg:$ShiftedRm), iis, + !strconcat(opc, "s"), ".w\t$Rd, $Rn, $ShiftedRm", + [(set rGPR:$Rd, (opnode GPR:$Rn, t2_so_reg:$ShiftedRm))]> { let Inst{31-27} = 0b11101; let Inst{26-25} = 0b01; let Inst{24-21} = opcod; @@ -340,51 +605,58 @@ multiclass T2I_bin_ii12rs<bits<3> op23_21, string opc, PatFrag opnode, // The register-immediate version is re-materializable. This is useful // in particular for taking the address of a local. let isReMaterializable = 1 in { - def ri : T2sI<(outs rGPR:$dst), (ins GPR:$lhs, t2_so_imm:$rhs), IIC_iALUi, - opc, ".w\t$dst, $lhs, $rhs", - [(set rGPR:$dst, (opnode GPR:$lhs, t2_so_imm:$rhs))]> { + def ri : T2sTwoRegImm< + (outs rGPR:$Rd), (ins GPR:$Rn, t2_so_imm:$imm), IIC_iALUi, + opc, ".w\t$Rd, $Rn, $imm", + [(set rGPR:$Rd, (opnode GPR:$Rn, t2_so_imm:$imm))]> { let Inst{31-27} = 0b11110; let Inst{25} = 0; let Inst{24} = 1; let Inst{23-21} = op23_21; - let Inst{20} = 0; // The S bit. let Inst{15} = 0; } } // 12-bit imm - def ri12 : T2I<(outs rGPR:$dst), (ins GPR:$lhs, imm0_4095:$rhs), IIC_iALUi, - !strconcat(opc, "w"), "\t$dst, $lhs, $rhs", - [(set rGPR:$dst, (opnode GPR:$lhs, imm0_4095:$rhs))]> { + def ri12 : T2I< + (outs rGPR:$Rd), (ins GPR:$Rn, imm0_4095:$imm), IIC_iALUi, + !strconcat(opc, "w"), "\t$Rd, $Rn, $imm", + [(set rGPR:$Rd, (opnode GPR:$Rn, imm0_4095:$imm))]> { + bits<4> Rd; + bits<4> Rn; + bits<12> imm; let Inst{31-27} = 0b11110; - let Inst{25} = 1; - let Inst{24} = 0; + let Inst{26} = imm{11}; + let Inst{25-24} = 0b10; let Inst{23-21} = op23_21; let Inst{20} = 0; // The S bit. + let Inst{19-16} = Rn; let Inst{15} = 0; + let Inst{14-12} = imm{10-8}; + let Inst{11-8} = Rd; + let Inst{7-0} = imm{7-0}; } // register - def rr : T2sI<(outs rGPR:$dst), (ins GPR:$lhs, rGPR:$rhs), IIC_iALUr, - opc, ".w\t$dst, $lhs, $rhs", - [(set rGPR:$dst, (opnode GPR:$lhs, rGPR:$rhs))]> { + def rr : T2sThreeReg<(outs rGPR:$Rd), (ins GPR:$Rn, rGPR:$Rm), IIC_iALUr, + opc, ".w\t$Rd, $Rn, $Rm", + [(set rGPR:$Rd, (opnode GPR:$Rn, rGPR:$Rm))]> { let isCommutable = Commutable; let Inst{31-27} = 0b11101; let Inst{26-25} = 0b01; let Inst{24} = 1; let Inst{23-21} = op23_21; - let Inst{20} = 0; // The S bit. let Inst{14-12} = 0b000; // imm3 let Inst{7-6} = 0b00; // imm2 let Inst{5-4} = 0b00; // type } // shifted register - def rs : T2sI<(outs rGPR:$dst), (ins GPR:$lhs, t2_so_reg:$rhs), IIC_iALUsi, - opc, ".w\t$dst, $lhs, $rhs", - [(set rGPR:$dst, (opnode GPR:$lhs, t2_so_reg:$rhs))]> { + def rs : T2sTwoRegShiftedReg< + (outs rGPR:$Rd), (ins GPR:$Rn, t2_so_reg:$ShiftedRm), + IIC_iALUsi, opc, ".w\t$Rd, $Rn, $ShiftedRm", + [(set rGPR:$Rd, (opnode GPR:$Rn, t2_so_reg:$ShiftedRm))]> { let Inst{31-27} = 0b11101; let Inst{26-25} = 0b01; let Inst{24} = 1; let Inst{23-21} = op23_21; - let Inst{20} = 0; // The S bit. } } @@ -395,50 +667,49 @@ let Uses = [CPSR] in { multiclass T2I_adde_sube_irs<bits<4> opcod, string opc, PatFrag opnode, bit Commutable = 0> { // shifted imm - def ri : T2sI<(outs rGPR:$dst), (ins rGPR:$lhs, t2_so_imm:$rhs), IIC_iALUi, - opc, "\t$dst, $lhs, $rhs", - [(set rGPR:$dst, (opnode rGPR:$lhs, t2_so_imm:$rhs))]>, + def ri : T2sTwoRegImm<(outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_imm:$imm), + IIC_iALUi, opc, "\t$Rd, $Rn, $imm", + [(set rGPR:$Rd, (opnode rGPR:$Rn, t2_so_imm:$imm))]>, Requires<[IsThumb2]> { let Inst{31-27} = 0b11110; let Inst{25} = 0; let Inst{24-21} = opcod; - let Inst{20} = 0; // The S bit. let Inst{15} = 0; } // register - def rr : T2sI<(outs rGPR:$dst), (ins rGPR:$lhs, rGPR:$rhs), IIC_iALUr, - opc, ".w\t$dst, $lhs, $rhs", - [(set rGPR:$dst, (opnode rGPR:$lhs, rGPR:$rhs))]>, + def rr : T2sThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iALUr, + opc, ".w\t$Rd, $Rn, $Rm", + [(set rGPR:$Rd, (opnode rGPR:$Rn, rGPR:$Rm))]>, Requires<[IsThumb2]> { let isCommutable = Commutable; let Inst{31-27} = 0b11101; let Inst{26-25} = 0b01; let Inst{24-21} = opcod; - let Inst{20} = 0; // The S bit. let Inst{14-12} = 0b000; // imm3 let Inst{7-6} = 0b00; // imm2 let Inst{5-4} = 0b00; // type } // shifted register - def rs : T2sI<(outs rGPR:$dst), (ins rGPR:$lhs, t2_so_reg:$rhs), IIC_iALUsi, - opc, ".w\t$dst, $lhs, $rhs", - [(set rGPR:$dst, (opnode rGPR:$lhs, t2_so_reg:$rhs))]>, + def rs : T2sTwoRegShiftedReg< + (outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_reg:$ShiftedRm), + IIC_iALUsi, opc, ".w\t$Rd, $Rn, $ShiftedRm", + [(set rGPR:$Rd, (opnode rGPR:$Rn, t2_so_reg:$ShiftedRm))]>, Requires<[IsThumb2]> { let Inst{31-27} = 0b11101; let Inst{26-25} = 0b01; let Inst{24-21} = opcod; - let Inst{20} = 0; // The S bit. } } // Carry setting variants -let Defs = [CPSR] in { +let isCodeGenOnly = 1, Defs = [CPSR] in { multiclass T2I_adde_sube_s_irs<bits<4> opcod, string opc, PatFrag opnode, bit Commutable = 0> { // shifted imm - def ri : T2sI<(outs rGPR:$dst), (ins rGPR:$lhs, t2_so_imm:$rhs), IIC_iALUi, - opc, "\t$dst, $lhs, $rhs", - [(set rGPR:$dst, (opnode rGPR:$lhs, t2_so_imm:$rhs))]>, + def ri : T2sTwoRegImm< + (outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_imm:$imm), IIC_iALUi, + opc, "\t$Rd, $Rn, $imm", + [(set rGPR:$Rd, (opnode rGPR:$Rn, t2_so_imm:$imm))]>, Requires<[IsThumb2]> { let Inst{31-27} = 0b11110; let Inst{25} = 0; @@ -447,9 +718,9 @@ multiclass T2I_adde_sube_s_irs<bits<4> opcod, string opc, PatFrag opnode, let Inst{15} = 0; } // register - def rr : T2sI<(outs rGPR:$dst), (ins rGPR:$lhs, rGPR:$rhs), IIC_iALUr, - opc, ".w\t$dst, $lhs, $rhs", - [(set rGPR:$dst, (opnode rGPR:$lhs, rGPR:$rhs))]>, + def rr : T2sThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iALUr, + opc, ".w\t$Rd, $Rn, $Rm", + [(set rGPR:$Rd, (opnode rGPR:$Rn, rGPR:$Rm))]>, Requires<[IsThumb2]> { let isCommutable = Commutable; let Inst{31-27} = 0b11101; @@ -461,9 +732,10 @@ multiclass T2I_adde_sube_s_irs<bits<4> opcod, string opc, PatFrag opnode, let Inst{5-4} = 0b00; // type } // shifted register - def rs : T2sI<(outs rGPR:$dst), (ins rGPR:$lhs, t2_so_reg:$rhs), IIC_iALUsi, - opc, ".w\t$dst, $lhs, $rhs", - [(set rGPR:$dst, (opnode rGPR:$lhs, t2_so_reg:$rhs))]>, + def rs : T2sTwoRegShiftedReg< + (outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_reg:$ShiftedRm), + IIC_iALUsi, opc, ".w\t$Rd, $Rn, $ShiftedRm", + [(set rGPR:$Rd, (opnode rGPR:$Rn, t2_so_reg:$ShiftedRm))]>, Requires<[IsThumb2]> { let Inst{31-27} = 0b11101; let Inst{26-25} = 0b01; @@ -476,12 +748,13 @@ multiclass T2I_adde_sube_s_irs<bits<4> opcod, string opc, PatFrag opnode, /// T2I_rbin_s_is - Same as T2I_rbin_irs except sets 's' bit and the register /// version is not needed since this is only for codegen. -let Defs = [CPSR] in { +let isCodeGenOnly = 1, Defs = [CPSR] in { multiclass T2I_rbin_s_is<bits<4> opcod, string opc, PatFrag opnode> { // shifted imm - def ri : T2I<(outs rGPR:$dst), (ins rGPR:$rhs, t2_so_imm:$lhs), IIC_iALUi, - !strconcat(opc, "s"), ".w\t$dst, $rhs, $lhs", - [(set rGPR:$dst, (opnode t2_so_imm:$lhs, rGPR:$rhs))]> { + def ri : T2TwoRegImm< + (outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_imm:$imm), IIC_iALUi, + !strconcat(opc, "s"), ".w\t$Rd, $Rn, $imm", + [(set rGPR:$Rd, (opnode t2_so_imm:$imm, rGPR:$Rn))]> { let Inst{31-27} = 0b11110; let Inst{25} = 0; let Inst{24-21} = opcod; @@ -489,9 +762,10 @@ multiclass T2I_rbin_s_is<bits<4> opcod, string opc, PatFrag opnode> { let Inst{15} = 0; } // shifted register - def rs : T2I<(outs rGPR:$dst), (ins rGPR:$rhs, t2_so_reg:$lhs), IIC_iALUsi, - !strconcat(opc, "s"), "\t$dst, $rhs, $lhs", - [(set rGPR:$dst, (opnode t2_so_reg:$lhs, rGPR:$rhs))]> { + def rs : T2TwoRegShiftedReg< + (outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_reg:$ShiftedRm), + IIC_iALUsi, !strconcat(opc, "s"), "\t$Rd, $Rn, $ShiftedRm", + [(set rGPR:$Rd, (opnode t2_so_reg:$ShiftedRm, rGPR:$Rn))]> { let Inst{31-27} = 0b11101; let Inst{26-25} = 0b01; let Inst{24-21} = opcod; @@ -504,18 +778,20 @@ multiclass T2I_rbin_s_is<bits<4> opcod, string opc, PatFrag opnode> { // rotate operation that produces a value. multiclass T2I_sh_ir<bits<2> opcod, string opc, PatFrag opnode> { // 5-bit imm - def ri : T2sI<(outs rGPR:$dst), (ins rGPR:$lhs, i32imm:$rhs), IIC_iMOVsi, - opc, ".w\t$dst, $lhs, $rhs", - [(set rGPR:$dst, (opnode rGPR:$lhs, imm1_31:$rhs))]> { + def ri : T2sTwoRegShiftImm< + (outs rGPR:$Rd), (ins rGPR:$Rm, i32imm:$imm), IIC_iMOVsi, + opc, ".w\t$Rd, $Rm, $imm", + [(set rGPR:$Rd, (opnode rGPR:$Rm, imm1_31:$imm))]> { let Inst{31-27} = 0b11101; let Inst{26-21} = 0b010010; let Inst{19-16} = 0b1111; // Rn let Inst{5-4} = opcod; } // register - def rr : T2sI<(outs rGPR:$dst), (ins rGPR:$lhs, rGPR:$rhs), IIC_iMOVsr, - opc, ".w\t$dst, $lhs, $rhs", - [(set rGPR:$dst, (opnode rGPR:$lhs, rGPR:$rhs))]> { + def rr : T2sThreeReg< + (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMOVsr, + opc, ".w\t$Rd, $Rn, $Rm", + [(set rGPR:$Rd, (opnode rGPR:$Rn, rGPR:$Rm))]> { let Inst{31-27} = 0b11111; let Inst{26-23} = 0b0100; let Inst{22-21} = opcod; @@ -528,11 +804,14 @@ multiclass T2I_sh_ir<bits<2> opcod, string opc, PatFrag opnode> { /// patterns. Similar to T2I_bin_irs except the instruction does not produce /// a explicit result, only implicitly set CPSR. let isCompare = 1, Defs = [CPSR] in { -multiclass T2I_cmp_irs<bits<4> opcod, string opc, PatFrag opnode> { +multiclass T2I_cmp_irs<bits<4> opcod, string opc, + InstrItinClass iii, InstrItinClass iir, InstrItinClass iis, + PatFrag opnode> { // shifted imm - def ri : T2I<(outs), (ins GPR:$lhs, t2_so_imm:$rhs), IIC_iCMPi, - opc, ".w\t$lhs, $rhs", - [(opnode GPR:$lhs, t2_so_imm:$rhs)]> { + def ri : T2OneRegCmpImm< + (outs), (ins GPR:$Rn, t2_so_imm:$imm), iii, + opc, ".w\t$Rn, $imm", + [(opnode GPR:$Rn, t2_so_imm:$imm)]> { let Inst{31-27} = 0b11110; let Inst{25} = 0; let Inst{24-21} = opcod; @@ -541,7 +820,8 @@ multiclass T2I_cmp_irs<bits<4> opcod, string opc, PatFrag opnode> { let Inst{11-8} = 0b1111; // Rd } // register - def rr : T2I<(outs), (ins GPR:$lhs, rGPR:$rhs), IIC_iCMPr, + def rr : T2TwoRegCmp< + (outs), (ins GPR:$lhs, rGPR:$rhs), iir, opc, ".w\t$lhs, $rhs", [(opnode GPR:$lhs, rGPR:$rhs)]> { let Inst{31-27} = 0b11101; @@ -554,9 +834,10 @@ multiclass T2I_cmp_irs<bits<4> opcod, string opc, PatFrag opnode> { let Inst{5-4} = 0b00; // type } // shifted register - def rs : T2I<(outs), (ins GPR:$lhs, t2_so_reg:$rhs), IIC_iCMPsi, - opc, ".w\t$lhs, $rhs", - [(opnode GPR:$lhs, t2_so_reg:$rhs)]> { + def rs : T2OneRegCmpShiftedReg< + (outs), (ins GPR:$Rn, t2_so_reg:$ShiftedRm), iis, + opc, ".w\t$Rn, $ShiftedRm", + [(opnode GPR:$Rn, t2_so_reg:$ShiftedRm)]> { let Inst{31-27} = 0b11101; let Inst{26-25} = 0b01; let Inst{24-21} = opcod; @@ -567,20 +848,29 @@ multiclass T2I_cmp_irs<bits<4> opcod, string opc, PatFrag opnode> { } /// T2I_ld - Defines a set of (op r, {imm12|imm8|so_reg}) load patterns. -multiclass T2I_ld<bit signed, bits<2> opcod, string opc, PatFrag opnode> { - def i12 : T2Ii12<(outs GPR:$dst), (ins t2addrmode_imm12:$addr), IIC_iLoadi, - opc, ".w\t$dst, $addr", - [(set GPR:$dst, (opnode t2addrmode_imm12:$addr))]> { +multiclass T2I_ld<bit signed, bits<2> opcod, string opc, + InstrItinClass iii, InstrItinClass iis, PatFrag opnode> { + def i12 : T2Ii12<(outs GPR:$Rt), (ins t2addrmode_imm12:$addr), iii, + opc, ".w\t$Rt, $addr", + [(set GPR:$Rt, (opnode t2addrmode_imm12:$addr))]> { let Inst{31-27} = 0b11111; let Inst{26-25} = 0b00; let Inst{24} = signed; let Inst{23} = 1; let Inst{22-21} = opcod; let Inst{20} = 1; // load + + bits<4> Rt; + let Inst{15-12} = Rt; + + bits<17> addr; + let Inst{19-16} = addr{16-13}; // Rn + let Inst{23} = addr{12}; // U + let Inst{11-0} = addr{11-0}; // imm } - def i8 : T2Ii8 <(outs GPR:$dst), (ins t2addrmode_imm8:$addr), IIC_iLoadi, - opc, "\t$dst, $addr", - [(set GPR:$dst, (opnode t2addrmode_imm8:$addr))]> { + def i8 : T2Ii8 <(outs GPR:$Rt), (ins t2addrmode_imm8:$addr), iii, + opc, "\t$Rt, $addr", + [(set GPR:$Rt, (opnode t2addrmode_imm8:$addr))]> { let Inst{31-27} = 0b11111; let Inst{26-25} = 0b00; let Inst{24} = signed; @@ -591,10 +881,18 @@ multiclass T2I_ld<bit signed, bits<2> opcod, string opc, PatFrag opnode> { // Offset: index==TRUE, wback==FALSE let Inst{10} = 1; // The P bit. let Inst{8} = 0; // The W bit. + + bits<4> Rt; + let Inst{15-12} = Rt; + + bits<13> addr; + let Inst{19-16} = addr{12-9}; // Rn + let Inst{9} = addr{8}; // U + let Inst{7-0} = addr{7-0}; // imm } - def s : T2Iso <(outs GPR:$dst), (ins t2addrmode_so_reg:$addr), IIC_iLoadr, - opc, ".w\t$dst, $addr", - [(set GPR:$dst, (opnode t2addrmode_so_reg:$addr))]> { + def s : T2Iso <(outs GPR:$Rt), (ins t2addrmode_so_reg:$addr), iis, + opc, ".w\t$Rt, $addr", + [(set GPR:$Rt, (opnode t2addrmode_so_reg:$addr))]> { let Inst{31-27} = 0b11111; let Inst{26-25} = 0b00; let Inst{24} = signed; @@ -602,10 +900,20 @@ multiclass T2I_ld<bit signed, bits<2> opcod, string opc, PatFrag opnode> { let Inst{22-21} = opcod; let Inst{20} = 1; // load let Inst{11-6} = 0b000000; + + bits<4> Rt; + let Inst{15-12} = Rt; + + bits<10> addr; + let Inst{19-16} = addr{9-6}; // Rn + let Inst{3-0} = addr{5-2}; // Rm + let Inst{5-4} = addr{1-0}; // imm } - def pci : T2Ipc <(outs GPR:$dst), (ins i32imm:$addr), IIC_iLoadi, - opc, ".w\t$dst, $addr", - [(set GPR:$dst, (opnode (ARMWrapper tconstpool:$addr)))]> { + + // FIXME: Is the pci variant actually needed? + def pci : T2Ipc <(outs GPR:$Rt), (ins i32imm:$addr), iii, + opc, ".w\t$Rt, $addr", + [(set GPR:$Rt, (opnode (ARMWrapper tconstpool:$addr)))]> { let isReMaterializable = 1; let Inst{31-27} = 0b11111; let Inst{26-25} = 0b00; @@ -614,22 +922,35 @@ multiclass T2I_ld<bit signed, bits<2> opcod, string opc, PatFrag opnode> { let Inst{22-21} = opcod; let Inst{20} = 1; // load let Inst{19-16} = 0b1111; // Rn + bits<4> Rt; + bits<12> addr; + let Inst{15-12} = Rt{3-0}; + let Inst{11-0} = addr{11-0}; } } /// T2I_st - Defines a set of (op r, {imm12|imm8|so_reg}) store patterns. -multiclass T2I_st<bits<2> opcod, string opc, PatFrag opnode> { - def i12 : T2Ii12<(outs), (ins GPR:$src, t2addrmode_imm12:$addr), IIC_iStorei, - opc, ".w\t$src, $addr", - [(opnode GPR:$src, t2addrmode_imm12:$addr)]> { +multiclass T2I_st<bits<2> opcod, string opc, + InstrItinClass iii, InstrItinClass iis, PatFrag opnode> { + def i12 : T2Ii12<(outs), (ins GPR:$Rt, t2addrmode_imm12:$addr), iii, + opc, ".w\t$Rt, $addr", + [(opnode GPR:$Rt, t2addrmode_imm12:$addr)]> { let Inst{31-27} = 0b11111; let Inst{26-23} = 0b0001; let Inst{22-21} = opcod; let Inst{20} = 0; // !load + + bits<4> Rt; + let Inst{15-12} = Rt; + + bits<17> addr; + let Inst{19-16} = addr{16-13}; // Rn + let Inst{23} = addr{12}; // U + let Inst{11-0} = addr{11-0}; // imm } - def i8 : T2Ii8 <(outs), (ins GPR:$src, t2addrmode_imm8:$addr), IIC_iStorei, - opc, "\t$src, $addr", - [(opnode GPR:$src, t2addrmode_imm8:$addr)]> { + def i8 : T2Ii8 <(outs), (ins GPR:$Rt, t2addrmode_imm8:$addr), iii, + opc, "\t$Rt, $addr", + [(opnode GPR:$Rt, t2addrmode_imm8:$addr)]> { let Inst{31-27} = 0b11111; let Inst{26-23} = 0b0000; let Inst{22-21} = opcod; @@ -638,24 +959,40 @@ multiclass T2I_st<bits<2> opcod, string opc, PatFrag opnode> { // Offset: index==TRUE, wback==FALSE let Inst{10} = 1; // The P bit. let Inst{8} = 0; // The W bit. + + bits<4> Rt; + let Inst{15-12} = Rt; + + bits<13> addr; + let Inst{19-16} = addr{12-9}; // Rn + let Inst{9} = addr{8}; // U + let Inst{7-0} = addr{7-0}; // imm } - def s : T2Iso <(outs), (ins GPR:$src, t2addrmode_so_reg:$addr), IIC_iStorer, - opc, ".w\t$src, $addr", - [(opnode GPR:$src, t2addrmode_so_reg:$addr)]> { + def s : T2Iso <(outs), (ins GPR:$Rt, t2addrmode_so_reg:$addr), iis, + opc, ".w\t$Rt, $addr", + [(opnode GPR:$Rt, t2addrmode_so_reg:$addr)]> { let Inst{31-27} = 0b11111; let Inst{26-23} = 0b0000; let Inst{22-21} = opcod; let Inst{20} = 0; // !load let Inst{11-6} = 0b000000; + + bits<4> Rt; + let Inst{15-12} = Rt; + + bits<10> addr; + let Inst{19-16} = addr{9-6}; // Rn + let Inst{3-0} = addr{5-2}; // Rm + let Inst{5-4} = addr{1-0}; // imm } } -/// T2I_unary_rrot - A unary operation with two forms: one whose operand is a +/// T2I_ext_rrot - A unary operation with two forms: one whose operand is a /// register and one whose operand is a register rotated by 8/16/24. -multiclass T2I_unary_rrot<bits<3> opcod, string opc, PatFrag opnode> { - def r : T2I<(outs rGPR:$dst), (ins rGPR:$src), IIC_iUNAr, - opc, ".w\t$dst, $src", - [(set rGPR:$dst, (opnode rGPR:$src))]> { +multiclass T2I_ext_rrot<bits<3> opcod, string opc, PatFrag opnode> { + def r : T2TwoReg<(outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iEXTr, + opc, ".w\t$Rd, $Rm", + [(set rGPR:$Rd, (opnode rGPR:$Rm))]> { let Inst{31-27} = 0b11111; let Inst{26-23} = 0b0100; let Inst{22-20} = opcod; @@ -664,25 +1001,27 @@ multiclass T2I_unary_rrot<bits<3> opcod, string opc, PatFrag opnode> { let Inst{7} = 1; let Inst{5-4} = 0b00; // rotate } - def r_rot : T2I<(outs rGPR:$dst), (ins rGPR:$src, i32imm:$rot), IIC_iUNAsi, - opc, ".w\t$dst, $src, ror $rot", - [(set rGPR:$dst, (opnode (rotr rGPR:$src, rot_imm:$rot)))]> { + def r_rot : T2TwoReg<(outs rGPR:$Rd), (ins rGPR:$Rm, rot_imm:$rot), IIC_iEXTr, + opc, ".w\t$Rd, $Rm, ror $rot", + [(set rGPR:$Rd, (opnode (rotr rGPR:$Rm, rot_imm:$rot)))]> { let Inst{31-27} = 0b11111; let Inst{26-23} = 0b0100; let Inst{22-20} = opcod; let Inst{19-16} = 0b1111; // Rn let Inst{15-12} = 0b1111; let Inst{7} = 1; - let Inst{5-4} = {?,?}; // rotate + + bits<2> rot; + let Inst{5-4} = rot{1-0}; // rotate } } // UXTB16 - Requres T2ExtractPack, does not need the .w qualifier. -multiclass T2I_unary_rrot_uxtb16<bits<3> opcod, string opc, PatFrag opnode> { - def r : T2I<(outs rGPR:$dst), (ins rGPR:$src), IIC_iUNAr, - opc, "\t$dst, $src", - [(set rGPR:$dst, (opnode rGPR:$src))]>, - Requires<[HasT2ExtractPack]> { +multiclass T2I_ext_rrot_uxtb16<bits<3> opcod, string opc, PatFrag opnode> { + def r : T2TwoReg<(outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iEXTr, + opc, "\t$Rd, $Rm", + [(set rGPR:$Rd, (opnode rGPR:$Rm))]>, + Requires<[HasT2ExtractPack, IsThumb2]> { let Inst{31-27} = 0b11111; let Inst{26-23} = 0b0100; let Inst{22-20} = opcod; @@ -691,25 +1030,27 @@ multiclass T2I_unary_rrot_uxtb16<bits<3> opcod, string opc, PatFrag opnode> { let Inst{7} = 1; let Inst{5-4} = 0b00; // rotate } - def r_rot : T2I<(outs rGPR:$dst), (ins rGPR:$src, i32imm:$rot), IIC_iUNAsi, - opc, "\t$dst, $src, ror $rot", - [(set rGPR:$dst, (opnode (rotr rGPR:$src, rot_imm:$rot)))]>, - Requires<[HasT2ExtractPack]> { + def r_rot : T2TwoReg<(outs rGPR:$dst), (ins rGPR:$Rm, rot_imm:$rot), + IIC_iEXTr, opc, "\t$dst, $Rm, ror $rot", + [(set rGPR:$dst, (opnode (rotr rGPR:$Rm, rot_imm:$rot)))]>, + Requires<[HasT2ExtractPack, IsThumb2]> { let Inst{31-27} = 0b11111; let Inst{26-23} = 0b0100; let Inst{22-20} = opcod; let Inst{19-16} = 0b1111; // Rn let Inst{15-12} = 0b1111; let Inst{7} = 1; - let Inst{5-4} = {?,?}; // rotate + + bits<2> rot; + let Inst{5-4} = rot{1-0}; // rotate } } // SXTB16 - Requres T2ExtractPack, does not need the .w qualifier, no pattern // supported yet. -multiclass T2I_unary_rrot_sxtb16<bits<3> opcod, string opc> { - def r : T2I<(outs rGPR:$dst), (ins rGPR:$src), IIC_iUNAr, - opc, "\t$dst, $src", []> { +multiclass T2I_ext_rrot_sxtb16<bits<3> opcod, string opc> { + def r : T2TwoReg<(outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iEXTr, + opc, "\t$Rd, $Rm", []> { let Inst{31-27} = 0b11111; let Inst{26-23} = 0b0100; let Inst{22-20} = opcod; @@ -718,25 +1059,27 @@ multiclass T2I_unary_rrot_sxtb16<bits<3> opcod, string opc> { let Inst{7} = 1; let Inst{5-4} = 0b00; // rotate } - def r_rot : T2I<(outs rGPR:$dst), (ins rGPR:$src, i32imm:$rot), IIC_iUNAsi, - opc, "\t$dst, $src, ror $rot", []> { + def r_rot : T2TwoReg<(outs rGPR:$Rd), (ins rGPR:$Rm, i32imm:$rot), IIC_iEXTr, + opc, "\t$Rd, $Rm, ror $rot", []> { let Inst{31-27} = 0b11111; let Inst{26-23} = 0b0100; let Inst{22-20} = opcod; let Inst{19-16} = 0b1111; // Rn let Inst{15-12} = 0b1111; let Inst{7} = 1; - let Inst{5-4} = {?,?}; // rotate + + bits<2> rot; + let Inst{5-4} = rot{1-0}; // rotate } } -/// T2I_bin_rrot - A binary operation with two forms: one whose operand is a +/// T2I_exta_rrot - A binary operation with two forms: one whose operand is a /// register and one whose operand is a register rotated by 8/16/24. -multiclass T2I_bin_rrot<bits<3> opcod, string opc, PatFrag opnode> { - def rr : T2I<(outs rGPR:$dst), (ins rGPR:$LHS, rGPR:$RHS), IIC_iALUr, - opc, "\t$dst, $LHS, $RHS", - [(set rGPR:$dst, (opnode rGPR:$LHS, rGPR:$RHS))]>, - Requires<[HasT2ExtractPack]> { +multiclass T2I_exta_rrot<bits<3> opcod, string opc, PatFrag opnode> { + def rr : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iEXTAr, + opc, "\t$Rd, $Rn, $Rm", + [(set rGPR:$Rd, (opnode rGPR:$Rn, rGPR:$Rm))]>, + Requires<[HasT2ExtractPack, IsThumb2]> { let Inst{31-27} = 0b11111; let Inst{26-23} = 0b0100; let Inst{22-20} = opcod; @@ -744,25 +1087,28 @@ multiclass T2I_bin_rrot<bits<3> opcod, string opc, PatFrag opnode> { let Inst{7} = 1; let Inst{5-4} = 0b00; // rotate } - def rr_rot : T2I<(outs rGPR:$dst), (ins rGPR:$LHS, rGPR:$RHS, i32imm:$rot), - IIC_iALUsr, opc, "\t$dst, $LHS, $RHS, ror $rot", - [(set rGPR:$dst, (opnode rGPR:$LHS, - (rotr rGPR:$RHS, rot_imm:$rot)))]>, - Requires<[HasT2ExtractPack]> { + def rr_rot : T2ThreeReg<(outs rGPR:$Rd), + (ins rGPR:$Rn, rGPR:$Rm, rot_imm:$rot), + IIC_iEXTAsr, opc, "\t$Rd, $Rn, $Rm, ror $rot", + [(set rGPR:$Rd, (opnode rGPR:$Rn, + (rotr rGPR:$Rm, rot_imm:$rot)))]>, + Requires<[HasT2ExtractPack, IsThumb2]> { let Inst{31-27} = 0b11111; let Inst{26-23} = 0b0100; let Inst{22-20} = opcod; let Inst{15-12} = 0b1111; let Inst{7} = 1; - let Inst{5-4} = {?,?}; // rotate + + bits<2> rot; + let Inst{5-4} = rot{1-0}; // rotate } } // DO variant - disassembly only, no pattern -multiclass T2I_bin_rrot_DO<bits<3> opcod, string opc> { - def rr : T2I<(outs rGPR:$dst), (ins rGPR:$LHS, rGPR:$RHS), IIC_iALUr, - opc, "\t$dst, $LHS, $RHS", []> { +multiclass T2I_exta_rrot_DO<bits<3> opcod, string opc> { + def rr : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iEXTAr, + opc, "\t$Rd, $Rn, $Rm", []> { let Inst{31-27} = 0b11111; let Inst{26-23} = 0b0100; let Inst{22-20} = opcod; @@ -770,14 +1116,16 @@ multiclass T2I_bin_rrot_DO<bits<3> opcod, string opc> { let Inst{7} = 1; let Inst{5-4} = 0b00; // rotate } - def rr_rot : T2I<(outs rGPR:$dst), (ins rGPR:$LHS, rGPR:$RHS, i32imm:$rot), - IIC_iALUsr, opc, "\t$dst, $LHS, $RHS, ror $rot", []> { + def rr_rot : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, i32imm:$rot), + IIC_iEXTAsr, opc, "\t$Rd, $Rn, $Rm, ror $rot", []> { let Inst{31-27} = 0b11111; let Inst{26-23} = 0b0100; let Inst{22-20} = opcod; let Inst{15-12} = 0b1111; let Inst{7} = 1; - let Inst{5-4} = {?,?}; // rotate + + bits<2> rot; + let Inst{5-4} = rot{1-0}; // rotate } } @@ -789,24 +1137,23 @@ multiclass T2I_bin_rrot_DO<bits<3> opcod, string opc> { // Miscellaneous Instructions. // +class T2PCOneRegImm<dag oops, dag iops, InstrItinClass itin, + string asm, list<dag> pattern> + : T2XI<oops, iops, itin, asm, pattern> { + bits<4> Rd; + bits<12> label; + + let Inst{11-8} = Rd; + let Inst{26} = label{11}; + let Inst{14-12} = label{10-8}; + let Inst{7-0} = label{7-0}; +} + // LEApcrel - Load a pc-relative address into a register without offending the // assembler. -let neverHasSideEffects = 1 in { -let isReMaterializable = 1 in -def t2LEApcrel : T2XI<(outs rGPR:$dst), (ins i32imm:$label, pred:$p), IIC_iALUi, - "adr${p}.w\t$dst, #$label", []> { - let Inst{31-27} = 0b11110; - let Inst{25-24} = 0b10; - // Inst{23:21} = '11' (add = FALSE) or '00' (add = TRUE) - let Inst{22} = 0; - let Inst{20} = 0; - let Inst{19-16} = 0b1111; // Rn - let Inst{15} = 0; -} -} // neverHasSideEffects -def t2LEApcrelJT : T2XI<(outs rGPR:$dst), - (ins i32imm:$label, nohash_imm:$id, pred:$p), IIC_iALUi, - "adr${p}.w\t$dst, #${label}_${id}", []> { +def t2ADR : T2PCOneRegImm<(outs rGPR:$Rd), + (ins t2adrlabel:$addr, pred:$p), + IIC_iALUi, "adr{$p}.w\t$Rd, #$addr", []> { let Inst{31-27} = 0b11110; let Inst{25-24} = 0b10; // Inst{23:21} = '11' (add = FALSE) or '00' (add = TRUE) @@ -814,76 +1161,88 @@ def t2LEApcrelJT : T2XI<(outs rGPR:$dst), let Inst{20} = 0; let Inst{19-16} = 0b1111; // Rn let Inst{15} = 0; -} + bits<4> Rd; + bits<13> addr; + let Inst{11-8} = Rd; + let Inst{23} = addr{12}; + let Inst{21} = addr{12}; + let Inst{26} = addr{11}; + let Inst{14-12} = addr{10-8}; + let Inst{7-0} = addr{7-0}; +} + +let neverHasSideEffects = 1, isReMaterializable = 1 in +def t2LEApcrel : t2PseudoInst<(outs rGPR:$Rd), (ins i32imm:$label, pred:$p), + Size4Bytes, IIC_iALUi, []>; +def t2LEApcrelJT : t2PseudoInst<(outs rGPR:$Rd), + (ins i32imm:$label, nohash_imm:$id, pred:$p), + Size4Bytes, IIC_iALUi, + []>; + + +// FIXME: None of these add/sub SP special instructions should be necessary +// at all for thumb2 since they use the same encodings as the generic +// add/sub instructions. In thumb1 we need them since they have dedicated +// encodings. At the least, they should be pseudo instructions. // ADD r, sp, {so_imm|i12} -def t2ADDrSPi : T2sI<(outs GPR:$dst), (ins GPR:$sp, t2_so_imm:$imm), - IIC_iALUi, "add", ".w\t$dst, $sp, $imm", []> { +let isCodeGenOnly = 1 in { +def t2ADDrSPi : T2sTwoRegImm<(outs GPR:$Rd), (ins GPR:$Rn, t2_so_imm:$imm), + IIC_iALUi, "add", ".w\t$Rd, $Rn, $imm", []> { let Inst{31-27} = 0b11110; let Inst{25} = 0; let Inst{24-21} = 0b1000; - let Inst{20} = ?; // The S bit. - let Inst{19-16} = 0b1101; // Rn = sp let Inst{15} = 0; } -def t2ADDrSPi12 : T2I<(outs GPR:$dst), (ins GPR:$sp, imm0_4095:$imm), - IIC_iALUi, "addw", "\t$dst, $sp, $imm", []> { +def t2ADDrSPi12 : T2TwoRegImm<(outs GPR:$Rd), (ins GPR:$Rn, imm0_4095:$imm), + IIC_iALUi, "addw", "\t$Rd, $Rn, $imm", []> { let Inst{31-27} = 0b11110; - let Inst{25} = 1; - let Inst{24-21} = 0b0000; - let Inst{20} = 0; // The S bit. - let Inst{19-16} = 0b1101; // Rn = sp + let Inst{25-20} = 0b100000; let Inst{15} = 0; } // ADD r, sp, so_reg -def t2ADDrSPs : T2sI<(outs GPR:$dst), (ins GPR:$sp, t2_so_reg:$rhs), - IIC_iALUsi, "add", ".w\t$dst, $sp, $rhs", []> { +def t2ADDrSPs : T2sTwoRegShiftedReg< + (outs GPR:$Rd), (ins GPR:$Rn, t2_so_reg:$ShiftedRm), + IIC_iALUsi, "add", ".w\t$Rd, $Rn, $ShiftedRm", []> { let Inst{31-27} = 0b11101; let Inst{26-25} = 0b01; let Inst{24-21} = 0b1000; - let Inst{20} = ?; // The S bit. - let Inst{19-16} = 0b1101; // Rn = sp let Inst{15} = 0; } // SUB r, sp, {so_imm|i12} -def t2SUBrSPi : T2sI<(outs GPR:$dst), (ins GPR:$sp, t2_so_imm:$imm), - IIC_iALUi, "sub", ".w\t$dst, $sp, $imm", []> { +def t2SUBrSPi : T2sTwoRegImm<(outs GPR:$Rd), (ins GPR:$Rn, t2_so_imm:$imm), + IIC_iALUi, "sub", ".w\t$Rd, $Rn, $imm", []> { let Inst{31-27} = 0b11110; let Inst{25} = 0; let Inst{24-21} = 0b1101; - let Inst{20} = ?; // The S bit. - let Inst{19-16} = 0b1101; // Rn = sp let Inst{15} = 0; } -def t2SUBrSPi12 : T2I<(outs GPR:$dst), (ins GPR:$sp, imm0_4095:$imm), - IIC_iALUi, "subw", "\t$dst, $sp, $imm", []> { +def t2SUBrSPi12 : T2TwoRegImm<(outs GPR:$Rd), (ins GPR:$Rn, imm0_4095:$imm), + IIC_iALUi, "subw", "\t$Rd, $Rn, $imm", []> { let Inst{31-27} = 0b11110; - let Inst{25} = 1; - let Inst{24-21} = 0b0101; - let Inst{20} = 0; // The S bit. - let Inst{19-16} = 0b1101; // Rn = sp + let Inst{25-20} = 0b101010; let Inst{15} = 0; } // SUB r, sp, so_reg -def t2SUBrSPs : T2sI<(outs GPR:$dst), (ins GPR:$sp, t2_so_reg:$rhs), +def t2SUBrSPs : T2sTwoRegImm<(outs GPR:$Rd), (ins GPR:$Rn, t2_so_reg:$imm), IIC_iALUsi, - "sub", "\t$dst, $sp, $rhs", []> { + "sub", "\t$Rd, $Rn, $imm", []> { let Inst{31-27} = 0b11101; let Inst{26-25} = 0b01; let Inst{24-21} = 0b1101; - let Inst{20} = ?; // The S bit. let Inst{19-16} = 0b1101; // Rn = sp let Inst{15} = 0; } +} // end isCodeGenOnly = 1 // Signed and unsigned division on v7-M -def t2SDIV : T2I<(outs rGPR:$dst), (ins rGPR:$a, rGPR:$b), IIC_iALUi, - "sdiv", "\t$dst, $a, $b", - [(set rGPR:$dst, (sdiv rGPR:$a, rGPR:$b))]>, - Requires<[HasDivide]> { +def t2SDIV : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iALUi, + "sdiv", "\t$Rd, $Rn, $Rm", + [(set rGPR:$Rd, (sdiv rGPR:$Rn, rGPR:$Rm))]>, + Requires<[HasDivide, IsThumb2]> { let Inst{31-27} = 0b11111; let Inst{26-21} = 0b011100; let Inst{20} = 0b1; @@ -891,10 +1250,10 @@ def t2SDIV : T2I<(outs rGPR:$dst), (ins rGPR:$a, rGPR:$b), IIC_iALUi, let Inst{7-4} = 0b1111; } -def t2UDIV : T2I<(outs rGPR:$dst), (ins rGPR:$a, rGPR:$b), IIC_iALUi, - "udiv", "\t$dst, $a, $b", - [(set rGPR:$dst, (udiv rGPR:$a, rGPR:$b))]>, - Requires<[HasDivide]> { +def t2UDIV : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iALUi, + "udiv", "\t$Rd, $Rn, $Rm", + [(set rGPR:$Rd, (udiv rGPR:$Rn, rGPR:$Rm))]>, + Requires<[HasDivide, IsThumb2]> { let Inst{31-27} = 0b11111; let Inst{26-21} = 0b011101; let Inst{20} = 0b1; @@ -908,26 +1267,26 @@ def t2UDIV : T2I<(outs rGPR:$dst), (ins rGPR:$a, rGPR:$b), IIC_iALUi, // Load let canFoldAsLoad = 1, isReMaterializable = 1 in -defm t2LDR : T2I_ld<0, 0b10, "ldr", UnOpFrag<(load node:$Src)>>; +defm t2LDR : T2I_ld<0, 0b10, "ldr", IIC_iLoad_i, IIC_iLoad_si, + UnOpFrag<(load node:$Src)>>; // Loads with zero extension -defm t2LDRH : T2I_ld<0, 0b01, "ldrh", UnOpFrag<(zextloadi16 node:$Src)>>; -defm t2LDRB : T2I_ld<0, 0b00, "ldrb", UnOpFrag<(zextloadi8 node:$Src)>>; +defm t2LDRH : T2I_ld<0, 0b01, "ldrh", IIC_iLoad_bh_i, IIC_iLoad_bh_si, + UnOpFrag<(zextloadi16 node:$Src)>>; +defm t2LDRB : T2I_ld<0, 0b00, "ldrb", IIC_iLoad_bh_i, IIC_iLoad_bh_si, + UnOpFrag<(zextloadi8 node:$Src)>>; // Loads with sign extension -defm t2LDRSH : T2I_ld<1, 0b01, "ldrsh", UnOpFrag<(sextloadi16 node:$Src)>>; -defm t2LDRSB : T2I_ld<1, 0b00, "ldrsb", UnOpFrag<(sextloadi8 node:$Src)>>; +defm t2LDRSH : T2I_ld<1, 0b01, "ldrsh", IIC_iLoad_bh_i, IIC_iLoad_bh_si, + UnOpFrag<(sextloadi16 node:$Src)>>; +defm t2LDRSB : T2I_ld<1, 0b00, "ldrsb", IIC_iLoad_bh_i, IIC_iLoad_bh_si, + UnOpFrag<(sextloadi8 node:$Src)>>; let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 in { // Load doubleword -def t2LDRDi8 : T2Ii8s4<1, 0, 1, (outs rGPR:$dst1, rGPR:$dst2), +def t2LDRDi8 : T2Ii8s4<1, 0, 1, (outs rGPR:$Rt, rGPR:$Rt2), (ins t2addrmode_imm8s4:$addr), - IIC_iLoadi, "ldrd", "\t$dst1, $addr", []>; -def t2LDRDpci : T2Ii8s4<1, 0, 1, (outs rGPR:$dst1, rGPR:$dst2), - (ins i32imm:$addr), IIC_iLoadi, - "ldrd", "\t$dst1, $addr", []> { - let Inst{19-16} = 0b1111; // Rn -} + IIC_iLoad_d_i, "ldrd", "\t$Rt, $Rt2, $addr", []>; } // mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 // zextload i1 -> zextload i8 @@ -976,70 +1335,71 @@ def : T2Pat<(extloadi16 (ARMWrapper tconstpool:$addr)), // not via pattern. // Indexed loads + let mayLoad = 1, neverHasSideEffects = 1 in { -def t2LDR_PRE : T2Iidxldst<0, 0b10, 1, 1, (outs GPR:$dst, GPR:$base_wb), +def t2LDR_PRE : T2Iidxldst<0, 0b10, 1, 1, (outs GPR:$Rt, GPR:$Rn), (ins t2addrmode_imm8:$addr), - AddrModeT2_i8, IndexModePre, IIC_iLoadiu, - "ldr", "\t$dst, $addr!", "$addr.base = $base_wb", + AddrModeT2_i8, IndexModePre, IIC_iLoad_iu, + "ldr", "\t$Rt, $addr!", "$addr.base = $Rn", []>; -def t2LDR_POST : T2Iidxldst<0, 0b10, 1, 0, (outs GPR:$dst, GPR:$base_wb), - (ins GPR:$base, t2am_imm8_offset:$offset), - AddrModeT2_i8, IndexModePost, IIC_iLoadiu, - "ldr", "\t$dst, [$base], $offset", "$base = $base_wb", +def t2LDR_POST : T2Iidxldst<0, 0b10, 1, 0, (outs GPR:$Rt, GPR:$Rn), + (ins GPR:$base, t2am_imm8_offset:$addr), + AddrModeT2_i8, IndexModePost, IIC_iLoad_iu, + "ldr", "\t$Rt, [$Rn], $addr", "$base = $Rn", []>; -def t2LDRB_PRE : T2Iidxldst<0, 0b00, 1, 1, (outs GPR:$dst, GPR:$base_wb), +def t2LDRB_PRE : T2Iidxldst<0, 0b00, 1, 1, (outs GPR:$Rt, GPR:$Rn), (ins t2addrmode_imm8:$addr), - AddrModeT2_i8, IndexModePre, IIC_iLoadiu, - "ldrb", "\t$dst, $addr!", "$addr.base = $base_wb", + AddrModeT2_i8, IndexModePre, IIC_iLoad_bh_iu, + "ldrb", "\t$Rt, $addr!", "$addr.base = $Rn", []>; -def t2LDRB_POST : T2Iidxldst<0, 0b00, 1, 0, (outs GPR:$dst, GPR:$base_wb), - (ins GPR:$base, t2am_imm8_offset:$offset), - AddrModeT2_i8, IndexModePost, IIC_iLoadiu, - "ldrb", "\t$dst, [$base], $offset", "$base = $base_wb", +def t2LDRB_POST : T2Iidxldst<0, 0b00, 1, 0, (outs GPR:$Rt, GPR:$Rn), + (ins GPR:$base, t2am_imm8_offset:$addr), + AddrModeT2_i8, IndexModePost, IIC_iLoad_bh_iu, + "ldrb", "\t$Rt, [$Rn], $addr", "$base = $Rn", []>; -def t2LDRH_PRE : T2Iidxldst<0, 0b01, 1, 1, (outs GPR:$dst, GPR:$base_wb), +def t2LDRH_PRE : T2Iidxldst<0, 0b01, 1, 1, (outs GPR:$Rt, GPR:$Rn), (ins t2addrmode_imm8:$addr), - AddrModeT2_i8, IndexModePre, IIC_iLoadiu, - "ldrh", "\t$dst, $addr!", "$addr.base = $base_wb", + AddrModeT2_i8, IndexModePre, IIC_iLoad_bh_iu, + "ldrh", "\t$Rt, $addr!", "$addr.base = $Rn", []>; -def t2LDRH_POST : T2Iidxldst<0, 0b01, 1, 0, (outs GPR:$dst, GPR:$base_wb), - (ins GPR:$base, t2am_imm8_offset:$offset), - AddrModeT2_i8, IndexModePost, IIC_iLoadiu, - "ldrh", "\t$dst, [$base], $offset", "$base = $base_wb", +def t2LDRH_POST : T2Iidxldst<0, 0b01, 1, 0, (outs GPR:$Rt, GPR:$Rn), + (ins GPR:$base, t2am_imm8_offset:$addr), + AddrModeT2_i8, IndexModePost, IIC_iLoad_bh_iu, + "ldrh", "\t$Rt, [$Rn], $addr", "$base = $Rn", []>; -def t2LDRSB_PRE : T2Iidxldst<1, 0b00, 1, 1, (outs GPR:$dst, GPR:$base_wb), +def t2LDRSB_PRE : T2Iidxldst<1, 0b00, 1, 1, (outs GPR:$Rt, GPR:$Rn), (ins t2addrmode_imm8:$addr), - AddrModeT2_i8, IndexModePre, IIC_iLoadiu, - "ldrsb", "\t$dst, $addr!", "$addr.base = $base_wb", + AddrModeT2_i8, IndexModePre, IIC_iLoad_bh_iu, + "ldrsb", "\t$Rt, $addr!", "$addr.base = $Rn", []>; -def t2LDRSB_POST : T2Iidxldst<1, 0b00, 1, 0, (outs GPR:$dst, GPR:$base_wb), - (ins GPR:$base, t2am_imm8_offset:$offset), - AddrModeT2_i8, IndexModePost, IIC_iLoadiu, - "ldrsb", "\t$dst, [$base], $offset", "$base = $base_wb", +def t2LDRSB_POST : T2Iidxldst<1, 0b00, 1, 0, (outs GPR:$Rt, GPR:$Rn), + (ins GPR:$base, t2am_imm8_offset:$addr), + AddrModeT2_i8, IndexModePost, IIC_iLoad_bh_iu, + "ldrsb", "\t$Rt, [$Rn], $addr", "$base = $Rn", []>; -def t2LDRSH_PRE : T2Iidxldst<1, 0b01, 1, 1, (outs GPR:$dst, GPR:$base_wb), +def t2LDRSH_PRE : T2Iidxldst<1, 0b01, 1, 1, (outs GPR:$Rt, GPR:$Rn), (ins t2addrmode_imm8:$addr), - AddrModeT2_i8, IndexModePre, IIC_iLoadiu, - "ldrsh", "\t$dst, $addr!", "$addr.base = $base_wb", + AddrModeT2_i8, IndexModePre, IIC_iLoad_bh_iu, + "ldrsh", "\t$Rt, $addr!", "$addr.base = $Rn", []>; -def t2LDRSH_POST : T2Iidxldst<1, 0b01, 1, 0, (outs GPR:$dst, GPR:$base_wb), - (ins GPR:$base, t2am_imm8_offset:$offset), - AddrModeT2_i8, IndexModePost, IIC_iLoadiu, - "ldrsh", "\t$dst, [$base], $offset", "$base = $base_wb", +def t2LDRSH_POST : T2Iidxldst<1, 0b01, 1, 0, (outs GPR:$dst, GPR:$Rn), + (ins GPR:$base, t2am_imm8_offset:$addr), + AddrModeT2_i8, IndexModePost, IIC_iLoad_bh_iu, + "ldrsh", "\t$dst, [$Rn], $addr", "$base = $Rn", []>; -} // mayLoad = 1, neverHasSideEffects = 1 +} // mayLoad = 1, neverHasSideEffects = 1 // LDRT, LDRBT, LDRHT, LDRSBT, LDRSHT all have offset mode (PUW=0b110) and are // for disassembly only. // Ref: A8.6.57 LDR (immediate, Thumb) Encoding T4 -class T2IldT<bit signed, bits<2> type, string opc> - : T2Ii8<(outs GPR:$dst), (ins t2addrmode_imm8:$addr), IIC_iLoadi, opc, - "\t$dst, $addr", []> { +class T2IldT<bit signed, bits<2> type, string opc, InstrItinClass ii> + : T2Ii8<(outs GPR:$Rt), (ins t2addrmode_imm8:$addr), ii, opc, + "\t$Rt, $addr", []> { let Inst{31-27} = 0b11111; let Inst{26-25} = 0b00; let Inst{24} = signed; @@ -1048,74 +1408,83 @@ class T2IldT<bit signed, bits<2> type, string opc> let Inst{20} = 1; // load let Inst{11} = 1; let Inst{10-8} = 0b110; // PUW. + + bits<4> Rt; + bits<13> addr; + let Inst{15-12} = Rt; + let Inst{19-16} = addr{12-9}; + let Inst{7-0} = addr{7-0}; } -def t2LDRT : T2IldT<0, 0b10, "ldrt">; -def t2LDRBT : T2IldT<0, 0b00, "ldrbt">; -def t2LDRHT : T2IldT<0, 0b01, "ldrht">; -def t2LDRSBT : T2IldT<1, 0b00, "ldrsbt">; -def t2LDRSHT : T2IldT<1, 0b01, "ldrsht">; +def t2LDRT : T2IldT<0, 0b10, "ldrt", IIC_iLoad_i>; +def t2LDRBT : T2IldT<0, 0b00, "ldrbt", IIC_iLoad_bh_i>; +def t2LDRHT : T2IldT<0, 0b01, "ldrht", IIC_iLoad_bh_i>; +def t2LDRSBT : T2IldT<1, 0b00, "ldrsbt", IIC_iLoad_bh_i>; +def t2LDRSHT : T2IldT<1, 0b01, "ldrsht", IIC_iLoad_bh_i>; // Store -defm t2STR :T2I_st<0b10,"str", BinOpFrag<(store node:$LHS, node:$RHS)>>; -defm t2STRB:T2I_st<0b00,"strb",BinOpFrag<(truncstorei8 node:$LHS, node:$RHS)>>; -defm t2STRH:T2I_st<0b01,"strh",BinOpFrag<(truncstorei16 node:$LHS, node:$RHS)>>; +defm t2STR :T2I_st<0b10,"str", IIC_iStore_i, IIC_iStore_si, + BinOpFrag<(store node:$LHS, node:$RHS)>>; +defm t2STRB:T2I_st<0b00,"strb", IIC_iStore_bh_i, IIC_iStore_bh_si, + BinOpFrag<(truncstorei8 node:$LHS, node:$RHS)>>; +defm t2STRH:T2I_st<0b01,"strh", IIC_iStore_bh_i, IIC_iStore_bh_si, + BinOpFrag<(truncstorei16 node:$LHS, node:$RHS)>>; // Store doubleword let mayLoad = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1 in def t2STRDi8 : T2Ii8s4<1, 0, 0, (outs), - (ins GPR:$src1, GPR:$src2, t2addrmode_imm8s4:$addr), - IIC_iStorer, "strd", "\t$src1, $addr", []>; + (ins GPR:$Rt, GPR:$Rt2, t2addrmode_imm8s4:$addr), + IIC_iStore_d_r, "strd", "\t$Rt, $Rt2, $addr", []>; // Indexed stores def t2STR_PRE : T2Iidxldst<0, 0b10, 0, 1, (outs GPR:$base_wb), - (ins GPR:$src, GPR:$base, t2am_imm8_offset:$offset), - AddrModeT2_i8, IndexModePre, IIC_iStoreiu, - "str", "\t$src, [$base, $offset]!", "$base = $base_wb", + (ins GPR:$Rt, GPR:$Rn, t2am_imm8_offset:$addr), + AddrModeT2_i8, IndexModePre, IIC_iStore_iu, + "str", "\t$Rt, [$Rn, $addr]!", "$Rn = $base_wb", [(set GPR:$base_wb, - (pre_store GPR:$src, GPR:$base, t2am_imm8_offset:$offset))]>; + (pre_store GPR:$Rt, GPR:$Rn, t2am_imm8_offset:$addr))]>; def t2STR_POST : T2Iidxldst<0, 0b10, 0, 0, (outs GPR:$base_wb), - (ins GPR:$src, GPR:$base, t2am_imm8_offset:$offset), - AddrModeT2_i8, IndexModePost, IIC_iStoreiu, - "str", "\t$src, [$base], $offset", "$base = $base_wb", + (ins GPR:$Rt, GPR:$Rn, t2am_imm8_offset:$addr), + AddrModeT2_i8, IndexModePost, IIC_iStore_iu, + "str", "\t$Rt, [$Rn], $addr", "$Rn = $base_wb", [(set GPR:$base_wb, - (post_store GPR:$src, GPR:$base, t2am_imm8_offset:$offset))]>; + (post_store GPR:$Rt, GPR:$Rn, t2am_imm8_offset:$addr))]>; def t2STRH_PRE : T2Iidxldst<0, 0b01, 0, 1, (outs GPR:$base_wb), - (ins GPR:$src, GPR:$base, t2am_imm8_offset:$offset), - AddrModeT2_i8, IndexModePre, IIC_iStoreiu, - "strh", "\t$src, [$base, $offset]!", "$base = $base_wb", + (ins GPR:$Rt, GPR:$Rn, t2am_imm8_offset:$addr), + AddrModeT2_i8, IndexModePre, IIC_iStore_iu, + "strh", "\t$Rt, [$Rn, $addr]!", "$Rn = $base_wb", [(set GPR:$base_wb, - (pre_truncsti16 GPR:$src, GPR:$base, t2am_imm8_offset:$offset))]>; + (pre_truncsti16 GPR:$Rt, GPR:$Rn, t2am_imm8_offset:$addr))]>; def t2STRH_POST : T2Iidxldst<0, 0b01, 0, 0, (outs GPR:$base_wb), - (ins GPR:$src, GPR:$base, t2am_imm8_offset:$offset), - AddrModeT2_i8, IndexModePost, IIC_iStoreiu, - "strh", "\t$src, [$base], $offset", "$base = $base_wb", + (ins GPR:$Rt, GPR:$Rn, t2am_imm8_offset:$addr), + AddrModeT2_i8, IndexModePost, IIC_iStore_bh_iu, + "strh", "\t$Rt, [$Rn], $addr", "$Rn = $base_wb", [(set GPR:$base_wb, - (post_truncsti16 GPR:$src, GPR:$base, t2am_imm8_offset:$offset))]>; + (post_truncsti16 GPR:$Rt, GPR:$Rn, t2am_imm8_offset:$addr))]>; def t2STRB_PRE : T2Iidxldst<0, 0b00, 0, 1, (outs GPR:$base_wb), - (ins GPR:$src, GPR:$base, t2am_imm8_offset:$offset), - AddrModeT2_i8, IndexModePre, IIC_iStoreiu, - "strb", "\t$src, [$base, $offset]!", "$base = $base_wb", + (ins GPR:$Rt, GPR:$Rn, t2am_imm8_offset:$addr), + AddrModeT2_i8, IndexModePre, IIC_iStore_bh_iu, + "strb", "\t$Rt, [$Rn, $addr]!", "$Rn = $base_wb", [(set GPR:$base_wb, - (pre_truncsti8 GPR:$src, GPR:$base, t2am_imm8_offset:$offset))]>; + (pre_truncsti8 GPR:$Rt, GPR:$Rn, t2am_imm8_offset:$addr))]>; def t2STRB_POST : T2Iidxldst<0, 0b00, 0, 0, (outs GPR:$base_wb), - (ins GPR:$src, GPR:$base, t2am_imm8_offset:$offset), - AddrModeT2_i8, IndexModePost, IIC_iStoreiu, - "strb", "\t$src, [$base], $offset", "$base = $base_wb", + (ins GPR:$Rt, GPR:$Rn, t2am_imm8_offset:$addr), + AddrModeT2_i8, IndexModePost, IIC_iStore_bh_iu, + "strb", "\t$Rt, [$Rn], $addr", "$Rn = $base_wb", [(set GPR:$base_wb, - (post_truncsti8 GPR:$src, GPR:$base, t2am_imm8_offset:$offset))]>; + (post_truncsti8 GPR:$Rt, GPR:$Rn, t2am_imm8_offset:$addr))]>; // STRT, STRBT, STRHT all have offset mode (PUW=0b110) and are for disassembly // only. // Ref: A8.6.193 STR (immediate, Thumb) Encoding T4 -class T2IstT<bits<2> type, string opc> - : T2Ii8<(outs GPR:$src), (ins t2addrmode_imm8:$addr), IIC_iStorei, opc, - "\t$src, $addr", []> { +class T2IstT<bits<2> type, string opc, InstrItinClass ii> + : T2Ii8<(outs GPR:$Rt), (ins t2addrmode_imm8:$addr), ii, opc, + "\t$Rt, $addr", []> { let Inst{31-27} = 0b11111; let Inst{26-25} = 0b00; let Inst{24} = 0; // not signed @@ -1124,51 +1493,62 @@ class T2IstT<bits<2> type, string opc> let Inst{20} = 0; // store let Inst{11} = 1; let Inst{10-8} = 0b110; // PUW + + bits<4> Rt; + bits<13> addr; + let Inst{15-12} = Rt; + let Inst{19-16} = addr{12-9}; + let Inst{7-0} = addr{7-0}; } -def t2STRT : T2IstT<0b10, "strt">; -def t2STRBT : T2IstT<0b00, "strbt">; -def t2STRHT : T2IstT<0b01, "strht">; +def t2STRT : T2IstT<0b10, "strt", IIC_iStore_i>; +def t2STRBT : T2IstT<0b00, "strbt", IIC_iStore_bh_i>; +def t2STRHT : T2IstT<0b01, "strht", IIC_iStore_bh_i>; // ldrd / strd pre / post variants // For disassembly only. -def t2LDRD_PRE : T2Ii8s4<1, 1, 1, (outs GPR:$dst1, GPR:$dst2), - (ins GPR:$base, t2am_imm8s4_offset:$imm), NoItinerary, - "ldrd", "\t$dst1, $dst2, [$base, $imm]!", []>; +def t2LDRD_PRE : T2Ii8s4<1, 1, 1, (outs GPR:$Rt, GPR:$Rt2), + (ins GPR:$base, t2am_imm8s4_offset:$imm), IIC_iLoad_d_ru, + "ldrd", "\t$Rt, $Rt2, [$base, $imm]!", []>; -def t2LDRD_POST : T2Ii8s4<0, 1, 1, (outs GPR:$dst1, GPR:$dst2), - (ins GPR:$base, t2am_imm8s4_offset:$imm), NoItinerary, - "ldrd", "\t$dst1, $dst2, [$base], $imm", []>; +def t2LDRD_POST : T2Ii8s4<0, 1, 1, (outs GPR:$Rt, GPR:$Rt2), + (ins GPR:$base, t2am_imm8s4_offset:$imm), IIC_iLoad_d_ru, + "ldrd", "\t$Rt, $Rt2, [$base], $imm", []>; def t2STRD_PRE : T2Ii8s4<1, 1, 0, (outs), - (ins GPR:$src1, GPR:$src2, GPR:$base, t2am_imm8s4_offset:$imm), - NoItinerary, "strd", "\t$src1, $src2, [$base, $imm]!", []>; + (ins GPR:$Rt, GPR:$Rt2, GPR:$base, t2am_imm8s4_offset:$imm), + IIC_iStore_d_ru, "strd", "\t$Rt, $Rt2, [$base, $imm]!", []>; def t2STRD_POST : T2Ii8s4<0, 1, 0, (outs), - (ins GPR:$src1, GPR:$src2, GPR:$base, t2am_imm8s4_offset:$imm), - NoItinerary, "strd", "\t$src1, $src2, [$base], $imm", []>; + (ins GPR:$Rt, GPR:$Rt2, GPR:$base, t2am_imm8s4_offset:$imm), + IIC_iStore_d_ru, "strd", "\t$Rt, $Rt2, [$base], $imm", []>; // T2Ipl (Preload Data/Instruction) signals the memory system of possible future // data/instruction access. These are for disassembly only. -// -// A8.6.117, A8.6.118. Different instructions are generated for #0 and #-0. -// The neg_zero operand translates -0 to -1, -1 to -2, ..., etc. -multiclass T2Ipl<bit instr, bit write, string opc> { +// instr_write is inverted for Thumb mode: (prefetch 3) -> (preload 0), +// (prefetch 1) -> (preload 2), (prefetch 2) -> (preload 1). +multiclass T2Ipl<bits<1> write, bits<1> instr, string opc> { - def i12 : T2I<(outs), (ins GPR:$base, i32imm:$imm), IIC_iLoadi, opc, - "\t[$base, $imm]", []> { + def i12 : T2Ii12<(outs), (ins t2addrmode_imm12:$addr), IIC_Preload, opc, + "\t$addr", + [(ARMPreload t2addrmode_imm12:$addr, (i32 write), (i32 instr))]> { let Inst{31-25} = 0b1111100; let Inst{24} = instr; - let Inst{23} = 1; // U = 1 let Inst{22} = 0; let Inst{21} = write; let Inst{20} = 1; let Inst{15-12} = 0b1111; + + bits<17> addr; + let Inst{19-16} = addr{16-13}; // Rn + let Inst{23} = addr{12}; // U + let Inst{11-0} = addr{11-0}; // imm12 } - def i8 : T2I<(outs), (ins GPR:$base, neg_zero:$imm), IIC_iLoadi, opc, - "\t[$base, $imm]", []> { + def i8 : T2Ii8<(outs), (ins t2addrmode_imm8:$addr), IIC_Preload, opc, + "\t$addr", + [(ARMPreload t2addrmode_imm8:$addr, (i32 write), (i32 instr))]> { let Inst{31-25} = 0b1111100; let Inst{24} = instr; let Inst{23} = 0; // U = 0 @@ -1177,22 +1557,15 @@ multiclass T2Ipl<bit instr, bit write, string opc> { let Inst{20} = 1; let Inst{15-12} = 0b1111; let Inst{11-8} = 0b1100; - } - def pci : T2I<(outs), (ins GPR:$base, neg_zero:$imm), IIC_iLoadi, opc, - "\t[pc, $imm]", []> { - let Inst{31-25} = 0b1111100; - let Inst{24} = instr; - let Inst{23} = ?; // add = (U == 1) - let Inst{22} = 0; - let Inst{21} = write; - let Inst{20} = 1; - let Inst{19-16} = 0b1111; // Rn = 0b1111 - let Inst{15-12} = 0b1111; + bits<13> addr; + let Inst{19-16} = addr{12-9}; // Rn + let Inst{7-0} = addr{7-0}; // imm8 } - def r : T2I<(outs), (ins GPR:$base, GPR:$a), IIC_iLoadi, opc, - "\t[$base, $a]", []> { + def s : T2Iso<(outs), (ins t2addrmode_so_reg:$addr), IIC_Preload, opc, + "\t$addr", + [(ARMPreload t2addrmode_so_reg:$addr, (i32 write), (i32 instr))]> { let Inst{31-25} = 0b1111100; let Inst{24} = instr; let Inst{23} = 0; // add = TRUE for T1 @@ -1201,133 +1574,174 @@ multiclass T2Ipl<bit instr, bit write, string opc> { let Inst{20} = 1; let Inst{15-12} = 0b1111; let Inst{11-6} = 0000000; - let Inst{5-4} = 0b00; // no shift is applied - } - def s : T2I<(outs), (ins GPR:$base, GPR:$a, i32imm:$shamt), IIC_iLoadi, opc, - "\t[$base, $a, lsl $shamt]", []> { - let Inst{31-25} = 0b1111100; - let Inst{24} = instr; - let Inst{23} = 0; // add = TRUE for T1 - let Inst{22} = 0; - let Inst{21} = write; - let Inst{20} = 1; - let Inst{15-12} = 0b1111; - let Inst{11-6} = 0000000; + bits<10> addr; + let Inst{19-16} = addr{9-6}; // Rn + let Inst{3-0} = addr{5-2}; // Rm + let Inst{5-4} = addr{1-0}; // imm2 } } -defm t2PLD : T2Ipl<0, 0, "pld">; -defm t2PLDW : T2Ipl<0, 1, "pldw">; -defm t2PLI : T2Ipl<1, 0, "pli">; +defm t2PLD : T2Ipl<0, 0, "pld">, Requires<[IsThumb2]>; +defm t2PLDW : T2Ipl<1, 0, "pldw">, Requires<[IsThumb2,HasV7,HasMP]>; +defm t2PLI : T2Ipl<0, 1, "pli">, Requires<[IsThumb2,HasV7]>; //===----------------------------------------------------------------------===// // Load / store multiple Instructions. // -let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 in { -def t2LDM : T2XI<(outs), (ins addrmode4:$addr, pred:$p, - reglist:$dsts, variable_ops), IIC_iLoadm, - "ldm${addr:submode}${p}${addr:wide}\t$addr, $dsts", []> { - let Inst{31-27} = 0b11101; - let Inst{26-25} = 0b00; - let Inst{24-23} = {?, ?}; // IA: '01', DB: '10' - let Inst{22} = 0; - let Inst{21} = 0; // The W bit. - let Inst{20} = 1; // Load -} +multiclass thumb2_ldst_mult<string asm, InstrItinClass itin, + InstrItinClass itin_upd, bit L_bit> { + def IA : + T2XI<(outs), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops), + itin, !strconcat(asm, "ia${p}.w\t$Rn, $regs"), []> { + bits<4> Rn; + bits<16> regs; -def t2LDM_UPD : T2XIt<(outs GPR:$wb), (ins addrmode4:$addr, pred:$p, - reglist:$dsts, variable_ops), IIC_iLoadm, - "ldm${addr:submode}${p}${addr:wide}\t$addr!, $dsts", - "$addr.addr = $wb", []> { - let Inst{31-27} = 0b11101; - let Inst{26-25} = 0b00; - let Inst{24-23} = {?, ?}; // IA: '01', DB: '10' - let Inst{22} = 0; - let Inst{21} = 1; // The W bit. - let Inst{20} = 1; // Load -} -} // mayLoad, neverHasSideEffects, hasExtraDefRegAllocReq + let Inst{31-27} = 0b11101; + let Inst{26-25} = 0b00; + let Inst{24-23} = 0b01; // Increment After + let Inst{22} = 0; + let Inst{21} = 0; // No writeback + let Inst{20} = L_bit; + let Inst{19-16} = Rn; + let Inst{15-0} = regs; + } + def IA_UPD : + T2XIt<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops), + itin_upd, !strconcat(asm, "ia${p}.w\t$Rn!, $regs"), "$Rn = $wb", []> { + bits<4> Rn; + bits<16> regs; -let mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1 in { -def t2STM : T2XI<(outs), (ins addrmode4:$addr, pred:$p, - reglist:$srcs, variable_ops), IIC_iStorem, - "stm${addr:submode}${p}${addr:wide}\t$addr, $srcs", []> { - let Inst{31-27} = 0b11101; - let Inst{26-25} = 0b00; - let Inst{24-23} = {?, ?}; // IA: '01', DB: '10' - let Inst{22} = 0; - let Inst{21} = 0; // The W bit. - let Inst{20} = 0; // Store -} + let Inst{31-27} = 0b11101; + let Inst{26-25} = 0b00; + let Inst{24-23} = 0b01; // Increment After + let Inst{22} = 0; + let Inst{21} = 1; // Writeback + let Inst{20} = L_bit; + let Inst{19-16} = Rn; + let Inst{15-0} = regs; + } + def DB : + T2XI<(outs), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops), + itin, !strconcat(asm, "db${p}.w\t$Rn, $regs"), []> { + bits<4> Rn; + bits<16> regs; -def t2STM_UPD : T2XIt<(outs GPR:$wb), (ins addrmode4:$addr, pred:$p, - reglist:$srcs, variable_ops), - IIC_iStorem, - "stm${addr:submode}${p}${addr:wide}\t$addr!, $srcs", - "$addr.addr = $wb", []> { - let Inst{31-27} = 0b11101; - let Inst{26-25} = 0b00; - let Inst{24-23} = {?, ?}; // IA: '01', DB: '10' - let Inst{22} = 0; - let Inst{21} = 1; // The W bit. - let Inst{20} = 0; // Store + let Inst{31-27} = 0b11101; + let Inst{26-25} = 0b00; + let Inst{24-23} = 0b10; // Decrement Before + let Inst{22} = 0; + let Inst{21} = 0; // No writeback + let Inst{20} = L_bit; + let Inst{19-16} = Rn; + let Inst{15-0} = regs; + } + def DB_UPD : + T2XIt<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops), + itin_upd, !strconcat(asm, "db${p}.w\t$Rn, $regs"), "$Rn = $wb", []> { + bits<4> Rn; + bits<16> regs; + + let Inst{31-27} = 0b11101; + let Inst{26-25} = 0b00; + let Inst{24-23} = 0b10; // Decrement Before + let Inst{22} = 0; + let Inst{21} = 1; // Writeback + let Inst{20} = L_bit; + let Inst{19-16} = Rn; + let Inst{15-0} = regs; + } } -} // mayStore, neverHasSideEffects, hasExtraSrcRegAllocReq + +let neverHasSideEffects = 1 in { + +let mayLoad = 1, hasExtraDefRegAllocReq = 1 in +defm t2LDM : thumb2_ldst_mult<"ldm", IIC_iLoad_m, IIC_iLoad_mu, 1>; + +let mayStore = 1, hasExtraSrcRegAllocReq = 1 in +defm t2STM : thumb2_ldst_mult<"stm", IIC_iStore_m, IIC_iStore_mu, 0>; + +} // neverHasSideEffects + //===----------------------------------------------------------------------===// // Move Instructions. // let neverHasSideEffects = 1 in -def t2MOVr : T2sI<(outs GPR:$dst), (ins GPR:$src), IIC_iMOVr, - "mov", ".w\t$dst, $src", []> { +def t2MOVr : T2sTwoReg<(outs GPR:$Rd), (ins GPR:$Rm), IIC_iMOVr, + "mov", ".w\t$Rd, $Rm", []> { let Inst{31-27} = 0b11101; let Inst{26-25} = 0b01; let Inst{24-21} = 0b0010; - let Inst{20} = ?; // The S bit. let Inst{19-16} = 0b1111; // Rn let Inst{14-12} = 0b000; let Inst{7-4} = 0b0000; } // AddedComplexity to ensure isel tries t2MOVi before t2MOVi16. -let isReMaterializable = 1, isAsCheapAsAMove = 1, AddedComplexity = 1 in -def t2MOVi : T2sI<(outs rGPR:$dst), (ins t2_so_imm:$src), IIC_iMOVi, - "mov", ".w\t$dst, $src", - [(set rGPR:$dst, t2_so_imm:$src)]> { +let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1, + AddedComplexity = 1 in +def t2MOVi : T2sOneRegImm<(outs rGPR:$Rd), (ins t2_so_imm:$imm), IIC_iMOVi, + "mov", ".w\t$Rd, $imm", + [(set rGPR:$Rd, t2_so_imm:$imm)]> { let Inst{31-27} = 0b11110; let Inst{25} = 0; let Inst{24-21} = 0b0010; - let Inst{20} = ?; // The S bit. let Inst{19-16} = 0b1111; // Rn let Inst{15} = 0; } -let isReMaterializable = 1, isAsCheapAsAMove = 1 in -def t2MOVi16 : T2I<(outs rGPR:$dst), (ins i32imm:$src), IIC_iMOVi, - "movw", "\t$dst, $src", - [(set rGPR:$dst, imm0_65535:$src)]> { +let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in +def t2MOVi16 : T2I<(outs rGPR:$Rd), (ins i32imm_hilo16:$imm), IIC_iMOVi, + "movw", "\t$Rd, $imm", + [(set rGPR:$Rd, imm0_65535:$imm)]> { let Inst{31-27} = 0b11110; let Inst{25} = 1; let Inst{24-21} = 0b0010; let Inst{20} = 0; // The S bit. let Inst{15} = 0; + + bits<4> Rd; + bits<16> imm; + + let Inst{11-8} = Rd; + let Inst{19-16} = imm{15-12}; + let Inst{26} = imm{11}; + let Inst{14-12} = imm{10-8}; + let Inst{7-0} = imm{7-0}; } -let Constraints = "$src = $dst" in -def t2MOVTi16 : T2I<(outs rGPR:$dst), (ins rGPR:$src, i32imm:$imm), IIC_iMOVi, - "movt", "\t$dst, $imm", - [(set rGPR:$dst, +def t2MOVi16_ga_pcrel : PseudoInst<(outs rGPR:$Rd), + (ins i32imm:$addr, pclabel:$id), IIC_iMOVi, []>; + +let Constraints = "$src = $Rd" in { +def t2MOVTi16 : T2I<(outs rGPR:$Rd), + (ins rGPR:$src, i32imm_hilo16:$imm), IIC_iMOVi, + "movt", "\t$Rd, $imm", + [(set rGPR:$Rd, (or (and rGPR:$src, 0xffff), lo16AllZero:$imm))]> { let Inst{31-27} = 0b11110; let Inst{25} = 1; let Inst{24-21} = 0b0110; let Inst{20} = 0; // The S bit. let Inst{15} = 0; + + bits<4> Rd; + bits<16> imm; + + let Inst{11-8} = Rd; + let Inst{19-16} = imm{15-12}; + let Inst{26} = imm{11}; + let Inst{14-12} = imm{10-8}; + let Inst{7-0} = imm{7-0}; } +def t2MOVTi16_ga_pcrel : PseudoInst<(outs rGPR:$Rd), + (ins rGPR:$src, i32imm:$addr, pclabel:$id), IIC_iMOVi, []>; +} // Constraints + def : T2Pat<(or rGPR:$src, 0xffff0000), (t2MOVTi16 rGPR:$src, 0xffff)>; //===----------------------------------------------------------------------===// @@ -1336,28 +1750,28 @@ def : T2Pat<(or rGPR:$src, 0xffff0000), (t2MOVTi16 rGPR:$src, 0xffff)>; // Sign extenders -defm t2SXTB : T2I_unary_rrot<0b100, "sxtb", +defm t2SXTB : T2I_ext_rrot<0b100, "sxtb", UnOpFrag<(sext_inreg node:$Src, i8)>>; -defm t2SXTH : T2I_unary_rrot<0b000, "sxth", +defm t2SXTH : T2I_ext_rrot<0b000, "sxth", UnOpFrag<(sext_inreg node:$Src, i16)>>; -defm t2SXTB16 : T2I_unary_rrot_sxtb16<0b010, "sxtb16">; +defm t2SXTB16 : T2I_ext_rrot_sxtb16<0b010, "sxtb16">; -defm t2SXTAB : T2I_bin_rrot<0b100, "sxtab", +defm t2SXTAB : T2I_exta_rrot<0b100, "sxtab", BinOpFrag<(add node:$LHS, (sext_inreg node:$RHS, i8))>>; -defm t2SXTAH : T2I_bin_rrot<0b000, "sxtah", +defm t2SXTAH : T2I_exta_rrot<0b000, "sxtah", BinOpFrag<(add node:$LHS, (sext_inreg node:$RHS,i16))>>; -defm t2SXTAB16 : T2I_bin_rrot_DO<0b010, "sxtab16">; +defm t2SXTAB16 : T2I_exta_rrot_DO<0b010, "sxtab16">; // TODO: SXT(A){B|H}16 - done for disassembly only // Zero extenders let AddedComplexity = 16 in { -defm t2UXTB : T2I_unary_rrot<0b101, "uxtb", +defm t2UXTB : T2I_ext_rrot<0b101, "uxtb", UnOpFrag<(and node:$Src, 0x000000FF)>>; -defm t2UXTH : T2I_unary_rrot<0b001, "uxth", +defm t2UXTH : T2I_ext_rrot<0b001, "uxth", UnOpFrag<(and node:$Src, 0x0000FFFF)>>; -defm t2UXTB16 : T2I_unary_rrot_uxtb16<0b011, "uxtb16", +defm t2UXTB16 : T2I_ext_rrot_uxtb16<0b011, "uxtb16", UnOpFrag<(and node:$Src, 0x00FF00FF)>>; // FIXME: This pattern incorrectly assumes the shl operator is a rotate. @@ -1365,15 +1779,17 @@ defm t2UXTB16 : T2I_unary_rrot_uxtb16<0b011, "uxtb16", // instead so we can include a check for masking back in the upper // eight bits of the source into the lower eight bits of the result. //def : T2Pat<(and (shl rGPR:$Src, (i32 8)), 0xFF00FF), -// (t2UXTB16r_rot rGPR:$Src, 24)>, Requires<[HasT2ExtractPack]>; +// (t2UXTB16r_rot rGPR:$Src, 24)>, +// Requires<[HasT2ExtractPack, IsThumb2]>; def : T2Pat<(and (srl rGPR:$Src, (i32 8)), 0xFF00FF), - (t2UXTB16r_rot rGPR:$Src, 8)>, Requires<[HasT2ExtractPack]>; + (t2UXTB16r_rot rGPR:$Src, 8)>, + Requires<[HasT2ExtractPack, IsThumb2]>; -defm t2UXTAB : T2I_bin_rrot<0b101, "uxtab", +defm t2UXTAB : T2I_exta_rrot<0b101, "uxtab", BinOpFrag<(add node:$LHS, (and node:$RHS, 0x00FF))>>; -defm t2UXTAH : T2I_bin_rrot<0b001, "uxtah", +defm t2UXTAH : T2I_exta_rrot<0b001, "uxtah", BinOpFrag<(add node:$LHS, (and node:$RHS, 0xFFFF))>>; -defm t2UXTAB16 : T2I_bin_rrot_DO<0b011, "uxtab16">; +defm t2UXTAB16 : T2I_exta_rrot_DO<0b011, "uxtab16">; } //===----------------------------------------------------------------------===// @@ -1387,8 +1803,10 @@ defm t2SUB : T2I_bin_ii12rs<0b101, "sub", // ADD and SUB with 's' bit set. No 12-bit immediate (T4) variants. defm t2ADDS : T2I_bin_s_irs <0b1000, "add", + IIC_iALUi, IIC_iALUr, IIC_iALUsi, BinOpFrag<(addc node:$LHS, node:$RHS)>, 1>; defm t2SUBS : T2I_bin_s_irs <0b1101, "sub", + IIC_iALUi, IIC_iALUr, IIC_iALUsi, BinOpFrag<(subc node:$LHS, node:$RHS)>>; defm t2ADC : T2I_adde_sube_irs<0b1010, "adc", @@ -1436,8 +1854,8 @@ def : T2Pat<(adde rGPR:$src, t2_so_imm_not:$imm), // Select Bytes -- for disassembly only -def t2SEL : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b), NoItinerary, "sel", - "\t$dst, $a, $b", []> { +def t2SEL : T2ThreeReg<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), + NoItinerary, "sel", "\t$Rd, $Rn, $Rm", []> { let Inst{31-27} = 0b11111; let Inst{26-24} = 0b010; let Inst{23} = 0b1; @@ -1450,28 +1868,41 @@ def t2SEL : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b), NoItinerary, "sel", // A6.3.13, A6.3.14, A6.3.15 Parallel addition and subtraction (signed/unsigned) // And Miscellaneous operations -- for disassembly only class T2I_pam<bits<3> op22_20, bits<4> op7_4, string opc, - list<dag> pat = [/* For disassembly only; pattern left blank */]> - : T2I<(outs rGPR:$dst), (ins rGPR:$a, rGPR:$b), NoItinerary, opc, - "\t$dst, $a, $b", pat> { + list<dag> pat = [/* For disassembly only; pattern left blank */], + dag iops = (ins rGPR:$Rn, rGPR:$Rm), + string asm = "\t$Rd, $Rn, $Rm"> + : T2I<(outs rGPR:$Rd), iops, NoItinerary, opc, asm, pat> { let Inst{31-27} = 0b11111; let Inst{26-23} = 0b0101; let Inst{22-20} = op22_20; let Inst{15-12} = 0b1111; let Inst{7-4} = op7_4; + + bits<4> Rd; + bits<4> Rn; + bits<4> Rm; + + let Inst{11-8} = Rd; + let Inst{19-16} = Rn; + let Inst{3-0} = Rm; } // Saturating add/subtract -- for disassembly only def t2QADD : T2I_pam<0b000, 0b1000, "qadd", - [(set rGPR:$dst, (int_arm_qadd rGPR:$a, rGPR:$b))]>; + [(set rGPR:$Rd, (int_arm_qadd rGPR:$Rn, rGPR:$Rm))], + (ins rGPR:$Rm, rGPR:$Rn), "\t$Rd, $Rm, $Rn">; def t2QADD16 : T2I_pam<0b001, 0b0001, "qadd16">; def t2QADD8 : T2I_pam<0b000, 0b0001, "qadd8">; def t2QASX : T2I_pam<0b010, 0b0001, "qasx">; -def t2QDADD : T2I_pam<0b000, 0b1001, "qdadd">; -def t2QDSUB : T2I_pam<0b000, 0b1011, "qdsub">; +def t2QDADD : T2I_pam<0b000, 0b1001, "qdadd", [], + (ins rGPR:$Rm, rGPR:$Rn), "\t$Rd, $Rm, $Rn">; +def t2QDSUB : T2I_pam<0b000, 0b1011, "qdsub", [], + (ins rGPR:$Rm, rGPR:$Rn), "\t$Rd, $Rm, $Rn">; def t2QSAX : T2I_pam<0b110, 0b0001, "qsax">; def t2QSUB : T2I_pam<0b000, 0b1010, "qsub", - [(set rGPR:$dst, (int_arm_qsub rGPR:$a, rGPR:$b))]>; + [(set rGPR:$Rd, (int_arm_qsub rGPR:$Rn, rGPR:$Rm))], + (ins rGPR:$Rm, rGPR:$Rn), "\t$Rd, $Rm, $Rn">; def t2QSUB16 : T2I_pam<0b101, 0b0001, "qsub16">; def t2QSUB8 : T2I_pam<0b100, 0b0001, "qsub8">; def t2UQADD16 : T2I_pam<0b001, 0b0101, "uqadd16">; @@ -1511,21 +1942,61 @@ def t2UHSAX : T2I_pam<0b110, 0b0110, "uhsax">; def t2UHSUB16 : T2I_pam<0b101, 0b0110, "uhsub16">; def t2UHSUB8 : T2I_pam<0b100, 0b0110, "uhsub8">; +// Helper class for disassembly only +// A6.3.16 & A6.3.17 +// T2Imac - Thumb2 multiply [accumulate, and absolute difference] instructions. +class T2ThreeReg_mac<bit long, bits<3> op22_20, bits<4> op7_4, dag oops, + dag iops, InstrItinClass itin, string opc, string asm, list<dag> pattern> + : T2ThreeReg<oops, iops, itin, opc, asm, pattern> { + let Inst{31-27} = 0b11111; + let Inst{26-24} = 0b011; + let Inst{23} = long; + let Inst{22-20} = op22_20; + let Inst{7-4} = op7_4; +} + +class T2FourReg_mac<bit long, bits<3> op22_20, bits<4> op7_4, dag oops, + dag iops, InstrItinClass itin, string opc, string asm, list<dag> pattern> + : T2FourReg<oops, iops, itin, opc, asm, pattern> { + let Inst{31-27} = 0b11111; + let Inst{26-24} = 0b011; + let Inst{23} = long; + let Inst{22-20} = op22_20; + let Inst{7-4} = op7_4; +} + // Unsigned Sum of Absolute Differences [and Accumulate] -- for disassembly only -def t2USAD8 : T2I_mac<0, 0b111, 0b0000, (outs rGPR:$dst), - (ins rGPR:$a, rGPR:$b), - NoItinerary, "usad8", "\t$dst, $a, $b", []> { +def t2USAD8 : T2ThreeReg_mac<0, 0b111, 0b0000, (outs rGPR:$Rd), + (ins rGPR:$Rn, rGPR:$Rm), + NoItinerary, "usad8", "\t$Rd, $Rn, $Rm", []> { let Inst{15-12} = 0b1111; } -def t2USADA8 : T2I_mac<0, 0b111, 0b0000, (outs rGPR:$dst), - (ins rGPR:$a, rGPR:$b, rGPR:$acc), NoItinerary, "usada8", - "\t$dst, $a, $b, $acc", []>; +def t2USADA8 : T2FourReg_mac<0, 0b111, 0b0000, (outs rGPR:$Rd), + (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), NoItinerary, + "usada8", "\t$Rd, $Rn, $Rm, $Ra", []>; // Signed/Unsigned saturate -- for disassembly only -def t2SSAT: T2I<(outs rGPR:$dst), (ins i32imm:$bit_pos, rGPR:$a, shift_imm:$sh), - NoItinerary, "ssat", "\t$dst, $bit_pos, $a$sh", +class T2SatI<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : T2I<oops, iops, itin, opc, asm, pattern> { + bits<4> Rd; + bits<4> Rn; + bits<5> sat_imm; + bits<7> sh; + + let Inst{11-8} = Rd; + let Inst{19-16} = Rn; + let Inst{4-0} = sat_imm{4-0}; + let Inst{21} = sh{6}; + let Inst{14-12} = sh{4-2}; + let Inst{7-6} = sh{1-0}; +} + +def t2SSAT: T2SatI< + (outs rGPR:$Rd), (ins i32imm:$sat_imm, rGPR:$Rn, shift_imm:$sh), + NoItinerary, "ssat", "\t$Rd, $sat_imm, $Rn$sh", [/* For disassembly only; pattern left blank */]> { let Inst{31-27} = 0b11110; let Inst{25-22} = 0b1100; @@ -1533,8 +2004,9 @@ def t2SSAT: T2I<(outs rGPR:$dst), (ins i32imm:$bit_pos, rGPR:$a, shift_imm:$sh), let Inst{15} = 0; } -def t2SSAT16: T2I<(outs rGPR:$dst), (ins i32imm:$bit_pos, rGPR:$a), NoItinerary, - "ssat16", "\t$dst, $bit_pos, $a", +def t2SSAT16: T2SatI< + (outs rGPR:$Rd), (ins i32imm:$sat_imm, rGPR:$Rn), NoItinerary, + "ssat16", "\t$Rd, $sat_imm, $Rn", [/* For disassembly only; pattern left blank */]> { let Inst{31-27} = 0b11110; let Inst{25-22} = 0b1100; @@ -1545,8 +2017,9 @@ def t2SSAT16: T2I<(outs rGPR:$dst), (ins i32imm:$bit_pos, rGPR:$a), NoItinerary, let Inst{7-6} = 0b00; // imm2 = '00' } -def t2USAT: T2I<(outs rGPR:$dst), (ins i32imm:$bit_pos, rGPR:$a, shift_imm:$sh), - NoItinerary, "usat", "\t$dst, $bit_pos, $a$sh", +def t2USAT: T2SatI< + (outs rGPR:$Rd), (ins i32imm:$sat_imm, rGPR:$Rn, shift_imm:$sh), + NoItinerary, "usat", "\t$Rd, $sat_imm, $Rn$sh", [/* For disassembly only; pattern left blank */]> { let Inst{31-27} = 0b11110; let Inst{25-22} = 0b1110; @@ -1554,8 +2027,9 @@ def t2USAT: T2I<(outs rGPR:$dst), (ins i32imm:$bit_pos, rGPR:$a, shift_imm:$sh), let Inst{15} = 0; } -def t2USAT16: T2I<(outs rGPR:$dst), (ins i32imm:$bit_pos, rGPR:$a), NoItinerary, - "usat16", "\t$dst, $bit_pos, $a", +def t2USAT16: T2SatI< + (outs rGPR:$dst), (ins i32imm:$sat_imm, rGPR:$Rn), NoItinerary, + "usat16", "\t$dst, $sat_imm, $Rn", [/* For disassembly only; pattern left blank */]> { let Inst{31-27} = 0b11110; let Inst{25-22} = 0b1110; @@ -1579,23 +2053,23 @@ defm t2ASR : T2I_sh_ir<0b10, "asr", BinOpFrag<(sra node:$LHS, node:$RHS)>>; defm t2ROR : T2I_sh_ir<0b11, "ror", BinOpFrag<(rotr node:$LHS, node:$RHS)>>; let Uses = [CPSR] in { -def t2MOVrx : T2sI<(outs rGPR:$dst), (ins rGPR:$src), IIC_iMOVsi, - "rrx", "\t$dst, $src", - [(set rGPR:$dst, (ARMrrx rGPR:$src))]> { +def t2RRX : T2sTwoReg<(outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iMOVsi, + "rrx", "\t$Rd, $Rm", + [(set rGPR:$Rd, (ARMrrx rGPR:$Rm))]> { let Inst{31-27} = 0b11101; let Inst{26-25} = 0b01; let Inst{24-21} = 0b0010; - let Inst{20} = ?; // The S bit. let Inst{19-16} = 0b1111; // Rn let Inst{14-12} = 0b000; let Inst{7-4} = 0b0011; } } -let Defs = [CPSR] in { -def t2MOVsrl_flag : T2I<(outs rGPR:$dst), (ins rGPR:$src), IIC_iMOVsi, - "lsrs", ".w\t$dst, $src, #1", - [(set rGPR:$dst, (ARMsrl_flag rGPR:$src))]> { +let isCodeGenOnly = 1, Defs = [CPSR] in { +def t2MOVsrl_flag : T2TwoRegShiftImm< + (outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iMOVsi, + "lsrs", ".w\t$Rd, $Rm, #1", + [(set rGPR:$Rd, (ARMsrl_flag rGPR:$Rm))]> { let Inst{31-27} = 0b11101; let Inst{26-25} = 0b01; let Inst{24-21} = 0b0010; @@ -1606,9 +2080,10 @@ def t2MOVsrl_flag : T2I<(outs rGPR:$dst), (ins rGPR:$src), IIC_iMOVsi, let Inst{14-12} = 0b000; let Inst{7-6} = 0b01; } -def t2MOVsra_flag : T2I<(outs rGPR:$dst), (ins rGPR:$src), IIC_iMOVsi, - "asrs", ".w\t$dst, $src, #1", - [(set rGPR:$dst, (ARMsra_flag rGPR:$src))]> { +def t2MOVsra_flag : T2TwoRegShiftImm< + (outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iMOVsi, + "asrs", ".w\t$Rd, $Rm, #1", + [(set rGPR:$Rd, (ARMsra_flag rGPR:$Rm))]> { let Inst{31-27} = 0b11101; let Inst{26-25} = 0b01; let Inst{24-21} = 0b0010; @@ -1626,39 +2101,67 @@ def t2MOVsra_flag : T2I<(outs rGPR:$dst), (ins rGPR:$src), IIC_iMOVsi, // defm t2AND : T2I_bin_w_irs<0b0000, "and", + IIC_iBITi, IIC_iBITr, IIC_iBITsi, BinOpFrag<(and node:$LHS, node:$RHS)>, 1>; defm t2ORR : T2I_bin_w_irs<0b0010, "orr", + IIC_iBITi, IIC_iBITr, IIC_iBITsi, BinOpFrag<(or node:$LHS, node:$RHS)>, 1>; defm t2EOR : T2I_bin_w_irs<0b0100, "eor", + IIC_iBITi, IIC_iBITr, IIC_iBITsi, BinOpFrag<(xor node:$LHS, node:$RHS)>, 1>; defm t2BIC : T2I_bin_w_irs<0b0001, "bic", + IIC_iBITi, IIC_iBITr, IIC_iBITsi, BinOpFrag<(and node:$LHS, (not node:$RHS))>>; -defm t2ANDS : T2I_bin_s_irs<0b0000, "and", - BinOpFrag<(ARMand node:$LHS, node:$RHS)>, 1>; +class T2BitFI<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : T2I<oops, iops, itin, opc, asm, pattern> { + bits<4> Rd; + bits<5> msb; + bits<5> lsb; + + let Inst{11-8} = Rd; + let Inst{4-0} = msb{4-0}; + let Inst{14-12} = lsb{4-2}; + let Inst{7-6} = lsb{1-0}; +} + +class T2TwoRegBitFI<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : T2BitFI<oops, iops, itin, opc, asm, pattern> { + bits<4> Rn; -let Constraints = "$src = $dst" in -def t2BFC : T2I<(outs rGPR:$dst), (ins rGPR:$src, bf_inv_mask_imm:$imm), - IIC_iUNAsi, "bfc", "\t$dst, $imm", - [(set rGPR:$dst, (and rGPR:$src, bf_inv_mask_imm:$imm))]> { + let Inst{19-16} = Rn; +} + +let Constraints = "$src = $Rd" in +def t2BFC : T2BitFI<(outs rGPR:$Rd), (ins rGPR:$src, bf_inv_mask_imm:$imm), + IIC_iUNAsi, "bfc", "\t$Rd, $imm", + [(set rGPR:$Rd, (and rGPR:$src, bf_inv_mask_imm:$imm))]> { let Inst{31-27} = 0b11110; let Inst{25} = 1; let Inst{24-20} = 0b10110; let Inst{19-16} = 0b1111; // Rn let Inst{15} = 0; + + bits<10> imm; + let msb{4-0} = imm{9-5}; + let lsb{4-0} = imm{4-0}; } -def t2SBFX: T2I<(outs rGPR:$dst), (ins rGPR:$src, imm0_31:$lsb, imm0_31:$width), - IIC_iALUi, "sbfx", "\t$dst, $src, $lsb, $width", []> { +def t2SBFX: T2TwoRegBitFI< + (outs rGPR:$Rd), (ins rGPR:$Rn, imm0_31:$lsb, imm0_31_m1:$msb), + IIC_iUNAsi, "sbfx", "\t$Rd, $Rn, $lsb, $msb", []> { let Inst{31-27} = 0b11110; let Inst{25} = 1; let Inst{24-20} = 0b10100; let Inst{15} = 0; } -def t2UBFX: T2I<(outs rGPR:$dst), (ins rGPR:$src, imm0_31:$lsb, imm0_31:$width), - IIC_iALUi, "ubfx", "\t$dst, $src, $lsb, $width", []> { +def t2UBFX: T2TwoRegBitFI< + (outs rGPR:$Rd), (ins rGPR:$Rn, imm0_31:$lsb, imm0_31_m1:$msb), + IIC_iUNAsi, "ubfx", "\t$Rd, $Rn, $lsb, $msb", []> { let Inst{31-27} = 0b11110; let Inst{25} = 1; let Inst{24-20} = 0b11100; @@ -1666,24 +2169,50 @@ def t2UBFX: T2I<(outs rGPR:$dst), (ins rGPR:$src, imm0_31:$lsb, imm0_31:$width), } // A8.6.18 BFI - Bitfield insert (Encoding T1) -let Constraints = "$src = $dst" in -def t2BFI : T2I<(outs rGPR:$dst), - (ins rGPR:$src, rGPR:$val, bf_inv_mask_imm:$imm), - IIC_iALUi, "bfi", "\t$dst, $val, $imm", - [(set rGPR:$dst, (ARMbfi rGPR:$src, rGPR:$val, - bf_inv_mask_imm:$imm))]> { - let Inst{31-27} = 0b11110; - let Inst{25} = 1; - let Inst{24-20} = 0b10110; - let Inst{15} = 0; +let Constraints = "$src = $Rd" in { + def t2BFI : T2TwoRegBitFI<(outs rGPR:$Rd), + (ins rGPR:$src, rGPR:$Rn, bf_inv_mask_imm:$imm), + IIC_iBITi, "bfi", "\t$Rd, $Rn, $imm", + [(set rGPR:$Rd, (ARMbfi rGPR:$src, rGPR:$Rn, + bf_inv_mask_imm:$imm))]> { + let Inst{31-27} = 0b11110; + let Inst{25} = 1; + let Inst{24-20} = 0b10110; + let Inst{15} = 0; + + bits<10> imm; + let msb{4-0} = imm{9-5}; + let lsb{4-0} = imm{4-0}; + } + + // GNU as only supports this form of bfi (w/ 4 arguments) + let isAsmParserOnly = 1 in + def t2BFI4p : T2TwoRegBitFI<(outs rGPR:$Rd), + (ins rGPR:$src, rGPR:$Rn, lsb_pos_imm:$lsbit, + width_imm:$width), + IIC_iBITi, "bfi", "\t$Rd, $Rn, $lsbit, $width", + []> { + let Inst{31-27} = 0b11110; + let Inst{25} = 1; + let Inst{24-20} = 0b10110; + let Inst{15} = 0; + + bits<5> lsbit; + bits<5> width; + let msb{4-0} = width; // Custom encoder => lsb+width-1 + let lsb{4-0} = lsbit; + } } -defm t2ORN : T2I_bin_irs<0b0011, "orn", BinOpFrag<(or node:$LHS, - (not node:$RHS))>, 0, "">; +defm t2ORN : T2I_bin_irs<0b0011, "orn", + IIC_iBITi, IIC_iBITr, IIC_iBITsi, + BinOpFrag<(or node:$LHS, (not node:$RHS))>, 0, "">; // Prefer over of t2EORri ra, rb, -1 because mvn has 16-bit version let AddedComplexity = 1 in -defm t2MVN : T2I_un_irs <0b0011, "mvn", UnOpFrag<(not node:$Src)>, 1, 1>; +defm t2MVN : T2I_un_irs <0b0011, "mvn", + IIC_iMVNi, IIC_iMVNr, IIC_iMVNsi, + UnOpFrag<(not node:$Src)>, 1, 1>; let AddedComplexity = 1 in @@ -1702,9 +2231,9 @@ def : T2Pat<(t2_so_imm_not:$src), // Multiply Instructions. // let isCommutable = 1 in -def t2MUL: T2I<(outs rGPR:$dst), (ins rGPR:$a, rGPR:$b), IIC_iMUL32, - "mul", "\t$dst, $a, $b", - [(set rGPR:$dst, (mul rGPR:$a, rGPR:$b))]> { +def t2MUL: T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL32, + "mul", "\t$Rd, $Rn, $Rm", + [(set rGPR:$Rd, (mul rGPR:$Rn, rGPR:$Rm))]> { let Inst{31-27} = 0b11111; let Inst{26-23} = 0b0110; let Inst{22-20} = 0b000; @@ -1712,83 +2241,63 @@ def t2MUL: T2I<(outs rGPR:$dst), (ins rGPR:$a, rGPR:$b), IIC_iMUL32, let Inst{7-4} = 0b0000; // Multiply } -def t2MLA: T2I<(outs rGPR:$dst), (ins rGPR:$a, rGPR:$b, rGPR:$c), IIC_iMAC32, - "mla", "\t$dst, $a, $b, $c", - [(set rGPR:$dst, (add (mul rGPR:$a, rGPR:$b), rGPR:$c))]> { +def t2MLA: T2FourReg< + (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32, + "mla", "\t$Rd, $Rn, $Rm, $Ra", + [(set rGPR:$Rd, (add (mul rGPR:$Rn, rGPR:$Rm), rGPR:$Ra))]> { let Inst{31-27} = 0b11111; let Inst{26-23} = 0b0110; let Inst{22-20} = 0b000; - let Inst{15-12} = {?, ?, ?, ?}; // Ra let Inst{7-4} = 0b0000; // Multiply } -def t2MLS: T2I<(outs rGPR:$dst), (ins rGPR:$a, rGPR:$b, rGPR:$c), IIC_iMAC32, - "mls", "\t$dst, $a, $b, $c", - [(set rGPR:$dst, (sub rGPR:$c, (mul rGPR:$a, rGPR:$b)))]> { +def t2MLS: T2FourReg< + (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32, + "mls", "\t$Rd, $Rn, $Rm, $Ra", + [(set rGPR:$Rd, (sub rGPR:$Ra, (mul rGPR:$Rn, rGPR:$Rm)))]> { let Inst{31-27} = 0b11111; let Inst{26-23} = 0b0110; let Inst{22-20} = 0b000; - let Inst{15-12} = {?, ?, ?, ?}; // Ra let Inst{7-4} = 0b0001; // Multiply and Subtract } // Extra precision multiplies with low / high results let neverHasSideEffects = 1 in { let isCommutable = 1 in { -def t2SMULL : T2I<(outs rGPR:$ldst, rGPR:$hdst), - (ins rGPR:$a, rGPR:$b), IIC_iMUL64, - "smull", "\t$ldst, $hdst, $a, $b", []> { - let Inst{31-27} = 0b11111; - let Inst{26-23} = 0b0111; - let Inst{22-20} = 0b000; - let Inst{7-4} = 0b0000; -} - -def t2UMULL : T2I<(outs rGPR:$ldst, rGPR:$hdst), - (ins rGPR:$a, rGPR:$b), IIC_iMUL64, - "umull", "\t$ldst, $hdst, $a, $b", []> { - let Inst{31-27} = 0b11111; - let Inst{26-23} = 0b0111; - let Inst{22-20} = 0b010; - let Inst{7-4} = 0b0000; -} +def t2SMULL : T2MulLong<0b000, 0b0000, + (outs rGPR:$Rd, rGPR:$Ra), + (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL64, + "smull", "\t$Rd, $Ra, $Rn, $Rm", []>; + +def t2UMULL : T2MulLong<0b010, 0b0000, + (outs rGPR:$RdLo, rGPR:$RdHi), + (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL64, + "umull", "\t$RdLo, $RdHi, $Rn, $Rm", []>; } // isCommutable // Multiply + accumulate -def t2SMLAL : T2I<(outs rGPR:$ldst, rGPR:$hdst), - (ins rGPR:$a, rGPR:$b), IIC_iMAC64, - "smlal", "\t$ldst, $hdst, $a, $b", []>{ - let Inst{31-27} = 0b11111; - let Inst{26-23} = 0b0111; - let Inst{22-20} = 0b100; - let Inst{7-4} = 0b0000; -} - -def t2UMLAL : T2I<(outs rGPR:$ldst, rGPR:$hdst), - (ins rGPR:$a, rGPR:$b), IIC_iMAC64, - "umlal", "\t$ldst, $hdst, $a, $b", []>{ - let Inst{31-27} = 0b11111; - let Inst{26-23} = 0b0111; - let Inst{22-20} = 0b110; - let Inst{7-4} = 0b0000; -} - -def t2UMAAL : T2I<(outs rGPR:$ldst, rGPR:$hdst), - (ins rGPR:$a, rGPR:$b), IIC_iMAC64, - "umaal", "\t$ldst, $hdst, $a, $b", []>{ - let Inst{31-27} = 0b11111; - let Inst{26-23} = 0b0111; - let Inst{22-20} = 0b110; - let Inst{7-4} = 0b0110; -} +def t2SMLAL : T2MulLong<0b100, 0b0000, + (outs rGPR:$RdLo, rGPR:$RdHi), + (ins rGPR:$Rn, rGPR:$Rm), IIC_iMAC64, + "smlal", "\t$RdLo, $RdHi, $Rn, $Rm", []>; + +def t2UMLAL : T2MulLong<0b110, 0b0000, + (outs rGPR:$RdLo, rGPR:$RdHi), + (ins rGPR:$Rn, rGPR:$Rm), IIC_iMAC64, + "umlal", "\t$RdLo, $RdHi, $Rn, $Rm", []>; + +def t2UMAAL : T2MulLong<0b110, 0b0110, + (outs rGPR:$RdLo, rGPR:$RdHi), + (ins rGPR:$Rn, rGPR:$Rm), IIC_iMAC64, + "umaal", "\t$RdLo, $RdHi, $Rn, $Rm", []>; } // neverHasSideEffects // Rounding variants of the below included for disassembly only // Most significant word multiply -def t2SMMUL : T2I<(outs rGPR:$dst), (ins rGPR:$a, rGPR:$b), IIC_iMUL32, - "smmul", "\t$dst, $a, $b", - [(set rGPR:$dst, (mulhs rGPR:$a, rGPR:$b))]> { +def t2SMMUL : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL32, + "smmul", "\t$Rd, $Rn, $Rm", + [(set rGPR:$Rd, (mulhs rGPR:$Rn, rGPR:$Rm))]> { let Inst{31-27} = 0b11111; let Inst{26-23} = 0b0110; let Inst{22-20} = 0b101; @@ -1796,8 +2305,8 @@ def t2SMMUL : T2I<(outs rGPR:$dst), (ins rGPR:$a, rGPR:$b), IIC_iMUL32, let Inst{7-4} = 0b0000; // No Rounding (Inst{4} = 0) } -def t2SMMULR : T2I<(outs rGPR:$dst), (ins rGPR:$a, rGPR:$b), IIC_iMUL32, - "smmulr", "\t$dst, $a, $b", []> { +def t2SMMULR : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL32, + "smmulr", "\t$Rd, $Rn, $Rm", []> { let Inst{31-27} = 0b11111; let Inst{26-23} = 0b0110; let Inst{22-20} = 0b101; @@ -1805,49 +2314,49 @@ def t2SMMULR : T2I<(outs rGPR:$dst), (ins rGPR:$a, rGPR:$b), IIC_iMUL32, let Inst{7-4} = 0b0001; // Rounding (Inst{4} = 1) } -def t2SMMLA : T2I<(outs rGPR:$dst), (ins rGPR:$a, rGPR:$b, rGPR:$c), IIC_iMAC32, - "smmla", "\t$dst, $a, $b, $c", - [(set rGPR:$dst, (add (mulhs rGPR:$a, rGPR:$b), rGPR:$c))]> { +def t2SMMLA : T2FourReg< + (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32, + "smmla", "\t$Rd, $Rn, $Rm, $Ra", + [(set rGPR:$Rd, (add (mulhs rGPR:$Rm, rGPR:$Rn), rGPR:$Ra))]> { let Inst{31-27} = 0b11111; let Inst{26-23} = 0b0110; let Inst{22-20} = 0b101; - let Inst{15-12} = {?, ?, ?, ?}; // Ra let Inst{7-4} = 0b0000; // No Rounding (Inst{4} = 0) } -def t2SMMLAR: T2I<(outs rGPR:$dst), (ins rGPR:$a, rGPR:$b, rGPR:$c), IIC_iMAC32, - "smmlar", "\t$dst, $a, $b, $c", []> { +def t2SMMLAR: T2FourReg< + (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32, + "smmlar", "\t$Rd, $Rn, $Rm, $Ra", []> { let Inst{31-27} = 0b11111; let Inst{26-23} = 0b0110; let Inst{22-20} = 0b101; - let Inst{15-12} = {?, ?, ?, ?}; // Ra let Inst{7-4} = 0b0001; // Rounding (Inst{4} = 1) } -def t2SMMLS: T2I <(outs rGPR:$dst), (ins rGPR:$a, rGPR:$b, rGPR:$c), IIC_iMAC32, - "smmls", "\t$dst, $a, $b, $c", - [(set rGPR:$dst, (sub rGPR:$c, (mulhs rGPR:$a, rGPR:$b)))]> { +def t2SMMLS: T2FourReg< + (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32, + "smmls", "\t$Rd, $Rn, $Rm, $Ra", + [(set rGPR:$Rd, (sub rGPR:$Ra, (mulhs rGPR:$Rn, rGPR:$Rm)))]> { let Inst{31-27} = 0b11111; let Inst{26-23} = 0b0110; let Inst{22-20} = 0b110; - let Inst{15-12} = {?, ?, ?, ?}; // Ra let Inst{7-4} = 0b0000; // No Rounding (Inst{4} = 0) } -def t2SMMLSR:T2I <(outs rGPR:$dst), (ins rGPR:$a, rGPR:$b, rGPR:$c), IIC_iMAC32, - "smmlsr", "\t$dst, $a, $b, $c", []> { +def t2SMMLSR:T2FourReg< + (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32, + "smmlsr", "\t$Rd, $Rn, $Rm, $Ra", []> { let Inst{31-27} = 0b11111; let Inst{26-23} = 0b0110; let Inst{22-20} = 0b110; - let Inst{15-12} = {?, ?, ?, ?}; // Ra let Inst{7-4} = 0b0001; // Rounding (Inst{4} = 1) } multiclass T2I_smul<string opc, PatFrag opnode> { - def BB : T2I<(outs rGPR:$dst), (ins rGPR:$a, rGPR:$b), IIC_iMUL32, - !strconcat(opc, "bb"), "\t$dst, $a, $b", - [(set rGPR:$dst, (opnode (sext_inreg rGPR:$a, i16), - (sext_inreg rGPR:$b, i16)))]> { + def BB : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL16, + !strconcat(opc, "bb"), "\t$Rd, $Rn, $Rm", + [(set rGPR:$Rd, (opnode (sext_inreg rGPR:$Rn, i16), + (sext_inreg rGPR:$Rm, i16)))]> { let Inst{31-27} = 0b11111; let Inst{26-23} = 0b0110; let Inst{22-20} = 0b001; @@ -1856,10 +2365,10 @@ multiclass T2I_smul<string opc, PatFrag opnode> { let Inst{5-4} = 0b00; } - def BT : T2I<(outs rGPR:$dst), (ins rGPR:$a, rGPR:$b), IIC_iMUL32, - !strconcat(opc, "bt"), "\t$dst, $a, $b", - [(set rGPR:$dst, (opnode (sext_inreg rGPR:$a, i16), - (sra rGPR:$b, (i32 16))))]> { + def BT : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL16, + !strconcat(opc, "bt"), "\t$Rd, $Rn, $Rm", + [(set rGPR:$Rd, (opnode (sext_inreg rGPR:$Rn, i16), + (sra rGPR:$Rm, (i32 16))))]> { let Inst{31-27} = 0b11111; let Inst{26-23} = 0b0110; let Inst{22-20} = 0b001; @@ -1868,10 +2377,10 @@ multiclass T2I_smul<string opc, PatFrag opnode> { let Inst{5-4} = 0b01; } - def TB : T2I<(outs rGPR:$dst), (ins rGPR:$a, rGPR:$b), IIC_iMUL32, - !strconcat(opc, "tb"), "\t$dst, $a, $b", - [(set rGPR:$dst, (opnode (sra rGPR:$a, (i32 16)), - (sext_inreg rGPR:$b, i16)))]> { + def TB : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL16, + !strconcat(opc, "tb"), "\t$Rd, $Rn, $Rm", + [(set rGPR:$Rd, (opnode (sra rGPR:$Rn, (i32 16)), + (sext_inreg rGPR:$Rm, i16)))]> { let Inst{31-27} = 0b11111; let Inst{26-23} = 0b0110; let Inst{22-20} = 0b001; @@ -1880,10 +2389,10 @@ multiclass T2I_smul<string opc, PatFrag opnode> { let Inst{5-4} = 0b10; } - def TT : T2I<(outs rGPR:$dst), (ins rGPR:$a, rGPR:$b), IIC_iMUL32, - !strconcat(opc, "tt"), "\t$dst, $a, $b", - [(set rGPR:$dst, (opnode (sra rGPR:$a, (i32 16)), - (sra rGPR:$b, (i32 16))))]> { + def TT : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL16, + !strconcat(opc, "tt"), "\t$Rd, $Rn, $Rm", + [(set rGPR:$Rd, (opnode (sra rGPR:$Rn, (i32 16)), + (sra rGPR:$Rm, (i32 16))))]> { let Inst{31-27} = 0b11111; let Inst{26-23} = 0b0110; let Inst{22-20} = 0b001; @@ -1892,10 +2401,10 @@ multiclass T2I_smul<string opc, PatFrag opnode> { let Inst{5-4} = 0b11; } - def WB : T2I<(outs rGPR:$dst), (ins rGPR:$a, rGPR:$b), IIC_iMUL16, - !strconcat(opc, "wb"), "\t$dst, $a, $b", - [(set rGPR:$dst, (sra (opnode rGPR:$a, - (sext_inreg rGPR:$b, i16)), (i32 16)))]> { + def WB : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL16, + !strconcat(opc, "wb"), "\t$Rd, $Rn, $Rm", + [(set rGPR:$Rd, (sra (opnode rGPR:$Rn, + (sext_inreg rGPR:$Rm, i16)), (i32 16)))]> { let Inst{31-27} = 0b11111; let Inst{26-23} = 0b0110; let Inst{22-20} = 0b011; @@ -1904,10 +2413,10 @@ multiclass T2I_smul<string opc, PatFrag opnode> { let Inst{5-4} = 0b00; } - def WT : T2I<(outs rGPR:$dst), (ins rGPR:$a, rGPR:$b), IIC_iMUL16, - !strconcat(opc, "wt"), "\t$dst, $a, $b", - [(set rGPR:$dst, (sra (opnode rGPR:$a, - (sra rGPR:$b, (i32 16))), (i32 16)))]> { + def WT : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL16, + !strconcat(opc, "wt"), "\t$Rd, $Rn, $Rm", + [(set rGPR:$Rd, (sra (opnode rGPR:$Rn, + (sra rGPR:$Rm, (i32 16))), (i32 16)))]> { let Inst{31-27} = 0b11111; let Inst{26-23} = 0b0110; let Inst{22-20} = 0b011; @@ -1919,75 +2428,75 @@ multiclass T2I_smul<string opc, PatFrag opnode> { multiclass T2I_smla<string opc, PatFrag opnode> { - def BB : T2I<(outs rGPR:$dst), (ins rGPR:$a, rGPR:$b, rGPR:$acc), IIC_iMAC16, - !strconcat(opc, "bb"), "\t$dst, $a, $b, $acc", - [(set rGPR:$dst, (add rGPR:$acc, - (opnode (sext_inreg rGPR:$a, i16), - (sext_inreg rGPR:$b, i16))))]> { + def BB : T2FourReg< + (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC16, + !strconcat(opc, "bb"), "\t$Rd, $Rn, $Rm, $Ra", + [(set rGPR:$Rd, (add rGPR:$Ra, + (opnode (sext_inreg rGPR:$Rn, i16), + (sext_inreg rGPR:$Rm, i16))))]> { let Inst{31-27} = 0b11111; let Inst{26-23} = 0b0110; let Inst{22-20} = 0b001; - let Inst{15-12} = {?, ?, ?, ?}; // Ra let Inst{7-6} = 0b00; let Inst{5-4} = 0b00; } - def BT : T2I<(outs rGPR:$dst), (ins rGPR:$a, rGPR:$b, rGPR:$acc), IIC_iMAC16, - !strconcat(opc, "bt"), "\t$dst, $a, $b, $acc", - [(set rGPR:$dst, (add rGPR:$acc, (opnode (sext_inreg rGPR:$a, i16), - (sra rGPR:$b, (i32 16)))))]> { + def BT : T2FourReg< + (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC16, + !strconcat(opc, "bt"), "\t$Rd, $Rn, $Rm, $Ra", + [(set rGPR:$Rd, (add rGPR:$Ra, (opnode (sext_inreg rGPR:$Rn, i16), + (sra rGPR:$Rm, (i32 16)))))]> { let Inst{31-27} = 0b11111; let Inst{26-23} = 0b0110; let Inst{22-20} = 0b001; - let Inst{15-12} = {?, ?, ?, ?}; // Ra let Inst{7-6} = 0b00; let Inst{5-4} = 0b01; } - def TB : T2I<(outs rGPR:$dst), (ins rGPR:$a, rGPR:$b, rGPR:$acc), IIC_iMAC16, - !strconcat(opc, "tb"), "\t$dst, $a, $b, $acc", - [(set rGPR:$dst, (add rGPR:$acc, (opnode (sra rGPR:$a, (i32 16)), - (sext_inreg rGPR:$b, i16))))]> { + def TB : T2FourReg< + (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC16, + !strconcat(opc, "tb"), "\t$Rd, $Rn, $Rm, $Ra", + [(set rGPR:$Rd, (add rGPR:$Ra, (opnode (sra rGPR:$Rn, (i32 16)), + (sext_inreg rGPR:$Rm, i16))))]> { let Inst{31-27} = 0b11111; let Inst{26-23} = 0b0110; let Inst{22-20} = 0b001; - let Inst{15-12} = {?, ?, ?, ?}; // Ra let Inst{7-6} = 0b00; let Inst{5-4} = 0b10; } - def TT : T2I<(outs rGPR:$dst), (ins rGPR:$a, rGPR:$b, rGPR:$acc), IIC_iMAC16, - !strconcat(opc, "tt"), "\t$dst, $a, $b, $acc", - [(set rGPR:$dst, (add rGPR:$acc, (opnode (sra rGPR:$a, (i32 16)), - (sra rGPR:$b, (i32 16)))))]> { + def TT : T2FourReg< + (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC16, + !strconcat(opc, "tt"), "\t$Rd, $Rn, $Rm, $Ra", + [(set rGPR:$Rd, (add rGPR:$Ra, (opnode (sra rGPR:$Rn, (i32 16)), + (sra rGPR:$Rm, (i32 16)))))]> { let Inst{31-27} = 0b11111; let Inst{26-23} = 0b0110; let Inst{22-20} = 0b001; - let Inst{15-12} = {?, ?, ?, ?}; // Ra let Inst{7-6} = 0b00; let Inst{5-4} = 0b11; } - def WB : T2I<(outs rGPR:$dst), (ins rGPR:$a, rGPR:$b, rGPR:$acc), IIC_iMAC16, - !strconcat(opc, "wb"), "\t$dst, $a, $b, $acc", - [(set rGPR:$dst, (add rGPR:$acc, (sra (opnode rGPR:$a, - (sext_inreg rGPR:$b, i16)), (i32 16))))]> { + def WB : T2FourReg< + (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC16, + !strconcat(opc, "wb"), "\t$Rd, $Rn, $Rm, $Ra", + [(set rGPR:$Rd, (add rGPR:$Ra, (sra (opnode rGPR:$Rn, + (sext_inreg rGPR:$Rm, i16)), (i32 16))))]> { let Inst{31-27} = 0b11111; let Inst{26-23} = 0b0110; let Inst{22-20} = 0b011; - let Inst{15-12} = {?, ?, ?, ?}; // Ra let Inst{7-6} = 0b00; let Inst{5-4} = 0b00; } - def WT : T2I<(outs rGPR:$dst), (ins rGPR:$a, rGPR:$b, rGPR:$acc), IIC_iMAC16, - !strconcat(opc, "wt"), "\t$dst, $a, $b, $acc", - [(set rGPR:$dst, (add rGPR:$acc, (sra (opnode rGPR:$a, - (sra rGPR:$b, (i32 16))), (i32 16))))]> { + def WT : T2FourReg< + (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC16, + !strconcat(opc, "wt"), "\t$Rd, $Rn, $Rm, $Ra", + [(set rGPR:$Rd, (add rGPR:$Ra, (sra (opnode rGPR:$Rn, + (sra rGPR:$Rm, (i32 16))), (i32 16))))]> { let Inst{31-27} = 0b11111; let Inst{26-23} = 0b0110; let Inst{22-20} = 0b011; - let Inst{15-12} = {?, ?, ?, ?}; // Ra let Inst{7-6} = 0b00; let Inst{5-4} = 0b01; } @@ -1997,62 +2506,68 @@ defm t2SMUL : T2I_smul<"smul", BinOpFrag<(mul node:$LHS, node:$RHS)>>; defm t2SMLA : T2I_smla<"smla", BinOpFrag<(mul node:$LHS, node:$RHS)>>; // Halfword multiple accumulate long: SMLAL<x><y> -- for disassembly only -def t2SMLALBB : T2I_mac<1, 0b100, 0b1000, (outs rGPR:$ldst,rGPR:$hdst), - (ins rGPR:$a,rGPR:$b), IIC_iMAC64, "smlalbb", "\t$ldst, $hdst, $a, $b", +def t2SMLALBB : T2FourReg_mac<1, 0b100, 0b1000, (outs rGPR:$Ra,rGPR:$Rd), + (ins rGPR:$Rn,rGPR:$Rm), IIC_iMAC64, "smlalbb", "\t$Ra, $Rd, $Rn, $Rm", [/* For disassembly only; pattern left blank */]>; -def t2SMLALBT : T2I_mac<1, 0b100, 0b1001, (outs rGPR:$ldst,rGPR:$hdst), - (ins rGPR:$a,rGPR:$b), IIC_iMAC64, "smlalbt", "\t$ldst, $hdst, $a, $b", +def t2SMLALBT : T2FourReg_mac<1, 0b100, 0b1001, (outs rGPR:$Ra,rGPR:$Rd), + (ins rGPR:$Rn,rGPR:$Rm), IIC_iMAC64, "smlalbt", "\t$Ra, $Rd, $Rn, $Rm", [/* For disassembly only; pattern left blank */]>; -def t2SMLALTB : T2I_mac<1, 0b100, 0b1010, (outs rGPR:$ldst,rGPR:$hdst), - (ins rGPR:$a,rGPR:$b), IIC_iMAC64, "smlaltb", "\t$ldst, $hdst, $a, $b", +def t2SMLALTB : T2FourReg_mac<1, 0b100, 0b1010, (outs rGPR:$Ra,rGPR:$Rd), + (ins rGPR:$Rn,rGPR:$Rm), IIC_iMAC64, "smlaltb", "\t$Ra, $Rd, $Rn, $Rm", [/* For disassembly only; pattern left blank */]>; -def t2SMLALTT : T2I_mac<1, 0b100, 0b1011, (outs rGPR:$ldst,rGPR:$hdst), - (ins rGPR:$a,rGPR:$b), IIC_iMAC64, "smlaltt", "\t$ldst, $hdst, $a, $b", +def t2SMLALTT : T2FourReg_mac<1, 0b100, 0b1011, (outs rGPR:$Ra,rGPR:$Rd), + (ins rGPR:$Rn,rGPR:$Rm), IIC_iMAC64, "smlaltt", "\t$Ra, $Rd, $Rn, $Rm", [/* For disassembly only; pattern left blank */]>; // Dual halfword multiple: SMUAD, SMUSD, SMLAD, SMLSD, SMLALD, SMLSLD // These are for disassembly only. -def t2SMUAD: T2I_mac<0, 0b010, 0b0000, (outs rGPR:$dst), (ins rGPR:$a, rGPR:$b), - IIC_iMAC32, "smuad", "\t$dst, $a, $b", []> { +def t2SMUAD: T2ThreeReg_mac< + 0, 0b010, 0b0000, (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), + IIC_iMAC32, "smuad", "\t$Rd, $Rn, $Rm", []> { let Inst{15-12} = 0b1111; } -def t2SMUADX:T2I_mac<0, 0b010, 0b0001, (outs rGPR:$dst), (ins rGPR:$a, rGPR:$b), - IIC_iMAC32, "smuadx", "\t$dst, $a, $b", []> { +def t2SMUADX:T2ThreeReg_mac< + 0, 0b010, 0b0001, (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), + IIC_iMAC32, "smuadx", "\t$Rd, $Rn, $Rm", []> { let Inst{15-12} = 0b1111; } -def t2SMUSD: T2I_mac<0, 0b100, 0b0000, (outs rGPR:$dst), (ins rGPR:$a, rGPR:$b), - IIC_iMAC32, "smusd", "\t$dst, $a, $b", []> { +def t2SMUSD: T2ThreeReg_mac< + 0, 0b100, 0b0000, (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), + IIC_iMAC32, "smusd", "\t$Rd, $Rn, $Rm", []> { let Inst{15-12} = 0b1111; } -def t2SMUSDX:T2I_mac<0, 0b100, 0b0001, (outs rGPR:$dst), (ins rGPR:$a, rGPR:$b), - IIC_iMAC32, "smusdx", "\t$dst, $a, $b", []> { +def t2SMUSDX:T2ThreeReg_mac< + 0, 0b100, 0b0001, (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), + IIC_iMAC32, "smusdx", "\t$Rd, $Rn, $Rm", []> { let Inst{15-12} = 0b1111; } -def t2SMLAD : T2I_mac<0, 0b010, 0b0000, (outs rGPR:$dst), - (ins rGPR:$a, rGPR:$b, rGPR:$acc), IIC_iMAC32, "smlad", - "\t$dst, $a, $b, $acc", []>; -def t2SMLADX : T2I_mac<0, 0b010, 0b0001, (outs rGPR:$dst), - (ins rGPR:$a, rGPR:$b, rGPR:$acc), IIC_iMAC32, "smladx", - "\t$dst, $a, $b, $acc", []>; -def t2SMLSD : T2I_mac<0, 0b100, 0b0000, (outs rGPR:$dst), - (ins rGPR:$a, rGPR:$b, rGPR:$acc), IIC_iMAC32, "smlsd", - "\t$dst, $a, $b, $acc", []>; -def t2SMLSDX : T2I_mac<0, 0b100, 0b0001, (outs rGPR:$dst), - (ins rGPR:$a, rGPR:$b, rGPR:$acc), IIC_iMAC32, "smlsdx", - "\t$dst, $a, $b, $acc", []>; -def t2SMLALD : T2I_mac<1, 0b100, 0b1100, (outs rGPR:$ldst,rGPR:$hdst), - (ins rGPR:$a,rGPR:$b), IIC_iMAC64, "smlald", - "\t$ldst, $hdst, $a, $b", []>; -def t2SMLALDX : T2I_mac<1, 0b100, 0b1101, (outs rGPR:$ldst,rGPR:$hdst), - (ins rGPR:$a,rGPR:$b), IIC_iMAC64, "smlaldx", - "\t$ldst, $hdst, $a, $b", []>; -def t2SMLSLD : T2I_mac<1, 0b101, 0b1100, (outs rGPR:$ldst,rGPR:$hdst), - (ins rGPR:$a,rGPR:$b), IIC_iMAC64, "smlsld", - "\t$ldst, $hdst, $a, $b", []>; -def t2SMLSLDX : T2I_mac<1, 0b101, 0b1101, (outs rGPR:$ldst,rGPR:$hdst), - (ins rGPR:$a,rGPR:$b), IIC_iMAC64, "smlsldx", - "\t$ldst, $hdst, $a, $b", []>; +def t2SMLAD : T2ThreeReg_mac< + 0, 0b010, 0b0000, (outs rGPR:$Rd), + (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32, "smlad", + "\t$Rd, $Rn, $Rm, $Ra", []>; +def t2SMLADX : T2FourReg_mac< + 0, 0b010, 0b0001, (outs rGPR:$Rd), + (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32, "smladx", + "\t$Rd, $Rn, $Rm, $Ra", []>; +def t2SMLSD : T2FourReg_mac<0, 0b100, 0b0000, (outs rGPR:$Rd), + (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32, "smlsd", + "\t$Rd, $Rn, $Rm, $Ra", []>; +def t2SMLSDX : T2FourReg_mac<0, 0b100, 0b0001, (outs rGPR:$Rd), + (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32, "smlsdx", + "\t$Rd, $Rn, $Rm, $Ra", []>; +def t2SMLALD : T2FourReg_mac<1, 0b100, 0b1100, (outs rGPR:$Ra,rGPR:$Rd), + (ins rGPR:$Rm, rGPR:$Rn), IIC_iMAC64, "smlald", + "\t$Ra, $Rd, $Rm, $Rn", []>; +def t2SMLALDX : T2FourReg_mac<1, 0b100, 0b1101, (outs rGPR:$Ra,rGPR:$Rd), + (ins rGPR:$Rm,rGPR:$Rn), IIC_iMAC64, "smlaldx", + "\t$Ra, $Rd, $Rm, $Rn", []>; +def t2SMLSLD : T2FourReg_mac<1, 0b101, 0b1100, (outs rGPR:$Ra,rGPR:$Rd), + (ins rGPR:$Rm,rGPR:$Rn), IIC_iMAC64, "smlsld", + "\t$Ra, $Rd, $Rm, $Rn", []>; +def t2SMLSLDX : T2FourReg_mac<1, 0b101, 0b1101, (outs rGPR:$Ra,rGPR:$Rd), + (ins rGPR:$Rm,rGPR:$Rn), IIC_iMAC64, "smlsldx", + "\t$Ra, $Rd, $Rm, $Rn", []>; //===----------------------------------------------------------------------===// // Misc. Arithmetic Instructions. @@ -2060,99 +2575,117 @@ def t2SMLSLDX : T2I_mac<1, 0b101, 0b1101, (outs rGPR:$ldst,rGPR:$hdst), class T2I_misc<bits<2> op1, bits<2> op2, dag oops, dag iops, InstrItinClass itin, string opc, string asm, list<dag> pattern> - : T2I<oops, iops, itin, opc, asm, pattern> { + : T2ThreeReg<oops, iops, itin, opc, asm, pattern> { let Inst{31-27} = 0b11111; let Inst{26-22} = 0b01010; let Inst{21-20} = op1; let Inst{15-12} = 0b1111; let Inst{7-6} = 0b10; let Inst{5-4} = op2; + let Rn{3-0} = Rm; } -def t2CLZ : T2I_misc<0b11, 0b00, (outs rGPR:$dst), (ins rGPR:$src), IIC_iUNAr, - "clz", "\t$dst, $src", [(set rGPR:$dst, (ctlz rGPR:$src))]>; +def t2CLZ : T2I_misc<0b11, 0b00, (outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iUNAr, + "clz", "\t$Rd, $Rm", [(set rGPR:$Rd, (ctlz rGPR:$Rm))]>; -def t2RBIT : T2I_misc<0b01, 0b10, (outs rGPR:$dst), (ins rGPR:$src), IIC_iUNAr, - "rbit", "\t$dst, $src", - [(set rGPR:$dst, (ARMrbit rGPR:$src))]>; +def t2RBIT : T2I_misc<0b01, 0b10, (outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iUNAr, + "rbit", "\t$Rd, $Rm", + [(set rGPR:$Rd, (ARMrbit rGPR:$Rm))]>; -def t2REV : T2I_misc<0b01, 0b00, (outs rGPR:$dst), (ins rGPR:$src), IIC_iUNAr, - "rev", ".w\t$dst, $src", [(set rGPR:$dst, (bswap rGPR:$src))]>; +def t2REV : T2I_misc<0b01, 0b00, (outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iUNAr, + "rev", ".w\t$Rd, $Rm", [(set rGPR:$Rd, (bswap rGPR:$Rm))]>; -def t2REV16 : T2I_misc<0b01, 0b01, (outs rGPR:$dst), (ins rGPR:$src), IIC_iUNAr, - "rev16", ".w\t$dst, $src", - [(set rGPR:$dst, - (or (and (srl rGPR:$src, (i32 8)), 0xFF), - (or (and (shl rGPR:$src, (i32 8)), 0xFF00), - (or (and (srl rGPR:$src, (i32 8)), 0xFF0000), - (and (shl rGPR:$src, (i32 8)), 0xFF000000)))))]>; +def t2REV16 : T2I_misc<0b01, 0b01, (outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iUNAr, + "rev16", ".w\t$Rd, $Rm", + [(set rGPR:$Rd, + (or (and (srl rGPR:$Rm, (i32 8)), 0xFF), + (or (and (shl rGPR:$Rm, (i32 8)), 0xFF00), + (or (and (srl rGPR:$Rm, (i32 8)), 0xFF0000), + (and (shl rGPR:$Rm, (i32 8)), 0xFF000000)))))]>; -def t2REVSH : T2I_misc<0b01, 0b11, (outs rGPR:$dst), (ins rGPR:$src), IIC_iUNAr, - "revsh", ".w\t$dst, $src", - [(set rGPR:$dst, +def t2REVSH : T2I_misc<0b01, 0b11, (outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iUNAr, + "revsh", ".w\t$Rd, $Rm", + [(set rGPR:$Rd, (sext_inreg - (or (srl (and rGPR:$src, 0xFF00), (i32 8)), - (shl rGPR:$src, (i32 8))), i16))]>; - -def t2PKHBT : T2I<(outs rGPR:$dst), (ins rGPR:$src1, rGPR:$src2, shift_imm:$sh), - IIC_iALUsi, "pkhbt", "\t$dst, $src1, $src2$sh", - [(set rGPR:$dst, (or (and rGPR:$src1, 0xFFFF), - (and (shl rGPR:$src2, lsl_amt:$sh), + (or (srl (and rGPR:$Rm, 0xFF00), (i32 8)), + (shl rGPR:$Rm, (i32 8))), i16))]>; + +def t2PKHBT : T2ThreeReg< + (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, shift_imm:$sh), + IIC_iBITsi, "pkhbt", "\t$Rd, $Rn, $Rm$sh", + [(set rGPR:$Rd, (or (and rGPR:$Rn, 0xFFFF), + (and (shl rGPR:$Rm, lsl_amt:$sh), 0xFFFF0000)))]>, - Requires<[HasT2ExtractPack]> { + Requires<[HasT2ExtractPack, IsThumb2]> { let Inst{31-27} = 0b11101; let Inst{26-25} = 0b01; let Inst{24-20} = 0b01100; let Inst{5} = 0; // BT form let Inst{4} = 0; + + bits<8> sh; + let Inst{14-12} = sh{7-5}; + let Inst{7-6} = sh{4-3}; } // Alternate cases for PKHBT where identities eliminate some nodes. def : T2Pat<(or (and rGPR:$src1, 0xFFFF), (and rGPR:$src2, 0xFFFF0000)), (t2PKHBT rGPR:$src1, rGPR:$src2, 0)>, - Requires<[HasT2ExtractPack]>; + Requires<[HasT2ExtractPack, IsThumb2]>; def : T2Pat<(or (and rGPR:$src1, 0xFFFF), (shl rGPR:$src2, imm16_31:$sh)), (t2PKHBT rGPR:$src1, rGPR:$src2, (lsl_shift_imm imm16_31:$sh))>, - Requires<[HasT2ExtractPack]>; + Requires<[HasT2ExtractPack, IsThumb2]>; // Note: Shifts of 1-15 bits will be transformed to srl instead of sra and // will match the pattern below. -def t2PKHTB : T2I<(outs rGPR:$dst), (ins rGPR:$src1, rGPR:$src2, shift_imm:$sh), - IIC_iALUsi, "pkhtb", "\t$dst, $src1, $src2$sh", - [(set rGPR:$dst, (or (and rGPR:$src1, 0xFFFF0000), - (and (sra rGPR:$src2, asr_amt:$sh), +def t2PKHTB : T2ThreeReg< + (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, shift_imm:$sh), + IIC_iBITsi, "pkhtb", "\t$Rd, $Rn, $Rm$sh", + [(set rGPR:$Rd, (or (and rGPR:$Rn, 0xFFFF0000), + (and (sra rGPR:$Rm, asr_amt:$sh), 0xFFFF)))]>, - Requires<[HasT2ExtractPack]> { + Requires<[HasT2ExtractPack, IsThumb2]> { let Inst{31-27} = 0b11101; let Inst{26-25} = 0b01; let Inst{24-20} = 0b01100; let Inst{5} = 1; // TB form let Inst{4} = 0; + + bits<8> sh; + let Inst{14-12} = sh{7-5}; + let Inst{7-6} = sh{4-3}; } // Alternate cases for PKHTB where identities eliminate some nodes. Note that // a shift amount of 0 is *not legal* here, it is PKHBT instead. def : T2Pat<(or (and rGPR:$src1, 0xFFFF0000), (srl rGPR:$src2, imm16_31:$sh)), (t2PKHTB rGPR:$src1, rGPR:$src2, (asr_shift_imm imm16_31:$sh))>, - Requires<[HasT2ExtractPack]>; + Requires<[HasT2ExtractPack, IsThumb2]>; def : T2Pat<(or (and rGPR:$src1, 0xFFFF0000), (and (srl rGPR:$src2, imm1_15:$sh), 0xFFFF)), (t2PKHTB rGPR:$src1, rGPR:$src2, (asr_shift_imm imm1_15:$sh))>, - Requires<[HasT2ExtractPack]>; + Requires<[HasT2ExtractPack, IsThumb2]>; //===----------------------------------------------------------------------===// // Comparison Instructions... // defm t2CMP : T2I_cmp_irs<0b1101, "cmp", + IIC_iCMPi, IIC_iCMPr, IIC_iCMPsi, BinOpFrag<(ARMcmp node:$LHS, node:$RHS)>>; -defm t2CMPz : T2I_cmp_irs<0b1101, "cmp", - BinOpFrag<(ARMcmpZ node:$LHS, node:$RHS)>>; + +def : T2Pat<(ARMcmpZ GPR:$lhs, t2_so_imm:$imm), + (t2CMPri GPR:$lhs, t2_so_imm:$imm)>; +def : T2Pat<(ARMcmpZ GPR:$lhs, rGPR:$rhs), + (t2CMPrr GPR:$lhs, rGPR:$rhs)>; +def : T2Pat<(ARMcmpZ GPR:$lhs, t2_so_reg:$rhs), + (t2CMPrs GPR:$lhs, t2_so_reg:$rhs)>; //FIXME: Disable CMN, as CCodes are backwards from compare expectations // Compare-to-zero still works out, just not the relationals //defm t2CMN : T2I_cmp_irs<0b1000, "cmn", // BinOpFrag<(ARMcmp node:$LHS,(ineg node:$RHS))>>; defm t2CMNz : T2I_cmp_irs<0b1000, "cmn", + IIC_iCMPi, IIC_iCMPr, IIC_iCMPsi, BinOpFrag<(ARMcmpZ node:$LHS,(ineg node:$RHS))>>; //def : T2Pat<(ARMcmp GPR:$src, t2_so_imm_neg:$imm), @@ -2162,18 +2695,21 @@ def : T2Pat<(ARMcmpZ GPR:$src, t2_so_imm_neg:$imm), (t2CMNzri GPR:$src, t2_so_imm_neg:$imm)>; defm t2TST : T2I_cmp_irs<0b0000, "tst", - BinOpFrag<(ARMcmpZ (and node:$LHS, node:$RHS), 0)>>; + IIC_iTSTi, IIC_iTSTr, IIC_iTSTsi, + BinOpFrag<(ARMcmpZ (and_su node:$LHS, node:$RHS), 0)>>; defm t2TEQ : T2I_cmp_irs<0b0100, "teq", - BinOpFrag<(ARMcmpZ (xor node:$LHS, node:$RHS), 0)>>; + IIC_iTSTi, IIC_iTSTr, IIC_iTSTsi, + BinOpFrag<(ARMcmpZ (xor_su node:$LHS, node:$RHS), 0)>>; // Conditional moves // FIXME: should be able to write a pattern for ARMcmov, but can't use // a two-value operand where a dag node expects two operands. :( let neverHasSideEffects = 1 in { -def t2MOVCCr : T2I<(outs rGPR:$dst), (ins rGPR:$false, rGPR:$true), IIC_iCMOVr, - "mov", ".w\t$dst, $true", - [/*(set rGPR:$dst, (ARMcmov rGPR:$false, rGPR:$true, imm:$cc, CCR:$ccr))*/]>, - RegConstraint<"$false = $dst"> { +def t2MOVCCr : T2TwoReg< + (outs rGPR:$Rd), (ins rGPR:$false, rGPR:$Rm), IIC_iCMOVr, + "mov", ".w\t$Rd, $Rm", + [/*(set rGPR:$Rd, (ARMcmov rGPR:$false, rGPR:$Rm, imm:$cc, CCR:$ccr))*/]>, + RegConstraint<"$false = $Rd"> { let Inst{31-27} = 0b11101; let Inst{26-25} = 0b01; let Inst{24-21} = 0b0010; @@ -2183,10 +2719,11 @@ def t2MOVCCr : T2I<(outs rGPR:$dst), (ins rGPR:$false, rGPR:$true), IIC_iCMOVr, let Inst{7-4} = 0b0000; } -def t2MOVCCi : T2I<(outs rGPR:$dst), (ins rGPR:$false, t2_so_imm:$true), - IIC_iCMOVi, "mov", ".w\t$dst, $true", -[/*(set rGPR:$dst,(ARMcmov rGPR:$false,t2_so_imm:$true, imm:$cc, CCR:$ccr))*/]>, - RegConstraint<"$false = $dst"> { +let isMoveImm = 1 in +def t2MOVCCi : T2OneRegImm<(outs rGPR:$Rd), (ins rGPR:$false, t2_so_imm:$imm), + IIC_iCMOVi, "mov", ".w\t$Rd, $imm", +[/*(set rGPR:$Rd,(ARMcmov rGPR:$false,t2_so_imm:$imm, imm:$cc, CCR:$ccr))*/]>, + RegConstraint<"$false = $Rd"> { let Inst{31-27} = 0b11110; let Inst{25} = 0; let Inst{24-21} = 0b0010; @@ -2195,9 +2732,49 @@ def t2MOVCCi : T2I<(outs rGPR:$dst), (ins rGPR:$false, t2_so_imm:$true), let Inst{15} = 0; } +let isMoveImm = 1 in +def t2MOVCCi16 : T2I<(outs rGPR:$Rd), (ins rGPR:$false, i32imm_hilo16:$imm), + IIC_iCMOVi, + "movw", "\t$Rd, $imm", []>, + RegConstraint<"$false = $Rd"> { + let Inst{31-27} = 0b11110; + let Inst{25} = 1; + let Inst{24-21} = 0b0010; + let Inst{20} = 0; // The S bit. + let Inst{15} = 0; + + bits<4> Rd; + bits<16> imm; + + let Inst{11-8} = Rd; + let Inst{19-16} = imm{15-12}; + let Inst{26} = imm{11}; + let Inst{14-12} = imm{10-8}; + let Inst{7-0} = imm{7-0}; +} + +let isMoveImm = 1 in +def t2MOVCCi32imm : PseudoInst<(outs rGPR:$dst), + (ins rGPR:$false, i32imm:$src, pred:$p), + IIC_iCMOVix2, []>, RegConstraint<"$false = $dst">; + +let isMoveImm = 1 in +def t2MVNCCi : T2OneRegImm<(outs rGPR:$Rd), (ins rGPR:$false, t2_so_imm:$imm), + IIC_iCMOVi, "mvn", ".w\t$Rd, $imm", +[/*(set rGPR:$Rd,(ARMcmov rGPR:$false,t2_so_imm_not:$imm, + imm:$cc, CCR:$ccr))*/]>, + RegConstraint<"$false = $Rd"> { + let Inst{31-27} = 0b11110; + let Inst{25} = 0; + let Inst{24-21} = 0b0011; + let Inst{20} = 0; // The S bit. + let Inst{19-16} = 0b1111; // Rn + let Inst{15} = 0; +} + class T2I_movcc_sh<bits<2> opcod, dag oops, dag iops, InstrItinClass itin, string opc, string asm, list<dag> pattern> - : T2I<oops, iops, itin, opc, asm, pattern> { + : T2TwoRegShiftImm<oops, iops, itin, opc, asm, pattern> { let Inst{31-27} = 0b11101; let Inst{26-25} = 0b01; let Inst{24-21} = 0b0010; @@ -2205,22 +2782,22 @@ class T2I_movcc_sh<bits<2> opcod, dag oops, dag iops, InstrItinClass itin, let Inst{19-16} = 0b1111; // Rn let Inst{5-4} = opcod; // Shift type. } -def t2MOVCClsl : T2I_movcc_sh<0b00, (outs rGPR:$dst), - (ins rGPR:$false, rGPR:$true, i32imm:$rhs), - IIC_iCMOVsi, "lsl", ".w\t$dst, $true, $rhs", []>, - RegConstraint<"$false = $dst">; -def t2MOVCClsr : T2I_movcc_sh<0b01, (outs rGPR:$dst), - (ins rGPR:$false, rGPR:$true, i32imm:$rhs), - IIC_iCMOVsi, "lsr", ".w\t$dst, $true, $rhs", []>, - RegConstraint<"$false = $dst">; -def t2MOVCCasr : T2I_movcc_sh<0b10, (outs rGPR:$dst), - (ins rGPR:$false, rGPR:$true, i32imm:$rhs), - IIC_iCMOVsi, "asr", ".w\t$dst, $true, $rhs", []>, - RegConstraint<"$false = $dst">; -def t2MOVCCror : T2I_movcc_sh<0b11, (outs rGPR:$dst), - (ins rGPR:$false, rGPR:$true, i32imm:$rhs), - IIC_iCMOVsi, "ror", ".w\t$dst, $true, $rhs", []>, - RegConstraint<"$false = $dst">; +def t2MOVCClsl : T2I_movcc_sh<0b00, (outs rGPR:$Rd), + (ins rGPR:$false, rGPR:$Rm, i32imm:$imm), + IIC_iCMOVsi, "lsl", ".w\t$Rd, $Rm, $imm", []>, + RegConstraint<"$false = $Rd">; +def t2MOVCClsr : T2I_movcc_sh<0b01, (outs rGPR:$Rd), + (ins rGPR:$false, rGPR:$Rm, i32imm:$imm), + IIC_iCMOVsi, "lsr", ".w\t$Rd, $Rm, $imm", []>, + RegConstraint<"$false = $Rd">; +def t2MOVCCasr : T2I_movcc_sh<0b10, (outs rGPR:$Rd), + (ins rGPR:$false, rGPR:$Rm, i32imm:$imm), + IIC_iCMOVsi, "asr", ".w\t$Rd, $Rm, $imm", []>, + RegConstraint<"$false = $Rd">; +def t2MOVCCror : T2I_movcc_sh<0b11, (outs rGPR:$Rd), + (ins rGPR:$false, rGPR:$Rm, i32imm:$imm), + IIC_iCMOVsi, "ror", ".w\t$Rd, $Rm, $imm", []>, + RegConstraint<"$false = $Rd">; } // neverHasSideEffects //===----------------------------------------------------------------------===// @@ -2229,78 +2806,29 @@ def t2MOVCCror : T2I_movcc_sh<0b11, (outs rGPR:$dst), // memory barriers protect the atomic sequences let hasSideEffects = 1 in { -def t2DMBsy : AInoP<(outs), (ins), ThumbFrm, NoItinerary, "dmb", "", - [(ARMMemBarrier)]>, Requires<[IsThumb, HasDB]> { - let Inst{31-4} = 0xF3BF8F5; - // FIXME: add support for options other than a full system DMB - let Inst{3-0} = 0b1111; -} - -def t2DSBsy : AInoP<(outs), (ins), ThumbFrm, NoItinerary, "dsb", "", - [(ARMSyncBarrier)]>, Requires<[IsThumb, HasDB]> { - let Inst{31-4} = 0xF3BF8F4; - // FIXME: add support for options other than a full system DSB - let Inst{3-0} = 0b1111; -} +def t2DMB : AInoP<(outs), (ins memb_opt:$opt), ThumbFrm, NoItinerary, + "dmb", "\t$opt", [(ARMMemBarrier (i32 imm:$opt))]>, + Requires<[IsThumb, HasDB]> { + bits<4> opt; + let Inst{31-4} = 0xf3bf8f5; + let Inst{3-0} = opt; } - -// Helper class for multiclass T2MemB -- for disassembly only -class T2I_memb<string opc, string asm> - : T2I<(outs), (ins), NoItinerary, opc, asm, - [/* For disassembly only; pattern left blank */]>, - Requires<[IsThumb2, HasV7]> { - let Inst{31-20} = 0xf3b; - let Inst{15-14} = 0b10; - let Inst{12} = 0; } -multiclass T2MemB<bits<4> op7_4, string opc> { - - def st : T2I_memb<opc, "\tst"> { - let Inst{7-4} = op7_4; - let Inst{3-0} = 0b1110; - } - - def ish : T2I_memb<opc, "\tish"> { - let Inst{7-4} = op7_4; - let Inst{3-0} = 0b1011; - } - - def ishst : T2I_memb<opc, "\tishst"> { - let Inst{7-4} = op7_4; - let Inst{3-0} = 0b1010; - } - - def nsh : T2I_memb<opc, "\tnsh"> { - let Inst{7-4} = op7_4; - let Inst{3-0} = 0b0111; - } - - def nshst : T2I_memb<opc, "\tnshst"> { - let Inst{7-4} = op7_4; - let Inst{3-0} = 0b0110; - } - - def osh : T2I_memb<opc, "\tosh"> { - let Inst{7-4} = op7_4; - let Inst{3-0} = 0b0011; - } - - def oshst : T2I_memb<opc, "\toshst"> { - let Inst{7-4} = op7_4; - let Inst{3-0} = 0b0010; - } +def t2DSB : AInoP<(outs), (ins memb_opt:$opt), ThumbFrm, NoItinerary, + "dsb", "\t$opt", + [/* For disassembly only; pattern left blank */]>, + Requires<[IsThumb, HasDB]> { + bits<4> opt; + let Inst{31-4} = 0xf3bf8f4; + let Inst{3-0} = opt; } -// These DMB variants are for disassembly only. -defm t2DMB : T2MemB<0b0101, "dmb">; - -// These DSB variants are for disassembly only. -defm t2DSB : T2MemB<0b0100, "dsb">; - // ISB has only full system option -- for disassembly only -def t2ISBsy : T2I_memb<"isb", ""> { - let Inst{7-4} = 0b0110; +def t2ISB : AInoP<(outs), (ins), ThumbFrm, NoItinerary, "isb", "", + [/* For disassembly only; pattern left blank */]>, + Requires<[IsThumb2, HasV7]> { + let Inst{31-4} = 0xf3bf8f6; let Inst{3-0} = 0b1111; } @@ -2314,6 +2842,11 @@ class T2I_ldrex<bits<2> opcod, dag oops, dag iops, AddrMode am, SizeFlagVal sz, let Inst{7-6} = 0b01; let Inst{5-4} = opcod; let Inst{3-0} = 0b1111; + + bits<4> Rn; + bits<4> Rt; + let Inst{19-16} = Rn; + let Inst{15-12} = Rt; } class T2I_strex<bits<2> opcod, dag oops, dag iops, AddrMode am, SizeFlagVal sz, InstrItinClass itin, string opc, string asm, string cstr, @@ -2324,60 +2857,88 @@ class T2I_strex<bits<2> opcod, dag oops, dag iops, AddrMode am, SizeFlagVal sz, let Inst{11-8} = rt2; let Inst{7-6} = 0b01; let Inst{5-4} = opcod; + + bits<4> Rd; + bits<4> Rn; + bits<4> Rt; + let Inst{11-8} = Rd; + let Inst{19-16} = Rn; + let Inst{15-12} = Rt; } let mayLoad = 1 in { -def t2LDREXB : T2I_ldrex<0b00, (outs rGPR:$dest), (ins rGPR:$ptr), AddrModeNone, - Size4Bytes, NoItinerary, "ldrexb", "\t$dest, [$ptr]", +def t2LDREXB : T2I_ldrex<0b00, (outs rGPR:$Rt), (ins rGPR:$Rn), AddrModeNone, + Size4Bytes, NoItinerary, "ldrexb", "\t$Rt, [$Rn]", "", []>; -def t2LDREXH : T2I_ldrex<0b01, (outs rGPR:$dest), (ins rGPR:$ptr), AddrModeNone, - Size4Bytes, NoItinerary, "ldrexh", "\t$dest, [$ptr]", +def t2LDREXH : T2I_ldrex<0b01, (outs rGPR:$Rt), (ins rGPR:$Rn), AddrModeNone, + Size4Bytes, NoItinerary, "ldrexh", "\t$Rt, [$Rn]", "", []>; -def t2LDREX : Thumb2I<(outs rGPR:$dest), (ins rGPR:$ptr), AddrModeNone, +def t2LDREX : Thumb2I<(outs rGPR:$Rt), (ins rGPR:$Rn), AddrModeNone, Size4Bytes, NoItinerary, - "ldrex", "\t$dest, [$ptr]", "", + "ldrex", "\t$Rt, [$Rn]", "", []> { let Inst{31-27} = 0b11101; let Inst{26-20} = 0b0000101; let Inst{11-8} = 0b1111; let Inst{7-0} = 0b00000000; // imm8 = 0 + + bits<4> Rn; + bits<4> Rt; + let Inst{19-16} = Rn; + let Inst{15-12} = Rt; } -def t2LDREXD : T2I_ldrex<0b11, (outs rGPR:$dest, rGPR:$dest2), (ins rGPR:$ptr), +def t2LDREXD : T2I_ldrex<0b11, (outs rGPR:$Rt, rGPR:$Rt2), (ins rGPR:$Rn), AddrModeNone, Size4Bytes, NoItinerary, - "ldrexd", "\t$dest, $dest2, [$ptr]", "", - [], {?, ?, ?, ?}>; + "ldrexd", "\t$Rt, $Rt2, [$Rn]", "", + [], {?, ?, ?, ?}> { + bits<4> Rt2; + let Inst{11-8} = Rt2; +} } -let mayStore = 1, Constraints = "@earlyclobber $success" in { -def t2STREXB : T2I_strex<0b00, (outs rGPR:$success), (ins rGPR:$src, rGPR:$ptr), +let mayStore = 1, Constraints = "@earlyclobber $Rd" in { +def t2STREXB : T2I_strex<0b00, (outs rGPR:$Rd), (ins rGPR:$Rt, rGPR:$Rn), AddrModeNone, Size4Bytes, NoItinerary, - "strexb", "\t$success, $src, [$ptr]", "", []>; -def t2STREXH : T2I_strex<0b01, (outs rGPR:$success), (ins rGPR:$src, rGPR:$ptr), + "strexb", "\t$Rd, $Rt, [$Rn]", "", []>; +def t2STREXH : T2I_strex<0b01, (outs rGPR:$Rd), (ins rGPR:$Rt, rGPR:$Rn), AddrModeNone, Size4Bytes, NoItinerary, - "strexh", "\t$success, $src, [$ptr]", "", []>; -def t2STREX : Thumb2I<(outs rGPR:$success), (ins rGPR:$src, rGPR:$ptr), + "strexh", "\t$Rd, $Rt, [$Rn]", "", []>; +def t2STREX : Thumb2I<(outs rGPR:$Rd), (ins rGPR:$Rt, rGPR:$Rn), AddrModeNone, Size4Bytes, NoItinerary, - "strex", "\t$success, $src, [$ptr]", "", + "strex", "\t$Rd, $Rt, [$Rn]", "", []> { let Inst{31-27} = 0b11101; let Inst{26-20} = 0b0000100; let Inst{7-0} = 0b00000000; // imm8 = 0 + + bits<4> Rd; + bits<4> Rn; + bits<4> Rt; + let Inst{11-8} = Rd; + let Inst{19-16} = Rn; + let Inst{15-12} = Rt; } -def t2STREXD : T2I_strex<0b11, (outs rGPR:$success), - (ins rGPR:$src, rGPR:$src2, rGPR:$ptr), +def t2STREXD : T2I_strex<0b11, (outs rGPR:$Rd), + (ins rGPR:$Rt, rGPR:$Rt2, rGPR:$Rn), AddrModeNone, Size4Bytes, NoItinerary, - "strexd", "\t$success, $src, $src2, [$ptr]", "", [], - {?, ?, ?, ?}>; + "strexd", "\t$Rd, $Rt, $Rt2, [$Rn]", "", [], + {?, ?, ?, ?}> { + bits<4> Rt2; + let Inst{11-8} = Rt2; +} } // Clear-Exclusive is for disassembly only. -def t2CLREX : T2I<(outs), (ins), NoItinerary, "clrex", "", - [/* For disassembly only; pattern left blank */]>, - Requires<[IsARM, HasV7]> { - let Inst{31-20} = 0xf3b; +def t2CLREX : T2XI<(outs), (ins), NoItinerary, "clrex", + [/* For disassembly only; pattern left blank */]>, + Requires<[IsThumb2, HasV7]> { + let Inst{31-16} = 0xf3bf; let Inst{15-14} = 0b10; + let Inst{13} = 0; let Inst{12} = 0; + let Inst{11-8} = 0b1111; let Inst{7-4} = 0b0010; + let Inst{3-0} = 0b1111; } //===----------------------------------------------------------------------===// @@ -2386,7 +2947,7 @@ def t2CLREX : T2I<(outs), (ins), NoItinerary, "clrex", "", // __aeabi_read_tp preserves the registers r1-r3. let isCall = 1, - Defs = [R0, R12, LR, CPSR] in { + Defs = [R0, R12, LR, CPSR], Uses = [SP] in { def t2TPsoft : T2XI<(outs), (ins), IIC_Br, "bl\t__aeabi_read_tp", [(set R0, ARMthread_pointer)]> { @@ -2413,32 +2974,18 @@ let Defs = [ R0, R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, LR, D0, D1, D2, D3, D4, D5, D6, D7, D8, D9, D10, D11, D12, D13, D14, D15, D16, D17, D18, D19, D20, D21, D22, D23, D24, D25, D26, D27, D28, D29, D30, - D31 ], hasSideEffects = 1, isBarrier = 1 in { + D31 ], hasSideEffects = 1, isBarrier = 1, isCodeGenOnly = 1 in { def t2Int_eh_sjlj_setjmp : Thumb2XI<(outs), (ins tGPR:$src, tGPR:$val), - AddrModeNone, SizeSpecial, NoItinerary, - "mov\t$val, pc\t${:comment} begin eh.setjmp\n\t" - "adds\t$val, #7\n\t" - "str\t$val, [$src, #4]\n\t" - "movs\tr0, #0\n\t" - "b\t1f\n\t" - "movs\tr0, #1\t${:comment} end eh.setjmp\n\t" - "1:", "", + AddrModeNone, SizeSpecial, NoItinerary, "", "", [(set R0, (ARMeh_sjlj_setjmp tGPR:$src, tGPR:$val))]>, Requires<[IsThumb2, HasVFP2]>; } let Defs = [ R0, R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, LR ], - hasSideEffects = 1, isBarrier = 1 in { + hasSideEffects = 1, isBarrier = 1, isCodeGenOnly = 1 in { def t2Int_eh_sjlj_setjmp_nofp : Thumb2XI<(outs), (ins tGPR:$src, tGPR:$val), - AddrModeNone, SizeSpecial, NoItinerary, - "mov\t$val, pc\t${:comment} begin eh.setjmp\n\t" - "adds\t$val, #7\n\t" - "str\t$val, [$src, #4]\n\t" - "movs\tr0, #0\n\t" - "b\t1f\n\t" - "movs\tr0, #1\t${:comment} end eh.setjmp\n\t" - "1:", "", + AddrModeNone, SizeSpecial, NoItinerary, "", "", [(set R0, (ARMeh_sjlj_setjmp tGPR:$src, tGPR:$val))]>, Requires<[IsThumb2, NoVFP]>; } @@ -2453,82 +3000,77 @@ let Defs = // operand list. // FIXME: Should pc be an implicit operand like PICADD, etc? let isReturn = 1, isTerminator = 1, isBarrier = 1, mayLoad = 1, - hasExtraDefRegAllocReq = 1 in - def t2LDM_RET : T2XIt<(outs GPR:$wb), (ins addrmode4:$addr, pred:$p, - reglist:$dsts, variable_ops), IIC_Br, - "ldm${addr:submode}${p}${addr:wide}\t$addr!, $dsts", - "$addr.addr = $wb", []> { + hasExtraDefRegAllocReq = 1, isCodeGenOnly = 1 in +def t2LDMIA_RET: T2XIt<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, + reglist:$regs, variable_ops), + IIC_iLoad_mBr, + "ldmia${p}.w\t$Rn!, $regs", + "$Rn = $wb", []> { + bits<4> Rn; + bits<16> regs; + let Inst{31-27} = 0b11101; let Inst{26-25} = 0b00; - let Inst{24-23} = {?, ?}; // IA: '01', DB: '10' - let Inst{22} = 0; - let Inst{21} = 1; // The W bit. - let Inst{20} = 1; // Load + let Inst{24-23} = 0b01; // Increment After + let Inst{22} = 0; + let Inst{21} = 1; // Writeback + let Inst{20} = 1; + let Inst{19-16} = Rn; + let Inst{15-0} = regs; } let isBranch = 1, isTerminator = 1, isBarrier = 1 in { let isPredicable = 1 in -def t2B : T2XI<(outs), (ins brtarget:$target), IIC_Br, +def t2B : T2XI<(outs), (ins uncondbrtarget:$target), IIC_Br, "b.w\t$target", [(br bb:$target)]> { let Inst{31-27} = 0b11110; let Inst{15-14} = 0b10; let Inst{12} = 1; + + bits<20> target; + let Inst{26} = target{19}; + let Inst{11} = target{18}; + let Inst{13} = target{17}; + let Inst{21-16} = target{16-11}; + let Inst{10-0} = target{10-0}; } let isNotDuplicable = 1, isIndirectBranch = 1 in { -def t2BR_JT : - T2JTI<(outs), - (ins GPR:$target, GPR:$index, jt2block_operand:$jt, i32imm:$id), - IIC_Br, "mov\tpc, $target$jt", - [(ARMbr2jt GPR:$target, GPR:$index, tjumptable:$jt, imm:$id)]> { - let Inst{31-27} = 0b11101; - let Inst{26-20} = 0b0100100; - let Inst{19-16} = 0b1111; - let Inst{14-12} = 0b000; - let Inst{11-8} = 0b1111; // Rd = pc - let Inst{7-4} = 0b0000; -} +def t2BR_JT : t2PseudoInst<(outs), + (ins GPR:$target, GPR:$index, i32imm:$jt, i32imm:$id), + SizeSpecial, IIC_Br, + [(ARMbr2jt GPR:$target, GPR:$index, tjumptable:$jt, imm:$id)]>; // FIXME: Add a non-pc based case that can be predicated. -def t2TBB : - T2JTI<(outs), - (ins tb_addrmode:$index, jt2block_operand:$jt, i32imm:$id), - IIC_Br, "tbb\t$index$jt", []> { - let Inst{31-27} = 0b11101; - let Inst{26-20} = 0b0001101; - let Inst{19-16} = 0b1111; // Rn = pc (table follows this instruction) - let Inst{15-8} = 0b11110000; - let Inst{7-4} = 0b0000; // B form -} - -def t2TBH : - T2JTI<(outs), - (ins tb_addrmode:$index, jt2block_operand:$jt, i32imm:$id), - IIC_Br, "tbh\t$index$jt", []> { - let Inst{31-27} = 0b11101; - let Inst{26-20} = 0b0001101; - let Inst{19-16} = 0b1111; // Rn = pc (table follows this instruction) - let Inst{15-8} = 0b11110000; - let Inst{7-4} = 0b0001; // H form -} - -// Generic versions of the above two instructions, for disassembly only - -def t2TBBgen : T2I<(outs), (ins GPR:$a, GPR:$b), IIC_Br, - "tbb", "\t[$a, $b]", []>{ - let Inst{31-27} = 0b11101; - let Inst{26-20} = 0b0001101; - let Inst{15-8} = 0b11110000; - let Inst{7-4} = 0b0000; // B form -} - -def t2TBHgen : T2I<(outs), (ins GPR:$a, GPR:$b), IIC_Br, - "tbh", "\t[$a, $b, lsl #1]", []> { - let Inst{31-27} = 0b11101; - let Inst{26-20} = 0b0001101; - let Inst{15-8} = 0b11110000; - let Inst{7-4} = 0b0001; // H form +def t2TBB_JT : t2PseudoInst<(outs), + (ins GPR:$index, i32imm:$jt, i32imm:$id), + SizeSpecial, IIC_Br, []>; + +def t2TBH_JT : t2PseudoInst<(outs), + (ins GPR:$index, i32imm:$jt, i32imm:$id), + SizeSpecial, IIC_Br, []>; + +def t2TBB : T2I<(outs), (ins GPR:$Rn, GPR:$Rm), IIC_Br, + "tbb", "\t[$Rn, $Rm]", []> { + bits<4> Rn; + bits<4> Rm; + let Inst{31-20} = 0b111010001101; + let Inst{19-16} = Rn; + let Inst{15-5} = 0b11110000000; + let Inst{4} = 0; // B form + let Inst{3-0} = Rm; +} + +def t2TBH : T2I<(outs), (ins GPR:$Rn, GPR:$Rm), IIC_Br, + "tbh", "\t[$Rn, $Rm, lsl #1]", []> { + bits<4> Rn; + bits<4> Rm; + let Inst{31-20} = 0b111010001101; + let Inst{19-16} = Rn; + let Inst{15-5} = 0b11110000000; + let Inst{4} = 1; // H form + let Inst{3-0} = Rm; } } // isNotDuplicable, isIndirectBranch @@ -2543,6 +3085,16 @@ def t2Bcc : T2I<(outs), (ins brtarget:$target), IIC_Br, let Inst{31-27} = 0b11110; let Inst{15-14} = 0b10; let Inst{12} = 0; + + bits<4> p; + let Inst{25-22} = p; + + bits<21> target; + let Inst{26} = target{20}; + let Inst{11} = target{19}; + let Inst{13} = target{18}; + let Inst{21-16} = target{17-12}; + let Inst{10-0} = target{11-1}; } @@ -2554,6 +3106,11 @@ def t2IT : Thumb2XI<(outs), (ins it_pred:$cc, it_mask:$mask), // 16-bit instruction. let Inst{31-16} = 0x0000; let Inst{15-8} = 0b10111111; + + bits<4> cc; + bits<4> mask; + let Inst{7-4} = cc; + let Inst{3-0} = mask; } // Branch and Exchange Jazelle -- for disassembly only @@ -2565,22 +3122,44 @@ def t2BXJ : T2I<(outs), (ins rGPR:$func), NoItinerary, "bxj", "\t$func", let Inst{25-20} = 0b111100; let Inst{15-14} = 0b10; let Inst{12} = 0; + + bits<4> func; + let Inst{19-16} = func; } -// Change Processor State is a system instruction -- for disassembly only. -// The singleton $opt operand contains the following information: -// opt{4-0} = mode from Inst{4-0} -// opt{5} = changemode from Inst{17} -// opt{8-6} = AIF from Inst{8-6} -// opt{10-9} = imod from Inst{19-18} with 0b10 as enable and 0b11 as disable -def t2CPS : T2XI<(outs),(ins cps_opt:$opt), NoItinerary, "cps$opt", - [/* For disassembly only; pattern left blank */]> { +// Change Processor State is a system instruction -- for disassembly and +// parsing only. +// FIXME: Since the asm parser has currently no clean way to handle optional +// operands, create 3 versions of the same instruction. Once there's a clean +// framework to represent optional operands, change this behavior. +class t2CPS<dag iops, string asm_op> : T2XI<(outs), iops, NoItinerary, + !strconcat("cps", asm_op), + [/* For disassembly only; pattern left blank */]> { + bits<2> imod; + bits<3> iflags; + bits<5> mode; + bit M; + let Inst{31-27} = 0b11110; - let Inst{26} = 0; + let Inst{26} = 0; let Inst{25-20} = 0b111010; + let Inst{19-16} = 0b1111; let Inst{15-14} = 0b10; - let Inst{12} = 0; -} + let Inst{12} = 0; + let Inst{10-9} = imod; + let Inst{8} = M; + let Inst{7-5} = iflags; + let Inst{4-0} = mode; +} + +let M = 1 in + def t2CPS3p : t2CPS<(ins imod_op:$imod, iflags_op:$iflags, i32imm:$mode), + "$imod.w\t$iflags, $mode">; +let mode = 0, M = 0 in + def t2CPS2p : t2CPS<(ins imod_op:$imod, iflags_op:$iflags), + "$imod.w\t$iflags">; +let imod = 0, iflags = 0, M = 1 in + def t2CPS1p : t2CPS<(ins i32imm:$mode), "\t$mode">; // A6.3.4 Branches and miscellaneous control // Table A6-14 Change Processor State, and hint instructions @@ -2589,6 +3168,7 @@ class T2I_hint<bits<8> op7_0, string opc, string asm> : T2I<(outs), (ins), NoItinerary, opc, asm, [/* For disassembly only; pattern left blank */]> { let Inst{31-20} = 0xf3a; + let Inst{19-16} = 0b1111; let Inst{15-14} = 0b10; let Inst{12} = 0; let Inst{10-8} = 0b000; @@ -2608,6 +3188,9 @@ def t2DBG : T2I<(outs),(ins i32imm:$opt), NoItinerary, "dbg", "\t$opt", let Inst{12} = 0; let Inst{10-8} = 0b000; let Inst{7-4} = 0b1111; + + bits<4> opt; + let Inst{3-0} = opt; } // Secure Monitor Call is a system instruction -- for disassembly only @@ -2617,83 +3200,86 @@ def t2SMC : T2I<(outs), (ins i32imm:$opt), NoItinerary, "smc", "\t$opt", let Inst{31-27} = 0b11110; let Inst{26-20} = 0b1111111; let Inst{15-12} = 0b1000; -} -// Store Return State is a system instruction -- for disassembly only -def t2SRSDBW : T2I<(outs),(ins i32imm:$mode),NoItinerary,"srsdb","\tsp!, $mode", - [/* For disassembly only; pattern left blank */]> { - let Inst{31-27} = 0b11101; - let Inst{26-20} = 0b0000010; // W = 1 + bits<4> opt; + let Inst{19-16} = opt; } -def t2SRSDB : T2I<(outs),(ins i32imm:$mode),NoItinerary,"srsdb","\tsp, $mode", - [/* For disassembly only; pattern left blank */]> { - let Inst{31-27} = 0b11101; - let Inst{26-20} = 0b0000000; // W = 0 -} +class T2SRS<bits<12> op31_20, + dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : T2I<oops, iops, itin, opc, asm, pattern> { + let Inst{31-20} = op31_20{11-0}; -def t2SRSIAW : T2I<(outs),(ins i32imm:$mode),NoItinerary,"srsia","\tsp!, $mode", - [/* For disassembly only; pattern left blank */]> { - let Inst{31-27} = 0b11101; - let Inst{26-20} = 0b0011010; // W = 1 + bits<5> mode; + let Inst{4-0} = mode{4-0}; } -def t2SRSIA : T2I<(outs), (ins i32imm:$mode),NoItinerary,"srsia","\tsp, $mode", - [/* For disassembly only; pattern left blank */]> { - let Inst{31-27} = 0b11101; - let Inst{26-20} = 0b0011000; // W = 0 -} +// Store Return State is a system instruction -- for disassembly only +def t2SRSDBW : T2SRS<0b111010000010, + (outs),(ins i32imm:$mode),NoItinerary,"srsdb","\tsp!, $mode", + [/* For disassembly only; pattern left blank */]>; +def t2SRSDB : T2SRS<0b111010000000, + (outs),(ins i32imm:$mode),NoItinerary,"srsdb","\tsp, $mode", + [/* For disassembly only; pattern left blank */]>; +def t2SRSIAW : T2SRS<0b111010011010, + (outs),(ins i32imm:$mode),NoItinerary,"srsia","\tsp!, $mode", + [/* For disassembly only; pattern left blank */]>; +def t2SRSIA : T2SRS<0b111010011000, + (outs), (ins i32imm:$mode),NoItinerary,"srsia","\tsp, $mode", + [/* For disassembly only; pattern left blank */]>; // Return From Exception is a system instruction -- for disassembly only -def t2RFEDBW : T2I<(outs), (ins rGPR:$base), NoItinerary, "rfedb", "\t$base!", - [/* For disassembly only; pattern left blank */]> { - let Inst{31-27} = 0b11101; - let Inst{26-20} = 0b0000011; // W = 1 -} -def t2RFEDB : T2I<(outs), (ins rGPR:$base), NoItinerary, "rfeab", "\t$base", - [/* For disassembly only; pattern left blank */]> { - let Inst{31-27} = 0b11101; - let Inst{26-20} = 0b0000001; // W = 0 -} +class T2RFE<bits<12> op31_20, dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : T2I<oops, iops, itin, opc, asm, pattern> { + let Inst{31-20} = op31_20{11-0}; -def t2RFEIAW : T2I<(outs), (ins rGPR:$base), NoItinerary, "rfeia", "\t$base!", - [/* For disassembly only; pattern left blank */]> { - let Inst{31-27} = 0b11101; - let Inst{26-20} = 0b0011011; // W = 1 + bits<4> Rn; + let Inst{19-16} = Rn; } -def t2RFEIA : T2I<(outs), (ins rGPR:$base), NoItinerary, "rfeia", "\t$base", - [/* For disassembly only; pattern left blank */]> { - let Inst{31-27} = 0b11101; - let Inst{26-20} = 0b0011001; // W = 0 -} +def t2RFEDBW : T2RFE<0b111010000011, + (outs), (ins rGPR:$Rn), NoItinerary, "rfedb", "\t$Rn!", + [/* For disassembly only; pattern left blank */]>; +def t2RFEDB : T2RFE<0b111010000001, + (outs), (ins rGPR:$Rn), NoItinerary, "rfeab", "\t$Rn", + [/* For disassembly only; pattern left blank */]>; +def t2RFEIAW : T2RFE<0b111010011011, + (outs), (ins rGPR:$Rn), NoItinerary, "rfeia", "\t$Rn!", + [/* For disassembly only; pattern left blank */]>; +def t2RFEIA : T2RFE<0b111010011001, + (outs), (ins rGPR:$Rn), NoItinerary, "rfeia", "\t$Rn", + [/* For disassembly only; pattern left blank */]>; //===----------------------------------------------------------------------===// // Non-Instruction Patterns // -// Two piece so_imms. -def : T2Pat<(or rGPR:$LHS, t2_so_imm2part:$RHS), - (t2ORRri (t2ORRri rGPR:$LHS, (t2_so_imm2part_1 imm:$RHS)), - (t2_so_imm2part_2 imm:$RHS))>; -def : T2Pat<(xor rGPR:$LHS, t2_so_imm2part:$RHS), - (t2EORri (t2EORri rGPR:$LHS, (t2_so_imm2part_1 imm:$RHS)), - (t2_so_imm2part_2 imm:$RHS))>; -def : T2Pat<(add rGPR:$LHS, t2_so_imm2part:$RHS), - (t2ADDri (t2ADDri rGPR:$LHS, (t2_so_imm2part_1 imm:$RHS)), - (t2_so_imm2part_2 imm:$RHS))>; -def : T2Pat<(add rGPR:$LHS, t2_so_neg_imm2part:$RHS), - (t2SUBri (t2SUBri rGPR:$LHS, (t2_so_neg_imm2part_1 imm:$RHS)), - (t2_so_neg_imm2part_2 imm:$RHS))>; - // 32-bit immediate using movw + movt. -// This is a single pseudo instruction to make it re-materializable. Remove -// when we can do generalized remat. -let isReMaterializable = 1 in -def t2MOVi32imm : T2Ix2<(outs rGPR:$dst), (ins i32imm:$src), IIC_iMOVi, - "movw", "\t$dst, ${src:lo16}\n\tmovt${p}\t$dst, ${src:hi16}", - [(set rGPR:$dst, (i32 imm:$src))]>; +// This is a single pseudo instruction to make it re-materializable. +// FIXME: Remove this when we can do generalized remat. +let isReMaterializable = 1, isMoveImm = 1 in +def t2MOVi32imm : PseudoInst<(outs rGPR:$dst), (ins i32imm:$src), IIC_iMOVix2, + [(set rGPR:$dst, (i32 imm:$src))]>, + Requires<[IsThumb, HasV6T2]>; + +// Pseudo instruction that combines movw + movt + add pc (if pic). +// It also makes it possible to rematerialize the instructions. +// FIXME: Remove this when we can do generalized remat and when machine licm +// can properly the instructions. +let isReMaterializable = 1 in { +def t2MOV_ga_pcrel : PseudoInst<(outs rGPR:$dst), (ins i32imm:$addr), + IIC_iMOVix2addpc, + [(set rGPR:$dst, (ARMWrapperPIC tglobaladdr:$addr))]>, + Requires<[IsThumb2, UseMovt]>; + +def t2MOV_ga_dyn : PseudoInst<(outs rGPR:$dst), (ins i32imm:$addr), + IIC_iMOVix2, + [(set rGPR:$dst, (ARMWrapperDYN tglobaladdr:$addr))]>, + Requires<[IsThumb2, UseMovt]>; +} // ConstantPool, GlobalAddress, and JumpTable def : T2Pat<(ARMWrapper tglobaladdr :$dst), (t2LEApcrel tglobaladdr :$dst)>, @@ -2709,10 +3295,9 @@ def : T2Pat<(ARMWrapperJT tjumptable:$dst, imm:$id), // be expanded into two instructions late to allow if-conversion and // scheduling. let canFoldAsLoad = 1, isReMaterializable = 1 in -def t2LDRpci_pic : PseudoInst<(outs GPR:$dst), (ins i32imm:$addr, pclabel:$cp), - NoItinerary, - "${:comment} ldr.w\t$dst, $addr\n$cp:\n\tadd\t$dst, pc", - [(set GPR:$dst, (ARMpic_add (load (ARMWrapper tconstpool:$addr)), +def t2LDRpci_pic : PseudoInst<(outs rGPR:$dst), (ins i32imm:$addr, pclabel:$cp), + IIC_iLoadiALU, + [(set rGPR:$dst, (ARMpic_add (load (ARMWrapper tconstpool:$addr)), imm:$cp))]>, Requires<[IsThumb2]>; @@ -2720,48 +3305,128 @@ def t2LDRpci_pic : PseudoInst<(outs GPR:$dst), (ins i32imm:$addr, pclabel:$cp), // Move between special register and ARM core register -- for disassembly only // -// Rd = Instr{11-8} -def t2MRS : T2I<(outs rGPR:$dst), (ins), NoItinerary, "mrs", "\t$dst, cpsr", - [/* For disassembly only; pattern left blank */]> { - let Inst{31-27} = 0b11110; - let Inst{26} = 0; - let Inst{25-21} = 0b11111; - let Inst{20} = 0; // The R bit. - let Inst{15-14} = 0b10; - let Inst{12} = 0; +class T2SpecialReg<bits<12> op31_20, bits<2> op15_14, bits<1> op12, + dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : T2I<oops, iops, itin, opc, asm, pattern> { + let Inst{31-20} = op31_20{11-0}; + let Inst{15-14} = op15_14{1-0}; + let Inst{12} = op12{0}; } -// Rd = Instr{11-8} -def t2MRSsys : T2I<(outs rGPR:$dst), (ins), NoItinerary, "mrs", "\t$dst, spsr", - [/* For disassembly only; pattern left blank */]> { - let Inst{31-27} = 0b11110; - let Inst{26} = 0; - let Inst{25-21} = 0b11111; - let Inst{20} = 1; // The R bit. - let Inst{15-14} = 0b10; - let Inst{12} = 0; +class T2MRS<bits<12> op31_20, bits<2> op15_14, bits<1> op12, + dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : T2SpecialReg<op31_20, op15_14, op12, oops, iops, itin, opc, asm, pattern> { + bits<4> Rd; + let Inst{11-8} = Rd; + let Inst{19-16} = 0b1111; } -// Rn = Inst{19-16} -def t2MSR : T2I<(outs), (ins rGPR:$src, msr_mask:$mask), NoItinerary, "msr", - "\tcpsr$mask, $src", - [/* For disassembly only; pattern left blank */]> { - let Inst{31-27} = 0b11110; - let Inst{26} = 0; - let Inst{25-21} = 0b11100; - let Inst{20} = 0; // The R bit. - let Inst{15-14} = 0b10; - let Inst{12} = 0; +def t2MRS : T2MRS<0b111100111110, 0b10, 0, + (outs rGPR:$Rd), (ins), NoItinerary, "mrs", "\t$Rd, cpsr", + [/* For disassembly only; pattern left blank */]>; +def t2MRSsys : T2MRS<0b111100111111, 0b10, 0, + (outs rGPR:$Rd), (ins), NoItinerary, "mrs", "\t$Rd, spsr", + [/* For disassembly only; pattern left blank */]>; + +// Move from ARM core register to Special Register +// +// No need to have both system and application versions, the encodings are the +// same and the assembly parser has no way to distinguish between them. The mask +// operand contains the special register (R Bit) in bit 4 and bits 3-0 contains +// the mask with the fields to be accessed in the special register. +def t2MSR : T2SpecialReg<0b111100111000 /* op31-20 */, 0b10 /* op15-14 */, + 0 /* op12 */, (outs), (ins msr_mask:$mask, rGPR:$Rn), + NoItinerary, "msr", "\t$mask, $Rn", + [/* For disassembly only; pattern left blank */]> { + bits<5> mask; + bits<4> Rn; + let Inst{19-16} = Rn; + let Inst{20} = mask{4}; // R Bit + let Inst{13} = 0b0; + let Inst{11-8} = mask{3-0}; } -// Rn = Inst{19-16} -def t2MSRsys : T2I<(outs), (ins rGPR:$src, msr_mask:$mask), NoItinerary, "msr", - "\tspsr$mask, $src", +//===----------------------------------------------------------------------===// +// Move between coprocessor and ARM core register -- for disassembly only +// + +class t2MovRCopro<string opc, bit direction> + : T2Cop<(outs), (ins p_imm:$cop, i32imm:$opc1, + GPR:$Rt, c_imm:$CRn, c_imm:$CRm, i32imm:$opc2), + !strconcat(opc, "\t$cop, $opc1, $Rt, $CRn, $CRm, $opc2"), + [/* For disassembly only; pattern left blank */]> { + let Inst{27-24} = 0b1110; + let Inst{20} = direction; + let Inst{4} = 1; + + bits<4> Rt; + bits<4> cop; + bits<3> opc1; + bits<3> opc2; + bits<4> CRm; + bits<4> CRn; + + let Inst{15-12} = Rt; + let Inst{11-8} = cop; + let Inst{23-21} = opc1; + let Inst{7-5} = opc2; + let Inst{3-0} = CRm; + let Inst{19-16} = CRn; +} + +def t2MCR2 : t2MovRCopro<"mcr2", 0 /* from ARM core register to coprocessor */>; +def t2MRC2 : t2MovRCopro<"mrc2", 1 /* from coprocessor to ARM core register */>; + +class t2MovRRCopro<string opc, bit direction> + : T2Cop<(outs), (ins p_imm:$cop, i32imm:$opc1, GPR:$Rt, GPR:$Rt2, c_imm:$CRm), + !strconcat(opc, "\t$cop, $opc1, $Rt, $Rt2, $CRm"), + [/* For disassembly only; pattern left blank */]> { + let Inst{27-24} = 0b1100; + let Inst{23-21} = 0b010; + let Inst{20} = direction; + + bits<4> Rt; + bits<4> Rt2; + bits<4> cop; + bits<4> opc1; + bits<4> CRm; + + let Inst{15-12} = Rt; + let Inst{19-16} = Rt2; + let Inst{11-8} = cop; + let Inst{7-4} = opc1; + let Inst{3-0} = CRm; +} + +def t2MCRR2 : t2MovRRCopro<"mcrr2", + 0 /* from ARM core register to coprocessor */>; +def t2MRRC2 : t2MovRRCopro<"mrrc2", + 1 /* from coprocessor to ARM core register */>; + +//===----------------------------------------------------------------------===// +// Other Coprocessor Instructions. For disassembly only. +// + +def t2CDP2 : T2Cop<(outs), (ins p_imm:$cop, i32imm:$opc1, + c_imm:$CRd, c_imm:$CRn, c_imm:$CRm, i32imm:$opc2), + "cdp2\t$cop, $opc1, $CRd, $CRn, $CRm, $opc2", [/* For disassembly only; pattern left blank */]> { - let Inst{31-27} = 0b11110; - let Inst{26} = 0; - let Inst{25-21} = 0b11100; - let Inst{20} = 1; // The R bit. - let Inst{15-14} = 0b10; - let Inst{12} = 0; + let Inst{27-24} = 0b1110; + + bits<4> opc1; + bits<4> CRn; + bits<4> CRd; + bits<4> cop; + bits<3> opc2; + bits<4> CRm; + + let Inst{3-0} = CRm; + let Inst{4} = 0; + let Inst{7-5} = opc2; + let Inst{11-8} = cop; + let Inst{15-12} = CRd; + let Inst{19-16} = CRn; + let Inst{23-20} = opc1; } diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrVFP.td b/contrib/llvm/lib/Target/ARM/ARMInstrVFP.td index c29e096..920c5c9 100644 --- a/contrib/llvm/lib/Target/ARM/ARMInstrVFP.td +++ b/contrib/llvm/lib/Target/ARM/ARMInstrVFP.td @@ -1,4 +1,4 @@ -//===- ARMInstrVFP.td - VFP support for ARM -------------------------------===// +//===- ARMInstrVFP.td - VFP support for ARM ----------------*- tablegen -*-===// // // The LLVM Compiler Infrastructure // @@ -11,30 +11,26 @@ // //===----------------------------------------------------------------------===// -def SDT_FTOI : -SDTypeProfile<1, 1, [SDTCisVT<0, f32>, SDTCisFP<1>]>; -def SDT_ITOF : -SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisVT<1, f32>]>; -def SDT_CMPFP0 : -SDTypeProfile<0, 1, [SDTCisFP<0>]>; -def SDT_VMOVDRR : -SDTypeProfile<1, 2, [SDTCisVT<0, f64>, SDTCisVT<1, i32>, - SDTCisSameAs<1, 2>]>; - -def arm_ftoui : SDNode<"ARMISD::FTOUI", SDT_FTOI>; -def arm_ftosi : SDNode<"ARMISD::FTOSI", SDT_FTOI>; -def arm_sitof : SDNode<"ARMISD::SITOF", SDT_ITOF>; -def arm_uitof : SDNode<"ARMISD::UITOF", SDT_ITOF>; -def arm_fmstat : SDNode<"ARMISD::FMSTAT", SDTNone, [SDNPInFlag,SDNPOutFlag]>; -def arm_cmpfp : SDNode<"ARMISD::CMPFP", SDT_ARMCmp, [SDNPOutFlag]>; -def arm_cmpfp0 : SDNode<"ARMISD::CMPFPw0",SDT_CMPFP0, [SDNPOutFlag]>; -def arm_fmdrr : SDNode<"ARMISD::VMOVDRR", SDT_VMOVDRR>; +def SDT_FTOI : SDTypeProfile<1, 1, [SDTCisVT<0, f32>, SDTCisFP<1>]>; +def SDT_ITOF : SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisVT<1, f32>]>; +def SDT_CMPFP0 : SDTypeProfile<0, 1, [SDTCisFP<0>]>; +def SDT_VMOVDRR : SDTypeProfile<1, 2, [SDTCisVT<0, f64>, SDTCisVT<1, i32>, + SDTCisSameAs<1, 2>]>; + +def arm_ftoui : SDNode<"ARMISD::FTOUI", SDT_FTOI>; +def arm_ftosi : SDNode<"ARMISD::FTOSI", SDT_FTOI>; +def arm_sitof : SDNode<"ARMISD::SITOF", SDT_ITOF>; +def arm_uitof : SDNode<"ARMISD::UITOF", SDT_ITOF>; +def arm_fmstat : SDNode<"ARMISD::FMSTAT", SDTNone, [SDNPInGlue, SDNPOutGlue]>; +def arm_cmpfp : SDNode<"ARMISD::CMPFP", SDT_ARMCmp, [SDNPOutGlue]>; +def arm_cmpfp0 : SDNode<"ARMISD::CMPFPw0", SDT_CMPFP0, [SDNPOutGlue]>; +def arm_fmdrr : SDNode<"ARMISD::VMOVDRR", SDT_VMOVDRR>; + //===----------------------------------------------------------------------===// // Operand Definitions. // - def vfp_f32imm : Operand<f32>, PatLeaf<(f32 fpimm), [{ return ARM::getVFPf32Imm(N->getValueAPF()) != -1; @@ -55,86 +51,136 @@ def vfp_f64imm : Operand<f64>, // let canFoldAsLoad = 1, isReMaterializable = 1 in { -def VLDRD : ADI5<0b1101, 0b01, (outs DPR:$dst), (ins addrmode5:$addr), - IIC_fpLoad64, "vldr", ".64\t$dst, $addr", - [(set DPR:$dst, (f64 (load addrmode5:$addr)))]>; -def VLDRS : ASI5<0b1101, 0b01, (outs SPR:$dst), (ins addrmode5:$addr), - IIC_fpLoad32, "vldr", ".32\t$dst, $addr", - [(set SPR:$dst, (load addrmode5:$addr))]>; -} // canFoldAsLoad +def VLDRD : ADI5<0b1101, 0b01, (outs DPR:$Dd), (ins addrmode5:$addr), + IIC_fpLoad64, "vldr", ".64\t$Dd, $addr", + [(set DPR:$Dd, (f64 (load addrmode5:$addr)))]>; -def VSTRD : ADI5<0b1101, 0b00, (outs), (ins DPR:$src, addrmode5:$addr), - IIC_fpStore64, "vstr", ".64\t$src, $addr", - [(store (f64 DPR:$src), addrmode5:$addr)]>; +def VLDRS : ASI5<0b1101, 0b01, (outs SPR:$Sd), (ins addrmode5:$addr), + IIC_fpLoad32, "vldr", ".32\t$Sd, $addr", + [(set SPR:$Sd, (load addrmode5:$addr))]> { + // Some single precision VFP instructions may be executed on both NEON and VFP + // pipelines. + let D = VFPNeonDomain; +} -def VSTRS : ASI5<0b1101, 0b00, (outs), (ins SPR:$src, addrmode5:$addr), - IIC_fpStore32, "vstr", ".32\t$src, $addr", - [(store SPR:$src, addrmode5:$addr)]>; +} // End of 'let canFoldAsLoad = 1, isReMaterializable = 1 in' -//===----------------------------------------------------------------------===// -// Load / store multiple Instructions. -// +def VSTRD : ADI5<0b1101, 0b00, (outs), (ins DPR:$Dd, addrmode5:$addr), + IIC_fpStore64, "vstr", ".64\t$Dd, $addr", + [(store (f64 DPR:$Dd), addrmode5:$addr)]>; -let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 in { -def VLDMD : AXDI4<(outs), (ins addrmode4:$addr, pred:$p, reglist:$dsts, - variable_ops), IndexModeNone, IIC_fpLoadm, - "vldm${addr:submode}${p}\t$addr, $dsts", "", []> { - let Inst{20} = 1; +def VSTRS : ASI5<0b1101, 0b00, (outs), (ins SPR:$Sd, addrmode5:$addr), + IIC_fpStore32, "vstr", ".32\t$Sd, $addr", + [(store SPR:$Sd, addrmode5:$addr)]> { + // Some single precision VFP instructions may be executed on both NEON and VFP + // pipelines. + let D = VFPNeonDomain; } -def VLDMS : AXSI4<(outs), (ins addrmode4:$addr, pred:$p, reglist:$dsts, - variable_ops), IndexModeNone, IIC_fpLoadm, - "vldm${addr:submode}${p}\t$addr, $dsts", "", []> { - let Inst{20} = 1; -} +//===----------------------------------------------------------------------===// +// Load / store multiple Instructions. +// -def VLDMD_UPD : AXDI4<(outs GPR:$wb), (ins addrmode4:$addr, pred:$p, - reglist:$dsts, variable_ops), - IndexModeUpd, IIC_fpLoadm, - "vldm${addr:submode}${p}\t$addr!, $dsts", - "$addr.addr = $wb", []> { - let Inst{20} = 1; +multiclass vfp_ldst_mult<string asm, bit L_bit, + InstrItinClass itin, InstrItinClass itin_upd> { + // Double Precision + def DIA : + AXDI4<(outs), (ins GPR:$Rn, pred:$p, dpr_reglist:$regs, variable_ops), + IndexModeNone, itin, + !strconcat(asm, "ia${p}\t$Rn, $regs"), "", []> { + let Inst{24-23} = 0b01; // Increment After + let Inst{21} = 0; // No writeback + let Inst{20} = L_bit; + } + def DIA_UPD : + AXDI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, dpr_reglist:$regs, variable_ops), + IndexModeUpd, itin_upd, + !strconcat(asm, "ia${p}\t$Rn!, $regs"), "$Rn = $wb", []> { + let Inst{24-23} = 0b01; // Increment After + let Inst{21} = 1; // Writeback + let Inst{20} = L_bit; + } + def DDB : + AXDI4<(outs), (ins GPR:$Rn, pred:$p, dpr_reglist:$regs, variable_ops), + IndexModeNone, itin, + !strconcat(asm, "db${p}\t$Rn, $regs"), "", []> { + let Inst{24-23} = 0b10; // Decrement Before + let Inst{21} = 0; // No writeback + let Inst{20} = L_bit; + } + def DDB_UPD : + AXDI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, dpr_reglist:$regs, variable_ops), + IndexModeUpd, itin_upd, + !strconcat(asm, "db${p}\t$Rn!, $regs"), "$Rn = $wb", []> { + let Inst{24-23} = 0b10; // Decrement Before + let Inst{21} = 1; // Writeback + let Inst{20} = L_bit; + } + + // Single Precision + def SIA : + AXSI4<(outs), (ins GPR:$Rn, pred:$p, spr_reglist:$regs, variable_ops), + IndexModeNone, itin, + !strconcat(asm, "ia${p}\t$Rn, $regs"), "", []> { + let Inst{24-23} = 0b01; // Increment After + let Inst{21} = 0; // No writeback + let Inst{20} = L_bit; + + // Some single precision VFP instructions may be executed on both NEON and + // VFP pipelines. + let D = VFPNeonDomain; + } + def SIA_UPD : + AXSI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, spr_reglist:$regs, variable_ops), + IndexModeUpd, itin_upd, + !strconcat(asm, "ia${p}\t$Rn!, $regs"), "$Rn = $wb", []> { + let Inst{24-23} = 0b01; // Increment After + let Inst{21} = 1; // Writeback + let Inst{20} = L_bit; + + // Some single precision VFP instructions may be executed on both NEON and + // VFP pipelines. + let D = VFPNeonDomain; + } + def SDB : + AXSI4<(outs), (ins GPR:$Rn, pred:$p, spr_reglist:$regs, variable_ops), + IndexModeNone, itin, + !strconcat(asm, "db${p}\t$Rn, $regs"), "", []> { + let Inst{24-23} = 0b10; // Decrement Before + let Inst{21} = 0; // No writeback + let Inst{20} = L_bit; + + // Some single precision VFP instructions may be executed on both NEON and + // VFP pipelines. + let D = VFPNeonDomain; + } + def SDB_UPD : + AXSI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, spr_reglist:$regs, variable_ops), + IndexModeUpd, itin_upd, + !strconcat(asm, "db${p}\t$Rn!, $regs"), "$Rn = $wb", []> { + let Inst{24-23} = 0b10; // Decrement Before + let Inst{21} = 1; // Writeback + let Inst{20} = L_bit; + + // Some single precision VFP instructions may be executed on both NEON and + // VFP pipelines. + let D = VFPNeonDomain; + } } -def VLDMS_UPD : AXSI4<(outs GPR:$wb), (ins addrmode4:$addr, pred:$p, - reglist:$dsts, variable_ops), - IndexModeUpd, IIC_fpLoadm, - "vldm${addr:submode}${p}\t$addr!, $dsts", - "$addr.addr = $wb", []> { - let Inst{20} = 1; -} -} // mayLoad, neverHasSideEffects, hasExtraDefRegAllocReq +let neverHasSideEffects = 1 in { -let mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1 in { -def VSTMD : AXDI4<(outs), (ins addrmode4:$addr, pred:$p, reglist:$srcs, - variable_ops), IndexModeNone, IIC_fpStorem, - "vstm${addr:submode}${p}\t$addr, $srcs", "", []> { - let Inst{20} = 0; -} +let mayLoad = 1, hasExtraDefRegAllocReq = 1 in +defm VLDM : vfp_ldst_mult<"vldm", 1, IIC_fpLoad_m, IIC_fpLoad_mu>; -def VSTMS : AXSI4<(outs), (ins addrmode4:$addr, pred:$p, reglist:$srcs, - variable_ops), IndexModeNone, IIC_fpStorem, - "vstm${addr:submode}${p}\t$addr, $srcs", "", []> { - let Inst{20} = 0; -} +let mayStore = 1, hasExtraSrcRegAllocReq = 1 in +defm VSTM : vfp_ldst_mult<"vstm", 0, IIC_fpLoad_m, IIC_fpLoad_mu>; -def VSTMD_UPD : AXDI4<(outs GPR:$wb), (ins addrmode4:$addr, pred:$p, - reglist:$srcs, variable_ops), - IndexModeUpd, IIC_fpStorem, - "vstm${addr:submode}${p}\t$addr!, $srcs", - "$addr.addr = $wb", []> { - let Inst{20} = 0; -} +} // neverHasSideEffects -def VSTMS_UPD : AXSI4<(outs GPR:$wb), (ins addrmode4:$addr, pred:$p, - reglist:$srcs, variable_ops), - IndexModeUpd, IIC_fpStorem, - "vstm${addr:submode}${p}\t$addr!, $srcs", - "$addr.addr = $wb", []> { - let Inst{20} = 0; -} -} // mayStore, neverHasSideEffects, hasExtraSrcRegAllocReq +def : MnemonicAlias<"vldm", "vldmia">; +def : MnemonicAlias<"vstm", "vstmia">; // FLDMX, FSTMX - mixing S/D registers for pre-armv6 cores @@ -142,56 +188,71 @@ def VSTMS_UPD : AXSI4<(outs GPR:$wb), (ins addrmode4:$addr, pred:$p, // FP Binary Operations. // -def VADDD : ADbI<0b11100, 0b11, 0, 0, (outs DPR:$dst), (ins DPR:$a, DPR:$b), - IIC_fpALU64, "vadd", ".f64\t$dst, $a, $b", - [(set DPR:$dst, (fadd DPR:$a, (f64 DPR:$b)))]>; - -def VADDS : ASbIn<0b11100, 0b11, 0, 0, (outs SPR:$dst), (ins SPR:$a, SPR:$b), - IIC_fpALU32, "vadd", ".f32\t$dst, $a, $b", - [(set SPR:$dst, (fadd SPR:$a, SPR:$b))]>; - -// These are encoded as unary instructions. -let Defs = [FPSCR] in { -def VCMPED : ADuI<0b11101, 0b11, 0b0100, 0b11, 0, (outs), (ins DPR:$a, DPR:$b), - IIC_fpCMP64, "vcmpe", ".f64\t$a, $b", - [(arm_cmpfp DPR:$a, (f64 DPR:$b))]>; - -def VCMPD : ADuI<0b11101, 0b11, 0b0100, 0b01, 0, (outs), (ins DPR:$a, DPR:$b), - IIC_fpCMP64, "vcmp", ".f64\t$a, $b", - [/* For disassembly only; pattern left blank */]>; - -def VCMPES : ASuI<0b11101, 0b11, 0b0100, 0b11, 0, (outs), (ins SPR:$a, SPR:$b), - IIC_fpCMP32, "vcmpe", ".f32\t$a, $b", - [(arm_cmpfp SPR:$a, SPR:$b)]>; - -def VCMPS : ASuI<0b11101, 0b11, 0b0100, 0b01, 0, (outs), (ins SPR:$a, SPR:$b), - IIC_fpCMP32, "vcmp", ".f32\t$a, $b", - [/* For disassembly only; pattern left blank */]>; +def VADDD : ADbI<0b11100, 0b11, 0, 0, + (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm), + IIC_fpALU64, "vadd", ".f64\t$Dd, $Dn, $Dm", + [(set DPR:$Dd, (fadd DPR:$Dn, (f64 DPR:$Dm)))]>; + +def VADDS : ASbIn<0b11100, 0b11, 0, 0, + (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm), + IIC_fpALU32, "vadd", ".f32\t$Sd, $Sn, $Sm", + [(set SPR:$Sd, (fadd SPR:$Sn, SPR:$Sm))]> { + // Some single precision VFP instructions may be executed on both NEON and VFP + // pipelines. + let D = VFPNeonDomain; } -def VDIVD : ADbI<0b11101, 0b00, 0, 0, (outs DPR:$dst), (ins DPR:$a, DPR:$b), - IIC_fpDIV64, "vdiv", ".f64\t$dst, $a, $b", - [(set DPR:$dst, (fdiv DPR:$a, (f64 DPR:$b)))]>; - -def VDIVS : ASbI<0b11101, 0b00, 0, 0, (outs SPR:$dst), (ins SPR:$a, SPR:$b), - IIC_fpDIV32, "vdiv", ".f32\t$dst, $a, $b", - [(set SPR:$dst, (fdiv SPR:$a, SPR:$b))]>; - -def VMULD : ADbI<0b11100, 0b10, 0, 0, (outs DPR:$dst), (ins DPR:$a, DPR:$b), - IIC_fpMUL64, "vmul", ".f64\t$dst, $a, $b", - [(set DPR:$dst, (fmul DPR:$a, (f64 DPR:$b)))]>; - -def VMULS : ASbIn<0b11100, 0b10, 0, 0, (outs SPR:$dst), (ins SPR:$a, SPR:$b), - IIC_fpMUL32, "vmul", ".f32\t$dst, $a, $b", - [(set SPR:$dst, (fmul SPR:$a, SPR:$b))]>; +def VSUBD : ADbI<0b11100, 0b11, 1, 0, + (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm), + IIC_fpALU64, "vsub", ".f64\t$Dd, $Dn, $Dm", + [(set DPR:$Dd, (fsub DPR:$Dn, (f64 DPR:$Dm)))]>; + +def VSUBS : ASbIn<0b11100, 0b11, 1, 0, + (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm), + IIC_fpALU32, "vsub", ".f32\t$Sd, $Sn, $Sm", + [(set SPR:$Sd, (fsub SPR:$Sn, SPR:$Sm))]> { + // Some single precision VFP instructions may be executed on both NEON and VFP + // pipelines. + let D = VFPNeonDomain; +} -def VNMULD : ADbI<0b11100, 0b10, 1, 0, (outs DPR:$dst), (ins DPR:$a, DPR:$b), - IIC_fpMUL64, "vnmul", ".f64\t$dst, $a, $b", - [(set DPR:$dst, (fneg (fmul DPR:$a, (f64 DPR:$b))))]>; +def VDIVD : ADbI<0b11101, 0b00, 0, 0, + (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm), + IIC_fpDIV64, "vdiv", ".f64\t$Dd, $Dn, $Dm", + [(set DPR:$Dd, (fdiv DPR:$Dn, (f64 DPR:$Dm)))]>; + +def VDIVS : ASbI<0b11101, 0b00, 0, 0, + (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm), + IIC_fpDIV32, "vdiv", ".f32\t$Sd, $Sn, $Sm", + [(set SPR:$Sd, (fdiv SPR:$Sn, SPR:$Sm))]>; + +def VMULD : ADbI<0b11100, 0b10, 0, 0, + (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm), + IIC_fpMUL64, "vmul", ".f64\t$Dd, $Dn, $Dm", + [(set DPR:$Dd, (fmul DPR:$Dn, (f64 DPR:$Dm)))]>; + +def VMULS : ASbIn<0b11100, 0b10, 0, 0, + (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm), + IIC_fpMUL32, "vmul", ".f32\t$Sd, $Sn, $Sm", + [(set SPR:$Sd, (fmul SPR:$Sn, SPR:$Sm))]> { + // Some single precision VFP instructions may be executed on both NEON and VFP + // pipelines. + let D = VFPNeonDomain; +} -def VNMULS : ASbI<0b11100, 0b10, 1, 0, (outs SPR:$dst), (ins SPR:$a, SPR:$b), - IIC_fpMUL32, "vnmul", ".f32\t$dst, $a, $b", - [(set SPR:$dst, (fneg (fmul SPR:$a, SPR:$b)))]>; +def VNMULD : ADbI<0b11100, 0b10, 1, 0, + (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm), + IIC_fpMUL64, "vnmul", ".f64\t$Dd, $Dn, $Dm", + [(set DPR:$Dd, (fneg (fmul DPR:$Dn, (f64 DPR:$Dm))))]>; + +def VNMULS : ASbI<0b11100, 0b10, 1, 0, + (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm), + IIC_fpMUL32, "vnmul", ".f32\t$Sd, $Sn, $Sm", + [(set SPR:$Sd, (fneg (fmul SPR:$Sn, SPR:$Sm)))]> { + // Some single precision VFP instructions may be executed on both NEON and VFP + // pipelines. + let D = VFPNeonDomain; +} // Match reassociated forms only if not sign dependent rounding. def : Pat<(fmul (fneg DPR:$a), (f64 DPR:$b)), @@ -199,53 +260,128 @@ def : Pat<(fmul (fneg DPR:$a), (f64 DPR:$b)), def : Pat<(fmul (fneg SPR:$a), SPR:$b), (VNMULS SPR:$a, SPR:$b)>, Requires<[NoHonorSignDependentRounding]>; +// These are encoded as unary instructions. +let Defs = [FPSCR] in { +def VCMPED : ADuI<0b11101, 0b11, 0b0100, 0b11, 0, + (outs), (ins DPR:$Dd, DPR:$Dm), + IIC_fpCMP64, "vcmpe", ".f64\t$Dd, $Dm", + [(arm_cmpfp DPR:$Dd, (f64 DPR:$Dm))]>; + +def VCMPES : ASuI<0b11101, 0b11, 0b0100, 0b11, 0, + (outs), (ins SPR:$Sd, SPR:$Sm), + IIC_fpCMP32, "vcmpe", ".f32\t$Sd, $Sm", + [(arm_cmpfp SPR:$Sd, SPR:$Sm)]> { + // Some single precision VFP instructions may be executed on both NEON and VFP + // pipelines. + let D = VFPNeonDomain; +} -def VSUBD : ADbI<0b11100, 0b11, 1, 0, (outs DPR:$dst), (ins DPR:$a, DPR:$b), - IIC_fpALU64, "vsub", ".f64\t$dst, $a, $b", - [(set DPR:$dst, (fsub DPR:$a, (f64 DPR:$b)))]>; +// FIXME: Verify encoding after integrated assembler is working. +def VCMPD : ADuI<0b11101, 0b11, 0b0100, 0b01, 0, + (outs), (ins DPR:$Dd, DPR:$Dm), + IIC_fpCMP64, "vcmp", ".f64\t$Dd, $Dm", + [/* For disassembly only; pattern left blank */]>; -def VSUBS : ASbIn<0b11100, 0b11, 1, 0, (outs SPR:$dst), (ins SPR:$a, SPR:$b), - IIC_fpALU32, "vsub", ".f32\t$dst, $a, $b", - [(set SPR:$dst, (fsub SPR:$a, SPR:$b))]>; +def VCMPS : ASuI<0b11101, 0b11, 0b0100, 0b01, 0, + (outs), (ins SPR:$Sd, SPR:$Sm), + IIC_fpCMP32, "vcmp", ".f32\t$Sd, $Sm", + [/* For disassembly only; pattern left blank */]> { + // Some single precision VFP instructions may be executed on both NEON and VFP + // pipelines. + let D = VFPNeonDomain; +} +} // Defs = [FPSCR] //===----------------------------------------------------------------------===// // FP Unary Operations. // -def VABSD : ADuI<0b11101, 0b11, 0b0000, 0b11, 0, (outs DPR:$dst), (ins DPR:$a), - IIC_fpUNA64, "vabs", ".f64\t$dst, $a", - [(set DPR:$dst, (fabs (f64 DPR:$a)))]>; - -def VABSS : ASuIn<0b11101, 0b11, 0b0000, 0b11, 0,(outs SPR:$dst), (ins SPR:$a), - IIC_fpUNA32, "vabs", ".f32\t$dst, $a", - [(set SPR:$dst, (fabs SPR:$a))]>; +def VABSD : ADuI<0b11101, 0b11, 0b0000, 0b11, 0, + (outs DPR:$Dd), (ins DPR:$Dm), + IIC_fpUNA64, "vabs", ".f64\t$Dd, $Dm", + [(set DPR:$Dd, (fabs (f64 DPR:$Dm)))]>; + +def VABSS : ASuIn<0b11101, 0b11, 0b0000, 0b11, 0, + (outs SPR:$Sd), (ins SPR:$Sm), + IIC_fpUNA32, "vabs", ".f32\t$Sd, $Sm", + [(set SPR:$Sd, (fabs SPR:$Sm))]> { + // Some single precision VFP instructions may be executed on both NEON and VFP + // pipelines. + let D = VFPNeonDomain; +} let Defs = [FPSCR] in { -def VCMPEZD : ADuI<0b11101, 0b11, 0b0101, 0b11, 0, (outs), (ins DPR:$a), - IIC_fpCMP64, "vcmpe", ".f64\t$a, #0", - [(arm_cmpfp0 (f64 DPR:$a))]>; +def VCMPEZD : ADuI<0b11101, 0b11, 0b0101, 0b11, 0, + (outs), (ins DPR:$Dd), + IIC_fpCMP64, "vcmpe", ".f64\t$Dd, #0", + [(arm_cmpfp0 (f64 DPR:$Dd))]> { + let Inst{3-0} = 0b0000; + let Inst{5} = 0; +} -def VCMPZD : ADuI<0b11101, 0b11, 0b0101, 0b01, 0, (outs), (ins DPR:$a), - IIC_fpCMP64, "vcmp", ".f64\t$a, #0", - [/* For disassembly only; pattern left blank */]>; +def VCMPEZS : ASuI<0b11101, 0b11, 0b0101, 0b11, 0, + (outs), (ins SPR:$Sd), + IIC_fpCMP32, "vcmpe", ".f32\t$Sd, #0", + [(arm_cmpfp0 SPR:$Sd)]> { + let Inst{3-0} = 0b0000; + let Inst{5} = 0; -def VCMPEZS : ASuI<0b11101, 0b11, 0b0101, 0b11, 0, (outs), (ins SPR:$a), - IIC_fpCMP32, "vcmpe", ".f32\t$a, #0", - [(arm_cmpfp0 SPR:$a)]>; + // Some single precision VFP instructions may be executed on both NEON and VFP + // pipelines. + let D = VFPNeonDomain; +} -def VCMPZS : ASuI<0b11101, 0b11, 0b0101, 0b01, 0, (outs), (ins SPR:$a), - IIC_fpCMP32, "vcmp", ".f32\t$a, #0", - [/* For disassembly only; pattern left blank */]>; +// FIXME: Verify encoding after integrated assembler is working. +def VCMPZD : ADuI<0b11101, 0b11, 0b0101, 0b01, 0, + (outs), (ins DPR:$Dd), + IIC_fpCMP64, "vcmp", ".f64\t$Dd, #0", + [/* For disassembly only; pattern left blank */]> { + let Inst{3-0} = 0b0000; + let Inst{5} = 0; } -def VCVTDS : ASuI<0b11101, 0b11, 0b0111, 0b11, 0, (outs DPR:$dst), (ins SPR:$a), - IIC_fpCVTDS, "vcvt", ".f64.f32\t$dst, $a", - [(set DPR:$dst, (fextend SPR:$a))]>; +def VCMPZS : ASuI<0b11101, 0b11, 0b0101, 0b01, 0, + (outs), (ins SPR:$Sd), + IIC_fpCMP32, "vcmp", ".f32\t$Sd, #0", + [/* For disassembly only; pattern left blank */]> { + let Inst{3-0} = 0b0000; + let Inst{5} = 0; + + // Some single precision VFP instructions may be executed on both NEON and VFP + // pipelines. + let D = VFPNeonDomain; +} +} // Defs = [FPSCR] + +def VCVTDS : ASuI<0b11101, 0b11, 0b0111, 0b11, 0, + (outs DPR:$Dd), (ins SPR:$Sm), + IIC_fpCVTDS, "vcvt", ".f64.f32\t$Dd, $Sm", + [(set DPR:$Dd, (fextend SPR:$Sm))]> { + // Instruction operands. + bits<5> Dd; + bits<5> Sm; + + // Encode instruction operands. + let Inst{3-0} = Sm{4-1}; + let Inst{5} = Sm{0}; + let Inst{15-12} = Dd{3-0}; + let Inst{22} = Dd{4}; +} // Special case encoding: bits 11-8 is 0b1011. -def VCVTSD : VFPAI<(outs SPR:$dst), (ins DPR:$a), VFPUnaryFrm, - IIC_fpCVTSD, "vcvt", ".f32.f64\t$dst, $a", - [(set SPR:$dst, (fround DPR:$a))]> { +def VCVTSD : VFPAI<(outs SPR:$Sd), (ins DPR:$Dm), VFPUnaryFrm, + IIC_fpCVTSD, "vcvt", ".f32.f64\t$Sd, $Dm", + [(set SPR:$Sd, (fround DPR:$Dm))]> { + // Instruction operands. + bits<5> Sd; + bits<5> Dm; + + // Encode instruction operands. + let Inst{3-0} = Dm{3-0}; + let Inst{5} = Dm{4}; + let Inst{15-12} = Sd{4-1}; + let Inst{22} = Sd{0}; + let Inst{27-23} = 0b11101; let Inst{21-16} = 0b110111; let Inst{11-8} = 0b1011; @@ -255,6 +391,7 @@ def VCVTSD : VFPAI<(outs SPR:$dst), (ins DPR:$a), VFPUnaryFrm, // Between half-precision and single-precision. For disassembly only. +// FIXME: Verify encoding after integrated assembler is working. def VCVTBSH: ASuI<0b11101, 0b11, 0b0010, 0b01, 0, (outs SPR:$dst), (ins SPR:$a), /* FIXME */ IIC_fpCVTSH, "vcvtb", ".f32.f16\t$dst, $a", [/* For disassembly only; pattern left blank */]>; @@ -277,47 +414,94 @@ def VCVTTHS: ASuI<0b11101, 0b11, 0b0011, 0b11, 0, (outs SPR:$dst), (ins SPR:$a), /* FIXME */ IIC_fpCVTHS, "vcvtt", ".f16.f32\t$dst, $a", [/* For disassembly only; pattern left blank */]>; -let neverHasSideEffects = 1 in { -def VMOVD: ADuI<0b11101, 0b11, 0b0000, 0b01, 0, (outs DPR:$dst), (ins DPR:$a), - IIC_fpUNA64, "vmov", ".f64\t$dst, $a", []>; - -def VMOVS: ASuI<0b11101, 0b11, 0b0000, 0b01, 0, (outs SPR:$dst), (ins SPR:$a), - IIC_fpUNA32, "vmov", ".f32\t$dst, $a", []>; -} // neverHasSideEffects +def VNEGD : ADuI<0b11101, 0b11, 0b0001, 0b01, 0, + (outs DPR:$Dd), (ins DPR:$Dm), + IIC_fpUNA64, "vneg", ".f64\t$Dd, $Dm", + [(set DPR:$Dd, (fneg (f64 DPR:$Dm)))]>; + +def VNEGS : ASuIn<0b11101, 0b11, 0b0001, 0b01, 0, + (outs SPR:$Sd), (ins SPR:$Sm), + IIC_fpUNA32, "vneg", ".f32\t$Sd, $Sm", + [(set SPR:$Sd, (fneg SPR:$Sm))]> { + // Some single precision VFP instructions may be executed on both NEON and VFP + // pipelines. + let D = VFPNeonDomain; +} -def VNEGD : ADuI<0b11101, 0b11, 0b0001, 0b01, 0, (outs DPR:$dst), (ins DPR:$a), - IIC_fpUNA64, "vneg", ".f64\t$dst, $a", - [(set DPR:$dst, (fneg (f64 DPR:$a)))]>; +def VSQRTD : ADuI<0b11101, 0b11, 0b0001, 0b11, 0, + (outs DPR:$Dd), (ins DPR:$Dm), + IIC_fpSQRT64, "vsqrt", ".f64\t$Dd, $Dm", + [(set DPR:$Dd, (fsqrt (f64 DPR:$Dm)))]>; -def VNEGS : ASuIn<0b11101, 0b11, 0b0001, 0b01, 0,(outs SPR:$dst), (ins SPR:$a), - IIC_fpUNA32, "vneg", ".f32\t$dst, $a", - [(set SPR:$dst, (fneg SPR:$a))]>; +def VSQRTS : ASuI<0b11101, 0b11, 0b0001, 0b11, 0, + (outs SPR:$Sd), (ins SPR:$Sm), + IIC_fpSQRT32, "vsqrt", ".f32\t$Sd, $Sm", + [(set SPR:$Sd, (fsqrt SPR:$Sm))]>; -def VSQRTD : ADuI<0b11101, 0b11, 0b0001, 0b11, 0, (outs DPR:$dst), (ins DPR:$a), - IIC_fpSQRT64, "vsqrt", ".f64\t$dst, $a", - [(set DPR:$dst, (fsqrt (f64 DPR:$a)))]>; +let neverHasSideEffects = 1 in { +def VMOVD : ADuI<0b11101, 0b11, 0b0000, 0b01, 0, + (outs DPR:$Dd), (ins DPR:$Dm), + IIC_fpUNA64, "vmov", ".f64\t$Dd, $Dm", []>; -def VSQRTS : ASuI<0b11101, 0b11, 0b0001, 0b11, 0, (outs SPR:$dst), (ins SPR:$a), - IIC_fpSQRT32, "vsqrt", ".f32\t$dst, $a", - [(set SPR:$dst, (fsqrt SPR:$a))]>; +def VMOVS : ASuI<0b11101, 0b11, 0b0000, 0b01, 0, + (outs SPR:$Sd), (ins SPR:$Sm), + IIC_fpUNA32, "vmov", ".f32\t$Sd, $Sm", []>; +} // neverHasSideEffects //===----------------------------------------------------------------------===// // FP <-> GPR Copies. Int <-> FP Conversions. // -def VMOVRS : AVConv2I<0b11100001, 0b1010, (outs GPR:$dst), (ins SPR:$src), - IIC_fpMOVSI, "vmov", "\t$dst, $src", - [(set GPR:$dst, (bitconvert SPR:$src))]>; +def VMOVRS : AVConv2I<0b11100001, 0b1010, + (outs GPR:$Rt), (ins SPR:$Sn), + IIC_fpMOVSI, "vmov", "\t$Rt, $Sn", + [(set GPR:$Rt, (bitconvert SPR:$Sn))]> { + // Instruction operands. + bits<4> Rt; + bits<5> Sn; + + // Encode instruction operands. + let Inst{19-16} = Sn{4-1}; + let Inst{7} = Sn{0}; + let Inst{15-12} = Rt; + + let Inst{6-5} = 0b00; + let Inst{3-0} = 0b0000; +} -def VMOVSR : AVConv4I<0b11100000, 0b1010, (outs SPR:$dst), (ins GPR:$src), - IIC_fpMOVIS, "vmov", "\t$dst, $src", - [(set SPR:$dst, (bitconvert GPR:$src))]>; +def VMOVSR : AVConv4I<0b11100000, 0b1010, + (outs SPR:$Sn), (ins GPR:$Rt), + IIC_fpMOVIS, "vmov", "\t$Sn, $Rt", + [(set SPR:$Sn, (bitconvert GPR:$Rt))]> { + // Instruction operands. + bits<5> Sn; + bits<4> Rt; + + // Encode instruction operands. + let Inst{19-16} = Sn{4-1}; + let Inst{7} = Sn{0}; + let Inst{15-12} = Rt; + + let Inst{6-5} = 0b00; + let Inst{3-0} = 0b0000; +} let neverHasSideEffects = 1 in { def VMOVRRD : AVConv3I<0b11000101, 0b1011, - (outs GPR:$wb, GPR:$dst2), (ins DPR:$src), - IIC_fpMOVDI, "vmov", "\t$wb, $dst2, $src", + (outs GPR:$Rt, GPR:$Rt2), (ins DPR:$Dm), + IIC_fpMOVDI, "vmov", "\t$Rt, $Rt2, $Dm", [/* FIXME: Can't write pattern for multiple result instr*/]> { + // Instruction operands. + bits<5> Dm; + bits<4> Rt; + bits<4> Rt2; + + // Encode instruction operands. + let Inst{3-0} = Dm{3-0}; + let Inst{5} = Dm{4}; + let Inst{15-12} = Rt; + let Inst{19-16} = Rt2; + let Inst{7-6} = 0b00; } @@ -333,10 +517,21 @@ def VMOVRRS : AVConv3I<0b11000101, 0b1010, // FMDLR: GPR -> SPR def VMOVDRR : AVConv5I<0b11000100, 0b1011, - (outs DPR:$dst), (ins GPR:$src1, GPR:$src2), - IIC_fpMOVID, "vmov", "\t$dst, $src1, $src2", - [(set DPR:$dst, (arm_fmdrr GPR:$src1, GPR:$src2))]> { - let Inst{7-6} = 0b00; + (outs DPR:$Dm), (ins GPR:$Rt, GPR:$Rt2), + IIC_fpMOVID, "vmov", "\t$Dm, $Rt, $Rt2", + [(set DPR:$Dm, (arm_fmdrr GPR:$Rt, GPR:$Rt2))]> { + // Instruction operands. + bits<5> Dm; + bits<4> Rt; + bits<4> Rt2; + + // Encode instruction operands. + let Inst{3-0} = Dm{3-0}; + let Inst{5} = Dm{4}; + let Inst{15-12} = Rt; + let Inst{19-16} = Rt2; + + let Inst{7-6} = 0b00; } let neverHasSideEffects = 1 in @@ -350,102 +545,183 @@ def VMOVSRR : AVConv5I<0b11000100, 0b1010, // FMRDH: SPR -> GPR // FMRDL: SPR -> GPR // FMRRS: SPR -> GPR -// FMRX : SPR system reg -> GPR - +// FMRX: SPR system reg -> GPR // FMSRR: GPR -> SPR +// FMXR: GPR -> VFP system reg + + +// Int -> FP: + +class AVConv1IDs_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, + bits<4> opcod4, dag oops, dag iops, + InstrItinClass itin, string opc, string asm, + list<dag> pattern> + : AVConv1I<opcod1, opcod2, opcod3, opcod4, oops, iops, itin, opc, asm, + pattern> { + // Instruction operands. + bits<5> Dd; + bits<5> Sm; + + // Encode instruction operands. + let Inst{3-0} = Sm{4-1}; + let Inst{5} = Sm{0}; + let Inst{15-12} = Dd{3-0}; + let Inst{22} = Dd{4}; +} -// FMXR: GPR -> VFP Sstem reg - - -// Int to FP: +class AVConv1InSs_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, + bits<4> opcod4, dag oops, dag iops,InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : AVConv1In<opcod1, opcod2, opcod3, opcod4, oops, iops, itin, opc, asm, + pattern> { + // Instruction operands. + bits<5> Sd; + bits<5> Sm; + + // Encode instruction operands. + let Inst{3-0} = Sm{4-1}; + let Inst{5} = Sm{0}; + let Inst{15-12} = Sd{4-1}; + let Inst{22} = Sd{0}; +} -def VSITOD : AVConv1I<0b11101, 0b11, 0b1000, 0b1011, - (outs DPR:$dst), (ins SPR:$a), - IIC_fpCVTID, "vcvt", ".f64.s32\t$dst, $a", - [(set DPR:$dst, (f64 (arm_sitof SPR:$a)))]> { +def VSITOD : AVConv1IDs_Encode<0b11101, 0b11, 0b1000, 0b1011, + (outs DPR:$Dd), (ins SPR:$Sm), + IIC_fpCVTID, "vcvt", ".f64.s32\t$Dd, $Sm", + [(set DPR:$Dd, (f64 (arm_sitof SPR:$Sm)))]> { let Inst{7} = 1; // s32 } -def VSITOS : AVConv1In<0b11101, 0b11, 0b1000, 0b1010, - (outs SPR:$dst),(ins SPR:$a), - IIC_fpCVTIS, "vcvt", ".f32.s32\t$dst, $a", - [(set SPR:$dst, (arm_sitof SPR:$a))]> { +def VSITOS : AVConv1InSs_Encode<0b11101, 0b11, 0b1000, 0b1010, + (outs SPR:$Sd),(ins SPR:$Sm), + IIC_fpCVTIS, "vcvt", ".f32.s32\t$Sd, $Sm", + [(set SPR:$Sd, (arm_sitof SPR:$Sm))]> { let Inst{7} = 1; // s32 + + // Some single precision VFP instructions may be executed on both NEON and VFP + // pipelines. + let D = VFPNeonDomain; } -def VUITOD : AVConv1I<0b11101, 0b11, 0b1000, 0b1011, - (outs DPR:$dst), (ins SPR:$a), - IIC_fpCVTID, "vcvt", ".f64.u32\t$dst, $a", - [(set DPR:$dst, (f64 (arm_uitof SPR:$a)))]> { +def VUITOD : AVConv1IDs_Encode<0b11101, 0b11, 0b1000, 0b1011, + (outs DPR:$Dd), (ins SPR:$Sm), + IIC_fpCVTID, "vcvt", ".f64.u32\t$Dd, $Sm", + [(set DPR:$Dd, (f64 (arm_uitof SPR:$Sm)))]> { let Inst{7} = 0; // u32 } -def VUITOS : AVConv1In<0b11101, 0b11, 0b1000, 0b1010, - (outs SPR:$dst), (ins SPR:$a), - IIC_fpCVTIS, "vcvt", ".f32.u32\t$dst, $a", - [(set SPR:$dst, (arm_uitof SPR:$a))]> { +def VUITOS : AVConv1InSs_Encode<0b11101, 0b11, 0b1000, 0b1010, + (outs SPR:$Sd), (ins SPR:$Sm), + IIC_fpCVTIS, "vcvt", ".f32.u32\t$Sd, $Sm", + [(set SPR:$Sd, (arm_uitof SPR:$Sm))]> { let Inst{7} = 0; // u32 + + // Some single precision VFP instructions may be executed on both NEON and VFP + // pipelines. + let D = VFPNeonDomain; } -// FP to Int: -// Always set Z bit in the instruction, i.e. "round towards zero" variants. +// FP -> Int: + +class AVConv1IsD_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, + bits<4> opcod4, dag oops, dag iops, + InstrItinClass itin, string opc, string asm, + list<dag> pattern> + : AVConv1I<opcod1, opcod2, opcod3, opcod4, oops, iops, itin, opc, asm, + pattern> { + // Instruction operands. + bits<5> Sd; + bits<5> Dm; + + // Encode instruction operands. + let Inst{3-0} = Dm{3-0}; + let Inst{5} = Dm{4}; + let Inst{15-12} = Sd{4-1}; + let Inst{22} = Sd{0}; +} + +class AVConv1InsS_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, + bits<4> opcod4, dag oops, dag iops, + InstrItinClass itin, string opc, string asm, + list<dag> pattern> + : AVConv1In<opcod1, opcod2, opcod3, opcod4, oops, iops, itin, opc, asm, + pattern> { + // Instruction operands. + bits<5> Sd; + bits<5> Sm; + + // Encode instruction operands. + let Inst{3-0} = Sm{4-1}; + let Inst{5} = Sm{0}; + let Inst{15-12} = Sd{4-1}; + let Inst{22} = Sd{0}; +} -def VTOSIZD : AVConv1I<0b11101, 0b11, 0b1101, 0b1011, - (outs SPR:$dst), (ins DPR:$a), - IIC_fpCVTDI, "vcvt", ".s32.f64\t$dst, $a", - [(set SPR:$dst, (arm_ftosi (f64 DPR:$a)))]> { +// Always set Z bit in the instruction, i.e. "round towards zero" variants. +def VTOSIZD : AVConv1IsD_Encode<0b11101, 0b11, 0b1101, 0b1011, + (outs SPR:$Sd), (ins DPR:$Dm), + IIC_fpCVTDI, "vcvt", ".s32.f64\t$Sd, $Dm", + [(set SPR:$Sd, (arm_ftosi (f64 DPR:$Dm)))]> { let Inst{7} = 1; // Z bit } -def VTOSIZS : AVConv1In<0b11101, 0b11, 0b1101, 0b1010, - (outs SPR:$dst), (ins SPR:$a), - IIC_fpCVTSI, "vcvt", ".s32.f32\t$dst, $a", - [(set SPR:$dst, (arm_ftosi SPR:$a))]> { +def VTOSIZS : AVConv1InsS_Encode<0b11101, 0b11, 0b1101, 0b1010, + (outs SPR:$Sd), (ins SPR:$Sm), + IIC_fpCVTSI, "vcvt", ".s32.f32\t$Sd, $Sm", + [(set SPR:$Sd, (arm_ftosi SPR:$Sm))]> { let Inst{7} = 1; // Z bit + + // Some single precision VFP instructions may be executed on both NEON and VFP + // pipelines. + let D = VFPNeonDomain; } -def VTOUIZD : AVConv1I<0b11101, 0b11, 0b1100, 0b1011, - (outs SPR:$dst), (ins DPR:$a), - IIC_fpCVTDI, "vcvt", ".u32.f64\t$dst, $a", - [(set SPR:$dst, (arm_ftoui (f64 DPR:$a)))]> { +def VTOUIZD : AVConv1IsD_Encode<0b11101, 0b11, 0b1100, 0b1011, + (outs SPR:$Sd), (ins DPR:$Dm), + IIC_fpCVTDI, "vcvt", ".u32.f64\t$Sd, $Dm", + [(set SPR:$Sd, (arm_ftoui (f64 DPR:$Dm)))]> { let Inst{7} = 1; // Z bit } -def VTOUIZS : AVConv1In<0b11101, 0b11, 0b1100, 0b1010, - (outs SPR:$dst), (ins SPR:$a), - IIC_fpCVTSI, "vcvt", ".u32.f32\t$dst, $a", - [(set SPR:$dst, (arm_ftoui SPR:$a))]> { +def VTOUIZS : AVConv1InsS_Encode<0b11101, 0b11, 0b1100, 0b1010, + (outs SPR:$Sd), (ins SPR:$Sm), + IIC_fpCVTSI, "vcvt", ".u32.f32\t$Sd, $Sm", + [(set SPR:$Sd, (arm_ftoui SPR:$Sm))]> { let Inst{7} = 1; // Z bit + + // Some single precision VFP instructions may be executed on both NEON and VFP + // pipelines. + let D = VFPNeonDomain; } // And the Z bit '0' variants, i.e. use the rounding mode specified by FPSCR. -// For disassembly only. let Uses = [FPSCR] in { -def VTOSIRD : AVConv1I<0b11101, 0b11, 0b1101, 0b1011, - (outs SPR:$dst), (ins DPR:$a), - IIC_fpCVTDI, "vcvtr", ".s32.f64\t$dst, $a", - [(set SPR:$dst, (int_arm_vcvtr (f64 DPR:$a)))]> { +// FIXME: Verify encoding after integrated assembler is working. +def VTOSIRD : AVConv1IsD_Encode<0b11101, 0b11, 0b1101, 0b1011, + (outs SPR:$Sd), (ins DPR:$Dm), + IIC_fpCVTDI, "vcvtr", ".s32.f64\t$Sd, $Dm", + [(set SPR:$Sd, (int_arm_vcvtr (f64 DPR:$Dm)))]>{ let Inst{7} = 0; // Z bit } -def VTOSIRS : AVConv1In<0b11101, 0b11, 0b1101, 0b1010, - (outs SPR:$dst), (ins SPR:$a), - IIC_fpCVTSI, "vcvtr", ".s32.f32\t$dst, $a", - [(set SPR:$dst, (int_arm_vcvtr SPR:$a))]> { +def VTOSIRS : AVConv1InsS_Encode<0b11101, 0b11, 0b1101, 0b1010, + (outs SPR:$Sd), (ins SPR:$Sm), + IIC_fpCVTSI, "vcvtr", ".s32.f32\t$Sd, $Sm", + [(set SPR:$Sd, (int_arm_vcvtr SPR:$Sm))]> { let Inst{7} = 0; // Z bit } -def VTOUIRD : AVConv1I<0b11101, 0b11, 0b1100, 0b1011, - (outs SPR:$dst), (ins DPR:$a), - IIC_fpCVTDI, "vcvtr", ".u32.f64\t$dst, $a", - [(set SPR:$dst, (int_arm_vcvtru (f64 DPR:$a)))]> { +def VTOUIRD : AVConv1IsD_Encode<0b11101, 0b11, 0b1100, 0b1011, + (outs SPR:$Sd), (ins DPR:$Dm), + IIC_fpCVTDI, "vcvtr", ".u32.f64\t$Sd, $Dm", + [(set SPR:$Sd, (int_arm_vcvtru(f64 DPR:$Dm)))]>{ let Inst{7} = 0; // Z bit } -def VTOUIRS : AVConv1In<0b11101, 0b11, 0b1100, 0b1010, - (outs SPR:$dst), (ins SPR:$a), - IIC_fpCVTSI, "vcvtr", ".u32.f32\t$dst, $a", - [(set SPR:$dst, (int_arm_vcvtru SPR:$a))]> { +def VTOUIRS : AVConv1InsS_Encode<0b11101, 0b11, 0b1100, 0b1010, + (outs SPR:$Sd), (ins SPR:$Sm), + IIC_fpCVTSI, "vcvtr", ".u32.f32\t$Sd, $Sm", + [(set SPR:$Sd, (int_arm_vcvtru SPR:$Sm))]> { let Inst{7} = 0; // Z bit } } @@ -457,30 +733,47 @@ def VTOUIRS : AVConv1In<0b11101, 0b11, 0b1100, 0b1010, // S32 (U=0, sx=1) -> SL // U32 (U=1, sx=1) -> UL -let Constraints = "$a = $dst" in { +// FIXME: Marking these as codegen only seems wrong. They are real +// instructions(?) +let Constraints = "$a = $dst", isCodeGenOnly = 1 in { // FP to Fixed-Point: -let isCodeGenOnly = 1 in { def VTOSHS : AVConv1XI<0b11101, 0b11, 0b1110, 0b1010, 0, (outs SPR:$dst), (ins SPR:$a, i32imm:$fbits), IIC_fpCVTSI, "vcvt", ".s16.f32\t$dst, $a, $fbits", - [/* For disassembly only; pattern left blank */]>; + [/* For disassembly only; pattern left blank */]> { + // Some single precision VFP instructions may be executed on both NEON and VFP + // pipelines. + let D = VFPNeonDomain; +} def VTOUHS : AVConv1XI<0b11101, 0b11, 0b1111, 0b1010, 0, (outs SPR:$dst), (ins SPR:$a, i32imm:$fbits), IIC_fpCVTSI, "vcvt", ".u16.f32\t$dst, $a, $fbits", - [/* For disassembly only; pattern left blank */]>; + [/* For disassembly only; pattern left blank */]> { + // Some single precision VFP instructions may be executed on both NEON and VFP + // pipelines. + let D = VFPNeonDomain; +} def VTOSLS : AVConv1XI<0b11101, 0b11, 0b1110, 0b1010, 1, (outs SPR:$dst), (ins SPR:$a, i32imm:$fbits), IIC_fpCVTSI, "vcvt", ".s32.f32\t$dst, $a, $fbits", - [/* For disassembly only; pattern left blank */]>; + [/* For disassembly only; pattern left blank */]> { + // Some single precision VFP instructions may be executed on both NEON and VFP + // pipelines. + let D = VFPNeonDomain; +} def VTOULS : AVConv1XI<0b11101, 0b11, 0b1111, 0b1010, 1, (outs SPR:$dst), (ins SPR:$a, i32imm:$fbits), IIC_fpCVTSI, "vcvt", ".u32.f32\t$dst, $a, $fbits", - [/* For disassembly only; pattern left blank */]>; + [/* For disassembly only; pattern left blank */]> { + // Some single precision VFP instructions may be executed on both NEON and VFP + // pipelines. + let D = VFPNeonDomain; +} def VTOSHD : AVConv1XI<0b11101, 0b11, 0b1110, 0b1011, 0, (outs DPR:$dst), (ins DPR:$a, i32imm:$fbits), @@ -501,30 +794,44 @@ def VTOULD : AVConv1XI<0b11101, 0b11, 0b1111, 0b1011, 1, (outs DPR:$dst), (ins DPR:$a, i32imm:$fbits), IIC_fpCVTDI, "vcvt", ".u32.f64\t$dst, $a, $fbits", [/* For disassembly only; pattern left blank */]>; -} // Fixed-Point to FP: -let isCodeGenOnly = 1 in { def VSHTOS : AVConv1XI<0b11101, 0b11, 0b1010, 0b1010, 0, (outs SPR:$dst), (ins SPR:$a, i32imm:$fbits), IIC_fpCVTIS, "vcvt", ".f32.s16\t$dst, $a, $fbits", - [/* For disassembly only; pattern left blank */]>; + [/* For disassembly only; pattern left blank */]> { + // Some single precision VFP instructions may be executed on both NEON and VFP + // pipelines. + let D = VFPNeonDomain; +} def VUHTOS : AVConv1XI<0b11101, 0b11, 0b1011, 0b1010, 0, (outs SPR:$dst), (ins SPR:$a, i32imm:$fbits), IIC_fpCVTIS, "vcvt", ".f32.u16\t$dst, $a, $fbits", - [/* For disassembly only; pattern left blank */]>; + [/* For disassembly only; pattern left blank */]> { + // Some single precision VFP instructions may be executed on both NEON and VFP + // pipelines. + let D = VFPNeonDomain; +} def VSLTOS : AVConv1XI<0b11101, 0b11, 0b1010, 0b1010, 1, (outs SPR:$dst), (ins SPR:$a, i32imm:$fbits), IIC_fpCVTIS, "vcvt", ".f32.s32\t$dst, $a, $fbits", - [/* For disassembly only; pattern left blank */]>; + [/* For disassembly only; pattern left blank */]> { + // Some single precision VFP instructions may be executed on both NEON and VFP + // pipelines. + let D = VFPNeonDomain; +} def VULTOS : AVConv1XI<0b11101, 0b11, 0b1011, 0b1010, 1, (outs SPR:$dst), (ins SPR:$a, i32imm:$fbits), IIC_fpCVTIS, "vcvt", ".f32.u32\t$dst, $a, $fbits", - [/* For disassembly only; pattern left blank */]>; + [/* For disassembly only; pattern left blank */]> { + // Some single precision VFP instructions may be executed on both NEON and VFP + // pipelines. + let D = VFPNeonDomain; +} def VSHTOD : AVConv1XI<0b11101, 0b11, 0b1010, 0b1011, 0, (outs DPR:$dst), (ins DPR:$a, i32imm:$fbits), @@ -545,70 +852,120 @@ def VULTOD : AVConv1XI<0b11101, 0b11, 0b1011, 0b1011, 1, (outs DPR:$dst), (ins DPR:$a, i32imm:$fbits), IIC_fpCVTID, "vcvt", ".f64.u32\t$dst, $a, $fbits", [/* For disassembly only; pattern left blank */]>; -} -} // End of 'let Constraints = "$src = $dst" in' +} // End of 'let Constraints = "$a = $dst", isCodeGenOnly = 1 in' //===----------------------------------------------------------------------===// // FP FMA Operations. // -def VMLAD : ADbI_vmlX<0b11100, 0b00, 0, 0, - (outs DPR:$dst), (ins DPR:$dstin, DPR:$a, DPR:$b), - IIC_fpMAC64, "vmla", ".f64\t$dst, $a, $b", - [(set DPR:$dst, (fadd (fmul DPR:$a, DPR:$b), - (f64 DPR:$dstin)))]>, - RegConstraint<"$dstin = $dst">; +def VMLAD : ADbI<0b11100, 0b00, 0, 0, + (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm), + IIC_fpMAC64, "vmla", ".f64\t$Dd, $Dn, $Dm", + [(set DPR:$Dd, (fadd_mlx (fmul_su DPR:$Dn, DPR:$Dm), + (f64 DPR:$Ddin)))]>, + RegConstraint<"$Ddin = $Dd">, + Requires<[HasVFP2,UseFPVMLx]>; def VMLAS : ASbIn<0b11100, 0b00, 0, 0, - (outs SPR:$dst), (ins SPR:$dstin, SPR:$a, SPR:$b), - IIC_fpMAC32, "vmla", ".f32\t$dst, $a, $b", - [(set SPR:$dst, (fadd (fmul SPR:$a, SPR:$b), SPR:$dstin))]>, - RegConstraint<"$dstin = $dst">; - -def VNMLSD : ADbI_vmlX<0b11100, 0b01, 0, 0, - (outs DPR:$dst), (ins DPR:$dstin, DPR:$a, DPR:$b), - IIC_fpMAC64, "vnmls", ".f64\t$dst, $a, $b", - [(set DPR:$dst, (fsub (fmul DPR:$a, DPR:$b), - (f64 DPR:$dstin)))]>, - RegConstraint<"$dstin = $dst">; + (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm), + IIC_fpMAC32, "vmla", ".f32\t$Sd, $Sn, $Sm", + [(set SPR:$Sd, (fadd_mlx (fmul_su SPR:$Sn, SPR:$Sm), + SPR:$Sdin))]>, + RegConstraint<"$Sdin = $Sd">, + Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]> { + // Some single precision VFP instructions may be executed on both NEON and VFP + // pipelines. + let D = VFPNeonDomain; +} -def VNMLSS : ASbI<0b11100, 0b01, 0, 0, - (outs SPR:$dst), (ins SPR:$dstin, SPR:$a, SPR:$b), - IIC_fpMAC32, "vnmls", ".f32\t$dst, $a, $b", - [(set SPR:$dst, (fsub (fmul SPR:$a, SPR:$b), SPR:$dstin))]>, - RegConstraint<"$dstin = $dst">; - -def VMLSD : ADbI_vmlX<0b11100, 0b00, 1, 0, - (outs DPR:$dst), (ins DPR:$dstin, DPR:$a, DPR:$b), - IIC_fpMAC64, "vmls", ".f64\t$dst, $a, $b", - [(set DPR:$dst, (fadd (fneg (fmul DPR:$a, DPR:$b)), - (f64 DPR:$dstin)))]>, - RegConstraint<"$dstin = $dst">; +def : Pat<(fadd_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))), + (VMLAD DPR:$dstin, DPR:$a, DPR:$b)>, + Requires<[HasVFP2,UseFPVMLx]>; +def : Pat<(fadd_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)), + (VMLAS SPR:$dstin, SPR:$a, SPR:$b)>, + Requires<[HasVFP2,DontUseNEONForFP, UseFPVMLx]>; + +def VMLSD : ADbI<0b11100, 0b00, 1, 0, + (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm), + IIC_fpMAC64, "vmls", ".f64\t$Dd, $Dn, $Dm", + [(set DPR:$Dd, (fadd_mlx (fneg (fmul_su DPR:$Dn,DPR:$Dm)), + (f64 DPR:$Ddin)))]>, + RegConstraint<"$Ddin = $Dd">, + Requires<[HasVFP2,UseFPVMLx]>; def VMLSS : ASbIn<0b11100, 0b00, 1, 0, - (outs SPR:$dst), (ins SPR:$dstin, SPR:$a, SPR:$b), - IIC_fpMAC32, "vmls", ".f32\t$dst, $a, $b", - [(set SPR:$dst, (fadd (fneg (fmul SPR:$a, SPR:$b)), SPR:$dstin))]>, - RegConstraint<"$dstin = $dst">; - -def : Pat<(fsub DPR:$dstin, (fmul DPR:$a, (f64 DPR:$b))), - (VMLSD DPR:$dstin, DPR:$a, DPR:$b)>, Requires<[DontUseNEONForFP]>; -def : Pat<(fsub SPR:$dstin, (fmul SPR:$a, SPR:$b)), - (VMLSS SPR:$dstin, SPR:$a, SPR:$b)>, Requires<[DontUseNEONForFP]>; - -def VNMLAD : ADbI_vmlX<0b11100, 0b01, 1, 0, - (outs DPR:$dst), (ins DPR:$dstin, DPR:$a, DPR:$b), - IIC_fpMAC64, "vnmla", ".f64\t$dst, $a, $b", - [(set DPR:$dst, (fsub (fneg (fmul DPR:$a, DPR:$b)), - (f64 DPR:$dstin)))]>, - RegConstraint<"$dstin = $dst">; + (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm), + IIC_fpMAC32, "vmls", ".f32\t$Sd, $Sn, $Sm", + [(set SPR:$Sd, (fadd_mlx (fneg (fmul_su SPR:$Sn, SPR:$Sm)), + SPR:$Sdin))]>, + RegConstraint<"$Sdin = $Sd">, + Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]> { + // Some single precision VFP instructions may be executed on both NEON and VFP + // pipelines. + let D = VFPNeonDomain; +} + +def : Pat<(fsub_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))), + (VMLSD DPR:$dstin, DPR:$a, DPR:$b)>, + Requires<[HasVFP2,UseFPVMLx]>; +def : Pat<(fsub_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)), + (VMLSS SPR:$dstin, SPR:$a, SPR:$b)>, + Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]>; + +def VNMLAD : ADbI<0b11100, 0b01, 1, 0, + (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm), + IIC_fpMAC64, "vnmla", ".f64\t$Dd, $Dn, $Dm", + [(set DPR:$Dd,(fsub_mlx (fneg (fmul_su DPR:$Dn,DPR:$Dm)), + (f64 DPR:$Ddin)))]>, + RegConstraint<"$Ddin = $Dd">, + Requires<[HasVFP2,UseFPVMLx]>; def VNMLAS : ASbI<0b11100, 0b01, 1, 0, - (outs SPR:$dst), (ins SPR:$dstin, SPR:$a, SPR:$b), - IIC_fpMAC32, "vnmla", ".f32\t$dst, $a, $b", - [(set SPR:$dst, (fsub (fneg (fmul SPR:$a, SPR:$b)), SPR:$dstin))]>, - RegConstraint<"$dstin = $dst">; + (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm), + IIC_fpMAC32, "vnmla", ".f32\t$Sd, $Sn, $Sm", + [(set SPR:$Sd, (fsub_mlx (fneg (fmul_su SPR:$Sn, SPR:$Sm)), + SPR:$Sdin))]>, + RegConstraint<"$Sdin = $Sd">, + Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]> { + // Some single precision VFP instructions may be executed on both NEON and VFP + // pipelines. + let D = VFPNeonDomain; +} + +def : Pat<(fsub_mlx (fneg (fmul_su DPR:$a, (f64 DPR:$b))), DPR:$dstin), + (VNMLAD DPR:$dstin, DPR:$a, DPR:$b)>, + Requires<[HasVFP2,UseFPVMLx]>; +def : Pat<(fsub_mlx (fneg (fmul_su SPR:$a, SPR:$b)), SPR:$dstin), + (VNMLAS SPR:$dstin, SPR:$a, SPR:$b)>, + Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]>; + +def VNMLSD : ADbI<0b11100, 0b01, 0, 0, + (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm), + IIC_fpMAC64, "vnmls", ".f64\t$Dd, $Dn, $Dm", + [(set DPR:$Dd, (fsub_mlx (fmul_su DPR:$Dn, DPR:$Dm), + (f64 DPR:$Ddin)))]>, + RegConstraint<"$Ddin = $Dd">, + Requires<[HasVFP2,UseFPVMLx]>; + +def VNMLSS : ASbI<0b11100, 0b01, 0, 0, + (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm), + IIC_fpMAC32, "vnmls", ".f32\t$Sd, $Sn, $Sm", + [(set SPR:$Sd, (fsub_mlx (fmul_su SPR:$Sn, SPR:$Sm), SPR:$Sdin))]>, + RegConstraint<"$Sdin = $Sd">, + Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]> { + // Some single precision VFP instructions may be executed on both NEON and VFP + // pipelines. + let D = VFPNeonDomain; +} + +def : Pat<(fsub_mlx (fmul_su DPR:$a, (f64 DPR:$b)), DPR:$dstin), + (VNMLSD DPR:$dstin, DPR:$a, DPR:$b)>, + Requires<[HasVFP2,UseFPVMLx]>; +def : Pat<(fsub_mlx (fmul_su SPR:$a, SPR:$b), SPR:$dstin), + (VNMLSS SPR:$dstin, SPR:$a, SPR:$b)>, + Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]>; + //===----------------------------------------------------------------------===// // FP Conditional moves. @@ -616,92 +973,157 @@ def VNMLAS : ASbI<0b11100, 0b01, 1, 0, let neverHasSideEffects = 1 in { def VMOVDcc : ADuI<0b11101, 0b11, 0b0000, 0b01, 0, - (outs DPR:$dst), (ins DPR:$false, DPR:$true), - IIC_fpUNA64, "vmov", ".f64\t$dst, $true", - [/*(set DPR:$dst, (ARMcmov DPR:$false, DPR:$true, imm:$cc))*/]>, - RegConstraint<"$false = $dst">; + (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm), + IIC_fpUNA64, "vmov", ".f64\t$Dd, $Dm", + [/*(set DPR:$Dd, (ARMcmov DPR:$Dn, DPR:$Dm, imm:$cc))*/]>, + RegConstraint<"$Dn = $Dd">; def VMOVScc : ASuI<0b11101, 0b11, 0b0000, 0b01, 0, - (outs SPR:$dst), (ins SPR:$false, SPR:$true), - IIC_fpUNA32, "vmov", ".f32\t$dst, $true", - [/*(set SPR:$dst, (ARMcmov SPR:$false, SPR:$true, imm:$cc))*/]>, - RegConstraint<"$false = $dst">; + (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm), + IIC_fpUNA32, "vmov", ".f32\t$Sd, $Sm", + [/*(set SPR:$Sd, (ARMcmov SPR:$Sn, SPR:$Sm, imm:$cc))*/]>, + RegConstraint<"$Sn = $Sd">; def VNEGDcc : ADuI<0b11101, 0b11, 0b0001, 0b01, 0, - (outs DPR:$dst), (ins DPR:$false, DPR:$true), - IIC_fpUNA64, "vneg", ".f64\t$dst, $true", - [/*(set DPR:$dst, (ARMcneg DPR:$false, DPR:$true, imm:$cc))*/]>, - RegConstraint<"$false = $dst">; + (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm), + IIC_fpUNA64, "vneg", ".f64\t$Dd, $Dm", + [/*(set DPR:$Dd, (ARMcneg DPR:$Dn, DPR:$Dm, imm:$cc))*/]>, + RegConstraint<"$Dn = $Dd">; def VNEGScc : ASuI<0b11101, 0b11, 0b0001, 0b01, 0, - (outs SPR:$dst), (ins SPR:$false, SPR:$true), - IIC_fpUNA32, "vneg", ".f32\t$dst, $true", - [/*(set SPR:$dst, (ARMcneg SPR:$false, SPR:$true, imm:$cc))*/]>, - RegConstraint<"$false = $dst">; + (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm), + IIC_fpUNA32, "vneg", ".f32\t$Sd, $Sm", + [/*(set SPR:$Sd, (ARMcneg SPR:$Sn, SPR:$Sm, imm:$cc))*/]>, + RegConstraint<"$Sn = $Sd"> { + // Some single precision VFP instructions may be executed on both NEON and VFP + // pipelines. + let D = VFPNeonDomain; +} } // neverHasSideEffects //===----------------------------------------------------------------------===// -// Misc. +// Move from VFP System Register to ARM core register. // -// APSR is the application level alias of CPSR. This FPSCR N, Z, C, V flags -// to APSR. -let Defs = [CPSR], Uses = [FPSCR] in -def FMSTAT : VFPAI<(outs), (ins), VFPMiscFrm, IIC_fpSTAT, "vmrs", - "\tapsr_nzcv, fpscr", - [(arm_fmstat)]> { +class MovFromVFP<bits<4> opc19_16, dag oops, dag iops, string opc, string asm, + list<dag> pattern>: + VFPAI<oops, iops, VFPMiscFrm, IIC_fpSTAT, opc, asm, pattern> { + + // Instruction operand. + bits<4> Rt; + let Inst{27-20} = 0b11101111; - let Inst{19-16} = 0b0001; - let Inst{15-12} = 0b1111; + let Inst{19-16} = opc19_16; + let Inst{15-12} = Rt; let Inst{11-8} = 0b1010; let Inst{7} = 0; + let Inst{6-5} = 0b00; let Inst{4} = 1; + let Inst{3-0} = 0b0000; } -// FPSCR <-> GPR (for disassembly only) +// APSR is the application level alias of CPSR. This FPSCR N, Z, C, V flags +// to APSR. +let Defs = [CPSR], Uses = [FPSCR], Rt = 0b1111 /* apsr_nzcv */ in +def FMSTAT : MovFromVFP<0b0001 /* fpscr */, (outs), (ins), + "vmrs", "\tapsr_nzcv, fpscr", [(arm_fmstat)]>; + +// Application level FPSCR -> GPR let hasSideEffects = 1, Uses = [FPSCR] in -def VMRS : VFPAI<(outs GPR:$dst), (ins), VFPMiscFrm, IIC_fpSTAT, - "vmrs", "\t$dst, fpscr", - [(set GPR:$dst, (int_arm_get_fpscr))]> { - let Inst{27-20} = 0b11101111; - let Inst{19-16} = 0b0001; - let Inst{11-8} = 0b1010; - let Inst{7} = 0; - let Inst{4} = 1; +def VMRS : MovFromVFP<0b0001 /* fpscr */, (outs GPR:$Rt), (ins), + "vmrs", "\t$Rt, fpscr", + [(set GPR:$Rt, (int_arm_get_fpscr))]>; + +// System level FPEXC, FPSID -> GPR +let Uses = [FPSCR] in { + def VMRS_FPEXC : MovFromVFP<0b1000 /* fpexc */, (outs GPR:$Rt), (ins), + "vmrs", "\t$Rt, fpexc", []>; + def VMRS_FPSID : MovFromVFP<0b0000 /* fpsid */, (outs GPR:$Rt), (ins), + "vmrs", "\t$Rt, fpsid", []>; } -let Defs = [FPSCR] in -def VMSR : VFPAI<(outs), (ins GPR:$src), VFPMiscFrm, IIC_fpSTAT, - "vmsr", "\tfpscr, $src", - [(int_arm_set_fpscr GPR:$src)]> { +//===----------------------------------------------------------------------===// +// Move from ARM core register to VFP System Register. +// + +class MovToVFP<bits<4> opc19_16, dag oops, dag iops, string opc, string asm, + list<dag> pattern>: + VFPAI<oops, iops, VFPMiscFrm, IIC_fpSTAT, opc, asm, pattern> { + + // Instruction operand. + bits<4> src; + + // Encode instruction operand. + let Inst{15-12} = src; + let Inst{27-20} = 0b11101110; - let Inst{19-16} = 0b0001; + let Inst{19-16} = opc19_16; let Inst{11-8} = 0b1010; let Inst{7} = 0; let Inst{4} = 1; } +let Defs = [FPSCR] in { + // Application level GPR -> FPSCR + def VMSR : MovToVFP<0b0001 /* fpscr */, (outs), (ins GPR:$src), + "vmsr", "\tfpscr, $src", [(int_arm_set_fpscr GPR:$src)]>; + // System level GPR -> FPEXC + def VMSR_FPEXC : MovToVFP<0b1000 /* fpexc */, (outs), (ins GPR:$src), + "vmsr", "\tfpexc, $src", []>; + // System level GPR -> FPSID + def VMSR_FPSID : MovToVFP<0b0000 /* fpsid */, (outs), (ins GPR:$src), + "vmsr", "\tfpsid, $src", []>; +} + +//===----------------------------------------------------------------------===// +// Misc. +// + // Materialize FP immediates. VFP3 only. let isReMaterializable = 1 in { -def FCONSTD : VFPAI<(outs DPR:$dst), (ins vfp_f64imm:$imm), +def FCONSTD : VFPAI<(outs DPR:$Dd), (ins vfp_f64imm:$imm), VFPMiscFrm, IIC_fpUNA64, - "vmov", ".f64\t$dst, $imm", - [(set DPR:$dst, vfp_f64imm:$imm)]>, Requires<[HasVFP3]> { + "vmov", ".f64\t$Dd, $imm", + [(set DPR:$Dd, vfp_f64imm:$imm)]>, Requires<[HasVFP3]> { + // Instruction operands. + bits<5> Dd; + bits<32> imm; + + // Encode instruction operands. + let Inst{15-12} = Dd{3-0}; + let Inst{22} = Dd{4}; + let Inst{19} = imm{31}; + let Inst{18-16} = imm{22-20}; + let Inst{3-0} = imm{19-16}; + + // Encode remaining instruction bits. let Inst{27-23} = 0b11101; let Inst{21-20} = 0b11; let Inst{11-9} = 0b101; - let Inst{8} = 1; + let Inst{8} = 1; // Double precision. let Inst{7-4} = 0b0000; } -def FCONSTS : VFPAI<(outs SPR:$dst), (ins vfp_f32imm:$imm), - VFPMiscFrm, IIC_fpUNA32, - "vmov", ".f32\t$dst, $imm", - [(set SPR:$dst, vfp_f32imm:$imm)]>, Requires<[HasVFP3]> { +def FCONSTS : VFPAI<(outs SPR:$Sd), (ins vfp_f32imm:$imm), + VFPMiscFrm, IIC_fpUNA32, + "vmov", ".f32\t$Sd, $imm", + [(set SPR:$Sd, vfp_f32imm:$imm)]>, Requires<[HasVFP3]> { + // Instruction operands. + bits<5> Sd; + bits<32> imm; + + // Encode instruction operands. + let Inst{15-12} = Sd{4-1}; + let Inst{22} = Sd{0}; + let Inst{19} = imm{31}; // The immediate is handled as a double. + let Inst{18-16} = imm{22-20}; + let Inst{3-0} = imm{19-16}; + + // Encode remaining instruction bits. let Inst{27-23} = 0b11101; let Inst{21-20} = 0b11; let Inst{11-9} = 0b101; - let Inst{8} = 0; + let Inst{8} = 0; // Single precision. let Inst{7-4} = 0b0000; } } diff --git a/contrib/llvm/lib/Target/ARM/ARMJITInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMJITInfo.cpp index 5f6d7ee..45b7e48 100644 --- a/contrib/llvm/lib/Target/ARM/ARMJITInfo.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMJITInfo.cpp @@ -22,7 +22,7 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/System/Memory.h" +#include "llvm/Support/Memory.h" #include <cstdlib> using namespace llvm; @@ -43,7 +43,7 @@ static TargetJITInfo::JITCompilerFn JITCompilerFunction; #define ASMPREFIX GETASMPREFIX(__USER_LABEL_PREFIX__) // CompilationCallback stub - We can't use a C function with inline assembly in -// it, because we the prolog/epilog inserted by GCC won't work for us (we need +// it, because the prolog/epilog inserted by GCC won't work for us. (We need // to preserve more context and manipulate the stack directly). Instead, // write our own wrapper, which does things our way, so we have complete // control over register saving and restoring. @@ -97,9 +97,10 @@ extern "C" { "str r0, [sp,#16]\n" // Return to the (newly modified) stub to invoke the real function. // The above twiddling of the saved return addresses allows us to - // deallocate everything, including the LR the stub saved, all in one - // pop instruction. - "ldmia sp!, {r0, r1, r2, r3, lr, pc}\n" + // deallocate everything, including the LR the stub saved, with two + // updating load instructions. + "ldmia sp!, {r0, r1, r2, r3, lr}\n" + "ldr pc, [sp], #4\n" ); #else // Not an ARM host void ARMCompilationCallback() { @@ -290,7 +291,7 @@ void ARMJITInfo::relocate(void *Function, MachineRelocation *MR, *((intptr_t*)RelocPos) |= ResultPtr; // Set register Rn to PC. *((intptr_t*)RelocPos) |= - ARMRegisterInfo::getRegisterNumbering(ARM::PC) << ARMII::RegRnShift; + getARMRegisterNumbering(ARM::PC) << ARMII::RegRnShift; break; } case ARM::reloc_arm_pic_jt: diff --git a/contrib/llvm/lib/Target/ARM/ARMJITInfo.h b/contrib/llvm/lib/Target/ARM/ARMJITInfo.h index f5d9eff..2f97928 100644 --- a/contrib/llvm/lib/Target/ARM/ARMJITInfo.h +++ b/contrib/llvm/lib/Target/ARM/ARMJITInfo.h @@ -105,7 +105,7 @@ namespace llvm { /// model is PIC. void Initialize(const MachineFunction &MF, bool isPIC) { const ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); - ConstPoolId2AddrMap.resize(AFI->getNumConstPoolEntries()); + ConstPoolId2AddrMap.resize(AFI->getNumPICLabels()); JumpTableId2AddrMap.resize(AFI->getNumJumpTables()); IsPIC = isPIC; } diff --git a/contrib/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/contrib/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp index 2b7645a..d9dc5cd 100644 --- a/contrib/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp @@ -128,45 +128,153 @@ namespace { char ARMLoadStoreOpt::ID = 0; } -static int getLoadStoreMultipleOpcode(int Opcode) { +static int getLoadStoreMultipleOpcode(int Opcode, ARM_AM::AMSubMode Mode) { switch (Opcode) { - case ARM::LDR: + default: llvm_unreachable("Unhandled opcode!"); + case ARM::LDRi12: ++NumLDMGened; - return ARM::LDM; - case ARM::STR: + switch (Mode) { + default: llvm_unreachable("Unhandled submode!"); + case ARM_AM::ia: return ARM::LDMIA; + case ARM_AM::da: return ARM::LDMDA; + case ARM_AM::db: return ARM::LDMDB; + case ARM_AM::ib: return ARM::LDMIB; + } + break; + case ARM::STRi12: ++NumSTMGened; - return ARM::STM; + switch (Mode) { + default: llvm_unreachable("Unhandled submode!"); + case ARM_AM::ia: return ARM::STMIA; + case ARM_AM::da: return ARM::STMDA; + case ARM_AM::db: return ARM::STMDB; + case ARM_AM::ib: return ARM::STMIB; + } + break; case ARM::t2LDRi8: case ARM::t2LDRi12: ++NumLDMGened; - return ARM::t2LDM; + switch (Mode) { + default: llvm_unreachable("Unhandled submode!"); + case ARM_AM::ia: return ARM::t2LDMIA; + case ARM_AM::db: return ARM::t2LDMDB; + } + break; case ARM::t2STRi8: case ARM::t2STRi12: ++NumSTMGened; - return ARM::t2STM; + switch (Mode) { + default: llvm_unreachable("Unhandled submode!"); + case ARM_AM::ia: return ARM::t2STMIA; + case ARM_AM::db: return ARM::t2STMDB; + } + break; case ARM::VLDRS: ++NumVLDMGened; - return ARM::VLDMS; + switch (Mode) { + default: llvm_unreachable("Unhandled submode!"); + case ARM_AM::ia: return ARM::VLDMSIA; + case ARM_AM::db: return ARM::VLDMSDB; + } + break; case ARM::VSTRS: ++NumVSTMGened; - return ARM::VSTMS; + switch (Mode) { + default: llvm_unreachable("Unhandled submode!"); + case ARM_AM::ia: return ARM::VSTMSIA; + case ARM_AM::db: return ARM::VSTMSDB; + } + break; case ARM::VLDRD: ++NumVLDMGened; - return ARM::VLDMD; + switch (Mode) { + default: llvm_unreachable("Unhandled submode!"); + case ARM_AM::ia: return ARM::VLDMDIA; + case ARM_AM::db: return ARM::VLDMDDB; + } + break; case ARM::VSTRD: ++NumVSTMGened; - return ARM::VSTMD; - default: llvm_unreachable("Unhandled opcode!"); + switch (Mode) { + default: llvm_unreachable("Unhandled submode!"); + case ARM_AM::ia: return ARM::VSTMDIA; + case ARM_AM::db: return ARM::VSTMDDB; + } + break; } + return 0; } +namespace llvm { + namespace ARM_AM { + +AMSubMode getLoadStoreMultipleSubMode(int Opcode) { + switch (Opcode) { + default: llvm_unreachable("Unhandled opcode!"); + case ARM::LDMIA_RET: + case ARM::LDMIA: + case ARM::LDMIA_UPD: + case ARM::STMIA: + case ARM::STMIA_UPD: + case ARM::t2LDMIA_RET: + case ARM::t2LDMIA: + case ARM::t2LDMIA_UPD: + case ARM::t2STMIA: + case ARM::t2STMIA_UPD: + case ARM::VLDMSIA: + case ARM::VLDMSIA_UPD: + case ARM::VSTMSIA: + case ARM::VSTMSIA_UPD: + case ARM::VLDMDIA: + case ARM::VLDMDIA_UPD: + case ARM::VSTMDIA: + case ARM::VSTMDIA_UPD: + return ARM_AM::ia; + + case ARM::LDMDA: + case ARM::LDMDA_UPD: + case ARM::STMDA: + case ARM::STMDA_UPD: + return ARM_AM::da; + + case ARM::LDMDB: + case ARM::LDMDB_UPD: + case ARM::STMDB: + case ARM::STMDB_UPD: + case ARM::t2LDMDB: + case ARM::t2LDMDB_UPD: + case ARM::t2STMDB: + case ARM::t2STMDB_UPD: + case ARM::VLDMSDB: + case ARM::VLDMSDB_UPD: + case ARM::VSTMSDB: + case ARM::VSTMSDB_UPD: + case ARM::VLDMDDB: + case ARM::VLDMDDB_UPD: + case ARM::VSTMDDB: + case ARM::VSTMDDB_UPD: + return ARM_AM::db; + + case ARM::LDMIB: + case ARM::LDMIB_UPD: + case ARM::STMIB: + case ARM::STMIB_UPD: + return ARM_AM::ib; + } + + return ARM_AM::bad_am_submode; +} + + } // end namespace ARM_AM +} // end namespace llvm + static bool isT2i32Load(unsigned Opc) { return Opc == ARM::t2LDRi12 || Opc == ARM::t2LDRi8; } static bool isi32Load(unsigned Opc) { - return Opc == ARM::LDR || isT2i32Load(Opc); + return Opc == ARM::LDRi12 || isT2i32Load(Opc); } static bool isT2i32Store(unsigned Opc) { @@ -174,7 +282,7 @@ static bool isT2i32Store(unsigned Opc) { } static bool isi32Store(unsigned Opc) { - return Opc == ARM::STR || isT2i32Store(Opc); + return Opc == ARM::STRi12 || isT2i32Store(Opc); } /// MergeOps - Create and insert a LDM or STM with Base as base register and @@ -245,10 +353,10 @@ ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB, bool isDef = (isi32Load(Opcode) || Opcode == ARM::VLDRS || Opcode == ARM::VLDRD); - Opcode = getLoadStoreMultipleOpcode(Opcode); + Opcode = getLoadStoreMultipleOpcode(Opcode, Mode); MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII->get(Opcode)) .addReg(Base, getKillRegState(BaseKill)) - .addImm(ARM_AM::getAM4ModeImm(Mode)).addImm(Pred).addReg(PredReg); + .addImm(Pred).addReg(PredReg); for (unsigned i = 0; i != NumRegs; ++i) MIB = MIB.addReg(Regs[i].first, getDefRegState(isDef) | getKillRegState(Regs[i].second)); @@ -271,22 +379,14 @@ void ARMLoadStoreOpt::MergeOpsUpdate(MachineBasicBlock &MBB, // First calculate which of the registers should be killed by the merged // instruction. const unsigned insertPos = memOps[insertAfter].Position; - - SmallSet<unsigned, 4> UnavailRegs; SmallSet<unsigned, 4> KilledRegs; DenseMap<unsigned, unsigned> Killer; - for (unsigned i = 0; i < memOpsBegin; ++i) { - if (memOps[i].Position < insertPos && memOps[i].isKill) { - unsigned Reg = memOps[i].Reg; - if (memOps[i].Merged) - UnavailRegs.insert(Reg); - else { - KilledRegs.insert(Reg); - Killer[Reg] = i; - } + for (unsigned i = 0, e = memOps.size(); i != e; ++i) { + if (i == memOpsBegin) { + i = memOpsEnd; + if (i == e) + break; } - } - for (unsigned i = memOpsEnd, e = memOps.size(); i != e; ++i) { if (memOps[i].Position < insertPos && memOps[i].isKill) { unsigned Reg = memOps[i].Reg; KilledRegs.insert(Reg); @@ -297,12 +397,7 @@ void ARMLoadStoreOpt::MergeOpsUpdate(MachineBasicBlock &MBB, SmallVector<std::pair<unsigned, bool>, 8> Regs; for (unsigned i = memOpsBegin; i < memOpsEnd; ++i) { unsigned Reg = memOps[i].Reg; - if (UnavailRegs.count(Reg)) - // Register is killed before and it's not easy / possible to update the - // kill marker on already merged instructions. Abort. - return; - - // If we are inserting the merged operation after an unmerged operation that + // If we are inserting the merged operation after an operation that // uses the same register, make sure to transfer any kill flag. bool isKill = memOps[i].isKill || KilledRegs.count(Reg); Regs.push_back(std::make_pair(Reg, isKill)); @@ -318,17 +413,24 @@ void ARMLoadStoreOpt::MergeOpsUpdate(MachineBasicBlock &MBB, // Merge succeeded, update records. Merges.push_back(prior(Loc)); for (unsigned i = memOpsBegin; i < memOpsEnd; ++i) { - // Remove kill flags from any unmerged memops that come before insertPos. + // Remove kill flags from any memops that come before insertPos. if (Regs[i-memOpsBegin].second) { unsigned Reg = Regs[i-memOpsBegin].first; if (KilledRegs.count(Reg)) { unsigned j = Killer[Reg]; - memOps[j].MBBI->getOperand(0).setIsKill(false); + int Idx = memOps[j].MBBI->findRegisterUseOperandIdx(Reg, true); + assert(Idx >= 0 && "Cannot find killing operand"); + memOps[j].MBBI->getOperand(Idx).setIsKill(false); memOps[j].isKill = false; } + memOps[i].isKill = true; } MBB.erase(memOps[i].MBBI); + // Update this memop to refer to the merged instruction. + // We may need to move kill flags again. memOps[i].Merged = true; + memOps[i].MBBI = Merges.back(); + memOps[i].Position = insertPos; } } @@ -349,7 +451,7 @@ ARMLoadStoreOpt::MergeLDR_STR(MachineBasicBlock &MBB, unsigned SIndex, const MachineOperand &PMO = Loc->getOperand(0); unsigned PReg = PMO.getReg(); unsigned PRegNum = PMO.isUndef() ? UINT_MAX - : ARMRegisterInfo::getRegisterNumbering(PReg); + : getARMRegisterNumbering(PReg); unsigned Count = 1; for (unsigned i = SIndex+1, e = MemOps.size(); i != e; ++i) { @@ -357,7 +459,7 @@ ARMLoadStoreOpt::MergeLDR_STR(MachineBasicBlock &MBB, unsigned SIndex, const MachineOperand &MO = MemOps[i].MBBI->getOperand(0); unsigned Reg = MO.getReg(); unsigned RegNum = MO.isUndef() ? UINT_MAX - : ARMRegisterInfo::getRegisterNumbering(Reg); + : getARMRegisterNumbering(Reg); // Register numbers must be in ascending order. For VFP, the registers // must also be consecutive and there is a limit of 16 double-word // registers per instruction. @@ -440,8 +542,8 @@ static inline bool isMatchingIncrement(MachineInstr *MI, unsigned Base, static inline unsigned getLSMultipleTransferSize(MachineInstr *MI) { switch (MI->getOpcode()) { default: return 0; - case ARM::LDR: - case ARM::STR: + case ARM::LDRi12: + case ARM::STRi12: case ARM::t2LDRi8: case ARM::t2LDRi12: case ARM::t2STRi8: @@ -452,31 +554,109 @@ static inline unsigned getLSMultipleTransferSize(MachineInstr *MI) { case ARM::VLDRD: case ARM::VSTRD: return 8; - case ARM::LDM: - case ARM::STM: - case ARM::t2LDM: - case ARM::t2STM: - case ARM::VLDMS: - case ARM::VSTMS: - return (MI->getNumOperands() - 4) * 4; - case ARM::VLDMD: - case ARM::VSTMD: - return (MI->getNumOperands() - 4) * 8; + case ARM::LDMIA: + case ARM::LDMDA: + case ARM::LDMDB: + case ARM::LDMIB: + case ARM::STMIA: + case ARM::STMDA: + case ARM::STMDB: + case ARM::STMIB: + case ARM::t2LDMIA: + case ARM::t2LDMDB: + case ARM::t2STMIA: + case ARM::t2STMDB: + case ARM::VLDMSIA: + case ARM::VLDMSDB: + case ARM::VSTMSIA: + case ARM::VSTMSDB: + return (MI->getNumOperands() - MI->getDesc().getNumOperands() + 1) * 4; + case ARM::VLDMDIA: + case ARM::VLDMDDB: + case ARM::VSTMDIA: + case ARM::VSTMDDB: + return (MI->getNumOperands() - MI->getDesc().getNumOperands() + 1) * 8; } } -static unsigned getUpdatingLSMultipleOpcode(unsigned Opc) { +static unsigned getUpdatingLSMultipleOpcode(unsigned Opc, + ARM_AM::AMSubMode Mode) { switch (Opc) { - case ARM::LDM: return ARM::LDM_UPD; - case ARM::STM: return ARM::STM_UPD; - case ARM::t2LDM: return ARM::t2LDM_UPD; - case ARM::t2STM: return ARM::t2STM_UPD; - case ARM::VLDMS: return ARM::VLDMS_UPD; - case ARM::VLDMD: return ARM::VLDMD_UPD; - case ARM::VSTMS: return ARM::VSTMS_UPD; - case ARM::VSTMD: return ARM::VSTMD_UPD; default: llvm_unreachable("Unhandled opcode!"); + case ARM::LDMIA: + case ARM::LDMDA: + case ARM::LDMDB: + case ARM::LDMIB: + switch (Mode) { + default: llvm_unreachable("Unhandled submode!"); + case ARM_AM::ia: return ARM::LDMIA_UPD; + case ARM_AM::ib: return ARM::LDMIB_UPD; + case ARM_AM::da: return ARM::LDMDA_UPD; + case ARM_AM::db: return ARM::LDMDB_UPD; + } + break; + case ARM::STMIA: + case ARM::STMDA: + case ARM::STMDB: + case ARM::STMIB: + switch (Mode) { + default: llvm_unreachable("Unhandled submode!"); + case ARM_AM::ia: return ARM::STMIA_UPD; + case ARM_AM::ib: return ARM::STMIB_UPD; + case ARM_AM::da: return ARM::STMDA_UPD; + case ARM_AM::db: return ARM::STMDB_UPD; + } + break; + case ARM::t2LDMIA: + case ARM::t2LDMDB: + switch (Mode) { + default: llvm_unreachable("Unhandled submode!"); + case ARM_AM::ia: return ARM::t2LDMIA_UPD; + case ARM_AM::db: return ARM::t2LDMDB_UPD; + } + break; + case ARM::t2STMIA: + case ARM::t2STMDB: + switch (Mode) { + default: llvm_unreachable("Unhandled submode!"); + case ARM_AM::ia: return ARM::t2STMIA_UPD; + case ARM_AM::db: return ARM::t2STMDB_UPD; + } + break; + case ARM::VLDMSIA: + case ARM::VLDMSDB: + switch (Mode) { + default: llvm_unreachable("Unhandled submode!"); + case ARM_AM::ia: return ARM::VLDMSIA_UPD; + case ARM_AM::db: return ARM::VLDMSDB_UPD; + } + break; + case ARM::VLDMDIA: + case ARM::VLDMDDB: + switch (Mode) { + default: llvm_unreachable("Unhandled submode!"); + case ARM_AM::ia: return ARM::VLDMDIA_UPD; + case ARM_AM::db: return ARM::VLDMDDB_UPD; + } + break; + case ARM::VSTMSIA: + case ARM::VSTMSDB: + switch (Mode) { + default: llvm_unreachable("Unhandled submode!"); + case ARM_AM::ia: return ARM::VSTMSIA_UPD; + case ARM_AM::db: return ARM::VSTMSDB_UPD; + } + break; + case ARM::VSTMDIA: + case ARM::VSTMDDB: + switch (Mode) { + default: llvm_unreachable("Unhandled submode!"); + case ARM_AM::ia: return ARM::VSTMDIA_UPD; + case ARM_AM::db: return ARM::VSTMDDB_UPD; + } + break; } + return 0; } @@ -505,16 +685,14 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineBasicBlock &MBB, int Opcode = MI->getOpcode(); DebugLoc dl = MI->getDebugLoc(); - bool DoMerge = false; - ARM_AM::AMSubMode Mode = ARM_AM::ia; - // Can't use an updating ld/st if the base register is also a dest // register. e.g. ldmdb r0!, {r0, r1, r2}. The behavior is undefined. - for (unsigned i = 3, e = MI->getNumOperands(); i != e; ++i) { + for (unsigned i = 2, e = MI->getNumOperands(); i != e; ++i) if (MI->getOperand(i).getReg() == Base) return false; - } - Mode = ARM_AM::getAM4SubMode(MI->getOperand(1).getImm()); + + bool DoMerge = false; + ARM_AM::AMSubMode Mode = ARM_AM::getLoadStoreMultipleSubMode(Opcode); // Try merging with the previous instruction. MachineBasicBlock::iterator BeginMBBI = MBB.begin(); @@ -560,15 +738,16 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineBasicBlock &MBB, if (!DoMerge) return false; - unsigned NewOpc = getUpdatingLSMultipleOpcode(Opcode); + unsigned NewOpc = getUpdatingLSMultipleOpcode(Opcode, Mode); MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII->get(NewOpc)) .addReg(Base, getDefRegState(true)) // WB base register .addReg(Base, getKillRegState(BaseKill)) - .addImm(ARM_AM::getAM4ModeImm(Mode)) .addImm(Pred).addReg(PredReg); + // Transfer the rest of operands. - for (unsigned OpNum = 4, e = MI->getNumOperands(); OpNum != e; ++OpNum) + for (unsigned OpNum = 3, e = MI->getNumOperands(); OpNum != e; ++OpNum) MIB.addOperand(MI->getOperand(OpNum)); + // Transfer memoperands. (*MIB).setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); @@ -576,14 +755,21 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineBasicBlock &MBB, return true; } -static unsigned getPreIndexedLoadStoreOpcode(unsigned Opc) { +static unsigned getPreIndexedLoadStoreOpcode(unsigned Opc, + ARM_AM::AddrOpc Mode) { switch (Opc) { - case ARM::LDR: return ARM::LDR_PRE; - case ARM::STR: return ARM::STR_PRE; - case ARM::VLDRS: return ARM::VLDMS_UPD; - case ARM::VLDRD: return ARM::VLDMD_UPD; - case ARM::VSTRS: return ARM::VSTMS_UPD; - case ARM::VSTRD: return ARM::VSTMD_UPD; + case ARM::LDRi12: + return ARM::LDR_PRE; + case ARM::STRi12: + return ARM::STR_PRE; + case ARM::VLDRS: + return Mode == ARM_AM::add ? ARM::VLDMSIA_UPD : ARM::VLDMSDB_UPD; + case ARM::VLDRD: + return Mode == ARM_AM::add ? ARM::VLDMDIA_UPD : ARM::VLDMDDB_UPD; + case ARM::VSTRS: + return Mode == ARM_AM::add ? ARM::VSTMSIA_UPD : ARM::VSTMSDB_UPD; + case ARM::VSTRD: + return Mode == ARM_AM::add ? ARM::VSTMDIA_UPD : ARM::VSTMDDB_UPD; case ARM::t2LDRi8: case ARM::t2LDRi12: return ARM::t2LDR_PRE; @@ -595,14 +781,21 @@ static unsigned getPreIndexedLoadStoreOpcode(unsigned Opc) { return 0; } -static unsigned getPostIndexedLoadStoreOpcode(unsigned Opc) { +static unsigned getPostIndexedLoadStoreOpcode(unsigned Opc, + ARM_AM::AddrOpc Mode) { switch (Opc) { - case ARM::LDR: return ARM::LDR_POST; - case ARM::STR: return ARM::STR_POST; - case ARM::VLDRS: return ARM::VLDMS_UPD; - case ARM::VLDRD: return ARM::VLDMD_UPD; - case ARM::VSTRS: return ARM::VSTMS_UPD; - case ARM::VSTRD: return ARM::VSTMD_UPD; + case ARM::LDRi12: + return ARM::LDR_POST; + case ARM::STRi12: + return ARM::STR_POST; + case ARM::VLDRS: + return Mode == ARM_AM::add ? ARM::VLDMSIA_UPD : ARM::VLDMSDB_UPD; + case ARM::VLDRD: + return Mode == ARM_AM::add ? ARM::VLDMDIA_UPD : ARM::VLDMDDB_UPD; + case ARM::VSTRS: + return Mode == ARM_AM::add ? ARM::VSTMSIA_UPD : ARM::VSTMSDB_UPD; + case ARM::VSTRD: + return Mode == ARM_AM::add ? ARM::VSTMDIA_UPD : ARM::VSTMDDB_UPD; case ARM::t2LDRi8: case ARM::t2LDRi12: return ARM::t2LDR_POST; @@ -629,14 +822,12 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineBasicBlock &MBB, DebugLoc dl = MI->getDebugLoc(); bool isAM5 = (Opcode == ARM::VLDRD || Opcode == ARM::VLDRS || Opcode == ARM::VSTRD || Opcode == ARM::VSTRS); - bool isAM2 = (Opcode == ARM::LDR || Opcode == ARM::STR); - if (isAM2 && ARM_AM::getAM2Offset(MI->getOperand(3).getImm()) != 0) - return false; - if (isAM5 && ARM_AM::getAM5Offset(MI->getOperand(2).getImm()) != 0) - return false; - if (isT2i32Load(Opcode) || isT2i32Store(Opcode)) + bool isAM2 = (Opcode == ARM::LDRi12 || Opcode == ARM::STRi12); + if (isi32Load(Opcode) || isi32Store(Opcode)) if (MI->getOperand(2).getImm() != 0) return false; + if (isAM5 && ARM_AM::getAM5Offset(MI->getOperand(2).getImm()) != 0) + return false; bool isLd = isi32Load(Opcode) || Opcode == ARM::VLDRS || Opcode == ARM::VLDRD; // Can't do the merge if the destination register is the same as the would-be @@ -666,7 +857,7 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineBasicBlock &MBB, DoMerge = true; } if (DoMerge) { - NewOpc = getPreIndexedLoadStoreOpcode(Opcode); + NewOpc = getPreIndexedLoadStoreOpcode(Opcode, AddSub); MBB.erase(PrevMBBI); } } @@ -685,7 +876,7 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineBasicBlock &MBB, DoMerge = true; } if (DoMerge) { - NewOpc = getPostIndexedLoadStoreOpcode(Opcode); + NewOpc = getPostIndexedLoadStoreOpcode(Opcode, AddSub); if (NextMBBI == I) { Advance = true; ++I; @@ -698,12 +889,9 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineBasicBlock &MBB, return false; unsigned Offset = 0; - if (isAM5) - Offset = ARM_AM::getAM4ModeImm(AddSub == ARM_AM::sub ? - ARM_AM::db : ARM_AM::ia); - else if (isAM2) + if (isAM2) Offset = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift); - else + else if (!isAM5) Offset = AddSub == ARM_AM::sub ? -Bytes : Bytes; if (isAM5) { @@ -715,7 +903,6 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineBasicBlock &MBB, BuildMI(MBB, MBBI, dl, TII->get(NewOpc)) .addReg(Base, getDefRegState(true)) // WB base register .addReg(Base, getKillRegState(isLd ? BaseKill : false)) - .addImm(Offset) .addImm(Pred).addReg(PredReg) .addReg(MO.getReg(), (isLd ? getDefRegState(true) : getKillRegState(MO.isKill()))); @@ -782,15 +969,14 @@ static bool isMemoryOp(const MachineInstr *MI) { int Opcode = MI->getOpcode(); switch (Opcode) { default: break; - case ARM::LDR: - case ARM::STR: - return MI->getOperand(1).isReg() && MI->getOperand(2).getReg() == 0; case ARM::VLDRS: case ARM::VSTRS: return MI->getOperand(1).isReg(); case ARM::VLDRD: case ARM::VSTRD: return MI->getOperand(1).isReg(); + case ARM::LDRi12: + case ARM::STRi12: case ARM::t2LDRi8: case ARM::t2LDRi12: case ARM::t2STRi8: @@ -818,24 +1004,19 @@ void ARMLoadStoreOpt::AdvanceRS(MachineBasicBlock &MBB, MemOpQueue &MemOps) { static int getMemoryOpOffset(const MachineInstr *MI) { int Opcode = MI->getOpcode(); - bool isAM2 = Opcode == ARM::LDR || Opcode == ARM::STR; bool isAM3 = Opcode == ARM::LDRD || Opcode == ARM::STRD; unsigned NumOperands = MI->getDesc().getNumOperands(); unsigned OffField = MI->getOperand(NumOperands-3).getImm(); if (Opcode == ARM::t2LDRi12 || Opcode == ARM::t2LDRi8 || Opcode == ARM::t2STRi12 || Opcode == ARM::t2STRi8 || - Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8) + Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8 || + Opcode == ARM::LDRi12 || Opcode == ARM::STRi12) return OffField; - int Offset = isAM2 - ? ARM_AM::getAM2Offset(OffField) - : (isAM3 ? ARM_AM::getAM3Offset(OffField) - : ARM_AM::getAM5Offset(OffField) * 4); - if (isAM2) { - if (ARM_AM::getAM2Op(OffField) == ARM_AM::sub) - Offset = -Offset; - } else if (isAM3) { + int Offset = isAM3 ? ARM_AM::getAM3Offset(OffField) + : ARM_AM::getAM5Offset(OffField) * 4; + if (isAM3) { if (ARM_AM::getAM3Op(OffField) == ARM_AM::sub) Offset = -Offset; } else { @@ -847,35 +1028,24 @@ static int getMemoryOpOffset(const MachineInstr *MI) { static void InsertLDR_STR(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, - int OffImm, bool isDef, + int Offset, bool isDef, DebugLoc dl, unsigned NewOpc, unsigned Reg, bool RegDeadKill, bool RegUndef, unsigned BaseReg, bool BaseKill, bool BaseUndef, - unsigned OffReg, bool OffKill, bool OffUndef, + bool OffKill, bool OffUndef, ARMCC::CondCodes Pred, unsigned PredReg, const TargetInstrInfo *TII, bool isT2) { - int Offset = OffImm; - if (!isT2) { - if (OffImm < 0) - Offset = ARM_AM::getAM2Opc(ARM_AM::sub, -OffImm, ARM_AM::no_shift); - else - Offset = ARM_AM::getAM2Opc(ARM_AM::add, OffImm, ARM_AM::no_shift); - } if (isDef) { MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(NewOpc)) .addReg(Reg, getDefRegState(true) | getDeadRegState(RegDeadKill)) .addReg(BaseReg, getKillRegState(BaseKill)|getUndefRegState(BaseUndef)); - if (!isT2) - MIB.addReg(OffReg, getKillRegState(OffKill)|getUndefRegState(OffUndef)); MIB.addImm(Offset).addImm(Pred).addReg(PredReg); } else { MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(NewOpc)) .addReg(Reg, getKillRegState(RegDeadKill) | getUndefRegState(RegUndef)) .addReg(BaseReg, getKillRegState(BaseKill)|getUndefRegState(BaseUndef)); - if (!isT2) - MIB.addReg(OffReg, getKillRegState(OffKill)|getUndefRegState(OffUndef)); MIB.addImm(Offset).addImm(Pred).addReg(PredReg); } } @@ -906,23 +1076,21 @@ bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB, unsigned BaseReg = BaseOp.getReg(); bool BaseKill = BaseOp.isKill(); bool BaseUndef = BaseOp.isUndef(); - unsigned OffReg = isT2 ? 0 : MI->getOperand(3).getReg(); bool OffKill = isT2 ? false : MI->getOperand(3).isKill(); bool OffUndef = isT2 ? false : MI->getOperand(3).isUndef(); int OffImm = getMemoryOpOffset(MI); unsigned PredReg = 0; ARMCC::CondCodes Pred = llvm::getInstrPredicate(MI, PredReg); - if (OddRegNum > EvenRegNum && OffReg == 0 && OffImm == 0) { + if (OddRegNum > EvenRegNum && OffImm == 0) { // Ascending register numbers and no offset. It's safe to change it to a // ldm or stm. unsigned NewOpc = (isLd) - ? (isT2 ? ARM::t2LDM : ARM::LDM) - : (isT2 ? ARM::t2STM : ARM::STM); + ? (isT2 ? ARM::t2LDMIA : ARM::LDMIA) + : (isT2 ? ARM::t2STMIA : ARM::STMIA); if (isLd) { BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(NewOpc)) .addReg(BaseReg, getKillRegState(BaseKill)) - .addImm(ARM_AM::getAM4ModeImm(ARM_AM::ia)) .addImm(Pred).addReg(PredReg) .addReg(EvenReg, getDefRegState(isLd) | getDeadRegState(EvenDeadKill)) .addReg(OddReg, getDefRegState(isLd) | getDeadRegState(OddDeadKill)); @@ -930,7 +1098,6 @@ bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB, } else { BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(NewOpc)) .addReg(BaseReg, getKillRegState(BaseKill)) - .addImm(ARM_AM::getAM4ModeImm(ARM_AM::ia)) .addImm(Pred).addReg(PredReg) .addReg(EvenReg, getKillRegState(EvenDeadKill) | getUndefRegState(EvenUndef)) @@ -941,28 +1108,24 @@ bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB, NewBBI = llvm::prior(MBBI); } else { // Split into two instructions. - assert((!isT2 || !OffReg) && - "Thumb2 ldrd / strd does not encode offset register!"); unsigned NewOpc = (isLd) - ? (isT2 ? (OffImm < 0 ? ARM::t2LDRi8 : ARM::t2LDRi12) : ARM::LDR) - : (isT2 ? (OffImm < 0 ? ARM::t2STRi8 : ARM::t2STRi12) : ARM::STR); + ? (isT2 ? (OffImm < 0 ? ARM::t2LDRi8 : ARM::t2LDRi12) : ARM::LDRi12) + : (isT2 ? (OffImm < 0 ? ARM::t2STRi8 : ARM::t2STRi12) : ARM::STRi12); DebugLoc dl = MBBI->getDebugLoc(); // If this is a load and base register is killed, it may have been // re-defed by the load, make sure the first load does not clobber it. if (isLd && (BaseKill || OffKill) && - (TRI->regsOverlap(EvenReg, BaseReg) || - (OffReg && TRI->regsOverlap(EvenReg, OffReg)))) { - assert(!TRI->regsOverlap(OddReg, BaseReg) && - (!OffReg || !TRI->regsOverlap(OddReg, OffReg))); + (TRI->regsOverlap(EvenReg, BaseReg))) { + assert(!TRI->regsOverlap(OddReg, BaseReg)); InsertLDR_STR(MBB, MBBI, OffImm+4, isLd, dl, NewOpc, OddReg, OddDeadKill, false, - BaseReg, false, BaseUndef, OffReg, false, OffUndef, + BaseReg, false, BaseUndef, false, OffUndef, Pred, PredReg, TII, isT2); NewBBI = llvm::prior(MBBI); InsertLDR_STR(MBB, MBBI, OffImm, isLd, dl, NewOpc, EvenReg, EvenDeadKill, false, - BaseReg, BaseKill, BaseUndef, OffReg, OffKill, OffUndef, + BaseReg, BaseKill, BaseUndef, OffKill, OffUndef, Pred, PredReg, TII, isT2); } else { if (OddReg == EvenReg && EvenDeadKill) { @@ -974,12 +1137,12 @@ bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB, } InsertLDR_STR(MBB, MBBI, OffImm, isLd, dl, NewOpc, EvenReg, EvenDeadKill, EvenUndef, - BaseReg, false, BaseUndef, OffReg, false, OffUndef, + BaseReg, false, BaseUndef, false, OffUndef, Pred, PredReg, TII, isT2); NewBBI = llvm::prior(MBBI); InsertLDR_STR(MBB, MBBI, OffImm+4, isLd, dl, NewOpc, OddReg, OddDeadKill, OddUndef, - BaseReg, BaseKill, BaseUndef, OffReg, OffKill, OffUndef, + BaseReg, BaseKill, BaseUndef, OffKill, OffUndef, Pred, PredReg, TII, isT2); } if (isLd) @@ -1158,17 +1321,6 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) { return NumMerges > 0; } -namespace { - struct OffsetCompare { - bool operator()(const MachineInstr *LHS, const MachineInstr *RHS) const { - int LOffset = getMemoryOpOffset(LHS); - int ROffset = getMemoryOpOffset(RHS); - assert(LHS == RHS || LOffset != ROffset); - return LOffset > ROffset; - } - }; -} - /// MergeReturnIntoLDM - If this is a exit BB, try merging the return ops /// ("bx lr" and "mov pc, lr") into the preceeding stack restore so it /// directly restore the value of LR into pc. @@ -1182,20 +1334,25 @@ namespace { bool ARMLoadStoreOpt::MergeReturnIntoLDM(MachineBasicBlock &MBB) { if (MBB.empty()) return false; - MachineBasicBlock::iterator MBBI = prior(MBB.end()); + MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr(); if (MBBI != MBB.begin() && (MBBI->getOpcode() == ARM::BX_RET || MBBI->getOpcode() == ARM::tBX_RET || MBBI->getOpcode() == ARM::MOVPCLR)) { MachineInstr *PrevMI = prior(MBBI); - if (PrevMI->getOpcode() == ARM::LDM_UPD || - PrevMI->getOpcode() == ARM::t2LDM_UPD) { + unsigned Opcode = PrevMI->getOpcode(); + if (Opcode == ARM::LDMIA_UPD || Opcode == ARM::LDMDA_UPD || + Opcode == ARM::LDMDB_UPD || Opcode == ARM::LDMIB_UPD || + Opcode == ARM::t2LDMIA_UPD || Opcode == ARM::t2LDMDB_UPD) { MachineOperand &MO = PrevMI->getOperand(PrevMI->getNumOperands()-1); if (MO.getReg() != ARM::LR) return false; - unsigned NewOpc = isThumb2 ? ARM::t2LDM_RET : ARM::LDM_RET; + unsigned NewOpc = (isThumb2 ? ARM::t2LDMIA_RET : ARM::LDMIA_RET); + assert(((isThumb2 && Opcode == ARM::t2LDMIA_UPD) || + Opcode == ARM::LDMIA_UPD) && "Unsupported multiple load-return!"); PrevMI->setDesc(TII->get(NewOpc)); MO.setReg(ARM::PC); + PrevMI->copyImplicitOps(&*MBBI); MBB.erase(MBBI); return true; } @@ -1216,7 +1373,8 @@ bool ARMLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { ++MFI) { MachineBasicBlock &MBB = *MFI; Modified |= LoadStoreMultipleOpti(MBB); - Modified |= MergeReturnIntoLDM(MBB); + if (TM.getSubtarget<ARMSubtarget>().hasV5TOps()) + Modified |= MergeReturnIntoLDM(MBB); } delete RS; @@ -1250,7 +1408,7 @@ namespace { bool CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1, DebugLoc &dl, unsigned &NewOpc, unsigned &EvenReg, unsigned &OddReg, unsigned &BaseReg, - unsigned &OffReg, int &Offset, + int &Offset, unsigned &PredReg, ARMCC::CondCodes &Pred, bool &isT2); bool RescheduleOps(MachineBasicBlock *MBB, @@ -1292,7 +1450,7 @@ static bool IsSafeAndProfitableToMove(bool isLd, unsigned Base, if (I->isDebugValue() || MemOps.count(&*I)) continue; const TargetInstrDesc &TID = I->getDesc(); - if (TID.isCall() || TID.isTerminator() || TID.hasUnmodeledSideEffects()) + if (TID.isCall() || TID.isTerminator() || I->hasUnmodeledSideEffects()) return false; if (isLd && TID.mayStore()) return false; @@ -1330,8 +1488,7 @@ ARMPreAllocLoadStoreOpt::CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1, DebugLoc &dl, unsigned &NewOpc, unsigned &EvenReg, unsigned &OddReg, unsigned &BaseReg, - unsigned &OffReg, int &Offset, - unsigned &PredReg, + int &Offset, unsigned &PredReg, ARMCC::CondCodes &Pred, bool &isT2) { // Make sure we're allowed to generate LDRD/STRD. @@ -1341,9 +1498,9 @@ ARMPreAllocLoadStoreOpt::CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1, // FIXME: VLDRS / VSTRS -> VLDRD / VSTRD unsigned Scale = 1; unsigned Opcode = Op0->getOpcode(); - if (Opcode == ARM::LDR) + if (Opcode == ARM::LDRi12) NewOpc = ARM::LDRD; - else if (Opcode == ARM::STR) + else if (Opcode == ARM::STRi12) NewOpc = ARM::STRD; else if (Opcode == ARM::t2LDRi8 || Opcode == ARM::t2LDRi12) { NewOpc = ARM::t2LDRDi8; @@ -1356,12 +1513,7 @@ ARMPreAllocLoadStoreOpt::CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1, } else return false; - // Make sure the offset registers match. - if (!isT2 && - (Op0->getOperand(2).getReg() != Op1->getOperand(2).getReg())) - return false; - - // Must sure the base address satisfies i64 ld / st alignment requirement. + // Make sure the base address satisfies i64 ld / st alignment requirement. if (!Op0->hasOneMemOperand() || !(*Op0->memoperands_begin())->getValue() || (*Op0->memoperands_begin())->isVolatile()) @@ -1370,7 +1522,7 @@ ARMPreAllocLoadStoreOpt::CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1, unsigned Align = (*Op0->memoperands_begin())->getAlignment(); const Function *Func = MF->getFunction(); unsigned ReqAlign = STI->hasV6Ops() - ? TD->getPrefTypeAlignment(Type::getInt64Ty(Func->getContext())) + ? TD->getABITypeAlignment(Type::getInt64Ty(Func->getContext())) : 8; // Pre-v6 need 8-byte align if (Align < ReqAlign) return false; @@ -1404,13 +1556,22 @@ ARMPreAllocLoadStoreOpt::CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1, if (EvenReg == OddReg) return false; BaseReg = Op0->getOperand(1).getReg(); - if (!isT2) - OffReg = Op0->getOperand(2).getReg(); Pred = llvm::getInstrPredicate(Op0, PredReg); dl = Op0->getDebugLoc(); return true; } +namespace { + struct OffsetCompare { + bool operator()(const MachineInstr *LHS, const MachineInstr *RHS) const { + int LOffset = getMemoryOpOffset(LHS); + int ROffset = getMemoryOpOffset(RHS); + assert(LHS == RHS || LOffset != ROffset); + return LOffset > ROffset; + } + }; +} + bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB, SmallVector<MachineInstr*, 4> &Ops, unsigned Base, bool isLd, @@ -1493,14 +1654,14 @@ bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB, MachineInstr *Op0 = Ops.back(); MachineInstr *Op1 = Ops[Ops.size()-2]; unsigned EvenReg = 0, OddReg = 0; - unsigned BaseReg = 0, OffReg = 0, PredReg = 0; + unsigned BaseReg = 0, PredReg = 0; ARMCC::CondCodes Pred = ARMCC::AL; bool isT2 = false; unsigned NewOpc = 0; int Offset = 0; DebugLoc dl; if (NumMove == 2 && CanFormLdStDWord(Op0, Op1, dl, NewOpc, - EvenReg, OddReg, BaseReg, OffReg, + EvenReg, OddReg, BaseReg, Offset, PredReg, Pred, isT2)) { Ops.pop_back(); Ops.pop_back(); @@ -1512,8 +1673,11 @@ bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB, .addReg(EvenReg, RegState::Define) .addReg(OddReg, RegState::Define) .addReg(BaseReg); + // FIXME: We're converting from LDRi12 to an insn that still + // uses addrmode2, so we need an explicit offset reg. It should + // always by reg0 since we're transforming LDRi12s. if (!isT2) - MIB.addReg(OffReg); + MIB.addReg(0); MIB.addImm(Offset).addImm(Pred).addReg(PredReg); ++NumLDRDFormed; } else { @@ -1522,8 +1686,11 @@ bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB, .addReg(EvenReg) .addReg(OddReg) .addReg(BaseReg); + // FIXME: We're converting from LDRi12 to an insn that still + // uses addrmode2, so we need an explicit offset reg. It should + // always by reg0 since we're transforming STRi12s. if (!isT2) - MIB.addReg(OffReg); + MIB.addReg(0); MIB.addImm(Offset).addImm(Pred).addReg(PredReg); ++NumSTRDFormed; } diff --git a/contrib/llvm/lib/Target/ARM/ARMMCCodeEmitter.cpp b/contrib/llvm/lib/Target/ARM/ARMMCCodeEmitter.cpp new file mode 100644 index 0000000..6d7b485 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMMCCodeEmitter.cpp @@ -0,0 +1,1230 @@ +//===-- ARM/ARMMCCodeEmitter.cpp - Convert ARM code to machine code -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the ARMMCCodeEmitter class. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "mccodeemitter" +#include "ARM.h" +#include "ARMAddressingModes.h" +#include "ARMFixupKinds.h" +#include "ARMInstrInfo.h" +#include "ARMMCExpr.h" +#include "ARMSubtarget.h" +#include "llvm/MC/MCCodeEmitter.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Support/raw_ostream.h" +using namespace llvm; + +STATISTIC(MCNumEmitted, "Number of MC instructions emitted."); +STATISTIC(MCNumCPRelocations, "Number of constant pool relocations created."); + +namespace { +class ARMMCCodeEmitter : public MCCodeEmitter { + ARMMCCodeEmitter(const ARMMCCodeEmitter &); // DO NOT IMPLEMENT + void operator=(const ARMMCCodeEmitter &); // DO NOT IMPLEMENT + const TargetMachine &TM; + const TargetInstrInfo &TII; + const ARMSubtarget *Subtarget; + MCContext &Ctx; + +public: + ARMMCCodeEmitter(TargetMachine &tm, MCContext &ctx) + : TM(tm), TII(*TM.getInstrInfo()), + Subtarget(&TM.getSubtarget<ARMSubtarget>()), Ctx(ctx) { + } + + ~ARMMCCodeEmitter() {} + + unsigned getMachineSoImmOpValue(unsigned SoImm) const; + + // getBinaryCodeForInstr - TableGen'erated function for getting the + // binary encoding for an instruction. + unsigned getBinaryCodeForInstr(const MCInst &MI, + SmallVectorImpl<MCFixup> &Fixups) const; + + /// getMachineOpValue - Return binary encoding of operand. If the machine + /// operand requires relocation, record the relocation and return zero. + unsigned getMachineOpValue(const MCInst &MI,const MCOperand &MO, + SmallVectorImpl<MCFixup> &Fixups) const; + + /// getHiLo16ImmOpValue - Return the encoding for the hi / low 16-bit of + /// the specified operand. This is used for operands with :lower16: and + /// :upper16: prefixes. + uint32_t getHiLo16ImmOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const; + + bool EncodeAddrModeOpValues(const MCInst &MI, unsigned OpIdx, + unsigned &Reg, unsigned &Imm, + SmallVectorImpl<MCFixup> &Fixups) const; + + /// getThumbBLTargetOpValue - Return encoding info for Thumb immediate + /// BL branch target. + uint32_t getThumbBLTargetOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const; + + /// getThumbBLXTargetOpValue - Return encoding info for Thumb immediate + /// BLX branch target. + uint32_t getThumbBLXTargetOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const; + + /// getThumbBRTargetOpValue - Return encoding info for Thumb branch target. + uint32_t getThumbBRTargetOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const; + + /// getThumbBCCTargetOpValue - Return encoding info for Thumb branch target. + uint32_t getThumbBCCTargetOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const; + + /// getThumbCBTargetOpValue - Return encoding info for Thumb branch target. + uint32_t getThumbCBTargetOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const; + + /// getBranchTargetOpValue - Return encoding info for 24-bit immediate + /// branch target. + uint32_t getBranchTargetOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const; + + /// getUnconditionalBranchTargetOpValue - Return encoding info for 24-bit + /// immediate Thumb2 direct branch target. + uint32_t getUnconditionalBranchTargetOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const; + + /// getARMBranchTargetOpValue - Return encoding info for 24-bit immediate + /// branch target. + uint32_t getARMBranchTargetOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const; + + /// getAdrLabelOpValue - Return encoding info for 12-bit immediate + /// ADR label target. + uint32_t getAdrLabelOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const; + uint32_t getThumbAdrLabelOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const; + uint32_t getT2AdrLabelOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const; + + + /// getAddrModeImm12OpValue - Return encoding info for 'reg +/- imm12' + /// operand. + uint32_t getAddrModeImm12OpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const; + + /// getThumbAddrModeRegRegOpValue - Return encoding for 'reg + reg' operand. + uint32_t getThumbAddrModeRegRegOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups)const; + + /// getT2AddrModeImm8s4OpValue - Return encoding info for 'reg +/- imm8<<2' + /// operand. + uint32_t getT2AddrModeImm8s4OpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const; + + + /// getLdStSORegOpValue - Return encoding info for 'reg +/- reg shop imm' + /// operand as needed by load/store instructions. + uint32_t getLdStSORegOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const; + + /// getLdStmModeOpValue - Return encoding for load/store multiple mode. + uint32_t getLdStmModeOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const { + ARM_AM::AMSubMode Mode = (ARM_AM::AMSubMode)MI.getOperand(OpIdx).getImm(); + switch (Mode) { + default: assert(0 && "Unknown addressing sub-mode!"); + case ARM_AM::da: return 0; + case ARM_AM::ia: return 1; + case ARM_AM::db: return 2; + case ARM_AM::ib: return 3; + } + } + /// getShiftOp - Return the shift opcode (bit[6:5]) of the immediate value. + /// + unsigned getShiftOp(ARM_AM::ShiftOpc ShOpc) const { + switch (ShOpc) { + default: llvm_unreachable("Unknown shift opc!"); + case ARM_AM::no_shift: + case ARM_AM::lsl: return 0; + case ARM_AM::lsr: return 1; + case ARM_AM::asr: return 2; + case ARM_AM::ror: + case ARM_AM::rrx: return 3; + } + return 0; + } + + /// getAddrMode2OpValue - Return encoding for addrmode2 operands. + uint32_t getAddrMode2OpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const; + + /// getAddrMode2OffsetOpValue - Return encoding for am2offset operands. + uint32_t getAddrMode2OffsetOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const; + + /// getAddrMode3OffsetOpValue - Return encoding for am3offset operands. + uint32_t getAddrMode3OffsetOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const; + + /// getAddrMode3OpValue - Return encoding for addrmode3 operands. + uint32_t getAddrMode3OpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const; + + /// getAddrModeThumbSPOpValue - Return encoding info for 'reg +/- imm12' + /// operand. + uint32_t getAddrModeThumbSPOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const; + + /// getAddrModeISOpValue - Encode the t_addrmode_is# operands. + uint32_t getAddrModeISOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const; + + /// getAddrModePCOpValue - Return encoding for t_addrmode_pc operands. + uint32_t getAddrModePCOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const; + + /// getAddrMode5OpValue - Return encoding info for 'reg +/- imm8' operand. + uint32_t getAddrMode5OpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const; + + /// getCCOutOpValue - Return encoding of the 's' bit. + unsigned getCCOutOpValue(const MCInst &MI, unsigned Op, + SmallVectorImpl<MCFixup> &Fixups) const { + // The operand is either reg0 or CPSR. The 's' bit is encoded as '0' or + // '1' respectively. + return MI.getOperand(Op).getReg() == ARM::CPSR; + } + + /// getSOImmOpValue - Return an encoded 12-bit shifted-immediate value. + unsigned getSOImmOpValue(const MCInst &MI, unsigned Op, + SmallVectorImpl<MCFixup> &Fixups) const { + unsigned SoImm = MI.getOperand(Op).getImm(); + int SoImmVal = ARM_AM::getSOImmVal(SoImm); + assert(SoImmVal != -1 && "Not a valid so_imm value!"); + + // Encode rotate_imm. + unsigned Binary = (ARM_AM::getSOImmValRot((unsigned)SoImmVal) >> 1) + << ARMII::SoRotImmShift; + + // Encode immed_8. + Binary |= ARM_AM::getSOImmValImm((unsigned)SoImmVal); + return Binary; + } + + /// getT2SOImmOpValue - Return an encoded 12-bit shifted-immediate value. + unsigned getT2SOImmOpValue(const MCInst &MI, unsigned Op, + SmallVectorImpl<MCFixup> &Fixups) const { + unsigned SoImm = MI.getOperand(Op).getImm(); + unsigned Encoded = ARM_AM::getT2SOImmVal(SoImm); + assert(Encoded != ~0U && "Not a Thumb2 so_imm value?"); + return Encoded; + } + + unsigned getT2AddrModeSORegOpValue(const MCInst &MI, unsigned OpNum, + SmallVectorImpl<MCFixup> &Fixups) const; + unsigned getT2AddrModeImm8OpValue(const MCInst &MI, unsigned OpNum, + SmallVectorImpl<MCFixup> &Fixups) const; + unsigned getT2AddrModeImm8OffsetOpValue(const MCInst &MI, unsigned OpNum, + SmallVectorImpl<MCFixup> &Fixups) const; + unsigned getT2AddrModeImm12OffsetOpValue(const MCInst &MI, unsigned OpNum, + SmallVectorImpl<MCFixup> &Fixups) const; + + /// getSORegOpValue - Return an encoded so_reg shifted register value. + unsigned getSORegOpValue(const MCInst &MI, unsigned Op, + SmallVectorImpl<MCFixup> &Fixups) const; + unsigned getT2SORegOpValue(const MCInst &MI, unsigned Op, + SmallVectorImpl<MCFixup> &Fixups) const; + + unsigned getRotImmOpValue(const MCInst &MI, unsigned Op, + SmallVectorImpl<MCFixup> &Fixups) const { + switch (MI.getOperand(Op).getImm()) { + default: assert (0 && "Not a valid rot_imm value!"); + case 0: return 0; + case 8: return 1; + case 16: return 2; + case 24: return 3; + } + } + + unsigned getImmMinusOneOpValue(const MCInst &MI, unsigned Op, + SmallVectorImpl<MCFixup> &Fixups) const { + return MI.getOperand(Op).getImm() - 1; + } + + unsigned getNEONVcvtImm32OpValue(const MCInst &MI, unsigned Op, + SmallVectorImpl<MCFixup> &Fixups) const { + return 64 - MI.getOperand(Op).getImm(); + } + + unsigned getBitfieldInvertedMaskOpValue(const MCInst &MI, unsigned Op, + SmallVectorImpl<MCFixup> &Fixups) const; + + unsigned getMsbOpValue(const MCInst &MI, unsigned Op, + SmallVectorImpl<MCFixup> &Fixups) const; + + unsigned getRegisterListOpValue(const MCInst &MI, unsigned Op, + SmallVectorImpl<MCFixup> &Fixups) const; + unsigned getAddrMode6AddressOpValue(const MCInst &MI, unsigned Op, + SmallVectorImpl<MCFixup> &Fixups) const; + unsigned getAddrMode6DupAddressOpValue(const MCInst &MI, unsigned Op, + SmallVectorImpl<MCFixup> &Fixups) const; + unsigned getAddrMode6OffsetOpValue(const MCInst &MI, unsigned Op, + SmallVectorImpl<MCFixup> &Fixups) const; + + unsigned NEONThumb2DataIPostEncoder(const MCInst &MI, + unsigned EncodedValue) const; + unsigned NEONThumb2LoadStorePostEncoder(const MCInst &MI, + unsigned EncodedValue) const; + unsigned NEONThumb2DupPostEncoder(const MCInst &MI, + unsigned EncodedValue) const; + + unsigned VFPThumb2PostEncoder(const MCInst &MI, + unsigned EncodedValue) const; + + void EmitByte(unsigned char C, raw_ostream &OS) const { + OS << (char)C; + } + + void EmitConstant(uint64_t Val, unsigned Size, raw_ostream &OS) const { + // Output the constant in little endian byte order. + for (unsigned i = 0; i != Size; ++i) { + EmitByte(Val & 255, OS); + Val >>= 8; + } + } + + void EncodeInstruction(const MCInst &MI, raw_ostream &OS, + SmallVectorImpl<MCFixup> &Fixups) const; +}; + +} // end anonymous namespace + +MCCodeEmitter *llvm::createARMMCCodeEmitter(const Target &, TargetMachine &TM, + MCContext &Ctx) { + return new ARMMCCodeEmitter(TM, Ctx); +} + +/// NEONThumb2DataIPostEncoder - Post-process encoded NEON data-processing +/// instructions, and rewrite them to their Thumb2 form if we are currently in +/// Thumb2 mode. +unsigned ARMMCCodeEmitter::NEONThumb2DataIPostEncoder(const MCInst &MI, + unsigned EncodedValue) const { + if (Subtarget->isThumb2()) { + // NEON Thumb2 data-processsing encodings are very simple: bit 24 is moved + // to bit 12 of the high half-word (i.e. bit 28), and bits 27-24 are + // set to 1111. + unsigned Bit24 = EncodedValue & 0x01000000; + unsigned Bit28 = Bit24 << 4; + EncodedValue &= 0xEFFFFFFF; + EncodedValue |= Bit28; + EncodedValue |= 0x0F000000; + } + + return EncodedValue; +} + +/// NEONThumb2LoadStorePostEncoder - Post-process encoded NEON load/store +/// instructions, and rewrite them to their Thumb2 form if we are currently in +/// Thumb2 mode. +unsigned ARMMCCodeEmitter::NEONThumb2LoadStorePostEncoder(const MCInst &MI, + unsigned EncodedValue) const { + if (Subtarget->isThumb2()) { + EncodedValue &= 0xF0FFFFFF; + EncodedValue |= 0x09000000; + } + + return EncodedValue; +} + +/// NEONThumb2DupPostEncoder - Post-process encoded NEON vdup +/// instructions, and rewrite them to their Thumb2 form if we are currently in +/// Thumb2 mode. +unsigned ARMMCCodeEmitter::NEONThumb2DupPostEncoder(const MCInst &MI, + unsigned EncodedValue) const { + if (Subtarget->isThumb2()) { + EncodedValue &= 0x00FFFFFF; + EncodedValue |= 0xEE000000; + } + + return EncodedValue; +} + +/// VFPThumb2PostEncoder - Post-process encoded VFP instructions and rewrite +/// them to their Thumb2 form if we are currently in Thumb2 mode. +unsigned ARMMCCodeEmitter:: +VFPThumb2PostEncoder(const MCInst &MI, unsigned EncodedValue) const { + if (Subtarget->isThumb2()) { + EncodedValue &= 0x0FFFFFFF; + EncodedValue |= 0xE0000000; + } + return EncodedValue; +} + +/// getMachineOpValue - Return binary encoding of operand. If the machine +/// operand requires relocation, record the relocation and return zero. +unsigned ARMMCCodeEmitter:: +getMachineOpValue(const MCInst &MI, const MCOperand &MO, + SmallVectorImpl<MCFixup> &Fixups) const { + if (MO.isReg()) { + unsigned Reg = MO.getReg(); + unsigned RegNo = getARMRegisterNumbering(Reg); + + // Q registers are encoded as 2x their register number. + switch (Reg) { + default: + return RegNo; + case ARM::Q0: case ARM::Q1: case ARM::Q2: case ARM::Q3: + case ARM::Q4: case ARM::Q5: case ARM::Q6: case ARM::Q7: + case ARM::Q8: case ARM::Q9: case ARM::Q10: case ARM::Q11: + case ARM::Q12: case ARM::Q13: case ARM::Q14: case ARM::Q15: + return 2 * RegNo; + } + } else if (MO.isImm()) { + return static_cast<unsigned>(MO.getImm()); + } else if (MO.isFPImm()) { + return static_cast<unsigned>(APFloat(MO.getFPImm()) + .bitcastToAPInt().getHiBits(32).getLimitedValue()); + } + + llvm_unreachable("Unable to encode MCOperand!"); + return 0; +} + +/// getAddrModeImmOpValue - Return encoding info for 'reg +/- imm' operand. +bool ARMMCCodeEmitter:: +EncodeAddrModeOpValues(const MCInst &MI, unsigned OpIdx, unsigned &Reg, + unsigned &Imm, SmallVectorImpl<MCFixup> &Fixups) const { + const MCOperand &MO = MI.getOperand(OpIdx); + const MCOperand &MO1 = MI.getOperand(OpIdx + 1); + + Reg = getARMRegisterNumbering(MO.getReg()); + + int32_t SImm = MO1.getImm(); + bool isAdd = true; + + // Special value for #-0 + if (SImm == INT32_MIN) + SImm = 0; + + // Immediate is always encoded as positive. The 'U' bit controls add vs sub. + if (SImm < 0) { + SImm = -SImm; + isAdd = false; + } + + Imm = SImm; + return isAdd; +} + +/// getBranchTargetOpValue - Helper function to get the branch target operand, +/// which is either an immediate or requires a fixup. +static uint32_t getBranchTargetOpValue(const MCInst &MI, unsigned OpIdx, + unsigned FixupKind, + SmallVectorImpl<MCFixup> &Fixups) { + const MCOperand &MO = MI.getOperand(OpIdx); + + // If the destination is an immediate, we have nothing to do. + if (MO.isImm()) return MO.getImm(); + assert(MO.isExpr() && "Unexpected branch target type!"); + const MCExpr *Expr = MO.getExpr(); + MCFixupKind Kind = MCFixupKind(FixupKind); + Fixups.push_back(MCFixup::Create(0, Expr, Kind)); + + // All of the information is in the fixup. + return 0; +} + +/// getThumbBLTargetOpValue - Return encoding info for immediate branch target. +uint32_t ARMMCCodeEmitter:: +getThumbBLTargetOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const { + return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_thumb_bl, Fixups); +} + +/// getThumbBLXTargetOpValue - Return encoding info for Thumb immediate +/// BLX branch target. +uint32_t ARMMCCodeEmitter:: +getThumbBLXTargetOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const { + return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_thumb_blx, Fixups); +} + +/// getThumbBRTargetOpValue - Return encoding info for Thumb branch target. +uint32_t ARMMCCodeEmitter:: +getThumbBRTargetOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const { + return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_thumb_br, Fixups); +} + +/// getThumbBCCTargetOpValue - Return encoding info for Thumb branch target. +uint32_t ARMMCCodeEmitter:: +getThumbBCCTargetOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const { + return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_thumb_bcc, Fixups); +} + +/// getThumbCBTargetOpValue - Return encoding info for Thumb branch target. +uint32_t ARMMCCodeEmitter:: +getThumbCBTargetOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const { + return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_thumb_cb, Fixups); +} + +/// Return true if this branch has a non-always predication +static bool HasConditionalBranch(const MCInst &MI) { + int NumOp = MI.getNumOperands(); + if (NumOp >= 2) { + for (int i = 0; i < NumOp-1; ++i) { + const MCOperand &MCOp1 = MI.getOperand(i); + const MCOperand &MCOp2 = MI.getOperand(i + 1); + if (MCOp1.isImm() && MCOp2.isReg() && + (MCOp2.getReg() == 0 || MCOp2.getReg() == ARM::CPSR)) { + if (ARMCC::CondCodes(MCOp1.getImm()) != ARMCC::AL) + return true; + } + } + } + return false; +} + +/// getBranchTargetOpValue - Return encoding info for 24-bit immediate branch +/// target. +uint32_t ARMMCCodeEmitter:: +getBranchTargetOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const { + // FIXME: This really, really shouldn't use TargetMachine. We don't want + // coupling between MC and TM anywhere we can help it. + if (Subtarget->isThumb2()) + return + ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_t2_condbranch, Fixups); + return getARMBranchTargetOpValue(MI, OpIdx, Fixups); +} + +/// getBranchTargetOpValue - Return encoding info for 24-bit immediate branch +/// target. +uint32_t ARMMCCodeEmitter:: +getARMBranchTargetOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const { + if (HasConditionalBranch(MI)) + return ::getBranchTargetOpValue(MI, OpIdx, + ARM::fixup_arm_condbranch, Fixups); + return ::getBranchTargetOpValue(MI, OpIdx, + ARM::fixup_arm_uncondbranch, Fixups); +} + + + + +/// getUnconditionalBranchTargetOpValue - Return encoding info for 24-bit +/// immediate branch target. +uint32_t ARMMCCodeEmitter:: +getUnconditionalBranchTargetOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const { + unsigned Val = + ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_t2_uncondbranch, Fixups); + bool I = (Val & 0x800000); + bool J1 = (Val & 0x400000); + bool J2 = (Val & 0x200000); + if (I ^ J1) + Val &= ~0x400000; + else + Val |= 0x400000; + + if (I ^ J2) + Val &= ~0x200000; + else + Val |= 0x200000; + + return Val; +} + +/// getAdrLabelOpValue - Return encoding info for 12-bit immediate ADR label +/// target. +uint32_t ARMMCCodeEmitter:: +getAdrLabelOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const { + assert(MI.getOperand(OpIdx).isExpr() && "Unexpected adr target type!"); + return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_adr_pcrel_12, + Fixups); +} + +/// getAdrLabelOpValue - Return encoding info for 12-bit immediate ADR label +/// target. +uint32_t ARMMCCodeEmitter:: +getT2AdrLabelOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const { + assert(MI.getOperand(OpIdx).isExpr() && "Unexpected adr target type!"); + return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_t2_adr_pcrel_12, + Fixups); +} + +/// getAdrLabelOpValue - Return encoding info for 8-bit immediate ADR label +/// target. +uint32_t ARMMCCodeEmitter:: +getThumbAdrLabelOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const { + assert(MI.getOperand(OpIdx).isExpr() && "Unexpected adr target type!"); + return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_thumb_adr_pcrel_10, + Fixups); +} + +/// getThumbAddrModeRegRegOpValue - Return encoding info for 'reg + reg' +/// operand. +uint32_t ARMMCCodeEmitter:: +getThumbAddrModeRegRegOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &) const { + // [Rn, Rm] + // {5-3} = Rm + // {2-0} = Rn + const MCOperand &MO1 = MI.getOperand(OpIdx); + const MCOperand &MO2 = MI.getOperand(OpIdx + 1); + unsigned Rn = getARMRegisterNumbering(MO1.getReg()); + unsigned Rm = getARMRegisterNumbering(MO2.getReg()); + return (Rm << 3) | Rn; +} + +/// getAddrModeImm12OpValue - Return encoding info for 'reg +/- imm12' operand. +uint32_t ARMMCCodeEmitter:: +getAddrModeImm12OpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const { + // {17-13} = reg + // {12} = (U)nsigned (add == '1', sub == '0') + // {11-0} = imm12 + unsigned Reg, Imm12; + bool isAdd = true; + // If The first operand isn't a register, we have a label reference. + const MCOperand &MO = MI.getOperand(OpIdx); + if (!MO.isReg()) { + Reg = getARMRegisterNumbering(ARM::PC); // Rn is PC. + Imm12 = 0; + isAdd = false ; // 'U' bit is set as part of the fixup. + + assert(MO.isExpr() && "Unexpected machine operand type!"); + const MCExpr *Expr = MO.getExpr(); + + MCFixupKind Kind; + if (Subtarget->isThumb2()) + Kind = MCFixupKind(ARM::fixup_t2_ldst_pcrel_12); + else + Kind = MCFixupKind(ARM::fixup_arm_ldst_pcrel_12); + Fixups.push_back(MCFixup::Create(0, Expr, Kind)); + + ++MCNumCPRelocations; + } else + isAdd = EncodeAddrModeOpValues(MI, OpIdx, Reg, Imm12, Fixups); + + uint32_t Binary = Imm12 & 0xfff; + // Immediate is always encoded as positive. The 'U' bit controls add vs sub. + if (isAdd) + Binary |= (1 << 12); + Binary |= (Reg << 13); + return Binary; +} + +/// getT2AddrModeImm8s4OpValue - Return encoding info for +/// 'reg +/- imm8<<2' operand. +uint32_t ARMMCCodeEmitter:: +getT2AddrModeImm8s4OpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const { + // {12-9} = reg + // {8} = (U)nsigned (add == '1', sub == '0') + // {7-0} = imm8 + unsigned Reg, Imm8; + bool isAdd = true; + // If The first operand isn't a register, we have a label reference. + const MCOperand &MO = MI.getOperand(OpIdx); + if (!MO.isReg()) { + Reg = getARMRegisterNumbering(ARM::PC); // Rn is PC. + Imm8 = 0; + isAdd = false ; // 'U' bit is set as part of the fixup. + + assert(MO.isExpr() && "Unexpected machine operand type!"); + const MCExpr *Expr = MO.getExpr(); + MCFixupKind Kind = MCFixupKind(ARM::fixup_arm_pcrel_10); + Fixups.push_back(MCFixup::Create(0, Expr, Kind)); + + ++MCNumCPRelocations; + } else + isAdd = EncodeAddrModeOpValues(MI, OpIdx, Reg, Imm8, Fixups); + + uint32_t Binary = (Imm8 >> 2) & 0xff; + // Immediate is always encoded as positive. The 'U' bit controls add vs sub. + if (isAdd) + Binary |= (1 << 8); + Binary |= (Reg << 9); + return Binary; +} + +// FIXME: This routine assumes that a binary +// expression will always result in a PCRel expression +// In reality, its only true if one or more subexpressions +// is itself a PCRel (i.e. "." in asm or some other pcrel construct) +// but this is good enough for now. +static bool EvaluateAsPCRel(const MCExpr *Expr) { + switch (Expr->getKind()) { + default: assert(0 && "Unexpected expression type"); + case MCExpr::SymbolRef: return false; + case MCExpr::Binary: return true; + } +} + +uint32_t +ARMMCCodeEmitter::getHiLo16ImmOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const { + // {20-16} = imm{15-12} + // {11-0} = imm{11-0} + const MCOperand &MO = MI.getOperand(OpIdx); + if (MO.isImm()) + // Hi / lo 16 bits already extracted during earlier passes. + return static_cast<unsigned>(MO.getImm()); + + // Handle :upper16: and :lower16: assembly prefixes. + const MCExpr *E = MO.getExpr(); + if (E->getKind() == MCExpr::Target) { + const ARMMCExpr *ARM16Expr = cast<ARMMCExpr>(E); + E = ARM16Expr->getSubExpr(); + + MCFixupKind Kind; + switch (ARM16Expr->getKind()) { + default: assert(0 && "Unsupported ARMFixup"); + case ARMMCExpr::VK_ARM_HI16: + if (!Subtarget->isTargetDarwin() && EvaluateAsPCRel(E)) + Kind = MCFixupKind(Subtarget->isThumb2() + ? ARM::fixup_t2_movt_hi16_pcrel + : ARM::fixup_arm_movt_hi16_pcrel); + else + Kind = MCFixupKind(Subtarget->isThumb2() + ? ARM::fixup_t2_movt_hi16 + : ARM::fixup_arm_movt_hi16); + break; + case ARMMCExpr::VK_ARM_LO16: + if (!Subtarget->isTargetDarwin() && EvaluateAsPCRel(E)) + Kind = MCFixupKind(Subtarget->isThumb2() + ? ARM::fixup_t2_movw_lo16_pcrel + : ARM::fixup_arm_movw_lo16_pcrel); + else + Kind = MCFixupKind(Subtarget->isThumb2() + ? ARM::fixup_t2_movw_lo16 + : ARM::fixup_arm_movw_lo16); + break; + } + Fixups.push_back(MCFixup::Create(0, E, Kind)); + return 0; + }; + + llvm_unreachable("Unsupported MCExpr type in MCOperand!"); + return 0; +} + +uint32_t ARMMCCodeEmitter:: +getLdStSORegOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const { + const MCOperand &MO = MI.getOperand(OpIdx); + const MCOperand &MO1 = MI.getOperand(OpIdx+1); + const MCOperand &MO2 = MI.getOperand(OpIdx+2); + unsigned Rn = getARMRegisterNumbering(MO.getReg()); + unsigned Rm = getARMRegisterNumbering(MO1.getReg()); + unsigned ShImm = ARM_AM::getAM2Offset(MO2.getImm()); + bool isAdd = ARM_AM::getAM2Op(MO2.getImm()) == ARM_AM::add; + ARM_AM::ShiftOpc ShOp = ARM_AM::getAM2ShiftOpc(MO2.getImm()); + unsigned SBits = getShiftOp(ShOp); + + // {16-13} = Rn + // {12} = isAdd + // {11-0} = shifter + // {3-0} = Rm + // {4} = 0 + // {6-5} = type + // {11-7} = imm + uint32_t Binary = Rm; + Binary |= Rn << 13; + Binary |= SBits << 5; + Binary |= ShImm << 7; + if (isAdd) + Binary |= 1 << 12; + return Binary; +} + +uint32_t ARMMCCodeEmitter:: +getAddrMode2OpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const { + // {17-14} Rn + // {13} 1 == imm12, 0 == Rm + // {12} isAdd + // {11-0} imm12/Rm + const MCOperand &MO = MI.getOperand(OpIdx); + unsigned Rn = getARMRegisterNumbering(MO.getReg()); + uint32_t Binary = getAddrMode2OffsetOpValue(MI, OpIdx + 1, Fixups); + Binary |= Rn << 14; + return Binary; +} + +uint32_t ARMMCCodeEmitter:: +getAddrMode2OffsetOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const { + // {13} 1 == imm12, 0 == Rm + // {12} isAdd + // {11-0} imm12/Rm + const MCOperand &MO = MI.getOperand(OpIdx); + const MCOperand &MO1 = MI.getOperand(OpIdx+1); + unsigned Imm = MO1.getImm(); + bool isAdd = ARM_AM::getAM2Op(Imm) == ARM_AM::add; + bool isReg = MO.getReg() != 0; + uint32_t Binary = ARM_AM::getAM2Offset(Imm); + // if reg +/- reg, Rm will be non-zero. Otherwise, we have reg +/- imm12 + if (isReg) { + ARM_AM::ShiftOpc ShOp = ARM_AM::getAM2ShiftOpc(Imm); + Binary <<= 7; // Shift amount is bits [11:7] + Binary |= getShiftOp(ShOp) << 5; // Shift type is bits [6:5] + Binary |= getARMRegisterNumbering(MO.getReg()); // Rm is bits [3:0] + } + return Binary | (isAdd << 12) | (isReg << 13); +} + +uint32_t ARMMCCodeEmitter:: +getAddrMode3OffsetOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const { + // {9} 1 == imm8, 0 == Rm + // {8} isAdd + // {7-4} imm7_4/zero + // {3-0} imm3_0/Rm + const MCOperand &MO = MI.getOperand(OpIdx); + const MCOperand &MO1 = MI.getOperand(OpIdx+1); + unsigned Imm = MO1.getImm(); + bool isAdd = ARM_AM::getAM3Op(Imm) == ARM_AM::add; + bool isImm = MO.getReg() == 0; + uint32_t Imm8 = ARM_AM::getAM3Offset(Imm); + // if reg +/- reg, Rm will be non-zero. Otherwise, we have reg +/- imm8 + if (!isImm) + Imm8 = getARMRegisterNumbering(MO.getReg()); + return Imm8 | (isAdd << 8) | (isImm << 9); +} + +uint32_t ARMMCCodeEmitter:: +getAddrMode3OpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const { + // {13} 1 == imm8, 0 == Rm + // {12-9} Rn + // {8} isAdd + // {7-4} imm7_4/zero + // {3-0} imm3_0/Rm + const MCOperand &MO = MI.getOperand(OpIdx); + const MCOperand &MO1 = MI.getOperand(OpIdx+1); + const MCOperand &MO2 = MI.getOperand(OpIdx+2); + unsigned Rn = getARMRegisterNumbering(MO.getReg()); + unsigned Imm = MO2.getImm(); + bool isAdd = ARM_AM::getAM3Op(Imm) == ARM_AM::add; + bool isImm = MO1.getReg() == 0; + uint32_t Imm8 = ARM_AM::getAM3Offset(Imm); + // if reg +/- reg, Rm will be non-zero. Otherwise, we have reg +/- imm8 + if (!isImm) + Imm8 = getARMRegisterNumbering(MO1.getReg()); + return (Rn << 9) | Imm8 | (isAdd << 8) | (isImm << 13); +} + +/// getAddrModeThumbSPOpValue - Encode the t_addrmode_sp operands. +uint32_t ARMMCCodeEmitter:: +getAddrModeThumbSPOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const { + // [SP, #imm] + // {7-0} = imm8 + const MCOperand &MO1 = MI.getOperand(OpIdx + 1); + assert(MI.getOperand(OpIdx).getReg() == ARM::SP && + "Unexpected base register!"); + + // The immediate is already shifted for the implicit zeroes, so no change + // here. + return MO1.getImm() & 0xff; +} + +/// getAddrModeISOpValue - Encode the t_addrmode_is# operands. +uint32_t ARMMCCodeEmitter:: +getAddrModeISOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const { + // [Rn, #imm] + // {7-3} = imm5 + // {2-0} = Rn + const MCOperand &MO = MI.getOperand(OpIdx); + const MCOperand &MO1 = MI.getOperand(OpIdx + 1); + unsigned Rn = getARMRegisterNumbering(MO.getReg()); + unsigned Imm5 = MO1.getImm(); + return ((Imm5 & 0x1f) << 3) | Rn; +} + +/// getAddrModePCOpValue - Return encoding for t_addrmode_pc operands. +uint32_t ARMMCCodeEmitter:: +getAddrModePCOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const { + return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_thumb_cp, Fixups); +} + +/// getAddrMode5OpValue - Return encoding info for 'reg +/- imm10' operand. +uint32_t ARMMCCodeEmitter:: +getAddrMode5OpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const { + // {12-9} = reg + // {8} = (U)nsigned (add == '1', sub == '0') + // {7-0} = imm8 + unsigned Reg, Imm8; + bool isAdd; + // If The first operand isn't a register, we have a label reference. + const MCOperand &MO = MI.getOperand(OpIdx); + if (!MO.isReg()) { + Reg = getARMRegisterNumbering(ARM::PC); // Rn is PC. + Imm8 = 0; + isAdd = false; // 'U' bit is handled as part of the fixup. + + assert(MO.isExpr() && "Unexpected machine operand type!"); + const MCExpr *Expr = MO.getExpr(); + MCFixupKind Kind; + if (Subtarget->isThumb2()) + Kind = MCFixupKind(ARM::fixup_t2_pcrel_10); + else + Kind = MCFixupKind(ARM::fixup_arm_pcrel_10); + Fixups.push_back(MCFixup::Create(0, Expr, Kind)); + + ++MCNumCPRelocations; + } else { + EncodeAddrModeOpValues(MI, OpIdx, Reg, Imm8, Fixups); + isAdd = ARM_AM::getAM5Op(Imm8) == ARM_AM::add; + } + + uint32_t Binary = ARM_AM::getAM5Offset(Imm8); + // Immediate is always encoded as positive. The 'U' bit controls add vs sub. + if (isAdd) + Binary |= (1 << 8); + Binary |= (Reg << 9); + return Binary; +} + +unsigned ARMMCCodeEmitter:: +getSORegOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const { + // Sub-operands are [reg, reg, imm]. The first register is Rm, the reg to be + // shifted. The second is either Rs, the amount to shift by, or reg0 in which + // case the imm contains the amount to shift by. + // + // {3-0} = Rm. + // {4} = 1 if reg shift, 0 if imm shift + // {6-5} = type + // If reg shift: + // {11-8} = Rs + // {7} = 0 + // else (imm shift) + // {11-7} = imm + + const MCOperand &MO = MI.getOperand(OpIdx); + const MCOperand &MO1 = MI.getOperand(OpIdx + 1); + const MCOperand &MO2 = MI.getOperand(OpIdx + 2); + ARM_AM::ShiftOpc SOpc = ARM_AM::getSORegShOp(MO2.getImm()); + + // Encode Rm. + unsigned Binary = getARMRegisterNumbering(MO.getReg()); + + // Encode the shift opcode. + unsigned SBits = 0; + unsigned Rs = MO1.getReg(); + if (Rs) { + // Set shift operand (bit[7:4]). + // LSL - 0001 + // LSR - 0011 + // ASR - 0101 + // ROR - 0111 + // RRX - 0110 and bit[11:8] clear. + switch (SOpc) { + default: llvm_unreachable("Unknown shift opc!"); + case ARM_AM::lsl: SBits = 0x1; break; + case ARM_AM::lsr: SBits = 0x3; break; + case ARM_AM::asr: SBits = 0x5; break; + case ARM_AM::ror: SBits = 0x7; break; + case ARM_AM::rrx: SBits = 0x6; break; + } + } else { + // Set shift operand (bit[6:4]). + // LSL - 000 + // LSR - 010 + // ASR - 100 + // ROR - 110 + switch (SOpc) { + default: llvm_unreachable("Unknown shift opc!"); + case ARM_AM::lsl: SBits = 0x0; break; + case ARM_AM::lsr: SBits = 0x2; break; + case ARM_AM::asr: SBits = 0x4; break; + case ARM_AM::ror: SBits = 0x6; break; + } + } + + Binary |= SBits << 4; + if (SOpc == ARM_AM::rrx) + return Binary; + + // Encode the shift operation Rs or shift_imm (except rrx). + if (Rs) { + // Encode Rs bit[11:8]. + assert(ARM_AM::getSORegOffset(MO2.getImm()) == 0); + return Binary | (getARMRegisterNumbering(Rs) << ARMII::RegRsShift); + } + + // Encode shift_imm bit[11:7]. + return Binary | ARM_AM::getSORegOffset(MO2.getImm()) << 7; +} + +unsigned ARMMCCodeEmitter:: +getT2AddrModeSORegOpValue(const MCInst &MI, unsigned OpNum, + SmallVectorImpl<MCFixup> &Fixups) const { + const MCOperand &MO1 = MI.getOperand(OpNum); + const MCOperand &MO2 = MI.getOperand(OpNum+1); + const MCOperand &MO3 = MI.getOperand(OpNum+2); + + // Encoded as [Rn, Rm, imm]. + // FIXME: Needs fixup support. + unsigned Value = getARMRegisterNumbering(MO1.getReg()); + Value <<= 4; + Value |= getARMRegisterNumbering(MO2.getReg()); + Value <<= 2; + Value |= MO3.getImm(); + + return Value; +} + +unsigned ARMMCCodeEmitter:: +getT2AddrModeImm8OpValue(const MCInst &MI, unsigned OpNum, + SmallVectorImpl<MCFixup> &Fixups) const { + const MCOperand &MO1 = MI.getOperand(OpNum); + const MCOperand &MO2 = MI.getOperand(OpNum+1); + + // FIXME: Needs fixup support. + unsigned Value = getARMRegisterNumbering(MO1.getReg()); + + // Even though the immediate is 8 bits long, we need 9 bits in order + // to represent the (inverse of the) sign bit. + Value <<= 9; + int32_t tmp = (int32_t)MO2.getImm(); + if (tmp < 0) + tmp = abs(tmp); + else + Value |= 256; // Set the ADD bit + Value |= tmp & 255; + return Value; +} + +unsigned ARMMCCodeEmitter:: +getT2AddrModeImm8OffsetOpValue(const MCInst &MI, unsigned OpNum, + SmallVectorImpl<MCFixup> &Fixups) const { + const MCOperand &MO1 = MI.getOperand(OpNum); + + // FIXME: Needs fixup support. + unsigned Value = 0; + int32_t tmp = (int32_t)MO1.getImm(); + if (tmp < 0) + tmp = abs(tmp); + else + Value |= 256; // Set the ADD bit + Value |= tmp & 255; + return Value; +} + +unsigned ARMMCCodeEmitter:: +getT2AddrModeImm12OffsetOpValue(const MCInst &MI, unsigned OpNum, + SmallVectorImpl<MCFixup> &Fixups) const { + const MCOperand &MO1 = MI.getOperand(OpNum); + + // FIXME: Needs fixup support. + unsigned Value = 0; + int32_t tmp = (int32_t)MO1.getImm(); + if (tmp < 0) + tmp = abs(tmp); + else + Value |= 4096; // Set the ADD bit + Value |= tmp & 4095; + return Value; +} + +unsigned ARMMCCodeEmitter:: +getT2SORegOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const { + // Sub-operands are [reg, imm]. The first register is Rm, the reg to be + // shifted. The second is the amount to shift by. + // + // {3-0} = Rm. + // {4} = 0 + // {6-5} = type + // {11-7} = imm + + const MCOperand &MO = MI.getOperand(OpIdx); + const MCOperand &MO1 = MI.getOperand(OpIdx + 1); + ARM_AM::ShiftOpc SOpc = ARM_AM::getSORegShOp(MO1.getImm()); + + // Encode Rm. + unsigned Binary = getARMRegisterNumbering(MO.getReg()); + + // Encode the shift opcode. + unsigned SBits = 0; + // Set shift operand (bit[6:4]). + // LSL - 000 + // LSR - 010 + // ASR - 100 + // ROR - 110 + switch (SOpc) { + default: llvm_unreachable("Unknown shift opc!"); + case ARM_AM::lsl: SBits = 0x0; break; + case ARM_AM::lsr: SBits = 0x2; break; + case ARM_AM::asr: SBits = 0x4; break; + case ARM_AM::ror: SBits = 0x6; break; + } + + Binary |= SBits << 4; + if (SOpc == ARM_AM::rrx) + return Binary; + + // Encode shift_imm bit[11:7]. + return Binary | ARM_AM::getSORegOffset(MO1.getImm()) << 7; +} + +unsigned ARMMCCodeEmitter:: +getBitfieldInvertedMaskOpValue(const MCInst &MI, unsigned Op, + SmallVectorImpl<MCFixup> &Fixups) const { + // 10 bits. lower 5 bits are are the lsb of the mask, high five bits are the + // msb of the mask. + const MCOperand &MO = MI.getOperand(Op); + uint32_t v = ~MO.getImm(); + uint32_t lsb = CountTrailingZeros_32(v); + uint32_t msb = (32 - CountLeadingZeros_32 (v)) - 1; + assert (v != 0 && lsb < 32 && msb < 32 && "Illegal bitfield mask!"); + return lsb | (msb << 5); +} + +unsigned ARMMCCodeEmitter:: +getMsbOpValue(const MCInst &MI, unsigned Op, + SmallVectorImpl<MCFixup> &Fixups) const { + // MSB - 5 bits. + uint32_t lsb = MI.getOperand(Op-1).getImm(); + uint32_t width = MI.getOperand(Op).getImm(); + uint32_t msb = lsb+width-1; + assert (width != 0 && msb < 32 && "Illegal bit width!"); + return msb; +} + +unsigned ARMMCCodeEmitter:: +getRegisterListOpValue(const MCInst &MI, unsigned Op, + SmallVectorImpl<MCFixup> &Fixups) const { + // VLDM/VSTM: + // {12-8} = Vd + // {7-0} = Number of registers + // + // LDM/STM: + // {15-0} = Bitfield of GPRs. + unsigned Reg = MI.getOperand(Op).getReg(); + bool SPRRegs = ARM::SPRRegClass.contains(Reg); + bool DPRRegs = ARM::DPRRegClass.contains(Reg); + + unsigned Binary = 0; + + if (SPRRegs || DPRRegs) { + // VLDM/VSTM + unsigned RegNo = getARMRegisterNumbering(Reg); + unsigned NumRegs = (MI.getNumOperands() - Op) & 0xff; + Binary |= (RegNo & 0x1f) << 8; + if (SPRRegs) + Binary |= NumRegs; + else + Binary |= NumRegs * 2; + } else { + for (unsigned I = Op, E = MI.getNumOperands(); I < E; ++I) { + unsigned RegNo = getARMRegisterNumbering(MI.getOperand(I).getReg()); + Binary |= 1 << RegNo; + } + } + + return Binary; +} + +/// getAddrMode6AddressOpValue - Encode an addrmode6 register number along +/// with the alignment operand. +unsigned ARMMCCodeEmitter:: +getAddrMode6AddressOpValue(const MCInst &MI, unsigned Op, + SmallVectorImpl<MCFixup> &Fixups) const { + const MCOperand &Reg = MI.getOperand(Op); + const MCOperand &Imm = MI.getOperand(Op + 1); + + unsigned RegNo = getARMRegisterNumbering(Reg.getReg()); + unsigned Align = 0; + + switch (Imm.getImm()) { + default: break; + case 2: + case 4: + case 8: Align = 0x01; break; + case 16: Align = 0x02; break; + case 32: Align = 0x03; break; + } + + return RegNo | (Align << 4); +} + +/// getAddrMode6DupAddressOpValue - Encode an addrmode6 register number and +/// alignment operand for use in VLD-dup instructions. This is the same as +/// getAddrMode6AddressOpValue except for the alignment encoding, which is +/// different for VLD4-dup. +unsigned ARMMCCodeEmitter:: +getAddrMode6DupAddressOpValue(const MCInst &MI, unsigned Op, + SmallVectorImpl<MCFixup> &Fixups) const { + const MCOperand &Reg = MI.getOperand(Op); + const MCOperand &Imm = MI.getOperand(Op + 1); + + unsigned RegNo = getARMRegisterNumbering(Reg.getReg()); + unsigned Align = 0; + + switch (Imm.getImm()) { + default: break; + case 2: + case 4: + case 8: Align = 0x01; break; + case 16: Align = 0x03; break; + } + + return RegNo | (Align << 4); +} + +unsigned ARMMCCodeEmitter:: +getAddrMode6OffsetOpValue(const MCInst &MI, unsigned Op, + SmallVectorImpl<MCFixup> &Fixups) const { + const MCOperand &MO = MI.getOperand(Op); + if (MO.getReg() == 0) return 0x0D; + return MO.getReg(); +} + +void ARMMCCodeEmitter:: +EncodeInstruction(const MCInst &MI, raw_ostream &OS, + SmallVectorImpl<MCFixup> &Fixups) const { + // Pseudo instructions don't get encoded. + const TargetInstrDesc &Desc = TII.get(MI.getOpcode()); + uint64_t TSFlags = Desc.TSFlags; + if ((TSFlags & ARMII::FormMask) == ARMII::Pseudo) + return; + int Size; + // Basic size info comes from the TSFlags field. + switch ((TSFlags & ARMII::SizeMask) >> ARMII::SizeShift) { + default: llvm_unreachable("Unexpected instruction size!"); + case ARMII::Size2Bytes: Size = 2; break; + case ARMII::Size4Bytes: Size = 4; break; + } + uint32_t Binary = getBinaryCodeForInstr(MI, Fixups); + // Thumb 32-bit wide instructions need to emit the high order halfword + // first. + if (Subtarget->isThumb() && Size == 4) { + EmitConstant(Binary >> 16, 2, OS); + EmitConstant(Binary & 0xffff, 2, OS); + } else + EmitConstant(Binary, Size, OS); + ++MCNumEmitted; // Keep track of the # of mi's emitted. +} + +#include "ARMGenMCCodeEmitter.inc" diff --git a/contrib/llvm/lib/Target/ARM/ARMMCExpr.cpp b/contrib/llvm/lib/Target/ARM/ARMMCExpr.cpp new file mode 100644 index 0000000..2727ba8 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMMCExpr.cpp @@ -0,0 +1,73 @@ +//===-- ARMMCExpr.cpp - ARM specific MC expression classes ----------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "armmcexpr" +#include "ARMMCExpr.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCAssembler.h" +using namespace llvm; + +const ARMMCExpr* +ARMMCExpr::Create(VariantKind Kind, const MCExpr *Expr, + MCContext &Ctx) { + return new (Ctx) ARMMCExpr(Kind, Expr); +} + +void ARMMCExpr::PrintImpl(raw_ostream &OS) const { + switch (Kind) { + default: assert(0 && "Invalid kind!"); + case VK_ARM_HI16: OS << ":upper16:"; break; + case VK_ARM_LO16: OS << ":lower16:"; break; + } + + const MCExpr *Expr = getSubExpr(); + if (Expr->getKind() != MCExpr::SymbolRef) + OS << '('; + Expr->print(OS); + if (Expr->getKind() != MCExpr::SymbolRef) + OS << ')'; +} + +bool +ARMMCExpr::EvaluateAsRelocatableImpl(MCValue &Res, + const MCAsmLayout *Layout) const { + return false; +} + +// FIXME: This basically copies MCObjectStreamer::AddValueSymbols. Perhaps +// that method should be made public? +static void AddValueSymbols_(const MCExpr *Value, MCAssembler *Asm) { + switch (Value->getKind()) { + case MCExpr::Target: + assert(0 && "Can't handle nested target expr!"); + break; + + case MCExpr::Constant: + break; + + case MCExpr::Binary: { + const MCBinaryExpr *BE = cast<MCBinaryExpr>(Value); + AddValueSymbols_(BE->getLHS(), Asm); + AddValueSymbols_(BE->getRHS(), Asm); + break; + } + + case MCExpr::SymbolRef: + Asm->getOrCreateSymbolData(cast<MCSymbolRefExpr>(Value)->getSymbol()); + break; + + case MCExpr::Unary: + AddValueSymbols_(cast<MCUnaryExpr>(Value)->getSubExpr(), Asm); + break; + } +} + +void ARMMCExpr::AddValueSymbols(MCAssembler *Asm) const { + AddValueSymbols_(getSubExpr(), Asm); +} diff --git a/contrib/llvm/lib/Target/ARM/ARMMCExpr.h b/contrib/llvm/lib/Target/ARM/ARMMCExpr.h new file mode 100644 index 0000000..d42f766 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMMCExpr.h @@ -0,0 +1,73 @@ +//===-- ARMMCExpr.h - ARM specific MC expression classes ------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef ARMMCEXPR_H +#define ARMMCEXPR_H + +#include "llvm/MC/MCExpr.h" + +namespace llvm { + +class ARMMCExpr : public MCTargetExpr { +public: + enum VariantKind { + VK_ARM_None, + VK_ARM_HI16, // The R_ARM_MOVT_ABS relocation (:upper16: in the .s file) + VK_ARM_LO16 // The R_ARM_MOVW_ABS_NC relocation (:lower16: in the .s file) + }; + +private: + const VariantKind Kind; + const MCExpr *Expr; + + explicit ARMMCExpr(VariantKind _Kind, const MCExpr *_Expr) + : Kind(_Kind), Expr(_Expr) {} + +public: + /// @name Construction + /// @{ + + static const ARMMCExpr *Create(VariantKind Kind, const MCExpr *Expr, + MCContext &Ctx); + + static const ARMMCExpr *CreateUpper16(const MCExpr *Expr, MCContext &Ctx) { + return Create(VK_ARM_HI16, Expr, Ctx); + } + + static const ARMMCExpr *CreateLower16(const MCExpr *Expr, MCContext &Ctx) { + return Create(VK_ARM_LO16, Expr, Ctx); + } + + /// @} + /// @name Accessors + /// @{ + + /// getOpcode - Get the kind of this expression. + VariantKind getKind() const { return Kind; } + + /// getSubExpr - Get the child of this expression. + const MCExpr *getSubExpr() const { return Expr; } + + /// @} + + void PrintImpl(raw_ostream &OS) const; + bool EvaluateAsRelocatableImpl(MCValue &Res, + const MCAsmLayout *Layout) const; + void AddValueSymbols(MCAssembler *) const; + + static bool classof(const MCExpr *E) { + return E->getKind() == MCExpr::Target; + } + + static bool classof(const ARMMCExpr *) { return true; } + +}; +} // end namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/ARM/ARMMCInstLower.cpp b/contrib/llvm/lib/Target/ARM/ARMMCInstLower.cpp index ab2b06b..59d6050 100644 --- a/contrib/llvm/lib/Target/ARM/ARMMCInstLower.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMMCInstLower.cpp @@ -12,122 +12,69 @@ // //===----------------------------------------------------------------------===// -#include "ARMMCInstLower.h" -//#include "llvm/CodeGen/MachineModuleInfoImpls.h" -#include "llvm/CodeGen/AsmPrinter.h" +#include "ARM.h" +#include "ARMAsmPrinter.h" +#include "ARMMCExpr.h" +#include "llvm/Constants.h" #include "llvm/CodeGen/MachineBasicBlock.h" -#include "llvm/MC/MCAsmInfo.h" -#include "llvm/MC/MCContext.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" -//#include "llvm/MC/MCStreamer.h" #include "llvm/Target/Mangler.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/ADT/SmallString.h" using namespace llvm; -#if 0 -const ARMSubtarget &ARMMCInstLower::getSubtarget() const { - return AsmPrinter.getSubtarget(); -} - -MachineModuleInfoMachO &ARMMCInstLower::getMachOMMI() const { - assert(getSubtarget().isTargetDarwin() &&"Can only get MachO info on darwin"); - return AsmPrinter.MMI->getObjFileInfo<MachineModuleInfoMachO>(); -} -#endif - -MCSymbol *ARMMCInstLower:: -GetGlobalAddressSymbol(const MachineOperand &MO) const { - // FIXME: HANDLE PLT references how?? - switch (MO.getTargetFlags()) { - default: assert(0 && "Unknown target flag on GV operand"); - case 0: break; - } - - return Printer.Mang->getSymbol(MO.getGlobal()); -} - -MCSymbol *ARMMCInstLower:: -GetExternalSymbolSymbol(const MachineOperand &MO) const { - // FIXME: HANDLE PLT references how?? +static MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol, + ARMAsmPrinter &Printer) { + MCContext &Ctx = Printer.OutContext; + const MCExpr *Expr; switch (MO.getTargetFlags()) { - default: assert(0 && "Unknown target flag on GV operand"); - case 0: break; + default: { + Expr = MCSymbolRefExpr::Create(Symbol, MCSymbolRefExpr::VK_None, Ctx); + switch (MO.getTargetFlags()) { + default: + assert(0 && "Unknown target flag on symbol operand"); + case 0: + break; + case ARMII::MO_LO16: + Expr = MCSymbolRefExpr::Create(Symbol, MCSymbolRefExpr::VK_None, Ctx); + Expr = ARMMCExpr::CreateLower16(Expr, Ctx); + break; + case ARMII::MO_HI16: + Expr = MCSymbolRefExpr::Create(Symbol, MCSymbolRefExpr::VK_None, Ctx); + Expr = ARMMCExpr::CreateUpper16(Expr, Ctx); + break; + } + break; } - - return Printer.GetExternalSymbolSymbol(MO.getSymbolName()); -} - - -MCSymbol *ARMMCInstLower:: -GetJumpTableSymbol(const MachineOperand &MO) const { - SmallString<256> Name; - raw_svector_ostream(Name) << Printer.MAI->getPrivateGlobalPrefix() << "JTI" - << Printer.getFunctionNumber() << '_' << MO.getIndex(); - -#if 0 - switch (MO.getTargetFlags()) { - default: llvm_unreachable("Unknown target flag on GV operand"); + case ARMII::MO_PLT: + Expr = MCSymbolRefExpr::Create(Symbol, MCSymbolRefExpr::VK_ARM_PLT, Ctx); + break; } -#endif - - // Create a symbol for the name. - return Ctx.GetOrCreateSymbol(Name.str()); -} -MCSymbol *ARMMCInstLower:: -GetConstantPoolIndexSymbol(const MachineOperand &MO) const { - SmallString<256> Name; - raw_svector_ostream(Name) << Printer.MAI->getPrivateGlobalPrefix() << "CPI" - << Printer.getFunctionNumber() << '_' << MO.getIndex(); - -#if 0 - switch (MO.getTargetFlags()) { - default: llvm_unreachable("Unknown target flag on GV operand"); - } -#endif - - // Create a symbol for the name. - return Ctx.GetOrCreateSymbol(Name.str()); -} - -MCOperand ARMMCInstLower:: -LowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const { - // FIXME: We would like an efficient form for this, so we don't have to do a - // lot of extra uniquing. - const MCExpr *Expr = MCSymbolRefExpr::Create(Sym, Ctx); - -#if 0 - switch (MO.getTargetFlags()) { - default: llvm_unreachable("Unknown target flag on GV operand"); - } -#endif - if (!MO.isJTI() && MO.getOffset()) Expr = MCBinaryExpr::CreateAdd(Expr, MCConstantExpr::Create(MO.getOffset(), Ctx), Ctx); return MCOperand::CreateExpr(Expr); -} +} -void ARMMCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const { +void llvm::LowerARMMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI, + ARMAsmPrinter &AP) { OutMI.setOpcode(MI->getOpcode()); - + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { const MachineOperand &MO = MI->getOperand(i); - + MCOperand MCOp; switch (MO.getType()) { default: MI->dump(); assert(0 && "unknown operand type"); case MachineOperand::MO_Register: - // Ignore all implicit register operands. - if (MO.isImplicit()) continue; + // Ignore all non-CPSR implicit register operands. + if (MO.isImplicit() && MO.getReg() != ARM::CPSR) continue; assert(!MO.getSubReg() && "Subregs should be eliminated!"); MCOp = MCOperand::CreateReg(MO.getReg()); break; @@ -136,27 +83,33 @@ void ARMMCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const { break; case MachineOperand::MO_MachineBasicBlock: MCOp = MCOperand::CreateExpr(MCSymbolRefExpr::Create( - MO.getMBB()->getSymbol(), Ctx)); + MO.getMBB()->getSymbol(), AP.OutContext)); break; case MachineOperand::MO_GlobalAddress: - MCOp = LowerSymbolOperand(MO, GetGlobalAddressSymbol(MO)); + MCOp = GetSymbolRef(MO, AP.Mang->getSymbol(MO.getGlobal()), AP); break; case MachineOperand::MO_ExternalSymbol: - MCOp = LowerSymbolOperand(MO, GetExternalSymbolSymbol(MO)); + MCOp = GetSymbolRef(MO, + AP.GetExternalSymbolSymbol(MO.getSymbolName()), AP); break; case MachineOperand::MO_JumpTableIndex: - MCOp = LowerSymbolOperand(MO, GetJumpTableSymbol(MO)); + MCOp = GetSymbolRef(MO, AP.GetJTISymbol(MO.getIndex()), AP); break; case MachineOperand::MO_ConstantPoolIndex: - MCOp = LowerSymbolOperand(MO, GetConstantPoolIndexSymbol(MO)); + MCOp = GetSymbolRef(MO, AP.GetCPISymbol(MO.getIndex()), AP); break; case MachineOperand::MO_BlockAddress: - MCOp = LowerSymbolOperand(MO, Printer.GetBlockAddressSymbol( - MO.getBlockAddress())); + MCOp = GetSymbolRef(MO,AP.GetBlockAddressSymbol(MO.getBlockAddress()),AP); break; + case MachineOperand::MO_FPImmediate: { + APFloat Val = MO.getFPImm()->getValueAPF(); + bool ignored; + Val.convert(APFloat::IEEEdouble, APFloat::rmTowardZero, &ignored); + MCOp = MCOperand::CreateFPImm(Val.convertToDouble()); + break; + } } - + OutMI.addOperand(MCOp); } - } diff --git a/contrib/llvm/lib/Target/ARM/ARMMCInstLower.h b/contrib/llvm/lib/Target/ARM/ARMMCInstLower.h deleted file mode 100644 index b81a306..0000000 --- a/contrib/llvm/lib/Target/ARM/ARMMCInstLower.h +++ /dev/null @@ -1,56 +0,0 @@ -//===-- ARMMCInstLower.h - Lower MachineInstr to MCInst -------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// - -#ifndef ARM_MCINSTLOWER_H -#define ARM_MCINSTLOWER_H - -#include "llvm/Support/Compiler.h" - -namespace llvm { - class AsmPrinter; - class MCAsmInfo; - class MCContext; - class MCInst; - class MCOperand; - class MCSymbol; - class MachineInstr; - class MachineModuleInfoMachO; - class MachineOperand; - class Mangler; - //class ARMSubtarget; - -/// ARMMCInstLower - This class is used to lower an MachineInstr into an MCInst. -class LLVM_LIBRARY_VISIBILITY ARMMCInstLower { - MCContext &Ctx; - Mangler &Mang; - AsmPrinter &Printer; - - //const ARMSubtarget &getSubtarget() const; -public: - ARMMCInstLower(MCContext &ctx, Mangler &mang, AsmPrinter &printer) - : Ctx(ctx), Mang(mang), Printer(printer) {} - - void Lower(const MachineInstr *MI, MCInst &OutMI) const; - - //MCSymbol *GetPICBaseSymbol() const; - MCSymbol *GetGlobalAddressSymbol(const MachineOperand &MO) const; - MCSymbol *GetExternalSymbolSymbol(const MachineOperand &MO) const; - MCSymbol *GetJumpTableSymbol(const MachineOperand &MO) const; - MCSymbol *GetConstantPoolIndexSymbol(const MachineOperand &MO) const; - MCOperand LowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const; - -/* -private: - MachineModuleInfoMachO &getMachOMMI() const; - */ -}; - -} - -#endif diff --git a/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h b/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h index 514c26b..138f0c2 100644 --- a/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h +++ b/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h @@ -22,8 +22,8 @@ namespace llvm { -/// ARMFunctionInfo - This class is derived from MachineFunction private -/// ARM target-specific information for each MachineFunction. +/// ARMFunctionInfo - This class is derived from MachineFunctionInfo and +/// contains private ARM-specific information for each MachineFunction. class ARMFunctionInfo : public MachineFunctionInfo { /// isThumb - True if this function is compiled under Thumb mode. @@ -79,15 +79,11 @@ class ARMFunctionInfo : public MachineFunctionInfo { BitVector GPRCS2Frames; BitVector DPRCSFrames; - /// SpilledCSRegs - A BitVector mask of all spilled callee-saved registers. - /// - BitVector SpilledCSRegs; - /// JumpTableUId - Unique id for jumptables. /// unsigned JumpTableUId; - unsigned ConstPoolEntryUId; + unsigned PICLabelUId; /// VarArgsFrameIndex - FrameIndex for start of varargs area. int VarArgsFrameIndex; @@ -95,6 +91,10 @@ class ARMFunctionInfo : public MachineFunctionInfo { /// HasITBlocks - True if IT blocks have been inserted. bool HasITBlocks; + /// CPEClones - Track constant pool entries clones created by Constant Island + /// pass. + DenseMap<unsigned, unsigned> CPEClones; + public: ARMFunctionInfo() : isThumb(false), @@ -104,8 +104,8 @@ public: FramePtrSpillOffset(0), GPRCS1Offset(0), GPRCS2Offset(0), DPRCSOffset(0), GPRCS1Size(0), GPRCS2Size(0), DPRCSSize(0), GPRCS1Frames(0), GPRCS2Frames(0), DPRCSFrames(0), - JumpTableUId(0), ConstPoolEntryUId(0), VarArgsFrameIndex(0), - HasITBlocks(false) {} + JumpTableUId(0), PICLabelUId(0), + VarArgsFrameIndex(0), HasITBlocks(false) {} explicit ARMFunctionInfo(MachineFunction &MF) : isThumb(MF.getTarget().getSubtarget<ARMSubtarget>().isThumb()), @@ -115,9 +115,8 @@ public: FramePtrSpillOffset(0), GPRCS1Offset(0), GPRCS2Offset(0), DPRCSOffset(0), GPRCS1Size(0), GPRCS2Size(0), DPRCSSize(0), GPRCS1Frames(32), GPRCS2Frames(32), DPRCSFrames(32), - SpilledCSRegs(MF.getTarget().getRegisterInfo()->getNumRegs()), - JumpTableUId(0), ConstPoolEntryUId(0), VarArgsFrameIndex(0), - HasITBlocks(false) {} + JumpTableUId(0), PICLabelUId(0), + VarArgsFrameIndex(0), HasITBlocks(false) {} bool isThumbFunction() const { return isThumb; } bool isThumb1OnlyFunction() const { return isThumb && !hasThumb2; } @@ -207,18 +206,6 @@ public: } } - void setCSRegisterIsSpilled(unsigned Reg) { - SpilledCSRegs.set(Reg); - } - - bool isCSRegisterSpilled(unsigned Reg) const { - return SpilledCSRegs[Reg]; - } - - const BitVector &getSpilledCSRegisters() const { - return SpilledCSRegs; - } - unsigned createJumpTableUId() { return JumpTableUId++; } @@ -227,16 +214,16 @@ public: return JumpTableUId; } - void initConstPoolEntryUId(unsigned UId) { - ConstPoolEntryUId = UId; + void initPICLabelUId(unsigned UId) { + PICLabelUId = UId; } - unsigned getNumConstPoolEntries() const { - return ConstPoolEntryUId; + unsigned getNumPICLabels() const { + return PICLabelUId; } - unsigned createConstPoolEntryUId() { - return ConstPoolEntryUId++; + unsigned createPICLabelUId() { + return PICLabelUId++; } int getVarArgsFrameIndex() const { return VarArgsFrameIndex; } @@ -244,6 +231,19 @@ public: bool hasITBlocks() const { return HasITBlocks; } void setHasITBlocks(bool h) { HasITBlocks = h; } + + void recordCPEClone(unsigned CPIdx, unsigned CPCloneIdx) { + if (!CPEClones.insert(std::make_pair(CPCloneIdx, CPIdx)).second) + assert(0 && "Duplicate entries!"); + } + + unsigned getOriginalCPIdx(unsigned CloneIdx) const { + DenseMap<unsigned, unsigned>::const_iterator I = CPEClones.find(CloneIdx); + if (I != CPEClones.end()) + return I->second; + else + return -1U; + } }; } // End llvm namespace diff --git a/contrib/llvm/lib/Target/ARM/ARMPerfectShuffle.h b/contrib/llvm/lib/Target/ARM/ARMPerfectShuffle.h index 5ff7c38..edecc4b 100644 --- a/contrib/llvm/lib/Target/ARM/ARMPerfectShuffle.h +++ b/contrib/llvm/lib/Target/ARM/ARMPerfectShuffle.h @@ -21,6566 +21,6566 @@ // This table is 6561*4 = 26244 bytes in size. static const unsigned PerfectShuffleTable[6561+1] = { - 135053414U, // <0,0,0,0>: Cost 1 vdup0 LHS - 1543503974U, // <0,0,0,1>: Cost 2 vext2 <0,0,0,0>, LHS - 2618572962U, // <0,0,0,2>: Cost 3 vext2 <0,2,0,0>, <0,2,0,0> - 2568054923U, // <0,0,0,3>: Cost 3 vext1 <3,0,0,0>, <3,0,0,0> - 1476398390U, // <0,0,0,4>: Cost 2 vext1 <0,0,0,0>, RHS - 2550140624U, // <0,0,0,5>: Cost 3 vext1 <0,0,0,0>, <5,1,7,3> - 2550141434U, // <0,0,0,6>: Cost 3 vext1 <0,0,0,0>, <6,2,7,3> - 2591945711U, // <0,0,0,7>: Cost 3 vext1 <7,0,0,0>, <7,0,0,0> - 135053414U, // <0,0,0,u>: Cost 1 vdup0 LHS - 2886516736U, // <0,0,1,0>: Cost 3 vzipl LHS, <0,0,0,0> - 1812775014U, // <0,0,1,1>: Cost 2 vzipl LHS, LHS - 1618133094U, // <0,0,1,2>: Cost 2 vext3 <1,2,3,0>, LHS - 2625209292U, // <0,0,1,3>: Cost 3 vext2 <1,3,0,0>, <1,3,0,0> - 2886558034U, // <0,0,1,4>: Cost 3 vzipl LHS, <0,4,1,5> - 2617246864U, // <0,0,1,5>: Cost 3 vext2 <0,0,0,0>, <1,5,3,7> - 3659723031U, // <0,0,1,6>: Cost 4 vext1 <6,0,0,1>, <6,0,0,1> - 2591953904U, // <0,0,1,7>: Cost 3 vext1 <7,0,0,1>, <7,0,0,1> - 1812775581U, // <0,0,1,u>: Cost 2 vzipl LHS, LHS - 3020734464U, // <0,0,2,0>: Cost 3 vtrnl LHS, <0,0,0,0> - 3020734474U, // <0,0,2,1>: Cost 3 vtrnl LHS, <0,0,1,1> - 1946992742U, // <0,0,2,2>: Cost 2 vtrnl LHS, LHS - 2631181989U, // <0,0,2,3>: Cost 3 vext2 <2,3,0,0>, <2,3,0,0> - 3020734668U, // <0,0,2,4>: Cost 3 vtrnl LHS, <0,2,4,6> - 3826550569U, // <0,0,2,5>: Cost 4 vuzpl <0,2,0,2>, <2,4,5,6> - 2617247674U, // <0,0,2,6>: Cost 3 vext2 <0,0,0,0>, <2,6,3,7> - 2591962097U, // <0,0,2,7>: Cost 3 vext1 <7,0,0,2>, <7,0,0,2> - 1946992796U, // <0,0,2,u>: Cost 2 vtrnl LHS, LHS - 2635163787U, // <0,0,3,0>: Cost 3 vext2 <3,0,0,0>, <3,0,0,0> - 2686419196U, // <0,0,3,1>: Cost 3 vext3 <0,3,1,0>, <0,3,1,0> - 2686492933U, // <0,0,3,2>: Cost 3 vext3 <0,3,2,0>, <0,3,2,0> - 2617248156U, // <0,0,3,3>: Cost 3 vext2 <0,0,0,0>, <3,3,3,3> - 2617248258U, // <0,0,3,4>: Cost 3 vext2 <0,0,0,0>, <3,4,5,6> - 3826551298U, // <0,0,3,5>: Cost 4 vuzpl <0,2,0,2>, <3,4,5,6> - 3690990200U, // <0,0,3,6>: Cost 4 vext2 <0,0,0,0>, <3,6,0,7> - 3713551042U, // <0,0,3,7>: Cost 4 vext2 <3,7,0,0>, <3,7,0,0> - 2635163787U, // <0,0,3,u>: Cost 3 vext2 <3,0,0,0>, <3,0,0,0> - 2617248658U, // <0,0,4,0>: Cost 3 vext2 <0,0,0,0>, <4,0,5,1> - 2888450150U, // <0,0,4,1>: Cost 3 vzipl <0,4,1,5>, LHS - 3021570150U, // <0,0,4,2>: Cost 3 vtrnl <0,2,4,6>, LHS - 3641829519U, // <0,0,4,3>: Cost 4 vext1 <3,0,0,4>, <3,0,0,4> - 3021570252U, // <0,0,4,4>: Cost 3 vtrnl <0,2,4,6>, <0,2,4,6> - 1543507254U, // <0,0,4,5>: Cost 2 vext2 <0,0,0,0>, RHS - 2752810294U, // <0,0,4,6>: Cost 3 vuzpl <0,2,0,2>, RHS - 3786998152U, // <0,0,4,7>: Cost 4 vext3 <4,7,5,0>, <0,4,7,5> - 1543507497U, // <0,0,4,u>: Cost 2 vext2 <0,0,0,0>, RHS - 2684354972U, // <0,0,5,0>: Cost 3 vext3 <0,0,0,0>, <0,5,0,7> - 2617249488U, // <0,0,5,1>: Cost 3 vext2 <0,0,0,0>, <5,1,7,3> - 3765617070U, // <0,0,5,2>: Cost 4 vext3 <1,2,3,0>, <0,5,2,7> - 3635865780U, // <0,0,5,3>: Cost 4 vext1 <2,0,0,5>, <3,0,4,5> - 2617249734U, // <0,0,5,4>: Cost 3 vext2 <0,0,0,0>, <5,4,7,6> - 2617249796U, // <0,0,5,5>: Cost 3 vext2 <0,0,0,0>, <5,5,5,5> - 2718712274U, // <0,0,5,6>: Cost 3 vext3 <5,6,7,0>, <0,5,6,7> - 2617249960U, // <0,0,5,7>: Cost 3 vext2 <0,0,0,0>, <5,7,5,7> - 2720039396U, // <0,0,5,u>: Cost 3 vext3 <5,u,7,0>, <0,5,u,7> - 2684355053U, // <0,0,6,0>: Cost 3 vext3 <0,0,0,0>, <0,6,0,7> - 3963609190U, // <0,0,6,1>: Cost 4 vzipl <0,6,2,7>, LHS - 2617250298U, // <0,0,6,2>: Cost 3 vext2 <0,0,0,0>, <6,2,7,3> - 3796435464U, // <0,0,6,3>: Cost 4 vext3 <6,3,7,0>, <0,6,3,7> - 3659762998U, // <0,0,6,4>: Cost 4 vext1 <6,0,0,6>, RHS - 3659763810U, // <0,0,6,5>: Cost 4 vext1 <6,0,0,6>, <5,6,7,0> - 2617250616U, // <0,0,6,6>: Cost 3 vext2 <0,0,0,0>, <6,6,6,6> - 2657727309U, // <0,0,6,7>: Cost 3 vext2 <6,7,0,0>, <6,7,0,0> - 2658390942U, // <0,0,6,u>: Cost 3 vext2 <6,u,0,0>, <6,u,0,0> - 2659054575U, // <0,0,7,0>: Cost 3 vext2 <7,0,0,0>, <7,0,0,0> - 3635880854U, // <0,0,7,1>: Cost 4 vext1 <2,0,0,7>, <1,2,3,0> - 3635881401U, // <0,0,7,2>: Cost 4 vext1 <2,0,0,7>, <2,0,0,7> - 3734787298U, // <0,0,7,3>: Cost 4 vext2 <7,3,0,0>, <7,3,0,0> - 2617251174U, // <0,0,7,4>: Cost 3 vext2 <0,0,0,0>, <7,4,5,6> - 3659772002U, // <0,0,7,5>: Cost 4 vext1 <6,0,0,7>, <5,6,7,0> - 3659772189U, // <0,0,7,6>: Cost 4 vext1 <6,0,0,7>, <6,0,0,7> - 2617251436U, // <0,0,7,7>: Cost 3 vext2 <0,0,0,0>, <7,7,7,7> - 2659054575U, // <0,0,7,u>: Cost 3 vext2 <7,0,0,0>, <7,0,0,0> - 135053414U, // <0,0,u,0>: Cost 1 vdup0 LHS - 1817419878U, // <0,0,u,1>: Cost 2 vzipl LHS, LHS - 1947435110U, // <0,0,u,2>: Cost 2 vtrnl LHS, LHS - 2568120467U, // <0,0,u,3>: Cost 3 vext1 <3,0,0,u>, <3,0,0,u> - 1476463926U, // <0,0,u,4>: Cost 2 vext1 <0,0,0,u>, RHS - 1543510170U, // <0,0,u,5>: Cost 2 vext2 <0,0,0,0>, RHS - 2752813210U, // <0,0,u,6>: Cost 3 vuzpl <0,2,0,2>, RHS - 2592011255U, // <0,0,u,7>: Cost 3 vext1 <7,0,0,u>, <7,0,0,u> - 135053414U, // <0,0,u,u>: Cost 1 vdup0 LHS - 2618581002U, // <0,1,0,0>: Cost 3 vext2 <0,2,0,1>, <0,0,1,1> - 1557446758U, // <0,1,0,1>: Cost 2 vext2 <2,3,0,1>, LHS - 2618581155U, // <0,1,0,2>: Cost 3 vext2 <0,2,0,1>, <0,2,0,1> - 2690548468U, // <0,1,0,3>: Cost 3 vext3 <1,0,3,0>, <1,0,3,0> - 2626543954U, // <0,1,0,4>: Cost 3 vext2 <1,5,0,1>, <0,4,1,5> - 4094985216U, // <0,1,0,5>: Cost 4 vtrnl <0,2,0,2>, <1,3,5,7> - 2592019278U, // <0,1,0,6>: Cost 3 vext1 <7,0,1,0>, <6,7,0,1> - 2592019448U, // <0,1,0,7>: Cost 3 vext1 <7,0,1,0>, <7,0,1,0> - 1557447325U, // <0,1,0,u>: Cost 2 vext2 <2,3,0,1>, LHS - 1476476938U, // <0,1,1,0>: Cost 2 vext1 <0,0,1,1>, <0,0,1,1> - 2886517556U, // <0,1,1,1>: Cost 3 vzipl LHS, <1,1,1,1> - 2886517654U, // <0,1,1,2>: Cost 3 vzipl LHS, <1,2,3,0> - 2886517720U, // <0,1,1,3>: Cost 3 vzipl LHS, <1,3,1,3> - 1476480310U, // <0,1,1,4>: Cost 2 vext1 <0,0,1,1>, RHS - 2886558864U, // <0,1,1,5>: Cost 3 vzipl LHS, <1,5,3,7> - 2550223354U, // <0,1,1,6>: Cost 3 vext1 <0,0,1,1>, <6,2,7,3> - 2550223856U, // <0,1,1,7>: Cost 3 vext1 <0,0,1,1>, <7,0,0,1> - 1476482862U, // <0,1,1,u>: Cost 2 vext1 <0,0,1,1>, LHS - 1494401126U, // <0,1,2,0>: Cost 2 vext1 <3,0,1,2>, LHS - 3020735284U, // <0,1,2,1>: Cost 3 vtrnl LHS, <1,1,1,1> - 2562172349U, // <0,1,2,2>: Cost 3 vext1 <2,0,1,2>, <2,0,1,2> - 835584U, // <0,1,2,3>: Cost 0 copy LHS - 1494404406U, // <0,1,2,4>: Cost 2 vext1 <3,0,1,2>, RHS - 3020735488U, // <0,1,2,5>: Cost 3 vtrnl LHS, <1,3,5,7> - 2631190458U, // <0,1,2,6>: Cost 3 vext2 <2,3,0,1>, <2,6,3,7> - 1518294010U, // <0,1,2,7>: Cost 2 vext1 <7,0,1,2>, <7,0,1,2> - 835584U, // <0,1,2,u>: Cost 0 copy LHS - 2692318156U, // <0,1,3,0>: Cost 3 vext3 <1,3,0,0>, <1,3,0,0> - 2691875800U, // <0,1,3,1>: Cost 3 vext3 <1,2,3,0>, <1,3,1,3> - 2691875806U, // <0,1,3,2>: Cost 3 vext3 <1,2,3,0>, <1,3,2,0> - 2692539367U, // <0,1,3,3>: Cost 3 vext3 <1,3,3,0>, <1,3,3,0> - 2562182454U, // <0,1,3,4>: Cost 3 vext1 <2,0,1,3>, RHS - 2691875840U, // <0,1,3,5>: Cost 3 vext3 <1,2,3,0>, <1,3,5,7> - 2692760578U, // <0,1,3,6>: Cost 3 vext3 <1,3,6,0>, <1,3,6,0> - 2639817411U, // <0,1,3,7>: Cost 3 vext2 <3,7,0,1>, <3,7,0,1> - 2691875863U, // <0,1,3,u>: Cost 3 vext3 <1,2,3,0>, <1,3,u,3> - 2568159334U, // <0,1,4,0>: Cost 3 vext1 <3,0,1,4>, LHS - 4095312692U, // <0,1,4,1>: Cost 4 vtrnl <0,2,4,6>, <1,1,1,1> - 2568160934U, // <0,1,4,2>: Cost 3 vext1 <3,0,1,4>, <2,3,0,1> - 2568161432U, // <0,1,4,3>: Cost 3 vext1 <3,0,1,4>, <3,0,1,4> - 2568162614U, // <0,1,4,4>: Cost 3 vext1 <3,0,1,4>, RHS - 1557450038U, // <0,1,4,5>: Cost 2 vext2 <2,3,0,1>, RHS - 2754235702U, // <0,1,4,6>: Cost 3 vuzpl <0,4,1,5>, RHS - 2592052220U, // <0,1,4,7>: Cost 3 vext1 <7,0,1,4>, <7,0,1,4> - 1557450281U, // <0,1,4,u>: Cost 2 vext2 <2,3,0,1>, RHS - 3765617775U, // <0,1,5,0>: Cost 4 vext3 <1,2,3,0>, <1,5,0,1> - 2647781007U, // <0,1,5,1>: Cost 3 vext2 <5,1,0,1>, <5,1,0,1> - 3704934138U, // <0,1,5,2>: Cost 4 vext2 <2,3,0,1>, <5,2,3,0> - 2691875984U, // <0,1,5,3>: Cost 3 vext3 <1,2,3,0>, <1,5,3,7> - 2657734598U, // <0,1,5,4>: Cost 3 vext2 <6,7,0,1>, <5,4,7,6> - 2650435539U, // <0,1,5,5>: Cost 3 vext2 <5,5,0,1>, <5,5,0,1> - 2651099172U, // <0,1,5,6>: Cost 3 vext2 <5,6,0,1>, <5,6,0,1> - 2651762805U, // <0,1,5,7>: Cost 3 vext2 <5,7,0,1>, <5,7,0,1> - 2691876029U, // <0,1,5,u>: Cost 3 vext3 <1,2,3,0>, <1,5,u,7> - 2592063590U, // <0,1,6,0>: Cost 3 vext1 <7,0,1,6>, LHS - 3765617871U, // <0,1,6,1>: Cost 4 vext3 <1,2,3,0>, <1,6,1,7> - 2654417337U, // <0,1,6,2>: Cost 3 vext2 <6,2,0,1>, <6,2,0,1> - 3765617889U, // <0,1,6,3>: Cost 4 vext3 <1,2,3,0>, <1,6,3,7> - 2592066870U, // <0,1,6,4>: Cost 3 vext1 <7,0,1,6>, RHS - 3765617907U, // <0,1,6,5>: Cost 4 vext3 <1,2,3,0>, <1,6,5,7> - 2657071869U, // <0,1,6,6>: Cost 3 vext2 <6,6,0,1>, <6,6,0,1> - 1583993678U, // <0,1,6,7>: Cost 2 vext2 <6,7,0,1>, <6,7,0,1> - 1584657311U, // <0,1,6,u>: Cost 2 vext2 <6,u,0,1>, <6,u,0,1> - 2657735672U, // <0,1,7,0>: Cost 3 vext2 <6,7,0,1>, <7,0,1,0> - 2657735808U, // <0,1,7,1>: Cost 3 vext2 <6,7,0,1>, <7,1,7,1> - 2631193772U, // <0,1,7,2>: Cost 3 vext2 <2,3,0,1>, <7,2,3,0> - 2661053667U, // <0,1,7,3>: Cost 3 vext2 <7,3,0,1>, <7,3,0,1> - 2657736038U, // <0,1,7,4>: Cost 3 vext2 <6,7,0,1>, <7,4,5,6> - 3721524621U, // <0,1,7,5>: Cost 4 vext2 <5,1,0,1>, <7,5,1,0> - 2657736158U, // <0,1,7,6>: Cost 3 vext2 <6,7,0,1>, <7,6,1,0> - 2657736300U, // <0,1,7,7>: Cost 3 vext2 <6,7,0,1>, <7,7,7,7> - 2657736322U, // <0,1,7,u>: Cost 3 vext2 <6,7,0,1>, <7,u,1,2> - 1494450278U, // <0,1,u,0>: Cost 2 vext1 <3,0,1,u>, LHS - 1557452590U, // <0,1,u,1>: Cost 2 vext2 <2,3,0,1>, LHS - 2754238254U, // <0,1,u,2>: Cost 3 vuzpl <0,4,1,5>, LHS - 835584U, // <0,1,u,3>: Cost 0 copy LHS - 1494453558U, // <0,1,u,4>: Cost 2 vext1 <3,0,1,u>, RHS - 1557452954U, // <0,1,u,5>: Cost 2 vext2 <2,3,0,1>, RHS - 2754238618U, // <0,1,u,6>: Cost 3 vuzpl <0,4,1,5>, RHS - 1518343168U, // <0,1,u,7>: Cost 2 vext1 <7,0,1,u>, <7,0,1,u> - 835584U, // <0,1,u,u>: Cost 0 copy LHS - 2752299008U, // <0,2,0,0>: Cost 3 vuzpl LHS, <0,0,0,0> - 1544847462U, // <0,2,0,1>: Cost 2 vext2 <0,2,0,2>, LHS - 1678557286U, // <0,2,0,2>: Cost 2 vuzpl LHS, LHS - 2696521165U, // <0,2,0,3>: Cost 3 vext3 <2,0,3,0>, <2,0,3,0> - 2752340172U, // <0,2,0,4>: Cost 3 vuzpl LHS, <0,2,4,6> - 2691876326U, // <0,2,0,5>: Cost 3 vext3 <1,2,3,0>, <2,0,5,7> - 2618589695U, // <0,2,0,6>: Cost 3 vext2 <0,2,0,2>, <0,6,2,7> - 2592093185U, // <0,2,0,7>: Cost 3 vext1 <7,0,2,0>, <7,0,2,0> - 1678557340U, // <0,2,0,u>: Cost 2 vuzpl LHS, LHS - 2618589942U, // <0,2,1,0>: Cost 3 vext2 <0,2,0,2>, <1,0,3,2> - 2752299828U, // <0,2,1,1>: Cost 3 vuzpl LHS, <1,1,1,1> - 2886518376U, // <0,2,1,2>: Cost 3 vzipl LHS, <2,2,2,2> - 2752299766U, // <0,2,1,3>: Cost 3 vuzpl LHS, <1,0,3,2> - 2550295862U, // <0,2,1,4>: Cost 3 vext1 <0,0,2,1>, RHS - 2752340992U, // <0,2,1,5>: Cost 3 vuzpl LHS, <1,3,5,7> - 2886559674U, // <0,2,1,6>: Cost 3 vzipl LHS, <2,6,3,7> - 3934208106U, // <0,2,1,7>: Cost 4 vuzpr <7,0,1,2>, <0,1,2,7> - 2752340771U, // <0,2,1,u>: Cost 3 vuzpl LHS, <1,0,u,2> - 1476558868U, // <0,2,2,0>: Cost 2 vext1 <0,0,2,2>, <0,0,2,2> - 2226628029U, // <0,2,2,1>: Cost 3 vrev <2,0,1,2> - 2752300648U, // <0,2,2,2>: Cost 3 vuzpl LHS, <2,2,2,2> - 3020736114U, // <0,2,2,3>: Cost 3 vtrnl LHS, <2,2,3,3> - 1476562230U, // <0,2,2,4>: Cost 2 vext1 <0,0,2,2>, RHS - 2550304464U, // <0,2,2,5>: Cost 3 vext1 <0,0,2,2>, <5,1,7,3> - 2618591162U, // <0,2,2,6>: Cost 3 vext2 <0,2,0,2>, <2,6,3,7> - 2550305777U, // <0,2,2,7>: Cost 3 vext1 <0,0,2,2>, <7,0,0,2> - 1476564782U, // <0,2,2,u>: Cost 2 vext1 <0,0,2,2>, LHS - 2618591382U, // <0,2,3,0>: Cost 3 vext2 <0,2,0,2>, <3,0,1,2> - 2752301206U, // <0,2,3,1>: Cost 3 vuzpl LHS, <3,0,1,2> - 3826043121U, // <0,2,3,2>: Cost 4 vuzpl LHS, <3,1,2,3> - 2752301468U, // <0,2,3,3>: Cost 3 vuzpl LHS, <3,3,3,3> - 2618591746U, // <0,2,3,4>: Cost 3 vext2 <0,2,0,2>, <3,4,5,6> - 2752301570U, // <0,2,3,5>: Cost 3 vuzpl LHS, <3,4,5,6> - 3830688102U, // <0,2,3,6>: Cost 4 vuzpl LHS, <3,2,6,3> - 2698807012U, // <0,2,3,7>: Cost 3 vext3 <2,3,7,0>, <2,3,7,0> - 2752301269U, // <0,2,3,u>: Cost 3 vuzpl LHS, <3,0,u,2> - 2562261094U, // <0,2,4,0>: Cost 3 vext1 <2,0,2,4>, LHS - 4095313828U, // <0,2,4,1>: Cost 4 vtrnl <0,2,4,6>, <2,6,1,3> - 2226718152U, // <0,2,4,2>: Cost 3 vrev <2,0,2,4> - 2568235169U, // <0,2,4,3>: Cost 3 vext1 <3,0,2,4>, <3,0,2,4> - 2562264374U, // <0,2,4,4>: Cost 3 vext1 <2,0,2,4>, RHS - 1544850742U, // <0,2,4,5>: Cost 2 vext2 <0,2,0,2>, RHS - 1678560566U, // <0,2,4,6>: Cost 2 vuzpl LHS, RHS - 2592125957U, // <0,2,4,7>: Cost 3 vext1 <7,0,2,4>, <7,0,2,4> - 1678560584U, // <0,2,4,u>: Cost 2 vuzpl LHS, RHS - 2691876686U, // <0,2,5,0>: Cost 3 vext3 <1,2,3,0>, <2,5,0,7> - 2618592976U, // <0,2,5,1>: Cost 3 vext2 <0,2,0,2>, <5,1,7,3> - 3765618528U, // <0,2,5,2>: Cost 4 vext3 <1,2,3,0>, <2,5,2,7> - 3765618536U, // <0,2,5,3>: Cost 4 vext3 <1,2,3,0>, <2,5,3,6> - 2618593222U, // <0,2,5,4>: Cost 3 vext2 <0,2,0,2>, <5,4,7,6> - 2752303108U, // <0,2,5,5>: Cost 3 vuzpl LHS, <5,5,5,5> - 2618593378U, // <0,2,5,6>: Cost 3 vext2 <0,2,0,2>, <5,6,7,0> - 2824785206U, // <0,2,5,7>: Cost 3 vuzpr <1,0,3,2>, RHS - 2824785207U, // <0,2,5,u>: Cost 3 vuzpr <1,0,3,2>, RHS - 2752303950U, // <0,2,6,0>: Cost 3 vuzpl LHS, <6,7,0,1> - 3830690081U, // <0,2,6,1>: Cost 4 vuzpl LHS, <6,0,1,2> - 2618593786U, // <0,2,6,2>: Cost 3 vext2 <0,2,0,2>, <6,2,7,3> - 2691876794U, // <0,2,6,3>: Cost 3 vext3 <1,2,3,0>, <2,6,3,7> - 2752303990U, // <0,2,6,4>: Cost 3 vuzpl LHS, <6,7,4,5> - 3830690445U, // <0,2,6,5>: Cost 4 vuzpl LHS, <6,4,5,6> - 2752303928U, // <0,2,6,6>: Cost 3 vuzpl LHS, <6,6,6,6> - 2657743695U, // <0,2,6,7>: Cost 3 vext2 <6,7,0,2>, <6,7,0,2> - 2691876839U, // <0,2,6,u>: Cost 3 vext3 <1,2,3,0>, <2,6,u,7> - 2659070961U, // <0,2,7,0>: Cost 3 vext2 <7,0,0,2>, <7,0,0,2> - 2659734594U, // <0,2,7,1>: Cost 3 vext2 <7,1,0,2>, <7,1,0,2> - 3734140051U, // <0,2,7,2>: Cost 4 vext2 <7,2,0,2>, <7,2,0,2> - 2701166596U, // <0,2,7,3>: Cost 3 vext3 <2,7,3,0>, <2,7,3,0> - 2662389094U, // <0,2,7,4>: Cost 3 vext2 <7,5,0,2>, <7,4,5,6> - 2662389126U, // <0,2,7,5>: Cost 3 vext2 <7,5,0,2>, <7,5,0,2> - 3736794583U, // <0,2,7,6>: Cost 4 vext2 <7,6,0,2>, <7,6,0,2> - 2752304748U, // <0,2,7,7>: Cost 3 vuzpl LHS, <7,7,7,7> - 2659070961U, // <0,2,7,u>: Cost 3 vext2 <7,0,0,2>, <7,0,0,2> - 1476608026U, // <0,2,u,0>: Cost 2 vext1 <0,0,2,u>, <0,0,2,u> - 1544853294U, // <0,2,u,1>: Cost 2 vext2 <0,2,0,2>, LHS - 1678563118U, // <0,2,u,2>: Cost 2 vuzpl LHS, LHS - 3021178482U, // <0,2,u,3>: Cost 3 vtrnl LHS, <2,2,3,3> - 1476611382U, // <0,2,u,4>: Cost 2 vext1 <0,0,2,u>, RHS - 1544853658U, // <0,2,u,5>: Cost 2 vext2 <0,2,0,2>, RHS - 1678563482U, // <0,2,u,6>: Cost 2 vuzpl LHS, RHS - 2824785449U, // <0,2,u,7>: Cost 3 vuzpr <1,0,3,2>, RHS - 1678563172U, // <0,2,u,u>: Cost 2 vuzpl LHS, LHS - 2556329984U, // <0,3,0,0>: Cost 3 vext1 <1,0,3,0>, <0,0,0,0> - 2686421142U, // <0,3,0,1>: Cost 3 vext3 <0,3,1,0>, <3,0,1,2> - 2562303437U, // <0,3,0,2>: Cost 3 vext1 <2,0,3,0>, <2,0,3,0> - 4094986652U, // <0,3,0,3>: Cost 4 vtrnl <0,2,0,2>, <3,3,3,3> - 2556333366U, // <0,3,0,4>: Cost 3 vext1 <1,0,3,0>, RHS - 4094986754U, // <0,3,0,5>: Cost 4 vtrnl <0,2,0,2>, <3,4,5,6> - 3798796488U, // <0,3,0,6>: Cost 4 vext3 <6,7,3,0>, <3,0,6,7> - 3776530634U, // <0,3,0,7>: Cost 4 vext3 <3,0,7,0>, <3,0,7,0> - 2556335918U, // <0,3,0,u>: Cost 3 vext1 <1,0,3,0>, LHS - 2886518934U, // <0,3,1,0>: Cost 3 vzipl LHS, <3,0,1,2> - 2556338933U, // <0,3,1,1>: Cost 3 vext1 <1,0,3,1>, <1,0,3,1> - 2691877105U, // <0,3,1,2>: Cost 3 vext3 <1,2,3,0>, <3,1,2,3> - 2886519196U, // <0,3,1,3>: Cost 3 vzipl LHS, <3,3,3,3> - 2886519298U, // <0,3,1,4>: Cost 3 vzipl LHS, <3,4,5,6> - 4095740418U, // <0,3,1,5>: Cost 4 vtrnl <0,3,1,4>, <3,4,5,6> - 3659944242U, // <0,3,1,6>: Cost 4 vext1 <6,0,3,1>, <6,0,3,1> - 3769600286U, // <0,3,1,7>: Cost 4 vext3 <1,u,3,0>, <3,1,7,3> - 2886519582U, // <0,3,1,u>: Cost 3 vzipl LHS, <3,u,1,2> - 1482604646U, // <0,3,2,0>: Cost 2 vext1 <1,0,3,2>, LHS - 1482605302U, // <0,3,2,1>: Cost 2 vext1 <1,0,3,2>, <1,0,3,2> - 2556348008U, // <0,3,2,2>: Cost 3 vext1 <1,0,3,2>, <2,2,2,2> - 3020736924U, // <0,3,2,3>: Cost 3 vtrnl LHS, <3,3,3,3> - 1482607926U, // <0,3,2,4>: Cost 2 vext1 <1,0,3,2>, RHS - 3020737026U, // <0,3,2,5>: Cost 3 vtrnl LHS, <3,4,5,6> - 2598154746U, // <0,3,2,6>: Cost 3 vext1 <u,0,3,2>, <6,2,7,3> - 2598155258U, // <0,3,2,7>: Cost 3 vext1 <u,0,3,2>, <7,0,1,2> - 1482610478U, // <0,3,2,u>: Cost 2 vext1 <1,0,3,2>, LHS - 3692341398U, // <0,3,3,0>: Cost 4 vext2 <0,2,0,3>, <3,0,1,2> - 2635851999U, // <0,3,3,1>: Cost 3 vext2 <3,1,0,3>, <3,1,0,3> - 3636069840U, // <0,3,3,2>: Cost 4 vext1 <2,0,3,3>, <2,0,3,3> - 2691877276U, // <0,3,3,3>: Cost 3 vext3 <1,2,3,0>, <3,3,3,3> - 3961522690U, // <0,3,3,4>: Cost 4 vzipl <0,3,1,4>, <3,4,5,6> - 3826797058U, // <0,3,3,5>: Cost 4 vuzpl <0,2,3,5>, <3,4,5,6> - 3703622282U, // <0,3,3,6>: Cost 4 vext2 <2,1,0,3>, <3,6,2,7> - 3769600452U, // <0,3,3,7>: Cost 4 vext3 <1,u,3,0>, <3,3,7,7> - 2640497430U, // <0,3,3,u>: Cost 3 vext2 <3,u,0,3>, <3,u,0,3> - 3962194070U, // <0,3,4,0>: Cost 4 vzipl <0,4,1,5>, <3,0,1,2> - 2232617112U, // <0,3,4,1>: Cost 3 vrev <3,0,1,4> - 2232690849U, // <0,3,4,2>: Cost 3 vrev <3,0,2,4> - 4095314332U, // <0,3,4,3>: Cost 4 vtrnl <0,2,4,6>, <3,3,3,3> - 3962194434U, // <0,3,4,4>: Cost 4 vzipl <0,4,1,5>, <3,4,5,6> - 2691877378U, // <0,3,4,5>: Cost 3 vext3 <1,2,3,0>, <3,4,5,6> - 3826765110U, // <0,3,4,6>: Cost 4 vuzpl <0,2,3,1>, RHS - 3665941518U, // <0,3,4,7>: Cost 4 vext1 <7,0,3,4>, <7,0,3,4> - 2691877405U, // <0,3,4,u>: Cost 3 vext3 <1,2,3,0>, <3,4,u,6> - 3630112870U, // <0,3,5,0>: Cost 4 vext1 <1,0,3,5>, LHS - 3630113526U, // <0,3,5,1>: Cost 4 vext1 <1,0,3,5>, <1,0,3,2> - 4035199734U, // <0,3,5,2>: Cost 4 vzipr <1,4,0,5>, <1,0,3,2> - 3769600578U, // <0,3,5,3>: Cost 4 vext3 <1,u,3,0>, <3,5,3,7> - 2232846516U, // <0,3,5,4>: Cost 3 vrev <3,0,4,5> - 3779037780U, // <0,3,5,5>: Cost 4 vext3 <3,4,5,0>, <3,5,5,7> - 2718714461U, // <0,3,5,6>: Cost 3 vext3 <5,6,7,0>, <3,5,6,7> - 2706106975U, // <0,3,5,7>: Cost 3 vext3 <3,5,7,0>, <3,5,7,0> - 2233141464U, // <0,3,5,u>: Cost 3 vrev <3,0,u,5> - 2691877496U, // <0,3,6,0>: Cost 3 vext3 <1,2,3,0>, <3,6,0,7> - 3727511914U, // <0,3,6,1>: Cost 4 vext2 <6,1,0,3>, <6,1,0,3> - 3765619338U, // <0,3,6,2>: Cost 4 vext3 <1,2,3,0>, <3,6,2,7> - 3765619347U, // <0,3,6,3>: Cost 4 vext3 <1,2,3,0>, <3,6,3,7> - 3765987996U, // <0,3,6,4>: Cost 4 vext3 <1,2,u,0>, <3,6,4,7> - 3306670270U, // <0,3,6,5>: Cost 4 vrev <3,0,5,6> - 3792456365U, // <0,3,6,6>: Cost 4 vext3 <5,6,7,0>, <3,6,6,6> - 2706770608U, // <0,3,6,7>: Cost 3 vext3 <3,6,7,0>, <3,6,7,0> - 2706844345U, // <0,3,6,u>: Cost 3 vext3 <3,6,u,0>, <3,6,u,0> - 3769600707U, // <0,3,7,0>: Cost 4 vext3 <1,u,3,0>, <3,7,0,1> - 2659742787U, // <0,3,7,1>: Cost 3 vext2 <7,1,0,3>, <7,1,0,3> - 3636102612U, // <0,3,7,2>: Cost 4 vext1 <2,0,3,7>, <2,0,3,7> - 3769600740U, // <0,3,7,3>: Cost 4 vext3 <1,u,3,0>, <3,7,3,7> - 3769600747U, // <0,3,7,4>: Cost 4 vext3 <1,u,3,0>, <3,7,4,5> - 3769600758U, // <0,3,7,5>: Cost 4 vext3 <1,u,3,0>, <3,7,5,7> - 3659993400U, // <0,3,7,6>: Cost 4 vext1 <6,0,3,7>, <6,0,3,7> - 3781176065U, // <0,3,7,7>: Cost 4 vext3 <3,7,7,0>, <3,7,7,0> - 2664388218U, // <0,3,7,u>: Cost 3 vext2 <7,u,0,3>, <7,u,0,3> - 1482653798U, // <0,3,u,0>: Cost 2 vext1 <1,0,3,u>, LHS - 1482654460U, // <0,3,u,1>: Cost 2 vext1 <1,0,3,u>, <1,0,3,u> - 2556397160U, // <0,3,u,2>: Cost 3 vext1 <1,0,3,u>, <2,2,2,2> - 3021179292U, // <0,3,u,3>: Cost 3 vtrnl LHS, <3,3,3,3> - 1482657078U, // <0,3,u,4>: Cost 2 vext1 <1,0,3,u>, RHS - 3021179394U, // <0,3,u,5>: Cost 3 vtrnl LHS, <3,4,5,6> - 2598203898U, // <0,3,u,6>: Cost 3 vext1 <u,0,3,u>, <6,2,7,3> - 2708097874U, // <0,3,u,7>: Cost 3 vext3 <3,u,7,0>, <3,u,7,0> - 1482659630U, // <0,3,u,u>: Cost 2 vext1 <1,0,3,u>, LHS - 2617278468U, // <0,4,0,0>: Cost 3 vext2 <0,0,0,4>, <0,0,0,4> - 2618605670U, // <0,4,0,1>: Cost 3 vext2 <0,2,0,4>, LHS - 2618605734U, // <0,4,0,2>: Cost 3 vext2 <0,2,0,4>, <0,2,0,4> - 3642091695U, // <0,4,0,3>: Cost 4 vext1 <3,0,4,0>, <3,0,4,0> - 2753134796U, // <0,4,0,4>: Cost 3 vuzpl <0,2,4,6>, <0,2,4,6> - 2718714770U, // <0,4,0,5>: Cost 3 vext3 <5,6,7,0>, <4,0,5,1> - 3021245750U, // <0,4,0,6>: Cost 3 vtrnl <0,2,0,2>, RHS - 3665982483U, // <0,4,0,7>: Cost 4 vext1 <7,0,4,0>, <7,0,4,0> - 3021245768U, // <0,4,0,u>: Cost 3 vtrnl <0,2,0,2>, RHS - 2568355942U, // <0,4,1,0>: Cost 3 vext1 <3,0,4,1>, LHS - 3692348212U, // <0,4,1,1>: Cost 4 vext2 <0,2,0,4>, <1,1,1,1> - 3692348310U, // <0,4,1,2>: Cost 4 vext2 <0,2,0,4>, <1,2,3,0> - 2568358064U, // <0,4,1,3>: Cost 3 vext1 <3,0,4,1>, <3,0,4,1> - 2568359222U, // <0,4,1,4>: Cost 3 vext1 <3,0,4,1>, RHS - 1812778294U, // <0,4,1,5>: Cost 2 vzipl LHS, RHS - 3022671158U, // <0,4,1,6>: Cost 3 vtrnl <0,4,1,5>, RHS - 2592248852U, // <0,4,1,7>: Cost 3 vext1 <7,0,4,1>, <7,0,4,1> - 1812778537U, // <0,4,1,u>: Cost 2 vzipl LHS, RHS - 2568364134U, // <0,4,2,0>: Cost 3 vext1 <3,0,4,2>, LHS - 2238573423U, // <0,4,2,1>: Cost 3 vrev <4,0,1,2> - 3692349032U, // <0,4,2,2>: Cost 4 vext2 <0,2,0,4>, <2,2,2,2> - 2631214761U, // <0,4,2,3>: Cost 3 vext2 <2,3,0,4>, <2,3,0,4> - 2568367414U, // <0,4,2,4>: Cost 3 vext1 <3,0,4,2>, RHS - 2887028022U, // <0,4,2,5>: Cost 3 vzipl <0,2,0,2>, RHS - 1946996022U, // <0,4,2,6>: Cost 2 vtrnl LHS, RHS - 2592257045U, // <0,4,2,7>: Cost 3 vext1 <7,0,4,2>, <7,0,4,2> - 1946996040U, // <0,4,2,u>: Cost 2 vtrnl LHS, RHS - 3692349590U, // <0,4,3,0>: Cost 4 vext2 <0,2,0,4>, <3,0,1,2> - 3826878614U, // <0,4,3,1>: Cost 4 vuzpl <0,2,4,6>, <3,0,1,2> - 3826878625U, // <0,4,3,2>: Cost 4 vuzpl <0,2,4,6>, <3,0,2,4> - 3692349852U, // <0,4,3,3>: Cost 4 vext2 <0,2,0,4>, <3,3,3,3> - 3692349954U, // <0,4,3,4>: Cost 4 vext2 <0,2,0,4>, <3,4,5,6> - 3826878978U, // <0,4,3,5>: Cost 4 vuzpl <0,2,4,6>, <3,4,5,6> - 4095200566U, // <0,4,3,6>: Cost 4 vtrnl <0,2,3,1>, RHS - 3713583814U, // <0,4,3,7>: Cost 4 vext2 <3,7,0,4>, <3,7,0,4> - 3692350238U, // <0,4,3,u>: Cost 4 vext2 <0,2,0,4>, <3,u,1,2> - 2550464552U, // <0,4,4,0>: Cost 3 vext1 <0,0,4,4>, <0,0,4,4> - 3962194914U, // <0,4,4,1>: Cost 4 vzipl <0,4,1,5>, <4,1,5,0> - 3693677631U, // <0,4,4,2>: Cost 4 vext2 <0,4,0,4>, <4,2,6,3> - 3642124467U, // <0,4,4,3>: Cost 4 vext1 <3,0,4,4>, <3,0,4,4> - 2718715088U, // <0,4,4,4>: Cost 3 vext3 <5,6,7,0>, <4,4,4,4> - 2618608950U, // <0,4,4,5>: Cost 3 vext2 <0,2,0,4>, RHS - 2753137974U, // <0,4,4,6>: Cost 3 vuzpl <0,2,4,6>, RHS - 3666015255U, // <0,4,4,7>: Cost 4 vext1 <7,0,4,4>, <7,0,4,4> - 2618609193U, // <0,4,4,u>: Cost 3 vext2 <0,2,0,4>, RHS - 2568388710U, // <0,4,5,0>: Cost 3 vext1 <3,0,4,5>, LHS - 2568389526U, // <0,4,5,1>: Cost 3 vext1 <3,0,4,5>, <1,2,3,0> - 3636159963U, // <0,4,5,2>: Cost 4 vext1 <2,0,4,5>, <2,0,4,5> - 2568390836U, // <0,4,5,3>: Cost 3 vext1 <3,0,4,5>, <3,0,4,5> - 2568391990U, // <0,4,5,4>: Cost 3 vext1 <3,0,4,5>, RHS - 2718715180U, // <0,4,5,5>: Cost 3 vext3 <5,6,7,0>, <4,5,5,6> - 1618136374U, // <0,4,5,6>: Cost 2 vext3 <1,2,3,0>, RHS - 2592281624U, // <0,4,5,7>: Cost 3 vext1 <7,0,4,5>, <7,0,4,5> - 1618136392U, // <0,4,5,u>: Cost 2 vext3 <1,2,3,0>, RHS - 2550480938U, // <0,4,6,0>: Cost 3 vext1 <0,0,4,6>, <0,0,4,6> - 3826880801U, // <0,4,6,1>: Cost 4 vuzpl <0,2,4,6>, <6,0,1,2> - 2562426332U, // <0,4,6,2>: Cost 3 vext1 <2,0,4,6>, <2,0,4,6> - 3786190181U, // <0,4,6,3>: Cost 4 vext3 <4,6,3,0>, <4,6,3,0> - 2718715252U, // <0,4,6,4>: Cost 3 vext3 <5,6,7,0>, <4,6,4,6> - 3826881165U, // <0,4,6,5>: Cost 4 vuzpl <0,2,4,6>, <6,4,5,6> - 2712669568U, // <0,4,6,6>: Cost 3 vext3 <4,6,6,0>, <4,6,6,0> - 2657760081U, // <0,4,6,7>: Cost 3 vext2 <6,7,0,4>, <6,7,0,4> - 2718715284U, // <0,4,6,u>: Cost 3 vext3 <5,6,7,0>, <4,6,u,2> - 3654090854U, // <0,4,7,0>: Cost 4 vext1 <5,0,4,7>, LHS - 3934229326U, // <0,4,7,1>: Cost 4 vuzpr <7,0,1,4>, <6,7,0,1> - 3734156437U, // <0,4,7,2>: Cost 4 vext2 <7,2,0,4>, <7,2,0,4> - 3734820070U, // <0,4,7,3>: Cost 4 vext2 <7,3,0,4>, <7,3,0,4> - 3654094134U, // <0,4,7,4>: Cost 4 vext1 <5,0,4,7>, RHS - 2713259464U, // <0,4,7,5>: Cost 3 vext3 <4,7,5,0>, <4,7,5,0> - 2713333201U, // <0,4,7,6>: Cost 3 vext3 <4,7,6,0>, <4,7,6,0> - 3654095866U, // <0,4,7,7>: Cost 4 vext1 <5,0,4,7>, <7,0,1,2> - 2713259464U, // <0,4,7,u>: Cost 3 vext3 <4,7,5,0>, <4,7,5,0> - 2568413286U, // <0,4,u,0>: Cost 3 vext1 <3,0,4,u>, LHS - 2618611502U, // <0,4,u,1>: Cost 3 vext2 <0,2,0,4>, LHS - 2753140526U, // <0,4,u,2>: Cost 3 vuzpl <0,2,4,6>, LHS - 2568415415U, // <0,4,u,3>: Cost 3 vext1 <3,0,4,u>, <3,0,4,u> - 2568416566U, // <0,4,u,4>: Cost 3 vext1 <3,0,4,u>, RHS - 1817423158U, // <0,4,u,5>: Cost 2 vzipl LHS, RHS - 1947438390U, // <0,4,u,6>: Cost 2 vtrnl LHS, RHS - 2592306203U, // <0,4,u,7>: Cost 3 vext1 <7,0,4,u>, <7,0,4,u> - 1947438408U, // <0,4,u,u>: Cost 2 vtrnl LHS, RHS - 3630219264U, // <0,5,0,0>: Cost 4 vext1 <1,0,5,0>, <0,0,0,0> - 2625912934U, // <0,5,0,1>: Cost 3 vext2 <1,4,0,5>, LHS - 3692355748U, // <0,5,0,2>: Cost 4 vext2 <0,2,0,5>, <0,2,0,2> - 3693019384U, // <0,5,0,3>: Cost 4 vext2 <0,3,0,5>, <0,3,0,5> - 3630222646U, // <0,5,0,4>: Cost 4 vext1 <1,0,5,0>, RHS - 3699655062U, // <0,5,0,5>: Cost 4 vext2 <1,4,0,5>, <0,5,0,1> - 2718715508U, // <0,5,0,6>: Cost 3 vext3 <5,6,7,0>, <5,0,6,1> - 3087011126U, // <0,5,0,7>: Cost 3 vtrnr <0,0,0,0>, RHS - 2625913501U, // <0,5,0,u>: Cost 3 vext2 <1,4,0,5>, LHS - 1500659814U, // <0,5,1,0>: Cost 2 vext1 <4,0,5,1>, LHS - 2886520528U, // <0,5,1,1>: Cost 3 vzipl LHS, <5,1,7,3> - 2574403176U, // <0,5,1,2>: Cost 3 vext1 <4,0,5,1>, <2,2,2,2> - 2574403734U, // <0,5,1,3>: Cost 3 vext1 <4,0,5,1>, <3,0,1,2> - 1500662674U, // <0,5,1,4>: Cost 2 vext1 <4,0,5,1>, <4,0,5,1> - 2886520836U, // <0,5,1,5>: Cost 3 vzipl LHS, <5,5,5,5> - 2886520930U, // <0,5,1,6>: Cost 3 vzipl LHS, <5,6,7,0> - 2718715600U, // <0,5,1,7>: Cost 3 vext3 <5,6,7,0>, <5,1,7,3> - 1500665646U, // <0,5,1,u>: Cost 2 vext1 <4,0,5,1>, LHS - 2556493926U, // <0,5,2,0>: Cost 3 vext1 <1,0,5,2>, LHS - 2244546120U, // <0,5,2,1>: Cost 3 vrev <5,0,1,2> - 3692357256U, // <0,5,2,2>: Cost 4 vext2 <0,2,0,5>, <2,2,5,7> - 2568439994U, // <0,5,2,3>: Cost 3 vext1 <3,0,5,2>, <3,0,5,2> - 2556497206U, // <0,5,2,4>: Cost 3 vext1 <1,0,5,2>, RHS - 3020738564U, // <0,5,2,5>: Cost 3 vtrnl LHS, <5,5,5,5> - 4027877161U, // <0,5,2,6>: Cost 4 vzipr <0,2,0,2>, <2,4,5,6> - 3093220662U, // <0,5,2,7>: Cost 3 vtrnr <1,0,3,2>, RHS - 3093220663U, // <0,5,2,u>: Cost 3 vtrnr <1,0,3,2>, RHS - 3699656854U, // <0,5,3,0>: Cost 4 vext2 <1,4,0,5>, <3,0,1,2> - 3699656927U, // <0,5,3,1>: Cost 4 vext2 <1,4,0,5>, <3,1,0,3> - 3699657006U, // <0,5,3,2>: Cost 4 vext2 <1,4,0,5>, <3,2,0,1> - 3699657116U, // <0,5,3,3>: Cost 4 vext2 <1,4,0,5>, <3,3,3,3> - 2637859284U, // <0,5,3,4>: Cost 3 vext2 <3,4,0,5>, <3,4,0,5> - 3790319453U, // <0,5,3,5>: Cost 4 vext3 <5,3,5,0>, <5,3,5,0> - 3699657354U, // <0,5,3,6>: Cost 4 vext2 <1,4,0,5>, <3,6,2,7> - 2716725103U, // <0,5,3,7>: Cost 3 vext3 <5,3,7,0>, <5,3,7,0> - 2716798840U, // <0,5,3,u>: Cost 3 vext3 <5,3,u,0>, <5,3,u,0> - 2661747602U, // <0,5,4,0>: Cost 3 vext2 <7,4,0,5>, <4,0,5,1> - 3630252810U, // <0,5,4,1>: Cost 4 vext1 <1,0,5,4>, <1,0,5,4> - 3636225507U, // <0,5,4,2>: Cost 4 vext1 <2,0,5,4>, <2,0,5,4> - 3716910172U, // <0,5,4,3>: Cost 4 vext2 <4,3,0,5>, <4,3,0,5> - 3962195892U, // <0,5,4,4>: Cost 4 vzipl <0,4,1,5>, <5,4,5,6> - 2625916214U, // <0,5,4,5>: Cost 3 vext2 <1,4,0,5>, RHS - 3718901071U, // <0,5,4,6>: Cost 4 vext2 <4,6,0,5>, <4,6,0,5> - 2718715846U, // <0,5,4,7>: Cost 3 vext3 <5,6,7,0>, <5,4,7,6> - 2625916457U, // <0,5,4,u>: Cost 3 vext2 <1,4,0,5>, RHS - 3791278034U, // <0,5,5,0>: Cost 4 vext3 <5,5,0,0>, <5,5,0,0> - 3791351771U, // <0,5,5,1>: Cost 4 vext3 <5,5,1,0>, <5,5,1,0> - 3318386260U, // <0,5,5,2>: Cost 4 vrev <5,0,2,5> - 3791499245U, // <0,5,5,3>: Cost 4 vext3 <5,5,3,0>, <5,5,3,0> - 3318533734U, // <0,5,5,4>: Cost 4 vrev <5,0,4,5> - 2718715908U, // <0,5,5,5>: Cost 3 vext3 <5,6,7,0>, <5,5,5,5> - 2657767522U, // <0,5,5,6>: Cost 3 vext2 <6,7,0,5>, <5,6,7,0> - 2718715928U, // <0,5,5,7>: Cost 3 vext3 <5,6,7,0>, <5,5,7,7> - 2718715937U, // <0,5,5,u>: Cost 3 vext3 <5,6,7,0>, <5,5,u,7> - 2592358502U, // <0,5,6,0>: Cost 3 vext1 <7,0,5,6>, LHS - 3792015404U, // <0,5,6,1>: Cost 4 vext3 <5,6,1,0>, <5,6,1,0> - 3731509754U, // <0,5,6,2>: Cost 4 vext2 <6,7,0,5>, <6,2,7,3> - 3785748546U, // <0,5,6,3>: Cost 4 vext3 <4,5,6,0>, <5,6,3,4> - 2592361782U, // <0,5,6,4>: Cost 3 vext1 <7,0,5,6>, RHS - 2592362594U, // <0,5,6,5>: Cost 3 vext1 <7,0,5,6>, <5,6,7,0> - 3785748576U, // <0,5,6,6>: Cost 4 vext3 <4,5,6,0>, <5,6,6,7> - 1644974178U, // <0,5,6,7>: Cost 2 vext3 <5,6,7,0>, <5,6,7,0> - 1645047915U, // <0,5,6,u>: Cost 2 vext3 <5,6,u,0>, <5,6,u,0> - 2562506854U, // <0,5,7,0>: Cost 3 vext1 <2,0,5,7>, LHS - 2562507670U, // <0,5,7,1>: Cost 3 vext1 <2,0,5,7>, <1,2,3,0> - 2562508262U, // <0,5,7,2>: Cost 3 vext1 <2,0,5,7>, <2,0,5,7> - 3636250774U, // <0,5,7,3>: Cost 4 vext1 <2,0,5,7>, <3,0,1,2> - 2562510134U, // <0,5,7,4>: Cost 3 vext1 <2,0,5,7>, RHS - 2718716072U, // <0,5,7,5>: Cost 3 vext3 <5,6,7,0>, <5,7,5,7> - 2718716074U, // <0,5,7,6>: Cost 3 vext3 <5,6,7,0>, <5,7,6,0> - 2719379635U, // <0,5,7,7>: Cost 3 vext3 <5,7,7,0>, <5,7,7,0> - 2562512686U, // <0,5,7,u>: Cost 3 vext1 <2,0,5,7>, LHS - 1500717158U, // <0,5,u,0>: Cost 2 vext1 <4,0,5,u>, LHS - 2625918766U, // <0,5,u,1>: Cost 3 vext2 <1,4,0,5>, LHS - 2719674583U, // <0,5,u,2>: Cost 3 vext3 <5,u,2,0>, <5,u,2,0> - 2568489152U, // <0,5,u,3>: Cost 3 vext1 <3,0,5,u>, <3,0,5,u> - 1500720025U, // <0,5,u,4>: Cost 2 vext1 <4,0,5,u>, <4,0,5,u> - 2625919130U, // <0,5,u,5>: Cost 3 vext2 <1,4,0,5>, RHS - 2586407243U, // <0,5,u,6>: Cost 3 vext1 <6,0,5,u>, <6,0,5,u> - 1646301444U, // <0,5,u,7>: Cost 2 vext3 <5,u,7,0>, <5,u,7,0> - 1646375181U, // <0,5,u,u>: Cost 2 vext3 <5,u,u,0>, <5,u,u,0> - 2586411110U, // <0,6,0,0>: Cost 3 vext1 <6,0,6,0>, LHS - 2619949158U, // <0,6,0,1>: Cost 3 vext2 <0,4,0,6>, LHS - 2619949220U, // <0,6,0,2>: Cost 3 vext2 <0,4,0,6>, <0,2,0,2> - 3785748789U, // <0,6,0,3>: Cost 4 vext3 <4,5,6,0>, <6,0,3,4> - 2619949386U, // <0,6,0,4>: Cost 3 vext2 <0,4,0,6>, <0,4,0,6> - 2586415202U, // <0,6,0,5>: Cost 3 vext1 <6,0,6,0>, <5,6,7,0> - 2586415436U, // <0,6,0,6>: Cost 3 vext1 <6,0,6,0>, <6,0,6,0> - 2952793398U, // <0,6,0,7>: Cost 3 vzipr <0,0,0,0>, RHS - 2619949725U, // <0,6,0,u>: Cost 3 vext2 <0,4,0,6>, LHS - 2562531430U, // <0,6,1,0>: Cost 3 vext1 <2,0,6,1>, LHS - 3693691700U, // <0,6,1,1>: Cost 4 vext2 <0,4,0,6>, <1,1,1,1> - 2886521338U, // <0,6,1,2>: Cost 3 vzipl LHS, <6,2,7,3> - 3693691864U, // <0,6,1,3>: Cost 4 vext2 <0,4,0,6>, <1,3,1,3> - 2562534710U, // <0,6,1,4>: Cost 3 vext1 <2,0,6,1>, RHS - 2580450932U, // <0,6,1,5>: Cost 3 vext1 <5,0,6,1>, <5,0,6,1> - 2886521656U, // <0,6,1,6>: Cost 3 vzipl LHS, <6,6,6,6> - 2966736182U, // <0,6,1,7>: Cost 3 vzipr <2,3,0,1>, RHS - 2966736183U, // <0,6,1,u>: Cost 3 vzipr <2,3,0,1>, RHS - 1500741734U, // <0,6,2,0>: Cost 2 vext1 <4,0,6,2>, LHS - 2250518817U, // <0,6,2,1>: Cost 3 vrev <6,0,1,2> - 2574485096U, // <0,6,2,2>: Cost 3 vext1 <4,0,6,2>, <2,2,2,2> - 2631894694U, // <0,6,2,3>: Cost 3 vext2 <2,4,0,6>, <2,3,0,1> - 1500744604U, // <0,6,2,4>: Cost 2 vext1 <4,0,6,2>, <4,0,6,2> - 2574487248U, // <0,6,2,5>: Cost 3 vext1 <4,0,6,2>, <5,1,7,3> - 3020739384U, // <0,6,2,6>: Cost 3 vtrnl LHS, <6,6,6,6> - 2954136886U, // <0,6,2,7>: Cost 3 vzipr <0,2,0,2>, RHS - 1500747566U, // <0,6,2,u>: Cost 2 vext1 <4,0,6,2>, LHS - 3693693078U, // <0,6,3,0>: Cost 4 vext2 <0,4,0,6>, <3,0,1,2> - 3705637136U, // <0,6,3,1>: Cost 4 vext2 <2,4,0,6>, <3,1,5,7> - 3705637192U, // <0,6,3,2>: Cost 4 vext2 <2,4,0,6>, <3,2,3,0> - 3693693340U, // <0,6,3,3>: Cost 4 vext2 <0,4,0,6>, <3,3,3,3> - 2637867477U, // <0,6,3,4>: Cost 3 vext2 <3,4,0,6>, <3,4,0,6> - 3705637424U, // <0,6,3,5>: Cost 4 vext2 <2,4,0,6>, <3,5,1,7> - 3666154056U, // <0,6,3,6>: Cost 4 vext1 <7,0,6,3>, <6,3,7,0> - 2722697800U, // <0,6,3,7>: Cost 3 vext3 <6,3,7,0>, <6,3,7,0> - 2722771537U, // <0,6,3,u>: Cost 3 vext3 <6,3,u,0>, <6,3,u,0> - 2562556006U, // <0,6,4,0>: Cost 3 vext1 <2,0,6,4>, LHS - 4095316257U, // <0,6,4,1>: Cost 4 vtrnl <0,2,4,6>, <6,0,1,2> - 2562557420U, // <0,6,4,2>: Cost 3 vext1 <2,0,6,4>, <2,0,6,4> - 3636299926U, // <0,6,4,3>: Cost 4 vext1 <2,0,6,4>, <3,0,1,2> - 2562559286U, // <0,6,4,4>: Cost 3 vext1 <2,0,6,4>, RHS - 2619952438U, // <0,6,4,5>: Cost 3 vext2 <0,4,0,6>, RHS - 2723287696U, // <0,6,4,6>: Cost 3 vext3 <6,4,6,0>, <6,4,6,0> - 4027895094U, // <0,6,4,7>: Cost 4 vzipr <0,2,0,4>, RHS - 2619952681U, // <0,6,4,u>: Cost 3 vext2 <0,4,0,6>, RHS - 2718716594U, // <0,6,5,0>: Cost 3 vext3 <5,6,7,0>, <6,5,0,7> - 3648250774U, // <0,6,5,1>: Cost 4 vext1 <4,0,6,5>, <1,2,3,0> - 3792458436U, // <0,6,5,2>: Cost 4 vext3 <5,6,7,0>, <6,5,2,7> - 3705638767U, // <0,6,5,3>: Cost 5 vext2 <2,4,0,6>, <5,3,7,0> - 3648252831U, // <0,6,5,4>: Cost 4 vext1 <4,0,6,5>, <4,0,6,5> - 3797619416U, // <0,6,5,5>: Cost 4 vext3 <6,5,5,0>, <6,5,5,0> - 3792458472U, // <0,6,5,6>: Cost 4 vext3 <5,6,7,0>, <6,5,6,7> - 4035202358U, // <0,6,5,7>: Cost 4 vzipr <1,4,0,5>, RHS - 2718716594U, // <0,6,5,u>: Cost 3 vext3 <5,6,7,0>, <6,5,0,7> - 3786412796U, // <0,6,6,0>: Cost 4 vext3 <4,6,6,0>, <6,6,0,0> - 3792458504U, // <0,6,6,1>: Cost 4 vext3 <5,6,7,0>, <6,6,1,3> - 3728200126U, // <0,6,6,2>: Cost 4 vext2 <6,2,0,6>, <6,2,0,6> - 3798135575U, // <0,6,6,3>: Cost 4 vext3 <6,6,3,0>, <6,6,3,0> - 3786412836U, // <0,6,6,4>: Cost 4 vext3 <4,6,6,0>, <6,6,4,4> - 3792458543U, // <0,6,6,5>: Cost 4 vext3 <5,6,7,0>, <6,6,5,6> - 2718716728U, // <0,6,6,6>: Cost 3 vext3 <5,6,7,0>, <6,6,6,6> - 2718716738U, // <0,6,6,7>: Cost 3 vext3 <5,6,7,0>, <6,6,7,7> - 2718716747U, // <0,6,6,u>: Cost 3 vext3 <5,6,7,0>, <6,6,u,7> - 2718716750U, // <0,6,7,0>: Cost 3 vext3 <5,6,7,0>, <6,7,0,1> - 2724909910U, // <0,6,7,1>: Cost 3 vext3 <6,7,1,0>, <6,7,1,0> - 3636323823U, // <0,6,7,2>: Cost 4 vext1 <2,0,6,7>, <2,0,6,7> - 2725057384U, // <0,6,7,3>: Cost 3 vext3 <6,7,3,0>, <6,7,3,0> - 2718716790U, // <0,6,7,4>: Cost 3 vext3 <5,6,7,0>, <6,7,4,5> - 2718716800U, // <0,6,7,5>: Cost 3 vext3 <5,6,7,0>, <6,7,5,6> - 3792458629U, // <0,6,7,6>: Cost 4 vext3 <5,6,7,0>, <6,7,6,2> - 2725352332U, // <0,6,7,7>: Cost 3 vext3 <6,7,7,0>, <6,7,7,0> - 2718716822U, // <0,6,7,u>: Cost 3 vext3 <5,6,7,0>, <6,7,u,1> - 1500790886U, // <0,6,u,0>: Cost 2 vext1 <4,0,6,u>, LHS - 2619954990U, // <0,6,u,1>: Cost 3 vext2 <0,4,0,6>, LHS - 2562590192U, // <0,6,u,2>: Cost 3 vext1 <2,0,6,u>, <2,0,6,u> - 2725721017U, // <0,6,u,3>: Cost 3 vext3 <6,u,3,0>, <6,u,3,0> - 1500793762U, // <0,6,u,4>: Cost 2 vext1 <4,0,6,u>, <4,0,6,u> - 2619955354U, // <0,6,u,5>: Cost 3 vext2 <0,4,0,6>, RHS - 2725942228U, // <0,6,u,6>: Cost 3 vext3 <6,u,6,0>, <6,u,6,0> - 2954186038U, // <0,6,u,7>: Cost 3 vzipr <0,2,0,u>, RHS - 1500796718U, // <0,6,u,u>: Cost 2 vext1 <4,0,6,u>, LHS - 2256401391U, // <0,7,0,0>: Cost 3 vrev <7,0,0,0> - 2632564838U, // <0,7,0,1>: Cost 3 vext2 <2,5,0,7>, LHS - 2256548865U, // <0,7,0,2>: Cost 3 vrev <7,0,2,0> - 3700998396U, // <0,7,0,3>: Cost 4 vext2 <1,6,0,7>, <0,3,1,0> - 2718716952U, // <0,7,0,4>: Cost 3 vext3 <5,6,7,0>, <7,0,4,5> - 2718716962U, // <0,7,0,5>: Cost 3 vext3 <5,6,7,0>, <7,0,5,6> - 2621284845U, // <0,7,0,6>: Cost 3 vext2 <0,6,0,7>, <0,6,0,7> - 3904685542U, // <0,7,0,7>: Cost 4 vuzpr <2,0,5,7>, <2,0,5,7> - 2632565405U, // <0,7,0,u>: Cost 3 vext2 <2,5,0,7>, LHS - 2256409584U, // <0,7,1,0>: Cost 3 vrev <7,0,0,1> - 3706307380U, // <0,7,1,1>: Cost 4 vext2 <2,5,0,7>, <1,1,1,1> - 2632565654U, // <0,7,1,2>: Cost 3 vext2 <2,5,0,7>, <1,2,3,0> - 3769603168U, // <0,7,1,3>: Cost 4 vext3 <1,u,3,0>, <7,1,3,5> - 2256704532U, // <0,7,1,4>: Cost 3 vrev <7,0,4,1> - 3769603184U, // <0,7,1,5>: Cost 4 vext3 <1,u,3,0>, <7,1,5,3> - 3700999366U, // <0,7,1,6>: Cost 4 vext2 <1,6,0,7>, <1,6,0,7> - 2886522476U, // <0,7,1,7>: Cost 3 vzipl LHS, <7,7,7,7> - 2256999480U, // <0,7,1,u>: Cost 3 vrev <7,0,u,1> - 2586501222U, // <0,7,2,0>: Cost 3 vext1 <6,0,7,2>, LHS - 1182749690U, // <0,7,2,1>: Cost 2 vrev <7,0,1,2> - 3636356595U, // <0,7,2,2>: Cost 4 vext1 <2,0,7,2>, <2,0,7,2> - 2727711916U, // <0,7,2,3>: Cost 3 vext3 <7,2,3,0>, <7,2,3,0> - 2586504502U, // <0,7,2,4>: Cost 3 vext1 <6,0,7,2>, RHS - 2632566606U, // <0,7,2,5>: Cost 3 vext2 <2,5,0,7>, <2,5,0,7> - 2586505559U, // <0,7,2,6>: Cost 3 vext1 <6,0,7,2>, <6,0,7,2> - 3020740204U, // <0,7,2,7>: Cost 3 vtrnl LHS, <7,7,7,7> - 1183265849U, // <0,7,2,u>: Cost 2 vrev <7,0,u,2> - 3701000342U, // <0,7,3,0>: Cost 4 vext2 <1,6,0,7>, <3,0,1,2> - 3706308849U, // <0,7,3,1>: Cost 4 vext2 <2,5,0,7>, <3,1,2,3> - 3330315268U, // <0,7,3,2>: Cost 4 vrev <7,0,2,3> - 3706309020U, // <0,7,3,3>: Cost 4 vext2 <2,5,0,7>, <3,3,3,3> - 3706309122U, // <0,7,3,4>: Cost 4 vext2 <2,5,0,7>, <3,4,5,6> - 3712281127U, // <0,7,3,5>: Cost 4 vext2 <3,5,0,7>, <3,5,0,7> - 2639202936U, // <0,7,3,6>: Cost 3 vext2 <3,6,0,7>, <3,6,0,7> - 3802412321U, // <0,7,3,7>: Cost 4 vext3 <7,3,7,0>, <7,3,7,0> - 2640530202U, // <0,7,3,u>: Cost 3 vext2 <3,u,0,7>, <3,u,0,7> - 3654287462U, // <0,7,4,0>: Cost 4 vext1 <5,0,7,4>, LHS - 2256507900U, // <0,7,4,1>: Cost 3 vrev <7,0,1,4> - 2256581637U, // <0,7,4,2>: Cost 3 vrev <7,0,2,4> - 3660262008U, // <0,7,4,3>: Cost 4 vext1 <6,0,7,4>, <3,6,0,7> - 3786413405U, // <0,7,4,4>: Cost 4 vext3 <4,6,6,0>, <7,4,4,6> - 2632568118U, // <0,7,4,5>: Cost 3 vext2 <2,5,0,7>, RHS - 3718917457U, // <0,7,4,6>: Cost 4 vext2 <4,6,0,7>, <4,6,0,7> - 3787003255U, // <0,7,4,7>: Cost 4 vext3 <4,7,5,0>, <7,4,7,5> - 2632568361U, // <0,7,4,u>: Cost 3 vext2 <2,5,0,7>, RHS - 3706310268U, // <0,7,5,0>: Cost 4 vext2 <2,5,0,7>, <5,0,7,0> - 3792459156U, // <0,7,5,1>: Cost 4 vext3 <5,6,7,0>, <7,5,1,7> - 3330331654U, // <0,7,5,2>: Cost 4 vrev <7,0,2,5> - 3722899255U, // <0,7,5,3>: Cost 4 vext2 <5,3,0,7>, <5,3,0,7> - 2256737304U, // <0,7,5,4>: Cost 3 vrev <7,0,4,5> - 3724226521U, // <0,7,5,5>: Cost 4 vext2 <5,5,0,7>, <5,5,0,7> - 2718717377U, // <0,7,5,6>: Cost 3 vext3 <5,6,7,0>, <7,5,6,7> - 2729997763U, // <0,7,5,7>: Cost 3 vext3 <7,5,7,0>, <7,5,7,0> - 2720044499U, // <0,7,5,u>: Cost 3 vext3 <5,u,7,0>, <7,5,u,7> - 3712946517U, // <0,7,6,0>: Cost 4 vext2 <3,6,0,7>, <6,0,7,0> - 2256524286U, // <0,7,6,1>: Cost 3 vrev <7,0,1,6> - 3792459246U, // <0,7,6,2>: Cost 4 vext3 <5,6,7,0>, <7,6,2,7> - 3796440567U, // <0,7,6,3>: Cost 4 vext3 <6,3,7,0>, <7,6,3,7> - 3654307126U, // <0,7,6,4>: Cost 4 vext1 <5,0,7,6>, RHS - 2656457394U, // <0,7,6,5>: Cost 3 vext2 <6,5,0,7>, <6,5,0,7> - 3792459281U, // <0,7,6,6>: Cost 4 vext3 <5,6,7,0>, <7,6,6,6> - 2730661396U, // <0,7,6,7>: Cost 3 vext3 <7,6,7,0>, <7,6,7,0> - 2658448293U, // <0,7,6,u>: Cost 3 vext2 <6,u,0,7>, <6,u,0,7> - 3787003431U, // <0,7,7,0>: Cost 4 vext3 <4,7,5,0>, <7,7,0,1> - 3654312854U, // <0,7,7,1>: Cost 4 vext1 <5,0,7,7>, <1,2,3,0> - 3654313446U, // <0,7,7,2>: Cost 4 vext1 <5,0,7,7>, <2,0,5,7> - 3804771905U, // <0,7,7,3>: Cost 4 vext3 <7,7,3,0>, <7,7,3,0> - 3654315318U, // <0,7,7,4>: Cost 4 vext1 <5,0,7,7>, RHS - 3654315651U, // <0,7,7,5>: Cost 4 vext1 <5,0,7,7>, <5,0,7,7> - 3660288348U, // <0,7,7,6>: Cost 4 vext1 <6,0,7,7>, <6,0,7,7> - 2718717548U, // <0,7,7,7>: Cost 3 vext3 <5,6,7,0>, <7,7,7,7> - 2664420990U, // <0,7,7,u>: Cost 3 vext2 <7,u,0,7>, <7,u,0,7> - 2256466935U, // <0,7,u,0>: Cost 3 vrev <7,0,0,u> - 1182798848U, // <0,7,u,1>: Cost 2 vrev <7,0,1,u> - 2256614409U, // <0,7,u,2>: Cost 3 vrev <7,0,2,u> - 2731693714U, // <0,7,u,3>: Cost 3 vext3 <7,u,3,0>, <7,u,3,0> - 2256761883U, // <0,7,u,4>: Cost 3 vrev <7,0,4,u> - 2632571034U, // <0,7,u,5>: Cost 3 vext2 <2,5,0,7>, RHS - 2669066421U, // <0,7,u,6>: Cost 3 vext2 <u,6,0,7>, <u,6,0,7> - 2731988662U, // <0,7,u,7>: Cost 3 vext3 <7,u,7,0>, <7,u,7,0> - 1183315007U, // <0,7,u,u>: Cost 2 vrev <7,0,u,u> - 135053414U, // <0,u,0,0>: Cost 1 vdup0 LHS - 1544896614U, // <0,u,0,1>: Cost 2 vext2 <0,2,0,u>, LHS - 1678999654U, // <0,u,0,2>: Cost 2 vuzpl LHS, LHS - 2691880677U, // <0,u,0,3>: Cost 3 vext3 <1,2,3,0>, <u,0,3,2> - 1476988214U, // <0,u,0,4>: Cost 2 vext1 <0,0,u,0>, RHS - 2718791419U, // <0,u,0,5>: Cost 3 vext3 <5,6,u,0>, <u,0,5,6> - 3021248666U, // <0,u,0,6>: Cost 3 vtrnl <0,2,0,2>, RHS - 2592535607U, // <0,u,0,7>: Cost 3 vext1 <7,0,u,0>, <7,0,u,0> - 135053414U, // <0,u,0,u>: Cost 1 vdup0 LHS - 1476993097U, // <0,u,1,0>: Cost 2 vext1 <0,0,u,1>, <0,0,u,1> - 1812780846U, // <0,u,1,1>: Cost 2 vzipl LHS, LHS - 1618138926U, // <0,u,1,2>: Cost 2 vext3 <1,2,3,0>, LHS - 2752742134U, // <0,u,1,3>: Cost 3 vuzpl LHS, <1,0,3,2> - 1476996406U, // <0,u,1,4>: Cost 2 vext1 <0,0,u,1>, RHS - 1812781210U, // <0,u,1,5>: Cost 2 vzipl LHS, RHS - 2887006416U, // <0,u,1,6>: Cost 3 vzipl LHS, <u,6,3,7> - 2966736200U, // <0,u,1,7>: Cost 3 vzipr <2,3,0,1>, RHS - 1812781413U, // <0,u,1,u>: Cost 2 vzipl LHS, LHS - 1482973286U, // <0,u,2,0>: Cost 2 vext1 <1,0,u,2>, LHS - 1482973987U, // <0,u,2,1>: Cost 2 vext1 <1,0,u,2>, <1,0,u,2> - 1946998574U, // <0,u,2,2>: Cost 2 vtrnl LHS, LHS - 835584U, // <0,u,2,3>: Cost 0 copy LHS - 1482976566U, // <0,u,2,4>: Cost 2 vext1 <1,0,u,2>, RHS - 3020781631U, // <0,u,2,5>: Cost 3 vtrnl LHS, <u,4,5,6> - 1946998938U, // <0,u,2,6>: Cost 2 vtrnl LHS, RHS - 1518810169U, // <0,u,2,7>: Cost 2 vext1 <7,0,u,2>, <7,0,u,2> - 835584U, // <0,u,2,u>: Cost 0 copy LHS - 2618640534U, // <0,u,3,0>: Cost 3 vext2 <0,2,0,u>, <3,0,1,2> - 2752743574U, // <0,u,3,1>: Cost 3 vuzpl LHS, <3,0,1,2> - 2636556597U, // <0,u,3,2>: Cost 3 vext2 <3,2,0,u>, <3,2,0,u> - 2752743836U, // <0,u,3,3>: Cost 3 vuzpl LHS, <3,3,3,3> - 2618640898U, // <0,u,3,4>: Cost 3 vext2 <0,2,0,u>, <3,4,5,6> - 2752743938U, // <0,u,3,5>: Cost 3 vuzpl LHS, <3,4,5,6> - 2639202936U, // <0,u,3,6>: Cost 3 vext2 <3,6,0,7>, <3,6,0,7> - 2639874762U, // <0,u,3,7>: Cost 3 vext2 <3,7,0,u>, <3,7,0,u> - 2752743637U, // <0,u,3,u>: Cost 3 vuzpl LHS, <3,0,u,2> - 2562703462U, // <0,u,4,0>: Cost 3 vext1 <2,0,u,4>, LHS - 2888455982U, // <0,u,4,1>: Cost 3 vzipl <0,4,1,5>, LHS - 3021575982U, // <0,u,4,2>: Cost 3 vtrnl <0,2,4,6>, LHS - 2568677591U, // <0,u,4,3>: Cost 3 vext1 <3,0,u,4>, <3,0,u,4> - 2562706742U, // <0,u,4,4>: Cost 3 vext1 <2,0,u,4>, RHS - 1544899894U, // <0,u,4,5>: Cost 2 vext2 <0,2,0,u>, RHS - 1679002934U, // <0,u,4,6>: Cost 2 vuzpl LHS, RHS - 2718718033U, // <0,u,4,7>: Cost 3 vext3 <5,6,7,0>, <u,4,7,6> - 1679002952U, // <0,u,4,u>: Cost 2 vuzpl LHS, RHS - 2568683622U, // <0,u,5,0>: Cost 3 vext1 <3,0,u,5>, LHS - 2568684438U, // <0,u,5,1>: Cost 3 vext1 <3,0,u,5>, <1,2,3,0> - 3765622902U, // <0,u,5,2>: Cost 4 vext3 <1,2,3,0>, <u,5,2,7> - 2691881087U, // <0,u,5,3>: Cost 3 vext3 <1,2,3,0>, <u,5,3,7> - 2568686902U, // <0,u,5,4>: Cost 3 vext1 <3,0,u,5>, RHS - 2650492890U, // <0,u,5,5>: Cost 3 vext2 <5,5,0,u>, <5,5,0,u> - 1618139290U, // <0,u,5,6>: Cost 2 vext3 <1,2,3,0>, RHS - 2824834358U, // <0,u,5,7>: Cost 3 vuzpr <1,0,3,u>, RHS - 1618139308U, // <0,u,5,u>: Cost 2 vext3 <1,2,3,0>, RHS - 2592579686U, // <0,u,6,0>: Cost 3 vext1 <7,0,u,6>, LHS - 2262496983U, // <0,u,6,1>: Cost 3 vrev <u,0,1,6> - 2654474688U, // <0,u,6,2>: Cost 3 vext2 <6,2,0,u>, <6,2,0,u> - 2691881168U, // <0,u,6,3>: Cost 3 vext3 <1,2,3,0>, <u,6,3,7> - 2592582966U, // <0,u,6,4>: Cost 3 vext1 <7,0,u,6>, RHS - 2656465587U, // <0,u,6,5>: Cost 3 vext2 <6,5,0,u>, <6,5,0,u> - 2657129220U, // <0,u,6,6>: Cost 3 vext2 <6,6,0,u>, <6,6,0,u> - 1584051029U, // <0,u,6,7>: Cost 2 vext2 <6,7,0,u>, <6,7,0,u> - 1584714662U, // <0,u,6,u>: Cost 2 vext2 <6,u,0,u>, <6,u,0,u> - 2562728038U, // <0,u,7,0>: Cost 3 vext1 <2,0,u,7>, LHS - 2562728854U, // <0,u,7,1>: Cost 3 vext1 <2,0,u,7>, <1,2,3,0> - 2562729473U, // <0,u,7,2>: Cost 3 vext1 <2,0,u,7>, <2,0,u,7> - 2661111018U, // <0,u,7,3>: Cost 3 vext2 <7,3,0,u>, <7,3,0,u> - 2562731318U, // <0,u,7,4>: Cost 3 vext1 <2,0,u,7>, RHS - 2718718258U, // <0,u,7,5>: Cost 3 vext3 <5,6,7,0>, <u,7,5,6> - 2586620261U, // <0,u,7,6>: Cost 3 vext1 <6,0,u,7>, <6,0,u,7> - 2657793644U, // <0,u,7,7>: Cost 3 vext2 <6,7,0,u>, <7,7,7,7> - 2562733870U, // <0,u,7,u>: Cost 3 vext1 <2,0,u,7>, LHS - 135053414U, // <0,u,u,0>: Cost 1 vdup0 LHS - 1544902446U, // <0,u,u,1>: Cost 2 vext2 <0,2,0,u>, LHS - 1679005486U, // <0,u,u,2>: Cost 2 vuzpl LHS, LHS - 835584U, // <0,u,u,3>: Cost 0 copy LHS - 1483025718U, // <0,u,u,4>: Cost 2 vext1 <1,0,u,u>, RHS - 1544902810U, // <0,u,u,5>: Cost 2 vext2 <0,2,0,u>, RHS - 1679005850U, // <0,u,u,6>: Cost 2 vuzpl LHS, RHS - 1518859327U, // <0,u,u,7>: Cost 2 vext1 <7,0,u,u>, <7,0,u,u> - 835584U, // <0,u,u,u>: Cost 0 copy LHS - 2689744896U, // <1,0,0,0>: Cost 3 vext3 <0,u,1,1>, <0,0,0,0> - 1610694666U, // <1,0,0,1>: Cost 2 vext3 <0,0,1,1>, <0,0,1,1> - 2689744916U, // <1,0,0,2>: Cost 3 vext3 <0,u,1,1>, <0,0,2,2> - 2619310332U, // <1,0,0,3>: Cost 3 vext2 <0,3,1,0>, <0,3,1,0> - 2684657701U, // <1,0,0,4>: Cost 3 vext3 <0,0,4,1>, <0,0,4,1> - 2620637598U, // <1,0,0,5>: Cost 3 vext2 <0,5,1,0>, <0,5,1,0> - 3708977654U, // <1,0,0,6>: Cost 4 vext2 <3,0,1,0>, <0,6,1,7> - 3666351168U, // <1,0,0,7>: Cost 4 vext1 <7,1,0,0>, <7,1,0,0> - 1611210825U, // <1,0,0,u>: Cost 2 vext3 <0,0,u,1>, <0,0,u,1> - 2556780646U, // <1,0,1,0>: Cost 3 vext1 <1,1,0,1>, LHS - 2556781355U, // <1,0,1,1>: Cost 3 vext1 <1,1,0,1>, <1,1,0,1> - 1616003174U, // <1,0,1,2>: Cost 2 vext3 <0,u,1,1>, LHS - 3693052888U, // <1,0,1,3>: Cost 4 vext2 <0,3,1,0>, <1,3,1,3> - 2556783926U, // <1,0,1,4>: Cost 3 vext1 <1,1,0,1>, RHS - 2580672143U, // <1,0,1,5>: Cost 3 vext1 <5,1,0,1>, <5,1,0,1> - 2724839566U, // <1,0,1,6>: Cost 3 vext3 <6,7,0,1>, <0,1,6,7> - 3654415354U, // <1,0,1,7>: Cost 4 vext1 <5,1,0,1>, <7,0,1,2> - 1616003228U, // <1,0,1,u>: Cost 2 vext3 <0,u,1,1>, LHS - 2685690019U, // <1,0,2,0>: Cost 3 vext3 <0,2,0,1>, <0,2,0,1> - 2685763756U, // <1,0,2,1>: Cost 3 vext3 <0,2,1,1>, <0,2,1,1> - 2698297524U, // <1,0,2,2>: Cost 3 vext3 <2,3,0,1>, <0,2,2,0> - 2685911230U, // <1,0,2,3>: Cost 3 vext3 <0,2,3,1>, <0,2,3,1> - 2689745100U, // <1,0,2,4>: Cost 3 vext3 <0,u,1,1>, <0,2,4,6> - 3764814038U, // <1,0,2,5>: Cost 4 vext3 <1,1,1,1>, <0,2,5,7> - 2724839640U, // <1,0,2,6>: Cost 3 vext3 <6,7,0,1>, <0,2,6,0> - 2592625658U, // <1,0,2,7>: Cost 3 vext1 <7,1,0,2>, <7,0,1,2> - 2686279915U, // <1,0,2,u>: Cost 3 vext3 <0,2,u,1>, <0,2,u,1> - 3087843328U, // <1,0,3,0>: Cost 3 vtrnr LHS, <0,0,0,0> - 3087843338U, // <1,0,3,1>: Cost 3 vtrnr LHS, <0,0,1,1> - 67944550U, // <1,0,3,2>: Cost 1 vrev LHS - 2568743135U, // <1,0,3,3>: Cost 3 vext1 <3,1,0,3>, <3,1,0,3> - 2562772278U, // <1,0,3,4>: Cost 3 vext1 <2,1,0,3>, RHS - 4099850454U, // <1,0,3,5>: Cost 4 vtrnl <1,0,3,2>, <0,2,5,7> - 3704998538U, // <1,0,3,6>: Cost 4 vext2 <2,3,1,0>, <3,6,2,7> - 2592633923U, // <1,0,3,7>: Cost 3 vext1 <7,1,0,3>, <7,1,0,3> - 68386972U, // <1,0,3,u>: Cost 1 vrev LHS - 2620640146U, // <1,0,4,0>: Cost 3 vext2 <0,5,1,0>, <4,0,5,1> - 2689745234U, // <1,0,4,1>: Cost 3 vext3 <0,u,1,1>, <0,4,1,5> - 2689745244U, // <1,0,4,2>: Cost 3 vext3 <0,u,1,1>, <0,4,2,6> - 3760980320U, // <1,0,4,3>: Cost 4 vext3 <0,4,3,1>, <0,4,3,1> - 3761054057U, // <1,0,4,4>: Cost 4 vext3 <0,4,4,1>, <0,4,4,1> - 2619313462U, // <1,0,4,5>: Cost 3 vext2 <0,3,1,0>, RHS - 3761201531U, // <1,0,4,6>: Cost 4 vext3 <0,4,6,1>, <0,4,6,1> - 3666383940U, // <1,0,4,7>: Cost 4 vext1 <7,1,0,4>, <7,1,0,4> - 2619313705U, // <1,0,4,u>: Cost 3 vext2 <0,3,1,0>, RHS - 4029300736U, // <1,0,5,0>: Cost 4 vzipr <0,4,1,5>, <0,0,0,0> - 2895249510U, // <1,0,5,1>: Cost 3 vzipl <1,5,3,7>, LHS - 3028287590U, // <1,0,5,2>: Cost 3 vtrnl <1,3,5,7>, LHS - 3642501345U, // <1,0,5,3>: Cost 4 vext1 <3,1,0,5>, <3,1,0,5> - 2215592058U, // <1,0,5,4>: Cost 3 vrev <0,1,4,5> - 3724242907U, // <1,0,5,5>: Cost 4 vext2 <5,5,1,0>, <5,5,1,0> - 3724906540U, // <1,0,5,6>: Cost 4 vext2 <5,6,1,0>, <5,6,1,0> - 3911118134U, // <1,0,5,7>: Cost 4 vuzpr <3,1,3,0>, RHS - 3028287644U, // <1,0,5,u>: Cost 3 vtrnl <1,3,5,7>, LHS - 3762086375U, // <1,0,6,0>: Cost 4 vext3 <0,6,0,1>, <0,6,0,1> - 2698297846U, // <1,0,6,1>: Cost 3 vext3 <2,3,0,1>, <0,6,1,7> - 3760022015U, // <1,0,6,2>: Cost 4 vext3 <0,2,u,1>, <0,6,2,7> - 3642509538U, // <1,0,6,3>: Cost 4 vext1 <3,1,0,6>, <3,1,0,6> - 3762381323U, // <1,0,6,4>: Cost 4 vext3 <0,6,4,1>, <0,6,4,1> - 3730215604U, // <1,0,6,5>: Cost 4 vext2 <6,5,1,0>, <6,5,1,0> - 3730879237U, // <1,0,6,6>: Cost 4 vext2 <6,6,1,0>, <6,6,1,0> - 2657801046U, // <1,0,6,7>: Cost 3 vext2 <6,7,1,0>, <6,7,1,0> - 2658464679U, // <1,0,6,u>: Cost 3 vext2 <6,u,1,0>, <6,u,1,0> - 2659128312U, // <1,0,7,0>: Cost 3 vext2 <7,0,1,0>, <7,0,1,0> - 4047898278U, // <1,0,7,1>: Cost 4 vzipr <3,5,1,7>, <2,3,0,1> - 2215460970U, // <1,0,7,2>: Cost 3 vrev <0,1,2,7> - 3734861035U, // <1,0,7,3>: Cost 4 vext2 <7,3,1,0>, <7,3,1,0> - 3731543398U, // <1,0,7,4>: Cost 4 vext2 <6,7,1,0>, <7,4,5,6> - 3736188301U, // <1,0,7,5>: Cost 4 vext2 <7,5,1,0>, <7,5,1,0> - 2663110110U, // <1,0,7,6>: Cost 3 vext2 <7,6,1,0>, <7,6,1,0> - 3731543660U, // <1,0,7,7>: Cost 4 vext2 <6,7,1,0>, <7,7,7,7> - 2664437376U, // <1,0,7,u>: Cost 3 vext2 <7,u,1,0>, <7,u,1,0> - 3087884288U, // <1,0,u,0>: Cost 3 vtrnr LHS, <0,0,0,0> - 1616003730U, // <1,0,u,1>: Cost 2 vext3 <0,u,1,1>, <0,u,1,1> - 67985515U, // <1,0,u,2>: Cost 1 vrev LHS - 2689893028U, // <1,0,u,3>: Cost 3 vext3 <0,u,3,1>, <0,u,3,1> - 2689745586U, // <1,0,u,4>: Cost 3 vext3 <0,u,1,1>, <0,u,4,6> - 2619316378U, // <1,0,u,5>: Cost 3 vext2 <0,3,1,0>, RHS - 2669082807U, // <1,0,u,6>: Cost 3 vext2 <u,6,1,0>, <u,6,1,0> - 2592674888U, // <1,0,u,7>: Cost 3 vext1 <7,1,0,u>, <7,1,0,u> - 68427937U, // <1,0,u,u>: Cost 1 vrev LHS - 1543585802U, // <1,1,0,0>: Cost 2 vext2 <0,0,1,1>, <0,0,1,1> - 1548894310U, // <1,1,0,1>: Cost 2 vext2 <0,u,1,1>, LHS - 2618654892U, // <1,1,0,2>: Cost 3 vext2 <0,2,1,1>, <0,2,1,1> - 2689745654U, // <1,1,0,3>: Cost 3 vext3 <0,u,1,1>, <1,0,3,2> - 2622636370U, // <1,1,0,4>: Cost 3 vext2 <0,u,1,1>, <0,4,1,5> - 2620645791U, // <1,1,0,5>: Cost 3 vext2 <0,5,1,1>, <0,5,1,1> - 3696378367U, // <1,1,0,6>: Cost 4 vext2 <0,u,1,1>, <0,6,2,7> - 3666424905U, // <1,1,0,7>: Cost 4 vext1 <7,1,1,0>, <7,1,1,0> - 1548894866U, // <1,1,0,u>: Cost 2 vext2 <0,u,1,1>, <0,u,1,1> - 1483112550U, // <1,1,1,0>: Cost 2 vext1 <1,1,1,1>, LHS - 202162278U, // <1,1,1,1>: Cost 1 vdup1 LHS - 2622636950U, // <1,1,1,2>: Cost 3 vext2 <0,u,1,1>, <1,2,3,0> - 2622637016U, // <1,1,1,3>: Cost 3 vext2 <0,u,1,1>, <1,3,1,3> - 1483115830U, // <1,1,1,4>: Cost 2 vext1 <1,1,1,1>, RHS - 2622637200U, // <1,1,1,5>: Cost 3 vext2 <0,u,1,1>, <1,5,3,7> - 2622637263U, // <1,1,1,6>: Cost 3 vext2 <0,u,1,1>, <1,6,1,7> - 2592691274U, // <1,1,1,7>: Cost 3 vext1 <7,1,1,1>, <7,1,1,1> - 202162278U, // <1,1,1,u>: Cost 1 vdup1 LHS - 2550890588U, // <1,1,2,0>: Cost 3 vext1 <0,1,1,2>, <0,1,1,2> - 2617329183U, // <1,1,2,1>: Cost 3 vext2 <0,0,1,1>, <2,1,3,1> - 2622637672U, // <1,1,2,2>: Cost 3 vext2 <0,u,1,1>, <2,2,2,2> - 2622637734U, // <1,1,2,3>: Cost 3 vext2 <0,u,1,1>, <2,3,0,1> - 2550893878U, // <1,1,2,4>: Cost 3 vext1 <0,1,1,2>, RHS - 3696379744U, // <1,1,2,5>: Cost 4 vext2 <0,u,1,1>, <2,5,2,7> - 2622638010U, // <1,1,2,6>: Cost 3 vext2 <0,u,1,1>, <2,6,3,7> - 3804554170U, // <1,1,2,7>: Cost 4 vext3 <7,7,0,1>, <1,2,7,0> - 2622638139U, // <1,1,2,u>: Cost 3 vext2 <0,u,1,1>, <2,u,0,1> - 2622638230U, // <1,1,3,0>: Cost 3 vext2 <0,u,1,1>, <3,0,1,2> - 3087844148U, // <1,1,3,1>: Cost 3 vtrnr LHS, <1,1,1,1> - 4161585244U, // <1,1,3,2>: Cost 4 vtrnr LHS, <0,1,1,2> - 2014101606U, // <1,1,3,3>: Cost 2 vtrnr LHS, LHS - 2622638594U, // <1,1,3,4>: Cost 3 vext2 <0,u,1,1>, <3,4,5,6> - 2689745920U, // <1,1,3,5>: Cost 3 vext3 <0,u,1,1>, <1,3,5,7> - 3763487753U, // <1,1,3,6>: Cost 4 vext3 <0,u,1,1>, <1,3,6,7> - 2592707660U, // <1,1,3,7>: Cost 3 vext1 <7,1,1,3>, <7,1,1,3> - 2014101611U, // <1,1,3,u>: Cost 2 vtrnr LHS, LHS - 2556878950U, // <1,1,4,0>: Cost 3 vext1 <1,1,1,4>, LHS - 2221335351U, // <1,1,4,1>: Cost 3 vrev <1,1,1,4> - 3696380988U, // <1,1,4,2>: Cost 4 vext2 <0,u,1,1>, <4,2,6,0> - 3763487805U, // <1,1,4,3>: Cost 4 vext3 <0,u,1,1>, <1,4,3,5> - 2556882230U, // <1,1,4,4>: Cost 3 vext1 <1,1,1,4>, RHS - 1548897590U, // <1,1,4,5>: Cost 2 vext2 <0,u,1,1>, RHS - 2758184246U, // <1,1,4,6>: Cost 3 vuzpl <1,1,1,1>, RHS - 3666457677U, // <1,1,4,7>: Cost 4 vext1 <7,1,1,4>, <7,1,1,4> - 1548897833U, // <1,1,4,u>: Cost 2 vext2 <0,u,1,1>, RHS - 2693653615U, // <1,1,5,0>: Cost 3 vext3 <1,5,0,1>, <1,5,0,1> - 2617331408U, // <1,1,5,1>: Cost 3 vext2 <0,0,1,1>, <5,1,7,3> - 4029302934U, // <1,1,5,2>: Cost 4 vzipr <0,4,1,5>, <3,0,1,2> - 2689746064U, // <1,1,5,3>: Cost 3 vext3 <0,u,1,1>, <1,5,3,7> - 2221564755U, // <1,1,5,4>: Cost 3 vrev <1,1,4,5> - 2955559250U, // <1,1,5,5>: Cost 3 vzipr <0,4,1,5>, <0,4,1,5> - 2617331810U, // <1,1,5,6>: Cost 3 vext2 <0,0,1,1>, <5,6,7,0> - 2825293110U, // <1,1,5,7>: Cost 3 vuzpr <1,1,1,1>, RHS - 2689746109U, // <1,1,5,u>: Cost 3 vext3 <0,u,1,1>, <1,5,u,7> - 3696382241U, // <1,1,6,0>: Cost 4 vext2 <0,u,1,1>, <6,0,1,2> - 2689746127U, // <1,1,6,1>: Cost 3 vext3 <0,u,1,1>, <1,6,1,7> - 2617332218U, // <1,1,6,2>: Cost 3 vext2 <0,0,1,1>, <6,2,7,3> - 3763487969U, // <1,1,6,3>: Cost 4 vext3 <0,u,1,1>, <1,6,3,7> - 3696382605U, // <1,1,6,4>: Cost 4 vext2 <0,u,1,1>, <6,4,5,6> - 4029309266U, // <1,1,6,5>: Cost 4 vzipr <0,4,1,6>, <0,4,1,5> - 2617332536U, // <1,1,6,6>: Cost 3 vext2 <0,0,1,1>, <6,6,6,6> - 2724840702U, // <1,1,6,7>: Cost 3 vext3 <6,7,0,1>, <1,6,7,0> - 2725504263U, // <1,1,6,u>: Cost 3 vext3 <6,u,0,1>, <1,6,u,0> - 2617332720U, // <1,1,7,0>: Cost 3 vext2 <0,0,1,1>, <7,0,0,1> - 2659800138U, // <1,1,7,1>: Cost 3 vext2 <7,1,1,1>, <7,1,1,1> - 3691074717U, // <1,1,7,2>: Cost 4 vext2 <0,0,1,1>, <7,2,1,3> - 4167811174U, // <1,1,7,3>: Cost 4 vtrnr <1,1,5,7>, LHS - 2617333094U, // <1,1,7,4>: Cost 3 vext2 <0,0,1,1>, <7,4,5,6> - 3295396702U, // <1,1,7,5>: Cost 4 vrev <1,1,5,7> - 3803891014U, // <1,1,7,6>: Cost 4 vext3 <7,6,0,1>, <1,7,6,0> - 2617333356U, // <1,1,7,7>: Cost 3 vext2 <0,0,1,1>, <7,7,7,7> - 2659800138U, // <1,1,7,u>: Cost 3 vext2 <7,1,1,1>, <7,1,1,1> - 1483112550U, // <1,1,u,0>: Cost 2 vext1 <1,1,1,1>, LHS - 202162278U, // <1,1,u,1>: Cost 1 vdup1 LHS - 2622642056U, // <1,1,u,2>: Cost 3 vext2 <0,u,1,1>, <u,2,3,3> - 2014142566U, // <1,1,u,3>: Cost 2 vtrnr LHS, LHS - 1483115830U, // <1,1,u,4>: Cost 2 vext1 <1,1,1,1>, RHS - 1548900506U, // <1,1,u,5>: Cost 2 vext2 <0,u,1,1>, RHS - 2622642384U, // <1,1,u,6>: Cost 3 vext2 <0,u,1,1>, <u,6,3,7> - 2825293353U, // <1,1,u,7>: Cost 3 vuzpr <1,1,1,1>, RHS - 202162278U, // <1,1,u,u>: Cost 1 vdup1 LHS - 2635251712U, // <1,2,0,0>: Cost 3 vext2 <3,0,1,2>, <0,0,0,0> - 1561509990U, // <1,2,0,1>: Cost 2 vext2 <3,0,1,2>, LHS - 2618663085U, // <1,2,0,2>: Cost 3 vext2 <0,2,1,2>, <0,2,1,2> - 2696529358U, // <1,2,0,3>: Cost 3 vext3 <2,0,3,1>, <2,0,3,1> - 2635252050U, // <1,2,0,4>: Cost 3 vext2 <3,0,1,2>, <0,4,1,5> - 3769533926U, // <1,2,0,5>: Cost 4 vext3 <1,u,2,1>, <2,0,5,7> - 2621317617U, // <1,2,0,6>: Cost 3 vext2 <0,6,1,2>, <0,6,1,2> - 2659140170U, // <1,2,0,7>: Cost 3 vext2 <7,0,1,2>, <0,7,2,1> - 1561510557U, // <1,2,0,u>: Cost 2 vext2 <3,0,1,2>, LHS - 2623308516U, // <1,2,1,0>: Cost 3 vext2 <1,0,1,2>, <1,0,1,2> - 2635252532U, // <1,2,1,1>: Cost 3 vext2 <3,0,1,2>, <1,1,1,1> - 2631271318U, // <1,2,1,2>: Cost 3 vext2 <2,3,1,2>, <1,2,3,0> - 2958180454U, // <1,2,1,3>: Cost 3 vzipr <0,u,1,1>, LHS - 2550959414U, // <1,2,1,4>: Cost 3 vext1 <0,1,2,1>, RHS - 2635252880U, // <1,2,1,5>: Cost 3 vext2 <3,0,1,2>, <1,5,3,7> - 2635252952U, // <1,2,1,6>: Cost 3 vext2 <3,0,1,2>, <1,6,2,7> - 3732882731U, // <1,2,1,7>: Cost 4 vext2 <7,0,1,2>, <1,7,3,0> - 2958180459U, // <1,2,1,u>: Cost 3 vzipr <0,u,1,1>, LHS - 2629281213U, // <1,2,2,0>: Cost 3 vext2 <2,0,1,2>, <2,0,1,2> - 2635253280U, // <1,2,2,1>: Cost 3 vext2 <3,0,1,2>, <2,1,3,2> - 2618664552U, // <1,2,2,2>: Cost 3 vext2 <0,2,1,2>, <2,2,2,2> - 2689746546U, // <1,2,2,3>: Cost 3 vext3 <0,u,1,1>, <2,2,3,3> - 3764815485U, // <1,2,2,4>: Cost 4 vext3 <1,1,1,1>, <2,2,4,5> - 3760023176U, // <1,2,2,5>: Cost 4 vext3 <0,2,u,1>, <2,2,5,7> - 2635253690U, // <1,2,2,6>: Cost 3 vext2 <3,0,1,2>, <2,6,3,7> - 2659141610U, // <1,2,2,7>: Cost 3 vext2 <7,0,1,2>, <2,7,0,1> - 2689746591U, // <1,2,2,u>: Cost 3 vext3 <0,u,1,1>, <2,2,u,3> - 403488870U, // <1,2,3,0>: Cost 1 vext1 LHS, LHS - 1477231350U, // <1,2,3,1>: Cost 2 vext1 LHS, <1,0,3,2> - 1477232232U, // <1,2,3,2>: Cost 2 vext1 LHS, <2,2,2,2> - 1477233052U, // <1,2,3,3>: Cost 2 vext1 LHS, <3,3,3,3> - 403492150U, // <1,2,3,4>: Cost 1 vext1 LHS, RHS - 1525010128U, // <1,2,3,5>: Cost 2 vext1 LHS, <5,1,7,3> - 1525010938U, // <1,2,3,6>: Cost 2 vext1 LHS, <6,2,7,3> - 1525011450U, // <1,2,3,7>: Cost 2 vext1 LHS, <7,0,1,2> - 403494702U, // <1,2,3,u>: Cost 1 vext1 LHS, LHS - 2641226607U, // <1,2,4,0>: Cost 3 vext2 <4,0,1,2>, <4,0,1,2> - 3624723446U, // <1,2,4,1>: Cost 4 vext1 <0,1,2,4>, <1,3,4,6> - 3301123609U, // <1,2,4,2>: Cost 4 vrev <2,1,2,4> - 2598759198U, // <1,2,4,3>: Cost 3 vext1 <u,1,2,4>, <3,u,1,2> - 2659142864U, // <1,2,4,4>: Cost 3 vext2 <7,0,1,2>, <4,4,4,4> - 1561513270U, // <1,2,4,5>: Cost 2 vext2 <3,0,1,2>, RHS - 2659143028U, // <1,2,4,6>: Cost 3 vext2 <7,0,1,2>, <4,6,4,6> - 2659143112U, // <1,2,4,7>: Cost 3 vext2 <7,0,1,2>, <4,7,5,0> - 1561513513U, // <1,2,4,u>: Cost 2 vext2 <3,0,1,2>, RHS - 2550988902U, // <1,2,5,0>: Cost 3 vext1 <0,1,2,5>, LHS - 2550989824U, // <1,2,5,1>: Cost 3 vext1 <0,1,2,5>, <1,3,5,7> - 3624732264U, // <1,2,5,2>: Cost 4 vext1 <0,1,2,5>, <2,2,2,2> - 2955559014U, // <1,2,5,3>: Cost 3 vzipr <0,4,1,5>, LHS - 2550992182U, // <1,2,5,4>: Cost 3 vext1 <0,1,2,5>, RHS - 2659143684U, // <1,2,5,5>: Cost 3 vext2 <7,0,1,2>, <5,5,5,5> - 2659143778U, // <1,2,5,6>: Cost 3 vext2 <7,0,1,2>, <5,6,7,0> - 2659143848U, // <1,2,5,7>: Cost 3 vext2 <7,0,1,2>, <5,7,5,7> - 2550994734U, // <1,2,5,u>: Cost 3 vext1 <0,1,2,5>, LHS - 2700289945U, // <1,2,6,0>: Cost 3 vext3 <2,6,0,1>, <2,6,0,1> - 2635256232U, // <1,2,6,1>: Cost 3 vext2 <3,0,1,2>, <6,1,7,2> - 2659144186U, // <1,2,6,2>: Cost 3 vext2 <7,0,1,2>, <6,2,7,3> - 2689746874U, // <1,2,6,3>: Cost 3 vext3 <0,u,1,1>, <2,6,3,7> - 3763488705U, // <1,2,6,4>: Cost 4 vext3 <0,u,1,1>, <2,6,4,5> - 3763488716U, // <1,2,6,5>: Cost 4 vext3 <0,u,1,1>, <2,6,5,7> - 2659144504U, // <1,2,6,6>: Cost 3 vext2 <7,0,1,2>, <6,6,6,6> - 2657817432U, // <1,2,6,7>: Cost 3 vext2 <6,7,1,2>, <6,7,1,2> - 2689746919U, // <1,2,6,u>: Cost 3 vext3 <0,u,1,1>, <2,6,u,7> - 1585402874U, // <1,2,7,0>: Cost 2 vext2 <7,0,1,2>, <7,0,1,2> - 2659144770U, // <1,2,7,1>: Cost 3 vext2 <7,0,1,2>, <7,1,0,2> - 3708998858U, // <1,2,7,2>: Cost 4 vext2 <3,0,1,2>, <7,2,6,3> - 2635257059U, // <1,2,7,3>: Cost 3 vext2 <3,0,1,2>, <7,3,0,1> - 2659145062U, // <1,2,7,4>: Cost 3 vext2 <7,0,1,2>, <7,4,5,6> - 3732886916U, // <1,2,7,5>: Cost 4 vext2 <7,0,1,2>, <7,5,0,0> - 3732886998U, // <1,2,7,6>: Cost 4 vext2 <7,0,1,2>, <7,6,0,1> - 2659145255U, // <1,2,7,7>: Cost 3 vext2 <7,0,1,2>, <7,7,0,1> - 1590711938U, // <1,2,7,u>: Cost 2 vext2 <7,u,1,2>, <7,u,1,2> - 403529835U, // <1,2,u,0>: Cost 1 vext1 LHS, LHS - 1477272310U, // <1,2,u,1>: Cost 2 vext1 LHS, <1,0,3,2> - 1477273192U, // <1,2,u,2>: Cost 2 vext1 LHS, <2,2,2,2> - 1477273750U, // <1,2,u,3>: Cost 2 vext1 LHS, <3,0,1,2> - 403533110U, // <1,2,u,4>: Cost 1 vext1 LHS, RHS - 1561516186U, // <1,2,u,5>: Cost 2 vext2 <3,0,1,2>, RHS - 1525051898U, // <1,2,u,6>: Cost 2 vext1 LHS, <6,2,7,3> - 1525052410U, // <1,2,u,7>: Cost 2 vext1 LHS, <7,0,1,2> - 403535662U, // <1,2,u,u>: Cost 1 vext1 LHS, LHS - 2819407872U, // <1,3,0,0>: Cost 3 vuzpr LHS, <0,0,0,0> - 1551564902U, // <1,3,0,1>: Cost 2 vext2 <1,3,1,3>, LHS - 2819408630U, // <1,3,0,2>: Cost 3 vuzpr LHS, <1,0,3,2> - 2619334911U, // <1,3,0,3>: Cost 3 vext2 <0,3,1,3>, <0,3,1,3> - 2625306962U, // <1,3,0,4>: Cost 3 vext2 <1,3,1,3>, <0,4,1,5> - 3832725879U, // <1,3,0,5>: Cost 4 vuzpl <1,2,3,0>, <0,4,5,6> - 3699048959U, // <1,3,0,6>: Cost 4 vext2 <1,3,1,3>, <0,6,2,7> - 3776538827U, // <1,3,0,7>: Cost 4 vext3 <3,0,7,1>, <3,0,7,1> - 1551565469U, // <1,3,0,u>: Cost 2 vext2 <1,3,1,3>, LHS - 2618671862U, // <1,3,1,0>: Cost 3 vext2 <0,2,1,3>, <1,0,3,2> - 2819408692U, // <1,3,1,1>: Cost 3 vuzpr LHS, <1,1,1,1> - 2624643975U, // <1,3,1,2>: Cost 3 vext2 <1,2,1,3>, <1,2,1,3> - 1745666150U, // <1,3,1,3>: Cost 2 vuzpr LHS, LHS - 2557005110U, // <1,3,1,4>: Cost 3 vext1 <1,1,3,1>, RHS - 2625307792U, // <1,3,1,5>: Cost 3 vext2 <1,3,1,3>, <1,5,3,7> - 3698386127U, // <1,3,1,6>: Cost 4 vext2 <1,2,1,3>, <1,6,1,7> - 2592838748U, // <1,3,1,7>: Cost 3 vext1 <7,1,3,1>, <7,1,3,1> - 1745666155U, // <1,3,1,u>: Cost 2 vuzpr LHS, LHS - 2819408790U, // <1,3,2,0>: Cost 3 vuzpr LHS, <1,2,3,0> - 2625308193U, // <1,3,2,1>: Cost 3 vext2 <1,3,1,3>, <2,1,3,3> - 2819408036U, // <1,3,2,2>: Cost 3 vuzpr LHS, <0,2,0,2> - 2819851890U, // <1,3,2,3>: Cost 3 vuzpr LHS, <2,2,3,3> - 2819408794U, // <1,3,2,4>: Cost 3 vuzpr LHS, <1,2,3,4> - 3893149890U, // <1,3,2,5>: Cost 4 vuzpr LHS, <0,2,3,5> - 2819408076U, // <1,3,2,6>: Cost 3 vuzpr LHS, <0,2,4,6> - 3772041583U, // <1,3,2,7>: Cost 4 vext3 <2,3,0,1>, <3,2,7,3> - 2819408042U, // <1,3,2,u>: Cost 3 vuzpr LHS, <0,2,0,u> - 1483276390U, // <1,3,3,0>: Cost 2 vext1 <1,1,3,3>, LHS - 1483277128U, // <1,3,3,1>: Cost 2 vext1 <1,1,3,3>, <1,1,3,3> - 2557019752U, // <1,3,3,2>: Cost 3 vext1 <1,1,3,3>, <2,2,2,2> - 2819408856U, // <1,3,3,3>: Cost 3 vuzpr LHS, <1,3,1,3> - 1483279670U, // <1,3,3,4>: Cost 2 vext1 <1,1,3,3>, RHS - 2819409614U, // <1,3,3,5>: Cost 3 vuzpr LHS, <2,3,4,5> - 2598826490U, // <1,3,3,6>: Cost 3 vext1 <u,1,3,3>, <6,2,7,3> - 3087844352U, // <1,3,3,7>: Cost 3 vtrnr LHS, <1,3,5,7> - 1483282222U, // <1,3,3,u>: Cost 2 vext1 <1,1,3,3>, LHS - 2568970342U, // <1,3,4,0>: Cost 3 vext1 <3,1,3,4>, LHS - 2568971224U, // <1,3,4,1>: Cost 3 vext1 <3,1,3,4>, <1,3,1,3> - 3832761290U, // <1,3,4,2>: Cost 4 vuzpl <1,2,3,4>, <4,1,2,3> - 2233428219U, // <1,3,4,3>: Cost 3 vrev <3,1,3,4> - 2568973622U, // <1,3,4,4>: Cost 3 vext1 <3,1,3,4>, RHS - 1551568182U, // <1,3,4,5>: Cost 2 vext2 <1,3,1,3>, RHS - 2819410434U, // <1,3,4,6>: Cost 3 vuzpr LHS, <3,4,5,6> - 3666605151U, // <1,3,4,7>: Cost 4 vext1 <7,1,3,4>, <7,1,3,4> - 1551568425U, // <1,3,4,u>: Cost 2 vext2 <1,3,1,3>, RHS - 2563006566U, // <1,3,5,0>: Cost 3 vext1 <2,1,3,5>, LHS - 2568979456U, // <1,3,5,1>: Cost 3 vext1 <3,1,3,5>, <1,3,5,7> - 2563008035U, // <1,3,5,2>: Cost 3 vext1 <2,1,3,5>, <2,1,3,5> - 2233436412U, // <1,3,5,3>: Cost 3 vrev <3,1,3,5> - 2563009846U, // <1,3,5,4>: Cost 3 vext1 <2,1,3,5>, RHS - 2867187716U, // <1,3,5,5>: Cost 3 vuzpr LHS, <5,5,5,5> - 2655834214U, // <1,3,5,6>: Cost 3 vext2 <6,4,1,3>, <5,6,7,4> - 1745669430U, // <1,3,5,7>: Cost 2 vuzpr LHS, RHS - 1745669431U, // <1,3,5,u>: Cost 2 vuzpr LHS, RHS - 2867187810U, // <1,3,6,0>: Cost 3 vuzpr LHS, <5,6,7,0> - 3699052931U, // <1,3,6,1>: Cost 4 vext2 <1,3,1,3>, <6,1,3,1> - 2654507460U, // <1,3,6,2>: Cost 3 vext2 <6,2,1,3>, <6,2,1,3> - 3766291091U, // <1,3,6,3>: Cost 4 vext3 <1,3,3,1>, <3,6,3,7> - 2655834726U, // <1,3,6,4>: Cost 3 vext2 <6,4,1,3>, <6,4,1,3> - 3923384562U, // <1,3,6,5>: Cost 4 vuzpr <5,1,7,3>, <u,6,7,5> - 2657161992U, // <1,3,6,6>: Cost 3 vext2 <6,6,1,3>, <6,6,1,3> - 2819852218U, // <1,3,6,7>: Cost 3 vuzpr LHS, <2,6,3,7> - 2819852219U, // <1,3,6,u>: Cost 3 vuzpr LHS, <2,6,3,u> - 2706926275U, // <1,3,7,0>: Cost 3 vext3 <3,7,0,1>, <3,7,0,1> - 2659816524U, // <1,3,7,1>: Cost 3 vext2 <7,1,1,3>, <7,1,1,3> - 3636766245U, // <1,3,7,2>: Cost 4 vext1 <2,1,3,7>, <2,1,3,7> - 2867187903U, // <1,3,7,3>: Cost 3 vuzpr LHS, <5,7,u,3> - 2625312102U, // <1,3,7,4>: Cost 3 vext2 <1,3,1,3>, <7,4,5,6> - 2867188598U, // <1,3,7,5>: Cost 3 vuzpr LHS, <6,7,4,5> - 3728250344U, // <1,3,7,6>: Cost 4 vext2 <6,2,1,3>, <7,6,2,1> - 2867187880U, // <1,3,7,7>: Cost 3 vuzpr LHS, <5,7,5,7> - 2707516171U, // <1,3,7,u>: Cost 3 vext3 <3,7,u,1>, <3,7,u,1> - 1483317350U, // <1,3,u,0>: Cost 2 vext1 <1,1,3,u>, LHS - 1483318093U, // <1,3,u,1>: Cost 2 vext1 <1,1,3,u>, <1,1,3,u> - 2819410718U, // <1,3,u,2>: Cost 3 vuzpr LHS, <3,u,1,2> - 1745666717U, // <1,3,u,3>: Cost 2 vuzpr LHS, LHS - 1483320630U, // <1,3,u,4>: Cost 2 vext1 <1,1,3,u>, RHS - 1551571098U, // <1,3,u,5>: Cost 2 vext2 <1,3,1,3>, RHS - 2819410758U, // <1,3,u,6>: Cost 3 vuzpr LHS, <3,u,5,6> - 1745669673U, // <1,3,u,7>: Cost 2 vuzpr LHS, RHS - 1745666722U, // <1,3,u,u>: Cost 2 vuzpr LHS, LHS - 2617352205U, // <1,4,0,0>: Cost 3 vext2 <0,0,1,4>, <0,0,1,4> - 2619342950U, // <1,4,0,1>: Cost 3 vext2 <0,3,1,4>, LHS - 3692421295U, // <1,4,0,2>: Cost 4 vext2 <0,2,1,4>, <0,2,1,4> - 2619343104U, // <1,4,0,3>: Cost 3 vext2 <0,3,1,4>, <0,3,1,4> - 2617352530U, // <1,4,0,4>: Cost 3 vext2 <0,0,1,4>, <0,4,1,5> - 1634880402U, // <1,4,0,5>: Cost 2 vext3 <4,0,5,1>, <4,0,5,1> - 2713930652U, // <1,4,0,6>: Cost 3 vext3 <4,u,5,1>, <4,0,6,2> - 3732898396U, // <1,4,0,7>: Cost 4 vext2 <7,0,1,4>, <0,7,4,1> - 1635101613U, // <1,4,0,u>: Cost 2 vext3 <4,0,u,1>, <4,0,u,1> - 3693085430U, // <1,4,1,0>: Cost 4 vext2 <0,3,1,4>, <1,0,3,2> - 2623988535U, // <1,4,1,1>: Cost 3 vext2 <1,1,1,4>, <1,1,1,4> - 3693085590U, // <1,4,1,2>: Cost 4 vext2 <0,3,1,4>, <1,2,3,0> - 3692422134U, // <1,4,1,3>: Cost 4 vext2 <0,2,1,4>, <1,3,4,6> - 3693085726U, // <1,4,1,4>: Cost 4 vext2 <0,3,1,4>, <1,4,0,1> - 2892401974U, // <1,4,1,5>: Cost 3 vzipl <1,1,1,1>, RHS - 3026619702U, // <1,4,1,6>: Cost 3 vtrnl <1,1,1,1>, RHS - 3800206324U, // <1,4,1,7>: Cost 4 vext3 <7,0,4,1>, <4,1,7,0> - 2892402217U, // <1,4,1,u>: Cost 3 vzipl <1,1,1,1>, RHS - 3966978927U, // <1,4,2,0>: Cost 4 vzipl <1,2,3,4>, <4,0,1,2> - 3966979018U, // <1,4,2,1>: Cost 4 vzipl <1,2,3,4>, <4,1,2,3> - 3693086312U, // <1,4,2,2>: Cost 4 vext2 <0,3,1,4>, <2,2,2,2> - 2635269798U, // <1,4,2,3>: Cost 3 vext2 <3,0,1,4>, <2,3,0,1> - 3966979280U, // <1,4,2,4>: Cost 4 vzipl <1,2,3,4>, <4,4,4,4> - 2893204790U, // <1,4,2,5>: Cost 3 vzipl <1,2,3,0>, RHS - 3693086650U, // <1,4,2,6>: Cost 4 vext2 <0,3,1,4>, <2,6,3,7> - 3666662502U, // <1,4,2,7>: Cost 4 vext1 <7,1,4,2>, <7,1,4,2> - 2893205033U, // <1,4,2,u>: Cost 3 vzipl <1,2,3,0>, RHS - 2563063910U, // <1,4,3,0>: Cost 3 vext1 <2,1,4,3>, LHS - 2563064730U, // <1,4,3,1>: Cost 3 vext1 <2,1,4,3>, <1,2,3,4> - 2563065386U, // <1,4,3,2>: Cost 3 vext1 <2,1,4,3>, <2,1,4,3> - 3693087132U, // <1,4,3,3>: Cost 4 vext2 <0,3,1,4>, <3,3,3,3> - 2619345410U, // <1,4,3,4>: Cost 3 vext2 <0,3,1,4>, <3,4,5,6> - 3087843666U, // <1,4,3,5>: Cost 3 vtrnr LHS, <0,4,1,5> - 3087843676U, // <1,4,3,6>: Cost 3 vtrnr LHS, <0,4,2,6> - 3666670695U, // <1,4,3,7>: Cost 4 vext1 <7,1,4,3>, <7,1,4,3> - 3087843669U, // <1,4,3,u>: Cost 3 vtrnr LHS, <0,4,1,u> - 2620672914U, // <1,4,4,0>: Cost 3 vext2 <0,5,1,4>, <4,0,5,1> - 3630842706U, // <1,4,4,1>: Cost 4 vext1 <1,1,4,4>, <1,1,4,4> - 3313069003U, // <1,4,4,2>: Cost 4 vrev <4,1,2,4> - 3642788100U, // <1,4,4,3>: Cost 4 vext1 <3,1,4,4>, <3,1,4,4> - 2713930960U, // <1,4,4,4>: Cost 3 vext3 <4,u,5,1>, <4,4,4,4> - 2619346230U, // <1,4,4,5>: Cost 3 vext2 <0,3,1,4>, RHS - 2713930980U, // <1,4,4,6>: Cost 3 vext3 <4,u,5,1>, <4,4,6,6> - 3736882642U, // <1,4,4,7>: Cost 4 vext2 <7,6,1,4>, <4,7,6,1> - 2619346473U, // <1,4,4,u>: Cost 3 vext2 <0,3,1,4>, RHS - 2557108326U, // <1,4,5,0>: Cost 3 vext1 <1,1,4,5>, LHS - 2557109075U, // <1,4,5,1>: Cost 3 vext1 <1,1,4,5>, <1,1,4,5> - 2598913774U, // <1,4,5,2>: Cost 3 vext1 <u,1,4,5>, <2,3,u,1> - 3630852246U, // <1,4,5,3>: Cost 4 vext1 <1,1,4,5>, <3,0,1,2> - 2557111606U, // <1,4,5,4>: Cost 3 vext1 <1,1,4,5>, RHS - 2895252790U, // <1,4,5,5>: Cost 3 vzipl <1,5,3,7>, RHS - 1616006454U, // <1,4,5,6>: Cost 2 vext3 <0,u,1,1>, RHS - 3899059510U, // <1,4,5,7>: Cost 4 vuzpr <1,1,1,4>, RHS - 1616006472U, // <1,4,5,u>: Cost 2 vext3 <0,u,1,1>, RHS - 2557116518U, // <1,4,6,0>: Cost 3 vext1 <1,1,4,6>, LHS - 2557117236U, // <1,4,6,1>: Cost 3 vext1 <1,1,4,6>, <1,1,1,1> - 3630859880U, // <1,4,6,2>: Cost 4 vext1 <1,1,4,6>, <2,2,2,2> - 2569062550U, // <1,4,6,3>: Cost 3 vext1 <3,1,4,6>, <3,0,1,2> - 2557119798U, // <1,4,6,4>: Cost 3 vext1 <1,1,4,6>, RHS - 3763490174U, // <1,4,6,5>: Cost 4 vext3 <0,u,1,1>, <4,6,5,7> - 3763490183U, // <1,4,6,6>: Cost 4 vext3 <0,u,1,1>, <4,6,6,7> - 2712751498U, // <1,4,6,7>: Cost 3 vext3 <4,6,7,1>, <4,6,7,1> - 2557122350U, // <1,4,6,u>: Cost 3 vext1 <1,1,4,6>, LHS - 2659161084U, // <1,4,7,0>: Cost 3 vext2 <7,0,1,4>, <7,0,1,4> - 3732903040U, // <1,4,7,1>: Cost 4 vext2 <7,0,1,4>, <7,1,7,1> - 3734230174U, // <1,4,7,2>: Cost 4 vext2 <7,2,1,4>, <7,2,1,4> - 3734893807U, // <1,4,7,3>: Cost 4 vext2 <7,3,1,4>, <7,3,1,4> - 3660729654U, // <1,4,7,4>: Cost 4 vext1 <6,1,4,7>, RHS - 3786493384U, // <1,4,7,5>: Cost 4 vext3 <4,6,7,1>, <4,7,5,0> - 2713341394U, // <1,4,7,6>: Cost 3 vext3 <4,7,6,1>, <4,7,6,1> - 3660731386U, // <1,4,7,7>: Cost 4 vext1 <6,1,4,7>, <7,0,1,2> - 2664470148U, // <1,4,7,u>: Cost 3 vext2 <7,u,1,4>, <7,u,1,4> - 2557132902U, // <1,4,u,0>: Cost 3 vext1 <1,1,4,u>, LHS - 2619348782U, // <1,4,u,1>: Cost 3 vext2 <0,3,1,4>, LHS - 2563106351U, // <1,4,u,2>: Cost 3 vext1 <2,1,4,u>, <2,1,4,u> - 2713783816U, // <1,4,u,3>: Cost 3 vext3 <4,u,3,1>, <4,u,3,1> - 2622666815U, // <1,4,u,4>: Cost 3 vext2 <0,u,1,4>, <u,4,5,6> - 1640189466U, // <1,4,u,5>: Cost 2 vext3 <4,u,5,1>, <4,u,5,1> - 1616006697U, // <1,4,u,6>: Cost 2 vext3 <0,u,1,1>, RHS - 2712751498U, // <1,4,u,7>: Cost 3 vext3 <4,6,7,1>, <4,6,7,1> - 1616006715U, // <1,4,u,u>: Cost 2 vext3 <0,u,1,1>, RHS - 2620014592U, // <1,5,0,0>: Cost 3 vext2 <0,4,1,5>, <0,0,0,0> - 1546272870U, // <1,5,0,1>: Cost 2 vext2 <0,4,1,5>, LHS - 2618687664U, // <1,5,0,2>: Cost 3 vext2 <0,2,1,5>, <0,2,1,5> - 3693093120U, // <1,5,0,3>: Cost 4 vext2 <0,3,1,5>, <0,3,1,4> - 1546273106U, // <1,5,0,4>: Cost 2 vext2 <0,4,1,5>, <0,4,1,5> - 2620678563U, // <1,5,0,5>: Cost 3 vext2 <0,5,1,5>, <0,5,1,5> - 2714668660U, // <1,5,0,6>: Cost 3 vext3 <5,0,6,1>, <5,0,6,1> - 3772042877U, // <1,5,0,7>: Cost 4 vext3 <2,3,0,1>, <5,0,7,1> - 1546273437U, // <1,5,0,u>: Cost 2 vext2 <0,4,1,5>, LHS - 2620015350U, // <1,5,1,0>: Cost 3 vext2 <0,4,1,5>, <1,0,3,2> - 2620015412U, // <1,5,1,1>: Cost 3 vext2 <0,4,1,5>, <1,1,1,1> - 2620015510U, // <1,5,1,2>: Cost 3 vext2 <0,4,1,5>, <1,2,3,0> - 2618688512U, // <1,5,1,3>: Cost 3 vext2 <0,2,1,5>, <1,3,5,7> - 2620015677U, // <1,5,1,4>: Cost 3 vext2 <0,4,1,5>, <1,4,3,5> - 2620015727U, // <1,5,1,5>: Cost 3 vext2 <0,4,1,5>, <1,5,0,1> - 2620015859U, // <1,5,1,6>: Cost 3 vext2 <0,4,1,5>, <1,6,5,7> - 3093728566U, // <1,5,1,7>: Cost 3 vtrnr <1,1,1,1>, RHS - 2620015981U, // <1,5,1,u>: Cost 3 vext2 <0,4,1,5>, <1,u,1,3> - 3692430816U, // <1,5,2,0>: Cost 4 vext2 <0,2,1,5>, <2,0,5,1> - 2620016163U, // <1,5,2,1>: Cost 3 vext2 <0,4,1,5>, <2,1,3,5> - 2620016232U, // <1,5,2,2>: Cost 3 vext2 <0,4,1,5>, <2,2,2,2> - 2620016294U, // <1,5,2,3>: Cost 3 vext2 <0,4,1,5>, <2,3,0,1> - 3693758221U, // <1,5,2,4>: Cost 4 vext2 <0,4,1,5>, <2,4,2,5> - 3692431209U, // <1,5,2,5>: Cost 4 vext2 <0,2,1,5>, <2,5,3,7> - 2620016570U, // <1,5,2,6>: Cost 3 vext2 <0,4,1,5>, <2,6,3,7> - 4173598006U, // <1,5,2,7>: Cost 4 vtrnr <2,1,3,2>, RHS - 2620016699U, // <1,5,2,u>: Cost 3 vext2 <0,4,1,5>, <2,u,0,1> - 2620016790U, // <1,5,3,0>: Cost 3 vext2 <0,4,1,5>, <3,0,1,2> - 2569110672U, // <1,5,3,1>: Cost 3 vext1 <3,1,5,3>, <1,5,3,7> - 3693758785U, // <1,5,3,2>: Cost 4 vext2 <0,4,1,5>, <3,2,2,2> - 2620017052U, // <1,5,3,3>: Cost 3 vext2 <0,4,1,5>, <3,3,3,3> - 2620017154U, // <1,5,3,4>: Cost 3 vext2 <0,4,1,5>, <3,4,5,6> - 3135623172U, // <1,5,3,5>: Cost 3 vtrnr LHS, <5,5,5,5> - 4161587048U, // <1,5,3,6>: Cost 4 vtrnr LHS, <2,5,3,6> - 2014104886U, // <1,5,3,7>: Cost 2 vtrnr LHS, RHS - 2014104887U, // <1,5,3,u>: Cost 2 vtrnr LHS, RHS - 2620017554U, // <1,5,4,0>: Cost 3 vext2 <0,4,1,5>, <4,0,5,1> - 2620017634U, // <1,5,4,1>: Cost 3 vext2 <0,4,1,5>, <4,1,5,0> - 3693759551U, // <1,5,4,2>: Cost 4 vext2 <0,4,1,5>, <4,2,6,3> - 3642861837U, // <1,5,4,3>: Cost 4 vext1 <3,1,5,4>, <3,1,5,4> - 2575092710U, // <1,5,4,4>: Cost 3 vext1 <4,1,5,4>, <4,1,5,4> - 1546276150U, // <1,5,4,5>: Cost 2 vext2 <0,4,1,5>, RHS - 2759855414U, // <1,5,4,6>: Cost 3 vuzpl <1,3,5,7>, RHS - 2713931718U, // <1,5,4,7>: Cost 3 vext3 <4,u,5,1>, <5,4,7,6> - 1546276393U, // <1,5,4,u>: Cost 2 vext2 <0,4,1,5>, RHS - 2557182054U, // <1,5,5,0>: Cost 3 vext1 <1,1,5,5>, LHS - 2557182812U, // <1,5,5,1>: Cost 3 vext1 <1,1,5,5>, <1,1,5,5> - 3630925347U, // <1,5,5,2>: Cost 4 vext1 <1,1,5,5>, <2,1,3,5> - 4029301675U, // <1,5,5,3>: Cost 4 vzipr <0,4,1,5>, <1,2,5,3> - 2557185334U, // <1,5,5,4>: Cost 3 vext1 <1,1,5,5>, RHS - 2713931780U, // <1,5,5,5>: Cost 3 vext3 <4,u,5,1>, <5,5,5,5> - 2667794530U, // <1,5,5,6>: Cost 3 vext2 <u,4,1,5>, <5,6,7,0> - 2713931800U, // <1,5,5,7>: Cost 3 vext3 <4,u,5,1>, <5,5,7,7> - 2557187886U, // <1,5,5,u>: Cost 3 vext1 <1,1,5,5>, LHS - 2718208036U, // <1,5,6,0>: Cost 3 vext3 <5,6,0,1>, <5,6,0,1> - 2620019115U, // <1,5,6,1>: Cost 3 vext2 <0,4,1,5>, <6,1,7,5> - 2667794938U, // <1,5,6,2>: Cost 3 vext2 <u,4,1,5>, <6,2,7,3> - 3787673666U, // <1,5,6,3>: Cost 4 vext3 <4,u,5,1>, <5,6,3,4> - 3693761165U, // <1,5,6,4>: Cost 4 vext2 <0,4,1,5>, <6,4,5,6> - 3319279297U, // <1,5,6,5>: Cost 4 vrev <5,1,5,6> - 2667795256U, // <1,5,6,6>: Cost 3 vext2 <u,4,1,5>, <6,6,6,6> - 2713931874U, // <1,5,6,7>: Cost 3 vext3 <4,u,5,1>, <5,6,7,0> - 2713931883U, // <1,5,6,u>: Cost 3 vext3 <4,u,5,1>, <5,6,u,0> - 2557198438U, // <1,5,7,0>: Cost 3 vext1 <1,1,5,7>, LHS - 2557199156U, // <1,5,7,1>: Cost 3 vext1 <1,1,5,7>, <1,1,1,1> - 2569143974U, // <1,5,7,2>: Cost 3 vext1 <3,1,5,7>, <2,3,0,1> - 2569144592U, // <1,5,7,3>: Cost 3 vext1 <3,1,5,7>, <3,1,5,7> - 2557201718U, // <1,5,7,4>: Cost 3 vext1 <1,1,5,7>, RHS - 2713931944U, // <1,5,7,5>: Cost 3 vext3 <4,u,5,1>, <5,7,5,7> - 3787673770U, // <1,5,7,6>: Cost 4 vext3 <4,u,5,1>, <5,7,6,0> - 2719387828U, // <1,5,7,7>: Cost 3 vext3 <5,7,7,1>, <5,7,7,1> - 2557204270U, // <1,5,7,u>: Cost 3 vext1 <1,1,5,7>, LHS - 2620020435U, // <1,5,u,0>: Cost 3 vext2 <0,4,1,5>, <u,0,1,2> - 1546278702U, // <1,5,u,1>: Cost 2 vext2 <0,4,1,5>, LHS - 2620020616U, // <1,5,u,2>: Cost 3 vext2 <0,4,1,5>, <u,2,3,3> - 2620020668U, // <1,5,u,3>: Cost 3 vext2 <0,4,1,5>, <u,3,0,1> - 1594054682U, // <1,5,u,4>: Cost 2 vext2 <u,4,1,5>, <u,4,1,5> - 1546279066U, // <1,5,u,5>: Cost 2 vext2 <0,4,1,5>, RHS - 2620020944U, // <1,5,u,6>: Cost 3 vext2 <0,4,1,5>, <u,6,3,7> - 2014145846U, // <1,5,u,7>: Cost 2 vtrnr LHS, RHS - 2014145847U, // <1,5,u,u>: Cost 2 vtrnr LHS, RHS - 3692437504U, // <1,6,0,0>: Cost 4 vext2 <0,2,1,6>, <0,0,0,0> - 2618695782U, // <1,6,0,1>: Cost 3 vext2 <0,2,1,6>, LHS - 2618695857U, // <1,6,0,2>: Cost 3 vext2 <0,2,1,6>, <0,2,1,6> - 3794161970U, // <1,6,0,3>: Cost 4 vext3 <6,0,3,1>, <6,0,3,1> - 2620023122U, // <1,6,0,4>: Cost 3 vext2 <0,4,1,6>, <0,4,1,5> - 2620686756U, // <1,6,0,5>: Cost 3 vext2 <0,5,1,6>, <0,5,1,6> - 2621350389U, // <1,6,0,6>: Cost 3 vext2 <0,6,1,6>, <0,6,1,6> - 4028599606U, // <1,6,0,7>: Cost 4 vzipr <0,3,1,0>, RHS - 2618696349U, // <1,6,0,u>: Cost 3 vext2 <0,2,1,6>, LHS - 3692438262U, // <1,6,1,0>: Cost 4 vext2 <0,2,1,6>, <1,0,3,2> - 2625995572U, // <1,6,1,1>: Cost 3 vext2 <1,4,1,6>, <1,1,1,1> - 3692438422U, // <1,6,1,2>: Cost 4 vext2 <0,2,1,6>, <1,2,3,0> - 3692438488U, // <1,6,1,3>: Cost 4 vext2 <0,2,1,6>, <1,3,1,3> - 2625995820U, // <1,6,1,4>: Cost 3 vext2 <1,4,1,6>, <1,4,1,6> - 3692438672U, // <1,6,1,5>: Cost 4 vext2 <0,2,1,6>, <1,5,3,7> - 3692438720U, // <1,6,1,6>: Cost 4 vext2 <0,2,1,6>, <1,6,0,1> - 2958183734U, // <1,6,1,7>: Cost 3 vzipr <0,u,1,1>, RHS - 2958183735U, // <1,6,1,u>: Cost 3 vzipr <0,u,1,1>, RHS - 2721526201U, // <1,6,2,0>: Cost 3 vext3 <6,2,0,1>, <6,2,0,1> - 3692439097U, // <1,6,2,1>: Cost 4 vext2 <0,2,1,6>, <2,1,6,0> - 3692439144U, // <1,6,2,2>: Cost 4 vext2 <0,2,1,6>, <2,2,2,2> - 3692439206U, // <1,6,2,3>: Cost 4 vext2 <0,2,1,6>, <2,3,0,1> - 3636948278U, // <1,6,2,4>: Cost 4 vext1 <2,1,6,2>, RHS - 3787674092U, // <1,6,2,5>: Cost 4 vext3 <4,u,5,1>, <6,2,5,7> - 2618697658U, // <1,6,2,6>: Cost 3 vext2 <0,2,1,6>, <2,6,3,7> - 2970799414U, // <1,6,2,7>: Cost 3 vzipr <3,0,1,2>, RHS - 2970799415U, // <1,6,2,u>: Cost 3 vzipr <3,0,1,2>, RHS - 2563211366U, // <1,6,3,0>: Cost 3 vext1 <2,1,6,3>, LHS - 3699738854U, // <1,6,3,1>: Cost 4 vext2 <1,4,1,6>, <3,1,1,1> - 2563212860U, // <1,6,3,2>: Cost 3 vext1 <2,1,6,3>, <2,1,6,3> - 3692439964U, // <1,6,3,3>: Cost 4 vext2 <0,2,1,6>, <3,3,3,3> - 2563214646U, // <1,6,3,4>: Cost 3 vext1 <2,1,6,3>, RHS - 4191820018U, // <1,6,3,5>: Cost 4 vtrnr <5,1,7,3>, <u,6,7,5> - 2587103648U, // <1,6,3,6>: Cost 3 vext1 <6,1,6,3>, <6,1,6,3> - 3087845306U, // <1,6,3,7>: Cost 3 vtrnr LHS, <2,6,3,7> - 3087845307U, // <1,6,3,u>: Cost 3 vtrnr LHS, <2,6,3,u> - 3693767570U, // <1,6,4,0>: Cost 4 vext2 <0,4,1,6>, <4,0,5,1> - 3693767650U, // <1,6,4,1>: Cost 4 vext2 <0,4,1,6>, <4,1,5,0> - 3636962877U, // <1,6,4,2>: Cost 4 vext1 <2,1,6,4>, <2,1,6,4> - 3325088134U, // <1,6,4,3>: Cost 4 vrev <6,1,3,4> - 3693767898U, // <1,6,4,4>: Cost 4 vext2 <0,4,1,6>, <4,4,5,5> - 2618699062U, // <1,6,4,5>: Cost 3 vext2 <0,2,1,6>, RHS - 3833670966U, // <1,6,4,6>: Cost 4 vuzpl <1,3,6,7>, RHS - 4028632374U, // <1,6,4,7>: Cost 4 vzipr <0,3,1,4>, RHS - 2618699305U, // <1,6,4,u>: Cost 3 vext2 <0,2,1,6>, RHS - 3693768264U, // <1,6,5,0>: Cost 4 vext2 <0,4,1,6>, <5,0,1,2> - 3630998373U, // <1,6,5,1>: Cost 4 vext1 <1,1,6,5>, <1,1,6,5> - 3636971070U, // <1,6,5,2>: Cost 4 vext1 <2,1,6,5>, <2,1,6,5> - 3642943767U, // <1,6,5,3>: Cost 4 vext1 <3,1,6,5>, <3,1,6,5> - 3693768628U, // <1,6,5,4>: Cost 4 vext2 <0,4,1,6>, <5,4,5,6> - 3732918276U, // <1,6,5,5>: Cost 4 vext2 <7,0,1,6>, <5,5,5,5> - 2620690530U, // <1,6,5,6>: Cost 3 vext2 <0,5,1,6>, <5,6,7,0> - 2955562294U, // <1,6,5,7>: Cost 3 vzipr <0,4,1,5>, RHS - 2955562295U, // <1,6,5,u>: Cost 3 vzipr <0,4,1,5>, RHS - 2724180733U, // <1,6,6,0>: Cost 3 vext3 <6,6,0,1>, <6,6,0,1> - 3631006566U, // <1,6,6,1>: Cost 4 vext1 <1,1,6,6>, <1,1,6,6> - 3631007674U, // <1,6,6,2>: Cost 4 vext1 <1,1,6,6>, <2,6,3,7> - 3692442184U, // <1,6,6,3>: Cost 4 vext2 <0,2,1,6>, <6,3,7,0> - 3631009078U, // <1,6,6,4>: Cost 4 vext1 <1,1,6,6>, RHS - 3787674416U, // <1,6,6,5>: Cost 4 vext3 <4,u,5,1>, <6,6,5,7> - 2713932600U, // <1,6,6,6>: Cost 3 vext3 <4,u,5,1>, <6,6,6,6> - 2713932610U, // <1,6,6,7>: Cost 3 vext3 <4,u,5,1>, <6,6,7,7> - 2713932619U, // <1,6,6,u>: Cost 3 vext3 <4,u,5,1>, <6,6,u,7> - 1651102542U, // <1,6,7,0>: Cost 2 vext3 <6,7,0,1>, <6,7,0,1> - 2724918103U, // <1,6,7,1>: Cost 3 vext3 <6,7,1,1>, <6,7,1,1> - 2698302306U, // <1,6,7,2>: Cost 3 vext3 <2,3,0,1>, <6,7,2,3> - 3642960153U, // <1,6,7,3>: Cost 4 vext1 <3,1,6,7>, <3,1,6,7> - 2713932662U, // <1,6,7,4>: Cost 3 vext3 <4,u,5,1>, <6,7,4,5> - 2725213051U, // <1,6,7,5>: Cost 3 vext3 <6,7,5,1>, <6,7,5,1> - 2724844426U, // <1,6,7,6>: Cost 3 vext3 <6,7,0,1>, <6,7,6,7> - 4035956022U, // <1,6,7,7>: Cost 4 vzipr <1,5,1,7>, RHS - 1651692438U, // <1,6,7,u>: Cost 2 vext3 <6,7,u,1>, <6,7,u,1> - 1651766175U, // <1,6,u,0>: Cost 2 vext3 <6,u,0,1>, <6,u,0,1> - 2618701614U, // <1,6,u,1>: Cost 3 vext2 <0,2,1,6>, LHS - 3135663508U, // <1,6,u,2>: Cost 3 vtrnr LHS, <4,6,u,2> - 3692443580U, // <1,6,u,3>: Cost 4 vext2 <0,2,1,6>, <u,3,0,1> - 2713932743U, // <1,6,u,4>: Cost 3 vext3 <4,u,5,1>, <6,u,4,5> - 2618701978U, // <1,6,u,5>: Cost 3 vext2 <0,2,1,6>, RHS - 2622683344U, // <1,6,u,6>: Cost 3 vext2 <0,u,1,6>, <u,6,3,7> - 3087886266U, // <1,6,u,7>: Cost 3 vtrnr LHS, <2,6,3,7> - 1652356071U, // <1,6,u,u>: Cost 2 vext3 <6,u,u,1>, <6,u,u,1> - 2726171632U, // <1,7,0,0>: Cost 3 vext3 <7,0,0,1>, <7,0,0,1> - 2626666598U, // <1,7,0,1>: Cost 3 vext2 <1,5,1,7>, LHS - 3695100067U, // <1,7,0,2>: Cost 4 vext2 <0,6,1,7>, <0,2,0,1> - 3707044102U, // <1,7,0,3>: Cost 4 vext2 <2,6,1,7>, <0,3,2,1> - 2726466580U, // <1,7,0,4>: Cost 3 vext3 <7,0,4,1>, <7,0,4,1> - 3654921933U, // <1,7,0,5>: Cost 4 vext1 <5,1,7,0>, <5,1,7,0> - 2621358582U, // <1,7,0,6>: Cost 3 vext2 <0,6,1,7>, <0,6,1,7> - 2622022215U, // <1,7,0,7>: Cost 3 vext2 <0,7,1,7>, <0,7,1,7> - 2626667165U, // <1,7,0,u>: Cost 3 vext2 <1,5,1,7>, LHS - 2593128550U, // <1,7,1,0>: Cost 3 vext1 <7,1,7,1>, LHS - 2626667316U, // <1,7,1,1>: Cost 3 vext2 <1,5,1,7>, <1,1,1,1> - 3700409238U, // <1,7,1,2>: Cost 4 vext2 <1,5,1,7>, <1,2,3,0> - 2257294428U, // <1,7,1,3>: Cost 3 vrev <7,1,3,1> - 2593131830U, // <1,7,1,4>: Cost 3 vext1 <7,1,7,1>, RHS - 2626667646U, // <1,7,1,5>: Cost 3 vext2 <1,5,1,7>, <1,5,1,7> - 2627331279U, // <1,7,1,6>: Cost 3 vext2 <1,6,1,7>, <1,6,1,7> - 2593133696U, // <1,7,1,7>: Cost 3 vext1 <7,1,7,1>, <7,1,7,1> - 2628658545U, // <1,7,1,u>: Cost 3 vext2 <1,u,1,7>, <1,u,1,7> - 2587164774U, // <1,7,2,0>: Cost 3 vext1 <6,1,7,2>, LHS - 3701073445U, // <1,7,2,1>: Cost 4 vext2 <1,6,1,7>, <2,1,3,7> - 3700409960U, // <1,7,2,2>: Cost 4 vext2 <1,5,1,7>, <2,2,2,2> - 2638612134U, // <1,7,2,3>: Cost 3 vext2 <3,5,1,7>, <2,3,0,1> - 2587168054U, // <1,7,2,4>: Cost 3 vext1 <6,1,7,2>, RHS - 3706382167U, // <1,7,2,5>: Cost 4 vext2 <2,5,1,7>, <2,5,1,7> - 2587169192U, // <1,7,2,6>: Cost 3 vext1 <6,1,7,2>, <6,1,7,2> - 3660911610U, // <1,7,2,7>: Cost 4 vext1 <6,1,7,2>, <7,0,1,2> - 2587170606U, // <1,7,2,u>: Cost 3 vext1 <6,1,7,2>, LHS - 1507459174U, // <1,7,3,0>: Cost 2 vext1 <5,1,7,3>, LHS - 2569257984U, // <1,7,3,1>: Cost 3 vext1 <3,1,7,3>, <1,3,5,7> - 2581202536U, // <1,7,3,2>: Cost 3 vext1 <5,1,7,3>, <2,2,2,2> - 2569259294U, // <1,7,3,3>: Cost 3 vext1 <3,1,7,3>, <3,1,7,3> - 1507462454U, // <1,7,3,4>: Cost 2 vext1 <5,1,7,3>, RHS - 1507462864U, // <1,7,3,5>: Cost 2 vext1 <5,1,7,3>, <5,1,7,3> - 2581205498U, // <1,7,3,6>: Cost 3 vext1 <5,1,7,3>, <6,2,7,3> - 2581206010U, // <1,7,3,7>: Cost 3 vext1 <5,1,7,3>, <7,0,1,2> - 1507465006U, // <1,7,3,u>: Cost 2 vext1 <5,1,7,3>, LHS - 2728826164U, // <1,7,4,0>: Cost 3 vext3 <7,4,0,1>, <7,4,0,1> - 3654951732U, // <1,7,4,1>: Cost 4 vext1 <5,1,7,4>, <1,1,1,1> - 3330987094U, // <1,7,4,2>: Cost 4 vrev <7,1,2,4> - 3331060831U, // <1,7,4,3>: Cost 4 vrev <7,1,3,4> - 3787674971U, // <1,7,4,4>: Cost 4 vext3 <4,u,5,1>, <7,4,4,4> - 2626669878U, // <1,7,4,5>: Cost 3 vext2 <1,5,1,7>, RHS - 3785979241U, // <1,7,4,6>: Cost 4 vext3 <4,6,0,1>, <7,4,6,0> - 3787085176U, // <1,7,4,7>: Cost 4 vext3 <4,7,6,1>, <7,4,7,6> - 2626670121U, // <1,7,4,u>: Cost 3 vext2 <1,5,1,7>, RHS - 2569273446U, // <1,7,5,0>: Cost 3 vext1 <3,1,7,5>, LHS - 2569274368U, // <1,7,5,1>: Cost 3 vext1 <3,1,7,5>, <1,3,5,7> - 3643016808U, // <1,7,5,2>: Cost 4 vext1 <3,1,7,5>, <2,2,2,2> - 2569275680U, // <1,7,5,3>: Cost 3 vext1 <3,1,7,5>, <3,1,7,5> - 2569276726U, // <1,7,5,4>: Cost 3 vext1 <3,1,7,5>, RHS - 4102034790U, // <1,7,5,5>: Cost 4 vtrnl <1,3,5,7>, <7,4,5,6> - 2651222067U, // <1,7,5,6>: Cost 3 vext2 <5,6,1,7>, <5,6,1,7> - 3899378998U, // <1,7,5,7>: Cost 4 vuzpr <1,1,5,7>, RHS - 2569279278U, // <1,7,5,u>: Cost 3 vext1 <3,1,7,5>, LHS - 2730153430U, // <1,7,6,0>: Cost 3 vext3 <7,6,0,1>, <7,6,0,1> - 2724845022U, // <1,7,6,1>: Cost 3 vext3 <6,7,0,1>, <7,6,1,0> - 3643025338U, // <1,7,6,2>: Cost 4 vext1 <3,1,7,6>, <2,6,3,7> - 3643025697U, // <1,7,6,3>: Cost 4 vext1 <3,1,7,6>, <3,1,7,6> - 3643026742U, // <1,7,6,4>: Cost 4 vext1 <3,1,7,6>, RHS - 3654971091U, // <1,7,6,5>: Cost 4 vext1 <5,1,7,6>, <5,1,7,6> - 3787675153U, // <1,7,6,6>: Cost 4 vext3 <4,u,5,1>, <7,6,6,6> - 2724845076U, // <1,7,6,7>: Cost 3 vext3 <6,7,0,1>, <7,6,7,0> - 2725508637U, // <1,7,6,u>: Cost 3 vext3 <6,u,0,1>, <7,6,u,0> - 2730817063U, // <1,7,7,0>: Cost 3 vext3 <7,7,0,1>, <7,7,0,1> - 3631088436U, // <1,7,7,1>: Cost 4 vext1 <1,1,7,7>, <1,1,1,1> - 3660949158U, // <1,7,7,2>: Cost 4 vext1 <6,1,7,7>, <2,3,0,1> - 3801904705U, // <1,7,7,3>: Cost 4 vext3 <7,3,0,1>, <7,7,3,0> - 3631090998U, // <1,7,7,4>: Cost 4 vext1 <1,1,7,7>, RHS - 2662503828U, // <1,7,7,5>: Cost 3 vext2 <7,5,1,7>, <7,5,1,7> - 3660951981U, // <1,7,7,6>: Cost 4 vext1 <6,1,7,7>, <6,1,7,7> - 2713933420U, // <1,7,7,7>: Cost 3 vext3 <4,u,5,1>, <7,7,7,7> - 2731406959U, // <1,7,7,u>: Cost 3 vext3 <7,7,u,1>, <7,7,u,1> - 1507500134U, // <1,7,u,0>: Cost 2 vext1 <5,1,7,u>, LHS - 2626672430U, // <1,7,u,1>: Cost 3 vext2 <1,5,1,7>, LHS - 2581243496U, // <1,7,u,2>: Cost 3 vext1 <5,1,7,u>, <2,2,2,2> - 2569300259U, // <1,7,u,3>: Cost 3 vext1 <3,1,7,u>, <3,1,7,u> - 1507503414U, // <1,7,u,4>: Cost 2 vext1 <5,1,7,u>, RHS - 1507503829U, // <1,7,u,5>: Cost 2 vext1 <5,1,7,u>, <5,1,7,u> - 2581246458U, // <1,7,u,6>: Cost 3 vext1 <5,1,7,u>, <6,2,7,3> - 2581246970U, // <1,7,u,7>: Cost 3 vext1 <5,1,7,u>, <7,0,1,2> - 1507505966U, // <1,7,u,u>: Cost 2 vext1 <5,1,7,u>, LHS - 1543643153U, // <1,u,0,0>: Cost 2 vext2 <0,0,1,u>, <0,0,1,u> - 1546297446U, // <1,u,0,1>: Cost 2 vext2 <0,4,1,u>, LHS - 2819448852U, // <1,u,0,2>: Cost 3 vuzpr LHS, <0,0,2,2> - 2619375876U, // <1,u,0,3>: Cost 3 vext2 <0,3,1,u>, <0,3,1,u> - 1546297685U, // <1,u,0,4>: Cost 2 vext2 <0,4,1,u>, <0,4,1,u> - 1658771190U, // <1,u,0,5>: Cost 2 vext3 <u,0,5,1>, <u,0,5,1> - 2736789248U, // <1,u,0,6>: Cost 3 vext3 <u,7,0,1>, <u,0,6,2> - 2659189376U, // <1,u,0,7>: Cost 3 vext2 <7,0,1,u>, <0,7,u,1> - 1546298013U, // <1,u,0,u>: Cost 2 vext2 <0,4,1,u>, LHS - 1483112550U, // <1,u,1,0>: Cost 2 vext1 <1,1,1,1>, LHS - 202162278U, // <1,u,1,1>: Cost 1 vdup1 LHS - 1616009006U, // <1,u,1,2>: Cost 2 vext3 <0,u,1,1>, LHS - 1745707110U, // <1,u,1,3>: Cost 2 vuzpr LHS, LHS - 1483115830U, // <1,u,1,4>: Cost 2 vext1 <1,1,1,1>, RHS - 2620040336U, // <1,u,1,5>: Cost 3 vext2 <0,4,1,u>, <1,5,3,7> - 3026622618U, // <1,u,1,6>: Cost 3 vtrnl <1,1,1,1>, RHS - 2958183752U, // <1,u,1,7>: Cost 3 vzipr <0,u,1,1>, RHS - 202162278U, // <1,u,1,u>: Cost 1 vdup1 LHS - 2819449750U, // <1,u,2,0>: Cost 3 vuzpr LHS, <1,2,3,0> - 2893207342U, // <1,u,2,1>: Cost 3 vzipl <1,2,3,0>, LHS - 2819448996U, // <1,u,2,2>: Cost 3 vuzpr LHS, <0,2,0,2> - 2819450482U, // <1,u,2,3>: Cost 3 vuzpr LHS, <2,2,3,3> - 2819449754U, // <1,u,2,4>: Cost 3 vuzpr LHS, <1,2,3,4> - 2893207706U, // <1,u,2,5>: Cost 3 vzipl <1,2,3,0>, RHS - 2819449036U, // <1,u,2,6>: Cost 3 vuzpr LHS, <0,2,4,6> - 2970799432U, // <1,u,2,7>: Cost 3 vzipr <3,0,1,2>, RHS - 2819449002U, // <1,u,2,u>: Cost 3 vuzpr LHS, <0,2,0,u> - 403931292U, // <1,u,3,0>: Cost 1 vext1 LHS, LHS - 1477673718U, // <1,u,3,1>: Cost 2 vext1 LHS, <1,0,3,2> - 115726126U, // <1,u,3,2>: Cost 1 vrev LHS - 2014102173U, // <1,u,3,3>: Cost 2 vtrnr LHS, LHS - 403934518U, // <1,u,3,4>: Cost 1 vext1 LHS, RHS - 1507536601U, // <1,u,3,5>: Cost 2 vext1 <5,1,u,3>, <5,1,u,3> - 1525453306U, // <1,u,3,6>: Cost 2 vext1 LHS, <6,2,7,3> - 2014105129U, // <1,u,3,7>: Cost 2 vtrnr LHS, RHS - 403937070U, // <1,u,3,u>: Cost 1 vext1 LHS, LHS - 2620042157U, // <1,u,4,0>: Cost 3 vext2 <0,4,1,u>, <4,0,u,1> - 2620042237U, // <1,u,4,1>: Cost 3 vext2 <0,4,1,u>, <4,1,u,0> - 2263217967U, // <1,u,4,2>: Cost 3 vrev <u,1,2,4> - 2569341224U, // <1,u,4,3>: Cost 3 vext1 <3,1,u,4>, <3,1,u,4> - 2569342262U, // <1,u,4,4>: Cost 3 vext1 <3,1,u,4>, RHS - 1546300726U, // <1,u,4,5>: Cost 2 vext2 <0,4,1,u>, RHS - 2819449180U, // <1,u,4,6>: Cost 3 vuzpr LHS, <0,4,2,6> - 2724845649U, // <1,u,4,7>: Cost 3 vext3 <6,7,0,1>, <u,4,7,6> - 1546300969U, // <1,u,4,u>: Cost 2 vext2 <0,4,1,u>, RHS - 2551431270U, // <1,u,5,0>: Cost 3 vext1 <0,1,u,5>, LHS - 2551432192U, // <1,u,5,1>: Cost 3 vext1 <0,1,u,5>, <1,3,5,7> - 3028293422U, // <1,u,5,2>: Cost 3 vtrnl <1,3,5,7>, LHS - 2955559068U, // <1,u,5,3>: Cost 3 vzipr <0,4,1,5>, LHS - 2551434550U, // <1,u,5,4>: Cost 3 vext1 <0,1,u,5>, RHS - 2895255706U, // <1,u,5,5>: Cost 3 vzipl <1,5,3,7>, RHS - 1616009370U, // <1,u,5,6>: Cost 2 vext3 <0,u,1,1>, RHS - 1745710390U, // <1,u,5,7>: Cost 2 vuzpr LHS, RHS - 1745710391U, // <1,u,5,u>: Cost 2 vuzpr LHS, RHS - 2653221159U, // <1,u,6,0>: Cost 3 vext2 <6,0,1,u>, <6,0,1,u> - 2725509303U, // <1,u,6,1>: Cost 3 vext3 <6,u,0,1>, <u,6,1,0> - 2659193338U, // <1,u,6,2>: Cost 3 vext2 <7,0,1,u>, <6,2,7,3> - 2689751248U, // <1,u,6,3>: Cost 3 vext3 <0,u,1,1>, <u,6,3,7> - 2867228774U, // <1,u,6,4>: Cost 3 vuzpr LHS, <5,6,7,4> - 3764820194U, // <1,u,6,5>: Cost 4 vext3 <1,1,1,1>, <u,6,5,7> - 2657202957U, // <1,u,6,6>: Cost 3 vext2 <6,6,1,u>, <6,6,1,u> - 2819450810U, // <1,u,6,7>: Cost 3 vuzpr LHS, <2,6,3,7> - 2819450811U, // <1,u,6,u>: Cost 3 vuzpr LHS, <2,6,3,u> - 1585452032U, // <1,u,7,0>: Cost 2 vext2 <7,0,1,u>, <7,0,1,u> - 2557420340U, // <1,u,7,1>: Cost 3 vext1 <1,1,u,7>, <1,1,1,1> - 2569365158U, // <1,u,7,2>: Cost 3 vext1 <3,1,u,7>, <2,3,0,1> - 2569365803U, // <1,u,7,3>: Cost 3 vext1 <3,1,u,7>, <3,1,u,7> - 2557422902U, // <1,u,7,4>: Cost 3 vext1 <1,1,u,7>, RHS - 2662512021U, // <1,u,7,5>: Cost 3 vext2 <7,5,1,u>, <7,5,1,u> - 2724845884U, // <1,u,7,6>: Cost 3 vext3 <6,7,0,1>, <u,7,6,7> - 2659194476U, // <1,u,7,7>: Cost 3 vext2 <7,0,1,u>, <7,7,7,7> - 1590761096U, // <1,u,7,u>: Cost 2 vext2 <7,u,1,u>, <7,u,1,u> - 403972257U, // <1,u,u,0>: Cost 1 vext1 LHS, LHS - 202162278U, // <1,u,u,1>: Cost 1 vdup1 LHS - 115767091U, // <1,u,u,2>: Cost 1 vrev LHS - 1745707677U, // <1,u,u,3>: Cost 2 vuzpr LHS, LHS - 403975478U, // <1,u,u,4>: Cost 1 vext1 LHS, RHS - 1546303642U, // <1,u,u,5>: Cost 2 vext2 <0,4,1,u>, RHS - 1616009613U, // <1,u,u,6>: Cost 2 vext3 <0,u,1,1>, RHS - 1745710633U, // <1,u,u,7>: Cost 2 vuzpr LHS, RHS - 403978030U, // <1,u,u,u>: Cost 1 vext1 LHS, LHS - 2551463936U, // <2,0,0,0>: Cost 3 vext1 <0,2,0,0>, <0,0,0,0> - 2685698058U, // <2,0,0,1>: Cost 3 vext3 <0,2,0,2>, <0,0,1,1> - 1610776596U, // <2,0,0,2>: Cost 2 vext3 <0,0,2,2>, <0,0,2,2> - 2619384069U, // <2,0,0,3>: Cost 3 vext2 <0,3,2,0>, <0,3,2,0> - 2551467318U, // <2,0,0,4>: Cost 3 vext1 <0,2,0,0>, RHS - 3899836596U, // <2,0,0,5>: Cost 4 vuzpr <1,2,3,0>, <3,0,4,5> - 2621374968U, // <2,0,0,6>: Cost 3 vext2 <0,6,2,0>, <0,6,2,0> - 4168271334U, // <2,0,0,7>: Cost 4 vtrnr <1,2,3,0>, <2,0,5,7> - 1611219018U, // <2,0,0,u>: Cost 2 vext3 <0,0,u,2>, <0,0,u,2> - 2551472138U, // <2,0,1,0>: Cost 3 vext1 <0,2,0,1>, <0,0,1,1> - 2690564186U, // <2,0,1,1>: Cost 3 vext3 <1,0,3,2>, <0,1,1,0> - 1611956326U, // <2,0,1,2>: Cost 2 vext3 <0,2,0,2>, LHS - 2826092646U, // <2,0,1,3>: Cost 3 vuzpr <1,2,3,0>, LHS - 2551475510U, // <2,0,1,4>: Cost 3 vext1 <0,2,0,1>, RHS - 3692463248U, // <2,0,1,5>: Cost 4 vext2 <0,2,2,0>, <1,5,3,7> - 2587308473U, // <2,0,1,6>: Cost 3 vext1 <6,2,0,1>, <6,2,0,1> - 3661050874U, // <2,0,1,7>: Cost 4 vext1 <6,2,0,1>, <7,0,1,2> - 1611956380U, // <2,0,1,u>: Cost 2 vext3 <0,2,0,2>, LHS - 1477738598U, // <2,0,2,0>: Cost 2 vext1 <0,2,0,2>, LHS - 2551481078U, // <2,0,2,1>: Cost 3 vext1 <0,2,0,2>, <1,0,3,2> - 2551481796U, // <2,0,2,2>: Cost 3 vext1 <0,2,0,2>, <2,0,2,0> - 2551482518U, // <2,0,2,3>: Cost 3 vext1 <0,2,0,2>, <3,0,1,2> - 1477741878U, // <2,0,2,4>: Cost 2 vext1 <0,2,0,2>, RHS - 2551484112U, // <2,0,2,5>: Cost 3 vext1 <0,2,0,2>, <5,1,7,3> - 2551484759U, // <2,0,2,6>: Cost 3 vext1 <0,2,0,2>, <6,0,7,2> - 2551485434U, // <2,0,2,7>: Cost 3 vext1 <0,2,0,2>, <7,0,1,2> - 1477744430U, // <2,0,2,u>: Cost 2 vext1 <0,2,0,2>, LHS - 2953625600U, // <2,0,3,0>: Cost 3 vzipr LHS, <0,0,0,0> - 2953627302U, // <2,0,3,1>: Cost 3 vzipr LHS, <2,3,0,1> - 2953625764U, // <2,0,3,2>: Cost 3 vzipr LHS, <0,2,0,2> - 4027369695U, // <2,0,3,3>: Cost 4 vzipr LHS, <3,1,0,3> - 3625233718U, // <2,0,3,4>: Cost 4 vext1 <0,2,0,3>, RHS - 3899836110U, // <2,0,3,5>: Cost 4 vuzpr <1,2,3,0>, <2,3,4,5> - 4032012618U, // <2,0,3,6>: Cost 4 vzipr LHS, <0,4,0,6> - 3899835392U, // <2,0,3,7>: Cost 4 vuzpr <1,2,3,0>, <1,3,5,7> - 2953625770U, // <2,0,3,u>: Cost 3 vzipr LHS, <0,2,0,u> - 2551496806U, // <2,0,4,0>: Cost 3 vext1 <0,2,0,4>, LHS - 2685698386U, // <2,0,4,1>: Cost 3 vext3 <0,2,0,2>, <0,4,1,5> - 2685698396U, // <2,0,4,2>: Cost 3 vext3 <0,2,0,2>, <0,4,2,6> - 3625240726U, // <2,0,4,3>: Cost 4 vext1 <0,2,0,4>, <3,0,1,2> - 2551500086U, // <2,0,4,4>: Cost 3 vext1 <0,2,0,4>, RHS - 2618723638U, // <2,0,4,5>: Cost 3 vext2 <0,2,2,0>, RHS - 2765409590U, // <2,0,4,6>: Cost 3 vuzpl <2,3,0,1>, RHS - 3799990664U, // <2,0,4,7>: Cost 4 vext3 <7,0,1,2>, <0,4,7,5> - 2685698450U, // <2,0,4,u>: Cost 3 vext3 <0,2,0,2>, <0,4,u,6> - 3625246822U, // <2,0,5,0>: Cost 4 vext1 <0,2,0,5>, LHS - 3289776304U, // <2,0,5,1>: Cost 4 vrev <0,2,1,5> - 2690564526U, // <2,0,5,2>: Cost 3 vext3 <1,0,3,2>, <0,5,2,7> - 3289923778U, // <2,0,5,3>: Cost 4 vrev <0,2,3,5> - 2216255691U, // <2,0,5,4>: Cost 3 vrev <0,2,4,5> - 3726307332U, // <2,0,5,5>: Cost 4 vext2 <5,u,2,0>, <5,5,5,5> - 3726307426U, // <2,0,5,6>: Cost 4 vext2 <5,u,2,0>, <5,6,7,0> - 2826095926U, // <2,0,5,7>: Cost 3 vuzpr <1,2,3,0>, RHS - 2216550639U, // <2,0,5,u>: Cost 3 vrev <0,2,u,5> - 4162420736U, // <2,0,6,0>: Cost 4 vtrnr <0,2,4,6>, <0,0,0,0> - 2901885030U, // <2,0,6,1>: Cost 3 vzipl <2,6,3,7>, LHS - 2685698559U, // <2,0,6,2>: Cost 3 vext3 <0,2,0,2>, <0,6,2,7> - 3643173171U, // <2,0,6,3>: Cost 4 vext1 <3,2,0,6>, <3,2,0,6> - 2216263884U, // <2,0,6,4>: Cost 3 vrev <0,2,4,6> - 3730289341U, // <2,0,6,5>: Cost 4 vext2 <6,5,2,0>, <6,5,2,0> - 3726308152U, // <2,0,6,6>: Cost 4 vext2 <5,u,2,0>, <6,6,6,6> - 3899836346U, // <2,0,6,7>: Cost 4 vuzpr <1,2,3,0>, <2,6,3,7> - 2216558832U, // <2,0,6,u>: Cost 3 vrev <0,2,u,6> - 2659202049U, // <2,0,7,0>: Cost 3 vext2 <7,0,2,0>, <7,0,2,0> - 3726308437U, // <2,0,7,1>: Cost 4 vext2 <5,u,2,0>, <7,1,2,3> - 2726249034U, // <2,0,7,2>: Cost 3 vext3 <7,0,1,2>, <0,7,2,1> - 3734934772U, // <2,0,7,3>: Cost 4 vext2 <7,3,2,0>, <7,3,2,0> - 3726308710U, // <2,0,7,4>: Cost 4 vext2 <5,u,2,0>, <7,4,5,6> - 3726308814U, // <2,0,7,5>: Cost 4 vext2 <5,u,2,0>, <7,5,u,2> - 3736925671U, // <2,0,7,6>: Cost 4 vext2 <7,6,2,0>, <7,6,2,0> - 3726308972U, // <2,0,7,7>: Cost 4 vext2 <5,u,2,0>, <7,7,7,7> - 2659202049U, // <2,0,7,u>: Cost 3 vext2 <7,0,2,0>, <7,0,2,0> - 1477787750U, // <2,0,u,0>: Cost 2 vext1 <0,2,0,u>, LHS - 2953668262U, // <2,0,u,1>: Cost 3 vzipr LHS, <2,3,0,1> - 1611956893U, // <2,0,u,2>: Cost 2 vext3 <0,2,0,2>, LHS - 2551531670U, // <2,0,u,3>: Cost 3 vext1 <0,2,0,u>, <3,0,1,2> - 1477791030U, // <2,0,u,4>: Cost 2 vext1 <0,2,0,u>, RHS - 2618726554U, // <2,0,u,5>: Cost 3 vext2 <0,2,2,0>, RHS - 2765412506U, // <2,0,u,6>: Cost 3 vuzpl <2,3,0,1>, RHS - 2826096169U, // <2,0,u,7>: Cost 3 vuzpr <1,2,3,0>, RHS - 1611956947U, // <2,0,u,u>: Cost 2 vext3 <0,2,0,2>, LHS - 2569453670U, // <2,1,0,0>: Cost 3 vext1 <3,2,1,0>, LHS - 2619392102U, // <2,1,0,1>: Cost 3 vext2 <0,3,2,1>, LHS - 3759440619U, // <2,1,0,2>: Cost 4 vext3 <0,2,0,2>, <1,0,2,0> - 1616823030U, // <2,1,0,3>: Cost 2 vext3 <1,0,3,2>, <1,0,3,2> - 2569456950U, // <2,1,0,4>: Cost 3 vext1 <3,2,1,0>, RHS - 2690712328U, // <2,1,0,5>: Cost 3 vext3 <1,0,5,2>, <1,0,5,2> - 3661115841U, // <2,1,0,6>: Cost 4 vext1 <6,2,1,0>, <6,2,1,0> - 2622046794U, // <2,1,0,7>: Cost 3 vext2 <0,7,2,1>, <0,7,2,1> - 1617191715U, // <2,1,0,u>: Cost 2 vext3 <1,0,u,2>, <1,0,u,2> - 2551545958U, // <2,1,1,0>: Cost 3 vext1 <0,2,1,1>, LHS - 2685698868U, // <2,1,1,1>: Cost 3 vext3 <0,2,0,2>, <1,1,1,1> - 2628682646U, // <2,1,1,2>: Cost 3 vext2 <1,u,2,1>, <1,2,3,0> - 2685698888U, // <2,1,1,3>: Cost 3 vext3 <0,2,0,2>, <1,1,3,3> - 2551549238U, // <2,1,1,4>: Cost 3 vext1 <0,2,1,1>, RHS - 3693134992U, // <2,1,1,5>: Cost 4 vext2 <0,3,2,1>, <1,5,3,7> - 3661124034U, // <2,1,1,6>: Cost 4 vext1 <6,2,1,1>, <6,2,1,1> - 3625292794U, // <2,1,1,7>: Cost 4 vext1 <0,2,1,1>, <7,0,1,2> - 2685698933U, // <2,1,1,u>: Cost 3 vext3 <0,2,0,2>, <1,1,u,3> - 2551554150U, // <2,1,2,0>: Cost 3 vext1 <0,2,1,2>, LHS - 3893649571U, // <2,1,2,1>: Cost 4 vuzpr <0,2,0,1>, <0,2,0,1> - 2551555688U, // <2,1,2,2>: Cost 3 vext1 <0,2,1,2>, <2,2,2,2> - 2685698966U, // <2,1,2,3>: Cost 3 vext3 <0,2,0,2>, <1,2,3,0> - 2551557430U, // <2,1,2,4>: Cost 3 vext1 <0,2,1,2>, RHS - 3763422123U, // <2,1,2,5>: Cost 4 vext3 <0,u,0,2>, <1,2,5,3> - 3693135802U, // <2,1,2,6>: Cost 4 vext2 <0,3,2,1>, <2,6,3,7> - 2726249402U, // <2,1,2,7>: Cost 3 vext3 <7,0,1,2>, <1,2,7,0> - 2685699011U, // <2,1,2,u>: Cost 3 vext3 <0,2,0,2>, <1,2,u,0> - 2551562342U, // <2,1,3,0>: Cost 3 vext1 <0,2,1,3>, LHS - 2953625610U, // <2,1,3,1>: Cost 3 vzipr LHS, <0,0,1,1> - 2953627798U, // <2,1,3,2>: Cost 3 vzipr LHS, <3,0,1,2> - 2953626584U, // <2,1,3,3>: Cost 3 vzipr LHS, <1,3,1,3> - 2551565622U, // <2,1,3,4>: Cost 3 vext1 <0,2,1,3>, RHS - 2953625938U, // <2,1,3,5>: Cost 3 vzipr LHS, <0,4,1,5> - 2587398596U, // <2,1,3,6>: Cost 3 vext1 <6,2,1,3>, <6,2,1,3> - 4032013519U, // <2,1,3,7>: Cost 4 vzipr LHS, <1,6,1,7> - 2953625617U, // <2,1,3,u>: Cost 3 vzipr LHS, <0,0,1,u> - 2690565154U, // <2,1,4,0>: Cost 3 vext3 <1,0,3,2>, <1,4,0,5> - 3625313270U, // <2,1,4,1>: Cost 4 vext1 <0,2,1,4>, <1,3,4,6> - 3771532340U, // <2,1,4,2>: Cost 4 vext3 <2,2,2,2>, <1,4,2,5> - 1148404634U, // <2,1,4,3>: Cost 2 vrev <1,2,3,4> - 3625315638U, // <2,1,4,4>: Cost 4 vext1 <0,2,1,4>, RHS - 2619395382U, // <2,1,4,5>: Cost 3 vext2 <0,3,2,1>, RHS - 3837242678U, // <2,1,4,6>: Cost 4 vuzpl <2,0,1,2>, RHS - 3799991394U, // <2,1,4,7>: Cost 4 vext3 <7,0,1,2>, <1,4,7,6> - 1148773319U, // <2,1,4,u>: Cost 2 vrev <1,2,u,4> - 2551578726U, // <2,1,5,0>: Cost 3 vext1 <0,2,1,5>, LHS - 2551579648U, // <2,1,5,1>: Cost 3 vext1 <0,2,1,5>, <1,3,5,7> - 3625321952U, // <2,1,5,2>: Cost 4 vext1 <0,2,1,5>, <2,0,5,1> - 2685699216U, // <2,1,5,3>: Cost 3 vext3 <0,2,0,2>, <1,5,3,7> - 2551582006U, // <2,1,5,4>: Cost 3 vext1 <0,2,1,5>, RHS - 3740913668U, // <2,1,5,5>: Cost 4 vext2 <u,3,2,1>, <5,5,5,5> - 3661156806U, // <2,1,5,6>: Cost 4 vext1 <6,2,1,5>, <6,2,1,5> - 3893652790U, // <2,1,5,7>: Cost 4 vuzpr <0,2,0,1>, RHS - 2685699261U, // <2,1,5,u>: Cost 3 vext3 <0,2,0,2>, <1,5,u,7> - 2551586918U, // <2,1,6,0>: Cost 3 vext1 <0,2,1,6>, LHS - 3625329398U, // <2,1,6,1>: Cost 4 vext1 <0,2,1,6>, <1,0,3,2> - 2551588794U, // <2,1,6,2>: Cost 3 vext1 <0,2,1,6>, <2,6,3,7> - 3088679014U, // <2,1,6,3>: Cost 3 vtrnr <0,2,4,6>, LHS - 2551590198U, // <2,1,6,4>: Cost 3 vext1 <0,2,1,6>, RHS - 4029382994U, // <2,1,6,5>: Cost 4 vzipr <0,4,2,6>, <0,4,1,5> - 3625333560U, // <2,1,6,6>: Cost 4 vext1 <0,2,1,6>, <6,6,6,6> - 3731624800U, // <2,1,6,7>: Cost 4 vext2 <6,7,2,1>, <6,7,2,1> - 2551592750U, // <2,1,6,u>: Cost 3 vext1 <0,2,1,6>, LHS - 2622051322U, // <2,1,7,0>: Cost 3 vext2 <0,7,2,1>, <7,0,1,2> - 3733615699U, // <2,1,7,1>: Cost 4 vext2 <7,1,2,1>, <7,1,2,1> - 3795125538U, // <2,1,7,2>: Cost 4 vext3 <6,1,7,2>, <1,7,2,0> - 2222171037U, // <2,1,7,3>: Cost 3 vrev <1,2,3,7> - 3740915046U, // <2,1,7,4>: Cost 4 vext2 <u,3,2,1>, <7,4,5,6> - 3296060335U, // <2,1,7,5>: Cost 4 vrev <1,2,5,7> - 3736933864U, // <2,1,7,6>: Cost 4 vext2 <7,6,2,1>, <7,6,2,1> - 3805300055U, // <2,1,7,7>: Cost 4 vext3 <7,u,1,2>, <1,7,7,u> - 2669827714U, // <2,1,7,u>: Cost 3 vext2 <u,7,2,1>, <7,u,1,2> - 2551603302U, // <2,1,u,0>: Cost 3 vext1 <0,2,1,u>, LHS - 2953666570U, // <2,1,u,1>: Cost 3 vzipr LHS, <0,0,1,1> - 2953668758U, // <2,1,u,2>: Cost 3 vzipr LHS, <3,0,1,2> - 1148437406U, // <2,1,u,3>: Cost 2 vrev <1,2,3,u> - 2551606582U, // <2,1,u,4>: Cost 3 vext1 <0,2,1,u>, RHS - 2953666898U, // <2,1,u,5>: Cost 3 vzipr LHS, <0,4,1,5> - 2587398596U, // <2,1,u,6>: Cost 3 vext1 <6,2,1,3>, <6,2,1,3> - 2669828370U, // <2,1,u,7>: Cost 3 vext2 <u,7,2,1>, <u,7,2,1> - 1148806091U, // <2,1,u,u>: Cost 2 vrev <1,2,u,u> - 1543667732U, // <2,2,0,0>: Cost 2 vext2 <0,0,2,2>, <0,0,2,2> - 1548976230U, // <2,2,0,1>: Cost 2 vext2 <0,u,2,2>, LHS - 2685699524U, // <2,2,0,2>: Cost 3 vext3 <0,2,0,2>, <2,0,2,0> - 2685699535U, // <2,2,0,3>: Cost 3 vext3 <0,2,0,2>, <2,0,3,2> - 2551614774U, // <2,2,0,4>: Cost 3 vext1 <0,2,2,0>, RHS - 3704422830U, // <2,2,0,5>: Cost 4 vext2 <2,2,2,2>, <0,5,2,7> - 3893657642U, // <2,2,0,6>: Cost 4 vuzpr <0,2,0,2>, <0,0,4,6> - 3770574323U, // <2,2,0,7>: Cost 4 vext3 <2,0,7,2>, <2,0,7,2> - 1548976796U, // <2,2,0,u>: Cost 2 vext2 <0,u,2,2>, <0,u,2,2> - 2622718710U, // <2,2,1,0>: Cost 3 vext2 <0,u,2,2>, <1,0,3,2> - 2622718772U, // <2,2,1,1>: Cost 3 vext2 <0,u,2,2>, <1,1,1,1> - 2622718870U, // <2,2,1,2>: Cost 3 vext2 <0,u,2,2>, <1,2,3,0> - 2819915878U, // <2,2,1,3>: Cost 3 vuzpr <0,2,0,2>, LHS - 3625364790U, // <2,2,1,4>: Cost 4 vext1 <0,2,2,1>, RHS - 2622719120U, // <2,2,1,5>: Cost 3 vext2 <0,u,2,2>, <1,5,3,7> - 3760031292U, // <2,2,1,6>: Cost 4 vext3 <0,2,u,2>, <2,1,6,3> - 3667170468U, // <2,2,1,7>: Cost 4 vext1 <7,2,2,1>, <7,2,2,1> - 2819915883U, // <2,2,1,u>: Cost 3 vuzpr <0,2,0,2>, LHS - 1489829990U, // <2,2,2,0>: Cost 2 vext1 <2,2,2,2>, LHS - 2563572470U, // <2,2,2,1>: Cost 3 vext1 <2,2,2,2>, <1,0,3,2> - 269271142U, // <2,2,2,2>: Cost 1 vdup2 LHS - 2685699698U, // <2,2,2,3>: Cost 3 vext3 <0,2,0,2>, <2,2,3,3> - 1489833270U, // <2,2,2,4>: Cost 2 vext1 <2,2,2,2>, RHS - 2685699720U, // <2,2,2,5>: Cost 3 vext3 <0,2,0,2>, <2,2,5,7> - 2622719930U, // <2,2,2,6>: Cost 3 vext2 <0,u,2,2>, <2,6,3,7> - 2593436837U, // <2,2,2,7>: Cost 3 vext1 <7,2,2,2>, <7,2,2,2> - 269271142U, // <2,2,2,u>: Cost 1 vdup2 LHS - 2685699750U, // <2,2,3,0>: Cost 3 vext3 <0,2,0,2>, <2,3,0,1> - 2690565806U, // <2,2,3,1>: Cost 3 vext3 <1,0,3,2>, <2,3,1,0> - 2953627240U, // <2,2,3,2>: Cost 3 vzipr LHS, <2,2,2,2> - 1879883878U, // <2,2,3,3>: Cost 2 vzipr LHS, LHS - 2685699790U, // <2,2,3,4>: Cost 3 vext3 <0,2,0,2>, <2,3,4,5> - 3893659342U, // <2,2,3,5>: Cost 4 vuzpr <0,2,0,2>, <2,3,4,5> - 2958270812U, // <2,2,3,6>: Cost 3 vzipr LHS, <0,4,2,6> - 2593445030U, // <2,2,3,7>: Cost 3 vext1 <7,2,2,3>, <7,2,2,3> - 1879883883U, // <2,2,3,u>: Cost 2 vzipr LHS, LHS - 2551644262U, // <2,2,4,0>: Cost 3 vext1 <0,2,2,4>, LHS - 3625386742U, // <2,2,4,1>: Cost 4 vext1 <0,2,2,4>, <1,0,3,2> - 2551645902U, // <2,2,4,2>: Cost 3 vext1 <0,2,2,4>, <2,3,4,5> - 3759441686U, // <2,2,4,3>: Cost 4 vext3 <0,2,0,2>, <2,4,3,5> - 2551647542U, // <2,2,4,4>: Cost 3 vext1 <0,2,2,4>, RHS - 1548979510U, // <2,2,4,5>: Cost 2 vext2 <0,u,2,2>, RHS - 2764901686U, // <2,2,4,6>: Cost 3 vuzpl <2,2,2,2>, RHS - 3667195047U, // <2,2,4,7>: Cost 4 vext1 <7,2,2,4>, <7,2,2,4> - 1548979753U, // <2,2,4,u>: Cost 2 vext2 <0,u,2,2>, RHS - 3696463432U, // <2,2,5,0>: Cost 4 vext2 <0,u,2,2>, <5,0,1,2> - 2617413328U, // <2,2,5,1>: Cost 3 vext2 <0,0,2,2>, <5,1,7,3> - 2685699936U, // <2,2,5,2>: Cost 3 vext3 <0,2,0,2>, <2,5,2,7> - 4027383910U, // <2,2,5,3>: Cost 4 vzipr <0,1,2,5>, LHS - 2228201085U, // <2,2,5,4>: Cost 3 vrev <2,2,4,5> - 2617413636U, // <2,2,5,5>: Cost 3 vext2 <0,0,2,2>, <5,5,5,5> - 2617413730U, // <2,2,5,6>: Cost 3 vext2 <0,0,2,2>, <5,6,7,0> - 2819919158U, // <2,2,5,7>: Cost 3 vuzpr <0,2,0,2>, RHS - 2819919159U, // <2,2,5,u>: Cost 3 vuzpr <0,2,0,2>, RHS - 3625402554U, // <2,2,6,0>: Cost 4 vext1 <0,2,2,6>, <0,2,2,6> - 3760031652U, // <2,2,6,1>: Cost 4 vext3 <0,2,u,2>, <2,6,1,3> - 2617414138U, // <2,2,6,2>: Cost 3 vext2 <0,0,2,2>, <6,2,7,3> - 2685700026U, // <2,2,6,3>: Cost 3 vext3 <0,2,0,2>, <2,6,3,7> - 3625405750U, // <2,2,6,4>: Cost 4 vext1 <0,2,2,6>, RHS - 3760031692U, // <2,2,6,5>: Cost 4 vext3 <0,2,u,2>, <2,6,5,7> - 3088679116U, // <2,2,6,6>: Cost 3 vtrnr <0,2,4,6>, <0,2,4,6> - 2657891169U, // <2,2,6,7>: Cost 3 vext2 <6,7,2,2>, <6,7,2,2> - 2685700071U, // <2,2,6,u>: Cost 3 vext3 <0,2,0,2>, <2,6,u,7> - 2726250474U, // <2,2,7,0>: Cost 3 vext3 <7,0,1,2>, <2,7,0,1> - 3704427616U, // <2,2,7,1>: Cost 4 vext2 <2,2,2,2>, <7,1,3,5> - 2660545701U, // <2,2,7,2>: Cost 3 vext2 <7,2,2,2>, <7,2,2,2> - 4030718054U, // <2,2,7,3>: Cost 4 vzipr <0,6,2,7>, LHS - 2617415014U, // <2,2,7,4>: Cost 3 vext2 <0,0,2,2>, <7,4,5,6> - 3302033032U, // <2,2,7,5>: Cost 4 vrev <2,2,5,7> - 3661246929U, // <2,2,7,6>: Cost 4 vext1 <6,2,2,7>, <6,2,2,7> - 2617415276U, // <2,2,7,7>: Cost 3 vext2 <0,0,2,2>, <7,7,7,7> - 2731558962U, // <2,2,7,u>: Cost 3 vext3 <7,u,1,2>, <2,7,u,1> - 1489829990U, // <2,2,u,0>: Cost 2 vext1 <2,2,2,2>, LHS - 1548982062U, // <2,2,u,1>: Cost 2 vext2 <0,u,2,2>, LHS - 269271142U, // <2,2,u,2>: Cost 1 vdup2 LHS - 1879924838U, // <2,2,u,3>: Cost 2 vzipr LHS, LHS - 1489833270U, // <2,2,u,4>: Cost 2 vext1 <2,2,2,2>, RHS - 1548982426U, // <2,2,u,5>: Cost 2 vext2 <0,u,2,2>, RHS - 2953666908U, // <2,2,u,6>: Cost 3 vzipr LHS, <0,4,2,6> - 2819919401U, // <2,2,u,7>: Cost 3 vuzpr <0,2,0,2>, RHS - 269271142U, // <2,2,u,u>: Cost 1 vdup2 LHS - 1544339456U, // <2,3,0,0>: Cost 2 vext2 LHS, <0,0,0,0> - 470597734U, // <2,3,0,1>: Cost 1 vext2 LHS, LHS - 1548984484U, // <2,3,0,2>: Cost 2 vext2 LHS, <0,2,0,2> - 2619408648U, // <2,3,0,3>: Cost 3 vext2 <0,3,2,3>, <0,3,2,3> - 1548984658U, // <2,3,0,4>: Cost 2 vext2 LHS, <0,4,1,5> - 2665857454U, // <2,3,0,5>: Cost 3 vext2 LHS, <0,5,2,7> - 2622726655U, // <2,3,0,6>: Cost 3 vext2 LHS, <0,6,2,7> - 2593494188U, // <2,3,0,7>: Cost 3 vext1 <7,2,3,0>, <7,2,3,0> - 470598301U, // <2,3,0,u>: Cost 1 vext2 LHS, LHS - 1544340214U, // <2,3,1,0>: Cost 2 vext2 LHS, <1,0,3,2> - 1544340276U, // <2,3,1,1>: Cost 2 vext2 LHS, <1,1,1,1> - 1544340374U, // <2,3,1,2>: Cost 2 vext2 LHS, <1,2,3,0> - 1548985304U, // <2,3,1,3>: Cost 2 vext2 LHS, <1,3,1,3> - 2551696694U, // <2,3,1,4>: Cost 3 vext1 <0,2,3,1>, RHS - 1548985488U, // <2,3,1,5>: Cost 2 vext2 LHS, <1,5,3,7> - 2622727375U, // <2,3,1,6>: Cost 3 vext2 LHS, <1,6,1,7> - 2665858347U, // <2,3,1,7>: Cost 3 vext2 LHS, <1,7,3,0> - 1548985709U, // <2,3,1,u>: Cost 2 vext2 LHS, <1,u,1,3> - 2622727613U, // <2,3,2,0>: Cost 3 vext2 LHS, <2,0,1,2> - 2622727711U, // <2,3,2,1>: Cost 3 vext2 LHS, <2,1,3,1> - 1544341096U, // <2,3,2,2>: Cost 2 vext2 LHS, <2,2,2,2> - 1544341158U, // <2,3,2,3>: Cost 2 vext2 LHS, <2,3,0,1> - 2622727958U, // <2,3,2,4>: Cost 3 vext2 LHS, <2,4,3,5> - 2622728032U, // <2,3,2,5>: Cost 3 vext2 LHS, <2,5,2,7> - 1548986298U, // <2,3,2,6>: Cost 2 vext2 LHS, <2,6,3,7> - 2665859050U, // <2,3,2,7>: Cost 3 vext2 LHS, <2,7,0,1> - 1548986427U, // <2,3,2,u>: Cost 2 vext2 LHS, <2,u,0,1> - 1548986518U, // <2,3,3,0>: Cost 2 vext2 LHS, <3,0,1,2> - 2622728415U, // <2,3,3,1>: Cost 3 vext2 LHS, <3,1,0,3> - 1489913458U, // <2,3,3,2>: Cost 2 vext1 <2,2,3,3>, <2,2,3,3> - 1544341916U, // <2,3,3,3>: Cost 2 vext2 LHS, <3,3,3,3> - 1548986882U, // <2,3,3,4>: Cost 2 vext2 LHS, <3,4,5,6> - 2665859632U, // <2,3,3,5>: Cost 3 vext2 LHS, <3,5,1,7> - 2234304870U, // <2,3,3,6>: Cost 3 vrev <3,2,6,3> - 2958271632U, // <2,3,3,7>: Cost 3 vzipr LHS, <1,5,3,7> - 1548987166U, // <2,3,3,u>: Cost 2 vext2 LHS, <3,u,1,2> - 1483948134U, // <2,3,4,0>: Cost 2 vext1 <1,2,3,4>, LHS - 1483948954U, // <2,3,4,1>: Cost 2 vext1 <1,2,3,4>, <1,2,3,4> - 2622729276U, // <2,3,4,2>: Cost 3 vext2 LHS, <4,2,6,0> - 2557692054U, // <2,3,4,3>: Cost 3 vext1 <1,2,3,4>, <3,0,1,2> - 1483951414U, // <2,3,4,4>: Cost 2 vext1 <1,2,3,4>, RHS - 470601014U, // <2,3,4,5>: Cost 1 vext2 LHS, RHS - 1592118644U, // <2,3,4,6>: Cost 2 vext2 LHS, <4,6,4,6> - 2593526960U, // <2,3,4,7>: Cost 3 vext1 <7,2,3,4>, <7,2,3,4> - 470601257U, // <2,3,4,u>: Cost 1 vext2 LHS, RHS - 2551726182U, // <2,3,5,0>: Cost 3 vext1 <0,2,3,5>, LHS - 1592118992U, // <2,3,5,1>: Cost 2 vext2 LHS, <5,1,7,3> - 2665860862U, // <2,3,5,2>: Cost 3 vext2 LHS, <5,2,3,4> - 2551728642U, // <2,3,5,3>: Cost 3 vext1 <0,2,3,5>, <3,4,5,6> - 1592119238U, // <2,3,5,4>: Cost 2 vext2 LHS, <5,4,7,6> - 1592119300U, // <2,3,5,5>: Cost 2 vext2 LHS, <5,5,5,5> - 1592119394U, // <2,3,5,6>: Cost 2 vext2 LHS, <5,6,7,0> - 1592119464U, // <2,3,5,7>: Cost 2 vext2 LHS, <5,7,5,7> - 1592119545U, // <2,3,5,u>: Cost 2 vext2 LHS, <5,u,5,7> - 2622730529U, // <2,3,6,0>: Cost 3 vext2 LHS, <6,0,1,2> - 2557707164U, // <2,3,6,1>: Cost 3 vext1 <1,2,3,6>, <1,2,3,6> - 1592119802U, // <2,3,6,2>: Cost 2 vext2 LHS, <6,2,7,3> - 2665861682U, // <2,3,6,3>: Cost 3 vext2 LHS, <6,3,4,5> - 2622730893U, // <2,3,6,4>: Cost 3 vext2 LHS, <6,4,5,6> - 2665861810U, // <2,3,6,5>: Cost 3 vext2 LHS, <6,5,0,7> - 1592120120U, // <2,3,6,6>: Cost 2 vext2 LHS, <6,6,6,6> - 1592120142U, // <2,3,6,7>: Cost 2 vext2 LHS, <6,7,0,1> - 1592120223U, // <2,3,6,u>: Cost 2 vext2 LHS, <6,u,0,1> - 1592120314U, // <2,3,7,0>: Cost 2 vext2 LHS, <7,0,1,2> - 2659890261U, // <2,3,7,1>: Cost 3 vext2 <7,1,2,3>, <7,1,2,3> - 2660553894U, // <2,3,7,2>: Cost 3 vext2 <7,2,2,3>, <7,2,2,3> - 2665862371U, // <2,3,7,3>: Cost 3 vext2 LHS, <7,3,0,1> - 1592120678U, // <2,3,7,4>: Cost 2 vext2 LHS, <7,4,5,6> - 2665862534U, // <2,3,7,5>: Cost 3 vext2 LHS, <7,5,0,2> - 2665862614U, // <2,3,7,6>: Cost 3 vext2 LHS, <7,6,0,1> - 1592120940U, // <2,3,7,7>: Cost 2 vext2 LHS, <7,7,7,7> - 1592120962U, // <2,3,7,u>: Cost 2 vext2 LHS, <7,u,1,2> - 1548990163U, // <2,3,u,0>: Cost 2 vext2 LHS, <u,0,1,2> - 470603566U, // <2,3,u,1>: Cost 1 vext2 LHS, LHS - 1548990341U, // <2,3,u,2>: Cost 2 vext2 LHS, <u,2,3,0> - 1548990396U, // <2,3,u,3>: Cost 2 vext2 LHS, <u,3,0,1> - 1548990527U, // <2,3,u,4>: Cost 2 vext2 LHS, <u,4,5,6> - 470603930U, // <2,3,u,5>: Cost 1 vext2 LHS, RHS - 1548990672U, // <2,3,u,6>: Cost 2 vext2 LHS, <u,6,3,7> - 1592121600U, // <2,3,u,7>: Cost 2 vext2 LHS, <u,7,0,1> - 470604133U, // <2,3,u,u>: Cost 1 vext2 LHS, LHS - 2617425942U, // <2,4,0,0>: Cost 3 vext2 <0,0,2,4>, <0,0,2,4> - 2618753126U, // <2,4,0,1>: Cost 3 vext2 <0,2,2,4>, LHS - 2618753208U, // <2,4,0,2>: Cost 3 vext2 <0,2,2,4>, <0,2,2,4> - 2619416841U, // <2,4,0,3>: Cost 3 vext2 <0,3,2,4>, <0,3,2,4> - 2587593628U, // <2,4,0,4>: Cost 3 vext1 <6,2,4,0>, <4,0,6,2> - 2712832914U, // <2,4,0,5>: Cost 3 vext3 <4,6,u,2>, <4,0,5,1> - 1634962332U, // <2,4,0,6>: Cost 2 vext3 <4,0,6,2>, <4,0,6,2> - 3799993252U, // <2,4,0,7>: Cost 4 vext3 <7,0,1,2>, <4,0,7,1> - 1634962332U, // <2,4,0,u>: Cost 2 vext3 <4,0,6,2>, <4,0,6,2> - 2619417334U, // <2,4,1,0>: Cost 3 vext2 <0,3,2,4>, <1,0,3,2> - 3692495668U, // <2,4,1,1>: Cost 4 vext2 <0,2,2,4>, <1,1,1,1> - 2625389466U, // <2,4,1,2>: Cost 3 vext2 <1,3,2,4>, <1,2,3,4> - 2826125414U, // <2,4,1,3>: Cost 3 vuzpr <1,2,3,4>, LHS - 3699794995U, // <2,4,1,4>: Cost 4 vext2 <1,4,2,4>, <1,4,2,4> - 3692496016U, // <2,4,1,5>: Cost 4 vext2 <0,2,2,4>, <1,5,3,7> - 3763424238U, // <2,4,1,6>: Cost 4 vext3 <0,u,0,2>, <4,1,6,3> - 3667317942U, // <2,4,1,7>: Cost 4 vext1 <7,2,4,1>, <7,2,4,1> - 2826125419U, // <2,4,1,u>: Cost 3 vuzpr <1,2,3,4>, LHS - 2629371336U, // <2,4,2,0>: Cost 3 vext2 <2,0,2,4>, <2,0,2,4> - 3699131946U, // <2,4,2,1>: Cost 4 vext2 <1,3,2,4>, <2,1,4,3> - 2630698602U, // <2,4,2,2>: Cost 3 vext2 <2,2,2,4>, <2,2,2,4> - 2618754766U, // <2,4,2,3>: Cost 3 vext2 <0,2,2,4>, <2,3,4,5> - 2826126234U, // <2,4,2,4>: Cost 3 vuzpr <1,2,3,4>, <1,2,3,4> - 2899119414U, // <2,4,2,5>: Cost 3 vzipl <2,2,2,2>, RHS - 3033337142U, // <2,4,2,6>: Cost 3 vtrnl <2,2,2,2>, RHS - 3800214597U, // <2,4,2,7>: Cost 4 vext3 <7,0,4,2>, <4,2,7,0> - 2899119657U, // <2,4,2,u>: Cost 3 vzipl <2,2,2,2>, RHS - 2635344033U, // <2,4,3,0>: Cost 3 vext2 <3,0,2,4>, <3,0,2,4> - 4032012325U, // <2,4,3,1>: Cost 4 vzipr LHS, <0,0,4,1> - 3692497228U, // <2,4,3,2>: Cost 4 vext2 <0,2,2,4>, <3,2,3,4> - 3692497308U, // <2,4,3,3>: Cost 4 vext2 <0,2,2,4>, <3,3,3,3> - 3001404624U, // <2,4,3,4>: Cost 3 vzipr LHS, <4,4,4,4> - 2953627342U, // <2,4,3,5>: Cost 3 vzipr LHS, <2,3,4,5> - 2953625804U, // <2,4,3,6>: Cost 3 vzipr LHS, <0,2,4,6> - 3899868160U, // <2,4,3,7>: Cost 4 vuzpr <1,2,3,4>, <1,3,5,7> - 2953625806U, // <2,4,3,u>: Cost 3 vzipr LHS, <0,2,4,u> - 2710916266U, // <2,4,4,0>: Cost 3 vext3 <4,4,0,2>, <4,4,0,2> - 3899869648U, // <2,4,4,1>: Cost 4 vuzpr <1,2,3,4>, <3,4,0,1> - 3899869658U, // <2,4,4,2>: Cost 4 vuzpr <1,2,3,4>, <3,4,1,2> - 3899868930U, // <2,4,4,3>: Cost 4 vuzpr <1,2,3,4>, <2,4,1,3> - 2712833232U, // <2,4,4,4>: Cost 3 vext3 <4,6,u,2>, <4,4,4,4> - 2618756406U, // <2,4,4,5>: Cost 3 vext2 <0,2,2,4>, RHS - 2765737270U, // <2,4,4,6>: Cost 3 vuzpl <2,3,4,5>, RHS - 4168304426U, // <2,4,4,7>: Cost 4 vtrnr <1,2,3,4>, <2,4,5,7> - 2618756649U, // <2,4,4,u>: Cost 3 vext2 <0,2,2,4>, RHS - 2551800011U, // <2,4,5,0>: Cost 3 vext1 <0,2,4,5>, <0,2,4,5> - 2569716470U, // <2,4,5,1>: Cost 3 vext1 <3,2,4,5>, <1,0,3,2> - 2563745405U, // <2,4,5,2>: Cost 3 vext1 <2,2,4,5>, <2,2,4,5> - 2569718102U, // <2,4,5,3>: Cost 3 vext1 <3,2,4,5>, <3,2,4,5> - 2551803190U, // <2,4,5,4>: Cost 3 vext1 <0,2,4,5>, RHS - 3625545732U, // <2,4,5,5>: Cost 4 vext1 <0,2,4,5>, <5,5,5,5> - 1611959606U, // <2,4,5,6>: Cost 2 vext3 <0,2,0,2>, RHS - 2826128694U, // <2,4,5,7>: Cost 3 vuzpr <1,2,3,4>, RHS - 1611959624U, // <2,4,5,u>: Cost 2 vext3 <0,2,0,2>, RHS - 1478066278U, // <2,4,6,0>: Cost 2 vext1 <0,2,4,6>, LHS - 2551808758U, // <2,4,6,1>: Cost 3 vext1 <0,2,4,6>, <1,0,3,2> - 2551809516U, // <2,4,6,2>: Cost 3 vext1 <0,2,4,6>, <2,0,6,4> - 2551810198U, // <2,4,6,3>: Cost 3 vext1 <0,2,4,6>, <3,0,1,2> - 1478069558U, // <2,4,6,4>: Cost 2 vext1 <0,2,4,6>, RHS - 2901888310U, // <2,4,6,5>: Cost 3 vzipl <2,6,3,7>, RHS - 2551812920U, // <2,4,6,6>: Cost 3 vext1 <0,2,4,6>, <6,6,6,6> - 2726251914U, // <2,4,6,7>: Cost 3 vext3 <7,0,1,2>, <4,6,7,1> - 1478072110U, // <2,4,6,u>: Cost 2 vext1 <0,2,4,6>, LHS - 2659234821U, // <2,4,7,0>: Cost 3 vext2 <7,0,2,4>, <7,0,2,4> - 3786722726U, // <2,4,7,1>: Cost 4 vext3 <4,7,1,2>, <4,7,1,2> - 3734303911U, // <2,4,7,2>: Cost 4 vext2 <7,2,2,4>, <7,2,2,4> - 3734967544U, // <2,4,7,3>: Cost 4 vext2 <7,3,2,4>, <7,3,2,4> - 3727005030U, // <2,4,7,4>: Cost 4 vext2 <6,0,2,4>, <7,4,5,6> - 2726251976U, // <2,4,7,5>: Cost 3 vext3 <7,0,1,2>, <4,7,5,0> - 2726251986U, // <2,4,7,6>: Cost 3 vext3 <7,0,1,2>, <4,7,6,1> - 3727005292U, // <2,4,7,7>: Cost 4 vext2 <6,0,2,4>, <7,7,7,7> - 2659234821U, // <2,4,7,u>: Cost 3 vext2 <7,0,2,4>, <7,0,2,4> - 1478082662U, // <2,4,u,0>: Cost 2 vext1 <0,2,4,u>, LHS - 2618758958U, // <2,4,u,1>: Cost 3 vext2 <0,2,2,4>, LHS - 2551826024U, // <2,4,u,2>: Cost 3 vext1 <0,2,4,u>, <2,2,2,2> - 2551826582U, // <2,4,u,3>: Cost 3 vext1 <0,2,4,u>, <3,0,1,2> - 1478085942U, // <2,4,u,4>: Cost 2 vext1 <0,2,4,u>, RHS - 2953668302U, // <2,4,u,5>: Cost 3 vzipr LHS, <2,3,4,5> - 1611959849U, // <2,4,u,6>: Cost 2 vext3 <0,2,0,2>, RHS - 2826128937U, // <2,4,u,7>: Cost 3 vuzpr <1,2,3,4>, RHS - 1611959867U, // <2,4,u,u>: Cost 2 vext3 <0,2,0,2>, RHS - 3691839488U, // <2,5,0,0>: Cost 4 vext2 <0,1,2,5>, <0,0,0,0> - 2618097766U, // <2,5,0,1>: Cost 3 vext2 <0,1,2,5>, LHS - 2620088484U, // <2,5,0,2>: Cost 3 vext2 <0,4,2,5>, <0,2,0,2> - 2619425034U, // <2,5,0,3>: Cost 3 vext2 <0,3,2,5>, <0,3,2,5> - 2620088667U, // <2,5,0,4>: Cost 3 vext2 <0,4,2,5>, <0,4,2,5> - 2620752300U, // <2,5,0,5>: Cost 3 vext2 <0,5,2,5>, <0,5,2,5> - 3693830655U, // <2,5,0,6>: Cost 4 vext2 <0,4,2,5>, <0,6,2,7> - 3094531382U, // <2,5,0,7>: Cost 3 vtrnr <1,2,3,0>, RHS - 2618098333U, // <2,5,0,u>: Cost 3 vext2 <0,1,2,5>, LHS - 3691840246U, // <2,5,1,0>: Cost 4 vext2 <0,1,2,5>, <1,0,3,2> - 3691840308U, // <2,5,1,1>: Cost 4 vext2 <0,1,2,5>, <1,1,1,1> - 2626061206U, // <2,5,1,2>: Cost 3 vext2 <1,4,2,5>, <1,2,3,0> - 2618098688U, // <2,5,1,3>: Cost 3 vext2 <0,1,2,5>, <1,3,5,7> - 2626061364U, // <2,5,1,4>: Cost 3 vext2 <1,4,2,5>, <1,4,2,5> - 3691840656U, // <2,5,1,5>: Cost 4 vext2 <0,1,2,5>, <1,5,3,7> - 3789082310U, // <2,5,1,6>: Cost 4 vext3 <5,1,6,2>, <5,1,6,2> - 2712833744U, // <2,5,1,7>: Cost 3 vext3 <4,6,u,2>, <5,1,7,3> - 2628715896U, // <2,5,1,u>: Cost 3 vext2 <1,u,2,5>, <1,u,2,5> - 3693831613U, // <2,5,2,0>: Cost 4 vext2 <0,4,2,5>, <2,0,1,2> - 4026698642U, // <2,5,2,1>: Cost 4 vzipr <0,0,2,2>, <4,0,5,1> - 2632033896U, // <2,5,2,2>: Cost 3 vext2 <2,4,2,5>, <2,2,2,2> - 3691841190U, // <2,5,2,3>: Cost 4 vext2 <0,1,2,5>, <2,3,0,1> - 2632034061U, // <2,5,2,4>: Cost 3 vext2 <2,4,2,5>, <2,4,2,5> - 3691841352U, // <2,5,2,5>: Cost 4 vext2 <0,1,2,5>, <2,5,0,1> - 3691841466U, // <2,5,2,6>: Cost 4 vext2 <0,1,2,5>, <2,6,3,7> - 3088354614U, // <2,5,2,7>: Cost 3 vtrnr <0,2,0,2>, RHS - 3088354615U, // <2,5,2,u>: Cost 3 vtrnr <0,2,0,2>, RHS - 2557829222U, // <2,5,3,0>: Cost 3 vext1 <1,2,5,3>, LHS - 2557830059U, // <2,5,3,1>: Cost 3 vext1 <1,2,5,3>, <1,2,5,3> - 2575746766U, // <2,5,3,2>: Cost 3 vext1 <4,2,5,3>, <2,3,4,5> - 3691841948U, // <2,5,3,3>: Cost 4 vext2 <0,1,2,5>, <3,3,3,3> - 2619427330U, // <2,5,3,4>: Cost 3 vext2 <0,3,2,5>, <3,4,5,6> - 2581720847U, // <2,5,3,5>: Cost 3 vext1 <5,2,5,3>, <5,2,5,3> - 2953628162U, // <2,5,3,6>: Cost 3 vzipr LHS, <3,4,5,6> - 2953626624U, // <2,5,3,7>: Cost 3 vzipr LHS, <1,3,5,7> - 2953626625U, // <2,5,3,u>: Cost 3 vzipr LHS, <1,3,5,u> - 2569781350U, // <2,5,4,0>: Cost 3 vext1 <3,2,5,4>, LHS - 3631580076U, // <2,5,4,1>: Cost 4 vext1 <1,2,5,4>, <1,2,5,4> - 2569782990U, // <2,5,4,2>: Cost 3 vext1 <3,2,5,4>, <2,3,4,5> - 2569783646U, // <2,5,4,3>: Cost 3 vext1 <3,2,5,4>, <3,2,5,4> - 2569784630U, // <2,5,4,4>: Cost 3 vext1 <3,2,5,4>, RHS - 2618101046U, // <2,5,4,5>: Cost 3 vext2 <0,1,2,5>, RHS - 3893905922U, // <2,5,4,6>: Cost 4 vuzpr <0,2,3,5>, <3,4,5,6> - 3094564150U, // <2,5,4,7>: Cost 3 vtrnr <1,2,3,4>, RHS - 2618101289U, // <2,5,4,u>: Cost 3 vext2 <0,1,2,5>, RHS - 2551873638U, // <2,5,5,0>: Cost 3 vext1 <0,2,5,5>, LHS - 3637560320U, // <2,5,5,1>: Cost 4 vext1 <2,2,5,5>, <1,3,5,7> - 3637560966U, // <2,5,5,2>: Cost 4 vext1 <2,2,5,5>, <2,2,5,5> - 3723030343U, // <2,5,5,3>: Cost 4 vext2 <5,3,2,5>, <5,3,2,5> - 2551876918U, // <2,5,5,4>: Cost 3 vext1 <0,2,5,5>, RHS - 2712834052U, // <2,5,5,5>: Cost 3 vext3 <4,6,u,2>, <5,5,5,5> - 4028713474U, // <2,5,5,6>: Cost 4 vzipr <0,3,2,5>, <3,4,5,6> - 2712834072U, // <2,5,5,7>: Cost 3 vext3 <4,6,u,2>, <5,5,7,7> - 2712834081U, // <2,5,5,u>: Cost 3 vext3 <4,6,u,2>, <5,5,u,7> - 2575769702U, // <2,5,6,0>: Cost 3 vext1 <4,2,5,6>, LHS - 3631596462U, // <2,5,6,1>: Cost 4 vext1 <1,2,5,6>, <1,2,5,6> - 2655924730U, // <2,5,6,2>: Cost 3 vext2 <6,4,2,5>, <6,2,7,3> - 3643541856U, // <2,5,6,3>: Cost 4 vext1 <3,2,5,6>, <3,2,5,6> - 2655924849U, // <2,5,6,4>: Cost 3 vext2 <6,4,2,5>, <6,4,2,5> - 3787755607U, // <2,5,6,5>: Cost 4 vext3 <4,u,6,2>, <5,6,5,7> - 4029385218U, // <2,5,6,6>: Cost 4 vzipr <0,4,2,6>, <3,4,5,6> - 3088682294U, // <2,5,6,7>: Cost 3 vtrnr <0,2,4,6>, RHS - 3088682295U, // <2,5,6,u>: Cost 3 vtrnr <0,2,4,6>, RHS - 2563833958U, // <2,5,7,0>: Cost 3 vext1 <2,2,5,7>, LHS - 2551890678U, // <2,5,7,1>: Cost 3 vext1 <0,2,5,7>, <1,0,3,2> - 2563835528U, // <2,5,7,2>: Cost 3 vext1 <2,2,5,7>, <2,2,5,7> - 3637577878U, // <2,5,7,3>: Cost 4 vext1 <2,2,5,7>, <3,0,1,2> - 2563837238U, // <2,5,7,4>: Cost 3 vext1 <2,2,5,7>, RHS - 2712834216U, // <2,5,7,5>: Cost 3 vext3 <4,6,u,2>, <5,7,5,7> - 2712834220U, // <2,5,7,6>: Cost 3 vext3 <4,6,u,2>, <5,7,6,2> - 4174449974U, // <2,5,7,7>: Cost 4 vtrnr <2,2,5,7>, RHS - 2563839790U, // <2,5,7,u>: Cost 3 vext1 <2,2,5,7>, LHS - 2563842150U, // <2,5,u,0>: Cost 3 vext1 <2,2,5,u>, LHS - 2618103598U, // <2,5,u,1>: Cost 3 vext2 <0,1,2,5>, LHS - 2563843721U, // <2,5,u,2>: Cost 3 vext1 <2,2,5,u>, <2,2,5,u> - 2569816418U, // <2,5,u,3>: Cost 3 vext1 <3,2,5,u>, <3,2,5,u> - 2622748735U, // <2,5,u,4>: Cost 3 vext2 <0,u,2,5>, <u,4,5,6> - 2618103962U, // <2,5,u,5>: Cost 3 vext2 <0,1,2,5>, RHS - 2953669122U, // <2,5,u,6>: Cost 3 vzipr LHS, <3,4,5,6> - 2953667584U, // <2,5,u,7>: Cost 3 vzipr LHS, <1,3,5,7> - 2618104165U, // <2,5,u,u>: Cost 3 vext2 <0,1,2,5>, LHS - 2620096512U, // <2,6,0,0>: Cost 3 vext2 <0,4,2,6>, <0,0,0,0> - 1546354790U, // <2,6,0,1>: Cost 2 vext2 <0,4,2,6>, LHS - 2620096676U, // <2,6,0,2>: Cost 3 vext2 <0,4,2,6>, <0,2,0,2> - 3693838588U, // <2,6,0,3>: Cost 4 vext2 <0,4,2,6>, <0,3,1,0> - 1546355036U, // <2,6,0,4>: Cost 2 vext2 <0,4,2,6>, <0,4,2,6> - 3694502317U, // <2,6,0,5>: Cost 4 vext2 <0,5,2,6>, <0,5,2,6> - 2551911246U, // <2,6,0,6>: Cost 3 vext1 <0,2,6,0>, <6,7,0,1> - 2720723287U, // <2,6,0,7>: Cost 3 vext3 <6,0,7,2>, <6,0,7,2> - 1546355357U, // <2,6,0,u>: Cost 2 vext2 <0,4,2,6>, LHS - 2620097270U, // <2,6,1,0>: Cost 3 vext2 <0,4,2,6>, <1,0,3,2> - 2620097332U, // <2,6,1,1>: Cost 3 vext2 <0,4,2,6>, <1,1,1,1> - 2620097430U, // <2,6,1,2>: Cost 3 vext2 <0,4,2,6>, <1,2,3,0> - 2820243558U, // <2,6,1,3>: Cost 3 vuzpr <0,2,4,6>, LHS - 2620097598U, // <2,6,1,4>: Cost 3 vext2 <0,4,2,6>, <1,4,3,6> - 2620097680U, // <2,6,1,5>: Cost 3 vext2 <0,4,2,6>, <1,5,3,7> - 3693839585U, // <2,6,1,6>: Cost 4 vext2 <0,4,2,6>, <1,6,3,7> - 2721386920U, // <2,6,1,7>: Cost 3 vext3 <6,1,7,2>, <6,1,7,2> - 2820243563U, // <2,6,1,u>: Cost 3 vuzpr <0,2,4,6>, LHS - 2714014137U, // <2,6,2,0>: Cost 3 vext3 <4,u,6,2>, <6,2,0,1> - 2712834500U, // <2,6,2,1>: Cost 3 vext3 <4,6,u,2>, <6,2,1,3> - 2620098152U, // <2,6,2,2>: Cost 3 vext2 <0,4,2,6>, <2,2,2,2> - 2620098214U, // <2,6,2,3>: Cost 3 vext2 <0,4,2,6>, <2,3,0,1> - 2632042254U, // <2,6,2,4>: Cost 3 vext2 <2,4,2,6>, <2,4,2,6> - 2712834540U, // <2,6,2,5>: Cost 3 vext3 <4,6,u,2>, <6,2,5,7> - 2820243660U, // <2,6,2,6>: Cost 3 vuzpr <0,2,4,6>, <0,2,4,6> - 2958265654U, // <2,6,2,7>: Cost 3 vzipr <0,u,2,2>, RHS - 2620098619U, // <2,6,2,u>: Cost 3 vext2 <0,4,2,6>, <2,u,0,1> - 2620098710U, // <2,6,3,0>: Cost 3 vext2 <0,4,2,6>, <3,0,1,2> - 3893986982U, // <2,6,3,1>: Cost 4 vuzpr <0,2,4,6>, <2,3,0,1> - 2569848762U, // <2,6,3,2>: Cost 3 vext1 <3,2,6,3>, <2,6,3,7> - 2620098972U, // <2,6,3,3>: Cost 3 vext2 <0,4,2,6>, <3,3,3,3> - 2620099074U, // <2,6,3,4>: Cost 3 vext2 <0,4,2,6>, <3,4,5,6> - 3893987022U, // <2,6,3,5>: Cost 4 vuzpr <0,2,4,6>, <2,3,4,5> - 3001404644U, // <2,6,3,6>: Cost 3 vzipr LHS, <4,4,6,6> - 1879887158U, // <2,6,3,7>: Cost 2 vzipr LHS, RHS - 1879887159U, // <2,6,3,u>: Cost 2 vzipr LHS, RHS - 2620099484U, // <2,6,4,0>: Cost 3 vext2 <0,4,2,6>, <4,0,6,2> - 2620099566U, // <2,6,4,1>: Cost 3 vext2 <0,4,2,6>, <4,1,6,3> - 2620099644U, // <2,6,4,2>: Cost 3 vext2 <0,4,2,6>, <4,2,6,0> - 3643599207U, // <2,6,4,3>: Cost 4 vext1 <3,2,6,4>, <3,2,6,4> - 2575830080U, // <2,6,4,4>: Cost 3 vext1 <4,2,6,4>, <4,2,6,4> - 1546358070U, // <2,6,4,5>: Cost 2 vext2 <0,4,2,6>, RHS - 2667875700U, // <2,6,4,6>: Cost 3 vext2 <u,4,2,6>, <4,6,4,6> - 4028042550U, // <2,6,4,7>: Cost 4 vzipr <0,2,2,4>, RHS - 1546358313U, // <2,6,4,u>: Cost 2 vext2 <0,4,2,6>, RHS - 3693841992U, // <2,6,5,0>: Cost 4 vext2 <0,4,2,6>, <5,0,1,2> - 2667876048U, // <2,6,5,1>: Cost 3 vext2 <u,4,2,6>, <5,1,7,3> - 2712834756U, // <2,6,5,2>: Cost 3 vext3 <4,6,u,2>, <6,5,2,7> - 3643607400U, // <2,6,5,3>: Cost 4 vext1 <3,2,6,5>, <3,2,6,5> - 2252091873U, // <2,6,5,4>: Cost 3 vrev <6,2,4,5> - 2667876356U, // <2,6,5,5>: Cost 3 vext2 <u,4,2,6>, <5,5,5,5> - 2667876450U, // <2,6,5,6>: Cost 3 vext2 <u,4,2,6>, <5,6,7,0> - 2820246838U, // <2,6,5,7>: Cost 3 vuzpr <0,2,4,6>, RHS - 2820246839U, // <2,6,5,u>: Cost 3 vuzpr <0,2,4,6>, RHS - 2563899494U, // <2,6,6,0>: Cost 3 vext1 <2,2,6,6>, LHS - 3893988683U, // <2,6,6,1>: Cost 4 vuzpr <0,2,4,6>, <4,6,0,1> - 2563901072U, // <2,6,6,2>: Cost 3 vext1 <2,2,6,6>, <2,2,6,6> - 3893987236U, // <2,6,6,3>: Cost 4 vuzpr <0,2,4,6>, <2,6,1,3> - 2563902774U, // <2,6,6,4>: Cost 3 vext1 <2,2,6,6>, RHS - 3893988723U, // <2,6,6,5>: Cost 4 vuzpr <0,2,4,6>, <4,6,4,5> - 2712834872U, // <2,6,6,6>: Cost 3 vext3 <4,6,u,2>, <6,6,6,6> - 2955644214U, // <2,6,6,7>: Cost 3 vzipr <0,4,2,6>, RHS - 2955644215U, // <2,6,6,u>: Cost 3 vzipr <0,4,2,6>, RHS - 2712834894U, // <2,6,7,0>: Cost 3 vext3 <4,6,u,2>, <6,7,0,1> - 2724926296U, // <2,6,7,1>: Cost 3 vext3 <6,7,1,2>, <6,7,1,2> - 2725000033U, // <2,6,7,2>: Cost 3 vext3 <6,7,2,2>, <6,7,2,2> - 2702365544U, // <2,6,7,3>: Cost 3 vext3 <3,0,1,2>, <6,7,3,0> - 2712834934U, // <2,6,7,4>: Cost 3 vext3 <4,6,u,2>, <6,7,4,5> - 3776107393U, // <2,6,7,5>: Cost 4 vext3 <3,0,1,2>, <6,7,5,7> - 2725294981U, // <2,6,7,6>: Cost 3 vext3 <6,7,6,2>, <6,7,6,2> - 2726253452U, // <2,6,7,7>: Cost 3 vext3 <7,0,1,2>, <6,7,7,0> - 2712834966U, // <2,6,7,u>: Cost 3 vext3 <4,6,u,2>, <6,7,u,1> - 2620102355U, // <2,6,u,0>: Cost 3 vext2 <0,4,2,6>, <u,0,1,2> - 1546360622U, // <2,6,u,1>: Cost 2 vext2 <0,4,2,6>, LHS - 2620102536U, // <2,6,u,2>: Cost 3 vext2 <0,4,2,6>, <u,2,3,3> - 2820244125U, // <2,6,u,3>: Cost 3 vuzpr <0,2,4,6>, LHS - 1594136612U, // <2,6,u,4>: Cost 2 vext2 <u,4,2,6>, <u,4,2,6> - 1546360986U, // <2,6,u,5>: Cost 2 vext2 <0,4,2,6>, RHS - 2620102864U, // <2,6,u,6>: Cost 3 vext2 <0,4,2,6>, <u,6,3,7> - 1879928118U, // <2,6,u,7>: Cost 2 vzipr LHS, RHS - 1879928119U, // <2,6,u,u>: Cost 2 vzipr LHS, RHS - 2726179825U, // <2,7,0,0>: Cost 3 vext3 <7,0,0,2>, <7,0,0,2> - 1652511738U, // <2,7,0,1>: Cost 2 vext3 <7,0,1,2>, <7,0,1,2> - 2621431972U, // <2,7,0,2>: Cost 3 vext2 <0,6,2,7>, <0,2,0,2> - 2257949868U, // <2,7,0,3>: Cost 3 vrev <7,2,3,0> - 2726474773U, // <2,7,0,4>: Cost 3 vext3 <7,0,4,2>, <7,0,4,2> - 2620768686U, // <2,7,0,5>: Cost 3 vext2 <0,5,2,7>, <0,5,2,7> - 2621432319U, // <2,7,0,6>: Cost 3 vext2 <0,6,2,7>, <0,6,2,7> - 2599760953U, // <2,7,0,7>: Cost 3 vext1 <u,2,7,0>, <7,0,u,2> - 1653027897U, // <2,7,0,u>: Cost 2 vext3 <7,0,u,2>, <7,0,u,2> - 2639348470U, // <2,7,1,0>: Cost 3 vext2 <3,6,2,7>, <1,0,3,2> - 3695174452U, // <2,7,1,1>: Cost 4 vext2 <0,6,2,7>, <1,1,1,1> - 3695174550U, // <2,7,1,2>: Cost 4 vext2 <0,6,2,7>, <1,2,3,0> - 3694511104U, // <2,7,1,3>: Cost 4 vext2 <0,5,2,7>, <1,3,5,7> - 3713090594U, // <2,7,1,4>: Cost 4 vext2 <3,6,2,7>, <1,4,0,5> - 3693184144U, // <2,7,1,5>: Cost 4 vext2 <0,3,2,7>, <1,5,3,7> - 2627405016U, // <2,7,1,6>: Cost 3 vext2 <1,6,2,7>, <1,6,2,7> - 3799995519U, // <2,7,1,7>: Cost 4 vext3 <7,0,1,2>, <7,1,7,0> - 2639348470U, // <2,7,1,u>: Cost 3 vext2 <3,6,2,7>, <1,0,3,2> - 3695175101U, // <2,7,2,0>: Cost 4 vext2 <0,6,2,7>, <2,0,1,2> - 3643655168U, // <2,7,2,1>: Cost 4 vext1 <3,2,7,2>, <1,3,5,7> - 2257892517U, // <2,7,2,2>: Cost 3 vrev <7,2,2,2> - 3695175334U, // <2,7,2,3>: Cost 4 vext2 <0,6,2,7>, <2,3,0,1> - 3695175465U, // <2,7,2,4>: Cost 4 vext2 <0,6,2,7>, <2,4,5,6> - 2632714080U, // <2,7,2,5>: Cost 3 vext2 <2,5,2,7>, <2,5,2,7> - 2633377713U, // <2,7,2,6>: Cost 3 vext2 <2,6,2,7>, <2,6,2,7> - 3695175658U, // <2,7,2,7>: Cost 4 vext2 <0,6,2,7>, <2,7,0,1> - 2634704979U, // <2,7,2,u>: Cost 3 vext2 <2,u,2,7>, <2,u,2,7> - 1514094694U, // <2,7,3,0>: Cost 2 vext1 <6,2,7,3>, LHS - 2569921680U, // <2,7,3,1>: Cost 3 vext1 <3,2,7,3>, <1,5,3,7> - 2587838056U, // <2,7,3,2>: Cost 3 vext1 <6,2,7,3>, <2,2,2,2> - 2569922927U, // <2,7,3,3>: Cost 3 vext1 <3,2,7,3>, <3,2,7,3> - 1514097974U, // <2,7,3,4>: Cost 2 vext1 <6,2,7,3>, RHS - 2581868321U, // <2,7,3,5>: Cost 3 vext1 <5,2,7,3>, <5,2,7,3> - 1514099194U, // <2,7,3,6>: Cost 2 vext1 <6,2,7,3>, <6,2,7,3> - 2587841530U, // <2,7,3,7>: Cost 3 vext1 <6,2,7,3>, <7,0,1,2> - 1514100526U, // <2,7,3,u>: Cost 2 vext1 <6,2,7,3>, LHS - 2708706617U, // <2,7,4,0>: Cost 3 vext3 <4,0,6,2>, <7,4,0,6> - 3649643418U, // <2,7,4,1>: Cost 4 vext1 <4,2,7,4>, <1,2,3,4> - 3649644330U, // <2,7,4,2>: Cost 4 vext1 <4,2,7,4>, <2,4,5,7> - 2257982640U, // <2,7,4,3>: Cost 3 vrev <7,2,3,4> - 3649645641U, // <2,7,4,4>: Cost 4 vext1 <4,2,7,4>, <4,2,7,4> - 2621435190U, // <2,7,4,5>: Cost 3 vext2 <0,6,2,7>, RHS - 2712835441U, // <2,7,4,6>: Cost 3 vext3 <4,6,u,2>, <7,4,6,u> - 3799995762U, // <2,7,4,7>: Cost 4 vext3 <7,0,1,2>, <7,4,7,0> - 2621435433U, // <2,7,4,u>: Cost 3 vext2 <0,6,2,7>, RHS - 2729497990U, // <2,7,5,0>: Cost 3 vext3 <7,5,0,2>, <7,5,0,2> - 3643679744U, // <2,7,5,1>: Cost 4 vext1 <3,2,7,5>, <1,3,5,7> - 3637708424U, // <2,7,5,2>: Cost 4 vext1 <2,2,7,5>, <2,2,5,7> - 3643681137U, // <2,7,5,3>: Cost 4 vext1 <3,2,7,5>, <3,2,7,5> - 2599800118U, // <2,7,5,4>: Cost 3 vext1 <u,2,7,5>, RHS - 3786577334U, // <2,7,5,5>: Cost 4 vext3 <4,6,u,2>, <7,5,5,5> - 3786577345U, // <2,7,5,6>: Cost 4 vext3 <4,6,u,2>, <7,5,6,7> - 2599802214U, // <2,7,5,7>: Cost 3 vext1 <u,2,7,5>, <7,4,5,6> - 2599802670U, // <2,7,5,u>: Cost 3 vext1 <u,2,7,5>, LHS - 2581889126U, // <2,7,6,0>: Cost 3 vext1 <5,2,7,6>, LHS - 3643687936U, // <2,7,6,1>: Cost 4 vext1 <3,2,7,6>, <1,3,5,7> - 2663240186U, // <2,7,6,2>: Cost 3 vext2 <7,6,2,7>, <6,2,7,3> - 3643689330U, // <2,7,6,3>: Cost 4 vext1 <3,2,7,6>, <3,2,7,6> - 2581892406U, // <2,7,6,4>: Cost 3 vext1 <5,2,7,6>, RHS - 2581892900U, // <2,7,6,5>: Cost 3 vext1 <5,2,7,6>, <5,2,7,6> - 2587865597U, // <2,7,6,6>: Cost 3 vext1 <6,2,7,6>, <6,2,7,6> - 3786577428U, // <2,7,6,7>: Cost 4 vext3 <4,6,u,2>, <7,6,7,0> - 2581894958U, // <2,7,6,u>: Cost 3 vext1 <5,2,7,6>, LHS - 2726254119U, // <2,7,7,0>: Cost 3 vext3 <7,0,1,2>, <7,7,0,1> - 3804640817U, // <2,7,7,1>: Cost 4 vext3 <7,7,1,2>, <7,7,1,2> - 3637724826U, // <2,7,7,2>: Cost 4 vext1 <2,2,7,7>, <2,2,7,7> - 3734992123U, // <2,7,7,3>: Cost 4 vext2 <7,3,2,7>, <7,3,2,7> - 2552040758U, // <2,7,7,4>: Cost 3 vext1 <0,2,7,7>, RHS - 3799995992U, // <2,7,7,5>: Cost 4 vext3 <7,0,1,2>, <7,7,5,5> - 2663241198U, // <2,7,7,6>: Cost 3 vext2 <7,6,2,7>, <7,6,2,7> - 2712835692U, // <2,7,7,7>: Cost 3 vext3 <4,6,u,2>, <7,7,7,7> - 2731562607U, // <2,7,7,u>: Cost 3 vext3 <7,u,1,2>, <7,7,u,1> - 1514135654U, // <2,7,u,0>: Cost 2 vext1 <6,2,7,u>, LHS - 1657820802U, // <2,7,u,1>: Cost 2 vext3 <7,u,1,2>, <7,u,1,2> - 2587879016U, // <2,7,u,2>: Cost 3 vext1 <6,2,7,u>, <2,2,2,2> - 2569963892U, // <2,7,u,3>: Cost 3 vext1 <3,2,7,u>, <3,2,7,u> - 1514138934U, // <2,7,u,4>: Cost 2 vext1 <6,2,7,u>, RHS - 2621438106U, // <2,7,u,5>: Cost 3 vext2 <0,6,2,7>, RHS - 1514140159U, // <2,7,u,6>: Cost 2 vext1 <6,2,7,u>, <6,2,7,u> - 2587882490U, // <2,7,u,7>: Cost 3 vext1 <6,2,7,u>, <7,0,1,2> - 1514141486U, // <2,7,u,u>: Cost 2 vext1 <6,2,7,u>, LHS - 1544380416U, // <2,u,0,0>: Cost 2 vext2 LHS, <0,0,0,0> - 470638699U, // <2,u,0,1>: Cost 1 vext2 LHS, LHS - 1544380580U, // <2,u,0,2>: Cost 2 vext2 LHS, <0,2,0,2> - 1658631909U, // <2,u,0,3>: Cost 2 vext3 <u,0,3,2>, <u,0,3,2> - 1544380754U, // <2,u,0,4>: Cost 2 vext2 LHS, <0,4,1,5> - 2665898414U, // <2,u,0,5>: Cost 3 vext2 LHS, <0,5,2,7> - 1658853120U, // <2,u,0,6>: Cost 2 vext3 <u,0,6,2>, <u,0,6,2> - 3094531625U, // <2,u,0,7>: Cost 3 vtrnr <1,2,3,0>, RHS - 470639261U, // <2,u,0,u>: Cost 1 vext2 LHS, LHS - 1544381174U, // <2,u,1,0>: Cost 2 vext2 LHS, <1,0,3,2> - 1544381236U, // <2,u,1,1>: Cost 2 vext2 LHS, <1,1,1,1> - 1544381334U, // <2,u,1,2>: Cost 2 vext2 LHS, <1,2,3,0> - 1544381400U, // <2,u,1,3>: Cost 2 vext2 LHS, <1,3,1,3> - 2618123325U, // <2,u,1,4>: Cost 3 vext2 LHS, <1,4,3,5> - 1544381584U, // <2,u,1,5>: Cost 2 vext2 LHS, <1,5,3,7> - 2618123489U, // <2,u,1,6>: Cost 3 vext2 LHS, <1,6,3,7> - 2726254427U, // <2,u,1,7>: Cost 3 vext3 <7,0,1,2>, <u,1,7,3> - 1544381823U, // <2,u,1,u>: Cost 2 vext2 LHS, <1,u,3,3> - 1478328422U, // <2,u,2,0>: Cost 2 vext1 <0,2,u,2>, LHS - 2618123807U, // <2,u,2,1>: Cost 3 vext2 LHS, <2,1,3,1> - 269271142U, // <2,u,2,2>: Cost 1 vdup2 LHS - 1544382118U, // <2,u,2,3>: Cost 2 vext2 LHS, <2,3,0,1> - 1478331702U, // <2,u,2,4>: Cost 2 vext1 <0,2,u,2>, RHS - 2618124136U, // <2,u,2,5>: Cost 3 vext2 LHS, <2,5,3,6> - 1544382394U, // <2,u,2,6>: Cost 2 vext2 LHS, <2,6,3,7> - 3088354857U, // <2,u,2,7>: Cost 3 vtrnr <0,2,0,2>, RHS - 269271142U, // <2,u,2,u>: Cost 1 vdup2 LHS - 1544382614U, // <2,u,3,0>: Cost 2 vext2 LHS, <3,0,1,2> - 2953627374U, // <2,u,3,1>: Cost 3 vzipr LHS, <2,3,u,1> - 1490282143U, // <2,u,3,2>: Cost 2 vext1 <2,2,u,3>, <2,2,u,3> - 1879883932U, // <2,u,3,3>: Cost 2 vzipr LHS, LHS - 1544382978U, // <2,u,3,4>: Cost 2 vext2 LHS, <3,4,5,6> - 2953627378U, // <2,u,3,5>: Cost 3 vzipr LHS, <2,3,u,5> - 1514172931U, // <2,u,3,6>: Cost 2 vext1 <6,2,u,3>, <6,2,u,3> - 1879887176U, // <2,u,3,7>: Cost 2 vzipr LHS, RHS - 1879883937U, // <2,u,3,u>: Cost 2 vzipr LHS, LHS - 1484316774U, // <2,u,4,0>: Cost 2 vext1 <1,2,u,4>, LHS - 1484317639U, // <2,u,4,1>: Cost 2 vext1 <1,2,u,4>, <1,2,u,4> - 2552088270U, // <2,u,4,2>: Cost 3 vext1 <0,2,u,4>, <2,3,4,5> - 1190213513U, // <2,u,4,3>: Cost 2 vrev <u,2,3,4> - 1484320054U, // <2,u,4,4>: Cost 2 vext1 <1,2,u,4>, RHS - 470641974U, // <2,u,4,5>: Cost 1 vext2 LHS, RHS - 1592159604U, // <2,u,4,6>: Cost 2 vext2 LHS, <4,6,4,6> - 3094564393U, // <2,u,4,7>: Cost 3 vtrnr <1,2,3,4>, RHS - 470642217U, // <2,u,4,u>: Cost 1 vext2 LHS, RHS - 2552094959U, // <2,u,5,0>: Cost 3 vext1 <0,2,u,5>, <0,2,u,5> - 1592159952U, // <2,u,5,1>: Cost 2 vext2 LHS, <5,1,7,3> - 2564040353U, // <2,u,5,2>: Cost 3 vext1 <2,2,u,5>, <2,2,u,5> - 2690275455U, // <2,u,5,3>: Cost 3 vext3 <0,u,u,2>, <u,5,3,7> - 1592160198U, // <2,u,5,4>: Cost 2 vext2 LHS, <5,4,7,6> - 1592160260U, // <2,u,5,5>: Cost 2 vext2 LHS, <5,5,5,5> - 1611962522U, // <2,u,5,6>: Cost 2 vext3 <0,2,0,2>, RHS - 1592160424U, // <2,u,5,7>: Cost 2 vext2 LHS, <5,7,5,7> - 1611962540U, // <2,u,5,u>: Cost 2 vext3 <0,2,0,2>, RHS - 1478361190U, // <2,u,6,0>: Cost 2 vext1 <0,2,u,6>, LHS - 2552103670U, // <2,u,6,1>: Cost 3 vext1 <0,2,u,6>, <1,0,3,2> - 1592160762U, // <2,u,6,2>: Cost 2 vext2 LHS, <6,2,7,3> - 2685704400U, // <2,u,6,3>: Cost 3 vext3 <0,2,0,2>, <u,6,3,7> - 1478364470U, // <2,u,6,4>: Cost 2 vext1 <0,2,u,6>, RHS - 2901891226U, // <2,u,6,5>: Cost 3 vzipl <2,6,3,7>, RHS - 1592161080U, // <2,u,6,6>: Cost 2 vext2 LHS, <6,6,6,6> - 1592161102U, // <2,u,6,7>: Cost 2 vext2 LHS, <6,7,0,1> - 1478367022U, // <2,u,6,u>: Cost 2 vext1 <0,2,u,6>, LHS - 1592161274U, // <2,u,7,0>: Cost 2 vext2 LHS, <7,0,1,2> - 2659931226U, // <2,u,7,1>: Cost 3 vext2 <7,1,2,u>, <7,1,2,u> - 2564056739U, // <2,u,7,2>: Cost 3 vext1 <2,2,u,7>, <2,2,u,7> - 2665903331U, // <2,u,7,3>: Cost 3 vext2 LHS, <7,3,0,1> - 1592161638U, // <2,u,7,4>: Cost 2 vext2 LHS, <7,4,5,6> - 2665903494U, // <2,u,7,5>: Cost 3 vext2 LHS, <7,5,0,2> - 2587947527U, // <2,u,7,6>: Cost 3 vext1 <6,2,u,7>, <6,2,u,7> - 1592161900U, // <2,u,7,7>: Cost 2 vext2 LHS, <7,7,7,7> - 1592161922U, // <2,u,7,u>: Cost 2 vext2 LHS, <7,u,1,2> - 1478377574U, // <2,u,u,0>: Cost 2 vext1 <0,2,u,u>, LHS - 470644526U, // <2,u,u,1>: Cost 1 vext2 LHS, LHS - 269271142U, // <2,u,u,2>: Cost 1 vdup2 LHS - 1879924892U, // <2,u,u,3>: Cost 2 vzipr LHS, LHS - 1478380854U, // <2,u,u,4>: Cost 2 vext1 <0,2,u,u>, RHS - 470644890U, // <2,u,u,5>: Cost 1 vext2 LHS, RHS - 1611962765U, // <2,u,u,6>: Cost 2 vext3 <0,2,0,2>, RHS - 1879928136U, // <2,u,u,7>: Cost 2 vzipr LHS, RHS - 470645093U, // <2,u,u,u>: Cost 1 vext2 LHS, LHS - 1611448320U, // <3,0,0,0>: Cost 2 vext3 LHS, <0,0,0,0> - 1611890698U, // <3,0,0,1>: Cost 2 vext3 LHS, <0,0,1,1> - 1611890708U, // <3,0,0,2>: Cost 2 vext3 LHS, <0,0,2,2> - 3763576860U, // <3,0,0,3>: Cost 4 vext3 LHS, <0,0,3,1> - 2689835045U, // <3,0,0,4>: Cost 3 vext3 LHS, <0,0,4,1> - 3698508206U, // <3,0,0,5>: Cost 4 vext2 <1,2,3,0>, <0,5,2,7> - 3763576887U, // <3,0,0,6>: Cost 4 vext3 LHS, <0,0,6,1> - 3667678434U, // <3,0,0,7>: Cost 4 vext1 <7,3,0,0>, <7,3,0,0> - 1616093258U, // <3,0,0,u>: Cost 2 vext3 LHS, <0,0,u,2> - 1490337894U, // <3,0,1,0>: Cost 2 vext1 <2,3,0,1>, LHS - 2685632602U, // <3,0,1,1>: Cost 3 vext3 LHS, <0,1,1,0> - 537706598U, // <3,0,1,2>: Cost 1 vext3 LHS, LHS - 2624766936U, // <3,0,1,3>: Cost 3 vext2 <1,2,3,0>, <1,3,1,3> - 1490341174U, // <3,0,1,4>: Cost 2 vext1 <2,3,0,1>, RHS - 2624767120U, // <3,0,1,5>: Cost 3 vext2 <1,2,3,0>, <1,5,3,7> - 2732966030U, // <3,0,1,6>: Cost 3 vext3 LHS, <0,1,6,7> - 2593944803U, // <3,0,1,7>: Cost 3 vext1 <7,3,0,1>, <7,3,0,1> - 537706652U, // <3,0,1,u>: Cost 1 vext3 LHS, LHS - 1611890852U, // <3,0,2,0>: Cost 2 vext3 LHS, <0,2,0,2> - 2685632684U, // <3,0,2,1>: Cost 3 vext3 LHS, <0,2,1,1> - 2685632692U, // <3,0,2,2>: Cost 3 vext3 LHS, <0,2,2,0> - 2685632702U, // <3,0,2,3>: Cost 3 vext3 LHS, <0,2,3,1> - 1611890892U, // <3,0,2,4>: Cost 2 vext3 LHS, <0,2,4,6> - 2732966102U, // <3,0,2,5>: Cost 3 vext3 LHS, <0,2,5,7> - 2624767930U, // <3,0,2,6>: Cost 3 vext2 <1,2,3,0>, <2,6,3,7> - 2685632744U, // <3,0,2,7>: Cost 3 vext3 LHS, <0,2,7,7> - 1611890924U, // <3,0,2,u>: Cost 2 vext3 LHS, <0,2,u,2> - 2624768150U, // <3,0,3,0>: Cost 3 vext2 <1,2,3,0>, <3,0,1,2> - 2685632764U, // <3,0,3,1>: Cost 3 vext3 LHS, <0,3,1,0> - 2685632774U, // <3,0,3,2>: Cost 3 vext3 LHS, <0,3,2,1> - 2624768412U, // <3,0,3,3>: Cost 3 vext2 <1,2,3,0>, <3,3,3,3> - 2624768514U, // <3,0,3,4>: Cost 3 vext2 <1,2,3,0>, <3,4,5,6> - 3702491714U, // <3,0,3,5>: Cost 4 vext2 <1,u,3,0>, <3,5,3,7> - 2624768632U, // <3,0,3,6>: Cost 3 vext2 <1,2,3,0>, <3,6,0,7> - 3702491843U, // <3,0,3,7>: Cost 4 vext2 <1,u,3,0>, <3,7,0,1> - 2686959934U, // <3,0,3,u>: Cost 3 vext3 <0,3,u,3>, <0,3,u,3> - 2689835336U, // <3,0,4,0>: Cost 3 vext3 LHS, <0,4,0,4> - 1611891026U, // <3,0,4,1>: Cost 2 vext3 LHS, <0,4,1,5> - 1611891036U, // <3,0,4,2>: Cost 2 vext3 LHS, <0,4,2,6> - 3763577184U, // <3,0,4,3>: Cost 4 vext3 LHS, <0,4,3,1> - 2689835374U, // <3,0,4,4>: Cost 3 vext3 LHS, <0,4,4,6> - 1551027510U, // <3,0,4,5>: Cost 2 vext2 <1,2,3,0>, RHS - 2666573172U, // <3,0,4,6>: Cost 3 vext2 <u,2,3,0>, <4,6,4,6> - 3667711206U, // <3,0,4,7>: Cost 4 vext1 <7,3,0,4>, <7,3,0,4> - 1616093586U, // <3,0,4,u>: Cost 2 vext3 LHS, <0,4,u,6> - 2685190556U, // <3,0,5,0>: Cost 3 vext3 LHS, <0,5,0,7> - 2666573520U, // <3,0,5,1>: Cost 3 vext2 <u,2,3,0>, <5,1,7,3> - 3040886886U, // <3,0,5,2>: Cost 3 vtrnl <3,4,5,6>, LHS - 3625912834U, // <3,0,5,3>: Cost 4 vext1 <0,3,0,5>, <3,4,5,6> - 2666573766U, // <3,0,5,4>: Cost 3 vext2 <u,2,3,0>, <5,4,7,6> - 2666573828U, // <3,0,5,5>: Cost 3 vext2 <u,2,3,0>, <5,5,5,5> - 2732966354U, // <3,0,5,6>: Cost 3 vext3 LHS, <0,5,6,7> - 2666573992U, // <3,0,5,7>: Cost 3 vext2 <u,2,3,0>, <5,7,5,7> - 3040886940U, // <3,0,5,u>: Cost 3 vtrnl <3,4,5,6>, LHS - 2685190637U, // <3,0,6,0>: Cost 3 vext3 LHS, <0,6,0,7> - 2732966390U, // <3,0,6,1>: Cost 3 vext3 LHS, <0,6,1,7> - 2689835519U, // <3,0,6,2>: Cost 3 vext3 LHS, <0,6,2,7> - 3667724438U, // <3,0,6,3>: Cost 4 vext1 <7,3,0,6>, <3,0,1,2> - 3763577355U, // <3,0,6,4>: Cost 4 vext3 LHS, <0,6,4,1> - 3806708243U, // <3,0,6,5>: Cost 4 vext3 LHS, <0,6,5,0> - 2666574648U, // <3,0,6,6>: Cost 3 vext2 <u,2,3,0>, <6,6,6,6> - 2657948520U, // <3,0,6,7>: Cost 3 vext2 <6,7,3,0>, <6,7,3,0> - 2689835573U, // <3,0,6,u>: Cost 3 vext3 LHS, <0,6,u,7> - 2666574842U, // <3,0,7,0>: Cost 3 vext2 <u,2,3,0>, <7,0,1,2> - 2685633095U, // <3,0,7,1>: Cost 3 vext3 LHS, <0,7,1,7> - 2660603052U, // <3,0,7,2>: Cost 3 vext2 <7,2,3,0>, <7,2,3,0> - 3643844997U, // <3,0,7,3>: Cost 4 vext1 <3,3,0,7>, <3,3,0,7> - 2666575206U, // <3,0,7,4>: Cost 3 vext2 <u,2,3,0>, <7,4,5,6> - 3655790391U, // <3,0,7,5>: Cost 4 vext1 <5,3,0,7>, <5,3,0,7> - 3731690968U, // <3,0,7,6>: Cost 4 vext2 <6,7,3,0>, <7,6,0,3> - 2666575468U, // <3,0,7,7>: Cost 3 vext2 <u,2,3,0>, <7,7,7,7> - 2664584850U, // <3,0,7,u>: Cost 3 vext2 <7,u,3,0>, <7,u,3,0> - 1616093834U, // <3,0,u,0>: Cost 2 vext3 LHS, <0,u,0,2> - 1611891346U, // <3,0,u,1>: Cost 2 vext3 LHS, <0,u,1,1> - 537707165U, // <3,0,u,2>: Cost 1 vext3 LHS, LHS - 2689835684U, // <3,0,u,3>: Cost 3 vext3 LHS, <0,u,3,1> - 1616093874U, // <3,0,u,4>: Cost 2 vext3 LHS, <0,u,4,6> - 1551030426U, // <3,0,u,5>: Cost 2 vext2 <1,2,3,0>, RHS - 2624772304U, // <3,0,u,6>: Cost 3 vext2 <1,2,3,0>, <u,6,3,7> - 2594002154U, // <3,0,u,7>: Cost 3 vext1 <7,3,0,u>, <7,3,0,u> - 537707219U, // <3,0,u,u>: Cost 1 vext3 LHS, LHS - 2552201318U, // <3,1,0,0>: Cost 3 vext1 <0,3,1,0>, LHS - 2618802278U, // <3,1,0,1>: Cost 3 vext2 <0,2,3,1>, LHS - 2618802366U, // <3,1,0,2>: Cost 3 vext2 <0,2,3,1>, <0,2,3,1> - 1611449078U, // <3,1,0,3>: Cost 2 vext3 LHS, <1,0,3,2> - 2552204598U, // <3,1,0,4>: Cost 3 vext1 <0,3,1,0>, RHS - 2732966663U, // <3,1,0,5>: Cost 3 vext3 LHS, <1,0,5,1> - 3906258396U, // <3,1,0,6>: Cost 4 vuzpr <2,3,0,1>, <2,0,4,6> - 3667752171U, // <3,1,0,7>: Cost 4 vext1 <7,3,1,0>, <7,3,1,0> - 1611891491U, // <3,1,0,u>: Cost 2 vext3 LHS, <1,0,u,2> - 2689835819U, // <3,1,1,0>: Cost 3 vext3 LHS, <1,1,0,1> - 1611449140U, // <3,1,1,1>: Cost 2 vext3 LHS, <1,1,1,1> - 2624775063U, // <3,1,1,2>: Cost 3 vext2 <1,2,3,1>, <1,2,3,1> - 1611891528U, // <3,1,1,3>: Cost 2 vext3 LHS, <1,1,3,3> - 2689835859U, // <3,1,1,4>: Cost 3 vext3 LHS, <1,1,4,5> - 2689835868U, // <3,1,1,5>: Cost 3 vext3 LHS, <1,1,5,5> - 3763577701U, // <3,1,1,6>: Cost 4 vext3 LHS, <1,1,6,5> - 3765273452U, // <3,1,1,7>: Cost 4 vext3 <1,1,7,3>, <1,1,7,3> - 1611891573U, // <3,1,1,u>: Cost 2 vext3 LHS, <1,1,u,3> - 2629420494U, // <3,1,2,0>: Cost 3 vext2 <2,0,3,1>, <2,0,3,1> - 2689835911U, // <3,1,2,1>: Cost 3 vext3 LHS, <1,2,1,3> - 2564163248U, // <3,1,2,2>: Cost 3 vext1 <2,3,1,2>, <2,3,1,2> - 1611449238U, // <3,1,2,3>: Cost 2 vext3 LHS, <1,2,3,0> - 2564164918U, // <3,1,2,4>: Cost 3 vext1 <2,3,1,2>, RHS - 2689835947U, // <3,1,2,5>: Cost 3 vext3 LHS, <1,2,5,3> - 3692545978U, // <3,1,2,6>: Cost 4 vext2 <0,2,3,1>, <2,6,3,7> - 2732966842U, // <3,1,2,7>: Cost 3 vext3 LHS, <1,2,7,0> - 1611891651U, // <3,1,2,u>: Cost 2 vext3 LHS, <1,2,u,0> - 1484456038U, // <3,1,3,0>: Cost 2 vext1 <1,3,1,3>, LHS - 1611891672U, // <3,1,3,1>: Cost 2 vext3 LHS, <1,3,1,3> - 2685633502U, // <3,1,3,2>: Cost 3 vext3 LHS, <1,3,2,0> - 2685633512U, // <3,1,3,3>: Cost 3 vext3 LHS, <1,3,3,1> - 1484459318U, // <3,1,3,4>: Cost 2 vext1 <1,3,1,3>, RHS - 1611891712U, // <3,1,3,5>: Cost 2 vext3 LHS, <1,3,5,7> - 2689836041U, // <3,1,3,6>: Cost 3 vext3 LHS, <1,3,6,7> - 2733409294U, // <3,1,3,7>: Cost 3 vext3 LHS, <1,3,7,3> - 1611891735U, // <3,1,3,u>: Cost 2 vext3 LHS, <1,3,u,3> - 2552234086U, // <3,1,4,0>: Cost 3 vext1 <0,3,1,4>, LHS - 2732966955U, // <3,1,4,1>: Cost 3 vext3 LHS, <1,4,1,5> - 2732966964U, // <3,1,4,2>: Cost 3 vext3 LHS, <1,4,2,5> - 2685633597U, // <3,1,4,3>: Cost 3 vext3 LHS, <1,4,3,5> - 2552237366U, // <3,1,4,4>: Cost 3 vext1 <0,3,1,4>, RHS - 2618805558U, // <3,1,4,5>: Cost 3 vext2 <0,2,3,1>, RHS - 2769472822U, // <3,1,4,6>: Cost 3 vuzpl <3,0,1,2>, RHS - 3667784943U, // <3,1,4,7>: Cost 4 vext1 <7,3,1,4>, <7,3,1,4> - 2685633642U, // <3,1,4,u>: Cost 3 vext3 LHS, <1,4,u,5> - 2689836143U, // <3,1,5,0>: Cost 3 vext3 LHS, <1,5,0,1> - 2564187280U, // <3,1,5,1>: Cost 3 vext1 <2,3,1,5>, <1,5,3,7> - 2564187827U, // <3,1,5,2>: Cost 3 vext1 <2,3,1,5>, <2,3,1,5> - 1611891856U, // <3,1,5,3>: Cost 2 vext3 LHS, <1,5,3,7> - 2689836183U, // <3,1,5,4>: Cost 3 vext3 LHS, <1,5,4,5> - 3759375522U, // <3,1,5,5>: Cost 4 vext3 LHS, <1,5,5,7> - 3720417378U, // <3,1,5,6>: Cost 4 vext2 <4,u,3,1>, <5,6,7,0> - 2832518454U, // <3,1,5,7>: Cost 3 vuzpr <2,3,0,1>, RHS - 1611891901U, // <3,1,5,u>: Cost 2 vext3 LHS, <1,5,u,7> - 3763578048U, // <3,1,6,0>: Cost 4 vext3 LHS, <1,6,0,1> - 2689836239U, // <3,1,6,1>: Cost 3 vext3 LHS, <1,6,1,7> - 2732967128U, // <3,1,6,2>: Cost 3 vext3 LHS, <1,6,2,7> - 2685633761U, // <3,1,6,3>: Cost 3 vext3 LHS, <1,6,3,7> - 3763578088U, // <3,1,6,4>: Cost 4 vext3 LHS, <1,6,4,5> - 2689836275U, // <3,1,6,5>: Cost 3 vext3 LHS, <1,6,5,7> - 3763578108U, // <3,1,6,6>: Cost 4 vext3 LHS, <1,6,6,7> - 2732967166U, // <3,1,6,7>: Cost 3 vext3 LHS, <1,6,7,0> - 2685633806U, // <3,1,6,u>: Cost 3 vext3 LHS, <1,6,u,7> - 3631972454U, // <3,1,7,0>: Cost 4 vext1 <1,3,1,7>, LHS - 2659947612U, // <3,1,7,1>: Cost 3 vext2 <7,1,3,1>, <7,1,3,1> - 4036102294U, // <3,1,7,2>: Cost 4 vzipr <1,5,3,7>, <3,0,1,2> - 3095396454U, // <3,1,7,3>: Cost 3 vtrnr <1,3,5,7>, LHS - 3631975734U, // <3,1,7,4>: Cost 4 vext1 <1,3,1,7>, RHS - 2222982144U, // <3,1,7,5>: Cost 3 vrev <1,3,5,7> - 3296797705U, // <3,1,7,6>: Cost 4 vrev <1,3,6,7> - 3720418924U, // <3,1,7,7>: Cost 4 vext2 <4,u,3,1>, <7,7,7,7> - 3095396459U, // <3,1,7,u>: Cost 3 vtrnr <1,3,5,7>, LHS - 1484496998U, // <3,1,u,0>: Cost 2 vext1 <1,3,1,u>, LHS - 1611892077U, // <3,1,u,1>: Cost 2 vext3 LHS, <1,u,1,3> - 2685633907U, // <3,1,u,2>: Cost 3 vext3 LHS, <1,u,2,0> - 1611892092U, // <3,1,u,3>: Cost 2 vext3 LHS, <1,u,3,0> - 1484500278U, // <3,1,u,4>: Cost 2 vext1 <1,3,1,u>, RHS - 1611892117U, // <3,1,u,5>: Cost 2 vext3 LHS, <1,u,5,7> - 2685633950U, // <3,1,u,6>: Cost 3 vext3 LHS, <1,u,6,7> - 2832518697U, // <3,1,u,7>: Cost 3 vuzpr <2,3,0,1>, RHS - 1611892140U, // <3,1,u,u>: Cost 2 vext3 LHS, <1,u,u,3> - 2623455232U, // <3,2,0,0>: Cost 3 vext2 <1,0,3,2>, <0,0,0,0> - 1549713510U, // <3,2,0,1>: Cost 2 vext2 <1,0,3,2>, LHS - 2689836484U, // <3,2,0,2>: Cost 3 vext3 LHS, <2,0,2,0> - 2685633997U, // <3,2,0,3>: Cost 3 vext3 LHS, <2,0,3,0> - 2623455570U, // <3,2,0,4>: Cost 3 vext2 <1,0,3,2>, <0,4,1,5> - 2732967398U, // <3,2,0,5>: Cost 3 vext3 LHS, <2,0,5,7> - 2689836524U, // <3,2,0,6>: Cost 3 vext3 LHS, <2,0,6,4> - 2229044964U, // <3,2,0,7>: Cost 3 vrev <2,3,7,0> - 1549714077U, // <3,2,0,u>: Cost 2 vext2 <1,0,3,2>, LHS - 1549714166U, // <3,2,1,0>: Cost 2 vext2 <1,0,3,2>, <1,0,3,2> - 2623456052U, // <3,2,1,1>: Cost 3 vext2 <1,0,3,2>, <1,1,1,1> - 2623456150U, // <3,2,1,2>: Cost 3 vext2 <1,0,3,2>, <1,2,3,0> - 2685634079U, // <3,2,1,3>: Cost 3 vext3 LHS, <2,1,3,1> - 2552286518U, // <3,2,1,4>: Cost 3 vext1 <0,3,2,1>, RHS - 2623456400U, // <3,2,1,5>: Cost 3 vext2 <1,0,3,2>, <1,5,3,7> - 2689836604U, // <3,2,1,6>: Cost 3 vext3 LHS, <2,1,6,3> - 3667834101U, // <3,2,1,7>: Cost 4 vext1 <7,3,2,1>, <7,3,2,1> - 1155385070U, // <3,2,1,u>: Cost 2 vrev <2,3,u,1> - 2689836629U, // <3,2,2,0>: Cost 3 vext3 LHS, <2,2,0,1> - 2689836640U, // <3,2,2,1>: Cost 3 vext3 LHS, <2,2,1,3> - 1611449960U, // <3,2,2,2>: Cost 2 vext3 LHS, <2,2,2,2> - 1611892338U, // <3,2,2,3>: Cost 2 vext3 LHS, <2,2,3,3> - 2689836669U, // <3,2,2,4>: Cost 3 vext3 LHS, <2,2,4,5> - 2689836680U, // <3,2,2,5>: Cost 3 vext3 LHS, <2,2,5,7> - 2689836688U, // <3,2,2,6>: Cost 3 vext3 LHS, <2,2,6,6> - 3763578518U, // <3,2,2,7>: Cost 4 vext3 LHS, <2,2,7,3> - 1611892383U, // <3,2,2,u>: Cost 2 vext3 LHS, <2,2,u,3> - 1611450022U, // <3,2,3,0>: Cost 2 vext3 LHS, <2,3,0,1> - 2685191854U, // <3,2,3,1>: Cost 3 vext3 LHS, <2,3,1,0> - 2685191865U, // <3,2,3,2>: Cost 3 vext3 LHS, <2,3,2,2> - 2685191875U, // <3,2,3,3>: Cost 3 vext3 LHS, <2,3,3,3> - 1611450062U, // <3,2,3,4>: Cost 2 vext3 LHS, <2,3,4,5> - 2732967635U, // <3,2,3,5>: Cost 3 vext3 LHS, <2,3,5,1> - 2732967645U, // <3,2,3,6>: Cost 3 vext3 LHS, <2,3,6,2> - 2732967652U, // <3,2,3,7>: Cost 3 vext3 LHS, <2,3,7,0> - 1611450094U, // <3,2,3,u>: Cost 2 vext3 LHS, <2,3,u,1> - 2558279782U, // <3,2,4,0>: Cost 3 vext1 <1,3,2,4>, LHS - 2558280602U, // <3,2,4,1>: Cost 3 vext1 <1,3,2,4>, <1,2,3,4> - 2732967692U, // <3,2,4,2>: Cost 3 vext3 LHS, <2,4,2,4> - 2685634326U, // <3,2,4,3>: Cost 3 vext3 LHS, <2,4,3,5> - 2558283062U, // <3,2,4,4>: Cost 3 vext1 <1,3,2,4>, RHS - 1549716790U, // <3,2,4,5>: Cost 2 vext2 <1,0,3,2>, RHS - 2689836844U, // <3,2,4,6>: Cost 3 vext3 LHS, <2,4,6,0> - 2229077736U, // <3,2,4,7>: Cost 3 vrev <2,3,7,4> - 1549717033U, // <3,2,4,u>: Cost 2 vext2 <1,0,3,2>, RHS - 2552316006U, // <3,2,5,0>: Cost 3 vext1 <0,3,2,5>, LHS - 2228643507U, // <3,2,5,1>: Cost 3 vrev <2,3,1,5> - 2689836896U, // <3,2,5,2>: Cost 3 vext3 LHS, <2,5,2,7> - 2685634408U, // <3,2,5,3>: Cost 3 vext3 LHS, <2,5,3,6> - 1155122894U, // <3,2,5,4>: Cost 2 vrev <2,3,4,5> - 2665263108U, // <3,2,5,5>: Cost 3 vext2 <u,0,3,2>, <5,5,5,5> - 2689836932U, // <3,2,5,6>: Cost 3 vext3 LHS, <2,5,6,7> - 2665263272U, // <3,2,5,7>: Cost 3 vext2 <u,0,3,2>, <5,7,5,7> - 1155417842U, // <3,2,5,u>: Cost 2 vrev <2,3,u,5> - 2689836953U, // <3,2,6,0>: Cost 3 vext3 LHS, <2,6,0,1> - 2689836964U, // <3,2,6,1>: Cost 3 vext3 LHS, <2,6,1,3> - 2689836976U, // <3,2,6,2>: Cost 3 vext3 LHS, <2,6,2,6> - 1611892666U, // <3,2,6,3>: Cost 2 vext3 LHS, <2,6,3,7> - 2689836993U, // <3,2,6,4>: Cost 3 vext3 LHS, <2,6,4,5> - 2689837004U, // <3,2,6,5>: Cost 3 vext3 LHS, <2,6,5,7> - 2689837013U, // <3,2,6,6>: Cost 3 vext3 LHS, <2,6,6,7> - 2665263950U, // <3,2,6,7>: Cost 3 vext2 <u,0,3,2>, <6,7,0,1> - 1611892711U, // <3,2,6,u>: Cost 2 vext3 LHS, <2,6,u,7> - 2665264122U, // <3,2,7,0>: Cost 3 vext2 <u,0,3,2>, <7,0,1,2> - 2623460419U, // <3,2,7,1>: Cost 3 vext2 <1,0,3,2>, <7,1,0,3> - 4169138340U, // <3,2,7,2>: Cost 4 vtrnr <1,3,5,7>, <0,2,0,2> - 2962358374U, // <3,2,7,3>: Cost 3 vzipr <1,5,3,7>, LHS - 2665264486U, // <3,2,7,4>: Cost 3 vext2 <u,0,3,2>, <7,4,5,6> - 2228954841U, // <3,2,7,5>: Cost 3 vrev <2,3,5,7> - 2229028578U, // <3,2,7,6>: Cost 3 vrev <2,3,6,7> - 2665264748U, // <3,2,7,7>: Cost 3 vext2 <u,0,3,2>, <7,7,7,7> - 2962358379U, // <3,2,7,u>: Cost 3 vzipr <1,5,3,7>, LHS - 1611892795U, // <3,2,u,0>: Cost 2 vext3 LHS, <2,u,0,1> - 1549719342U, // <3,2,u,1>: Cost 2 vext2 <1,0,3,2>, LHS - 1611449960U, // <3,2,u,2>: Cost 2 vext3 LHS, <2,2,2,2> - 1611892824U, // <3,2,u,3>: Cost 2 vext3 LHS, <2,u,3,3> - 1611892835U, // <3,2,u,4>: Cost 2 vext3 LHS, <2,u,4,5> - 1549719706U, // <3,2,u,5>: Cost 2 vext2 <1,0,3,2>, RHS - 2689837168U, // <3,2,u,6>: Cost 3 vext3 LHS, <2,u,6,0> - 2665265408U, // <3,2,u,7>: Cost 3 vext2 <u,0,3,2>, <u,7,0,1> - 1611892867U, // <3,2,u,u>: Cost 2 vext3 LHS, <2,u,u,1> - 2685192331U, // <3,3,0,0>: Cost 3 vext3 LHS, <3,0,0,0> - 1611450518U, // <3,3,0,1>: Cost 2 vext3 LHS, <3,0,1,2> - 2685634717U, // <3,3,0,2>: Cost 3 vext3 LHS, <3,0,2,0> - 2564294806U, // <3,3,0,3>: Cost 3 vext1 <2,3,3,0>, <3,0,1,2> - 2685634736U, // <3,3,0,4>: Cost 3 vext3 LHS, <3,0,4,1> - 2732968122U, // <3,3,0,5>: Cost 3 vext3 LHS, <3,0,5,2> - 3763579075U, // <3,3,0,6>: Cost 4 vext3 LHS, <3,0,6,2> - 4034053264U, // <3,3,0,7>: Cost 4 vzipr <1,2,3,0>, <1,5,3,7> - 1611450581U, // <3,3,0,u>: Cost 2 vext3 LHS, <3,0,u,2> - 2685192415U, // <3,3,1,0>: Cost 3 vext3 LHS, <3,1,0,3> - 1550385992U, // <3,3,1,1>: Cost 2 vext2 <1,1,3,3>, <1,1,3,3> - 2685192433U, // <3,3,1,2>: Cost 3 vext3 LHS, <3,1,2,3> - 2685634808U, // <3,3,1,3>: Cost 3 vext3 LHS, <3,1,3,1> - 2558332214U, // <3,3,1,4>: Cost 3 vext1 <1,3,3,1>, RHS - 2685634828U, // <3,3,1,5>: Cost 3 vext3 LHS, <3,1,5,3> - 3759376661U, // <3,3,1,6>: Cost 4 vext3 LHS, <3,1,6,3> - 2703477022U, // <3,3,1,7>: Cost 3 vext3 <3,1,7,3>, <3,1,7,3> - 1555031423U, // <3,3,1,u>: Cost 2 vext2 <1,u,3,3>, <1,u,3,3> - 2564309094U, // <3,3,2,0>: Cost 3 vext1 <2,3,3,2>, LHS - 2630100513U, // <3,3,2,1>: Cost 3 vext2 <2,1,3,3>, <2,1,3,3> - 1557022322U, // <3,3,2,2>: Cost 2 vext2 <2,2,3,3>, <2,2,3,3> - 2685192520U, // <3,3,2,3>: Cost 3 vext3 LHS, <3,2,3,0> - 2564312374U, // <3,3,2,4>: Cost 3 vext1 <2,3,3,2>, RHS - 2732968286U, // <3,3,2,5>: Cost 3 vext3 LHS, <3,2,5,4> - 2685634918U, // <3,3,2,6>: Cost 3 vext3 LHS, <3,2,6,3> - 2704140655U, // <3,3,2,7>: Cost 3 vext3 <3,2,7,3>, <3,2,7,3> - 1561004120U, // <3,3,2,u>: Cost 2 vext2 <2,u,3,3>, <2,u,3,3> - 1496547430U, // <3,3,3,0>: Cost 2 vext1 <3,3,3,3>, LHS - 2624129256U, // <3,3,3,1>: Cost 3 vext2 <1,1,3,3>, <3,1,1,3> - 2630764866U, // <3,3,3,2>: Cost 3 vext2 <2,2,3,3>, <3,2,2,3> - 336380006U, // <3,3,3,3>: Cost 1 vdup3 LHS - 1496550710U, // <3,3,3,4>: Cost 2 vext1 <3,3,3,3>, RHS - 2732968368U, // <3,3,3,5>: Cost 3 vext3 LHS, <3,3,5,5> - 2624129683U, // <3,3,3,6>: Cost 3 vext2 <1,1,3,3>, <3,6,3,7> - 2594182400U, // <3,3,3,7>: Cost 3 vext1 <7,3,3,3>, <7,3,3,3> - 336380006U, // <3,3,3,u>: Cost 1 vdup3 LHS - 2558353510U, // <3,3,4,0>: Cost 3 vext1 <1,3,3,4>, LHS - 2558354411U, // <3,3,4,1>: Cost 3 vext1 <1,3,3,4>, <1,3,3,4> - 2564327108U, // <3,3,4,2>: Cost 3 vext1 <2,3,3,4>, <2,3,3,4> - 2564327938U, // <3,3,4,3>: Cost 3 vext1 <2,3,3,4>, <3,4,5,6> - 2960343962U, // <3,3,4,4>: Cost 3 vzipr <1,2,3,4>, <1,2,3,4> - 1611893250U, // <3,3,4,5>: Cost 2 vext3 LHS, <3,4,5,6> - 2771619126U, // <3,3,4,6>: Cost 3 vuzpl <3,3,3,3>, RHS - 4034086032U, // <3,3,4,7>: Cost 4 vzipr <1,2,3,4>, <1,5,3,7> - 1611893277U, // <3,3,4,u>: Cost 2 vext3 LHS, <3,4,u,6> - 2558361702U, // <3,3,5,0>: Cost 3 vext1 <1,3,3,5>, LHS - 2558362604U, // <3,3,5,1>: Cost 3 vext1 <1,3,3,5>, <1,3,3,5> - 2558363342U, // <3,3,5,2>: Cost 3 vext1 <1,3,3,5>, <2,3,4,5> - 2732968512U, // <3,3,5,3>: Cost 3 vext3 LHS, <3,5,3,5> - 2558364982U, // <3,3,5,4>: Cost 3 vext1 <1,3,3,5>, RHS - 3101279950U, // <3,3,5,5>: Cost 3 vtrnr <2,3,4,5>, <2,3,4,5> - 2665934946U, // <3,3,5,6>: Cost 3 vext2 <u,1,3,3>, <5,6,7,0> - 2826636598U, // <3,3,5,7>: Cost 3 vuzpr <1,3,1,3>, RHS - 2826636599U, // <3,3,5,u>: Cost 3 vuzpr <1,3,1,3>, RHS - 2732968568U, // <3,3,6,0>: Cost 3 vext3 LHS, <3,6,0,7> - 3763579521U, // <3,3,6,1>: Cost 4 vext3 LHS, <3,6,1,7> - 2732968586U, // <3,3,6,2>: Cost 3 vext3 LHS, <3,6,2,7> - 2732968595U, // <3,3,6,3>: Cost 3 vext3 LHS, <3,6,3,7> - 2732968604U, // <3,3,6,4>: Cost 3 vext3 LHS, <3,6,4,7> - 3763579557U, // <3,3,6,5>: Cost 4 vext3 LHS, <3,6,5,7> - 2732968621U, // <3,3,6,6>: Cost 3 vext3 LHS, <3,6,6,6> - 2657973099U, // <3,3,6,7>: Cost 3 vext2 <6,7,3,3>, <6,7,3,3> - 2658636732U, // <3,3,6,u>: Cost 3 vext2 <6,u,3,3>, <6,u,3,3> - 2558378086U, // <3,3,7,0>: Cost 3 vext1 <1,3,3,7>, LHS - 2558378990U, // <3,3,7,1>: Cost 3 vext1 <1,3,3,7>, <1,3,3,7> - 2564351687U, // <3,3,7,2>: Cost 3 vext1 <2,3,3,7>, <2,3,3,7> - 2661291264U, // <3,3,7,3>: Cost 3 vext2 <7,3,3,3>, <7,3,3,3> - 2558381366U, // <3,3,7,4>: Cost 3 vext1 <1,3,3,7>, RHS - 2732968694U, // <3,3,7,5>: Cost 3 vext3 LHS, <3,7,5,7> - 3781126907U, // <3,3,7,6>: Cost 4 vext3 <3,7,6,3>, <3,7,6,3> - 3095397376U, // <3,3,7,7>: Cost 3 vtrnr <1,3,5,7>, <1,3,5,7> - 2558383918U, // <3,3,7,u>: Cost 3 vext1 <1,3,3,7>, LHS - 1496547430U, // <3,3,u,0>: Cost 2 vext1 <3,3,3,3>, LHS - 1611893534U, // <3,3,u,1>: Cost 2 vext3 LHS, <3,u,1,2> - 1592858504U, // <3,3,u,2>: Cost 2 vext2 <u,2,3,3>, <u,2,3,3> - 336380006U, // <3,3,u,3>: Cost 1 vdup3 LHS - 1496550710U, // <3,3,u,4>: Cost 2 vext1 <3,3,3,3>, RHS - 1611893574U, // <3,3,u,5>: Cost 2 vext3 LHS, <3,u,5,6> - 2690280268U, // <3,3,u,6>: Cost 3 vext3 LHS, <3,u,6,3> - 2826636841U, // <3,3,u,7>: Cost 3 vuzpr <1,3,1,3>, RHS - 336380006U, // <3,3,u,u>: Cost 1 vdup3 LHS - 2624798720U, // <3,4,0,0>: Cost 3 vext2 <1,2,3,4>, <0,0,0,0> - 1551056998U, // <3,4,0,1>: Cost 2 vext2 <1,2,3,4>, LHS - 2624798884U, // <3,4,0,2>: Cost 3 vext2 <1,2,3,4>, <0,2,0,2> - 3693232384U, // <3,4,0,3>: Cost 4 vext2 <0,3,3,4>, <0,3,1,4> - 2624799058U, // <3,4,0,4>: Cost 3 vext2 <1,2,3,4>, <0,4,1,5> - 1659227026U, // <3,4,0,5>: Cost 2 vext3 LHS, <4,0,5,1> - 1659227036U, // <3,4,0,6>: Cost 2 vext3 LHS, <4,0,6,2> - 3667973382U, // <3,4,0,7>: Cost 4 vext1 <7,3,4,0>, <7,3,4,0> - 1551057565U, // <3,4,0,u>: Cost 2 vext2 <1,2,3,4>, LHS - 2624799478U, // <3,4,1,0>: Cost 3 vext2 <1,2,3,4>, <1,0,3,2> - 2624799540U, // <3,4,1,1>: Cost 3 vext2 <1,2,3,4>, <1,1,1,1> - 1551057818U, // <3,4,1,2>: Cost 2 vext2 <1,2,3,4>, <1,2,3,4> - 2624799704U, // <3,4,1,3>: Cost 3 vext2 <1,2,3,4>, <1,3,1,3> - 2564377910U, // <3,4,1,4>: Cost 3 vext1 <2,3,4,1>, RHS - 2689838050U, // <3,4,1,5>: Cost 3 vext3 LHS, <4,1,5,0> - 2689838062U, // <3,4,1,6>: Cost 3 vext3 LHS, <4,1,6,3> - 2628117807U, // <3,4,1,7>: Cost 3 vext2 <1,7,3,4>, <1,7,3,4> - 1555039616U, // <3,4,1,u>: Cost 2 vext2 <1,u,3,4>, <1,u,3,4> - 3626180710U, // <3,4,2,0>: Cost 4 vext1 <0,3,4,2>, LHS - 2624800298U, // <3,4,2,1>: Cost 3 vext2 <1,2,3,4>, <2,1,4,3> - 2624800360U, // <3,4,2,2>: Cost 3 vext2 <1,2,3,4>, <2,2,2,2> - 2624800422U, // <3,4,2,3>: Cost 3 vext2 <1,2,3,4>, <2,3,0,1> - 2624800514U, // <3,4,2,4>: Cost 3 vext2 <1,2,3,4>, <2,4,1,3> - 2709965878U, // <3,4,2,5>: Cost 3 vext3 <4,2,5,3>, <4,2,5,3> - 2689838140U, // <3,4,2,6>: Cost 3 vext3 LHS, <4,2,6,0> - 2634090504U, // <3,4,2,7>: Cost 3 vext2 <2,7,3,4>, <2,7,3,4> - 2689838158U, // <3,4,2,u>: Cost 3 vext3 LHS, <4,2,u,0> - 2624800918U, // <3,4,3,0>: Cost 3 vext2 <1,2,3,4>, <3,0,1,2> - 2636081403U, // <3,4,3,1>: Cost 3 vext2 <3,1,3,4>, <3,1,3,4> - 2636745036U, // <3,4,3,2>: Cost 3 vext2 <3,2,3,4>, <3,2,3,4> - 2624801180U, // <3,4,3,3>: Cost 3 vext2 <1,2,3,4>, <3,3,3,3> - 2624801232U, // <3,4,3,4>: Cost 3 vext2 <1,2,3,4>, <3,4,0,1> - 2905836854U, // <3,4,3,5>: Cost 3 vzipl <3,3,3,3>, RHS - 3040054582U, // <3,4,3,6>: Cost 3 vtrnl <3,3,3,3>, RHS - 3702524611U, // <3,4,3,7>: Cost 4 vext2 <1,u,3,4>, <3,7,0,1> - 2624801566U, // <3,4,3,u>: Cost 3 vext2 <1,2,3,4>, <3,u,1,2> - 2564399206U, // <3,4,4,0>: Cost 3 vext1 <2,3,4,4>, LHS - 2564400026U, // <3,4,4,1>: Cost 3 vext1 <2,3,4,4>, <1,2,3,4> - 2564400845U, // <3,4,4,2>: Cost 3 vext1 <2,3,4,4>, <2,3,4,4> - 2570373542U, // <3,4,4,3>: Cost 3 vext1 <3,3,4,4>, <3,3,4,4> - 1659227344U, // <3,4,4,4>: Cost 2 vext3 LHS, <4,4,4,4> - 1551060278U, // <3,4,4,5>: Cost 2 vext2 <1,2,3,4>, RHS - 1659227364U, // <3,4,4,6>: Cost 2 vext3 LHS, <4,4,6,6> - 3668006154U, // <3,4,4,7>: Cost 4 vext1 <7,3,4,4>, <7,3,4,4> - 1551060521U, // <3,4,4,u>: Cost 2 vext2 <1,2,3,4>, RHS - 1490665574U, // <3,4,5,0>: Cost 2 vext1 <2,3,4,5>, LHS - 2689838341U, // <3,4,5,1>: Cost 3 vext3 LHS, <4,5,1,3> - 1490667214U, // <3,4,5,2>: Cost 2 vext1 <2,3,4,5>, <2,3,4,5> - 2564409494U, // <3,4,5,3>: Cost 3 vext1 <2,3,4,5>, <3,0,1,2> - 1490668854U, // <3,4,5,4>: Cost 2 vext1 <2,3,4,5>, RHS - 2689838381U, // <3,4,5,5>: Cost 3 vext3 LHS, <4,5,5,7> - 537709878U, // <3,4,5,6>: Cost 1 vext3 LHS, RHS - 2594272523U, // <3,4,5,7>: Cost 3 vext1 <7,3,4,5>, <7,3,4,5> - 537709896U, // <3,4,5,u>: Cost 1 vext3 LHS, RHS - 2689838411U, // <3,4,6,0>: Cost 3 vext3 LHS, <4,6,0,1> - 2558444534U, // <3,4,6,1>: Cost 3 vext1 <1,3,4,6>, <1,3,4,6> - 2666607098U, // <3,4,6,2>: Cost 3 vext2 <u,2,3,4>, <6,2,7,3> - 2558446082U, // <3,4,6,3>: Cost 3 vext1 <1,3,4,6>, <3,4,5,6> - 1659227508U, // <3,4,6,4>: Cost 2 vext3 LHS, <4,6,4,6> - 2689838462U, // <3,4,6,5>: Cost 3 vext3 LHS, <4,6,5,7> - 2689838471U, // <3,4,6,6>: Cost 3 vext3 LHS, <4,6,6,7> - 2657981292U, // <3,4,6,7>: Cost 3 vext2 <6,7,3,4>, <6,7,3,4> - 1659227540U, // <3,4,6,u>: Cost 2 vext3 LHS, <4,6,u,2> - 2666607610U, // <3,4,7,0>: Cost 3 vext2 <u,2,3,4>, <7,0,1,2> - 3702527072U, // <3,4,7,1>: Cost 4 vext2 <1,u,3,4>, <7,1,3,5> - 2660635824U, // <3,4,7,2>: Cost 3 vext2 <7,2,3,4>, <7,2,3,4> - 3644139945U, // <3,4,7,3>: Cost 4 vext1 <3,3,4,7>, <3,3,4,7> - 2666607974U, // <3,4,7,4>: Cost 3 vext2 <u,2,3,4>, <7,4,5,6> - 2732969416U, // <3,4,7,5>: Cost 3 vext3 LHS, <4,7,5,0> - 2732969425U, // <3,4,7,6>: Cost 3 vext3 LHS, <4,7,6,0> - 2666608236U, // <3,4,7,7>: Cost 3 vext2 <u,2,3,4>, <7,7,7,7> - 2664617622U, // <3,4,7,u>: Cost 3 vext2 <7,u,3,4>, <7,u,3,4> - 1490690150U, // <3,4,u,0>: Cost 2 vext1 <2,3,4,u>, LHS - 1551062830U, // <3,4,u,1>: Cost 2 vext2 <1,2,3,4>, LHS - 1490691793U, // <3,4,u,2>: Cost 2 vext1 <2,3,4,u>, <2,3,4,u> - 2624804796U, // <3,4,u,3>: Cost 3 vext2 <1,2,3,4>, <u,3,0,1> - 1490693430U, // <3,4,u,4>: Cost 2 vext1 <2,3,4,u>, RHS - 1551063194U, // <3,4,u,5>: Cost 2 vext2 <1,2,3,4>, RHS - 537710121U, // <3,4,u,6>: Cost 1 vext3 LHS, RHS - 2594297102U, // <3,4,u,7>: Cost 3 vext1 <7,3,4,u>, <7,3,4,u> - 537710139U, // <3,4,u,u>: Cost 1 vext3 LHS, RHS - 3692576768U, // <3,5,0,0>: Cost 4 vext2 <0,2,3,5>, <0,0,0,0> - 2618835046U, // <3,5,0,1>: Cost 3 vext2 <0,2,3,5>, LHS - 2618835138U, // <3,5,0,2>: Cost 3 vext2 <0,2,3,5>, <0,2,3,5> - 3692577024U, // <3,5,0,3>: Cost 4 vext2 <0,2,3,5>, <0,3,1,4> - 2689838690U, // <3,5,0,4>: Cost 3 vext3 LHS, <5,0,4,1> - 2732969579U, // <3,5,0,5>: Cost 3 vext3 LHS, <5,0,5,1> - 2732969588U, // <3,5,0,6>: Cost 3 vext3 LHS, <5,0,6,1> - 2246963055U, // <3,5,0,7>: Cost 3 vrev <5,3,7,0> - 2618835613U, // <3,5,0,u>: Cost 3 vext2 <0,2,3,5>, LHS - 2594308198U, // <3,5,1,0>: Cost 3 vext1 <7,3,5,1>, LHS - 3692577588U, // <3,5,1,1>: Cost 4 vext2 <0,2,3,5>, <1,1,1,1> - 2624807835U, // <3,5,1,2>: Cost 3 vext2 <1,2,3,5>, <1,2,3,5> - 2625471468U, // <3,5,1,3>: Cost 3 vext2 <1,3,3,5>, <1,3,3,5> - 2626135101U, // <3,5,1,4>: Cost 3 vext2 <1,4,3,5>, <1,4,3,5> - 2594311888U, // <3,5,1,5>: Cost 3 vext1 <7,3,5,1>, <5,1,7,3> - 3699877107U, // <3,5,1,6>: Cost 4 vext2 <1,4,3,5>, <1,6,5,7> - 1641680592U, // <3,5,1,7>: Cost 2 vext3 <5,1,7,3>, <5,1,7,3> - 1641754329U, // <3,5,1,u>: Cost 2 vext3 <5,1,u,3>, <5,1,u,3> - 3692578274U, // <3,5,2,0>: Cost 4 vext2 <0,2,3,5>, <2,0,5,3> - 2630116899U, // <3,5,2,1>: Cost 3 vext2 <2,1,3,5>, <2,1,3,5> - 3692578408U, // <3,5,2,2>: Cost 4 vext2 <0,2,3,5>, <2,2,2,2> - 2625472206U, // <3,5,2,3>: Cost 3 vext2 <1,3,3,5>, <2,3,4,5> - 2632107798U, // <3,5,2,4>: Cost 3 vext2 <2,4,3,5>, <2,4,3,5> - 2715938575U, // <3,5,2,5>: Cost 3 vext3 <5,2,5,3>, <5,2,5,3> - 3692578746U, // <3,5,2,6>: Cost 4 vext2 <0,2,3,5>, <2,6,3,7> - 2716086049U, // <3,5,2,7>: Cost 3 vext3 <5,2,7,3>, <5,2,7,3> - 2634762330U, // <3,5,2,u>: Cost 3 vext2 <2,u,3,5>, <2,u,3,5> - 3692578966U, // <3,5,3,0>: Cost 4 vext2 <0,2,3,5>, <3,0,1,2> - 2636089596U, // <3,5,3,1>: Cost 3 vext2 <3,1,3,5>, <3,1,3,5> - 3699214668U, // <3,5,3,2>: Cost 4 vext2 <1,3,3,5>, <3,2,3,4> - 2638080412U, // <3,5,3,3>: Cost 3 vext2 <3,4,3,5>, <3,3,3,3> - 2618837506U, // <3,5,3,4>: Cost 3 vext2 <0,2,3,5>, <3,4,5,6> - 2832844494U, // <3,5,3,5>: Cost 3 vuzpr <2,3,4,5>, <2,3,4,5> - 4033415682U, // <3,5,3,6>: Cost 4 vzipr <1,1,3,3>, <3,4,5,6> - 3095072054U, // <3,5,3,7>: Cost 3 vtrnr <1,3,1,3>, RHS - 3095072055U, // <3,5,3,u>: Cost 3 vtrnr <1,3,1,3>, RHS - 2600304742U, // <3,5,4,0>: Cost 3 vext1 <u,3,5,4>, LHS - 3763580815U, // <3,5,4,1>: Cost 4 vext3 LHS, <5,4,1,5> - 2564474582U, // <3,5,4,2>: Cost 3 vext1 <2,3,5,4>, <2,3,5,4> - 3699879044U, // <3,5,4,3>: Cost 4 vext2 <1,4,3,5>, <4,3,5,0> - 2600308022U, // <3,5,4,4>: Cost 3 vext1 <u,3,5,4>, RHS - 2618838326U, // <3,5,4,5>: Cost 3 vext2 <0,2,3,5>, RHS - 2772454710U, // <3,5,4,6>: Cost 3 vuzpl <3,4,5,6>, RHS - 1659228102U, // <3,5,4,7>: Cost 2 vext3 LHS, <5,4,7,6> - 1659228111U, // <3,5,4,u>: Cost 2 vext3 LHS, <5,4,u,6> - 2570453094U, // <3,5,5,0>: Cost 3 vext1 <3,3,5,5>, LHS - 2624810704U, // <3,5,5,1>: Cost 3 vext2 <1,2,3,5>, <5,1,7,3> - 2570454734U, // <3,5,5,2>: Cost 3 vext1 <3,3,5,5>, <2,3,4,5> - 2570455472U, // <3,5,5,3>: Cost 3 vext1 <3,3,5,5>, <3,3,5,5> - 2570456374U, // <3,5,5,4>: Cost 3 vext1 <3,3,5,5>, RHS - 1659228164U, // <3,5,5,5>: Cost 2 vext3 LHS, <5,5,5,5> - 2732969998U, // <3,5,5,6>: Cost 3 vext3 LHS, <5,5,6,6> - 1659228184U, // <3,5,5,7>: Cost 2 vext3 LHS, <5,5,7,7> - 1659228193U, // <3,5,5,u>: Cost 2 vext3 LHS, <5,5,u,7> - 2732970020U, // <3,5,6,0>: Cost 3 vext3 LHS, <5,6,0,1> - 2732970035U, // <3,5,6,1>: Cost 3 vext3 LHS, <5,6,1,7> - 2564490968U, // <3,5,6,2>: Cost 3 vext1 <2,3,5,6>, <2,3,5,6> - 2732970050U, // <3,5,6,3>: Cost 3 vext3 LHS, <5,6,3,4> - 2732970060U, // <3,5,6,4>: Cost 3 vext3 LHS, <5,6,4,5> - 2732970071U, // <3,5,6,5>: Cost 3 vext3 LHS, <5,6,5,7> - 2732970080U, // <3,5,6,6>: Cost 3 vext3 LHS, <5,6,6,7> - 1659228258U, // <3,5,6,7>: Cost 2 vext3 LHS, <5,6,7,0> - 1659228267U, // <3,5,6,u>: Cost 2 vext3 LHS, <5,6,u,0> - 1484783718U, // <3,5,7,0>: Cost 2 vext1 <1,3,5,7>, LHS - 1484784640U, // <3,5,7,1>: Cost 2 vext1 <1,3,5,7>, <1,3,5,7> - 2558527080U, // <3,5,7,2>: Cost 3 vext1 <1,3,5,7>, <2,2,2,2> - 2558527638U, // <3,5,7,3>: Cost 3 vext1 <1,3,5,7>, <3,0,1,2> - 1484786998U, // <3,5,7,4>: Cost 2 vext1 <1,3,5,7>, RHS - 1659228328U, // <3,5,7,5>: Cost 2 vext3 LHS, <5,7,5,7> - 2732970154U, // <3,5,7,6>: Cost 3 vext3 LHS, <5,7,6,0> - 2558531180U, // <3,5,7,7>: Cost 3 vext1 <1,3,5,7>, <7,7,7,7> - 1484789550U, // <3,5,7,u>: Cost 2 vext1 <1,3,5,7>, LHS - 1484791910U, // <3,5,u,0>: Cost 2 vext1 <1,3,5,u>, LHS - 1484792833U, // <3,5,u,1>: Cost 2 vext1 <1,3,5,u>, <1,3,5,u> - 2558535272U, // <3,5,u,2>: Cost 3 vext1 <1,3,5,u>, <2,2,2,2> - 2558535830U, // <3,5,u,3>: Cost 3 vext1 <1,3,5,u>, <3,0,1,2> - 1484795190U, // <3,5,u,4>: Cost 2 vext1 <1,3,5,u>, RHS - 1659228409U, // <3,5,u,5>: Cost 2 vext3 LHS, <5,u,5,7> - 2772457626U, // <3,5,u,6>: Cost 3 vuzpl <3,4,5,6>, RHS - 1646326023U, // <3,5,u,7>: Cost 2 vext3 <5,u,7,3>, <5,u,7,3> - 1484797742U, // <3,5,u,u>: Cost 2 vext1 <1,3,5,u>, LHS - 2558541926U, // <3,6,0,0>: Cost 3 vext1 <1,3,6,0>, LHS - 2689839393U, // <3,6,0,1>: Cost 3 vext3 LHS, <6,0,1,2> - 2689839404U, // <3,6,0,2>: Cost 3 vext3 LHS, <6,0,2,4> - 3706519808U, // <3,6,0,3>: Cost 4 vext2 <2,5,3,6>, <0,3,1,4> - 2689839420U, // <3,6,0,4>: Cost 3 vext3 LHS, <6,0,4,2> - 2732970314U, // <3,6,0,5>: Cost 3 vext3 LHS, <6,0,5,7> - 2732970316U, // <3,6,0,6>: Cost 3 vext3 LHS, <6,0,6,0> - 2960313654U, // <3,6,0,7>: Cost 3 vzipr <1,2,3,0>, RHS - 2689839456U, // <3,6,0,u>: Cost 3 vext3 LHS, <6,0,u,2> - 3763581290U, // <3,6,1,0>: Cost 4 vext3 LHS, <6,1,0,3> - 3763581297U, // <3,6,1,1>: Cost 4 vext3 LHS, <6,1,1,1> - 2624816028U, // <3,6,1,2>: Cost 3 vext2 <1,2,3,6>, <1,2,3,6> - 3763581315U, // <3,6,1,3>: Cost 4 vext3 LHS, <6,1,3,1> - 2626143294U, // <3,6,1,4>: Cost 3 vext2 <1,4,3,6>, <1,4,3,6> - 3763581335U, // <3,6,1,5>: Cost 4 vext3 LHS, <6,1,5,3> - 2721321376U, // <3,6,1,6>: Cost 3 vext3 <6,1,6,3>, <6,1,6,3> - 2721395113U, // <3,6,1,7>: Cost 3 vext3 <6,1,7,3>, <6,1,7,3> - 2628797826U, // <3,6,1,u>: Cost 3 vext2 <1,u,3,6>, <1,u,3,6> - 2594390118U, // <3,6,2,0>: Cost 3 vext1 <7,3,6,2>, LHS - 2721616324U, // <3,6,2,1>: Cost 3 vext3 <6,2,1,3>, <6,2,1,3> - 2630788725U, // <3,6,2,2>: Cost 3 vext2 <2,2,3,6>, <2,2,3,6> - 3763581395U, // <3,6,2,3>: Cost 4 vext3 LHS, <6,2,3,0> - 2632115991U, // <3,6,2,4>: Cost 3 vext2 <2,4,3,6>, <2,4,3,6> - 2632779624U, // <3,6,2,5>: Cost 3 vext2 <2,5,3,6>, <2,5,3,6> - 2594394618U, // <3,6,2,6>: Cost 3 vext1 <7,3,6,2>, <6,2,7,3> - 1648316922U, // <3,6,2,7>: Cost 2 vext3 <6,2,7,3>, <6,2,7,3> - 1648390659U, // <3,6,2,u>: Cost 2 vext3 <6,2,u,3>, <6,2,u,3> - 3693914262U, // <3,6,3,0>: Cost 4 vext2 <0,4,3,6>, <3,0,1,2> - 3638281176U, // <3,6,3,1>: Cost 4 vext1 <2,3,6,3>, <1,3,1,3> - 3696568678U, // <3,6,3,2>: Cost 4 vext2 <0,u,3,6>, <3,2,6,3> - 2638088604U, // <3,6,3,3>: Cost 3 vext2 <3,4,3,6>, <3,3,3,3> - 2632780290U, // <3,6,3,4>: Cost 3 vext2 <2,5,3,6>, <3,4,5,6> - 3712494145U, // <3,6,3,5>: Cost 4 vext2 <3,5,3,6>, <3,5,3,6> - 3698559612U, // <3,6,3,6>: Cost 4 vext2 <1,2,3,6>, <3,6,1,2> - 2959674678U, // <3,6,3,7>: Cost 3 vzipr <1,1,3,3>, RHS - 2959674679U, // <3,6,3,u>: Cost 3 vzipr <1,1,3,3>, RHS - 3763581536U, // <3,6,4,0>: Cost 4 vext3 LHS, <6,4,0,6> - 2722943590U, // <3,6,4,1>: Cost 3 vext3 <6,4,1,3>, <6,4,1,3> - 2732970609U, // <3,6,4,2>: Cost 3 vext3 LHS, <6,4,2,5> - 3698560147U, // <3,6,4,3>: Cost 4 vext2 <1,2,3,6>, <4,3,6,6> - 2732970628U, // <3,6,4,4>: Cost 3 vext3 LHS, <6,4,4,6> - 2689839757U, // <3,6,4,5>: Cost 3 vext3 LHS, <6,4,5,6> - 2732970640U, // <3,6,4,6>: Cost 3 vext3 LHS, <6,4,6,0> - 2960346422U, // <3,6,4,7>: Cost 3 vzipr <1,2,3,4>, RHS - 2689839784U, // <3,6,4,u>: Cost 3 vext3 LHS, <6,4,u,6> - 2576498790U, // <3,6,5,0>: Cost 3 vext1 <4,3,6,5>, LHS - 3650241270U, // <3,6,5,1>: Cost 4 vext1 <4,3,6,5>, <1,0,3,2> - 2732970692U, // <3,6,5,2>: Cost 3 vext3 LHS, <6,5,2,7> - 2576501250U, // <3,6,5,3>: Cost 3 vext1 <4,3,6,5>, <3,4,5,6> - 2576501906U, // <3,6,5,4>: Cost 3 vext1 <4,3,6,5>, <4,3,6,5> - 3650244622U, // <3,6,5,5>: Cost 4 vext1 <4,3,6,5>, <5,5,6,6> - 4114633528U, // <3,6,5,6>: Cost 4 vtrnl <3,4,5,6>, <6,6,6,6> - 2732970735U, // <3,6,5,7>: Cost 3 vext3 LHS, <6,5,7,5> - 2576504622U, // <3,6,5,u>: Cost 3 vext1 <4,3,6,5>, LHS - 2732970749U, // <3,6,6,0>: Cost 3 vext3 LHS, <6,6,0,1> - 2724270856U, // <3,6,6,1>: Cost 3 vext3 <6,6,1,3>, <6,6,1,3> - 2624819706U, // <3,6,6,2>: Cost 3 vext2 <1,2,3,6>, <6,2,7,3> - 3656223234U, // <3,6,6,3>: Cost 4 vext1 <5,3,6,6>, <3,4,5,6> - 2732970788U, // <3,6,6,4>: Cost 3 vext3 LHS, <6,6,4,4> - 2732970800U, // <3,6,6,5>: Cost 3 vext3 LHS, <6,6,5,7> - 1659228984U, // <3,6,6,6>: Cost 2 vext3 LHS, <6,6,6,6> - 1659228994U, // <3,6,6,7>: Cost 2 vext3 LHS, <6,6,7,7> - 1659229003U, // <3,6,6,u>: Cost 2 vext3 LHS, <6,6,u,7> - 1659229006U, // <3,6,7,0>: Cost 2 vext3 LHS, <6,7,0,1> - 2558600201U, // <3,6,7,1>: Cost 3 vext1 <1,3,6,7>, <1,3,6,7> - 2558601146U, // <3,6,7,2>: Cost 3 vext1 <1,3,6,7>, <2,6,3,7> - 2725081963U, // <3,6,7,3>: Cost 3 vext3 <6,7,3,3>, <6,7,3,3> - 1659229046U, // <3,6,7,4>: Cost 2 vext3 LHS, <6,7,4,5> - 2715423611U, // <3,6,7,5>: Cost 3 vext3 <5,1,7,3>, <6,7,5,1> - 2722059141U, // <3,6,7,6>: Cost 3 vext3 <6,2,7,3>, <6,7,6,2> - 2962361654U, // <3,6,7,7>: Cost 3 vzipr <1,5,3,7>, RHS - 1659229078U, // <3,6,7,u>: Cost 2 vext3 LHS, <6,7,u,1> - 1659229087U, // <3,6,u,0>: Cost 2 vext3 LHS, <6,u,0,1> - 2689840041U, // <3,6,u,1>: Cost 3 vext3 LHS, <6,u,1,2> - 2558609339U, // <3,6,u,2>: Cost 3 vext1 <1,3,6,u>, <2,6,3,u> - 2576525853U, // <3,6,u,3>: Cost 3 vext1 <4,3,6,u>, <3,4,u,6> - 1659229127U, // <3,6,u,4>: Cost 2 vext3 LHS, <6,u,4,5> - 2689840081U, // <3,6,u,5>: Cost 3 vext3 LHS, <6,u,5,6> - 1659228984U, // <3,6,u,6>: Cost 2 vext3 LHS, <6,6,6,6> - 1652298720U, // <3,6,u,7>: Cost 2 vext3 <6,u,7,3>, <6,u,7,3> - 1659229159U, // <3,6,u,u>: Cost 2 vext3 LHS, <6,u,u,1> - 2626813952U, // <3,7,0,0>: Cost 3 vext2 <1,5,3,7>, <0,0,0,0> - 1553072230U, // <3,7,0,1>: Cost 2 vext2 <1,5,3,7>, LHS - 2626814116U, // <3,7,0,2>: Cost 3 vext2 <1,5,3,7>, <0,2,0,2> - 3700556028U, // <3,7,0,3>: Cost 4 vext2 <1,5,3,7>, <0,3,1,0> - 2626814290U, // <3,7,0,4>: Cost 3 vext2 <1,5,3,7>, <0,4,1,5> - 2582507375U, // <3,7,0,5>: Cost 3 vext1 <5,3,7,0>, <5,3,7,0> - 2588480072U, // <3,7,0,6>: Cost 3 vext1 <6,3,7,0>, <6,3,7,0> - 2732971055U, // <3,7,0,7>: Cost 3 vext3 LHS, <7,0,7,1> - 1553072797U, // <3,7,0,u>: Cost 2 vext2 <1,5,3,7>, LHS - 2626814710U, // <3,7,1,0>: Cost 3 vext2 <1,5,3,7>, <1,0,3,2> - 2626814772U, // <3,7,1,1>: Cost 3 vext2 <1,5,3,7>, <1,1,1,1> - 2626814870U, // <3,7,1,2>: Cost 3 vext2 <1,5,3,7>, <1,2,3,0> - 2625487854U, // <3,7,1,3>: Cost 3 vext2 <1,3,3,7>, <1,3,3,7> - 2582514998U, // <3,7,1,4>: Cost 3 vext1 <5,3,7,1>, RHS - 1553073296U, // <3,7,1,5>: Cost 2 vext2 <1,5,3,7>, <1,5,3,7> - 2627478753U, // <3,7,1,6>: Cost 3 vext2 <1,6,3,7>, <1,6,3,7> - 2727367810U, // <3,7,1,7>: Cost 3 vext3 <7,1,7,3>, <7,1,7,3> - 1555064195U, // <3,7,1,u>: Cost 2 vext2 <1,u,3,7>, <1,u,3,7> - 2588491878U, // <3,7,2,0>: Cost 3 vext1 <6,3,7,2>, LHS - 3700557318U, // <3,7,2,1>: Cost 4 vext2 <1,5,3,7>, <2,1,0,3> - 2626815592U, // <3,7,2,2>: Cost 3 vext2 <1,5,3,7>, <2,2,2,2> - 2626815654U, // <3,7,2,3>: Cost 3 vext2 <1,5,3,7>, <2,3,0,1> - 2588495158U, // <3,7,2,4>: Cost 3 vext1 <6,3,7,2>, RHS - 2632787817U, // <3,7,2,5>: Cost 3 vext2 <2,5,3,7>, <2,5,3,7> - 1559709626U, // <3,7,2,6>: Cost 2 vext2 <2,6,3,7>, <2,6,3,7> - 2728031443U, // <3,7,2,7>: Cost 3 vext3 <7,2,7,3>, <7,2,7,3> - 1561036892U, // <3,7,2,u>: Cost 2 vext2 <2,u,3,7>, <2,u,3,7> - 2626816150U, // <3,7,3,0>: Cost 3 vext2 <1,5,3,7>, <3,0,1,2> - 2626816268U, // <3,7,3,1>: Cost 3 vext2 <1,5,3,7>, <3,1,5,3> - 2633451878U, // <3,7,3,2>: Cost 3 vext2 <2,6,3,7>, <3,2,6,3> - 2626816412U, // <3,7,3,3>: Cost 3 vext2 <1,5,3,7>, <3,3,3,3> - 2626816514U, // <3,7,3,4>: Cost 3 vext2 <1,5,3,7>, <3,4,5,6> - 2638760514U, // <3,7,3,5>: Cost 3 vext2 <3,5,3,7>, <3,5,3,7> - 2639424147U, // <3,7,3,6>: Cost 3 vext2 <3,6,3,7>, <3,6,3,7> - 2826961920U, // <3,7,3,7>: Cost 3 vuzpr <1,3,5,7>, <1,3,5,7> - 2626816798U, // <3,7,3,u>: Cost 3 vext2 <1,5,3,7>, <3,u,1,2> - 2582536294U, // <3,7,4,0>: Cost 3 vext1 <5,3,7,4>, LHS - 2582537360U, // <3,7,4,1>: Cost 3 vext1 <5,3,7,4>, <1,5,3,7> - 2588510138U, // <3,7,4,2>: Cost 3 vext1 <6,3,7,4>, <2,6,3,7> - 3700558996U, // <3,7,4,3>: Cost 4 vext2 <1,5,3,7>, <4,3,6,7> - 2582539574U, // <3,7,4,4>: Cost 3 vext1 <5,3,7,4>, RHS - 1553075510U, // <3,7,4,5>: Cost 2 vext2 <1,5,3,7>, RHS - 2588512844U, // <3,7,4,6>: Cost 3 vext1 <6,3,7,4>, <6,3,7,4> - 2564625766U, // <3,7,4,7>: Cost 3 vext1 <2,3,7,4>, <7,4,5,6> - 1553075753U, // <3,7,4,u>: Cost 2 vext2 <1,5,3,7>, RHS - 2732971398U, // <3,7,5,0>: Cost 3 vext3 LHS, <7,5,0,2> - 2626817744U, // <3,7,5,1>: Cost 3 vext2 <1,5,3,7>, <5,1,7,3> - 3700559649U, // <3,7,5,2>: Cost 4 vext2 <1,5,3,7>, <5,2,7,3> - 2626817903U, // <3,7,5,3>: Cost 3 vext2 <1,5,3,7>, <5,3,7,0> - 2258728203U, // <3,7,5,4>: Cost 3 vrev <7,3,4,5> - 2732971446U, // <3,7,5,5>: Cost 3 vext3 LHS, <7,5,5,5> - 2732971457U, // <3,7,5,6>: Cost 3 vext3 LHS, <7,5,6,7> - 2826964278U, // <3,7,5,7>: Cost 3 vuzpr <1,3,5,7>, RHS - 2826964279U, // <3,7,5,u>: Cost 3 vuzpr <1,3,5,7>, RHS - 2732971478U, // <3,7,6,0>: Cost 3 vext3 LHS, <7,6,0,1> - 2732971486U, // <3,7,6,1>: Cost 3 vext3 LHS, <7,6,1,0> - 2633454074U, // <3,7,6,2>: Cost 3 vext2 <2,6,3,7>, <6,2,7,3> - 2633454152U, // <3,7,6,3>: Cost 3 vext2 <2,6,3,7>, <6,3,7,0> - 2732971518U, // <3,7,6,4>: Cost 3 vext3 LHS, <7,6,4,5> - 2732971526U, // <3,7,6,5>: Cost 3 vext3 LHS, <7,6,5,4> - 2732971537U, // <3,7,6,6>: Cost 3 vext3 LHS, <7,6,6,6> - 2732971540U, // <3,7,6,7>: Cost 3 vext3 LHS, <7,6,7,0> - 2726041124U, // <3,7,6,u>: Cost 3 vext3 <6,u,7,3>, <7,6,u,7> - 2570616934U, // <3,7,7,0>: Cost 3 vext1 <3,3,7,7>, LHS - 2570617856U, // <3,7,7,1>: Cost 3 vext1 <3,3,7,7>, <1,3,5,7> - 2564646635U, // <3,7,7,2>: Cost 3 vext1 <2,3,7,7>, <2,3,7,7> - 2570619332U, // <3,7,7,3>: Cost 3 vext1 <3,3,7,7>, <3,3,7,7> - 2570620214U, // <3,7,7,4>: Cost 3 vext1 <3,3,7,7>, RHS - 2582564726U, // <3,7,7,5>: Cost 3 vext1 <5,3,7,7>, <5,3,7,7> - 2588537423U, // <3,7,7,6>: Cost 3 vext1 <6,3,7,7>, <6,3,7,7> - 1659229804U, // <3,7,7,7>: Cost 2 vext3 LHS, <7,7,7,7> - 1659229804U, // <3,7,7,u>: Cost 2 vext3 LHS, <7,7,7,7> - 2626819795U, // <3,7,u,0>: Cost 3 vext2 <1,5,3,7>, <u,0,1,2> - 1553078062U, // <3,7,u,1>: Cost 2 vext2 <1,5,3,7>, LHS - 2626819973U, // <3,7,u,2>: Cost 3 vext2 <1,5,3,7>, <u,2,3,0> - 2826961565U, // <3,7,u,3>: Cost 3 vuzpr <1,3,5,7>, LHS - 2626820159U, // <3,7,u,4>: Cost 3 vext2 <1,5,3,7>, <u,4,5,6> - 1553078426U, // <3,7,u,5>: Cost 2 vext2 <1,5,3,7>, RHS - 1595545808U, // <3,7,u,6>: Cost 2 vext2 <u,6,3,7>, <u,6,3,7> - 1659229804U, // <3,7,u,7>: Cost 2 vext3 LHS, <7,7,7,7> - 1553078629U, // <3,7,u,u>: Cost 2 vext2 <1,5,3,7>, LHS - 1611448320U, // <3,u,0,0>: Cost 2 vext3 LHS, <0,0,0,0> - 1611896531U, // <3,u,0,1>: Cost 2 vext3 LHS, <u,0,1,2> - 1659672284U, // <3,u,0,2>: Cost 2 vext3 LHS, <u,0,2,2> - 1616099045U, // <3,u,0,3>: Cost 2 vext3 LHS, <u,0,3,2> - 2685638381U, // <3,u,0,4>: Cost 3 vext3 LHS, <u,0,4,1> - 1663874806U, // <3,u,0,5>: Cost 2 vext3 LHS, <u,0,5,1> - 1663874816U, // <3,u,0,6>: Cost 2 vext3 LHS, <u,0,6,2> - 2960313672U, // <3,u,0,7>: Cost 3 vzipr <1,2,3,0>, RHS - 1611896594U, // <3,u,0,u>: Cost 2 vext3 LHS, <u,0,u,2> - 1549763324U, // <3,u,1,0>: Cost 2 vext2 <1,0,3,u>, <1,0,3,u> - 1550426957U, // <3,u,1,1>: Cost 2 vext2 <1,1,3,u>, <1,1,3,u> - 537712430U, // <3,u,1,2>: Cost 1 vext3 LHS, LHS - 1616541495U, // <3,u,1,3>: Cost 2 vext3 LHS, <u,1,3,3> - 1490930998U, // <3,u,1,4>: Cost 2 vext1 <2,3,u,1>, RHS - 1553081489U, // <3,u,1,5>: Cost 2 vext2 <1,5,3,u>, <1,5,3,u> - 2627486946U, // <3,u,1,6>: Cost 3 vext2 <1,6,3,u>, <1,6,3,u> - 1659230043U, // <3,u,1,7>: Cost 2 vext3 LHS, <u,1,7,3> - 537712484U, // <3,u,1,u>: Cost 1 vext3 LHS, LHS - 1611890852U, // <3,u,2,0>: Cost 2 vext3 LHS, <0,2,0,2> - 2624833102U, // <3,u,2,1>: Cost 3 vext2 <1,2,3,u>, <2,1,u,3> - 1557063287U, // <3,u,2,2>: Cost 2 vext2 <2,2,3,u>, <2,2,3,u> - 1616099205U, // <3,u,2,3>: Cost 2 vext3 LHS, <u,2,3,0> - 1611890892U, // <3,u,2,4>: Cost 2 vext3 LHS, <0,2,4,6> - 2689841054U, // <3,u,2,5>: Cost 3 vext3 LHS, <u,2,5,7> - 1559717819U, // <3,u,2,6>: Cost 2 vext2 <2,6,3,u>, <2,6,3,u> - 1659230124U, // <3,u,2,7>: Cost 2 vext3 LHS, <u,2,7,3> - 1616541618U, // <3,u,2,u>: Cost 2 vext3 LHS, <u,2,u,0> - 1611896764U, // <3,u,3,0>: Cost 2 vext3 LHS, <u,3,0,1> - 1484973079U, // <3,u,3,1>: Cost 2 vext1 <1,3,u,3>, <1,3,u,3> - 2685638607U, // <3,u,3,2>: Cost 3 vext3 LHS, <u,3,2,2> - 336380006U, // <3,u,3,3>: Cost 1 vdup3 LHS - 1611896804U, // <3,u,3,4>: Cost 2 vext3 LHS, <u,3,4,5> - 1616541679U, // <3,u,3,5>: Cost 2 vext3 LHS, <u,3,5,7> - 2690283512U, // <3,u,3,6>: Cost 3 vext3 LHS, <u,3,6,7> - 2959674696U, // <3,u,3,7>: Cost 3 vzipr <1,1,3,3>, RHS - 336380006U, // <3,u,3,u>: Cost 1 vdup3 LHS - 2558722150U, // <3,u,4,0>: Cost 3 vext1 <1,3,u,4>, LHS - 1659672602U, // <3,u,4,1>: Cost 2 vext3 LHS, <u,4,1,5> - 1659672612U, // <3,u,4,2>: Cost 2 vext3 LHS, <u,4,2,6> - 2689841196U, // <3,u,4,3>: Cost 3 vext3 LHS, <u,4,3,5> - 1659227344U, // <3,u,4,4>: Cost 2 vext3 LHS, <4,4,4,4> - 1611896895U, // <3,u,4,5>: Cost 2 vext3 LHS, <u,4,5,6> - 1663875144U, // <3,u,4,6>: Cost 2 vext3 LHS, <u,4,6,6> - 1659230289U, // <3,u,4,7>: Cost 2 vext3 LHS, <u,4,7,6> - 1611896922U, // <3,u,4,u>: Cost 2 vext3 LHS, <u,4,u,6> - 1490960486U, // <3,u,5,0>: Cost 2 vext1 <2,3,u,5>, LHS - 2689841261U, // <3,u,5,1>: Cost 3 vext3 LHS, <u,5,1,7> - 1490962162U, // <3,u,5,2>: Cost 2 vext1 <2,3,u,5>, <2,3,u,5> - 1616541823U, // <3,u,5,3>: Cost 2 vext3 LHS, <u,5,3,7> - 1490963766U, // <3,u,5,4>: Cost 2 vext1 <2,3,u,5>, RHS - 1659228164U, // <3,u,5,5>: Cost 2 vext3 LHS, <5,5,5,5> - 537712794U, // <3,u,5,6>: Cost 1 vext3 LHS, RHS - 1659230371U, // <3,u,5,7>: Cost 2 vext3 LHS, <u,5,7,7> - 537712812U, // <3,u,5,u>: Cost 1 vext3 LHS, RHS - 2689841327U, // <3,u,6,0>: Cost 3 vext3 LHS, <u,6,0,1> - 2558739482U, // <3,u,6,1>: Cost 3 vext1 <1,3,u,6>, <1,3,u,6> - 2689841351U, // <3,u,6,2>: Cost 3 vext3 LHS, <u,6,2,7> - 1616099536U, // <3,u,6,3>: Cost 2 vext3 LHS, <u,6,3,7> - 1659227508U, // <3,u,6,4>: Cost 2 vext3 LHS, <4,6,4,6> - 2690283746U, // <3,u,6,5>: Cost 3 vext3 LHS, <u,6,5,7> - 1659228984U, // <3,u,6,6>: Cost 2 vext3 LHS, <6,6,6,6> - 1659230445U, // <3,u,6,7>: Cost 2 vext3 LHS, <u,6,7,0> - 1616099581U, // <3,u,6,u>: Cost 2 vext3 LHS, <u,6,u,7> - 1485004902U, // <3,u,7,0>: Cost 2 vext1 <1,3,u,7>, LHS - 1485005851U, // <3,u,7,1>: Cost 2 vext1 <1,3,u,7>, <1,3,u,7> - 2558748264U, // <3,u,7,2>: Cost 3 vext1 <1,3,u,7>, <2,2,2,2> - 3095397021U, // <3,u,7,3>: Cost 3 vtrnr <1,3,5,7>, LHS - 1485008182U, // <3,u,7,4>: Cost 2 vext1 <1,3,u,7>, RHS - 1659228328U, // <3,u,7,5>: Cost 2 vext3 LHS, <5,7,5,7> - 2722060599U, // <3,u,7,6>: Cost 3 vext3 <6,2,7,3>, <u,7,6,2> - 1659229804U, // <3,u,7,7>: Cost 2 vext3 LHS, <7,7,7,7> - 1485010734U, // <3,u,7,u>: Cost 2 vext1 <1,3,u,7>, LHS - 1616099665U, // <3,u,u,0>: Cost 2 vext3 LHS, <u,u,0,1> - 1611897179U, // <3,u,u,1>: Cost 2 vext3 LHS, <u,u,1,2> - 537712997U, // <3,u,u,2>: Cost 1 vext3 LHS, LHS - 336380006U, // <3,u,u,3>: Cost 1 vdup3 LHS - 1616099705U, // <3,u,u,4>: Cost 2 vext3 LHS, <u,u,4,5> - 1611897219U, // <3,u,u,5>: Cost 2 vext3 LHS, <u,u,5,6> - 537713037U, // <3,u,u,6>: Cost 1 vext3 LHS, RHS - 1659230607U, // <3,u,u,7>: Cost 2 vext3 LHS, <u,u,7,0> - 537713051U, // <3,u,u,u>: Cost 1 vext3 LHS, LHS - 2691907584U, // <4,0,0,0>: Cost 3 vext3 <1,2,3,4>, <0,0,0,0> - 2691907594U, // <4,0,0,1>: Cost 3 vext3 <1,2,3,4>, <0,0,1,1> - 2691907604U, // <4,0,0,2>: Cost 3 vext3 <1,2,3,4>, <0,0,2,2> - 3709862144U, // <4,0,0,3>: Cost 4 vext2 <3,1,4,0>, <0,3,1,4> - 2684682280U, // <4,0,0,4>: Cost 3 vext3 <0,0,4,4>, <0,0,4,4> - 3694600633U, // <4,0,0,5>: Cost 4 vext2 <0,5,4,0>, <0,5,4,0> - 3291431290U, // <4,0,0,6>: Cost 4 vrev <0,4,6,0> - 3668342067U, // <4,0,0,7>: Cost 4 vext1 <7,4,0,0>, <7,4,0,0> - 2691907657U, // <4,0,0,u>: Cost 3 vext3 <1,2,3,4>, <0,0,u,1> - 2570715238U, // <4,0,1,0>: Cost 3 vext1 <3,4,0,1>, LHS - 2570716058U, // <4,0,1,1>: Cost 3 vext1 <3,4,0,1>, <1,2,3,4> - 1618165862U, // <4,0,1,2>: Cost 2 vext3 <1,2,3,4>, LHS - 2570717648U, // <4,0,1,3>: Cost 3 vext1 <3,4,0,1>, <3,4,0,1> - 2570718518U, // <4,0,1,4>: Cost 3 vext1 <3,4,0,1>, RHS - 2594607206U, // <4,0,1,5>: Cost 3 vext1 <7,4,0,1>, <5,6,7,4> - 3662377563U, // <4,0,1,6>: Cost 4 vext1 <6,4,0,1>, <6,4,0,1> - 2594608436U, // <4,0,1,7>: Cost 3 vext1 <7,4,0,1>, <7,4,0,1> - 1618165916U, // <4,0,1,u>: Cost 2 vext3 <1,2,3,4>, LHS - 2685714598U, // <4,0,2,0>: Cost 3 vext3 <0,2,0,4>, <0,2,0,4> - 3759530159U, // <4,0,2,1>: Cost 4 vext3 <0,2,1,4>, <0,2,1,4> - 2685862072U, // <4,0,2,2>: Cost 3 vext3 <0,2,2,4>, <0,2,2,4> - 2631476937U, // <4,0,2,3>: Cost 3 vext2 <2,3,4,0>, <2,3,4,0> - 2685714636U, // <4,0,2,4>: Cost 3 vext3 <0,2,0,4>, <0,2,4,6> - 3765649622U, // <4,0,2,5>: Cost 4 vext3 <1,2,3,4>, <0,2,5,7> - 2686157020U, // <4,0,2,6>: Cost 3 vext3 <0,2,6,4>, <0,2,6,4> - 3668358453U, // <4,0,2,7>: Cost 4 vext1 <7,4,0,2>, <7,4,0,2> - 2686304494U, // <4,0,2,u>: Cost 3 vext3 <0,2,u,4>, <0,2,u,4> - 3632529510U, // <4,0,3,0>: Cost 4 vext1 <1,4,0,3>, LHS - 2686451968U, // <4,0,3,1>: Cost 3 vext3 <0,3,1,4>, <0,3,1,4> - 2686525705U, // <4,0,3,2>: Cost 3 vext3 <0,3,2,4>, <0,3,2,4> - 3760341266U, // <4,0,3,3>: Cost 4 vext3 <0,3,3,4>, <0,3,3,4> - 3632532790U, // <4,0,3,4>: Cost 4 vext1 <1,4,0,3>, RHS - 3913254606U, // <4,0,3,5>: Cost 4 vuzpr <3,4,5,0>, <2,3,4,5> - 3705219740U, // <4,0,3,6>: Cost 4 vext2 <2,3,4,0>, <3,6,4,7> - 3713845990U, // <4,0,3,7>: Cost 4 vext2 <3,7,4,0>, <3,7,4,0> - 2686451968U, // <4,0,3,u>: Cost 3 vext3 <0,3,1,4>, <0,3,1,4> - 2552823910U, // <4,0,4,0>: Cost 3 vext1 <0,4,0,4>, LHS - 2691907922U, // <4,0,4,1>: Cost 3 vext3 <1,2,3,4>, <0,4,1,5> - 2691907932U, // <4,0,4,2>: Cost 3 vext3 <1,2,3,4>, <0,4,2,6> - 3626567830U, // <4,0,4,3>: Cost 4 vext1 <0,4,0,4>, <3,0,1,2> - 2552827190U, // <4,0,4,4>: Cost 3 vext1 <0,4,0,4>, RHS - 2631478582U, // <4,0,4,5>: Cost 3 vext2 <2,3,4,0>, RHS - 3626570017U, // <4,0,4,6>: Cost 4 vext1 <0,4,0,4>, <6,0,1,2> - 3668374839U, // <4,0,4,7>: Cost 4 vext1 <7,4,0,4>, <7,4,0,4> - 2552829742U, // <4,0,4,u>: Cost 3 vext1 <0,4,0,4>, LHS - 2558804070U, // <4,0,5,0>: Cost 3 vext1 <1,4,0,5>, LHS - 1839644774U, // <4,0,5,1>: Cost 2 vzipl RHS, LHS - 2913386660U, // <4,0,5,2>: Cost 3 vzipl RHS, <0,2,0,2> - 2570750420U, // <4,0,5,3>: Cost 3 vext1 <3,4,0,5>, <3,4,0,5> - 2558807350U, // <4,0,5,4>: Cost 3 vext1 <1,4,0,5>, RHS - 3987128750U, // <4,0,5,5>: Cost 4 vzipl RHS, <0,5,2,7> - 3987128822U, // <4,0,5,6>: Cost 4 vzipl RHS, <0,6,1,7> - 2594641208U, // <4,0,5,7>: Cost 3 vext1 <7,4,0,5>, <7,4,0,5> - 1839645341U, // <4,0,5,u>: Cost 2 vzipl RHS, LHS - 2552840294U, // <4,0,6,0>: Cost 3 vext1 <0,4,0,6>, LHS - 3047604234U, // <4,0,6,1>: Cost 3 vtrnl RHS, <0,0,1,1> - 1973862502U, // <4,0,6,2>: Cost 2 vtrnl RHS, LHS - 2570758613U, // <4,0,6,3>: Cost 3 vext1 <3,4,0,6>, <3,4,0,6> - 2552843574U, // <4,0,6,4>: Cost 3 vext1 <0,4,0,6>, RHS - 2217664887U, // <4,0,6,5>: Cost 3 vrev <0,4,5,6> - 3662418528U, // <4,0,6,6>: Cost 4 vext1 <6,4,0,6>, <6,4,0,6> - 2658022257U, // <4,0,6,7>: Cost 3 vext2 <6,7,4,0>, <6,7,4,0> - 1973862556U, // <4,0,6,u>: Cost 2 vtrnl RHS, LHS - 3731764218U, // <4,0,7,0>: Cost 4 vext2 <6,7,4,0>, <7,0,1,2> - 3988324454U, // <4,0,7,1>: Cost 4 vzipl <4,7,5,0>, LHS - 4122034278U, // <4,0,7,2>: Cost 4 vtrnl <4,6,7,1>, LHS - 3735082246U, // <4,0,7,3>: Cost 4 vext2 <7,3,4,0>, <7,3,4,0> - 3731764536U, // <4,0,7,4>: Cost 4 vext2 <6,7,4,0>, <7,4,0,5> - 3937145718U, // <4,0,7,5>: Cost 4 vuzpr <7,4,5,0>, <6,7,4,5> - 3737073145U, // <4,0,7,6>: Cost 4 vext2 <7,6,4,0>, <7,6,4,0> - 3731764844U, // <4,0,7,7>: Cost 4 vext2 <6,7,4,0>, <7,7,7,7> - 4122034332U, // <4,0,7,u>: Cost 4 vtrnl <4,6,7,1>, LHS - 2552856678U, // <4,0,u,0>: Cost 3 vext1 <0,4,0,u>, LHS - 1841635430U, // <4,0,u,1>: Cost 2 vzipl RHS, LHS - 1618166429U, // <4,0,u,2>: Cost 2 vext3 <1,2,3,4>, LHS - 2570774999U, // <4,0,u,3>: Cost 3 vext1 <3,4,0,u>, <3,4,0,u> - 2552859958U, // <4,0,u,4>: Cost 3 vext1 <0,4,0,u>, RHS - 2631481498U, // <4,0,u,5>: Cost 3 vext2 <2,3,4,0>, RHS - 2686157020U, // <4,0,u,6>: Cost 3 vext3 <0,2,6,4>, <0,2,6,4> - 2594665787U, // <4,0,u,7>: Cost 3 vext1 <7,4,0,u>, <7,4,0,u> - 1618166483U, // <4,0,u,u>: Cost 2 vext3 <1,2,3,4>, LHS - 2617548837U, // <4,1,0,0>: Cost 3 vext2 <0,0,4,1>, <0,0,4,1> - 2622857318U, // <4,1,0,1>: Cost 3 vext2 <0,u,4,1>, LHS - 3693281484U, // <4,1,0,2>: Cost 4 vext2 <0,3,4,1>, <0,2,4,6> - 2691908342U, // <4,1,0,3>: Cost 3 vext3 <1,2,3,4>, <1,0,3,2> - 2622857554U, // <4,1,0,4>: Cost 3 vext2 <0,u,4,1>, <0,4,1,5> - 3764470538U, // <4,1,0,5>: Cost 4 vext3 <1,0,5,4>, <1,0,5,4> - 3695272459U, // <4,1,0,6>: Cost 4 vext2 <0,6,4,1>, <0,6,4,1> - 3733094980U, // <4,1,0,7>: Cost 4 vext2 <7,0,4,1>, <0,7,1,4> - 2622857885U, // <4,1,0,u>: Cost 3 vext2 <0,u,4,1>, LHS - 3696599798U, // <4,1,1,0>: Cost 4 vext2 <0,u,4,1>, <1,0,3,2> - 2691097399U, // <4,1,1,1>: Cost 3 vext3 <1,1,1,4>, <1,1,1,4> - 2631484314U, // <4,1,1,2>: Cost 3 vext2 <2,3,4,1>, <1,2,3,4> - 2691908424U, // <4,1,1,3>: Cost 3 vext3 <1,2,3,4>, <1,1,3,3> - 3696600125U, // <4,1,1,4>: Cost 4 vext2 <0,u,4,1>, <1,4,3,5> - 3696600175U, // <4,1,1,5>: Cost 4 vext2 <0,u,4,1>, <1,5,0,1> - 3696600307U, // <4,1,1,6>: Cost 4 vext2 <0,u,4,1>, <1,6,5,7> - 3668423997U, // <4,1,1,7>: Cost 4 vext1 <7,4,1,1>, <7,4,1,1> - 2691908469U, // <4,1,1,u>: Cost 3 vext3 <1,2,3,4>, <1,1,u,3> - 2570797158U, // <4,1,2,0>: Cost 3 vext1 <3,4,1,2>, LHS - 2570797978U, // <4,1,2,1>: Cost 3 vext1 <3,4,1,2>, <1,2,3,4> - 3696600680U, // <4,1,2,2>: Cost 4 vext2 <0,u,4,1>, <2,2,2,2> - 1618166682U, // <4,1,2,3>: Cost 2 vext3 <1,2,3,4>, <1,2,3,4> - 2570800438U, // <4,1,2,4>: Cost 3 vext1 <3,4,1,2>, RHS - 3765650347U, // <4,1,2,5>: Cost 4 vext3 <1,2,3,4>, <1,2,5,3> - 3696601018U, // <4,1,2,6>: Cost 4 vext2 <0,u,4,1>, <2,6,3,7> - 3668432190U, // <4,1,2,7>: Cost 4 vext1 <7,4,1,2>, <7,4,1,2> - 1618535367U, // <4,1,2,u>: Cost 2 vext3 <1,2,u,4>, <1,2,u,4> - 2564833382U, // <4,1,3,0>: Cost 3 vext1 <2,4,1,3>, LHS - 2691908568U, // <4,1,3,1>: Cost 3 vext3 <1,2,3,4>, <1,3,1,3> - 2691908578U, // <4,1,3,2>: Cost 3 vext3 <1,2,3,4>, <1,3,2,4> - 2692572139U, // <4,1,3,3>: Cost 3 vext3 <1,3,3,4>, <1,3,3,4> - 2564836662U, // <4,1,3,4>: Cost 3 vext1 <2,4,1,3>, RHS - 2691908608U, // <4,1,3,5>: Cost 3 vext3 <1,2,3,4>, <1,3,5,7> - 2588725862U, // <4,1,3,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3> - 3662468090U, // <4,1,3,7>: Cost 4 vext1 <6,4,1,3>, <7,0,1,2> - 2691908631U, // <4,1,3,u>: Cost 3 vext3 <1,2,3,4>, <1,3,u,3> - 3760194590U, // <4,1,4,0>: Cost 4 vext3 <0,3,1,4>, <1,4,0,1> - 3693947874U, // <4,1,4,1>: Cost 4 vext2 <0,4,4,1>, <4,1,5,0> - 3765650484U, // <4,1,4,2>: Cost 4 vext3 <1,2,3,4>, <1,4,2,5> - 3113877606U, // <4,1,4,3>: Cost 3 vtrnr <4,4,4,4>, LHS - 3760194630U, // <4,1,4,4>: Cost 4 vext3 <0,3,1,4>, <1,4,4,5> - 2622860598U, // <4,1,4,5>: Cost 3 vext2 <0,u,4,1>, RHS - 3297436759U, // <4,1,4,6>: Cost 4 vrev <1,4,6,4> - 3800007772U, // <4,1,4,7>: Cost 4 vext3 <7,0,1,4>, <1,4,7,0> - 2622860841U, // <4,1,4,u>: Cost 3 vext2 <0,u,4,1>, RHS - 1479164006U, // <4,1,5,0>: Cost 2 vext1 <0,4,1,5>, LHS - 2552906486U, // <4,1,5,1>: Cost 3 vext1 <0,4,1,5>, <1,0,3,2> - 2552907299U, // <4,1,5,2>: Cost 3 vext1 <0,4,1,5>, <2,1,3,5> - 2552907926U, // <4,1,5,3>: Cost 3 vext1 <0,4,1,5>, <3,0,1,2> - 1479167286U, // <4,1,5,4>: Cost 2 vext1 <0,4,1,5>, RHS - 2913387664U, // <4,1,5,5>: Cost 3 vzipl RHS, <1,5,3,7> - 2600686074U, // <4,1,5,6>: Cost 3 vext1 <u,4,1,5>, <6,2,7,3> - 2600686586U, // <4,1,5,7>: Cost 3 vext1 <u,4,1,5>, <7,0,1,2> - 1479169838U, // <4,1,5,u>: Cost 2 vext1 <0,4,1,5>, LHS - 2552914022U, // <4,1,6,0>: Cost 3 vext1 <0,4,1,6>, LHS - 2558886708U, // <4,1,6,1>: Cost 3 vext1 <1,4,1,6>, <1,1,1,1> - 4028205206U, // <4,1,6,2>: Cost 4 vzipr <0,2,4,6>, <3,0,1,2> - 3089858662U, // <4,1,6,3>: Cost 3 vtrnr <0,4,2,6>, LHS - 2552917302U, // <4,1,6,4>: Cost 3 vext1 <0,4,1,6>, RHS - 2223637584U, // <4,1,6,5>: Cost 3 vrev <1,4,5,6> - 4121347081U, // <4,1,6,6>: Cost 4 vtrnl RHS, <1,3,6,7> - 3721155406U, // <4,1,6,7>: Cost 4 vext2 <5,0,4,1>, <6,7,0,1> - 2552919854U, // <4,1,6,u>: Cost 3 vext1 <0,4,1,6>, LHS - 2659357716U, // <4,1,7,0>: Cost 3 vext2 <7,0,4,1>, <7,0,4,1> - 3733763173U, // <4,1,7,1>: Cost 4 vext2 <7,1,4,1>, <7,1,4,1> - 3734426806U, // <4,1,7,2>: Cost 4 vext2 <7,2,4,1>, <7,2,4,1> - 2695226671U, // <4,1,7,3>: Cost 3 vext3 <1,7,3,4>, <1,7,3,4> - 3721155942U, // <4,1,7,4>: Cost 4 vext2 <5,0,4,1>, <7,4,5,6> - 3721155976U, // <4,1,7,5>: Cost 4 vext2 <5,0,4,1>, <7,5,0,4> - 3662500458U, // <4,1,7,6>: Cost 4 vext1 <6,4,1,7>, <6,4,1,7> - 3721156204U, // <4,1,7,7>: Cost 4 vext2 <5,0,4,1>, <7,7,7,7> - 2659357716U, // <4,1,7,u>: Cost 3 vext2 <7,0,4,1>, <7,0,4,1> - 1479188582U, // <4,1,u,0>: Cost 2 vext1 <0,4,1,u>, LHS - 2552931062U, // <4,1,u,1>: Cost 3 vext1 <0,4,1,u>, <1,0,3,2> - 2552931944U, // <4,1,u,2>: Cost 3 vext1 <0,4,1,u>, <2,2,2,2> - 1622148480U, // <4,1,u,3>: Cost 2 vext3 <1,u,3,4>, <1,u,3,4> - 1479191862U, // <4,1,u,4>: Cost 2 vext1 <0,4,1,u>, RHS - 2622863514U, // <4,1,u,5>: Cost 3 vext2 <0,u,4,1>, RHS - 2588725862U, // <4,1,u,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3> - 2600686586U, // <4,1,u,7>: Cost 3 vext1 <u,4,1,5>, <7,0,1,2> - 1479194414U, // <4,1,u,u>: Cost 2 vext1 <0,4,1,u>, LHS - 2617557030U, // <4,2,0,0>: Cost 3 vext2 <0,0,4,2>, <0,0,4,2> - 2622865510U, // <4,2,0,1>: Cost 3 vext2 <0,u,4,2>, LHS - 2622865612U, // <4,2,0,2>: Cost 3 vext2 <0,u,4,2>, <0,2,4,6> - 3693289753U, // <4,2,0,3>: Cost 4 vext2 <0,3,4,2>, <0,3,4,2> - 2635473244U, // <4,2,0,4>: Cost 3 vext2 <3,0,4,2>, <0,4,2,6> - 3765650918U, // <4,2,0,5>: Cost 4 vext3 <1,2,3,4>, <2,0,5,7> - 2696775148U, // <4,2,0,6>: Cost 3 vext3 <2,0,6,4>, <2,0,6,4> - 3695944285U, // <4,2,0,7>: Cost 4 vext2 <0,7,4,2>, <0,7,4,2> - 2622866077U, // <4,2,0,u>: Cost 3 vext2 <0,u,4,2>, LHS - 3696607990U, // <4,2,1,0>: Cost 4 vext2 <0,u,4,2>, <1,0,3,2> - 3696608052U, // <4,2,1,1>: Cost 4 vext2 <0,u,4,2>, <1,1,1,1> - 3696608150U, // <4,2,1,2>: Cost 4 vext2 <0,u,4,2>, <1,2,3,0> - 3895574630U, // <4,2,1,3>: Cost 4 vuzpr <0,4,u,2>, LHS - 2691909162U, // <4,2,1,4>: Cost 3 vext3 <1,2,3,4>, <2,1,4,3> - 3696608400U, // <4,2,1,5>: Cost 4 vext2 <0,u,4,2>, <1,5,3,7> - 3760784956U, // <4,2,1,6>: Cost 4 vext3 <0,4,0,4>, <2,1,6,3> - 3773908549U, // <4,2,1,7>: Cost 5 vext3 <2,5,7,4>, <2,1,7,3> - 2691909162U, // <4,2,1,u>: Cost 3 vext3 <1,2,3,4>, <2,1,4,3> - 3696608748U, // <4,2,2,0>: Cost 4 vext2 <0,u,4,2>, <2,0,6,4> - 3696608828U, // <4,2,2,1>: Cost 4 vext2 <0,u,4,2>, <2,1,6,3> - 2691909224U, // <4,2,2,2>: Cost 3 vext3 <1,2,3,4>, <2,2,2,2> - 2691909234U, // <4,2,2,3>: Cost 3 vext3 <1,2,3,4>, <2,2,3,3> - 3759605368U, // <4,2,2,4>: Cost 4 vext3 <0,2,2,4>, <2,2,4,0> - 3696609156U, // <4,2,2,5>: Cost 4 vext2 <0,u,4,2>, <2,5,6,7> - 3760785040U, // <4,2,2,6>: Cost 4 vext3 <0,4,0,4>, <2,2,6,6> - 3668505927U, // <4,2,2,7>: Cost 4 vext1 <7,4,2,2>, <7,4,2,2> - 2691909279U, // <4,2,2,u>: Cost 3 vext3 <1,2,3,4>, <2,2,u,3> - 2691909286U, // <4,2,3,0>: Cost 3 vext3 <1,2,3,4>, <2,3,0,1> - 3764840111U, // <4,2,3,1>: Cost 4 vext3 <1,1,1,4>, <2,3,1,1> - 3765651129U, // <4,2,3,2>: Cost 4 vext3 <1,2,3,4>, <2,3,2,2> - 2698544836U, // <4,2,3,3>: Cost 3 vext3 <2,3,3,4>, <2,3,3,4> - 2685863630U, // <4,2,3,4>: Cost 3 vext3 <0,2,2,4>, <2,3,4,5> - 2698692310U, // <4,2,3,5>: Cost 3 vext3 <2,3,5,4>, <2,3,5,4> - 3772507871U, // <4,2,3,6>: Cost 4 vext3 <2,3,6,4>, <2,3,6,4> - 2698839784U, // <4,2,3,7>: Cost 3 vext3 <2,3,7,4>, <2,3,7,4> - 2691909358U, // <4,2,3,u>: Cost 3 vext3 <1,2,3,4>, <2,3,u,1> - 2564915302U, // <4,2,4,0>: Cost 3 vext1 <2,4,2,4>, LHS - 2564916122U, // <4,2,4,1>: Cost 3 vext1 <2,4,2,4>, <1,2,3,4> - 2564917004U, // <4,2,4,2>: Cost 3 vext1 <2,4,2,4>, <2,4,2,4> - 2699208469U, // <4,2,4,3>: Cost 3 vext3 <2,4,3,4>, <2,4,3,4> - 2564918582U, // <4,2,4,4>: Cost 3 vext1 <2,4,2,4>, RHS - 2622868790U, // <4,2,4,5>: Cost 3 vext2 <0,u,4,2>, RHS - 2229667632U, // <4,2,4,6>: Cost 3 vrev <2,4,6,4> - 3800082229U, // <4,2,4,7>: Cost 4 vext3 <7,0,2,4>, <2,4,7,0> - 2622869033U, // <4,2,4,u>: Cost 3 vext2 <0,u,4,2>, RHS - 2552979558U, // <4,2,5,0>: Cost 3 vext1 <0,4,2,5>, LHS - 2558952342U, // <4,2,5,1>: Cost 3 vext1 <1,4,2,5>, <1,2,3,0> - 2564925032U, // <4,2,5,2>: Cost 3 vext1 <2,4,2,5>, <2,2,2,2> - 2967060582U, // <4,2,5,3>: Cost 3 vzipr <2,3,4,5>, LHS - 2552982838U, // <4,2,5,4>: Cost 3 vext1 <0,4,2,5>, RHS - 3987130190U, // <4,2,5,5>: Cost 4 vzipl RHS, <2,5,0,7> - 2913388474U, // <4,2,5,6>: Cost 3 vzipl RHS, <2,6,3,7> - 3895577910U, // <4,2,5,7>: Cost 4 vuzpr <0,4,u,2>, RHS - 2552985390U, // <4,2,5,u>: Cost 3 vext1 <0,4,2,5>, LHS - 1479245926U, // <4,2,6,0>: Cost 2 vext1 <0,4,2,6>, LHS - 2552988406U, // <4,2,6,1>: Cost 3 vext1 <0,4,2,6>, <1,0,3,2> - 2552989288U, // <4,2,6,2>: Cost 3 vext1 <0,4,2,6>, <2,2,2,2> - 2954461286U, // <4,2,6,3>: Cost 3 vzipr <0,2,4,6>, LHS - 1479249206U, // <4,2,6,4>: Cost 2 vext1 <0,4,2,6>, RHS - 2229610281U, // <4,2,6,5>: Cost 3 vrev <2,4,5,6> - 2600767994U, // <4,2,6,6>: Cost 3 vext1 <u,4,2,6>, <6,2,7,3> - 2600768506U, // <4,2,6,7>: Cost 3 vext1 <u,4,2,6>, <7,0,1,2> - 1479251758U, // <4,2,6,u>: Cost 2 vext1 <0,4,2,6>, LHS - 2659365909U, // <4,2,7,0>: Cost 3 vext2 <7,0,4,2>, <7,0,4,2> - 3733771366U, // <4,2,7,1>: Cost 4 vext2 <7,1,4,2>, <7,1,4,2> - 3734434999U, // <4,2,7,2>: Cost 4 vext2 <7,2,4,2>, <7,2,4,2> - 2701199368U, // <4,2,7,3>: Cost 3 vext3 <2,7,3,4>, <2,7,3,4> - 4175774618U, // <4,2,7,4>: Cost 4 vtrnr <2,4,5,7>, <1,2,3,4> - 3303360298U, // <4,2,7,5>: Cost 4 vrev <2,4,5,7> - 3727136217U, // <4,2,7,6>: Cost 4 vext2 <6,0,4,2>, <7,6,0,4> - 3727136364U, // <4,2,7,7>: Cost 4 vext2 <6,0,4,2>, <7,7,7,7> - 2659365909U, // <4,2,7,u>: Cost 3 vext2 <7,0,4,2>, <7,0,4,2> - 1479262310U, // <4,2,u,0>: Cost 2 vext1 <0,4,2,u>, LHS - 2553004790U, // <4,2,u,1>: Cost 3 vext1 <0,4,2,u>, <1,0,3,2> - 2553005672U, // <4,2,u,2>: Cost 3 vext1 <0,4,2,u>, <2,2,2,2> - 2954477670U, // <4,2,u,3>: Cost 3 vzipr <0,2,4,u>, LHS - 1479265590U, // <4,2,u,4>: Cost 2 vext1 <0,4,2,u>, RHS - 2622871706U, // <4,2,u,5>: Cost 3 vext2 <0,u,4,2>, RHS - 2229700404U, // <4,2,u,6>: Cost 3 vrev <2,4,6,u> - 2600784890U, // <4,2,u,7>: Cost 3 vext1 <u,4,2,u>, <7,0,1,2> - 1479268142U, // <4,2,u,u>: Cost 2 vext1 <0,4,2,u>, LHS - 3765651595U, // <4,3,0,0>: Cost 4 vext3 <1,2,3,4>, <3,0,0,0> - 2691909782U, // <4,3,0,1>: Cost 3 vext3 <1,2,3,4>, <3,0,1,2> - 2702452897U, // <4,3,0,2>: Cost 3 vext3 <3,0,2,4>, <3,0,2,4> - 3693297946U, // <4,3,0,3>: Cost 4 vext2 <0,3,4,3>, <0,3,4,3> - 3760711856U, // <4,3,0,4>: Cost 4 vext3 <0,3,u,4>, <3,0,4,1> - 2235533820U, // <4,3,0,5>: Cost 3 vrev <3,4,5,0> - 3309349381U, // <4,3,0,6>: Cost 4 vrev <3,4,6,0> - 3668563278U, // <4,3,0,7>: Cost 4 vext1 <7,4,3,0>, <7,4,3,0> - 2691909845U, // <4,3,0,u>: Cost 3 vext3 <1,2,3,4>, <3,0,u,2> - 2235173328U, // <4,3,1,0>: Cost 3 vrev <3,4,0,1> - 3764840678U, // <4,3,1,1>: Cost 4 vext3 <1,1,1,4>, <3,1,1,1> - 2630173594U, // <4,3,1,2>: Cost 3 vext2 <2,1,4,3>, <1,2,3,4> - 2703190267U, // <4,3,1,3>: Cost 3 vext3 <3,1,3,4>, <3,1,3,4> - 3760195840U, // <4,3,1,4>: Cost 4 vext3 <0,3,1,4>, <3,1,4,0> - 3765651724U, // <4,3,1,5>: Cost 4 vext3 <1,2,3,4>, <3,1,5,3> - 3309357574U, // <4,3,1,6>: Cost 4 vrev <3,4,6,1> - 3769633054U, // <4,3,1,7>: Cost 4 vext3 <1,u,3,4>, <3,1,7,3> - 2703558952U, // <4,3,1,u>: Cost 3 vext3 <3,1,u,4>, <3,1,u,4> - 3626770534U, // <4,3,2,0>: Cost 4 vext1 <0,4,3,2>, LHS - 2630174250U, // <4,3,2,1>: Cost 3 vext2 <2,1,4,3>, <2,1,4,3> - 3765651777U, // <4,3,2,2>: Cost 4 vext3 <1,2,3,4>, <3,2,2,2> - 2703853900U, // <4,3,2,3>: Cost 3 vext3 <3,2,3,4>, <3,2,3,4> - 3626773814U, // <4,3,2,4>: Cost 4 vext1 <0,4,3,2>, RHS - 2704001374U, // <4,3,2,5>: Cost 3 vext3 <3,2,5,4>, <3,2,5,4> - 3765651814U, // <4,3,2,6>: Cost 4 vext3 <1,2,3,4>, <3,2,6,3> - 3769633135U, // <4,3,2,7>: Cost 4 vext3 <1,u,3,4>, <3,2,7,3> - 2634819681U, // <4,3,2,u>: Cost 3 vext2 <2,u,4,3>, <2,u,4,3> - 3765651839U, // <4,3,3,0>: Cost 4 vext3 <1,2,3,4>, <3,3,0,1> - 3765651848U, // <4,3,3,1>: Cost 4 vext3 <1,2,3,4>, <3,3,1,1> - 3710552404U, // <4,3,3,2>: Cost 4 vext2 <3,2,4,3>, <3,2,4,3> - 2691910044U, // <4,3,3,3>: Cost 3 vext3 <1,2,3,4>, <3,3,3,3> - 2704591270U, // <4,3,3,4>: Cost 3 vext3 <3,3,4,4>, <3,3,4,4> - 3769633202U, // <4,3,3,5>: Cost 4 vext3 <1,u,3,4>, <3,3,5,7> - 3703917212U, // <4,3,3,6>: Cost 4 vext2 <2,1,4,3>, <3,6,4,7> - 3769633220U, // <4,3,3,7>: Cost 4 vext3 <1,u,3,4>, <3,3,7,7> - 2691910044U, // <4,3,3,u>: Cost 3 vext3 <1,2,3,4>, <3,3,3,3> - 2691910096U, // <4,3,4,0>: Cost 3 vext3 <1,2,3,4>, <3,4,0,1> - 2691910106U, // <4,3,4,1>: Cost 3 vext3 <1,2,3,4>, <3,4,1,2> - 2564990741U, // <4,3,4,2>: Cost 3 vext1 <2,4,3,4>, <2,4,3,4> - 3765651946U, // <4,3,4,3>: Cost 4 vext3 <1,2,3,4>, <3,4,3,0> - 2691910136U, // <4,3,4,4>: Cost 3 vext3 <1,2,3,4>, <3,4,4,5> - 2686454274U, // <4,3,4,5>: Cost 3 vext3 <0,3,1,4>, <3,4,5,6> - 2235640329U, // <4,3,4,6>: Cost 3 vrev <3,4,6,4> - 3801483792U, // <4,3,4,7>: Cost 4 vext3 <7,2,3,4>, <3,4,7,2> - 2691910168U, // <4,3,4,u>: Cost 3 vext3 <1,2,3,4>, <3,4,u,1> - 2559025254U, // <4,3,5,0>: Cost 3 vext1 <1,4,3,5>, LHS - 2559026237U, // <4,3,5,1>: Cost 3 vext1 <1,4,3,5>, <1,4,3,5> - 2564998862U, // <4,3,5,2>: Cost 3 vext1 <2,4,3,5>, <2,3,4,5> - 2570971548U, // <4,3,5,3>: Cost 3 vext1 <3,4,3,5>, <3,3,3,3> - 2559028534U, // <4,3,5,4>: Cost 3 vext1 <1,4,3,5>, RHS - 4163519477U, // <4,3,5,5>: Cost 4 vtrnr <0,4,1,5>, <1,3,4,5> - 3309390346U, // <4,3,5,6>: Cost 4 vrev <3,4,6,5> - 2706139747U, // <4,3,5,7>: Cost 3 vext3 <3,5,7,4>, <3,5,7,4> - 2559031086U, // <4,3,5,u>: Cost 3 vext1 <1,4,3,5>, LHS - 2559033446U, // <4,3,6,0>: Cost 3 vext1 <1,4,3,6>, LHS - 2559034430U, // <4,3,6,1>: Cost 3 vext1 <1,4,3,6>, <1,4,3,6> - 2565007127U, // <4,3,6,2>: Cost 3 vext1 <2,4,3,6>, <2,4,3,6> - 2570979740U, // <4,3,6,3>: Cost 3 vext1 <3,4,3,6>, <3,3,3,3> - 2559036726U, // <4,3,6,4>: Cost 3 vext1 <1,4,3,6>, RHS - 1161841154U, // <4,3,6,5>: Cost 2 vrev <3,4,5,6> - 4028203932U, // <4,3,6,6>: Cost 4 vzipr <0,2,4,6>, <1,2,3,6> - 2706803380U, // <4,3,6,7>: Cost 3 vext3 <3,6,7,4>, <3,6,7,4> - 1162062365U, // <4,3,6,u>: Cost 2 vrev <3,4,u,6> - 3769633475U, // <4,3,7,0>: Cost 4 vext3 <1,u,3,4>, <3,7,0,1> - 3769633488U, // <4,3,7,1>: Cost 4 vext3 <1,u,3,4>, <3,7,1,5> - 3638757144U, // <4,3,7,2>: Cost 4 vext1 <2,4,3,7>, <2,4,3,7> - 3769633508U, // <4,3,7,3>: Cost 4 vext3 <1,u,3,4>, <3,7,3,7> - 3769633515U, // <4,3,7,4>: Cost 4 vext3 <1,u,3,4>, <3,7,4,5> - 3769633526U, // <4,3,7,5>: Cost 4 vext3 <1,u,3,4>, <3,7,5,7> - 3662647932U, // <4,3,7,6>: Cost 4 vext1 <6,4,3,7>, <6,4,3,7> - 3781208837U, // <4,3,7,7>: Cost 4 vext3 <3,7,7,4>, <3,7,7,4> - 3769633547U, // <4,3,7,u>: Cost 4 vext3 <1,u,3,4>, <3,7,u,1> - 2559049830U, // <4,3,u,0>: Cost 3 vext1 <1,4,3,u>, LHS - 2691910430U, // <4,3,u,1>: Cost 3 vext3 <1,2,3,4>, <3,u,1,2> - 2565023513U, // <4,3,u,2>: Cost 3 vext1 <2,4,3,u>, <2,4,3,u> - 2707835698U, // <4,3,u,3>: Cost 3 vext3 <3,u,3,4>, <3,u,3,4> - 2559053110U, // <4,3,u,4>: Cost 3 vext1 <1,4,3,u>, RHS - 1161857540U, // <4,3,u,5>: Cost 2 vrev <3,4,5,u> - 2235673101U, // <4,3,u,6>: Cost 3 vrev <3,4,6,u> - 2708130646U, // <4,3,u,7>: Cost 3 vext3 <3,u,7,4>, <3,u,7,4> - 1162078751U, // <4,3,u,u>: Cost 2 vrev <3,4,u,u> - 2617573416U, // <4,4,0,0>: Cost 3 vext2 <0,0,4,4>, <0,0,4,4> - 1570373734U, // <4,4,0,1>: Cost 2 vext2 <4,4,4,4>, LHS - 2779676774U, // <4,4,0,2>: Cost 3 vuzpl <4,6,4,6>, LHS - 3760196480U, // <4,4,0,3>: Cost 4 vext3 <0,3,1,4>, <4,0,3,1> - 2576977100U, // <4,4,0,4>: Cost 3 vext1 <4,4,4,0>, <4,4,4,0> - 2718747538U, // <4,4,0,5>: Cost 3 vext3 <5,6,7,4>, <4,0,5,1> - 2718747548U, // <4,4,0,6>: Cost 3 vext3 <5,6,7,4>, <4,0,6,2> - 3668637015U, // <4,4,0,7>: Cost 4 vext1 <7,4,4,0>, <7,4,4,0> - 1570374301U, // <4,4,0,u>: Cost 2 vext2 <4,4,4,4>, LHS - 2644116214U, // <4,4,1,0>: Cost 3 vext2 <4,4,4,4>, <1,0,3,2> - 2644116276U, // <4,4,1,1>: Cost 3 vext2 <4,4,4,4>, <1,1,1,1> - 2691910602U, // <4,4,1,2>: Cost 3 vext3 <1,2,3,4>, <4,1,2,3> - 2644116440U, // <4,4,1,3>: Cost 3 vext2 <4,4,4,4>, <1,3,1,3> - 2711227356U, // <4,4,1,4>: Cost 3 vext3 <4,4,4,4>, <4,1,4,3> - 2709310438U, // <4,4,1,5>: Cost 3 vext3 <4,1,5,4>, <4,1,5,4> - 3765652462U, // <4,4,1,6>: Cost 4 vext3 <1,2,3,4>, <4,1,6,3> - 3768970231U, // <4,4,1,7>: Cost 4 vext3 <1,7,3,4>, <4,1,7,3> - 2695891968U, // <4,4,1,u>: Cost 3 vext3 <1,u,3,4>, <4,1,u,3> - 3703260634U, // <4,4,2,0>: Cost 4 vext2 <2,0,4,4>, <2,0,4,4> - 3765652499U, // <4,4,2,1>: Cost 4 vext3 <1,2,3,4>, <4,2,1,4> - 2644117096U, // <4,4,2,2>: Cost 3 vext2 <4,4,4,4>, <2,2,2,2> - 2631509709U, // <4,4,2,3>: Cost 3 vext2 <2,3,4,4>, <2,3,4,4> - 2644117269U, // <4,4,2,4>: Cost 3 vext2 <4,4,4,4>, <2,4,3,4> - 3705251698U, // <4,4,2,5>: Cost 4 vext2 <2,3,4,4>, <2,5,4,7> - 2710047808U, // <4,4,2,6>: Cost 3 vext3 <4,2,6,4>, <4,2,6,4> - 3783863369U, // <4,4,2,7>: Cost 4 vext3 <4,2,7,4>, <4,2,7,4> - 2634827874U, // <4,4,2,u>: Cost 3 vext2 <2,u,4,4>, <2,u,4,4> - 2644117654U, // <4,4,3,0>: Cost 3 vext2 <4,4,4,4>, <3,0,1,2> - 3638797210U, // <4,4,3,1>: Cost 4 vext1 <2,4,4,3>, <1,2,3,4> - 3638798082U, // <4,4,3,2>: Cost 4 vext1 <2,4,4,3>, <2,4,1,3> - 2637482406U, // <4,4,3,3>: Cost 3 vext2 <3,3,4,4>, <3,3,4,4> - 2638146039U, // <4,4,3,4>: Cost 3 vext2 <3,4,4,4>, <3,4,4,4> - 3913287374U, // <4,4,3,5>: Cost 4 vuzpr <3,4,5,4>, <2,3,4,5> - 3765652625U, // <4,4,3,6>: Cost 4 vext3 <1,2,3,4>, <4,3,6,4> - 3713878762U, // <4,4,3,7>: Cost 4 vext2 <3,7,4,4>, <3,7,4,4> - 2637482406U, // <4,4,3,u>: Cost 3 vext2 <3,3,4,4>, <3,3,4,4> - 1503264870U, // <4,4,4,0>: Cost 2 vext1 <4,4,4,4>, LHS - 2577007514U, // <4,4,4,1>: Cost 3 vext1 <4,4,4,4>, <1,2,3,4> - 2577008232U, // <4,4,4,2>: Cost 3 vext1 <4,4,4,4>, <2,2,2,2> - 2571037175U, // <4,4,4,3>: Cost 3 vext1 <3,4,4,4>, <3,4,4,4> - 161926454U, // <4,4,4,4>: Cost 1 vdup0 RHS - 1570377014U, // <4,4,4,5>: Cost 2 vext2 <4,4,4,4>, RHS - 2779680054U, // <4,4,4,6>: Cost 3 vuzpl <4,6,4,6>, RHS - 2594927963U, // <4,4,4,7>: Cost 3 vext1 <7,4,4,4>, <7,4,4,4> - 161926454U, // <4,4,4,u>: Cost 1 vdup0 RHS - 2571042918U, // <4,4,5,0>: Cost 3 vext1 <3,4,4,5>, LHS - 2571043738U, // <4,4,5,1>: Cost 3 vext1 <3,4,4,5>, <1,2,3,4> - 3638814495U, // <4,4,5,2>: Cost 4 vext1 <2,4,4,5>, <2,4,4,5> - 2571045368U, // <4,4,5,3>: Cost 3 vext1 <3,4,4,5>, <3,4,4,5> - 2571046198U, // <4,4,5,4>: Cost 3 vext1 <3,4,4,5>, RHS - 1839648054U, // <4,4,5,5>: Cost 2 vzipl RHS, RHS - 1618169142U, // <4,4,5,6>: Cost 2 vext3 <1,2,3,4>, RHS - 2594936156U, // <4,4,5,7>: Cost 3 vext1 <7,4,4,5>, <7,4,4,5> - 1618169160U, // <4,4,5,u>: Cost 2 vext3 <1,2,3,4>, RHS - 2553135206U, // <4,4,6,0>: Cost 3 vext1 <0,4,4,6>, LHS - 3626877686U, // <4,4,6,1>: Cost 4 vext1 <0,4,4,6>, <1,0,3,2> - 2565080782U, // <4,4,6,2>: Cost 3 vext1 <2,4,4,6>, <2,3,4,5> - 2571053561U, // <4,4,6,3>: Cost 3 vext1 <3,4,4,6>, <3,4,4,6> - 2553138486U, // <4,4,6,4>: Cost 3 vext1 <0,4,4,6>, RHS - 2241555675U, // <4,4,6,5>: Cost 3 vrev <4,4,5,6> - 1973865782U, // <4,4,6,6>: Cost 2 vtrnl RHS, RHS - 2658055029U, // <4,4,6,7>: Cost 3 vext2 <6,7,4,4>, <6,7,4,4> - 1973865800U, // <4,4,6,u>: Cost 2 vtrnl RHS, RHS - 2644120570U, // <4,4,7,0>: Cost 3 vext2 <4,4,4,4>, <7,0,1,2> - 3638829978U, // <4,4,7,1>: Cost 4 vext1 <2,4,4,7>, <1,2,3,4> - 3638830881U, // <4,4,7,2>: Cost 4 vext1 <2,4,4,7>, <2,4,4,7> - 3735115018U, // <4,4,7,3>: Cost 4 vext2 <7,3,4,4>, <7,3,4,4> - 2662036827U, // <4,4,7,4>: Cost 3 vext2 <7,4,4,4>, <7,4,4,4> - 2713292236U, // <4,4,7,5>: Cost 3 vext3 <4,7,5,4>, <4,7,5,4> - 2713365973U, // <4,4,7,6>: Cost 3 vext3 <4,7,6,4>, <4,7,6,4> - 2644121196U, // <4,4,7,7>: Cost 3 vext2 <4,4,4,4>, <7,7,7,7> - 2662036827U, // <4,4,7,u>: Cost 3 vext2 <7,4,4,4>, <7,4,4,4> - 1503297638U, // <4,4,u,0>: Cost 2 vext1 <4,4,4,u>, LHS - 1570379566U, // <4,4,u,1>: Cost 2 vext2 <4,4,4,4>, LHS - 2779682606U, // <4,4,u,2>: Cost 3 vuzpl <4,6,4,6>, LHS - 2571069947U, // <4,4,u,3>: Cost 3 vext1 <3,4,4,u>, <3,4,4,u> - 161926454U, // <4,4,u,4>: Cost 1 vdup0 RHS - 1841638710U, // <4,4,u,5>: Cost 2 vzipl RHS, RHS - 1618169385U, // <4,4,u,6>: Cost 2 vext3 <1,2,3,4>, RHS - 2594960735U, // <4,4,u,7>: Cost 3 vext1 <7,4,4,u>, <7,4,4,u> - 161926454U, // <4,4,u,u>: Cost 1 vdup0 RHS - 2631516160U, // <4,5,0,0>: Cost 3 vext2 <2,3,4,5>, <0,0,0,0> - 1557774438U, // <4,5,0,1>: Cost 2 vext2 <2,3,4,5>, LHS - 2618908875U, // <4,5,0,2>: Cost 3 vext2 <0,2,4,5>, <0,2,4,5> - 2571078140U, // <4,5,0,3>: Cost 3 vext1 <3,4,5,0>, <3,4,5,0> - 2626871634U, // <4,5,0,4>: Cost 3 vext2 <1,5,4,5>, <0,4,1,5> - 3705258414U, // <4,5,0,5>: Cost 4 vext2 <2,3,4,5>, <0,5,2,7> - 2594968438U, // <4,5,0,6>: Cost 3 vext1 <7,4,5,0>, <6,7,4,5> - 2594968928U, // <4,5,0,7>: Cost 3 vext1 <7,4,5,0>, <7,4,5,0> - 1557775005U, // <4,5,0,u>: Cost 2 vext2 <2,3,4,5>, LHS - 2631516918U, // <4,5,1,0>: Cost 3 vext2 <2,3,4,5>, <1,0,3,2> - 2624217939U, // <4,5,1,1>: Cost 3 vext2 <1,1,4,5>, <1,1,4,5> - 2631517078U, // <4,5,1,2>: Cost 3 vext2 <2,3,4,5>, <1,2,3,0> - 2821341286U, // <4,5,1,3>: Cost 3 vuzpr <0,4,1,5>, LHS - 3895086054U, // <4,5,1,4>: Cost 4 vuzpr <0,4,1,5>, <4,1,5,4> - 2626872471U, // <4,5,1,5>: Cost 3 vext2 <1,5,4,5>, <1,5,4,5> - 3895083131U, // <4,5,1,6>: Cost 4 vuzpr <0,4,1,5>, <0,1,4,6> - 2718748368U, // <4,5,1,7>: Cost 3 vext3 <5,6,7,4>, <5,1,7,3> - 2821341291U, // <4,5,1,u>: Cost 3 vuzpr <0,4,1,5>, LHS - 2571092070U, // <4,5,2,0>: Cost 3 vext1 <3,4,5,2>, LHS - 3699287585U, // <4,5,2,1>: Cost 4 vext2 <1,3,4,5>, <2,1,3,3> - 2630854269U, // <4,5,2,2>: Cost 3 vext2 <2,2,4,5>, <2,2,4,5> - 1557776078U, // <4,5,2,3>: Cost 2 vext2 <2,3,4,5>, <2,3,4,5> - 2631517974U, // <4,5,2,4>: Cost 3 vext2 <2,3,4,5>, <2,4,3,5> - 3692652384U, // <4,5,2,5>: Cost 4 vext2 <0,2,4,5>, <2,5,2,7> - 2631518138U, // <4,5,2,6>: Cost 3 vext2 <2,3,4,5>, <2,6,3,7> - 4164013366U, // <4,5,2,7>: Cost 4 vtrnr <0,4,u,2>, RHS - 1561094243U, // <4,5,2,u>: Cost 2 vext2 <2,u,4,5>, <2,u,4,5> - 2631518358U, // <4,5,3,0>: Cost 3 vext2 <2,3,4,5>, <3,0,1,2> - 3895084710U, // <4,5,3,1>: Cost 4 vuzpr <0,4,1,5>, <2,3,0,1> - 2631518540U, // <4,5,3,2>: Cost 3 vext2 <2,3,4,5>, <3,2,3,4> - 2631518620U, // <4,5,3,3>: Cost 3 vext2 <2,3,4,5>, <3,3,3,3> - 2631518716U, // <4,5,3,4>: Cost 3 vext2 <2,3,4,5>, <3,4,5,0> - 2631518784U, // <4,5,3,5>: Cost 3 vext2 <2,3,4,5>, <3,5,3,5> - 2658060980U, // <4,5,3,6>: Cost 3 vext2 <6,7,4,5>, <3,6,7,4> - 2640145131U, // <4,5,3,7>: Cost 3 vext2 <3,7,4,5>, <3,7,4,5> - 2631519006U, // <4,5,3,u>: Cost 3 vext2 <2,3,4,5>, <3,u,1,2> - 2571108454U, // <4,5,4,0>: Cost 3 vext1 <3,4,5,4>, LHS - 3632907342U, // <4,5,4,1>: Cost 4 vext1 <1,4,5,4>, <1,4,5,4> - 2571110094U, // <4,5,4,2>: Cost 3 vext1 <3,4,5,4>, <2,3,4,5> - 2571110912U, // <4,5,4,3>: Cost 3 vext1 <3,4,5,4>, <3,4,5,4> - 2571111734U, // <4,5,4,4>: Cost 3 vext1 <3,4,5,4>, RHS - 1557777718U, // <4,5,4,5>: Cost 2 vext2 <2,3,4,5>, RHS - 2645454195U, // <4,5,4,6>: Cost 3 vext2 <4,6,4,5>, <4,6,4,5> - 2718748614U, // <4,5,4,7>: Cost 3 vext3 <5,6,7,4>, <5,4,7,6> - 1557777961U, // <4,5,4,u>: Cost 2 vext2 <2,3,4,5>, RHS - 1503346790U, // <4,5,5,0>: Cost 2 vext1 <4,4,5,5>, LHS - 2913398480U, // <4,5,5,1>: Cost 3 vzipl RHS, <5,1,7,3> - 2631519998U, // <4,5,5,2>: Cost 3 vext2 <2,3,4,5>, <5,2,3,4> - 2577090710U, // <4,5,5,3>: Cost 3 vext1 <4,4,5,5>, <3,0,1,2> - 1503349978U, // <4,5,5,4>: Cost 2 vext1 <4,4,5,5>, <4,4,5,5> - 2631520260U, // <4,5,5,5>: Cost 3 vext2 <2,3,4,5>, <5,5,5,5> - 2913390690U, // <4,5,5,6>: Cost 3 vzipl RHS, <5,6,7,0> - 2821344566U, // <4,5,5,7>: Cost 3 vuzpr <0,4,1,5>, RHS - 1503352622U, // <4,5,5,u>: Cost 2 vext1 <4,4,5,5>, LHS - 1497383014U, // <4,5,6,0>: Cost 2 vext1 <3,4,5,6>, LHS - 2559181904U, // <4,5,6,1>: Cost 3 vext1 <1,4,5,6>, <1,4,5,6> - 2565154601U, // <4,5,6,2>: Cost 3 vext1 <2,4,5,6>, <2,4,5,6> - 1497385474U, // <4,5,6,3>: Cost 2 vext1 <3,4,5,6>, <3,4,5,6> - 1497386294U, // <4,5,6,4>: Cost 2 vext1 <3,4,5,6>, RHS - 3047608324U, // <4,5,6,5>: Cost 3 vtrnl RHS, <5,5,5,5> - 2571129656U, // <4,5,6,6>: Cost 3 vext1 <3,4,5,6>, <6,6,6,6> - 27705344U, // <4,5,6,7>: Cost 0 copy RHS - 27705344U, // <4,5,6,u>: Cost 0 copy RHS - 2565161062U, // <4,5,7,0>: Cost 3 vext1 <2,4,5,7>, LHS - 2565161882U, // <4,5,7,1>: Cost 3 vext1 <2,4,5,7>, <1,2,3,4> - 2565162794U, // <4,5,7,2>: Cost 3 vext1 <2,4,5,7>, <2,4,5,7> - 2661381387U, // <4,5,7,3>: Cost 3 vext2 <7,3,4,5>, <7,3,4,5> - 2565164342U, // <4,5,7,4>: Cost 3 vext1 <2,4,5,7>, RHS - 2718748840U, // <4,5,7,5>: Cost 3 vext3 <5,6,7,4>, <5,7,5,7> - 2718748846U, // <4,5,7,6>: Cost 3 vext3 <5,6,7,4>, <5,7,6,4> - 2719412407U, // <4,5,7,7>: Cost 3 vext3 <5,7,7,4>, <5,7,7,4> - 2565166894U, // <4,5,7,u>: Cost 3 vext1 <2,4,5,7>, LHS - 1497399398U, // <4,5,u,0>: Cost 2 vext1 <3,4,5,u>, LHS - 1557780270U, // <4,5,u,1>: Cost 2 vext2 <2,3,4,5>, LHS - 2631522181U, // <4,5,u,2>: Cost 3 vext2 <2,3,4,5>, <u,2,3,0> - 1497401860U, // <4,5,u,3>: Cost 2 vext1 <3,4,5,u>, <3,4,5,u> - 1497402678U, // <4,5,u,4>: Cost 2 vext1 <3,4,5,u>, RHS - 1557780634U, // <4,5,u,5>: Cost 2 vext2 <2,3,4,5>, RHS - 2631522512U, // <4,5,u,6>: Cost 3 vext2 <2,3,4,5>, <u,6,3,7> - 27705344U, // <4,5,u,7>: Cost 0 copy RHS - 27705344U, // <4,5,u,u>: Cost 0 copy RHS - 2618916864U, // <4,6,0,0>: Cost 3 vext2 <0,2,4,6>, <0,0,0,0> - 1545175142U, // <4,6,0,1>: Cost 2 vext2 <0,2,4,6>, LHS - 1545175244U, // <4,6,0,2>: Cost 2 vext2 <0,2,4,6>, <0,2,4,6> - 3692658940U, // <4,6,0,3>: Cost 4 vext2 <0,2,4,6>, <0,3,1,0> - 2618917202U, // <4,6,0,4>: Cost 3 vext2 <0,2,4,6>, <0,4,1,5> - 3852910806U, // <4,6,0,5>: Cost 4 vuzpl RHS, <0,2,5,7> - 2253525648U, // <4,6,0,6>: Cost 3 vrev <6,4,6,0> - 4040764726U, // <4,6,0,7>: Cost 4 vzipr <2,3,4,0>, RHS - 1545175709U, // <4,6,0,u>: Cost 2 vext2 <0,2,4,6>, LHS - 2618917622U, // <4,6,1,0>: Cost 3 vext2 <0,2,4,6>, <1,0,3,2> - 2618917684U, // <4,6,1,1>: Cost 3 vext2 <0,2,4,6>, <1,1,1,1> - 2618917782U, // <4,6,1,2>: Cost 3 vext2 <0,2,4,6>, <1,2,3,0> - 2618917848U, // <4,6,1,3>: Cost 3 vext2 <0,2,4,6>, <1,3,1,3> - 3692659773U, // <4,6,1,4>: Cost 4 vext2 <0,2,4,6>, <1,4,3,5> - 2618918032U, // <4,6,1,5>: Cost 3 vext2 <0,2,4,6>, <1,5,3,7> - 3692659937U, // <4,6,1,6>: Cost 4 vext2 <0,2,4,6>, <1,6,3,7> - 4032146742U, // <4,6,1,7>: Cost 4 vzipr <0,u,4,1>, RHS - 2618918253U, // <4,6,1,u>: Cost 3 vext2 <0,2,4,6>, <1,u,1,3> - 2618918380U, // <4,6,2,0>: Cost 3 vext2 <0,2,4,6>, <2,0,6,4> - 2618918460U, // <4,6,2,1>: Cost 3 vext2 <0,2,4,6>, <2,1,6,3> - 2618918504U, // <4,6,2,2>: Cost 3 vext2 <0,2,4,6>, <2,2,2,2> - 2618918566U, // <4,6,2,3>: Cost 3 vext2 <0,2,4,6>, <2,3,0,1> - 2618918679U, // <4,6,2,4>: Cost 3 vext2 <0,2,4,6>, <2,4,3,6> - 2618918788U, // <4,6,2,5>: Cost 3 vext2 <0,2,4,6>, <2,5,6,7> - 2618918842U, // <4,6,2,6>: Cost 3 vext2 <0,2,4,6>, <2,6,3,7> - 2718749178U, // <4,6,2,7>: Cost 3 vext3 <5,6,7,4>, <6,2,7,3> - 2618918971U, // <4,6,2,u>: Cost 3 vext2 <0,2,4,6>, <2,u,0,1> - 2618919062U, // <4,6,3,0>: Cost 3 vext2 <0,2,4,6>, <3,0,1,2> - 2636171526U, // <4,6,3,1>: Cost 3 vext2 <3,1,4,6>, <3,1,4,6> - 3692661057U, // <4,6,3,2>: Cost 4 vext2 <0,2,4,6>, <3,2,2,2> - 2618919324U, // <4,6,3,3>: Cost 3 vext2 <0,2,4,6>, <3,3,3,3> - 2618919426U, // <4,6,3,4>: Cost 3 vext2 <0,2,4,6>, <3,4,5,6> - 2638826058U, // <4,6,3,5>: Cost 3 vext2 <3,5,4,6>, <3,5,4,6> - 3913303030U, // <4,6,3,6>: Cost 4 vuzpr <3,4,5,6>, <1,3,4,6> - 2722730572U, // <4,6,3,7>: Cost 3 vext3 <6,3,7,4>, <6,3,7,4> - 2618919710U, // <4,6,3,u>: Cost 3 vext2 <0,2,4,6>, <3,u,1,2> - 2565210214U, // <4,6,4,0>: Cost 3 vext1 <2,4,6,4>, LHS - 2718749286U, // <4,6,4,1>: Cost 3 vext3 <5,6,7,4>, <6,4,1,3> - 2565211952U, // <4,6,4,2>: Cost 3 vext1 <2,4,6,4>, <2,4,6,4> - 2571184649U, // <4,6,4,3>: Cost 3 vext1 <3,4,6,4>, <3,4,6,4> - 2565213494U, // <4,6,4,4>: Cost 3 vext1 <2,4,6,4>, RHS - 1545178422U, // <4,6,4,5>: Cost 2 vext2 <0,2,4,6>, RHS - 1705430326U, // <4,6,4,6>: Cost 2 vuzpl RHS, RHS - 2595075437U, // <4,6,4,7>: Cost 3 vext1 <7,4,6,4>, <7,4,6,4> - 1545178665U, // <4,6,4,u>: Cost 2 vext2 <0,2,4,6>, RHS - 2565218406U, // <4,6,5,0>: Cost 3 vext1 <2,4,6,5>, LHS - 2645462736U, // <4,6,5,1>: Cost 3 vext2 <4,6,4,6>, <5,1,7,3> - 2913399290U, // <4,6,5,2>: Cost 3 vzipl RHS, <6,2,7,3> - 3913305394U, // <4,6,5,3>: Cost 4 vuzpr <3,4,5,6>, <4,5,6,3> - 2645462982U, // <4,6,5,4>: Cost 3 vext2 <4,6,4,6>, <5,4,7,6> - 2779172868U, // <4,6,5,5>: Cost 3 vuzpl RHS, <5,5,5,5> - 2913391416U, // <4,6,5,6>: Cost 3 vzipl RHS, <6,6,6,6> - 2821426486U, // <4,6,5,7>: Cost 3 vuzpr <0,4,2,6>, RHS - 2821426487U, // <4,6,5,u>: Cost 3 vuzpr <0,4,2,6>, RHS - 1503428710U, // <4,6,6,0>: Cost 2 vext1 <4,4,6,6>, LHS - 2577171190U, // <4,6,6,1>: Cost 3 vext1 <4,4,6,6>, <1,0,3,2> - 2645463546U, // <4,6,6,2>: Cost 3 vext2 <4,6,4,6>, <6,2,7,3> - 2577172630U, // <4,6,6,3>: Cost 3 vext1 <4,4,6,6>, <3,0,1,2> - 1503431908U, // <4,6,6,4>: Cost 2 vext1 <4,4,6,6>, <4,4,6,6> - 2253501069U, // <4,6,6,5>: Cost 3 vrev <6,4,5,6> - 2618921784U, // <4,6,6,6>: Cost 3 vext2 <0,2,4,6>, <6,6,6,6> - 2954464566U, // <4,6,6,7>: Cost 3 vzipr <0,2,4,6>, RHS - 1503434542U, // <4,6,6,u>: Cost 2 vext1 <4,4,6,6>, LHS - 2645464058U, // <4,6,7,0>: Cost 3 vext2 <4,6,4,6>, <7,0,1,2> - 2779173882U, // <4,6,7,1>: Cost 3 vuzpl RHS, <7,0,1,2> - 3638978355U, // <4,6,7,2>: Cost 4 vext1 <2,4,6,7>, <2,4,6,7> - 2725090156U, // <4,6,7,3>: Cost 3 vext3 <6,7,3,4>, <6,7,3,4> - 2645464422U, // <4,6,7,4>: Cost 3 vext2 <4,6,4,6>, <7,4,5,6> - 2779174246U, // <4,6,7,5>: Cost 3 vuzpl RHS, <7,4,5,6> - 3852915914U, // <4,6,7,6>: Cost 4 vuzpl RHS, <7,2,6,3> - 2779174508U, // <4,6,7,7>: Cost 3 vuzpl RHS, <7,7,7,7> - 2779173945U, // <4,6,7,u>: Cost 3 vuzpl RHS, <7,0,u,2> - 1503445094U, // <4,6,u,0>: Cost 2 vext1 <4,4,6,u>, LHS - 1545180974U, // <4,6,u,1>: Cost 2 vext2 <0,2,4,6>, LHS - 1705432878U, // <4,6,u,2>: Cost 2 vuzpl RHS, LHS - 2618922940U, // <4,6,u,3>: Cost 3 vext2 <0,2,4,6>, <u,3,0,1> - 1503448294U, // <4,6,u,4>: Cost 2 vext1 <4,4,6,u>, <4,4,6,u> - 1545181338U, // <4,6,u,5>: Cost 2 vext2 <0,2,4,6>, RHS - 1705433242U, // <4,6,u,6>: Cost 2 vuzpl RHS, RHS - 2954480950U, // <4,6,u,7>: Cost 3 vzipr <0,2,4,u>, RHS - 1545181541U, // <4,6,u,u>: Cost 2 vext2 <0,2,4,6>, LHS - 3706601472U, // <4,7,0,0>: Cost 4 vext2 <2,5,4,7>, <0,0,0,0> - 2632859750U, // <4,7,0,1>: Cost 3 vext2 <2,5,4,7>, LHS - 2726343685U, // <4,7,0,2>: Cost 3 vext3 <7,0,2,4>, <7,0,2,4> - 3701293312U, // <4,7,0,3>: Cost 4 vext2 <1,6,4,7>, <0,3,1,4> - 3706601810U, // <4,7,0,4>: Cost 4 vext2 <2,5,4,7>, <0,4,1,5> - 2259424608U, // <4,7,0,5>: Cost 3 vrev <7,4,5,0> - 3695321617U, // <4,7,0,6>: Cost 4 vext2 <0,6,4,7>, <0,6,4,7> - 3800454194U, // <4,7,0,7>: Cost 4 vext3 <7,0,7,4>, <7,0,7,4> - 2632860317U, // <4,7,0,u>: Cost 3 vext2 <2,5,4,7>, LHS - 2259064116U, // <4,7,1,0>: Cost 3 vrev <7,4,0,1> - 3700630324U, // <4,7,1,1>: Cost 4 vext2 <1,5,4,7>, <1,1,1,1> - 2632860570U, // <4,7,1,2>: Cost 3 vext2 <2,5,4,7>, <1,2,3,4> - 3769635936U, // <4,7,1,3>: Cost 4 vext3 <1,u,3,4>, <7,1,3,5> - 3656920374U, // <4,7,1,4>: Cost 4 vext1 <5,4,7,1>, RHS - 3700630681U, // <4,7,1,5>: Cost 4 vext2 <1,5,4,7>, <1,5,4,7> - 3701294314U, // <4,7,1,6>: Cost 4 vext2 <1,6,4,7>, <1,6,4,7> - 3793818754U, // <4,7,1,7>: Cost 4 vext3 <5,u,7,4>, <7,1,7,3> - 2259654012U, // <4,7,1,u>: Cost 3 vrev <7,4,u,1> - 3656925286U, // <4,7,2,0>: Cost 4 vext1 <5,4,7,2>, LHS - 3706603050U, // <4,7,2,1>: Cost 4 vext2 <2,5,4,7>, <2,1,4,3> - 3706603112U, // <4,7,2,2>: Cost 4 vext2 <2,5,4,7>, <2,2,2,2> - 2727744688U, // <4,7,2,3>: Cost 3 vext3 <7,2,3,4>, <7,2,3,4> - 3705939745U, // <4,7,2,4>: Cost 4 vext2 <2,4,4,7>, <2,4,4,7> - 2632861554U, // <4,7,2,5>: Cost 3 vext2 <2,5,4,7>, <2,5,4,7> - 3706603450U, // <4,7,2,6>: Cost 4 vext2 <2,5,4,7>, <2,6,3,7> - 3792491731U, // <4,7,2,7>: Cost 4 vext3 <5,6,7,4>, <7,2,7,3> - 2634852453U, // <4,7,2,u>: Cost 3 vext2 <2,u,4,7>, <2,u,4,7> - 3706603670U, // <4,7,3,0>: Cost 4 vext2 <2,5,4,7>, <3,0,1,2> - 3662906266U, // <4,7,3,1>: Cost 4 vext1 <6,4,7,3>, <1,2,3,4> - 3725183326U, // <4,7,3,2>: Cost 4 vext2 <5,6,4,7>, <3,2,5,4> - 3706603932U, // <4,7,3,3>: Cost 4 vext2 <2,5,4,7>, <3,3,3,3> - 3701295618U, // <4,7,3,4>: Cost 4 vext2 <1,6,4,7>, <3,4,5,6> - 2638834251U, // <4,7,3,5>: Cost 3 vext2 <3,5,4,7>, <3,5,4,7> - 2639497884U, // <4,7,3,6>: Cost 3 vext2 <3,6,4,7>, <3,6,4,7> - 3802445093U, // <4,7,3,7>: Cost 4 vext3 <7,3,7,4>, <7,3,7,4> - 2640825150U, // <4,7,3,u>: Cost 3 vext2 <3,u,4,7>, <3,u,4,7> - 2718750004U, // <4,7,4,0>: Cost 3 vext3 <5,6,7,4>, <7,4,0,1> - 3706604490U, // <4,7,4,1>: Cost 4 vext2 <2,5,4,7>, <4,1,2,3> - 3656943474U, // <4,7,4,2>: Cost 4 vext1 <5,4,7,4>, <2,5,4,7> - 3779884371U, // <4,7,4,3>: Cost 4 vext3 <3,5,7,4>, <7,4,3,5> - 2259383643U, // <4,7,4,4>: Cost 3 vrev <7,4,4,4> - 2632863030U, // <4,7,4,5>: Cost 3 vext2 <2,5,4,7>, RHS - 2259531117U, // <4,7,4,6>: Cost 3 vrev <7,4,6,4> - 3907340074U, // <4,7,4,7>: Cost 4 vuzpr <2,4,5,7>, <2,4,5,7> - 2632863273U, // <4,7,4,u>: Cost 3 vext2 <2,5,4,7>, RHS - 2913391610U, // <4,7,5,0>: Cost 3 vzipl RHS, <7,0,1,2> - 3645006848U, // <4,7,5,1>: Cost 4 vext1 <3,4,7,5>, <1,3,5,7> - 2589181646U, // <4,7,5,2>: Cost 3 vext1 <6,4,7,5>, <2,3,4,5> - 3645008403U, // <4,7,5,3>: Cost 4 vext1 <3,4,7,5>, <3,4,7,5> - 2913391974U, // <4,7,5,4>: Cost 3 vzipl RHS, <7,4,5,6> - 2583211973U, // <4,7,5,5>: Cost 3 vext1 <5,4,7,5>, <5,4,7,5> - 2589184670U, // <4,7,5,6>: Cost 3 vext1 <6,4,7,5>, <6,4,7,5> - 2913392236U, // <4,7,5,7>: Cost 3 vzipl RHS, <7,7,7,7> - 2913392258U, // <4,7,5,u>: Cost 3 vzipl RHS, <7,u,1,2> - 1509474406U, // <4,7,6,0>: Cost 2 vext1 <5,4,7,6>, LHS - 3047609338U, // <4,7,6,1>: Cost 3 vtrnl RHS, <7,0,1,2> - 2583217768U, // <4,7,6,2>: Cost 3 vext1 <5,4,7,6>, <2,2,2,2> - 2583218326U, // <4,7,6,3>: Cost 3 vext1 <5,4,7,6>, <3,0,1,2> - 1509477686U, // <4,7,6,4>: Cost 2 vext1 <5,4,7,6>, RHS - 1509478342U, // <4,7,6,5>: Cost 2 vext1 <5,4,7,6>, <5,4,7,6> - 2583220730U, // <4,7,6,6>: Cost 3 vext1 <5,4,7,6>, <6,2,7,3> - 3047609964U, // <4,7,6,7>: Cost 3 vtrnl RHS, <7,7,7,7> - 1509480238U, // <4,7,6,u>: Cost 2 vext1 <5,4,7,6>, LHS - 3650994278U, // <4,7,7,0>: Cost 4 vext1 <4,4,7,7>, LHS - 3650995098U, // <4,7,7,1>: Cost 4 vext1 <4,4,7,7>, <1,2,3,4> - 3650996010U, // <4,7,7,2>: Cost 4 vext1 <4,4,7,7>, <2,4,5,7> - 3804804677U, // <4,7,7,3>: Cost 4 vext3 <7,7,3,4>, <7,7,3,4> - 3650997486U, // <4,7,7,4>: Cost 4 vext1 <4,4,7,7>, <4,4,7,7> - 2662725039U, // <4,7,7,5>: Cost 3 vext2 <7,5,4,7>, <7,5,4,7> - 3662942880U, // <4,7,7,6>: Cost 4 vext1 <6,4,7,7>, <6,4,7,7> - 2718750316U, // <4,7,7,7>: Cost 3 vext3 <5,6,7,4>, <7,7,7,7> - 2664715938U, // <4,7,7,u>: Cost 3 vext2 <7,u,4,7>, <7,u,4,7> - 1509490790U, // <4,7,u,0>: Cost 2 vext1 <5,4,7,u>, LHS - 2632865582U, // <4,7,u,1>: Cost 3 vext2 <2,5,4,7>, LHS - 2583234152U, // <4,7,u,2>: Cost 3 vext1 <5,4,7,u>, <2,2,2,2> - 2583234710U, // <4,7,u,3>: Cost 3 vext1 <5,4,7,u>, <3,0,1,2> - 1509494070U, // <4,7,u,4>: Cost 2 vext1 <5,4,7,u>, RHS - 1509494728U, // <4,7,u,5>: Cost 2 vext1 <5,4,7,u>, <5,4,7,u> - 2583237114U, // <4,7,u,6>: Cost 3 vext1 <5,4,7,u>, <6,2,7,3> - 3047757420U, // <4,7,u,7>: Cost 3 vtrnl RHS, <7,7,7,7> - 1509496622U, // <4,7,u,u>: Cost 2 vext1 <5,4,7,u>, LHS - 2618933248U, // <4,u,0,0>: Cost 3 vext2 <0,2,4,u>, <0,0,0,0> - 1545191526U, // <4,u,0,1>: Cost 2 vext2 <0,2,4,u>, LHS - 1545191630U, // <4,u,0,2>: Cost 2 vext2 <0,2,4,u>, <0,2,4,u> - 2691913445U, // <4,u,0,3>: Cost 3 vext3 <1,2,3,4>, <u,0,3,2> - 2618933586U, // <4,u,0,4>: Cost 3 vext2 <0,2,4,u>, <0,4,1,5> - 2265397305U, // <4,u,0,5>: Cost 3 vrev <u,4,5,0> - 2595189625U, // <4,u,0,6>: Cost 3 vext1 <7,4,u,0>, <6,7,4,u> - 2595190139U, // <4,u,0,7>: Cost 3 vext1 <7,4,u,0>, <7,4,u,0> - 1545192093U, // <4,u,0,u>: Cost 2 vext2 <0,2,4,u>, LHS - 2618934006U, // <4,u,1,0>: Cost 3 vext2 <0,2,4,u>, <1,0,3,2> - 2618934068U, // <4,u,1,1>: Cost 3 vext2 <0,2,4,u>, <1,1,1,1> - 1618171694U, // <4,u,1,2>: Cost 2 vext3 <1,2,3,4>, LHS - 2618934232U, // <4,u,1,3>: Cost 3 vext2 <0,2,4,u>, <1,3,1,3> - 2695894848U, // <4,u,1,4>: Cost 3 vext3 <1,u,3,4>, <u,1,4,3> - 2618934416U, // <4,u,1,5>: Cost 3 vext2 <0,2,4,u>, <1,5,3,7> - 3692676321U, // <4,u,1,6>: Cost 4 vext2 <0,2,4,u>, <1,6,3,7> - 2718750555U, // <4,u,1,7>: Cost 3 vext3 <5,6,7,4>, <u,1,7,3> - 1618171748U, // <4,u,1,u>: Cost 2 vext3 <1,2,3,4>, LHS - 2553397350U, // <4,u,2,0>: Cost 3 vext1 <0,4,u,2>, LHS - 2630215215U, // <4,u,2,1>: Cost 3 vext2 <2,1,4,u>, <2,1,4,u> - 2618934888U, // <4,u,2,2>: Cost 3 vext2 <0,2,4,u>, <2,2,2,2> - 1557800657U, // <4,u,2,3>: Cost 2 vext2 <2,3,4,u>, <2,3,4,u> - 2618935065U, // <4,u,2,4>: Cost 3 vext2 <0,2,4,u>, <2,4,3,u> - 2733864859U, // <4,u,2,5>: Cost 3 vext3 <u,2,5,4>, <u,2,5,4> - 2618935226U, // <4,u,2,6>: Cost 3 vext2 <0,2,4,u>, <2,6,3,7> - 2718750636U, // <4,u,2,7>: Cost 3 vext3 <5,6,7,4>, <u,2,7,3> - 1561118822U, // <4,u,2,u>: Cost 2 vext2 <2,u,4,u>, <2,u,4,u> - 2618935446U, // <4,u,3,0>: Cost 3 vext2 <0,2,4,u>, <3,0,1,2> - 2779318422U, // <4,u,3,1>: Cost 3 vuzpl RHS, <3,0,1,2> - 2636851545U, // <4,u,3,2>: Cost 3 vext2 <3,2,4,u>, <3,2,4,u> - 2618935708U, // <4,u,3,3>: Cost 3 vext2 <0,2,4,u>, <3,3,3,3> - 2618935810U, // <4,u,3,4>: Cost 3 vext2 <0,2,4,u>, <3,4,5,6> - 2691913711U, // <4,u,3,5>: Cost 3 vext3 <1,2,3,4>, <u,3,5,7> - 2588725862U, // <4,u,3,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3> - 2640169710U, // <4,u,3,7>: Cost 3 vext2 <3,7,4,u>, <3,7,4,u> - 2618936094U, // <4,u,3,u>: Cost 3 vext2 <0,2,4,u>, <3,u,1,2> - 1503559782U, // <4,u,4,0>: Cost 2 vext1 <4,4,u,4>, LHS - 2692282391U, // <4,u,4,1>: Cost 3 vext3 <1,2,u,4>, <u,4,1,2> - 2565359426U, // <4,u,4,2>: Cost 3 vext1 <2,4,u,4>, <2,4,u,4> - 2571332123U, // <4,u,4,3>: Cost 3 vext1 <3,4,u,4>, <3,4,u,4> - 161926454U, // <4,u,4,4>: Cost 1 vdup0 RHS - 1545194806U, // <4,u,4,5>: Cost 2 vext2 <0,2,4,u>, RHS - 1705577782U, // <4,u,4,6>: Cost 2 vuzpl RHS, RHS - 2718750801U, // <4,u,4,7>: Cost 3 vext3 <5,6,7,4>, <u,4,7,6> - 161926454U, // <4,u,4,u>: Cost 1 vdup0 RHS - 1479164006U, // <4,u,5,0>: Cost 2 vext1 <0,4,1,5>, LHS - 1839650606U, // <4,u,5,1>: Cost 2 vzipl RHS, LHS - 2565367502U, // <4,u,5,2>: Cost 3 vext1 <2,4,u,5>, <2,3,4,5> - 3089777309U, // <4,u,5,3>: Cost 3 vtrnr <0,4,1,5>, LHS - 1479167286U, // <4,u,5,4>: Cost 2 vext1 <0,4,1,5>, RHS - 1839650970U, // <4,u,5,5>: Cost 2 vzipl RHS, RHS - 1618172058U, // <4,u,5,6>: Cost 2 vext3 <1,2,3,4>, RHS - 3089780265U, // <4,u,5,7>: Cost 3 vtrnr <0,4,1,5>, RHS - 1618172076U, // <4,u,5,u>: Cost 2 vext3 <1,2,3,4>, RHS - 1479688294U, // <4,u,6,0>: Cost 2 vext1 <0,4,u,6>, LHS - 2553430774U, // <4,u,6,1>: Cost 3 vext1 <0,4,u,6>, <1,0,3,2> - 1973868334U, // <4,u,6,2>: Cost 2 vtrnl RHS, LHS - 1497606685U, // <4,u,6,3>: Cost 2 vext1 <3,4,u,6>, <3,4,u,6> - 1479691574U, // <4,u,6,4>: Cost 2 vext1 <0,4,u,6>, RHS - 1509552079U, // <4,u,6,5>: Cost 2 vext1 <5,4,u,6>, <5,4,u,6> - 1973868698U, // <4,u,6,6>: Cost 2 vtrnl RHS, RHS - 27705344U, // <4,u,6,7>: Cost 0 copy RHS - 27705344U, // <4,u,6,u>: Cost 0 copy RHS - 2565382246U, // <4,u,7,0>: Cost 3 vext1 <2,4,u,7>, LHS - 2565383066U, // <4,u,7,1>: Cost 3 vext1 <2,4,u,7>, <1,2,3,4> - 2565384005U, // <4,u,7,2>: Cost 3 vext1 <2,4,u,7>, <2,4,u,7> - 2661405966U, // <4,u,7,3>: Cost 3 vext2 <7,3,4,u>, <7,3,4,u> - 2565385526U, // <4,u,7,4>: Cost 3 vext1 <2,4,u,7>, RHS - 2779321702U, // <4,u,7,5>: Cost 3 vuzpl RHS, <7,4,5,6> - 2589274793U, // <4,u,7,6>: Cost 3 vext1 <6,4,u,7>, <6,4,u,7> - 2779321964U, // <4,u,7,7>: Cost 3 vuzpl RHS, <7,7,7,7> - 2565388078U, // <4,u,7,u>: Cost 3 vext1 <2,4,u,7>, LHS - 1479704678U, // <4,u,u,0>: Cost 2 vext1 <0,4,u,u>, LHS - 1545197358U, // <4,u,u,1>: Cost 2 vext2 <0,2,4,u>, LHS - 1618172261U, // <4,u,u,2>: Cost 2 vext3 <1,2,3,4>, LHS - 1497623071U, // <4,u,u,3>: Cost 2 vext1 <3,4,u,u>, <3,4,u,u> - 161926454U, // <4,u,u,4>: Cost 1 vdup0 RHS - 1545197722U, // <4,u,u,5>: Cost 2 vext2 <0,2,4,u>, RHS - 1618172301U, // <4,u,u,6>: Cost 2 vext3 <1,2,3,4>, RHS - 27705344U, // <4,u,u,7>: Cost 0 copy RHS - 27705344U, // <4,u,u,u>: Cost 0 copy RHS - 2687123456U, // <5,0,0,0>: Cost 3 vext3 <0,4,1,5>, <0,0,0,0> - 2687123466U, // <5,0,0,1>: Cost 3 vext3 <0,4,1,5>, <0,0,1,1> - 2687123476U, // <5,0,0,2>: Cost 3 vext3 <0,4,1,5>, <0,0,2,2> - 3710599434U, // <5,0,0,3>: Cost 4 vext2 <3,2,5,0>, <0,3,2,5> - 2642166098U, // <5,0,0,4>: Cost 3 vext2 <4,1,5,0>, <0,4,1,5> - 3657060306U, // <5,0,0,5>: Cost 4 vext1 <5,5,0,0>, <5,5,0,0> - 3292094923U, // <5,0,0,6>: Cost 4 vrev <0,5,6,0> - 3669005700U, // <5,0,0,7>: Cost 4 vext1 <7,5,0,0>, <7,5,0,0> - 2687123530U, // <5,0,0,u>: Cost 3 vext3 <0,4,1,5>, <0,0,u,2> - 2559434854U, // <5,0,1,0>: Cost 3 vext1 <1,5,0,1>, LHS - 2559435887U, // <5,0,1,1>: Cost 3 vext1 <1,5,0,1>, <1,5,0,1> - 1613381734U, // <5,0,1,2>: Cost 2 vext3 <0,4,1,5>, LHS - 3698656256U, // <5,0,1,3>: Cost 4 vext2 <1,2,5,0>, <1,3,5,7> - 2559438134U, // <5,0,1,4>: Cost 3 vext1 <1,5,0,1>, RHS - 2583326675U, // <5,0,1,5>: Cost 3 vext1 <5,5,0,1>, <5,5,0,1> - 3715908851U, // <5,0,1,6>: Cost 4 vext2 <4,1,5,0>, <1,6,5,7> - 3657069562U, // <5,0,1,7>: Cost 4 vext1 <5,5,0,1>, <7,0,1,2> - 1613381788U, // <5,0,1,u>: Cost 2 vext3 <0,4,1,5>, LHS - 2686017700U, // <5,0,2,0>: Cost 3 vext3 <0,2,4,5>, <0,2,0,2> - 2685796528U, // <5,0,2,1>: Cost 3 vext3 <0,2,1,5>, <0,2,1,5> - 2698625208U, // <5,0,2,2>: Cost 3 vext3 <2,3,4,5>, <0,2,2,4> - 2685944002U, // <5,0,2,3>: Cost 3 vext3 <0,2,3,5>, <0,2,3,5> - 2686017739U, // <5,0,2,4>: Cost 3 vext3 <0,2,4,5>, <0,2,4,5> - 2686091476U, // <5,0,2,5>: Cost 3 vext3 <0,2,5,5>, <0,2,5,5> - 2725167324U, // <5,0,2,6>: Cost 3 vext3 <6,7,4,5>, <0,2,6,4> - 2595280230U, // <5,0,2,7>: Cost 3 vext1 <7,5,0,2>, <7,4,5,6> - 2686312687U, // <5,0,2,u>: Cost 3 vext3 <0,2,u,5>, <0,2,u,5> - 3760128248U, // <5,0,3,0>: Cost 4 vext3 <0,3,0,5>, <0,3,0,5> - 3759685888U, // <5,0,3,1>: Cost 4 vext3 <0,2,3,5>, <0,3,1,4> - 2686533898U, // <5,0,3,2>: Cost 3 vext3 <0,3,2,5>, <0,3,2,5> - 3760349459U, // <5,0,3,3>: Cost 4 vext3 <0,3,3,5>, <0,3,3,5> - 2638187004U, // <5,0,3,4>: Cost 3 vext2 <3,4,5,0>, <3,4,5,0> - 3776348452U, // <5,0,3,5>: Cost 4 vext3 <3,0,4,5>, <0,3,5,4> - 3713256094U, // <5,0,3,6>: Cost 4 vext2 <3,6,5,0>, <3,6,5,0> - 3914064896U, // <5,0,3,7>: Cost 4 vuzpr <3,5,7,0>, <1,3,5,7> - 2686976320U, // <5,0,3,u>: Cost 3 vext3 <0,3,u,5>, <0,3,u,5> - 2559459430U, // <5,0,4,0>: Cost 3 vext1 <1,5,0,4>, LHS - 1613381970U, // <5,0,4,1>: Cost 2 vext3 <0,4,1,5>, <0,4,1,5> - 2687123804U, // <5,0,4,2>: Cost 3 vext3 <0,4,1,5>, <0,4,2,6> - 3761013092U, // <5,0,4,3>: Cost 4 vext3 <0,4,3,5>, <0,4,3,5> - 2559462710U, // <5,0,4,4>: Cost 3 vext1 <1,5,0,4>, RHS - 2638187830U, // <5,0,4,5>: Cost 3 vext2 <3,4,5,0>, RHS - 3761234303U, // <5,0,4,6>: Cost 4 vext3 <0,4,6,5>, <0,4,6,5> - 2646150600U, // <5,0,4,7>: Cost 3 vext2 <4,7,5,0>, <4,7,5,0> - 1613381970U, // <5,0,4,u>: Cost 2 vext3 <0,4,1,5>, <0,4,1,5> - 3766763926U, // <5,0,5,0>: Cost 4 vext3 <1,4,0,5>, <0,5,0,1> - 2919268454U, // <5,0,5,1>: Cost 3 vzipl <5,5,5,5>, LHS - 3053486182U, // <5,0,5,2>: Cost 3 vtrnl <5,5,5,5>, LHS - 3723210589U, // <5,0,5,3>: Cost 4 vext2 <5,3,5,0>, <5,3,5,0> - 3766763966U, // <5,0,5,4>: Cost 4 vext3 <1,4,0,5>, <0,5,4,5> - 2650796031U, // <5,0,5,5>: Cost 3 vext2 <5,5,5,0>, <5,5,5,0> - 3719893090U, // <5,0,5,6>: Cost 4 vext2 <4,7,5,0>, <5,6,7,0> - 3914067254U, // <5,0,5,7>: Cost 4 vuzpr <3,5,7,0>, RHS - 2919269021U, // <5,0,5,u>: Cost 3 vzipl <5,5,5,5>, LHS - 4047519744U, // <5,0,6,0>: Cost 4 vzipr <3,4,5,6>, <0,0,0,0> - 2920038502U, // <5,0,6,1>: Cost 3 vzipl <5,6,7,0>, LHS - 3759759871U, // <5,0,6,2>: Cost 4 vext3 <0,2,4,5>, <0,6,2,7> - 3645164070U, // <5,0,6,3>: Cost 4 vext1 <3,5,0,6>, <3,5,0,6> - 3762414095U, // <5,0,6,4>: Cost 4 vext3 <0,6,4,5>, <0,6,4,5> - 3993780690U, // <5,0,6,5>: Cost 4 vzipl <5,6,7,0>, <0,5,6,7> - 3719893816U, // <5,0,6,6>: Cost 4 vext2 <4,7,5,0>, <6,6,6,6> - 2662077302U, // <5,0,6,7>: Cost 3 vext2 <7,4,5,0>, <6,7,4,5> - 2920039069U, // <5,0,6,u>: Cost 3 vzipl <5,6,7,0>, LHS - 2565455974U, // <5,0,7,0>: Cost 3 vext1 <2,5,0,7>, LHS - 2565456790U, // <5,0,7,1>: Cost 3 vext1 <2,5,0,7>, <1,2,3,0> - 2565457742U, // <5,0,7,2>: Cost 3 vext1 <2,5,0,7>, <2,5,0,7> - 3639199894U, // <5,0,7,3>: Cost 4 vext1 <2,5,0,7>, <3,0,1,2> - 2565459254U, // <5,0,7,4>: Cost 3 vext1 <2,5,0,7>, RHS - 2589347938U, // <5,0,7,5>: Cost 3 vext1 <6,5,0,7>, <5,6,7,0> - 2589348530U, // <5,0,7,6>: Cost 3 vext1 <6,5,0,7>, <6,5,0,7> - 4188456422U, // <5,0,7,7>: Cost 4 vtrnr RHS, <2,0,5,7> - 2565461806U, // <5,0,7,u>: Cost 3 vext1 <2,5,0,7>, LHS - 2687124106U, // <5,0,u,0>: Cost 3 vext3 <0,4,1,5>, <0,u,0,2> - 1616036502U, // <5,0,u,1>: Cost 2 vext3 <0,u,1,5>, <0,u,1,5> - 1613382301U, // <5,0,u,2>: Cost 2 vext3 <0,4,1,5>, LHS - 2689925800U, // <5,0,u,3>: Cost 3 vext3 <0,u,3,5>, <0,u,3,5> - 2687124146U, // <5,0,u,4>: Cost 3 vext3 <0,4,1,5>, <0,u,4,6> - 2638190746U, // <5,0,u,5>: Cost 3 vext2 <3,4,5,0>, RHS - 2589356723U, // <5,0,u,6>: Cost 3 vext1 <6,5,0,u>, <6,5,0,u> - 2595280230U, // <5,0,u,7>: Cost 3 vext1 <7,5,0,2>, <7,4,5,6> - 1613382355U, // <5,0,u,u>: Cost 2 vext3 <0,4,1,5>, LHS - 2646818816U, // <5,1,0,0>: Cost 3 vext2 <4,u,5,1>, <0,0,0,0> - 1573077094U, // <5,1,0,1>: Cost 2 vext2 <4,u,5,1>, LHS - 2646818980U, // <5,1,0,2>: Cost 3 vext2 <4,u,5,1>, <0,2,0,2> - 2687124214U, // <5,1,0,3>: Cost 3 vext3 <0,4,1,5>, <1,0,3,2> - 2641510738U, // <5,1,0,4>: Cost 3 vext2 <4,0,5,1>, <0,4,1,5> - 2641510814U, // <5,1,0,5>: Cost 3 vext2 <4,0,5,1>, <0,5,1,0> - 3720561142U, // <5,1,0,6>: Cost 4 vext2 <4,u,5,1>, <0,6,1,7> - 3298141357U, // <5,1,0,7>: Cost 4 vrev <1,5,7,0> - 1573077661U, // <5,1,0,u>: Cost 2 vext2 <4,u,5,1>, LHS - 2223891567U, // <5,1,1,0>: Cost 3 vrev <1,5,0,1> - 2687124276U, // <5,1,1,1>: Cost 3 vext3 <0,4,1,5>, <1,1,1,1> - 2646819734U, // <5,1,1,2>: Cost 3 vext2 <4,u,5,1>, <1,2,3,0> - 2687124296U, // <5,1,1,3>: Cost 3 vext3 <0,4,1,5>, <1,1,3,3> - 2691326803U, // <5,1,1,4>: Cost 3 vext3 <1,1,4,5>, <1,1,4,5> - 2691400540U, // <5,1,1,5>: Cost 3 vext3 <1,1,5,5>, <1,1,5,5> - 3765216101U, // <5,1,1,6>: Cost 4 vext3 <1,1,6,5>, <1,1,6,5> - 3765289838U, // <5,1,1,7>: Cost 4 vext3 <1,1,7,5>, <1,1,7,5> - 2687124341U, // <5,1,1,u>: Cost 3 vext3 <0,4,1,5>, <1,1,u,3> - 3297641584U, // <5,1,2,0>: Cost 4 vrev <1,5,0,2> - 3763520391U, // <5,1,2,1>: Cost 4 vext3 <0,u,1,5>, <1,2,1,3> - 2646820456U, // <5,1,2,2>: Cost 3 vext2 <4,u,5,1>, <2,2,2,2> - 2687124374U, // <5,1,2,3>: Cost 3 vext3 <0,4,1,5>, <1,2,3,0> - 2691990436U, // <5,1,2,4>: Cost 3 vext3 <1,2,4,5>, <1,2,4,5> - 2687124395U, // <5,1,2,5>: Cost 3 vext3 <0,4,1,5>, <1,2,5,3> - 2646820794U, // <5,1,2,6>: Cost 3 vext2 <4,u,5,1>, <2,6,3,7> - 3808199610U, // <5,1,2,7>: Cost 4 vext3 <u,3,4,5>, <1,2,7,0> - 2687124419U, // <5,1,2,u>: Cost 3 vext3 <0,4,1,5>, <1,2,u,0> - 2577440870U, // <5,1,3,0>: Cost 3 vext1 <4,5,1,3>, LHS - 2687124440U, // <5,1,3,1>: Cost 3 vext3 <0,4,1,5>, <1,3,1,3> - 3759686627U, // <5,1,3,2>: Cost 4 vext3 <0,2,3,5>, <1,3,2,5> - 2692580332U, // <5,1,3,3>: Cost 3 vext3 <1,3,3,5>, <1,3,3,5> - 2687124469U, // <5,1,3,4>: Cost 3 vext3 <0,4,1,5>, <1,3,4,5> - 2685207552U, // <5,1,3,5>: Cost 3 vext3 <0,1,2,5>, <1,3,5,7> - 3760866313U, // <5,1,3,6>: Cost 4 vext3 <0,4,1,5>, <1,3,6,7> - 2692875280U, // <5,1,3,7>: Cost 3 vext3 <1,3,7,5>, <1,3,7,5> - 2687124503U, // <5,1,3,u>: Cost 3 vext3 <0,4,1,5>, <1,3,u,3> - 1567771538U, // <5,1,4,0>: Cost 2 vext2 <4,0,5,1>, <4,0,5,1> - 2693096491U, // <5,1,4,1>: Cost 3 vext3 <1,4,1,5>, <1,4,1,5> - 2693170228U, // <5,1,4,2>: Cost 3 vext3 <1,4,2,5>, <1,4,2,5> - 2687124541U, // <5,1,4,3>: Cost 3 vext3 <0,4,1,5>, <1,4,3,5> - 2646822096U, // <5,1,4,4>: Cost 3 vext2 <4,u,5,1>, <4,4,4,4> - 1573080374U, // <5,1,4,5>: Cost 2 vext2 <4,u,5,1>, RHS - 2646822260U, // <5,1,4,6>: Cost 3 vext2 <4,u,5,1>, <4,6,4,6> - 3298174129U, // <5,1,4,7>: Cost 4 vrev <1,5,7,4> - 1573080602U, // <5,1,4,u>: Cost 2 vext2 <4,u,5,1>, <4,u,5,1> - 2687124591U, // <5,1,5,0>: Cost 3 vext3 <0,4,1,5>, <1,5,0,1> - 2646822543U, // <5,1,5,1>: Cost 3 vext2 <4,u,5,1>, <5,1,0,1> - 3760866433U, // <5,1,5,2>: Cost 4 vext3 <0,4,1,5>, <1,5,2,1> - 2687124624U, // <5,1,5,3>: Cost 3 vext3 <0,4,1,5>, <1,5,3,7> - 2687124631U, // <5,1,5,4>: Cost 3 vext3 <0,4,1,5>, <1,5,4,5> - 2646822916U, // <5,1,5,5>: Cost 3 vext2 <4,u,5,1>, <5,5,5,5> - 2646823010U, // <5,1,5,6>: Cost 3 vext2 <4,u,5,1>, <5,6,7,0> - 2646823080U, // <5,1,5,7>: Cost 3 vext2 <4,u,5,1>, <5,7,5,7> - 2687124663U, // <5,1,5,u>: Cost 3 vext3 <0,4,1,5>, <1,5,u,1> - 2553577574U, // <5,1,6,0>: Cost 3 vext1 <0,5,1,6>, LHS - 3763520719U, // <5,1,6,1>: Cost 4 vext3 <0,u,1,5>, <1,6,1,7> - 2646823418U, // <5,1,6,2>: Cost 3 vext2 <4,u,5,1>, <6,2,7,3> - 3760866529U, // <5,1,6,3>: Cost 4 vext3 <0,4,1,5>, <1,6,3,7> - 2553580854U, // <5,1,6,4>: Cost 3 vext1 <0,5,1,6>, RHS - 2687124723U, // <5,1,6,5>: Cost 3 vext3 <0,4,1,5>, <1,6,5,7> - 2646823736U, // <5,1,6,6>: Cost 3 vext2 <4,u,5,1>, <6,6,6,6> - 2646823758U, // <5,1,6,7>: Cost 3 vext2 <4,u,5,1>, <6,7,0,1> - 2646823839U, // <5,1,6,u>: Cost 3 vext2 <4,u,5,1>, <6,u,0,1> - 2559557734U, // <5,1,7,0>: Cost 3 vext1 <1,5,1,7>, LHS - 2559558452U, // <5,1,7,1>: Cost 3 vext1 <1,5,1,7>, <1,1,1,1> - 2571503270U, // <5,1,7,2>: Cost 3 vext1 <3,5,1,7>, <2,3,0,1> - 2040971366U, // <5,1,7,3>: Cost 2 vtrnr RHS, LHS - 2559561014U, // <5,1,7,4>: Cost 3 vext1 <1,5,1,7>, RHS - 2595393232U, // <5,1,7,5>: Cost 3 vext1 <7,5,1,7>, <5,1,7,3> - 4188455035U, // <5,1,7,6>: Cost 4 vtrnr RHS, <0,1,4,6> - 2646824556U, // <5,1,7,7>: Cost 3 vext2 <4,u,5,1>, <7,7,7,7> - 2040971371U, // <5,1,7,u>: Cost 2 vtrnr RHS, LHS - 1591662326U, // <5,1,u,0>: Cost 2 vext2 <u,0,5,1>, <u,0,5,1> - 1573082926U, // <5,1,u,1>: Cost 2 vext2 <4,u,5,1>, LHS - 2695824760U, // <5,1,u,2>: Cost 3 vext3 <1,u,2,5>, <1,u,2,5> - 2040979558U, // <5,1,u,3>: Cost 2 vtrnr RHS, LHS - 2687124874U, // <5,1,u,4>: Cost 3 vext3 <0,4,1,5>, <1,u,4,5> - 1573083290U, // <5,1,u,5>: Cost 2 vext2 <4,u,5,1>, RHS - 2646825168U, // <5,1,u,6>: Cost 3 vext2 <4,u,5,1>, <u,6,3,7> - 2646825216U, // <5,1,u,7>: Cost 3 vext2 <4,u,5,1>, <u,7,0,1> - 2040979563U, // <5,1,u,u>: Cost 2 vtrnr RHS, LHS - 3702652928U, // <5,2,0,0>: Cost 4 vext2 <1,u,5,2>, <0,0,0,0> - 2628911206U, // <5,2,0,1>: Cost 3 vext2 <1,u,5,2>, LHS - 2641518756U, // <5,2,0,2>: Cost 3 vext2 <4,0,5,2>, <0,2,0,2> - 3759760847U, // <5,2,0,3>: Cost 4 vext3 <0,2,4,5>, <2,0,3,2> - 3760866775U, // <5,2,0,4>: Cost 4 vext3 <0,4,1,5>, <2,0,4,1> - 3759539680U, // <5,2,0,5>: Cost 4 vext3 <0,2,1,5>, <2,0,5,1> - 3760866796U, // <5,2,0,6>: Cost 4 vext3 <0,4,1,5>, <2,0,6,4> - 3304114054U, // <5,2,0,7>: Cost 4 vrev <2,5,7,0> - 2628911773U, // <5,2,0,u>: Cost 3 vext2 <1,u,5,2>, LHS - 2623603464U, // <5,2,1,0>: Cost 3 vext2 <1,0,5,2>, <1,0,5,2> - 3698008921U, // <5,2,1,1>: Cost 4 vext2 <1,1,5,2>, <1,1,5,2> - 3633325603U, // <5,2,1,2>: Cost 4 vext1 <1,5,2,1>, <2,1,3,5> - 2687125027U, // <5,2,1,3>: Cost 3 vext3 <0,4,1,5>, <2,1,3,5> - 3633327414U, // <5,2,1,4>: Cost 4 vext1 <1,5,2,1>, RHS - 3759539760U, // <5,2,1,5>: Cost 4 vext3 <0,2,1,5>, <2,1,5,0> - 3760866876U, // <5,2,1,6>: Cost 4 vext3 <0,4,1,5>, <2,1,6,3> - 3304122247U, // <5,2,1,7>: Cost 4 vrev <2,5,7,1> - 2687125072U, // <5,2,1,u>: Cost 3 vext3 <0,4,1,5>, <2,1,u,5> - 3633332326U, // <5,2,2,0>: Cost 4 vext1 <1,5,2,2>, LHS - 3759760992U, // <5,2,2,1>: Cost 4 vext3 <0,2,4,5>, <2,2,1,3> - 2687125096U, // <5,2,2,2>: Cost 3 vext3 <0,4,1,5>, <2,2,2,2> - 2687125106U, // <5,2,2,3>: Cost 3 vext3 <0,4,1,5>, <2,2,3,3> - 2697963133U, // <5,2,2,4>: Cost 3 vext3 <2,2,4,5>, <2,2,4,5> - 3759466120U, // <5,2,2,5>: Cost 4 vext3 <0,2,0,5>, <2,2,5,7> - 3760866960U, // <5,2,2,6>: Cost 4 vext3 <0,4,1,5>, <2,2,6,6> - 3771926168U, // <5,2,2,7>: Cost 4 vext3 <2,2,7,5>, <2,2,7,5> - 2687125151U, // <5,2,2,u>: Cost 3 vext3 <0,4,1,5>, <2,2,u,3> - 2687125158U, // <5,2,3,0>: Cost 3 vext3 <0,4,1,5>, <2,3,0,1> - 2698405555U, // <5,2,3,1>: Cost 3 vext3 <2,3,1,5>, <2,3,1,5> - 2577516238U, // <5,2,3,2>: Cost 3 vext1 <4,5,2,3>, <2,3,4,5> - 3759687365U, // <5,2,3,3>: Cost 4 vext3 <0,2,3,5>, <2,3,3,5> - 1624884942U, // <5,2,3,4>: Cost 2 vext3 <2,3,4,5>, <2,3,4,5> - 2698700503U, // <5,2,3,5>: Cost 3 vext3 <2,3,5,5>, <2,3,5,5> - 3772368608U, // <5,2,3,6>: Cost 4 vext3 <2,3,4,5>, <2,3,6,5> - 3702655716U, // <5,2,3,7>: Cost 4 vext2 <1,u,5,2>, <3,7,3,7> - 1625179890U, // <5,2,3,u>: Cost 2 vext3 <2,3,u,5>, <2,3,u,5> - 2641521555U, // <5,2,4,0>: Cost 3 vext2 <4,0,5,2>, <4,0,5,2> - 3772368642U, // <5,2,4,1>: Cost 4 vext3 <2,3,4,5>, <2,4,1,3> - 2699142925U, // <5,2,4,2>: Cost 3 vext3 <2,4,2,5>, <2,4,2,5> - 2698626838U, // <5,2,4,3>: Cost 3 vext3 <2,3,4,5>, <2,4,3,5> - 2698626848U, // <5,2,4,4>: Cost 3 vext3 <2,3,4,5>, <2,4,4,6> - 2628914486U, // <5,2,4,5>: Cost 3 vext2 <1,u,5,2>, RHS - 2645503353U, // <5,2,4,6>: Cost 3 vext2 <4,6,5,2>, <4,6,5,2> - 3304146826U, // <5,2,4,7>: Cost 4 vrev <2,5,7,4> - 2628914729U, // <5,2,4,u>: Cost 3 vext2 <1,u,5,2>, RHS - 2553643110U, // <5,2,5,0>: Cost 3 vext1 <0,5,2,5>, LHS - 3758950227U, // <5,2,5,1>: Cost 4 vext3 <0,1,2,5>, <2,5,1,3> - 3759761248U, // <5,2,5,2>: Cost 4 vext3 <0,2,4,5>, <2,5,2,7> - 2982396006U, // <5,2,5,3>: Cost 3 vzipr <4,u,5,5>, LHS - 2553646390U, // <5,2,5,4>: Cost 3 vext1 <0,5,2,5>, RHS - 2553647108U, // <5,2,5,5>: Cost 3 vext1 <0,5,2,5>, <5,5,5,5> - 3760867204U, // <5,2,5,6>: Cost 4 vext3 <0,4,1,5>, <2,5,6,7> - 3702657141U, // <5,2,5,7>: Cost 4 vext2 <1,u,5,2>, <5,7,0,1> - 2982396011U, // <5,2,5,u>: Cost 3 vzipr <4,u,5,5>, LHS - 3627393126U, // <5,2,6,0>: Cost 4 vext1 <0,5,2,6>, LHS - 3760867236U, // <5,2,6,1>: Cost 4 vext3 <0,4,1,5>, <2,6,1,3> - 2645504506U, // <5,2,6,2>: Cost 3 vext2 <4,6,5,2>, <6,2,7,3> - 2687125434U, // <5,2,6,3>: Cost 3 vext3 <0,4,1,5>, <2,6,3,7> - 2700617665U, // <5,2,6,4>: Cost 3 vext3 <2,6,4,5>, <2,6,4,5> - 3760867276U, // <5,2,6,5>: Cost 4 vext3 <0,4,1,5>, <2,6,5,7> - 3763521493U, // <5,2,6,6>: Cost 4 vext3 <0,u,1,5>, <2,6,6,7> - 3719246670U, // <5,2,6,7>: Cost 4 vext2 <4,6,5,2>, <6,7,0,1> - 2687125479U, // <5,2,6,u>: Cost 3 vext3 <0,4,1,5>, <2,6,u,7> - 2565603430U, // <5,2,7,0>: Cost 3 vext1 <2,5,2,7>, LHS - 2553660150U, // <5,2,7,1>: Cost 3 vext1 <0,5,2,7>, <1,0,3,2> - 2565605216U, // <5,2,7,2>: Cost 3 vext1 <2,5,2,7>, <2,5,2,7> - 2961178726U, // <5,2,7,3>: Cost 3 vzipr <1,3,5,7>, LHS - 2565606710U, // <5,2,7,4>: Cost 3 vext1 <2,5,2,7>, RHS - 4034920552U, // <5,2,7,5>: Cost 4 vzipr <1,3,5,7>, <0,1,2,5> - 3114713292U, // <5,2,7,6>: Cost 3 vtrnr RHS, <0,2,4,6> - 3702658668U, // <5,2,7,7>: Cost 4 vext2 <1,u,5,2>, <7,7,7,7> - 2961178731U, // <5,2,7,u>: Cost 3 vzipr <1,3,5,7>, LHS - 2687125563U, // <5,2,u,0>: Cost 3 vext3 <0,4,1,5>, <2,u,0,1> - 2628917038U, // <5,2,u,1>: Cost 3 vext2 <1,u,5,2>, LHS - 2565613409U, // <5,2,u,2>: Cost 3 vext1 <2,5,2,u>, <2,5,2,u> - 2687125592U, // <5,2,u,3>: Cost 3 vext3 <0,4,1,5>, <2,u,3,3> - 1628203107U, // <5,2,u,4>: Cost 2 vext3 <2,u,4,5>, <2,u,4,5> - 2628917402U, // <5,2,u,5>: Cost 3 vext2 <1,u,5,2>, RHS - 2702092405U, // <5,2,u,6>: Cost 3 vext3 <2,u,6,5>, <2,u,6,5> - 3304179598U, // <5,2,u,7>: Cost 4 vrev <2,5,7,u> - 1628498055U, // <5,2,u,u>: Cost 2 vext3 <2,u,u,5>, <2,u,u,5> - 3760867467U, // <5,3,0,0>: Cost 4 vext3 <0,4,1,5>, <3,0,0,0> - 2687125654U, // <5,3,0,1>: Cost 3 vext3 <0,4,1,5>, <3,0,1,2> - 3759761565U, // <5,3,0,2>: Cost 4 vext3 <0,2,4,5>, <3,0,2,0> - 3633391766U, // <5,3,0,3>: Cost 4 vext1 <1,5,3,0>, <3,0,1,2> - 2687125680U, // <5,3,0,4>: Cost 3 vext3 <0,4,1,5>, <3,0,4,1> - 3760277690U, // <5,3,0,5>: Cost 4 vext3 <0,3,2,5>, <3,0,5,2> - 3310013014U, // <5,3,0,6>: Cost 4 vrev <3,5,6,0> - 2236344927U, // <5,3,0,7>: Cost 3 vrev <3,5,7,0> - 2687125717U, // <5,3,0,u>: Cost 3 vext3 <0,4,1,5>, <3,0,u,2> - 3760867551U, // <5,3,1,0>: Cost 4 vext3 <0,4,1,5>, <3,1,0,3> - 3760867558U, // <5,3,1,1>: Cost 4 vext3 <0,4,1,5>, <3,1,1,1> - 2624938923U, // <5,3,1,2>: Cost 3 vext2 <1,2,5,3>, <1,2,5,3> - 2703198460U, // <5,3,1,3>: Cost 3 vext3 <3,1,3,5>, <3,1,3,5> - 3760867587U, // <5,3,1,4>: Cost 4 vext3 <0,4,1,5>, <3,1,4,3> - 2636219536U, // <5,3,1,5>: Cost 3 vext2 <3,1,5,3>, <1,5,3,7> - 3698681075U, // <5,3,1,6>: Cost 4 vext2 <1,2,5,3>, <1,6,5,7> - 2703493408U, // <5,3,1,7>: Cost 3 vext3 <3,1,7,5>, <3,1,7,5> - 2628920721U, // <5,3,1,u>: Cost 3 vext2 <1,u,5,3>, <1,u,5,3> - 3766765870U, // <5,3,2,0>: Cost 4 vext3 <1,4,0,5>, <3,2,0,1> - 3698681379U, // <5,3,2,1>: Cost 4 vext2 <1,2,5,3>, <2,1,3,5> - 3760867649U, // <5,3,2,2>: Cost 4 vext3 <0,4,1,5>, <3,2,2,2> - 2698627404U, // <5,3,2,3>: Cost 3 vext3 <2,3,4,5>, <3,2,3,4> - 2703935830U, // <5,3,2,4>: Cost 3 vext3 <3,2,4,5>, <3,2,4,5> - 2698627422U, // <5,3,2,5>: Cost 3 vext3 <2,3,4,5>, <3,2,5,4> - 3760867686U, // <5,3,2,6>: Cost 4 vext3 <0,4,1,5>, <3,2,6,3> - 3769788783U, // <5,3,2,7>: Cost 4 vext3 <1,u,5,5>, <3,2,7,3> - 2701945209U, // <5,3,2,u>: Cost 3 vext3 <2,u,4,5>, <3,2,u,4> - 3760867711U, // <5,3,3,0>: Cost 4 vext3 <0,4,1,5>, <3,3,0,1> - 2636220684U, // <5,3,3,1>: Cost 3 vext2 <3,1,5,3>, <3,1,5,3> - 3772369298U, // <5,3,3,2>: Cost 4 vext3 <2,3,4,5>, <3,3,2,2> - 2687125916U, // <5,3,3,3>: Cost 3 vext3 <0,4,1,5>, <3,3,3,3> - 2704599463U, // <5,3,3,4>: Cost 3 vext3 <3,3,4,5>, <3,3,4,5> - 2704673200U, // <5,3,3,5>: Cost 3 vext3 <3,3,5,5>, <3,3,5,5> - 3709962935U, // <5,3,3,6>: Cost 4 vext2 <3,1,5,3>, <3,6,7,7> - 3772369346U, // <5,3,3,7>: Cost 4 vext3 <2,3,4,5>, <3,3,7,5> - 2704894411U, // <5,3,3,u>: Cost 3 vext3 <3,3,u,5>, <3,3,u,5> - 2704968148U, // <5,3,4,0>: Cost 3 vext3 <3,4,0,5>, <3,4,0,5> - 3698682850U, // <5,3,4,1>: Cost 4 vext2 <1,2,5,3>, <4,1,5,0> - 2642857014U, // <5,3,4,2>: Cost 3 vext2 <4,2,5,3>, <4,2,5,3> - 2705189359U, // <5,3,4,3>: Cost 3 vext3 <3,4,3,5>, <3,4,3,5> - 2705263096U, // <5,3,4,4>: Cost 3 vext3 <3,4,4,5>, <3,4,4,5> - 2685946370U, // <5,3,4,5>: Cost 3 vext3 <0,2,3,5>, <3,4,5,6> - 3779152394U, // <5,3,4,6>: Cost 4 vext3 <3,4,6,5>, <3,4,6,5> - 2236377699U, // <5,3,4,7>: Cost 3 vrev <3,5,7,4> - 2687126045U, // <5,3,4,u>: Cost 3 vext3 <0,4,1,5>, <3,4,u,6> - 2571632742U, // <5,3,5,0>: Cost 3 vext1 <3,5,3,5>, LHS - 2559689870U, // <5,3,5,1>: Cost 3 vext1 <1,5,3,5>, <1,5,3,5> - 2571634382U, // <5,3,5,2>: Cost 3 vext1 <3,5,3,5>, <2,3,4,5> - 2571635264U, // <5,3,5,3>: Cost 3 vext1 <3,5,3,5>, <3,5,3,5> - 2571636022U, // <5,3,5,4>: Cost 3 vext1 <3,5,3,5>, RHS - 2559692804U, // <5,3,5,5>: Cost 3 vext1 <1,5,3,5>, <5,5,5,5> - 3720581218U, // <5,3,5,6>: Cost 4 vext2 <4,u,5,3>, <5,6,7,0> - 2236385892U, // <5,3,5,7>: Cost 3 vrev <3,5,7,5> - 2571638574U, // <5,3,5,u>: Cost 3 vext1 <3,5,3,5>, LHS - 2565668966U, // <5,3,6,0>: Cost 3 vext1 <2,5,3,6>, LHS - 3633439887U, // <5,3,6,1>: Cost 4 vext1 <1,5,3,6>, <1,5,3,6> - 2565670760U, // <5,3,6,2>: Cost 3 vext1 <2,5,3,6>, <2,5,3,6> - 2565671426U, // <5,3,6,3>: Cost 3 vext1 <2,5,3,6>, <3,4,5,6> - 2565672246U, // <5,3,6,4>: Cost 3 vext1 <2,5,3,6>, RHS - 3639414630U, // <5,3,6,5>: Cost 4 vext1 <2,5,3,6>, <5,3,6,0> - 4047521640U, // <5,3,6,6>: Cost 4 vzipr <3,4,5,6>, <2,5,3,6> - 2725169844U, // <5,3,6,7>: Cost 3 vext3 <6,7,4,5>, <3,6,7,4> - 2565674798U, // <5,3,6,u>: Cost 3 vext1 <2,5,3,6>, LHS - 1485963366U, // <5,3,7,0>: Cost 2 vext1 <1,5,3,7>, LHS - 1485964432U, // <5,3,7,1>: Cost 2 vext1 <1,5,3,7>, <1,5,3,7> - 2559706728U, // <5,3,7,2>: Cost 3 vext1 <1,5,3,7>, <2,2,2,2> - 2559707286U, // <5,3,7,3>: Cost 3 vext1 <1,5,3,7>, <3,0,1,2> - 1485966646U, // <5,3,7,4>: Cost 2 vext1 <1,5,3,7>, RHS - 2559708880U, // <5,3,7,5>: Cost 3 vext1 <1,5,3,7>, <5,1,7,3> - 2601513466U, // <5,3,7,6>: Cost 3 vext1 <u,5,3,7>, <6,2,7,3> - 3114714112U, // <5,3,7,7>: Cost 3 vtrnr RHS, <1,3,5,7> - 1485969198U, // <5,3,7,u>: Cost 2 vext1 <1,5,3,7>, LHS - 1485971558U, // <5,3,u,0>: Cost 2 vext1 <1,5,3,u>, LHS - 1485972625U, // <5,3,u,1>: Cost 2 vext1 <1,5,3,u>, <1,5,3,u> - 2559714920U, // <5,3,u,2>: Cost 3 vext1 <1,5,3,u>, <2,2,2,2> - 2559715478U, // <5,3,u,3>: Cost 3 vext1 <1,5,3,u>, <3,0,1,2> - 1485974838U, // <5,3,u,4>: Cost 2 vext1 <1,5,3,u>, RHS - 2687126342U, // <5,3,u,5>: Cost 3 vext3 <0,4,1,5>, <3,u,5,6> - 2601521658U, // <5,3,u,6>: Cost 3 vext1 <u,5,3,u>, <6,2,7,3> - 2236410471U, // <5,3,u,7>: Cost 3 vrev <3,5,7,u> - 1485977390U, // <5,3,u,u>: Cost 2 vext1 <1,5,3,u>, LHS - 3627491430U, // <5,4,0,0>: Cost 4 vext1 <0,5,4,0>, LHS - 2636890214U, // <5,4,0,1>: Cost 3 vext2 <3,2,5,4>, LHS - 3703333028U, // <5,4,0,2>: Cost 4 vext2 <2,0,5,4>, <0,2,0,2> - 3782249348U, // <5,4,0,3>: Cost 4 vext3 <4,0,3,5>, <4,0,3,5> - 2642198866U, // <5,4,0,4>: Cost 3 vext2 <4,1,5,4>, <0,4,1,5> - 2687126418U, // <5,4,0,5>: Cost 3 vext3 <0,4,1,5>, <4,0,5,1> - 2242243887U, // <5,4,0,6>: Cost 3 vrev <4,5,6,0> - 3316059448U, // <5,4,0,7>: Cost 4 vrev <4,5,7,0> - 2636890781U, // <5,4,0,u>: Cost 3 vext2 <3,2,5,4>, LHS - 2241809658U, // <5,4,1,0>: Cost 3 vrev <4,5,0,1> - 3698025307U, // <5,4,1,1>: Cost 4 vext2 <1,1,5,4>, <1,1,5,4> - 3698688940U, // <5,4,1,2>: Cost 4 vext2 <1,2,5,4>, <1,2,5,4> - 3698689024U, // <5,4,1,3>: Cost 4 vext2 <1,2,5,4>, <1,3,5,7> - 3700016206U, // <5,4,1,4>: Cost 4 vext2 <1,4,5,4>, <1,4,5,4> - 2687126498U, // <5,4,1,5>: Cost 3 vext3 <0,4,1,5>, <4,1,5,0> - 3760868336U, // <5,4,1,6>: Cost 4 vext3 <0,4,1,5>, <4,1,6,5> - 3316067641U, // <5,4,1,7>: Cost 4 vrev <4,5,7,1> - 2242399554U, // <5,4,1,u>: Cost 3 vrev <4,5,u,1> - 3703334371U, // <5,4,2,0>: Cost 4 vext2 <2,0,5,4>, <2,0,5,4> - 3703998004U, // <5,4,2,1>: Cost 4 vext2 <2,1,5,4>, <2,1,5,4> - 3704661637U, // <5,4,2,2>: Cost 4 vext2 <2,2,5,4>, <2,2,5,4> - 2636891854U, // <5,4,2,3>: Cost 3 vext2 <3,2,5,4>, <2,3,4,5> - 3705988903U, // <5,4,2,4>: Cost 4 vext2 <2,4,5,4>, <2,4,5,4> - 2698628150U, // <5,4,2,5>: Cost 3 vext3 <2,3,4,5>, <4,2,5,3> - 3760868415U, // <5,4,2,6>: Cost 4 vext3 <0,4,1,5>, <4,2,6,3> - 3783871562U, // <5,4,2,7>: Cost 4 vext3 <4,2,7,5>, <4,2,7,5> - 2666752099U, // <5,4,2,u>: Cost 3 vext2 <u,2,5,4>, <2,u,4,5> - 3639459942U, // <5,4,3,0>: Cost 4 vext1 <2,5,4,3>, LHS - 3709970701U, // <5,4,3,1>: Cost 4 vext2 <3,1,5,4>, <3,1,5,4> - 2636892510U, // <5,4,3,2>: Cost 3 vext2 <3,2,5,4>, <3,2,5,4> - 3710634396U, // <5,4,3,3>: Cost 4 vext2 <3,2,5,4>, <3,3,3,3> - 2638219776U, // <5,4,3,4>: Cost 3 vext2 <3,4,5,4>, <3,4,5,4> - 3766987908U, // <5,4,3,5>: Cost 4 vext3 <1,4,3,5>, <4,3,5,0> - 2710719634U, // <5,4,3,6>: Cost 3 vext3 <4,3,6,5>, <4,3,6,5> - 3914097664U, // <5,4,3,7>: Cost 4 vuzpr <3,5,7,4>, <1,3,5,7> - 2640874308U, // <5,4,3,u>: Cost 3 vext2 <3,u,5,4>, <3,u,5,4> - 2583642214U, // <5,4,4,0>: Cost 3 vext1 <5,5,4,4>, LHS - 2642201574U, // <5,4,4,1>: Cost 3 vext2 <4,1,5,4>, <4,1,5,4> - 3710635062U, // <5,4,4,2>: Cost 4 vext2 <3,2,5,4>, <4,2,5,3> - 3717270664U, // <5,4,4,3>: Cost 4 vext2 <4,3,5,4>, <4,3,5,4> - 2713963728U, // <5,4,4,4>: Cost 3 vext3 <4,u,5,5>, <4,4,4,4> - 1637567706U, // <5,4,4,5>: Cost 2 vext3 <4,4,5,5>, <4,4,5,5> - 2242276659U, // <5,4,4,6>: Cost 3 vrev <4,5,6,4> - 2646183372U, // <5,4,4,7>: Cost 3 vext2 <4,7,5,4>, <4,7,5,4> - 1637788917U, // <5,4,4,u>: Cost 2 vext3 <4,4,u,5>, <4,4,u,5> - 2559762534U, // <5,4,5,0>: Cost 3 vext1 <1,5,4,5>, LHS - 2559763607U, // <5,4,5,1>: Cost 3 vext1 <1,5,4,5>, <1,5,4,5> - 2698628366U, // <5,4,5,2>: Cost 3 vext3 <2,3,4,5>, <4,5,2,3> - 3633506454U, // <5,4,5,3>: Cost 4 vext1 <1,5,4,5>, <3,0,1,2> - 2559765814U, // <5,4,5,4>: Cost 3 vext1 <1,5,4,5>, RHS - 2583654395U, // <5,4,5,5>: Cost 3 vext1 <5,5,4,5>, <5,5,4,5> - 1613385014U, // <5,4,5,6>: Cost 2 vext3 <0,4,1,5>, RHS - 3901639990U, // <5,4,5,7>: Cost 4 vuzpr <1,5,0,4>, RHS - 1613385032U, // <5,4,5,u>: Cost 2 vext3 <0,4,1,5>, RHS - 2559770726U, // <5,4,6,0>: Cost 3 vext1 <1,5,4,6>, LHS - 2559771648U, // <5,4,6,1>: Cost 3 vext1 <1,5,4,6>, <1,3,5,7> - 3633514088U, // <5,4,6,2>: Cost 4 vext1 <1,5,4,6>, <2,2,2,2> - 2571717122U, // <5,4,6,3>: Cost 3 vext1 <3,5,4,6>, <3,4,5,6> - 2559774006U, // <5,4,6,4>: Cost 3 vext1 <1,5,4,6>, RHS - 2712636796U, // <5,4,6,5>: Cost 3 vext3 <4,6,5,5>, <4,6,5,5> - 3760868743U, // <5,4,6,6>: Cost 4 vext3 <0,4,1,5>, <4,6,6,7> - 2712784270U, // <5,4,6,7>: Cost 3 vext3 <4,6,7,5>, <4,6,7,5> - 2559776558U, // <5,4,6,u>: Cost 3 vext1 <1,5,4,6>, LHS - 2565750886U, // <5,4,7,0>: Cost 3 vext1 <2,5,4,7>, LHS - 2565751706U, // <5,4,7,1>: Cost 3 vext1 <2,5,4,7>, <1,2,3,4> - 2565752690U, // <5,4,7,2>: Cost 3 vext1 <2,5,4,7>, <2,5,4,7> - 2571725387U, // <5,4,7,3>: Cost 3 vext1 <3,5,4,7>, <3,5,4,7> - 2565754166U, // <5,4,7,4>: Cost 3 vext1 <2,5,4,7>, RHS - 3114713426U, // <5,4,7,5>: Cost 3 vtrnr RHS, <0,4,1,5> - 94817590U, // <5,4,7,6>: Cost 1 vrev RHS - 2595616175U, // <5,4,7,7>: Cost 3 vext1 <7,5,4,7>, <7,5,4,7> - 94965064U, // <5,4,7,u>: Cost 1 vrev RHS - 2559787110U, // <5,4,u,0>: Cost 3 vext1 <1,5,4,u>, LHS - 2559788186U, // <5,4,u,1>: Cost 3 vext1 <1,5,4,u>, <1,5,4,u> - 2242014483U, // <5,4,u,2>: Cost 3 vrev <4,5,2,u> - 2667419628U, // <5,4,u,3>: Cost 3 vext2 <u,3,5,4>, <u,3,5,4> - 2559790390U, // <5,4,u,4>: Cost 3 vext1 <1,5,4,u>, RHS - 1640222238U, // <5,4,u,5>: Cost 2 vext3 <4,u,5,5>, <4,u,5,5> - 94825783U, // <5,4,u,6>: Cost 1 vrev RHS - 2714111536U, // <5,4,u,7>: Cost 3 vext3 <4,u,7,5>, <4,u,7,5> - 94973257U, // <5,4,u,u>: Cost 1 vrev RHS - 2646851584U, // <5,5,0,0>: Cost 3 vext2 <4,u,5,5>, <0,0,0,0> - 1573109862U, // <5,5,0,1>: Cost 2 vext2 <4,u,5,5>, LHS - 2646851748U, // <5,5,0,2>: Cost 3 vext2 <4,u,5,5>, <0,2,0,2> - 3760279130U, // <5,5,0,3>: Cost 4 vext3 <0,3,2,5>, <5,0,3,2> - 2687127138U, // <5,5,0,4>: Cost 3 vext3 <0,4,1,5>, <5,0,4,1> - 2248142847U, // <5,5,0,5>: Cost 3 vrev <5,5,5,0> - 3720593910U, // <5,5,0,6>: Cost 4 vext2 <4,u,5,5>, <0,6,1,7> - 4182502710U, // <5,5,0,7>: Cost 4 vtrnr <3,5,7,0>, RHS - 1573110429U, // <5,5,0,u>: Cost 2 vext2 <4,u,5,5>, LHS - 2646852342U, // <5,5,1,0>: Cost 3 vext2 <4,u,5,5>, <1,0,3,2> - 2624291676U, // <5,5,1,1>: Cost 3 vext2 <1,1,5,5>, <1,1,5,5> - 2646852502U, // <5,5,1,2>: Cost 3 vext2 <4,u,5,5>, <1,2,3,0> - 2646852568U, // <5,5,1,3>: Cost 3 vext2 <4,u,5,5>, <1,3,1,3> - 2715217591U, // <5,5,1,4>: Cost 3 vext3 <5,1,4,5>, <5,1,4,5> - 2628936848U, // <5,5,1,5>: Cost 3 vext2 <1,u,5,5>, <1,5,3,7> - 3698033907U, // <5,5,1,6>: Cost 4 vext2 <1,1,5,5>, <1,6,5,7> - 2713964240U, // <5,5,1,7>: Cost 3 vext3 <4,u,5,5>, <5,1,7,3> - 2628937107U, // <5,5,1,u>: Cost 3 vext2 <1,u,5,5>, <1,u,5,5> - 3645497446U, // <5,5,2,0>: Cost 4 vext1 <3,5,5,2>, LHS - 3760869099U, // <5,5,2,1>: Cost 4 vext3 <0,4,1,5>, <5,2,1,3> - 2646853224U, // <5,5,2,2>: Cost 3 vext2 <4,u,5,5>, <2,2,2,2> - 2698628862U, // <5,5,2,3>: Cost 3 vext3 <2,3,4,5>, <5,2,3,4> - 3772370694U, // <5,5,2,4>: Cost 4 vext3 <2,3,4,5>, <5,2,4,3> - 2713964303U, // <5,5,2,5>: Cost 3 vext3 <4,u,5,5>, <5,2,5,3> - 2646853562U, // <5,5,2,6>: Cost 3 vext2 <4,u,5,5>, <2,6,3,7> - 4038198272U, // <5,5,2,7>: Cost 4 vzipr <1,u,5,2>, <1,3,5,7> - 2701946667U, // <5,5,2,u>: Cost 3 vext3 <2,u,4,5>, <5,2,u,4> - 2646853782U, // <5,5,3,0>: Cost 3 vext2 <4,u,5,5>, <3,0,1,2> - 3698034922U, // <5,5,3,1>: Cost 4 vext2 <1,1,5,5>, <3,1,1,5> - 3702679919U, // <5,5,3,2>: Cost 4 vext2 <1,u,5,5>, <3,2,7,3> - 2637564336U, // <5,5,3,3>: Cost 3 vext2 <3,3,5,5>, <3,3,5,5> - 2646854146U, // <5,5,3,4>: Cost 3 vext2 <4,u,5,5>, <3,4,5,6> - 2638891602U, // <5,5,3,5>: Cost 3 vext2 <3,5,5,5>, <3,5,5,5> - 3702680247U, // <5,5,3,6>: Cost 4 vext2 <1,u,5,5>, <3,6,7,7> - 3702680259U, // <5,5,3,7>: Cost 4 vext2 <1,u,5,5>, <3,7,0,1> - 2646854430U, // <5,5,3,u>: Cost 3 vext2 <4,u,5,5>, <3,u,1,2> - 2646854546U, // <5,5,4,0>: Cost 3 vext2 <4,u,5,5>, <4,0,5,1> - 2642209767U, // <5,5,4,1>: Cost 3 vext2 <4,1,5,5>, <4,1,5,5> - 3711306806U, // <5,5,4,2>: Cost 4 vext2 <3,3,5,5>, <4,2,5,3> - 3645516369U, // <5,5,4,3>: Cost 4 vext1 <3,5,5,4>, <3,5,5,4> - 1570458842U, // <5,5,4,4>: Cost 2 vext2 <4,4,5,5>, <4,4,5,5> - 1573113142U, // <5,5,4,5>: Cost 2 vext2 <4,u,5,5>, RHS - 2645527932U, // <5,5,4,6>: Cost 3 vext2 <4,6,5,5>, <4,6,5,5> - 2713964486U, // <5,5,4,7>: Cost 3 vext3 <4,u,5,5>, <5,4,7,6> - 1573113374U, // <5,5,4,u>: Cost 2 vext2 <4,u,5,5>, <4,u,5,5> - 1509982310U, // <5,5,5,0>: Cost 2 vext1 <5,5,5,5>, LHS - 2646855376U, // <5,5,5,1>: Cost 3 vext2 <4,u,5,5>, <5,1,7,3> - 2583725672U, // <5,5,5,2>: Cost 3 vext1 <5,5,5,5>, <2,2,2,2> - 2583726230U, // <5,5,5,3>: Cost 3 vext1 <5,5,5,5>, <3,0,1,2> - 1509985590U, // <5,5,5,4>: Cost 2 vext1 <5,5,5,5>, RHS - 229035318U, // <5,5,5,5>: Cost 1 vdup1 RHS - 2646855778U, // <5,5,5,6>: Cost 3 vext2 <4,u,5,5>, <5,6,7,0> - 2646855848U, // <5,5,5,7>: Cost 3 vext2 <4,u,5,5>, <5,7,5,7> - 229035318U, // <5,5,5,u>: Cost 1 vdup1 RHS - 2577760358U, // <5,5,6,0>: Cost 3 vext1 <4,5,5,6>, LHS - 3633587361U, // <5,5,6,1>: Cost 4 vext1 <1,5,5,6>, <1,5,5,6> - 2646856186U, // <5,5,6,2>: Cost 3 vext2 <4,u,5,5>, <6,2,7,3> - 3633588738U, // <5,5,6,3>: Cost 4 vext1 <1,5,5,6>, <3,4,5,6> - 2718535756U, // <5,5,6,4>: Cost 3 vext3 <5,6,4,5>, <5,6,4,5> - 2644202223U, // <5,5,6,5>: Cost 3 vext2 <4,4,5,5>, <6,5,7,5> - 2973780482U, // <5,5,6,6>: Cost 3 vzipr <3,4,5,6>, <3,4,5,6> - 2646856526U, // <5,5,6,7>: Cost 3 vext2 <4,u,5,5>, <6,7,0,1> - 2646856607U, // <5,5,6,u>: Cost 3 vext2 <4,u,5,5>, <6,u,0,1> - 2571796582U, // <5,5,7,0>: Cost 3 vext1 <3,5,5,7>, LHS - 3633595392U, // <5,5,7,1>: Cost 4 vext1 <1,5,5,7>, <1,3,5,7> - 2571798222U, // <5,5,7,2>: Cost 3 vext1 <3,5,5,7>, <2,3,4,5> - 2571799124U, // <5,5,7,3>: Cost 3 vext1 <3,5,5,7>, <3,5,5,7> - 2571799862U, // <5,5,7,4>: Cost 3 vext1 <3,5,5,7>, RHS - 3114717188U, // <5,5,7,5>: Cost 3 vtrnr RHS, <5,5,5,5> - 4034923010U, // <5,5,7,6>: Cost 4 vzipr <1,3,5,7>, <3,4,5,6> - 2040974646U, // <5,5,7,7>: Cost 2 vtrnr RHS, RHS - 2040974647U, // <5,5,7,u>: Cost 2 vtrnr RHS, RHS - 1509982310U, // <5,5,u,0>: Cost 2 vext1 <5,5,5,5>, LHS - 1573115694U, // <5,5,u,1>: Cost 2 vext2 <4,u,5,5>, LHS - 2571806414U, // <5,5,u,2>: Cost 3 vext1 <3,5,5,u>, <2,3,4,5> - 2571807317U, // <5,5,u,3>: Cost 3 vext1 <3,5,5,u>, <3,5,5,u> - 1509985590U, // <5,5,u,4>: Cost 2 vext1 <5,5,5,5>, RHS - 229035318U, // <5,5,u,5>: Cost 1 vdup1 RHS - 2646857936U, // <5,5,u,6>: Cost 3 vext2 <4,u,5,5>, <u,6,3,7> - 2040982838U, // <5,5,u,7>: Cost 2 vtrnr RHS, RHS - 229035318U, // <5,5,u,u>: Cost 1 vdup1 RHS - 2638233600U, // <5,6,0,0>: Cost 3 vext2 <3,4,5,6>, <0,0,0,0> - 1564491878U, // <5,6,0,1>: Cost 2 vext2 <3,4,5,6>, LHS - 2632261796U, // <5,6,0,2>: Cost 3 vext2 <2,4,5,6>, <0,2,0,2> - 2638233856U, // <5,6,0,3>: Cost 3 vext2 <3,4,5,6>, <0,3,1,4> - 2638233938U, // <5,6,0,4>: Cost 3 vext2 <3,4,5,6>, <0,4,1,5> - 3706003885U, // <5,6,0,5>: Cost 4 vext2 <2,4,5,6>, <0,5,2,6> - 3706003967U, // <5,6,0,6>: Cost 4 vext2 <2,4,5,6>, <0,6,2,7> - 4047473974U, // <5,6,0,7>: Cost 4 vzipr <3,4,5,0>, RHS - 1564492445U, // <5,6,0,u>: Cost 2 vext2 <3,4,5,6>, LHS - 2638234358U, // <5,6,1,0>: Cost 3 vext2 <3,4,5,6>, <1,0,3,2> - 2638234420U, // <5,6,1,1>: Cost 3 vext2 <3,4,5,6>, <1,1,1,1> - 2638234518U, // <5,6,1,2>: Cost 3 vext2 <3,4,5,6>, <1,2,3,0> - 2638234584U, // <5,6,1,3>: Cost 3 vext2 <3,4,5,6>, <1,3,1,3> - 2626290768U, // <5,6,1,4>: Cost 3 vext2 <1,4,5,6>, <1,4,5,6> - 2638234768U, // <5,6,1,5>: Cost 3 vext2 <3,4,5,6>, <1,5,3,7> - 3700032719U, // <5,6,1,6>: Cost 4 vext2 <1,4,5,6>, <1,6,1,7> - 2982366518U, // <5,6,1,7>: Cost 3 vzipr <4,u,5,1>, RHS - 2628945300U, // <5,6,1,u>: Cost 3 vext2 <1,u,5,6>, <1,u,5,6> - 3706004925U, // <5,6,2,0>: Cost 4 vext2 <2,4,5,6>, <2,0,1,2> - 3711976966U, // <5,6,2,1>: Cost 4 vext2 <3,4,5,6>, <2,1,0,3> - 2638235240U, // <5,6,2,2>: Cost 3 vext2 <3,4,5,6>, <2,2,2,2> - 2638235302U, // <5,6,2,3>: Cost 3 vext2 <3,4,5,6>, <2,3,0,1> - 2632263465U, // <5,6,2,4>: Cost 3 vext2 <2,4,5,6>, <2,4,5,6> - 2638235496U, // <5,6,2,5>: Cost 3 vext2 <3,4,5,6>, <2,5,3,6> - 2638235578U, // <5,6,2,6>: Cost 3 vext2 <3,4,5,6>, <2,6,3,7> - 2713965050U, // <5,6,2,7>: Cost 3 vext3 <4,u,5,5>, <6,2,7,3> - 2634917997U, // <5,6,2,u>: Cost 3 vext2 <2,u,5,6>, <2,u,5,6> - 2638235798U, // <5,6,3,0>: Cost 3 vext2 <3,4,5,6>, <3,0,1,2> - 3711977695U, // <5,6,3,1>: Cost 4 vext2 <3,4,5,6>, <3,1,0,3> - 3710650720U, // <5,6,3,2>: Cost 4 vext2 <3,2,5,6>, <3,2,5,6> - 2638236060U, // <5,6,3,3>: Cost 3 vext2 <3,4,5,6>, <3,3,3,3> - 1564494338U, // <5,6,3,4>: Cost 2 vext2 <3,4,5,6>, <3,4,5,6> - 2638236234U, // <5,6,3,5>: Cost 3 vext2 <3,4,5,6>, <3,5,4,6> - 3711978104U, // <5,6,3,6>: Cost 4 vext2 <3,4,5,6>, <3,6,0,7> - 4034227510U, // <5,6,3,7>: Cost 4 vzipr <1,2,5,3>, RHS - 1567148870U, // <5,6,3,u>: Cost 2 vext2 <3,u,5,6>, <3,u,5,6> - 2577817702U, // <5,6,4,0>: Cost 3 vext1 <4,5,6,4>, LHS - 3700034544U, // <5,6,4,1>: Cost 4 vext2 <1,4,5,6>, <4,1,6,5> - 2723033713U, // <5,6,4,2>: Cost 3 vext3 <6,4,2,5>, <6,4,2,5> - 2638236818U, // <5,6,4,3>: Cost 3 vext2 <3,4,5,6>, <4,3,6,5> - 2644208859U, // <5,6,4,4>: Cost 3 vext2 <4,4,5,6>, <4,4,5,6> - 1564495158U, // <5,6,4,5>: Cost 2 vext2 <3,4,5,6>, RHS - 2645536125U, // <5,6,4,6>: Cost 3 vext2 <4,6,5,6>, <4,6,5,6> - 2723402398U, // <5,6,4,7>: Cost 3 vext3 <6,4,7,5>, <6,4,7,5> - 1564495401U, // <5,6,4,u>: Cost 2 vext2 <3,4,5,6>, RHS - 2577825894U, // <5,6,5,0>: Cost 3 vext1 <4,5,6,5>, LHS - 2662125264U, // <5,6,5,1>: Cost 3 vext2 <7,4,5,6>, <5,1,7,3> - 3775836867U, // <5,6,5,2>: Cost 4 vext3 <2,u,6,5>, <6,5,2,6> - 3711979343U, // <5,6,5,3>: Cost 4 vext2 <3,4,5,6>, <5,3,3,4> - 2650181556U, // <5,6,5,4>: Cost 3 vext2 <5,4,5,6>, <5,4,5,6> - 2662125572U, // <5,6,5,5>: Cost 3 vext2 <7,4,5,6>, <5,5,5,5> - 2638237732U, // <5,6,5,6>: Cost 3 vext2 <3,4,5,6>, <5,6,0,1> - 2982399286U, // <5,6,5,7>: Cost 3 vzipr <4,u,5,5>, RHS - 2982399287U, // <5,6,5,u>: Cost 3 vzipr <4,u,5,5>, RHS - 2583806054U, // <5,6,6,0>: Cost 3 vext1 <5,5,6,6>, LHS - 3711979910U, // <5,6,6,1>: Cost 4 vext2 <3,4,5,6>, <6,1,3,4> - 2662126074U, // <5,6,6,2>: Cost 3 vext2 <7,4,5,6>, <6,2,7,3> - 2583808514U, // <5,6,6,3>: Cost 3 vext1 <5,5,6,6>, <3,4,5,6> - 2583809334U, // <5,6,6,4>: Cost 3 vext1 <5,5,6,6>, RHS - 2583810062U, // <5,6,6,5>: Cost 3 vext1 <5,5,6,6>, <5,5,6,6> - 2638238520U, // <5,6,6,6>: Cost 3 vext2 <3,4,5,6>, <6,6,6,6> - 2973781302U, // <5,6,6,7>: Cost 3 vzipr <3,4,5,6>, RHS - 2973781303U, // <5,6,6,u>: Cost 3 vzipr <3,4,5,6>, RHS - 430358630U, // <5,6,7,0>: Cost 1 vext1 RHS, LHS - 1504101110U, // <5,6,7,1>: Cost 2 vext1 RHS, <1,0,3,2> - 1504101992U, // <5,6,7,2>: Cost 2 vext1 RHS, <2,2,2,2> - 1504102550U, // <5,6,7,3>: Cost 2 vext1 RHS, <3,0,1,2> - 430361910U, // <5,6,7,4>: Cost 1 vext1 RHS, RHS - 1504104390U, // <5,6,7,5>: Cost 2 vext1 RHS, <5,4,7,6> - 1504105272U, // <5,6,7,6>: Cost 2 vext1 RHS, <6,6,6,6> - 1504106092U, // <5,6,7,7>: Cost 2 vext1 RHS, <7,7,7,7> - 430364462U, // <5,6,7,u>: Cost 1 vext1 RHS, LHS - 430366822U, // <5,6,u,0>: Cost 1 vext1 RHS, LHS - 1564497710U, // <5,6,u,1>: Cost 2 vext2 <3,4,5,6>, LHS - 1504110184U, // <5,6,u,2>: Cost 2 vext1 RHS, <2,2,2,2> - 1504110742U, // <5,6,u,3>: Cost 2 vext1 RHS, <3,0,1,2> - 430370103U, // <5,6,u,4>: Cost 1 vext1 RHS, RHS - 1564498074U, // <5,6,u,5>: Cost 2 vext2 <3,4,5,6>, RHS - 1504113146U, // <5,6,u,6>: Cost 2 vext1 RHS, <6,2,7,3> - 1504113658U, // <5,6,u,7>: Cost 2 vext1 RHS, <7,0,1,2> - 430372654U, // <5,6,u,u>: Cost 1 vext1 RHS, LHS - 2625634304U, // <5,7,0,0>: Cost 3 vext2 <1,3,5,7>, <0,0,0,0> - 1551892582U, // <5,7,0,1>: Cost 2 vext2 <1,3,5,7>, LHS - 2625634468U, // <5,7,0,2>: Cost 3 vext2 <1,3,5,7>, <0,2,0,2> - 2571889247U, // <5,7,0,3>: Cost 3 vext1 <3,5,7,0>, <3,5,7,0> - 2625634642U, // <5,7,0,4>: Cost 3 vext2 <1,3,5,7>, <0,4,1,5> - 2595778728U, // <5,7,0,5>: Cost 3 vext1 <7,5,7,0>, <5,7,5,7> - 3699376639U, // <5,7,0,6>: Cost 4 vext2 <1,3,5,7>, <0,6,2,7> - 2260235715U, // <5,7,0,7>: Cost 3 vrev <7,5,7,0> - 1551893149U, // <5,7,0,u>: Cost 2 vext2 <1,3,5,7>, LHS - 2625635062U, // <5,7,1,0>: Cost 3 vext2 <1,3,5,7>, <1,0,3,2> - 2624308020U, // <5,7,1,1>: Cost 3 vext2 <1,1,5,7>, <1,1,1,1> - 2625635222U, // <5,7,1,2>: Cost 3 vext2 <1,3,5,7>, <1,2,3,0> - 1551893504U, // <5,7,1,3>: Cost 2 vext2 <1,3,5,7>, <1,3,5,7> - 2571898166U, // <5,7,1,4>: Cost 3 vext1 <3,5,7,1>, RHS - 2625635472U, // <5,7,1,5>: Cost 3 vext2 <1,3,5,7>, <1,5,3,7> - 2627626227U, // <5,7,1,6>: Cost 3 vext2 <1,6,5,7>, <1,6,5,7> - 3702031684U, // <5,7,1,7>: Cost 4 vext2 <1,7,5,7>, <1,7,5,7> - 1555211669U, // <5,7,1,u>: Cost 2 vext2 <1,u,5,7>, <1,u,5,7> - 2629617126U, // <5,7,2,0>: Cost 3 vext2 <2,0,5,7>, <2,0,5,7> - 3699377670U, // <5,7,2,1>: Cost 4 vext2 <1,3,5,7>, <2,1,0,3> - 2625635944U, // <5,7,2,2>: Cost 3 vext2 <1,3,5,7>, <2,2,2,2> - 2625636006U, // <5,7,2,3>: Cost 3 vext2 <1,3,5,7>, <2,3,0,1> - 2632271658U, // <5,7,2,4>: Cost 3 vext2 <2,4,5,7>, <2,4,5,7> - 2625636201U, // <5,7,2,5>: Cost 3 vext2 <1,3,5,7>, <2,5,3,7> - 2625636282U, // <5,7,2,6>: Cost 3 vext2 <1,3,5,7>, <2,6,3,7> - 3708004381U, // <5,7,2,7>: Cost 4 vext2 <2,7,5,7>, <2,7,5,7> - 2625636411U, // <5,7,2,u>: Cost 3 vext2 <1,3,5,7>, <2,u,0,1> - 2625636502U, // <5,7,3,0>: Cost 3 vext2 <1,3,5,7>, <3,0,1,2> - 2625636604U, // <5,7,3,1>: Cost 3 vext2 <1,3,5,7>, <3,1,3,5> - 3699378478U, // <5,7,3,2>: Cost 4 vext2 <1,3,5,7>, <3,2,0,1> - 2625636764U, // <5,7,3,3>: Cost 3 vext2 <1,3,5,7>, <3,3,3,3> - 2625636866U, // <5,7,3,4>: Cost 3 vext2 <1,3,5,7>, <3,4,5,6> - 2625636959U, // <5,7,3,5>: Cost 3 vext2 <1,3,5,7>, <3,5,7,0> - 3699378808U, // <5,7,3,6>: Cost 4 vext2 <1,3,5,7>, <3,6,0,7> - 2640235254U, // <5,7,3,7>: Cost 3 vext2 <3,7,5,7>, <3,7,5,7> - 2625637150U, // <5,7,3,u>: Cost 3 vext2 <1,3,5,7>, <3,u,1,2> - 2571919462U, // <5,7,4,0>: Cost 3 vext1 <3,5,7,4>, LHS - 2571920384U, // <5,7,4,1>: Cost 3 vext1 <3,5,7,4>, <1,3,5,7> - 3699379260U, // <5,7,4,2>: Cost 4 vext2 <1,3,5,7>, <4,2,6,0> - 2571922019U, // <5,7,4,3>: Cost 3 vext1 <3,5,7,4>, <3,5,7,4> - 2571922742U, // <5,7,4,4>: Cost 3 vext1 <3,5,7,4>, RHS - 1551895862U, // <5,7,4,5>: Cost 2 vext2 <1,3,5,7>, RHS - 2846277980U, // <5,7,4,6>: Cost 3 vuzpr RHS, <0,4,2,6> - 2646207951U, // <5,7,4,7>: Cost 3 vext2 <4,7,5,7>, <4,7,5,7> - 1551896105U, // <5,7,4,u>: Cost 2 vext2 <1,3,5,7>, RHS - 2583871590U, // <5,7,5,0>: Cost 3 vext1 <5,5,7,5>, LHS - 2652180176U, // <5,7,5,1>: Cost 3 vext2 <5,7,5,7>, <5,1,7,3> - 2625638177U, // <5,7,5,2>: Cost 3 vext2 <1,3,5,7>, <5,2,7,3> - 2625638262U, // <5,7,5,3>: Cost 3 vext2 <1,3,5,7>, <5,3,7,7> - 2583874870U, // <5,7,5,4>: Cost 3 vext1 <5,5,7,5>, RHS - 2846281732U, // <5,7,5,5>: Cost 3 vuzpr RHS, <5,5,5,5> - 2651517015U, // <5,7,5,6>: Cost 3 vext2 <5,6,5,7>, <5,6,5,7> - 1772539190U, // <5,7,5,7>: Cost 2 vuzpr RHS, RHS - 1772539191U, // <5,7,5,u>: Cost 2 vuzpr RHS, RHS - 2846281826U, // <5,7,6,0>: Cost 3 vuzpr RHS, <5,6,7,0> - 3699380615U, // <5,7,6,1>: Cost 4 vext2 <1,3,5,7>, <6,1,3,5> - 2846281108U, // <5,7,6,2>: Cost 3 vuzpr RHS, <4,6,u,2> - 2589854210U, // <5,7,6,3>: Cost 3 vext1 <6,5,7,6>, <3,4,5,6> - 2846281830U, // <5,7,6,4>: Cost 3 vuzpr RHS, <5,6,7,4> - 2725467658U, // <5,7,6,5>: Cost 3 vext3 <6,7,u,5>, <7,6,5,u> - 2846281076U, // <5,7,6,6>: Cost 3 vuzpr RHS, <4,6,4,6> - 2846279610U, // <5,7,6,7>: Cost 3 vuzpr RHS, <2,6,3,7> - 2846279611U, // <5,7,6,u>: Cost 3 vuzpr RHS, <2,6,3,u> - 1510146150U, // <5,7,7,0>: Cost 2 vext1 <5,5,7,7>, LHS - 2846282574U, // <5,7,7,1>: Cost 3 vuzpr RHS, <6,7,0,1> - 2583889512U, // <5,7,7,2>: Cost 3 vext1 <5,5,7,7>, <2,2,2,2> - 2846281919U, // <5,7,7,3>: Cost 3 vuzpr RHS, <5,7,u,3> - 1510149430U, // <5,7,7,4>: Cost 2 vext1 <5,5,7,7>, RHS - 1510150168U, // <5,7,7,5>: Cost 2 vext1 <5,5,7,7>, <5,5,7,7> - 2583892474U, // <5,7,7,6>: Cost 3 vext1 <5,5,7,7>, <6,2,7,3> - 2625640044U, // <5,7,7,7>: Cost 3 vext2 <1,3,5,7>, <7,7,7,7> - 1510151982U, // <5,7,7,u>: Cost 2 vext1 <5,5,7,7>, LHS - 1510154342U, // <5,7,u,0>: Cost 2 vext1 <5,5,7,u>, LHS - 1551898414U, // <5,7,u,1>: Cost 2 vext2 <1,3,5,7>, LHS - 2625640325U, // <5,7,u,2>: Cost 3 vext2 <1,3,5,7>, <u,2,3,0> - 1772536477U, // <5,7,u,3>: Cost 2 vuzpr RHS, LHS - 1510157622U, // <5,7,u,4>: Cost 2 vext1 <5,5,7,u>, RHS - 1551898778U, // <5,7,u,5>: Cost 2 vext2 <1,3,5,7>, RHS - 2625640656U, // <5,7,u,6>: Cost 3 vext2 <1,3,5,7>, <u,6,3,7> - 1772539433U, // <5,7,u,7>: Cost 2 vuzpr RHS, RHS - 1551898981U, // <5,7,u,u>: Cost 2 vext2 <1,3,5,7>, LHS - 2625642496U, // <5,u,0,0>: Cost 3 vext2 <1,3,5,u>, <0,0,0,0> - 1551900774U, // <5,u,0,1>: Cost 2 vext2 <1,3,5,u>, LHS - 2625642660U, // <5,u,0,2>: Cost 3 vext2 <1,3,5,u>, <0,2,0,2> - 2698630885U, // <5,u,0,3>: Cost 3 vext3 <2,3,4,5>, <u,0,3,2> - 2687129325U, // <5,u,0,4>: Cost 3 vext3 <0,4,1,5>, <u,0,4,1> - 2689783542U, // <5,u,0,5>: Cost 3 vext3 <0,u,1,5>, <u,0,5,1> - 2266134675U, // <5,u,0,6>: Cost 3 vrev <u,5,6,0> - 2595853772U, // <5,u,0,7>: Cost 3 vext1 <7,5,u,0>, <7,5,u,0> - 1551901341U, // <5,u,0,u>: Cost 2 vext2 <1,3,5,u>, LHS - 2625643254U, // <5,u,1,0>: Cost 3 vext2 <1,3,5,u>, <1,0,3,2> - 2625643316U, // <5,u,1,1>: Cost 3 vext2 <1,3,5,u>, <1,1,1,1> - 1613387566U, // <5,u,1,2>: Cost 2 vext3 <0,4,1,5>, LHS - 1551901697U, // <5,u,1,3>: Cost 2 vext2 <1,3,5,u>, <1,3,5,u> - 2626307154U, // <5,u,1,4>: Cost 3 vext2 <1,4,5,u>, <1,4,5,u> - 2689783622U, // <5,u,1,5>: Cost 3 vext3 <0,u,1,5>, <u,1,5,0> - 2627634420U, // <5,u,1,6>: Cost 3 vext2 <1,6,5,u>, <1,6,5,u> - 2982366536U, // <5,u,1,7>: Cost 3 vzipr <4,u,5,1>, RHS - 1613387620U, // <5,u,1,u>: Cost 2 vext3 <0,4,1,5>, LHS - 2846286742U, // <5,u,2,0>: Cost 3 vuzpr RHS, <1,2,3,0> - 2685796528U, // <5,u,2,1>: Cost 3 vext3 <0,2,1,5>, <0,2,1,5> - 2625644136U, // <5,u,2,2>: Cost 3 vext2 <1,3,5,u>, <2,2,2,2> - 2687129480U, // <5,u,2,3>: Cost 3 vext3 <0,4,1,5>, <u,2,3,3> - 2632279851U, // <5,u,2,4>: Cost 3 vext2 <2,4,5,u>, <2,4,5,u> - 2625644394U, // <5,u,2,5>: Cost 3 vext2 <1,3,5,u>, <2,5,3,u> - 2625644474U, // <5,u,2,6>: Cost 3 vext2 <1,3,5,u>, <2,6,3,7> - 2713966508U, // <5,u,2,7>: Cost 3 vext3 <4,u,5,5>, <u,2,7,3> - 2625644603U, // <5,u,2,u>: Cost 3 vext2 <1,3,5,u>, <2,u,0,1> - 2687129532U, // <5,u,3,0>: Cost 3 vext3 <0,4,1,5>, <u,3,0,1> - 2636261649U, // <5,u,3,1>: Cost 3 vext2 <3,1,5,u>, <3,1,5,u> - 2636925282U, // <5,u,3,2>: Cost 3 vext2 <3,2,5,u>, <3,2,5,u> - 2625644956U, // <5,u,3,3>: Cost 3 vext2 <1,3,5,u>, <3,3,3,3> - 1564510724U, // <5,u,3,4>: Cost 2 vext2 <3,4,5,u>, <3,4,5,u> - 2625645160U, // <5,u,3,5>: Cost 3 vext2 <1,3,5,u>, <3,5,u,0> - 2734610422U, // <5,u,3,6>: Cost 3 vext3 <u,3,6,5>, <u,3,6,5> - 2640243447U, // <5,u,3,7>: Cost 3 vext2 <3,7,5,u>, <3,7,5,u> - 1567165256U, // <5,u,3,u>: Cost 2 vext2 <3,u,5,u>, <3,u,5,u> - 1567828889U, // <5,u,4,0>: Cost 2 vext2 <4,0,5,u>, <4,0,5,u> - 1661163546U, // <5,u,4,1>: Cost 2 vext3 <u,4,1,5>, <u,4,1,5> - 2734463012U, // <5,u,4,2>: Cost 3 vext3 <u,3,4,5>, <u,4,2,6> - 2698631212U, // <5,u,4,3>: Cost 3 vext3 <2,3,4,5>, <u,4,3,5> - 1570458842U, // <5,u,4,4>: Cost 2 vext2 <4,4,5,5>, <4,4,5,5> - 1551904054U, // <5,u,4,5>: Cost 2 vext2 <1,3,5,u>, RHS - 2846286172U, // <5,u,4,6>: Cost 3 vuzpr RHS, <0,4,2,6> - 2646216144U, // <5,u,4,7>: Cost 3 vext2 <4,7,5,u>, <4,7,5,u> - 1551904297U, // <5,u,4,u>: Cost 2 vext2 <1,3,5,u>, RHS - 1509982310U, // <5,u,5,0>: Cost 2 vext1 <5,5,5,5>, LHS - 2560058555U, // <5,u,5,1>: Cost 3 vext1 <1,5,u,5>, <1,5,u,5> - 2698926194U, // <5,u,5,2>: Cost 3 vext3 <2,3,u,5>, <u,5,2,3> - 2698631295U, // <5,u,5,3>: Cost 3 vext3 <2,3,4,5>, <u,5,3,7> - 1509985590U, // <5,u,5,4>: Cost 2 vext1 <5,5,5,5>, RHS - 229035318U, // <5,u,5,5>: Cost 1 vdup1 RHS - 1613387930U, // <5,u,5,6>: Cost 2 vext3 <0,4,1,5>, RHS - 1772547382U, // <5,u,5,7>: Cost 2 vuzpr RHS, RHS - 229035318U, // <5,u,5,u>: Cost 1 vdup1 RHS - 2566037606U, // <5,u,6,0>: Cost 3 vext1 <2,5,u,6>, LHS - 2920044334U, // <5,u,6,1>: Cost 3 vzipl <5,6,7,0>, LHS - 2566039445U, // <5,u,6,2>: Cost 3 vext1 <2,5,u,6>, <2,5,u,6> - 2687129808U, // <5,u,6,3>: Cost 3 vext3 <0,4,1,5>, <u,6,3,7> - 2566040886U, // <5,u,6,4>: Cost 3 vext1 <2,5,u,6>, RHS - 2920044698U, // <5,u,6,5>: Cost 3 vzipl <5,6,7,0>, RHS - 2846289268U, // <5,u,6,6>: Cost 3 vuzpr RHS, <4,6,4,6> - 2973781320U, // <5,u,6,7>: Cost 3 vzipr <3,4,5,6>, RHS - 2687129853U, // <5,u,6,u>: Cost 3 vext3 <0,4,1,5>, <u,6,u,7> - 430506086U, // <5,u,7,0>: Cost 1 vext1 RHS, LHS - 1486333117U, // <5,u,7,1>: Cost 2 vext1 <1,5,u,7>, <1,5,u,7> - 1504249448U, // <5,u,7,2>: Cost 2 vext1 RHS, <2,2,2,2> - 2040971933U, // <5,u,7,3>: Cost 2 vtrnr RHS, LHS - 430509384U, // <5,u,7,4>: Cost 1 vext1 RHS, RHS - 1504251600U, // <5,u,7,5>: Cost 2 vext1 RHS, <5,1,7,3> - 118708378U, // <5,u,7,6>: Cost 1 vrev RHS - 2040974889U, // <5,u,7,7>: Cost 2 vtrnr RHS, RHS - 430511918U, // <5,u,7,u>: Cost 1 vext1 RHS, LHS - 430514278U, // <5,u,u,0>: Cost 1 vext1 RHS, LHS - 1551906606U, // <5,u,u,1>: Cost 2 vext2 <1,3,5,u>, LHS - 1613388133U, // <5,u,u,2>: Cost 2 vext3 <0,4,1,5>, LHS - 1772544669U, // <5,u,u,3>: Cost 2 vuzpr RHS, LHS - 430517577U, // <5,u,u,4>: Cost 1 vext1 RHS, RHS - 229035318U, // <5,u,u,5>: Cost 1 vdup1 RHS - 118716571U, // <5,u,u,6>: Cost 1 vrev RHS - 1772547625U, // <5,u,u,7>: Cost 2 vuzpr RHS, RHS - 430520110U, // <5,u,u,u>: Cost 1 vext1 RHS, LHS - 2686025728U, // <6,0,0,0>: Cost 3 vext3 <0,2,4,6>, <0,0,0,0> - 2686025738U, // <6,0,0,1>: Cost 3 vext3 <0,2,4,6>, <0,0,1,1> - 2686025748U, // <6,0,0,2>: Cost 3 vext3 <0,2,4,6>, <0,0,2,2> - 3779084320U, // <6,0,0,3>: Cost 4 vext3 <3,4,5,6>, <0,0,3,5> - 2642903388U, // <6,0,0,4>: Cost 3 vext2 <4,2,6,0>, <0,4,2,6> - 3657723939U, // <6,0,0,5>: Cost 4 vext1 <5,6,0,0>, <5,6,0,0> - 3926676514U, // <6,0,0,6>: Cost 4 vuzpr <5,6,7,0>, <7,0,5,6> - 3926675786U, // <6,0,0,7>: Cost 4 vuzpr <5,6,7,0>, <6,0,5,7> - 2686025802U, // <6,0,0,u>: Cost 3 vext3 <0,2,4,6>, <0,0,u,2> - 2566070374U, // <6,0,1,0>: Cost 3 vext1 <2,6,0,1>, LHS - 3759767642U, // <6,0,1,1>: Cost 4 vext3 <0,2,4,6>, <0,1,1,0> - 1612284006U, // <6,0,1,2>: Cost 2 vext3 <0,2,4,6>, LHS - 2583988738U, // <6,0,1,3>: Cost 3 vext1 <5,6,0,1>, <3,4,5,6> - 2566073654U, // <6,0,1,4>: Cost 3 vext1 <2,6,0,1>, RHS - 2583990308U, // <6,0,1,5>: Cost 3 vext1 <5,6,0,1>, <5,6,0,1> - 2589963005U, // <6,0,1,6>: Cost 3 vext1 <6,6,0,1>, <6,6,0,1> - 2595935702U, // <6,0,1,7>: Cost 3 vext1 <7,6,0,1>, <7,6,0,1> - 1612284060U, // <6,0,1,u>: Cost 2 vext3 <0,2,4,6>, LHS - 2686025892U, // <6,0,2,0>: Cost 3 vext3 <0,2,4,6>, <0,2,0,2> - 2685804721U, // <6,0,2,1>: Cost 3 vext3 <0,2,1,6>, <0,2,1,6> - 3759620282U, // <6,0,2,2>: Cost 4 vext3 <0,2,2,6>, <0,2,2,6> - 2705342658U, // <6,0,2,3>: Cost 3 vext3 <3,4,5,6>, <0,2,3,5> - 1612284108U, // <6,0,2,4>: Cost 2 vext3 <0,2,4,6>, <0,2,4,6> - 3706029956U, // <6,0,2,5>: Cost 4 vext2 <2,4,6,0>, <2,5,6,7> - 2686173406U, // <6,0,2,6>: Cost 3 vext3 <0,2,6,6>, <0,2,6,6> - 3651769338U, // <6,0,2,7>: Cost 4 vext1 <4,6,0,2>, <7,0,1,2> - 1612579056U, // <6,0,2,u>: Cost 2 vext3 <0,2,u,6>, <0,2,u,6> - 3706030230U, // <6,0,3,0>: Cost 4 vext2 <2,4,6,0>, <3,0,1,2> - 2705342720U, // <6,0,3,1>: Cost 3 vext3 <3,4,5,6>, <0,3,1,4> - 2705342730U, // <6,0,3,2>: Cost 3 vext3 <3,4,5,6>, <0,3,2,5> - 3706030492U, // <6,0,3,3>: Cost 4 vext2 <2,4,6,0>, <3,3,3,3> - 2644896258U, // <6,0,3,4>: Cost 3 vext2 <4,5,6,0>, <3,4,5,6> - 3718638154U, // <6,0,3,5>: Cost 4 vext2 <4,5,6,0>, <3,5,4,6> - 3729918619U, // <6,0,3,6>: Cost 4 vext2 <6,4,6,0>, <3,6,4,6> - 3926672384U, // <6,0,3,7>: Cost 4 vuzpr <5,6,7,0>, <1,3,5,7> - 2705342784U, // <6,0,3,u>: Cost 3 vext3 <3,4,5,6>, <0,3,u,5> - 2687058250U, // <6,0,4,0>: Cost 3 vext3 <0,4,0,6>, <0,4,0,6> - 2686026066U, // <6,0,4,1>: Cost 3 vext3 <0,2,4,6>, <0,4,1,5> - 1613463900U, // <6,0,4,2>: Cost 2 vext3 <0,4,2,6>, <0,4,2,6> - 3761021285U, // <6,0,4,3>: Cost 4 vext3 <0,4,3,6>, <0,4,3,6> - 2687353198U, // <6,0,4,4>: Cost 3 vext3 <0,4,4,6>, <0,4,4,6> - 2632289590U, // <6,0,4,5>: Cost 3 vext2 <2,4,6,0>, RHS - 2645560704U, // <6,0,4,6>: Cost 3 vext2 <4,6,6,0>, <4,6,6,0> - 2646224337U, // <6,0,4,7>: Cost 3 vext2 <4,7,6,0>, <4,7,6,0> - 1613906322U, // <6,0,4,u>: Cost 2 vext3 <0,4,u,6>, <0,4,u,6> - 3651788902U, // <6,0,5,0>: Cost 4 vext1 <4,6,0,5>, LHS - 2687795620U, // <6,0,5,1>: Cost 3 vext3 <0,5,1,6>, <0,5,1,6> - 3761611181U, // <6,0,5,2>: Cost 4 vext3 <0,5,2,6>, <0,5,2,6> - 3723284326U, // <6,0,5,3>: Cost 4 vext2 <5,3,6,0>, <5,3,6,0> - 2646224838U, // <6,0,5,4>: Cost 3 vext2 <4,7,6,0>, <5,4,7,6> - 3718639630U, // <6,0,5,5>: Cost 4 vext2 <4,5,6,0>, <5,5,6,6> - 2652196962U, // <6,0,5,6>: Cost 3 vext2 <5,7,6,0>, <5,6,7,0> - 2852932918U, // <6,0,5,7>: Cost 3 vuzpr <5,6,7,0>, RHS - 2852932919U, // <6,0,5,u>: Cost 3 vuzpr <5,6,7,0>, RHS - 2852933730U, // <6,0,6,0>: Cost 3 vuzpr <5,6,7,0>, <5,6,7,0> - 2925985894U, // <6,0,6,1>: Cost 3 vzipl <6,6,6,6>, LHS - 3060203622U, // <6,0,6,2>: Cost 3 vtrnl <6,6,6,6>, LHS - 3718640178U, // <6,0,6,3>: Cost 4 vext2 <4,5,6,0>, <6,3,4,5> - 2656178832U, // <6,0,6,4>: Cost 3 vext2 <6,4,6,0>, <6,4,6,0> - 3725939378U, // <6,0,6,5>: Cost 4 vext2 <5,7,6,0>, <6,5,0,7> - 2657506098U, // <6,0,6,6>: Cost 3 vext2 <6,6,6,0>, <6,6,6,0> - 2619020110U, // <6,0,6,7>: Cost 3 vext2 <0,2,6,0>, <6,7,0,1> - 2925986461U, // <6,0,6,u>: Cost 3 vzipl <6,6,6,6>, LHS - 2572091494U, // <6,0,7,0>: Cost 3 vext1 <3,6,0,7>, LHS - 2572092310U, // <6,0,7,1>: Cost 3 vext1 <3,6,0,7>, <1,2,3,0> - 2980495524U, // <6,0,7,2>: Cost 3 vzipr RHS, <0,2,0,2> - 2572094072U, // <6,0,7,3>: Cost 3 vext1 <3,6,0,7>, <3,6,0,7> - 2572094774U, // <6,0,7,4>: Cost 3 vext1 <3,6,0,7>, RHS - 4054238242U, // <6,0,7,5>: Cost 4 vzipr RHS, <1,4,0,5> - 3645837653U, // <6,0,7,6>: Cost 4 vext1 <3,6,0,7>, <6,0,7,0> - 4054239054U, // <6,0,7,7>: Cost 4 vzipr RHS, <2,5,0,7> - 2572097326U, // <6,0,7,u>: Cost 3 vext1 <3,6,0,7>, LHS - 2686026378U, // <6,0,u,0>: Cost 3 vext3 <0,2,4,6>, <0,u,0,2> - 2686026386U, // <6,0,u,1>: Cost 3 vext3 <0,2,4,6>, <0,u,1,1> - 1612284573U, // <6,0,u,2>: Cost 2 vext3 <0,2,4,6>, LHS - 2705343144U, // <6,0,u,3>: Cost 3 vext3 <3,4,5,6>, <0,u,3,5> - 1616265906U, // <6,0,u,4>: Cost 2 vext3 <0,u,4,6>, <0,u,4,6> - 2632292506U, // <6,0,u,5>: Cost 3 vext2 <2,4,6,0>, RHS - 2590020356U, // <6,0,u,6>: Cost 3 vext1 <6,6,0,u>, <6,6,0,u> - 2852933161U, // <6,0,u,7>: Cost 3 vuzpr <5,6,7,0>, RHS - 1612284627U, // <6,0,u,u>: Cost 2 vext3 <0,2,4,6>, LHS - 2595995750U, // <6,1,0,0>: Cost 3 vext1 <7,6,1,0>, LHS - 2646229094U, // <6,1,0,1>: Cost 3 vext2 <4,7,6,1>, LHS - 3694092492U, // <6,1,0,2>: Cost 4 vext2 <0,4,6,1>, <0,2,4,6> - 2686026486U, // <6,1,0,3>: Cost 3 vext3 <0,2,4,6>, <1,0,3,2> - 2595999030U, // <6,1,0,4>: Cost 3 vext1 <7,6,1,0>, RHS - 3767730952U, // <6,1,0,5>: Cost 4 vext3 <1,5,4,6>, <1,0,5,2> - 2596000590U, // <6,1,0,6>: Cost 3 vext1 <7,6,1,0>, <6,7,0,1> - 2596001246U, // <6,1,0,7>: Cost 3 vext1 <7,6,1,0>, <7,6,1,0> - 2686026531U, // <6,1,0,u>: Cost 3 vext3 <0,2,4,6>, <1,0,u,2> - 3763602219U, // <6,1,1,0>: Cost 4 vext3 <0,u,2,6>, <1,1,0,1> - 2686026548U, // <6,1,1,1>: Cost 3 vext3 <0,2,4,6>, <1,1,1,1> - 3764929346U, // <6,1,1,2>: Cost 4 vext3 <1,1,2,6>, <1,1,2,6> - 2686026568U, // <6,1,1,3>: Cost 3 vext3 <0,2,4,6>, <1,1,3,3> - 2691334996U, // <6,1,1,4>: Cost 3 vext3 <1,1,4,6>, <1,1,4,6> - 3760874332U, // <6,1,1,5>: Cost 4 vext3 <0,4,1,6>, <1,1,5,5> - 3765224294U, // <6,1,1,6>: Cost 4 vext3 <1,1,6,6>, <1,1,6,6> - 3669751263U, // <6,1,1,7>: Cost 4 vext1 <7,6,1,1>, <7,6,1,1> - 2686026613U, // <6,1,1,u>: Cost 3 vext3 <0,2,4,6>, <1,1,u,3> - 2554208358U, // <6,1,2,0>: Cost 3 vext1 <0,6,1,2>, LHS - 3763602311U, // <6,1,2,1>: Cost 4 vext3 <0,u,2,6>, <1,2,1,3> - 3639895971U, // <6,1,2,2>: Cost 4 vext1 <2,6,1,2>, <2,6,1,2> - 2686026646U, // <6,1,2,3>: Cost 3 vext3 <0,2,4,6>, <1,2,3,0> - 2554211638U, // <6,1,2,4>: Cost 3 vext1 <0,6,1,2>, RHS - 3760874411U, // <6,1,2,5>: Cost 4 vext3 <0,4,1,6>, <1,2,5,3> - 2554212858U, // <6,1,2,6>: Cost 3 vext1 <0,6,1,2>, <6,2,7,3> - 3802973114U, // <6,1,2,7>: Cost 4 vext3 <7,4,5,6>, <1,2,7,0> - 2686026691U, // <6,1,2,u>: Cost 3 vext3 <0,2,4,6>, <1,2,u,0> - 2566160486U, // <6,1,3,0>: Cost 3 vext1 <2,6,1,3>, LHS - 2686026712U, // <6,1,3,1>: Cost 3 vext3 <0,2,4,6>, <1,3,1,3> - 2686026724U, // <6,1,3,2>: Cost 3 vext3 <0,2,4,6>, <1,3,2,6> - 3759768552U, // <6,1,3,3>: Cost 4 vext3 <0,2,4,6>, <1,3,3,1> - 2692662262U, // <6,1,3,4>: Cost 3 vext3 <1,3,4,6>, <1,3,4,6> - 2686026752U, // <6,1,3,5>: Cost 3 vext3 <0,2,4,6>, <1,3,5,7> - 2590053128U, // <6,1,3,6>: Cost 3 vext1 <6,6,1,3>, <6,6,1,3> - 3663795194U, // <6,1,3,7>: Cost 4 vext1 <6,6,1,3>, <7,0,1,2> - 2686026775U, // <6,1,3,u>: Cost 3 vext3 <0,2,4,6>, <1,3,u,3> - 2641587099U, // <6,1,4,0>: Cost 3 vext2 <4,0,6,1>, <4,0,6,1> - 2693104684U, // <6,1,4,1>: Cost 3 vext3 <1,4,1,6>, <1,4,1,6> - 3639912357U, // <6,1,4,2>: Cost 4 vext1 <2,6,1,4>, <2,6,1,4> - 2687206462U, // <6,1,4,3>: Cost 3 vext3 <0,4,2,6>, <1,4,3,6> - 3633941814U, // <6,1,4,4>: Cost 4 vext1 <1,6,1,4>, RHS - 2693399632U, // <6,1,4,5>: Cost 3 vext3 <1,4,5,6>, <1,4,5,6> - 3765077075U, // <6,1,4,6>: Cost 4 vext3 <1,1,4,6>, <1,4,6,0> - 2646232530U, // <6,1,4,7>: Cost 3 vext2 <4,7,6,1>, <4,7,6,1> - 2687206507U, // <6,1,4,u>: Cost 3 vext3 <0,4,2,6>, <1,4,u,6> - 2647559796U, // <6,1,5,0>: Cost 3 vext2 <5,0,6,1>, <5,0,6,1> - 3765077118U, // <6,1,5,1>: Cost 4 vext3 <1,1,4,6>, <1,5,1,7> - 3767583878U, // <6,1,5,2>: Cost 4 vext3 <1,5,2,6>, <1,5,2,6> - 2686026896U, // <6,1,5,3>: Cost 3 vext3 <0,2,4,6>, <1,5,3,7> - 2693989528U, // <6,1,5,4>: Cost 3 vext3 <1,5,4,6>, <1,5,4,6> - 3767805089U, // <6,1,5,5>: Cost 4 vext3 <1,5,5,6>, <1,5,5,6> - 2652868706U, // <6,1,5,6>: Cost 3 vext2 <5,u,6,1>, <5,6,7,0> - 3908250934U, // <6,1,5,7>: Cost 4 vuzpr <2,6,0,1>, RHS - 2686026941U, // <6,1,5,u>: Cost 3 vext3 <0,2,4,6>, <1,5,u,7> - 2554241126U, // <6,1,6,0>: Cost 3 vext1 <0,6,1,6>, LHS - 3763602639U, // <6,1,6,1>: Cost 4 vext3 <0,u,2,6>, <1,6,1,7> - 3759547607U, // <6,1,6,2>: Cost 4 vext3 <0,2,1,6>, <1,6,2,6> - 3115221094U, // <6,1,6,3>: Cost 3 vtrnr <4,6,4,6>, LHS - 2554244406U, // <6,1,6,4>: Cost 3 vext1 <0,6,1,6>, RHS - 3760874739U, // <6,1,6,5>: Cost 4 vext3 <0,4,1,6>, <1,6,5,7> - 2554245944U, // <6,1,6,6>: Cost 3 vext1 <0,6,1,6>, <6,6,6,6> - 3719975758U, // <6,1,6,7>: Cost 4 vext2 <4,7,6,1>, <6,7,0,1> - 3115221099U, // <6,1,6,u>: Cost 3 vtrnr <4,6,4,6>, LHS - 2560221286U, // <6,1,7,0>: Cost 3 vext1 <1,6,1,7>, LHS - 2560222415U, // <6,1,7,1>: Cost 3 vext1 <1,6,1,7>, <1,6,1,7> - 2980497558U, // <6,1,7,2>: Cost 3 vzipr RHS, <3,0,1,2> - 3103211622U, // <6,1,7,3>: Cost 3 vtrnr <2,6,3,7>, LHS - 2560224566U, // <6,1,7,4>: Cost 3 vext1 <1,6,1,7>, RHS - 2980495698U, // <6,1,7,5>: Cost 3 vzipr RHS, <0,4,1,5> - 3633967526U, // <6,1,7,6>: Cost 4 vext1 <1,6,1,7>, <6,1,7,0> - 4054237686U, // <6,1,7,7>: Cost 4 vzipr RHS, <0,6,1,7> - 2560227118U, // <6,1,7,u>: Cost 3 vext1 <1,6,1,7>, LHS - 2560229478U, // <6,1,u,0>: Cost 3 vext1 <1,6,1,u>, LHS - 2686027117U, // <6,1,u,1>: Cost 3 vext3 <0,2,4,6>, <1,u,1,3> - 2686027129U, // <6,1,u,2>: Cost 3 vext3 <0,2,4,6>, <1,u,2,6> - 2686027132U, // <6,1,u,3>: Cost 3 vext3 <0,2,4,6>, <1,u,3,0> - 2687206795U, // <6,1,u,4>: Cost 3 vext3 <0,4,2,6>, <1,u,4,6> - 2686027157U, // <6,1,u,5>: Cost 3 vext3 <0,2,4,6>, <1,u,5,7> - 2590094093U, // <6,1,u,6>: Cost 3 vext1 <6,6,1,u>, <6,6,1,u> - 2596066790U, // <6,1,u,7>: Cost 3 vext1 <7,6,1,u>, <7,6,1,u> - 2686027177U, // <6,1,u,u>: Cost 3 vext3 <0,2,4,6>, <1,u,u,0> - 2646900736U, // <6,2,0,0>: Cost 3 vext2 <4,u,6,2>, <0,0,0,0> - 1573159014U, // <6,2,0,1>: Cost 2 vext2 <4,u,6,2>, LHS - 2646900900U, // <6,2,0,2>: Cost 3 vext2 <4,u,6,2>, <0,2,0,2> - 3759769037U, // <6,2,0,3>: Cost 4 vext3 <0,2,4,6>, <2,0,3,0> - 2641592668U, // <6,2,0,4>: Cost 3 vext2 <4,0,6,2>, <0,4,2,6> - 3779085794U, // <6,2,0,5>: Cost 4 vext3 <3,4,5,6>, <2,0,5,3> - 2686027244U, // <6,2,0,6>: Cost 3 vext3 <0,2,4,6>, <2,0,6,4> - 3669816807U, // <6,2,0,7>: Cost 4 vext1 <7,6,2,0>, <7,6,2,0> - 1573159581U, // <6,2,0,u>: Cost 2 vext2 <4,u,6,2>, LHS - 2230527897U, // <6,2,1,0>: Cost 3 vrev <2,6,0,1> - 2646901556U, // <6,2,1,1>: Cost 3 vext2 <4,u,6,2>, <1,1,1,1> - 2646901654U, // <6,2,1,2>: Cost 3 vext2 <4,u,6,2>, <1,2,3,0> - 2847047782U, // <6,2,1,3>: Cost 3 vuzpr <4,6,u,2>, LHS - 3771049517U, // <6,2,1,4>: Cost 4 vext3 <2,1,4,6>, <2,1,4,6> - 2646901904U, // <6,2,1,5>: Cost 3 vext2 <4,u,6,2>, <1,5,3,7> - 2686027324U, // <6,2,1,6>: Cost 3 vext3 <0,2,4,6>, <2,1,6,3> - 3669825000U, // <6,2,1,7>: Cost 4 vext1 <7,6,2,1>, <7,6,2,1> - 2231117793U, // <6,2,1,u>: Cost 3 vrev <2,6,u,1> - 3763603029U, // <6,2,2,0>: Cost 4 vext3 <0,u,2,6>, <2,2,0,1> - 3759769184U, // <6,2,2,1>: Cost 4 vext3 <0,2,4,6>, <2,2,1,3> - 2686027368U, // <6,2,2,2>: Cost 3 vext3 <0,2,4,6>, <2,2,2,2> - 2686027378U, // <6,2,2,3>: Cost 3 vext3 <0,2,4,6>, <2,2,3,3> - 2697971326U, // <6,2,2,4>: Cost 3 vext3 <2,2,4,6>, <2,2,4,6> - 3759769224U, // <6,2,2,5>: Cost 4 vext3 <0,2,4,6>, <2,2,5,7> - 2698118800U, // <6,2,2,6>: Cost 3 vext3 <2,2,6,6>, <2,2,6,6> - 3920794092U, // <6,2,2,7>: Cost 4 vuzpr <4,6,u,2>, <6,2,5,7> - 2686027423U, // <6,2,2,u>: Cost 3 vext3 <0,2,4,6>, <2,2,u,3> - 2686027430U, // <6,2,3,0>: Cost 3 vext3 <0,2,4,6>, <2,3,0,1> - 3759769262U, // <6,2,3,1>: Cost 4 vext3 <0,2,4,6>, <2,3,1,0> - 2698487485U, // <6,2,3,2>: Cost 3 vext3 <2,3,2,6>, <2,3,2,6> - 2705344196U, // <6,2,3,3>: Cost 3 vext3 <3,4,5,6>, <2,3,3,4> - 2686027470U, // <6,2,3,4>: Cost 3 vext3 <0,2,4,6>, <2,3,4,5> - 2698708696U, // <6,2,3,5>: Cost 3 vext3 <2,3,5,6>, <2,3,5,6> - 2724660961U, // <6,2,3,6>: Cost 3 vext3 <6,6,6,6>, <2,3,6,6> - 2729232104U, // <6,2,3,7>: Cost 3 vext3 <7,4,5,6>, <2,3,7,4> - 2686027502U, // <6,2,3,u>: Cost 3 vext3 <0,2,4,6>, <2,3,u,1> - 1567853468U, // <6,2,4,0>: Cost 2 vext2 <4,0,6,2>, <4,0,6,2> - 3759769351U, // <6,2,4,1>: Cost 4 vext3 <0,2,4,6>, <2,4,1,u> - 2699151118U, // <6,2,4,2>: Cost 3 vext3 <2,4,2,6>, <2,4,2,6> - 2686027543U, // <6,2,4,3>: Cost 3 vext3 <0,2,4,6>, <2,4,3,6> - 2699298592U, // <6,2,4,4>: Cost 3 vext3 <2,4,4,6>, <2,4,4,6> - 1573162294U, // <6,2,4,5>: Cost 2 vext2 <4,u,6,2>, RHS - 2686027564U, // <6,2,4,6>: Cost 3 vext3 <0,2,4,6>, <2,4,6,0> - 3719982547U, // <6,2,4,7>: Cost 4 vext2 <4,7,6,2>, <4,7,6,2> - 1573162532U, // <6,2,4,u>: Cost 2 vext2 <4,u,6,2>, <4,u,6,2> - 3779086154U, // <6,2,5,0>: Cost 4 vext3 <3,4,5,6>, <2,5,0,3> - 2646904528U, // <6,2,5,1>: Cost 3 vext2 <4,u,6,2>, <5,1,7,3> - 3759769440U, // <6,2,5,2>: Cost 4 vext3 <0,2,4,6>, <2,5,2,7> - 2699888488U, // <6,2,5,3>: Cost 3 vext3 <2,5,3,6>, <2,5,3,6> - 2230855617U, // <6,2,5,4>: Cost 3 vrev <2,6,4,5> - 2646904836U, // <6,2,5,5>: Cost 3 vext2 <4,u,6,2>, <5,5,5,5> - 2646904930U, // <6,2,5,6>: Cost 3 vext2 <4,u,6,2>, <5,6,7,0> - 2847051062U, // <6,2,5,7>: Cost 3 vuzpr <4,6,u,2>, RHS - 2700257173U, // <6,2,5,u>: Cost 3 vext3 <2,5,u,6>, <2,5,u,6> - 2687207321U, // <6,2,6,0>: Cost 3 vext3 <0,4,2,6>, <2,6,0,1> - 2686027684U, // <6,2,6,1>: Cost 3 vext3 <0,2,4,6>, <2,6,1,3> - 2566260656U, // <6,2,6,2>: Cost 3 vext1 <2,6,2,6>, <2,6,2,6> - 2685806522U, // <6,2,6,3>: Cost 3 vext3 <0,2,1,6>, <2,6,3,7> - 2687207361U, // <6,2,6,4>: Cost 3 vext3 <0,4,2,6>, <2,6,4,5> - 2686027724U, // <6,2,6,5>: Cost 3 vext3 <0,2,4,6>, <2,6,5,7> - 2646905656U, // <6,2,6,6>: Cost 3 vext2 <4,u,6,2>, <6,6,6,6> - 2646905678U, // <6,2,6,7>: Cost 3 vext2 <4,u,6,2>, <6,7,0,1> - 2686027751U, // <6,2,6,u>: Cost 3 vext3 <0,2,4,6>, <2,6,u,7> - 2554323046U, // <6,2,7,0>: Cost 3 vext1 <0,6,2,7>, LHS - 2572239606U, // <6,2,7,1>: Cost 3 vext1 <3,6,2,7>, <1,0,3,2> - 2566268849U, // <6,2,7,2>: Cost 3 vext1 <2,6,2,7>, <2,6,2,7> - 1906753638U, // <6,2,7,3>: Cost 2 vzipr RHS, LHS - 2554326326U, // <6,2,7,4>: Cost 3 vext1 <0,6,2,7>, RHS - 3304687564U, // <6,2,7,5>: Cost 4 vrev <2,6,5,7> - 2980495708U, // <6,2,7,6>: Cost 3 vzipr RHS, <0,4,2,6> - 2646906476U, // <6,2,7,7>: Cost 3 vext2 <4,u,6,2>, <7,7,7,7> - 1906753643U, // <6,2,7,u>: Cost 2 vzipr RHS, LHS - 1591744256U, // <6,2,u,0>: Cost 2 vext2 <u,0,6,2>, <u,0,6,2> - 1573164846U, // <6,2,u,1>: Cost 2 vext2 <4,u,6,2>, LHS - 2701805650U, // <6,2,u,2>: Cost 3 vext3 <2,u,2,6>, <2,u,2,6> - 1906761830U, // <6,2,u,3>: Cost 2 vzipr RHS, LHS - 2686027875U, // <6,2,u,4>: Cost 3 vext3 <0,2,4,6>, <2,u,4,5> - 1573165210U, // <6,2,u,5>: Cost 2 vext2 <4,u,6,2>, RHS - 2686322800U, // <6,2,u,6>: Cost 3 vext3 <0,2,u,6>, <2,u,6,0> - 2847051305U, // <6,2,u,7>: Cost 3 vuzpr <4,6,u,2>, RHS - 1906761835U, // <6,2,u,u>: Cost 2 vzipr RHS, LHS - 3759769739U, // <6,3,0,0>: Cost 4 vext3 <0,2,4,6>, <3,0,0,0> - 2686027926U, // <6,3,0,1>: Cost 3 vext3 <0,2,4,6>, <3,0,1,2> - 2686027937U, // <6,3,0,2>: Cost 3 vext3 <0,2,4,6>, <3,0,2,4> - 3640027286U, // <6,3,0,3>: Cost 4 vext1 <2,6,3,0>, <3,0,1,2> - 2687207601U, // <6,3,0,4>: Cost 3 vext3 <0,4,2,6>, <3,0,4,2> - 2705344698U, // <6,3,0,5>: Cost 3 vext3 <3,4,5,6>, <3,0,5,2> - 3663917847U, // <6,3,0,6>: Cost 4 vext1 <6,6,3,0>, <6,6,3,0> - 2237008560U, // <6,3,0,7>: Cost 3 vrev <3,6,7,0> - 2686027989U, // <6,3,0,u>: Cost 3 vext3 <0,2,4,6>, <3,0,u,2> - 3759769823U, // <6,3,1,0>: Cost 4 vext3 <0,2,4,6>, <3,1,0,3> - 3759769830U, // <6,3,1,1>: Cost 4 vext3 <0,2,4,6>, <3,1,1,1> - 3759769841U, // <6,3,1,2>: Cost 4 vext3 <0,2,4,6>, <3,1,2,3> - 3759769848U, // <6,3,1,3>: Cost 4 vext3 <0,2,4,6>, <3,1,3,1> - 2703280390U, // <6,3,1,4>: Cost 3 vext3 <3,1,4,6>, <3,1,4,6> - 3759769868U, // <6,3,1,5>: Cost 4 vext3 <0,2,4,6>, <3,1,5,3> - 3704063194U, // <6,3,1,6>: Cost 4 vext2 <2,1,6,3>, <1,6,3,0> - 3767732510U, // <6,3,1,7>: Cost 4 vext3 <1,5,4,6>, <3,1,7,3> - 2703280390U, // <6,3,1,u>: Cost 3 vext3 <3,1,4,6>, <3,1,4,6> - 3704063468U, // <6,3,2,0>: Cost 4 vext2 <2,1,6,3>, <2,0,6,4> - 2630321724U, // <6,3,2,1>: Cost 3 vext2 <2,1,6,3>, <2,1,6,3> - 3759769921U, // <6,3,2,2>: Cost 4 vext3 <0,2,4,6>, <3,2,2,2> - 3759769928U, // <6,3,2,3>: Cost 4 vext3 <0,2,4,6>, <3,2,3,0> - 3704063767U, // <6,3,2,4>: Cost 4 vext2 <2,1,6,3>, <2,4,3,6> - 3704063876U, // <6,3,2,5>: Cost 4 vext2 <2,1,6,3>, <2,5,6,7> - 2636957626U, // <6,3,2,6>: Cost 3 vext2 <3,2,6,3>, <2,6,3,7> - 3777907058U, // <6,3,2,7>: Cost 4 vext3 <3,2,7,6>, <3,2,7,6> - 2630321724U, // <6,3,2,u>: Cost 3 vext2 <2,1,6,3>, <2,1,6,3> - 3759769983U, // <6,3,3,0>: Cost 4 vext3 <0,2,4,6>, <3,3,0,1> - 3710036245U, // <6,3,3,1>: Cost 4 vext2 <3,1,6,3>, <3,1,6,3> - 2636958054U, // <6,3,3,2>: Cost 3 vext2 <3,2,6,3>, <3,2,6,3> - 2686028188U, // <6,3,3,3>: Cost 3 vext3 <0,2,4,6>, <3,3,3,3> - 2704607656U, // <6,3,3,4>: Cost 3 vext3 <3,3,4,6>, <3,3,4,6> - 3773041072U, // <6,3,3,5>: Cost 4 vext3 <2,4,4,6>, <3,3,5,5> - 3711363731U, // <6,3,3,6>: Cost 4 vext2 <3,3,6,3>, <3,6,3,7> - 3767732676U, // <6,3,3,7>: Cost 4 vext3 <1,5,4,6>, <3,3,7,7> - 2707999179U, // <6,3,3,u>: Cost 3 vext3 <3,u,5,6>, <3,3,u,5> - 2584232038U, // <6,3,4,0>: Cost 3 vext1 <5,6,3,4>, LHS - 2642267118U, // <6,3,4,1>: Cost 3 vext2 <4,1,6,3>, <4,1,6,3> - 2642930751U, // <6,3,4,2>: Cost 3 vext2 <4,2,6,3>, <4,2,6,3> - 2705197552U, // <6,3,4,3>: Cost 3 vext3 <3,4,3,6>, <3,4,3,6> - 2584235318U, // <6,3,4,4>: Cost 3 vext1 <5,6,3,4>, RHS - 1631603202U, // <6,3,4,5>: Cost 2 vext3 <3,4,5,6>, <3,4,5,6> - 2654211444U, // <6,3,4,6>: Cost 3 vext2 <6,1,6,3>, <4,6,4,6> - 2237041332U, // <6,3,4,7>: Cost 3 vrev <3,6,7,4> - 1631824413U, // <6,3,4,u>: Cost 2 vext3 <3,4,u,6>, <3,4,u,6> - 3640066150U, // <6,3,5,0>: Cost 4 vext1 <2,6,3,5>, LHS - 3772746288U, // <6,3,5,1>: Cost 4 vext3 <2,4,0,6>, <3,5,1,7> - 3640067790U, // <6,3,5,2>: Cost 4 vext1 <2,6,3,5>, <2,3,4,5> - 3773041216U, // <6,3,5,3>: Cost 4 vext3 <2,4,4,6>, <3,5,3,5> - 2705934922U, // <6,3,5,4>: Cost 3 vext3 <3,5,4,6>, <3,5,4,6> - 3773041236U, // <6,3,5,5>: Cost 4 vext3 <2,4,4,6>, <3,5,5,7> - 3779086940U, // <6,3,5,6>: Cost 4 vext3 <3,4,5,6>, <3,5,6,6> - 3767732831U, // <6,3,5,7>: Cost 4 vext3 <1,5,4,6>, <3,5,7,0> - 2706229870U, // <6,3,5,u>: Cost 3 vext3 <3,5,u,6>, <3,5,u,6> - 2602164326U, // <6,3,6,0>: Cost 3 vext1 <u,6,3,6>, LHS - 2654212512U, // <6,3,6,1>: Cost 3 vext2 <6,1,6,3>, <6,1,6,3> - 2566334393U, // <6,3,6,2>: Cost 3 vext1 <2,6,3,6>, <2,6,3,6> - 3704066588U, // <6,3,6,3>: Cost 4 vext2 <2,1,6,3>, <6,3,2,1> - 2602167524U, // <6,3,6,4>: Cost 3 vext1 <u,6,3,6>, <4,4,6,6> - 3710702321U, // <6,3,6,5>: Cost 4 vext2 <3,2,6,3>, <6,5,7,7> - 2724661933U, // <6,3,6,6>: Cost 3 vext3 <6,6,6,6>, <3,6,6,6> - 3710702465U, // <6,3,6,7>: Cost 4 vext2 <3,2,6,3>, <6,7,5,7> - 2602170158U, // <6,3,6,u>: Cost 3 vext1 <u,6,3,6>, LHS - 1492598886U, // <6,3,7,0>: Cost 2 vext1 <2,6,3,7>, LHS - 2560369889U, // <6,3,7,1>: Cost 3 vext1 <1,6,3,7>, <1,6,3,7> - 1492600762U, // <6,3,7,2>: Cost 2 vext1 <2,6,3,7>, <2,6,3,7> - 2566342806U, // <6,3,7,3>: Cost 3 vext1 <2,6,3,7>, <3,0,1,2> - 1492602166U, // <6,3,7,4>: Cost 2 vext1 <2,6,3,7>, RHS - 2602176208U, // <6,3,7,5>: Cost 3 vext1 <u,6,3,7>, <5,1,7,3> - 2566345210U, // <6,3,7,6>: Cost 3 vext1 <2,6,3,7>, <6,2,7,3> - 2980496528U, // <6,3,7,7>: Cost 3 vzipr RHS, <1,5,3,7> - 1492604718U, // <6,3,7,u>: Cost 2 vext1 <2,6,3,7>, LHS - 1492607078U, // <6,3,u,0>: Cost 2 vext1 <2,6,3,u>, LHS - 2686028574U, // <6,3,u,1>: Cost 3 vext3 <0,2,4,6>, <3,u,1,2> - 1492608955U, // <6,3,u,2>: Cost 2 vext1 <2,6,3,u>, <2,6,3,u> - 2566350998U, // <6,3,u,3>: Cost 3 vext1 <2,6,3,u>, <3,0,1,2> - 1492610358U, // <6,3,u,4>: Cost 2 vext1 <2,6,3,u>, RHS - 1634257734U, // <6,3,u,5>: Cost 2 vext3 <3,u,5,6>, <3,u,5,6> - 2566353489U, // <6,3,u,6>: Cost 3 vext1 <2,6,3,u>, <6,3,u,0> - 2980504720U, // <6,3,u,7>: Cost 3 vzipr RHS, <1,5,3,7> - 1492612910U, // <6,3,u,u>: Cost 2 vext1 <2,6,3,u>, LHS - 3703406592U, // <6,4,0,0>: Cost 4 vext2 <2,0,6,4>, <0,0,0,0> - 2629664870U, // <6,4,0,1>: Cost 3 vext2 <2,0,6,4>, LHS - 2629664972U, // <6,4,0,2>: Cost 3 vext2 <2,0,6,4>, <0,2,4,6> - 3779087232U, // <6,4,0,3>: Cost 4 vext3 <3,4,5,6>, <4,0,3,1> - 2642936156U, // <6,4,0,4>: Cost 3 vext2 <4,2,6,4>, <0,4,2,6> - 2712570770U, // <6,4,0,5>: Cost 3 vext3 <4,6,4,6>, <4,0,5,1> - 2687208348U, // <6,4,0,6>: Cost 3 vext3 <0,4,2,6>, <4,0,6,2> - 3316723081U, // <6,4,0,7>: Cost 4 vrev <4,6,7,0> - 2629665437U, // <6,4,0,u>: Cost 3 vext2 <2,0,6,4>, LHS - 2242473291U, // <6,4,1,0>: Cost 3 vrev <4,6,0,1> - 3700089652U, // <6,4,1,1>: Cost 4 vext2 <1,4,6,4>, <1,1,1,1> - 3703407510U, // <6,4,1,2>: Cost 4 vext2 <2,0,6,4>, <1,2,3,0> - 2852962406U, // <6,4,1,3>: Cost 3 vuzpr <5,6,7,4>, LHS - 3628166454U, // <6,4,1,4>: Cost 4 vext1 <0,6,4,1>, RHS - 3760876514U, // <6,4,1,5>: Cost 4 vext3 <0,4,1,6>, <4,1,5,0> - 2687208430U, // <6,4,1,6>: Cost 3 vext3 <0,4,2,6>, <4,1,6,3> - 3316731274U, // <6,4,1,7>: Cost 4 vrev <4,6,7,1> - 2243063187U, // <6,4,1,u>: Cost 3 vrev <4,6,u,1> - 2629666284U, // <6,4,2,0>: Cost 3 vext2 <2,0,6,4>, <2,0,6,4> - 3703408188U, // <6,4,2,1>: Cost 4 vext2 <2,0,6,4>, <2,1,6,3> - 3703408232U, // <6,4,2,2>: Cost 4 vext2 <2,0,6,4>, <2,2,2,2> - 3703408294U, // <6,4,2,3>: Cost 4 vext2 <2,0,6,4>, <2,3,0,1> - 2632320816U, // <6,4,2,4>: Cost 3 vext2 <2,4,6,4>, <2,4,6,4> - 2923384118U, // <6,4,2,5>: Cost 3 vzipl <6,2,7,3>, RHS - 2687208508U, // <6,4,2,6>: Cost 3 vext3 <0,4,2,6>, <4,2,6,0> - 3760950341U, // <6,4,2,7>: Cost 4 vext3 <0,4,2,6>, <4,2,7,0> - 2634975348U, // <6,4,2,u>: Cost 3 vext2 <2,u,6,4>, <2,u,6,4> - 3703408790U, // <6,4,3,0>: Cost 4 vext2 <2,0,6,4>, <3,0,1,2> - 3316305238U, // <6,4,3,1>: Cost 4 vrev <4,6,1,3> - 3703408947U, // <6,4,3,2>: Cost 4 vext2 <2,0,6,4>, <3,2,0,6> - 3703409052U, // <6,4,3,3>: Cost 4 vext2 <2,0,6,4>, <3,3,3,3> - 2644929026U, // <6,4,3,4>: Cost 3 vext2 <4,5,6,4>, <3,4,5,6> - 3718670922U, // <6,4,3,5>: Cost 4 vext2 <4,5,6,4>, <3,5,4,6> - 2705345682U, // <6,4,3,6>: Cost 3 vext3 <3,4,5,6>, <4,3,6,5> - 3926705152U, // <6,4,3,7>: Cost 4 vuzpr <5,6,7,4>, <1,3,5,7> - 2668817222U, // <6,4,3,u>: Cost 3 vext2 <u,5,6,4>, <3,u,5,6> - 2590277734U, // <6,4,4,0>: Cost 3 vext1 <6,6,4,4>, LHS - 3716017135U, // <6,4,4,1>: Cost 4 vext2 <4,1,6,4>, <4,1,6,4> - 2642938944U, // <6,4,4,2>: Cost 3 vext2 <4,2,6,4>, <4,2,6,4> - 3717344401U, // <6,4,4,3>: Cost 4 vext2 <4,3,6,4>, <4,3,6,4> - 2712571088U, // <6,4,4,4>: Cost 3 vext3 <4,6,4,6>, <4,4,4,4> - 2629668150U, // <6,4,4,5>: Cost 3 vext2 <2,0,6,4>, RHS - 1637649636U, // <6,4,4,6>: Cost 2 vext3 <4,4,6,6>, <4,4,6,6> - 2646257109U, // <6,4,4,7>: Cost 3 vext2 <4,7,6,4>, <4,7,6,4> - 1637649636U, // <6,4,4,u>: Cost 2 vext3 <4,4,6,6>, <4,4,6,6> - 2566398054U, // <6,4,5,0>: Cost 3 vext1 <2,6,4,5>, LHS - 3760876805U, // <6,4,5,1>: Cost 4 vext3 <0,4,1,6>, <4,5,1,3> - 2566399937U, // <6,4,5,2>: Cost 3 vext1 <2,6,4,5>, <2,6,4,5> - 2584316418U, // <6,4,5,3>: Cost 3 vext1 <5,6,4,5>, <3,4,5,6> - 2566401334U, // <6,4,5,4>: Cost 3 vext1 <2,6,4,5>, RHS - 2584318028U, // <6,4,5,5>: Cost 3 vext1 <5,6,4,5>, <5,6,4,5> - 1612287286U, // <6,4,5,6>: Cost 2 vext3 <0,2,4,6>, RHS - 2852965686U, // <6,4,5,7>: Cost 3 vuzpr <5,6,7,4>, RHS - 1612287304U, // <6,4,5,u>: Cost 2 vext3 <0,2,4,6>, RHS - 1504608358U, // <6,4,6,0>: Cost 2 vext1 <4,6,4,6>, LHS - 2578350838U, // <6,4,6,1>: Cost 3 vext1 <4,6,4,6>, <1,0,3,2> - 2578351720U, // <6,4,6,2>: Cost 3 vext1 <4,6,4,6>, <2,2,2,2> - 2578352278U, // <6,4,6,3>: Cost 3 vext1 <4,6,4,6>, <3,0,1,2> - 1504611638U, // <6,4,6,4>: Cost 2 vext1 <4,6,4,6>, RHS - 2578353872U, // <6,4,6,5>: Cost 3 vext1 <4,6,4,6>, <5,1,7,3> - 2578354682U, // <6,4,6,6>: Cost 3 vext1 <4,6,4,6>, <6,2,7,3> - 2578355194U, // <6,4,6,7>: Cost 3 vext1 <4,6,4,6>, <7,0,1,2> - 1504614190U, // <6,4,6,u>: Cost 2 vext1 <4,6,4,6>, LHS - 2572386406U, // <6,4,7,0>: Cost 3 vext1 <3,6,4,7>, LHS - 2572387226U, // <6,4,7,1>: Cost 3 vext1 <3,6,4,7>, <1,2,3,4> - 3640157902U, // <6,4,7,2>: Cost 4 vext1 <2,6,4,7>, <2,3,4,5> - 2572389020U, // <6,4,7,3>: Cost 3 vext1 <3,6,4,7>, <3,6,4,7> - 2572389686U, // <6,4,7,4>: Cost 3 vext1 <3,6,4,7>, RHS - 2980497102U, // <6,4,7,5>: Cost 3 vzipr RHS, <2,3,4,5> - 2980495564U, // <6,4,7,6>: Cost 3 vzipr RHS, <0,2,4,6> - 4054239090U, // <6,4,7,7>: Cost 4 vzipr RHS, <2,5,4,7> - 2572392238U, // <6,4,7,u>: Cost 3 vext1 <3,6,4,7>, LHS - 1504608358U, // <6,4,u,0>: Cost 2 vext1 <4,6,4,6>, LHS - 2629670702U, // <6,4,u,1>: Cost 3 vext2 <2,0,6,4>, LHS - 2566424516U, // <6,4,u,2>: Cost 3 vext1 <2,6,4,u>, <2,6,4,u> - 2584340994U, // <6,4,u,3>: Cost 3 vext1 <5,6,4,u>, <3,4,5,6> - 1640156694U, // <6,4,u,4>: Cost 2 vext3 <4,u,4,6>, <4,u,4,6> - 2629671066U, // <6,4,u,5>: Cost 3 vext2 <2,0,6,4>, RHS - 1612287529U, // <6,4,u,6>: Cost 2 vext3 <0,2,4,6>, RHS - 2852965929U, // <6,4,u,7>: Cost 3 vuzpr <5,6,7,4>, RHS - 1612287547U, // <6,4,u,u>: Cost 2 vext3 <0,2,4,6>, RHS - 3708723200U, // <6,5,0,0>: Cost 4 vext2 <2,u,6,5>, <0,0,0,0> - 2634981478U, // <6,5,0,1>: Cost 3 vext2 <2,u,6,5>, LHS - 3694125260U, // <6,5,0,2>: Cost 4 vext2 <0,4,6,5>, <0,2,4,6> - 3779087962U, // <6,5,0,3>: Cost 4 vext3 <3,4,5,6>, <5,0,3,2> - 3760877154U, // <6,5,0,4>: Cost 4 vext3 <0,4,1,6>, <5,0,4,1> - 4195110916U, // <6,5,0,5>: Cost 4 vtrnr <5,6,7,0>, <5,5,5,5> - 3696779775U, // <6,5,0,6>: Cost 4 vext2 <0,u,6,5>, <0,6,2,7> - 1175212130U, // <6,5,0,7>: Cost 2 vrev <5,6,7,0> - 1175285867U, // <6,5,0,u>: Cost 2 vrev <5,6,u,0> - 2248445988U, // <6,5,1,0>: Cost 3 vrev <5,6,0,1> - 3698107237U, // <6,5,1,1>: Cost 4 vext2 <1,1,6,5>, <1,1,6,5> - 3708724118U, // <6,5,1,2>: Cost 4 vext2 <2,u,6,5>, <1,2,3,0> - 3908575334U, // <6,5,1,3>: Cost 4 vuzpr <2,6,4,5>, LHS - 3716023376U, // <6,5,1,4>: Cost 4 vext2 <4,1,6,5>, <1,4,5,6> - 3708724368U, // <6,5,1,5>: Cost 4 vext2 <2,u,6,5>, <1,5,3,7> - 3767733960U, // <6,5,1,6>: Cost 4 vext3 <1,5,4,6>, <5,1,6,4> - 2712571600U, // <6,5,1,7>: Cost 3 vext3 <4,6,4,6>, <5,1,7,3> - 2712571609U, // <6,5,1,u>: Cost 3 vext3 <4,6,4,6>, <5,1,u,3> - 2578391142U, // <6,5,2,0>: Cost 3 vext1 <4,6,5,2>, LHS - 3704079934U, // <6,5,2,1>: Cost 4 vext2 <2,1,6,5>, <2,1,6,5> - 3708724840U, // <6,5,2,2>: Cost 4 vext2 <2,u,6,5>, <2,2,2,2> - 3705407182U, // <6,5,2,3>: Cost 4 vext2 <2,3,6,5>, <2,3,4,5> - 2578394422U, // <6,5,2,4>: Cost 3 vext1 <4,6,5,2>, RHS - 3717351272U, // <6,5,2,5>: Cost 4 vext2 <4,3,6,5>, <2,5,3,6> - 2634983354U, // <6,5,2,6>: Cost 3 vext2 <2,u,6,5>, <2,6,3,7> - 3115486518U, // <6,5,2,7>: Cost 3 vtrnr <4,6,u,2>, RHS - 2634983541U, // <6,5,2,u>: Cost 3 vext2 <2,u,6,5>, <2,u,6,5> - 3708725398U, // <6,5,3,0>: Cost 4 vext2 <2,u,6,5>, <3,0,1,2> - 3710052631U, // <6,5,3,1>: Cost 4 vext2 <3,1,6,5>, <3,1,6,5> - 3708725606U, // <6,5,3,2>: Cost 4 vext2 <2,u,6,5>, <3,2,6,3> - 3708725660U, // <6,5,3,3>: Cost 4 vext2 <2,u,6,5>, <3,3,3,3> - 2643610114U, // <6,5,3,4>: Cost 3 vext2 <4,3,6,5>, <3,4,5,6> - 3717352010U, // <6,5,3,5>: Cost 4 vext2 <4,3,6,5>, <3,5,4,6> - 3773632358U, // <6,5,3,6>: Cost 4 vext3 <2,5,3,6>, <5,3,6,0> - 2248978533U, // <6,5,3,7>: Cost 3 vrev <5,6,7,3> - 2249052270U, // <6,5,3,u>: Cost 3 vrev <5,6,u,3> - 2596323430U, // <6,5,4,0>: Cost 3 vext1 <7,6,5,4>, LHS - 3716025328U, // <6,5,4,1>: Cost 4 vext2 <4,1,6,5>, <4,1,6,5> - 3716688961U, // <6,5,4,2>: Cost 4 vext2 <4,2,6,5>, <4,2,6,5> - 2643610770U, // <6,5,4,3>: Cost 3 vext2 <4,3,6,5>, <4,3,6,5> - 2596326710U, // <6,5,4,4>: Cost 3 vext1 <7,6,5,4>, RHS - 2634984758U, // <6,5,4,5>: Cost 3 vext2 <2,u,6,5>, RHS - 3767734199U, // <6,5,4,6>: Cost 4 vext3 <1,5,4,6>, <5,4,6,0> - 1643696070U, // <6,5,4,7>: Cost 2 vext3 <5,4,7,6>, <5,4,7,6> - 1643769807U, // <6,5,4,u>: Cost 2 vext3 <5,4,u,6>, <5,4,u,6> - 2578415718U, // <6,5,5,0>: Cost 3 vext1 <4,6,5,5>, LHS - 3652158198U, // <6,5,5,1>: Cost 4 vext1 <4,6,5,5>, <1,0,3,2> - 3652159080U, // <6,5,5,2>: Cost 4 vext1 <4,6,5,5>, <2,2,2,2> - 3652159638U, // <6,5,5,3>: Cost 4 vext1 <4,6,5,5>, <3,0,1,2> - 2578418998U, // <6,5,5,4>: Cost 3 vext1 <4,6,5,5>, RHS - 2712571908U, // <6,5,5,5>: Cost 3 vext3 <4,6,4,6>, <5,5,5,5> - 2718027790U, // <6,5,5,6>: Cost 3 vext3 <5,5,6,6>, <5,5,6,6> - 2712571928U, // <6,5,5,7>: Cost 3 vext3 <4,6,4,6>, <5,5,7,7> - 2712571937U, // <6,5,5,u>: Cost 3 vext3 <4,6,4,6>, <5,5,u,7> - 2705346596U, // <6,5,6,0>: Cost 3 vext3 <3,4,5,6>, <5,6,0,1> - 3767144496U, // <6,5,6,1>: Cost 4 vext3 <1,4,5,6>, <5,6,1,4> - 3773116473U, // <6,5,6,2>: Cost 4 vext3 <2,4,5,6>, <5,6,2,4> - 2705346626U, // <6,5,6,3>: Cost 3 vext3 <3,4,5,6>, <5,6,3,4> - 2705346636U, // <6,5,6,4>: Cost 3 vext3 <3,4,5,6>, <5,6,4,5> - 3908577217U, // <6,5,6,5>: Cost 4 vuzpr <2,6,4,5>, <2,6,4,5> - 2578428728U, // <6,5,6,6>: Cost 3 vext1 <4,6,5,6>, <6,6,6,6> - 2712572002U, // <6,5,6,7>: Cost 3 vext3 <4,6,4,6>, <5,6,7,0> - 2705346668U, // <6,5,6,u>: Cost 3 vext3 <3,4,5,6>, <5,6,u,1> - 2560516198U, // <6,5,7,0>: Cost 3 vext1 <1,6,5,7>, LHS - 2560517363U, // <6,5,7,1>: Cost 3 vext1 <1,6,5,7>, <1,6,5,7> - 2566490060U, // <6,5,7,2>: Cost 3 vext1 <2,6,5,7>, <2,6,5,7> - 3634260118U, // <6,5,7,3>: Cost 4 vext1 <1,6,5,7>, <3,0,1,2> - 2560519478U, // <6,5,7,4>: Cost 3 vext1 <1,6,5,7>, RHS - 2980498650U, // <6,5,7,5>: Cost 3 vzipr RHS, <4,4,5,5> - 2980497922U, // <6,5,7,6>: Cost 3 vzipr RHS, <3,4,5,6> - 3103214902U, // <6,5,7,7>: Cost 3 vtrnr <2,6,3,7>, RHS - 2560522030U, // <6,5,7,u>: Cost 3 vext1 <1,6,5,7>, LHS - 2560524390U, // <6,5,u,0>: Cost 3 vext1 <1,6,5,u>, LHS - 2560525556U, // <6,5,u,1>: Cost 3 vext1 <1,6,5,u>, <1,6,5,u> - 2566498253U, // <6,5,u,2>: Cost 3 vext1 <2,6,5,u>, <2,6,5,u> - 2646931439U, // <6,5,u,3>: Cost 3 vext2 <4,u,6,5>, <u,3,5,7> - 2560527670U, // <6,5,u,4>: Cost 3 vext1 <1,6,5,u>, RHS - 2634987674U, // <6,5,u,5>: Cost 3 vext2 <2,u,6,5>, RHS - 2980506114U, // <6,5,u,6>: Cost 3 vzipr RHS, <3,4,5,6> - 1175277674U, // <6,5,u,7>: Cost 2 vrev <5,6,7,u> - 1175351411U, // <6,5,u,u>: Cost 2 vrev <5,6,u,u> - 2578448486U, // <6,6,0,0>: Cost 3 vext1 <4,6,6,0>, LHS - 1573191782U, // <6,6,0,1>: Cost 2 vext2 <4,u,6,6>, LHS - 2686030124U, // <6,6,0,2>: Cost 3 vext3 <0,2,4,6>, <6,0,2,4> - 3779088690U, // <6,6,0,3>: Cost 4 vext3 <3,4,5,6>, <6,0,3,1> - 2687209788U, // <6,6,0,4>: Cost 3 vext3 <0,4,2,6>, <6,0,4,2> - 3652194000U, // <6,6,0,5>: Cost 4 vext1 <4,6,6,0>, <5,1,7,3> - 2254852914U, // <6,6,0,6>: Cost 3 vrev <6,6,6,0> - 4041575734U, // <6,6,0,7>: Cost 4 vzipr <2,4,6,0>, RHS - 1573192349U, // <6,6,0,u>: Cost 2 vext2 <4,u,6,6>, LHS - 2646934262U, // <6,6,1,0>: Cost 3 vext2 <4,u,6,6>, <1,0,3,2> - 2646934324U, // <6,6,1,1>: Cost 3 vext2 <4,u,6,6>, <1,1,1,1> - 2646934422U, // <6,6,1,2>: Cost 3 vext2 <4,u,6,6>, <1,2,3,0> - 2846785638U, // <6,6,1,3>: Cost 3 vuzpr <4,6,4,6>, LHS - 3760951694U, // <6,6,1,4>: Cost 4 vext3 <0,4,2,6>, <6,1,4,3> - 2646934672U, // <6,6,1,5>: Cost 3 vext2 <4,u,6,6>, <1,5,3,7> - 2712572320U, // <6,6,1,6>: Cost 3 vext3 <4,6,4,6>, <6,1,6,3> - 3775549865U, // <6,6,1,7>: Cost 4 vext3 <2,u,2,6>, <6,1,7,3> - 2846785643U, // <6,6,1,u>: Cost 3 vuzpr <4,6,4,6>, LHS - 3759772094U, // <6,6,2,0>: Cost 4 vext3 <0,2,4,6>, <6,2,0,6> - 3704751676U, // <6,6,2,1>: Cost 4 vext2 <2,2,6,6>, <2,1,6,3> - 2631009936U, // <6,6,2,2>: Cost 3 vext2 <2,2,6,6>, <2,2,6,6> - 2646935206U, // <6,6,2,3>: Cost 3 vext2 <4,u,6,6>, <2,3,0,1> - 3759772127U, // <6,6,2,4>: Cost 4 vext3 <0,2,4,6>, <6,2,4,3> - 3704752004U, // <6,6,2,5>: Cost 4 vext2 <2,2,6,6>, <2,5,6,7> - 2646935482U, // <6,6,2,6>: Cost 3 vext2 <4,u,6,6>, <2,6,3,7> - 2712572410U, // <6,6,2,7>: Cost 3 vext3 <4,6,4,6>, <6,2,7,3> - 2712572419U, // <6,6,2,u>: Cost 3 vext3 <4,6,4,6>, <6,2,u,3> - 2646935702U, // <6,6,3,0>: Cost 3 vext2 <4,u,6,6>, <3,0,1,2> - 3777024534U, // <6,6,3,1>: Cost 4 vext3 <3,1,4,6>, <6,3,1,4> - 3704752453U, // <6,6,3,2>: Cost 4 vext2 <2,2,6,6>, <3,2,2,6> - 2646935964U, // <6,6,3,3>: Cost 3 vext2 <4,u,6,6>, <3,3,3,3> - 2705347122U, // <6,6,3,4>: Cost 3 vext3 <3,4,5,6>, <6,3,4,5> - 3779678778U, // <6,6,3,5>: Cost 4 vext3 <3,5,4,6>, <6,3,5,4> - 2657553069U, // <6,6,3,6>: Cost 3 vext2 <6,6,6,6>, <3,6,6,6> - 4039609654U, // <6,6,3,7>: Cost 4 vzipr <2,1,6,3>, RHS - 2708001366U, // <6,6,3,u>: Cost 3 vext3 <3,u,5,6>, <6,3,u,5> - 2578481254U, // <6,6,4,0>: Cost 3 vext1 <4,6,6,4>, LHS - 3652223734U, // <6,6,4,1>: Cost 4 vext1 <4,6,6,4>, <1,0,3,2> - 3760951922U, // <6,6,4,2>: Cost 4 vext3 <0,4,2,6>, <6,4,2,6> - 3779089019U, // <6,6,4,3>: Cost 4 vext3 <3,4,5,6>, <6,4,3,6> - 1570540772U, // <6,6,4,4>: Cost 2 vext2 <4,4,6,6>, <4,4,6,6> - 1573195062U, // <6,6,4,5>: Cost 2 vext2 <4,u,6,6>, RHS - 2712572560U, // <6,6,4,6>: Cost 3 vext3 <4,6,4,6>, <6,4,6,0> - 2723410591U, // <6,6,4,7>: Cost 3 vext3 <6,4,7,6>, <6,4,7,6> - 1573195304U, // <6,6,4,u>: Cost 2 vext2 <4,u,6,6>, <4,u,6,6> - 3640287334U, // <6,6,5,0>: Cost 4 vext1 <2,6,6,5>, LHS - 2646937296U, // <6,6,5,1>: Cost 3 vext2 <4,u,6,6>, <5,1,7,3> - 3640289235U, // <6,6,5,2>: Cost 4 vext1 <2,6,6,5>, <2,6,6,5> - 3720679279U, // <6,6,5,3>: Cost 4 vext2 <4,u,6,6>, <5,3,7,0> - 2646937542U, // <6,6,5,4>: Cost 3 vext2 <4,u,6,6>, <5,4,7,6> - 2646937604U, // <6,6,5,5>: Cost 3 vext2 <4,u,6,6>, <5,5,5,5> - 2646937698U, // <6,6,5,6>: Cost 3 vext2 <4,u,6,6>, <5,6,7,0> - 2846788918U, // <6,6,5,7>: Cost 3 vuzpr <4,6,4,6>, RHS - 2846788919U, // <6,6,5,u>: Cost 3 vuzpr <4,6,4,6>, RHS - 1516699750U, // <6,6,6,0>: Cost 2 vext1 <6,6,6,6>, LHS - 2590442230U, // <6,6,6,1>: Cost 3 vext1 <6,6,6,6>, <1,0,3,2> - 2646938106U, // <6,6,6,2>: Cost 3 vext2 <4,u,6,6>, <6,2,7,3> - 2590443670U, // <6,6,6,3>: Cost 3 vext1 <6,6,6,6>, <3,0,1,2> - 1516703030U, // <6,6,6,4>: Cost 2 vext1 <6,6,6,6>, RHS - 2590445264U, // <6,6,6,5>: Cost 3 vext1 <6,6,6,6>, <5,1,7,3> - 296144182U, // <6,6,6,6>: Cost 1 vdup2 RHS - 2712572738U, // <6,6,6,7>: Cost 3 vext3 <4,6,4,6>, <6,6,7,7> - 296144182U, // <6,6,6,u>: Cost 1 vdup2 RHS - 2566561894U, // <6,6,7,0>: Cost 3 vext1 <2,6,6,7>, LHS - 3634332924U, // <6,6,7,1>: Cost 4 vext1 <1,6,6,7>, <1,6,6,7> - 2566563797U, // <6,6,7,2>: Cost 3 vext1 <2,6,6,7>, <2,6,6,7> - 2584480258U, // <6,6,7,3>: Cost 3 vext1 <5,6,6,7>, <3,4,5,6> - 2566565174U, // <6,6,7,4>: Cost 3 vext1 <2,6,6,7>, RHS - 2717438846U, // <6,6,7,5>: Cost 3 vext3 <5,4,7,6>, <6,7,5,4> - 2980500280U, // <6,6,7,6>: Cost 3 vzipr RHS, <6,6,6,6> - 1906756918U, // <6,6,7,7>: Cost 2 vzipr RHS, RHS - 1906756919U, // <6,6,7,u>: Cost 2 vzipr RHS, RHS - 1516699750U, // <6,6,u,0>: Cost 2 vext1 <6,6,6,6>, LHS - 1573197614U, // <6,6,u,1>: Cost 2 vext2 <4,u,6,6>, LHS - 2566571990U, // <6,6,u,2>: Cost 3 vext1 <2,6,6,u>, <2,6,6,u> - 2846786205U, // <6,6,u,3>: Cost 3 vuzpr <4,6,4,6>, LHS - 1516703030U, // <6,6,u,4>: Cost 2 vext1 <6,6,6,6>, RHS - 1573197978U, // <6,6,u,5>: Cost 2 vext2 <4,u,6,6>, RHS - 296144182U, // <6,6,u,6>: Cost 1 vdup2 RHS - 1906765110U, // <6,6,u,7>: Cost 2 vzipr RHS, RHS - 296144182U, // <6,6,u,u>: Cost 1 vdup2 RHS - 1571209216U, // <6,7,0,0>: Cost 2 vext2 RHS, <0,0,0,0> - 497467494U, // <6,7,0,1>: Cost 1 vext2 RHS, LHS - 1571209380U, // <6,7,0,2>: Cost 2 vext2 RHS, <0,2,0,2> - 2644951292U, // <6,7,0,3>: Cost 3 vext2 RHS, <0,3,1,0> - 1571209554U, // <6,7,0,4>: Cost 2 vext2 RHS, <0,4,1,5> - 1510756450U, // <6,7,0,5>: Cost 2 vext1 <5,6,7,0>, <5,6,7,0> - 2644951542U, // <6,7,0,6>: Cost 3 vext2 RHS, <0,6,1,7> - 2584499194U, // <6,7,0,7>: Cost 3 vext1 <5,6,7,0>, <7,0,1,2> - 497468061U, // <6,7,0,u>: Cost 1 vext2 RHS, LHS - 1571209974U, // <6,7,1,0>: Cost 2 vext2 RHS, <1,0,3,2> - 1571210036U, // <6,7,1,1>: Cost 2 vext2 RHS, <1,1,1,1> - 1571210134U, // <6,7,1,2>: Cost 2 vext2 RHS, <1,2,3,0> - 1571210200U, // <6,7,1,3>: Cost 2 vext2 RHS, <1,3,1,3> - 2644952098U, // <6,7,1,4>: Cost 3 vext2 RHS, <1,4,0,5> - 1571210384U, // <6,7,1,5>: Cost 2 vext2 RHS, <1,5,3,7> - 2644952271U, // <6,7,1,6>: Cost 3 vext2 RHS, <1,6,1,7> - 2578535418U, // <6,7,1,7>: Cost 3 vext1 <4,6,7,1>, <7,0,1,2> - 1571210605U, // <6,7,1,u>: Cost 2 vext2 RHS, <1,u,1,3> - 2644952509U, // <6,7,2,0>: Cost 3 vext2 RHS, <2,0,1,2> - 2644952582U, // <6,7,2,1>: Cost 3 vext2 RHS, <2,1,0,3> - 1571210856U, // <6,7,2,2>: Cost 2 vext2 RHS, <2,2,2,2> - 1571210918U, // <6,7,2,3>: Cost 2 vext2 RHS, <2,3,0,1> - 2644952828U, // <6,7,2,4>: Cost 3 vext2 RHS, <2,4,0,6> - 2633009028U, // <6,7,2,5>: Cost 3 vext2 <2,5,6,7>, <2,5,6,7> - 1571211194U, // <6,7,2,6>: Cost 2 vext2 RHS, <2,6,3,7> - 2668840938U, // <6,7,2,7>: Cost 3 vext2 RHS, <2,7,0,1> - 1571211323U, // <6,7,2,u>: Cost 2 vext2 RHS, <2,u,0,1> - 1571211414U, // <6,7,3,0>: Cost 2 vext2 RHS, <3,0,1,2> - 2644953311U, // <6,7,3,1>: Cost 3 vext2 RHS, <3,1,0,3> - 2644953390U, // <6,7,3,2>: Cost 3 vext2 RHS, <3,2,0,1> - 1571211676U, // <6,7,3,3>: Cost 2 vext2 RHS, <3,3,3,3> - 1571211778U, // <6,7,3,4>: Cost 2 vext2 RHS, <3,4,5,6> - 2644953648U, // <6,7,3,5>: Cost 3 vext2 RHS, <3,5,1,7> - 2644953720U, // <6,7,3,6>: Cost 3 vext2 RHS, <3,6,0,7> - 2644953795U, // <6,7,3,7>: Cost 3 vext2 RHS, <3,7,0,1> - 1571212062U, // <6,7,3,u>: Cost 2 vext2 RHS, <3,u,1,2> - 1573202834U, // <6,7,4,0>: Cost 2 vext2 RHS, <4,0,5,1> - 2644954058U, // <6,7,4,1>: Cost 3 vext2 RHS, <4,1,2,3> - 2644954166U, // <6,7,4,2>: Cost 3 vext2 RHS, <4,2,5,3> - 2644954258U, // <6,7,4,3>: Cost 3 vext2 RHS, <4,3,6,5> - 1571212496U, // <6,7,4,4>: Cost 2 vext2 RHS, <4,4,4,4> - 497470774U, // <6,7,4,5>: Cost 1 vext2 RHS, RHS - 1573203316U, // <6,7,4,6>: Cost 2 vext2 RHS, <4,6,4,6> - 2646281688U, // <6,7,4,7>: Cost 3 vext2 <4,7,6,7>, <4,7,6,7> - 497471017U, // <6,7,4,u>: Cost 1 vext2 RHS, RHS - 2644954696U, // <6,7,5,0>: Cost 3 vext2 RHS, <5,0,1,2> - 1573203664U, // <6,7,5,1>: Cost 2 vext2 RHS, <5,1,7,3> - 2644954878U, // <6,7,5,2>: Cost 3 vext2 RHS, <5,2,3,4> - 2644954991U, // <6,7,5,3>: Cost 3 vext2 RHS, <5,3,7,0> - 1571213254U, // <6,7,5,4>: Cost 2 vext2 RHS, <5,4,7,6> - 1571213316U, // <6,7,5,5>: Cost 2 vext2 RHS, <5,5,5,5> - 1571213410U, // <6,7,5,6>: Cost 2 vext2 RHS, <5,6,7,0> - 1573204136U, // <6,7,5,7>: Cost 2 vext2 RHS, <5,7,5,7> - 1573204217U, // <6,7,5,u>: Cost 2 vext2 RHS, <5,u,5,7> - 2644955425U, // <6,7,6,0>: Cost 3 vext2 RHS, <6,0,1,2> - 2644955561U, // <6,7,6,1>: Cost 3 vext2 RHS, <6,1,7,3> - 1573204474U, // <6,7,6,2>: Cost 2 vext2 RHS, <6,2,7,3> - 2644955698U, // <6,7,6,3>: Cost 3 vext2 RHS, <6,3,4,5> - 2644955789U, // <6,7,6,4>: Cost 3 vext2 RHS, <6,4,5,6> - 2644955889U, // <6,7,6,5>: Cost 3 vext2 RHS, <6,5,7,7> - 1571214136U, // <6,7,6,6>: Cost 2 vext2 RHS, <6,6,6,6> - 1571214158U, // <6,7,6,7>: Cost 2 vext2 RHS, <6,7,0,1> - 1573204895U, // <6,7,6,u>: Cost 2 vext2 RHS, <6,u,0,1> - 1573204986U, // <6,7,7,0>: Cost 2 vext2 RHS, <7,0,1,2> - 2572608656U, // <6,7,7,1>: Cost 3 vext1 <3,6,7,7>, <1,5,3,7> - 2644956362U, // <6,7,7,2>: Cost 3 vext2 RHS, <7,2,6,3> - 2572610231U, // <6,7,7,3>: Cost 3 vext1 <3,6,7,7>, <3,6,7,7> - 1573205350U, // <6,7,7,4>: Cost 2 vext2 RHS, <7,4,5,6> - 2646947220U, // <6,7,7,5>: Cost 3 vext2 RHS, <7,5,1,7> - 1516786498U, // <6,7,7,6>: Cost 2 vext1 <6,6,7,7>, <6,6,7,7> - 1571214956U, // <6,7,7,7>: Cost 2 vext2 RHS, <7,7,7,7> - 1573205634U, // <6,7,7,u>: Cost 2 vext2 RHS, <7,u,1,2> - 1571215059U, // <6,7,u,0>: Cost 2 vext2 RHS, <u,0,1,2> - 497473326U, // <6,7,u,1>: Cost 1 vext2 RHS, LHS - 1571215237U, // <6,7,u,2>: Cost 2 vext2 RHS, <u,2,3,0> - 1571215292U, // <6,7,u,3>: Cost 2 vext2 RHS, <u,3,0,1> - 1571215423U, // <6,7,u,4>: Cost 2 vext2 RHS, <u,4,5,6> - 497473690U, // <6,7,u,5>: Cost 1 vext2 RHS, RHS - 1571215568U, // <6,7,u,6>: Cost 2 vext2 RHS, <u,6,3,7> - 1573206272U, // <6,7,u,7>: Cost 2 vext2 RHS, <u,7,0,1> - 497473893U, // <6,7,u,u>: Cost 1 vext2 RHS, LHS - 1571217408U, // <6,u,0,0>: Cost 2 vext2 RHS, <0,0,0,0> - 497475686U, // <6,u,0,1>: Cost 1 vext2 RHS, LHS - 1571217572U, // <6,u,0,2>: Cost 2 vext2 RHS, <0,2,0,2> - 2689865445U, // <6,u,0,3>: Cost 3 vext3 <0,u,2,6>, <u,0,3,2> - 1571217746U, // <6,u,0,4>: Cost 2 vext2 RHS, <0,4,1,5> - 1510830187U, // <6,u,0,5>: Cost 2 vext1 <5,6,u,0>, <5,6,u,0> - 2644959734U, // <6,u,0,6>: Cost 3 vext2 RHS, <0,6,1,7> - 1193130221U, // <6,u,0,7>: Cost 2 vrev <u,6,7,0> - 497476253U, // <6,u,0,u>: Cost 1 vext2 RHS, LHS - 1571218166U, // <6,u,1,0>: Cost 2 vext2 RHS, <1,0,3,2> - 1571218228U, // <6,u,1,1>: Cost 2 vext2 RHS, <1,1,1,1> - 1612289838U, // <6,u,1,2>: Cost 2 vext3 <0,2,4,6>, LHS - 1571218392U, // <6,u,1,3>: Cost 2 vext2 RHS, <1,3,1,3> - 2566663478U, // <6,u,1,4>: Cost 3 vext1 <2,6,u,1>, RHS - 1571218576U, // <6,u,1,5>: Cost 2 vext2 RHS, <1,5,3,7> - 2644960463U, // <6,u,1,6>: Cost 3 vext2 RHS, <1,6,1,7> - 2717439835U, // <6,u,1,7>: Cost 3 vext3 <5,4,7,6>, <u,1,7,3> - 1612289892U, // <6,u,1,u>: Cost 2 vext3 <0,2,4,6>, LHS - 1504870502U, // <6,u,2,0>: Cost 2 vext1 <4,6,u,2>, LHS - 2644960774U, // <6,u,2,1>: Cost 3 vext2 RHS, <2,1,0,3> - 1571219048U, // <6,u,2,2>: Cost 2 vext2 RHS, <2,2,2,2> - 1571219110U, // <6,u,2,3>: Cost 2 vext2 RHS, <2,3,0,1> - 1504873782U, // <6,u,2,4>: Cost 2 vext1 <4,6,u,2>, RHS - 2633017221U, // <6,u,2,5>: Cost 3 vext2 <2,5,6,u>, <2,5,6,u> - 1571219386U, // <6,u,2,6>: Cost 2 vext2 RHS, <2,6,3,7> - 2712573868U, // <6,u,2,7>: Cost 3 vext3 <4,6,4,6>, <u,2,7,3> - 1571219515U, // <6,u,2,u>: Cost 2 vext2 RHS, <2,u,0,1> - 1571219606U, // <6,u,3,0>: Cost 2 vext2 RHS, <3,0,1,2> - 2644961503U, // <6,u,3,1>: Cost 3 vext2 RHS, <3,1,0,3> - 2566678499U, // <6,u,3,2>: Cost 3 vext1 <2,6,u,3>, <2,6,u,3> - 1571219868U, // <6,u,3,3>: Cost 2 vext2 RHS, <3,3,3,3> - 1571219970U, // <6,u,3,4>: Cost 2 vext2 RHS, <3,4,5,6> - 2689865711U, // <6,u,3,5>: Cost 3 vext3 <0,u,2,6>, <u,3,5,7> - 2708002806U, // <6,u,3,6>: Cost 3 vext3 <3,u,5,6>, <u,3,6,5> - 2644961987U, // <6,u,3,7>: Cost 3 vext2 RHS, <3,7,0,1> - 1571220254U, // <6,u,3,u>: Cost 2 vext2 RHS, <3,u,1,2> - 1571220370U, // <6,u,4,0>: Cost 2 vext2 RHS, <4,0,5,1> - 2644962250U, // <6,u,4,1>: Cost 3 vext2 RHS, <4,1,2,3> - 1661245476U, // <6,u,4,2>: Cost 2 vext3 <u,4,2,6>, <u,4,2,6> - 2686031917U, // <6,u,4,3>: Cost 3 vext3 <0,2,4,6>, <u,4,3,6> - 1571220688U, // <6,u,4,4>: Cost 2 vext2 RHS, <4,4,4,4> - 497478967U, // <6,u,4,5>: Cost 1 vext2 RHS, RHS - 1571220852U, // <6,u,4,6>: Cost 2 vext2 RHS, <4,6,4,6> - 1661614161U, // <6,u,4,7>: Cost 2 vext3 <u,4,7,6>, <u,4,7,6> - 497479209U, // <6,u,4,u>: Cost 1 vext2 RHS, RHS - 2566692966U, // <6,u,5,0>: Cost 3 vext1 <2,6,u,5>, LHS - 1571221200U, // <6,u,5,1>: Cost 2 vext2 RHS, <5,1,7,3> - 2566694885U, // <6,u,5,2>: Cost 3 vext1 <2,6,u,5>, <2,6,u,5> - 2689865855U, // <6,u,5,3>: Cost 3 vext3 <0,u,2,6>, <u,5,3,7> - 1571221446U, // <6,u,5,4>: Cost 2 vext2 RHS, <5,4,7,6> - 1571221508U, // <6,u,5,5>: Cost 2 vext2 RHS, <5,5,5,5> - 1612290202U, // <6,u,5,6>: Cost 2 vext3 <0,2,4,6>, RHS - 1571221672U, // <6,u,5,7>: Cost 2 vext2 RHS, <5,7,5,7> - 1612290220U, // <6,u,5,u>: Cost 2 vext3 <0,2,4,6>, RHS - 1504903270U, // <6,u,6,0>: Cost 2 vext1 <4,6,u,6>, LHS - 2644963752U, // <6,u,6,1>: Cost 3 vext2 RHS, <6,1,7,2> - 1571222010U, // <6,u,6,2>: Cost 2 vext2 RHS, <6,2,7,3> - 2686032080U, // <6,u,6,3>: Cost 3 vext3 <0,2,4,6>, <u,6,3,7> - 1504906550U, // <6,u,6,4>: Cost 2 vext1 <4,6,u,6>, RHS - 2644964079U, // <6,u,6,5>: Cost 3 vext2 RHS, <6,5,7,5> - 296144182U, // <6,u,6,6>: Cost 1 vdup2 RHS - 1571222350U, // <6,u,6,7>: Cost 2 vext2 RHS, <6,7,0,1> - 296144182U, // <6,u,6,u>: Cost 1 vdup2 RHS - 1492967526U, // <6,u,7,0>: Cost 2 vext1 <2,6,u,7>, LHS - 2560738574U, // <6,u,7,1>: Cost 3 vext1 <1,6,u,7>, <1,6,u,7> - 1492969447U, // <6,u,7,2>: Cost 2 vext1 <2,6,u,7>, <2,6,u,7> - 1906753692U, // <6,u,7,3>: Cost 2 vzipr RHS, LHS - 1492970806U, // <6,u,7,4>: Cost 2 vext1 <2,6,u,7>, RHS - 2980495761U, // <6,u,7,5>: Cost 3 vzipr RHS, <0,4,u,5> - 1516860235U, // <6,u,7,6>: Cost 2 vext1 <6,6,u,7>, <6,6,u,7> - 1906756936U, // <6,u,7,7>: Cost 2 vzipr RHS, RHS - 1492973358U, // <6,u,7,u>: Cost 2 vext1 <2,6,u,7>, LHS - 1492975718U, // <6,u,u,0>: Cost 2 vext1 <2,6,u,u>, LHS - 497481518U, // <6,u,u,1>: Cost 1 vext2 RHS, LHS - 1612290405U, // <6,u,u,2>: Cost 2 vext3 <0,2,4,6>, LHS - 1571223484U, // <6,u,u,3>: Cost 2 vext2 RHS, <u,3,0,1> - 1492978998U, // <6,u,u,4>: Cost 2 vext1 <2,6,u,u>, RHS - 497481882U, // <6,u,u,5>: Cost 1 vext2 RHS, RHS - 296144182U, // <6,u,u,6>: Cost 1 vdup2 RHS - 1906765128U, // <6,u,u,7>: Cost 2 vzipr RHS, RHS - 497482085U, // <6,u,u,u>: Cost 1 vext2 RHS, LHS - 1638318080U, // <7,0,0,0>: Cost 2 vext3 RHS, <0,0,0,0> - 1638318090U, // <7,0,0,1>: Cost 2 vext3 RHS, <0,0,1,1> - 1638318100U, // <7,0,0,2>: Cost 2 vext3 RHS, <0,0,2,2> - 3646442178U, // <7,0,0,3>: Cost 4 vext1 <3,7,0,0>, <3,7,0,0> - 2712059941U, // <7,0,0,4>: Cost 3 vext3 RHS, <0,0,4,1> - 2651603364U, // <7,0,0,5>: Cost 3 vext2 <5,6,7,0>, <0,5,1,6> - 2590618445U, // <7,0,0,6>: Cost 3 vext1 <6,7,0,0>, <6,7,0,0> - 3785801798U, // <7,0,0,7>: Cost 4 vext3 RHS, <0,0,7,7> - 1638318153U, // <7,0,0,u>: Cost 2 vext3 RHS, <0,0,u,1> - 1516879974U, // <7,0,1,0>: Cost 2 vext1 <6,7,0,1>, LHS - 2693922911U, // <7,0,1,1>: Cost 3 vext3 <1,5,3,7>, <0,1,1,5> - 564576358U, // <7,0,1,2>: Cost 1 vext3 RHS, LHS - 2638996480U, // <7,0,1,3>: Cost 3 vext2 <3,5,7,0>, <1,3,5,7> - 1516883254U, // <7,0,1,4>: Cost 2 vext1 <6,7,0,1>, RHS - 2649613456U, // <7,0,1,5>: Cost 3 vext2 <5,3,7,0>, <1,5,3,7> - 1516884814U, // <7,0,1,6>: Cost 2 vext1 <6,7,0,1>, <6,7,0,1> - 2590626808U, // <7,0,1,7>: Cost 3 vext1 <6,7,0,1>, <7,0,1,0> - 564576412U, // <7,0,1,u>: Cost 1 vext3 RHS, LHS - 1638318244U, // <7,0,2,0>: Cost 2 vext3 RHS, <0,2,0,2> - 2692743344U, // <7,0,2,1>: Cost 3 vext3 <1,3,5,7>, <0,2,1,5> - 2712060084U, // <7,0,2,2>: Cost 3 vext3 RHS, <0,2,2,0> - 2712060094U, // <7,0,2,3>: Cost 3 vext3 RHS, <0,2,3,1> - 1638318284U, // <7,0,2,4>: Cost 2 vext3 RHS, <0,2,4,6> - 2712060118U, // <7,0,2,5>: Cost 3 vext3 RHS, <0,2,5,7> - 2651604922U, // <7,0,2,6>: Cost 3 vext2 <5,6,7,0>, <2,6,3,7> - 2686255336U, // <7,0,2,7>: Cost 3 vext3 <0,2,7,7>, <0,2,7,7> - 1638318316U, // <7,0,2,u>: Cost 2 vext3 RHS, <0,2,u,2> - 2651605142U, // <7,0,3,0>: Cost 3 vext2 <5,6,7,0>, <3,0,1,2> - 2712060156U, // <7,0,3,1>: Cost 3 vext3 RHS, <0,3,1,0> - 2712060165U, // <7,0,3,2>: Cost 3 vext3 RHS, <0,3,2,0> - 2651605404U, // <7,0,3,3>: Cost 3 vext2 <5,6,7,0>, <3,3,3,3> - 2651605506U, // <7,0,3,4>: Cost 3 vext2 <5,6,7,0>, <3,4,5,6> - 2638998111U, // <7,0,3,5>: Cost 3 vext2 <3,5,7,0>, <3,5,7,0> - 2639661744U, // <7,0,3,6>: Cost 3 vext2 <3,6,7,0>, <3,6,7,0> - 3712740068U, // <7,0,3,7>: Cost 4 vext2 <3,5,7,0>, <3,7,3,7> - 2640989010U, // <7,0,3,u>: Cost 3 vext2 <3,u,7,0>, <3,u,7,0> - 2712060232U, // <7,0,4,0>: Cost 3 vext3 RHS, <0,4,0,4> - 1638318418U, // <7,0,4,1>: Cost 2 vext3 RHS, <0,4,1,5> - 1638318428U, // <7,0,4,2>: Cost 2 vext3 RHS, <0,4,2,6> - 3646474950U, // <7,0,4,3>: Cost 4 vext1 <3,7,0,4>, <3,7,0,4> - 2712060270U, // <7,0,4,4>: Cost 3 vext3 RHS, <0,4,4,6> - 1577864502U, // <7,0,4,5>: Cost 2 vext2 <5,6,7,0>, RHS - 2651606388U, // <7,0,4,6>: Cost 3 vext2 <5,6,7,0>, <4,6,4,6> - 3787792776U, // <7,0,4,7>: Cost 4 vext3 RHS, <0,4,7,5> - 1638318481U, // <7,0,4,u>: Cost 2 vext3 RHS, <0,4,u,5> - 2590654566U, // <7,0,5,0>: Cost 3 vext1 <6,7,0,5>, LHS - 2651606736U, // <7,0,5,1>: Cost 3 vext2 <5,6,7,0>, <5,1,7,3> - 2712060334U, // <7,0,5,2>: Cost 3 vext3 RHS, <0,5,2,7> - 2649616239U, // <7,0,5,3>: Cost 3 vext2 <5,3,7,0>, <5,3,7,0> - 2651606982U, // <7,0,5,4>: Cost 3 vext2 <5,6,7,0>, <5,4,7,6> - 2651607044U, // <7,0,5,5>: Cost 3 vext2 <5,6,7,0>, <5,5,5,5> - 1577865314U, // <7,0,5,6>: Cost 2 vext2 <5,6,7,0>, <5,6,7,0> - 2651607208U, // <7,0,5,7>: Cost 3 vext2 <5,6,7,0>, <5,7,5,7> - 1579192580U, // <7,0,5,u>: Cost 2 vext2 <5,u,7,0>, <5,u,7,0> - 2688393709U, // <7,0,6,0>: Cost 3 vext3 <0,6,0,7>, <0,6,0,7> - 2712060406U, // <7,0,6,1>: Cost 3 vext3 RHS, <0,6,1,7> - 2688541183U, // <7,0,6,2>: Cost 3 vext3 <0,6,2,7>, <0,6,2,7> - 2655588936U, // <7,0,6,3>: Cost 3 vext2 <6,3,7,0>, <6,3,7,0> - 3762430481U, // <7,0,6,4>: Cost 4 vext3 <0,6,4,7>, <0,6,4,7> - 2651607730U, // <7,0,6,5>: Cost 3 vext2 <5,6,7,0>, <6,5,0,7> - 2651607864U, // <7,0,6,6>: Cost 3 vext2 <5,6,7,0>, <6,6,6,6> - 2651607886U, // <7,0,6,7>: Cost 3 vext2 <5,6,7,0>, <6,7,0,1> - 2688983605U, // <7,0,6,u>: Cost 3 vext3 <0,6,u,7>, <0,6,u,7> - 2651608058U, // <7,0,7,0>: Cost 3 vext2 <5,6,7,0>, <7,0,1,2> - 2932703334U, // <7,0,7,1>: Cost 3 vzipl <7,7,7,7>, LHS - 3066921062U, // <7,0,7,2>: Cost 3 vtrnl <7,7,7,7>, LHS - 3712742678U, // <7,0,7,3>: Cost 4 vext2 <3,5,7,0>, <7,3,5,7> - 2651608422U, // <7,0,7,4>: Cost 3 vext2 <5,6,7,0>, <7,4,5,6> - 2651608513U, // <7,0,7,5>: Cost 3 vext2 <5,6,7,0>, <7,5,6,7> - 2663552532U, // <7,0,7,6>: Cost 3 vext2 <7,6,7,0>, <7,6,7,0> - 2651608684U, // <7,0,7,7>: Cost 3 vext2 <5,6,7,0>, <7,7,7,7> - 2651608706U, // <7,0,7,u>: Cost 3 vext2 <5,6,7,0>, <7,u,1,2> - 1638318730U, // <7,0,u,0>: Cost 2 vext3 RHS, <0,u,0,2> - 1638318738U, // <7,0,u,1>: Cost 2 vext3 RHS, <0,u,1,1> - 564576925U, // <7,0,u,2>: Cost 1 vext3 RHS, LHS - 2572765898U, // <7,0,u,3>: Cost 3 vext1 <3,7,0,u>, <3,7,0,u> - 1638318770U, // <7,0,u,4>: Cost 2 vext3 RHS, <0,u,4,6> - 1577867418U, // <7,0,u,5>: Cost 2 vext2 <5,6,7,0>, RHS - 1516942165U, // <7,0,u,6>: Cost 2 vext1 <6,7,0,u>, <6,7,0,u> - 2651609344U, // <7,0,u,7>: Cost 3 vext2 <5,6,7,0>, <u,7,0,1> - 564576979U, // <7,0,u,u>: Cost 1 vext3 RHS, LHS - 2590687334U, // <7,1,0,0>: Cost 3 vext1 <6,7,1,0>, LHS - 2639003750U, // <7,1,0,1>: Cost 3 vext2 <3,5,7,1>, LHS - 2793357414U, // <7,1,0,2>: Cost 3 vuzpl <7,0,1,2>, LHS - 1638318838U, // <7,1,0,3>: Cost 2 vext3 RHS, <1,0,3,2> - 2590690614U, // <7,1,0,4>: Cost 3 vext1 <6,7,1,0>, RHS - 2712060679U, // <7,1,0,5>: Cost 3 vext3 RHS, <1,0,5,1> - 2590692182U, // <7,1,0,6>: Cost 3 vext1 <6,7,1,0>, <6,7,1,0> - 3785802521U, // <7,1,0,7>: Cost 4 vext3 RHS, <1,0,7,1> - 1638318883U, // <7,1,0,u>: Cost 2 vext3 RHS, <1,0,u,2> - 2712060715U, // <7,1,1,0>: Cost 3 vext3 RHS, <1,1,0,1> - 1638318900U, // <7,1,1,1>: Cost 2 vext3 RHS, <1,1,1,1> - 3774300994U, // <7,1,1,2>: Cost 4 vext3 <2,6,3,7>, <1,1,2,6> - 1638318920U, // <7,1,1,3>: Cost 2 vext3 RHS, <1,1,3,3> - 2712060755U, // <7,1,1,4>: Cost 3 vext3 RHS, <1,1,4,5> - 2691416926U, // <7,1,1,5>: Cost 3 vext3 <1,1,5,7>, <1,1,5,7> - 2590700375U, // <7,1,1,6>: Cost 3 vext1 <6,7,1,1>, <6,7,1,1> - 3765158766U, // <7,1,1,7>: Cost 4 vext3 <1,1,5,7>, <1,1,7,5> - 1638318965U, // <7,1,1,u>: Cost 2 vext3 RHS, <1,1,u,3> - 2712060796U, // <7,1,2,0>: Cost 3 vext3 RHS, <1,2,0,1> - 2712060807U, // <7,1,2,1>: Cost 3 vext3 RHS, <1,2,1,3> - 3712747112U, // <7,1,2,2>: Cost 4 vext2 <3,5,7,1>, <2,2,2,2> - 1638318998U, // <7,1,2,3>: Cost 2 vext3 RHS, <1,2,3,0> - 2712060836U, // <7,1,2,4>: Cost 3 vext3 RHS, <1,2,4,5> - 2712060843U, // <7,1,2,5>: Cost 3 vext3 RHS, <1,2,5,3> - 2590708568U, // <7,1,2,6>: Cost 3 vext1 <6,7,1,2>, <6,7,1,2> - 2735948730U, // <7,1,2,7>: Cost 3 vext3 RHS, <1,2,7,0> - 1638319043U, // <7,1,2,u>: Cost 2 vext3 RHS, <1,2,u,0> - 2712060876U, // <7,1,3,0>: Cost 3 vext3 RHS, <1,3,0,0> - 1638319064U, // <7,1,3,1>: Cost 2 vext3 RHS, <1,3,1,3> - 2712060894U, // <7,1,3,2>: Cost 3 vext3 RHS, <1,3,2,0> - 2692596718U, // <7,1,3,3>: Cost 3 vext3 <1,3,3,7>, <1,3,3,7> - 2712060917U, // <7,1,3,4>: Cost 3 vext3 RHS, <1,3,4,5> - 1619002368U, // <7,1,3,5>: Cost 2 vext3 <1,3,5,7>, <1,3,5,7> - 2692817929U, // <7,1,3,6>: Cost 3 vext3 <1,3,6,7>, <1,3,6,7> - 2735948814U, // <7,1,3,7>: Cost 3 vext3 RHS, <1,3,7,3> - 1619223579U, // <7,1,3,u>: Cost 2 vext3 <1,3,u,7>, <1,3,u,7> - 2712060962U, // <7,1,4,0>: Cost 3 vext3 RHS, <1,4,0,5> - 2712060971U, // <7,1,4,1>: Cost 3 vext3 RHS, <1,4,1,5> - 2712060980U, // <7,1,4,2>: Cost 3 vext3 RHS, <1,4,2,5> - 2712060989U, // <7,1,4,3>: Cost 3 vext3 RHS, <1,4,3,5> - 3785802822U, // <7,1,4,4>: Cost 4 vext3 RHS, <1,4,4,5> - 2639007030U, // <7,1,4,5>: Cost 3 vext2 <3,5,7,1>, RHS - 2645642634U, // <7,1,4,6>: Cost 3 vext2 <4,6,7,1>, <4,6,7,1> - 3719384520U, // <7,1,4,7>: Cost 4 vext2 <4,6,7,1>, <4,7,5,0> - 2639007273U, // <7,1,4,u>: Cost 3 vext2 <3,5,7,1>, RHS - 2572812390U, // <7,1,5,0>: Cost 3 vext1 <3,7,1,5>, LHS - 2693776510U, // <7,1,5,1>: Cost 3 vext3 <1,5,1,7>, <1,5,1,7> - 3774301318U, // <7,1,5,2>: Cost 4 vext3 <2,6,3,7>, <1,5,2,6> - 1620182160U, // <7,1,5,3>: Cost 2 vext3 <1,5,3,7>, <1,5,3,7> - 2572815670U, // <7,1,5,4>: Cost 3 vext1 <3,7,1,5>, RHS - 3766486178U, // <7,1,5,5>: Cost 4 vext3 <1,3,5,7>, <1,5,5,7> - 2651615331U, // <7,1,5,6>: Cost 3 vext2 <5,6,7,1>, <5,6,7,1> - 2652278964U, // <7,1,5,7>: Cost 3 vext2 <5,7,7,1>, <5,7,7,1> - 1620550845U, // <7,1,5,u>: Cost 2 vext3 <1,5,u,7>, <1,5,u,7> - 3768108230U, // <7,1,6,0>: Cost 4 vext3 <1,6,0,7>, <1,6,0,7> - 2694440143U, // <7,1,6,1>: Cost 3 vext3 <1,6,1,7>, <1,6,1,7> - 2712061144U, // <7,1,6,2>: Cost 3 vext3 RHS, <1,6,2,7> - 2694587617U, // <7,1,6,3>: Cost 3 vext3 <1,6,3,7>, <1,6,3,7> - 3768403178U, // <7,1,6,4>: Cost 4 vext3 <1,6,4,7>, <1,6,4,7> - 2694735091U, // <7,1,6,5>: Cost 3 vext3 <1,6,5,7>, <1,6,5,7> - 3768550652U, // <7,1,6,6>: Cost 4 vext3 <1,6,6,7>, <1,6,6,7> - 2652279630U, // <7,1,6,7>: Cost 3 vext2 <5,7,7,1>, <6,7,0,1> - 2694956302U, // <7,1,6,u>: Cost 3 vext3 <1,6,u,7>, <1,6,u,7> - 2645644282U, // <7,1,7,0>: Cost 3 vext2 <4,6,7,1>, <7,0,1,2> - 2859062094U, // <7,1,7,1>: Cost 3 vuzpr <6,7,0,1>, <6,7,0,1> - 3779462437U, // <7,1,7,2>: Cost 4 vext3 <3,5,1,7>, <1,7,2,3> - 3121938534U, // <7,1,7,3>: Cost 3 vtrnr <5,7,5,7>, LHS - 2554916150U, // <7,1,7,4>: Cost 3 vext1 <0,7,1,7>, RHS - 3769140548U, // <7,1,7,5>: Cost 4 vext3 <1,7,5,7>, <1,7,5,7> - 3726022164U, // <7,1,7,6>: Cost 4 vext2 <5,7,7,1>, <7,6,7,0> - 2554918508U, // <7,1,7,7>: Cost 3 vext1 <0,7,1,7>, <7,7,7,7> - 3121938539U, // <7,1,7,u>: Cost 3 vtrnr <5,7,5,7>, LHS - 2572836966U, // <7,1,u,0>: Cost 3 vext1 <3,7,1,u>, LHS - 1638319469U, // <7,1,u,1>: Cost 2 vext3 RHS, <1,u,1,3> - 2712061299U, // <7,1,u,2>: Cost 3 vext3 RHS, <1,u,2,0> - 1622173059U, // <7,1,u,3>: Cost 2 vext3 <1,u,3,7>, <1,u,3,7> - 2572840246U, // <7,1,u,4>: Cost 3 vext1 <3,7,1,u>, RHS - 1622320533U, // <7,1,u,5>: Cost 2 vext3 <1,u,5,7>, <1,u,5,7> - 2696136094U, // <7,1,u,6>: Cost 3 vext3 <1,u,6,7>, <1,u,6,7> - 2859060777U, // <7,1,u,7>: Cost 3 vuzpr <6,7,0,1>, RHS - 1622541744U, // <7,1,u,u>: Cost 2 vext3 <1,u,u,7>, <1,u,u,7> - 2712061364U, // <7,2,0,0>: Cost 3 vext3 RHS, <2,0,0,2> - 2712061373U, // <7,2,0,1>: Cost 3 vext3 RHS, <2,0,1,2> - 2712061380U, // <7,2,0,2>: Cost 3 vext3 RHS, <2,0,2,0> - 2712061389U, // <7,2,0,3>: Cost 3 vext3 RHS, <2,0,3,0> - 2712061404U, // <7,2,0,4>: Cost 3 vext3 RHS, <2,0,4,6> - 2696725990U, // <7,2,0,5>: Cost 3 vext3 <2,0,5,7>, <2,0,5,7> - 2712061417U, // <7,2,0,6>: Cost 3 vext3 RHS, <2,0,6,1> - 3785803251U, // <7,2,0,7>: Cost 4 vext3 RHS, <2,0,7,2> - 2696947201U, // <7,2,0,u>: Cost 3 vext3 <2,0,u,7>, <2,0,u,7> - 2712061446U, // <7,2,1,0>: Cost 3 vext3 RHS, <2,1,0,3> - 3785803276U, // <7,2,1,1>: Cost 4 vext3 RHS, <2,1,1,0> - 3785803285U, // <7,2,1,2>: Cost 4 vext3 RHS, <2,1,2,0> - 2712061471U, // <7,2,1,3>: Cost 3 vext3 RHS, <2,1,3,1> - 2712061482U, // <7,2,1,4>: Cost 3 vext3 RHS, <2,1,4,3> - 3766486576U, // <7,2,1,5>: Cost 4 vext3 <1,3,5,7>, <2,1,5,0> - 2712061500U, // <7,2,1,6>: Cost 3 vext3 RHS, <2,1,6,3> - 2602718850U, // <7,2,1,7>: Cost 3 vext1 <u,7,2,1>, <7,u,1,2> - 2712061516U, // <7,2,1,u>: Cost 3 vext3 RHS, <2,1,u,1> - 2712061525U, // <7,2,2,0>: Cost 3 vext3 RHS, <2,2,0,1> - 2712061536U, // <7,2,2,1>: Cost 3 vext3 RHS, <2,2,1,3> - 1638319720U, // <7,2,2,2>: Cost 2 vext3 RHS, <2,2,2,2> - 1638319730U, // <7,2,2,3>: Cost 2 vext3 RHS, <2,2,3,3> - 2712061565U, // <7,2,2,4>: Cost 3 vext3 RHS, <2,2,4,5> - 2698053256U, // <7,2,2,5>: Cost 3 vext3 <2,2,5,7>, <2,2,5,7> - 2712061584U, // <7,2,2,6>: Cost 3 vext3 RHS, <2,2,6,6> - 3771795096U, // <7,2,2,7>: Cost 4 vext3 <2,2,5,7>, <2,2,7,5> - 1638319775U, // <7,2,2,u>: Cost 2 vext3 RHS, <2,2,u,3> - 1638319782U, // <7,2,3,0>: Cost 2 vext3 RHS, <2,3,0,1> - 2693924531U, // <7,2,3,1>: Cost 3 vext3 <1,5,3,7>, <2,3,1,5> - 2700560061U, // <7,2,3,2>: Cost 3 vext3 <2,6,3,7>, <2,3,2,6> - 2693924551U, // <7,2,3,3>: Cost 3 vext3 <1,5,3,7>, <2,3,3,7> - 1638319822U, // <7,2,3,4>: Cost 2 vext3 RHS, <2,3,4,5> - 2698716889U, // <7,2,3,5>: Cost 3 vext3 <2,3,5,7>, <2,3,5,7> - 2712061665U, // <7,2,3,6>: Cost 3 vext3 RHS, <2,3,6,6> - 2735949540U, // <7,2,3,7>: Cost 3 vext3 RHS, <2,3,7,0> - 1638319854U, // <7,2,3,u>: Cost 2 vext3 RHS, <2,3,u,1> - 2712061692U, // <7,2,4,0>: Cost 3 vext3 RHS, <2,4,0,6> - 2712061698U, // <7,2,4,1>: Cost 3 vext3 RHS, <2,4,1,3> - 2712061708U, // <7,2,4,2>: Cost 3 vext3 RHS, <2,4,2,4> - 2712061718U, // <7,2,4,3>: Cost 3 vext3 RHS, <2,4,3,5> - 2712061728U, // <7,2,4,4>: Cost 3 vext3 RHS, <2,4,4,6> - 2699380522U, // <7,2,4,5>: Cost 3 vext3 <2,4,5,7>, <2,4,5,7> - 2712061740U, // <7,2,4,6>: Cost 3 vext3 RHS, <2,4,6,0> - 3809691445U, // <7,2,4,7>: Cost 4 vext3 RHS, <2,4,7,0> - 2699601733U, // <7,2,4,u>: Cost 3 vext3 <2,4,u,7>, <2,4,u,7> - 2699675470U, // <7,2,5,0>: Cost 3 vext3 <2,5,0,7>, <2,5,0,7> - 3766486867U, // <7,2,5,1>: Cost 4 vext3 <1,3,5,7>, <2,5,1,3> - 2699822944U, // <7,2,5,2>: Cost 3 vext3 <2,5,2,7>, <2,5,2,7> - 2692745065U, // <7,2,5,3>: Cost 3 vext3 <1,3,5,7>, <2,5,3,7> - 2699970418U, // <7,2,5,4>: Cost 3 vext3 <2,5,4,7>, <2,5,4,7> - 3766486907U, // <7,2,5,5>: Cost 4 vext3 <1,3,5,7>, <2,5,5,7> - 2700117892U, // <7,2,5,6>: Cost 3 vext3 <2,5,6,7>, <2,5,6,7> - 3771795334U, // <7,2,5,7>: Cost 4 vext3 <2,2,5,7>, <2,5,7,0> - 2692745110U, // <7,2,5,u>: Cost 3 vext3 <1,3,5,7>, <2,5,u,7> - 2572894310U, // <7,2,6,0>: Cost 3 vext1 <3,7,2,6>, LHS - 2712061860U, // <7,2,6,1>: Cost 3 vext3 RHS, <2,6,1,3> - 2700486577U, // <7,2,6,2>: Cost 3 vext3 <2,6,2,7>, <2,6,2,7> - 1626818490U, // <7,2,6,3>: Cost 2 vext3 <2,6,3,7>, <2,6,3,7> - 2572897590U, // <7,2,6,4>: Cost 3 vext1 <3,7,2,6>, RHS - 2700707788U, // <7,2,6,5>: Cost 3 vext3 <2,6,5,7>, <2,6,5,7> - 2700781525U, // <7,2,6,6>: Cost 3 vext3 <2,6,6,7>, <2,6,6,7> - 3774597086U, // <7,2,6,7>: Cost 4 vext3 <2,6,7,7>, <2,6,7,7> - 1627187175U, // <7,2,6,u>: Cost 2 vext3 <2,6,u,7>, <2,6,u,7> - 2735949802U, // <7,2,7,0>: Cost 3 vext3 RHS, <2,7,0,1> - 3780200434U, // <7,2,7,1>: Cost 4 vext3 <3,6,2,7>, <2,7,1,0> - 3773564928U, // <7,2,7,2>: Cost 4 vext3 <2,5,2,7>, <2,7,2,5> - 2986541158U, // <7,2,7,3>: Cost 3 vzipr <5,5,7,7>, LHS - 2554989878U, // <7,2,7,4>: Cost 3 vext1 <0,7,2,7>, RHS - 3775113245U, // <7,2,7,5>: Cost 4 vext3 <2,7,5,7>, <2,7,5,7> - 4060283228U, // <7,2,7,6>: Cost 4 vzipr <5,5,7,7>, <0,4,2,6> - 2554992236U, // <7,2,7,7>: Cost 3 vext1 <0,7,2,7>, <7,7,7,7> - 2986541163U, // <7,2,7,u>: Cost 3 vzipr <5,5,7,7>, LHS - 1638320187U, // <7,2,u,0>: Cost 2 vext3 RHS, <2,u,0,1> - 2693924936U, // <7,2,u,1>: Cost 3 vext3 <1,5,3,7>, <2,u,1,5> - 1638319720U, // <7,2,u,2>: Cost 2 vext3 RHS, <2,2,2,2> - 1628145756U, // <7,2,u,3>: Cost 2 vext3 <2,u,3,7>, <2,u,3,7> - 1638320227U, // <7,2,u,4>: Cost 2 vext3 RHS, <2,u,4,5> - 2702035054U, // <7,2,u,5>: Cost 3 vext3 <2,u,5,7>, <2,u,5,7> - 2702108791U, // <7,2,u,6>: Cost 3 vext3 <2,u,6,7>, <2,u,6,7> - 2735949945U, // <7,2,u,7>: Cost 3 vext3 RHS, <2,u,7,0> - 1628514441U, // <7,2,u,u>: Cost 2 vext3 <2,u,u,7>, <2,u,u,7> - 2712062091U, // <7,3,0,0>: Cost 3 vext3 RHS, <3,0,0,0> - 1638320278U, // <7,3,0,1>: Cost 2 vext3 RHS, <3,0,1,2> - 2712062109U, // <7,3,0,2>: Cost 3 vext3 RHS, <3,0,2,0> - 2590836886U, // <7,3,0,3>: Cost 3 vext1 <6,7,3,0>, <3,0,1,2> - 2712062128U, // <7,3,0,4>: Cost 3 vext3 RHS, <3,0,4,1> - 2712062138U, // <7,3,0,5>: Cost 3 vext3 RHS, <3,0,5,2> - 2590839656U, // <7,3,0,6>: Cost 3 vext1 <6,7,3,0>, <6,7,3,0> - 3311414017U, // <7,3,0,7>: Cost 4 vrev <3,7,7,0> - 1638320341U, // <7,3,0,u>: Cost 2 vext3 RHS, <3,0,u,2> - 2237164227U, // <7,3,1,0>: Cost 3 vrev <3,7,0,1> - 2712062182U, // <7,3,1,1>: Cost 3 vext3 RHS, <3,1,1,1> - 2712062193U, // <7,3,1,2>: Cost 3 vext3 RHS, <3,1,2,3> - 2692745468U, // <7,3,1,3>: Cost 3 vext3 <1,3,5,7>, <3,1,3,5> - 2712062214U, // <7,3,1,4>: Cost 3 vext3 RHS, <3,1,4,6> - 2693925132U, // <7,3,1,5>: Cost 3 vext3 <1,5,3,7>, <3,1,5,3> - 3768183059U, // <7,3,1,6>: Cost 4 vext3 <1,6,1,7>, <3,1,6,1> - 2692745504U, // <7,3,1,7>: Cost 3 vext3 <1,3,5,7>, <3,1,7,5> - 2696063273U, // <7,3,1,u>: Cost 3 vext3 <1,u,5,7>, <3,1,u,5> - 2712062254U, // <7,3,2,0>: Cost 3 vext3 RHS, <3,2,0,1> - 2712062262U, // <7,3,2,1>: Cost 3 vext3 RHS, <3,2,1,0> - 2712062273U, // <7,3,2,2>: Cost 3 vext3 RHS, <3,2,2,2> - 2712062280U, // <7,3,2,3>: Cost 3 vext3 RHS, <3,2,3,0> - 2712062294U, // <7,3,2,4>: Cost 3 vext3 RHS, <3,2,4,5> - 2712062302U, // <7,3,2,5>: Cost 3 vext3 RHS, <3,2,5,4> - 2700560742U, // <7,3,2,6>: Cost 3 vext3 <2,6,3,7>, <3,2,6,3> - 2712062319U, // <7,3,2,7>: Cost 3 vext3 RHS, <3,2,7,3> - 2712062325U, // <7,3,2,u>: Cost 3 vext3 RHS, <3,2,u,0> - 2712062335U, // <7,3,3,0>: Cost 3 vext3 RHS, <3,3,0,1> - 2636368158U, // <7,3,3,1>: Cost 3 vext2 <3,1,7,3>, <3,1,7,3> - 2637031791U, // <7,3,3,2>: Cost 3 vext2 <3,2,7,3>, <3,2,7,3> - 1638320540U, // <7,3,3,3>: Cost 2 vext3 RHS, <3,3,3,3> - 2712062374U, // <7,3,3,4>: Cost 3 vext3 RHS, <3,3,4,4> - 2704689586U, // <7,3,3,5>: Cost 3 vext3 <3,3,5,7>, <3,3,5,7> - 2590864235U, // <7,3,3,6>: Cost 3 vext1 <6,7,3,3>, <6,7,3,3> - 2704837060U, // <7,3,3,7>: Cost 3 vext3 <3,3,7,7>, <3,3,7,7> - 1638320540U, // <7,3,3,u>: Cost 2 vext3 RHS, <3,3,3,3> - 2712062416U, // <7,3,4,0>: Cost 3 vext3 RHS, <3,4,0,1> - 2712062426U, // <7,3,4,1>: Cost 3 vext3 RHS, <3,4,1,2> - 2566981640U, // <7,3,4,2>: Cost 3 vext1 <2,7,3,4>, <2,7,3,4> - 2712062447U, // <7,3,4,3>: Cost 3 vext3 RHS, <3,4,3,5> - 2712062456U, // <7,3,4,4>: Cost 3 vext3 RHS, <3,4,4,5> - 1638320642U, // <7,3,4,5>: Cost 2 vext3 RHS, <3,4,5,6> - 2648313204U, // <7,3,4,6>: Cost 3 vext2 <5,1,7,3>, <4,6,4,6> - 3311446789U, // <7,3,4,7>: Cost 4 vrev <3,7,7,4> - 1638320669U, // <7,3,4,u>: Cost 2 vext3 RHS, <3,4,u,6> - 2602819686U, // <7,3,5,0>: Cost 3 vext1 <u,7,3,5>, LHS - 1574571728U, // <7,3,5,1>: Cost 2 vext2 <5,1,7,3>, <5,1,7,3> - 2648977185U, // <7,3,5,2>: Cost 3 vext2 <5,2,7,3>, <5,2,7,3> - 2705869378U, // <7,3,5,3>: Cost 3 vext3 <3,5,3,7>, <3,5,3,7> - 2237491947U, // <7,3,5,4>: Cost 3 vrev <3,7,4,5> - 2706016852U, // <7,3,5,5>: Cost 3 vext3 <3,5,5,7>, <3,5,5,7> - 2648313954U, // <7,3,5,6>: Cost 3 vext2 <5,1,7,3>, <5,6,7,0> - 2692745823U, // <7,3,5,7>: Cost 3 vext3 <1,3,5,7>, <3,5,7,0> - 1579217159U, // <7,3,5,u>: Cost 2 vext2 <5,u,7,3>, <5,u,7,3> - 2706311800U, // <7,3,6,0>: Cost 3 vext3 <3,6,0,7>, <3,6,0,7> - 2654286249U, // <7,3,6,1>: Cost 3 vext2 <6,1,7,3>, <6,1,7,3> - 1581208058U, // <7,3,6,2>: Cost 2 vext2 <6,2,7,3>, <6,2,7,3> - 2706533011U, // <7,3,6,3>: Cost 3 vext3 <3,6,3,7>, <3,6,3,7> - 2706606748U, // <7,3,6,4>: Cost 3 vext3 <3,6,4,7>, <3,6,4,7> - 3780422309U, // <7,3,6,5>: Cost 4 vext3 <3,6,5,7>, <3,6,5,7> - 2712062637U, // <7,3,6,6>: Cost 3 vext3 RHS, <3,6,6,6> - 2706827959U, // <7,3,6,7>: Cost 3 vext3 <3,6,7,7>, <3,6,7,7> - 1585189856U, // <7,3,6,u>: Cost 2 vext2 <6,u,7,3>, <6,u,7,3> - 2693925571U, // <7,3,7,0>: Cost 3 vext3 <1,5,3,7>, <3,7,0,1> - 2693925584U, // <7,3,7,1>: Cost 3 vext3 <1,5,3,7>, <3,7,1,5> - 2700561114U, // <7,3,7,2>: Cost 3 vext3 <2,6,3,7>, <3,7,2,6> - 2572978916U, // <7,3,7,3>: Cost 3 vext1 <3,7,3,7>, <3,7,3,7> - 2693925611U, // <7,3,7,4>: Cost 3 vext3 <1,5,3,7>, <3,7,4,5> - 2707344118U, // <7,3,7,5>: Cost 3 vext3 <3,7,5,7>, <3,7,5,7> - 2654950894U, // <7,3,7,6>: Cost 3 vext2 <6,2,7,3>, <7,6,2,7> - 2648315500U, // <7,3,7,7>: Cost 3 vext2 <5,1,7,3>, <7,7,7,7> - 2693925643U, // <7,3,7,u>: Cost 3 vext3 <1,5,3,7>, <3,7,u,1> - 2237221578U, // <7,3,u,0>: Cost 3 vrev <3,7,0,u> - 1638320926U, // <7,3,u,1>: Cost 2 vext3 RHS, <3,u,1,2> - 1593153452U, // <7,3,u,2>: Cost 2 vext2 <u,2,7,3>, <u,2,7,3> - 1638320540U, // <7,3,u,3>: Cost 2 vext3 RHS, <3,3,3,3> - 2237516526U, // <7,3,u,4>: Cost 3 vrev <3,7,4,u> - 1638320966U, // <7,3,u,5>: Cost 2 vext3 RHS, <3,u,5,6> - 2712062796U, // <7,3,u,6>: Cost 3 vext3 RHS, <3,u,6,3> - 2692967250U, // <7,3,u,7>: Cost 3 vext3 <1,3,u,7>, <3,u,7,0> - 1638320989U, // <7,3,u,u>: Cost 2 vext3 RHS, <3,u,u,2> - 2651635712U, // <7,4,0,0>: Cost 3 vext2 <5,6,7,4>, <0,0,0,0> - 1577893990U, // <7,4,0,1>: Cost 2 vext2 <5,6,7,4>, LHS - 2651635876U, // <7,4,0,2>: Cost 3 vext2 <5,6,7,4>, <0,2,0,2> - 3785804672U, // <7,4,0,3>: Cost 4 vext3 RHS, <4,0,3,1> - 2651636050U, // <7,4,0,4>: Cost 3 vext2 <5,6,7,4>, <0,4,1,5> - 1638468498U, // <7,4,0,5>: Cost 2 vext3 RHS, <4,0,5,1> - 1638468508U, // <7,4,0,6>: Cost 2 vext3 RHS, <4,0,6,2> - 3787795364U, // <7,4,0,7>: Cost 4 vext3 RHS, <4,0,7,1> - 1640459181U, // <7,4,0,u>: Cost 2 vext3 RHS, <4,0,u,1> - 2651636470U, // <7,4,1,0>: Cost 3 vext2 <5,6,7,4>, <1,0,3,2> - 2651636532U, // <7,4,1,1>: Cost 3 vext2 <5,6,7,4>, <1,1,1,1> - 2712062922U, // <7,4,1,2>: Cost 3 vext3 RHS, <4,1,2,3> - 2639029248U, // <7,4,1,3>: Cost 3 vext2 <3,5,7,4>, <1,3,5,7> - 2712062940U, // <7,4,1,4>: Cost 3 vext3 RHS, <4,1,4,3> - 2712062946U, // <7,4,1,5>: Cost 3 vext3 RHS, <4,1,5,0> - 2712062958U, // <7,4,1,6>: Cost 3 vext3 RHS, <4,1,6,3> - 3785804791U, // <7,4,1,7>: Cost 4 vext3 RHS, <4,1,7,3> - 2712062973U, // <7,4,1,u>: Cost 3 vext3 RHS, <4,1,u,0> - 3785804807U, // <7,4,2,0>: Cost 4 vext3 RHS, <4,2,0,1> - 3785804818U, // <7,4,2,1>: Cost 4 vext3 RHS, <4,2,1,3> - 2651637352U, // <7,4,2,2>: Cost 3 vext2 <5,6,7,4>, <2,2,2,2> - 2651637414U, // <7,4,2,3>: Cost 3 vext2 <5,6,7,4>, <2,3,0,1> - 3716753194U, // <7,4,2,4>: Cost 4 vext2 <4,2,7,4>, <2,4,5,7> - 2712063030U, // <7,4,2,5>: Cost 3 vext3 RHS, <4,2,5,3> - 2712063036U, // <7,4,2,6>: Cost 3 vext3 RHS, <4,2,6,0> - 3773123658U, // <7,4,2,7>: Cost 4 vext3 <2,4,5,7>, <4,2,7,5> - 2712063054U, // <7,4,2,u>: Cost 3 vext3 RHS, <4,2,u,0> - 2651637910U, // <7,4,3,0>: Cost 3 vext2 <5,6,7,4>, <3,0,1,2> - 3712772348U, // <7,4,3,1>: Cost 4 vext2 <3,5,7,4>, <3,1,3,5> - 3785804906U, // <7,4,3,2>: Cost 4 vext3 RHS, <4,3,2,1> - 2651638172U, // <7,4,3,3>: Cost 3 vext2 <5,6,7,4>, <3,3,3,3> - 2651638274U, // <7,4,3,4>: Cost 3 vext2 <5,6,7,4>, <3,4,5,6> - 2639030883U, // <7,4,3,5>: Cost 3 vext2 <3,5,7,4>, <3,5,7,4> - 2712063122U, // <7,4,3,6>: Cost 3 vext3 RHS, <4,3,6,5> - 3712772836U, // <7,4,3,7>: Cost 4 vext2 <3,5,7,4>, <3,7,3,7> - 2641021782U, // <7,4,3,u>: Cost 3 vext2 <3,u,7,4>, <3,u,7,4> - 2714053802U, // <7,4,4,0>: Cost 3 vext3 RHS, <4,4,0,2> - 3785804978U, // <7,4,4,1>: Cost 4 vext3 RHS, <4,4,1,1> - 3716754505U, // <7,4,4,2>: Cost 4 vext2 <4,2,7,4>, <4,2,7,4> - 3785804998U, // <7,4,4,3>: Cost 4 vext3 RHS, <4,4,3,3> - 1638321360U, // <7,4,4,4>: Cost 2 vext3 RHS, <4,4,4,4> - 1638468826U, // <7,4,4,5>: Cost 2 vext3 RHS, <4,4,5,5> - 1638468836U, // <7,4,4,6>: Cost 2 vext3 RHS, <4,4,6,6> - 3785215214U, // <7,4,4,7>: Cost 4 vext3 <4,4,7,7>, <4,4,7,7> - 1640459509U, // <7,4,4,u>: Cost 2 vext3 RHS, <4,4,u,5> - 1517207654U, // <7,4,5,0>: Cost 2 vext1 <6,7,4,5>, LHS - 2573034640U, // <7,4,5,1>: Cost 3 vext1 <3,7,4,5>, <1,5,3,7> - 2712063246U, // <7,4,5,2>: Cost 3 vext3 RHS, <4,5,2,3> - 2573036267U, // <7,4,5,3>: Cost 3 vext1 <3,7,4,5>, <3,7,4,5> - 1517210934U, // <7,4,5,4>: Cost 2 vext1 <6,7,4,5>, RHS - 2711989549U, // <7,4,5,5>: Cost 3 vext3 <4,5,5,7>, <4,5,5,7> - 564579638U, // <7,4,5,6>: Cost 1 vext3 RHS, RHS - 2651639976U, // <7,4,5,7>: Cost 3 vext2 <5,6,7,4>, <5,7,5,7> - 564579656U, // <7,4,5,u>: Cost 1 vext3 RHS, RHS - 2712063307U, // <7,4,6,0>: Cost 3 vext3 RHS, <4,6,0,1> - 3767668056U, // <7,4,6,1>: Cost 4 vext3 <1,5,3,7>, <4,6,1,5> - 2651640314U, // <7,4,6,2>: Cost 3 vext2 <5,6,7,4>, <6,2,7,3> - 2655621708U, // <7,4,6,3>: Cost 3 vext2 <6,3,7,4>, <6,3,7,4> - 1638468980U, // <7,4,6,4>: Cost 2 vext3 RHS, <4,6,4,6> - 2712063358U, // <7,4,6,5>: Cost 3 vext3 RHS, <4,6,5,7> - 2712063367U, // <7,4,6,6>: Cost 3 vext3 RHS, <4,6,6,7> - 2712210826U, // <7,4,6,7>: Cost 3 vext3 RHS, <4,6,7,1> - 1638469012U, // <7,4,6,u>: Cost 2 vext3 RHS, <4,6,u,2> - 2651640826U, // <7,4,7,0>: Cost 3 vext2 <5,6,7,4>, <7,0,1,2> - 3773713830U, // <7,4,7,1>: Cost 4 vext3 <2,5,4,7>, <4,7,1,2> - 3773713842U, // <7,4,7,2>: Cost 4 vext3 <2,5,4,7>, <4,7,2,5> - 3780349372U, // <7,4,7,3>: Cost 4 vext3 <3,6,4,7>, <4,7,3,6> - 2651641140U, // <7,4,7,4>: Cost 3 vext2 <5,6,7,4>, <7,4,0,1> - 2712210888U, // <7,4,7,5>: Cost 3 vext3 RHS, <4,7,5,0> - 2712210898U, // <7,4,7,6>: Cost 3 vext3 RHS, <4,7,6,1> - 2651641452U, // <7,4,7,7>: Cost 3 vext2 <5,6,7,4>, <7,7,7,7> - 2713538026U, // <7,4,7,u>: Cost 3 vext3 <4,7,u,7>, <4,7,u,7> - 1517232230U, // <7,4,u,0>: Cost 2 vext1 <6,7,4,u>, LHS - 1577899822U, // <7,4,u,1>: Cost 2 vext2 <5,6,7,4>, LHS - 2712063489U, // <7,4,u,2>: Cost 3 vext3 RHS, <4,u,2,3> - 2573060846U, // <7,4,u,3>: Cost 3 vext1 <3,7,4,u>, <3,7,4,u> - 1640312342U, // <7,4,u,4>: Cost 2 vext3 RHS, <4,u,4,6> - 1638469146U, // <7,4,u,5>: Cost 2 vext3 RHS, <4,u,5,1> - 564579881U, // <7,4,u,6>: Cost 1 vext3 RHS, RHS - 2714054192U, // <7,4,u,7>: Cost 3 vext3 RHS, <4,u,7,5> - 564579899U, // <7,4,u,u>: Cost 1 vext3 RHS, RHS - 2579038310U, // <7,5,0,0>: Cost 3 vext1 <4,7,5,0>, LHS - 2636382310U, // <7,5,0,1>: Cost 3 vext2 <3,1,7,5>, LHS - 2796339302U, // <7,5,0,2>: Cost 3 vuzpl <7,4,5,6>, LHS - 3646810719U, // <7,5,0,3>: Cost 4 vext1 <3,7,5,0>, <3,5,7,0> - 2712063586U, // <7,5,0,4>: Cost 3 vext3 RHS, <5,0,4,1> - 2735951467U, // <7,5,0,5>: Cost 3 vext3 RHS, <5,0,5,1> - 2735951476U, // <7,5,0,6>: Cost 3 vext3 RHS, <5,0,6,1> - 2579043322U, // <7,5,0,7>: Cost 3 vext1 <4,7,5,0>, <7,0,1,2> - 2636382877U, // <7,5,0,u>: Cost 3 vext2 <3,1,7,5>, LHS - 2712211087U, // <7,5,1,0>: Cost 3 vext3 RHS, <5,1,0,1> - 3698180916U, // <7,5,1,1>: Cost 4 vext2 <1,1,7,5>, <1,1,1,1> - 3710124950U, // <7,5,1,2>: Cost 4 vext2 <3,1,7,5>, <1,2,3,0> - 2636383232U, // <7,5,1,3>: Cost 3 vext2 <3,1,7,5>, <1,3,5,7> - 2712211127U, // <7,5,1,4>: Cost 3 vext3 RHS, <5,1,4,5> - 2590994128U, // <7,5,1,5>: Cost 3 vext1 <6,7,5,1>, <5,1,7,3> - 2590995323U, // <7,5,1,6>: Cost 3 vext1 <6,7,5,1>, <6,7,5,1> - 1638469328U, // <7,5,1,7>: Cost 2 vext3 RHS, <5,1,7,3> - 1638469337U, // <7,5,1,u>: Cost 2 vext3 RHS, <5,1,u,3> - 3785805536U, // <7,5,2,0>: Cost 4 vext3 RHS, <5,2,0,1> - 3785805544U, // <7,5,2,1>: Cost 4 vext3 RHS, <5,2,1,0> - 3704817288U, // <7,5,2,2>: Cost 4 vext2 <2,2,7,5>, <2,2,5,7> - 2712063742U, // <7,5,2,3>: Cost 3 vext3 RHS, <5,2,3,4> - 3716761386U, // <7,5,2,4>: Cost 4 vext2 <4,2,7,5>, <2,4,5,7> - 2714054415U, // <7,5,2,5>: Cost 3 vext3 RHS, <5,2,5,3> - 3774304024U, // <7,5,2,6>: Cost 4 vext3 <2,6,3,7>, <5,2,6,3> - 2712063777U, // <7,5,2,7>: Cost 3 vext3 RHS, <5,2,7,3> - 2712063787U, // <7,5,2,u>: Cost 3 vext3 RHS, <5,2,u,4> - 3634888806U, // <7,5,3,0>: Cost 4 vext1 <1,7,5,3>, LHS - 2636384544U, // <7,5,3,1>: Cost 3 vext2 <3,1,7,5>, <3,1,7,5> - 3710790001U, // <7,5,3,2>: Cost 4 vext2 <3,2,7,5>, <3,2,7,5> - 3710126492U, // <7,5,3,3>: Cost 4 vext2 <3,1,7,5>, <3,3,3,3> - 3634892086U, // <7,5,3,4>: Cost 4 vext1 <1,7,5,3>, RHS - 2639039076U, // <7,5,3,5>: Cost 3 vext2 <3,5,7,5>, <3,5,7,5> - 3713444533U, // <7,5,3,6>: Cost 4 vext2 <3,6,7,5>, <3,6,7,5> - 2693926767U, // <7,5,3,7>: Cost 3 vext3 <1,5,3,7>, <5,3,7,0> - 2712063864U, // <7,5,3,u>: Cost 3 vext3 RHS, <5,3,u,0> - 2579071078U, // <7,5,4,0>: Cost 3 vext1 <4,7,5,4>, LHS - 3646841856U, // <7,5,4,1>: Cost 4 vext1 <3,7,5,4>, <1,3,5,7> - 3716762698U, // <7,5,4,2>: Cost 4 vext2 <4,2,7,5>, <4,2,7,5> - 3646843491U, // <7,5,4,3>: Cost 4 vext1 <3,7,5,4>, <3,5,7,4> - 2579074358U, // <7,5,4,4>: Cost 3 vext1 <4,7,5,4>, RHS - 2636385590U, // <7,5,4,5>: Cost 3 vext2 <3,1,7,5>, RHS - 2645675406U, // <7,5,4,6>: Cost 3 vext2 <4,6,7,5>, <4,6,7,5> - 1638322118U, // <7,5,4,7>: Cost 2 vext3 RHS, <5,4,7,6> - 1638469583U, // <7,5,4,u>: Cost 2 vext3 RHS, <5,4,u,6> - 2714054611U, // <7,5,5,0>: Cost 3 vext3 RHS, <5,5,0,1> - 2652974800U, // <7,5,5,1>: Cost 3 vext2 <5,u,7,5>, <5,1,7,3> - 3710127905U, // <7,5,5,2>: Cost 4 vext2 <3,1,7,5>, <5,2,7,3> - 3785805808U, // <7,5,5,3>: Cost 4 vext3 RHS, <5,5,3,3> - 2712211450U, // <7,5,5,4>: Cost 3 vext3 RHS, <5,5,4,4> - 1638322180U, // <7,5,5,5>: Cost 2 vext3 RHS, <5,5,5,5> - 2712064014U, // <7,5,5,6>: Cost 3 vext3 RHS, <5,5,6,6> - 1638469656U, // <7,5,5,7>: Cost 2 vext3 RHS, <5,5,7,7> - 1638469665U, // <7,5,5,u>: Cost 2 vext3 RHS, <5,5,u,7> - 2712064036U, // <7,5,6,0>: Cost 3 vext3 RHS, <5,6,0,1> - 2714054707U, // <7,5,6,1>: Cost 3 vext3 RHS, <5,6,1,7> - 3785805879U, // <7,5,6,2>: Cost 4 vext3 RHS, <5,6,2,2> - 2712064066U, // <7,5,6,3>: Cost 3 vext3 RHS, <5,6,3,4> - 2712064076U, // <7,5,6,4>: Cost 3 vext3 RHS, <5,6,4,5> - 2714054743U, // <7,5,6,5>: Cost 3 vext3 RHS, <5,6,5,7> - 2712064096U, // <7,5,6,6>: Cost 3 vext3 RHS, <5,6,6,7> - 1638322274U, // <7,5,6,7>: Cost 2 vext3 RHS, <5,6,7,0> - 1638469739U, // <7,5,6,u>: Cost 2 vext3 RHS, <5,6,u,0> - 1511325798U, // <7,5,7,0>: Cost 2 vext1 <5,7,5,7>, LHS - 2692747392U, // <7,5,7,1>: Cost 3 vext3 <1,3,5,7>, <5,7,1,3> - 2585069160U, // <7,5,7,2>: Cost 3 vext1 <5,7,5,7>, <2,2,2,2> - 2573126390U, // <7,5,7,3>: Cost 3 vext1 <3,7,5,7>, <3,7,5,7> - 1511329078U, // <7,5,7,4>: Cost 2 vext1 <5,7,5,7>, RHS - 1638469800U, // <7,5,7,5>: Cost 2 vext3 RHS, <5,7,5,7> - 2712211626U, // <7,5,7,6>: Cost 3 vext3 RHS, <5,7,6,0> - 2712211636U, // <7,5,7,7>: Cost 3 vext3 RHS, <5,7,7,1> - 1638469823U, // <7,5,7,u>: Cost 2 vext3 RHS, <5,7,u,3> - 1511333990U, // <7,5,u,0>: Cost 2 vext1 <5,7,5,u>, LHS - 2636388142U, // <7,5,u,1>: Cost 3 vext2 <3,1,7,5>, LHS - 2712211671U, // <7,5,u,2>: Cost 3 vext3 RHS, <5,u,2,0> - 2573134583U, // <7,5,u,3>: Cost 3 vext1 <3,7,5,u>, <3,7,5,u> - 1511337270U, // <7,5,u,4>: Cost 2 vext1 <5,7,5,u>, RHS - 1638469881U, // <7,5,u,5>: Cost 2 vext3 RHS, <5,u,5,7> - 2712064258U, // <7,5,u,6>: Cost 3 vext3 RHS, <5,u,6,7> - 1638469892U, // <7,5,u,7>: Cost 2 vext3 RHS, <5,u,7,0> - 1638469904U, // <7,5,u,u>: Cost 2 vext3 RHS, <5,u,u,3> - 2650324992U, // <7,6,0,0>: Cost 3 vext2 <5,4,7,6>, <0,0,0,0> - 1576583270U, // <7,6,0,1>: Cost 2 vext2 <5,4,7,6>, LHS - 2712064300U, // <7,6,0,2>: Cost 3 vext3 RHS, <6,0,2,4> - 2255295336U, // <7,6,0,3>: Cost 3 vrev <6,7,3,0> - 2712064316U, // <7,6,0,4>: Cost 3 vext3 RHS, <6,0,4,2> - 2585088098U, // <7,6,0,5>: Cost 3 vext1 <5,7,6,0>, <5,6,7,0> - 2735952204U, // <7,6,0,6>: Cost 3 vext3 RHS, <6,0,6,0> - 2712211799U, // <7,6,0,7>: Cost 3 vext3 RHS, <6,0,7,2> - 1576583837U, // <7,6,0,u>: Cost 2 vext2 <5,4,7,6>, LHS - 1181340494U, // <7,6,1,0>: Cost 2 vrev <6,7,0,1> - 2650325812U, // <7,6,1,1>: Cost 3 vext2 <5,4,7,6>, <1,1,1,1> - 2650325910U, // <7,6,1,2>: Cost 3 vext2 <5,4,7,6>, <1,2,3,0> - 2650325976U, // <7,6,1,3>: Cost 3 vext2 <5,4,7,6>, <1,3,1,3> - 2579123510U, // <7,6,1,4>: Cost 3 vext1 <4,7,6,1>, RHS - 2650326160U, // <7,6,1,5>: Cost 3 vext2 <5,4,7,6>, <1,5,3,7> - 2714055072U, // <7,6,1,6>: Cost 3 vext3 RHS, <6,1,6,3> - 2712064425U, // <7,6,1,7>: Cost 3 vext3 RHS, <6,1,7,3> - 1181930390U, // <7,6,1,u>: Cost 2 vrev <6,7,u,1> - 2712211897U, // <7,6,2,0>: Cost 3 vext3 RHS, <6,2,0,1> - 2714055108U, // <7,6,2,1>: Cost 3 vext3 RHS, <6,2,1,3> - 2650326632U, // <7,6,2,2>: Cost 3 vext2 <5,4,7,6>, <2,2,2,2> - 2650326694U, // <7,6,2,3>: Cost 3 vext2 <5,4,7,6>, <2,3,0,1> - 2714055137U, // <7,6,2,4>: Cost 3 vext3 RHS, <6,2,4,5> - 2714055148U, // <7,6,2,5>: Cost 3 vext3 RHS, <6,2,5,7> - 2650326970U, // <7,6,2,6>: Cost 3 vext2 <5,4,7,6>, <2,6,3,7> - 1638470138U, // <7,6,2,7>: Cost 2 vext3 RHS, <6,2,7,3> - 1638470147U, // <7,6,2,u>: Cost 2 vext3 RHS, <6,2,u,3> - 2650327190U, // <7,6,3,0>: Cost 3 vext2 <5,4,7,6>, <3,0,1,2> - 2255172441U, // <7,6,3,1>: Cost 3 vrev <6,7,1,3> - 2255246178U, // <7,6,3,2>: Cost 3 vrev <6,7,2,3> - 2650327452U, // <7,6,3,3>: Cost 3 vext2 <5,4,7,6>, <3,3,3,3> - 2712064562U, // <7,6,3,4>: Cost 3 vext3 RHS, <6,3,4,5> - 2650327627U, // <7,6,3,5>: Cost 3 vext2 <5,4,7,6>, <3,5,4,7> - 3713452726U, // <7,6,3,6>: Cost 4 vext2 <3,6,7,6>, <3,6,7,6> - 2700563016U, // <7,6,3,7>: Cost 3 vext3 <2,6,3,7>, <6,3,7,0> - 2712064593U, // <7,6,3,u>: Cost 3 vext3 RHS, <6,3,u,0> - 2650327954U, // <7,6,4,0>: Cost 3 vext2 <5,4,7,6>, <4,0,5,1> - 2735952486U, // <7,6,4,1>: Cost 3 vext3 RHS, <6,4,1,3> - 2735952497U, // <7,6,4,2>: Cost 3 vext3 RHS, <6,4,2,5> - 2255328108U, // <7,6,4,3>: Cost 3 vrev <6,7,3,4> - 2712212100U, // <7,6,4,4>: Cost 3 vext3 RHS, <6,4,4,6> - 1576586550U, // <7,6,4,5>: Cost 2 vext2 <5,4,7,6>, RHS - 2714055312U, // <7,6,4,6>: Cost 3 vext3 RHS, <6,4,6,0> - 2712212126U, // <7,6,4,7>: Cost 3 vext3 RHS, <6,4,7,5> - 1576586793U, // <7,6,4,u>: Cost 2 vext2 <5,4,7,6>, RHS - 2579152998U, // <7,6,5,0>: Cost 3 vext1 <4,7,6,5>, LHS - 2650328784U, // <7,6,5,1>: Cost 3 vext2 <5,4,7,6>, <5,1,7,3> - 2714055364U, // <7,6,5,2>: Cost 3 vext3 RHS, <6,5,2,7> - 3785806538U, // <7,6,5,3>: Cost 4 vext3 RHS, <6,5,3,4> - 1576587206U, // <7,6,5,4>: Cost 2 vext2 <5,4,7,6>, <5,4,7,6> - 2650329092U, // <7,6,5,5>: Cost 3 vext2 <5,4,7,6>, <5,5,5,5> - 2650329186U, // <7,6,5,6>: Cost 3 vext2 <5,4,7,6>, <5,6,7,0> - 2712064753U, // <7,6,5,7>: Cost 3 vext3 RHS, <6,5,7,7> - 1181963162U, // <7,6,5,u>: Cost 2 vrev <6,7,u,5> - 2714055421U, // <7,6,6,0>: Cost 3 vext3 RHS, <6,6,0,1> - 2714055432U, // <7,6,6,1>: Cost 3 vext3 RHS, <6,6,1,3> - 2650329594U, // <7,6,6,2>: Cost 3 vext2 <5,4,7,6>, <6,2,7,3> - 3785806619U, // <7,6,6,3>: Cost 4 vext3 RHS, <6,6,3,4> - 2712212260U, // <7,6,6,4>: Cost 3 vext3 RHS, <6,6,4,4> - 2714055472U, // <7,6,6,5>: Cost 3 vext3 RHS, <6,6,5,7> - 1638323000U, // <7,6,6,6>: Cost 2 vext3 RHS, <6,6,6,6> - 1638470466U, // <7,6,6,7>: Cost 2 vext3 RHS, <6,6,7,7> - 1638470475U, // <7,6,6,u>: Cost 2 vext3 RHS, <6,6,u,7> - 1638323022U, // <7,6,7,0>: Cost 2 vext3 RHS, <6,7,0,1> - 2712064854U, // <7,6,7,1>: Cost 3 vext3 RHS, <6,7,1,0> - 2712064865U, // <7,6,7,2>: Cost 3 vext3 RHS, <6,7,2,2> - 2712064872U, // <7,6,7,3>: Cost 3 vext3 RHS, <6,7,3,0> - 1638323062U, // <7,6,7,4>: Cost 2 vext3 RHS, <6,7,4,5> - 2712064894U, // <7,6,7,5>: Cost 3 vext3 RHS, <6,7,5,4> - 2712064905U, // <7,6,7,6>: Cost 3 vext3 RHS, <6,7,6,6> - 2712064915U, // <7,6,7,7>: Cost 3 vext3 RHS, <6,7,7,7> - 1638323094U, // <7,6,7,u>: Cost 2 vext3 RHS, <6,7,u,1> - 1638470559U, // <7,6,u,0>: Cost 2 vext3 RHS, <6,u,0,1> - 1576589102U, // <7,6,u,1>: Cost 2 vext2 <5,4,7,6>, LHS - 2712212402U, // <7,6,u,2>: Cost 3 vext3 RHS, <6,u,2,2> - 2712212409U, // <7,6,u,3>: Cost 3 vext3 RHS, <6,u,3,0> - 1638470599U, // <7,6,u,4>: Cost 2 vext3 RHS, <6,u,4,5> - 1576589466U, // <7,6,u,5>: Cost 2 vext2 <5,4,7,6>, RHS - 1638323000U, // <7,6,u,6>: Cost 2 vext3 RHS, <6,6,6,6> - 1638470624U, // <7,6,u,7>: Cost 2 vext3 RHS, <6,u,7,3> - 1638470631U, // <7,6,u,u>: Cost 2 vext3 RHS, <6,u,u,1> - 2712065007U, // <7,7,0,0>: Cost 3 vext3 RHS, <7,0,0,0> - 1638323194U, // <7,7,0,1>: Cost 2 vext3 RHS, <7,0,1,2> - 2712065025U, // <7,7,0,2>: Cost 3 vext3 RHS, <7,0,2,0> - 3646958337U, // <7,7,0,3>: Cost 4 vext1 <3,7,7,0>, <3,7,7,0> - 2712065044U, // <7,7,0,4>: Cost 3 vext3 RHS, <7,0,4,1> - 2585161907U, // <7,7,0,5>: Cost 3 vext1 <5,7,7,0>, <5,7,7,0> - 2591134604U, // <7,7,0,6>: Cost 3 vext1 <6,7,7,0>, <6,7,7,0> - 2591134714U, // <7,7,0,7>: Cost 3 vext1 <6,7,7,0>, <7,0,1,2> - 1638323257U, // <7,7,0,u>: Cost 2 vext3 RHS, <7,0,u,2> - 2712065091U, // <7,7,1,0>: Cost 3 vext3 RHS, <7,1,0,3> - 2712065098U, // <7,7,1,1>: Cost 3 vext3 RHS, <7,1,1,1> - 2712065109U, // <7,7,1,2>: Cost 3 vext3 RHS, <7,1,2,3> - 2692748384U, // <7,7,1,3>: Cost 3 vext3 <1,3,5,7>, <7,1,3,5> - 2585169206U, // <7,7,1,4>: Cost 3 vext1 <5,7,7,1>, RHS - 2693928048U, // <7,7,1,5>: Cost 3 vext3 <1,5,3,7>, <7,1,5,3> - 2585170766U, // <7,7,1,6>: Cost 3 vext1 <5,7,7,1>, <6,7,0,1> - 2735953024U, // <7,7,1,7>: Cost 3 vext3 RHS, <7,1,7,1> - 2695918731U, // <7,7,1,u>: Cost 3 vext3 <1,u,3,7>, <7,1,u,3> - 3770471574U, // <7,7,2,0>: Cost 4 vext3 <2,0,5,7>, <7,2,0,5> - 3785807002U, // <7,7,2,1>: Cost 4 vext3 RHS, <7,2,1,0> - 2712065189U, // <7,7,2,2>: Cost 3 vext3 RHS, <7,2,2,2> - 2712065196U, // <7,7,2,3>: Cost 3 vext3 RHS, <7,2,3,0> - 3773125818U, // <7,7,2,4>: Cost 4 vext3 <2,4,5,7>, <7,2,4,5> - 3766490305U, // <7,7,2,5>: Cost 4 vext3 <1,3,5,7>, <7,2,5,3> - 2700563658U, // <7,7,2,6>: Cost 3 vext3 <2,6,3,7>, <7,2,6,3> - 2735953107U, // <7,7,2,7>: Cost 3 vext3 RHS, <7,2,7,3> - 2701890780U, // <7,7,2,u>: Cost 3 vext3 <2,u,3,7>, <7,2,u,3> - 2712065251U, // <7,7,3,0>: Cost 3 vext3 RHS, <7,3,0,1> - 3766490350U, // <7,7,3,1>: Cost 4 vext3 <1,3,5,7>, <7,3,1,3> - 3774305530U, // <7,7,3,2>: Cost 4 vext3 <2,6,3,7>, <7,3,2,6> - 2637728196U, // <7,7,3,3>: Cost 3 vext2 <3,3,7,7>, <3,3,7,7> - 2712065291U, // <7,7,3,4>: Cost 3 vext3 RHS, <7,3,4,5> - 2585186486U, // <7,7,3,5>: Cost 3 vext1 <5,7,7,3>, <5,7,7,3> - 2639719095U, // <7,7,3,6>: Cost 3 vext2 <3,6,7,7>, <3,6,7,7> - 2640382728U, // <7,7,3,7>: Cost 3 vext2 <3,7,7,7>, <3,7,7,7> - 2641046361U, // <7,7,3,u>: Cost 3 vext2 <3,u,7,7>, <3,u,7,7> - 2712212792U, // <7,7,4,0>: Cost 3 vext3 RHS, <7,4,0,5> - 3646989312U, // <7,7,4,1>: Cost 4 vext1 <3,7,7,4>, <1,3,5,7> - 3785807176U, // <7,7,4,2>: Cost 4 vext3 RHS, <7,4,2,3> - 3646991109U, // <7,7,4,3>: Cost 4 vext1 <3,7,7,4>, <3,7,7,4> - 2712065371U, // <7,7,4,4>: Cost 3 vext3 RHS, <7,4,4,4> - 1638323558U, // <7,7,4,5>: Cost 2 vext3 RHS, <7,4,5,6> - 2712212845U, // <7,7,4,6>: Cost 3 vext3 RHS, <7,4,6,4> - 2591167846U, // <7,7,4,7>: Cost 3 vext1 <6,7,7,4>, <7,4,5,6> - 1638323585U, // <7,7,4,u>: Cost 2 vext3 RHS, <7,4,u,6> - 2585198694U, // <7,7,5,0>: Cost 3 vext1 <5,7,7,5>, LHS - 2712212884U, // <7,7,5,1>: Cost 3 vext3 RHS, <7,5,1,7> - 3711471393U, // <7,7,5,2>: Cost 4 vext2 <3,3,7,7>, <5,2,7,3> - 2649673590U, // <7,7,5,3>: Cost 3 vext2 <5,3,7,7>, <5,3,7,7> - 2712065455U, // <7,7,5,4>: Cost 3 vext3 RHS, <7,5,4,7> - 1577259032U, // <7,7,5,5>: Cost 2 vext2 <5,5,7,7>, <5,5,7,7> - 2712065473U, // <7,7,5,6>: Cost 3 vext3 RHS, <7,5,6,7> - 2712212936U, // <7,7,5,7>: Cost 3 vext3 RHS, <7,5,7,5> - 1579249931U, // <7,7,5,u>: Cost 2 vext2 <5,u,7,7>, <5,u,7,7> - 2591178854U, // <7,7,6,0>: Cost 3 vext1 <6,7,7,6>, LHS - 2735953374U, // <7,7,6,1>: Cost 3 vext3 RHS, <7,6,1,0> - 2712212974U, // <7,7,6,2>: Cost 3 vext3 RHS, <7,6,2,7> - 2655646287U, // <7,7,6,3>: Cost 3 vext2 <6,3,7,7>, <6,3,7,7> - 2591182134U, // <7,7,6,4>: Cost 3 vext1 <6,7,7,6>, RHS - 2656973553U, // <7,7,6,5>: Cost 3 vext2 <6,5,7,7>, <6,5,7,7> - 1583895362U, // <7,7,6,6>: Cost 2 vext2 <6,6,7,7>, <6,6,7,7> - 2712065556U, // <7,7,6,7>: Cost 3 vext3 RHS, <7,6,7,0> - 1585222628U, // <7,7,6,u>: Cost 2 vext2 <6,u,7,7>, <6,u,7,7> - 1523417190U, // <7,7,7,0>: Cost 2 vext1 <7,7,7,7>, LHS - 2597159670U, // <7,7,7,1>: Cost 3 vext1 <7,7,7,7>, <1,0,3,2> - 2597160552U, // <7,7,7,2>: Cost 3 vext1 <7,7,7,7>, <2,2,2,2> - 2597161110U, // <7,7,7,3>: Cost 3 vext1 <7,7,7,7>, <3,0,1,2> - 1523420470U, // <7,7,7,4>: Cost 2 vext1 <7,7,7,7>, RHS - 2651002296U, // <7,7,7,5>: Cost 3 vext2 <5,5,7,7>, <7,5,5,7> - 2657637906U, // <7,7,7,6>: Cost 3 vext2 <6,6,7,7>, <7,6,6,7> - 363253046U, // <7,7,7,7>: Cost 1 vdup3 RHS - 363253046U, // <7,7,7,u>: Cost 1 vdup3 RHS - 1523417190U, // <7,7,u,0>: Cost 2 vext1 <7,7,7,7>, LHS - 1638471298U, // <7,7,u,1>: Cost 2 vext3 RHS, <7,u,1,2> - 2712213132U, // <7,7,u,2>: Cost 3 vext3 RHS, <7,u,2,3> - 2712213138U, // <7,7,u,3>: Cost 3 vext3 RHS, <7,u,3,0> - 1523420470U, // <7,7,u,4>: Cost 2 vext1 <7,7,7,7>, RHS - 1638471338U, // <7,7,u,5>: Cost 2 vext3 RHS, <7,u,5,6> - 1595840756U, // <7,7,u,6>: Cost 2 vext2 <u,6,7,7>, <u,6,7,7> - 363253046U, // <7,7,u,7>: Cost 1 vdup3 RHS - 363253046U, // <7,7,u,u>: Cost 1 vdup3 RHS - 1638318080U, // <7,u,0,0>: Cost 2 vext3 RHS, <0,0,0,0> - 1638323923U, // <7,u,0,1>: Cost 2 vext3 RHS, <u,0,1,2> - 1662211804U, // <7,u,0,2>: Cost 2 vext3 RHS, <u,0,2,2> - 1638323941U, // <7,u,0,3>: Cost 2 vext3 RHS, <u,0,3,2> - 2712065773U, // <7,u,0,4>: Cost 3 vext3 RHS, <u,0,4,1> - 1662359286U, // <7,u,0,5>: Cost 2 vext3 RHS, <u,0,5,1> - 1662359296U, // <7,u,0,6>: Cost 2 vext3 RHS, <u,0,6,2> - 2987150664U, // <7,u,0,7>: Cost 3 vzipr <5,6,7,0>, RHS - 1638323986U, // <7,u,0,u>: Cost 2 vext3 RHS, <u,0,u,2> - 1517469798U, // <7,u,1,0>: Cost 2 vext1 <6,7,u,1>, LHS - 1638318900U, // <7,u,1,1>: Cost 2 vext3 RHS, <1,1,1,1> - 564582190U, // <7,u,1,2>: Cost 1 vext3 RHS, LHS - 1638324023U, // <7,u,1,3>: Cost 2 vext3 RHS, <u,1,3,3> - 1517473078U, // <7,u,1,4>: Cost 2 vext1 <6,7,u,1>, RHS - 2693928777U, // <7,u,1,5>: Cost 3 vext3 <1,5,3,7>, <u,1,5,3> - 1517474710U, // <7,u,1,6>: Cost 2 vext1 <6,7,u,1>, <6,7,u,1> - 1640462171U, // <7,u,1,7>: Cost 2 vext3 RHS, <u,1,7,3> - 564582244U, // <7,u,1,u>: Cost 1 vext3 RHS, LHS - 1638318244U, // <7,u,2,0>: Cost 2 vext3 RHS, <0,2,0,2> - 2712065907U, // <7,u,2,1>: Cost 3 vext3 RHS, <u,2,1,0> - 1638319720U, // <7,u,2,2>: Cost 2 vext3 RHS, <2,2,2,2> - 1638324101U, // <7,u,2,3>: Cost 2 vext3 RHS, <u,2,3,0> - 1638318284U, // <7,u,2,4>: Cost 2 vext3 RHS, <0,2,4,6> - 2712065947U, // <7,u,2,5>: Cost 3 vext3 RHS, <u,2,5,4> - 2700564387U, // <7,u,2,6>: Cost 3 vext3 <2,6,3,7>, <u,2,6,3> - 1640314796U, // <7,u,2,7>: Cost 2 vext3 RHS, <u,2,7,3> - 1638324146U, // <7,u,2,u>: Cost 2 vext3 RHS, <u,2,u,0> - 1638324156U, // <7,u,3,0>: Cost 2 vext3 RHS, <u,3,0,1> - 1638319064U, // <7,u,3,1>: Cost 2 vext3 RHS, <1,3,1,3> - 2700564435U, // <7,u,3,2>: Cost 3 vext3 <2,6,3,7>, <u,3,2,6> - 1638320540U, // <7,u,3,3>: Cost 2 vext3 RHS, <3,3,3,3> - 1638324196U, // <7,u,3,4>: Cost 2 vext3 RHS, <u,3,4,5> - 1638324207U, // <7,u,3,5>: Cost 2 vext3 RHS, <u,3,5,7> - 2700564472U, // <7,u,3,6>: Cost 3 vext3 <2,6,3,7>, <u,3,6,7> - 2695919610U, // <7,u,3,7>: Cost 3 vext3 <1,u,3,7>, <u,3,7,0> - 1638324228U, // <7,u,3,u>: Cost 2 vext3 RHS, <u,3,u,1> - 2712066061U, // <7,u,4,0>: Cost 3 vext3 RHS, <u,4,0,1> - 1662212122U, // <7,u,4,1>: Cost 2 vext3 RHS, <u,4,1,5> - 1662212132U, // <7,u,4,2>: Cost 2 vext3 RHS, <u,4,2,6> - 2712066092U, // <7,u,4,3>: Cost 3 vext3 RHS, <u,4,3,5> - 1638321360U, // <7,u,4,4>: Cost 2 vext3 RHS, <4,4,4,4> - 1638324287U, // <7,u,4,5>: Cost 2 vext3 RHS, <u,4,5,6> - 1662359624U, // <7,u,4,6>: Cost 2 vext3 RHS, <u,4,6,6> - 1640314961U, // <7,u,4,7>: Cost 2 vext3 RHS, <u,4,7,6> - 1638324314U, // <7,u,4,u>: Cost 2 vext3 RHS, <u,4,u,6> - 1517502566U, // <7,u,5,0>: Cost 2 vext1 <6,7,u,5>, LHS - 1574612693U, // <7,u,5,1>: Cost 2 vext2 <5,1,7,u>, <5,1,7,u> - 2712066162U, // <7,u,5,2>: Cost 3 vext3 RHS, <u,5,2,3> - 1638324351U, // <7,u,5,3>: Cost 2 vext3 RHS, <u,5,3,7> - 1576603592U, // <7,u,5,4>: Cost 2 vext2 <5,4,7,u>, <5,4,7,u> - 1577267225U, // <7,u,5,5>: Cost 2 vext2 <5,5,7,u>, <5,5,7,u> - 564582554U, // <7,u,5,6>: Cost 1 vext3 RHS, RHS - 1640462499U, // <7,u,5,7>: Cost 2 vext3 RHS, <u,5,7,7> - 564582572U, // <7,u,5,u>: Cost 1 vext3 RHS, RHS - 2712066223U, // <7,u,6,0>: Cost 3 vext3 RHS, <u,6,0,1> - 2712066238U, // <7,u,6,1>: Cost 3 vext3 RHS, <u,6,1,7> - 1581249023U, // <7,u,6,2>: Cost 2 vext2 <6,2,7,u>, <6,2,7,u> - 1638324432U, // <7,u,6,3>: Cost 2 vext3 RHS, <u,6,3,7> - 1638468980U, // <7,u,6,4>: Cost 2 vext3 RHS, <4,6,4,6> - 2712066274U, // <7,u,6,5>: Cost 3 vext3 RHS, <u,6,5,7> - 1583903555U, // <7,u,6,6>: Cost 2 vext2 <6,6,7,u>, <6,6,7,u> - 1640315117U, // <7,u,6,7>: Cost 2 vext3 RHS, <u,6,7,0> - 1638324477U, // <7,u,6,u>: Cost 2 vext3 RHS, <u,6,u,7> - 1638471936U, // <7,u,7,0>: Cost 2 vext3 RHS, <u,7,0,1> - 2692970763U, // <7,u,7,1>: Cost 3 vext3 <1,3,u,7>, <u,7,1,3> - 2700933399U, // <7,u,7,2>: Cost 3 vext3 <2,6,u,7>, <u,7,2,6> - 2573347601U, // <7,u,7,3>: Cost 3 vext1 <3,7,u,7>, <3,7,u,7> - 1638471976U, // <7,u,7,4>: Cost 2 vext3 RHS, <u,7,4,5> - 1511551171U, // <7,u,7,5>: Cost 2 vext1 <5,7,u,7>, <5,7,u,7> - 2712213815U, // <7,u,7,6>: Cost 3 vext3 RHS, <u,7,6,2> - 363253046U, // <7,u,7,7>: Cost 1 vdup3 RHS - 363253046U, // <7,u,7,u>: Cost 1 vdup3 RHS - 1638324561U, // <7,u,u,0>: Cost 2 vext3 RHS, <u,u,0,1> - 1638324571U, // <7,u,u,1>: Cost 2 vext3 RHS, <u,u,1,2> - 564582757U, // <7,u,u,2>: Cost 1 vext3 RHS, LHS - 1638324587U, // <7,u,u,3>: Cost 2 vext3 RHS, <u,u,3,0> - 1638324601U, // <7,u,u,4>: Cost 2 vext3 RHS, <u,u,4,5> - 1638324611U, // <7,u,u,5>: Cost 2 vext3 RHS, <u,u,5,6> - 564582797U, // <7,u,u,6>: Cost 1 vext3 RHS, RHS - 363253046U, // <7,u,u,7>: Cost 1 vdup3 RHS - 564582811U, // <7,u,u,u>: Cost 1 vext3 RHS, LHS - 135053414U, // <u,0,0,0>: Cost 1 vdup0 LHS - 1611489290U, // <u,0,0,1>: Cost 2 vext3 LHS, <0,0,1,1> - 1611489300U, // <u,0,0,2>: Cost 2 vext3 LHS, <0,0,2,2> - 2568054923U, // <u,0,0,3>: Cost 3 vext1 <3,0,0,0>, <3,0,0,0> - 1481706806U, // <u,0,0,4>: Cost 2 vext1 <0,u,0,0>, RHS - 2555449040U, // <u,0,0,5>: Cost 3 vext1 <0,u,0,0>, <5,1,7,3> - 2591282078U, // <u,0,0,6>: Cost 3 vext1 <6,u,0,0>, <6,u,0,0> - 2591945711U, // <u,0,0,7>: Cost 3 vext1 <7,0,0,0>, <7,0,0,0> - 135053414U, // <u,0,0,u>: Cost 1 vdup0 LHS - 1493655654U, // <u,0,1,0>: Cost 2 vext1 <2,u,0,1>, LHS - 1860550758U, // <u,0,1,1>: Cost 2 vzipl LHS, LHS - 537747563U, // <u,0,1,2>: Cost 1 vext3 LHS, LHS - 2625135576U, // <u,0,1,3>: Cost 3 vext2 <1,2,u,0>, <1,3,1,3> - 1493658934U, // <u,0,1,4>: Cost 2 vext1 <2,u,0,1>, RHS - 2625135760U, // <u,0,1,5>: Cost 3 vext2 <1,2,u,0>, <1,5,3,7> - 1517548447U, // <u,0,1,6>: Cost 2 vext1 <6,u,0,1>, <6,u,0,1> - 2591290362U, // <u,0,1,7>: Cost 3 vext1 <6,u,0,1>, <7,0,1,2> - 537747612U, // <u,0,1,u>: Cost 1 vext3 LHS, LHS - 1611489444U, // <u,0,2,0>: Cost 2 vext3 LHS, <0,2,0,2> - 2685231276U, // <u,0,2,1>: Cost 3 vext3 LHS, <0,2,1,1> - 1994768486U, // <u,0,2,2>: Cost 2 vtrnl LHS, LHS - 2685231294U, // <u,0,2,3>: Cost 3 vext3 LHS, <0,2,3,1> - 1611489484U, // <u,0,2,4>: Cost 2 vext3 LHS, <0,2,4,6> - 2712068310U, // <u,0,2,5>: Cost 3 vext3 RHS, <0,2,5,7> - 2625136570U, // <u,0,2,6>: Cost 3 vext2 <1,2,u,0>, <2,6,3,7> - 2591962097U, // <u,0,2,7>: Cost 3 vext1 <7,0,0,2>, <7,0,0,2> - 1611489516U, // <u,0,2,u>: Cost 2 vext3 LHS, <0,2,u,2> - 2954067968U, // <u,0,3,0>: Cost 3 vzipr LHS, <0,0,0,0> - 2685231356U, // <u,0,3,1>: Cost 3 vext3 LHS, <0,3,1,0> - 72589981U, // <u,0,3,2>: Cost 1 vrev LHS - 2625137052U, // <u,0,3,3>: Cost 3 vext2 <1,2,u,0>, <3,3,3,3> - 2625137154U, // <u,0,3,4>: Cost 3 vext2 <1,2,u,0>, <3,4,5,6> - 2639071848U, // <u,0,3,5>: Cost 3 vext2 <3,5,u,0>, <3,5,u,0> - 2639735481U, // <u,0,3,6>: Cost 3 vext2 <3,6,u,0>, <3,6,u,0> - 2597279354U, // <u,0,3,7>: Cost 3 vext1 <7,u,0,3>, <7,u,0,3> - 73032403U, // <u,0,3,u>: Cost 1 vrev LHS - 2687074636U, // <u,0,4,0>: Cost 3 vext3 <0,4,0,u>, <0,4,0,u> - 1611489618U, // <u,0,4,1>: Cost 2 vext3 LHS, <0,4,1,5> - 1611489628U, // <u,0,4,2>: Cost 2 vext3 LHS, <0,4,2,6> - 3629222038U, // <u,0,4,3>: Cost 4 vext1 <0,u,0,4>, <3,0,1,2> - 2555481398U, // <u,0,4,4>: Cost 3 vext1 <0,u,0,4>, RHS - 1551396150U, // <u,0,4,5>: Cost 2 vext2 <1,2,u,0>, RHS - 2651680116U, // <u,0,4,6>: Cost 3 vext2 <5,6,u,0>, <4,6,4,6> - 2646150600U, // <u,0,4,7>: Cost 3 vext2 <4,7,5,0>, <4,7,5,0> - 1611932050U, // <u,0,4,u>: Cost 2 vext3 LHS, <0,4,u,6> - 2561458278U, // <u,0,5,0>: Cost 3 vext1 <1,u,0,5>, LHS - 1863532646U, // <u,0,5,1>: Cost 2 vzipl RHS, LHS - 2712068526U, // <u,0,5,2>: Cost 3 vext3 RHS, <0,5,2,7> - 2649689976U, // <u,0,5,3>: Cost 3 vext2 <5,3,u,0>, <5,3,u,0> - 2220237489U, // <u,0,5,4>: Cost 3 vrev <0,u,4,5> - 2651680772U, // <u,0,5,5>: Cost 3 vext2 <5,6,u,0>, <5,5,5,5> - 1577939051U, // <u,0,5,6>: Cost 2 vext2 <5,6,u,0>, <5,6,u,0> - 2830077238U, // <u,0,5,7>: Cost 3 vuzpr <1,u,3,0>, RHS - 1579266317U, // <u,0,5,u>: Cost 2 vext2 <5,u,u,0>, <5,u,u,0> - 2555494502U, // <u,0,6,0>: Cost 3 vext1 <0,u,0,6>, LHS - 2712068598U, // <u,0,6,1>: Cost 3 vext3 RHS, <0,6,1,7> - 1997750374U, // <u,0,6,2>: Cost 2 vtrnl RHS, LHS - 2655662673U, // <u,0,6,3>: Cost 3 vext2 <6,3,u,0>, <6,3,u,0> - 2555497782U, // <u,0,6,4>: Cost 3 vext1 <0,u,0,6>, RHS - 2651681459U, // <u,0,6,5>: Cost 3 vext2 <5,6,u,0>, <6,5,0,u> - 2651681592U, // <u,0,6,6>: Cost 3 vext2 <5,6,u,0>, <6,6,6,6> - 2651681614U, // <u,0,6,7>: Cost 3 vext2 <5,6,u,0>, <6,7,0,1> - 1997750428U, // <u,0,6,u>: Cost 2 vtrnl RHS, LHS - 2567446630U, // <u,0,7,0>: Cost 3 vext1 <2,u,0,7>, LHS - 2567447446U, // <u,0,7,1>: Cost 3 vext1 <2,u,0,7>, <1,2,3,0> - 2567448641U, // <u,0,7,2>: Cost 3 vext1 <2,u,0,7>, <2,u,0,7> - 2573421338U, // <u,0,7,3>: Cost 3 vext1 <3,u,0,7>, <3,u,0,7> - 2567449910U, // <u,0,7,4>: Cost 3 vext1 <2,u,0,7>, RHS - 2651682242U, // <u,0,7,5>: Cost 3 vext2 <5,6,u,0>, <7,5,6,u> - 2591339429U, // <u,0,7,6>: Cost 3 vext1 <6,u,0,7>, <6,u,0,7> - 2651682412U, // <u,0,7,7>: Cost 3 vext2 <5,6,u,0>, <7,7,7,7> - 2567452462U, // <u,0,7,u>: Cost 3 vext1 <2,u,0,7>, LHS - 135053414U, // <u,0,u,0>: Cost 1 vdup0 LHS - 1611489938U, // <u,0,u,1>: Cost 2 vext3 LHS, <0,u,1,1> - 537748125U, // <u,0,u,2>: Cost 1 vext3 LHS, LHS - 2685674148U, // <u,0,u,3>: Cost 3 vext3 LHS, <0,u,3,1> - 1611932338U, // <u,0,u,4>: Cost 2 vext3 LHS, <0,u,4,6> - 1551399066U, // <u,0,u,5>: Cost 2 vext2 <1,2,u,0>, RHS - 1517605798U, // <u,0,u,6>: Cost 2 vext1 <6,u,0,u>, <6,u,0,u> - 2830077481U, // <u,0,u,7>: Cost 3 vuzpr <1,u,3,0>, RHS - 537748179U, // <u,0,u,u>: Cost 1 vext3 LHS, LHS - 1544101961U, // <u,1,0,0>: Cost 2 vext2 <0,0,u,1>, <0,0,u,1> - 1558036582U, // <u,1,0,1>: Cost 2 vext2 <2,3,u,1>, LHS - 2619171051U, // <u,1,0,2>: Cost 3 vext2 <0,2,u,1>, <0,2,u,1> - 1611490038U, // <u,1,0,3>: Cost 2 vext3 LHS, <1,0,3,2> - 2555522358U, // <u,1,0,4>: Cost 3 vext1 <0,u,1,0>, RHS - 2712068871U, // <u,1,0,5>: Cost 3 vext3 RHS, <1,0,5,1> - 2591355815U, // <u,1,0,6>: Cost 3 vext1 <6,u,1,0>, <6,u,1,0> - 2597328512U, // <u,1,0,7>: Cost 3 vext1 <7,u,1,0>, <7,u,1,0> - 1611490083U, // <u,1,0,u>: Cost 2 vext3 LHS, <1,0,u,2> - 1481785446U, // <u,1,1,0>: Cost 2 vext1 <0,u,1,1>, LHS - 202162278U, // <u,1,1,1>: Cost 1 vdup1 LHS - 2555528808U, // <u,1,1,2>: Cost 3 vext1 <0,u,1,1>, <2,2,2,2> - 1611490120U, // <u,1,1,3>: Cost 2 vext3 LHS, <1,1,3,3> - 1481788726U, // <u,1,1,4>: Cost 2 vext1 <0,u,1,1>, RHS - 2689876828U, // <u,1,1,5>: Cost 3 vext3 LHS, <1,1,5,5> - 2591364008U, // <u,1,1,6>: Cost 3 vext1 <6,u,1,1>, <6,u,1,1> - 2592691274U, // <u,1,1,7>: Cost 3 vext1 <7,1,1,1>, <7,1,1,1> - 202162278U, // <u,1,1,u>: Cost 1 vdup1 LHS - 1499709542U, // <u,1,2,0>: Cost 2 vext1 <3,u,1,2>, LHS - 2689876871U, // <u,1,2,1>: Cost 3 vext3 LHS, <1,2,1,3> - 2631116445U, // <u,1,2,2>: Cost 3 vext2 <2,2,u,1>, <2,2,u,1> - 835584U, // <u,1,2,3>: Cost 0 copy LHS - 1499712822U, // <u,1,2,4>: Cost 2 vext1 <3,u,1,2>, RHS - 2689876907U, // <u,1,2,5>: Cost 3 vext3 LHS, <1,2,5,3> - 2631780282U, // <u,1,2,6>: Cost 3 vext2 <2,3,u,1>, <2,6,3,7> - 1523603074U, // <u,1,2,7>: Cost 2 vext1 <7,u,1,2>, <7,u,1,2> - 835584U, // <u,1,2,u>: Cost 0 copy LHS - 1487773798U, // <u,1,3,0>: Cost 2 vext1 <1,u,1,3>, LHS - 1611490264U, // <u,1,3,1>: Cost 2 vext3 LHS, <1,3,1,3> - 2685232094U, // <u,1,3,2>: Cost 3 vext3 LHS, <1,3,2,0> - 2018746470U, // <u,1,3,3>: Cost 2 vtrnr LHS, LHS - 1487777078U, // <u,1,3,4>: Cost 2 vext1 <1,u,1,3>, RHS - 1611490304U, // <u,1,3,5>: Cost 2 vext3 LHS, <1,3,5,7> - 2685674505U, // <u,1,3,6>: Cost 3 vext3 LHS, <1,3,6,7> - 2640407307U, // <u,1,3,7>: Cost 3 vext2 <3,7,u,1>, <3,7,u,1> - 1611490327U, // <u,1,3,u>: Cost 2 vext3 LHS, <1,3,u,3> - 1567992749U, // <u,1,4,0>: Cost 2 vext2 <4,0,u,1>, <4,0,u,1> - 2693121070U, // <u,1,4,1>: Cost 3 vext3 <1,4,1,u>, <1,4,1,u> - 2693194807U, // <u,1,4,2>: Cost 3 vext3 <1,4,2,u>, <1,4,2,u> - 1152386432U, // <u,1,4,3>: Cost 2 vrev <1,u,3,4> - 2555555126U, // <u,1,4,4>: Cost 3 vext1 <0,u,1,4>, RHS - 1558039862U, // <u,1,4,5>: Cost 2 vext2 <2,3,u,1>, RHS - 2645716371U, // <u,1,4,6>: Cost 3 vext2 <4,6,u,1>, <4,6,u,1> - 2597361284U, // <u,1,4,7>: Cost 3 vext1 <7,u,1,4>, <7,u,1,4> - 1152755117U, // <u,1,4,u>: Cost 2 vrev <1,u,u,4> - 1481818214U, // <u,1,5,0>: Cost 2 vext1 <0,u,1,5>, LHS - 2555560694U, // <u,1,5,1>: Cost 3 vext1 <0,u,1,5>, <1,0,3,2> - 2555561576U, // <u,1,5,2>: Cost 3 vext1 <0,u,1,5>, <2,2,2,2> - 1611490448U, // <u,1,5,3>: Cost 2 vext3 LHS, <1,5,3,7> - 1481821494U, // <u,1,5,4>: Cost 2 vext1 <0,u,1,5>, RHS - 2651025435U, // <u,1,5,5>: Cost 3 vext2 <5,5,u,1>, <5,5,u,1> - 2651689068U, // <u,1,5,6>: Cost 3 vext2 <5,6,u,1>, <5,6,u,1> - 2823966006U, // <u,1,5,7>: Cost 3 vuzpr <0,u,1,1>, RHS - 1611932861U, // <u,1,5,u>: Cost 2 vext3 LHS, <1,5,u,7> - 2555568230U, // <u,1,6,0>: Cost 3 vext1 <0,u,1,6>, LHS - 2689877199U, // <u,1,6,1>: Cost 3 vext3 LHS, <1,6,1,7> - 2712069336U, // <u,1,6,2>: Cost 3 vext3 RHS, <1,6,2,7> - 2685232353U, // <u,1,6,3>: Cost 3 vext3 LHS, <1,6,3,7> - 2555571510U, // <u,1,6,4>: Cost 3 vext1 <0,u,1,6>, RHS - 2689877235U, // <u,1,6,5>: Cost 3 vext3 LHS, <1,6,5,7> - 2657661765U, // <u,1,6,6>: Cost 3 vext2 <6,6,u,1>, <6,6,u,1> - 1584583574U, // <u,1,6,7>: Cost 2 vext2 <6,7,u,1>, <6,7,u,1> - 1585247207U, // <u,1,6,u>: Cost 2 vext2 <6,u,u,1>, <6,u,u,1> - 2561548390U, // <u,1,7,0>: Cost 3 vext1 <1,u,1,7>, LHS - 2561549681U, // <u,1,7,1>: Cost 3 vext1 <1,u,1,7>, <1,u,1,7> - 2573493926U, // <u,1,7,2>: Cost 3 vext1 <3,u,1,7>, <2,3,0,1> - 2042962022U, // <u,1,7,3>: Cost 2 vtrnr RHS, LHS - 2561551670U, // <u,1,7,4>: Cost 3 vext1 <1,u,1,7>, RHS - 2226300309U, // <u,1,7,5>: Cost 3 vrev <1,u,5,7> - 2658325990U, // <u,1,7,6>: Cost 3 vext2 <6,7,u,1>, <7,6,1,u> - 2658326124U, // <u,1,7,7>: Cost 3 vext2 <6,7,u,1>, <7,7,7,7> - 2042962027U, // <u,1,7,u>: Cost 2 vtrnr RHS, LHS - 1481842790U, // <u,1,u,0>: Cost 2 vext1 <0,u,1,u>, LHS - 202162278U, // <u,1,u,1>: Cost 1 vdup1 LHS - 2685674867U, // <u,1,u,2>: Cost 3 vext3 LHS, <1,u,2,0> - 835584U, // <u,1,u,3>: Cost 0 copy LHS - 1481846070U, // <u,1,u,4>: Cost 2 vext1 <0,u,1,u>, RHS - 1611933077U, // <u,1,u,5>: Cost 2 vext3 LHS, <1,u,5,7> - 2685674910U, // <u,1,u,6>: Cost 3 vext3 LHS, <1,u,6,7> - 1523652232U, // <u,1,u,7>: Cost 2 vext1 <7,u,1,u>, <7,u,1,u> - 835584U, // <u,1,u,u>: Cost 0 copy LHS - 1544110154U, // <u,2,0,0>: Cost 2 vext2 <0,0,u,2>, <0,0,u,2> - 1545437286U, // <u,2,0,1>: Cost 2 vext2 <0,2,u,2>, LHS - 1545437420U, // <u,2,0,2>: Cost 2 vext2 <0,2,u,2>, <0,2,u,2> - 2685232589U, // <u,2,0,3>: Cost 3 vext3 LHS, <2,0,3,0> - 2619179346U, // <u,2,0,4>: Cost 3 vext2 <0,2,u,2>, <0,4,1,5> - 2712069606U, // <u,2,0,5>: Cost 3 vext3 RHS, <2,0,5,7> - 2689877484U, // <u,2,0,6>: Cost 3 vext3 LHS, <2,0,6,4> - 2659656273U, // <u,2,0,7>: Cost 3 vext2 <7,0,u,2>, <0,7,2,u> - 1545437853U, // <u,2,0,u>: Cost 2 vext2 <0,2,u,2>, LHS - 1550082851U, // <u,2,1,0>: Cost 2 vext2 <1,0,u,2>, <1,0,u,2> - 2619179828U, // <u,2,1,1>: Cost 3 vext2 <0,2,u,2>, <1,1,1,1> - 2619179926U, // <u,2,1,2>: Cost 3 vext2 <0,2,u,2>, <1,2,3,0> - 2685232671U, // <u,2,1,3>: Cost 3 vext3 LHS, <2,1,3,1> - 2555604278U, // <u,2,1,4>: Cost 3 vext1 <0,u,2,1>, RHS - 2619180176U, // <u,2,1,5>: Cost 3 vext2 <0,2,u,2>, <1,5,3,7> - 2689877564U, // <u,2,1,6>: Cost 3 vext3 LHS, <2,1,6,3> - 2602718850U, // <u,2,1,7>: Cost 3 vext1 <u,7,2,1>, <7,u,1,2> - 1158703235U, // <u,2,1,u>: Cost 2 vrev <2,u,u,1> - 1481867366U, // <u,2,2,0>: Cost 2 vext1 <0,u,2,2>, LHS - 2555609846U, // <u,2,2,1>: Cost 3 vext1 <0,u,2,2>, <1,0,3,2> - 269271142U, // <u,2,2,2>: Cost 1 vdup2 LHS - 1611490930U, // <u,2,2,3>: Cost 2 vext3 LHS, <2,2,3,3> - 1481870646U, // <u,2,2,4>: Cost 2 vext1 <0,u,2,2>, RHS - 2689877640U, // <u,2,2,5>: Cost 3 vext3 LHS, <2,2,5,7> - 2619180986U, // <u,2,2,6>: Cost 3 vext2 <0,2,u,2>, <2,6,3,7> - 2593436837U, // <u,2,2,7>: Cost 3 vext1 <7,2,2,2>, <7,2,2,2> - 269271142U, // <u,2,2,u>: Cost 1 vdup2 LHS - 408134301U, // <u,2,3,0>: Cost 1 vext1 LHS, LHS - 1481876214U, // <u,2,3,1>: Cost 2 vext1 LHS, <1,0,3,2> - 1481877096U, // <u,2,3,2>: Cost 2 vext1 LHS, <2,2,2,2> - 1880326246U, // <u,2,3,3>: Cost 2 vzipr LHS, LHS - 408137014U, // <u,2,3,4>: Cost 1 vext1 LHS, RHS - 1529654992U, // <u,2,3,5>: Cost 2 vext1 LHS, <5,1,7,3> - 1529655802U, // <u,2,3,6>: Cost 2 vext1 LHS, <6,2,7,3> - 1529656314U, // <u,2,3,7>: Cost 2 vext1 LHS, <7,0,1,2> - 408139566U, // <u,2,3,u>: Cost 1 vext1 LHS, LHS - 1567853468U, // <u,2,4,0>: Cost 2 vext2 <4,0,6,2>, <4,0,6,2> - 2561598362U, // <u,2,4,1>: Cost 3 vext1 <1,u,2,4>, <1,2,3,4> - 2555627214U, // <u,2,4,2>: Cost 3 vext1 <0,u,2,4>, <2,3,4,5> - 2685232918U, // <u,2,4,3>: Cost 3 vext3 LHS, <2,4,3,5> - 2555628854U, // <u,2,4,4>: Cost 3 vext1 <0,u,2,4>, RHS - 1545440566U, // <u,2,4,5>: Cost 2 vext2 <0,2,u,2>, RHS - 1571982740U, // <u,2,4,6>: Cost 2 vext2 <4,6,u,2>, <4,6,u,2> - 2592125957U, // <u,2,4,7>: Cost 3 vext1 <7,0,2,4>, <7,0,2,4> - 1545440809U, // <u,2,4,u>: Cost 2 vext2 <0,2,u,2>, RHS - 2555633766U, // <u,2,5,0>: Cost 3 vext1 <0,u,2,5>, LHS - 2561606550U, // <u,2,5,1>: Cost 3 vext1 <1,u,2,5>, <1,2,3,0> - 2689877856U, // <u,2,5,2>: Cost 3 vext3 LHS, <2,5,2,7> - 2685233000U, // <u,2,5,3>: Cost 3 vext3 LHS, <2,5,3,6> - 1158441059U, // <u,2,5,4>: Cost 2 vrev <2,u,4,5> - 2645725188U, // <u,2,5,5>: Cost 3 vext2 <4,6,u,2>, <5,5,5,5> - 2689877892U, // <u,2,5,6>: Cost 3 vext3 LHS, <2,5,6,7> - 2823900470U, // <u,2,5,7>: Cost 3 vuzpr <0,u,0,2>, RHS - 1158736007U, // <u,2,5,u>: Cost 2 vrev <2,u,u,5> - 1481900134U, // <u,2,6,0>: Cost 2 vext1 <0,u,2,6>, LHS - 2555642614U, // <u,2,6,1>: Cost 3 vext1 <0,u,2,6>, <1,0,3,2> - 2555643496U, // <u,2,6,2>: Cost 3 vext1 <0,u,2,6>, <2,2,2,2> - 1611491258U, // <u,2,6,3>: Cost 2 vext3 LHS, <2,6,3,7> - 1481903414U, // <u,2,6,4>: Cost 2 vext1 <0,u,2,6>, RHS - 2689877964U, // <u,2,6,5>: Cost 3 vext3 LHS, <2,6,5,7> - 2689877973U, // <u,2,6,6>: Cost 3 vext3 LHS, <2,6,6,7> - 2645726030U, // <u,2,6,7>: Cost 3 vext2 <4,6,u,2>, <6,7,0,1> - 1611933671U, // <u,2,6,u>: Cost 2 vext3 LHS, <2,6,u,7> - 1585919033U, // <u,2,7,0>: Cost 2 vext2 <7,0,u,2>, <7,0,u,2> - 2573566710U, // <u,2,7,1>: Cost 3 vext1 <3,u,2,7>, <1,0,3,2> - 2567596115U, // <u,2,7,2>: Cost 3 vext1 <2,u,2,7>, <2,u,2,7> - 1906901094U, // <u,2,7,3>: Cost 2 vzipr RHS, LHS - 2555653430U, // <u,2,7,4>: Cost 3 vext1 <0,u,2,7>, RHS - 2800080230U, // <u,2,7,5>: Cost 3 vuzpl LHS, <7,4,5,6> - 2980643164U, // <u,2,7,6>: Cost 3 vzipr RHS, <0,4,2,6> - 2645726828U, // <u,2,7,7>: Cost 3 vext2 <4,6,u,2>, <7,7,7,7> - 1906901099U, // <u,2,7,u>: Cost 2 vzipr RHS, LHS - 408175266U, // <u,2,u,0>: Cost 1 vext1 LHS, LHS - 1545443118U, // <u,2,u,1>: Cost 2 vext2 <0,2,u,2>, LHS - 269271142U, // <u,2,u,2>: Cost 1 vdup2 LHS - 1611491416U, // <u,2,u,3>: Cost 2 vext3 LHS, <2,u,3,3> - 408177974U, // <u,2,u,4>: Cost 1 vext1 LHS, RHS - 1545443482U, // <u,2,u,5>: Cost 2 vext2 <0,2,u,2>, RHS - 1726339226U, // <u,2,u,6>: Cost 2 vuzpl LHS, RHS - 1529697274U, // <u,2,u,7>: Cost 2 vext1 LHS, <7,0,1,2> - 408180526U, // <u,2,u,u>: Cost 1 vext1 LHS, LHS - 1544781824U, // <u,3,0,0>: Cost 2 vext2 LHS, <0,0,0,0> - 471040156U, // <u,3,0,1>: Cost 1 vext2 LHS, LHS - 1544781988U, // <u,3,0,2>: Cost 2 vext2 LHS, <0,2,0,2> - 2618523900U, // <u,3,0,3>: Cost 3 vext2 LHS, <0,3,1,0> - 1544782162U, // <u,3,0,4>: Cost 2 vext2 LHS, <0,4,1,5> - 2238188352U, // <u,3,0,5>: Cost 3 vrev <3,u,5,0> - 2623169023U, // <u,3,0,6>: Cost 3 vext2 LHS, <0,6,2,7> - 2238335826U, // <u,3,0,7>: Cost 3 vrev <3,u,7,0> - 471040669U, // <u,3,0,u>: Cost 1 vext2 LHS, LHS - 1544782582U, // <u,3,1,0>: Cost 2 vext2 LHS, <1,0,3,2> - 1544782644U, // <u,3,1,1>: Cost 2 vext2 LHS, <1,1,1,1> - 1544782742U, // <u,3,1,2>: Cost 2 vext2 LHS, <1,2,3,0> - 1544782808U, // <u,3,1,3>: Cost 2 vext2 LHS, <1,3,1,3> - 2618524733U, // <u,3,1,4>: Cost 3 vext2 LHS, <1,4,3,5> - 1544782992U, // <u,3,1,5>: Cost 2 vext2 LHS, <1,5,3,7> - 2618524897U, // <u,3,1,6>: Cost 3 vext2 LHS, <1,6,3,7> - 2703517987U, // <u,3,1,7>: Cost 3 vext3 <3,1,7,u>, <3,1,7,u> - 1544783213U, // <u,3,1,u>: Cost 2 vext2 LHS, <1,u,1,3> - 1529716838U, // <u,3,2,0>: Cost 2 vext1 <u,u,3,2>, LHS - 1164167966U, // <u,3,2,1>: Cost 2 vrev <3,u,1,2> - 1544783464U, // <u,3,2,2>: Cost 2 vext2 LHS, <2,2,2,2> - 1544783526U, // <u,3,2,3>: Cost 2 vext2 LHS, <2,3,0,1> - 1529720118U, // <u,3,2,4>: Cost 2 vext1 <u,u,3,2>, RHS - 2618525544U, // <u,3,2,5>: Cost 3 vext2 LHS, <2,5,3,6> - 1544783802U, // <u,3,2,6>: Cost 2 vext2 LHS, <2,6,3,7> - 2704181620U, // <u,3,2,7>: Cost 3 vext3 <3,2,7,u>, <3,2,7,u> - 1544783931U, // <u,3,2,u>: Cost 2 vext2 LHS, <2,u,0,1> - 1544784022U, // <u,3,3,0>: Cost 2 vext2 LHS, <3,0,1,2> - 1487922559U, // <u,3,3,1>: Cost 2 vext1 <1,u,3,3>, <1,u,3,3> - 1493895256U, // <u,3,3,2>: Cost 2 vext1 <2,u,3,3>, <2,u,3,3> - 336380006U, // <u,3,3,3>: Cost 1 vdup3 LHS - 1544784386U, // <u,3,3,4>: Cost 2 vext2 LHS, <3,4,5,6> - 2824054478U, // <u,3,3,5>: Cost 3 vuzpr LHS, <2,3,4,5> - 2238286668U, // <u,3,3,6>: Cost 3 vrev <3,u,6,3> - 2954069136U, // <u,3,3,7>: Cost 3 vzipr LHS, <1,5,3,7> - 336380006U, // <u,3,3,u>: Cost 1 vdup3 LHS - 1487929446U, // <u,3,4,0>: Cost 2 vext1 <1,u,3,4>, LHS - 1487930752U, // <u,3,4,1>: Cost 2 vext1 <1,u,3,4>, <1,u,3,4> - 2623171644U, // <u,3,4,2>: Cost 3 vext2 LHS, <4,2,6,0> - 2561673366U, // <u,3,4,3>: Cost 3 vext1 <1,u,3,4>, <3,0,1,2> - 1487932726U, // <u,3,4,4>: Cost 2 vext1 <1,u,3,4>, RHS - 471043382U, // <u,3,4,5>: Cost 1 vext2 LHS, RHS - 1592561012U, // <u,3,4,6>: Cost 2 vext2 LHS, <4,6,4,6> - 2238368598U, // <u,3,4,7>: Cost 3 vrev <3,u,7,4> - 471043625U, // <u,3,4,u>: Cost 1 vext2 LHS, RHS - 2555707494U, // <u,3,5,0>: Cost 3 vext1 <0,u,3,5>, LHS - 1574645465U, // <u,3,5,1>: Cost 2 vext2 <5,1,u,3>, <5,1,u,3> - 2567653106U, // <u,3,5,2>: Cost 3 vext1 <2,u,3,5>, <2,3,u,5> - 2555709954U, // <u,3,5,3>: Cost 3 vext1 <0,u,3,5>, <3,4,5,6> - 1592561606U, // <u,3,5,4>: Cost 2 vext2 LHS, <5,4,7,6> - 1592561668U, // <u,3,5,5>: Cost 2 vext2 LHS, <5,5,5,5> - 1592561762U, // <u,3,5,6>: Cost 2 vext2 LHS, <5,6,7,0> - 1750314294U, // <u,3,5,7>: Cost 2 vuzpr LHS, RHS - 1750314295U, // <u,3,5,u>: Cost 2 vuzpr LHS, RHS - 2623172897U, // <u,3,6,0>: Cost 3 vext2 LHS, <6,0,1,2> - 2561688962U, // <u,3,6,1>: Cost 3 vext1 <1,u,3,6>, <1,u,3,6> - 1581281795U, // <u,3,6,2>: Cost 2 vext2 <6,2,u,3>, <6,2,u,3> - 2706541204U, // <u,3,6,3>: Cost 3 vext3 <3,6,3,u>, <3,6,3,u> - 2623173261U, // <u,3,6,4>: Cost 3 vext2 LHS, <6,4,5,6> - 1164495686U, // <u,3,6,5>: Cost 2 vrev <3,u,5,6> - 1592562488U, // <u,3,6,6>: Cost 2 vext2 LHS, <6,6,6,6> - 1592562510U, // <u,3,6,7>: Cost 2 vext2 LHS, <6,7,0,1> - 1164716897U, // <u,3,6,u>: Cost 2 vrev <3,u,u,6> - 1487954022U, // <u,3,7,0>: Cost 2 vext1 <1,u,3,7>, LHS - 1487955331U, // <u,3,7,1>: Cost 2 vext1 <1,u,3,7>, <1,u,3,7> - 1493928028U, // <u,3,7,2>: Cost 2 vext1 <2,u,3,7>, <2,u,3,7> - 2561697942U, // <u,3,7,3>: Cost 3 vext1 <1,u,3,7>, <3,0,1,2> - 1487957302U, // <u,3,7,4>: Cost 2 vext1 <1,u,3,7>, RHS - 2707352311U, // <u,3,7,5>: Cost 3 vext3 <3,7,5,u>, <3,7,5,u> - 2655024623U, // <u,3,7,6>: Cost 3 vext2 <6,2,u,3>, <7,6,2,u> - 1592563308U, // <u,3,7,7>: Cost 2 vext2 LHS, <7,7,7,7> - 1487959854U, // <u,3,7,u>: Cost 2 vext1 <1,u,3,7>, LHS - 1544787667U, // <u,3,u,0>: Cost 2 vext2 LHS, <u,0,1,2> - 471045934U, // <u,3,u,1>: Cost 1 vext2 LHS, LHS - 1549432709U, // <u,3,u,2>: Cost 2 vext2 LHS, <u,2,3,0> - 336380006U, // <u,3,u,3>: Cost 1 vdup3 LHS - 1544788031U, // <u,3,u,4>: Cost 2 vext2 LHS, <u,4,5,6> - 471046298U, // <u,3,u,5>: Cost 1 vext2 LHS, RHS - 1549433040U, // <u,3,u,6>: Cost 2 vext2 LHS, <u,6,3,7> - 1750314537U, // <u,3,u,7>: Cost 2 vuzpr LHS, RHS - 471046501U, // <u,3,u,u>: Cost 1 vext2 LHS, LHS - 2625167360U, // <u,4,0,0>: Cost 3 vext2 <1,2,u,4>, <0,0,0,0> - 1551425638U, // <u,4,0,1>: Cost 2 vext2 <1,2,u,4>, LHS - 2619195630U, // <u,4,0,2>: Cost 3 vext2 <0,2,u,4>, <0,2,u,4> - 2619343104U, // <u,4,0,3>: Cost 3 vext2 <0,3,1,4>, <0,3,1,4> - 2625167698U, // <u,4,0,4>: Cost 3 vext2 <1,2,u,4>, <0,4,1,5> - 1638329234U, // <u,4,0,5>: Cost 2 vext3 RHS, <4,0,5,1> - 1638329244U, // <u,4,0,6>: Cost 2 vext3 RHS, <4,0,6,2> - 3787803556U, // <u,4,0,7>: Cost 4 vext3 RHS, <4,0,7,1> - 1551426205U, // <u,4,0,u>: Cost 2 vext2 <1,2,u,4>, LHS - 2555748454U, // <u,4,1,0>: Cost 3 vext1 <0,u,4,1>, LHS - 2625168180U, // <u,4,1,1>: Cost 3 vext2 <1,2,u,4>, <1,1,1,1> - 1551426503U, // <u,4,1,2>: Cost 2 vext2 <1,2,u,4>, <1,2,u,4> - 2625168344U, // <u,4,1,3>: Cost 3 vext2 <1,2,u,4>, <1,3,1,3> - 2555751734U, // <u,4,1,4>: Cost 3 vext1 <0,u,4,1>, RHS - 1860554038U, // <u,4,1,5>: Cost 2 vzipl LHS, RHS - 2689879022U, // <u,4,1,6>: Cost 3 vext3 LHS, <4,1,6,3> - 2592248852U, // <u,4,1,7>: Cost 3 vext1 <7,0,4,1>, <7,0,4,1> - 1555408301U, // <u,4,1,u>: Cost 2 vext2 <1,u,u,4>, <1,u,u,4> - 2555756646U, // <u,4,2,0>: Cost 3 vext1 <0,u,4,2>, LHS - 2625168943U, // <u,4,2,1>: Cost 3 vext2 <1,2,u,4>, <2,1,4,u> - 2625169000U, // <u,4,2,2>: Cost 3 vext2 <1,2,u,4>, <2,2,2,2> - 2619197134U, // <u,4,2,3>: Cost 3 vext2 <0,2,u,4>, <2,3,4,5> - 2555759926U, // <u,4,2,4>: Cost 3 vext1 <0,u,4,2>, RHS - 2712071222U, // <u,4,2,5>: Cost 3 vext3 RHS, <4,2,5,3> - 1994771766U, // <u,4,2,6>: Cost 2 vtrnl LHS, RHS - 2592257045U, // <u,4,2,7>: Cost 3 vext1 <7,0,4,2>, <7,0,4,2> - 1994771784U, // <u,4,2,u>: Cost 2 vtrnl LHS, RHS - 2625169558U, // <u,4,3,0>: Cost 3 vext2 <1,2,u,4>, <3,0,1,2> - 2567709594U, // <u,4,3,1>: Cost 3 vext1 <2,u,4,3>, <1,2,3,4> - 2567710817U, // <u,4,3,2>: Cost 3 vext1 <2,u,4,3>, <2,u,4,3> - 2625169820U, // <u,4,3,3>: Cost 3 vext2 <1,2,u,4>, <3,3,3,3> - 2625169922U, // <u,4,3,4>: Cost 3 vext2 <1,2,u,4>, <3,4,5,6> - 2954069710U, // <u,4,3,5>: Cost 3 vzipr LHS, <2,3,4,5> - 2954068172U, // <u,4,3,6>: Cost 3 vzipr LHS, <0,2,4,6> - 3903849472U, // <u,4,3,7>: Cost 4 vuzpr <1,u,3,4>, <1,3,5,7> - 2954068174U, // <u,4,3,u>: Cost 3 vzipr LHS, <0,2,4,u> - 1505919078U, // <u,4,4,0>: Cost 2 vext1 <4,u,4,4>, LHS - 2567717831U, // <u,4,4,1>: Cost 3 vext1 <2,u,4,4>, <1,2,u,4> - 2567719010U, // <u,4,4,2>: Cost 3 vext1 <2,u,4,4>, <2,u,4,4> - 2570373542U, // <u,4,4,3>: Cost 3 vext1 <3,3,4,4>, <3,3,4,4> - 161926454U, // <u,4,4,4>: Cost 1 vdup0 RHS - 1551428918U, // <u,4,4,5>: Cost 2 vext2 <1,2,u,4>, RHS - 1638329572U, // <u,4,4,6>: Cost 2 vext3 RHS, <4,4,6,6> - 2594927963U, // <u,4,4,7>: Cost 3 vext1 <7,4,4,4>, <7,4,4,4> - 161926454U, // <u,4,4,u>: Cost 1 vdup0 RHS - 1493983334U, // <u,4,5,0>: Cost 2 vext1 <2,u,4,5>, LHS - 2689879301U, // <u,4,5,1>: Cost 3 vext3 LHS, <4,5,1,3> - 1493985379U, // <u,4,5,2>: Cost 2 vext1 <2,u,4,5>, <2,u,4,5> - 2567727254U, // <u,4,5,3>: Cost 3 vext1 <2,u,4,5>, <3,0,1,2> - 1493986614U, // <u,4,5,4>: Cost 2 vext1 <2,u,4,5>, RHS - 1863535926U, // <u,4,5,5>: Cost 2 vzipl RHS, RHS - 537750838U, // <u,4,5,6>: Cost 1 vext3 LHS, RHS - 2830110006U, // <u,4,5,7>: Cost 3 vuzpr <1,u,3,4>, RHS - 537750856U, // <u,4,5,u>: Cost 1 vext3 LHS, RHS - 1482047590U, // <u,4,6,0>: Cost 2 vext1 <0,u,4,6>, LHS - 2555790070U, // <u,4,6,1>: Cost 3 vext1 <0,u,4,6>, <1,0,3,2> - 2555790952U, // <u,4,6,2>: Cost 3 vext1 <0,u,4,6>, <2,2,2,2> - 2555791510U, // <u,4,6,3>: Cost 3 vext1 <0,u,4,6>, <3,0,1,2> - 1482050870U, // <u,4,6,4>: Cost 2 vext1 <0,u,4,6>, RHS - 2689879422U, // <u,4,6,5>: Cost 3 vext3 LHS, <4,6,5,7> - 1997753654U, // <u,4,6,6>: Cost 2 vtrnl RHS, RHS - 2712071562U, // <u,4,6,7>: Cost 3 vext3 RHS, <4,6,7,1> - 1482053422U, // <u,4,6,u>: Cost 2 vext1 <0,u,4,6>, LHS - 2567741542U, // <u,4,7,0>: Cost 3 vext1 <2,u,4,7>, LHS - 2567742362U, // <u,4,7,1>: Cost 3 vext1 <2,u,4,7>, <1,2,3,4> - 2567743589U, // <u,4,7,2>: Cost 3 vext1 <2,u,4,7>, <2,u,4,7> - 2573716286U, // <u,4,7,3>: Cost 3 vext1 <3,u,4,7>, <3,u,4,7> - 2567744822U, // <u,4,7,4>: Cost 3 vext1 <2,u,4,7>, RHS - 2712071624U, // <u,4,7,5>: Cost 3 vext3 RHS, <4,7,5,0> - 96808489U, // <u,4,7,6>: Cost 1 vrev RHS - 2651715180U, // <u,4,7,7>: Cost 3 vext2 <5,6,u,4>, <7,7,7,7> - 96955963U, // <u,4,7,u>: Cost 1 vrev RHS - 1482063974U, // <u,4,u,0>: Cost 2 vext1 <0,u,4,u>, LHS - 1551431470U, // <u,4,u,1>: Cost 2 vext2 <1,2,u,4>, LHS - 1494009958U, // <u,4,u,2>: Cost 2 vext1 <2,u,4,u>, <2,u,4,u> - 2555807894U, // <u,4,u,3>: Cost 3 vext1 <0,u,4,u>, <3,0,1,2> - 161926454U, // <u,4,u,4>: Cost 1 vdup0 RHS - 1551431834U, // <u,4,u,5>: Cost 2 vext2 <1,2,u,4>, RHS - 537751081U, // <u,4,u,6>: Cost 1 vext3 LHS, RHS - 2830110249U, // <u,4,u,7>: Cost 3 vuzpr <1,u,3,4>, RHS - 537751099U, // <u,4,u,u>: Cost 1 vext3 LHS, RHS - 2631811072U, // <u,5,0,0>: Cost 3 vext2 <2,3,u,5>, <0,0,0,0> - 1558069350U, // <u,5,0,1>: Cost 2 vext2 <2,3,u,5>, LHS - 2619203823U, // <u,5,0,2>: Cost 3 vext2 <0,2,u,5>, <0,2,u,5> - 2619867456U, // <u,5,0,3>: Cost 3 vext2 <0,3,u,5>, <0,3,u,5> - 1546273106U, // <u,5,0,4>: Cost 2 vext2 <0,4,1,5>, <0,4,1,5> - 2733010539U, // <u,5,0,5>: Cost 3 vext3 LHS, <5,0,5,1> - 2597622682U, // <u,5,0,6>: Cost 3 vext1 <7,u,5,0>, <6,7,u,5> - 1176539396U, // <u,5,0,7>: Cost 2 vrev <5,u,7,0> - 1558069917U, // <u,5,0,u>: Cost 2 vext2 <2,3,u,5>, LHS - 1505968230U, // <u,5,1,0>: Cost 2 vext1 <4,u,5,1>, LHS - 2624512887U, // <u,5,1,1>: Cost 3 vext2 <1,1,u,5>, <1,1,u,5> - 2631811990U, // <u,5,1,2>: Cost 3 vext2 <2,3,u,5>, <1,2,3,0> - 2618541056U, // <u,5,1,3>: Cost 3 vext2 <0,1,u,5>, <1,3,5,7> - 1505971510U, // <u,5,1,4>: Cost 2 vext1 <4,u,5,1>, RHS - 2627167419U, // <u,5,1,5>: Cost 3 vext2 <1,5,u,5>, <1,5,u,5> - 2579714554U, // <u,5,1,6>: Cost 3 vext1 <4,u,5,1>, <6,2,7,3> - 1638330064U, // <u,5,1,7>: Cost 2 vext3 RHS, <5,1,7,3> - 1638477529U, // <u,5,1,u>: Cost 2 vext3 RHS, <5,1,u,3> - 2561802342U, // <u,5,2,0>: Cost 3 vext1 <1,u,5,2>, LHS - 2561803264U, // <u,5,2,1>: Cost 3 vext1 <1,u,5,2>, <1,3,5,7> - 2631149217U, // <u,5,2,2>: Cost 3 vext2 <2,2,u,5>, <2,2,u,5> - 1558071026U, // <u,5,2,3>: Cost 2 vext2 <2,3,u,5>, <2,3,u,5> - 2561805622U, // <u,5,2,4>: Cost 3 vext1 <1,u,5,2>, RHS - 2714062607U, // <u,5,2,5>: Cost 3 vext3 RHS, <5,2,5,3> - 2631813050U, // <u,5,2,6>: Cost 3 vext2 <2,3,u,5>, <2,6,3,7> - 3092335926U, // <u,5,2,7>: Cost 3 vtrnr <0,u,0,2>, RHS - 1561389191U, // <u,5,2,u>: Cost 2 vext2 <2,u,u,5>, <2,u,u,5> - 2561810534U, // <u,5,3,0>: Cost 3 vext1 <1,u,5,3>, LHS - 2561811857U, // <u,5,3,1>: Cost 3 vext1 <1,u,5,3>, <1,u,5,3> - 2631813474U, // <u,5,3,2>: Cost 3 vext2 <2,3,u,5>, <3,2,5,u> - 2631813532U, // <u,5,3,3>: Cost 3 vext2 <2,3,u,5>, <3,3,3,3> - 2619869698U, // <u,5,3,4>: Cost 3 vext2 <0,3,u,5>, <3,4,5,6> - 3001847002U, // <u,5,3,5>: Cost 3 vzipr LHS, <4,4,5,5> - 2954070530U, // <u,5,3,6>: Cost 3 vzipr LHS, <3,4,5,6> - 2018749750U, // <u,5,3,7>: Cost 2 vtrnr LHS, RHS - 2018749751U, // <u,5,3,u>: Cost 2 vtrnr LHS, RHS - 2573762662U, // <u,5,4,0>: Cost 3 vext1 <3,u,5,4>, LHS - 2620017634U, // <u,5,4,1>: Cost 3 vext2 <0,4,1,5>, <4,1,5,0> - 2573764338U, // <u,5,4,2>: Cost 3 vext1 <3,u,5,4>, <2,3,u,5> - 2573765444U, // <u,5,4,3>: Cost 3 vext1 <3,u,5,4>, <3,u,5,4> - 1570680053U, // <u,5,4,4>: Cost 2 vext2 <4,4,u,5>, <4,4,u,5> - 1558072630U, // <u,5,4,5>: Cost 2 vext2 <2,3,u,5>, RHS - 2645749143U, // <u,5,4,6>: Cost 3 vext2 <4,6,u,5>, <4,6,u,5> - 1638330310U, // <u,5,4,7>: Cost 2 vext3 RHS, <5,4,7,6> - 1558072873U, // <u,5,4,u>: Cost 2 vext2 <2,3,u,5>, RHS - 1506000998U, // <u,5,5,0>: Cost 2 vext1 <4,u,5,5>, LHS - 2561827984U, // <u,5,5,1>: Cost 3 vext1 <1,u,5,5>, <1,5,3,7> - 2579744360U, // <u,5,5,2>: Cost 3 vext1 <4,u,5,5>, <2,2,2,2> - 2579744918U, // <u,5,5,3>: Cost 3 vext1 <4,u,5,5>, <3,0,1,2> - 1506004278U, // <u,5,5,4>: Cost 2 vext1 <4,u,5,5>, RHS - 229035318U, // <u,5,5,5>: Cost 1 vdup1 RHS - 2712072206U, // <u,5,5,6>: Cost 3 vext3 RHS, <5,5,6,6> - 1638330392U, // <u,5,5,7>: Cost 2 vext3 RHS, <5,5,7,7> - 229035318U, // <u,5,5,u>: Cost 1 vdup1 RHS - 1500037222U, // <u,5,6,0>: Cost 2 vext1 <3,u,5,6>, LHS - 2561836436U, // <u,5,6,1>: Cost 3 vext1 <1,u,5,6>, <1,u,5,6> - 2567809133U, // <u,5,6,2>: Cost 3 vext1 <2,u,5,6>, <2,u,5,6> - 1500040006U, // <u,5,6,3>: Cost 2 vext1 <3,u,5,6>, <3,u,5,6> - 1500040502U, // <u,5,6,4>: Cost 2 vext1 <3,u,5,6>, RHS - 2714062935U, // <u,5,6,5>: Cost 3 vext3 RHS, <5,6,5,7> - 2712072288U, // <u,5,6,6>: Cost 3 vext3 RHS, <5,6,6,7> - 27705344U, // <u,5,6,7>: Cost 0 copy RHS - 27705344U, // <u,5,6,u>: Cost 0 copy RHS - 1488101478U, // <u,5,7,0>: Cost 2 vext1 <1,u,5,7>, LHS - 1488102805U, // <u,5,7,1>: Cost 2 vext1 <1,u,5,7>, <1,u,5,7> - 2561844840U, // <u,5,7,2>: Cost 3 vext1 <1,u,5,7>, <2,2,2,2> - 2561845398U, // <u,5,7,3>: Cost 3 vext1 <1,u,5,7>, <3,0,1,2> - 1488104758U, // <u,5,7,4>: Cost 2 vext1 <1,u,5,7>, RHS - 1638330536U, // <u,5,7,5>: Cost 2 vext3 RHS, <5,7,5,7> - 2712072362U, // <u,5,7,6>: Cost 3 vext3 RHS, <5,7,6,0> - 2042965302U, // <u,5,7,7>: Cost 2 vtrnr RHS, RHS - 1488107310U, // <u,5,7,u>: Cost 2 vext1 <1,u,5,7>, LHS - 1488109670U, // <u,5,u,0>: Cost 2 vext1 <1,u,5,u>, LHS - 1488110998U, // <u,5,u,1>: Cost 2 vext1 <1,u,5,u>, <1,u,5,u> - 2561853032U, // <u,5,u,2>: Cost 3 vext1 <1,u,5,u>, <2,2,2,2> - 1500056392U, // <u,5,u,3>: Cost 2 vext1 <3,u,5,u>, <3,u,5,u> - 1488112950U, // <u,5,u,4>: Cost 2 vext1 <1,u,5,u>, RHS - 229035318U, // <u,5,u,5>: Cost 1 vdup1 RHS - 2954111490U, // <u,5,u,6>: Cost 3 vzipr LHS, <3,4,5,6> - 27705344U, // <u,5,u,7>: Cost 0 copy RHS - 27705344U, // <u,5,u,u>: Cost 0 copy RHS - 2619211776U, // <u,6,0,0>: Cost 3 vext2 <0,2,u,6>, <0,0,0,0> - 1545470054U, // <u,6,0,1>: Cost 2 vext2 <0,2,u,6>, LHS - 1545470192U, // <u,6,0,2>: Cost 2 vext2 <0,2,u,6>, <0,2,u,6> - 2255958969U, // <u,6,0,3>: Cost 3 vrev <6,u,3,0> - 1546797458U, // <u,6,0,4>: Cost 2 vext2 <0,4,u,6>, <0,4,u,6> - 2720624971U, // <u,6,0,5>: Cost 3 vext3 <6,0,5,u>, <6,0,5,u> - 2256180180U, // <u,6,0,6>: Cost 3 vrev <6,u,6,0> - 2960682294U, // <u,6,0,7>: Cost 3 vzipr <1,2,u,0>, RHS - 1545470621U, // <u,6,0,u>: Cost 2 vext2 <0,2,u,6>, LHS - 1182004127U, // <u,6,1,0>: Cost 2 vrev <6,u,0,1> - 2619212596U, // <u,6,1,1>: Cost 3 vext2 <0,2,u,6>, <1,1,1,1> - 2619212694U, // <u,6,1,2>: Cost 3 vext2 <0,2,u,6>, <1,2,3,0> - 2619212760U, // <u,6,1,3>: Cost 3 vext2 <0,2,u,6>, <1,3,1,3> - 2626511979U, // <u,6,1,4>: Cost 3 vext2 <1,4,u,6>, <1,4,u,6> - 2619212944U, // <u,6,1,5>: Cost 3 vext2 <0,2,u,6>, <1,5,3,7> - 2714063264U, // <u,6,1,6>: Cost 3 vext3 RHS, <6,1,6,3> - 2967326006U, // <u,6,1,7>: Cost 3 vzipr <2,3,u,1>, RHS - 1182594023U, // <u,6,1,u>: Cost 2 vrev <6,u,u,1> - 1506050150U, // <u,6,2,0>: Cost 2 vext1 <4,u,6,2>, LHS - 2579792630U, // <u,6,2,1>: Cost 3 vext1 <4,u,6,2>, <1,0,3,2> - 2619213416U, // <u,6,2,2>: Cost 3 vext2 <0,2,u,6>, <2,2,2,2> - 2619213478U, // <u,6,2,3>: Cost 3 vext2 <0,2,u,6>, <2,3,0,1> - 1506053430U, // <u,6,2,4>: Cost 2 vext1 <4,u,6,2>, RHS - 2633148309U, // <u,6,2,5>: Cost 3 vext2 <2,5,u,6>, <2,5,u,6> - 2619213754U, // <u,6,2,6>: Cost 3 vext2 <0,2,u,6>, <2,6,3,7> - 1638330874U, // <u,6,2,7>: Cost 2 vext3 RHS, <6,2,7,3> - 1638478339U, // <u,6,2,u>: Cost 2 vext3 RHS, <6,2,u,3> - 2619213974U, // <u,6,3,0>: Cost 3 vext2 <0,2,u,6>, <3,0,1,2> - 2255836074U, // <u,6,3,1>: Cost 3 vrev <6,u,1,3> - 2255909811U, // <u,6,3,2>: Cost 3 vrev <6,u,2,3> - 2619214236U, // <u,6,3,3>: Cost 3 vext2 <0,2,u,6>, <3,3,3,3> - 1564715549U, // <u,6,3,4>: Cost 2 vext2 <3,4,u,6>, <3,4,u,6> - 2639121006U, // <u,6,3,5>: Cost 3 vext2 <3,5,u,6>, <3,5,u,6> - 3001847012U, // <u,6,3,6>: Cost 3 vzipr LHS, <4,4,6,6> - 1880329526U, // <u,6,3,7>: Cost 2 vzipr LHS, RHS - 1880329527U, // <u,6,3,u>: Cost 2 vzipr LHS, RHS - 2567864422U, // <u,6,4,0>: Cost 3 vext1 <2,u,6,4>, LHS - 2733011558U, // <u,6,4,1>: Cost 3 vext3 LHS, <6,4,1,3> - 2567866484U, // <u,6,4,2>: Cost 3 vext1 <2,u,6,4>, <2,u,6,4> - 2638458005U, // <u,6,4,3>: Cost 3 vext2 <3,4,u,6>, <4,3,6,u> - 1570540772U, // <u,6,4,4>: Cost 2 vext2 <4,4,6,6>, <4,4,6,6> - 1545473334U, // <u,6,4,5>: Cost 2 vext2 <0,2,u,6>, RHS - 1572015512U, // <u,6,4,6>: Cost 2 vext2 <4,6,u,6>, <4,6,u,6> - 2960715062U, // <u,6,4,7>: Cost 3 vzipr <1,2,u,4>, RHS - 1545473577U, // <u,6,4,u>: Cost 2 vext2 <0,2,u,6>, RHS - 2567872614U, // <u,6,5,0>: Cost 3 vext1 <2,u,6,5>, LHS - 2645757648U, // <u,6,5,1>: Cost 3 vext2 <4,6,u,6>, <5,1,7,3> - 2567874490U, // <u,6,5,2>: Cost 3 vext1 <2,u,6,5>, <2,6,3,7> - 2576501250U, // <u,6,5,3>: Cost 3 vext1 <4,3,6,5>, <3,4,5,6> - 1576660943U, // <u,6,5,4>: Cost 2 vext2 <5,4,u,6>, <5,4,u,6> - 2645757956U, // <u,6,5,5>: Cost 3 vext2 <4,6,u,6>, <5,5,5,5> - 2645758050U, // <u,6,5,6>: Cost 3 vext2 <4,6,u,6>, <5,6,7,0> - 2824080694U, // <u,6,5,7>: Cost 3 vuzpr <0,u,2,6>, RHS - 1182626795U, // <u,6,5,u>: Cost 2 vrev <6,u,u,5> - 1506082918U, // <u,6,6,0>: Cost 2 vext1 <4,u,6,6>, LHS - 2579825398U, // <u,6,6,1>: Cost 3 vext1 <4,u,6,6>, <1,0,3,2> - 2645758458U, // <u,6,6,2>: Cost 3 vext2 <4,6,u,6>, <6,2,7,3> - 2579826838U, // <u,6,6,3>: Cost 3 vext1 <4,u,6,6>, <3,0,1,2> - 1506086198U, // <u,6,6,4>: Cost 2 vext1 <4,u,6,6>, RHS - 2579828432U, // <u,6,6,5>: Cost 3 vext1 <4,u,6,6>, <5,1,7,3> - 296144182U, // <u,6,6,6>: Cost 1 vdup2 RHS - 1638331202U, // <u,6,6,7>: Cost 2 vext3 RHS, <6,6,7,7> - 296144182U, // <u,6,6,u>: Cost 1 vdup2 RHS - 432349286U, // <u,6,7,0>: Cost 1 vext1 RHS, LHS - 1506091766U, // <u,6,7,1>: Cost 2 vext1 RHS, <1,0,3,2> - 1506092648U, // <u,6,7,2>: Cost 2 vext1 RHS, <2,2,2,2> - 1506093206U, // <u,6,7,3>: Cost 2 vext1 RHS, <3,0,1,2> - 432352809U, // <u,6,7,4>: Cost 1 vext1 RHS, RHS - 1506094800U, // <u,6,7,5>: Cost 2 vext1 RHS, <5,1,7,3> - 1506095610U, // <u,6,7,6>: Cost 2 vext1 RHS, <6,2,7,3> - 1906904374U, // <u,6,7,7>: Cost 2 vzipr RHS, RHS - 432355118U, // <u,6,7,u>: Cost 1 vext1 RHS, LHS - 432357478U, // <u,6,u,0>: Cost 1 vext1 RHS, LHS - 1545475886U, // <u,6,u,1>: Cost 2 vext2 <0,2,u,6>, LHS - 1506100840U, // <u,6,u,2>: Cost 2 vext1 RHS, <2,2,2,2> - 1506101398U, // <u,6,u,3>: Cost 2 vext1 RHS, <3,0,1,2> - 432361002U, // <u,6,u,4>: Cost 1 vext1 RHS, RHS - 1545476250U, // <u,6,u,5>: Cost 2 vext2 <0,2,u,6>, RHS - 296144182U, // <u,6,u,6>: Cost 1 vdup2 RHS - 1880370486U, // <u,6,u,7>: Cost 2 vzipr LHS, RHS - 432363310U, // <u,6,u,u>: Cost 1 vext1 RHS, LHS - 1571356672U, // <u,7,0,0>: Cost 2 vext2 RHS, <0,0,0,0> - 497614950U, // <u,7,0,1>: Cost 1 vext2 RHS, LHS - 1571356836U, // <u,7,0,2>: Cost 2 vext2 RHS, <0,2,0,2> - 2573880146U, // <u,7,0,3>: Cost 3 vext1 <3,u,7,0>, <3,u,7,0> - 1571357010U, // <u,7,0,4>: Cost 2 vext2 RHS, <0,4,1,5> - 1512083716U, // <u,7,0,5>: Cost 2 vext1 <5,u,7,0>, <5,u,7,0> - 2621874741U, // <u,7,0,6>: Cost 3 vext2 <0,6,u,7>, <0,6,u,7> - 2585826298U, // <u,7,0,7>: Cost 3 vext1 <5,u,7,0>, <7,0,1,2> - 497615517U, // <u,7,0,u>: Cost 1 vext2 RHS, LHS - 1571357430U, // <u,7,1,0>: Cost 2 vext2 RHS, <1,0,3,2> - 1571357492U, // <u,7,1,1>: Cost 2 vext2 RHS, <1,1,1,1> - 1571357590U, // <u,7,1,2>: Cost 2 vext2 RHS, <1,2,3,0> - 1552114715U, // <u,7,1,3>: Cost 2 vext2 <1,3,u,7>, <1,3,u,7> - 2573888822U, // <u,7,1,4>: Cost 3 vext1 <3,u,7,1>, RHS - 1553441981U, // <u,7,1,5>: Cost 2 vext2 <1,5,u,7>, <1,5,u,7> - 2627847438U, // <u,7,1,6>: Cost 3 vext2 <1,6,u,7>, <1,6,u,7> - 2727408775U, // <u,7,1,7>: Cost 3 vext3 <7,1,7,u>, <7,1,7,u> - 1555432880U, // <u,7,1,u>: Cost 2 vext2 <1,u,u,7>, <1,u,u,7> - 2629838337U, // <u,7,2,0>: Cost 3 vext2 <2,0,u,7>, <2,0,u,7> - 1188058754U, // <u,7,2,1>: Cost 2 vrev <7,u,1,2> - 1571358312U, // <u,7,2,2>: Cost 2 vext2 RHS, <2,2,2,2> - 1571358374U, // <u,7,2,3>: Cost 2 vext2 RHS, <2,3,0,1> - 2632492869U, // <u,7,2,4>: Cost 3 vext2 <2,4,u,7>, <2,4,u,7> - 2633156502U, // <u,7,2,5>: Cost 3 vext2 <2,5,u,7>, <2,5,u,7> - 1560078311U, // <u,7,2,6>: Cost 2 vext2 <2,6,u,7>, <2,6,u,7> - 2728072408U, // <u,7,2,7>: Cost 3 vext3 <7,2,7,u>, <7,2,7,u> - 1561405577U, // <u,7,2,u>: Cost 2 vext2 <2,u,u,7>, <2,u,u,7> - 1571358870U, // <u,7,3,0>: Cost 2 vext2 RHS, <3,0,1,2> - 2627184913U, // <u,7,3,1>: Cost 3 vext2 <1,5,u,7>, <3,1,5,u> - 2633820523U, // <u,7,3,2>: Cost 3 vext2 <2,6,u,7>, <3,2,6,u> - 1571359132U, // <u,7,3,3>: Cost 2 vext2 RHS, <3,3,3,3> - 1571359234U, // <u,7,3,4>: Cost 2 vext2 RHS, <3,4,5,6> - 1512108295U, // <u,7,3,5>: Cost 2 vext1 <5,u,7,3>, <5,u,7,3> - 1518080992U, // <u,7,3,6>: Cost 2 vext1 <6,u,7,3>, <6,u,7,3> - 2640456465U, // <u,7,3,7>: Cost 3 vext2 <3,7,u,7>, <3,7,u,7> - 1571359518U, // <u,7,3,u>: Cost 2 vext2 RHS, <3,u,1,2> - 1571359634U, // <u,7,4,0>: Cost 2 vext2 RHS, <4,0,5,1> - 2573911067U, // <u,7,4,1>: Cost 3 vext1 <3,u,7,4>, <1,3,u,7> - 2645101622U, // <u,7,4,2>: Cost 3 vext2 RHS, <4,2,5,3> - 2573912918U, // <u,7,4,3>: Cost 3 vext1 <3,u,7,4>, <3,u,7,4> - 1571359952U, // <u,7,4,4>: Cost 2 vext2 RHS, <4,4,4,4> - 497618248U, // <u,7,4,5>: Cost 1 vext2 RHS, RHS - 1571360116U, // <u,7,4,6>: Cost 2 vext2 RHS, <4,6,4,6> - 2645102024U, // <u,7,4,7>: Cost 3 vext2 RHS, <4,7,5,0> - 497618473U, // <u,7,4,u>: Cost 1 vext2 RHS, RHS - 2645102152U, // <u,7,5,0>: Cost 3 vext2 RHS, <5,0,1,2> - 1571360464U, // <u,7,5,1>: Cost 2 vext2 RHS, <5,1,7,3> - 2645102334U, // <u,7,5,2>: Cost 3 vext2 RHS, <5,2,3,4> - 2645102447U, // <u,7,5,3>: Cost 3 vext2 RHS, <5,3,7,0> - 1571360710U, // <u,7,5,4>: Cost 2 vext2 RHS, <5,4,7,6> - 1571360772U, // <u,7,5,5>: Cost 2 vext2 RHS, <5,5,5,5> - 1571360866U, // <u,7,5,6>: Cost 2 vext2 RHS, <5,6,7,0> - 1571360936U, // <u,7,5,7>: Cost 2 vext2 RHS, <5,7,5,7> - 1571361017U, // <u,7,5,u>: Cost 2 vext2 RHS, <5,u,5,7> - 1530044518U, // <u,7,6,0>: Cost 2 vext1 <u,u,7,6>, LHS - 2645103016U, // <u,7,6,1>: Cost 3 vext2 RHS, <6,1,7,2> - 1571361274U, // <u,7,6,2>: Cost 2 vext2 RHS, <6,2,7,3> - 2645103154U, // <u,7,6,3>: Cost 3 vext2 RHS, <6,3,4,5> - 1530047798U, // <u,7,6,4>: Cost 2 vext1 <u,u,7,6>, RHS - 1188386474U, // <u,7,6,5>: Cost 2 vrev <7,u,5,6> - 1571361592U, // <u,7,6,6>: Cost 2 vext2 RHS, <6,6,6,6> - 1571361614U, // <u,7,6,7>: Cost 2 vext2 RHS, <6,7,0,1> - 1571361695U, // <u,7,6,u>: Cost 2 vext2 RHS, <6,u,0,1> - 1571361786U, // <u,7,7,0>: Cost 2 vext2 RHS, <7,0,1,2> - 2573935616U, // <u,7,7,1>: Cost 3 vext1 <3,u,7,7>, <1,3,5,7> - 2645103781U, // <u,7,7,2>: Cost 3 vext2 RHS, <7,2,2,2> - 2573937497U, // <u,7,7,3>: Cost 3 vext1 <3,u,7,7>, <3,u,7,7> - 1571362150U, // <u,7,7,4>: Cost 2 vext2 RHS, <7,4,5,6> - 1512141067U, // <u,7,7,5>: Cost 2 vext1 <5,u,7,7>, <5,u,7,7> - 1518113764U, // <u,7,7,6>: Cost 2 vext1 <6,u,7,7>, <6,u,7,7> - 363253046U, // <u,7,7,7>: Cost 1 vdup3 RHS - 363253046U, // <u,7,7,u>: Cost 1 vdup3 RHS - 1571362515U, // <u,7,u,0>: Cost 2 vext2 RHS, <u,0,1,2> - 497620782U, // <u,7,u,1>: Cost 1 vext2 RHS, LHS - 1571362693U, // <u,7,u,2>: Cost 2 vext2 RHS, <u,2,3,0> - 1571362748U, // <u,7,u,3>: Cost 2 vext2 RHS, <u,3,0,1> - 1571362879U, // <u,7,u,4>: Cost 2 vext2 RHS, <u,4,5,6> - 497621146U, // <u,7,u,5>: Cost 1 vext2 RHS, RHS - 1571363024U, // <u,7,u,6>: Cost 2 vext2 RHS, <u,6,3,7> - 363253046U, // <u,7,u,7>: Cost 1 vdup3 RHS - 497621349U, // <u,7,u,u>: Cost 1 vext2 RHS, LHS - 135053414U, // <u,u,0,0>: Cost 1 vdup0 LHS - 471081121U, // <u,u,0,1>: Cost 1 vext2 LHS, LHS - 1544822948U, // <u,u,0,2>: Cost 2 vext2 LHS, <0,2,0,2> - 1616140005U, // <u,u,0,3>: Cost 2 vext3 LHS, <u,0,3,2> - 1544823122U, // <u,u,0,4>: Cost 2 vext2 LHS, <0,4,1,5> - 1512157453U, // <u,u,0,5>: Cost 2 vext1 <5,u,u,0>, <5,u,u,0> - 1662220032U, // <u,u,0,6>: Cost 2 vext3 RHS, <u,0,6,2> - 1194457487U, // <u,u,0,7>: Cost 2 vrev <u,u,7,0> - 471081629U, // <u,u,0,u>: Cost 1 vext2 LHS, LHS - 1544823542U, // <u,u,1,0>: Cost 2 vext2 LHS, <1,0,3,2> - 202162278U, // <u,u,1,1>: Cost 1 vdup1 LHS - 537753390U, // <u,u,1,2>: Cost 1 vext3 LHS, LHS - 1544823768U, // <u,u,1,3>: Cost 2 vext2 LHS, <1,3,1,3> - 1494248758U, // <u,u,1,4>: Cost 2 vext1 <2,u,u,1>, RHS - 1544823952U, // <u,u,1,5>: Cost 2 vext2 LHS, <1,5,3,7> - 1518138343U, // <u,u,1,6>: Cost 2 vext1 <6,u,u,1>, <6,u,u,1> - 1640322907U, // <u,u,1,7>: Cost 2 vext3 RHS, <u,1,7,3> - 537753444U, // <u,u,1,u>: Cost 1 vext3 LHS, LHS - 1482309734U, // <u,u,2,0>: Cost 2 vext1 <0,u,u,2>, LHS - 1194031451U, // <u,u,2,1>: Cost 2 vrev <u,u,1,2> - 269271142U, // <u,u,2,2>: Cost 1 vdup2 LHS - 835584U, // <u,u,2,3>: Cost 0 copy LHS - 1482313014U, // <u,u,2,4>: Cost 2 vext1 <0,u,u,2>, RHS - 2618566504U, // <u,u,2,5>: Cost 3 vext2 LHS, <2,5,3,6> - 1544824762U, // <u,u,2,6>: Cost 2 vext2 LHS, <2,6,3,7> - 1638479788U, // <u,u,2,7>: Cost 2 vext3 RHS, <u,2,7,3> - 835584U, // <u,u,2,u>: Cost 0 copy LHS - 408576723U, // <u,u,3,0>: Cost 1 vext1 LHS, LHS - 1482318582U, // <u,u,3,1>: Cost 2 vext1 LHS, <1,0,3,2> - 120371557U, // <u,u,3,2>: Cost 1 vrev LHS - 336380006U, // <u,u,3,3>: Cost 1 vdup3 LHS - 408579382U, // <u,u,3,4>: Cost 1 vext1 LHS, RHS - 1616140271U, // <u,u,3,5>: Cost 2 vext3 LHS, <u,3,5,7> - 1530098170U, // <u,u,3,6>: Cost 2 vext1 LHS, <6,2,7,3> - 1880329544U, // <u,u,3,7>: Cost 2 vzipr LHS, RHS - 408581934U, // <u,u,3,u>: Cost 1 vext1 LHS, LHS - 1488298086U, // <u,u,4,0>: Cost 2 vext1 <1,u,u,4>, LHS - 1488299437U, // <u,u,4,1>: Cost 2 vext1 <1,u,u,4>, <1,u,u,4> - 1659271204U, // <u,u,4,2>: Cost 2 vext3 LHS, <u,4,2,6> - 1194195311U, // <u,u,4,3>: Cost 2 vrev <u,u,3,4> - 161926454U, // <u,u,4,4>: Cost 1 vdup0 RHS - 471084342U, // <u,u,4,5>: Cost 1 vext2 LHS, RHS - 1571368308U, // <u,u,4,6>: Cost 2 vext2 RHS, <4,6,4,6> - 1640323153U, // <u,u,4,7>: Cost 2 vext3 RHS, <u,4,7,6> - 471084585U, // <u,u,4,u>: Cost 1 vext2 LHS, RHS - 1494278246U, // <u,u,5,0>: Cost 2 vext1 <2,u,u,5>, LHS - 1571368656U, // <u,u,5,1>: Cost 2 vext2 RHS, <5,1,7,3> - 1494280327U, // <u,u,5,2>: Cost 2 vext1 <2,u,u,5>, <2,u,u,5> - 1616140415U, // <u,u,5,3>: Cost 2 vext3 LHS, <u,5,3,7> - 1494281526U, // <u,u,5,4>: Cost 2 vext1 <2,u,u,5>, RHS - 229035318U, // <u,u,5,5>: Cost 1 vdup1 RHS - 537753754U, // <u,u,5,6>: Cost 1 vext3 LHS, RHS - 1750355254U, // <u,u,5,7>: Cost 2 vuzpr LHS, RHS - 537753772U, // <u,u,5,u>: Cost 1 vext3 LHS, RHS - 1482342502U, // <u,u,6,0>: Cost 2 vext1 <0,u,u,6>, LHS - 2556084982U, // <u,u,6,1>: Cost 3 vext1 <0,u,u,6>, <1,0,3,2> - 1571369466U, // <u,u,6,2>: Cost 2 vext2 RHS, <6,2,7,3> - 1611938000U, // <u,u,6,3>: Cost 2 vext3 LHS, <u,6,3,7> - 1482345782U, // <u,u,6,4>: Cost 2 vext1 <0,u,u,6>, RHS - 1194359171U, // <u,u,6,5>: Cost 2 vrev <u,u,5,6> - 296144182U, // <u,u,6,6>: Cost 1 vdup2 RHS - 27705344U, // <u,u,6,7>: Cost 0 copy RHS - 27705344U, // <u,u,6,u>: Cost 0 copy RHS - 432496742U, // <u,u,7,0>: Cost 1 vext1 RHS, LHS - 1488324016U, // <u,u,7,1>: Cost 2 vext1 <1,u,u,7>, <1,u,u,7> - 1494296713U, // <u,u,7,2>: Cost 2 vext1 <2,u,u,7>, <2,u,u,7> - 1906901148U, // <u,u,7,3>: Cost 2 vzipr RHS, LHS - 432500283U, // <u,u,7,4>: Cost 1 vext1 RHS, RHS - 1506242256U, // <u,u,7,5>: Cost 2 vext1 RHS, <5,1,7,3> - 120699277U, // <u,u,7,6>: Cost 1 vrev RHS - 363253046U, // <u,u,7,7>: Cost 1 vdup3 RHS - 432502574U, // <u,u,7,u>: Cost 1 vext1 RHS, LHS - 408617688U, // <u,u,u,0>: Cost 1 vext1 LHS, LHS - 471086894U, // <u,u,u,1>: Cost 1 vext2 LHS, LHS - 537753957U, // <u,u,u,2>: Cost 1 vext3 LHS, LHS - 835584U, // <u,u,u,3>: Cost 0 copy LHS - 408620342U, // <u,u,u,4>: Cost 1 vext1 LHS, RHS - 471087258U, // <u,u,u,5>: Cost 1 vext2 LHS, RHS - 537753997U, // <u,u,u,6>: Cost 1 vext3 LHS, RHS - 27705344U, // <u,u,u,7>: Cost 0 copy RHS - 835584U, // <u,u,u,u>: Cost 0 copy LHS + 135053414U, // <0,0,0,0>: Cost 1 vdup0 LHS + 1543503974U, // <0,0,0,1>: Cost 2 vext2 <0,0,0,0>, LHS + 2618572962U, // <0,0,0,2>: Cost 3 vext2 <0,2,0,0>, <0,2,0,0> + 2568054923U, // <0,0,0,3>: Cost 3 vext1 <3,0,0,0>, <3,0,0,0> + 1476398390U, // <0,0,0,4>: Cost 2 vext1 <0,0,0,0>, RHS + 2550140624U, // <0,0,0,5>: Cost 3 vext1 <0,0,0,0>, <5,1,7,3> + 2550141434U, // <0,0,0,6>: Cost 3 vext1 <0,0,0,0>, <6,2,7,3> + 2591945711U, // <0,0,0,7>: Cost 3 vext1 <7,0,0,0>, <7,0,0,0> + 135053414U, // <0,0,0,u>: Cost 1 vdup0 LHS + 2886516736U, // <0,0,1,0>: Cost 3 vzipl LHS, <0,0,0,0> + 1812775014U, // <0,0,1,1>: Cost 2 vzipl LHS, LHS + 1618133094U, // <0,0,1,2>: Cost 2 vext3 <1,2,3,0>, LHS + 2625209292U, // <0,0,1,3>: Cost 3 vext2 <1,3,0,0>, <1,3,0,0> + 2886558034U, // <0,0,1,4>: Cost 3 vzipl LHS, <0,4,1,5> + 2617246864U, // <0,0,1,5>: Cost 3 vext2 <0,0,0,0>, <1,5,3,7> + 3659723031U, // <0,0,1,6>: Cost 4 vext1 <6,0,0,1>, <6,0,0,1> + 2591953904U, // <0,0,1,7>: Cost 3 vext1 <7,0,0,1>, <7,0,0,1> + 1812775581U, // <0,0,1,u>: Cost 2 vzipl LHS, LHS + 3020734464U, // <0,0,2,0>: Cost 3 vtrnl LHS, <0,0,0,0> + 3020734474U, // <0,0,2,1>: Cost 3 vtrnl LHS, <0,0,1,1> + 1946992742U, // <0,0,2,2>: Cost 2 vtrnl LHS, LHS + 2631181989U, // <0,0,2,3>: Cost 3 vext2 <2,3,0,0>, <2,3,0,0> + 3020734668U, // <0,0,2,4>: Cost 3 vtrnl LHS, <0,2,4,6> + 3826550569U, // <0,0,2,5>: Cost 4 vuzpl <0,2,0,2>, <2,4,5,6> + 2617247674U, // <0,0,2,6>: Cost 3 vext2 <0,0,0,0>, <2,6,3,7> + 2591962097U, // <0,0,2,7>: Cost 3 vext1 <7,0,0,2>, <7,0,0,2> + 1946992796U, // <0,0,2,u>: Cost 2 vtrnl LHS, LHS + 2635163787U, // <0,0,3,0>: Cost 3 vext2 <3,0,0,0>, <3,0,0,0> + 2686419196U, // <0,0,3,1>: Cost 3 vext3 <0,3,1,0>, <0,3,1,0> + 2686492933U, // <0,0,3,2>: Cost 3 vext3 <0,3,2,0>, <0,3,2,0> + 2617248156U, // <0,0,3,3>: Cost 3 vext2 <0,0,0,0>, <3,3,3,3> + 2617248258U, // <0,0,3,4>: Cost 3 vext2 <0,0,0,0>, <3,4,5,6> + 3826551298U, // <0,0,3,5>: Cost 4 vuzpl <0,2,0,2>, <3,4,5,6> + 3690990200U, // <0,0,3,6>: Cost 4 vext2 <0,0,0,0>, <3,6,0,7> + 3713551042U, // <0,0,3,7>: Cost 4 vext2 <3,7,0,0>, <3,7,0,0> + 2635163787U, // <0,0,3,u>: Cost 3 vext2 <3,0,0,0>, <3,0,0,0> + 2617248658U, // <0,0,4,0>: Cost 3 vext2 <0,0,0,0>, <4,0,5,1> + 2888450150U, // <0,0,4,1>: Cost 3 vzipl <0,4,1,5>, LHS + 3021570150U, // <0,0,4,2>: Cost 3 vtrnl <0,2,4,6>, LHS + 3641829519U, // <0,0,4,3>: Cost 4 vext1 <3,0,0,4>, <3,0,0,4> + 3021570252U, // <0,0,4,4>: Cost 3 vtrnl <0,2,4,6>, <0,2,4,6> + 1543507254U, // <0,0,4,5>: Cost 2 vext2 <0,0,0,0>, RHS + 2752810294U, // <0,0,4,6>: Cost 3 vuzpl <0,2,0,2>, RHS + 3786998152U, // <0,0,4,7>: Cost 4 vext3 <4,7,5,0>, <0,4,7,5> + 1543507497U, // <0,0,4,u>: Cost 2 vext2 <0,0,0,0>, RHS + 2684354972U, // <0,0,5,0>: Cost 3 vext3 <0,0,0,0>, <0,5,0,7> + 2617249488U, // <0,0,5,1>: Cost 3 vext2 <0,0,0,0>, <5,1,7,3> + 3765617070U, // <0,0,5,2>: Cost 4 vext3 <1,2,3,0>, <0,5,2,7> + 3635865780U, // <0,0,5,3>: Cost 4 vext1 <2,0,0,5>, <3,0,4,5> + 2617249734U, // <0,0,5,4>: Cost 3 vext2 <0,0,0,0>, <5,4,7,6> + 2617249796U, // <0,0,5,5>: Cost 3 vext2 <0,0,0,0>, <5,5,5,5> + 2718712274U, // <0,0,5,6>: Cost 3 vext3 <5,6,7,0>, <0,5,6,7> + 2617249960U, // <0,0,5,7>: Cost 3 vext2 <0,0,0,0>, <5,7,5,7> + 2720039396U, // <0,0,5,u>: Cost 3 vext3 <5,u,7,0>, <0,5,u,7> + 2684355053U, // <0,0,6,0>: Cost 3 vext3 <0,0,0,0>, <0,6,0,7> + 3963609190U, // <0,0,6,1>: Cost 4 vzipl <0,6,2,7>, LHS + 2617250298U, // <0,0,6,2>: Cost 3 vext2 <0,0,0,0>, <6,2,7,3> + 3796435464U, // <0,0,6,3>: Cost 4 vext3 <6,3,7,0>, <0,6,3,7> + 3659762998U, // <0,0,6,4>: Cost 4 vext1 <6,0,0,6>, RHS + 3659763810U, // <0,0,6,5>: Cost 4 vext1 <6,0,0,6>, <5,6,7,0> + 2617250616U, // <0,0,6,6>: Cost 3 vext2 <0,0,0,0>, <6,6,6,6> + 2657727309U, // <0,0,6,7>: Cost 3 vext2 <6,7,0,0>, <6,7,0,0> + 2658390942U, // <0,0,6,u>: Cost 3 vext2 <6,u,0,0>, <6,u,0,0> + 2659054575U, // <0,0,7,0>: Cost 3 vext2 <7,0,0,0>, <7,0,0,0> + 3635880854U, // <0,0,7,1>: Cost 4 vext1 <2,0,0,7>, <1,2,3,0> + 3635881401U, // <0,0,7,2>: Cost 4 vext1 <2,0,0,7>, <2,0,0,7> + 3734787298U, // <0,0,7,3>: Cost 4 vext2 <7,3,0,0>, <7,3,0,0> + 2617251174U, // <0,0,7,4>: Cost 3 vext2 <0,0,0,0>, <7,4,5,6> + 3659772002U, // <0,0,7,5>: Cost 4 vext1 <6,0,0,7>, <5,6,7,0> + 3659772189U, // <0,0,7,6>: Cost 4 vext1 <6,0,0,7>, <6,0,0,7> + 2617251436U, // <0,0,7,7>: Cost 3 vext2 <0,0,0,0>, <7,7,7,7> + 2659054575U, // <0,0,7,u>: Cost 3 vext2 <7,0,0,0>, <7,0,0,0> + 135053414U, // <0,0,u,0>: Cost 1 vdup0 LHS + 1817419878U, // <0,0,u,1>: Cost 2 vzipl LHS, LHS + 1947435110U, // <0,0,u,2>: Cost 2 vtrnl LHS, LHS + 2568120467U, // <0,0,u,3>: Cost 3 vext1 <3,0,0,u>, <3,0,0,u> + 1476463926U, // <0,0,u,4>: Cost 2 vext1 <0,0,0,u>, RHS + 1543510170U, // <0,0,u,5>: Cost 2 vext2 <0,0,0,0>, RHS + 2752813210U, // <0,0,u,6>: Cost 3 vuzpl <0,2,0,2>, RHS + 2592011255U, // <0,0,u,7>: Cost 3 vext1 <7,0,0,u>, <7,0,0,u> + 135053414U, // <0,0,u,u>: Cost 1 vdup0 LHS + 2618581002U, // <0,1,0,0>: Cost 3 vext2 <0,2,0,1>, <0,0,1,1> + 1557446758U, // <0,1,0,1>: Cost 2 vext2 <2,3,0,1>, LHS + 2618581155U, // <0,1,0,2>: Cost 3 vext2 <0,2,0,1>, <0,2,0,1> + 2690548468U, // <0,1,0,3>: Cost 3 vext3 <1,0,3,0>, <1,0,3,0> + 2626543954U, // <0,1,0,4>: Cost 3 vext2 <1,5,0,1>, <0,4,1,5> + 4094985216U, // <0,1,0,5>: Cost 4 vtrnl <0,2,0,2>, <1,3,5,7> + 2592019278U, // <0,1,0,6>: Cost 3 vext1 <7,0,1,0>, <6,7,0,1> + 2592019448U, // <0,1,0,7>: Cost 3 vext1 <7,0,1,0>, <7,0,1,0> + 1557447325U, // <0,1,0,u>: Cost 2 vext2 <2,3,0,1>, LHS + 1476476938U, // <0,1,1,0>: Cost 2 vext1 <0,0,1,1>, <0,0,1,1> + 2886517556U, // <0,1,1,1>: Cost 3 vzipl LHS, <1,1,1,1> + 2886517654U, // <0,1,1,2>: Cost 3 vzipl LHS, <1,2,3,0> + 2886517720U, // <0,1,1,3>: Cost 3 vzipl LHS, <1,3,1,3> + 1476480310U, // <0,1,1,4>: Cost 2 vext1 <0,0,1,1>, RHS + 2886558864U, // <0,1,1,5>: Cost 3 vzipl LHS, <1,5,3,7> + 2550223354U, // <0,1,1,6>: Cost 3 vext1 <0,0,1,1>, <6,2,7,3> + 2550223856U, // <0,1,1,7>: Cost 3 vext1 <0,0,1,1>, <7,0,0,1> + 1476482862U, // <0,1,1,u>: Cost 2 vext1 <0,0,1,1>, LHS + 1494401126U, // <0,1,2,0>: Cost 2 vext1 <3,0,1,2>, LHS + 3020735284U, // <0,1,2,1>: Cost 3 vtrnl LHS, <1,1,1,1> + 2562172349U, // <0,1,2,2>: Cost 3 vext1 <2,0,1,2>, <2,0,1,2> + 835584U, // <0,1,2,3>: Cost 0 copy LHS + 1494404406U, // <0,1,2,4>: Cost 2 vext1 <3,0,1,2>, RHS + 3020735488U, // <0,1,2,5>: Cost 3 vtrnl LHS, <1,3,5,7> + 2631190458U, // <0,1,2,6>: Cost 3 vext2 <2,3,0,1>, <2,6,3,7> + 1518294010U, // <0,1,2,7>: Cost 2 vext1 <7,0,1,2>, <7,0,1,2> + 835584U, // <0,1,2,u>: Cost 0 copy LHS + 2692318156U, // <0,1,3,0>: Cost 3 vext3 <1,3,0,0>, <1,3,0,0> + 2691875800U, // <0,1,3,1>: Cost 3 vext3 <1,2,3,0>, <1,3,1,3> + 2691875806U, // <0,1,3,2>: Cost 3 vext3 <1,2,3,0>, <1,3,2,0> + 2692539367U, // <0,1,3,3>: Cost 3 vext3 <1,3,3,0>, <1,3,3,0> + 2562182454U, // <0,1,3,4>: Cost 3 vext1 <2,0,1,3>, RHS + 2691875840U, // <0,1,3,5>: Cost 3 vext3 <1,2,3,0>, <1,3,5,7> + 2692760578U, // <0,1,3,6>: Cost 3 vext3 <1,3,6,0>, <1,3,6,0> + 2639817411U, // <0,1,3,7>: Cost 3 vext2 <3,7,0,1>, <3,7,0,1> + 2691875863U, // <0,1,3,u>: Cost 3 vext3 <1,2,3,0>, <1,3,u,3> + 2568159334U, // <0,1,4,0>: Cost 3 vext1 <3,0,1,4>, LHS + 4095312692U, // <0,1,4,1>: Cost 4 vtrnl <0,2,4,6>, <1,1,1,1> + 2568160934U, // <0,1,4,2>: Cost 3 vext1 <3,0,1,4>, <2,3,0,1> + 2568161432U, // <0,1,4,3>: Cost 3 vext1 <3,0,1,4>, <3,0,1,4> + 2568162614U, // <0,1,4,4>: Cost 3 vext1 <3,0,1,4>, RHS + 1557450038U, // <0,1,4,5>: Cost 2 vext2 <2,3,0,1>, RHS + 2754235702U, // <0,1,4,6>: Cost 3 vuzpl <0,4,1,5>, RHS + 2592052220U, // <0,1,4,7>: Cost 3 vext1 <7,0,1,4>, <7,0,1,4> + 1557450281U, // <0,1,4,u>: Cost 2 vext2 <2,3,0,1>, RHS + 3765617775U, // <0,1,5,0>: Cost 4 vext3 <1,2,3,0>, <1,5,0,1> + 2647781007U, // <0,1,5,1>: Cost 3 vext2 <5,1,0,1>, <5,1,0,1> + 3704934138U, // <0,1,5,2>: Cost 4 vext2 <2,3,0,1>, <5,2,3,0> + 2691875984U, // <0,1,5,3>: Cost 3 vext3 <1,2,3,0>, <1,5,3,7> + 2657734598U, // <0,1,5,4>: Cost 3 vext2 <6,7,0,1>, <5,4,7,6> + 2650435539U, // <0,1,5,5>: Cost 3 vext2 <5,5,0,1>, <5,5,0,1> + 2651099172U, // <0,1,5,6>: Cost 3 vext2 <5,6,0,1>, <5,6,0,1> + 2651762805U, // <0,1,5,7>: Cost 3 vext2 <5,7,0,1>, <5,7,0,1> + 2691876029U, // <0,1,5,u>: Cost 3 vext3 <1,2,3,0>, <1,5,u,7> + 2592063590U, // <0,1,6,0>: Cost 3 vext1 <7,0,1,6>, LHS + 3765617871U, // <0,1,6,1>: Cost 4 vext3 <1,2,3,0>, <1,6,1,7> + 2654417337U, // <0,1,6,2>: Cost 3 vext2 <6,2,0,1>, <6,2,0,1> + 3765617889U, // <0,1,6,3>: Cost 4 vext3 <1,2,3,0>, <1,6,3,7> + 2592066870U, // <0,1,6,4>: Cost 3 vext1 <7,0,1,6>, RHS + 3765617907U, // <0,1,6,5>: Cost 4 vext3 <1,2,3,0>, <1,6,5,7> + 2657071869U, // <0,1,6,6>: Cost 3 vext2 <6,6,0,1>, <6,6,0,1> + 1583993678U, // <0,1,6,7>: Cost 2 vext2 <6,7,0,1>, <6,7,0,1> + 1584657311U, // <0,1,6,u>: Cost 2 vext2 <6,u,0,1>, <6,u,0,1> + 2657735672U, // <0,1,7,0>: Cost 3 vext2 <6,7,0,1>, <7,0,1,0> + 2657735808U, // <0,1,7,1>: Cost 3 vext2 <6,7,0,1>, <7,1,7,1> + 2631193772U, // <0,1,7,2>: Cost 3 vext2 <2,3,0,1>, <7,2,3,0> + 2661053667U, // <0,1,7,3>: Cost 3 vext2 <7,3,0,1>, <7,3,0,1> + 2657736038U, // <0,1,7,4>: Cost 3 vext2 <6,7,0,1>, <7,4,5,6> + 3721524621U, // <0,1,7,5>: Cost 4 vext2 <5,1,0,1>, <7,5,1,0> + 2657736158U, // <0,1,7,6>: Cost 3 vext2 <6,7,0,1>, <7,6,1,0> + 2657736300U, // <0,1,7,7>: Cost 3 vext2 <6,7,0,1>, <7,7,7,7> + 2657736322U, // <0,1,7,u>: Cost 3 vext2 <6,7,0,1>, <7,u,1,2> + 1494450278U, // <0,1,u,0>: Cost 2 vext1 <3,0,1,u>, LHS + 1557452590U, // <0,1,u,1>: Cost 2 vext2 <2,3,0,1>, LHS + 2754238254U, // <0,1,u,2>: Cost 3 vuzpl <0,4,1,5>, LHS + 835584U, // <0,1,u,3>: Cost 0 copy LHS + 1494453558U, // <0,1,u,4>: Cost 2 vext1 <3,0,1,u>, RHS + 1557452954U, // <0,1,u,5>: Cost 2 vext2 <2,3,0,1>, RHS + 2754238618U, // <0,1,u,6>: Cost 3 vuzpl <0,4,1,5>, RHS + 1518343168U, // <0,1,u,7>: Cost 2 vext1 <7,0,1,u>, <7,0,1,u> + 835584U, // <0,1,u,u>: Cost 0 copy LHS + 2752299008U, // <0,2,0,0>: Cost 3 vuzpl LHS, <0,0,0,0> + 1544847462U, // <0,2,0,1>: Cost 2 vext2 <0,2,0,2>, LHS + 1678557286U, // <0,2,0,2>: Cost 2 vuzpl LHS, LHS + 2696521165U, // <0,2,0,3>: Cost 3 vext3 <2,0,3,0>, <2,0,3,0> + 2752340172U, // <0,2,0,4>: Cost 3 vuzpl LHS, <0,2,4,6> + 2691876326U, // <0,2,0,5>: Cost 3 vext3 <1,2,3,0>, <2,0,5,7> + 2618589695U, // <0,2,0,6>: Cost 3 vext2 <0,2,0,2>, <0,6,2,7> + 2592093185U, // <0,2,0,7>: Cost 3 vext1 <7,0,2,0>, <7,0,2,0> + 1678557340U, // <0,2,0,u>: Cost 2 vuzpl LHS, LHS + 2618589942U, // <0,2,1,0>: Cost 3 vext2 <0,2,0,2>, <1,0,3,2> + 2752299828U, // <0,2,1,1>: Cost 3 vuzpl LHS, <1,1,1,1> + 2886518376U, // <0,2,1,2>: Cost 3 vzipl LHS, <2,2,2,2> + 2752299766U, // <0,2,1,3>: Cost 3 vuzpl LHS, <1,0,3,2> + 2550295862U, // <0,2,1,4>: Cost 3 vext1 <0,0,2,1>, RHS + 2752340992U, // <0,2,1,5>: Cost 3 vuzpl LHS, <1,3,5,7> + 2886559674U, // <0,2,1,6>: Cost 3 vzipl LHS, <2,6,3,7> + 3934208106U, // <0,2,1,7>: Cost 4 vuzpr <7,0,1,2>, <0,1,2,7> + 2752340771U, // <0,2,1,u>: Cost 3 vuzpl LHS, <1,0,u,2> + 1476558868U, // <0,2,2,0>: Cost 2 vext1 <0,0,2,2>, <0,0,2,2> + 2226628029U, // <0,2,2,1>: Cost 3 vrev <2,0,1,2> + 2752300648U, // <0,2,2,2>: Cost 3 vuzpl LHS, <2,2,2,2> + 3020736114U, // <0,2,2,3>: Cost 3 vtrnl LHS, <2,2,3,3> + 1476562230U, // <0,2,2,4>: Cost 2 vext1 <0,0,2,2>, RHS + 2550304464U, // <0,2,2,5>: Cost 3 vext1 <0,0,2,2>, <5,1,7,3> + 2618591162U, // <0,2,2,6>: Cost 3 vext2 <0,2,0,2>, <2,6,3,7> + 2550305777U, // <0,2,2,7>: Cost 3 vext1 <0,0,2,2>, <7,0,0,2> + 1476564782U, // <0,2,2,u>: Cost 2 vext1 <0,0,2,2>, LHS + 2618591382U, // <0,2,3,0>: Cost 3 vext2 <0,2,0,2>, <3,0,1,2> + 2752301206U, // <0,2,3,1>: Cost 3 vuzpl LHS, <3,0,1,2> + 3826043121U, // <0,2,3,2>: Cost 4 vuzpl LHS, <3,1,2,3> + 2752301468U, // <0,2,3,3>: Cost 3 vuzpl LHS, <3,3,3,3> + 2618591746U, // <0,2,3,4>: Cost 3 vext2 <0,2,0,2>, <3,4,5,6> + 2752301570U, // <0,2,3,5>: Cost 3 vuzpl LHS, <3,4,5,6> + 3830688102U, // <0,2,3,6>: Cost 4 vuzpl LHS, <3,2,6,3> + 2698807012U, // <0,2,3,7>: Cost 3 vext3 <2,3,7,0>, <2,3,7,0> + 2752301269U, // <0,2,3,u>: Cost 3 vuzpl LHS, <3,0,u,2> + 2562261094U, // <0,2,4,0>: Cost 3 vext1 <2,0,2,4>, LHS + 4095313828U, // <0,2,4,1>: Cost 4 vtrnl <0,2,4,6>, <2,6,1,3> + 2226718152U, // <0,2,4,2>: Cost 3 vrev <2,0,2,4> + 2568235169U, // <0,2,4,3>: Cost 3 vext1 <3,0,2,4>, <3,0,2,4> + 2562264374U, // <0,2,4,4>: Cost 3 vext1 <2,0,2,4>, RHS + 1544850742U, // <0,2,4,5>: Cost 2 vext2 <0,2,0,2>, RHS + 1678560566U, // <0,2,4,6>: Cost 2 vuzpl LHS, RHS + 2592125957U, // <0,2,4,7>: Cost 3 vext1 <7,0,2,4>, <7,0,2,4> + 1678560584U, // <0,2,4,u>: Cost 2 vuzpl LHS, RHS + 2691876686U, // <0,2,5,0>: Cost 3 vext3 <1,2,3,0>, <2,5,0,7> + 2618592976U, // <0,2,5,1>: Cost 3 vext2 <0,2,0,2>, <5,1,7,3> + 3765618528U, // <0,2,5,2>: Cost 4 vext3 <1,2,3,0>, <2,5,2,7> + 3765618536U, // <0,2,5,3>: Cost 4 vext3 <1,2,3,0>, <2,5,3,6> + 2618593222U, // <0,2,5,4>: Cost 3 vext2 <0,2,0,2>, <5,4,7,6> + 2752303108U, // <0,2,5,5>: Cost 3 vuzpl LHS, <5,5,5,5> + 2618593378U, // <0,2,5,6>: Cost 3 vext2 <0,2,0,2>, <5,6,7,0> + 2824785206U, // <0,2,5,7>: Cost 3 vuzpr <1,0,3,2>, RHS + 2824785207U, // <0,2,5,u>: Cost 3 vuzpr <1,0,3,2>, RHS + 2752303950U, // <0,2,6,0>: Cost 3 vuzpl LHS, <6,7,0,1> + 3830690081U, // <0,2,6,1>: Cost 4 vuzpl LHS, <6,0,1,2> + 2618593786U, // <0,2,6,2>: Cost 3 vext2 <0,2,0,2>, <6,2,7,3> + 2691876794U, // <0,2,6,3>: Cost 3 vext3 <1,2,3,0>, <2,6,3,7> + 2752303990U, // <0,2,6,4>: Cost 3 vuzpl LHS, <6,7,4,5> + 3830690445U, // <0,2,6,5>: Cost 4 vuzpl LHS, <6,4,5,6> + 2752303928U, // <0,2,6,6>: Cost 3 vuzpl LHS, <6,6,6,6> + 2657743695U, // <0,2,6,7>: Cost 3 vext2 <6,7,0,2>, <6,7,0,2> + 2691876839U, // <0,2,6,u>: Cost 3 vext3 <1,2,3,0>, <2,6,u,7> + 2659070961U, // <0,2,7,0>: Cost 3 vext2 <7,0,0,2>, <7,0,0,2> + 2659734594U, // <0,2,7,1>: Cost 3 vext2 <7,1,0,2>, <7,1,0,2> + 3734140051U, // <0,2,7,2>: Cost 4 vext2 <7,2,0,2>, <7,2,0,2> + 2701166596U, // <0,2,7,3>: Cost 3 vext3 <2,7,3,0>, <2,7,3,0> + 2662389094U, // <0,2,7,4>: Cost 3 vext2 <7,5,0,2>, <7,4,5,6> + 2662389126U, // <0,2,7,5>: Cost 3 vext2 <7,5,0,2>, <7,5,0,2> + 3736794583U, // <0,2,7,6>: Cost 4 vext2 <7,6,0,2>, <7,6,0,2> + 2752304748U, // <0,2,7,7>: Cost 3 vuzpl LHS, <7,7,7,7> + 2659070961U, // <0,2,7,u>: Cost 3 vext2 <7,0,0,2>, <7,0,0,2> + 1476608026U, // <0,2,u,0>: Cost 2 vext1 <0,0,2,u>, <0,0,2,u> + 1544853294U, // <0,2,u,1>: Cost 2 vext2 <0,2,0,2>, LHS + 1678563118U, // <0,2,u,2>: Cost 2 vuzpl LHS, LHS + 3021178482U, // <0,2,u,3>: Cost 3 vtrnl LHS, <2,2,3,3> + 1476611382U, // <0,2,u,4>: Cost 2 vext1 <0,0,2,u>, RHS + 1544853658U, // <0,2,u,5>: Cost 2 vext2 <0,2,0,2>, RHS + 1678563482U, // <0,2,u,6>: Cost 2 vuzpl LHS, RHS + 2824785449U, // <0,2,u,7>: Cost 3 vuzpr <1,0,3,2>, RHS + 1678563172U, // <0,2,u,u>: Cost 2 vuzpl LHS, LHS + 2556329984U, // <0,3,0,0>: Cost 3 vext1 <1,0,3,0>, <0,0,0,0> + 2686421142U, // <0,3,0,1>: Cost 3 vext3 <0,3,1,0>, <3,0,1,2> + 2562303437U, // <0,3,0,2>: Cost 3 vext1 <2,0,3,0>, <2,0,3,0> + 4094986652U, // <0,3,0,3>: Cost 4 vtrnl <0,2,0,2>, <3,3,3,3> + 2556333366U, // <0,3,0,4>: Cost 3 vext1 <1,0,3,0>, RHS + 4094986754U, // <0,3,0,5>: Cost 4 vtrnl <0,2,0,2>, <3,4,5,6> + 3798796488U, // <0,3,0,6>: Cost 4 vext3 <6,7,3,0>, <3,0,6,7> + 3776530634U, // <0,3,0,7>: Cost 4 vext3 <3,0,7,0>, <3,0,7,0> + 2556335918U, // <0,3,0,u>: Cost 3 vext1 <1,0,3,0>, LHS + 2886518934U, // <0,3,1,0>: Cost 3 vzipl LHS, <3,0,1,2> + 2556338933U, // <0,3,1,1>: Cost 3 vext1 <1,0,3,1>, <1,0,3,1> + 2691877105U, // <0,3,1,2>: Cost 3 vext3 <1,2,3,0>, <3,1,2,3> + 2886519196U, // <0,3,1,3>: Cost 3 vzipl LHS, <3,3,3,3> + 2886519298U, // <0,3,1,4>: Cost 3 vzipl LHS, <3,4,5,6> + 4095740418U, // <0,3,1,5>: Cost 4 vtrnl <0,3,1,4>, <3,4,5,6> + 3659944242U, // <0,3,1,6>: Cost 4 vext1 <6,0,3,1>, <6,0,3,1> + 3769600286U, // <0,3,1,7>: Cost 4 vext3 <1,u,3,0>, <3,1,7,3> + 2886519582U, // <0,3,1,u>: Cost 3 vzipl LHS, <3,u,1,2> + 1482604646U, // <0,3,2,0>: Cost 2 vext1 <1,0,3,2>, LHS + 1482605302U, // <0,3,2,1>: Cost 2 vext1 <1,0,3,2>, <1,0,3,2> + 2556348008U, // <0,3,2,2>: Cost 3 vext1 <1,0,3,2>, <2,2,2,2> + 3020736924U, // <0,3,2,3>: Cost 3 vtrnl LHS, <3,3,3,3> + 1482607926U, // <0,3,2,4>: Cost 2 vext1 <1,0,3,2>, RHS + 3020737026U, // <0,3,2,5>: Cost 3 vtrnl LHS, <3,4,5,6> + 2598154746U, // <0,3,2,6>: Cost 3 vext1 <u,0,3,2>, <6,2,7,3> + 2598155258U, // <0,3,2,7>: Cost 3 vext1 <u,0,3,2>, <7,0,1,2> + 1482610478U, // <0,3,2,u>: Cost 2 vext1 <1,0,3,2>, LHS + 3692341398U, // <0,3,3,0>: Cost 4 vext2 <0,2,0,3>, <3,0,1,2> + 2635851999U, // <0,3,3,1>: Cost 3 vext2 <3,1,0,3>, <3,1,0,3> + 3636069840U, // <0,3,3,2>: Cost 4 vext1 <2,0,3,3>, <2,0,3,3> + 2691877276U, // <0,3,3,3>: Cost 3 vext3 <1,2,3,0>, <3,3,3,3> + 3961522690U, // <0,3,3,4>: Cost 4 vzipl <0,3,1,4>, <3,4,5,6> + 3826797058U, // <0,3,3,5>: Cost 4 vuzpl <0,2,3,5>, <3,4,5,6> + 3703622282U, // <0,3,3,6>: Cost 4 vext2 <2,1,0,3>, <3,6,2,7> + 3769600452U, // <0,3,3,7>: Cost 4 vext3 <1,u,3,0>, <3,3,7,7> + 2640497430U, // <0,3,3,u>: Cost 3 vext2 <3,u,0,3>, <3,u,0,3> + 3962194070U, // <0,3,4,0>: Cost 4 vzipl <0,4,1,5>, <3,0,1,2> + 2232617112U, // <0,3,4,1>: Cost 3 vrev <3,0,1,4> + 2232690849U, // <0,3,4,2>: Cost 3 vrev <3,0,2,4> + 4095314332U, // <0,3,4,3>: Cost 4 vtrnl <0,2,4,6>, <3,3,3,3> + 3962194434U, // <0,3,4,4>: Cost 4 vzipl <0,4,1,5>, <3,4,5,6> + 2691877378U, // <0,3,4,5>: Cost 3 vext3 <1,2,3,0>, <3,4,5,6> + 3826765110U, // <0,3,4,6>: Cost 4 vuzpl <0,2,3,1>, RHS + 3665941518U, // <0,3,4,7>: Cost 4 vext1 <7,0,3,4>, <7,0,3,4> + 2691877405U, // <0,3,4,u>: Cost 3 vext3 <1,2,3,0>, <3,4,u,6> + 3630112870U, // <0,3,5,0>: Cost 4 vext1 <1,0,3,5>, LHS + 3630113526U, // <0,3,5,1>: Cost 4 vext1 <1,0,3,5>, <1,0,3,2> + 4035199734U, // <0,3,5,2>: Cost 4 vzipr <1,4,0,5>, <1,0,3,2> + 3769600578U, // <0,3,5,3>: Cost 4 vext3 <1,u,3,0>, <3,5,3,7> + 2232846516U, // <0,3,5,4>: Cost 3 vrev <3,0,4,5> + 3779037780U, // <0,3,5,5>: Cost 4 vext3 <3,4,5,0>, <3,5,5,7> + 2718714461U, // <0,3,5,6>: Cost 3 vext3 <5,6,7,0>, <3,5,6,7> + 2706106975U, // <0,3,5,7>: Cost 3 vext3 <3,5,7,0>, <3,5,7,0> + 2233141464U, // <0,3,5,u>: Cost 3 vrev <3,0,u,5> + 2691877496U, // <0,3,6,0>: Cost 3 vext3 <1,2,3,0>, <3,6,0,7> + 3727511914U, // <0,3,6,1>: Cost 4 vext2 <6,1,0,3>, <6,1,0,3> + 3765619338U, // <0,3,6,2>: Cost 4 vext3 <1,2,3,0>, <3,6,2,7> + 3765619347U, // <0,3,6,3>: Cost 4 vext3 <1,2,3,0>, <3,6,3,7> + 3765987996U, // <0,3,6,4>: Cost 4 vext3 <1,2,u,0>, <3,6,4,7> + 3306670270U, // <0,3,6,5>: Cost 4 vrev <3,0,5,6> + 3792456365U, // <0,3,6,6>: Cost 4 vext3 <5,6,7,0>, <3,6,6,6> + 2706770608U, // <0,3,6,7>: Cost 3 vext3 <3,6,7,0>, <3,6,7,0> + 2706844345U, // <0,3,6,u>: Cost 3 vext3 <3,6,u,0>, <3,6,u,0> + 3769600707U, // <0,3,7,0>: Cost 4 vext3 <1,u,3,0>, <3,7,0,1> + 2659742787U, // <0,3,7,1>: Cost 3 vext2 <7,1,0,3>, <7,1,0,3> + 3636102612U, // <0,3,7,2>: Cost 4 vext1 <2,0,3,7>, <2,0,3,7> + 3769600740U, // <0,3,7,3>: Cost 4 vext3 <1,u,3,0>, <3,7,3,7> + 3769600747U, // <0,3,7,4>: Cost 4 vext3 <1,u,3,0>, <3,7,4,5> + 3769600758U, // <0,3,7,5>: Cost 4 vext3 <1,u,3,0>, <3,7,5,7> + 3659993400U, // <0,3,7,6>: Cost 4 vext1 <6,0,3,7>, <6,0,3,7> + 3781176065U, // <0,3,7,7>: Cost 4 vext3 <3,7,7,0>, <3,7,7,0> + 2664388218U, // <0,3,7,u>: Cost 3 vext2 <7,u,0,3>, <7,u,0,3> + 1482653798U, // <0,3,u,0>: Cost 2 vext1 <1,0,3,u>, LHS + 1482654460U, // <0,3,u,1>: Cost 2 vext1 <1,0,3,u>, <1,0,3,u> + 2556397160U, // <0,3,u,2>: Cost 3 vext1 <1,0,3,u>, <2,2,2,2> + 3021179292U, // <0,3,u,3>: Cost 3 vtrnl LHS, <3,3,3,3> + 1482657078U, // <0,3,u,4>: Cost 2 vext1 <1,0,3,u>, RHS + 3021179394U, // <0,3,u,5>: Cost 3 vtrnl LHS, <3,4,5,6> + 2598203898U, // <0,3,u,6>: Cost 3 vext1 <u,0,3,u>, <6,2,7,3> + 2708097874U, // <0,3,u,7>: Cost 3 vext3 <3,u,7,0>, <3,u,7,0> + 1482659630U, // <0,3,u,u>: Cost 2 vext1 <1,0,3,u>, LHS + 2617278468U, // <0,4,0,0>: Cost 3 vext2 <0,0,0,4>, <0,0,0,4> + 2618605670U, // <0,4,0,1>: Cost 3 vext2 <0,2,0,4>, LHS + 2618605734U, // <0,4,0,2>: Cost 3 vext2 <0,2,0,4>, <0,2,0,4> + 3642091695U, // <0,4,0,3>: Cost 4 vext1 <3,0,4,0>, <3,0,4,0> + 2753134796U, // <0,4,0,4>: Cost 3 vuzpl <0,2,4,6>, <0,2,4,6> + 2718714770U, // <0,4,0,5>: Cost 3 vext3 <5,6,7,0>, <4,0,5,1> + 3021245750U, // <0,4,0,6>: Cost 3 vtrnl <0,2,0,2>, RHS + 3665982483U, // <0,4,0,7>: Cost 4 vext1 <7,0,4,0>, <7,0,4,0> + 3021245768U, // <0,4,0,u>: Cost 3 vtrnl <0,2,0,2>, RHS + 2568355942U, // <0,4,1,0>: Cost 3 vext1 <3,0,4,1>, LHS + 3692348212U, // <0,4,1,1>: Cost 4 vext2 <0,2,0,4>, <1,1,1,1> + 3692348310U, // <0,4,1,2>: Cost 4 vext2 <0,2,0,4>, <1,2,3,0> + 2568358064U, // <0,4,1,3>: Cost 3 vext1 <3,0,4,1>, <3,0,4,1> + 2568359222U, // <0,4,1,4>: Cost 3 vext1 <3,0,4,1>, RHS + 1812778294U, // <0,4,1,5>: Cost 2 vzipl LHS, RHS + 3022671158U, // <0,4,1,6>: Cost 3 vtrnl <0,4,1,5>, RHS + 2592248852U, // <0,4,1,7>: Cost 3 vext1 <7,0,4,1>, <7,0,4,1> + 1812778537U, // <0,4,1,u>: Cost 2 vzipl LHS, RHS + 2568364134U, // <0,4,2,0>: Cost 3 vext1 <3,0,4,2>, LHS + 2238573423U, // <0,4,2,1>: Cost 3 vrev <4,0,1,2> + 3692349032U, // <0,4,2,2>: Cost 4 vext2 <0,2,0,4>, <2,2,2,2> + 2631214761U, // <0,4,2,3>: Cost 3 vext2 <2,3,0,4>, <2,3,0,4> + 2568367414U, // <0,4,2,4>: Cost 3 vext1 <3,0,4,2>, RHS + 2887028022U, // <0,4,2,5>: Cost 3 vzipl <0,2,0,2>, RHS + 1946996022U, // <0,4,2,6>: Cost 2 vtrnl LHS, RHS + 2592257045U, // <0,4,2,7>: Cost 3 vext1 <7,0,4,2>, <7,0,4,2> + 1946996040U, // <0,4,2,u>: Cost 2 vtrnl LHS, RHS + 3692349590U, // <0,4,3,0>: Cost 4 vext2 <0,2,0,4>, <3,0,1,2> + 3826878614U, // <0,4,3,1>: Cost 4 vuzpl <0,2,4,6>, <3,0,1,2> + 3826878625U, // <0,4,3,2>: Cost 4 vuzpl <0,2,4,6>, <3,0,2,4> + 3692349852U, // <0,4,3,3>: Cost 4 vext2 <0,2,0,4>, <3,3,3,3> + 3692349954U, // <0,4,3,4>: Cost 4 vext2 <0,2,0,4>, <3,4,5,6> + 3826878978U, // <0,4,3,5>: Cost 4 vuzpl <0,2,4,6>, <3,4,5,6> + 4095200566U, // <0,4,3,6>: Cost 4 vtrnl <0,2,3,1>, RHS + 3713583814U, // <0,4,3,7>: Cost 4 vext2 <3,7,0,4>, <3,7,0,4> + 3692350238U, // <0,4,3,u>: Cost 4 vext2 <0,2,0,4>, <3,u,1,2> + 2550464552U, // <0,4,4,0>: Cost 3 vext1 <0,0,4,4>, <0,0,4,4> + 3962194914U, // <0,4,4,1>: Cost 4 vzipl <0,4,1,5>, <4,1,5,0> + 3693677631U, // <0,4,4,2>: Cost 4 vext2 <0,4,0,4>, <4,2,6,3> + 3642124467U, // <0,4,4,3>: Cost 4 vext1 <3,0,4,4>, <3,0,4,4> + 2718715088U, // <0,4,4,4>: Cost 3 vext3 <5,6,7,0>, <4,4,4,4> + 2618608950U, // <0,4,4,5>: Cost 3 vext2 <0,2,0,4>, RHS + 2753137974U, // <0,4,4,6>: Cost 3 vuzpl <0,2,4,6>, RHS + 3666015255U, // <0,4,4,7>: Cost 4 vext1 <7,0,4,4>, <7,0,4,4> + 2618609193U, // <0,4,4,u>: Cost 3 vext2 <0,2,0,4>, RHS + 2568388710U, // <0,4,5,0>: Cost 3 vext1 <3,0,4,5>, LHS + 2568389526U, // <0,4,5,1>: Cost 3 vext1 <3,0,4,5>, <1,2,3,0> + 3636159963U, // <0,4,5,2>: Cost 4 vext1 <2,0,4,5>, <2,0,4,5> + 2568390836U, // <0,4,5,3>: Cost 3 vext1 <3,0,4,5>, <3,0,4,5> + 2568391990U, // <0,4,5,4>: Cost 3 vext1 <3,0,4,5>, RHS + 2718715180U, // <0,4,5,5>: Cost 3 vext3 <5,6,7,0>, <4,5,5,6> + 1618136374U, // <0,4,5,6>: Cost 2 vext3 <1,2,3,0>, RHS + 2592281624U, // <0,4,5,7>: Cost 3 vext1 <7,0,4,5>, <7,0,4,5> + 1618136392U, // <0,4,5,u>: Cost 2 vext3 <1,2,3,0>, RHS + 2550480938U, // <0,4,6,0>: Cost 3 vext1 <0,0,4,6>, <0,0,4,6> + 3826880801U, // <0,4,6,1>: Cost 4 vuzpl <0,2,4,6>, <6,0,1,2> + 2562426332U, // <0,4,6,2>: Cost 3 vext1 <2,0,4,6>, <2,0,4,6> + 3786190181U, // <0,4,6,3>: Cost 4 vext3 <4,6,3,0>, <4,6,3,0> + 2718715252U, // <0,4,6,4>: Cost 3 vext3 <5,6,7,0>, <4,6,4,6> + 3826881165U, // <0,4,6,5>: Cost 4 vuzpl <0,2,4,6>, <6,4,5,6> + 2712669568U, // <0,4,6,6>: Cost 3 vext3 <4,6,6,0>, <4,6,6,0> + 2657760081U, // <0,4,6,7>: Cost 3 vext2 <6,7,0,4>, <6,7,0,4> + 2718715284U, // <0,4,6,u>: Cost 3 vext3 <5,6,7,0>, <4,6,u,2> + 3654090854U, // <0,4,7,0>: Cost 4 vext1 <5,0,4,7>, LHS + 3934229326U, // <0,4,7,1>: Cost 4 vuzpr <7,0,1,4>, <6,7,0,1> + 3734156437U, // <0,4,7,2>: Cost 4 vext2 <7,2,0,4>, <7,2,0,4> + 3734820070U, // <0,4,7,3>: Cost 4 vext2 <7,3,0,4>, <7,3,0,4> + 3654094134U, // <0,4,7,4>: Cost 4 vext1 <5,0,4,7>, RHS + 2713259464U, // <0,4,7,5>: Cost 3 vext3 <4,7,5,0>, <4,7,5,0> + 2713333201U, // <0,4,7,6>: Cost 3 vext3 <4,7,6,0>, <4,7,6,0> + 3654095866U, // <0,4,7,7>: Cost 4 vext1 <5,0,4,7>, <7,0,1,2> + 2713259464U, // <0,4,7,u>: Cost 3 vext3 <4,7,5,0>, <4,7,5,0> + 2568413286U, // <0,4,u,0>: Cost 3 vext1 <3,0,4,u>, LHS + 2618611502U, // <0,4,u,1>: Cost 3 vext2 <0,2,0,4>, LHS + 2753140526U, // <0,4,u,2>: Cost 3 vuzpl <0,2,4,6>, LHS + 2568415415U, // <0,4,u,3>: Cost 3 vext1 <3,0,4,u>, <3,0,4,u> + 2568416566U, // <0,4,u,4>: Cost 3 vext1 <3,0,4,u>, RHS + 1817423158U, // <0,4,u,5>: Cost 2 vzipl LHS, RHS + 1947438390U, // <0,4,u,6>: Cost 2 vtrnl LHS, RHS + 2592306203U, // <0,4,u,7>: Cost 3 vext1 <7,0,4,u>, <7,0,4,u> + 1947438408U, // <0,4,u,u>: Cost 2 vtrnl LHS, RHS + 3630219264U, // <0,5,0,0>: Cost 4 vext1 <1,0,5,0>, <0,0,0,0> + 2625912934U, // <0,5,0,1>: Cost 3 vext2 <1,4,0,5>, LHS + 3692355748U, // <0,5,0,2>: Cost 4 vext2 <0,2,0,5>, <0,2,0,2> + 3693019384U, // <0,5,0,3>: Cost 4 vext2 <0,3,0,5>, <0,3,0,5> + 3630222646U, // <0,5,0,4>: Cost 4 vext1 <1,0,5,0>, RHS + 3699655062U, // <0,5,0,5>: Cost 4 vext2 <1,4,0,5>, <0,5,0,1> + 2718715508U, // <0,5,0,6>: Cost 3 vext3 <5,6,7,0>, <5,0,6,1> + 3087011126U, // <0,5,0,7>: Cost 3 vtrnr <0,0,0,0>, RHS + 2625913501U, // <0,5,0,u>: Cost 3 vext2 <1,4,0,5>, LHS + 1500659814U, // <0,5,1,0>: Cost 2 vext1 <4,0,5,1>, LHS + 2886520528U, // <0,5,1,1>: Cost 3 vzipl LHS, <5,1,7,3> + 2574403176U, // <0,5,1,2>: Cost 3 vext1 <4,0,5,1>, <2,2,2,2> + 2574403734U, // <0,5,1,3>: Cost 3 vext1 <4,0,5,1>, <3,0,1,2> + 1500662674U, // <0,5,1,4>: Cost 2 vext1 <4,0,5,1>, <4,0,5,1> + 2886520836U, // <0,5,1,5>: Cost 3 vzipl LHS, <5,5,5,5> + 2886520930U, // <0,5,1,6>: Cost 3 vzipl LHS, <5,6,7,0> + 2718715600U, // <0,5,1,7>: Cost 3 vext3 <5,6,7,0>, <5,1,7,3> + 1500665646U, // <0,5,1,u>: Cost 2 vext1 <4,0,5,1>, LHS + 2556493926U, // <0,5,2,0>: Cost 3 vext1 <1,0,5,2>, LHS + 2244546120U, // <0,5,2,1>: Cost 3 vrev <5,0,1,2> + 3692357256U, // <0,5,2,2>: Cost 4 vext2 <0,2,0,5>, <2,2,5,7> + 2568439994U, // <0,5,2,3>: Cost 3 vext1 <3,0,5,2>, <3,0,5,2> + 2556497206U, // <0,5,2,4>: Cost 3 vext1 <1,0,5,2>, RHS + 3020738564U, // <0,5,2,5>: Cost 3 vtrnl LHS, <5,5,5,5> + 4027877161U, // <0,5,2,6>: Cost 4 vzipr <0,2,0,2>, <2,4,5,6> + 3093220662U, // <0,5,2,7>: Cost 3 vtrnr <1,0,3,2>, RHS + 3093220663U, // <0,5,2,u>: Cost 3 vtrnr <1,0,3,2>, RHS + 3699656854U, // <0,5,3,0>: Cost 4 vext2 <1,4,0,5>, <3,0,1,2> + 3699656927U, // <0,5,3,1>: Cost 4 vext2 <1,4,0,5>, <3,1,0,3> + 3699657006U, // <0,5,3,2>: Cost 4 vext2 <1,4,0,5>, <3,2,0,1> + 3699657116U, // <0,5,3,3>: Cost 4 vext2 <1,4,0,5>, <3,3,3,3> + 2637859284U, // <0,5,3,4>: Cost 3 vext2 <3,4,0,5>, <3,4,0,5> + 3790319453U, // <0,5,3,5>: Cost 4 vext3 <5,3,5,0>, <5,3,5,0> + 3699657354U, // <0,5,3,6>: Cost 4 vext2 <1,4,0,5>, <3,6,2,7> + 2716725103U, // <0,5,3,7>: Cost 3 vext3 <5,3,7,0>, <5,3,7,0> + 2716798840U, // <0,5,3,u>: Cost 3 vext3 <5,3,u,0>, <5,3,u,0> + 2661747602U, // <0,5,4,0>: Cost 3 vext2 <7,4,0,5>, <4,0,5,1> + 3630252810U, // <0,5,4,1>: Cost 4 vext1 <1,0,5,4>, <1,0,5,4> + 3636225507U, // <0,5,4,2>: Cost 4 vext1 <2,0,5,4>, <2,0,5,4> + 3716910172U, // <0,5,4,3>: Cost 4 vext2 <4,3,0,5>, <4,3,0,5> + 3962195892U, // <0,5,4,4>: Cost 4 vzipl <0,4,1,5>, <5,4,5,6> + 2625916214U, // <0,5,4,5>: Cost 3 vext2 <1,4,0,5>, RHS + 3718901071U, // <0,5,4,6>: Cost 4 vext2 <4,6,0,5>, <4,6,0,5> + 2718715846U, // <0,5,4,7>: Cost 3 vext3 <5,6,7,0>, <5,4,7,6> + 2625916457U, // <0,5,4,u>: Cost 3 vext2 <1,4,0,5>, RHS + 3791278034U, // <0,5,5,0>: Cost 4 vext3 <5,5,0,0>, <5,5,0,0> + 3791351771U, // <0,5,5,1>: Cost 4 vext3 <5,5,1,0>, <5,5,1,0> + 3318386260U, // <0,5,5,2>: Cost 4 vrev <5,0,2,5> + 3791499245U, // <0,5,5,3>: Cost 4 vext3 <5,5,3,0>, <5,5,3,0> + 3318533734U, // <0,5,5,4>: Cost 4 vrev <5,0,4,5> + 2718715908U, // <0,5,5,5>: Cost 3 vext3 <5,6,7,0>, <5,5,5,5> + 2657767522U, // <0,5,5,6>: Cost 3 vext2 <6,7,0,5>, <5,6,7,0> + 2718715928U, // <0,5,5,7>: Cost 3 vext3 <5,6,7,0>, <5,5,7,7> + 2718715937U, // <0,5,5,u>: Cost 3 vext3 <5,6,7,0>, <5,5,u,7> + 2592358502U, // <0,5,6,0>: Cost 3 vext1 <7,0,5,6>, LHS + 3792015404U, // <0,5,6,1>: Cost 4 vext3 <5,6,1,0>, <5,6,1,0> + 3731509754U, // <0,5,6,2>: Cost 4 vext2 <6,7,0,5>, <6,2,7,3> + 3785748546U, // <0,5,6,3>: Cost 4 vext3 <4,5,6,0>, <5,6,3,4> + 2592361782U, // <0,5,6,4>: Cost 3 vext1 <7,0,5,6>, RHS + 2592362594U, // <0,5,6,5>: Cost 3 vext1 <7,0,5,6>, <5,6,7,0> + 3785748576U, // <0,5,6,6>: Cost 4 vext3 <4,5,6,0>, <5,6,6,7> + 1644974178U, // <0,5,6,7>: Cost 2 vext3 <5,6,7,0>, <5,6,7,0> + 1645047915U, // <0,5,6,u>: Cost 2 vext3 <5,6,u,0>, <5,6,u,0> + 2562506854U, // <0,5,7,0>: Cost 3 vext1 <2,0,5,7>, LHS + 2562507670U, // <0,5,7,1>: Cost 3 vext1 <2,0,5,7>, <1,2,3,0> + 2562508262U, // <0,5,7,2>: Cost 3 vext1 <2,0,5,7>, <2,0,5,7> + 3636250774U, // <0,5,7,3>: Cost 4 vext1 <2,0,5,7>, <3,0,1,2> + 2562510134U, // <0,5,7,4>: Cost 3 vext1 <2,0,5,7>, RHS + 2718716072U, // <0,5,7,5>: Cost 3 vext3 <5,6,7,0>, <5,7,5,7> + 2718716074U, // <0,5,7,6>: Cost 3 vext3 <5,6,7,0>, <5,7,6,0> + 2719379635U, // <0,5,7,7>: Cost 3 vext3 <5,7,7,0>, <5,7,7,0> + 2562512686U, // <0,5,7,u>: Cost 3 vext1 <2,0,5,7>, LHS + 1500717158U, // <0,5,u,0>: Cost 2 vext1 <4,0,5,u>, LHS + 2625918766U, // <0,5,u,1>: Cost 3 vext2 <1,4,0,5>, LHS + 2719674583U, // <0,5,u,2>: Cost 3 vext3 <5,u,2,0>, <5,u,2,0> + 2568489152U, // <0,5,u,3>: Cost 3 vext1 <3,0,5,u>, <3,0,5,u> + 1500720025U, // <0,5,u,4>: Cost 2 vext1 <4,0,5,u>, <4,0,5,u> + 2625919130U, // <0,5,u,5>: Cost 3 vext2 <1,4,0,5>, RHS + 2586407243U, // <0,5,u,6>: Cost 3 vext1 <6,0,5,u>, <6,0,5,u> + 1646301444U, // <0,5,u,7>: Cost 2 vext3 <5,u,7,0>, <5,u,7,0> + 1646375181U, // <0,5,u,u>: Cost 2 vext3 <5,u,u,0>, <5,u,u,0> + 2586411110U, // <0,6,0,0>: Cost 3 vext1 <6,0,6,0>, LHS + 2619949158U, // <0,6,0,1>: Cost 3 vext2 <0,4,0,6>, LHS + 2619949220U, // <0,6,0,2>: Cost 3 vext2 <0,4,0,6>, <0,2,0,2> + 3785748789U, // <0,6,0,3>: Cost 4 vext3 <4,5,6,0>, <6,0,3,4> + 2619949386U, // <0,6,0,4>: Cost 3 vext2 <0,4,0,6>, <0,4,0,6> + 2586415202U, // <0,6,0,5>: Cost 3 vext1 <6,0,6,0>, <5,6,7,0> + 2586415436U, // <0,6,0,6>: Cost 3 vext1 <6,0,6,0>, <6,0,6,0> + 2952793398U, // <0,6,0,7>: Cost 3 vzipr <0,0,0,0>, RHS + 2619949725U, // <0,6,0,u>: Cost 3 vext2 <0,4,0,6>, LHS + 2562531430U, // <0,6,1,0>: Cost 3 vext1 <2,0,6,1>, LHS + 3693691700U, // <0,6,1,1>: Cost 4 vext2 <0,4,0,6>, <1,1,1,1> + 2886521338U, // <0,6,1,2>: Cost 3 vzipl LHS, <6,2,7,3> + 3693691864U, // <0,6,1,3>: Cost 4 vext2 <0,4,0,6>, <1,3,1,3> + 2562534710U, // <0,6,1,4>: Cost 3 vext1 <2,0,6,1>, RHS + 2580450932U, // <0,6,1,5>: Cost 3 vext1 <5,0,6,1>, <5,0,6,1> + 2886521656U, // <0,6,1,6>: Cost 3 vzipl LHS, <6,6,6,6> + 2966736182U, // <0,6,1,7>: Cost 3 vzipr <2,3,0,1>, RHS + 2966736183U, // <0,6,1,u>: Cost 3 vzipr <2,3,0,1>, RHS + 1500741734U, // <0,6,2,0>: Cost 2 vext1 <4,0,6,2>, LHS + 2250518817U, // <0,6,2,1>: Cost 3 vrev <6,0,1,2> + 2574485096U, // <0,6,2,2>: Cost 3 vext1 <4,0,6,2>, <2,2,2,2> + 2631894694U, // <0,6,2,3>: Cost 3 vext2 <2,4,0,6>, <2,3,0,1> + 1500744604U, // <0,6,2,4>: Cost 2 vext1 <4,0,6,2>, <4,0,6,2> + 2574487248U, // <0,6,2,5>: Cost 3 vext1 <4,0,6,2>, <5,1,7,3> + 3020739384U, // <0,6,2,6>: Cost 3 vtrnl LHS, <6,6,6,6> + 2954136886U, // <0,6,2,7>: Cost 3 vzipr <0,2,0,2>, RHS + 1500747566U, // <0,6,2,u>: Cost 2 vext1 <4,0,6,2>, LHS + 3693693078U, // <0,6,3,0>: Cost 4 vext2 <0,4,0,6>, <3,0,1,2> + 3705637136U, // <0,6,3,1>: Cost 4 vext2 <2,4,0,6>, <3,1,5,7> + 3705637192U, // <0,6,3,2>: Cost 4 vext2 <2,4,0,6>, <3,2,3,0> + 3693693340U, // <0,6,3,3>: Cost 4 vext2 <0,4,0,6>, <3,3,3,3> + 2637867477U, // <0,6,3,4>: Cost 3 vext2 <3,4,0,6>, <3,4,0,6> + 3705637424U, // <0,6,3,5>: Cost 4 vext2 <2,4,0,6>, <3,5,1,7> + 3666154056U, // <0,6,3,6>: Cost 4 vext1 <7,0,6,3>, <6,3,7,0> + 2722697800U, // <0,6,3,7>: Cost 3 vext3 <6,3,7,0>, <6,3,7,0> + 2722771537U, // <0,6,3,u>: Cost 3 vext3 <6,3,u,0>, <6,3,u,0> + 2562556006U, // <0,6,4,0>: Cost 3 vext1 <2,0,6,4>, LHS + 4095316257U, // <0,6,4,1>: Cost 4 vtrnl <0,2,4,6>, <6,0,1,2> + 2562557420U, // <0,6,4,2>: Cost 3 vext1 <2,0,6,4>, <2,0,6,4> + 3636299926U, // <0,6,4,3>: Cost 4 vext1 <2,0,6,4>, <3,0,1,2> + 2562559286U, // <0,6,4,4>: Cost 3 vext1 <2,0,6,4>, RHS + 2619952438U, // <0,6,4,5>: Cost 3 vext2 <0,4,0,6>, RHS + 2723287696U, // <0,6,4,6>: Cost 3 vext3 <6,4,6,0>, <6,4,6,0> + 4027895094U, // <0,6,4,7>: Cost 4 vzipr <0,2,0,4>, RHS + 2619952681U, // <0,6,4,u>: Cost 3 vext2 <0,4,0,6>, RHS + 2718716594U, // <0,6,5,0>: Cost 3 vext3 <5,6,7,0>, <6,5,0,7> + 3648250774U, // <0,6,5,1>: Cost 4 vext1 <4,0,6,5>, <1,2,3,0> + 3792458436U, // <0,6,5,2>: Cost 4 vext3 <5,6,7,0>, <6,5,2,7> + 3705638767U, // <0,6,5,3>: Cost 5 vext2 <2,4,0,6>, <5,3,7,0> + 3648252831U, // <0,6,5,4>: Cost 4 vext1 <4,0,6,5>, <4,0,6,5> + 3797619416U, // <0,6,5,5>: Cost 4 vext3 <6,5,5,0>, <6,5,5,0> + 3792458472U, // <0,6,5,6>: Cost 4 vext3 <5,6,7,0>, <6,5,6,7> + 4035202358U, // <0,6,5,7>: Cost 4 vzipr <1,4,0,5>, RHS + 2718716594U, // <0,6,5,u>: Cost 3 vext3 <5,6,7,0>, <6,5,0,7> + 3786412796U, // <0,6,6,0>: Cost 4 vext3 <4,6,6,0>, <6,6,0,0> + 3792458504U, // <0,6,6,1>: Cost 4 vext3 <5,6,7,0>, <6,6,1,3> + 3728200126U, // <0,6,6,2>: Cost 4 vext2 <6,2,0,6>, <6,2,0,6> + 3798135575U, // <0,6,6,3>: Cost 4 vext3 <6,6,3,0>, <6,6,3,0> + 3786412836U, // <0,6,6,4>: Cost 4 vext3 <4,6,6,0>, <6,6,4,4> + 3792458543U, // <0,6,6,5>: Cost 4 vext3 <5,6,7,0>, <6,6,5,6> + 2718716728U, // <0,6,6,6>: Cost 3 vext3 <5,6,7,0>, <6,6,6,6> + 2718716738U, // <0,6,6,7>: Cost 3 vext3 <5,6,7,0>, <6,6,7,7> + 2718716747U, // <0,6,6,u>: Cost 3 vext3 <5,6,7,0>, <6,6,u,7> + 2718716750U, // <0,6,7,0>: Cost 3 vext3 <5,6,7,0>, <6,7,0,1> + 2724909910U, // <0,6,7,1>: Cost 3 vext3 <6,7,1,0>, <6,7,1,0> + 3636323823U, // <0,6,7,2>: Cost 4 vext1 <2,0,6,7>, <2,0,6,7> + 2725057384U, // <0,6,7,3>: Cost 3 vext3 <6,7,3,0>, <6,7,3,0> + 2718716790U, // <0,6,7,4>: Cost 3 vext3 <5,6,7,0>, <6,7,4,5> + 2718716800U, // <0,6,7,5>: Cost 3 vext3 <5,6,7,0>, <6,7,5,6> + 3792458629U, // <0,6,7,6>: Cost 4 vext3 <5,6,7,0>, <6,7,6,2> + 2725352332U, // <0,6,7,7>: Cost 3 vext3 <6,7,7,0>, <6,7,7,0> + 2718716822U, // <0,6,7,u>: Cost 3 vext3 <5,6,7,0>, <6,7,u,1> + 1500790886U, // <0,6,u,0>: Cost 2 vext1 <4,0,6,u>, LHS + 2619954990U, // <0,6,u,1>: Cost 3 vext2 <0,4,0,6>, LHS + 2562590192U, // <0,6,u,2>: Cost 3 vext1 <2,0,6,u>, <2,0,6,u> + 2725721017U, // <0,6,u,3>: Cost 3 vext3 <6,u,3,0>, <6,u,3,0> + 1500793762U, // <0,6,u,4>: Cost 2 vext1 <4,0,6,u>, <4,0,6,u> + 2619955354U, // <0,6,u,5>: Cost 3 vext2 <0,4,0,6>, RHS + 2725942228U, // <0,6,u,6>: Cost 3 vext3 <6,u,6,0>, <6,u,6,0> + 2954186038U, // <0,6,u,7>: Cost 3 vzipr <0,2,0,u>, RHS + 1500796718U, // <0,6,u,u>: Cost 2 vext1 <4,0,6,u>, LHS + 2256401391U, // <0,7,0,0>: Cost 3 vrev <7,0,0,0> + 2632564838U, // <0,7,0,1>: Cost 3 vext2 <2,5,0,7>, LHS + 2256548865U, // <0,7,0,2>: Cost 3 vrev <7,0,2,0> + 3700998396U, // <0,7,0,3>: Cost 4 vext2 <1,6,0,7>, <0,3,1,0> + 2718716952U, // <0,7,0,4>: Cost 3 vext3 <5,6,7,0>, <7,0,4,5> + 2718716962U, // <0,7,0,5>: Cost 3 vext3 <5,6,7,0>, <7,0,5,6> + 2621284845U, // <0,7,0,6>: Cost 3 vext2 <0,6,0,7>, <0,6,0,7> + 3904685542U, // <0,7,0,7>: Cost 4 vuzpr <2,0,5,7>, <2,0,5,7> + 2632565405U, // <0,7,0,u>: Cost 3 vext2 <2,5,0,7>, LHS + 2256409584U, // <0,7,1,0>: Cost 3 vrev <7,0,0,1> + 3706307380U, // <0,7,1,1>: Cost 4 vext2 <2,5,0,7>, <1,1,1,1> + 2632565654U, // <0,7,1,2>: Cost 3 vext2 <2,5,0,7>, <1,2,3,0> + 3769603168U, // <0,7,1,3>: Cost 4 vext3 <1,u,3,0>, <7,1,3,5> + 2256704532U, // <0,7,1,4>: Cost 3 vrev <7,0,4,1> + 3769603184U, // <0,7,1,5>: Cost 4 vext3 <1,u,3,0>, <7,1,5,3> + 3700999366U, // <0,7,1,6>: Cost 4 vext2 <1,6,0,7>, <1,6,0,7> + 2886522476U, // <0,7,1,7>: Cost 3 vzipl LHS, <7,7,7,7> + 2256999480U, // <0,7,1,u>: Cost 3 vrev <7,0,u,1> + 2586501222U, // <0,7,2,0>: Cost 3 vext1 <6,0,7,2>, LHS + 1182749690U, // <0,7,2,1>: Cost 2 vrev <7,0,1,2> + 3636356595U, // <0,7,2,2>: Cost 4 vext1 <2,0,7,2>, <2,0,7,2> + 2727711916U, // <0,7,2,3>: Cost 3 vext3 <7,2,3,0>, <7,2,3,0> + 2586504502U, // <0,7,2,4>: Cost 3 vext1 <6,0,7,2>, RHS + 2632566606U, // <0,7,2,5>: Cost 3 vext2 <2,5,0,7>, <2,5,0,7> + 2586505559U, // <0,7,2,6>: Cost 3 vext1 <6,0,7,2>, <6,0,7,2> + 3020740204U, // <0,7,2,7>: Cost 3 vtrnl LHS, <7,7,7,7> + 1183265849U, // <0,7,2,u>: Cost 2 vrev <7,0,u,2> + 3701000342U, // <0,7,3,0>: Cost 4 vext2 <1,6,0,7>, <3,0,1,2> + 3706308849U, // <0,7,3,1>: Cost 4 vext2 <2,5,0,7>, <3,1,2,3> + 3330315268U, // <0,7,3,2>: Cost 4 vrev <7,0,2,3> + 3706309020U, // <0,7,3,3>: Cost 4 vext2 <2,5,0,7>, <3,3,3,3> + 3706309122U, // <0,7,3,4>: Cost 4 vext2 <2,5,0,7>, <3,4,5,6> + 3712281127U, // <0,7,3,5>: Cost 4 vext2 <3,5,0,7>, <3,5,0,7> + 2639202936U, // <0,7,3,6>: Cost 3 vext2 <3,6,0,7>, <3,6,0,7> + 3802412321U, // <0,7,3,7>: Cost 4 vext3 <7,3,7,0>, <7,3,7,0> + 2640530202U, // <0,7,3,u>: Cost 3 vext2 <3,u,0,7>, <3,u,0,7> + 3654287462U, // <0,7,4,0>: Cost 4 vext1 <5,0,7,4>, LHS + 2256507900U, // <0,7,4,1>: Cost 3 vrev <7,0,1,4> + 2256581637U, // <0,7,4,2>: Cost 3 vrev <7,0,2,4> + 3660262008U, // <0,7,4,3>: Cost 4 vext1 <6,0,7,4>, <3,6,0,7> + 3786413405U, // <0,7,4,4>: Cost 4 vext3 <4,6,6,0>, <7,4,4,6> + 2632568118U, // <0,7,4,5>: Cost 3 vext2 <2,5,0,7>, RHS + 3718917457U, // <0,7,4,6>: Cost 4 vext2 <4,6,0,7>, <4,6,0,7> + 3787003255U, // <0,7,4,7>: Cost 4 vext3 <4,7,5,0>, <7,4,7,5> + 2632568361U, // <0,7,4,u>: Cost 3 vext2 <2,5,0,7>, RHS + 3706310268U, // <0,7,5,0>: Cost 4 vext2 <2,5,0,7>, <5,0,7,0> + 3792459156U, // <0,7,5,1>: Cost 4 vext3 <5,6,7,0>, <7,5,1,7> + 3330331654U, // <0,7,5,2>: Cost 4 vrev <7,0,2,5> + 3722899255U, // <0,7,5,3>: Cost 4 vext2 <5,3,0,7>, <5,3,0,7> + 2256737304U, // <0,7,5,4>: Cost 3 vrev <7,0,4,5> + 3724226521U, // <0,7,5,5>: Cost 4 vext2 <5,5,0,7>, <5,5,0,7> + 2718717377U, // <0,7,5,6>: Cost 3 vext3 <5,6,7,0>, <7,5,6,7> + 2729997763U, // <0,7,5,7>: Cost 3 vext3 <7,5,7,0>, <7,5,7,0> + 2720044499U, // <0,7,5,u>: Cost 3 vext3 <5,u,7,0>, <7,5,u,7> + 3712946517U, // <0,7,6,0>: Cost 4 vext2 <3,6,0,7>, <6,0,7,0> + 2256524286U, // <0,7,6,1>: Cost 3 vrev <7,0,1,6> + 3792459246U, // <0,7,6,2>: Cost 4 vext3 <5,6,7,0>, <7,6,2,7> + 3796440567U, // <0,7,6,3>: Cost 4 vext3 <6,3,7,0>, <7,6,3,7> + 3654307126U, // <0,7,6,4>: Cost 4 vext1 <5,0,7,6>, RHS + 2656457394U, // <0,7,6,5>: Cost 3 vext2 <6,5,0,7>, <6,5,0,7> + 3792459281U, // <0,7,6,6>: Cost 4 vext3 <5,6,7,0>, <7,6,6,6> + 2730661396U, // <0,7,6,7>: Cost 3 vext3 <7,6,7,0>, <7,6,7,0> + 2658448293U, // <0,7,6,u>: Cost 3 vext2 <6,u,0,7>, <6,u,0,7> + 3787003431U, // <0,7,7,0>: Cost 4 vext3 <4,7,5,0>, <7,7,0,1> + 3654312854U, // <0,7,7,1>: Cost 4 vext1 <5,0,7,7>, <1,2,3,0> + 3654313446U, // <0,7,7,2>: Cost 4 vext1 <5,0,7,7>, <2,0,5,7> + 3804771905U, // <0,7,7,3>: Cost 4 vext3 <7,7,3,0>, <7,7,3,0> + 3654315318U, // <0,7,7,4>: Cost 4 vext1 <5,0,7,7>, RHS + 3654315651U, // <0,7,7,5>: Cost 4 vext1 <5,0,7,7>, <5,0,7,7> + 3660288348U, // <0,7,7,6>: Cost 4 vext1 <6,0,7,7>, <6,0,7,7> + 2718717548U, // <0,7,7,7>: Cost 3 vext3 <5,6,7,0>, <7,7,7,7> + 2664420990U, // <0,7,7,u>: Cost 3 vext2 <7,u,0,7>, <7,u,0,7> + 2256466935U, // <0,7,u,0>: Cost 3 vrev <7,0,0,u> + 1182798848U, // <0,7,u,1>: Cost 2 vrev <7,0,1,u> + 2256614409U, // <0,7,u,2>: Cost 3 vrev <7,0,2,u> + 2731693714U, // <0,7,u,3>: Cost 3 vext3 <7,u,3,0>, <7,u,3,0> + 2256761883U, // <0,7,u,4>: Cost 3 vrev <7,0,4,u> + 2632571034U, // <0,7,u,5>: Cost 3 vext2 <2,5,0,7>, RHS + 2669066421U, // <0,7,u,6>: Cost 3 vext2 <u,6,0,7>, <u,6,0,7> + 2731988662U, // <0,7,u,7>: Cost 3 vext3 <7,u,7,0>, <7,u,7,0> + 1183315007U, // <0,7,u,u>: Cost 2 vrev <7,0,u,u> + 135053414U, // <0,u,0,0>: Cost 1 vdup0 LHS + 1544896614U, // <0,u,0,1>: Cost 2 vext2 <0,2,0,u>, LHS + 1678999654U, // <0,u,0,2>: Cost 2 vuzpl LHS, LHS + 2691880677U, // <0,u,0,3>: Cost 3 vext3 <1,2,3,0>, <u,0,3,2> + 1476988214U, // <0,u,0,4>: Cost 2 vext1 <0,0,u,0>, RHS + 2718791419U, // <0,u,0,5>: Cost 3 vext3 <5,6,u,0>, <u,0,5,6> + 3021248666U, // <0,u,0,6>: Cost 3 vtrnl <0,2,0,2>, RHS + 2592535607U, // <0,u,0,7>: Cost 3 vext1 <7,0,u,0>, <7,0,u,0> + 135053414U, // <0,u,0,u>: Cost 1 vdup0 LHS + 1476993097U, // <0,u,1,0>: Cost 2 vext1 <0,0,u,1>, <0,0,u,1> + 1812780846U, // <0,u,1,1>: Cost 2 vzipl LHS, LHS + 1618138926U, // <0,u,1,2>: Cost 2 vext3 <1,2,3,0>, LHS + 2752742134U, // <0,u,1,3>: Cost 3 vuzpl LHS, <1,0,3,2> + 1476996406U, // <0,u,1,4>: Cost 2 vext1 <0,0,u,1>, RHS + 1812781210U, // <0,u,1,5>: Cost 2 vzipl LHS, RHS + 2887006416U, // <0,u,1,6>: Cost 3 vzipl LHS, <u,6,3,7> + 2966736200U, // <0,u,1,7>: Cost 3 vzipr <2,3,0,1>, RHS + 1812781413U, // <0,u,1,u>: Cost 2 vzipl LHS, LHS + 1482973286U, // <0,u,2,0>: Cost 2 vext1 <1,0,u,2>, LHS + 1482973987U, // <0,u,2,1>: Cost 2 vext1 <1,0,u,2>, <1,0,u,2> + 1946998574U, // <0,u,2,2>: Cost 2 vtrnl LHS, LHS + 835584U, // <0,u,2,3>: Cost 0 copy LHS + 1482976566U, // <0,u,2,4>: Cost 2 vext1 <1,0,u,2>, RHS + 3020781631U, // <0,u,2,5>: Cost 3 vtrnl LHS, <u,4,5,6> + 1946998938U, // <0,u,2,6>: Cost 2 vtrnl LHS, RHS + 1518810169U, // <0,u,2,7>: Cost 2 vext1 <7,0,u,2>, <7,0,u,2> + 835584U, // <0,u,2,u>: Cost 0 copy LHS + 2618640534U, // <0,u,3,0>: Cost 3 vext2 <0,2,0,u>, <3,0,1,2> + 2752743574U, // <0,u,3,1>: Cost 3 vuzpl LHS, <3,0,1,2> + 2636556597U, // <0,u,3,2>: Cost 3 vext2 <3,2,0,u>, <3,2,0,u> + 2752743836U, // <0,u,3,3>: Cost 3 vuzpl LHS, <3,3,3,3> + 2618640898U, // <0,u,3,4>: Cost 3 vext2 <0,2,0,u>, <3,4,5,6> + 2752743938U, // <0,u,3,5>: Cost 3 vuzpl LHS, <3,4,5,6> + 2639202936U, // <0,u,3,6>: Cost 3 vext2 <3,6,0,7>, <3,6,0,7> + 2639874762U, // <0,u,3,7>: Cost 3 vext2 <3,7,0,u>, <3,7,0,u> + 2752743637U, // <0,u,3,u>: Cost 3 vuzpl LHS, <3,0,u,2> + 2562703462U, // <0,u,4,0>: Cost 3 vext1 <2,0,u,4>, LHS + 2888455982U, // <0,u,4,1>: Cost 3 vzipl <0,4,1,5>, LHS + 3021575982U, // <0,u,4,2>: Cost 3 vtrnl <0,2,4,6>, LHS + 2568677591U, // <0,u,4,3>: Cost 3 vext1 <3,0,u,4>, <3,0,u,4> + 2562706742U, // <0,u,4,4>: Cost 3 vext1 <2,0,u,4>, RHS + 1544899894U, // <0,u,4,5>: Cost 2 vext2 <0,2,0,u>, RHS + 1679002934U, // <0,u,4,6>: Cost 2 vuzpl LHS, RHS + 2718718033U, // <0,u,4,7>: Cost 3 vext3 <5,6,7,0>, <u,4,7,6> + 1679002952U, // <0,u,4,u>: Cost 2 vuzpl LHS, RHS + 2568683622U, // <0,u,5,0>: Cost 3 vext1 <3,0,u,5>, LHS + 2568684438U, // <0,u,5,1>: Cost 3 vext1 <3,0,u,5>, <1,2,3,0> + 3765622902U, // <0,u,5,2>: Cost 4 vext3 <1,2,3,0>, <u,5,2,7> + 2691881087U, // <0,u,5,3>: Cost 3 vext3 <1,2,3,0>, <u,5,3,7> + 2568686902U, // <0,u,5,4>: Cost 3 vext1 <3,0,u,5>, RHS + 2650492890U, // <0,u,5,5>: Cost 3 vext2 <5,5,0,u>, <5,5,0,u> + 1618139290U, // <0,u,5,6>: Cost 2 vext3 <1,2,3,0>, RHS + 2824834358U, // <0,u,5,7>: Cost 3 vuzpr <1,0,3,u>, RHS + 1618139308U, // <0,u,5,u>: Cost 2 vext3 <1,2,3,0>, RHS + 2592579686U, // <0,u,6,0>: Cost 3 vext1 <7,0,u,6>, LHS + 2262496983U, // <0,u,6,1>: Cost 3 vrev <u,0,1,6> + 2654474688U, // <0,u,6,2>: Cost 3 vext2 <6,2,0,u>, <6,2,0,u> + 2691881168U, // <0,u,6,3>: Cost 3 vext3 <1,2,3,0>, <u,6,3,7> + 2592582966U, // <0,u,6,4>: Cost 3 vext1 <7,0,u,6>, RHS + 2656465587U, // <0,u,6,5>: Cost 3 vext2 <6,5,0,u>, <6,5,0,u> + 2657129220U, // <0,u,6,6>: Cost 3 vext2 <6,6,0,u>, <6,6,0,u> + 1584051029U, // <0,u,6,7>: Cost 2 vext2 <6,7,0,u>, <6,7,0,u> + 1584714662U, // <0,u,6,u>: Cost 2 vext2 <6,u,0,u>, <6,u,0,u> + 2562728038U, // <0,u,7,0>: Cost 3 vext1 <2,0,u,7>, LHS + 2562728854U, // <0,u,7,1>: Cost 3 vext1 <2,0,u,7>, <1,2,3,0> + 2562729473U, // <0,u,7,2>: Cost 3 vext1 <2,0,u,7>, <2,0,u,7> + 2661111018U, // <0,u,7,3>: Cost 3 vext2 <7,3,0,u>, <7,3,0,u> + 2562731318U, // <0,u,7,4>: Cost 3 vext1 <2,0,u,7>, RHS + 2718718258U, // <0,u,7,5>: Cost 3 vext3 <5,6,7,0>, <u,7,5,6> + 2586620261U, // <0,u,7,6>: Cost 3 vext1 <6,0,u,7>, <6,0,u,7> + 2657793644U, // <0,u,7,7>: Cost 3 vext2 <6,7,0,u>, <7,7,7,7> + 2562733870U, // <0,u,7,u>: Cost 3 vext1 <2,0,u,7>, LHS + 135053414U, // <0,u,u,0>: Cost 1 vdup0 LHS + 1544902446U, // <0,u,u,1>: Cost 2 vext2 <0,2,0,u>, LHS + 1679005486U, // <0,u,u,2>: Cost 2 vuzpl LHS, LHS + 835584U, // <0,u,u,3>: Cost 0 copy LHS + 1483025718U, // <0,u,u,4>: Cost 2 vext1 <1,0,u,u>, RHS + 1544902810U, // <0,u,u,5>: Cost 2 vext2 <0,2,0,u>, RHS + 1679005850U, // <0,u,u,6>: Cost 2 vuzpl LHS, RHS + 1518859327U, // <0,u,u,7>: Cost 2 vext1 <7,0,u,u>, <7,0,u,u> + 835584U, // <0,u,u,u>: Cost 0 copy LHS + 2689744896U, // <1,0,0,0>: Cost 3 vext3 <0,u,1,1>, <0,0,0,0> + 1610694666U, // <1,0,0,1>: Cost 2 vext3 <0,0,1,1>, <0,0,1,1> + 2689744916U, // <1,0,0,2>: Cost 3 vext3 <0,u,1,1>, <0,0,2,2> + 2619310332U, // <1,0,0,3>: Cost 3 vext2 <0,3,1,0>, <0,3,1,0> + 2684657701U, // <1,0,0,4>: Cost 3 vext3 <0,0,4,1>, <0,0,4,1> + 2620637598U, // <1,0,0,5>: Cost 3 vext2 <0,5,1,0>, <0,5,1,0> + 3708977654U, // <1,0,0,6>: Cost 4 vext2 <3,0,1,0>, <0,6,1,7> + 3666351168U, // <1,0,0,7>: Cost 4 vext1 <7,1,0,0>, <7,1,0,0> + 1611210825U, // <1,0,0,u>: Cost 2 vext3 <0,0,u,1>, <0,0,u,1> + 2556780646U, // <1,0,1,0>: Cost 3 vext1 <1,1,0,1>, LHS + 2556781355U, // <1,0,1,1>: Cost 3 vext1 <1,1,0,1>, <1,1,0,1> + 1616003174U, // <1,0,1,2>: Cost 2 vext3 <0,u,1,1>, LHS + 3693052888U, // <1,0,1,3>: Cost 4 vext2 <0,3,1,0>, <1,3,1,3> + 2556783926U, // <1,0,1,4>: Cost 3 vext1 <1,1,0,1>, RHS + 2580672143U, // <1,0,1,5>: Cost 3 vext1 <5,1,0,1>, <5,1,0,1> + 2724839566U, // <1,0,1,6>: Cost 3 vext3 <6,7,0,1>, <0,1,6,7> + 3654415354U, // <1,0,1,7>: Cost 4 vext1 <5,1,0,1>, <7,0,1,2> + 1616003228U, // <1,0,1,u>: Cost 2 vext3 <0,u,1,1>, LHS + 2685690019U, // <1,0,2,0>: Cost 3 vext3 <0,2,0,1>, <0,2,0,1> + 2685763756U, // <1,0,2,1>: Cost 3 vext3 <0,2,1,1>, <0,2,1,1> + 2698297524U, // <1,0,2,2>: Cost 3 vext3 <2,3,0,1>, <0,2,2,0> + 2685911230U, // <1,0,2,3>: Cost 3 vext3 <0,2,3,1>, <0,2,3,1> + 2689745100U, // <1,0,2,4>: Cost 3 vext3 <0,u,1,1>, <0,2,4,6> + 3764814038U, // <1,0,2,5>: Cost 4 vext3 <1,1,1,1>, <0,2,5,7> + 2724839640U, // <1,0,2,6>: Cost 3 vext3 <6,7,0,1>, <0,2,6,0> + 2592625658U, // <1,0,2,7>: Cost 3 vext1 <7,1,0,2>, <7,0,1,2> + 2686279915U, // <1,0,2,u>: Cost 3 vext3 <0,2,u,1>, <0,2,u,1> + 3087843328U, // <1,0,3,0>: Cost 3 vtrnr LHS, <0,0,0,0> + 3087843338U, // <1,0,3,1>: Cost 3 vtrnr LHS, <0,0,1,1> + 67944550U, // <1,0,3,2>: Cost 1 vrev LHS + 2568743135U, // <1,0,3,3>: Cost 3 vext1 <3,1,0,3>, <3,1,0,3> + 2562772278U, // <1,0,3,4>: Cost 3 vext1 <2,1,0,3>, RHS + 4099850454U, // <1,0,3,5>: Cost 4 vtrnl <1,0,3,2>, <0,2,5,7> + 3704998538U, // <1,0,3,6>: Cost 4 vext2 <2,3,1,0>, <3,6,2,7> + 2592633923U, // <1,0,3,7>: Cost 3 vext1 <7,1,0,3>, <7,1,0,3> + 68386972U, // <1,0,3,u>: Cost 1 vrev LHS + 2620640146U, // <1,0,4,0>: Cost 3 vext2 <0,5,1,0>, <4,0,5,1> + 2689745234U, // <1,0,4,1>: Cost 3 vext3 <0,u,1,1>, <0,4,1,5> + 2689745244U, // <1,0,4,2>: Cost 3 vext3 <0,u,1,1>, <0,4,2,6> + 3760980320U, // <1,0,4,3>: Cost 4 vext3 <0,4,3,1>, <0,4,3,1> + 3761054057U, // <1,0,4,4>: Cost 4 vext3 <0,4,4,1>, <0,4,4,1> + 2619313462U, // <1,0,4,5>: Cost 3 vext2 <0,3,1,0>, RHS + 3761201531U, // <1,0,4,6>: Cost 4 vext3 <0,4,6,1>, <0,4,6,1> + 3666383940U, // <1,0,4,7>: Cost 4 vext1 <7,1,0,4>, <7,1,0,4> + 2619313705U, // <1,0,4,u>: Cost 3 vext2 <0,3,1,0>, RHS + 4029300736U, // <1,0,5,0>: Cost 4 vzipr <0,4,1,5>, <0,0,0,0> + 2895249510U, // <1,0,5,1>: Cost 3 vzipl <1,5,3,7>, LHS + 3028287590U, // <1,0,5,2>: Cost 3 vtrnl <1,3,5,7>, LHS + 3642501345U, // <1,0,5,3>: Cost 4 vext1 <3,1,0,5>, <3,1,0,5> + 2215592058U, // <1,0,5,4>: Cost 3 vrev <0,1,4,5> + 3724242907U, // <1,0,5,5>: Cost 4 vext2 <5,5,1,0>, <5,5,1,0> + 3724906540U, // <1,0,5,6>: Cost 4 vext2 <5,6,1,0>, <5,6,1,0> + 3911118134U, // <1,0,5,7>: Cost 4 vuzpr <3,1,3,0>, RHS + 3028287644U, // <1,0,5,u>: Cost 3 vtrnl <1,3,5,7>, LHS + 3762086375U, // <1,0,6,0>: Cost 4 vext3 <0,6,0,1>, <0,6,0,1> + 2698297846U, // <1,0,6,1>: Cost 3 vext3 <2,3,0,1>, <0,6,1,7> + 3760022015U, // <1,0,6,2>: Cost 4 vext3 <0,2,u,1>, <0,6,2,7> + 3642509538U, // <1,0,6,3>: Cost 4 vext1 <3,1,0,6>, <3,1,0,6> + 3762381323U, // <1,0,6,4>: Cost 4 vext3 <0,6,4,1>, <0,6,4,1> + 3730215604U, // <1,0,6,5>: Cost 4 vext2 <6,5,1,0>, <6,5,1,0> + 3730879237U, // <1,0,6,6>: Cost 4 vext2 <6,6,1,0>, <6,6,1,0> + 2657801046U, // <1,0,6,7>: Cost 3 vext2 <6,7,1,0>, <6,7,1,0> + 2658464679U, // <1,0,6,u>: Cost 3 vext2 <6,u,1,0>, <6,u,1,0> + 2659128312U, // <1,0,7,0>: Cost 3 vext2 <7,0,1,0>, <7,0,1,0> + 4047898278U, // <1,0,7,1>: Cost 4 vzipr <3,5,1,7>, <2,3,0,1> + 2215460970U, // <1,0,7,2>: Cost 3 vrev <0,1,2,7> + 3734861035U, // <1,0,7,3>: Cost 4 vext2 <7,3,1,0>, <7,3,1,0> + 3731543398U, // <1,0,7,4>: Cost 4 vext2 <6,7,1,0>, <7,4,5,6> + 3736188301U, // <1,0,7,5>: Cost 4 vext2 <7,5,1,0>, <7,5,1,0> + 2663110110U, // <1,0,7,6>: Cost 3 vext2 <7,6,1,0>, <7,6,1,0> + 3731543660U, // <1,0,7,7>: Cost 4 vext2 <6,7,1,0>, <7,7,7,7> + 2664437376U, // <1,0,7,u>: Cost 3 vext2 <7,u,1,0>, <7,u,1,0> + 3087884288U, // <1,0,u,0>: Cost 3 vtrnr LHS, <0,0,0,0> + 1616003730U, // <1,0,u,1>: Cost 2 vext3 <0,u,1,1>, <0,u,1,1> + 67985515U, // <1,0,u,2>: Cost 1 vrev LHS + 2689893028U, // <1,0,u,3>: Cost 3 vext3 <0,u,3,1>, <0,u,3,1> + 2689745586U, // <1,0,u,4>: Cost 3 vext3 <0,u,1,1>, <0,u,4,6> + 2619316378U, // <1,0,u,5>: Cost 3 vext2 <0,3,1,0>, RHS + 2669082807U, // <1,0,u,6>: Cost 3 vext2 <u,6,1,0>, <u,6,1,0> + 2592674888U, // <1,0,u,7>: Cost 3 vext1 <7,1,0,u>, <7,1,0,u> + 68427937U, // <1,0,u,u>: Cost 1 vrev LHS + 1543585802U, // <1,1,0,0>: Cost 2 vext2 <0,0,1,1>, <0,0,1,1> + 1548894310U, // <1,1,0,1>: Cost 2 vext2 <0,u,1,1>, LHS + 2618654892U, // <1,1,0,2>: Cost 3 vext2 <0,2,1,1>, <0,2,1,1> + 2689745654U, // <1,1,0,3>: Cost 3 vext3 <0,u,1,1>, <1,0,3,2> + 2622636370U, // <1,1,0,4>: Cost 3 vext2 <0,u,1,1>, <0,4,1,5> + 2620645791U, // <1,1,0,5>: Cost 3 vext2 <0,5,1,1>, <0,5,1,1> + 3696378367U, // <1,1,0,6>: Cost 4 vext2 <0,u,1,1>, <0,6,2,7> + 3666424905U, // <1,1,0,7>: Cost 4 vext1 <7,1,1,0>, <7,1,1,0> + 1548894866U, // <1,1,0,u>: Cost 2 vext2 <0,u,1,1>, <0,u,1,1> + 1483112550U, // <1,1,1,0>: Cost 2 vext1 <1,1,1,1>, LHS + 202162278U, // <1,1,1,1>: Cost 1 vdup1 LHS + 2622636950U, // <1,1,1,2>: Cost 3 vext2 <0,u,1,1>, <1,2,3,0> + 2622637016U, // <1,1,1,3>: Cost 3 vext2 <0,u,1,1>, <1,3,1,3> + 1483115830U, // <1,1,1,4>: Cost 2 vext1 <1,1,1,1>, RHS + 2622637200U, // <1,1,1,5>: Cost 3 vext2 <0,u,1,1>, <1,5,3,7> + 2622637263U, // <1,1,1,6>: Cost 3 vext2 <0,u,1,1>, <1,6,1,7> + 2592691274U, // <1,1,1,7>: Cost 3 vext1 <7,1,1,1>, <7,1,1,1> + 202162278U, // <1,1,1,u>: Cost 1 vdup1 LHS + 2550890588U, // <1,1,2,0>: Cost 3 vext1 <0,1,1,2>, <0,1,1,2> + 2617329183U, // <1,1,2,1>: Cost 3 vext2 <0,0,1,1>, <2,1,3,1> + 2622637672U, // <1,1,2,2>: Cost 3 vext2 <0,u,1,1>, <2,2,2,2> + 2622637734U, // <1,1,2,3>: Cost 3 vext2 <0,u,1,1>, <2,3,0,1> + 2550893878U, // <1,1,2,4>: Cost 3 vext1 <0,1,1,2>, RHS + 3696379744U, // <1,1,2,5>: Cost 4 vext2 <0,u,1,1>, <2,5,2,7> + 2622638010U, // <1,1,2,6>: Cost 3 vext2 <0,u,1,1>, <2,6,3,7> + 3804554170U, // <1,1,2,7>: Cost 4 vext3 <7,7,0,1>, <1,2,7,0> + 2622638139U, // <1,1,2,u>: Cost 3 vext2 <0,u,1,1>, <2,u,0,1> + 2622638230U, // <1,1,3,0>: Cost 3 vext2 <0,u,1,1>, <3,0,1,2> + 3087844148U, // <1,1,3,1>: Cost 3 vtrnr LHS, <1,1,1,1> + 4161585244U, // <1,1,3,2>: Cost 4 vtrnr LHS, <0,1,1,2> + 2014101606U, // <1,1,3,3>: Cost 2 vtrnr LHS, LHS + 2622638594U, // <1,1,3,4>: Cost 3 vext2 <0,u,1,1>, <3,4,5,6> + 2689745920U, // <1,1,3,5>: Cost 3 vext3 <0,u,1,1>, <1,3,5,7> + 3763487753U, // <1,1,3,6>: Cost 4 vext3 <0,u,1,1>, <1,3,6,7> + 2592707660U, // <1,1,3,7>: Cost 3 vext1 <7,1,1,3>, <7,1,1,3> + 2014101611U, // <1,1,3,u>: Cost 2 vtrnr LHS, LHS + 2556878950U, // <1,1,4,0>: Cost 3 vext1 <1,1,1,4>, LHS + 2221335351U, // <1,1,4,1>: Cost 3 vrev <1,1,1,4> + 3696380988U, // <1,1,4,2>: Cost 4 vext2 <0,u,1,1>, <4,2,6,0> + 3763487805U, // <1,1,4,3>: Cost 4 vext3 <0,u,1,1>, <1,4,3,5> + 2556882230U, // <1,1,4,4>: Cost 3 vext1 <1,1,1,4>, RHS + 1548897590U, // <1,1,4,5>: Cost 2 vext2 <0,u,1,1>, RHS + 2758184246U, // <1,1,4,6>: Cost 3 vuzpl <1,1,1,1>, RHS + 3666457677U, // <1,1,4,7>: Cost 4 vext1 <7,1,1,4>, <7,1,1,4> + 1548897833U, // <1,1,4,u>: Cost 2 vext2 <0,u,1,1>, RHS + 2693653615U, // <1,1,5,0>: Cost 3 vext3 <1,5,0,1>, <1,5,0,1> + 2617331408U, // <1,1,5,1>: Cost 3 vext2 <0,0,1,1>, <5,1,7,3> + 4029302934U, // <1,1,5,2>: Cost 4 vzipr <0,4,1,5>, <3,0,1,2> + 2689746064U, // <1,1,5,3>: Cost 3 vext3 <0,u,1,1>, <1,5,3,7> + 2221564755U, // <1,1,5,4>: Cost 3 vrev <1,1,4,5> + 2955559250U, // <1,1,5,5>: Cost 3 vzipr <0,4,1,5>, <0,4,1,5> + 2617331810U, // <1,1,5,6>: Cost 3 vext2 <0,0,1,1>, <5,6,7,0> + 2825293110U, // <1,1,5,7>: Cost 3 vuzpr <1,1,1,1>, RHS + 2689746109U, // <1,1,5,u>: Cost 3 vext3 <0,u,1,1>, <1,5,u,7> + 3696382241U, // <1,1,6,0>: Cost 4 vext2 <0,u,1,1>, <6,0,1,2> + 2689746127U, // <1,1,6,1>: Cost 3 vext3 <0,u,1,1>, <1,6,1,7> + 2617332218U, // <1,1,6,2>: Cost 3 vext2 <0,0,1,1>, <6,2,7,3> + 3763487969U, // <1,1,6,3>: Cost 4 vext3 <0,u,1,1>, <1,6,3,7> + 3696382605U, // <1,1,6,4>: Cost 4 vext2 <0,u,1,1>, <6,4,5,6> + 4029309266U, // <1,1,6,5>: Cost 4 vzipr <0,4,1,6>, <0,4,1,5> + 2617332536U, // <1,1,6,6>: Cost 3 vext2 <0,0,1,1>, <6,6,6,6> + 2724840702U, // <1,1,6,7>: Cost 3 vext3 <6,7,0,1>, <1,6,7,0> + 2725504263U, // <1,1,6,u>: Cost 3 vext3 <6,u,0,1>, <1,6,u,0> + 2617332720U, // <1,1,7,0>: Cost 3 vext2 <0,0,1,1>, <7,0,0,1> + 2659800138U, // <1,1,7,1>: Cost 3 vext2 <7,1,1,1>, <7,1,1,1> + 3691074717U, // <1,1,7,2>: Cost 4 vext2 <0,0,1,1>, <7,2,1,3> + 4167811174U, // <1,1,7,3>: Cost 4 vtrnr <1,1,5,7>, LHS + 2617333094U, // <1,1,7,4>: Cost 3 vext2 <0,0,1,1>, <7,4,5,6> + 3295396702U, // <1,1,7,5>: Cost 4 vrev <1,1,5,7> + 3803891014U, // <1,1,7,6>: Cost 4 vext3 <7,6,0,1>, <1,7,6,0> + 2617333356U, // <1,1,7,7>: Cost 3 vext2 <0,0,1,1>, <7,7,7,7> + 2659800138U, // <1,1,7,u>: Cost 3 vext2 <7,1,1,1>, <7,1,1,1> + 1483112550U, // <1,1,u,0>: Cost 2 vext1 <1,1,1,1>, LHS + 202162278U, // <1,1,u,1>: Cost 1 vdup1 LHS + 2622642056U, // <1,1,u,2>: Cost 3 vext2 <0,u,1,1>, <u,2,3,3> + 2014142566U, // <1,1,u,3>: Cost 2 vtrnr LHS, LHS + 1483115830U, // <1,1,u,4>: Cost 2 vext1 <1,1,1,1>, RHS + 1548900506U, // <1,1,u,5>: Cost 2 vext2 <0,u,1,1>, RHS + 2622642384U, // <1,1,u,6>: Cost 3 vext2 <0,u,1,1>, <u,6,3,7> + 2825293353U, // <1,1,u,7>: Cost 3 vuzpr <1,1,1,1>, RHS + 202162278U, // <1,1,u,u>: Cost 1 vdup1 LHS + 2635251712U, // <1,2,0,0>: Cost 3 vext2 <3,0,1,2>, <0,0,0,0> + 1561509990U, // <1,2,0,1>: Cost 2 vext2 <3,0,1,2>, LHS + 2618663085U, // <1,2,0,2>: Cost 3 vext2 <0,2,1,2>, <0,2,1,2> + 2696529358U, // <1,2,0,3>: Cost 3 vext3 <2,0,3,1>, <2,0,3,1> + 2635252050U, // <1,2,0,4>: Cost 3 vext2 <3,0,1,2>, <0,4,1,5> + 3769533926U, // <1,2,0,5>: Cost 4 vext3 <1,u,2,1>, <2,0,5,7> + 2621317617U, // <1,2,0,6>: Cost 3 vext2 <0,6,1,2>, <0,6,1,2> + 2659140170U, // <1,2,0,7>: Cost 3 vext2 <7,0,1,2>, <0,7,2,1> + 1561510557U, // <1,2,0,u>: Cost 2 vext2 <3,0,1,2>, LHS + 2623308516U, // <1,2,1,0>: Cost 3 vext2 <1,0,1,2>, <1,0,1,2> + 2635252532U, // <1,2,1,1>: Cost 3 vext2 <3,0,1,2>, <1,1,1,1> + 2631271318U, // <1,2,1,2>: Cost 3 vext2 <2,3,1,2>, <1,2,3,0> + 2958180454U, // <1,2,1,3>: Cost 3 vzipr <0,u,1,1>, LHS + 2550959414U, // <1,2,1,4>: Cost 3 vext1 <0,1,2,1>, RHS + 2635252880U, // <1,2,1,5>: Cost 3 vext2 <3,0,1,2>, <1,5,3,7> + 2635252952U, // <1,2,1,6>: Cost 3 vext2 <3,0,1,2>, <1,6,2,7> + 3732882731U, // <1,2,1,7>: Cost 4 vext2 <7,0,1,2>, <1,7,3,0> + 2958180459U, // <1,2,1,u>: Cost 3 vzipr <0,u,1,1>, LHS + 2629281213U, // <1,2,2,0>: Cost 3 vext2 <2,0,1,2>, <2,0,1,2> + 2635253280U, // <1,2,2,1>: Cost 3 vext2 <3,0,1,2>, <2,1,3,2> + 2618664552U, // <1,2,2,2>: Cost 3 vext2 <0,2,1,2>, <2,2,2,2> + 2689746546U, // <1,2,2,3>: Cost 3 vext3 <0,u,1,1>, <2,2,3,3> + 3764815485U, // <1,2,2,4>: Cost 4 vext3 <1,1,1,1>, <2,2,4,5> + 3760023176U, // <1,2,2,5>: Cost 4 vext3 <0,2,u,1>, <2,2,5,7> + 2635253690U, // <1,2,2,6>: Cost 3 vext2 <3,0,1,2>, <2,6,3,7> + 2659141610U, // <1,2,2,7>: Cost 3 vext2 <7,0,1,2>, <2,7,0,1> + 2689746591U, // <1,2,2,u>: Cost 3 vext3 <0,u,1,1>, <2,2,u,3> + 403488870U, // <1,2,3,0>: Cost 1 vext1 LHS, LHS + 1477231350U, // <1,2,3,1>: Cost 2 vext1 LHS, <1,0,3,2> + 1477232232U, // <1,2,3,2>: Cost 2 vext1 LHS, <2,2,2,2> + 1477233052U, // <1,2,3,3>: Cost 2 vext1 LHS, <3,3,3,3> + 403492150U, // <1,2,3,4>: Cost 1 vext1 LHS, RHS + 1525010128U, // <1,2,3,5>: Cost 2 vext1 LHS, <5,1,7,3> + 1525010938U, // <1,2,3,6>: Cost 2 vext1 LHS, <6,2,7,3> + 1525011450U, // <1,2,3,7>: Cost 2 vext1 LHS, <7,0,1,2> + 403494702U, // <1,2,3,u>: Cost 1 vext1 LHS, LHS + 2641226607U, // <1,2,4,0>: Cost 3 vext2 <4,0,1,2>, <4,0,1,2> + 3624723446U, // <1,2,4,1>: Cost 4 vext1 <0,1,2,4>, <1,3,4,6> + 3301123609U, // <1,2,4,2>: Cost 4 vrev <2,1,2,4> + 2598759198U, // <1,2,4,3>: Cost 3 vext1 <u,1,2,4>, <3,u,1,2> + 2659142864U, // <1,2,4,4>: Cost 3 vext2 <7,0,1,2>, <4,4,4,4> + 1561513270U, // <1,2,4,5>: Cost 2 vext2 <3,0,1,2>, RHS + 2659143028U, // <1,2,4,6>: Cost 3 vext2 <7,0,1,2>, <4,6,4,6> + 2659143112U, // <1,2,4,7>: Cost 3 vext2 <7,0,1,2>, <4,7,5,0> + 1561513513U, // <1,2,4,u>: Cost 2 vext2 <3,0,1,2>, RHS + 2550988902U, // <1,2,5,0>: Cost 3 vext1 <0,1,2,5>, LHS + 2550989824U, // <1,2,5,1>: Cost 3 vext1 <0,1,2,5>, <1,3,5,7> + 3624732264U, // <1,2,5,2>: Cost 4 vext1 <0,1,2,5>, <2,2,2,2> + 2955559014U, // <1,2,5,3>: Cost 3 vzipr <0,4,1,5>, LHS + 2550992182U, // <1,2,5,4>: Cost 3 vext1 <0,1,2,5>, RHS + 2659143684U, // <1,2,5,5>: Cost 3 vext2 <7,0,1,2>, <5,5,5,5> + 2659143778U, // <1,2,5,6>: Cost 3 vext2 <7,0,1,2>, <5,6,7,0> + 2659143848U, // <1,2,5,7>: Cost 3 vext2 <7,0,1,2>, <5,7,5,7> + 2550994734U, // <1,2,5,u>: Cost 3 vext1 <0,1,2,5>, LHS + 2700289945U, // <1,2,6,0>: Cost 3 vext3 <2,6,0,1>, <2,6,0,1> + 2635256232U, // <1,2,6,1>: Cost 3 vext2 <3,0,1,2>, <6,1,7,2> + 2659144186U, // <1,2,6,2>: Cost 3 vext2 <7,0,1,2>, <6,2,7,3> + 2689746874U, // <1,2,6,3>: Cost 3 vext3 <0,u,1,1>, <2,6,3,7> + 3763488705U, // <1,2,6,4>: Cost 4 vext3 <0,u,1,1>, <2,6,4,5> + 3763488716U, // <1,2,6,5>: Cost 4 vext3 <0,u,1,1>, <2,6,5,7> + 2659144504U, // <1,2,6,6>: Cost 3 vext2 <7,0,1,2>, <6,6,6,6> + 2657817432U, // <1,2,6,7>: Cost 3 vext2 <6,7,1,2>, <6,7,1,2> + 2689746919U, // <1,2,6,u>: Cost 3 vext3 <0,u,1,1>, <2,6,u,7> + 1585402874U, // <1,2,7,0>: Cost 2 vext2 <7,0,1,2>, <7,0,1,2> + 2659144770U, // <1,2,7,1>: Cost 3 vext2 <7,0,1,2>, <7,1,0,2> + 3708998858U, // <1,2,7,2>: Cost 4 vext2 <3,0,1,2>, <7,2,6,3> + 2635257059U, // <1,2,7,3>: Cost 3 vext2 <3,0,1,2>, <7,3,0,1> + 2659145062U, // <1,2,7,4>: Cost 3 vext2 <7,0,1,2>, <7,4,5,6> + 3732886916U, // <1,2,7,5>: Cost 4 vext2 <7,0,1,2>, <7,5,0,0> + 3732886998U, // <1,2,7,6>: Cost 4 vext2 <7,0,1,2>, <7,6,0,1> + 2659145255U, // <1,2,7,7>: Cost 3 vext2 <7,0,1,2>, <7,7,0,1> + 1590711938U, // <1,2,7,u>: Cost 2 vext2 <7,u,1,2>, <7,u,1,2> + 403529835U, // <1,2,u,0>: Cost 1 vext1 LHS, LHS + 1477272310U, // <1,2,u,1>: Cost 2 vext1 LHS, <1,0,3,2> + 1477273192U, // <1,2,u,2>: Cost 2 vext1 LHS, <2,2,2,2> + 1477273750U, // <1,2,u,3>: Cost 2 vext1 LHS, <3,0,1,2> + 403533110U, // <1,2,u,4>: Cost 1 vext1 LHS, RHS + 1561516186U, // <1,2,u,5>: Cost 2 vext2 <3,0,1,2>, RHS + 1525051898U, // <1,2,u,6>: Cost 2 vext1 LHS, <6,2,7,3> + 1525052410U, // <1,2,u,7>: Cost 2 vext1 LHS, <7,0,1,2> + 403535662U, // <1,2,u,u>: Cost 1 vext1 LHS, LHS + 2819407872U, // <1,3,0,0>: Cost 3 vuzpr LHS, <0,0,0,0> + 1551564902U, // <1,3,0,1>: Cost 2 vext2 <1,3,1,3>, LHS + 2819408630U, // <1,3,0,2>: Cost 3 vuzpr LHS, <1,0,3,2> + 2619334911U, // <1,3,0,3>: Cost 3 vext2 <0,3,1,3>, <0,3,1,3> + 2625306962U, // <1,3,0,4>: Cost 3 vext2 <1,3,1,3>, <0,4,1,5> + 3832725879U, // <1,3,0,5>: Cost 4 vuzpl <1,2,3,0>, <0,4,5,6> + 3699048959U, // <1,3,0,6>: Cost 4 vext2 <1,3,1,3>, <0,6,2,7> + 3776538827U, // <1,3,0,7>: Cost 4 vext3 <3,0,7,1>, <3,0,7,1> + 1551565469U, // <1,3,0,u>: Cost 2 vext2 <1,3,1,3>, LHS + 2618671862U, // <1,3,1,0>: Cost 3 vext2 <0,2,1,3>, <1,0,3,2> + 2819408692U, // <1,3,1,1>: Cost 3 vuzpr LHS, <1,1,1,1> + 2624643975U, // <1,3,1,2>: Cost 3 vext2 <1,2,1,3>, <1,2,1,3> + 1745666150U, // <1,3,1,3>: Cost 2 vuzpr LHS, LHS + 2557005110U, // <1,3,1,4>: Cost 3 vext1 <1,1,3,1>, RHS + 2625307792U, // <1,3,1,5>: Cost 3 vext2 <1,3,1,3>, <1,5,3,7> + 3698386127U, // <1,3,1,6>: Cost 4 vext2 <1,2,1,3>, <1,6,1,7> + 2592838748U, // <1,3,1,7>: Cost 3 vext1 <7,1,3,1>, <7,1,3,1> + 1745666155U, // <1,3,1,u>: Cost 2 vuzpr LHS, LHS + 2819408790U, // <1,3,2,0>: Cost 3 vuzpr LHS, <1,2,3,0> + 2625308193U, // <1,3,2,1>: Cost 3 vext2 <1,3,1,3>, <2,1,3,3> + 2819408036U, // <1,3,2,2>: Cost 3 vuzpr LHS, <0,2,0,2> + 2819851890U, // <1,3,2,3>: Cost 3 vuzpr LHS, <2,2,3,3> + 2819408794U, // <1,3,2,4>: Cost 3 vuzpr LHS, <1,2,3,4> + 3893149890U, // <1,3,2,5>: Cost 4 vuzpr LHS, <0,2,3,5> + 2819408076U, // <1,3,2,6>: Cost 3 vuzpr LHS, <0,2,4,6> + 3772041583U, // <1,3,2,7>: Cost 4 vext3 <2,3,0,1>, <3,2,7,3> + 2819408042U, // <1,3,2,u>: Cost 3 vuzpr LHS, <0,2,0,u> + 1483276390U, // <1,3,3,0>: Cost 2 vext1 <1,1,3,3>, LHS + 1483277128U, // <1,3,3,1>: Cost 2 vext1 <1,1,3,3>, <1,1,3,3> + 2557019752U, // <1,3,3,2>: Cost 3 vext1 <1,1,3,3>, <2,2,2,2> + 2819408856U, // <1,3,3,3>: Cost 3 vuzpr LHS, <1,3,1,3> + 1483279670U, // <1,3,3,4>: Cost 2 vext1 <1,1,3,3>, RHS + 2819409614U, // <1,3,3,5>: Cost 3 vuzpr LHS, <2,3,4,5> + 2598826490U, // <1,3,3,6>: Cost 3 vext1 <u,1,3,3>, <6,2,7,3> + 3087844352U, // <1,3,3,7>: Cost 3 vtrnr LHS, <1,3,5,7> + 1483282222U, // <1,3,3,u>: Cost 2 vext1 <1,1,3,3>, LHS + 2568970342U, // <1,3,4,0>: Cost 3 vext1 <3,1,3,4>, LHS + 2568971224U, // <1,3,4,1>: Cost 3 vext1 <3,1,3,4>, <1,3,1,3> + 3832761290U, // <1,3,4,2>: Cost 4 vuzpl <1,2,3,4>, <4,1,2,3> + 2233428219U, // <1,3,4,3>: Cost 3 vrev <3,1,3,4> + 2568973622U, // <1,3,4,4>: Cost 3 vext1 <3,1,3,4>, RHS + 1551568182U, // <1,3,4,5>: Cost 2 vext2 <1,3,1,3>, RHS + 2819410434U, // <1,3,4,6>: Cost 3 vuzpr LHS, <3,4,5,6> + 3666605151U, // <1,3,4,7>: Cost 4 vext1 <7,1,3,4>, <7,1,3,4> + 1551568425U, // <1,3,4,u>: Cost 2 vext2 <1,3,1,3>, RHS + 2563006566U, // <1,3,5,0>: Cost 3 vext1 <2,1,3,5>, LHS + 2568979456U, // <1,3,5,1>: Cost 3 vext1 <3,1,3,5>, <1,3,5,7> + 2563008035U, // <1,3,5,2>: Cost 3 vext1 <2,1,3,5>, <2,1,3,5> + 2233436412U, // <1,3,5,3>: Cost 3 vrev <3,1,3,5> + 2563009846U, // <1,3,5,4>: Cost 3 vext1 <2,1,3,5>, RHS + 2867187716U, // <1,3,5,5>: Cost 3 vuzpr LHS, <5,5,5,5> + 2655834214U, // <1,3,5,6>: Cost 3 vext2 <6,4,1,3>, <5,6,7,4> + 1745669430U, // <1,3,5,7>: Cost 2 vuzpr LHS, RHS + 1745669431U, // <1,3,5,u>: Cost 2 vuzpr LHS, RHS + 2867187810U, // <1,3,6,0>: Cost 3 vuzpr LHS, <5,6,7,0> + 3699052931U, // <1,3,6,1>: Cost 4 vext2 <1,3,1,3>, <6,1,3,1> + 2654507460U, // <1,3,6,2>: Cost 3 vext2 <6,2,1,3>, <6,2,1,3> + 3766291091U, // <1,3,6,3>: Cost 4 vext3 <1,3,3,1>, <3,6,3,7> + 2655834726U, // <1,3,6,4>: Cost 3 vext2 <6,4,1,3>, <6,4,1,3> + 3923384562U, // <1,3,6,5>: Cost 4 vuzpr <5,1,7,3>, <u,6,7,5> + 2657161992U, // <1,3,6,6>: Cost 3 vext2 <6,6,1,3>, <6,6,1,3> + 2819852218U, // <1,3,6,7>: Cost 3 vuzpr LHS, <2,6,3,7> + 2819852219U, // <1,3,6,u>: Cost 3 vuzpr LHS, <2,6,3,u> + 2706926275U, // <1,3,7,0>: Cost 3 vext3 <3,7,0,1>, <3,7,0,1> + 2659816524U, // <1,3,7,1>: Cost 3 vext2 <7,1,1,3>, <7,1,1,3> + 3636766245U, // <1,3,7,2>: Cost 4 vext1 <2,1,3,7>, <2,1,3,7> + 2867187903U, // <1,3,7,3>: Cost 3 vuzpr LHS, <5,7,u,3> + 2625312102U, // <1,3,7,4>: Cost 3 vext2 <1,3,1,3>, <7,4,5,6> + 2867188598U, // <1,3,7,5>: Cost 3 vuzpr LHS, <6,7,4,5> + 3728250344U, // <1,3,7,6>: Cost 4 vext2 <6,2,1,3>, <7,6,2,1> + 2867187880U, // <1,3,7,7>: Cost 3 vuzpr LHS, <5,7,5,7> + 2707516171U, // <1,3,7,u>: Cost 3 vext3 <3,7,u,1>, <3,7,u,1> + 1483317350U, // <1,3,u,0>: Cost 2 vext1 <1,1,3,u>, LHS + 1483318093U, // <1,3,u,1>: Cost 2 vext1 <1,1,3,u>, <1,1,3,u> + 2819410718U, // <1,3,u,2>: Cost 3 vuzpr LHS, <3,u,1,2> + 1745666717U, // <1,3,u,3>: Cost 2 vuzpr LHS, LHS + 1483320630U, // <1,3,u,4>: Cost 2 vext1 <1,1,3,u>, RHS + 1551571098U, // <1,3,u,5>: Cost 2 vext2 <1,3,1,3>, RHS + 2819410758U, // <1,3,u,6>: Cost 3 vuzpr LHS, <3,u,5,6> + 1745669673U, // <1,3,u,7>: Cost 2 vuzpr LHS, RHS + 1745666722U, // <1,3,u,u>: Cost 2 vuzpr LHS, LHS + 2617352205U, // <1,4,0,0>: Cost 3 vext2 <0,0,1,4>, <0,0,1,4> + 2619342950U, // <1,4,0,1>: Cost 3 vext2 <0,3,1,4>, LHS + 3692421295U, // <1,4,0,2>: Cost 4 vext2 <0,2,1,4>, <0,2,1,4> + 2619343104U, // <1,4,0,3>: Cost 3 vext2 <0,3,1,4>, <0,3,1,4> + 2617352530U, // <1,4,0,4>: Cost 3 vext2 <0,0,1,4>, <0,4,1,5> + 1634880402U, // <1,4,0,5>: Cost 2 vext3 <4,0,5,1>, <4,0,5,1> + 2713930652U, // <1,4,0,6>: Cost 3 vext3 <4,u,5,1>, <4,0,6,2> + 3732898396U, // <1,4,0,7>: Cost 4 vext2 <7,0,1,4>, <0,7,4,1> + 1635101613U, // <1,4,0,u>: Cost 2 vext3 <4,0,u,1>, <4,0,u,1> + 3693085430U, // <1,4,1,0>: Cost 4 vext2 <0,3,1,4>, <1,0,3,2> + 2623988535U, // <1,4,1,1>: Cost 3 vext2 <1,1,1,4>, <1,1,1,4> + 3693085590U, // <1,4,1,2>: Cost 4 vext2 <0,3,1,4>, <1,2,3,0> + 3692422134U, // <1,4,1,3>: Cost 4 vext2 <0,2,1,4>, <1,3,4,6> + 3693085726U, // <1,4,1,4>: Cost 4 vext2 <0,3,1,4>, <1,4,0,1> + 2892401974U, // <1,4,1,5>: Cost 3 vzipl <1,1,1,1>, RHS + 3026619702U, // <1,4,1,6>: Cost 3 vtrnl <1,1,1,1>, RHS + 3800206324U, // <1,4,1,7>: Cost 4 vext3 <7,0,4,1>, <4,1,7,0> + 2892402217U, // <1,4,1,u>: Cost 3 vzipl <1,1,1,1>, RHS + 3966978927U, // <1,4,2,0>: Cost 4 vzipl <1,2,3,4>, <4,0,1,2> + 3966979018U, // <1,4,2,1>: Cost 4 vzipl <1,2,3,4>, <4,1,2,3> + 3693086312U, // <1,4,2,2>: Cost 4 vext2 <0,3,1,4>, <2,2,2,2> + 2635269798U, // <1,4,2,3>: Cost 3 vext2 <3,0,1,4>, <2,3,0,1> + 3966979280U, // <1,4,2,4>: Cost 4 vzipl <1,2,3,4>, <4,4,4,4> + 2893204790U, // <1,4,2,5>: Cost 3 vzipl <1,2,3,0>, RHS + 3693086650U, // <1,4,2,6>: Cost 4 vext2 <0,3,1,4>, <2,6,3,7> + 3666662502U, // <1,4,2,7>: Cost 4 vext1 <7,1,4,2>, <7,1,4,2> + 2893205033U, // <1,4,2,u>: Cost 3 vzipl <1,2,3,0>, RHS + 2563063910U, // <1,4,3,0>: Cost 3 vext1 <2,1,4,3>, LHS + 2563064730U, // <1,4,3,1>: Cost 3 vext1 <2,1,4,3>, <1,2,3,4> + 2563065386U, // <1,4,3,2>: Cost 3 vext1 <2,1,4,3>, <2,1,4,3> + 3693087132U, // <1,4,3,3>: Cost 4 vext2 <0,3,1,4>, <3,3,3,3> + 2619345410U, // <1,4,3,4>: Cost 3 vext2 <0,3,1,4>, <3,4,5,6> + 3087843666U, // <1,4,3,5>: Cost 3 vtrnr LHS, <0,4,1,5> + 3087843676U, // <1,4,3,6>: Cost 3 vtrnr LHS, <0,4,2,6> + 3666670695U, // <1,4,3,7>: Cost 4 vext1 <7,1,4,3>, <7,1,4,3> + 3087843669U, // <1,4,3,u>: Cost 3 vtrnr LHS, <0,4,1,u> + 2620672914U, // <1,4,4,0>: Cost 3 vext2 <0,5,1,4>, <4,0,5,1> + 3630842706U, // <1,4,4,1>: Cost 4 vext1 <1,1,4,4>, <1,1,4,4> + 3313069003U, // <1,4,4,2>: Cost 4 vrev <4,1,2,4> + 3642788100U, // <1,4,4,3>: Cost 4 vext1 <3,1,4,4>, <3,1,4,4> + 2713930960U, // <1,4,4,4>: Cost 3 vext3 <4,u,5,1>, <4,4,4,4> + 2619346230U, // <1,4,4,5>: Cost 3 vext2 <0,3,1,4>, RHS + 2713930980U, // <1,4,4,6>: Cost 3 vext3 <4,u,5,1>, <4,4,6,6> + 3736882642U, // <1,4,4,7>: Cost 4 vext2 <7,6,1,4>, <4,7,6,1> + 2619346473U, // <1,4,4,u>: Cost 3 vext2 <0,3,1,4>, RHS + 2557108326U, // <1,4,5,0>: Cost 3 vext1 <1,1,4,5>, LHS + 2557109075U, // <1,4,5,1>: Cost 3 vext1 <1,1,4,5>, <1,1,4,5> + 2598913774U, // <1,4,5,2>: Cost 3 vext1 <u,1,4,5>, <2,3,u,1> + 3630852246U, // <1,4,5,3>: Cost 4 vext1 <1,1,4,5>, <3,0,1,2> + 2557111606U, // <1,4,5,4>: Cost 3 vext1 <1,1,4,5>, RHS + 2895252790U, // <1,4,5,5>: Cost 3 vzipl <1,5,3,7>, RHS + 1616006454U, // <1,4,5,6>: Cost 2 vext3 <0,u,1,1>, RHS + 3899059510U, // <1,4,5,7>: Cost 4 vuzpr <1,1,1,4>, RHS + 1616006472U, // <1,4,5,u>: Cost 2 vext3 <0,u,1,1>, RHS + 2557116518U, // <1,4,6,0>: Cost 3 vext1 <1,1,4,6>, LHS + 2557117236U, // <1,4,6,1>: Cost 3 vext1 <1,1,4,6>, <1,1,1,1> + 3630859880U, // <1,4,6,2>: Cost 4 vext1 <1,1,4,6>, <2,2,2,2> + 2569062550U, // <1,4,6,3>: Cost 3 vext1 <3,1,4,6>, <3,0,1,2> + 2557119798U, // <1,4,6,4>: Cost 3 vext1 <1,1,4,6>, RHS + 3763490174U, // <1,4,6,5>: Cost 4 vext3 <0,u,1,1>, <4,6,5,7> + 3763490183U, // <1,4,6,6>: Cost 4 vext3 <0,u,1,1>, <4,6,6,7> + 2712751498U, // <1,4,6,7>: Cost 3 vext3 <4,6,7,1>, <4,6,7,1> + 2557122350U, // <1,4,6,u>: Cost 3 vext1 <1,1,4,6>, LHS + 2659161084U, // <1,4,7,0>: Cost 3 vext2 <7,0,1,4>, <7,0,1,4> + 3732903040U, // <1,4,7,1>: Cost 4 vext2 <7,0,1,4>, <7,1,7,1> + 3734230174U, // <1,4,7,2>: Cost 4 vext2 <7,2,1,4>, <7,2,1,4> + 3734893807U, // <1,4,7,3>: Cost 4 vext2 <7,3,1,4>, <7,3,1,4> + 3660729654U, // <1,4,7,4>: Cost 4 vext1 <6,1,4,7>, RHS + 3786493384U, // <1,4,7,5>: Cost 4 vext3 <4,6,7,1>, <4,7,5,0> + 2713341394U, // <1,4,7,6>: Cost 3 vext3 <4,7,6,1>, <4,7,6,1> + 3660731386U, // <1,4,7,7>: Cost 4 vext1 <6,1,4,7>, <7,0,1,2> + 2664470148U, // <1,4,7,u>: Cost 3 vext2 <7,u,1,4>, <7,u,1,4> + 2557132902U, // <1,4,u,0>: Cost 3 vext1 <1,1,4,u>, LHS + 2619348782U, // <1,4,u,1>: Cost 3 vext2 <0,3,1,4>, LHS + 2563106351U, // <1,4,u,2>: Cost 3 vext1 <2,1,4,u>, <2,1,4,u> + 2713783816U, // <1,4,u,3>: Cost 3 vext3 <4,u,3,1>, <4,u,3,1> + 2622666815U, // <1,4,u,4>: Cost 3 vext2 <0,u,1,4>, <u,4,5,6> + 1640189466U, // <1,4,u,5>: Cost 2 vext3 <4,u,5,1>, <4,u,5,1> + 1616006697U, // <1,4,u,6>: Cost 2 vext3 <0,u,1,1>, RHS + 2712751498U, // <1,4,u,7>: Cost 3 vext3 <4,6,7,1>, <4,6,7,1> + 1616006715U, // <1,4,u,u>: Cost 2 vext3 <0,u,1,1>, RHS + 2620014592U, // <1,5,0,0>: Cost 3 vext2 <0,4,1,5>, <0,0,0,0> + 1546272870U, // <1,5,0,1>: Cost 2 vext2 <0,4,1,5>, LHS + 2618687664U, // <1,5,0,2>: Cost 3 vext2 <0,2,1,5>, <0,2,1,5> + 3693093120U, // <1,5,0,3>: Cost 4 vext2 <0,3,1,5>, <0,3,1,4> + 1546273106U, // <1,5,0,4>: Cost 2 vext2 <0,4,1,5>, <0,4,1,5> + 2620678563U, // <1,5,0,5>: Cost 3 vext2 <0,5,1,5>, <0,5,1,5> + 2714668660U, // <1,5,0,6>: Cost 3 vext3 <5,0,6,1>, <5,0,6,1> + 3772042877U, // <1,5,0,7>: Cost 4 vext3 <2,3,0,1>, <5,0,7,1> + 1546273437U, // <1,5,0,u>: Cost 2 vext2 <0,4,1,5>, LHS + 2620015350U, // <1,5,1,0>: Cost 3 vext2 <0,4,1,5>, <1,0,3,2> + 2620015412U, // <1,5,1,1>: Cost 3 vext2 <0,4,1,5>, <1,1,1,1> + 2620015510U, // <1,5,1,2>: Cost 3 vext2 <0,4,1,5>, <1,2,3,0> + 2618688512U, // <1,5,1,3>: Cost 3 vext2 <0,2,1,5>, <1,3,5,7> + 2620015677U, // <1,5,1,4>: Cost 3 vext2 <0,4,1,5>, <1,4,3,5> + 2620015727U, // <1,5,1,5>: Cost 3 vext2 <0,4,1,5>, <1,5,0,1> + 2620015859U, // <1,5,1,6>: Cost 3 vext2 <0,4,1,5>, <1,6,5,7> + 3093728566U, // <1,5,1,7>: Cost 3 vtrnr <1,1,1,1>, RHS + 2620015981U, // <1,5,1,u>: Cost 3 vext2 <0,4,1,5>, <1,u,1,3> + 3692430816U, // <1,5,2,0>: Cost 4 vext2 <0,2,1,5>, <2,0,5,1> + 2620016163U, // <1,5,2,1>: Cost 3 vext2 <0,4,1,5>, <2,1,3,5> + 2620016232U, // <1,5,2,2>: Cost 3 vext2 <0,4,1,5>, <2,2,2,2> + 2620016294U, // <1,5,2,3>: Cost 3 vext2 <0,4,1,5>, <2,3,0,1> + 3693758221U, // <1,5,2,4>: Cost 4 vext2 <0,4,1,5>, <2,4,2,5> + 3692431209U, // <1,5,2,5>: Cost 4 vext2 <0,2,1,5>, <2,5,3,7> + 2620016570U, // <1,5,2,6>: Cost 3 vext2 <0,4,1,5>, <2,6,3,7> + 4173598006U, // <1,5,2,7>: Cost 4 vtrnr <2,1,3,2>, RHS + 2620016699U, // <1,5,2,u>: Cost 3 vext2 <0,4,1,5>, <2,u,0,1> + 2620016790U, // <1,5,3,0>: Cost 3 vext2 <0,4,1,5>, <3,0,1,2> + 2569110672U, // <1,5,3,1>: Cost 3 vext1 <3,1,5,3>, <1,5,3,7> + 3693758785U, // <1,5,3,2>: Cost 4 vext2 <0,4,1,5>, <3,2,2,2> + 2620017052U, // <1,5,3,3>: Cost 3 vext2 <0,4,1,5>, <3,3,3,3> + 2620017154U, // <1,5,3,4>: Cost 3 vext2 <0,4,1,5>, <3,4,5,6> + 3135623172U, // <1,5,3,5>: Cost 3 vtrnr LHS, <5,5,5,5> + 4161587048U, // <1,5,3,6>: Cost 4 vtrnr LHS, <2,5,3,6> + 2014104886U, // <1,5,3,7>: Cost 2 vtrnr LHS, RHS + 2014104887U, // <1,5,3,u>: Cost 2 vtrnr LHS, RHS + 2620017554U, // <1,5,4,0>: Cost 3 vext2 <0,4,1,5>, <4,0,5,1> + 2620017634U, // <1,5,4,1>: Cost 3 vext2 <0,4,1,5>, <4,1,5,0> + 3693759551U, // <1,5,4,2>: Cost 4 vext2 <0,4,1,5>, <4,2,6,3> + 3642861837U, // <1,5,4,3>: Cost 4 vext1 <3,1,5,4>, <3,1,5,4> + 2575092710U, // <1,5,4,4>: Cost 3 vext1 <4,1,5,4>, <4,1,5,4> + 1546276150U, // <1,5,4,5>: Cost 2 vext2 <0,4,1,5>, RHS + 2759855414U, // <1,5,4,6>: Cost 3 vuzpl <1,3,5,7>, RHS + 2713931718U, // <1,5,4,7>: Cost 3 vext3 <4,u,5,1>, <5,4,7,6> + 1546276393U, // <1,5,4,u>: Cost 2 vext2 <0,4,1,5>, RHS + 2557182054U, // <1,5,5,0>: Cost 3 vext1 <1,1,5,5>, LHS + 2557182812U, // <1,5,5,1>: Cost 3 vext1 <1,1,5,5>, <1,1,5,5> + 3630925347U, // <1,5,5,2>: Cost 4 vext1 <1,1,5,5>, <2,1,3,5> + 4029301675U, // <1,5,5,3>: Cost 4 vzipr <0,4,1,5>, <1,2,5,3> + 2557185334U, // <1,5,5,4>: Cost 3 vext1 <1,1,5,5>, RHS + 2713931780U, // <1,5,5,5>: Cost 3 vext3 <4,u,5,1>, <5,5,5,5> + 2667794530U, // <1,5,5,6>: Cost 3 vext2 <u,4,1,5>, <5,6,7,0> + 2713931800U, // <1,5,5,7>: Cost 3 vext3 <4,u,5,1>, <5,5,7,7> + 2557187886U, // <1,5,5,u>: Cost 3 vext1 <1,1,5,5>, LHS + 2718208036U, // <1,5,6,0>: Cost 3 vext3 <5,6,0,1>, <5,6,0,1> + 2620019115U, // <1,5,6,1>: Cost 3 vext2 <0,4,1,5>, <6,1,7,5> + 2667794938U, // <1,5,6,2>: Cost 3 vext2 <u,4,1,5>, <6,2,7,3> + 3787673666U, // <1,5,6,3>: Cost 4 vext3 <4,u,5,1>, <5,6,3,4> + 3693761165U, // <1,5,6,4>: Cost 4 vext2 <0,4,1,5>, <6,4,5,6> + 3319279297U, // <1,5,6,5>: Cost 4 vrev <5,1,5,6> + 2667795256U, // <1,5,6,6>: Cost 3 vext2 <u,4,1,5>, <6,6,6,6> + 2713931874U, // <1,5,6,7>: Cost 3 vext3 <4,u,5,1>, <5,6,7,0> + 2713931883U, // <1,5,6,u>: Cost 3 vext3 <4,u,5,1>, <5,6,u,0> + 2557198438U, // <1,5,7,0>: Cost 3 vext1 <1,1,5,7>, LHS + 2557199156U, // <1,5,7,1>: Cost 3 vext1 <1,1,5,7>, <1,1,1,1> + 2569143974U, // <1,5,7,2>: Cost 3 vext1 <3,1,5,7>, <2,3,0,1> + 2569144592U, // <1,5,7,3>: Cost 3 vext1 <3,1,5,7>, <3,1,5,7> + 2557201718U, // <1,5,7,4>: Cost 3 vext1 <1,1,5,7>, RHS + 2713931944U, // <1,5,7,5>: Cost 3 vext3 <4,u,5,1>, <5,7,5,7> + 3787673770U, // <1,5,7,6>: Cost 4 vext3 <4,u,5,1>, <5,7,6,0> + 2719387828U, // <1,5,7,7>: Cost 3 vext3 <5,7,7,1>, <5,7,7,1> + 2557204270U, // <1,5,7,u>: Cost 3 vext1 <1,1,5,7>, LHS + 2620020435U, // <1,5,u,0>: Cost 3 vext2 <0,4,1,5>, <u,0,1,2> + 1546278702U, // <1,5,u,1>: Cost 2 vext2 <0,4,1,5>, LHS + 2620020616U, // <1,5,u,2>: Cost 3 vext2 <0,4,1,5>, <u,2,3,3> + 2620020668U, // <1,5,u,3>: Cost 3 vext2 <0,4,1,5>, <u,3,0,1> + 1594054682U, // <1,5,u,4>: Cost 2 vext2 <u,4,1,5>, <u,4,1,5> + 1546279066U, // <1,5,u,5>: Cost 2 vext2 <0,4,1,5>, RHS + 2620020944U, // <1,5,u,6>: Cost 3 vext2 <0,4,1,5>, <u,6,3,7> + 2014145846U, // <1,5,u,7>: Cost 2 vtrnr LHS, RHS + 2014145847U, // <1,5,u,u>: Cost 2 vtrnr LHS, RHS + 3692437504U, // <1,6,0,0>: Cost 4 vext2 <0,2,1,6>, <0,0,0,0> + 2618695782U, // <1,6,0,1>: Cost 3 vext2 <0,2,1,6>, LHS + 2618695857U, // <1,6,0,2>: Cost 3 vext2 <0,2,1,6>, <0,2,1,6> + 3794161970U, // <1,6,0,3>: Cost 4 vext3 <6,0,3,1>, <6,0,3,1> + 2620023122U, // <1,6,0,4>: Cost 3 vext2 <0,4,1,6>, <0,4,1,5> + 2620686756U, // <1,6,0,5>: Cost 3 vext2 <0,5,1,6>, <0,5,1,6> + 2621350389U, // <1,6,0,6>: Cost 3 vext2 <0,6,1,6>, <0,6,1,6> + 4028599606U, // <1,6,0,7>: Cost 4 vzipr <0,3,1,0>, RHS + 2618696349U, // <1,6,0,u>: Cost 3 vext2 <0,2,1,6>, LHS + 3692438262U, // <1,6,1,0>: Cost 4 vext2 <0,2,1,6>, <1,0,3,2> + 2625995572U, // <1,6,1,1>: Cost 3 vext2 <1,4,1,6>, <1,1,1,1> + 3692438422U, // <1,6,1,2>: Cost 4 vext2 <0,2,1,6>, <1,2,3,0> + 3692438488U, // <1,6,1,3>: Cost 4 vext2 <0,2,1,6>, <1,3,1,3> + 2625995820U, // <1,6,1,4>: Cost 3 vext2 <1,4,1,6>, <1,4,1,6> + 3692438672U, // <1,6,1,5>: Cost 4 vext2 <0,2,1,6>, <1,5,3,7> + 3692438720U, // <1,6,1,6>: Cost 4 vext2 <0,2,1,6>, <1,6,0,1> + 2958183734U, // <1,6,1,7>: Cost 3 vzipr <0,u,1,1>, RHS + 2958183735U, // <1,6,1,u>: Cost 3 vzipr <0,u,1,1>, RHS + 2721526201U, // <1,6,2,0>: Cost 3 vext3 <6,2,0,1>, <6,2,0,1> + 3692439097U, // <1,6,2,1>: Cost 4 vext2 <0,2,1,6>, <2,1,6,0> + 3692439144U, // <1,6,2,2>: Cost 4 vext2 <0,2,1,6>, <2,2,2,2> + 3692439206U, // <1,6,2,3>: Cost 4 vext2 <0,2,1,6>, <2,3,0,1> + 3636948278U, // <1,6,2,4>: Cost 4 vext1 <2,1,6,2>, RHS + 3787674092U, // <1,6,2,5>: Cost 4 vext3 <4,u,5,1>, <6,2,5,7> + 2618697658U, // <1,6,2,6>: Cost 3 vext2 <0,2,1,6>, <2,6,3,7> + 2970799414U, // <1,6,2,7>: Cost 3 vzipr <3,0,1,2>, RHS + 2970799415U, // <1,6,2,u>: Cost 3 vzipr <3,0,1,2>, RHS + 2563211366U, // <1,6,3,0>: Cost 3 vext1 <2,1,6,3>, LHS + 3699738854U, // <1,6,3,1>: Cost 4 vext2 <1,4,1,6>, <3,1,1,1> + 2563212860U, // <1,6,3,2>: Cost 3 vext1 <2,1,6,3>, <2,1,6,3> + 3692439964U, // <1,6,3,3>: Cost 4 vext2 <0,2,1,6>, <3,3,3,3> + 2563214646U, // <1,6,3,4>: Cost 3 vext1 <2,1,6,3>, RHS + 4191820018U, // <1,6,3,5>: Cost 4 vtrnr <5,1,7,3>, <u,6,7,5> + 2587103648U, // <1,6,3,6>: Cost 3 vext1 <6,1,6,3>, <6,1,6,3> + 3087845306U, // <1,6,3,7>: Cost 3 vtrnr LHS, <2,6,3,7> + 3087845307U, // <1,6,3,u>: Cost 3 vtrnr LHS, <2,6,3,u> + 3693767570U, // <1,6,4,0>: Cost 4 vext2 <0,4,1,6>, <4,0,5,1> + 3693767650U, // <1,6,4,1>: Cost 4 vext2 <0,4,1,6>, <4,1,5,0> + 3636962877U, // <1,6,4,2>: Cost 4 vext1 <2,1,6,4>, <2,1,6,4> + 3325088134U, // <1,6,4,3>: Cost 4 vrev <6,1,3,4> + 3693767898U, // <1,6,4,4>: Cost 4 vext2 <0,4,1,6>, <4,4,5,5> + 2618699062U, // <1,6,4,5>: Cost 3 vext2 <0,2,1,6>, RHS + 3833670966U, // <1,6,4,6>: Cost 4 vuzpl <1,3,6,7>, RHS + 4028632374U, // <1,6,4,7>: Cost 4 vzipr <0,3,1,4>, RHS + 2618699305U, // <1,6,4,u>: Cost 3 vext2 <0,2,1,6>, RHS + 3693768264U, // <1,6,5,0>: Cost 4 vext2 <0,4,1,6>, <5,0,1,2> + 3630998373U, // <1,6,5,1>: Cost 4 vext1 <1,1,6,5>, <1,1,6,5> + 3636971070U, // <1,6,5,2>: Cost 4 vext1 <2,1,6,5>, <2,1,6,5> + 3642943767U, // <1,6,5,3>: Cost 4 vext1 <3,1,6,5>, <3,1,6,5> + 3693768628U, // <1,6,5,4>: Cost 4 vext2 <0,4,1,6>, <5,4,5,6> + 3732918276U, // <1,6,5,5>: Cost 4 vext2 <7,0,1,6>, <5,5,5,5> + 2620690530U, // <1,6,5,6>: Cost 3 vext2 <0,5,1,6>, <5,6,7,0> + 2955562294U, // <1,6,5,7>: Cost 3 vzipr <0,4,1,5>, RHS + 2955562295U, // <1,6,5,u>: Cost 3 vzipr <0,4,1,5>, RHS + 2724180733U, // <1,6,6,0>: Cost 3 vext3 <6,6,0,1>, <6,6,0,1> + 3631006566U, // <1,6,6,1>: Cost 4 vext1 <1,1,6,6>, <1,1,6,6> + 3631007674U, // <1,6,6,2>: Cost 4 vext1 <1,1,6,6>, <2,6,3,7> + 3692442184U, // <1,6,6,3>: Cost 4 vext2 <0,2,1,6>, <6,3,7,0> + 3631009078U, // <1,6,6,4>: Cost 4 vext1 <1,1,6,6>, RHS + 3787674416U, // <1,6,6,5>: Cost 4 vext3 <4,u,5,1>, <6,6,5,7> + 2713932600U, // <1,6,6,6>: Cost 3 vext3 <4,u,5,1>, <6,6,6,6> + 2713932610U, // <1,6,6,7>: Cost 3 vext3 <4,u,5,1>, <6,6,7,7> + 2713932619U, // <1,6,6,u>: Cost 3 vext3 <4,u,5,1>, <6,6,u,7> + 1651102542U, // <1,6,7,0>: Cost 2 vext3 <6,7,0,1>, <6,7,0,1> + 2724918103U, // <1,6,7,1>: Cost 3 vext3 <6,7,1,1>, <6,7,1,1> + 2698302306U, // <1,6,7,2>: Cost 3 vext3 <2,3,0,1>, <6,7,2,3> + 3642960153U, // <1,6,7,3>: Cost 4 vext1 <3,1,6,7>, <3,1,6,7> + 2713932662U, // <1,6,7,4>: Cost 3 vext3 <4,u,5,1>, <6,7,4,5> + 2725213051U, // <1,6,7,5>: Cost 3 vext3 <6,7,5,1>, <6,7,5,1> + 2724844426U, // <1,6,7,6>: Cost 3 vext3 <6,7,0,1>, <6,7,6,7> + 4035956022U, // <1,6,7,7>: Cost 4 vzipr <1,5,1,7>, RHS + 1651692438U, // <1,6,7,u>: Cost 2 vext3 <6,7,u,1>, <6,7,u,1> + 1651766175U, // <1,6,u,0>: Cost 2 vext3 <6,u,0,1>, <6,u,0,1> + 2618701614U, // <1,6,u,1>: Cost 3 vext2 <0,2,1,6>, LHS + 3135663508U, // <1,6,u,2>: Cost 3 vtrnr LHS, <4,6,u,2> + 3692443580U, // <1,6,u,3>: Cost 4 vext2 <0,2,1,6>, <u,3,0,1> + 2713932743U, // <1,6,u,4>: Cost 3 vext3 <4,u,5,1>, <6,u,4,5> + 2618701978U, // <1,6,u,5>: Cost 3 vext2 <0,2,1,6>, RHS + 2622683344U, // <1,6,u,6>: Cost 3 vext2 <0,u,1,6>, <u,6,3,7> + 3087886266U, // <1,6,u,7>: Cost 3 vtrnr LHS, <2,6,3,7> + 1652356071U, // <1,6,u,u>: Cost 2 vext3 <6,u,u,1>, <6,u,u,1> + 2726171632U, // <1,7,0,0>: Cost 3 vext3 <7,0,0,1>, <7,0,0,1> + 2626666598U, // <1,7,0,1>: Cost 3 vext2 <1,5,1,7>, LHS + 3695100067U, // <1,7,0,2>: Cost 4 vext2 <0,6,1,7>, <0,2,0,1> + 3707044102U, // <1,7,0,3>: Cost 4 vext2 <2,6,1,7>, <0,3,2,1> + 2726466580U, // <1,7,0,4>: Cost 3 vext3 <7,0,4,1>, <7,0,4,1> + 3654921933U, // <1,7,0,5>: Cost 4 vext1 <5,1,7,0>, <5,1,7,0> + 2621358582U, // <1,7,0,6>: Cost 3 vext2 <0,6,1,7>, <0,6,1,7> + 2622022215U, // <1,7,0,7>: Cost 3 vext2 <0,7,1,7>, <0,7,1,7> + 2626667165U, // <1,7,0,u>: Cost 3 vext2 <1,5,1,7>, LHS + 2593128550U, // <1,7,1,0>: Cost 3 vext1 <7,1,7,1>, LHS + 2626667316U, // <1,7,1,1>: Cost 3 vext2 <1,5,1,7>, <1,1,1,1> + 3700409238U, // <1,7,1,2>: Cost 4 vext2 <1,5,1,7>, <1,2,3,0> + 2257294428U, // <1,7,1,3>: Cost 3 vrev <7,1,3,1> + 2593131830U, // <1,7,1,4>: Cost 3 vext1 <7,1,7,1>, RHS + 2626667646U, // <1,7,1,5>: Cost 3 vext2 <1,5,1,7>, <1,5,1,7> + 2627331279U, // <1,7,1,6>: Cost 3 vext2 <1,6,1,7>, <1,6,1,7> + 2593133696U, // <1,7,1,7>: Cost 3 vext1 <7,1,7,1>, <7,1,7,1> + 2628658545U, // <1,7,1,u>: Cost 3 vext2 <1,u,1,7>, <1,u,1,7> + 2587164774U, // <1,7,2,0>: Cost 3 vext1 <6,1,7,2>, LHS + 3701073445U, // <1,7,2,1>: Cost 4 vext2 <1,6,1,7>, <2,1,3,7> + 3700409960U, // <1,7,2,2>: Cost 4 vext2 <1,5,1,7>, <2,2,2,2> + 2638612134U, // <1,7,2,3>: Cost 3 vext2 <3,5,1,7>, <2,3,0,1> + 2587168054U, // <1,7,2,4>: Cost 3 vext1 <6,1,7,2>, RHS + 3706382167U, // <1,7,2,5>: Cost 4 vext2 <2,5,1,7>, <2,5,1,7> + 2587169192U, // <1,7,2,6>: Cost 3 vext1 <6,1,7,2>, <6,1,7,2> + 3660911610U, // <1,7,2,7>: Cost 4 vext1 <6,1,7,2>, <7,0,1,2> + 2587170606U, // <1,7,2,u>: Cost 3 vext1 <6,1,7,2>, LHS + 1507459174U, // <1,7,3,0>: Cost 2 vext1 <5,1,7,3>, LHS + 2569257984U, // <1,7,3,1>: Cost 3 vext1 <3,1,7,3>, <1,3,5,7> + 2581202536U, // <1,7,3,2>: Cost 3 vext1 <5,1,7,3>, <2,2,2,2> + 2569259294U, // <1,7,3,3>: Cost 3 vext1 <3,1,7,3>, <3,1,7,3> + 1507462454U, // <1,7,3,4>: Cost 2 vext1 <5,1,7,3>, RHS + 1507462864U, // <1,7,3,5>: Cost 2 vext1 <5,1,7,3>, <5,1,7,3> + 2581205498U, // <1,7,3,6>: Cost 3 vext1 <5,1,7,3>, <6,2,7,3> + 2581206010U, // <1,7,3,7>: Cost 3 vext1 <5,1,7,3>, <7,0,1,2> + 1507465006U, // <1,7,3,u>: Cost 2 vext1 <5,1,7,3>, LHS + 2728826164U, // <1,7,4,0>: Cost 3 vext3 <7,4,0,1>, <7,4,0,1> + 3654951732U, // <1,7,4,1>: Cost 4 vext1 <5,1,7,4>, <1,1,1,1> + 3330987094U, // <1,7,4,2>: Cost 4 vrev <7,1,2,4> + 3331060831U, // <1,7,4,3>: Cost 4 vrev <7,1,3,4> + 3787674971U, // <1,7,4,4>: Cost 4 vext3 <4,u,5,1>, <7,4,4,4> + 2626669878U, // <1,7,4,5>: Cost 3 vext2 <1,5,1,7>, RHS + 3785979241U, // <1,7,4,6>: Cost 4 vext3 <4,6,0,1>, <7,4,6,0> + 3787085176U, // <1,7,4,7>: Cost 4 vext3 <4,7,6,1>, <7,4,7,6> + 2626670121U, // <1,7,4,u>: Cost 3 vext2 <1,5,1,7>, RHS + 2569273446U, // <1,7,5,0>: Cost 3 vext1 <3,1,7,5>, LHS + 2569274368U, // <1,7,5,1>: Cost 3 vext1 <3,1,7,5>, <1,3,5,7> + 3643016808U, // <1,7,5,2>: Cost 4 vext1 <3,1,7,5>, <2,2,2,2> + 2569275680U, // <1,7,5,3>: Cost 3 vext1 <3,1,7,5>, <3,1,7,5> + 2569276726U, // <1,7,5,4>: Cost 3 vext1 <3,1,7,5>, RHS + 4102034790U, // <1,7,5,5>: Cost 4 vtrnl <1,3,5,7>, <7,4,5,6> + 2651222067U, // <1,7,5,6>: Cost 3 vext2 <5,6,1,7>, <5,6,1,7> + 3899378998U, // <1,7,5,7>: Cost 4 vuzpr <1,1,5,7>, RHS + 2569279278U, // <1,7,5,u>: Cost 3 vext1 <3,1,7,5>, LHS + 2730153430U, // <1,7,6,0>: Cost 3 vext3 <7,6,0,1>, <7,6,0,1> + 2724845022U, // <1,7,6,1>: Cost 3 vext3 <6,7,0,1>, <7,6,1,0> + 3643025338U, // <1,7,6,2>: Cost 4 vext1 <3,1,7,6>, <2,6,3,7> + 3643025697U, // <1,7,6,3>: Cost 4 vext1 <3,1,7,6>, <3,1,7,6> + 3643026742U, // <1,7,6,4>: Cost 4 vext1 <3,1,7,6>, RHS + 3654971091U, // <1,7,6,5>: Cost 4 vext1 <5,1,7,6>, <5,1,7,6> + 3787675153U, // <1,7,6,6>: Cost 4 vext3 <4,u,5,1>, <7,6,6,6> + 2724845076U, // <1,7,6,7>: Cost 3 vext3 <6,7,0,1>, <7,6,7,0> + 2725508637U, // <1,7,6,u>: Cost 3 vext3 <6,u,0,1>, <7,6,u,0> + 2730817063U, // <1,7,7,0>: Cost 3 vext3 <7,7,0,1>, <7,7,0,1> + 3631088436U, // <1,7,7,1>: Cost 4 vext1 <1,1,7,7>, <1,1,1,1> + 3660949158U, // <1,7,7,2>: Cost 4 vext1 <6,1,7,7>, <2,3,0,1> + 3801904705U, // <1,7,7,3>: Cost 4 vext3 <7,3,0,1>, <7,7,3,0> + 3631090998U, // <1,7,7,4>: Cost 4 vext1 <1,1,7,7>, RHS + 2662503828U, // <1,7,7,5>: Cost 3 vext2 <7,5,1,7>, <7,5,1,7> + 3660951981U, // <1,7,7,6>: Cost 4 vext1 <6,1,7,7>, <6,1,7,7> + 2713933420U, // <1,7,7,7>: Cost 3 vext3 <4,u,5,1>, <7,7,7,7> + 2731406959U, // <1,7,7,u>: Cost 3 vext3 <7,7,u,1>, <7,7,u,1> + 1507500134U, // <1,7,u,0>: Cost 2 vext1 <5,1,7,u>, LHS + 2626672430U, // <1,7,u,1>: Cost 3 vext2 <1,5,1,7>, LHS + 2581243496U, // <1,7,u,2>: Cost 3 vext1 <5,1,7,u>, <2,2,2,2> + 2569300259U, // <1,7,u,3>: Cost 3 vext1 <3,1,7,u>, <3,1,7,u> + 1507503414U, // <1,7,u,4>: Cost 2 vext1 <5,1,7,u>, RHS + 1507503829U, // <1,7,u,5>: Cost 2 vext1 <5,1,7,u>, <5,1,7,u> + 2581246458U, // <1,7,u,6>: Cost 3 vext1 <5,1,7,u>, <6,2,7,3> + 2581246970U, // <1,7,u,7>: Cost 3 vext1 <5,1,7,u>, <7,0,1,2> + 1507505966U, // <1,7,u,u>: Cost 2 vext1 <5,1,7,u>, LHS + 1543643153U, // <1,u,0,0>: Cost 2 vext2 <0,0,1,u>, <0,0,1,u> + 1546297446U, // <1,u,0,1>: Cost 2 vext2 <0,4,1,u>, LHS + 2819448852U, // <1,u,0,2>: Cost 3 vuzpr LHS, <0,0,2,2> + 2619375876U, // <1,u,0,3>: Cost 3 vext2 <0,3,1,u>, <0,3,1,u> + 1546297685U, // <1,u,0,4>: Cost 2 vext2 <0,4,1,u>, <0,4,1,u> + 1658771190U, // <1,u,0,5>: Cost 2 vext3 <u,0,5,1>, <u,0,5,1> + 2736789248U, // <1,u,0,6>: Cost 3 vext3 <u,7,0,1>, <u,0,6,2> + 2659189376U, // <1,u,0,7>: Cost 3 vext2 <7,0,1,u>, <0,7,u,1> + 1546298013U, // <1,u,0,u>: Cost 2 vext2 <0,4,1,u>, LHS + 1483112550U, // <1,u,1,0>: Cost 2 vext1 <1,1,1,1>, LHS + 202162278U, // <1,u,1,1>: Cost 1 vdup1 LHS + 1616009006U, // <1,u,1,2>: Cost 2 vext3 <0,u,1,1>, LHS + 1745707110U, // <1,u,1,3>: Cost 2 vuzpr LHS, LHS + 1483115830U, // <1,u,1,4>: Cost 2 vext1 <1,1,1,1>, RHS + 2620040336U, // <1,u,1,5>: Cost 3 vext2 <0,4,1,u>, <1,5,3,7> + 3026622618U, // <1,u,1,6>: Cost 3 vtrnl <1,1,1,1>, RHS + 2958183752U, // <1,u,1,7>: Cost 3 vzipr <0,u,1,1>, RHS + 202162278U, // <1,u,1,u>: Cost 1 vdup1 LHS + 2819449750U, // <1,u,2,0>: Cost 3 vuzpr LHS, <1,2,3,0> + 2893207342U, // <1,u,2,1>: Cost 3 vzipl <1,2,3,0>, LHS + 2819448996U, // <1,u,2,2>: Cost 3 vuzpr LHS, <0,2,0,2> + 2819450482U, // <1,u,2,3>: Cost 3 vuzpr LHS, <2,2,3,3> + 2819449754U, // <1,u,2,4>: Cost 3 vuzpr LHS, <1,2,3,4> + 2893207706U, // <1,u,2,5>: Cost 3 vzipl <1,2,3,0>, RHS + 2819449036U, // <1,u,2,6>: Cost 3 vuzpr LHS, <0,2,4,6> + 2970799432U, // <1,u,2,7>: Cost 3 vzipr <3,0,1,2>, RHS + 2819449002U, // <1,u,2,u>: Cost 3 vuzpr LHS, <0,2,0,u> + 403931292U, // <1,u,3,0>: Cost 1 vext1 LHS, LHS + 1477673718U, // <1,u,3,1>: Cost 2 vext1 LHS, <1,0,3,2> + 115726126U, // <1,u,3,2>: Cost 1 vrev LHS + 2014102173U, // <1,u,3,3>: Cost 2 vtrnr LHS, LHS + 403934518U, // <1,u,3,4>: Cost 1 vext1 LHS, RHS + 1507536601U, // <1,u,3,5>: Cost 2 vext1 <5,1,u,3>, <5,1,u,3> + 1525453306U, // <1,u,3,6>: Cost 2 vext1 LHS, <6,2,7,3> + 2014105129U, // <1,u,3,7>: Cost 2 vtrnr LHS, RHS + 403937070U, // <1,u,3,u>: Cost 1 vext1 LHS, LHS + 2620042157U, // <1,u,4,0>: Cost 3 vext2 <0,4,1,u>, <4,0,u,1> + 2620042237U, // <1,u,4,1>: Cost 3 vext2 <0,4,1,u>, <4,1,u,0> + 2263217967U, // <1,u,4,2>: Cost 3 vrev <u,1,2,4> + 2569341224U, // <1,u,4,3>: Cost 3 vext1 <3,1,u,4>, <3,1,u,4> + 2569342262U, // <1,u,4,4>: Cost 3 vext1 <3,1,u,4>, RHS + 1546300726U, // <1,u,4,5>: Cost 2 vext2 <0,4,1,u>, RHS + 2819449180U, // <1,u,4,6>: Cost 3 vuzpr LHS, <0,4,2,6> + 2724845649U, // <1,u,4,7>: Cost 3 vext3 <6,7,0,1>, <u,4,7,6> + 1546300969U, // <1,u,4,u>: Cost 2 vext2 <0,4,1,u>, RHS + 2551431270U, // <1,u,5,0>: Cost 3 vext1 <0,1,u,5>, LHS + 2551432192U, // <1,u,5,1>: Cost 3 vext1 <0,1,u,5>, <1,3,5,7> + 3028293422U, // <1,u,5,2>: Cost 3 vtrnl <1,3,5,7>, LHS + 2955559068U, // <1,u,5,3>: Cost 3 vzipr <0,4,1,5>, LHS + 2551434550U, // <1,u,5,4>: Cost 3 vext1 <0,1,u,5>, RHS + 2895255706U, // <1,u,5,5>: Cost 3 vzipl <1,5,3,7>, RHS + 1616009370U, // <1,u,5,6>: Cost 2 vext3 <0,u,1,1>, RHS + 1745710390U, // <1,u,5,7>: Cost 2 vuzpr LHS, RHS + 1745710391U, // <1,u,5,u>: Cost 2 vuzpr LHS, RHS + 2653221159U, // <1,u,6,0>: Cost 3 vext2 <6,0,1,u>, <6,0,1,u> + 2725509303U, // <1,u,6,1>: Cost 3 vext3 <6,u,0,1>, <u,6,1,0> + 2659193338U, // <1,u,6,2>: Cost 3 vext2 <7,0,1,u>, <6,2,7,3> + 2689751248U, // <1,u,6,3>: Cost 3 vext3 <0,u,1,1>, <u,6,3,7> + 2867228774U, // <1,u,6,4>: Cost 3 vuzpr LHS, <5,6,7,4> + 3764820194U, // <1,u,6,5>: Cost 4 vext3 <1,1,1,1>, <u,6,5,7> + 2657202957U, // <1,u,6,6>: Cost 3 vext2 <6,6,1,u>, <6,6,1,u> + 2819450810U, // <1,u,6,7>: Cost 3 vuzpr LHS, <2,6,3,7> + 2819450811U, // <1,u,6,u>: Cost 3 vuzpr LHS, <2,6,3,u> + 1585452032U, // <1,u,7,0>: Cost 2 vext2 <7,0,1,u>, <7,0,1,u> + 2557420340U, // <1,u,7,1>: Cost 3 vext1 <1,1,u,7>, <1,1,1,1> + 2569365158U, // <1,u,7,2>: Cost 3 vext1 <3,1,u,7>, <2,3,0,1> + 2569365803U, // <1,u,7,3>: Cost 3 vext1 <3,1,u,7>, <3,1,u,7> + 2557422902U, // <1,u,7,4>: Cost 3 vext1 <1,1,u,7>, RHS + 2662512021U, // <1,u,7,5>: Cost 3 vext2 <7,5,1,u>, <7,5,1,u> + 2724845884U, // <1,u,7,6>: Cost 3 vext3 <6,7,0,1>, <u,7,6,7> + 2659194476U, // <1,u,7,7>: Cost 3 vext2 <7,0,1,u>, <7,7,7,7> + 1590761096U, // <1,u,7,u>: Cost 2 vext2 <7,u,1,u>, <7,u,1,u> + 403972257U, // <1,u,u,0>: Cost 1 vext1 LHS, LHS + 202162278U, // <1,u,u,1>: Cost 1 vdup1 LHS + 115767091U, // <1,u,u,2>: Cost 1 vrev LHS + 1745707677U, // <1,u,u,3>: Cost 2 vuzpr LHS, LHS + 403975478U, // <1,u,u,4>: Cost 1 vext1 LHS, RHS + 1546303642U, // <1,u,u,5>: Cost 2 vext2 <0,4,1,u>, RHS + 1616009613U, // <1,u,u,6>: Cost 2 vext3 <0,u,1,1>, RHS + 1745710633U, // <1,u,u,7>: Cost 2 vuzpr LHS, RHS + 403978030U, // <1,u,u,u>: Cost 1 vext1 LHS, LHS + 2551463936U, // <2,0,0,0>: Cost 3 vext1 <0,2,0,0>, <0,0,0,0> + 2685698058U, // <2,0,0,1>: Cost 3 vext3 <0,2,0,2>, <0,0,1,1> + 1610776596U, // <2,0,0,2>: Cost 2 vext3 <0,0,2,2>, <0,0,2,2> + 2619384069U, // <2,0,0,3>: Cost 3 vext2 <0,3,2,0>, <0,3,2,0> + 2551467318U, // <2,0,0,4>: Cost 3 vext1 <0,2,0,0>, RHS + 3899836596U, // <2,0,0,5>: Cost 4 vuzpr <1,2,3,0>, <3,0,4,5> + 2621374968U, // <2,0,0,6>: Cost 3 vext2 <0,6,2,0>, <0,6,2,0> + 4168271334U, // <2,0,0,7>: Cost 4 vtrnr <1,2,3,0>, <2,0,5,7> + 1611219018U, // <2,0,0,u>: Cost 2 vext3 <0,0,u,2>, <0,0,u,2> + 2551472138U, // <2,0,1,0>: Cost 3 vext1 <0,2,0,1>, <0,0,1,1> + 2690564186U, // <2,0,1,1>: Cost 3 vext3 <1,0,3,2>, <0,1,1,0> + 1611956326U, // <2,0,1,2>: Cost 2 vext3 <0,2,0,2>, LHS + 2826092646U, // <2,0,1,3>: Cost 3 vuzpr <1,2,3,0>, LHS + 2551475510U, // <2,0,1,4>: Cost 3 vext1 <0,2,0,1>, RHS + 3692463248U, // <2,0,1,5>: Cost 4 vext2 <0,2,2,0>, <1,5,3,7> + 2587308473U, // <2,0,1,6>: Cost 3 vext1 <6,2,0,1>, <6,2,0,1> + 3661050874U, // <2,0,1,7>: Cost 4 vext1 <6,2,0,1>, <7,0,1,2> + 1611956380U, // <2,0,1,u>: Cost 2 vext3 <0,2,0,2>, LHS + 1477738598U, // <2,0,2,0>: Cost 2 vext1 <0,2,0,2>, LHS + 2551481078U, // <2,0,2,1>: Cost 3 vext1 <0,2,0,2>, <1,0,3,2> + 2551481796U, // <2,0,2,2>: Cost 3 vext1 <0,2,0,2>, <2,0,2,0> + 2551482518U, // <2,0,2,3>: Cost 3 vext1 <0,2,0,2>, <3,0,1,2> + 1477741878U, // <2,0,2,4>: Cost 2 vext1 <0,2,0,2>, RHS + 2551484112U, // <2,0,2,5>: Cost 3 vext1 <0,2,0,2>, <5,1,7,3> + 2551484759U, // <2,0,2,6>: Cost 3 vext1 <0,2,0,2>, <6,0,7,2> + 2551485434U, // <2,0,2,7>: Cost 3 vext1 <0,2,0,2>, <7,0,1,2> + 1477744430U, // <2,0,2,u>: Cost 2 vext1 <0,2,0,2>, LHS + 2953625600U, // <2,0,3,0>: Cost 3 vzipr LHS, <0,0,0,0> + 2953627302U, // <2,0,3,1>: Cost 3 vzipr LHS, <2,3,0,1> + 2953625764U, // <2,0,3,2>: Cost 3 vzipr LHS, <0,2,0,2> + 4027369695U, // <2,0,3,3>: Cost 4 vzipr LHS, <3,1,0,3> + 3625233718U, // <2,0,3,4>: Cost 4 vext1 <0,2,0,3>, RHS + 3899836110U, // <2,0,3,5>: Cost 4 vuzpr <1,2,3,0>, <2,3,4,5> + 4032012618U, // <2,0,3,6>: Cost 4 vzipr LHS, <0,4,0,6> + 3899835392U, // <2,0,3,7>: Cost 4 vuzpr <1,2,3,0>, <1,3,5,7> + 2953625770U, // <2,0,3,u>: Cost 3 vzipr LHS, <0,2,0,u> + 2551496806U, // <2,0,4,0>: Cost 3 vext1 <0,2,0,4>, LHS + 2685698386U, // <2,0,4,1>: Cost 3 vext3 <0,2,0,2>, <0,4,1,5> + 2685698396U, // <2,0,4,2>: Cost 3 vext3 <0,2,0,2>, <0,4,2,6> + 3625240726U, // <2,0,4,3>: Cost 4 vext1 <0,2,0,4>, <3,0,1,2> + 2551500086U, // <2,0,4,4>: Cost 3 vext1 <0,2,0,4>, RHS + 2618723638U, // <2,0,4,5>: Cost 3 vext2 <0,2,2,0>, RHS + 2765409590U, // <2,0,4,6>: Cost 3 vuzpl <2,3,0,1>, RHS + 3799990664U, // <2,0,4,7>: Cost 4 vext3 <7,0,1,2>, <0,4,7,5> + 2685698450U, // <2,0,4,u>: Cost 3 vext3 <0,2,0,2>, <0,4,u,6> + 3625246822U, // <2,0,5,0>: Cost 4 vext1 <0,2,0,5>, LHS + 3289776304U, // <2,0,5,1>: Cost 4 vrev <0,2,1,5> + 2690564526U, // <2,0,5,2>: Cost 3 vext3 <1,0,3,2>, <0,5,2,7> + 3289923778U, // <2,0,5,3>: Cost 4 vrev <0,2,3,5> + 2216255691U, // <2,0,5,4>: Cost 3 vrev <0,2,4,5> + 3726307332U, // <2,0,5,5>: Cost 4 vext2 <5,u,2,0>, <5,5,5,5> + 3726307426U, // <2,0,5,6>: Cost 4 vext2 <5,u,2,0>, <5,6,7,0> + 2826095926U, // <2,0,5,7>: Cost 3 vuzpr <1,2,3,0>, RHS + 2216550639U, // <2,0,5,u>: Cost 3 vrev <0,2,u,5> + 4162420736U, // <2,0,6,0>: Cost 4 vtrnr <0,2,4,6>, <0,0,0,0> + 2901885030U, // <2,0,6,1>: Cost 3 vzipl <2,6,3,7>, LHS + 2685698559U, // <2,0,6,2>: Cost 3 vext3 <0,2,0,2>, <0,6,2,7> + 3643173171U, // <2,0,6,3>: Cost 4 vext1 <3,2,0,6>, <3,2,0,6> + 2216263884U, // <2,0,6,4>: Cost 3 vrev <0,2,4,6> + 3730289341U, // <2,0,6,5>: Cost 4 vext2 <6,5,2,0>, <6,5,2,0> + 3726308152U, // <2,0,6,6>: Cost 4 vext2 <5,u,2,0>, <6,6,6,6> + 3899836346U, // <2,0,6,7>: Cost 4 vuzpr <1,2,3,0>, <2,6,3,7> + 2216558832U, // <2,0,6,u>: Cost 3 vrev <0,2,u,6> + 2659202049U, // <2,0,7,0>: Cost 3 vext2 <7,0,2,0>, <7,0,2,0> + 3726308437U, // <2,0,7,1>: Cost 4 vext2 <5,u,2,0>, <7,1,2,3> + 2726249034U, // <2,0,7,2>: Cost 3 vext3 <7,0,1,2>, <0,7,2,1> + 3734934772U, // <2,0,7,3>: Cost 4 vext2 <7,3,2,0>, <7,3,2,0> + 3726308710U, // <2,0,7,4>: Cost 4 vext2 <5,u,2,0>, <7,4,5,6> + 3726308814U, // <2,0,7,5>: Cost 4 vext2 <5,u,2,0>, <7,5,u,2> + 3736925671U, // <2,0,7,6>: Cost 4 vext2 <7,6,2,0>, <7,6,2,0> + 3726308972U, // <2,0,7,7>: Cost 4 vext2 <5,u,2,0>, <7,7,7,7> + 2659202049U, // <2,0,7,u>: Cost 3 vext2 <7,0,2,0>, <7,0,2,0> + 1477787750U, // <2,0,u,0>: Cost 2 vext1 <0,2,0,u>, LHS + 2953668262U, // <2,0,u,1>: Cost 3 vzipr LHS, <2,3,0,1> + 1611956893U, // <2,0,u,2>: Cost 2 vext3 <0,2,0,2>, LHS + 2551531670U, // <2,0,u,3>: Cost 3 vext1 <0,2,0,u>, <3,0,1,2> + 1477791030U, // <2,0,u,4>: Cost 2 vext1 <0,2,0,u>, RHS + 2618726554U, // <2,0,u,5>: Cost 3 vext2 <0,2,2,0>, RHS + 2765412506U, // <2,0,u,6>: Cost 3 vuzpl <2,3,0,1>, RHS + 2826096169U, // <2,0,u,7>: Cost 3 vuzpr <1,2,3,0>, RHS + 1611956947U, // <2,0,u,u>: Cost 2 vext3 <0,2,0,2>, LHS + 2569453670U, // <2,1,0,0>: Cost 3 vext1 <3,2,1,0>, LHS + 2619392102U, // <2,1,0,1>: Cost 3 vext2 <0,3,2,1>, LHS + 3759440619U, // <2,1,0,2>: Cost 4 vext3 <0,2,0,2>, <1,0,2,0> + 1616823030U, // <2,1,0,3>: Cost 2 vext3 <1,0,3,2>, <1,0,3,2> + 2569456950U, // <2,1,0,4>: Cost 3 vext1 <3,2,1,0>, RHS + 2690712328U, // <2,1,0,5>: Cost 3 vext3 <1,0,5,2>, <1,0,5,2> + 3661115841U, // <2,1,0,6>: Cost 4 vext1 <6,2,1,0>, <6,2,1,0> + 2622046794U, // <2,1,0,7>: Cost 3 vext2 <0,7,2,1>, <0,7,2,1> + 1617191715U, // <2,1,0,u>: Cost 2 vext3 <1,0,u,2>, <1,0,u,2> + 2551545958U, // <2,1,1,0>: Cost 3 vext1 <0,2,1,1>, LHS + 2685698868U, // <2,1,1,1>: Cost 3 vext3 <0,2,0,2>, <1,1,1,1> + 2628682646U, // <2,1,1,2>: Cost 3 vext2 <1,u,2,1>, <1,2,3,0> + 2685698888U, // <2,1,1,3>: Cost 3 vext3 <0,2,0,2>, <1,1,3,3> + 2551549238U, // <2,1,1,4>: Cost 3 vext1 <0,2,1,1>, RHS + 3693134992U, // <2,1,1,5>: Cost 4 vext2 <0,3,2,1>, <1,5,3,7> + 3661124034U, // <2,1,1,6>: Cost 4 vext1 <6,2,1,1>, <6,2,1,1> + 3625292794U, // <2,1,1,7>: Cost 4 vext1 <0,2,1,1>, <7,0,1,2> + 2685698933U, // <2,1,1,u>: Cost 3 vext3 <0,2,0,2>, <1,1,u,3> + 2551554150U, // <2,1,2,0>: Cost 3 vext1 <0,2,1,2>, LHS + 3893649571U, // <2,1,2,1>: Cost 4 vuzpr <0,2,0,1>, <0,2,0,1> + 2551555688U, // <2,1,2,2>: Cost 3 vext1 <0,2,1,2>, <2,2,2,2> + 2685698966U, // <2,1,2,3>: Cost 3 vext3 <0,2,0,2>, <1,2,3,0> + 2551557430U, // <2,1,2,4>: Cost 3 vext1 <0,2,1,2>, RHS + 3763422123U, // <2,1,2,5>: Cost 4 vext3 <0,u,0,2>, <1,2,5,3> + 3693135802U, // <2,1,2,6>: Cost 4 vext2 <0,3,2,1>, <2,6,3,7> + 2726249402U, // <2,1,2,7>: Cost 3 vext3 <7,0,1,2>, <1,2,7,0> + 2685699011U, // <2,1,2,u>: Cost 3 vext3 <0,2,0,2>, <1,2,u,0> + 2551562342U, // <2,1,3,0>: Cost 3 vext1 <0,2,1,3>, LHS + 2953625610U, // <2,1,3,1>: Cost 3 vzipr LHS, <0,0,1,1> + 2953627798U, // <2,1,3,2>: Cost 3 vzipr LHS, <3,0,1,2> + 2953626584U, // <2,1,3,3>: Cost 3 vzipr LHS, <1,3,1,3> + 2551565622U, // <2,1,3,4>: Cost 3 vext1 <0,2,1,3>, RHS + 2953625938U, // <2,1,3,5>: Cost 3 vzipr LHS, <0,4,1,5> + 2587398596U, // <2,1,3,6>: Cost 3 vext1 <6,2,1,3>, <6,2,1,3> + 4032013519U, // <2,1,3,7>: Cost 4 vzipr LHS, <1,6,1,7> + 2953625617U, // <2,1,3,u>: Cost 3 vzipr LHS, <0,0,1,u> + 2690565154U, // <2,1,4,0>: Cost 3 vext3 <1,0,3,2>, <1,4,0,5> + 3625313270U, // <2,1,4,1>: Cost 4 vext1 <0,2,1,4>, <1,3,4,6> + 3771532340U, // <2,1,4,2>: Cost 4 vext3 <2,2,2,2>, <1,4,2,5> + 1148404634U, // <2,1,4,3>: Cost 2 vrev <1,2,3,4> + 3625315638U, // <2,1,4,4>: Cost 4 vext1 <0,2,1,4>, RHS + 2619395382U, // <2,1,4,5>: Cost 3 vext2 <0,3,2,1>, RHS + 3837242678U, // <2,1,4,6>: Cost 4 vuzpl <2,0,1,2>, RHS + 3799991394U, // <2,1,4,7>: Cost 4 vext3 <7,0,1,2>, <1,4,7,6> + 1148773319U, // <2,1,4,u>: Cost 2 vrev <1,2,u,4> + 2551578726U, // <2,1,5,0>: Cost 3 vext1 <0,2,1,5>, LHS + 2551579648U, // <2,1,5,1>: Cost 3 vext1 <0,2,1,5>, <1,3,5,7> + 3625321952U, // <2,1,5,2>: Cost 4 vext1 <0,2,1,5>, <2,0,5,1> + 2685699216U, // <2,1,5,3>: Cost 3 vext3 <0,2,0,2>, <1,5,3,7> + 2551582006U, // <2,1,5,4>: Cost 3 vext1 <0,2,1,5>, RHS + 3740913668U, // <2,1,5,5>: Cost 4 vext2 <u,3,2,1>, <5,5,5,5> + 3661156806U, // <2,1,5,6>: Cost 4 vext1 <6,2,1,5>, <6,2,1,5> + 3893652790U, // <2,1,5,7>: Cost 4 vuzpr <0,2,0,1>, RHS + 2685699261U, // <2,1,5,u>: Cost 3 vext3 <0,2,0,2>, <1,5,u,7> + 2551586918U, // <2,1,6,0>: Cost 3 vext1 <0,2,1,6>, LHS + 3625329398U, // <2,1,6,1>: Cost 4 vext1 <0,2,1,6>, <1,0,3,2> + 2551588794U, // <2,1,6,2>: Cost 3 vext1 <0,2,1,6>, <2,6,3,7> + 3088679014U, // <2,1,6,3>: Cost 3 vtrnr <0,2,4,6>, LHS + 2551590198U, // <2,1,6,4>: Cost 3 vext1 <0,2,1,6>, RHS + 4029382994U, // <2,1,6,5>: Cost 4 vzipr <0,4,2,6>, <0,4,1,5> + 3625333560U, // <2,1,6,6>: Cost 4 vext1 <0,2,1,6>, <6,6,6,6> + 3731624800U, // <2,1,6,7>: Cost 4 vext2 <6,7,2,1>, <6,7,2,1> + 2551592750U, // <2,1,6,u>: Cost 3 vext1 <0,2,1,6>, LHS + 2622051322U, // <2,1,7,0>: Cost 3 vext2 <0,7,2,1>, <7,0,1,2> + 3733615699U, // <2,1,7,1>: Cost 4 vext2 <7,1,2,1>, <7,1,2,1> + 3795125538U, // <2,1,7,2>: Cost 4 vext3 <6,1,7,2>, <1,7,2,0> + 2222171037U, // <2,1,7,3>: Cost 3 vrev <1,2,3,7> + 3740915046U, // <2,1,7,4>: Cost 4 vext2 <u,3,2,1>, <7,4,5,6> + 3296060335U, // <2,1,7,5>: Cost 4 vrev <1,2,5,7> + 3736933864U, // <2,1,7,6>: Cost 4 vext2 <7,6,2,1>, <7,6,2,1> + 3805300055U, // <2,1,7,7>: Cost 4 vext3 <7,u,1,2>, <1,7,7,u> + 2669827714U, // <2,1,7,u>: Cost 3 vext2 <u,7,2,1>, <7,u,1,2> + 2551603302U, // <2,1,u,0>: Cost 3 vext1 <0,2,1,u>, LHS + 2953666570U, // <2,1,u,1>: Cost 3 vzipr LHS, <0,0,1,1> + 2953668758U, // <2,1,u,2>: Cost 3 vzipr LHS, <3,0,1,2> + 1148437406U, // <2,1,u,3>: Cost 2 vrev <1,2,3,u> + 2551606582U, // <2,1,u,4>: Cost 3 vext1 <0,2,1,u>, RHS + 2953666898U, // <2,1,u,5>: Cost 3 vzipr LHS, <0,4,1,5> + 2587398596U, // <2,1,u,6>: Cost 3 vext1 <6,2,1,3>, <6,2,1,3> + 2669828370U, // <2,1,u,7>: Cost 3 vext2 <u,7,2,1>, <u,7,2,1> + 1148806091U, // <2,1,u,u>: Cost 2 vrev <1,2,u,u> + 1543667732U, // <2,2,0,0>: Cost 2 vext2 <0,0,2,2>, <0,0,2,2> + 1548976230U, // <2,2,0,1>: Cost 2 vext2 <0,u,2,2>, LHS + 2685699524U, // <2,2,0,2>: Cost 3 vext3 <0,2,0,2>, <2,0,2,0> + 2685699535U, // <2,2,0,3>: Cost 3 vext3 <0,2,0,2>, <2,0,3,2> + 2551614774U, // <2,2,0,4>: Cost 3 vext1 <0,2,2,0>, RHS + 3704422830U, // <2,2,0,5>: Cost 4 vext2 <2,2,2,2>, <0,5,2,7> + 3893657642U, // <2,2,0,6>: Cost 4 vuzpr <0,2,0,2>, <0,0,4,6> + 3770574323U, // <2,2,0,7>: Cost 4 vext3 <2,0,7,2>, <2,0,7,2> + 1548976796U, // <2,2,0,u>: Cost 2 vext2 <0,u,2,2>, <0,u,2,2> + 2622718710U, // <2,2,1,0>: Cost 3 vext2 <0,u,2,2>, <1,0,3,2> + 2622718772U, // <2,2,1,1>: Cost 3 vext2 <0,u,2,2>, <1,1,1,1> + 2622718870U, // <2,2,1,2>: Cost 3 vext2 <0,u,2,2>, <1,2,3,0> + 2819915878U, // <2,2,1,3>: Cost 3 vuzpr <0,2,0,2>, LHS + 3625364790U, // <2,2,1,4>: Cost 4 vext1 <0,2,2,1>, RHS + 2622719120U, // <2,2,1,5>: Cost 3 vext2 <0,u,2,2>, <1,5,3,7> + 3760031292U, // <2,2,1,6>: Cost 4 vext3 <0,2,u,2>, <2,1,6,3> + 3667170468U, // <2,2,1,7>: Cost 4 vext1 <7,2,2,1>, <7,2,2,1> + 2819915883U, // <2,2,1,u>: Cost 3 vuzpr <0,2,0,2>, LHS + 1489829990U, // <2,2,2,0>: Cost 2 vext1 <2,2,2,2>, LHS + 2563572470U, // <2,2,2,1>: Cost 3 vext1 <2,2,2,2>, <1,0,3,2> + 269271142U, // <2,2,2,2>: Cost 1 vdup2 LHS + 2685699698U, // <2,2,2,3>: Cost 3 vext3 <0,2,0,2>, <2,2,3,3> + 1489833270U, // <2,2,2,4>: Cost 2 vext1 <2,2,2,2>, RHS + 2685699720U, // <2,2,2,5>: Cost 3 vext3 <0,2,0,2>, <2,2,5,7> + 2622719930U, // <2,2,2,6>: Cost 3 vext2 <0,u,2,2>, <2,6,3,7> + 2593436837U, // <2,2,2,7>: Cost 3 vext1 <7,2,2,2>, <7,2,2,2> + 269271142U, // <2,2,2,u>: Cost 1 vdup2 LHS + 2685699750U, // <2,2,3,0>: Cost 3 vext3 <0,2,0,2>, <2,3,0,1> + 2690565806U, // <2,2,3,1>: Cost 3 vext3 <1,0,3,2>, <2,3,1,0> + 2953627240U, // <2,2,3,2>: Cost 3 vzipr LHS, <2,2,2,2> + 1879883878U, // <2,2,3,3>: Cost 2 vzipr LHS, LHS + 2685699790U, // <2,2,3,4>: Cost 3 vext3 <0,2,0,2>, <2,3,4,5> + 3893659342U, // <2,2,3,5>: Cost 4 vuzpr <0,2,0,2>, <2,3,4,5> + 2958270812U, // <2,2,3,6>: Cost 3 vzipr LHS, <0,4,2,6> + 2593445030U, // <2,2,3,7>: Cost 3 vext1 <7,2,2,3>, <7,2,2,3> + 1879883883U, // <2,2,3,u>: Cost 2 vzipr LHS, LHS + 2551644262U, // <2,2,4,0>: Cost 3 vext1 <0,2,2,4>, LHS + 3625386742U, // <2,2,4,1>: Cost 4 vext1 <0,2,2,4>, <1,0,3,2> + 2551645902U, // <2,2,4,2>: Cost 3 vext1 <0,2,2,4>, <2,3,4,5> + 3759441686U, // <2,2,4,3>: Cost 4 vext3 <0,2,0,2>, <2,4,3,5> + 2551647542U, // <2,2,4,4>: Cost 3 vext1 <0,2,2,4>, RHS + 1548979510U, // <2,2,4,5>: Cost 2 vext2 <0,u,2,2>, RHS + 2764901686U, // <2,2,4,6>: Cost 3 vuzpl <2,2,2,2>, RHS + 3667195047U, // <2,2,4,7>: Cost 4 vext1 <7,2,2,4>, <7,2,2,4> + 1548979753U, // <2,2,4,u>: Cost 2 vext2 <0,u,2,2>, RHS + 3696463432U, // <2,2,5,0>: Cost 4 vext2 <0,u,2,2>, <5,0,1,2> + 2617413328U, // <2,2,5,1>: Cost 3 vext2 <0,0,2,2>, <5,1,7,3> + 2685699936U, // <2,2,5,2>: Cost 3 vext3 <0,2,0,2>, <2,5,2,7> + 4027383910U, // <2,2,5,3>: Cost 4 vzipr <0,1,2,5>, LHS + 2228201085U, // <2,2,5,4>: Cost 3 vrev <2,2,4,5> + 2617413636U, // <2,2,5,5>: Cost 3 vext2 <0,0,2,2>, <5,5,5,5> + 2617413730U, // <2,2,5,6>: Cost 3 vext2 <0,0,2,2>, <5,6,7,0> + 2819919158U, // <2,2,5,7>: Cost 3 vuzpr <0,2,0,2>, RHS + 2819919159U, // <2,2,5,u>: Cost 3 vuzpr <0,2,0,2>, RHS + 3625402554U, // <2,2,6,0>: Cost 4 vext1 <0,2,2,6>, <0,2,2,6> + 3760031652U, // <2,2,6,1>: Cost 4 vext3 <0,2,u,2>, <2,6,1,3> + 2617414138U, // <2,2,6,2>: Cost 3 vext2 <0,0,2,2>, <6,2,7,3> + 2685700026U, // <2,2,6,3>: Cost 3 vext3 <0,2,0,2>, <2,6,3,7> + 3625405750U, // <2,2,6,4>: Cost 4 vext1 <0,2,2,6>, RHS + 3760031692U, // <2,2,6,5>: Cost 4 vext3 <0,2,u,2>, <2,6,5,7> + 3088679116U, // <2,2,6,6>: Cost 3 vtrnr <0,2,4,6>, <0,2,4,6> + 2657891169U, // <2,2,6,7>: Cost 3 vext2 <6,7,2,2>, <6,7,2,2> + 2685700071U, // <2,2,6,u>: Cost 3 vext3 <0,2,0,2>, <2,6,u,7> + 2726250474U, // <2,2,7,0>: Cost 3 vext3 <7,0,1,2>, <2,7,0,1> + 3704427616U, // <2,2,7,1>: Cost 4 vext2 <2,2,2,2>, <7,1,3,5> + 2660545701U, // <2,2,7,2>: Cost 3 vext2 <7,2,2,2>, <7,2,2,2> + 4030718054U, // <2,2,7,3>: Cost 4 vzipr <0,6,2,7>, LHS + 2617415014U, // <2,2,7,4>: Cost 3 vext2 <0,0,2,2>, <7,4,5,6> + 3302033032U, // <2,2,7,5>: Cost 4 vrev <2,2,5,7> + 3661246929U, // <2,2,7,6>: Cost 4 vext1 <6,2,2,7>, <6,2,2,7> + 2617415276U, // <2,2,7,7>: Cost 3 vext2 <0,0,2,2>, <7,7,7,7> + 2731558962U, // <2,2,7,u>: Cost 3 vext3 <7,u,1,2>, <2,7,u,1> + 1489829990U, // <2,2,u,0>: Cost 2 vext1 <2,2,2,2>, LHS + 1548982062U, // <2,2,u,1>: Cost 2 vext2 <0,u,2,2>, LHS + 269271142U, // <2,2,u,2>: Cost 1 vdup2 LHS + 1879924838U, // <2,2,u,3>: Cost 2 vzipr LHS, LHS + 1489833270U, // <2,2,u,4>: Cost 2 vext1 <2,2,2,2>, RHS + 1548982426U, // <2,2,u,5>: Cost 2 vext2 <0,u,2,2>, RHS + 2953666908U, // <2,2,u,6>: Cost 3 vzipr LHS, <0,4,2,6> + 2819919401U, // <2,2,u,7>: Cost 3 vuzpr <0,2,0,2>, RHS + 269271142U, // <2,2,u,u>: Cost 1 vdup2 LHS + 1544339456U, // <2,3,0,0>: Cost 2 vext2 LHS, <0,0,0,0> + 470597734U, // <2,3,0,1>: Cost 1 vext2 LHS, LHS + 1548984484U, // <2,3,0,2>: Cost 2 vext2 LHS, <0,2,0,2> + 2619408648U, // <2,3,0,3>: Cost 3 vext2 <0,3,2,3>, <0,3,2,3> + 1548984658U, // <2,3,0,4>: Cost 2 vext2 LHS, <0,4,1,5> + 2665857454U, // <2,3,0,5>: Cost 3 vext2 LHS, <0,5,2,7> + 2622726655U, // <2,3,0,6>: Cost 3 vext2 LHS, <0,6,2,7> + 2593494188U, // <2,3,0,7>: Cost 3 vext1 <7,2,3,0>, <7,2,3,0> + 470598301U, // <2,3,0,u>: Cost 1 vext2 LHS, LHS + 1544340214U, // <2,3,1,0>: Cost 2 vext2 LHS, <1,0,3,2> + 1544340276U, // <2,3,1,1>: Cost 2 vext2 LHS, <1,1,1,1> + 1544340374U, // <2,3,1,2>: Cost 2 vext2 LHS, <1,2,3,0> + 1548985304U, // <2,3,1,3>: Cost 2 vext2 LHS, <1,3,1,3> + 2551696694U, // <2,3,1,4>: Cost 3 vext1 <0,2,3,1>, RHS + 1548985488U, // <2,3,1,5>: Cost 2 vext2 LHS, <1,5,3,7> + 2622727375U, // <2,3,1,6>: Cost 3 vext2 LHS, <1,6,1,7> + 2665858347U, // <2,3,1,7>: Cost 3 vext2 LHS, <1,7,3,0> + 1548985709U, // <2,3,1,u>: Cost 2 vext2 LHS, <1,u,1,3> + 2622727613U, // <2,3,2,0>: Cost 3 vext2 LHS, <2,0,1,2> + 2622727711U, // <2,3,2,1>: Cost 3 vext2 LHS, <2,1,3,1> + 1544341096U, // <2,3,2,2>: Cost 2 vext2 LHS, <2,2,2,2> + 1544341158U, // <2,3,2,3>: Cost 2 vext2 LHS, <2,3,0,1> + 2622727958U, // <2,3,2,4>: Cost 3 vext2 LHS, <2,4,3,5> + 2622728032U, // <2,3,2,5>: Cost 3 vext2 LHS, <2,5,2,7> + 1548986298U, // <2,3,2,6>: Cost 2 vext2 LHS, <2,6,3,7> + 2665859050U, // <2,3,2,7>: Cost 3 vext2 LHS, <2,7,0,1> + 1548986427U, // <2,3,2,u>: Cost 2 vext2 LHS, <2,u,0,1> + 1548986518U, // <2,3,3,0>: Cost 2 vext2 LHS, <3,0,1,2> + 2622728415U, // <2,3,3,1>: Cost 3 vext2 LHS, <3,1,0,3> + 1489913458U, // <2,3,3,2>: Cost 2 vext1 <2,2,3,3>, <2,2,3,3> + 1544341916U, // <2,3,3,3>: Cost 2 vext2 LHS, <3,3,3,3> + 1548986882U, // <2,3,3,4>: Cost 2 vext2 LHS, <3,4,5,6> + 2665859632U, // <2,3,3,5>: Cost 3 vext2 LHS, <3,5,1,7> + 2234304870U, // <2,3,3,6>: Cost 3 vrev <3,2,6,3> + 2958271632U, // <2,3,3,7>: Cost 3 vzipr LHS, <1,5,3,7> + 1548987166U, // <2,3,3,u>: Cost 2 vext2 LHS, <3,u,1,2> + 1483948134U, // <2,3,4,0>: Cost 2 vext1 <1,2,3,4>, LHS + 1483948954U, // <2,3,4,1>: Cost 2 vext1 <1,2,3,4>, <1,2,3,4> + 2622729276U, // <2,3,4,2>: Cost 3 vext2 LHS, <4,2,6,0> + 2557692054U, // <2,3,4,3>: Cost 3 vext1 <1,2,3,4>, <3,0,1,2> + 1483951414U, // <2,3,4,4>: Cost 2 vext1 <1,2,3,4>, RHS + 470601014U, // <2,3,4,5>: Cost 1 vext2 LHS, RHS + 1592118644U, // <2,3,4,6>: Cost 2 vext2 LHS, <4,6,4,6> + 2593526960U, // <2,3,4,7>: Cost 3 vext1 <7,2,3,4>, <7,2,3,4> + 470601257U, // <2,3,4,u>: Cost 1 vext2 LHS, RHS + 2551726182U, // <2,3,5,0>: Cost 3 vext1 <0,2,3,5>, LHS + 1592118992U, // <2,3,5,1>: Cost 2 vext2 LHS, <5,1,7,3> + 2665860862U, // <2,3,5,2>: Cost 3 vext2 LHS, <5,2,3,4> + 2551728642U, // <2,3,5,3>: Cost 3 vext1 <0,2,3,5>, <3,4,5,6> + 1592119238U, // <2,3,5,4>: Cost 2 vext2 LHS, <5,4,7,6> + 1592119300U, // <2,3,5,5>: Cost 2 vext2 LHS, <5,5,5,5> + 1592119394U, // <2,3,5,6>: Cost 2 vext2 LHS, <5,6,7,0> + 1592119464U, // <2,3,5,7>: Cost 2 vext2 LHS, <5,7,5,7> + 1592119545U, // <2,3,5,u>: Cost 2 vext2 LHS, <5,u,5,7> + 2622730529U, // <2,3,6,0>: Cost 3 vext2 LHS, <6,0,1,2> + 2557707164U, // <2,3,6,1>: Cost 3 vext1 <1,2,3,6>, <1,2,3,6> + 1592119802U, // <2,3,6,2>: Cost 2 vext2 LHS, <6,2,7,3> + 2665861682U, // <2,3,6,3>: Cost 3 vext2 LHS, <6,3,4,5> + 2622730893U, // <2,3,6,4>: Cost 3 vext2 LHS, <6,4,5,6> + 2665861810U, // <2,3,6,5>: Cost 3 vext2 LHS, <6,5,0,7> + 1592120120U, // <2,3,6,6>: Cost 2 vext2 LHS, <6,6,6,6> + 1592120142U, // <2,3,6,7>: Cost 2 vext2 LHS, <6,7,0,1> + 1592120223U, // <2,3,6,u>: Cost 2 vext2 LHS, <6,u,0,1> + 1592120314U, // <2,3,7,0>: Cost 2 vext2 LHS, <7,0,1,2> + 2659890261U, // <2,3,7,1>: Cost 3 vext2 <7,1,2,3>, <7,1,2,3> + 2660553894U, // <2,3,7,2>: Cost 3 vext2 <7,2,2,3>, <7,2,2,3> + 2665862371U, // <2,3,7,3>: Cost 3 vext2 LHS, <7,3,0,1> + 1592120678U, // <2,3,7,4>: Cost 2 vext2 LHS, <7,4,5,6> + 2665862534U, // <2,3,7,5>: Cost 3 vext2 LHS, <7,5,0,2> + 2665862614U, // <2,3,7,6>: Cost 3 vext2 LHS, <7,6,0,1> + 1592120940U, // <2,3,7,7>: Cost 2 vext2 LHS, <7,7,7,7> + 1592120962U, // <2,3,7,u>: Cost 2 vext2 LHS, <7,u,1,2> + 1548990163U, // <2,3,u,0>: Cost 2 vext2 LHS, <u,0,1,2> + 470603566U, // <2,3,u,1>: Cost 1 vext2 LHS, LHS + 1548990341U, // <2,3,u,2>: Cost 2 vext2 LHS, <u,2,3,0> + 1548990396U, // <2,3,u,3>: Cost 2 vext2 LHS, <u,3,0,1> + 1548990527U, // <2,3,u,4>: Cost 2 vext2 LHS, <u,4,5,6> + 470603930U, // <2,3,u,5>: Cost 1 vext2 LHS, RHS + 1548990672U, // <2,3,u,6>: Cost 2 vext2 LHS, <u,6,3,7> + 1592121600U, // <2,3,u,7>: Cost 2 vext2 LHS, <u,7,0,1> + 470604133U, // <2,3,u,u>: Cost 1 vext2 LHS, LHS + 2617425942U, // <2,4,0,0>: Cost 3 vext2 <0,0,2,4>, <0,0,2,4> + 2618753126U, // <2,4,0,1>: Cost 3 vext2 <0,2,2,4>, LHS + 2618753208U, // <2,4,0,2>: Cost 3 vext2 <0,2,2,4>, <0,2,2,4> + 2619416841U, // <2,4,0,3>: Cost 3 vext2 <0,3,2,4>, <0,3,2,4> + 2587593628U, // <2,4,0,4>: Cost 3 vext1 <6,2,4,0>, <4,0,6,2> + 2712832914U, // <2,4,0,5>: Cost 3 vext3 <4,6,u,2>, <4,0,5,1> + 1634962332U, // <2,4,0,6>: Cost 2 vext3 <4,0,6,2>, <4,0,6,2> + 3799993252U, // <2,4,0,7>: Cost 4 vext3 <7,0,1,2>, <4,0,7,1> + 1634962332U, // <2,4,0,u>: Cost 2 vext3 <4,0,6,2>, <4,0,6,2> + 2619417334U, // <2,4,1,0>: Cost 3 vext2 <0,3,2,4>, <1,0,3,2> + 3692495668U, // <2,4,1,1>: Cost 4 vext2 <0,2,2,4>, <1,1,1,1> + 2625389466U, // <2,4,1,2>: Cost 3 vext2 <1,3,2,4>, <1,2,3,4> + 2826125414U, // <2,4,1,3>: Cost 3 vuzpr <1,2,3,4>, LHS + 3699794995U, // <2,4,1,4>: Cost 4 vext2 <1,4,2,4>, <1,4,2,4> + 3692496016U, // <2,4,1,5>: Cost 4 vext2 <0,2,2,4>, <1,5,3,7> + 3763424238U, // <2,4,1,6>: Cost 4 vext3 <0,u,0,2>, <4,1,6,3> + 3667317942U, // <2,4,1,7>: Cost 4 vext1 <7,2,4,1>, <7,2,4,1> + 2826125419U, // <2,4,1,u>: Cost 3 vuzpr <1,2,3,4>, LHS + 2629371336U, // <2,4,2,0>: Cost 3 vext2 <2,0,2,4>, <2,0,2,4> + 3699131946U, // <2,4,2,1>: Cost 4 vext2 <1,3,2,4>, <2,1,4,3> + 2630698602U, // <2,4,2,2>: Cost 3 vext2 <2,2,2,4>, <2,2,2,4> + 2618754766U, // <2,4,2,3>: Cost 3 vext2 <0,2,2,4>, <2,3,4,5> + 2826126234U, // <2,4,2,4>: Cost 3 vuzpr <1,2,3,4>, <1,2,3,4> + 2899119414U, // <2,4,2,5>: Cost 3 vzipl <2,2,2,2>, RHS + 3033337142U, // <2,4,2,6>: Cost 3 vtrnl <2,2,2,2>, RHS + 3800214597U, // <2,4,2,7>: Cost 4 vext3 <7,0,4,2>, <4,2,7,0> + 2899119657U, // <2,4,2,u>: Cost 3 vzipl <2,2,2,2>, RHS + 2635344033U, // <2,4,3,0>: Cost 3 vext2 <3,0,2,4>, <3,0,2,4> + 4032012325U, // <2,4,3,1>: Cost 4 vzipr LHS, <0,0,4,1> + 3692497228U, // <2,4,3,2>: Cost 4 vext2 <0,2,2,4>, <3,2,3,4> + 3692497308U, // <2,4,3,3>: Cost 4 vext2 <0,2,2,4>, <3,3,3,3> + 3001404624U, // <2,4,3,4>: Cost 3 vzipr LHS, <4,4,4,4> + 2953627342U, // <2,4,3,5>: Cost 3 vzipr LHS, <2,3,4,5> + 2953625804U, // <2,4,3,6>: Cost 3 vzipr LHS, <0,2,4,6> + 3899868160U, // <2,4,3,7>: Cost 4 vuzpr <1,2,3,4>, <1,3,5,7> + 2953625806U, // <2,4,3,u>: Cost 3 vzipr LHS, <0,2,4,u> + 2710916266U, // <2,4,4,0>: Cost 3 vext3 <4,4,0,2>, <4,4,0,2> + 3899869648U, // <2,4,4,1>: Cost 4 vuzpr <1,2,3,4>, <3,4,0,1> + 3899869658U, // <2,4,4,2>: Cost 4 vuzpr <1,2,3,4>, <3,4,1,2> + 3899868930U, // <2,4,4,3>: Cost 4 vuzpr <1,2,3,4>, <2,4,1,3> + 2712833232U, // <2,4,4,4>: Cost 3 vext3 <4,6,u,2>, <4,4,4,4> + 2618756406U, // <2,4,4,5>: Cost 3 vext2 <0,2,2,4>, RHS + 2765737270U, // <2,4,4,6>: Cost 3 vuzpl <2,3,4,5>, RHS + 4168304426U, // <2,4,4,7>: Cost 4 vtrnr <1,2,3,4>, <2,4,5,7> + 2618756649U, // <2,4,4,u>: Cost 3 vext2 <0,2,2,4>, RHS + 2551800011U, // <2,4,5,0>: Cost 3 vext1 <0,2,4,5>, <0,2,4,5> + 2569716470U, // <2,4,5,1>: Cost 3 vext1 <3,2,4,5>, <1,0,3,2> + 2563745405U, // <2,4,5,2>: Cost 3 vext1 <2,2,4,5>, <2,2,4,5> + 2569718102U, // <2,4,5,3>: Cost 3 vext1 <3,2,4,5>, <3,2,4,5> + 2551803190U, // <2,4,5,4>: Cost 3 vext1 <0,2,4,5>, RHS + 3625545732U, // <2,4,5,5>: Cost 4 vext1 <0,2,4,5>, <5,5,5,5> + 1611959606U, // <2,4,5,6>: Cost 2 vext3 <0,2,0,2>, RHS + 2826128694U, // <2,4,5,7>: Cost 3 vuzpr <1,2,3,4>, RHS + 1611959624U, // <2,4,5,u>: Cost 2 vext3 <0,2,0,2>, RHS + 1478066278U, // <2,4,6,0>: Cost 2 vext1 <0,2,4,6>, LHS + 2551808758U, // <2,4,6,1>: Cost 3 vext1 <0,2,4,6>, <1,0,3,2> + 2551809516U, // <2,4,6,2>: Cost 3 vext1 <0,2,4,6>, <2,0,6,4> + 2551810198U, // <2,4,6,3>: Cost 3 vext1 <0,2,4,6>, <3,0,1,2> + 1478069558U, // <2,4,6,4>: Cost 2 vext1 <0,2,4,6>, RHS + 2901888310U, // <2,4,6,5>: Cost 3 vzipl <2,6,3,7>, RHS + 2551812920U, // <2,4,6,6>: Cost 3 vext1 <0,2,4,6>, <6,6,6,6> + 2726251914U, // <2,4,6,7>: Cost 3 vext3 <7,0,1,2>, <4,6,7,1> + 1478072110U, // <2,4,6,u>: Cost 2 vext1 <0,2,4,6>, LHS + 2659234821U, // <2,4,7,0>: Cost 3 vext2 <7,0,2,4>, <7,0,2,4> + 3786722726U, // <2,4,7,1>: Cost 4 vext3 <4,7,1,2>, <4,7,1,2> + 3734303911U, // <2,4,7,2>: Cost 4 vext2 <7,2,2,4>, <7,2,2,4> + 3734967544U, // <2,4,7,3>: Cost 4 vext2 <7,3,2,4>, <7,3,2,4> + 3727005030U, // <2,4,7,4>: Cost 4 vext2 <6,0,2,4>, <7,4,5,6> + 2726251976U, // <2,4,7,5>: Cost 3 vext3 <7,0,1,2>, <4,7,5,0> + 2726251986U, // <2,4,7,6>: Cost 3 vext3 <7,0,1,2>, <4,7,6,1> + 3727005292U, // <2,4,7,7>: Cost 4 vext2 <6,0,2,4>, <7,7,7,7> + 2659234821U, // <2,4,7,u>: Cost 3 vext2 <7,0,2,4>, <7,0,2,4> + 1478082662U, // <2,4,u,0>: Cost 2 vext1 <0,2,4,u>, LHS + 2618758958U, // <2,4,u,1>: Cost 3 vext2 <0,2,2,4>, LHS + 2551826024U, // <2,4,u,2>: Cost 3 vext1 <0,2,4,u>, <2,2,2,2> + 2551826582U, // <2,4,u,3>: Cost 3 vext1 <0,2,4,u>, <3,0,1,2> + 1478085942U, // <2,4,u,4>: Cost 2 vext1 <0,2,4,u>, RHS + 2953668302U, // <2,4,u,5>: Cost 3 vzipr LHS, <2,3,4,5> + 1611959849U, // <2,4,u,6>: Cost 2 vext3 <0,2,0,2>, RHS + 2826128937U, // <2,4,u,7>: Cost 3 vuzpr <1,2,3,4>, RHS + 1611959867U, // <2,4,u,u>: Cost 2 vext3 <0,2,0,2>, RHS + 3691839488U, // <2,5,0,0>: Cost 4 vext2 <0,1,2,5>, <0,0,0,0> + 2618097766U, // <2,5,0,1>: Cost 3 vext2 <0,1,2,5>, LHS + 2620088484U, // <2,5,0,2>: Cost 3 vext2 <0,4,2,5>, <0,2,0,2> + 2619425034U, // <2,5,0,3>: Cost 3 vext2 <0,3,2,5>, <0,3,2,5> + 2620088667U, // <2,5,0,4>: Cost 3 vext2 <0,4,2,5>, <0,4,2,5> + 2620752300U, // <2,5,0,5>: Cost 3 vext2 <0,5,2,5>, <0,5,2,5> + 3693830655U, // <2,5,0,6>: Cost 4 vext2 <0,4,2,5>, <0,6,2,7> + 3094531382U, // <2,5,0,7>: Cost 3 vtrnr <1,2,3,0>, RHS + 2618098333U, // <2,5,0,u>: Cost 3 vext2 <0,1,2,5>, LHS + 3691840246U, // <2,5,1,0>: Cost 4 vext2 <0,1,2,5>, <1,0,3,2> + 3691840308U, // <2,5,1,1>: Cost 4 vext2 <0,1,2,5>, <1,1,1,1> + 2626061206U, // <2,5,1,2>: Cost 3 vext2 <1,4,2,5>, <1,2,3,0> + 2618098688U, // <2,5,1,3>: Cost 3 vext2 <0,1,2,5>, <1,3,5,7> + 2626061364U, // <2,5,1,4>: Cost 3 vext2 <1,4,2,5>, <1,4,2,5> + 3691840656U, // <2,5,1,5>: Cost 4 vext2 <0,1,2,5>, <1,5,3,7> + 3789082310U, // <2,5,1,6>: Cost 4 vext3 <5,1,6,2>, <5,1,6,2> + 2712833744U, // <2,5,1,7>: Cost 3 vext3 <4,6,u,2>, <5,1,7,3> + 2628715896U, // <2,5,1,u>: Cost 3 vext2 <1,u,2,5>, <1,u,2,5> + 3693831613U, // <2,5,2,0>: Cost 4 vext2 <0,4,2,5>, <2,0,1,2> + 4026698642U, // <2,5,2,1>: Cost 4 vzipr <0,0,2,2>, <4,0,5,1> + 2632033896U, // <2,5,2,2>: Cost 3 vext2 <2,4,2,5>, <2,2,2,2> + 3691841190U, // <2,5,2,3>: Cost 4 vext2 <0,1,2,5>, <2,3,0,1> + 2632034061U, // <2,5,2,4>: Cost 3 vext2 <2,4,2,5>, <2,4,2,5> + 3691841352U, // <2,5,2,5>: Cost 4 vext2 <0,1,2,5>, <2,5,0,1> + 3691841466U, // <2,5,2,6>: Cost 4 vext2 <0,1,2,5>, <2,6,3,7> + 3088354614U, // <2,5,2,7>: Cost 3 vtrnr <0,2,0,2>, RHS + 3088354615U, // <2,5,2,u>: Cost 3 vtrnr <0,2,0,2>, RHS + 2557829222U, // <2,5,3,0>: Cost 3 vext1 <1,2,5,3>, LHS + 2557830059U, // <2,5,3,1>: Cost 3 vext1 <1,2,5,3>, <1,2,5,3> + 2575746766U, // <2,5,3,2>: Cost 3 vext1 <4,2,5,3>, <2,3,4,5> + 3691841948U, // <2,5,3,3>: Cost 4 vext2 <0,1,2,5>, <3,3,3,3> + 2619427330U, // <2,5,3,4>: Cost 3 vext2 <0,3,2,5>, <3,4,5,6> + 2581720847U, // <2,5,3,5>: Cost 3 vext1 <5,2,5,3>, <5,2,5,3> + 2953628162U, // <2,5,3,6>: Cost 3 vzipr LHS, <3,4,5,6> + 2953626624U, // <2,5,3,7>: Cost 3 vzipr LHS, <1,3,5,7> + 2953626625U, // <2,5,3,u>: Cost 3 vzipr LHS, <1,3,5,u> + 2569781350U, // <2,5,4,0>: Cost 3 vext1 <3,2,5,4>, LHS + 3631580076U, // <2,5,4,1>: Cost 4 vext1 <1,2,5,4>, <1,2,5,4> + 2569782990U, // <2,5,4,2>: Cost 3 vext1 <3,2,5,4>, <2,3,4,5> + 2569783646U, // <2,5,4,3>: Cost 3 vext1 <3,2,5,4>, <3,2,5,4> + 2569784630U, // <2,5,4,4>: Cost 3 vext1 <3,2,5,4>, RHS + 2618101046U, // <2,5,4,5>: Cost 3 vext2 <0,1,2,5>, RHS + 3893905922U, // <2,5,4,6>: Cost 4 vuzpr <0,2,3,5>, <3,4,5,6> + 3094564150U, // <2,5,4,7>: Cost 3 vtrnr <1,2,3,4>, RHS + 2618101289U, // <2,5,4,u>: Cost 3 vext2 <0,1,2,5>, RHS + 2551873638U, // <2,5,5,0>: Cost 3 vext1 <0,2,5,5>, LHS + 3637560320U, // <2,5,5,1>: Cost 4 vext1 <2,2,5,5>, <1,3,5,7> + 3637560966U, // <2,5,5,2>: Cost 4 vext1 <2,2,5,5>, <2,2,5,5> + 3723030343U, // <2,5,5,3>: Cost 4 vext2 <5,3,2,5>, <5,3,2,5> + 2551876918U, // <2,5,5,4>: Cost 3 vext1 <0,2,5,5>, RHS + 2712834052U, // <2,5,5,5>: Cost 3 vext3 <4,6,u,2>, <5,5,5,5> + 4028713474U, // <2,5,5,6>: Cost 4 vzipr <0,3,2,5>, <3,4,5,6> + 2712834072U, // <2,5,5,7>: Cost 3 vext3 <4,6,u,2>, <5,5,7,7> + 2712834081U, // <2,5,5,u>: Cost 3 vext3 <4,6,u,2>, <5,5,u,7> + 2575769702U, // <2,5,6,0>: Cost 3 vext1 <4,2,5,6>, LHS + 3631596462U, // <2,5,6,1>: Cost 4 vext1 <1,2,5,6>, <1,2,5,6> + 2655924730U, // <2,5,6,2>: Cost 3 vext2 <6,4,2,5>, <6,2,7,3> + 3643541856U, // <2,5,6,3>: Cost 4 vext1 <3,2,5,6>, <3,2,5,6> + 2655924849U, // <2,5,6,4>: Cost 3 vext2 <6,4,2,5>, <6,4,2,5> + 3787755607U, // <2,5,6,5>: Cost 4 vext3 <4,u,6,2>, <5,6,5,7> + 4029385218U, // <2,5,6,6>: Cost 4 vzipr <0,4,2,6>, <3,4,5,6> + 3088682294U, // <2,5,6,7>: Cost 3 vtrnr <0,2,4,6>, RHS + 3088682295U, // <2,5,6,u>: Cost 3 vtrnr <0,2,4,6>, RHS + 2563833958U, // <2,5,7,0>: Cost 3 vext1 <2,2,5,7>, LHS + 2551890678U, // <2,5,7,1>: Cost 3 vext1 <0,2,5,7>, <1,0,3,2> + 2563835528U, // <2,5,7,2>: Cost 3 vext1 <2,2,5,7>, <2,2,5,7> + 3637577878U, // <2,5,7,3>: Cost 4 vext1 <2,2,5,7>, <3,0,1,2> + 2563837238U, // <2,5,7,4>: Cost 3 vext1 <2,2,5,7>, RHS + 2712834216U, // <2,5,7,5>: Cost 3 vext3 <4,6,u,2>, <5,7,5,7> + 2712834220U, // <2,5,7,6>: Cost 3 vext3 <4,6,u,2>, <5,7,6,2> + 4174449974U, // <2,5,7,7>: Cost 4 vtrnr <2,2,5,7>, RHS + 2563839790U, // <2,5,7,u>: Cost 3 vext1 <2,2,5,7>, LHS + 2563842150U, // <2,5,u,0>: Cost 3 vext1 <2,2,5,u>, LHS + 2618103598U, // <2,5,u,1>: Cost 3 vext2 <0,1,2,5>, LHS + 2563843721U, // <2,5,u,2>: Cost 3 vext1 <2,2,5,u>, <2,2,5,u> + 2569816418U, // <2,5,u,3>: Cost 3 vext1 <3,2,5,u>, <3,2,5,u> + 2622748735U, // <2,5,u,4>: Cost 3 vext2 <0,u,2,5>, <u,4,5,6> + 2618103962U, // <2,5,u,5>: Cost 3 vext2 <0,1,2,5>, RHS + 2953669122U, // <2,5,u,6>: Cost 3 vzipr LHS, <3,4,5,6> + 2953667584U, // <2,5,u,7>: Cost 3 vzipr LHS, <1,3,5,7> + 2618104165U, // <2,5,u,u>: Cost 3 vext2 <0,1,2,5>, LHS + 2620096512U, // <2,6,0,0>: Cost 3 vext2 <0,4,2,6>, <0,0,0,0> + 1546354790U, // <2,6,0,1>: Cost 2 vext2 <0,4,2,6>, LHS + 2620096676U, // <2,6,0,2>: Cost 3 vext2 <0,4,2,6>, <0,2,0,2> + 3693838588U, // <2,6,0,3>: Cost 4 vext2 <0,4,2,6>, <0,3,1,0> + 1546355036U, // <2,6,0,4>: Cost 2 vext2 <0,4,2,6>, <0,4,2,6> + 3694502317U, // <2,6,0,5>: Cost 4 vext2 <0,5,2,6>, <0,5,2,6> + 2551911246U, // <2,6,0,6>: Cost 3 vext1 <0,2,6,0>, <6,7,0,1> + 2720723287U, // <2,6,0,7>: Cost 3 vext3 <6,0,7,2>, <6,0,7,2> + 1546355357U, // <2,6,0,u>: Cost 2 vext2 <0,4,2,6>, LHS + 2620097270U, // <2,6,1,0>: Cost 3 vext2 <0,4,2,6>, <1,0,3,2> + 2620097332U, // <2,6,1,1>: Cost 3 vext2 <0,4,2,6>, <1,1,1,1> + 2620097430U, // <2,6,1,2>: Cost 3 vext2 <0,4,2,6>, <1,2,3,0> + 2820243558U, // <2,6,1,3>: Cost 3 vuzpr <0,2,4,6>, LHS + 2620097598U, // <2,6,1,4>: Cost 3 vext2 <0,4,2,6>, <1,4,3,6> + 2620097680U, // <2,6,1,5>: Cost 3 vext2 <0,4,2,6>, <1,5,3,7> + 3693839585U, // <2,6,1,6>: Cost 4 vext2 <0,4,2,6>, <1,6,3,7> + 2721386920U, // <2,6,1,7>: Cost 3 vext3 <6,1,7,2>, <6,1,7,2> + 2820243563U, // <2,6,1,u>: Cost 3 vuzpr <0,2,4,6>, LHS + 2714014137U, // <2,6,2,0>: Cost 3 vext3 <4,u,6,2>, <6,2,0,1> + 2712834500U, // <2,6,2,1>: Cost 3 vext3 <4,6,u,2>, <6,2,1,3> + 2620098152U, // <2,6,2,2>: Cost 3 vext2 <0,4,2,6>, <2,2,2,2> + 2620098214U, // <2,6,2,3>: Cost 3 vext2 <0,4,2,6>, <2,3,0,1> + 2632042254U, // <2,6,2,4>: Cost 3 vext2 <2,4,2,6>, <2,4,2,6> + 2712834540U, // <2,6,2,5>: Cost 3 vext3 <4,6,u,2>, <6,2,5,7> + 2820243660U, // <2,6,2,6>: Cost 3 vuzpr <0,2,4,6>, <0,2,4,6> + 2958265654U, // <2,6,2,7>: Cost 3 vzipr <0,u,2,2>, RHS + 2620098619U, // <2,6,2,u>: Cost 3 vext2 <0,4,2,6>, <2,u,0,1> + 2620098710U, // <2,6,3,0>: Cost 3 vext2 <0,4,2,6>, <3,0,1,2> + 3893986982U, // <2,6,3,1>: Cost 4 vuzpr <0,2,4,6>, <2,3,0,1> + 2569848762U, // <2,6,3,2>: Cost 3 vext1 <3,2,6,3>, <2,6,3,7> + 2620098972U, // <2,6,3,3>: Cost 3 vext2 <0,4,2,6>, <3,3,3,3> + 2620099074U, // <2,6,3,4>: Cost 3 vext2 <0,4,2,6>, <3,4,5,6> + 3893987022U, // <2,6,3,5>: Cost 4 vuzpr <0,2,4,6>, <2,3,4,5> + 3001404644U, // <2,6,3,6>: Cost 3 vzipr LHS, <4,4,6,6> + 1879887158U, // <2,6,3,7>: Cost 2 vzipr LHS, RHS + 1879887159U, // <2,6,3,u>: Cost 2 vzipr LHS, RHS + 2620099484U, // <2,6,4,0>: Cost 3 vext2 <0,4,2,6>, <4,0,6,2> + 2620099566U, // <2,6,4,1>: Cost 3 vext2 <0,4,2,6>, <4,1,6,3> + 2620099644U, // <2,6,4,2>: Cost 3 vext2 <0,4,2,6>, <4,2,6,0> + 3643599207U, // <2,6,4,3>: Cost 4 vext1 <3,2,6,4>, <3,2,6,4> + 2575830080U, // <2,6,4,4>: Cost 3 vext1 <4,2,6,4>, <4,2,6,4> + 1546358070U, // <2,6,4,5>: Cost 2 vext2 <0,4,2,6>, RHS + 2667875700U, // <2,6,4,6>: Cost 3 vext2 <u,4,2,6>, <4,6,4,6> + 4028042550U, // <2,6,4,7>: Cost 4 vzipr <0,2,2,4>, RHS + 1546358313U, // <2,6,4,u>: Cost 2 vext2 <0,4,2,6>, RHS + 3693841992U, // <2,6,5,0>: Cost 4 vext2 <0,4,2,6>, <5,0,1,2> + 2667876048U, // <2,6,5,1>: Cost 3 vext2 <u,4,2,6>, <5,1,7,3> + 2712834756U, // <2,6,5,2>: Cost 3 vext3 <4,6,u,2>, <6,5,2,7> + 3643607400U, // <2,6,5,3>: Cost 4 vext1 <3,2,6,5>, <3,2,6,5> + 2252091873U, // <2,6,5,4>: Cost 3 vrev <6,2,4,5> + 2667876356U, // <2,6,5,5>: Cost 3 vext2 <u,4,2,6>, <5,5,5,5> + 2667876450U, // <2,6,5,6>: Cost 3 vext2 <u,4,2,6>, <5,6,7,0> + 2820246838U, // <2,6,5,7>: Cost 3 vuzpr <0,2,4,6>, RHS + 2820246839U, // <2,6,5,u>: Cost 3 vuzpr <0,2,4,6>, RHS + 2563899494U, // <2,6,6,0>: Cost 3 vext1 <2,2,6,6>, LHS + 3893988683U, // <2,6,6,1>: Cost 4 vuzpr <0,2,4,6>, <4,6,0,1> + 2563901072U, // <2,6,6,2>: Cost 3 vext1 <2,2,6,6>, <2,2,6,6> + 3893987236U, // <2,6,6,3>: Cost 4 vuzpr <0,2,4,6>, <2,6,1,3> + 2563902774U, // <2,6,6,4>: Cost 3 vext1 <2,2,6,6>, RHS + 3893988723U, // <2,6,6,5>: Cost 4 vuzpr <0,2,4,6>, <4,6,4,5> + 2712834872U, // <2,6,6,6>: Cost 3 vext3 <4,6,u,2>, <6,6,6,6> + 2955644214U, // <2,6,6,7>: Cost 3 vzipr <0,4,2,6>, RHS + 2955644215U, // <2,6,6,u>: Cost 3 vzipr <0,4,2,6>, RHS + 2712834894U, // <2,6,7,0>: Cost 3 vext3 <4,6,u,2>, <6,7,0,1> + 2724926296U, // <2,6,7,1>: Cost 3 vext3 <6,7,1,2>, <6,7,1,2> + 2725000033U, // <2,6,7,2>: Cost 3 vext3 <6,7,2,2>, <6,7,2,2> + 2702365544U, // <2,6,7,3>: Cost 3 vext3 <3,0,1,2>, <6,7,3,0> + 2712834934U, // <2,6,7,4>: Cost 3 vext3 <4,6,u,2>, <6,7,4,5> + 3776107393U, // <2,6,7,5>: Cost 4 vext3 <3,0,1,2>, <6,7,5,7> + 2725294981U, // <2,6,7,6>: Cost 3 vext3 <6,7,6,2>, <6,7,6,2> + 2726253452U, // <2,6,7,7>: Cost 3 vext3 <7,0,1,2>, <6,7,7,0> + 2712834966U, // <2,6,7,u>: Cost 3 vext3 <4,6,u,2>, <6,7,u,1> + 2620102355U, // <2,6,u,0>: Cost 3 vext2 <0,4,2,6>, <u,0,1,2> + 1546360622U, // <2,6,u,1>: Cost 2 vext2 <0,4,2,6>, LHS + 2620102536U, // <2,6,u,2>: Cost 3 vext2 <0,4,2,6>, <u,2,3,3> + 2820244125U, // <2,6,u,3>: Cost 3 vuzpr <0,2,4,6>, LHS + 1594136612U, // <2,6,u,4>: Cost 2 vext2 <u,4,2,6>, <u,4,2,6> + 1546360986U, // <2,6,u,5>: Cost 2 vext2 <0,4,2,6>, RHS + 2620102864U, // <2,6,u,6>: Cost 3 vext2 <0,4,2,6>, <u,6,3,7> + 1879928118U, // <2,6,u,7>: Cost 2 vzipr LHS, RHS + 1879928119U, // <2,6,u,u>: Cost 2 vzipr LHS, RHS + 2726179825U, // <2,7,0,0>: Cost 3 vext3 <7,0,0,2>, <7,0,0,2> + 1652511738U, // <2,7,0,1>: Cost 2 vext3 <7,0,1,2>, <7,0,1,2> + 2621431972U, // <2,7,0,2>: Cost 3 vext2 <0,6,2,7>, <0,2,0,2> + 2257949868U, // <2,7,0,3>: Cost 3 vrev <7,2,3,0> + 2726474773U, // <2,7,0,4>: Cost 3 vext3 <7,0,4,2>, <7,0,4,2> + 2620768686U, // <2,7,0,5>: Cost 3 vext2 <0,5,2,7>, <0,5,2,7> + 2621432319U, // <2,7,0,6>: Cost 3 vext2 <0,6,2,7>, <0,6,2,7> + 2599760953U, // <2,7,0,7>: Cost 3 vext1 <u,2,7,0>, <7,0,u,2> + 1653027897U, // <2,7,0,u>: Cost 2 vext3 <7,0,u,2>, <7,0,u,2> + 2639348470U, // <2,7,1,0>: Cost 3 vext2 <3,6,2,7>, <1,0,3,2> + 3695174452U, // <2,7,1,1>: Cost 4 vext2 <0,6,2,7>, <1,1,1,1> + 3695174550U, // <2,7,1,2>: Cost 4 vext2 <0,6,2,7>, <1,2,3,0> + 3694511104U, // <2,7,1,3>: Cost 4 vext2 <0,5,2,7>, <1,3,5,7> + 3713090594U, // <2,7,1,4>: Cost 4 vext2 <3,6,2,7>, <1,4,0,5> + 3693184144U, // <2,7,1,5>: Cost 4 vext2 <0,3,2,7>, <1,5,3,7> + 2627405016U, // <2,7,1,6>: Cost 3 vext2 <1,6,2,7>, <1,6,2,7> + 3799995519U, // <2,7,1,7>: Cost 4 vext3 <7,0,1,2>, <7,1,7,0> + 2639348470U, // <2,7,1,u>: Cost 3 vext2 <3,6,2,7>, <1,0,3,2> + 3695175101U, // <2,7,2,0>: Cost 4 vext2 <0,6,2,7>, <2,0,1,2> + 3643655168U, // <2,7,2,1>: Cost 4 vext1 <3,2,7,2>, <1,3,5,7> + 2257892517U, // <2,7,2,2>: Cost 3 vrev <7,2,2,2> + 3695175334U, // <2,7,2,3>: Cost 4 vext2 <0,6,2,7>, <2,3,0,1> + 3695175465U, // <2,7,2,4>: Cost 4 vext2 <0,6,2,7>, <2,4,5,6> + 2632714080U, // <2,7,2,5>: Cost 3 vext2 <2,5,2,7>, <2,5,2,7> + 2633377713U, // <2,7,2,6>: Cost 3 vext2 <2,6,2,7>, <2,6,2,7> + 3695175658U, // <2,7,2,7>: Cost 4 vext2 <0,6,2,7>, <2,7,0,1> + 2634704979U, // <2,7,2,u>: Cost 3 vext2 <2,u,2,7>, <2,u,2,7> + 1514094694U, // <2,7,3,0>: Cost 2 vext1 <6,2,7,3>, LHS + 2569921680U, // <2,7,3,1>: Cost 3 vext1 <3,2,7,3>, <1,5,3,7> + 2587838056U, // <2,7,3,2>: Cost 3 vext1 <6,2,7,3>, <2,2,2,2> + 2569922927U, // <2,7,3,3>: Cost 3 vext1 <3,2,7,3>, <3,2,7,3> + 1514097974U, // <2,7,3,4>: Cost 2 vext1 <6,2,7,3>, RHS + 2581868321U, // <2,7,3,5>: Cost 3 vext1 <5,2,7,3>, <5,2,7,3> + 1514099194U, // <2,7,3,6>: Cost 2 vext1 <6,2,7,3>, <6,2,7,3> + 2587841530U, // <2,7,3,7>: Cost 3 vext1 <6,2,7,3>, <7,0,1,2> + 1514100526U, // <2,7,3,u>: Cost 2 vext1 <6,2,7,3>, LHS + 2708706617U, // <2,7,4,0>: Cost 3 vext3 <4,0,6,2>, <7,4,0,6> + 3649643418U, // <2,7,4,1>: Cost 4 vext1 <4,2,7,4>, <1,2,3,4> + 3649644330U, // <2,7,4,2>: Cost 4 vext1 <4,2,7,4>, <2,4,5,7> + 2257982640U, // <2,7,4,3>: Cost 3 vrev <7,2,3,4> + 3649645641U, // <2,7,4,4>: Cost 4 vext1 <4,2,7,4>, <4,2,7,4> + 2621435190U, // <2,7,4,5>: Cost 3 vext2 <0,6,2,7>, RHS + 2712835441U, // <2,7,4,6>: Cost 3 vext3 <4,6,u,2>, <7,4,6,u> + 3799995762U, // <2,7,4,7>: Cost 4 vext3 <7,0,1,2>, <7,4,7,0> + 2621435433U, // <2,7,4,u>: Cost 3 vext2 <0,6,2,7>, RHS + 2729497990U, // <2,7,5,0>: Cost 3 vext3 <7,5,0,2>, <7,5,0,2> + 3643679744U, // <2,7,5,1>: Cost 4 vext1 <3,2,7,5>, <1,3,5,7> + 3637708424U, // <2,7,5,2>: Cost 4 vext1 <2,2,7,5>, <2,2,5,7> + 3643681137U, // <2,7,5,3>: Cost 4 vext1 <3,2,7,5>, <3,2,7,5> + 2599800118U, // <2,7,5,4>: Cost 3 vext1 <u,2,7,5>, RHS + 3786577334U, // <2,7,5,5>: Cost 4 vext3 <4,6,u,2>, <7,5,5,5> + 3786577345U, // <2,7,5,6>: Cost 4 vext3 <4,6,u,2>, <7,5,6,7> + 2599802214U, // <2,7,5,7>: Cost 3 vext1 <u,2,7,5>, <7,4,5,6> + 2599802670U, // <2,7,5,u>: Cost 3 vext1 <u,2,7,5>, LHS + 2581889126U, // <2,7,6,0>: Cost 3 vext1 <5,2,7,6>, LHS + 3643687936U, // <2,7,6,1>: Cost 4 vext1 <3,2,7,6>, <1,3,5,7> + 2663240186U, // <2,7,6,2>: Cost 3 vext2 <7,6,2,7>, <6,2,7,3> + 3643689330U, // <2,7,6,3>: Cost 4 vext1 <3,2,7,6>, <3,2,7,6> + 2581892406U, // <2,7,6,4>: Cost 3 vext1 <5,2,7,6>, RHS + 2581892900U, // <2,7,6,5>: Cost 3 vext1 <5,2,7,6>, <5,2,7,6> + 2587865597U, // <2,7,6,6>: Cost 3 vext1 <6,2,7,6>, <6,2,7,6> + 3786577428U, // <2,7,6,7>: Cost 4 vext3 <4,6,u,2>, <7,6,7,0> + 2581894958U, // <2,7,6,u>: Cost 3 vext1 <5,2,7,6>, LHS + 2726254119U, // <2,7,7,0>: Cost 3 vext3 <7,0,1,2>, <7,7,0,1> + 3804640817U, // <2,7,7,1>: Cost 4 vext3 <7,7,1,2>, <7,7,1,2> + 3637724826U, // <2,7,7,2>: Cost 4 vext1 <2,2,7,7>, <2,2,7,7> + 3734992123U, // <2,7,7,3>: Cost 4 vext2 <7,3,2,7>, <7,3,2,7> + 2552040758U, // <2,7,7,4>: Cost 3 vext1 <0,2,7,7>, RHS + 3799995992U, // <2,7,7,5>: Cost 4 vext3 <7,0,1,2>, <7,7,5,5> + 2663241198U, // <2,7,7,6>: Cost 3 vext2 <7,6,2,7>, <7,6,2,7> + 2712835692U, // <2,7,7,7>: Cost 3 vext3 <4,6,u,2>, <7,7,7,7> + 2731562607U, // <2,7,7,u>: Cost 3 vext3 <7,u,1,2>, <7,7,u,1> + 1514135654U, // <2,7,u,0>: Cost 2 vext1 <6,2,7,u>, LHS + 1657820802U, // <2,7,u,1>: Cost 2 vext3 <7,u,1,2>, <7,u,1,2> + 2587879016U, // <2,7,u,2>: Cost 3 vext1 <6,2,7,u>, <2,2,2,2> + 2569963892U, // <2,7,u,3>: Cost 3 vext1 <3,2,7,u>, <3,2,7,u> + 1514138934U, // <2,7,u,4>: Cost 2 vext1 <6,2,7,u>, RHS + 2621438106U, // <2,7,u,5>: Cost 3 vext2 <0,6,2,7>, RHS + 1514140159U, // <2,7,u,6>: Cost 2 vext1 <6,2,7,u>, <6,2,7,u> + 2587882490U, // <2,7,u,7>: Cost 3 vext1 <6,2,7,u>, <7,0,1,2> + 1514141486U, // <2,7,u,u>: Cost 2 vext1 <6,2,7,u>, LHS + 1544380416U, // <2,u,0,0>: Cost 2 vext2 LHS, <0,0,0,0> + 470638699U, // <2,u,0,1>: Cost 1 vext2 LHS, LHS + 1544380580U, // <2,u,0,2>: Cost 2 vext2 LHS, <0,2,0,2> + 1658631909U, // <2,u,0,3>: Cost 2 vext3 <u,0,3,2>, <u,0,3,2> + 1544380754U, // <2,u,0,4>: Cost 2 vext2 LHS, <0,4,1,5> + 2665898414U, // <2,u,0,5>: Cost 3 vext2 LHS, <0,5,2,7> + 1658853120U, // <2,u,0,6>: Cost 2 vext3 <u,0,6,2>, <u,0,6,2> + 3094531625U, // <2,u,0,7>: Cost 3 vtrnr <1,2,3,0>, RHS + 470639261U, // <2,u,0,u>: Cost 1 vext2 LHS, LHS + 1544381174U, // <2,u,1,0>: Cost 2 vext2 LHS, <1,0,3,2> + 1544381236U, // <2,u,1,1>: Cost 2 vext2 LHS, <1,1,1,1> + 1544381334U, // <2,u,1,2>: Cost 2 vext2 LHS, <1,2,3,0> + 1544381400U, // <2,u,1,3>: Cost 2 vext2 LHS, <1,3,1,3> + 2618123325U, // <2,u,1,4>: Cost 3 vext2 LHS, <1,4,3,5> + 1544381584U, // <2,u,1,5>: Cost 2 vext2 LHS, <1,5,3,7> + 2618123489U, // <2,u,1,6>: Cost 3 vext2 LHS, <1,6,3,7> + 2726254427U, // <2,u,1,7>: Cost 3 vext3 <7,0,1,2>, <u,1,7,3> + 1544381823U, // <2,u,1,u>: Cost 2 vext2 LHS, <1,u,3,3> + 1478328422U, // <2,u,2,0>: Cost 2 vext1 <0,2,u,2>, LHS + 2618123807U, // <2,u,2,1>: Cost 3 vext2 LHS, <2,1,3,1> + 269271142U, // <2,u,2,2>: Cost 1 vdup2 LHS + 1544382118U, // <2,u,2,3>: Cost 2 vext2 LHS, <2,3,0,1> + 1478331702U, // <2,u,2,4>: Cost 2 vext1 <0,2,u,2>, RHS + 2618124136U, // <2,u,2,5>: Cost 3 vext2 LHS, <2,5,3,6> + 1544382394U, // <2,u,2,6>: Cost 2 vext2 LHS, <2,6,3,7> + 3088354857U, // <2,u,2,7>: Cost 3 vtrnr <0,2,0,2>, RHS + 269271142U, // <2,u,2,u>: Cost 1 vdup2 LHS + 1544382614U, // <2,u,3,0>: Cost 2 vext2 LHS, <3,0,1,2> + 2953627374U, // <2,u,3,1>: Cost 3 vzipr LHS, <2,3,u,1> + 1490282143U, // <2,u,3,2>: Cost 2 vext1 <2,2,u,3>, <2,2,u,3> + 1879883932U, // <2,u,3,3>: Cost 2 vzipr LHS, LHS + 1544382978U, // <2,u,3,4>: Cost 2 vext2 LHS, <3,4,5,6> + 2953627378U, // <2,u,3,5>: Cost 3 vzipr LHS, <2,3,u,5> + 1514172931U, // <2,u,3,6>: Cost 2 vext1 <6,2,u,3>, <6,2,u,3> + 1879887176U, // <2,u,3,7>: Cost 2 vzipr LHS, RHS + 1879883937U, // <2,u,3,u>: Cost 2 vzipr LHS, LHS + 1484316774U, // <2,u,4,0>: Cost 2 vext1 <1,2,u,4>, LHS + 1484317639U, // <2,u,4,1>: Cost 2 vext1 <1,2,u,4>, <1,2,u,4> + 2552088270U, // <2,u,4,2>: Cost 3 vext1 <0,2,u,4>, <2,3,4,5> + 1190213513U, // <2,u,4,3>: Cost 2 vrev <u,2,3,4> + 1484320054U, // <2,u,4,4>: Cost 2 vext1 <1,2,u,4>, RHS + 470641974U, // <2,u,4,5>: Cost 1 vext2 LHS, RHS + 1592159604U, // <2,u,4,6>: Cost 2 vext2 LHS, <4,6,4,6> + 3094564393U, // <2,u,4,7>: Cost 3 vtrnr <1,2,3,4>, RHS + 470642217U, // <2,u,4,u>: Cost 1 vext2 LHS, RHS + 2552094959U, // <2,u,5,0>: Cost 3 vext1 <0,2,u,5>, <0,2,u,5> + 1592159952U, // <2,u,5,1>: Cost 2 vext2 LHS, <5,1,7,3> + 2564040353U, // <2,u,5,2>: Cost 3 vext1 <2,2,u,5>, <2,2,u,5> + 2690275455U, // <2,u,5,3>: Cost 3 vext3 <0,u,u,2>, <u,5,3,7> + 1592160198U, // <2,u,5,4>: Cost 2 vext2 LHS, <5,4,7,6> + 1592160260U, // <2,u,5,5>: Cost 2 vext2 LHS, <5,5,5,5> + 1611962522U, // <2,u,5,6>: Cost 2 vext3 <0,2,0,2>, RHS + 1592160424U, // <2,u,5,7>: Cost 2 vext2 LHS, <5,7,5,7> + 1611962540U, // <2,u,5,u>: Cost 2 vext3 <0,2,0,2>, RHS + 1478361190U, // <2,u,6,0>: Cost 2 vext1 <0,2,u,6>, LHS + 2552103670U, // <2,u,6,1>: Cost 3 vext1 <0,2,u,6>, <1,0,3,2> + 1592160762U, // <2,u,6,2>: Cost 2 vext2 LHS, <6,2,7,3> + 2685704400U, // <2,u,6,3>: Cost 3 vext3 <0,2,0,2>, <u,6,3,7> + 1478364470U, // <2,u,6,4>: Cost 2 vext1 <0,2,u,6>, RHS + 2901891226U, // <2,u,6,5>: Cost 3 vzipl <2,6,3,7>, RHS + 1592161080U, // <2,u,6,6>: Cost 2 vext2 LHS, <6,6,6,6> + 1592161102U, // <2,u,6,7>: Cost 2 vext2 LHS, <6,7,0,1> + 1478367022U, // <2,u,6,u>: Cost 2 vext1 <0,2,u,6>, LHS + 1592161274U, // <2,u,7,0>: Cost 2 vext2 LHS, <7,0,1,2> + 2659931226U, // <2,u,7,1>: Cost 3 vext2 <7,1,2,u>, <7,1,2,u> + 2564056739U, // <2,u,7,2>: Cost 3 vext1 <2,2,u,7>, <2,2,u,7> + 2665903331U, // <2,u,7,3>: Cost 3 vext2 LHS, <7,3,0,1> + 1592161638U, // <2,u,7,4>: Cost 2 vext2 LHS, <7,4,5,6> + 2665903494U, // <2,u,7,5>: Cost 3 vext2 LHS, <7,5,0,2> + 2587947527U, // <2,u,7,6>: Cost 3 vext1 <6,2,u,7>, <6,2,u,7> + 1592161900U, // <2,u,7,7>: Cost 2 vext2 LHS, <7,7,7,7> + 1592161922U, // <2,u,7,u>: Cost 2 vext2 LHS, <7,u,1,2> + 1478377574U, // <2,u,u,0>: Cost 2 vext1 <0,2,u,u>, LHS + 470644526U, // <2,u,u,1>: Cost 1 vext2 LHS, LHS + 269271142U, // <2,u,u,2>: Cost 1 vdup2 LHS + 1879924892U, // <2,u,u,3>: Cost 2 vzipr LHS, LHS + 1478380854U, // <2,u,u,4>: Cost 2 vext1 <0,2,u,u>, RHS + 470644890U, // <2,u,u,5>: Cost 1 vext2 LHS, RHS + 1611962765U, // <2,u,u,6>: Cost 2 vext3 <0,2,0,2>, RHS + 1879928136U, // <2,u,u,7>: Cost 2 vzipr LHS, RHS + 470645093U, // <2,u,u,u>: Cost 1 vext2 LHS, LHS + 1611448320U, // <3,0,0,0>: Cost 2 vext3 LHS, <0,0,0,0> + 1611890698U, // <3,0,0,1>: Cost 2 vext3 LHS, <0,0,1,1> + 1611890708U, // <3,0,0,2>: Cost 2 vext3 LHS, <0,0,2,2> + 3763576860U, // <3,0,0,3>: Cost 4 vext3 LHS, <0,0,3,1> + 2689835045U, // <3,0,0,4>: Cost 3 vext3 LHS, <0,0,4,1> + 3698508206U, // <3,0,0,5>: Cost 4 vext2 <1,2,3,0>, <0,5,2,7> + 3763576887U, // <3,0,0,6>: Cost 4 vext3 LHS, <0,0,6,1> + 3667678434U, // <3,0,0,7>: Cost 4 vext1 <7,3,0,0>, <7,3,0,0> + 1616093258U, // <3,0,0,u>: Cost 2 vext3 LHS, <0,0,u,2> + 1490337894U, // <3,0,1,0>: Cost 2 vext1 <2,3,0,1>, LHS + 2685632602U, // <3,0,1,1>: Cost 3 vext3 LHS, <0,1,1,0> + 537706598U, // <3,0,1,2>: Cost 1 vext3 LHS, LHS + 2624766936U, // <3,0,1,3>: Cost 3 vext2 <1,2,3,0>, <1,3,1,3> + 1490341174U, // <3,0,1,4>: Cost 2 vext1 <2,3,0,1>, RHS + 2624767120U, // <3,0,1,5>: Cost 3 vext2 <1,2,3,0>, <1,5,3,7> + 2732966030U, // <3,0,1,6>: Cost 3 vext3 LHS, <0,1,6,7> + 2593944803U, // <3,0,1,7>: Cost 3 vext1 <7,3,0,1>, <7,3,0,1> + 537706652U, // <3,0,1,u>: Cost 1 vext3 LHS, LHS + 1611890852U, // <3,0,2,0>: Cost 2 vext3 LHS, <0,2,0,2> + 2685632684U, // <3,0,2,1>: Cost 3 vext3 LHS, <0,2,1,1> + 2685632692U, // <3,0,2,2>: Cost 3 vext3 LHS, <0,2,2,0> + 2685632702U, // <3,0,2,3>: Cost 3 vext3 LHS, <0,2,3,1> + 1611890892U, // <3,0,2,4>: Cost 2 vext3 LHS, <0,2,4,6> + 2732966102U, // <3,0,2,5>: Cost 3 vext3 LHS, <0,2,5,7> + 2624767930U, // <3,0,2,6>: Cost 3 vext2 <1,2,3,0>, <2,6,3,7> + 2685632744U, // <3,0,2,7>: Cost 3 vext3 LHS, <0,2,7,7> + 1611890924U, // <3,0,2,u>: Cost 2 vext3 LHS, <0,2,u,2> + 2624768150U, // <3,0,3,0>: Cost 3 vext2 <1,2,3,0>, <3,0,1,2> + 2685632764U, // <3,0,3,1>: Cost 3 vext3 LHS, <0,3,1,0> + 2685632774U, // <3,0,3,2>: Cost 3 vext3 LHS, <0,3,2,1> + 2624768412U, // <3,0,3,3>: Cost 3 vext2 <1,2,3,0>, <3,3,3,3> + 2624768514U, // <3,0,3,4>: Cost 3 vext2 <1,2,3,0>, <3,4,5,6> + 3702491714U, // <3,0,3,5>: Cost 4 vext2 <1,u,3,0>, <3,5,3,7> + 2624768632U, // <3,0,3,6>: Cost 3 vext2 <1,2,3,0>, <3,6,0,7> + 3702491843U, // <3,0,3,7>: Cost 4 vext2 <1,u,3,0>, <3,7,0,1> + 2686959934U, // <3,0,3,u>: Cost 3 vext3 <0,3,u,3>, <0,3,u,3> + 2689835336U, // <3,0,4,0>: Cost 3 vext3 LHS, <0,4,0,4> + 1611891026U, // <3,0,4,1>: Cost 2 vext3 LHS, <0,4,1,5> + 1611891036U, // <3,0,4,2>: Cost 2 vext3 LHS, <0,4,2,6> + 3763577184U, // <3,0,4,3>: Cost 4 vext3 LHS, <0,4,3,1> + 2689835374U, // <3,0,4,4>: Cost 3 vext3 LHS, <0,4,4,6> + 1551027510U, // <3,0,4,5>: Cost 2 vext2 <1,2,3,0>, RHS + 2666573172U, // <3,0,4,6>: Cost 3 vext2 <u,2,3,0>, <4,6,4,6> + 3667711206U, // <3,0,4,7>: Cost 4 vext1 <7,3,0,4>, <7,3,0,4> + 1616093586U, // <3,0,4,u>: Cost 2 vext3 LHS, <0,4,u,6> + 2685190556U, // <3,0,5,0>: Cost 3 vext3 LHS, <0,5,0,7> + 2666573520U, // <3,0,5,1>: Cost 3 vext2 <u,2,3,0>, <5,1,7,3> + 3040886886U, // <3,0,5,2>: Cost 3 vtrnl <3,4,5,6>, LHS + 3625912834U, // <3,0,5,3>: Cost 4 vext1 <0,3,0,5>, <3,4,5,6> + 2666573766U, // <3,0,5,4>: Cost 3 vext2 <u,2,3,0>, <5,4,7,6> + 2666573828U, // <3,0,5,5>: Cost 3 vext2 <u,2,3,0>, <5,5,5,5> + 2732966354U, // <3,0,5,6>: Cost 3 vext3 LHS, <0,5,6,7> + 2666573992U, // <3,0,5,7>: Cost 3 vext2 <u,2,3,0>, <5,7,5,7> + 3040886940U, // <3,0,5,u>: Cost 3 vtrnl <3,4,5,6>, LHS + 2685190637U, // <3,0,6,0>: Cost 3 vext3 LHS, <0,6,0,7> + 2732966390U, // <3,0,6,1>: Cost 3 vext3 LHS, <0,6,1,7> + 2689835519U, // <3,0,6,2>: Cost 3 vext3 LHS, <0,6,2,7> + 3667724438U, // <3,0,6,3>: Cost 4 vext1 <7,3,0,6>, <3,0,1,2> + 3763577355U, // <3,0,6,4>: Cost 4 vext3 LHS, <0,6,4,1> + 3806708243U, // <3,0,6,5>: Cost 4 vext3 LHS, <0,6,5,0> + 2666574648U, // <3,0,6,6>: Cost 3 vext2 <u,2,3,0>, <6,6,6,6> + 2657948520U, // <3,0,6,7>: Cost 3 vext2 <6,7,3,0>, <6,7,3,0> + 2689835573U, // <3,0,6,u>: Cost 3 vext3 LHS, <0,6,u,7> + 2666574842U, // <3,0,7,0>: Cost 3 vext2 <u,2,3,0>, <7,0,1,2> + 2685633095U, // <3,0,7,1>: Cost 3 vext3 LHS, <0,7,1,7> + 2660603052U, // <3,0,7,2>: Cost 3 vext2 <7,2,3,0>, <7,2,3,0> + 3643844997U, // <3,0,7,3>: Cost 4 vext1 <3,3,0,7>, <3,3,0,7> + 2666575206U, // <3,0,7,4>: Cost 3 vext2 <u,2,3,0>, <7,4,5,6> + 3655790391U, // <3,0,7,5>: Cost 4 vext1 <5,3,0,7>, <5,3,0,7> + 3731690968U, // <3,0,7,6>: Cost 4 vext2 <6,7,3,0>, <7,6,0,3> + 2666575468U, // <3,0,7,7>: Cost 3 vext2 <u,2,3,0>, <7,7,7,7> + 2664584850U, // <3,0,7,u>: Cost 3 vext2 <7,u,3,0>, <7,u,3,0> + 1616093834U, // <3,0,u,0>: Cost 2 vext3 LHS, <0,u,0,2> + 1611891346U, // <3,0,u,1>: Cost 2 vext3 LHS, <0,u,1,1> + 537707165U, // <3,0,u,2>: Cost 1 vext3 LHS, LHS + 2689835684U, // <3,0,u,3>: Cost 3 vext3 LHS, <0,u,3,1> + 1616093874U, // <3,0,u,4>: Cost 2 vext3 LHS, <0,u,4,6> + 1551030426U, // <3,0,u,5>: Cost 2 vext2 <1,2,3,0>, RHS + 2624772304U, // <3,0,u,6>: Cost 3 vext2 <1,2,3,0>, <u,6,3,7> + 2594002154U, // <3,0,u,7>: Cost 3 vext1 <7,3,0,u>, <7,3,0,u> + 537707219U, // <3,0,u,u>: Cost 1 vext3 LHS, LHS + 2552201318U, // <3,1,0,0>: Cost 3 vext1 <0,3,1,0>, LHS + 2618802278U, // <3,1,0,1>: Cost 3 vext2 <0,2,3,1>, LHS + 2618802366U, // <3,1,0,2>: Cost 3 vext2 <0,2,3,1>, <0,2,3,1> + 1611449078U, // <3,1,0,3>: Cost 2 vext3 LHS, <1,0,3,2> + 2552204598U, // <3,1,0,4>: Cost 3 vext1 <0,3,1,0>, RHS + 2732966663U, // <3,1,0,5>: Cost 3 vext3 LHS, <1,0,5,1> + 3906258396U, // <3,1,0,6>: Cost 4 vuzpr <2,3,0,1>, <2,0,4,6> + 3667752171U, // <3,1,0,7>: Cost 4 vext1 <7,3,1,0>, <7,3,1,0> + 1611891491U, // <3,1,0,u>: Cost 2 vext3 LHS, <1,0,u,2> + 2689835819U, // <3,1,1,0>: Cost 3 vext3 LHS, <1,1,0,1> + 1611449140U, // <3,1,1,1>: Cost 2 vext3 LHS, <1,1,1,1> + 2624775063U, // <3,1,1,2>: Cost 3 vext2 <1,2,3,1>, <1,2,3,1> + 1611891528U, // <3,1,1,3>: Cost 2 vext3 LHS, <1,1,3,3> + 2689835859U, // <3,1,1,4>: Cost 3 vext3 LHS, <1,1,4,5> + 2689835868U, // <3,1,1,5>: Cost 3 vext3 LHS, <1,1,5,5> + 3763577701U, // <3,1,1,6>: Cost 4 vext3 LHS, <1,1,6,5> + 3765273452U, // <3,1,1,7>: Cost 4 vext3 <1,1,7,3>, <1,1,7,3> + 1611891573U, // <3,1,1,u>: Cost 2 vext3 LHS, <1,1,u,3> + 2629420494U, // <3,1,2,0>: Cost 3 vext2 <2,0,3,1>, <2,0,3,1> + 2689835911U, // <3,1,2,1>: Cost 3 vext3 LHS, <1,2,1,3> + 2564163248U, // <3,1,2,2>: Cost 3 vext1 <2,3,1,2>, <2,3,1,2> + 1611449238U, // <3,1,2,3>: Cost 2 vext3 LHS, <1,2,3,0> + 2564164918U, // <3,1,2,4>: Cost 3 vext1 <2,3,1,2>, RHS + 2689835947U, // <3,1,2,5>: Cost 3 vext3 LHS, <1,2,5,3> + 3692545978U, // <3,1,2,6>: Cost 4 vext2 <0,2,3,1>, <2,6,3,7> + 2732966842U, // <3,1,2,7>: Cost 3 vext3 LHS, <1,2,7,0> + 1611891651U, // <3,1,2,u>: Cost 2 vext3 LHS, <1,2,u,0> + 1484456038U, // <3,1,3,0>: Cost 2 vext1 <1,3,1,3>, LHS + 1611891672U, // <3,1,3,1>: Cost 2 vext3 LHS, <1,3,1,3> + 2685633502U, // <3,1,3,2>: Cost 3 vext3 LHS, <1,3,2,0> + 2685633512U, // <3,1,3,3>: Cost 3 vext3 LHS, <1,3,3,1> + 1484459318U, // <3,1,3,4>: Cost 2 vext1 <1,3,1,3>, RHS + 1611891712U, // <3,1,3,5>: Cost 2 vext3 LHS, <1,3,5,7> + 2689836041U, // <3,1,3,6>: Cost 3 vext3 LHS, <1,3,6,7> + 2733409294U, // <3,1,3,7>: Cost 3 vext3 LHS, <1,3,7,3> + 1611891735U, // <3,1,3,u>: Cost 2 vext3 LHS, <1,3,u,3> + 2552234086U, // <3,1,4,0>: Cost 3 vext1 <0,3,1,4>, LHS + 2732966955U, // <3,1,4,1>: Cost 3 vext3 LHS, <1,4,1,5> + 2732966964U, // <3,1,4,2>: Cost 3 vext3 LHS, <1,4,2,5> + 2685633597U, // <3,1,4,3>: Cost 3 vext3 LHS, <1,4,3,5> + 2552237366U, // <3,1,4,4>: Cost 3 vext1 <0,3,1,4>, RHS + 2618805558U, // <3,1,4,5>: Cost 3 vext2 <0,2,3,1>, RHS + 2769472822U, // <3,1,4,6>: Cost 3 vuzpl <3,0,1,2>, RHS + 3667784943U, // <3,1,4,7>: Cost 4 vext1 <7,3,1,4>, <7,3,1,4> + 2685633642U, // <3,1,4,u>: Cost 3 vext3 LHS, <1,4,u,5> + 2689836143U, // <3,1,5,0>: Cost 3 vext3 LHS, <1,5,0,1> + 2564187280U, // <3,1,5,1>: Cost 3 vext1 <2,3,1,5>, <1,5,3,7> + 2564187827U, // <3,1,5,2>: Cost 3 vext1 <2,3,1,5>, <2,3,1,5> + 1611891856U, // <3,1,5,3>: Cost 2 vext3 LHS, <1,5,3,7> + 2689836183U, // <3,1,5,4>: Cost 3 vext3 LHS, <1,5,4,5> + 3759375522U, // <3,1,5,5>: Cost 4 vext3 LHS, <1,5,5,7> + 3720417378U, // <3,1,5,6>: Cost 4 vext2 <4,u,3,1>, <5,6,7,0> + 2832518454U, // <3,1,5,7>: Cost 3 vuzpr <2,3,0,1>, RHS + 1611891901U, // <3,1,5,u>: Cost 2 vext3 LHS, <1,5,u,7> + 3763578048U, // <3,1,6,0>: Cost 4 vext3 LHS, <1,6,0,1> + 2689836239U, // <3,1,6,1>: Cost 3 vext3 LHS, <1,6,1,7> + 2732967128U, // <3,1,6,2>: Cost 3 vext3 LHS, <1,6,2,7> + 2685633761U, // <3,1,6,3>: Cost 3 vext3 LHS, <1,6,3,7> + 3763578088U, // <3,1,6,4>: Cost 4 vext3 LHS, <1,6,4,5> + 2689836275U, // <3,1,6,5>: Cost 3 vext3 LHS, <1,6,5,7> + 3763578108U, // <3,1,6,6>: Cost 4 vext3 LHS, <1,6,6,7> + 2732967166U, // <3,1,6,7>: Cost 3 vext3 LHS, <1,6,7,0> + 2685633806U, // <3,1,6,u>: Cost 3 vext3 LHS, <1,6,u,7> + 3631972454U, // <3,1,7,0>: Cost 4 vext1 <1,3,1,7>, LHS + 2659947612U, // <3,1,7,1>: Cost 3 vext2 <7,1,3,1>, <7,1,3,1> + 4036102294U, // <3,1,7,2>: Cost 4 vzipr <1,5,3,7>, <3,0,1,2> + 3095396454U, // <3,1,7,3>: Cost 3 vtrnr <1,3,5,7>, LHS + 3631975734U, // <3,1,7,4>: Cost 4 vext1 <1,3,1,7>, RHS + 2222982144U, // <3,1,7,5>: Cost 3 vrev <1,3,5,7> + 3296797705U, // <3,1,7,6>: Cost 4 vrev <1,3,6,7> + 3720418924U, // <3,1,7,7>: Cost 4 vext2 <4,u,3,1>, <7,7,7,7> + 3095396459U, // <3,1,7,u>: Cost 3 vtrnr <1,3,5,7>, LHS + 1484496998U, // <3,1,u,0>: Cost 2 vext1 <1,3,1,u>, LHS + 1611892077U, // <3,1,u,1>: Cost 2 vext3 LHS, <1,u,1,3> + 2685633907U, // <3,1,u,2>: Cost 3 vext3 LHS, <1,u,2,0> + 1611892092U, // <3,1,u,3>: Cost 2 vext3 LHS, <1,u,3,0> + 1484500278U, // <3,1,u,4>: Cost 2 vext1 <1,3,1,u>, RHS + 1611892117U, // <3,1,u,5>: Cost 2 vext3 LHS, <1,u,5,7> + 2685633950U, // <3,1,u,6>: Cost 3 vext3 LHS, <1,u,6,7> + 2832518697U, // <3,1,u,7>: Cost 3 vuzpr <2,3,0,1>, RHS + 1611892140U, // <3,1,u,u>: Cost 2 vext3 LHS, <1,u,u,3> + 2623455232U, // <3,2,0,0>: Cost 3 vext2 <1,0,3,2>, <0,0,0,0> + 1549713510U, // <3,2,0,1>: Cost 2 vext2 <1,0,3,2>, LHS + 2689836484U, // <3,2,0,2>: Cost 3 vext3 LHS, <2,0,2,0> + 2685633997U, // <3,2,0,3>: Cost 3 vext3 LHS, <2,0,3,0> + 2623455570U, // <3,2,0,4>: Cost 3 vext2 <1,0,3,2>, <0,4,1,5> + 2732967398U, // <3,2,0,5>: Cost 3 vext3 LHS, <2,0,5,7> + 2689836524U, // <3,2,0,6>: Cost 3 vext3 LHS, <2,0,6,4> + 2229044964U, // <3,2,0,7>: Cost 3 vrev <2,3,7,0> + 1549714077U, // <3,2,0,u>: Cost 2 vext2 <1,0,3,2>, LHS + 1549714166U, // <3,2,1,0>: Cost 2 vext2 <1,0,3,2>, <1,0,3,2> + 2623456052U, // <3,2,1,1>: Cost 3 vext2 <1,0,3,2>, <1,1,1,1> + 2623456150U, // <3,2,1,2>: Cost 3 vext2 <1,0,3,2>, <1,2,3,0> + 2685634079U, // <3,2,1,3>: Cost 3 vext3 LHS, <2,1,3,1> + 2552286518U, // <3,2,1,4>: Cost 3 vext1 <0,3,2,1>, RHS + 2623456400U, // <3,2,1,5>: Cost 3 vext2 <1,0,3,2>, <1,5,3,7> + 2689836604U, // <3,2,1,6>: Cost 3 vext3 LHS, <2,1,6,3> + 3667834101U, // <3,2,1,7>: Cost 4 vext1 <7,3,2,1>, <7,3,2,1> + 1155385070U, // <3,2,1,u>: Cost 2 vrev <2,3,u,1> + 2689836629U, // <3,2,2,0>: Cost 3 vext3 LHS, <2,2,0,1> + 2689836640U, // <3,2,2,1>: Cost 3 vext3 LHS, <2,2,1,3> + 1611449960U, // <3,2,2,2>: Cost 2 vext3 LHS, <2,2,2,2> + 1611892338U, // <3,2,2,3>: Cost 2 vext3 LHS, <2,2,3,3> + 2689836669U, // <3,2,2,4>: Cost 3 vext3 LHS, <2,2,4,5> + 2689836680U, // <3,2,2,5>: Cost 3 vext3 LHS, <2,2,5,7> + 2689836688U, // <3,2,2,6>: Cost 3 vext3 LHS, <2,2,6,6> + 3763578518U, // <3,2,2,7>: Cost 4 vext3 LHS, <2,2,7,3> + 1611892383U, // <3,2,2,u>: Cost 2 vext3 LHS, <2,2,u,3> + 1611450022U, // <3,2,3,0>: Cost 2 vext3 LHS, <2,3,0,1> + 2685191854U, // <3,2,3,1>: Cost 3 vext3 LHS, <2,3,1,0> + 2685191865U, // <3,2,3,2>: Cost 3 vext3 LHS, <2,3,2,2> + 2685191875U, // <3,2,3,3>: Cost 3 vext3 LHS, <2,3,3,3> + 1611450062U, // <3,2,3,4>: Cost 2 vext3 LHS, <2,3,4,5> + 2732967635U, // <3,2,3,5>: Cost 3 vext3 LHS, <2,3,5,1> + 2732967645U, // <3,2,3,6>: Cost 3 vext3 LHS, <2,3,6,2> + 2732967652U, // <3,2,3,7>: Cost 3 vext3 LHS, <2,3,7,0> + 1611450094U, // <3,2,3,u>: Cost 2 vext3 LHS, <2,3,u,1> + 2558279782U, // <3,2,4,0>: Cost 3 vext1 <1,3,2,4>, LHS + 2558280602U, // <3,2,4,1>: Cost 3 vext1 <1,3,2,4>, <1,2,3,4> + 2732967692U, // <3,2,4,2>: Cost 3 vext3 LHS, <2,4,2,4> + 2685634326U, // <3,2,4,3>: Cost 3 vext3 LHS, <2,4,3,5> + 2558283062U, // <3,2,4,4>: Cost 3 vext1 <1,3,2,4>, RHS + 1549716790U, // <3,2,4,5>: Cost 2 vext2 <1,0,3,2>, RHS + 2689836844U, // <3,2,4,6>: Cost 3 vext3 LHS, <2,4,6,0> + 2229077736U, // <3,2,4,7>: Cost 3 vrev <2,3,7,4> + 1549717033U, // <3,2,4,u>: Cost 2 vext2 <1,0,3,2>, RHS + 2552316006U, // <3,2,5,0>: Cost 3 vext1 <0,3,2,5>, LHS + 2228643507U, // <3,2,5,1>: Cost 3 vrev <2,3,1,5> + 2689836896U, // <3,2,5,2>: Cost 3 vext3 LHS, <2,5,2,7> + 2685634408U, // <3,2,5,3>: Cost 3 vext3 LHS, <2,5,3,6> + 1155122894U, // <3,2,5,4>: Cost 2 vrev <2,3,4,5> + 2665263108U, // <3,2,5,5>: Cost 3 vext2 <u,0,3,2>, <5,5,5,5> + 2689836932U, // <3,2,5,6>: Cost 3 vext3 LHS, <2,5,6,7> + 2665263272U, // <3,2,5,7>: Cost 3 vext2 <u,0,3,2>, <5,7,5,7> + 1155417842U, // <3,2,5,u>: Cost 2 vrev <2,3,u,5> + 2689836953U, // <3,2,6,0>: Cost 3 vext3 LHS, <2,6,0,1> + 2689836964U, // <3,2,6,1>: Cost 3 vext3 LHS, <2,6,1,3> + 2689836976U, // <3,2,6,2>: Cost 3 vext3 LHS, <2,6,2,6> + 1611892666U, // <3,2,6,3>: Cost 2 vext3 LHS, <2,6,3,7> + 2689836993U, // <3,2,6,4>: Cost 3 vext3 LHS, <2,6,4,5> + 2689837004U, // <3,2,6,5>: Cost 3 vext3 LHS, <2,6,5,7> + 2689837013U, // <3,2,6,6>: Cost 3 vext3 LHS, <2,6,6,7> + 2665263950U, // <3,2,6,7>: Cost 3 vext2 <u,0,3,2>, <6,7,0,1> + 1611892711U, // <3,2,6,u>: Cost 2 vext3 LHS, <2,6,u,7> + 2665264122U, // <3,2,7,0>: Cost 3 vext2 <u,0,3,2>, <7,0,1,2> + 2623460419U, // <3,2,7,1>: Cost 3 vext2 <1,0,3,2>, <7,1,0,3> + 4169138340U, // <3,2,7,2>: Cost 4 vtrnr <1,3,5,7>, <0,2,0,2> + 2962358374U, // <3,2,7,3>: Cost 3 vzipr <1,5,3,7>, LHS + 2665264486U, // <3,2,7,4>: Cost 3 vext2 <u,0,3,2>, <7,4,5,6> + 2228954841U, // <3,2,7,5>: Cost 3 vrev <2,3,5,7> + 2229028578U, // <3,2,7,6>: Cost 3 vrev <2,3,6,7> + 2665264748U, // <3,2,7,7>: Cost 3 vext2 <u,0,3,2>, <7,7,7,7> + 2962358379U, // <3,2,7,u>: Cost 3 vzipr <1,5,3,7>, LHS + 1611892795U, // <3,2,u,0>: Cost 2 vext3 LHS, <2,u,0,1> + 1549719342U, // <3,2,u,1>: Cost 2 vext2 <1,0,3,2>, LHS + 1611449960U, // <3,2,u,2>: Cost 2 vext3 LHS, <2,2,2,2> + 1611892824U, // <3,2,u,3>: Cost 2 vext3 LHS, <2,u,3,3> + 1611892835U, // <3,2,u,4>: Cost 2 vext3 LHS, <2,u,4,5> + 1549719706U, // <3,2,u,5>: Cost 2 vext2 <1,0,3,2>, RHS + 2689837168U, // <3,2,u,6>: Cost 3 vext3 LHS, <2,u,6,0> + 2665265408U, // <3,2,u,7>: Cost 3 vext2 <u,0,3,2>, <u,7,0,1> + 1611892867U, // <3,2,u,u>: Cost 2 vext3 LHS, <2,u,u,1> + 2685192331U, // <3,3,0,0>: Cost 3 vext3 LHS, <3,0,0,0> + 1611450518U, // <3,3,0,1>: Cost 2 vext3 LHS, <3,0,1,2> + 2685634717U, // <3,3,0,2>: Cost 3 vext3 LHS, <3,0,2,0> + 2564294806U, // <3,3,0,3>: Cost 3 vext1 <2,3,3,0>, <3,0,1,2> + 2685634736U, // <3,3,0,4>: Cost 3 vext3 LHS, <3,0,4,1> + 2732968122U, // <3,3,0,5>: Cost 3 vext3 LHS, <3,0,5,2> + 3763579075U, // <3,3,0,6>: Cost 4 vext3 LHS, <3,0,6,2> + 4034053264U, // <3,3,0,7>: Cost 4 vzipr <1,2,3,0>, <1,5,3,7> + 1611450581U, // <3,3,0,u>: Cost 2 vext3 LHS, <3,0,u,2> + 2685192415U, // <3,3,1,0>: Cost 3 vext3 LHS, <3,1,0,3> + 1550385992U, // <3,3,1,1>: Cost 2 vext2 <1,1,3,3>, <1,1,3,3> + 2685192433U, // <3,3,1,2>: Cost 3 vext3 LHS, <3,1,2,3> + 2685634808U, // <3,3,1,3>: Cost 3 vext3 LHS, <3,1,3,1> + 2558332214U, // <3,3,1,4>: Cost 3 vext1 <1,3,3,1>, RHS + 2685634828U, // <3,3,1,5>: Cost 3 vext3 LHS, <3,1,5,3> + 3759376661U, // <3,3,1,6>: Cost 4 vext3 LHS, <3,1,6,3> + 2703477022U, // <3,3,1,7>: Cost 3 vext3 <3,1,7,3>, <3,1,7,3> + 1555031423U, // <3,3,1,u>: Cost 2 vext2 <1,u,3,3>, <1,u,3,3> + 2564309094U, // <3,3,2,0>: Cost 3 vext1 <2,3,3,2>, LHS + 2630100513U, // <3,3,2,1>: Cost 3 vext2 <2,1,3,3>, <2,1,3,3> + 1557022322U, // <3,3,2,2>: Cost 2 vext2 <2,2,3,3>, <2,2,3,3> + 2685192520U, // <3,3,2,3>: Cost 3 vext3 LHS, <3,2,3,0> + 2564312374U, // <3,3,2,4>: Cost 3 vext1 <2,3,3,2>, RHS + 2732968286U, // <3,3,2,5>: Cost 3 vext3 LHS, <3,2,5,4> + 2685634918U, // <3,3,2,6>: Cost 3 vext3 LHS, <3,2,6,3> + 2704140655U, // <3,3,2,7>: Cost 3 vext3 <3,2,7,3>, <3,2,7,3> + 1561004120U, // <3,3,2,u>: Cost 2 vext2 <2,u,3,3>, <2,u,3,3> + 1496547430U, // <3,3,3,0>: Cost 2 vext1 <3,3,3,3>, LHS + 2624129256U, // <3,3,3,1>: Cost 3 vext2 <1,1,3,3>, <3,1,1,3> + 2630764866U, // <3,3,3,2>: Cost 3 vext2 <2,2,3,3>, <3,2,2,3> + 336380006U, // <3,3,3,3>: Cost 1 vdup3 LHS + 1496550710U, // <3,3,3,4>: Cost 2 vext1 <3,3,3,3>, RHS + 2732968368U, // <3,3,3,5>: Cost 3 vext3 LHS, <3,3,5,5> + 2624129683U, // <3,3,3,6>: Cost 3 vext2 <1,1,3,3>, <3,6,3,7> + 2594182400U, // <3,3,3,7>: Cost 3 vext1 <7,3,3,3>, <7,3,3,3> + 336380006U, // <3,3,3,u>: Cost 1 vdup3 LHS + 2558353510U, // <3,3,4,0>: Cost 3 vext1 <1,3,3,4>, LHS + 2558354411U, // <3,3,4,1>: Cost 3 vext1 <1,3,3,4>, <1,3,3,4> + 2564327108U, // <3,3,4,2>: Cost 3 vext1 <2,3,3,4>, <2,3,3,4> + 2564327938U, // <3,3,4,3>: Cost 3 vext1 <2,3,3,4>, <3,4,5,6> + 2960343962U, // <3,3,4,4>: Cost 3 vzipr <1,2,3,4>, <1,2,3,4> + 1611893250U, // <3,3,4,5>: Cost 2 vext3 LHS, <3,4,5,6> + 2771619126U, // <3,3,4,6>: Cost 3 vuzpl <3,3,3,3>, RHS + 4034086032U, // <3,3,4,7>: Cost 4 vzipr <1,2,3,4>, <1,5,3,7> + 1611893277U, // <3,3,4,u>: Cost 2 vext3 LHS, <3,4,u,6> + 2558361702U, // <3,3,5,0>: Cost 3 vext1 <1,3,3,5>, LHS + 2558362604U, // <3,3,5,1>: Cost 3 vext1 <1,3,3,5>, <1,3,3,5> + 2558363342U, // <3,3,5,2>: Cost 3 vext1 <1,3,3,5>, <2,3,4,5> + 2732968512U, // <3,3,5,3>: Cost 3 vext3 LHS, <3,5,3,5> + 2558364982U, // <3,3,5,4>: Cost 3 vext1 <1,3,3,5>, RHS + 3101279950U, // <3,3,5,5>: Cost 3 vtrnr <2,3,4,5>, <2,3,4,5> + 2665934946U, // <3,3,5,6>: Cost 3 vext2 <u,1,3,3>, <5,6,7,0> + 2826636598U, // <3,3,5,7>: Cost 3 vuzpr <1,3,1,3>, RHS + 2826636599U, // <3,3,5,u>: Cost 3 vuzpr <1,3,1,3>, RHS + 2732968568U, // <3,3,6,0>: Cost 3 vext3 LHS, <3,6,0,7> + 3763579521U, // <3,3,6,1>: Cost 4 vext3 LHS, <3,6,1,7> + 2732968586U, // <3,3,6,2>: Cost 3 vext3 LHS, <3,6,2,7> + 2732968595U, // <3,3,6,3>: Cost 3 vext3 LHS, <3,6,3,7> + 2732968604U, // <3,3,6,4>: Cost 3 vext3 LHS, <3,6,4,7> + 3763579557U, // <3,3,6,5>: Cost 4 vext3 LHS, <3,6,5,7> + 2732968621U, // <3,3,6,6>: Cost 3 vext3 LHS, <3,6,6,6> + 2657973099U, // <3,3,6,7>: Cost 3 vext2 <6,7,3,3>, <6,7,3,3> + 2658636732U, // <3,3,6,u>: Cost 3 vext2 <6,u,3,3>, <6,u,3,3> + 2558378086U, // <3,3,7,0>: Cost 3 vext1 <1,3,3,7>, LHS + 2558378990U, // <3,3,7,1>: Cost 3 vext1 <1,3,3,7>, <1,3,3,7> + 2564351687U, // <3,3,7,2>: Cost 3 vext1 <2,3,3,7>, <2,3,3,7> + 2661291264U, // <3,3,7,3>: Cost 3 vext2 <7,3,3,3>, <7,3,3,3> + 2558381366U, // <3,3,7,4>: Cost 3 vext1 <1,3,3,7>, RHS + 2732968694U, // <3,3,7,5>: Cost 3 vext3 LHS, <3,7,5,7> + 3781126907U, // <3,3,7,6>: Cost 4 vext3 <3,7,6,3>, <3,7,6,3> + 3095397376U, // <3,3,7,7>: Cost 3 vtrnr <1,3,5,7>, <1,3,5,7> + 2558383918U, // <3,3,7,u>: Cost 3 vext1 <1,3,3,7>, LHS + 1496547430U, // <3,3,u,0>: Cost 2 vext1 <3,3,3,3>, LHS + 1611893534U, // <3,3,u,1>: Cost 2 vext3 LHS, <3,u,1,2> + 1592858504U, // <3,3,u,2>: Cost 2 vext2 <u,2,3,3>, <u,2,3,3> + 336380006U, // <3,3,u,3>: Cost 1 vdup3 LHS + 1496550710U, // <3,3,u,4>: Cost 2 vext1 <3,3,3,3>, RHS + 1611893574U, // <3,3,u,5>: Cost 2 vext3 LHS, <3,u,5,6> + 2690280268U, // <3,3,u,6>: Cost 3 vext3 LHS, <3,u,6,3> + 2826636841U, // <3,3,u,7>: Cost 3 vuzpr <1,3,1,3>, RHS + 336380006U, // <3,3,u,u>: Cost 1 vdup3 LHS + 2624798720U, // <3,4,0,0>: Cost 3 vext2 <1,2,3,4>, <0,0,0,0> + 1551056998U, // <3,4,0,1>: Cost 2 vext2 <1,2,3,4>, LHS + 2624798884U, // <3,4,0,2>: Cost 3 vext2 <1,2,3,4>, <0,2,0,2> + 3693232384U, // <3,4,0,3>: Cost 4 vext2 <0,3,3,4>, <0,3,1,4> + 2624799058U, // <3,4,0,4>: Cost 3 vext2 <1,2,3,4>, <0,4,1,5> + 1659227026U, // <3,4,0,5>: Cost 2 vext3 LHS, <4,0,5,1> + 1659227036U, // <3,4,0,6>: Cost 2 vext3 LHS, <4,0,6,2> + 3667973382U, // <3,4,0,7>: Cost 4 vext1 <7,3,4,0>, <7,3,4,0> + 1551057565U, // <3,4,0,u>: Cost 2 vext2 <1,2,3,4>, LHS + 2624799478U, // <3,4,1,0>: Cost 3 vext2 <1,2,3,4>, <1,0,3,2> + 2624799540U, // <3,4,1,1>: Cost 3 vext2 <1,2,3,4>, <1,1,1,1> + 1551057818U, // <3,4,1,2>: Cost 2 vext2 <1,2,3,4>, <1,2,3,4> + 2624799704U, // <3,4,1,3>: Cost 3 vext2 <1,2,3,4>, <1,3,1,3> + 2564377910U, // <3,4,1,4>: Cost 3 vext1 <2,3,4,1>, RHS + 2689838050U, // <3,4,1,5>: Cost 3 vext3 LHS, <4,1,5,0> + 2689838062U, // <3,4,1,6>: Cost 3 vext3 LHS, <4,1,6,3> + 2628117807U, // <3,4,1,7>: Cost 3 vext2 <1,7,3,4>, <1,7,3,4> + 1555039616U, // <3,4,1,u>: Cost 2 vext2 <1,u,3,4>, <1,u,3,4> + 3626180710U, // <3,4,2,0>: Cost 4 vext1 <0,3,4,2>, LHS + 2624800298U, // <3,4,2,1>: Cost 3 vext2 <1,2,3,4>, <2,1,4,3> + 2624800360U, // <3,4,2,2>: Cost 3 vext2 <1,2,3,4>, <2,2,2,2> + 2624800422U, // <3,4,2,3>: Cost 3 vext2 <1,2,3,4>, <2,3,0,1> + 2624800514U, // <3,4,2,4>: Cost 3 vext2 <1,2,3,4>, <2,4,1,3> + 2709965878U, // <3,4,2,5>: Cost 3 vext3 <4,2,5,3>, <4,2,5,3> + 2689838140U, // <3,4,2,6>: Cost 3 vext3 LHS, <4,2,6,0> + 2634090504U, // <3,4,2,7>: Cost 3 vext2 <2,7,3,4>, <2,7,3,4> + 2689838158U, // <3,4,2,u>: Cost 3 vext3 LHS, <4,2,u,0> + 2624800918U, // <3,4,3,0>: Cost 3 vext2 <1,2,3,4>, <3,0,1,2> + 2636081403U, // <3,4,3,1>: Cost 3 vext2 <3,1,3,4>, <3,1,3,4> + 2636745036U, // <3,4,3,2>: Cost 3 vext2 <3,2,3,4>, <3,2,3,4> + 2624801180U, // <3,4,3,3>: Cost 3 vext2 <1,2,3,4>, <3,3,3,3> + 2624801232U, // <3,4,3,4>: Cost 3 vext2 <1,2,3,4>, <3,4,0,1> + 2905836854U, // <3,4,3,5>: Cost 3 vzipl <3,3,3,3>, RHS + 3040054582U, // <3,4,3,6>: Cost 3 vtrnl <3,3,3,3>, RHS + 3702524611U, // <3,4,3,7>: Cost 4 vext2 <1,u,3,4>, <3,7,0,1> + 2624801566U, // <3,4,3,u>: Cost 3 vext2 <1,2,3,4>, <3,u,1,2> + 2564399206U, // <3,4,4,0>: Cost 3 vext1 <2,3,4,4>, LHS + 2564400026U, // <3,4,4,1>: Cost 3 vext1 <2,3,4,4>, <1,2,3,4> + 2564400845U, // <3,4,4,2>: Cost 3 vext1 <2,3,4,4>, <2,3,4,4> + 2570373542U, // <3,4,4,3>: Cost 3 vext1 <3,3,4,4>, <3,3,4,4> + 1659227344U, // <3,4,4,4>: Cost 2 vext3 LHS, <4,4,4,4> + 1551060278U, // <3,4,4,5>: Cost 2 vext2 <1,2,3,4>, RHS + 1659227364U, // <3,4,4,6>: Cost 2 vext3 LHS, <4,4,6,6> + 3668006154U, // <3,4,4,7>: Cost 4 vext1 <7,3,4,4>, <7,3,4,4> + 1551060521U, // <3,4,4,u>: Cost 2 vext2 <1,2,3,4>, RHS + 1490665574U, // <3,4,5,0>: Cost 2 vext1 <2,3,4,5>, LHS + 2689838341U, // <3,4,5,1>: Cost 3 vext3 LHS, <4,5,1,3> + 1490667214U, // <3,4,5,2>: Cost 2 vext1 <2,3,4,5>, <2,3,4,5> + 2564409494U, // <3,4,5,3>: Cost 3 vext1 <2,3,4,5>, <3,0,1,2> + 1490668854U, // <3,4,5,4>: Cost 2 vext1 <2,3,4,5>, RHS + 2689838381U, // <3,4,5,5>: Cost 3 vext3 LHS, <4,5,5,7> + 537709878U, // <3,4,5,6>: Cost 1 vext3 LHS, RHS + 2594272523U, // <3,4,5,7>: Cost 3 vext1 <7,3,4,5>, <7,3,4,5> + 537709896U, // <3,4,5,u>: Cost 1 vext3 LHS, RHS + 2689838411U, // <3,4,6,0>: Cost 3 vext3 LHS, <4,6,0,1> + 2558444534U, // <3,4,6,1>: Cost 3 vext1 <1,3,4,6>, <1,3,4,6> + 2666607098U, // <3,4,6,2>: Cost 3 vext2 <u,2,3,4>, <6,2,7,3> + 2558446082U, // <3,4,6,3>: Cost 3 vext1 <1,3,4,6>, <3,4,5,6> + 1659227508U, // <3,4,6,4>: Cost 2 vext3 LHS, <4,6,4,6> + 2689838462U, // <3,4,6,5>: Cost 3 vext3 LHS, <4,6,5,7> + 2689838471U, // <3,4,6,6>: Cost 3 vext3 LHS, <4,6,6,7> + 2657981292U, // <3,4,6,7>: Cost 3 vext2 <6,7,3,4>, <6,7,3,4> + 1659227540U, // <3,4,6,u>: Cost 2 vext3 LHS, <4,6,u,2> + 2666607610U, // <3,4,7,0>: Cost 3 vext2 <u,2,3,4>, <7,0,1,2> + 3702527072U, // <3,4,7,1>: Cost 4 vext2 <1,u,3,4>, <7,1,3,5> + 2660635824U, // <3,4,7,2>: Cost 3 vext2 <7,2,3,4>, <7,2,3,4> + 3644139945U, // <3,4,7,3>: Cost 4 vext1 <3,3,4,7>, <3,3,4,7> + 2666607974U, // <3,4,7,4>: Cost 3 vext2 <u,2,3,4>, <7,4,5,6> + 2732969416U, // <3,4,7,5>: Cost 3 vext3 LHS, <4,7,5,0> + 2732969425U, // <3,4,7,6>: Cost 3 vext3 LHS, <4,7,6,0> + 2666608236U, // <3,4,7,7>: Cost 3 vext2 <u,2,3,4>, <7,7,7,7> + 2664617622U, // <3,4,7,u>: Cost 3 vext2 <7,u,3,4>, <7,u,3,4> + 1490690150U, // <3,4,u,0>: Cost 2 vext1 <2,3,4,u>, LHS + 1551062830U, // <3,4,u,1>: Cost 2 vext2 <1,2,3,4>, LHS + 1490691793U, // <3,4,u,2>: Cost 2 vext1 <2,3,4,u>, <2,3,4,u> + 2624804796U, // <3,4,u,3>: Cost 3 vext2 <1,2,3,4>, <u,3,0,1> + 1490693430U, // <3,4,u,4>: Cost 2 vext1 <2,3,4,u>, RHS + 1551063194U, // <3,4,u,5>: Cost 2 vext2 <1,2,3,4>, RHS + 537710121U, // <3,4,u,6>: Cost 1 vext3 LHS, RHS + 2594297102U, // <3,4,u,7>: Cost 3 vext1 <7,3,4,u>, <7,3,4,u> + 537710139U, // <3,4,u,u>: Cost 1 vext3 LHS, RHS + 3692576768U, // <3,5,0,0>: Cost 4 vext2 <0,2,3,5>, <0,0,0,0> + 2618835046U, // <3,5,0,1>: Cost 3 vext2 <0,2,3,5>, LHS + 2618835138U, // <3,5,0,2>: Cost 3 vext2 <0,2,3,5>, <0,2,3,5> + 3692577024U, // <3,5,0,3>: Cost 4 vext2 <0,2,3,5>, <0,3,1,4> + 2689838690U, // <3,5,0,4>: Cost 3 vext3 LHS, <5,0,4,1> + 2732969579U, // <3,5,0,5>: Cost 3 vext3 LHS, <5,0,5,1> + 2732969588U, // <3,5,0,6>: Cost 3 vext3 LHS, <5,0,6,1> + 2246963055U, // <3,5,0,7>: Cost 3 vrev <5,3,7,0> + 2618835613U, // <3,5,0,u>: Cost 3 vext2 <0,2,3,5>, LHS + 2594308198U, // <3,5,1,0>: Cost 3 vext1 <7,3,5,1>, LHS + 3692577588U, // <3,5,1,1>: Cost 4 vext2 <0,2,3,5>, <1,1,1,1> + 2624807835U, // <3,5,1,2>: Cost 3 vext2 <1,2,3,5>, <1,2,3,5> + 2625471468U, // <3,5,1,3>: Cost 3 vext2 <1,3,3,5>, <1,3,3,5> + 2626135101U, // <3,5,1,4>: Cost 3 vext2 <1,4,3,5>, <1,4,3,5> + 2594311888U, // <3,5,1,5>: Cost 3 vext1 <7,3,5,1>, <5,1,7,3> + 3699877107U, // <3,5,1,6>: Cost 4 vext2 <1,4,3,5>, <1,6,5,7> + 1641680592U, // <3,5,1,7>: Cost 2 vext3 <5,1,7,3>, <5,1,7,3> + 1641754329U, // <3,5,1,u>: Cost 2 vext3 <5,1,u,3>, <5,1,u,3> + 3692578274U, // <3,5,2,0>: Cost 4 vext2 <0,2,3,5>, <2,0,5,3> + 2630116899U, // <3,5,2,1>: Cost 3 vext2 <2,1,3,5>, <2,1,3,5> + 3692578408U, // <3,5,2,2>: Cost 4 vext2 <0,2,3,5>, <2,2,2,2> + 2625472206U, // <3,5,2,3>: Cost 3 vext2 <1,3,3,5>, <2,3,4,5> + 2632107798U, // <3,5,2,4>: Cost 3 vext2 <2,4,3,5>, <2,4,3,5> + 2715938575U, // <3,5,2,5>: Cost 3 vext3 <5,2,5,3>, <5,2,5,3> + 3692578746U, // <3,5,2,6>: Cost 4 vext2 <0,2,3,5>, <2,6,3,7> + 2716086049U, // <3,5,2,7>: Cost 3 vext3 <5,2,7,3>, <5,2,7,3> + 2634762330U, // <3,5,2,u>: Cost 3 vext2 <2,u,3,5>, <2,u,3,5> + 3692578966U, // <3,5,3,0>: Cost 4 vext2 <0,2,3,5>, <3,0,1,2> + 2636089596U, // <3,5,3,1>: Cost 3 vext2 <3,1,3,5>, <3,1,3,5> + 3699214668U, // <3,5,3,2>: Cost 4 vext2 <1,3,3,5>, <3,2,3,4> + 2638080412U, // <3,5,3,3>: Cost 3 vext2 <3,4,3,5>, <3,3,3,3> + 2618837506U, // <3,5,3,4>: Cost 3 vext2 <0,2,3,5>, <3,4,5,6> + 2832844494U, // <3,5,3,5>: Cost 3 vuzpr <2,3,4,5>, <2,3,4,5> + 4033415682U, // <3,5,3,6>: Cost 4 vzipr <1,1,3,3>, <3,4,5,6> + 3095072054U, // <3,5,3,7>: Cost 3 vtrnr <1,3,1,3>, RHS + 3095072055U, // <3,5,3,u>: Cost 3 vtrnr <1,3,1,3>, RHS + 2600304742U, // <3,5,4,0>: Cost 3 vext1 <u,3,5,4>, LHS + 3763580815U, // <3,5,4,1>: Cost 4 vext3 LHS, <5,4,1,5> + 2564474582U, // <3,5,4,2>: Cost 3 vext1 <2,3,5,4>, <2,3,5,4> + 3699879044U, // <3,5,4,3>: Cost 4 vext2 <1,4,3,5>, <4,3,5,0> + 2600308022U, // <3,5,4,4>: Cost 3 vext1 <u,3,5,4>, RHS + 2618838326U, // <3,5,4,5>: Cost 3 vext2 <0,2,3,5>, RHS + 2772454710U, // <3,5,4,6>: Cost 3 vuzpl <3,4,5,6>, RHS + 1659228102U, // <3,5,4,7>: Cost 2 vext3 LHS, <5,4,7,6> + 1659228111U, // <3,5,4,u>: Cost 2 vext3 LHS, <5,4,u,6> + 2570453094U, // <3,5,5,0>: Cost 3 vext1 <3,3,5,5>, LHS + 2624810704U, // <3,5,5,1>: Cost 3 vext2 <1,2,3,5>, <5,1,7,3> + 2570454734U, // <3,5,5,2>: Cost 3 vext1 <3,3,5,5>, <2,3,4,5> + 2570455472U, // <3,5,5,3>: Cost 3 vext1 <3,3,5,5>, <3,3,5,5> + 2570456374U, // <3,5,5,4>: Cost 3 vext1 <3,3,5,5>, RHS + 1659228164U, // <3,5,5,5>: Cost 2 vext3 LHS, <5,5,5,5> + 2732969998U, // <3,5,5,6>: Cost 3 vext3 LHS, <5,5,6,6> + 1659228184U, // <3,5,5,7>: Cost 2 vext3 LHS, <5,5,7,7> + 1659228193U, // <3,5,5,u>: Cost 2 vext3 LHS, <5,5,u,7> + 2732970020U, // <3,5,6,0>: Cost 3 vext3 LHS, <5,6,0,1> + 2732970035U, // <3,5,6,1>: Cost 3 vext3 LHS, <5,6,1,7> + 2564490968U, // <3,5,6,2>: Cost 3 vext1 <2,3,5,6>, <2,3,5,6> + 2732970050U, // <3,5,6,3>: Cost 3 vext3 LHS, <5,6,3,4> + 2732970060U, // <3,5,6,4>: Cost 3 vext3 LHS, <5,6,4,5> + 2732970071U, // <3,5,6,5>: Cost 3 vext3 LHS, <5,6,5,7> + 2732970080U, // <3,5,6,6>: Cost 3 vext3 LHS, <5,6,6,7> + 1659228258U, // <3,5,6,7>: Cost 2 vext3 LHS, <5,6,7,0> + 1659228267U, // <3,5,6,u>: Cost 2 vext3 LHS, <5,6,u,0> + 1484783718U, // <3,5,7,0>: Cost 2 vext1 <1,3,5,7>, LHS + 1484784640U, // <3,5,7,1>: Cost 2 vext1 <1,3,5,7>, <1,3,5,7> + 2558527080U, // <3,5,7,2>: Cost 3 vext1 <1,3,5,7>, <2,2,2,2> + 2558527638U, // <3,5,7,3>: Cost 3 vext1 <1,3,5,7>, <3,0,1,2> + 1484786998U, // <3,5,7,4>: Cost 2 vext1 <1,3,5,7>, RHS + 1659228328U, // <3,5,7,5>: Cost 2 vext3 LHS, <5,7,5,7> + 2732970154U, // <3,5,7,6>: Cost 3 vext3 LHS, <5,7,6,0> + 2558531180U, // <3,5,7,7>: Cost 3 vext1 <1,3,5,7>, <7,7,7,7> + 1484789550U, // <3,5,7,u>: Cost 2 vext1 <1,3,5,7>, LHS + 1484791910U, // <3,5,u,0>: Cost 2 vext1 <1,3,5,u>, LHS + 1484792833U, // <3,5,u,1>: Cost 2 vext1 <1,3,5,u>, <1,3,5,u> + 2558535272U, // <3,5,u,2>: Cost 3 vext1 <1,3,5,u>, <2,2,2,2> + 2558535830U, // <3,5,u,3>: Cost 3 vext1 <1,3,5,u>, <3,0,1,2> + 1484795190U, // <3,5,u,4>: Cost 2 vext1 <1,3,5,u>, RHS + 1659228409U, // <3,5,u,5>: Cost 2 vext3 LHS, <5,u,5,7> + 2772457626U, // <3,5,u,6>: Cost 3 vuzpl <3,4,5,6>, RHS + 1646326023U, // <3,5,u,7>: Cost 2 vext3 <5,u,7,3>, <5,u,7,3> + 1484797742U, // <3,5,u,u>: Cost 2 vext1 <1,3,5,u>, LHS + 2558541926U, // <3,6,0,0>: Cost 3 vext1 <1,3,6,0>, LHS + 2689839393U, // <3,6,0,1>: Cost 3 vext3 LHS, <6,0,1,2> + 2689839404U, // <3,6,0,2>: Cost 3 vext3 LHS, <6,0,2,4> + 3706519808U, // <3,6,0,3>: Cost 4 vext2 <2,5,3,6>, <0,3,1,4> + 2689839420U, // <3,6,0,4>: Cost 3 vext3 LHS, <6,0,4,2> + 2732970314U, // <3,6,0,5>: Cost 3 vext3 LHS, <6,0,5,7> + 2732970316U, // <3,6,0,6>: Cost 3 vext3 LHS, <6,0,6,0> + 2960313654U, // <3,6,0,7>: Cost 3 vzipr <1,2,3,0>, RHS + 2689839456U, // <3,6,0,u>: Cost 3 vext3 LHS, <6,0,u,2> + 3763581290U, // <3,6,1,0>: Cost 4 vext3 LHS, <6,1,0,3> + 3763581297U, // <3,6,1,1>: Cost 4 vext3 LHS, <6,1,1,1> + 2624816028U, // <3,6,1,2>: Cost 3 vext2 <1,2,3,6>, <1,2,3,6> + 3763581315U, // <3,6,1,3>: Cost 4 vext3 LHS, <6,1,3,1> + 2626143294U, // <3,6,1,4>: Cost 3 vext2 <1,4,3,6>, <1,4,3,6> + 3763581335U, // <3,6,1,5>: Cost 4 vext3 LHS, <6,1,5,3> + 2721321376U, // <3,6,1,6>: Cost 3 vext3 <6,1,6,3>, <6,1,6,3> + 2721395113U, // <3,6,1,7>: Cost 3 vext3 <6,1,7,3>, <6,1,7,3> + 2628797826U, // <3,6,1,u>: Cost 3 vext2 <1,u,3,6>, <1,u,3,6> + 2594390118U, // <3,6,2,0>: Cost 3 vext1 <7,3,6,2>, LHS + 2721616324U, // <3,6,2,1>: Cost 3 vext3 <6,2,1,3>, <6,2,1,3> + 2630788725U, // <3,6,2,2>: Cost 3 vext2 <2,2,3,6>, <2,2,3,6> + 3763581395U, // <3,6,2,3>: Cost 4 vext3 LHS, <6,2,3,0> + 2632115991U, // <3,6,2,4>: Cost 3 vext2 <2,4,3,6>, <2,4,3,6> + 2632779624U, // <3,6,2,5>: Cost 3 vext2 <2,5,3,6>, <2,5,3,6> + 2594394618U, // <3,6,2,6>: Cost 3 vext1 <7,3,6,2>, <6,2,7,3> + 1648316922U, // <3,6,2,7>: Cost 2 vext3 <6,2,7,3>, <6,2,7,3> + 1648390659U, // <3,6,2,u>: Cost 2 vext3 <6,2,u,3>, <6,2,u,3> + 3693914262U, // <3,6,3,0>: Cost 4 vext2 <0,4,3,6>, <3,0,1,2> + 3638281176U, // <3,6,3,1>: Cost 4 vext1 <2,3,6,3>, <1,3,1,3> + 3696568678U, // <3,6,3,2>: Cost 4 vext2 <0,u,3,6>, <3,2,6,3> + 2638088604U, // <3,6,3,3>: Cost 3 vext2 <3,4,3,6>, <3,3,3,3> + 2632780290U, // <3,6,3,4>: Cost 3 vext2 <2,5,3,6>, <3,4,5,6> + 3712494145U, // <3,6,3,5>: Cost 4 vext2 <3,5,3,6>, <3,5,3,6> + 3698559612U, // <3,6,3,6>: Cost 4 vext2 <1,2,3,6>, <3,6,1,2> + 2959674678U, // <3,6,3,7>: Cost 3 vzipr <1,1,3,3>, RHS + 2959674679U, // <3,6,3,u>: Cost 3 vzipr <1,1,3,3>, RHS + 3763581536U, // <3,6,4,0>: Cost 4 vext3 LHS, <6,4,0,6> + 2722943590U, // <3,6,4,1>: Cost 3 vext3 <6,4,1,3>, <6,4,1,3> + 2732970609U, // <3,6,4,2>: Cost 3 vext3 LHS, <6,4,2,5> + 3698560147U, // <3,6,4,3>: Cost 4 vext2 <1,2,3,6>, <4,3,6,6> + 2732970628U, // <3,6,4,4>: Cost 3 vext3 LHS, <6,4,4,6> + 2689839757U, // <3,6,4,5>: Cost 3 vext3 LHS, <6,4,5,6> + 2732970640U, // <3,6,4,6>: Cost 3 vext3 LHS, <6,4,6,0> + 2960346422U, // <3,6,4,7>: Cost 3 vzipr <1,2,3,4>, RHS + 2689839784U, // <3,6,4,u>: Cost 3 vext3 LHS, <6,4,u,6> + 2576498790U, // <3,6,5,0>: Cost 3 vext1 <4,3,6,5>, LHS + 3650241270U, // <3,6,5,1>: Cost 4 vext1 <4,3,6,5>, <1,0,3,2> + 2732970692U, // <3,6,5,2>: Cost 3 vext3 LHS, <6,5,2,7> + 2576501250U, // <3,6,5,3>: Cost 3 vext1 <4,3,6,5>, <3,4,5,6> + 2576501906U, // <3,6,5,4>: Cost 3 vext1 <4,3,6,5>, <4,3,6,5> + 3650244622U, // <3,6,5,5>: Cost 4 vext1 <4,3,6,5>, <5,5,6,6> + 4114633528U, // <3,6,5,6>: Cost 4 vtrnl <3,4,5,6>, <6,6,6,6> + 2732970735U, // <3,6,5,7>: Cost 3 vext3 LHS, <6,5,7,5> + 2576504622U, // <3,6,5,u>: Cost 3 vext1 <4,3,6,5>, LHS + 2732970749U, // <3,6,6,0>: Cost 3 vext3 LHS, <6,6,0,1> + 2724270856U, // <3,6,6,1>: Cost 3 vext3 <6,6,1,3>, <6,6,1,3> + 2624819706U, // <3,6,6,2>: Cost 3 vext2 <1,2,3,6>, <6,2,7,3> + 3656223234U, // <3,6,6,3>: Cost 4 vext1 <5,3,6,6>, <3,4,5,6> + 2732970788U, // <3,6,6,4>: Cost 3 vext3 LHS, <6,6,4,4> + 2732970800U, // <3,6,6,5>: Cost 3 vext3 LHS, <6,6,5,7> + 1659228984U, // <3,6,6,6>: Cost 2 vext3 LHS, <6,6,6,6> + 1659228994U, // <3,6,6,7>: Cost 2 vext3 LHS, <6,6,7,7> + 1659229003U, // <3,6,6,u>: Cost 2 vext3 LHS, <6,6,u,7> + 1659229006U, // <3,6,7,0>: Cost 2 vext3 LHS, <6,7,0,1> + 2558600201U, // <3,6,7,1>: Cost 3 vext1 <1,3,6,7>, <1,3,6,7> + 2558601146U, // <3,6,7,2>: Cost 3 vext1 <1,3,6,7>, <2,6,3,7> + 2725081963U, // <3,6,7,3>: Cost 3 vext3 <6,7,3,3>, <6,7,3,3> + 1659229046U, // <3,6,7,4>: Cost 2 vext3 LHS, <6,7,4,5> + 2715423611U, // <3,6,7,5>: Cost 3 vext3 <5,1,7,3>, <6,7,5,1> + 2722059141U, // <3,6,7,6>: Cost 3 vext3 <6,2,7,3>, <6,7,6,2> + 2962361654U, // <3,6,7,7>: Cost 3 vzipr <1,5,3,7>, RHS + 1659229078U, // <3,6,7,u>: Cost 2 vext3 LHS, <6,7,u,1> + 1659229087U, // <3,6,u,0>: Cost 2 vext3 LHS, <6,u,0,1> + 2689840041U, // <3,6,u,1>: Cost 3 vext3 LHS, <6,u,1,2> + 2558609339U, // <3,6,u,2>: Cost 3 vext1 <1,3,6,u>, <2,6,3,u> + 2576525853U, // <3,6,u,3>: Cost 3 vext1 <4,3,6,u>, <3,4,u,6> + 1659229127U, // <3,6,u,4>: Cost 2 vext3 LHS, <6,u,4,5> + 2689840081U, // <3,6,u,5>: Cost 3 vext3 LHS, <6,u,5,6> + 1659228984U, // <3,6,u,6>: Cost 2 vext3 LHS, <6,6,6,6> + 1652298720U, // <3,6,u,7>: Cost 2 vext3 <6,u,7,3>, <6,u,7,3> + 1659229159U, // <3,6,u,u>: Cost 2 vext3 LHS, <6,u,u,1> + 2626813952U, // <3,7,0,0>: Cost 3 vext2 <1,5,3,7>, <0,0,0,0> + 1553072230U, // <3,7,0,1>: Cost 2 vext2 <1,5,3,7>, LHS + 2626814116U, // <3,7,0,2>: Cost 3 vext2 <1,5,3,7>, <0,2,0,2> + 3700556028U, // <3,7,0,3>: Cost 4 vext2 <1,5,3,7>, <0,3,1,0> + 2626814290U, // <3,7,0,4>: Cost 3 vext2 <1,5,3,7>, <0,4,1,5> + 2582507375U, // <3,7,0,5>: Cost 3 vext1 <5,3,7,0>, <5,3,7,0> + 2588480072U, // <3,7,0,6>: Cost 3 vext1 <6,3,7,0>, <6,3,7,0> + 2732971055U, // <3,7,0,7>: Cost 3 vext3 LHS, <7,0,7,1> + 1553072797U, // <3,7,0,u>: Cost 2 vext2 <1,5,3,7>, LHS + 2626814710U, // <3,7,1,0>: Cost 3 vext2 <1,5,3,7>, <1,0,3,2> + 2626814772U, // <3,7,1,1>: Cost 3 vext2 <1,5,3,7>, <1,1,1,1> + 2626814870U, // <3,7,1,2>: Cost 3 vext2 <1,5,3,7>, <1,2,3,0> + 2625487854U, // <3,7,1,3>: Cost 3 vext2 <1,3,3,7>, <1,3,3,7> + 2582514998U, // <3,7,1,4>: Cost 3 vext1 <5,3,7,1>, RHS + 1553073296U, // <3,7,1,5>: Cost 2 vext2 <1,5,3,7>, <1,5,3,7> + 2627478753U, // <3,7,1,6>: Cost 3 vext2 <1,6,3,7>, <1,6,3,7> + 2727367810U, // <3,7,1,7>: Cost 3 vext3 <7,1,7,3>, <7,1,7,3> + 1555064195U, // <3,7,1,u>: Cost 2 vext2 <1,u,3,7>, <1,u,3,7> + 2588491878U, // <3,7,2,0>: Cost 3 vext1 <6,3,7,2>, LHS + 3700557318U, // <3,7,2,1>: Cost 4 vext2 <1,5,3,7>, <2,1,0,3> + 2626815592U, // <3,7,2,2>: Cost 3 vext2 <1,5,3,7>, <2,2,2,2> + 2626815654U, // <3,7,2,3>: Cost 3 vext2 <1,5,3,7>, <2,3,0,1> + 2588495158U, // <3,7,2,4>: Cost 3 vext1 <6,3,7,2>, RHS + 2632787817U, // <3,7,2,5>: Cost 3 vext2 <2,5,3,7>, <2,5,3,7> + 1559709626U, // <3,7,2,6>: Cost 2 vext2 <2,6,3,7>, <2,6,3,7> + 2728031443U, // <3,7,2,7>: Cost 3 vext3 <7,2,7,3>, <7,2,7,3> + 1561036892U, // <3,7,2,u>: Cost 2 vext2 <2,u,3,7>, <2,u,3,7> + 2626816150U, // <3,7,3,0>: Cost 3 vext2 <1,5,3,7>, <3,0,1,2> + 2626816268U, // <3,7,3,1>: Cost 3 vext2 <1,5,3,7>, <3,1,5,3> + 2633451878U, // <3,7,3,2>: Cost 3 vext2 <2,6,3,7>, <3,2,6,3> + 2626816412U, // <3,7,3,3>: Cost 3 vext2 <1,5,3,7>, <3,3,3,3> + 2626816514U, // <3,7,3,4>: Cost 3 vext2 <1,5,3,7>, <3,4,5,6> + 2638760514U, // <3,7,3,5>: Cost 3 vext2 <3,5,3,7>, <3,5,3,7> + 2639424147U, // <3,7,3,6>: Cost 3 vext2 <3,6,3,7>, <3,6,3,7> + 2826961920U, // <3,7,3,7>: Cost 3 vuzpr <1,3,5,7>, <1,3,5,7> + 2626816798U, // <3,7,3,u>: Cost 3 vext2 <1,5,3,7>, <3,u,1,2> + 2582536294U, // <3,7,4,0>: Cost 3 vext1 <5,3,7,4>, LHS + 2582537360U, // <3,7,4,1>: Cost 3 vext1 <5,3,7,4>, <1,5,3,7> + 2588510138U, // <3,7,4,2>: Cost 3 vext1 <6,3,7,4>, <2,6,3,7> + 3700558996U, // <3,7,4,3>: Cost 4 vext2 <1,5,3,7>, <4,3,6,7> + 2582539574U, // <3,7,4,4>: Cost 3 vext1 <5,3,7,4>, RHS + 1553075510U, // <3,7,4,5>: Cost 2 vext2 <1,5,3,7>, RHS + 2588512844U, // <3,7,4,6>: Cost 3 vext1 <6,3,7,4>, <6,3,7,4> + 2564625766U, // <3,7,4,7>: Cost 3 vext1 <2,3,7,4>, <7,4,5,6> + 1553075753U, // <3,7,4,u>: Cost 2 vext2 <1,5,3,7>, RHS + 2732971398U, // <3,7,5,0>: Cost 3 vext3 LHS, <7,5,0,2> + 2626817744U, // <3,7,5,1>: Cost 3 vext2 <1,5,3,7>, <5,1,7,3> + 3700559649U, // <3,7,5,2>: Cost 4 vext2 <1,5,3,7>, <5,2,7,3> + 2626817903U, // <3,7,5,3>: Cost 3 vext2 <1,5,3,7>, <5,3,7,0> + 2258728203U, // <3,7,5,4>: Cost 3 vrev <7,3,4,5> + 2732971446U, // <3,7,5,5>: Cost 3 vext3 LHS, <7,5,5,5> + 2732971457U, // <3,7,5,6>: Cost 3 vext3 LHS, <7,5,6,7> + 2826964278U, // <3,7,5,7>: Cost 3 vuzpr <1,3,5,7>, RHS + 2826964279U, // <3,7,5,u>: Cost 3 vuzpr <1,3,5,7>, RHS + 2732971478U, // <3,7,6,0>: Cost 3 vext3 LHS, <7,6,0,1> + 2732971486U, // <3,7,6,1>: Cost 3 vext3 LHS, <7,6,1,0> + 2633454074U, // <3,7,6,2>: Cost 3 vext2 <2,6,3,7>, <6,2,7,3> + 2633454152U, // <3,7,6,3>: Cost 3 vext2 <2,6,3,7>, <6,3,7,0> + 2732971518U, // <3,7,6,4>: Cost 3 vext3 LHS, <7,6,4,5> + 2732971526U, // <3,7,6,5>: Cost 3 vext3 LHS, <7,6,5,4> + 2732971537U, // <3,7,6,6>: Cost 3 vext3 LHS, <7,6,6,6> + 2732971540U, // <3,7,6,7>: Cost 3 vext3 LHS, <7,6,7,0> + 2726041124U, // <3,7,6,u>: Cost 3 vext3 <6,u,7,3>, <7,6,u,7> + 2570616934U, // <3,7,7,0>: Cost 3 vext1 <3,3,7,7>, LHS + 2570617856U, // <3,7,7,1>: Cost 3 vext1 <3,3,7,7>, <1,3,5,7> + 2564646635U, // <3,7,7,2>: Cost 3 vext1 <2,3,7,7>, <2,3,7,7> + 2570619332U, // <3,7,7,3>: Cost 3 vext1 <3,3,7,7>, <3,3,7,7> + 2570620214U, // <3,7,7,4>: Cost 3 vext1 <3,3,7,7>, RHS + 2582564726U, // <3,7,7,5>: Cost 3 vext1 <5,3,7,7>, <5,3,7,7> + 2588537423U, // <3,7,7,6>: Cost 3 vext1 <6,3,7,7>, <6,3,7,7> + 1659229804U, // <3,7,7,7>: Cost 2 vext3 LHS, <7,7,7,7> + 1659229804U, // <3,7,7,u>: Cost 2 vext3 LHS, <7,7,7,7> + 2626819795U, // <3,7,u,0>: Cost 3 vext2 <1,5,3,7>, <u,0,1,2> + 1553078062U, // <3,7,u,1>: Cost 2 vext2 <1,5,3,7>, LHS + 2626819973U, // <3,7,u,2>: Cost 3 vext2 <1,5,3,7>, <u,2,3,0> + 2826961565U, // <3,7,u,3>: Cost 3 vuzpr <1,3,5,7>, LHS + 2626820159U, // <3,7,u,4>: Cost 3 vext2 <1,5,3,7>, <u,4,5,6> + 1553078426U, // <3,7,u,5>: Cost 2 vext2 <1,5,3,7>, RHS + 1595545808U, // <3,7,u,6>: Cost 2 vext2 <u,6,3,7>, <u,6,3,7> + 1659229804U, // <3,7,u,7>: Cost 2 vext3 LHS, <7,7,7,7> + 1553078629U, // <3,7,u,u>: Cost 2 vext2 <1,5,3,7>, LHS + 1611448320U, // <3,u,0,0>: Cost 2 vext3 LHS, <0,0,0,0> + 1611896531U, // <3,u,0,1>: Cost 2 vext3 LHS, <u,0,1,2> + 1659672284U, // <3,u,0,2>: Cost 2 vext3 LHS, <u,0,2,2> + 1616099045U, // <3,u,0,3>: Cost 2 vext3 LHS, <u,0,3,2> + 2685638381U, // <3,u,0,4>: Cost 3 vext3 LHS, <u,0,4,1> + 1663874806U, // <3,u,0,5>: Cost 2 vext3 LHS, <u,0,5,1> + 1663874816U, // <3,u,0,6>: Cost 2 vext3 LHS, <u,0,6,2> + 2960313672U, // <3,u,0,7>: Cost 3 vzipr <1,2,3,0>, RHS + 1611896594U, // <3,u,0,u>: Cost 2 vext3 LHS, <u,0,u,2> + 1549763324U, // <3,u,1,0>: Cost 2 vext2 <1,0,3,u>, <1,0,3,u> + 1550426957U, // <3,u,1,1>: Cost 2 vext2 <1,1,3,u>, <1,1,3,u> + 537712430U, // <3,u,1,2>: Cost 1 vext3 LHS, LHS + 1616541495U, // <3,u,1,3>: Cost 2 vext3 LHS, <u,1,3,3> + 1490930998U, // <3,u,1,4>: Cost 2 vext1 <2,3,u,1>, RHS + 1553081489U, // <3,u,1,5>: Cost 2 vext2 <1,5,3,u>, <1,5,3,u> + 2627486946U, // <3,u,1,6>: Cost 3 vext2 <1,6,3,u>, <1,6,3,u> + 1659230043U, // <3,u,1,7>: Cost 2 vext3 LHS, <u,1,7,3> + 537712484U, // <3,u,1,u>: Cost 1 vext3 LHS, LHS + 1611890852U, // <3,u,2,0>: Cost 2 vext3 LHS, <0,2,0,2> + 2624833102U, // <3,u,2,1>: Cost 3 vext2 <1,2,3,u>, <2,1,u,3> + 1557063287U, // <3,u,2,2>: Cost 2 vext2 <2,2,3,u>, <2,2,3,u> + 1616099205U, // <3,u,2,3>: Cost 2 vext3 LHS, <u,2,3,0> + 1611890892U, // <3,u,2,4>: Cost 2 vext3 LHS, <0,2,4,6> + 2689841054U, // <3,u,2,5>: Cost 3 vext3 LHS, <u,2,5,7> + 1559717819U, // <3,u,2,6>: Cost 2 vext2 <2,6,3,u>, <2,6,3,u> + 1659230124U, // <3,u,2,7>: Cost 2 vext3 LHS, <u,2,7,3> + 1616541618U, // <3,u,2,u>: Cost 2 vext3 LHS, <u,2,u,0> + 1611896764U, // <3,u,3,0>: Cost 2 vext3 LHS, <u,3,0,1> + 1484973079U, // <3,u,3,1>: Cost 2 vext1 <1,3,u,3>, <1,3,u,3> + 2685638607U, // <3,u,3,2>: Cost 3 vext3 LHS, <u,3,2,2> + 336380006U, // <3,u,3,3>: Cost 1 vdup3 LHS + 1611896804U, // <3,u,3,4>: Cost 2 vext3 LHS, <u,3,4,5> + 1616541679U, // <3,u,3,5>: Cost 2 vext3 LHS, <u,3,5,7> + 2690283512U, // <3,u,3,6>: Cost 3 vext3 LHS, <u,3,6,7> + 2959674696U, // <3,u,3,7>: Cost 3 vzipr <1,1,3,3>, RHS + 336380006U, // <3,u,3,u>: Cost 1 vdup3 LHS + 2558722150U, // <3,u,4,0>: Cost 3 vext1 <1,3,u,4>, LHS + 1659672602U, // <3,u,4,1>: Cost 2 vext3 LHS, <u,4,1,5> + 1659672612U, // <3,u,4,2>: Cost 2 vext3 LHS, <u,4,2,6> + 2689841196U, // <3,u,4,3>: Cost 3 vext3 LHS, <u,4,3,5> + 1659227344U, // <3,u,4,4>: Cost 2 vext3 LHS, <4,4,4,4> + 1611896895U, // <3,u,4,5>: Cost 2 vext3 LHS, <u,4,5,6> + 1663875144U, // <3,u,4,6>: Cost 2 vext3 LHS, <u,4,6,6> + 1659230289U, // <3,u,4,7>: Cost 2 vext3 LHS, <u,4,7,6> + 1611896922U, // <3,u,4,u>: Cost 2 vext3 LHS, <u,4,u,6> + 1490960486U, // <3,u,5,0>: Cost 2 vext1 <2,3,u,5>, LHS + 2689841261U, // <3,u,5,1>: Cost 3 vext3 LHS, <u,5,1,7> + 1490962162U, // <3,u,5,2>: Cost 2 vext1 <2,3,u,5>, <2,3,u,5> + 1616541823U, // <3,u,5,3>: Cost 2 vext3 LHS, <u,5,3,7> + 1490963766U, // <3,u,5,4>: Cost 2 vext1 <2,3,u,5>, RHS + 1659228164U, // <3,u,5,5>: Cost 2 vext3 LHS, <5,5,5,5> + 537712794U, // <3,u,5,6>: Cost 1 vext3 LHS, RHS + 1659230371U, // <3,u,5,7>: Cost 2 vext3 LHS, <u,5,7,7> + 537712812U, // <3,u,5,u>: Cost 1 vext3 LHS, RHS + 2689841327U, // <3,u,6,0>: Cost 3 vext3 LHS, <u,6,0,1> + 2558739482U, // <3,u,6,1>: Cost 3 vext1 <1,3,u,6>, <1,3,u,6> + 2689841351U, // <3,u,6,2>: Cost 3 vext3 LHS, <u,6,2,7> + 1616099536U, // <3,u,6,3>: Cost 2 vext3 LHS, <u,6,3,7> + 1659227508U, // <3,u,6,4>: Cost 2 vext3 LHS, <4,6,4,6> + 2690283746U, // <3,u,6,5>: Cost 3 vext3 LHS, <u,6,5,7> + 1659228984U, // <3,u,6,6>: Cost 2 vext3 LHS, <6,6,6,6> + 1659230445U, // <3,u,6,7>: Cost 2 vext3 LHS, <u,6,7,0> + 1616099581U, // <3,u,6,u>: Cost 2 vext3 LHS, <u,6,u,7> + 1485004902U, // <3,u,7,0>: Cost 2 vext1 <1,3,u,7>, LHS + 1485005851U, // <3,u,7,1>: Cost 2 vext1 <1,3,u,7>, <1,3,u,7> + 2558748264U, // <3,u,7,2>: Cost 3 vext1 <1,3,u,7>, <2,2,2,2> + 3095397021U, // <3,u,7,3>: Cost 3 vtrnr <1,3,5,7>, LHS + 1485008182U, // <3,u,7,4>: Cost 2 vext1 <1,3,u,7>, RHS + 1659228328U, // <3,u,7,5>: Cost 2 vext3 LHS, <5,7,5,7> + 2722060599U, // <3,u,7,6>: Cost 3 vext3 <6,2,7,3>, <u,7,6,2> + 1659229804U, // <3,u,7,7>: Cost 2 vext3 LHS, <7,7,7,7> + 1485010734U, // <3,u,7,u>: Cost 2 vext1 <1,3,u,7>, LHS + 1616099665U, // <3,u,u,0>: Cost 2 vext3 LHS, <u,u,0,1> + 1611897179U, // <3,u,u,1>: Cost 2 vext3 LHS, <u,u,1,2> + 537712997U, // <3,u,u,2>: Cost 1 vext3 LHS, LHS + 336380006U, // <3,u,u,3>: Cost 1 vdup3 LHS + 1616099705U, // <3,u,u,4>: Cost 2 vext3 LHS, <u,u,4,5> + 1611897219U, // <3,u,u,5>: Cost 2 vext3 LHS, <u,u,5,6> + 537713037U, // <3,u,u,6>: Cost 1 vext3 LHS, RHS + 1659230607U, // <3,u,u,7>: Cost 2 vext3 LHS, <u,u,7,0> + 537713051U, // <3,u,u,u>: Cost 1 vext3 LHS, LHS + 2691907584U, // <4,0,0,0>: Cost 3 vext3 <1,2,3,4>, <0,0,0,0> + 2691907594U, // <4,0,0,1>: Cost 3 vext3 <1,2,3,4>, <0,0,1,1> + 2691907604U, // <4,0,0,2>: Cost 3 vext3 <1,2,3,4>, <0,0,2,2> + 3709862144U, // <4,0,0,3>: Cost 4 vext2 <3,1,4,0>, <0,3,1,4> + 2684682280U, // <4,0,0,4>: Cost 3 vext3 <0,0,4,4>, <0,0,4,4> + 3694600633U, // <4,0,0,5>: Cost 4 vext2 <0,5,4,0>, <0,5,4,0> + 3291431290U, // <4,0,0,6>: Cost 4 vrev <0,4,6,0> + 3668342067U, // <4,0,0,7>: Cost 4 vext1 <7,4,0,0>, <7,4,0,0> + 2691907657U, // <4,0,0,u>: Cost 3 vext3 <1,2,3,4>, <0,0,u,1> + 2570715238U, // <4,0,1,0>: Cost 3 vext1 <3,4,0,1>, LHS + 2570716058U, // <4,0,1,1>: Cost 3 vext1 <3,4,0,1>, <1,2,3,4> + 1618165862U, // <4,0,1,2>: Cost 2 vext3 <1,2,3,4>, LHS + 2570717648U, // <4,0,1,3>: Cost 3 vext1 <3,4,0,1>, <3,4,0,1> + 2570718518U, // <4,0,1,4>: Cost 3 vext1 <3,4,0,1>, RHS + 2594607206U, // <4,0,1,5>: Cost 3 vext1 <7,4,0,1>, <5,6,7,4> + 3662377563U, // <4,0,1,6>: Cost 4 vext1 <6,4,0,1>, <6,4,0,1> + 2594608436U, // <4,0,1,7>: Cost 3 vext1 <7,4,0,1>, <7,4,0,1> + 1618165916U, // <4,0,1,u>: Cost 2 vext3 <1,2,3,4>, LHS + 2685714598U, // <4,0,2,0>: Cost 3 vext3 <0,2,0,4>, <0,2,0,4> + 3759530159U, // <4,0,2,1>: Cost 4 vext3 <0,2,1,4>, <0,2,1,4> + 2685862072U, // <4,0,2,2>: Cost 3 vext3 <0,2,2,4>, <0,2,2,4> + 2631476937U, // <4,0,2,3>: Cost 3 vext2 <2,3,4,0>, <2,3,4,0> + 2685714636U, // <4,0,2,4>: Cost 3 vext3 <0,2,0,4>, <0,2,4,6> + 3765649622U, // <4,0,2,5>: Cost 4 vext3 <1,2,3,4>, <0,2,5,7> + 2686157020U, // <4,0,2,6>: Cost 3 vext3 <0,2,6,4>, <0,2,6,4> + 3668358453U, // <4,0,2,7>: Cost 4 vext1 <7,4,0,2>, <7,4,0,2> + 2686304494U, // <4,0,2,u>: Cost 3 vext3 <0,2,u,4>, <0,2,u,4> + 3632529510U, // <4,0,3,0>: Cost 4 vext1 <1,4,0,3>, LHS + 2686451968U, // <4,0,3,1>: Cost 3 vext3 <0,3,1,4>, <0,3,1,4> + 2686525705U, // <4,0,3,2>: Cost 3 vext3 <0,3,2,4>, <0,3,2,4> + 3760341266U, // <4,0,3,3>: Cost 4 vext3 <0,3,3,4>, <0,3,3,4> + 3632532790U, // <4,0,3,4>: Cost 4 vext1 <1,4,0,3>, RHS + 3913254606U, // <4,0,3,5>: Cost 4 vuzpr <3,4,5,0>, <2,3,4,5> + 3705219740U, // <4,0,3,6>: Cost 4 vext2 <2,3,4,0>, <3,6,4,7> + 3713845990U, // <4,0,3,7>: Cost 4 vext2 <3,7,4,0>, <3,7,4,0> + 2686451968U, // <4,0,3,u>: Cost 3 vext3 <0,3,1,4>, <0,3,1,4> + 2552823910U, // <4,0,4,0>: Cost 3 vext1 <0,4,0,4>, LHS + 2691907922U, // <4,0,4,1>: Cost 3 vext3 <1,2,3,4>, <0,4,1,5> + 2691907932U, // <4,0,4,2>: Cost 3 vext3 <1,2,3,4>, <0,4,2,6> + 3626567830U, // <4,0,4,3>: Cost 4 vext1 <0,4,0,4>, <3,0,1,2> + 2552827190U, // <4,0,4,4>: Cost 3 vext1 <0,4,0,4>, RHS + 2631478582U, // <4,0,4,5>: Cost 3 vext2 <2,3,4,0>, RHS + 3626570017U, // <4,0,4,6>: Cost 4 vext1 <0,4,0,4>, <6,0,1,2> + 3668374839U, // <4,0,4,7>: Cost 4 vext1 <7,4,0,4>, <7,4,0,4> + 2552829742U, // <4,0,4,u>: Cost 3 vext1 <0,4,0,4>, LHS + 2558804070U, // <4,0,5,0>: Cost 3 vext1 <1,4,0,5>, LHS + 1839644774U, // <4,0,5,1>: Cost 2 vzipl RHS, LHS + 2913386660U, // <4,0,5,2>: Cost 3 vzipl RHS, <0,2,0,2> + 2570750420U, // <4,0,5,3>: Cost 3 vext1 <3,4,0,5>, <3,4,0,5> + 2558807350U, // <4,0,5,4>: Cost 3 vext1 <1,4,0,5>, RHS + 3987128750U, // <4,0,5,5>: Cost 4 vzipl RHS, <0,5,2,7> + 3987128822U, // <4,0,5,6>: Cost 4 vzipl RHS, <0,6,1,7> + 2594641208U, // <4,0,5,7>: Cost 3 vext1 <7,4,0,5>, <7,4,0,5> + 1839645341U, // <4,0,5,u>: Cost 2 vzipl RHS, LHS + 2552840294U, // <4,0,6,0>: Cost 3 vext1 <0,4,0,6>, LHS + 3047604234U, // <4,0,6,1>: Cost 3 vtrnl RHS, <0,0,1,1> + 1973862502U, // <4,0,6,2>: Cost 2 vtrnl RHS, LHS + 2570758613U, // <4,0,6,3>: Cost 3 vext1 <3,4,0,6>, <3,4,0,6> + 2552843574U, // <4,0,6,4>: Cost 3 vext1 <0,4,0,6>, RHS + 2217664887U, // <4,0,6,5>: Cost 3 vrev <0,4,5,6> + 3662418528U, // <4,0,6,6>: Cost 4 vext1 <6,4,0,6>, <6,4,0,6> + 2658022257U, // <4,0,6,7>: Cost 3 vext2 <6,7,4,0>, <6,7,4,0> + 1973862556U, // <4,0,6,u>: Cost 2 vtrnl RHS, LHS + 3731764218U, // <4,0,7,0>: Cost 4 vext2 <6,7,4,0>, <7,0,1,2> + 3988324454U, // <4,0,7,1>: Cost 4 vzipl <4,7,5,0>, LHS + 4122034278U, // <4,0,7,2>: Cost 4 vtrnl <4,6,7,1>, LHS + 3735082246U, // <4,0,7,3>: Cost 4 vext2 <7,3,4,0>, <7,3,4,0> + 3731764536U, // <4,0,7,4>: Cost 4 vext2 <6,7,4,0>, <7,4,0,5> + 3937145718U, // <4,0,7,5>: Cost 4 vuzpr <7,4,5,0>, <6,7,4,5> + 3737073145U, // <4,0,7,6>: Cost 4 vext2 <7,6,4,0>, <7,6,4,0> + 3731764844U, // <4,0,7,7>: Cost 4 vext2 <6,7,4,0>, <7,7,7,7> + 4122034332U, // <4,0,7,u>: Cost 4 vtrnl <4,6,7,1>, LHS + 2552856678U, // <4,0,u,0>: Cost 3 vext1 <0,4,0,u>, LHS + 1841635430U, // <4,0,u,1>: Cost 2 vzipl RHS, LHS + 1618166429U, // <4,0,u,2>: Cost 2 vext3 <1,2,3,4>, LHS + 2570774999U, // <4,0,u,3>: Cost 3 vext1 <3,4,0,u>, <3,4,0,u> + 2552859958U, // <4,0,u,4>: Cost 3 vext1 <0,4,0,u>, RHS + 2631481498U, // <4,0,u,5>: Cost 3 vext2 <2,3,4,0>, RHS + 2686157020U, // <4,0,u,6>: Cost 3 vext3 <0,2,6,4>, <0,2,6,4> + 2594665787U, // <4,0,u,7>: Cost 3 vext1 <7,4,0,u>, <7,4,0,u> + 1618166483U, // <4,0,u,u>: Cost 2 vext3 <1,2,3,4>, LHS + 2617548837U, // <4,1,0,0>: Cost 3 vext2 <0,0,4,1>, <0,0,4,1> + 2622857318U, // <4,1,0,1>: Cost 3 vext2 <0,u,4,1>, LHS + 3693281484U, // <4,1,0,2>: Cost 4 vext2 <0,3,4,1>, <0,2,4,6> + 2691908342U, // <4,1,0,3>: Cost 3 vext3 <1,2,3,4>, <1,0,3,2> + 2622857554U, // <4,1,0,4>: Cost 3 vext2 <0,u,4,1>, <0,4,1,5> + 3764470538U, // <4,1,0,5>: Cost 4 vext3 <1,0,5,4>, <1,0,5,4> + 3695272459U, // <4,1,0,6>: Cost 4 vext2 <0,6,4,1>, <0,6,4,1> + 3733094980U, // <4,1,0,7>: Cost 4 vext2 <7,0,4,1>, <0,7,1,4> + 2622857885U, // <4,1,0,u>: Cost 3 vext2 <0,u,4,1>, LHS + 3696599798U, // <4,1,1,0>: Cost 4 vext2 <0,u,4,1>, <1,0,3,2> + 2691097399U, // <4,1,1,1>: Cost 3 vext3 <1,1,1,4>, <1,1,1,4> + 2631484314U, // <4,1,1,2>: Cost 3 vext2 <2,3,4,1>, <1,2,3,4> + 2691908424U, // <4,1,1,3>: Cost 3 vext3 <1,2,3,4>, <1,1,3,3> + 3696600125U, // <4,1,1,4>: Cost 4 vext2 <0,u,4,1>, <1,4,3,5> + 3696600175U, // <4,1,1,5>: Cost 4 vext2 <0,u,4,1>, <1,5,0,1> + 3696600307U, // <4,1,1,6>: Cost 4 vext2 <0,u,4,1>, <1,6,5,7> + 3668423997U, // <4,1,1,7>: Cost 4 vext1 <7,4,1,1>, <7,4,1,1> + 2691908469U, // <4,1,1,u>: Cost 3 vext3 <1,2,3,4>, <1,1,u,3> + 2570797158U, // <4,1,2,0>: Cost 3 vext1 <3,4,1,2>, LHS + 2570797978U, // <4,1,2,1>: Cost 3 vext1 <3,4,1,2>, <1,2,3,4> + 3696600680U, // <4,1,2,2>: Cost 4 vext2 <0,u,4,1>, <2,2,2,2> + 1618166682U, // <4,1,2,3>: Cost 2 vext3 <1,2,3,4>, <1,2,3,4> + 2570800438U, // <4,1,2,4>: Cost 3 vext1 <3,4,1,2>, RHS + 3765650347U, // <4,1,2,5>: Cost 4 vext3 <1,2,3,4>, <1,2,5,3> + 3696601018U, // <4,1,2,6>: Cost 4 vext2 <0,u,4,1>, <2,6,3,7> + 3668432190U, // <4,1,2,7>: Cost 4 vext1 <7,4,1,2>, <7,4,1,2> + 1618535367U, // <4,1,2,u>: Cost 2 vext3 <1,2,u,4>, <1,2,u,4> + 2564833382U, // <4,1,3,0>: Cost 3 vext1 <2,4,1,3>, LHS + 2691908568U, // <4,1,3,1>: Cost 3 vext3 <1,2,3,4>, <1,3,1,3> + 2691908578U, // <4,1,3,2>: Cost 3 vext3 <1,2,3,4>, <1,3,2,4> + 2692572139U, // <4,1,3,3>: Cost 3 vext3 <1,3,3,4>, <1,3,3,4> + 2564836662U, // <4,1,3,4>: Cost 3 vext1 <2,4,1,3>, RHS + 2691908608U, // <4,1,3,5>: Cost 3 vext3 <1,2,3,4>, <1,3,5,7> + 2588725862U, // <4,1,3,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3> + 3662468090U, // <4,1,3,7>: Cost 4 vext1 <6,4,1,3>, <7,0,1,2> + 2691908631U, // <4,1,3,u>: Cost 3 vext3 <1,2,3,4>, <1,3,u,3> + 3760194590U, // <4,1,4,0>: Cost 4 vext3 <0,3,1,4>, <1,4,0,1> + 3693947874U, // <4,1,4,1>: Cost 4 vext2 <0,4,4,1>, <4,1,5,0> + 3765650484U, // <4,1,4,2>: Cost 4 vext3 <1,2,3,4>, <1,4,2,5> + 3113877606U, // <4,1,4,3>: Cost 3 vtrnr <4,4,4,4>, LHS + 3760194630U, // <4,1,4,4>: Cost 4 vext3 <0,3,1,4>, <1,4,4,5> + 2622860598U, // <4,1,4,5>: Cost 3 vext2 <0,u,4,1>, RHS + 3297436759U, // <4,1,4,6>: Cost 4 vrev <1,4,6,4> + 3800007772U, // <4,1,4,7>: Cost 4 vext3 <7,0,1,4>, <1,4,7,0> + 2622860841U, // <4,1,4,u>: Cost 3 vext2 <0,u,4,1>, RHS + 1479164006U, // <4,1,5,0>: Cost 2 vext1 <0,4,1,5>, LHS + 2552906486U, // <4,1,5,1>: Cost 3 vext1 <0,4,1,5>, <1,0,3,2> + 2552907299U, // <4,1,5,2>: Cost 3 vext1 <0,4,1,5>, <2,1,3,5> + 2552907926U, // <4,1,5,3>: Cost 3 vext1 <0,4,1,5>, <3,0,1,2> + 1479167286U, // <4,1,5,4>: Cost 2 vext1 <0,4,1,5>, RHS + 2913387664U, // <4,1,5,5>: Cost 3 vzipl RHS, <1,5,3,7> + 2600686074U, // <4,1,5,6>: Cost 3 vext1 <u,4,1,5>, <6,2,7,3> + 2600686586U, // <4,1,5,7>: Cost 3 vext1 <u,4,1,5>, <7,0,1,2> + 1479169838U, // <4,1,5,u>: Cost 2 vext1 <0,4,1,5>, LHS + 2552914022U, // <4,1,6,0>: Cost 3 vext1 <0,4,1,6>, LHS + 2558886708U, // <4,1,6,1>: Cost 3 vext1 <1,4,1,6>, <1,1,1,1> + 4028205206U, // <4,1,6,2>: Cost 4 vzipr <0,2,4,6>, <3,0,1,2> + 3089858662U, // <4,1,6,3>: Cost 3 vtrnr <0,4,2,6>, LHS + 2552917302U, // <4,1,6,4>: Cost 3 vext1 <0,4,1,6>, RHS + 2223637584U, // <4,1,6,5>: Cost 3 vrev <1,4,5,6> + 4121347081U, // <4,1,6,6>: Cost 4 vtrnl RHS, <1,3,6,7> + 3721155406U, // <4,1,6,7>: Cost 4 vext2 <5,0,4,1>, <6,7,0,1> + 2552919854U, // <4,1,6,u>: Cost 3 vext1 <0,4,1,6>, LHS + 2659357716U, // <4,1,7,0>: Cost 3 vext2 <7,0,4,1>, <7,0,4,1> + 3733763173U, // <4,1,7,1>: Cost 4 vext2 <7,1,4,1>, <7,1,4,1> + 3734426806U, // <4,1,7,2>: Cost 4 vext2 <7,2,4,1>, <7,2,4,1> + 2695226671U, // <4,1,7,3>: Cost 3 vext3 <1,7,3,4>, <1,7,3,4> + 3721155942U, // <4,1,7,4>: Cost 4 vext2 <5,0,4,1>, <7,4,5,6> + 3721155976U, // <4,1,7,5>: Cost 4 vext2 <5,0,4,1>, <7,5,0,4> + 3662500458U, // <4,1,7,6>: Cost 4 vext1 <6,4,1,7>, <6,4,1,7> + 3721156204U, // <4,1,7,7>: Cost 4 vext2 <5,0,4,1>, <7,7,7,7> + 2659357716U, // <4,1,7,u>: Cost 3 vext2 <7,0,4,1>, <7,0,4,1> + 1479188582U, // <4,1,u,0>: Cost 2 vext1 <0,4,1,u>, LHS + 2552931062U, // <4,1,u,1>: Cost 3 vext1 <0,4,1,u>, <1,0,3,2> + 2552931944U, // <4,1,u,2>: Cost 3 vext1 <0,4,1,u>, <2,2,2,2> + 1622148480U, // <4,1,u,3>: Cost 2 vext3 <1,u,3,4>, <1,u,3,4> + 1479191862U, // <4,1,u,4>: Cost 2 vext1 <0,4,1,u>, RHS + 2622863514U, // <4,1,u,5>: Cost 3 vext2 <0,u,4,1>, RHS + 2588725862U, // <4,1,u,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3> + 2600686586U, // <4,1,u,7>: Cost 3 vext1 <u,4,1,5>, <7,0,1,2> + 1479194414U, // <4,1,u,u>: Cost 2 vext1 <0,4,1,u>, LHS + 2617557030U, // <4,2,0,0>: Cost 3 vext2 <0,0,4,2>, <0,0,4,2> + 2622865510U, // <4,2,0,1>: Cost 3 vext2 <0,u,4,2>, LHS + 2622865612U, // <4,2,0,2>: Cost 3 vext2 <0,u,4,2>, <0,2,4,6> + 3693289753U, // <4,2,0,3>: Cost 4 vext2 <0,3,4,2>, <0,3,4,2> + 2635473244U, // <4,2,0,4>: Cost 3 vext2 <3,0,4,2>, <0,4,2,6> + 3765650918U, // <4,2,0,5>: Cost 4 vext3 <1,2,3,4>, <2,0,5,7> + 2696775148U, // <4,2,0,6>: Cost 3 vext3 <2,0,6,4>, <2,0,6,4> + 3695944285U, // <4,2,0,7>: Cost 4 vext2 <0,7,4,2>, <0,7,4,2> + 2622866077U, // <4,2,0,u>: Cost 3 vext2 <0,u,4,2>, LHS + 3696607990U, // <4,2,1,0>: Cost 4 vext2 <0,u,4,2>, <1,0,3,2> + 3696608052U, // <4,2,1,1>: Cost 4 vext2 <0,u,4,2>, <1,1,1,1> + 3696608150U, // <4,2,1,2>: Cost 4 vext2 <0,u,4,2>, <1,2,3,0> + 3895574630U, // <4,2,1,3>: Cost 4 vuzpr <0,4,u,2>, LHS + 2691909162U, // <4,2,1,4>: Cost 3 vext3 <1,2,3,4>, <2,1,4,3> + 3696608400U, // <4,2,1,5>: Cost 4 vext2 <0,u,4,2>, <1,5,3,7> + 3760784956U, // <4,2,1,6>: Cost 4 vext3 <0,4,0,4>, <2,1,6,3> + 3773908549U, // <4,2,1,7>: Cost 5 vext3 <2,5,7,4>, <2,1,7,3> + 2691909162U, // <4,2,1,u>: Cost 3 vext3 <1,2,3,4>, <2,1,4,3> + 3696608748U, // <4,2,2,0>: Cost 4 vext2 <0,u,4,2>, <2,0,6,4> + 3696608828U, // <4,2,2,1>: Cost 4 vext2 <0,u,4,2>, <2,1,6,3> + 2691909224U, // <4,2,2,2>: Cost 3 vext3 <1,2,3,4>, <2,2,2,2> + 2691909234U, // <4,2,2,3>: Cost 3 vext3 <1,2,3,4>, <2,2,3,3> + 3759605368U, // <4,2,2,4>: Cost 4 vext3 <0,2,2,4>, <2,2,4,0> + 3696609156U, // <4,2,2,5>: Cost 4 vext2 <0,u,4,2>, <2,5,6,7> + 3760785040U, // <4,2,2,6>: Cost 4 vext3 <0,4,0,4>, <2,2,6,6> + 3668505927U, // <4,2,2,7>: Cost 4 vext1 <7,4,2,2>, <7,4,2,2> + 2691909279U, // <4,2,2,u>: Cost 3 vext3 <1,2,3,4>, <2,2,u,3> + 2691909286U, // <4,2,3,0>: Cost 3 vext3 <1,2,3,4>, <2,3,0,1> + 3764840111U, // <4,2,3,1>: Cost 4 vext3 <1,1,1,4>, <2,3,1,1> + 3765651129U, // <4,2,3,2>: Cost 4 vext3 <1,2,3,4>, <2,3,2,2> + 2698544836U, // <4,2,3,3>: Cost 3 vext3 <2,3,3,4>, <2,3,3,4> + 2685863630U, // <4,2,3,4>: Cost 3 vext3 <0,2,2,4>, <2,3,4,5> + 2698692310U, // <4,2,3,5>: Cost 3 vext3 <2,3,5,4>, <2,3,5,4> + 3772507871U, // <4,2,3,6>: Cost 4 vext3 <2,3,6,4>, <2,3,6,4> + 2698839784U, // <4,2,3,7>: Cost 3 vext3 <2,3,7,4>, <2,3,7,4> + 2691909358U, // <4,2,3,u>: Cost 3 vext3 <1,2,3,4>, <2,3,u,1> + 2564915302U, // <4,2,4,0>: Cost 3 vext1 <2,4,2,4>, LHS + 2564916122U, // <4,2,4,1>: Cost 3 vext1 <2,4,2,4>, <1,2,3,4> + 2564917004U, // <4,2,4,2>: Cost 3 vext1 <2,4,2,4>, <2,4,2,4> + 2699208469U, // <4,2,4,3>: Cost 3 vext3 <2,4,3,4>, <2,4,3,4> + 2564918582U, // <4,2,4,4>: Cost 3 vext1 <2,4,2,4>, RHS + 2622868790U, // <4,2,4,5>: Cost 3 vext2 <0,u,4,2>, RHS + 2229667632U, // <4,2,4,6>: Cost 3 vrev <2,4,6,4> + 3800082229U, // <4,2,4,7>: Cost 4 vext3 <7,0,2,4>, <2,4,7,0> + 2622869033U, // <4,2,4,u>: Cost 3 vext2 <0,u,4,2>, RHS + 2552979558U, // <4,2,5,0>: Cost 3 vext1 <0,4,2,5>, LHS + 2558952342U, // <4,2,5,1>: Cost 3 vext1 <1,4,2,5>, <1,2,3,0> + 2564925032U, // <4,2,5,2>: Cost 3 vext1 <2,4,2,5>, <2,2,2,2> + 2967060582U, // <4,2,5,3>: Cost 3 vzipr <2,3,4,5>, LHS + 2552982838U, // <4,2,5,4>: Cost 3 vext1 <0,4,2,5>, RHS + 3987130190U, // <4,2,5,5>: Cost 4 vzipl RHS, <2,5,0,7> + 2913388474U, // <4,2,5,6>: Cost 3 vzipl RHS, <2,6,3,7> + 3895577910U, // <4,2,5,7>: Cost 4 vuzpr <0,4,u,2>, RHS + 2552985390U, // <4,2,5,u>: Cost 3 vext1 <0,4,2,5>, LHS + 1479245926U, // <4,2,6,0>: Cost 2 vext1 <0,4,2,6>, LHS + 2552988406U, // <4,2,6,1>: Cost 3 vext1 <0,4,2,6>, <1,0,3,2> + 2552989288U, // <4,2,6,2>: Cost 3 vext1 <0,4,2,6>, <2,2,2,2> + 2954461286U, // <4,2,6,3>: Cost 3 vzipr <0,2,4,6>, LHS + 1479249206U, // <4,2,6,4>: Cost 2 vext1 <0,4,2,6>, RHS + 2229610281U, // <4,2,6,5>: Cost 3 vrev <2,4,5,6> + 2600767994U, // <4,2,6,6>: Cost 3 vext1 <u,4,2,6>, <6,2,7,3> + 2600768506U, // <4,2,6,7>: Cost 3 vext1 <u,4,2,6>, <7,0,1,2> + 1479251758U, // <4,2,6,u>: Cost 2 vext1 <0,4,2,6>, LHS + 2659365909U, // <4,2,7,0>: Cost 3 vext2 <7,0,4,2>, <7,0,4,2> + 3733771366U, // <4,2,7,1>: Cost 4 vext2 <7,1,4,2>, <7,1,4,2> + 3734434999U, // <4,2,7,2>: Cost 4 vext2 <7,2,4,2>, <7,2,4,2> + 2701199368U, // <4,2,7,3>: Cost 3 vext3 <2,7,3,4>, <2,7,3,4> + 4175774618U, // <4,2,7,4>: Cost 4 vtrnr <2,4,5,7>, <1,2,3,4> + 3303360298U, // <4,2,7,5>: Cost 4 vrev <2,4,5,7> + 3727136217U, // <4,2,7,6>: Cost 4 vext2 <6,0,4,2>, <7,6,0,4> + 3727136364U, // <4,2,7,7>: Cost 4 vext2 <6,0,4,2>, <7,7,7,7> + 2659365909U, // <4,2,7,u>: Cost 3 vext2 <7,0,4,2>, <7,0,4,2> + 1479262310U, // <4,2,u,0>: Cost 2 vext1 <0,4,2,u>, LHS + 2553004790U, // <4,2,u,1>: Cost 3 vext1 <0,4,2,u>, <1,0,3,2> + 2553005672U, // <4,2,u,2>: Cost 3 vext1 <0,4,2,u>, <2,2,2,2> + 2954477670U, // <4,2,u,3>: Cost 3 vzipr <0,2,4,u>, LHS + 1479265590U, // <4,2,u,4>: Cost 2 vext1 <0,4,2,u>, RHS + 2622871706U, // <4,2,u,5>: Cost 3 vext2 <0,u,4,2>, RHS + 2229700404U, // <4,2,u,6>: Cost 3 vrev <2,4,6,u> + 2600784890U, // <4,2,u,7>: Cost 3 vext1 <u,4,2,u>, <7,0,1,2> + 1479268142U, // <4,2,u,u>: Cost 2 vext1 <0,4,2,u>, LHS + 3765651595U, // <4,3,0,0>: Cost 4 vext3 <1,2,3,4>, <3,0,0,0> + 2691909782U, // <4,3,0,1>: Cost 3 vext3 <1,2,3,4>, <3,0,1,2> + 2702452897U, // <4,3,0,2>: Cost 3 vext3 <3,0,2,4>, <3,0,2,4> + 3693297946U, // <4,3,0,3>: Cost 4 vext2 <0,3,4,3>, <0,3,4,3> + 3760711856U, // <4,3,0,4>: Cost 4 vext3 <0,3,u,4>, <3,0,4,1> + 2235533820U, // <4,3,0,5>: Cost 3 vrev <3,4,5,0> + 3309349381U, // <4,3,0,6>: Cost 4 vrev <3,4,6,0> + 3668563278U, // <4,3,0,7>: Cost 4 vext1 <7,4,3,0>, <7,4,3,0> + 2691909845U, // <4,3,0,u>: Cost 3 vext3 <1,2,3,4>, <3,0,u,2> + 2235173328U, // <4,3,1,0>: Cost 3 vrev <3,4,0,1> + 3764840678U, // <4,3,1,1>: Cost 4 vext3 <1,1,1,4>, <3,1,1,1> + 2630173594U, // <4,3,1,2>: Cost 3 vext2 <2,1,4,3>, <1,2,3,4> + 2703190267U, // <4,3,1,3>: Cost 3 vext3 <3,1,3,4>, <3,1,3,4> + 3760195840U, // <4,3,1,4>: Cost 4 vext3 <0,3,1,4>, <3,1,4,0> + 3765651724U, // <4,3,1,5>: Cost 4 vext3 <1,2,3,4>, <3,1,5,3> + 3309357574U, // <4,3,1,6>: Cost 4 vrev <3,4,6,1> + 3769633054U, // <4,3,1,7>: Cost 4 vext3 <1,u,3,4>, <3,1,7,3> + 2703558952U, // <4,3,1,u>: Cost 3 vext3 <3,1,u,4>, <3,1,u,4> + 3626770534U, // <4,3,2,0>: Cost 4 vext1 <0,4,3,2>, LHS + 2630174250U, // <4,3,2,1>: Cost 3 vext2 <2,1,4,3>, <2,1,4,3> + 3765651777U, // <4,3,2,2>: Cost 4 vext3 <1,2,3,4>, <3,2,2,2> + 2703853900U, // <4,3,2,3>: Cost 3 vext3 <3,2,3,4>, <3,2,3,4> + 3626773814U, // <4,3,2,4>: Cost 4 vext1 <0,4,3,2>, RHS + 2704001374U, // <4,3,2,5>: Cost 3 vext3 <3,2,5,4>, <3,2,5,4> + 3765651814U, // <4,3,2,6>: Cost 4 vext3 <1,2,3,4>, <3,2,6,3> + 3769633135U, // <4,3,2,7>: Cost 4 vext3 <1,u,3,4>, <3,2,7,3> + 2634819681U, // <4,3,2,u>: Cost 3 vext2 <2,u,4,3>, <2,u,4,3> + 3765651839U, // <4,3,3,0>: Cost 4 vext3 <1,2,3,4>, <3,3,0,1> + 3765651848U, // <4,3,3,1>: Cost 4 vext3 <1,2,3,4>, <3,3,1,1> + 3710552404U, // <4,3,3,2>: Cost 4 vext2 <3,2,4,3>, <3,2,4,3> + 2691910044U, // <4,3,3,3>: Cost 3 vext3 <1,2,3,4>, <3,3,3,3> + 2704591270U, // <4,3,3,4>: Cost 3 vext3 <3,3,4,4>, <3,3,4,4> + 3769633202U, // <4,3,3,5>: Cost 4 vext3 <1,u,3,4>, <3,3,5,7> + 3703917212U, // <4,3,3,6>: Cost 4 vext2 <2,1,4,3>, <3,6,4,7> + 3769633220U, // <4,3,3,7>: Cost 4 vext3 <1,u,3,4>, <3,3,7,7> + 2691910044U, // <4,3,3,u>: Cost 3 vext3 <1,2,3,4>, <3,3,3,3> + 2691910096U, // <4,3,4,0>: Cost 3 vext3 <1,2,3,4>, <3,4,0,1> + 2691910106U, // <4,3,4,1>: Cost 3 vext3 <1,2,3,4>, <3,4,1,2> + 2564990741U, // <4,3,4,2>: Cost 3 vext1 <2,4,3,4>, <2,4,3,4> + 3765651946U, // <4,3,4,3>: Cost 4 vext3 <1,2,3,4>, <3,4,3,0> + 2691910136U, // <4,3,4,4>: Cost 3 vext3 <1,2,3,4>, <3,4,4,5> + 2686454274U, // <4,3,4,5>: Cost 3 vext3 <0,3,1,4>, <3,4,5,6> + 2235640329U, // <4,3,4,6>: Cost 3 vrev <3,4,6,4> + 3801483792U, // <4,3,4,7>: Cost 4 vext3 <7,2,3,4>, <3,4,7,2> + 2691910168U, // <4,3,4,u>: Cost 3 vext3 <1,2,3,4>, <3,4,u,1> + 2559025254U, // <4,3,5,0>: Cost 3 vext1 <1,4,3,5>, LHS + 2559026237U, // <4,3,5,1>: Cost 3 vext1 <1,4,3,5>, <1,4,3,5> + 2564998862U, // <4,3,5,2>: Cost 3 vext1 <2,4,3,5>, <2,3,4,5> + 2570971548U, // <4,3,5,3>: Cost 3 vext1 <3,4,3,5>, <3,3,3,3> + 2559028534U, // <4,3,5,4>: Cost 3 vext1 <1,4,3,5>, RHS + 4163519477U, // <4,3,5,5>: Cost 4 vtrnr <0,4,1,5>, <1,3,4,5> + 3309390346U, // <4,3,5,6>: Cost 4 vrev <3,4,6,5> + 2706139747U, // <4,3,5,7>: Cost 3 vext3 <3,5,7,4>, <3,5,7,4> + 2559031086U, // <4,3,5,u>: Cost 3 vext1 <1,4,3,5>, LHS + 2559033446U, // <4,3,6,0>: Cost 3 vext1 <1,4,3,6>, LHS + 2559034430U, // <4,3,6,1>: Cost 3 vext1 <1,4,3,6>, <1,4,3,6> + 2565007127U, // <4,3,6,2>: Cost 3 vext1 <2,4,3,6>, <2,4,3,6> + 2570979740U, // <4,3,6,3>: Cost 3 vext1 <3,4,3,6>, <3,3,3,3> + 2559036726U, // <4,3,6,4>: Cost 3 vext1 <1,4,3,6>, RHS + 1161841154U, // <4,3,6,5>: Cost 2 vrev <3,4,5,6> + 4028203932U, // <4,3,6,6>: Cost 4 vzipr <0,2,4,6>, <1,2,3,6> + 2706803380U, // <4,3,6,7>: Cost 3 vext3 <3,6,7,4>, <3,6,7,4> + 1162062365U, // <4,3,6,u>: Cost 2 vrev <3,4,u,6> + 3769633475U, // <4,3,7,0>: Cost 4 vext3 <1,u,3,4>, <3,7,0,1> + 3769633488U, // <4,3,7,1>: Cost 4 vext3 <1,u,3,4>, <3,7,1,5> + 3638757144U, // <4,3,7,2>: Cost 4 vext1 <2,4,3,7>, <2,4,3,7> + 3769633508U, // <4,3,7,3>: Cost 4 vext3 <1,u,3,4>, <3,7,3,7> + 3769633515U, // <4,3,7,4>: Cost 4 vext3 <1,u,3,4>, <3,7,4,5> + 3769633526U, // <4,3,7,5>: Cost 4 vext3 <1,u,3,4>, <3,7,5,7> + 3662647932U, // <4,3,7,6>: Cost 4 vext1 <6,4,3,7>, <6,4,3,7> + 3781208837U, // <4,3,7,7>: Cost 4 vext3 <3,7,7,4>, <3,7,7,4> + 3769633547U, // <4,3,7,u>: Cost 4 vext3 <1,u,3,4>, <3,7,u,1> + 2559049830U, // <4,3,u,0>: Cost 3 vext1 <1,4,3,u>, LHS + 2691910430U, // <4,3,u,1>: Cost 3 vext3 <1,2,3,4>, <3,u,1,2> + 2565023513U, // <4,3,u,2>: Cost 3 vext1 <2,4,3,u>, <2,4,3,u> + 2707835698U, // <4,3,u,3>: Cost 3 vext3 <3,u,3,4>, <3,u,3,4> + 2559053110U, // <4,3,u,4>: Cost 3 vext1 <1,4,3,u>, RHS + 1161857540U, // <4,3,u,5>: Cost 2 vrev <3,4,5,u> + 2235673101U, // <4,3,u,6>: Cost 3 vrev <3,4,6,u> + 2708130646U, // <4,3,u,7>: Cost 3 vext3 <3,u,7,4>, <3,u,7,4> + 1162078751U, // <4,3,u,u>: Cost 2 vrev <3,4,u,u> + 2617573416U, // <4,4,0,0>: Cost 3 vext2 <0,0,4,4>, <0,0,4,4> + 1570373734U, // <4,4,0,1>: Cost 2 vext2 <4,4,4,4>, LHS + 2779676774U, // <4,4,0,2>: Cost 3 vuzpl <4,6,4,6>, LHS + 3760196480U, // <4,4,0,3>: Cost 4 vext3 <0,3,1,4>, <4,0,3,1> + 2576977100U, // <4,4,0,4>: Cost 3 vext1 <4,4,4,0>, <4,4,4,0> + 2718747538U, // <4,4,0,5>: Cost 3 vext3 <5,6,7,4>, <4,0,5,1> + 2718747548U, // <4,4,0,6>: Cost 3 vext3 <5,6,7,4>, <4,0,6,2> + 3668637015U, // <4,4,0,7>: Cost 4 vext1 <7,4,4,0>, <7,4,4,0> + 1570374301U, // <4,4,0,u>: Cost 2 vext2 <4,4,4,4>, LHS + 2644116214U, // <4,4,1,0>: Cost 3 vext2 <4,4,4,4>, <1,0,3,2> + 2644116276U, // <4,4,1,1>: Cost 3 vext2 <4,4,4,4>, <1,1,1,1> + 2691910602U, // <4,4,1,2>: Cost 3 vext3 <1,2,3,4>, <4,1,2,3> + 2644116440U, // <4,4,1,3>: Cost 3 vext2 <4,4,4,4>, <1,3,1,3> + 2711227356U, // <4,4,1,4>: Cost 3 vext3 <4,4,4,4>, <4,1,4,3> + 2709310438U, // <4,4,1,5>: Cost 3 vext3 <4,1,5,4>, <4,1,5,4> + 3765652462U, // <4,4,1,6>: Cost 4 vext3 <1,2,3,4>, <4,1,6,3> + 3768970231U, // <4,4,1,7>: Cost 4 vext3 <1,7,3,4>, <4,1,7,3> + 2695891968U, // <4,4,1,u>: Cost 3 vext3 <1,u,3,4>, <4,1,u,3> + 3703260634U, // <4,4,2,0>: Cost 4 vext2 <2,0,4,4>, <2,0,4,4> + 3765652499U, // <4,4,2,1>: Cost 4 vext3 <1,2,3,4>, <4,2,1,4> + 2644117096U, // <4,4,2,2>: Cost 3 vext2 <4,4,4,4>, <2,2,2,2> + 2631509709U, // <4,4,2,3>: Cost 3 vext2 <2,3,4,4>, <2,3,4,4> + 2644117269U, // <4,4,2,4>: Cost 3 vext2 <4,4,4,4>, <2,4,3,4> + 3705251698U, // <4,4,2,5>: Cost 4 vext2 <2,3,4,4>, <2,5,4,7> + 2710047808U, // <4,4,2,6>: Cost 3 vext3 <4,2,6,4>, <4,2,6,4> + 3783863369U, // <4,4,2,7>: Cost 4 vext3 <4,2,7,4>, <4,2,7,4> + 2634827874U, // <4,4,2,u>: Cost 3 vext2 <2,u,4,4>, <2,u,4,4> + 2644117654U, // <4,4,3,0>: Cost 3 vext2 <4,4,4,4>, <3,0,1,2> + 3638797210U, // <4,4,3,1>: Cost 4 vext1 <2,4,4,3>, <1,2,3,4> + 3638798082U, // <4,4,3,2>: Cost 4 vext1 <2,4,4,3>, <2,4,1,3> + 2637482406U, // <4,4,3,3>: Cost 3 vext2 <3,3,4,4>, <3,3,4,4> + 2638146039U, // <4,4,3,4>: Cost 3 vext2 <3,4,4,4>, <3,4,4,4> + 3913287374U, // <4,4,3,5>: Cost 4 vuzpr <3,4,5,4>, <2,3,4,5> + 3765652625U, // <4,4,3,6>: Cost 4 vext3 <1,2,3,4>, <4,3,6,4> + 3713878762U, // <4,4,3,7>: Cost 4 vext2 <3,7,4,4>, <3,7,4,4> + 2637482406U, // <4,4,3,u>: Cost 3 vext2 <3,3,4,4>, <3,3,4,4> + 1503264870U, // <4,4,4,0>: Cost 2 vext1 <4,4,4,4>, LHS + 2577007514U, // <4,4,4,1>: Cost 3 vext1 <4,4,4,4>, <1,2,3,4> + 2577008232U, // <4,4,4,2>: Cost 3 vext1 <4,4,4,4>, <2,2,2,2> + 2571037175U, // <4,4,4,3>: Cost 3 vext1 <3,4,4,4>, <3,4,4,4> + 161926454U, // <4,4,4,4>: Cost 1 vdup0 RHS + 1570377014U, // <4,4,4,5>: Cost 2 vext2 <4,4,4,4>, RHS + 2779680054U, // <4,4,4,6>: Cost 3 vuzpl <4,6,4,6>, RHS + 2594927963U, // <4,4,4,7>: Cost 3 vext1 <7,4,4,4>, <7,4,4,4> + 161926454U, // <4,4,4,u>: Cost 1 vdup0 RHS + 2571042918U, // <4,4,5,0>: Cost 3 vext1 <3,4,4,5>, LHS + 2571043738U, // <4,4,5,1>: Cost 3 vext1 <3,4,4,5>, <1,2,3,4> + 3638814495U, // <4,4,5,2>: Cost 4 vext1 <2,4,4,5>, <2,4,4,5> + 2571045368U, // <4,4,5,3>: Cost 3 vext1 <3,4,4,5>, <3,4,4,5> + 2571046198U, // <4,4,5,4>: Cost 3 vext1 <3,4,4,5>, RHS + 1839648054U, // <4,4,5,5>: Cost 2 vzipl RHS, RHS + 1618169142U, // <4,4,5,6>: Cost 2 vext3 <1,2,3,4>, RHS + 2594936156U, // <4,4,5,7>: Cost 3 vext1 <7,4,4,5>, <7,4,4,5> + 1618169160U, // <4,4,5,u>: Cost 2 vext3 <1,2,3,4>, RHS + 2553135206U, // <4,4,6,0>: Cost 3 vext1 <0,4,4,6>, LHS + 3626877686U, // <4,4,6,1>: Cost 4 vext1 <0,4,4,6>, <1,0,3,2> + 2565080782U, // <4,4,6,2>: Cost 3 vext1 <2,4,4,6>, <2,3,4,5> + 2571053561U, // <4,4,6,3>: Cost 3 vext1 <3,4,4,6>, <3,4,4,6> + 2553138486U, // <4,4,6,4>: Cost 3 vext1 <0,4,4,6>, RHS + 2241555675U, // <4,4,6,5>: Cost 3 vrev <4,4,5,6> + 1973865782U, // <4,4,6,6>: Cost 2 vtrnl RHS, RHS + 2658055029U, // <4,4,6,7>: Cost 3 vext2 <6,7,4,4>, <6,7,4,4> + 1973865800U, // <4,4,6,u>: Cost 2 vtrnl RHS, RHS + 2644120570U, // <4,4,7,0>: Cost 3 vext2 <4,4,4,4>, <7,0,1,2> + 3638829978U, // <4,4,7,1>: Cost 4 vext1 <2,4,4,7>, <1,2,3,4> + 3638830881U, // <4,4,7,2>: Cost 4 vext1 <2,4,4,7>, <2,4,4,7> + 3735115018U, // <4,4,7,3>: Cost 4 vext2 <7,3,4,4>, <7,3,4,4> + 2662036827U, // <4,4,7,4>: Cost 3 vext2 <7,4,4,4>, <7,4,4,4> + 2713292236U, // <4,4,7,5>: Cost 3 vext3 <4,7,5,4>, <4,7,5,4> + 2713365973U, // <4,4,7,6>: Cost 3 vext3 <4,7,6,4>, <4,7,6,4> + 2644121196U, // <4,4,7,7>: Cost 3 vext2 <4,4,4,4>, <7,7,7,7> + 2662036827U, // <4,4,7,u>: Cost 3 vext2 <7,4,4,4>, <7,4,4,4> + 1503297638U, // <4,4,u,0>: Cost 2 vext1 <4,4,4,u>, LHS + 1570379566U, // <4,4,u,1>: Cost 2 vext2 <4,4,4,4>, LHS + 2779682606U, // <4,4,u,2>: Cost 3 vuzpl <4,6,4,6>, LHS + 2571069947U, // <4,4,u,3>: Cost 3 vext1 <3,4,4,u>, <3,4,4,u> + 161926454U, // <4,4,u,4>: Cost 1 vdup0 RHS + 1841638710U, // <4,4,u,5>: Cost 2 vzipl RHS, RHS + 1618169385U, // <4,4,u,6>: Cost 2 vext3 <1,2,3,4>, RHS + 2594960735U, // <4,4,u,7>: Cost 3 vext1 <7,4,4,u>, <7,4,4,u> + 161926454U, // <4,4,u,u>: Cost 1 vdup0 RHS + 2631516160U, // <4,5,0,0>: Cost 3 vext2 <2,3,4,5>, <0,0,0,0> + 1557774438U, // <4,5,0,1>: Cost 2 vext2 <2,3,4,5>, LHS + 2618908875U, // <4,5,0,2>: Cost 3 vext2 <0,2,4,5>, <0,2,4,5> + 2571078140U, // <4,5,0,3>: Cost 3 vext1 <3,4,5,0>, <3,4,5,0> + 2626871634U, // <4,5,0,4>: Cost 3 vext2 <1,5,4,5>, <0,4,1,5> + 3705258414U, // <4,5,0,5>: Cost 4 vext2 <2,3,4,5>, <0,5,2,7> + 2594968438U, // <4,5,0,6>: Cost 3 vext1 <7,4,5,0>, <6,7,4,5> + 2594968928U, // <4,5,0,7>: Cost 3 vext1 <7,4,5,0>, <7,4,5,0> + 1557775005U, // <4,5,0,u>: Cost 2 vext2 <2,3,4,5>, LHS + 2631516918U, // <4,5,1,0>: Cost 3 vext2 <2,3,4,5>, <1,0,3,2> + 2624217939U, // <4,5,1,1>: Cost 3 vext2 <1,1,4,5>, <1,1,4,5> + 2631517078U, // <4,5,1,2>: Cost 3 vext2 <2,3,4,5>, <1,2,3,0> + 2821341286U, // <4,5,1,3>: Cost 3 vuzpr <0,4,1,5>, LHS + 3895086054U, // <4,5,1,4>: Cost 4 vuzpr <0,4,1,5>, <4,1,5,4> + 2626872471U, // <4,5,1,5>: Cost 3 vext2 <1,5,4,5>, <1,5,4,5> + 3895083131U, // <4,5,1,6>: Cost 4 vuzpr <0,4,1,5>, <0,1,4,6> + 2718748368U, // <4,5,1,7>: Cost 3 vext3 <5,6,7,4>, <5,1,7,3> + 2821341291U, // <4,5,1,u>: Cost 3 vuzpr <0,4,1,5>, LHS + 2571092070U, // <4,5,2,0>: Cost 3 vext1 <3,4,5,2>, LHS + 3699287585U, // <4,5,2,1>: Cost 4 vext2 <1,3,4,5>, <2,1,3,3> + 2630854269U, // <4,5,2,2>: Cost 3 vext2 <2,2,4,5>, <2,2,4,5> + 1557776078U, // <4,5,2,3>: Cost 2 vext2 <2,3,4,5>, <2,3,4,5> + 2631517974U, // <4,5,2,4>: Cost 3 vext2 <2,3,4,5>, <2,4,3,5> + 3692652384U, // <4,5,2,5>: Cost 4 vext2 <0,2,4,5>, <2,5,2,7> + 2631518138U, // <4,5,2,6>: Cost 3 vext2 <2,3,4,5>, <2,6,3,7> + 4164013366U, // <4,5,2,7>: Cost 4 vtrnr <0,4,u,2>, RHS + 1561094243U, // <4,5,2,u>: Cost 2 vext2 <2,u,4,5>, <2,u,4,5> + 2631518358U, // <4,5,3,0>: Cost 3 vext2 <2,3,4,5>, <3,0,1,2> + 3895084710U, // <4,5,3,1>: Cost 4 vuzpr <0,4,1,5>, <2,3,0,1> + 2631518540U, // <4,5,3,2>: Cost 3 vext2 <2,3,4,5>, <3,2,3,4> + 2631518620U, // <4,5,3,3>: Cost 3 vext2 <2,3,4,5>, <3,3,3,3> + 2631518716U, // <4,5,3,4>: Cost 3 vext2 <2,3,4,5>, <3,4,5,0> + 2631518784U, // <4,5,3,5>: Cost 3 vext2 <2,3,4,5>, <3,5,3,5> + 2658060980U, // <4,5,3,6>: Cost 3 vext2 <6,7,4,5>, <3,6,7,4> + 2640145131U, // <4,5,3,7>: Cost 3 vext2 <3,7,4,5>, <3,7,4,5> + 2631519006U, // <4,5,3,u>: Cost 3 vext2 <2,3,4,5>, <3,u,1,2> + 2571108454U, // <4,5,4,0>: Cost 3 vext1 <3,4,5,4>, LHS + 3632907342U, // <4,5,4,1>: Cost 4 vext1 <1,4,5,4>, <1,4,5,4> + 2571110094U, // <4,5,4,2>: Cost 3 vext1 <3,4,5,4>, <2,3,4,5> + 2571110912U, // <4,5,4,3>: Cost 3 vext1 <3,4,5,4>, <3,4,5,4> + 2571111734U, // <4,5,4,4>: Cost 3 vext1 <3,4,5,4>, RHS + 1557777718U, // <4,5,4,5>: Cost 2 vext2 <2,3,4,5>, RHS + 2645454195U, // <4,5,4,6>: Cost 3 vext2 <4,6,4,5>, <4,6,4,5> + 2718748614U, // <4,5,4,7>: Cost 3 vext3 <5,6,7,4>, <5,4,7,6> + 1557777961U, // <4,5,4,u>: Cost 2 vext2 <2,3,4,5>, RHS + 1503346790U, // <4,5,5,0>: Cost 2 vext1 <4,4,5,5>, LHS + 2913398480U, // <4,5,5,1>: Cost 3 vzipl RHS, <5,1,7,3> + 2631519998U, // <4,5,5,2>: Cost 3 vext2 <2,3,4,5>, <5,2,3,4> + 2577090710U, // <4,5,5,3>: Cost 3 vext1 <4,4,5,5>, <3,0,1,2> + 1503349978U, // <4,5,5,4>: Cost 2 vext1 <4,4,5,5>, <4,4,5,5> + 2631520260U, // <4,5,5,5>: Cost 3 vext2 <2,3,4,5>, <5,5,5,5> + 2913390690U, // <4,5,5,6>: Cost 3 vzipl RHS, <5,6,7,0> + 2821344566U, // <4,5,5,7>: Cost 3 vuzpr <0,4,1,5>, RHS + 1503352622U, // <4,5,5,u>: Cost 2 vext1 <4,4,5,5>, LHS + 1497383014U, // <4,5,6,0>: Cost 2 vext1 <3,4,5,6>, LHS + 2559181904U, // <4,5,6,1>: Cost 3 vext1 <1,4,5,6>, <1,4,5,6> + 2565154601U, // <4,5,6,2>: Cost 3 vext1 <2,4,5,6>, <2,4,5,6> + 1497385474U, // <4,5,6,3>: Cost 2 vext1 <3,4,5,6>, <3,4,5,6> + 1497386294U, // <4,5,6,4>: Cost 2 vext1 <3,4,5,6>, RHS + 3047608324U, // <4,5,6,5>: Cost 3 vtrnl RHS, <5,5,5,5> + 2571129656U, // <4,5,6,6>: Cost 3 vext1 <3,4,5,6>, <6,6,6,6> + 27705344U, // <4,5,6,7>: Cost 0 copy RHS + 27705344U, // <4,5,6,u>: Cost 0 copy RHS + 2565161062U, // <4,5,7,0>: Cost 3 vext1 <2,4,5,7>, LHS + 2565161882U, // <4,5,7,1>: Cost 3 vext1 <2,4,5,7>, <1,2,3,4> + 2565162794U, // <4,5,7,2>: Cost 3 vext1 <2,4,5,7>, <2,4,5,7> + 2661381387U, // <4,5,7,3>: Cost 3 vext2 <7,3,4,5>, <7,3,4,5> + 2565164342U, // <4,5,7,4>: Cost 3 vext1 <2,4,5,7>, RHS + 2718748840U, // <4,5,7,5>: Cost 3 vext3 <5,6,7,4>, <5,7,5,7> + 2718748846U, // <4,5,7,6>: Cost 3 vext3 <5,6,7,4>, <5,7,6,4> + 2719412407U, // <4,5,7,7>: Cost 3 vext3 <5,7,7,4>, <5,7,7,4> + 2565166894U, // <4,5,7,u>: Cost 3 vext1 <2,4,5,7>, LHS + 1497399398U, // <4,5,u,0>: Cost 2 vext1 <3,4,5,u>, LHS + 1557780270U, // <4,5,u,1>: Cost 2 vext2 <2,3,4,5>, LHS + 2631522181U, // <4,5,u,2>: Cost 3 vext2 <2,3,4,5>, <u,2,3,0> + 1497401860U, // <4,5,u,3>: Cost 2 vext1 <3,4,5,u>, <3,4,5,u> + 1497402678U, // <4,5,u,4>: Cost 2 vext1 <3,4,5,u>, RHS + 1557780634U, // <4,5,u,5>: Cost 2 vext2 <2,3,4,5>, RHS + 2631522512U, // <4,5,u,6>: Cost 3 vext2 <2,3,4,5>, <u,6,3,7> + 27705344U, // <4,5,u,7>: Cost 0 copy RHS + 27705344U, // <4,5,u,u>: Cost 0 copy RHS + 2618916864U, // <4,6,0,0>: Cost 3 vext2 <0,2,4,6>, <0,0,0,0> + 1545175142U, // <4,6,0,1>: Cost 2 vext2 <0,2,4,6>, LHS + 1545175244U, // <4,6,0,2>: Cost 2 vext2 <0,2,4,6>, <0,2,4,6> + 3692658940U, // <4,6,0,3>: Cost 4 vext2 <0,2,4,6>, <0,3,1,0> + 2618917202U, // <4,6,0,4>: Cost 3 vext2 <0,2,4,6>, <0,4,1,5> + 3852910806U, // <4,6,0,5>: Cost 4 vuzpl RHS, <0,2,5,7> + 2253525648U, // <4,6,0,6>: Cost 3 vrev <6,4,6,0> + 4040764726U, // <4,6,0,7>: Cost 4 vzipr <2,3,4,0>, RHS + 1545175709U, // <4,6,0,u>: Cost 2 vext2 <0,2,4,6>, LHS + 2618917622U, // <4,6,1,0>: Cost 3 vext2 <0,2,4,6>, <1,0,3,2> + 2618917684U, // <4,6,1,1>: Cost 3 vext2 <0,2,4,6>, <1,1,1,1> + 2618917782U, // <4,6,1,2>: Cost 3 vext2 <0,2,4,6>, <1,2,3,0> + 2618917848U, // <4,6,1,3>: Cost 3 vext2 <0,2,4,6>, <1,3,1,3> + 3692659773U, // <4,6,1,4>: Cost 4 vext2 <0,2,4,6>, <1,4,3,5> + 2618918032U, // <4,6,1,5>: Cost 3 vext2 <0,2,4,6>, <1,5,3,7> + 3692659937U, // <4,6,1,6>: Cost 4 vext2 <0,2,4,6>, <1,6,3,7> + 4032146742U, // <4,6,1,7>: Cost 4 vzipr <0,u,4,1>, RHS + 2618918253U, // <4,6,1,u>: Cost 3 vext2 <0,2,4,6>, <1,u,1,3> + 2618918380U, // <4,6,2,0>: Cost 3 vext2 <0,2,4,6>, <2,0,6,4> + 2618918460U, // <4,6,2,1>: Cost 3 vext2 <0,2,4,6>, <2,1,6,3> + 2618918504U, // <4,6,2,2>: Cost 3 vext2 <0,2,4,6>, <2,2,2,2> + 2618918566U, // <4,6,2,3>: Cost 3 vext2 <0,2,4,6>, <2,3,0,1> + 2618918679U, // <4,6,2,4>: Cost 3 vext2 <0,2,4,6>, <2,4,3,6> + 2618918788U, // <4,6,2,5>: Cost 3 vext2 <0,2,4,6>, <2,5,6,7> + 2618918842U, // <4,6,2,6>: Cost 3 vext2 <0,2,4,6>, <2,6,3,7> + 2718749178U, // <4,6,2,7>: Cost 3 vext3 <5,6,7,4>, <6,2,7,3> + 2618918971U, // <4,6,2,u>: Cost 3 vext2 <0,2,4,6>, <2,u,0,1> + 2618919062U, // <4,6,3,0>: Cost 3 vext2 <0,2,4,6>, <3,0,1,2> + 2636171526U, // <4,6,3,1>: Cost 3 vext2 <3,1,4,6>, <3,1,4,6> + 3692661057U, // <4,6,3,2>: Cost 4 vext2 <0,2,4,6>, <3,2,2,2> + 2618919324U, // <4,6,3,3>: Cost 3 vext2 <0,2,4,6>, <3,3,3,3> + 2618919426U, // <4,6,3,4>: Cost 3 vext2 <0,2,4,6>, <3,4,5,6> + 2638826058U, // <4,6,3,5>: Cost 3 vext2 <3,5,4,6>, <3,5,4,6> + 3913303030U, // <4,6,3,6>: Cost 4 vuzpr <3,4,5,6>, <1,3,4,6> + 2722730572U, // <4,6,3,7>: Cost 3 vext3 <6,3,7,4>, <6,3,7,4> + 2618919710U, // <4,6,3,u>: Cost 3 vext2 <0,2,4,6>, <3,u,1,2> + 2565210214U, // <4,6,4,0>: Cost 3 vext1 <2,4,6,4>, LHS + 2718749286U, // <4,6,4,1>: Cost 3 vext3 <5,6,7,4>, <6,4,1,3> + 2565211952U, // <4,6,4,2>: Cost 3 vext1 <2,4,6,4>, <2,4,6,4> + 2571184649U, // <4,6,4,3>: Cost 3 vext1 <3,4,6,4>, <3,4,6,4> + 2565213494U, // <4,6,4,4>: Cost 3 vext1 <2,4,6,4>, RHS + 1545178422U, // <4,6,4,5>: Cost 2 vext2 <0,2,4,6>, RHS + 1705430326U, // <4,6,4,6>: Cost 2 vuzpl RHS, RHS + 2595075437U, // <4,6,4,7>: Cost 3 vext1 <7,4,6,4>, <7,4,6,4> + 1545178665U, // <4,6,4,u>: Cost 2 vext2 <0,2,4,6>, RHS + 2565218406U, // <4,6,5,0>: Cost 3 vext1 <2,4,6,5>, LHS + 2645462736U, // <4,6,5,1>: Cost 3 vext2 <4,6,4,6>, <5,1,7,3> + 2913399290U, // <4,6,5,2>: Cost 3 vzipl RHS, <6,2,7,3> + 3913305394U, // <4,6,5,3>: Cost 4 vuzpr <3,4,5,6>, <4,5,6,3> + 2645462982U, // <4,6,5,4>: Cost 3 vext2 <4,6,4,6>, <5,4,7,6> + 2779172868U, // <4,6,5,5>: Cost 3 vuzpl RHS, <5,5,5,5> + 2913391416U, // <4,6,5,6>: Cost 3 vzipl RHS, <6,6,6,6> + 2821426486U, // <4,6,5,7>: Cost 3 vuzpr <0,4,2,6>, RHS + 2821426487U, // <4,6,5,u>: Cost 3 vuzpr <0,4,2,6>, RHS + 1503428710U, // <4,6,6,0>: Cost 2 vext1 <4,4,6,6>, LHS + 2577171190U, // <4,6,6,1>: Cost 3 vext1 <4,4,6,6>, <1,0,3,2> + 2645463546U, // <4,6,6,2>: Cost 3 vext2 <4,6,4,6>, <6,2,7,3> + 2577172630U, // <4,6,6,3>: Cost 3 vext1 <4,4,6,6>, <3,0,1,2> + 1503431908U, // <4,6,6,4>: Cost 2 vext1 <4,4,6,6>, <4,4,6,6> + 2253501069U, // <4,6,6,5>: Cost 3 vrev <6,4,5,6> + 2618921784U, // <4,6,6,6>: Cost 3 vext2 <0,2,4,6>, <6,6,6,6> + 2954464566U, // <4,6,6,7>: Cost 3 vzipr <0,2,4,6>, RHS + 1503434542U, // <4,6,6,u>: Cost 2 vext1 <4,4,6,6>, LHS + 2645464058U, // <4,6,7,0>: Cost 3 vext2 <4,6,4,6>, <7,0,1,2> + 2779173882U, // <4,6,7,1>: Cost 3 vuzpl RHS, <7,0,1,2> + 3638978355U, // <4,6,7,2>: Cost 4 vext1 <2,4,6,7>, <2,4,6,7> + 2725090156U, // <4,6,7,3>: Cost 3 vext3 <6,7,3,4>, <6,7,3,4> + 2645464422U, // <4,6,7,4>: Cost 3 vext2 <4,6,4,6>, <7,4,5,6> + 2779174246U, // <4,6,7,5>: Cost 3 vuzpl RHS, <7,4,5,6> + 3852915914U, // <4,6,7,6>: Cost 4 vuzpl RHS, <7,2,6,3> + 2779174508U, // <4,6,7,7>: Cost 3 vuzpl RHS, <7,7,7,7> + 2779173945U, // <4,6,7,u>: Cost 3 vuzpl RHS, <7,0,u,2> + 1503445094U, // <4,6,u,0>: Cost 2 vext1 <4,4,6,u>, LHS + 1545180974U, // <4,6,u,1>: Cost 2 vext2 <0,2,4,6>, LHS + 1705432878U, // <4,6,u,2>: Cost 2 vuzpl RHS, LHS + 2618922940U, // <4,6,u,3>: Cost 3 vext2 <0,2,4,6>, <u,3,0,1> + 1503448294U, // <4,6,u,4>: Cost 2 vext1 <4,4,6,u>, <4,4,6,u> + 1545181338U, // <4,6,u,5>: Cost 2 vext2 <0,2,4,6>, RHS + 1705433242U, // <4,6,u,6>: Cost 2 vuzpl RHS, RHS + 2954480950U, // <4,6,u,7>: Cost 3 vzipr <0,2,4,u>, RHS + 1545181541U, // <4,6,u,u>: Cost 2 vext2 <0,2,4,6>, LHS + 3706601472U, // <4,7,0,0>: Cost 4 vext2 <2,5,4,7>, <0,0,0,0> + 2632859750U, // <4,7,0,1>: Cost 3 vext2 <2,5,4,7>, LHS + 2726343685U, // <4,7,0,2>: Cost 3 vext3 <7,0,2,4>, <7,0,2,4> + 3701293312U, // <4,7,0,3>: Cost 4 vext2 <1,6,4,7>, <0,3,1,4> + 3706601810U, // <4,7,0,4>: Cost 4 vext2 <2,5,4,7>, <0,4,1,5> + 2259424608U, // <4,7,0,5>: Cost 3 vrev <7,4,5,0> + 3695321617U, // <4,7,0,6>: Cost 4 vext2 <0,6,4,7>, <0,6,4,7> + 3800454194U, // <4,7,0,7>: Cost 4 vext3 <7,0,7,4>, <7,0,7,4> + 2632860317U, // <4,7,0,u>: Cost 3 vext2 <2,5,4,7>, LHS + 2259064116U, // <4,7,1,0>: Cost 3 vrev <7,4,0,1> + 3700630324U, // <4,7,1,1>: Cost 4 vext2 <1,5,4,7>, <1,1,1,1> + 2632860570U, // <4,7,1,2>: Cost 3 vext2 <2,5,4,7>, <1,2,3,4> + 3769635936U, // <4,7,1,3>: Cost 4 vext3 <1,u,3,4>, <7,1,3,5> + 3656920374U, // <4,7,1,4>: Cost 4 vext1 <5,4,7,1>, RHS + 3700630681U, // <4,7,1,5>: Cost 4 vext2 <1,5,4,7>, <1,5,4,7> + 3701294314U, // <4,7,1,6>: Cost 4 vext2 <1,6,4,7>, <1,6,4,7> + 3793818754U, // <4,7,1,7>: Cost 4 vext3 <5,u,7,4>, <7,1,7,3> + 2259654012U, // <4,7,1,u>: Cost 3 vrev <7,4,u,1> + 3656925286U, // <4,7,2,0>: Cost 4 vext1 <5,4,7,2>, LHS + 3706603050U, // <4,7,2,1>: Cost 4 vext2 <2,5,4,7>, <2,1,4,3> + 3706603112U, // <4,7,2,2>: Cost 4 vext2 <2,5,4,7>, <2,2,2,2> + 2727744688U, // <4,7,2,3>: Cost 3 vext3 <7,2,3,4>, <7,2,3,4> + 3705939745U, // <4,7,2,4>: Cost 4 vext2 <2,4,4,7>, <2,4,4,7> + 2632861554U, // <4,7,2,5>: Cost 3 vext2 <2,5,4,7>, <2,5,4,7> + 3706603450U, // <4,7,2,6>: Cost 4 vext2 <2,5,4,7>, <2,6,3,7> + 3792491731U, // <4,7,2,7>: Cost 4 vext3 <5,6,7,4>, <7,2,7,3> + 2634852453U, // <4,7,2,u>: Cost 3 vext2 <2,u,4,7>, <2,u,4,7> + 3706603670U, // <4,7,3,0>: Cost 4 vext2 <2,5,4,7>, <3,0,1,2> + 3662906266U, // <4,7,3,1>: Cost 4 vext1 <6,4,7,3>, <1,2,3,4> + 3725183326U, // <4,7,3,2>: Cost 4 vext2 <5,6,4,7>, <3,2,5,4> + 3706603932U, // <4,7,3,3>: Cost 4 vext2 <2,5,4,7>, <3,3,3,3> + 3701295618U, // <4,7,3,4>: Cost 4 vext2 <1,6,4,7>, <3,4,5,6> + 2638834251U, // <4,7,3,5>: Cost 3 vext2 <3,5,4,7>, <3,5,4,7> + 2639497884U, // <4,7,3,6>: Cost 3 vext2 <3,6,4,7>, <3,6,4,7> + 3802445093U, // <4,7,3,7>: Cost 4 vext3 <7,3,7,4>, <7,3,7,4> + 2640825150U, // <4,7,3,u>: Cost 3 vext2 <3,u,4,7>, <3,u,4,7> + 2718750004U, // <4,7,4,0>: Cost 3 vext3 <5,6,7,4>, <7,4,0,1> + 3706604490U, // <4,7,4,1>: Cost 4 vext2 <2,5,4,7>, <4,1,2,3> + 3656943474U, // <4,7,4,2>: Cost 4 vext1 <5,4,7,4>, <2,5,4,7> + 3779884371U, // <4,7,4,3>: Cost 4 vext3 <3,5,7,4>, <7,4,3,5> + 2259383643U, // <4,7,4,4>: Cost 3 vrev <7,4,4,4> + 2632863030U, // <4,7,4,5>: Cost 3 vext2 <2,5,4,7>, RHS + 2259531117U, // <4,7,4,6>: Cost 3 vrev <7,4,6,4> + 3907340074U, // <4,7,4,7>: Cost 4 vuzpr <2,4,5,7>, <2,4,5,7> + 2632863273U, // <4,7,4,u>: Cost 3 vext2 <2,5,4,7>, RHS + 2913391610U, // <4,7,5,0>: Cost 3 vzipl RHS, <7,0,1,2> + 3645006848U, // <4,7,5,1>: Cost 4 vext1 <3,4,7,5>, <1,3,5,7> + 2589181646U, // <4,7,5,2>: Cost 3 vext1 <6,4,7,5>, <2,3,4,5> + 3645008403U, // <4,7,5,3>: Cost 4 vext1 <3,4,7,5>, <3,4,7,5> + 2913391974U, // <4,7,5,4>: Cost 3 vzipl RHS, <7,4,5,6> + 2583211973U, // <4,7,5,5>: Cost 3 vext1 <5,4,7,5>, <5,4,7,5> + 2589184670U, // <4,7,5,6>: Cost 3 vext1 <6,4,7,5>, <6,4,7,5> + 2913392236U, // <4,7,5,7>: Cost 3 vzipl RHS, <7,7,7,7> + 2913392258U, // <4,7,5,u>: Cost 3 vzipl RHS, <7,u,1,2> + 1509474406U, // <4,7,6,0>: Cost 2 vext1 <5,4,7,6>, LHS + 3047609338U, // <4,7,6,1>: Cost 3 vtrnl RHS, <7,0,1,2> + 2583217768U, // <4,7,6,2>: Cost 3 vext1 <5,4,7,6>, <2,2,2,2> + 2583218326U, // <4,7,6,3>: Cost 3 vext1 <5,4,7,6>, <3,0,1,2> + 1509477686U, // <4,7,6,4>: Cost 2 vext1 <5,4,7,6>, RHS + 1509478342U, // <4,7,6,5>: Cost 2 vext1 <5,4,7,6>, <5,4,7,6> + 2583220730U, // <4,7,6,6>: Cost 3 vext1 <5,4,7,6>, <6,2,7,3> + 3047609964U, // <4,7,6,7>: Cost 3 vtrnl RHS, <7,7,7,7> + 1509480238U, // <4,7,6,u>: Cost 2 vext1 <5,4,7,6>, LHS + 3650994278U, // <4,7,7,0>: Cost 4 vext1 <4,4,7,7>, LHS + 3650995098U, // <4,7,7,1>: Cost 4 vext1 <4,4,7,7>, <1,2,3,4> + 3650996010U, // <4,7,7,2>: Cost 4 vext1 <4,4,7,7>, <2,4,5,7> + 3804804677U, // <4,7,7,3>: Cost 4 vext3 <7,7,3,4>, <7,7,3,4> + 3650997486U, // <4,7,7,4>: Cost 4 vext1 <4,4,7,7>, <4,4,7,7> + 2662725039U, // <4,7,7,5>: Cost 3 vext2 <7,5,4,7>, <7,5,4,7> + 3662942880U, // <4,7,7,6>: Cost 4 vext1 <6,4,7,7>, <6,4,7,7> + 2718750316U, // <4,7,7,7>: Cost 3 vext3 <5,6,7,4>, <7,7,7,7> + 2664715938U, // <4,7,7,u>: Cost 3 vext2 <7,u,4,7>, <7,u,4,7> + 1509490790U, // <4,7,u,0>: Cost 2 vext1 <5,4,7,u>, LHS + 2632865582U, // <4,7,u,1>: Cost 3 vext2 <2,5,4,7>, LHS + 2583234152U, // <4,7,u,2>: Cost 3 vext1 <5,4,7,u>, <2,2,2,2> + 2583234710U, // <4,7,u,3>: Cost 3 vext1 <5,4,7,u>, <3,0,1,2> + 1509494070U, // <4,7,u,4>: Cost 2 vext1 <5,4,7,u>, RHS + 1509494728U, // <4,7,u,5>: Cost 2 vext1 <5,4,7,u>, <5,4,7,u> + 2583237114U, // <4,7,u,6>: Cost 3 vext1 <5,4,7,u>, <6,2,7,3> + 3047757420U, // <4,7,u,7>: Cost 3 vtrnl RHS, <7,7,7,7> + 1509496622U, // <4,7,u,u>: Cost 2 vext1 <5,4,7,u>, LHS + 2618933248U, // <4,u,0,0>: Cost 3 vext2 <0,2,4,u>, <0,0,0,0> + 1545191526U, // <4,u,0,1>: Cost 2 vext2 <0,2,4,u>, LHS + 1545191630U, // <4,u,0,2>: Cost 2 vext2 <0,2,4,u>, <0,2,4,u> + 2691913445U, // <4,u,0,3>: Cost 3 vext3 <1,2,3,4>, <u,0,3,2> + 2618933586U, // <4,u,0,4>: Cost 3 vext2 <0,2,4,u>, <0,4,1,5> + 2265397305U, // <4,u,0,5>: Cost 3 vrev <u,4,5,0> + 2595189625U, // <4,u,0,6>: Cost 3 vext1 <7,4,u,0>, <6,7,4,u> + 2595190139U, // <4,u,0,7>: Cost 3 vext1 <7,4,u,0>, <7,4,u,0> + 1545192093U, // <4,u,0,u>: Cost 2 vext2 <0,2,4,u>, LHS + 2618934006U, // <4,u,1,0>: Cost 3 vext2 <0,2,4,u>, <1,0,3,2> + 2618934068U, // <4,u,1,1>: Cost 3 vext2 <0,2,4,u>, <1,1,1,1> + 1618171694U, // <4,u,1,2>: Cost 2 vext3 <1,2,3,4>, LHS + 2618934232U, // <4,u,1,3>: Cost 3 vext2 <0,2,4,u>, <1,3,1,3> + 2695894848U, // <4,u,1,4>: Cost 3 vext3 <1,u,3,4>, <u,1,4,3> + 2618934416U, // <4,u,1,5>: Cost 3 vext2 <0,2,4,u>, <1,5,3,7> + 3692676321U, // <4,u,1,6>: Cost 4 vext2 <0,2,4,u>, <1,6,3,7> + 2718750555U, // <4,u,1,7>: Cost 3 vext3 <5,6,7,4>, <u,1,7,3> + 1618171748U, // <4,u,1,u>: Cost 2 vext3 <1,2,3,4>, LHS + 2553397350U, // <4,u,2,0>: Cost 3 vext1 <0,4,u,2>, LHS + 2630215215U, // <4,u,2,1>: Cost 3 vext2 <2,1,4,u>, <2,1,4,u> + 2618934888U, // <4,u,2,2>: Cost 3 vext2 <0,2,4,u>, <2,2,2,2> + 1557800657U, // <4,u,2,3>: Cost 2 vext2 <2,3,4,u>, <2,3,4,u> + 2618935065U, // <4,u,2,4>: Cost 3 vext2 <0,2,4,u>, <2,4,3,u> + 2733864859U, // <4,u,2,5>: Cost 3 vext3 <u,2,5,4>, <u,2,5,4> + 2618935226U, // <4,u,2,6>: Cost 3 vext2 <0,2,4,u>, <2,6,3,7> + 2718750636U, // <4,u,2,7>: Cost 3 vext3 <5,6,7,4>, <u,2,7,3> + 1561118822U, // <4,u,2,u>: Cost 2 vext2 <2,u,4,u>, <2,u,4,u> + 2618935446U, // <4,u,3,0>: Cost 3 vext2 <0,2,4,u>, <3,0,1,2> + 2779318422U, // <4,u,3,1>: Cost 3 vuzpl RHS, <3,0,1,2> + 2636851545U, // <4,u,3,2>: Cost 3 vext2 <3,2,4,u>, <3,2,4,u> + 2618935708U, // <4,u,3,3>: Cost 3 vext2 <0,2,4,u>, <3,3,3,3> + 2618935810U, // <4,u,3,4>: Cost 3 vext2 <0,2,4,u>, <3,4,5,6> + 2691913711U, // <4,u,3,5>: Cost 3 vext3 <1,2,3,4>, <u,3,5,7> + 2588725862U, // <4,u,3,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3> + 2640169710U, // <4,u,3,7>: Cost 3 vext2 <3,7,4,u>, <3,7,4,u> + 2618936094U, // <4,u,3,u>: Cost 3 vext2 <0,2,4,u>, <3,u,1,2> + 1503559782U, // <4,u,4,0>: Cost 2 vext1 <4,4,u,4>, LHS + 2692282391U, // <4,u,4,1>: Cost 3 vext3 <1,2,u,4>, <u,4,1,2> + 2565359426U, // <4,u,4,2>: Cost 3 vext1 <2,4,u,4>, <2,4,u,4> + 2571332123U, // <4,u,4,3>: Cost 3 vext1 <3,4,u,4>, <3,4,u,4> + 161926454U, // <4,u,4,4>: Cost 1 vdup0 RHS + 1545194806U, // <4,u,4,5>: Cost 2 vext2 <0,2,4,u>, RHS + 1705577782U, // <4,u,4,6>: Cost 2 vuzpl RHS, RHS + 2718750801U, // <4,u,4,7>: Cost 3 vext3 <5,6,7,4>, <u,4,7,6> + 161926454U, // <4,u,4,u>: Cost 1 vdup0 RHS + 1479164006U, // <4,u,5,0>: Cost 2 vext1 <0,4,1,5>, LHS + 1839650606U, // <4,u,5,1>: Cost 2 vzipl RHS, LHS + 2565367502U, // <4,u,5,2>: Cost 3 vext1 <2,4,u,5>, <2,3,4,5> + 3089777309U, // <4,u,5,3>: Cost 3 vtrnr <0,4,1,5>, LHS + 1479167286U, // <4,u,5,4>: Cost 2 vext1 <0,4,1,5>, RHS + 1839650970U, // <4,u,5,5>: Cost 2 vzipl RHS, RHS + 1618172058U, // <4,u,5,6>: Cost 2 vext3 <1,2,3,4>, RHS + 3089780265U, // <4,u,5,7>: Cost 3 vtrnr <0,4,1,5>, RHS + 1618172076U, // <4,u,5,u>: Cost 2 vext3 <1,2,3,4>, RHS + 1479688294U, // <4,u,6,0>: Cost 2 vext1 <0,4,u,6>, LHS + 2553430774U, // <4,u,6,1>: Cost 3 vext1 <0,4,u,6>, <1,0,3,2> + 1973868334U, // <4,u,6,2>: Cost 2 vtrnl RHS, LHS + 1497606685U, // <4,u,6,3>: Cost 2 vext1 <3,4,u,6>, <3,4,u,6> + 1479691574U, // <4,u,6,4>: Cost 2 vext1 <0,4,u,6>, RHS + 1509552079U, // <4,u,6,5>: Cost 2 vext1 <5,4,u,6>, <5,4,u,6> + 1973868698U, // <4,u,6,6>: Cost 2 vtrnl RHS, RHS + 27705344U, // <4,u,6,7>: Cost 0 copy RHS + 27705344U, // <4,u,6,u>: Cost 0 copy RHS + 2565382246U, // <4,u,7,0>: Cost 3 vext1 <2,4,u,7>, LHS + 2565383066U, // <4,u,7,1>: Cost 3 vext1 <2,4,u,7>, <1,2,3,4> + 2565384005U, // <4,u,7,2>: Cost 3 vext1 <2,4,u,7>, <2,4,u,7> + 2661405966U, // <4,u,7,3>: Cost 3 vext2 <7,3,4,u>, <7,3,4,u> + 2565385526U, // <4,u,7,4>: Cost 3 vext1 <2,4,u,7>, RHS + 2779321702U, // <4,u,7,5>: Cost 3 vuzpl RHS, <7,4,5,6> + 2589274793U, // <4,u,7,6>: Cost 3 vext1 <6,4,u,7>, <6,4,u,7> + 2779321964U, // <4,u,7,7>: Cost 3 vuzpl RHS, <7,7,7,7> + 2565388078U, // <4,u,7,u>: Cost 3 vext1 <2,4,u,7>, LHS + 1479704678U, // <4,u,u,0>: Cost 2 vext1 <0,4,u,u>, LHS + 1545197358U, // <4,u,u,1>: Cost 2 vext2 <0,2,4,u>, LHS + 1618172261U, // <4,u,u,2>: Cost 2 vext3 <1,2,3,4>, LHS + 1497623071U, // <4,u,u,3>: Cost 2 vext1 <3,4,u,u>, <3,4,u,u> + 161926454U, // <4,u,u,4>: Cost 1 vdup0 RHS + 1545197722U, // <4,u,u,5>: Cost 2 vext2 <0,2,4,u>, RHS + 1618172301U, // <4,u,u,6>: Cost 2 vext3 <1,2,3,4>, RHS + 27705344U, // <4,u,u,7>: Cost 0 copy RHS + 27705344U, // <4,u,u,u>: Cost 0 copy RHS + 2687123456U, // <5,0,0,0>: Cost 3 vext3 <0,4,1,5>, <0,0,0,0> + 2687123466U, // <5,0,0,1>: Cost 3 vext3 <0,4,1,5>, <0,0,1,1> + 2687123476U, // <5,0,0,2>: Cost 3 vext3 <0,4,1,5>, <0,0,2,2> + 3710599434U, // <5,0,0,3>: Cost 4 vext2 <3,2,5,0>, <0,3,2,5> + 2642166098U, // <5,0,0,4>: Cost 3 vext2 <4,1,5,0>, <0,4,1,5> + 3657060306U, // <5,0,0,5>: Cost 4 vext1 <5,5,0,0>, <5,5,0,0> + 3292094923U, // <5,0,0,6>: Cost 4 vrev <0,5,6,0> + 3669005700U, // <5,0,0,7>: Cost 4 vext1 <7,5,0,0>, <7,5,0,0> + 2687123530U, // <5,0,0,u>: Cost 3 vext3 <0,4,1,5>, <0,0,u,2> + 2559434854U, // <5,0,1,0>: Cost 3 vext1 <1,5,0,1>, LHS + 2559435887U, // <5,0,1,1>: Cost 3 vext1 <1,5,0,1>, <1,5,0,1> + 1613381734U, // <5,0,1,2>: Cost 2 vext3 <0,4,1,5>, LHS + 3698656256U, // <5,0,1,3>: Cost 4 vext2 <1,2,5,0>, <1,3,5,7> + 2559438134U, // <5,0,1,4>: Cost 3 vext1 <1,5,0,1>, RHS + 2583326675U, // <5,0,1,5>: Cost 3 vext1 <5,5,0,1>, <5,5,0,1> + 3715908851U, // <5,0,1,6>: Cost 4 vext2 <4,1,5,0>, <1,6,5,7> + 3657069562U, // <5,0,1,7>: Cost 4 vext1 <5,5,0,1>, <7,0,1,2> + 1613381788U, // <5,0,1,u>: Cost 2 vext3 <0,4,1,5>, LHS + 2686017700U, // <5,0,2,0>: Cost 3 vext3 <0,2,4,5>, <0,2,0,2> + 2685796528U, // <5,0,2,1>: Cost 3 vext3 <0,2,1,5>, <0,2,1,5> + 2698625208U, // <5,0,2,2>: Cost 3 vext3 <2,3,4,5>, <0,2,2,4> + 2685944002U, // <5,0,2,3>: Cost 3 vext3 <0,2,3,5>, <0,2,3,5> + 2686017739U, // <5,0,2,4>: Cost 3 vext3 <0,2,4,5>, <0,2,4,5> + 2686091476U, // <5,0,2,5>: Cost 3 vext3 <0,2,5,5>, <0,2,5,5> + 2725167324U, // <5,0,2,6>: Cost 3 vext3 <6,7,4,5>, <0,2,6,4> + 2595280230U, // <5,0,2,7>: Cost 3 vext1 <7,5,0,2>, <7,4,5,6> + 2686312687U, // <5,0,2,u>: Cost 3 vext3 <0,2,u,5>, <0,2,u,5> + 3760128248U, // <5,0,3,0>: Cost 4 vext3 <0,3,0,5>, <0,3,0,5> + 3759685888U, // <5,0,3,1>: Cost 4 vext3 <0,2,3,5>, <0,3,1,4> + 2686533898U, // <5,0,3,2>: Cost 3 vext3 <0,3,2,5>, <0,3,2,5> + 3760349459U, // <5,0,3,3>: Cost 4 vext3 <0,3,3,5>, <0,3,3,5> + 2638187004U, // <5,0,3,4>: Cost 3 vext2 <3,4,5,0>, <3,4,5,0> + 3776348452U, // <5,0,3,5>: Cost 4 vext3 <3,0,4,5>, <0,3,5,4> + 3713256094U, // <5,0,3,6>: Cost 4 vext2 <3,6,5,0>, <3,6,5,0> + 3914064896U, // <5,0,3,7>: Cost 4 vuzpr <3,5,7,0>, <1,3,5,7> + 2686976320U, // <5,0,3,u>: Cost 3 vext3 <0,3,u,5>, <0,3,u,5> + 2559459430U, // <5,0,4,0>: Cost 3 vext1 <1,5,0,4>, LHS + 1613381970U, // <5,0,4,1>: Cost 2 vext3 <0,4,1,5>, <0,4,1,5> + 2687123804U, // <5,0,4,2>: Cost 3 vext3 <0,4,1,5>, <0,4,2,6> + 3761013092U, // <5,0,4,3>: Cost 4 vext3 <0,4,3,5>, <0,4,3,5> + 2559462710U, // <5,0,4,4>: Cost 3 vext1 <1,5,0,4>, RHS + 2638187830U, // <5,0,4,5>: Cost 3 vext2 <3,4,5,0>, RHS + 3761234303U, // <5,0,4,6>: Cost 4 vext3 <0,4,6,5>, <0,4,6,5> + 2646150600U, // <5,0,4,7>: Cost 3 vext2 <4,7,5,0>, <4,7,5,0> + 1613381970U, // <5,0,4,u>: Cost 2 vext3 <0,4,1,5>, <0,4,1,5> + 3766763926U, // <5,0,5,0>: Cost 4 vext3 <1,4,0,5>, <0,5,0,1> + 2919268454U, // <5,0,5,1>: Cost 3 vzipl <5,5,5,5>, LHS + 3053486182U, // <5,0,5,2>: Cost 3 vtrnl <5,5,5,5>, LHS + 3723210589U, // <5,0,5,3>: Cost 4 vext2 <5,3,5,0>, <5,3,5,0> + 3766763966U, // <5,0,5,4>: Cost 4 vext3 <1,4,0,5>, <0,5,4,5> + 2650796031U, // <5,0,5,5>: Cost 3 vext2 <5,5,5,0>, <5,5,5,0> + 3719893090U, // <5,0,5,6>: Cost 4 vext2 <4,7,5,0>, <5,6,7,0> + 3914067254U, // <5,0,5,7>: Cost 4 vuzpr <3,5,7,0>, RHS + 2919269021U, // <5,0,5,u>: Cost 3 vzipl <5,5,5,5>, LHS + 4047519744U, // <5,0,6,0>: Cost 4 vzipr <3,4,5,6>, <0,0,0,0> + 2920038502U, // <5,0,6,1>: Cost 3 vzipl <5,6,7,0>, LHS + 3759759871U, // <5,0,6,2>: Cost 4 vext3 <0,2,4,5>, <0,6,2,7> + 3645164070U, // <5,0,6,3>: Cost 4 vext1 <3,5,0,6>, <3,5,0,6> + 3762414095U, // <5,0,6,4>: Cost 4 vext3 <0,6,4,5>, <0,6,4,5> + 3993780690U, // <5,0,6,5>: Cost 4 vzipl <5,6,7,0>, <0,5,6,7> + 3719893816U, // <5,0,6,6>: Cost 4 vext2 <4,7,5,0>, <6,6,6,6> + 2662077302U, // <5,0,6,7>: Cost 3 vext2 <7,4,5,0>, <6,7,4,5> + 2920039069U, // <5,0,6,u>: Cost 3 vzipl <5,6,7,0>, LHS + 2565455974U, // <5,0,7,0>: Cost 3 vext1 <2,5,0,7>, LHS + 2565456790U, // <5,0,7,1>: Cost 3 vext1 <2,5,0,7>, <1,2,3,0> + 2565457742U, // <5,0,7,2>: Cost 3 vext1 <2,5,0,7>, <2,5,0,7> + 3639199894U, // <5,0,7,3>: Cost 4 vext1 <2,5,0,7>, <3,0,1,2> + 2565459254U, // <5,0,7,4>: Cost 3 vext1 <2,5,0,7>, RHS + 2589347938U, // <5,0,7,5>: Cost 3 vext1 <6,5,0,7>, <5,6,7,0> + 2589348530U, // <5,0,7,6>: Cost 3 vext1 <6,5,0,7>, <6,5,0,7> + 4188456422U, // <5,0,7,7>: Cost 4 vtrnr RHS, <2,0,5,7> + 2565461806U, // <5,0,7,u>: Cost 3 vext1 <2,5,0,7>, LHS + 2687124106U, // <5,0,u,0>: Cost 3 vext3 <0,4,1,5>, <0,u,0,2> + 1616036502U, // <5,0,u,1>: Cost 2 vext3 <0,u,1,5>, <0,u,1,5> + 1613382301U, // <5,0,u,2>: Cost 2 vext3 <0,4,1,5>, LHS + 2689925800U, // <5,0,u,3>: Cost 3 vext3 <0,u,3,5>, <0,u,3,5> + 2687124146U, // <5,0,u,4>: Cost 3 vext3 <0,4,1,5>, <0,u,4,6> + 2638190746U, // <5,0,u,5>: Cost 3 vext2 <3,4,5,0>, RHS + 2589356723U, // <5,0,u,6>: Cost 3 vext1 <6,5,0,u>, <6,5,0,u> + 2595280230U, // <5,0,u,7>: Cost 3 vext1 <7,5,0,2>, <7,4,5,6> + 1613382355U, // <5,0,u,u>: Cost 2 vext3 <0,4,1,5>, LHS + 2646818816U, // <5,1,0,0>: Cost 3 vext2 <4,u,5,1>, <0,0,0,0> + 1573077094U, // <5,1,0,1>: Cost 2 vext2 <4,u,5,1>, LHS + 2646818980U, // <5,1,0,2>: Cost 3 vext2 <4,u,5,1>, <0,2,0,2> + 2687124214U, // <5,1,0,3>: Cost 3 vext3 <0,4,1,5>, <1,0,3,2> + 2641510738U, // <5,1,0,4>: Cost 3 vext2 <4,0,5,1>, <0,4,1,5> + 2641510814U, // <5,1,0,5>: Cost 3 vext2 <4,0,5,1>, <0,5,1,0> + 3720561142U, // <5,1,0,6>: Cost 4 vext2 <4,u,5,1>, <0,6,1,7> + 3298141357U, // <5,1,0,7>: Cost 4 vrev <1,5,7,0> + 1573077661U, // <5,1,0,u>: Cost 2 vext2 <4,u,5,1>, LHS + 2223891567U, // <5,1,1,0>: Cost 3 vrev <1,5,0,1> + 2687124276U, // <5,1,1,1>: Cost 3 vext3 <0,4,1,5>, <1,1,1,1> + 2646819734U, // <5,1,1,2>: Cost 3 vext2 <4,u,5,1>, <1,2,3,0> + 2687124296U, // <5,1,1,3>: Cost 3 vext3 <0,4,1,5>, <1,1,3,3> + 2691326803U, // <5,1,1,4>: Cost 3 vext3 <1,1,4,5>, <1,1,4,5> + 2691400540U, // <5,1,1,5>: Cost 3 vext3 <1,1,5,5>, <1,1,5,5> + 3765216101U, // <5,1,1,6>: Cost 4 vext3 <1,1,6,5>, <1,1,6,5> + 3765289838U, // <5,1,1,7>: Cost 4 vext3 <1,1,7,5>, <1,1,7,5> + 2687124341U, // <5,1,1,u>: Cost 3 vext3 <0,4,1,5>, <1,1,u,3> + 3297641584U, // <5,1,2,0>: Cost 4 vrev <1,5,0,2> + 3763520391U, // <5,1,2,1>: Cost 4 vext3 <0,u,1,5>, <1,2,1,3> + 2646820456U, // <5,1,2,2>: Cost 3 vext2 <4,u,5,1>, <2,2,2,2> + 2687124374U, // <5,1,2,3>: Cost 3 vext3 <0,4,1,5>, <1,2,3,0> + 2691990436U, // <5,1,2,4>: Cost 3 vext3 <1,2,4,5>, <1,2,4,5> + 2687124395U, // <5,1,2,5>: Cost 3 vext3 <0,4,1,5>, <1,2,5,3> + 2646820794U, // <5,1,2,6>: Cost 3 vext2 <4,u,5,1>, <2,6,3,7> + 3808199610U, // <5,1,2,7>: Cost 4 vext3 <u,3,4,5>, <1,2,7,0> + 2687124419U, // <5,1,2,u>: Cost 3 vext3 <0,4,1,5>, <1,2,u,0> + 2577440870U, // <5,1,3,0>: Cost 3 vext1 <4,5,1,3>, LHS + 2687124440U, // <5,1,3,1>: Cost 3 vext3 <0,4,1,5>, <1,3,1,3> + 3759686627U, // <5,1,3,2>: Cost 4 vext3 <0,2,3,5>, <1,3,2,5> + 2692580332U, // <5,1,3,3>: Cost 3 vext3 <1,3,3,5>, <1,3,3,5> + 2687124469U, // <5,1,3,4>: Cost 3 vext3 <0,4,1,5>, <1,3,4,5> + 2685207552U, // <5,1,3,5>: Cost 3 vext3 <0,1,2,5>, <1,3,5,7> + 3760866313U, // <5,1,3,6>: Cost 4 vext3 <0,4,1,5>, <1,3,6,7> + 2692875280U, // <5,1,3,7>: Cost 3 vext3 <1,3,7,5>, <1,3,7,5> + 2687124503U, // <5,1,3,u>: Cost 3 vext3 <0,4,1,5>, <1,3,u,3> + 1567771538U, // <5,1,4,0>: Cost 2 vext2 <4,0,5,1>, <4,0,5,1> + 2693096491U, // <5,1,4,1>: Cost 3 vext3 <1,4,1,5>, <1,4,1,5> + 2693170228U, // <5,1,4,2>: Cost 3 vext3 <1,4,2,5>, <1,4,2,5> + 2687124541U, // <5,1,4,3>: Cost 3 vext3 <0,4,1,5>, <1,4,3,5> + 2646822096U, // <5,1,4,4>: Cost 3 vext2 <4,u,5,1>, <4,4,4,4> + 1573080374U, // <5,1,4,5>: Cost 2 vext2 <4,u,5,1>, RHS + 2646822260U, // <5,1,4,6>: Cost 3 vext2 <4,u,5,1>, <4,6,4,6> + 3298174129U, // <5,1,4,7>: Cost 4 vrev <1,5,7,4> + 1573080602U, // <5,1,4,u>: Cost 2 vext2 <4,u,5,1>, <4,u,5,1> + 2687124591U, // <5,1,5,0>: Cost 3 vext3 <0,4,1,5>, <1,5,0,1> + 2646822543U, // <5,1,5,1>: Cost 3 vext2 <4,u,5,1>, <5,1,0,1> + 3760866433U, // <5,1,5,2>: Cost 4 vext3 <0,4,1,5>, <1,5,2,1> + 2687124624U, // <5,1,5,3>: Cost 3 vext3 <0,4,1,5>, <1,5,3,7> + 2687124631U, // <5,1,5,4>: Cost 3 vext3 <0,4,1,5>, <1,5,4,5> + 2646822916U, // <5,1,5,5>: Cost 3 vext2 <4,u,5,1>, <5,5,5,5> + 2646823010U, // <5,1,5,6>: Cost 3 vext2 <4,u,5,1>, <5,6,7,0> + 2646823080U, // <5,1,5,7>: Cost 3 vext2 <4,u,5,1>, <5,7,5,7> + 2687124663U, // <5,1,5,u>: Cost 3 vext3 <0,4,1,5>, <1,5,u,1> + 2553577574U, // <5,1,6,0>: Cost 3 vext1 <0,5,1,6>, LHS + 3763520719U, // <5,1,6,1>: Cost 4 vext3 <0,u,1,5>, <1,6,1,7> + 2646823418U, // <5,1,6,2>: Cost 3 vext2 <4,u,5,1>, <6,2,7,3> + 3760866529U, // <5,1,6,3>: Cost 4 vext3 <0,4,1,5>, <1,6,3,7> + 2553580854U, // <5,1,6,4>: Cost 3 vext1 <0,5,1,6>, RHS + 2687124723U, // <5,1,6,5>: Cost 3 vext3 <0,4,1,5>, <1,6,5,7> + 2646823736U, // <5,1,6,6>: Cost 3 vext2 <4,u,5,1>, <6,6,6,6> + 2646823758U, // <5,1,6,7>: Cost 3 vext2 <4,u,5,1>, <6,7,0,1> + 2646823839U, // <5,1,6,u>: Cost 3 vext2 <4,u,5,1>, <6,u,0,1> + 2559557734U, // <5,1,7,0>: Cost 3 vext1 <1,5,1,7>, LHS + 2559558452U, // <5,1,7,1>: Cost 3 vext1 <1,5,1,7>, <1,1,1,1> + 2571503270U, // <5,1,7,2>: Cost 3 vext1 <3,5,1,7>, <2,3,0,1> + 2040971366U, // <5,1,7,3>: Cost 2 vtrnr RHS, LHS + 2559561014U, // <5,1,7,4>: Cost 3 vext1 <1,5,1,7>, RHS + 2595393232U, // <5,1,7,5>: Cost 3 vext1 <7,5,1,7>, <5,1,7,3> + 4188455035U, // <5,1,7,6>: Cost 4 vtrnr RHS, <0,1,4,6> + 2646824556U, // <5,1,7,7>: Cost 3 vext2 <4,u,5,1>, <7,7,7,7> + 2040971371U, // <5,1,7,u>: Cost 2 vtrnr RHS, LHS + 1591662326U, // <5,1,u,0>: Cost 2 vext2 <u,0,5,1>, <u,0,5,1> + 1573082926U, // <5,1,u,1>: Cost 2 vext2 <4,u,5,1>, LHS + 2695824760U, // <5,1,u,2>: Cost 3 vext3 <1,u,2,5>, <1,u,2,5> + 2040979558U, // <5,1,u,3>: Cost 2 vtrnr RHS, LHS + 2687124874U, // <5,1,u,4>: Cost 3 vext3 <0,4,1,5>, <1,u,4,5> + 1573083290U, // <5,1,u,5>: Cost 2 vext2 <4,u,5,1>, RHS + 2646825168U, // <5,1,u,6>: Cost 3 vext2 <4,u,5,1>, <u,6,3,7> + 2646825216U, // <5,1,u,7>: Cost 3 vext2 <4,u,5,1>, <u,7,0,1> + 2040979563U, // <5,1,u,u>: Cost 2 vtrnr RHS, LHS + 3702652928U, // <5,2,0,0>: Cost 4 vext2 <1,u,5,2>, <0,0,0,0> + 2628911206U, // <5,2,0,1>: Cost 3 vext2 <1,u,5,2>, LHS + 2641518756U, // <5,2,0,2>: Cost 3 vext2 <4,0,5,2>, <0,2,0,2> + 3759760847U, // <5,2,0,3>: Cost 4 vext3 <0,2,4,5>, <2,0,3,2> + 3760866775U, // <5,2,0,4>: Cost 4 vext3 <0,4,1,5>, <2,0,4,1> + 3759539680U, // <5,2,0,5>: Cost 4 vext3 <0,2,1,5>, <2,0,5,1> + 3760866796U, // <5,2,0,6>: Cost 4 vext3 <0,4,1,5>, <2,0,6,4> + 3304114054U, // <5,2,0,7>: Cost 4 vrev <2,5,7,0> + 2628911773U, // <5,2,0,u>: Cost 3 vext2 <1,u,5,2>, LHS + 2623603464U, // <5,2,1,0>: Cost 3 vext2 <1,0,5,2>, <1,0,5,2> + 3698008921U, // <5,2,1,1>: Cost 4 vext2 <1,1,5,2>, <1,1,5,2> + 3633325603U, // <5,2,1,2>: Cost 4 vext1 <1,5,2,1>, <2,1,3,5> + 2687125027U, // <5,2,1,3>: Cost 3 vext3 <0,4,1,5>, <2,1,3,5> + 3633327414U, // <5,2,1,4>: Cost 4 vext1 <1,5,2,1>, RHS + 3759539760U, // <5,2,1,5>: Cost 4 vext3 <0,2,1,5>, <2,1,5,0> + 3760866876U, // <5,2,1,6>: Cost 4 vext3 <0,4,1,5>, <2,1,6,3> + 3304122247U, // <5,2,1,7>: Cost 4 vrev <2,5,7,1> + 2687125072U, // <5,2,1,u>: Cost 3 vext3 <0,4,1,5>, <2,1,u,5> + 3633332326U, // <5,2,2,0>: Cost 4 vext1 <1,5,2,2>, LHS + 3759760992U, // <5,2,2,1>: Cost 4 vext3 <0,2,4,5>, <2,2,1,3> + 2687125096U, // <5,2,2,2>: Cost 3 vext3 <0,4,1,5>, <2,2,2,2> + 2687125106U, // <5,2,2,3>: Cost 3 vext3 <0,4,1,5>, <2,2,3,3> + 2697963133U, // <5,2,2,4>: Cost 3 vext3 <2,2,4,5>, <2,2,4,5> + 3759466120U, // <5,2,2,5>: Cost 4 vext3 <0,2,0,5>, <2,2,5,7> + 3760866960U, // <5,2,2,6>: Cost 4 vext3 <0,4,1,5>, <2,2,6,6> + 3771926168U, // <5,2,2,7>: Cost 4 vext3 <2,2,7,5>, <2,2,7,5> + 2687125151U, // <5,2,2,u>: Cost 3 vext3 <0,4,1,5>, <2,2,u,3> + 2687125158U, // <5,2,3,0>: Cost 3 vext3 <0,4,1,5>, <2,3,0,1> + 2698405555U, // <5,2,3,1>: Cost 3 vext3 <2,3,1,5>, <2,3,1,5> + 2577516238U, // <5,2,3,2>: Cost 3 vext1 <4,5,2,3>, <2,3,4,5> + 3759687365U, // <5,2,3,3>: Cost 4 vext3 <0,2,3,5>, <2,3,3,5> + 1624884942U, // <5,2,3,4>: Cost 2 vext3 <2,3,4,5>, <2,3,4,5> + 2698700503U, // <5,2,3,5>: Cost 3 vext3 <2,3,5,5>, <2,3,5,5> + 3772368608U, // <5,2,3,6>: Cost 4 vext3 <2,3,4,5>, <2,3,6,5> + 3702655716U, // <5,2,3,7>: Cost 4 vext2 <1,u,5,2>, <3,7,3,7> + 1625179890U, // <5,2,3,u>: Cost 2 vext3 <2,3,u,5>, <2,3,u,5> + 2641521555U, // <5,2,4,0>: Cost 3 vext2 <4,0,5,2>, <4,0,5,2> + 3772368642U, // <5,2,4,1>: Cost 4 vext3 <2,3,4,5>, <2,4,1,3> + 2699142925U, // <5,2,4,2>: Cost 3 vext3 <2,4,2,5>, <2,4,2,5> + 2698626838U, // <5,2,4,3>: Cost 3 vext3 <2,3,4,5>, <2,4,3,5> + 2698626848U, // <5,2,4,4>: Cost 3 vext3 <2,3,4,5>, <2,4,4,6> + 2628914486U, // <5,2,4,5>: Cost 3 vext2 <1,u,5,2>, RHS + 2645503353U, // <5,2,4,6>: Cost 3 vext2 <4,6,5,2>, <4,6,5,2> + 3304146826U, // <5,2,4,7>: Cost 4 vrev <2,5,7,4> + 2628914729U, // <5,2,4,u>: Cost 3 vext2 <1,u,5,2>, RHS + 2553643110U, // <5,2,5,0>: Cost 3 vext1 <0,5,2,5>, LHS + 3758950227U, // <5,2,5,1>: Cost 4 vext3 <0,1,2,5>, <2,5,1,3> + 3759761248U, // <5,2,5,2>: Cost 4 vext3 <0,2,4,5>, <2,5,2,7> + 2982396006U, // <5,2,5,3>: Cost 3 vzipr <4,u,5,5>, LHS + 2553646390U, // <5,2,5,4>: Cost 3 vext1 <0,5,2,5>, RHS + 2553647108U, // <5,2,5,5>: Cost 3 vext1 <0,5,2,5>, <5,5,5,5> + 3760867204U, // <5,2,5,6>: Cost 4 vext3 <0,4,1,5>, <2,5,6,7> + 3702657141U, // <5,2,5,7>: Cost 4 vext2 <1,u,5,2>, <5,7,0,1> + 2982396011U, // <5,2,5,u>: Cost 3 vzipr <4,u,5,5>, LHS + 3627393126U, // <5,2,6,0>: Cost 4 vext1 <0,5,2,6>, LHS + 3760867236U, // <5,2,6,1>: Cost 4 vext3 <0,4,1,5>, <2,6,1,3> + 2645504506U, // <5,2,6,2>: Cost 3 vext2 <4,6,5,2>, <6,2,7,3> + 2687125434U, // <5,2,6,3>: Cost 3 vext3 <0,4,1,5>, <2,6,3,7> + 2700617665U, // <5,2,6,4>: Cost 3 vext3 <2,6,4,5>, <2,6,4,5> + 3760867276U, // <5,2,6,5>: Cost 4 vext3 <0,4,1,5>, <2,6,5,7> + 3763521493U, // <5,2,6,6>: Cost 4 vext3 <0,u,1,5>, <2,6,6,7> + 3719246670U, // <5,2,6,7>: Cost 4 vext2 <4,6,5,2>, <6,7,0,1> + 2687125479U, // <5,2,6,u>: Cost 3 vext3 <0,4,1,5>, <2,6,u,7> + 2565603430U, // <5,2,7,0>: Cost 3 vext1 <2,5,2,7>, LHS + 2553660150U, // <5,2,7,1>: Cost 3 vext1 <0,5,2,7>, <1,0,3,2> + 2565605216U, // <5,2,7,2>: Cost 3 vext1 <2,5,2,7>, <2,5,2,7> + 2961178726U, // <5,2,7,3>: Cost 3 vzipr <1,3,5,7>, LHS + 2565606710U, // <5,2,7,4>: Cost 3 vext1 <2,5,2,7>, RHS + 4034920552U, // <5,2,7,5>: Cost 4 vzipr <1,3,5,7>, <0,1,2,5> + 3114713292U, // <5,2,7,6>: Cost 3 vtrnr RHS, <0,2,4,6> + 3702658668U, // <5,2,7,7>: Cost 4 vext2 <1,u,5,2>, <7,7,7,7> + 2961178731U, // <5,2,7,u>: Cost 3 vzipr <1,3,5,7>, LHS + 2687125563U, // <5,2,u,0>: Cost 3 vext3 <0,4,1,5>, <2,u,0,1> + 2628917038U, // <5,2,u,1>: Cost 3 vext2 <1,u,5,2>, LHS + 2565613409U, // <5,2,u,2>: Cost 3 vext1 <2,5,2,u>, <2,5,2,u> + 2687125592U, // <5,2,u,3>: Cost 3 vext3 <0,4,1,5>, <2,u,3,3> + 1628203107U, // <5,2,u,4>: Cost 2 vext3 <2,u,4,5>, <2,u,4,5> + 2628917402U, // <5,2,u,5>: Cost 3 vext2 <1,u,5,2>, RHS + 2702092405U, // <5,2,u,6>: Cost 3 vext3 <2,u,6,5>, <2,u,6,5> + 3304179598U, // <5,2,u,7>: Cost 4 vrev <2,5,7,u> + 1628498055U, // <5,2,u,u>: Cost 2 vext3 <2,u,u,5>, <2,u,u,5> + 3760867467U, // <5,3,0,0>: Cost 4 vext3 <0,4,1,5>, <3,0,0,0> + 2687125654U, // <5,3,0,1>: Cost 3 vext3 <0,4,1,5>, <3,0,1,2> + 3759761565U, // <5,3,0,2>: Cost 4 vext3 <0,2,4,5>, <3,0,2,0> + 3633391766U, // <5,3,0,3>: Cost 4 vext1 <1,5,3,0>, <3,0,1,2> + 2687125680U, // <5,3,0,4>: Cost 3 vext3 <0,4,1,5>, <3,0,4,1> + 3760277690U, // <5,3,0,5>: Cost 4 vext3 <0,3,2,5>, <3,0,5,2> + 3310013014U, // <5,3,0,6>: Cost 4 vrev <3,5,6,0> + 2236344927U, // <5,3,0,7>: Cost 3 vrev <3,5,7,0> + 2687125717U, // <5,3,0,u>: Cost 3 vext3 <0,4,1,5>, <3,0,u,2> + 3760867551U, // <5,3,1,0>: Cost 4 vext3 <0,4,1,5>, <3,1,0,3> + 3760867558U, // <5,3,1,1>: Cost 4 vext3 <0,4,1,5>, <3,1,1,1> + 2624938923U, // <5,3,1,2>: Cost 3 vext2 <1,2,5,3>, <1,2,5,3> + 2703198460U, // <5,3,1,3>: Cost 3 vext3 <3,1,3,5>, <3,1,3,5> + 3760867587U, // <5,3,1,4>: Cost 4 vext3 <0,4,1,5>, <3,1,4,3> + 2636219536U, // <5,3,1,5>: Cost 3 vext2 <3,1,5,3>, <1,5,3,7> + 3698681075U, // <5,3,1,6>: Cost 4 vext2 <1,2,5,3>, <1,6,5,7> + 2703493408U, // <5,3,1,7>: Cost 3 vext3 <3,1,7,5>, <3,1,7,5> + 2628920721U, // <5,3,1,u>: Cost 3 vext2 <1,u,5,3>, <1,u,5,3> + 3766765870U, // <5,3,2,0>: Cost 4 vext3 <1,4,0,5>, <3,2,0,1> + 3698681379U, // <5,3,2,1>: Cost 4 vext2 <1,2,5,3>, <2,1,3,5> + 3760867649U, // <5,3,2,2>: Cost 4 vext3 <0,4,1,5>, <3,2,2,2> + 2698627404U, // <5,3,2,3>: Cost 3 vext3 <2,3,4,5>, <3,2,3,4> + 2703935830U, // <5,3,2,4>: Cost 3 vext3 <3,2,4,5>, <3,2,4,5> + 2698627422U, // <5,3,2,5>: Cost 3 vext3 <2,3,4,5>, <3,2,5,4> + 3760867686U, // <5,3,2,6>: Cost 4 vext3 <0,4,1,5>, <3,2,6,3> + 3769788783U, // <5,3,2,7>: Cost 4 vext3 <1,u,5,5>, <3,2,7,3> + 2701945209U, // <5,3,2,u>: Cost 3 vext3 <2,u,4,5>, <3,2,u,4> + 3760867711U, // <5,3,3,0>: Cost 4 vext3 <0,4,1,5>, <3,3,0,1> + 2636220684U, // <5,3,3,1>: Cost 3 vext2 <3,1,5,3>, <3,1,5,3> + 3772369298U, // <5,3,3,2>: Cost 4 vext3 <2,3,4,5>, <3,3,2,2> + 2687125916U, // <5,3,3,3>: Cost 3 vext3 <0,4,1,5>, <3,3,3,3> + 2704599463U, // <5,3,3,4>: Cost 3 vext3 <3,3,4,5>, <3,3,4,5> + 2704673200U, // <5,3,3,5>: Cost 3 vext3 <3,3,5,5>, <3,3,5,5> + 3709962935U, // <5,3,3,6>: Cost 4 vext2 <3,1,5,3>, <3,6,7,7> + 3772369346U, // <5,3,3,7>: Cost 4 vext3 <2,3,4,5>, <3,3,7,5> + 2704894411U, // <5,3,3,u>: Cost 3 vext3 <3,3,u,5>, <3,3,u,5> + 2704968148U, // <5,3,4,0>: Cost 3 vext3 <3,4,0,5>, <3,4,0,5> + 3698682850U, // <5,3,4,1>: Cost 4 vext2 <1,2,5,3>, <4,1,5,0> + 2642857014U, // <5,3,4,2>: Cost 3 vext2 <4,2,5,3>, <4,2,5,3> + 2705189359U, // <5,3,4,3>: Cost 3 vext3 <3,4,3,5>, <3,4,3,5> + 2705263096U, // <5,3,4,4>: Cost 3 vext3 <3,4,4,5>, <3,4,4,5> + 2685946370U, // <5,3,4,5>: Cost 3 vext3 <0,2,3,5>, <3,4,5,6> + 3779152394U, // <5,3,4,6>: Cost 4 vext3 <3,4,6,5>, <3,4,6,5> + 2236377699U, // <5,3,4,7>: Cost 3 vrev <3,5,7,4> + 2687126045U, // <5,3,4,u>: Cost 3 vext3 <0,4,1,5>, <3,4,u,6> + 2571632742U, // <5,3,5,0>: Cost 3 vext1 <3,5,3,5>, LHS + 2559689870U, // <5,3,5,1>: Cost 3 vext1 <1,5,3,5>, <1,5,3,5> + 2571634382U, // <5,3,5,2>: Cost 3 vext1 <3,5,3,5>, <2,3,4,5> + 2571635264U, // <5,3,5,3>: Cost 3 vext1 <3,5,3,5>, <3,5,3,5> + 2571636022U, // <5,3,5,4>: Cost 3 vext1 <3,5,3,5>, RHS + 2559692804U, // <5,3,5,5>: Cost 3 vext1 <1,5,3,5>, <5,5,5,5> + 3720581218U, // <5,3,5,6>: Cost 4 vext2 <4,u,5,3>, <5,6,7,0> + 2236385892U, // <5,3,5,7>: Cost 3 vrev <3,5,7,5> + 2571638574U, // <5,3,5,u>: Cost 3 vext1 <3,5,3,5>, LHS + 2565668966U, // <5,3,6,0>: Cost 3 vext1 <2,5,3,6>, LHS + 3633439887U, // <5,3,6,1>: Cost 4 vext1 <1,5,3,6>, <1,5,3,6> + 2565670760U, // <5,3,6,2>: Cost 3 vext1 <2,5,3,6>, <2,5,3,6> + 2565671426U, // <5,3,6,3>: Cost 3 vext1 <2,5,3,6>, <3,4,5,6> + 2565672246U, // <5,3,6,4>: Cost 3 vext1 <2,5,3,6>, RHS + 3639414630U, // <5,3,6,5>: Cost 4 vext1 <2,5,3,6>, <5,3,6,0> + 4047521640U, // <5,3,6,6>: Cost 4 vzipr <3,4,5,6>, <2,5,3,6> + 2725169844U, // <5,3,6,7>: Cost 3 vext3 <6,7,4,5>, <3,6,7,4> + 2565674798U, // <5,3,6,u>: Cost 3 vext1 <2,5,3,6>, LHS + 1485963366U, // <5,3,7,0>: Cost 2 vext1 <1,5,3,7>, LHS + 1485964432U, // <5,3,7,1>: Cost 2 vext1 <1,5,3,7>, <1,5,3,7> + 2559706728U, // <5,3,7,2>: Cost 3 vext1 <1,5,3,7>, <2,2,2,2> + 2559707286U, // <5,3,7,3>: Cost 3 vext1 <1,5,3,7>, <3,0,1,2> + 1485966646U, // <5,3,7,4>: Cost 2 vext1 <1,5,3,7>, RHS + 2559708880U, // <5,3,7,5>: Cost 3 vext1 <1,5,3,7>, <5,1,7,3> + 2601513466U, // <5,3,7,6>: Cost 3 vext1 <u,5,3,7>, <6,2,7,3> + 3114714112U, // <5,3,7,7>: Cost 3 vtrnr RHS, <1,3,5,7> + 1485969198U, // <5,3,7,u>: Cost 2 vext1 <1,5,3,7>, LHS + 1485971558U, // <5,3,u,0>: Cost 2 vext1 <1,5,3,u>, LHS + 1485972625U, // <5,3,u,1>: Cost 2 vext1 <1,5,3,u>, <1,5,3,u> + 2559714920U, // <5,3,u,2>: Cost 3 vext1 <1,5,3,u>, <2,2,2,2> + 2559715478U, // <5,3,u,3>: Cost 3 vext1 <1,5,3,u>, <3,0,1,2> + 1485974838U, // <5,3,u,4>: Cost 2 vext1 <1,5,3,u>, RHS + 2687126342U, // <5,3,u,5>: Cost 3 vext3 <0,4,1,5>, <3,u,5,6> + 2601521658U, // <5,3,u,6>: Cost 3 vext1 <u,5,3,u>, <6,2,7,3> + 2236410471U, // <5,3,u,7>: Cost 3 vrev <3,5,7,u> + 1485977390U, // <5,3,u,u>: Cost 2 vext1 <1,5,3,u>, LHS + 3627491430U, // <5,4,0,0>: Cost 4 vext1 <0,5,4,0>, LHS + 2636890214U, // <5,4,0,1>: Cost 3 vext2 <3,2,5,4>, LHS + 3703333028U, // <5,4,0,2>: Cost 4 vext2 <2,0,5,4>, <0,2,0,2> + 3782249348U, // <5,4,0,3>: Cost 4 vext3 <4,0,3,5>, <4,0,3,5> + 2642198866U, // <5,4,0,4>: Cost 3 vext2 <4,1,5,4>, <0,4,1,5> + 2687126418U, // <5,4,0,5>: Cost 3 vext3 <0,4,1,5>, <4,0,5,1> + 2242243887U, // <5,4,0,6>: Cost 3 vrev <4,5,6,0> + 3316059448U, // <5,4,0,7>: Cost 4 vrev <4,5,7,0> + 2636890781U, // <5,4,0,u>: Cost 3 vext2 <3,2,5,4>, LHS + 2241809658U, // <5,4,1,0>: Cost 3 vrev <4,5,0,1> + 3698025307U, // <5,4,1,1>: Cost 4 vext2 <1,1,5,4>, <1,1,5,4> + 3698688940U, // <5,4,1,2>: Cost 4 vext2 <1,2,5,4>, <1,2,5,4> + 3698689024U, // <5,4,1,3>: Cost 4 vext2 <1,2,5,4>, <1,3,5,7> + 3700016206U, // <5,4,1,4>: Cost 4 vext2 <1,4,5,4>, <1,4,5,4> + 2687126498U, // <5,4,1,5>: Cost 3 vext3 <0,4,1,5>, <4,1,5,0> + 3760868336U, // <5,4,1,6>: Cost 4 vext3 <0,4,1,5>, <4,1,6,5> + 3316067641U, // <5,4,1,7>: Cost 4 vrev <4,5,7,1> + 2242399554U, // <5,4,1,u>: Cost 3 vrev <4,5,u,1> + 3703334371U, // <5,4,2,0>: Cost 4 vext2 <2,0,5,4>, <2,0,5,4> + 3703998004U, // <5,4,2,1>: Cost 4 vext2 <2,1,5,4>, <2,1,5,4> + 3704661637U, // <5,4,2,2>: Cost 4 vext2 <2,2,5,4>, <2,2,5,4> + 2636891854U, // <5,4,2,3>: Cost 3 vext2 <3,2,5,4>, <2,3,4,5> + 3705988903U, // <5,4,2,4>: Cost 4 vext2 <2,4,5,4>, <2,4,5,4> + 2698628150U, // <5,4,2,5>: Cost 3 vext3 <2,3,4,5>, <4,2,5,3> + 3760868415U, // <5,4,2,6>: Cost 4 vext3 <0,4,1,5>, <4,2,6,3> + 3783871562U, // <5,4,2,7>: Cost 4 vext3 <4,2,7,5>, <4,2,7,5> + 2666752099U, // <5,4,2,u>: Cost 3 vext2 <u,2,5,4>, <2,u,4,5> + 3639459942U, // <5,4,3,0>: Cost 4 vext1 <2,5,4,3>, LHS + 3709970701U, // <5,4,3,1>: Cost 4 vext2 <3,1,5,4>, <3,1,5,4> + 2636892510U, // <5,4,3,2>: Cost 3 vext2 <3,2,5,4>, <3,2,5,4> + 3710634396U, // <5,4,3,3>: Cost 4 vext2 <3,2,5,4>, <3,3,3,3> + 2638219776U, // <5,4,3,4>: Cost 3 vext2 <3,4,5,4>, <3,4,5,4> + 3766987908U, // <5,4,3,5>: Cost 4 vext3 <1,4,3,5>, <4,3,5,0> + 2710719634U, // <5,4,3,6>: Cost 3 vext3 <4,3,6,5>, <4,3,6,5> + 3914097664U, // <5,4,3,7>: Cost 4 vuzpr <3,5,7,4>, <1,3,5,7> + 2640874308U, // <5,4,3,u>: Cost 3 vext2 <3,u,5,4>, <3,u,5,4> + 2583642214U, // <5,4,4,0>: Cost 3 vext1 <5,5,4,4>, LHS + 2642201574U, // <5,4,4,1>: Cost 3 vext2 <4,1,5,4>, <4,1,5,4> + 3710635062U, // <5,4,4,2>: Cost 4 vext2 <3,2,5,4>, <4,2,5,3> + 3717270664U, // <5,4,4,3>: Cost 4 vext2 <4,3,5,4>, <4,3,5,4> + 2713963728U, // <5,4,4,4>: Cost 3 vext3 <4,u,5,5>, <4,4,4,4> + 1637567706U, // <5,4,4,5>: Cost 2 vext3 <4,4,5,5>, <4,4,5,5> + 2242276659U, // <5,4,4,6>: Cost 3 vrev <4,5,6,4> + 2646183372U, // <5,4,4,7>: Cost 3 vext2 <4,7,5,4>, <4,7,5,4> + 1637788917U, // <5,4,4,u>: Cost 2 vext3 <4,4,u,5>, <4,4,u,5> + 2559762534U, // <5,4,5,0>: Cost 3 vext1 <1,5,4,5>, LHS + 2559763607U, // <5,4,5,1>: Cost 3 vext1 <1,5,4,5>, <1,5,4,5> + 2698628366U, // <5,4,5,2>: Cost 3 vext3 <2,3,4,5>, <4,5,2,3> + 3633506454U, // <5,4,5,3>: Cost 4 vext1 <1,5,4,5>, <3,0,1,2> + 2559765814U, // <5,4,5,4>: Cost 3 vext1 <1,5,4,5>, RHS + 2583654395U, // <5,4,5,5>: Cost 3 vext1 <5,5,4,5>, <5,5,4,5> + 1613385014U, // <5,4,5,6>: Cost 2 vext3 <0,4,1,5>, RHS + 3901639990U, // <5,4,5,7>: Cost 4 vuzpr <1,5,0,4>, RHS + 1613385032U, // <5,4,5,u>: Cost 2 vext3 <0,4,1,5>, RHS + 2559770726U, // <5,4,6,0>: Cost 3 vext1 <1,5,4,6>, LHS + 2559771648U, // <5,4,6,1>: Cost 3 vext1 <1,5,4,6>, <1,3,5,7> + 3633514088U, // <5,4,6,2>: Cost 4 vext1 <1,5,4,6>, <2,2,2,2> + 2571717122U, // <5,4,6,3>: Cost 3 vext1 <3,5,4,6>, <3,4,5,6> + 2559774006U, // <5,4,6,4>: Cost 3 vext1 <1,5,4,6>, RHS + 2712636796U, // <5,4,6,5>: Cost 3 vext3 <4,6,5,5>, <4,6,5,5> + 3760868743U, // <5,4,6,6>: Cost 4 vext3 <0,4,1,5>, <4,6,6,7> + 2712784270U, // <5,4,6,7>: Cost 3 vext3 <4,6,7,5>, <4,6,7,5> + 2559776558U, // <5,4,6,u>: Cost 3 vext1 <1,5,4,6>, LHS + 2565750886U, // <5,4,7,0>: Cost 3 vext1 <2,5,4,7>, LHS + 2565751706U, // <5,4,7,1>: Cost 3 vext1 <2,5,4,7>, <1,2,3,4> + 2565752690U, // <5,4,7,2>: Cost 3 vext1 <2,5,4,7>, <2,5,4,7> + 2571725387U, // <5,4,7,3>: Cost 3 vext1 <3,5,4,7>, <3,5,4,7> + 2565754166U, // <5,4,7,4>: Cost 3 vext1 <2,5,4,7>, RHS + 3114713426U, // <5,4,7,5>: Cost 3 vtrnr RHS, <0,4,1,5> + 94817590U, // <5,4,7,6>: Cost 1 vrev RHS + 2595616175U, // <5,4,7,7>: Cost 3 vext1 <7,5,4,7>, <7,5,4,7> + 94965064U, // <5,4,7,u>: Cost 1 vrev RHS + 2559787110U, // <5,4,u,0>: Cost 3 vext1 <1,5,4,u>, LHS + 2559788186U, // <5,4,u,1>: Cost 3 vext1 <1,5,4,u>, <1,5,4,u> + 2242014483U, // <5,4,u,2>: Cost 3 vrev <4,5,2,u> + 2667419628U, // <5,4,u,3>: Cost 3 vext2 <u,3,5,4>, <u,3,5,4> + 2559790390U, // <5,4,u,4>: Cost 3 vext1 <1,5,4,u>, RHS + 1640222238U, // <5,4,u,5>: Cost 2 vext3 <4,u,5,5>, <4,u,5,5> + 94825783U, // <5,4,u,6>: Cost 1 vrev RHS + 2714111536U, // <5,4,u,7>: Cost 3 vext3 <4,u,7,5>, <4,u,7,5> + 94973257U, // <5,4,u,u>: Cost 1 vrev RHS + 2646851584U, // <5,5,0,0>: Cost 3 vext2 <4,u,5,5>, <0,0,0,0> + 1573109862U, // <5,5,0,1>: Cost 2 vext2 <4,u,5,5>, LHS + 2646851748U, // <5,5,0,2>: Cost 3 vext2 <4,u,5,5>, <0,2,0,2> + 3760279130U, // <5,5,0,3>: Cost 4 vext3 <0,3,2,5>, <5,0,3,2> + 2687127138U, // <5,5,0,4>: Cost 3 vext3 <0,4,1,5>, <5,0,4,1> + 2248142847U, // <5,5,0,5>: Cost 3 vrev <5,5,5,0> + 3720593910U, // <5,5,0,6>: Cost 4 vext2 <4,u,5,5>, <0,6,1,7> + 4182502710U, // <5,5,0,7>: Cost 4 vtrnr <3,5,7,0>, RHS + 1573110429U, // <5,5,0,u>: Cost 2 vext2 <4,u,5,5>, LHS + 2646852342U, // <5,5,1,0>: Cost 3 vext2 <4,u,5,5>, <1,0,3,2> + 2624291676U, // <5,5,1,1>: Cost 3 vext2 <1,1,5,5>, <1,1,5,5> + 2646852502U, // <5,5,1,2>: Cost 3 vext2 <4,u,5,5>, <1,2,3,0> + 2646852568U, // <5,5,1,3>: Cost 3 vext2 <4,u,5,5>, <1,3,1,3> + 2715217591U, // <5,5,1,4>: Cost 3 vext3 <5,1,4,5>, <5,1,4,5> + 2628936848U, // <5,5,1,5>: Cost 3 vext2 <1,u,5,5>, <1,5,3,7> + 3698033907U, // <5,5,1,6>: Cost 4 vext2 <1,1,5,5>, <1,6,5,7> + 2713964240U, // <5,5,1,7>: Cost 3 vext3 <4,u,5,5>, <5,1,7,3> + 2628937107U, // <5,5,1,u>: Cost 3 vext2 <1,u,5,5>, <1,u,5,5> + 3645497446U, // <5,5,2,0>: Cost 4 vext1 <3,5,5,2>, LHS + 3760869099U, // <5,5,2,1>: Cost 4 vext3 <0,4,1,5>, <5,2,1,3> + 2646853224U, // <5,5,2,2>: Cost 3 vext2 <4,u,5,5>, <2,2,2,2> + 2698628862U, // <5,5,2,3>: Cost 3 vext3 <2,3,4,5>, <5,2,3,4> + 3772370694U, // <5,5,2,4>: Cost 4 vext3 <2,3,4,5>, <5,2,4,3> + 2713964303U, // <5,5,2,5>: Cost 3 vext3 <4,u,5,5>, <5,2,5,3> + 2646853562U, // <5,5,2,6>: Cost 3 vext2 <4,u,5,5>, <2,6,3,7> + 4038198272U, // <5,5,2,7>: Cost 4 vzipr <1,u,5,2>, <1,3,5,7> + 2701946667U, // <5,5,2,u>: Cost 3 vext3 <2,u,4,5>, <5,2,u,4> + 2646853782U, // <5,5,3,0>: Cost 3 vext2 <4,u,5,5>, <3,0,1,2> + 3698034922U, // <5,5,3,1>: Cost 4 vext2 <1,1,5,5>, <3,1,1,5> + 3702679919U, // <5,5,3,2>: Cost 4 vext2 <1,u,5,5>, <3,2,7,3> + 2637564336U, // <5,5,3,3>: Cost 3 vext2 <3,3,5,5>, <3,3,5,5> + 2646854146U, // <5,5,3,4>: Cost 3 vext2 <4,u,5,5>, <3,4,5,6> + 2638891602U, // <5,5,3,5>: Cost 3 vext2 <3,5,5,5>, <3,5,5,5> + 3702680247U, // <5,5,3,6>: Cost 4 vext2 <1,u,5,5>, <3,6,7,7> + 3702680259U, // <5,5,3,7>: Cost 4 vext2 <1,u,5,5>, <3,7,0,1> + 2646854430U, // <5,5,3,u>: Cost 3 vext2 <4,u,5,5>, <3,u,1,2> + 2646854546U, // <5,5,4,0>: Cost 3 vext2 <4,u,5,5>, <4,0,5,1> + 2642209767U, // <5,5,4,1>: Cost 3 vext2 <4,1,5,5>, <4,1,5,5> + 3711306806U, // <5,5,4,2>: Cost 4 vext2 <3,3,5,5>, <4,2,5,3> + 3645516369U, // <5,5,4,3>: Cost 4 vext1 <3,5,5,4>, <3,5,5,4> + 1570458842U, // <5,5,4,4>: Cost 2 vext2 <4,4,5,5>, <4,4,5,5> + 1573113142U, // <5,5,4,5>: Cost 2 vext2 <4,u,5,5>, RHS + 2645527932U, // <5,5,4,6>: Cost 3 vext2 <4,6,5,5>, <4,6,5,5> + 2713964486U, // <5,5,4,7>: Cost 3 vext3 <4,u,5,5>, <5,4,7,6> + 1573113374U, // <5,5,4,u>: Cost 2 vext2 <4,u,5,5>, <4,u,5,5> + 1509982310U, // <5,5,5,0>: Cost 2 vext1 <5,5,5,5>, LHS + 2646855376U, // <5,5,5,1>: Cost 3 vext2 <4,u,5,5>, <5,1,7,3> + 2583725672U, // <5,5,5,2>: Cost 3 vext1 <5,5,5,5>, <2,2,2,2> + 2583726230U, // <5,5,5,3>: Cost 3 vext1 <5,5,5,5>, <3,0,1,2> + 1509985590U, // <5,5,5,4>: Cost 2 vext1 <5,5,5,5>, RHS + 229035318U, // <5,5,5,5>: Cost 1 vdup1 RHS + 2646855778U, // <5,5,5,6>: Cost 3 vext2 <4,u,5,5>, <5,6,7,0> + 2646855848U, // <5,5,5,7>: Cost 3 vext2 <4,u,5,5>, <5,7,5,7> + 229035318U, // <5,5,5,u>: Cost 1 vdup1 RHS + 2577760358U, // <5,5,6,0>: Cost 3 vext1 <4,5,5,6>, LHS + 3633587361U, // <5,5,6,1>: Cost 4 vext1 <1,5,5,6>, <1,5,5,6> + 2646856186U, // <5,5,6,2>: Cost 3 vext2 <4,u,5,5>, <6,2,7,3> + 3633588738U, // <5,5,6,3>: Cost 4 vext1 <1,5,5,6>, <3,4,5,6> + 2718535756U, // <5,5,6,4>: Cost 3 vext3 <5,6,4,5>, <5,6,4,5> + 2644202223U, // <5,5,6,5>: Cost 3 vext2 <4,4,5,5>, <6,5,7,5> + 2973780482U, // <5,5,6,6>: Cost 3 vzipr <3,4,5,6>, <3,4,5,6> + 2646856526U, // <5,5,6,7>: Cost 3 vext2 <4,u,5,5>, <6,7,0,1> + 2646856607U, // <5,5,6,u>: Cost 3 vext2 <4,u,5,5>, <6,u,0,1> + 2571796582U, // <5,5,7,0>: Cost 3 vext1 <3,5,5,7>, LHS + 3633595392U, // <5,5,7,1>: Cost 4 vext1 <1,5,5,7>, <1,3,5,7> + 2571798222U, // <5,5,7,2>: Cost 3 vext1 <3,5,5,7>, <2,3,4,5> + 2571799124U, // <5,5,7,3>: Cost 3 vext1 <3,5,5,7>, <3,5,5,7> + 2571799862U, // <5,5,7,4>: Cost 3 vext1 <3,5,5,7>, RHS + 3114717188U, // <5,5,7,5>: Cost 3 vtrnr RHS, <5,5,5,5> + 4034923010U, // <5,5,7,6>: Cost 4 vzipr <1,3,5,7>, <3,4,5,6> + 2040974646U, // <5,5,7,7>: Cost 2 vtrnr RHS, RHS + 2040974647U, // <5,5,7,u>: Cost 2 vtrnr RHS, RHS + 1509982310U, // <5,5,u,0>: Cost 2 vext1 <5,5,5,5>, LHS + 1573115694U, // <5,5,u,1>: Cost 2 vext2 <4,u,5,5>, LHS + 2571806414U, // <5,5,u,2>: Cost 3 vext1 <3,5,5,u>, <2,3,4,5> + 2571807317U, // <5,5,u,3>: Cost 3 vext1 <3,5,5,u>, <3,5,5,u> + 1509985590U, // <5,5,u,4>: Cost 2 vext1 <5,5,5,5>, RHS + 229035318U, // <5,5,u,5>: Cost 1 vdup1 RHS + 2646857936U, // <5,5,u,6>: Cost 3 vext2 <4,u,5,5>, <u,6,3,7> + 2040982838U, // <5,5,u,7>: Cost 2 vtrnr RHS, RHS + 229035318U, // <5,5,u,u>: Cost 1 vdup1 RHS + 2638233600U, // <5,6,0,0>: Cost 3 vext2 <3,4,5,6>, <0,0,0,0> + 1564491878U, // <5,6,0,1>: Cost 2 vext2 <3,4,5,6>, LHS + 2632261796U, // <5,6,0,2>: Cost 3 vext2 <2,4,5,6>, <0,2,0,2> + 2638233856U, // <5,6,0,3>: Cost 3 vext2 <3,4,5,6>, <0,3,1,4> + 2638233938U, // <5,6,0,4>: Cost 3 vext2 <3,4,5,6>, <0,4,1,5> + 3706003885U, // <5,6,0,5>: Cost 4 vext2 <2,4,5,6>, <0,5,2,6> + 3706003967U, // <5,6,0,6>: Cost 4 vext2 <2,4,5,6>, <0,6,2,7> + 4047473974U, // <5,6,0,7>: Cost 4 vzipr <3,4,5,0>, RHS + 1564492445U, // <5,6,0,u>: Cost 2 vext2 <3,4,5,6>, LHS + 2638234358U, // <5,6,1,0>: Cost 3 vext2 <3,4,5,6>, <1,0,3,2> + 2638234420U, // <5,6,1,1>: Cost 3 vext2 <3,4,5,6>, <1,1,1,1> + 2638234518U, // <5,6,1,2>: Cost 3 vext2 <3,4,5,6>, <1,2,3,0> + 2638234584U, // <5,6,1,3>: Cost 3 vext2 <3,4,5,6>, <1,3,1,3> + 2626290768U, // <5,6,1,4>: Cost 3 vext2 <1,4,5,6>, <1,4,5,6> + 2638234768U, // <5,6,1,5>: Cost 3 vext2 <3,4,5,6>, <1,5,3,7> + 3700032719U, // <5,6,1,6>: Cost 4 vext2 <1,4,5,6>, <1,6,1,7> + 2982366518U, // <5,6,1,7>: Cost 3 vzipr <4,u,5,1>, RHS + 2628945300U, // <5,6,1,u>: Cost 3 vext2 <1,u,5,6>, <1,u,5,6> + 3706004925U, // <5,6,2,0>: Cost 4 vext2 <2,4,5,6>, <2,0,1,2> + 3711976966U, // <5,6,2,1>: Cost 4 vext2 <3,4,5,6>, <2,1,0,3> + 2638235240U, // <5,6,2,2>: Cost 3 vext2 <3,4,5,6>, <2,2,2,2> + 2638235302U, // <5,6,2,3>: Cost 3 vext2 <3,4,5,6>, <2,3,0,1> + 2632263465U, // <5,6,2,4>: Cost 3 vext2 <2,4,5,6>, <2,4,5,6> + 2638235496U, // <5,6,2,5>: Cost 3 vext2 <3,4,5,6>, <2,5,3,6> + 2638235578U, // <5,6,2,6>: Cost 3 vext2 <3,4,5,6>, <2,6,3,7> + 2713965050U, // <5,6,2,7>: Cost 3 vext3 <4,u,5,5>, <6,2,7,3> + 2634917997U, // <5,6,2,u>: Cost 3 vext2 <2,u,5,6>, <2,u,5,6> + 2638235798U, // <5,6,3,0>: Cost 3 vext2 <3,4,5,6>, <3,0,1,2> + 3711977695U, // <5,6,3,1>: Cost 4 vext2 <3,4,5,6>, <3,1,0,3> + 3710650720U, // <5,6,3,2>: Cost 4 vext2 <3,2,5,6>, <3,2,5,6> + 2638236060U, // <5,6,3,3>: Cost 3 vext2 <3,4,5,6>, <3,3,3,3> + 1564494338U, // <5,6,3,4>: Cost 2 vext2 <3,4,5,6>, <3,4,5,6> + 2638236234U, // <5,6,3,5>: Cost 3 vext2 <3,4,5,6>, <3,5,4,6> + 3711978104U, // <5,6,3,6>: Cost 4 vext2 <3,4,5,6>, <3,6,0,7> + 4034227510U, // <5,6,3,7>: Cost 4 vzipr <1,2,5,3>, RHS + 1567148870U, // <5,6,3,u>: Cost 2 vext2 <3,u,5,6>, <3,u,5,6> + 2577817702U, // <5,6,4,0>: Cost 3 vext1 <4,5,6,4>, LHS + 3700034544U, // <5,6,4,1>: Cost 4 vext2 <1,4,5,6>, <4,1,6,5> + 2723033713U, // <5,6,4,2>: Cost 3 vext3 <6,4,2,5>, <6,4,2,5> + 2638236818U, // <5,6,4,3>: Cost 3 vext2 <3,4,5,6>, <4,3,6,5> + 2644208859U, // <5,6,4,4>: Cost 3 vext2 <4,4,5,6>, <4,4,5,6> + 1564495158U, // <5,6,4,5>: Cost 2 vext2 <3,4,5,6>, RHS + 2645536125U, // <5,6,4,6>: Cost 3 vext2 <4,6,5,6>, <4,6,5,6> + 2723402398U, // <5,6,4,7>: Cost 3 vext3 <6,4,7,5>, <6,4,7,5> + 1564495401U, // <5,6,4,u>: Cost 2 vext2 <3,4,5,6>, RHS + 2577825894U, // <5,6,5,0>: Cost 3 vext1 <4,5,6,5>, LHS + 2662125264U, // <5,6,5,1>: Cost 3 vext2 <7,4,5,6>, <5,1,7,3> + 3775836867U, // <5,6,5,2>: Cost 4 vext3 <2,u,6,5>, <6,5,2,6> + 3711979343U, // <5,6,5,3>: Cost 4 vext2 <3,4,5,6>, <5,3,3,4> + 2650181556U, // <5,6,5,4>: Cost 3 vext2 <5,4,5,6>, <5,4,5,6> + 2662125572U, // <5,6,5,5>: Cost 3 vext2 <7,4,5,6>, <5,5,5,5> + 2638237732U, // <5,6,5,6>: Cost 3 vext2 <3,4,5,6>, <5,6,0,1> + 2982399286U, // <5,6,5,7>: Cost 3 vzipr <4,u,5,5>, RHS + 2982399287U, // <5,6,5,u>: Cost 3 vzipr <4,u,5,5>, RHS + 2583806054U, // <5,6,6,0>: Cost 3 vext1 <5,5,6,6>, LHS + 3711979910U, // <5,6,6,1>: Cost 4 vext2 <3,4,5,6>, <6,1,3,4> + 2662126074U, // <5,6,6,2>: Cost 3 vext2 <7,4,5,6>, <6,2,7,3> + 2583808514U, // <5,6,6,3>: Cost 3 vext1 <5,5,6,6>, <3,4,5,6> + 2583809334U, // <5,6,6,4>: Cost 3 vext1 <5,5,6,6>, RHS + 2583810062U, // <5,6,6,5>: Cost 3 vext1 <5,5,6,6>, <5,5,6,6> + 2638238520U, // <5,6,6,6>: Cost 3 vext2 <3,4,5,6>, <6,6,6,6> + 2973781302U, // <5,6,6,7>: Cost 3 vzipr <3,4,5,6>, RHS + 2973781303U, // <5,6,6,u>: Cost 3 vzipr <3,4,5,6>, RHS + 430358630U, // <5,6,7,0>: Cost 1 vext1 RHS, LHS + 1504101110U, // <5,6,7,1>: Cost 2 vext1 RHS, <1,0,3,2> + 1504101992U, // <5,6,7,2>: Cost 2 vext1 RHS, <2,2,2,2> + 1504102550U, // <5,6,7,3>: Cost 2 vext1 RHS, <3,0,1,2> + 430361910U, // <5,6,7,4>: Cost 1 vext1 RHS, RHS + 1504104390U, // <5,6,7,5>: Cost 2 vext1 RHS, <5,4,7,6> + 1504105272U, // <5,6,7,6>: Cost 2 vext1 RHS, <6,6,6,6> + 1504106092U, // <5,6,7,7>: Cost 2 vext1 RHS, <7,7,7,7> + 430364462U, // <5,6,7,u>: Cost 1 vext1 RHS, LHS + 430366822U, // <5,6,u,0>: Cost 1 vext1 RHS, LHS + 1564497710U, // <5,6,u,1>: Cost 2 vext2 <3,4,5,6>, LHS + 1504110184U, // <5,6,u,2>: Cost 2 vext1 RHS, <2,2,2,2> + 1504110742U, // <5,6,u,3>: Cost 2 vext1 RHS, <3,0,1,2> + 430370103U, // <5,6,u,4>: Cost 1 vext1 RHS, RHS + 1564498074U, // <5,6,u,5>: Cost 2 vext2 <3,4,5,6>, RHS + 1504113146U, // <5,6,u,6>: Cost 2 vext1 RHS, <6,2,7,3> + 1504113658U, // <5,6,u,7>: Cost 2 vext1 RHS, <7,0,1,2> + 430372654U, // <5,6,u,u>: Cost 1 vext1 RHS, LHS + 2625634304U, // <5,7,0,0>: Cost 3 vext2 <1,3,5,7>, <0,0,0,0> + 1551892582U, // <5,7,0,1>: Cost 2 vext2 <1,3,5,7>, LHS + 2625634468U, // <5,7,0,2>: Cost 3 vext2 <1,3,5,7>, <0,2,0,2> + 2571889247U, // <5,7,0,3>: Cost 3 vext1 <3,5,7,0>, <3,5,7,0> + 2625634642U, // <5,7,0,4>: Cost 3 vext2 <1,3,5,7>, <0,4,1,5> + 2595778728U, // <5,7,0,5>: Cost 3 vext1 <7,5,7,0>, <5,7,5,7> + 3699376639U, // <5,7,0,6>: Cost 4 vext2 <1,3,5,7>, <0,6,2,7> + 2260235715U, // <5,7,0,7>: Cost 3 vrev <7,5,7,0> + 1551893149U, // <5,7,0,u>: Cost 2 vext2 <1,3,5,7>, LHS + 2625635062U, // <5,7,1,0>: Cost 3 vext2 <1,3,5,7>, <1,0,3,2> + 2624308020U, // <5,7,1,1>: Cost 3 vext2 <1,1,5,7>, <1,1,1,1> + 2625635222U, // <5,7,1,2>: Cost 3 vext2 <1,3,5,7>, <1,2,3,0> + 1551893504U, // <5,7,1,3>: Cost 2 vext2 <1,3,5,7>, <1,3,5,7> + 2571898166U, // <5,7,1,4>: Cost 3 vext1 <3,5,7,1>, RHS + 2625635472U, // <5,7,1,5>: Cost 3 vext2 <1,3,5,7>, <1,5,3,7> + 2627626227U, // <5,7,1,6>: Cost 3 vext2 <1,6,5,7>, <1,6,5,7> + 3702031684U, // <5,7,1,7>: Cost 4 vext2 <1,7,5,7>, <1,7,5,7> + 1555211669U, // <5,7,1,u>: Cost 2 vext2 <1,u,5,7>, <1,u,5,7> + 2629617126U, // <5,7,2,0>: Cost 3 vext2 <2,0,5,7>, <2,0,5,7> + 3699377670U, // <5,7,2,1>: Cost 4 vext2 <1,3,5,7>, <2,1,0,3> + 2625635944U, // <5,7,2,2>: Cost 3 vext2 <1,3,5,7>, <2,2,2,2> + 2625636006U, // <5,7,2,3>: Cost 3 vext2 <1,3,5,7>, <2,3,0,1> + 2632271658U, // <5,7,2,4>: Cost 3 vext2 <2,4,5,7>, <2,4,5,7> + 2625636201U, // <5,7,2,5>: Cost 3 vext2 <1,3,5,7>, <2,5,3,7> + 2625636282U, // <5,7,2,6>: Cost 3 vext2 <1,3,5,7>, <2,6,3,7> + 3708004381U, // <5,7,2,7>: Cost 4 vext2 <2,7,5,7>, <2,7,5,7> + 2625636411U, // <5,7,2,u>: Cost 3 vext2 <1,3,5,7>, <2,u,0,1> + 2625636502U, // <5,7,3,0>: Cost 3 vext2 <1,3,5,7>, <3,0,1,2> + 2625636604U, // <5,7,3,1>: Cost 3 vext2 <1,3,5,7>, <3,1,3,5> + 3699378478U, // <5,7,3,2>: Cost 4 vext2 <1,3,5,7>, <3,2,0,1> + 2625636764U, // <5,7,3,3>: Cost 3 vext2 <1,3,5,7>, <3,3,3,3> + 2625636866U, // <5,7,3,4>: Cost 3 vext2 <1,3,5,7>, <3,4,5,6> + 2625636959U, // <5,7,3,5>: Cost 3 vext2 <1,3,5,7>, <3,5,7,0> + 3699378808U, // <5,7,3,6>: Cost 4 vext2 <1,3,5,7>, <3,6,0,7> + 2640235254U, // <5,7,3,7>: Cost 3 vext2 <3,7,5,7>, <3,7,5,7> + 2625637150U, // <5,7,3,u>: Cost 3 vext2 <1,3,5,7>, <3,u,1,2> + 2571919462U, // <5,7,4,0>: Cost 3 vext1 <3,5,7,4>, LHS + 2571920384U, // <5,7,4,1>: Cost 3 vext1 <3,5,7,4>, <1,3,5,7> + 3699379260U, // <5,7,4,2>: Cost 4 vext2 <1,3,5,7>, <4,2,6,0> + 2571922019U, // <5,7,4,3>: Cost 3 vext1 <3,5,7,4>, <3,5,7,4> + 2571922742U, // <5,7,4,4>: Cost 3 vext1 <3,5,7,4>, RHS + 1551895862U, // <5,7,4,5>: Cost 2 vext2 <1,3,5,7>, RHS + 2846277980U, // <5,7,4,6>: Cost 3 vuzpr RHS, <0,4,2,6> + 2646207951U, // <5,7,4,7>: Cost 3 vext2 <4,7,5,7>, <4,7,5,7> + 1551896105U, // <5,7,4,u>: Cost 2 vext2 <1,3,5,7>, RHS + 2583871590U, // <5,7,5,0>: Cost 3 vext1 <5,5,7,5>, LHS + 2652180176U, // <5,7,5,1>: Cost 3 vext2 <5,7,5,7>, <5,1,7,3> + 2625638177U, // <5,7,5,2>: Cost 3 vext2 <1,3,5,7>, <5,2,7,3> + 2625638262U, // <5,7,5,3>: Cost 3 vext2 <1,3,5,7>, <5,3,7,7> + 2583874870U, // <5,7,5,4>: Cost 3 vext1 <5,5,7,5>, RHS + 2846281732U, // <5,7,5,5>: Cost 3 vuzpr RHS, <5,5,5,5> + 2651517015U, // <5,7,5,6>: Cost 3 vext2 <5,6,5,7>, <5,6,5,7> + 1772539190U, // <5,7,5,7>: Cost 2 vuzpr RHS, RHS + 1772539191U, // <5,7,5,u>: Cost 2 vuzpr RHS, RHS + 2846281826U, // <5,7,6,0>: Cost 3 vuzpr RHS, <5,6,7,0> + 3699380615U, // <5,7,6,1>: Cost 4 vext2 <1,3,5,7>, <6,1,3,5> + 2846281108U, // <5,7,6,2>: Cost 3 vuzpr RHS, <4,6,u,2> + 2589854210U, // <5,7,6,3>: Cost 3 vext1 <6,5,7,6>, <3,4,5,6> + 2846281830U, // <5,7,6,4>: Cost 3 vuzpr RHS, <5,6,7,4> + 2725467658U, // <5,7,6,5>: Cost 3 vext3 <6,7,u,5>, <7,6,5,u> + 2846281076U, // <5,7,6,6>: Cost 3 vuzpr RHS, <4,6,4,6> + 2846279610U, // <5,7,6,7>: Cost 3 vuzpr RHS, <2,6,3,7> + 2846279611U, // <5,7,6,u>: Cost 3 vuzpr RHS, <2,6,3,u> + 1510146150U, // <5,7,7,0>: Cost 2 vext1 <5,5,7,7>, LHS + 2846282574U, // <5,7,7,1>: Cost 3 vuzpr RHS, <6,7,0,1> + 2583889512U, // <5,7,7,2>: Cost 3 vext1 <5,5,7,7>, <2,2,2,2> + 2846281919U, // <5,7,7,3>: Cost 3 vuzpr RHS, <5,7,u,3> + 1510149430U, // <5,7,7,4>: Cost 2 vext1 <5,5,7,7>, RHS + 1510150168U, // <5,7,7,5>: Cost 2 vext1 <5,5,7,7>, <5,5,7,7> + 2583892474U, // <5,7,7,6>: Cost 3 vext1 <5,5,7,7>, <6,2,7,3> + 2625640044U, // <5,7,7,7>: Cost 3 vext2 <1,3,5,7>, <7,7,7,7> + 1510151982U, // <5,7,7,u>: Cost 2 vext1 <5,5,7,7>, LHS + 1510154342U, // <5,7,u,0>: Cost 2 vext1 <5,5,7,u>, LHS + 1551898414U, // <5,7,u,1>: Cost 2 vext2 <1,3,5,7>, LHS + 2625640325U, // <5,7,u,2>: Cost 3 vext2 <1,3,5,7>, <u,2,3,0> + 1772536477U, // <5,7,u,3>: Cost 2 vuzpr RHS, LHS + 1510157622U, // <5,7,u,4>: Cost 2 vext1 <5,5,7,u>, RHS + 1551898778U, // <5,7,u,5>: Cost 2 vext2 <1,3,5,7>, RHS + 2625640656U, // <5,7,u,6>: Cost 3 vext2 <1,3,5,7>, <u,6,3,7> + 1772539433U, // <5,7,u,7>: Cost 2 vuzpr RHS, RHS + 1551898981U, // <5,7,u,u>: Cost 2 vext2 <1,3,5,7>, LHS + 2625642496U, // <5,u,0,0>: Cost 3 vext2 <1,3,5,u>, <0,0,0,0> + 1551900774U, // <5,u,0,1>: Cost 2 vext2 <1,3,5,u>, LHS + 2625642660U, // <5,u,0,2>: Cost 3 vext2 <1,3,5,u>, <0,2,0,2> + 2698630885U, // <5,u,0,3>: Cost 3 vext3 <2,3,4,5>, <u,0,3,2> + 2687129325U, // <5,u,0,4>: Cost 3 vext3 <0,4,1,5>, <u,0,4,1> + 2689783542U, // <5,u,0,5>: Cost 3 vext3 <0,u,1,5>, <u,0,5,1> + 2266134675U, // <5,u,0,6>: Cost 3 vrev <u,5,6,0> + 2595853772U, // <5,u,0,7>: Cost 3 vext1 <7,5,u,0>, <7,5,u,0> + 1551901341U, // <5,u,0,u>: Cost 2 vext2 <1,3,5,u>, LHS + 2625643254U, // <5,u,1,0>: Cost 3 vext2 <1,3,5,u>, <1,0,3,2> + 2625643316U, // <5,u,1,1>: Cost 3 vext2 <1,3,5,u>, <1,1,1,1> + 1613387566U, // <5,u,1,2>: Cost 2 vext3 <0,4,1,5>, LHS + 1551901697U, // <5,u,1,3>: Cost 2 vext2 <1,3,5,u>, <1,3,5,u> + 2626307154U, // <5,u,1,4>: Cost 3 vext2 <1,4,5,u>, <1,4,5,u> + 2689783622U, // <5,u,1,5>: Cost 3 vext3 <0,u,1,5>, <u,1,5,0> + 2627634420U, // <5,u,1,6>: Cost 3 vext2 <1,6,5,u>, <1,6,5,u> + 2982366536U, // <5,u,1,7>: Cost 3 vzipr <4,u,5,1>, RHS + 1613387620U, // <5,u,1,u>: Cost 2 vext3 <0,4,1,5>, LHS + 2846286742U, // <5,u,2,0>: Cost 3 vuzpr RHS, <1,2,3,0> + 2685796528U, // <5,u,2,1>: Cost 3 vext3 <0,2,1,5>, <0,2,1,5> + 2625644136U, // <5,u,2,2>: Cost 3 vext2 <1,3,5,u>, <2,2,2,2> + 2687129480U, // <5,u,2,3>: Cost 3 vext3 <0,4,1,5>, <u,2,3,3> + 2632279851U, // <5,u,2,4>: Cost 3 vext2 <2,4,5,u>, <2,4,5,u> + 2625644394U, // <5,u,2,5>: Cost 3 vext2 <1,3,5,u>, <2,5,3,u> + 2625644474U, // <5,u,2,6>: Cost 3 vext2 <1,3,5,u>, <2,6,3,7> + 2713966508U, // <5,u,2,7>: Cost 3 vext3 <4,u,5,5>, <u,2,7,3> + 2625644603U, // <5,u,2,u>: Cost 3 vext2 <1,3,5,u>, <2,u,0,1> + 2687129532U, // <5,u,3,0>: Cost 3 vext3 <0,4,1,5>, <u,3,0,1> + 2636261649U, // <5,u,3,1>: Cost 3 vext2 <3,1,5,u>, <3,1,5,u> + 2636925282U, // <5,u,3,2>: Cost 3 vext2 <3,2,5,u>, <3,2,5,u> + 2625644956U, // <5,u,3,3>: Cost 3 vext2 <1,3,5,u>, <3,3,3,3> + 1564510724U, // <5,u,3,4>: Cost 2 vext2 <3,4,5,u>, <3,4,5,u> + 2625645160U, // <5,u,3,5>: Cost 3 vext2 <1,3,5,u>, <3,5,u,0> + 2734610422U, // <5,u,3,6>: Cost 3 vext3 <u,3,6,5>, <u,3,6,5> + 2640243447U, // <5,u,3,7>: Cost 3 vext2 <3,7,5,u>, <3,7,5,u> + 1567165256U, // <5,u,3,u>: Cost 2 vext2 <3,u,5,u>, <3,u,5,u> + 1567828889U, // <5,u,4,0>: Cost 2 vext2 <4,0,5,u>, <4,0,5,u> + 1661163546U, // <5,u,4,1>: Cost 2 vext3 <u,4,1,5>, <u,4,1,5> + 2734463012U, // <5,u,4,2>: Cost 3 vext3 <u,3,4,5>, <u,4,2,6> + 2698631212U, // <5,u,4,3>: Cost 3 vext3 <2,3,4,5>, <u,4,3,5> + 1570458842U, // <5,u,4,4>: Cost 2 vext2 <4,4,5,5>, <4,4,5,5> + 1551904054U, // <5,u,4,5>: Cost 2 vext2 <1,3,5,u>, RHS + 2846286172U, // <5,u,4,6>: Cost 3 vuzpr RHS, <0,4,2,6> + 2646216144U, // <5,u,4,7>: Cost 3 vext2 <4,7,5,u>, <4,7,5,u> + 1551904297U, // <5,u,4,u>: Cost 2 vext2 <1,3,5,u>, RHS + 1509982310U, // <5,u,5,0>: Cost 2 vext1 <5,5,5,5>, LHS + 2560058555U, // <5,u,5,1>: Cost 3 vext1 <1,5,u,5>, <1,5,u,5> + 2698926194U, // <5,u,5,2>: Cost 3 vext3 <2,3,u,5>, <u,5,2,3> + 2698631295U, // <5,u,5,3>: Cost 3 vext3 <2,3,4,5>, <u,5,3,7> + 1509985590U, // <5,u,5,4>: Cost 2 vext1 <5,5,5,5>, RHS + 229035318U, // <5,u,5,5>: Cost 1 vdup1 RHS + 1613387930U, // <5,u,5,6>: Cost 2 vext3 <0,4,1,5>, RHS + 1772547382U, // <5,u,5,7>: Cost 2 vuzpr RHS, RHS + 229035318U, // <5,u,5,u>: Cost 1 vdup1 RHS + 2566037606U, // <5,u,6,0>: Cost 3 vext1 <2,5,u,6>, LHS + 2920044334U, // <5,u,6,1>: Cost 3 vzipl <5,6,7,0>, LHS + 2566039445U, // <5,u,6,2>: Cost 3 vext1 <2,5,u,6>, <2,5,u,6> + 2687129808U, // <5,u,6,3>: Cost 3 vext3 <0,4,1,5>, <u,6,3,7> + 2566040886U, // <5,u,6,4>: Cost 3 vext1 <2,5,u,6>, RHS + 2920044698U, // <5,u,6,5>: Cost 3 vzipl <5,6,7,0>, RHS + 2846289268U, // <5,u,6,6>: Cost 3 vuzpr RHS, <4,6,4,6> + 2973781320U, // <5,u,6,7>: Cost 3 vzipr <3,4,5,6>, RHS + 2687129853U, // <5,u,6,u>: Cost 3 vext3 <0,4,1,5>, <u,6,u,7> + 430506086U, // <5,u,7,0>: Cost 1 vext1 RHS, LHS + 1486333117U, // <5,u,7,1>: Cost 2 vext1 <1,5,u,7>, <1,5,u,7> + 1504249448U, // <5,u,7,2>: Cost 2 vext1 RHS, <2,2,2,2> + 2040971933U, // <5,u,7,3>: Cost 2 vtrnr RHS, LHS + 430509384U, // <5,u,7,4>: Cost 1 vext1 RHS, RHS + 1504251600U, // <5,u,7,5>: Cost 2 vext1 RHS, <5,1,7,3> + 118708378U, // <5,u,7,6>: Cost 1 vrev RHS + 2040974889U, // <5,u,7,7>: Cost 2 vtrnr RHS, RHS + 430511918U, // <5,u,7,u>: Cost 1 vext1 RHS, LHS + 430514278U, // <5,u,u,0>: Cost 1 vext1 RHS, LHS + 1551906606U, // <5,u,u,1>: Cost 2 vext2 <1,3,5,u>, LHS + 1613388133U, // <5,u,u,2>: Cost 2 vext3 <0,4,1,5>, LHS + 1772544669U, // <5,u,u,3>: Cost 2 vuzpr RHS, LHS + 430517577U, // <5,u,u,4>: Cost 1 vext1 RHS, RHS + 229035318U, // <5,u,u,5>: Cost 1 vdup1 RHS + 118716571U, // <5,u,u,6>: Cost 1 vrev RHS + 1772547625U, // <5,u,u,7>: Cost 2 vuzpr RHS, RHS + 430520110U, // <5,u,u,u>: Cost 1 vext1 RHS, LHS + 2686025728U, // <6,0,0,0>: Cost 3 vext3 <0,2,4,6>, <0,0,0,0> + 2686025738U, // <6,0,0,1>: Cost 3 vext3 <0,2,4,6>, <0,0,1,1> + 2686025748U, // <6,0,0,2>: Cost 3 vext3 <0,2,4,6>, <0,0,2,2> + 3779084320U, // <6,0,0,3>: Cost 4 vext3 <3,4,5,6>, <0,0,3,5> + 2642903388U, // <6,0,0,4>: Cost 3 vext2 <4,2,6,0>, <0,4,2,6> + 3657723939U, // <6,0,0,5>: Cost 4 vext1 <5,6,0,0>, <5,6,0,0> + 3926676514U, // <6,0,0,6>: Cost 4 vuzpr <5,6,7,0>, <7,0,5,6> + 3926675786U, // <6,0,0,7>: Cost 4 vuzpr <5,6,7,0>, <6,0,5,7> + 2686025802U, // <6,0,0,u>: Cost 3 vext3 <0,2,4,6>, <0,0,u,2> + 2566070374U, // <6,0,1,0>: Cost 3 vext1 <2,6,0,1>, LHS + 3759767642U, // <6,0,1,1>: Cost 4 vext3 <0,2,4,6>, <0,1,1,0> + 1612284006U, // <6,0,1,2>: Cost 2 vext3 <0,2,4,6>, LHS + 2583988738U, // <6,0,1,3>: Cost 3 vext1 <5,6,0,1>, <3,4,5,6> + 2566073654U, // <6,0,1,4>: Cost 3 vext1 <2,6,0,1>, RHS + 2583990308U, // <6,0,1,5>: Cost 3 vext1 <5,6,0,1>, <5,6,0,1> + 2589963005U, // <6,0,1,6>: Cost 3 vext1 <6,6,0,1>, <6,6,0,1> + 2595935702U, // <6,0,1,7>: Cost 3 vext1 <7,6,0,1>, <7,6,0,1> + 1612284060U, // <6,0,1,u>: Cost 2 vext3 <0,2,4,6>, LHS + 2686025892U, // <6,0,2,0>: Cost 3 vext3 <0,2,4,6>, <0,2,0,2> + 2685804721U, // <6,0,2,1>: Cost 3 vext3 <0,2,1,6>, <0,2,1,6> + 3759620282U, // <6,0,2,2>: Cost 4 vext3 <0,2,2,6>, <0,2,2,6> + 2705342658U, // <6,0,2,3>: Cost 3 vext3 <3,4,5,6>, <0,2,3,5> + 1612284108U, // <6,0,2,4>: Cost 2 vext3 <0,2,4,6>, <0,2,4,6> + 3706029956U, // <6,0,2,5>: Cost 4 vext2 <2,4,6,0>, <2,5,6,7> + 2686173406U, // <6,0,2,6>: Cost 3 vext3 <0,2,6,6>, <0,2,6,6> + 3651769338U, // <6,0,2,7>: Cost 4 vext1 <4,6,0,2>, <7,0,1,2> + 1612579056U, // <6,0,2,u>: Cost 2 vext3 <0,2,u,6>, <0,2,u,6> + 3706030230U, // <6,0,3,0>: Cost 4 vext2 <2,4,6,0>, <3,0,1,2> + 2705342720U, // <6,0,3,1>: Cost 3 vext3 <3,4,5,6>, <0,3,1,4> + 2705342730U, // <6,0,3,2>: Cost 3 vext3 <3,4,5,6>, <0,3,2,5> + 3706030492U, // <6,0,3,3>: Cost 4 vext2 <2,4,6,0>, <3,3,3,3> + 2644896258U, // <6,0,3,4>: Cost 3 vext2 <4,5,6,0>, <3,4,5,6> + 3718638154U, // <6,0,3,5>: Cost 4 vext2 <4,5,6,0>, <3,5,4,6> + 3729918619U, // <6,0,3,6>: Cost 4 vext2 <6,4,6,0>, <3,6,4,6> + 3926672384U, // <6,0,3,7>: Cost 4 vuzpr <5,6,7,0>, <1,3,5,7> + 2705342784U, // <6,0,3,u>: Cost 3 vext3 <3,4,5,6>, <0,3,u,5> + 2687058250U, // <6,0,4,0>: Cost 3 vext3 <0,4,0,6>, <0,4,0,6> + 2686026066U, // <6,0,4,1>: Cost 3 vext3 <0,2,4,6>, <0,4,1,5> + 1613463900U, // <6,0,4,2>: Cost 2 vext3 <0,4,2,6>, <0,4,2,6> + 3761021285U, // <6,0,4,3>: Cost 4 vext3 <0,4,3,6>, <0,4,3,6> + 2687353198U, // <6,0,4,4>: Cost 3 vext3 <0,4,4,6>, <0,4,4,6> + 2632289590U, // <6,0,4,5>: Cost 3 vext2 <2,4,6,0>, RHS + 2645560704U, // <6,0,4,6>: Cost 3 vext2 <4,6,6,0>, <4,6,6,0> + 2646224337U, // <6,0,4,7>: Cost 3 vext2 <4,7,6,0>, <4,7,6,0> + 1613906322U, // <6,0,4,u>: Cost 2 vext3 <0,4,u,6>, <0,4,u,6> + 3651788902U, // <6,0,5,0>: Cost 4 vext1 <4,6,0,5>, LHS + 2687795620U, // <6,0,5,1>: Cost 3 vext3 <0,5,1,6>, <0,5,1,6> + 3761611181U, // <6,0,5,2>: Cost 4 vext3 <0,5,2,6>, <0,5,2,6> + 3723284326U, // <6,0,5,3>: Cost 4 vext2 <5,3,6,0>, <5,3,6,0> + 2646224838U, // <6,0,5,4>: Cost 3 vext2 <4,7,6,0>, <5,4,7,6> + 3718639630U, // <6,0,5,5>: Cost 4 vext2 <4,5,6,0>, <5,5,6,6> + 2652196962U, // <6,0,5,6>: Cost 3 vext2 <5,7,6,0>, <5,6,7,0> + 2852932918U, // <6,0,5,7>: Cost 3 vuzpr <5,6,7,0>, RHS + 2852932919U, // <6,0,5,u>: Cost 3 vuzpr <5,6,7,0>, RHS + 2852933730U, // <6,0,6,0>: Cost 3 vuzpr <5,6,7,0>, <5,6,7,0> + 2925985894U, // <6,0,6,1>: Cost 3 vzipl <6,6,6,6>, LHS + 3060203622U, // <6,0,6,2>: Cost 3 vtrnl <6,6,6,6>, LHS + 3718640178U, // <6,0,6,3>: Cost 4 vext2 <4,5,6,0>, <6,3,4,5> + 2656178832U, // <6,0,6,4>: Cost 3 vext2 <6,4,6,0>, <6,4,6,0> + 3725939378U, // <6,0,6,5>: Cost 4 vext2 <5,7,6,0>, <6,5,0,7> + 2657506098U, // <6,0,6,6>: Cost 3 vext2 <6,6,6,0>, <6,6,6,0> + 2619020110U, // <6,0,6,7>: Cost 3 vext2 <0,2,6,0>, <6,7,0,1> + 2925986461U, // <6,0,6,u>: Cost 3 vzipl <6,6,6,6>, LHS + 2572091494U, // <6,0,7,0>: Cost 3 vext1 <3,6,0,7>, LHS + 2572092310U, // <6,0,7,1>: Cost 3 vext1 <3,6,0,7>, <1,2,3,0> + 2980495524U, // <6,0,7,2>: Cost 3 vzipr RHS, <0,2,0,2> + 2572094072U, // <6,0,7,3>: Cost 3 vext1 <3,6,0,7>, <3,6,0,7> + 2572094774U, // <6,0,7,4>: Cost 3 vext1 <3,6,0,7>, RHS + 4054238242U, // <6,0,7,5>: Cost 4 vzipr RHS, <1,4,0,5> + 3645837653U, // <6,0,7,6>: Cost 4 vext1 <3,6,0,7>, <6,0,7,0> + 4054239054U, // <6,0,7,7>: Cost 4 vzipr RHS, <2,5,0,7> + 2572097326U, // <6,0,7,u>: Cost 3 vext1 <3,6,0,7>, LHS + 2686026378U, // <6,0,u,0>: Cost 3 vext3 <0,2,4,6>, <0,u,0,2> + 2686026386U, // <6,0,u,1>: Cost 3 vext3 <0,2,4,6>, <0,u,1,1> + 1612284573U, // <6,0,u,2>: Cost 2 vext3 <0,2,4,6>, LHS + 2705343144U, // <6,0,u,3>: Cost 3 vext3 <3,4,5,6>, <0,u,3,5> + 1616265906U, // <6,0,u,4>: Cost 2 vext3 <0,u,4,6>, <0,u,4,6> + 2632292506U, // <6,0,u,5>: Cost 3 vext2 <2,4,6,0>, RHS + 2590020356U, // <6,0,u,6>: Cost 3 vext1 <6,6,0,u>, <6,6,0,u> + 2852933161U, // <6,0,u,7>: Cost 3 vuzpr <5,6,7,0>, RHS + 1612284627U, // <6,0,u,u>: Cost 2 vext3 <0,2,4,6>, LHS + 2595995750U, // <6,1,0,0>: Cost 3 vext1 <7,6,1,0>, LHS + 2646229094U, // <6,1,0,1>: Cost 3 vext2 <4,7,6,1>, LHS + 3694092492U, // <6,1,0,2>: Cost 4 vext2 <0,4,6,1>, <0,2,4,6> + 2686026486U, // <6,1,0,3>: Cost 3 vext3 <0,2,4,6>, <1,0,3,2> + 2595999030U, // <6,1,0,4>: Cost 3 vext1 <7,6,1,0>, RHS + 3767730952U, // <6,1,0,5>: Cost 4 vext3 <1,5,4,6>, <1,0,5,2> + 2596000590U, // <6,1,0,6>: Cost 3 vext1 <7,6,1,0>, <6,7,0,1> + 2596001246U, // <6,1,0,7>: Cost 3 vext1 <7,6,1,0>, <7,6,1,0> + 2686026531U, // <6,1,0,u>: Cost 3 vext3 <0,2,4,6>, <1,0,u,2> + 3763602219U, // <6,1,1,0>: Cost 4 vext3 <0,u,2,6>, <1,1,0,1> + 2686026548U, // <6,1,1,1>: Cost 3 vext3 <0,2,4,6>, <1,1,1,1> + 3764929346U, // <6,1,1,2>: Cost 4 vext3 <1,1,2,6>, <1,1,2,6> + 2686026568U, // <6,1,1,3>: Cost 3 vext3 <0,2,4,6>, <1,1,3,3> + 2691334996U, // <6,1,1,4>: Cost 3 vext3 <1,1,4,6>, <1,1,4,6> + 3760874332U, // <6,1,1,5>: Cost 4 vext3 <0,4,1,6>, <1,1,5,5> + 3765224294U, // <6,1,1,6>: Cost 4 vext3 <1,1,6,6>, <1,1,6,6> + 3669751263U, // <6,1,1,7>: Cost 4 vext1 <7,6,1,1>, <7,6,1,1> + 2686026613U, // <6,1,1,u>: Cost 3 vext3 <0,2,4,6>, <1,1,u,3> + 2554208358U, // <6,1,2,0>: Cost 3 vext1 <0,6,1,2>, LHS + 3763602311U, // <6,1,2,1>: Cost 4 vext3 <0,u,2,6>, <1,2,1,3> + 3639895971U, // <6,1,2,2>: Cost 4 vext1 <2,6,1,2>, <2,6,1,2> + 2686026646U, // <6,1,2,3>: Cost 3 vext3 <0,2,4,6>, <1,2,3,0> + 2554211638U, // <6,1,2,4>: Cost 3 vext1 <0,6,1,2>, RHS + 3760874411U, // <6,1,2,5>: Cost 4 vext3 <0,4,1,6>, <1,2,5,3> + 2554212858U, // <6,1,2,6>: Cost 3 vext1 <0,6,1,2>, <6,2,7,3> + 3802973114U, // <6,1,2,7>: Cost 4 vext3 <7,4,5,6>, <1,2,7,0> + 2686026691U, // <6,1,2,u>: Cost 3 vext3 <0,2,4,6>, <1,2,u,0> + 2566160486U, // <6,1,3,0>: Cost 3 vext1 <2,6,1,3>, LHS + 2686026712U, // <6,1,3,1>: Cost 3 vext3 <0,2,4,6>, <1,3,1,3> + 2686026724U, // <6,1,3,2>: Cost 3 vext3 <0,2,4,6>, <1,3,2,6> + 3759768552U, // <6,1,3,3>: Cost 4 vext3 <0,2,4,6>, <1,3,3,1> + 2692662262U, // <6,1,3,4>: Cost 3 vext3 <1,3,4,6>, <1,3,4,6> + 2686026752U, // <6,1,3,5>: Cost 3 vext3 <0,2,4,6>, <1,3,5,7> + 2590053128U, // <6,1,3,6>: Cost 3 vext1 <6,6,1,3>, <6,6,1,3> + 3663795194U, // <6,1,3,7>: Cost 4 vext1 <6,6,1,3>, <7,0,1,2> + 2686026775U, // <6,1,3,u>: Cost 3 vext3 <0,2,4,6>, <1,3,u,3> + 2641587099U, // <6,1,4,0>: Cost 3 vext2 <4,0,6,1>, <4,0,6,1> + 2693104684U, // <6,1,4,1>: Cost 3 vext3 <1,4,1,6>, <1,4,1,6> + 3639912357U, // <6,1,4,2>: Cost 4 vext1 <2,6,1,4>, <2,6,1,4> + 2687206462U, // <6,1,4,3>: Cost 3 vext3 <0,4,2,6>, <1,4,3,6> + 3633941814U, // <6,1,4,4>: Cost 4 vext1 <1,6,1,4>, RHS + 2693399632U, // <6,1,4,5>: Cost 3 vext3 <1,4,5,6>, <1,4,5,6> + 3765077075U, // <6,1,4,6>: Cost 4 vext3 <1,1,4,6>, <1,4,6,0> + 2646232530U, // <6,1,4,7>: Cost 3 vext2 <4,7,6,1>, <4,7,6,1> + 2687206507U, // <6,1,4,u>: Cost 3 vext3 <0,4,2,6>, <1,4,u,6> + 2647559796U, // <6,1,5,0>: Cost 3 vext2 <5,0,6,1>, <5,0,6,1> + 3765077118U, // <6,1,5,1>: Cost 4 vext3 <1,1,4,6>, <1,5,1,7> + 3767583878U, // <6,1,5,2>: Cost 4 vext3 <1,5,2,6>, <1,5,2,6> + 2686026896U, // <6,1,5,3>: Cost 3 vext3 <0,2,4,6>, <1,5,3,7> + 2693989528U, // <6,1,5,4>: Cost 3 vext3 <1,5,4,6>, <1,5,4,6> + 3767805089U, // <6,1,5,5>: Cost 4 vext3 <1,5,5,6>, <1,5,5,6> + 2652868706U, // <6,1,5,6>: Cost 3 vext2 <5,u,6,1>, <5,6,7,0> + 3908250934U, // <6,1,5,7>: Cost 4 vuzpr <2,6,0,1>, RHS + 2686026941U, // <6,1,5,u>: Cost 3 vext3 <0,2,4,6>, <1,5,u,7> + 2554241126U, // <6,1,6,0>: Cost 3 vext1 <0,6,1,6>, LHS + 3763602639U, // <6,1,6,1>: Cost 4 vext3 <0,u,2,6>, <1,6,1,7> + 3759547607U, // <6,1,6,2>: Cost 4 vext3 <0,2,1,6>, <1,6,2,6> + 3115221094U, // <6,1,6,3>: Cost 3 vtrnr <4,6,4,6>, LHS + 2554244406U, // <6,1,6,4>: Cost 3 vext1 <0,6,1,6>, RHS + 3760874739U, // <6,1,6,5>: Cost 4 vext3 <0,4,1,6>, <1,6,5,7> + 2554245944U, // <6,1,6,6>: Cost 3 vext1 <0,6,1,6>, <6,6,6,6> + 3719975758U, // <6,1,6,7>: Cost 4 vext2 <4,7,6,1>, <6,7,0,1> + 3115221099U, // <6,1,6,u>: Cost 3 vtrnr <4,6,4,6>, LHS + 2560221286U, // <6,1,7,0>: Cost 3 vext1 <1,6,1,7>, LHS + 2560222415U, // <6,1,7,1>: Cost 3 vext1 <1,6,1,7>, <1,6,1,7> + 2980497558U, // <6,1,7,2>: Cost 3 vzipr RHS, <3,0,1,2> + 3103211622U, // <6,1,7,3>: Cost 3 vtrnr <2,6,3,7>, LHS + 2560224566U, // <6,1,7,4>: Cost 3 vext1 <1,6,1,7>, RHS + 2980495698U, // <6,1,7,5>: Cost 3 vzipr RHS, <0,4,1,5> + 3633967526U, // <6,1,7,6>: Cost 4 vext1 <1,6,1,7>, <6,1,7,0> + 4054237686U, // <6,1,7,7>: Cost 4 vzipr RHS, <0,6,1,7> + 2560227118U, // <6,1,7,u>: Cost 3 vext1 <1,6,1,7>, LHS + 2560229478U, // <6,1,u,0>: Cost 3 vext1 <1,6,1,u>, LHS + 2686027117U, // <6,1,u,1>: Cost 3 vext3 <0,2,4,6>, <1,u,1,3> + 2686027129U, // <6,1,u,2>: Cost 3 vext3 <0,2,4,6>, <1,u,2,6> + 2686027132U, // <6,1,u,3>: Cost 3 vext3 <0,2,4,6>, <1,u,3,0> + 2687206795U, // <6,1,u,4>: Cost 3 vext3 <0,4,2,6>, <1,u,4,6> + 2686027157U, // <6,1,u,5>: Cost 3 vext3 <0,2,4,6>, <1,u,5,7> + 2590094093U, // <6,1,u,6>: Cost 3 vext1 <6,6,1,u>, <6,6,1,u> + 2596066790U, // <6,1,u,7>: Cost 3 vext1 <7,6,1,u>, <7,6,1,u> + 2686027177U, // <6,1,u,u>: Cost 3 vext3 <0,2,4,6>, <1,u,u,0> + 2646900736U, // <6,2,0,0>: Cost 3 vext2 <4,u,6,2>, <0,0,0,0> + 1573159014U, // <6,2,0,1>: Cost 2 vext2 <4,u,6,2>, LHS + 2646900900U, // <6,2,0,2>: Cost 3 vext2 <4,u,6,2>, <0,2,0,2> + 3759769037U, // <6,2,0,3>: Cost 4 vext3 <0,2,4,6>, <2,0,3,0> + 2641592668U, // <6,2,0,4>: Cost 3 vext2 <4,0,6,2>, <0,4,2,6> + 3779085794U, // <6,2,0,5>: Cost 4 vext3 <3,4,5,6>, <2,0,5,3> + 2686027244U, // <6,2,0,6>: Cost 3 vext3 <0,2,4,6>, <2,0,6,4> + 3669816807U, // <6,2,0,7>: Cost 4 vext1 <7,6,2,0>, <7,6,2,0> + 1573159581U, // <6,2,0,u>: Cost 2 vext2 <4,u,6,2>, LHS + 2230527897U, // <6,2,1,0>: Cost 3 vrev <2,6,0,1> + 2646901556U, // <6,2,1,1>: Cost 3 vext2 <4,u,6,2>, <1,1,1,1> + 2646901654U, // <6,2,1,2>: Cost 3 vext2 <4,u,6,2>, <1,2,3,0> + 2847047782U, // <6,2,1,3>: Cost 3 vuzpr <4,6,u,2>, LHS + 3771049517U, // <6,2,1,4>: Cost 4 vext3 <2,1,4,6>, <2,1,4,6> + 2646901904U, // <6,2,1,5>: Cost 3 vext2 <4,u,6,2>, <1,5,3,7> + 2686027324U, // <6,2,1,6>: Cost 3 vext3 <0,2,4,6>, <2,1,6,3> + 3669825000U, // <6,2,1,7>: Cost 4 vext1 <7,6,2,1>, <7,6,2,1> + 2231117793U, // <6,2,1,u>: Cost 3 vrev <2,6,u,1> + 3763603029U, // <6,2,2,0>: Cost 4 vext3 <0,u,2,6>, <2,2,0,1> + 3759769184U, // <6,2,2,1>: Cost 4 vext3 <0,2,4,6>, <2,2,1,3> + 2686027368U, // <6,2,2,2>: Cost 3 vext3 <0,2,4,6>, <2,2,2,2> + 2686027378U, // <6,2,2,3>: Cost 3 vext3 <0,2,4,6>, <2,2,3,3> + 2697971326U, // <6,2,2,4>: Cost 3 vext3 <2,2,4,6>, <2,2,4,6> + 3759769224U, // <6,2,2,5>: Cost 4 vext3 <0,2,4,6>, <2,2,5,7> + 2698118800U, // <6,2,2,6>: Cost 3 vext3 <2,2,6,6>, <2,2,6,6> + 3920794092U, // <6,2,2,7>: Cost 4 vuzpr <4,6,u,2>, <6,2,5,7> + 2686027423U, // <6,2,2,u>: Cost 3 vext3 <0,2,4,6>, <2,2,u,3> + 2686027430U, // <6,2,3,0>: Cost 3 vext3 <0,2,4,6>, <2,3,0,1> + 3759769262U, // <6,2,3,1>: Cost 4 vext3 <0,2,4,6>, <2,3,1,0> + 2698487485U, // <6,2,3,2>: Cost 3 vext3 <2,3,2,6>, <2,3,2,6> + 2705344196U, // <6,2,3,3>: Cost 3 vext3 <3,4,5,6>, <2,3,3,4> + 2686027470U, // <6,2,3,4>: Cost 3 vext3 <0,2,4,6>, <2,3,4,5> + 2698708696U, // <6,2,3,5>: Cost 3 vext3 <2,3,5,6>, <2,3,5,6> + 2724660961U, // <6,2,3,6>: Cost 3 vext3 <6,6,6,6>, <2,3,6,6> + 2729232104U, // <6,2,3,7>: Cost 3 vext3 <7,4,5,6>, <2,3,7,4> + 2686027502U, // <6,2,3,u>: Cost 3 vext3 <0,2,4,6>, <2,3,u,1> + 1567853468U, // <6,2,4,0>: Cost 2 vext2 <4,0,6,2>, <4,0,6,2> + 3759769351U, // <6,2,4,1>: Cost 4 vext3 <0,2,4,6>, <2,4,1,u> + 2699151118U, // <6,2,4,2>: Cost 3 vext3 <2,4,2,6>, <2,4,2,6> + 2686027543U, // <6,2,4,3>: Cost 3 vext3 <0,2,4,6>, <2,4,3,6> + 2699298592U, // <6,2,4,4>: Cost 3 vext3 <2,4,4,6>, <2,4,4,6> + 1573162294U, // <6,2,4,5>: Cost 2 vext2 <4,u,6,2>, RHS + 2686027564U, // <6,2,4,6>: Cost 3 vext3 <0,2,4,6>, <2,4,6,0> + 3719982547U, // <6,2,4,7>: Cost 4 vext2 <4,7,6,2>, <4,7,6,2> + 1573162532U, // <6,2,4,u>: Cost 2 vext2 <4,u,6,2>, <4,u,6,2> + 3779086154U, // <6,2,5,0>: Cost 4 vext3 <3,4,5,6>, <2,5,0,3> + 2646904528U, // <6,2,5,1>: Cost 3 vext2 <4,u,6,2>, <5,1,7,3> + 3759769440U, // <6,2,5,2>: Cost 4 vext3 <0,2,4,6>, <2,5,2,7> + 2699888488U, // <6,2,5,3>: Cost 3 vext3 <2,5,3,6>, <2,5,3,6> + 2230855617U, // <6,2,5,4>: Cost 3 vrev <2,6,4,5> + 2646904836U, // <6,2,5,5>: Cost 3 vext2 <4,u,6,2>, <5,5,5,5> + 2646904930U, // <6,2,5,6>: Cost 3 vext2 <4,u,6,2>, <5,6,7,0> + 2847051062U, // <6,2,5,7>: Cost 3 vuzpr <4,6,u,2>, RHS + 2700257173U, // <6,2,5,u>: Cost 3 vext3 <2,5,u,6>, <2,5,u,6> + 2687207321U, // <6,2,6,0>: Cost 3 vext3 <0,4,2,6>, <2,6,0,1> + 2686027684U, // <6,2,6,1>: Cost 3 vext3 <0,2,4,6>, <2,6,1,3> + 2566260656U, // <6,2,6,2>: Cost 3 vext1 <2,6,2,6>, <2,6,2,6> + 2685806522U, // <6,2,6,3>: Cost 3 vext3 <0,2,1,6>, <2,6,3,7> + 2687207361U, // <6,2,6,4>: Cost 3 vext3 <0,4,2,6>, <2,6,4,5> + 2686027724U, // <6,2,6,5>: Cost 3 vext3 <0,2,4,6>, <2,6,5,7> + 2646905656U, // <6,2,6,6>: Cost 3 vext2 <4,u,6,2>, <6,6,6,6> + 2646905678U, // <6,2,6,7>: Cost 3 vext2 <4,u,6,2>, <6,7,0,1> + 2686027751U, // <6,2,6,u>: Cost 3 vext3 <0,2,4,6>, <2,6,u,7> + 2554323046U, // <6,2,7,0>: Cost 3 vext1 <0,6,2,7>, LHS + 2572239606U, // <6,2,7,1>: Cost 3 vext1 <3,6,2,7>, <1,0,3,2> + 2566268849U, // <6,2,7,2>: Cost 3 vext1 <2,6,2,7>, <2,6,2,7> + 1906753638U, // <6,2,7,3>: Cost 2 vzipr RHS, LHS + 2554326326U, // <6,2,7,4>: Cost 3 vext1 <0,6,2,7>, RHS + 3304687564U, // <6,2,7,5>: Cost 4 vrev <2,6,5,7> + 2980495708U, // <6,2,7,6>: Cost 3 vzipr RHS, <0,4,2,6> + 2646906476U, // <6,2,7,7>: Cost 3 vext2 <4,u,6,2>, <7,7,7,7> + 1906753643U, // <6,2,7,u>: Cost 2 vzipr RHS, LHS + 1591744256U, // <6,2,u,0>: Cost 2 vext2 <u,0,6,2>, <u,0,6,2> + 1573164846U, // <6,2,u,1>: Cost 2 vext2 <4,u,6,2>, LHS + 2701805650U, // <6,2,u,2>: Cost 3 vext3 <2,u,2,6>, <2,u,2,6> + 1906761830U, // <6,2,u,3>: Cost 2 vzipr RHS, LHS + 2686027875U, // <6,2,u,4>: Cost 3 vext3 <0,2,4,6>, <2,u,4,5> + 1573165210U, // <6,2,u,5>: Cost 2 vext2 <4,u,6,2>, RHS + 2686322800U, // <6,2,u,6>: Cost 3 vext3 <0,2,u,6>, <2,u,6,0> + 2847051305U, // <6,2,u,7>: Cost 3 vuzpr <4,6,u,2>, RHS + 1906761835U, // <6,2,u,u>: Cost 2 vzipr RHS, LHS + 3759769739U, // <6,3,0,0>: Cost 4 vext3 <0,2,4,6>, <3,0,0,0> + 2686027926U, // <6,3,0,1>: Cost 3 vext3 <0,2,4,6>, <3,0,1,2> + 2686027937U, // <6,3,0,2>: Cost 3 vext3 <0,2,4,6>, <3,0,2,4> + 3640027286U, // <6,3,0,3>: Cost 4 vext1 <2,6,3,0>, <3,0,1,2> + 2687207601U, // <6,3,0,4>: Cost 3 vext3 <0,4,2,6>, <3,0,4,2> + 2705344698U, // <6,3,0,5>: Cost 3 vext3 <3,4,5,6>, <3,0,5,2> + 3663917847U, // <6,3,0,6>: Cost 4 vext1 <6,6,3,0>, <6,6,3,0> + 2237008560U, // <6,3,0,7>: Cost 3 vrev <3,6,7,0> + 2686027989U, // <6,3,0,u>: Cost 3 vext3 <0,2,4,6>, <3,0,u,2> + 3759769823U, // <6,3,1,0>: Cost 4 vext3 <0,2,4,6>, <3,1,0,3> + 3759769830U, // <6,3,1,1>: Cost 4 vext3 <0,2,4,6>, <3,1,1,1> + 3759769841U, // <6,3,1,2>: Cost 4 vext3 <0,2,4,6>, <3,1,2,3> + 3759769848U, // <6,3,1,3>: Cost 4 vext3 <0,2,4,6>, <3,1,3,1> + 2703280390U, // <6,3,1,4>: Cost 3 vext3 <3,1,4,6>, <3,1,4,6> + 3759769868U, // <6,3,1,5>: Cost 4 vext3 <0,2,4,6>, <3,1,5,3> + 3704063194U, // <6,3,1,6>: Cost 4 vext2 <2,1,6,3>, <1,6,3,0> + 3767732510U, // <6,3,1,7>: Cost 4 vext3 <1,5,4,6>, <3,1,7,3> + 2703280390U, // <6,3,1,u>: Cost 3 vext3 <3,1,4,6>, <3,1,4,6> + 3704063468U, // <6,3,2,0>: Cost 4 vext2 <2,1,6,3>, <2,0,6,4> + 2630321724U, // <6,3,2,1>: Cost 3 vext2 <2,1,6,3>, <2,1,6,3> + 3759769921U, // <6,3,2,2>: Cost 4 vext3 <0,2,4,6>, <3,2,2,2> + 3759769928U, // <6,3,2,3>: Cost 4 vext3 <0,2,4,6>, <3,2,3,0> + 3704063767U, // <6,3,2,4>: Cost 4 vext2 <2,1,6,3>, <2,4,3,6> + 3704063876U, // <6,3,2,5>: Cost 4 vext2 <2,1,6,3>, <2,5,6,7> + 2636957626U, // <6,3,2,6>: Cost 3 vext2 <3,2,6,3>, <2,6,3,7> + 3777907058U, // <6,3,2,7>: Cost 4 vext3 <3,2,7,6>, <3,2,7,6> + 2630321724U, // <6,3,2,u>: Cost 3 vext2 <2,1,6,3>, <2,1,6,3> + 3759769983U, // <6,3,3,0>: Cost 4 vext3 <0,2,4,6>, <3,3,0,1> + 3710036245U, // <6,3,3,1>: Cost 4 vext2 <3,1,6,3>, <3,1,6,3> + 2636958054U, // <6,3,3,2>: Cost 3 vext2 <3,2,6,3>, <3,2,6,3> + 2686028188U, // <6,3,3,3>: Cost 3 vext3 <0,2,4,6>, <3,3,3,3> + 2704607656U, // <6,3,3,4>: Cost 3 vext3 <3,3,4,6>, <3,3,4,6> + 3773041072U, // <6,3,3,5>: Cost 4 vext3 <2,4,4,6>, <3,3,5,5> + 3711363731U, // <6,3,3,6>: Cost 4 vext2 <3,3,6,3>, <3,6,3,7> + 3767732676U, // <6,3,3,7>: Cost 4 vext3 <1,5,4,6>, <3,3,7,7> + 2707999179U, // <6,3,3,u>: Cost 3 vext3 <3,u,5,6>, <3,3,u,5> + 2584232038U, // <6,3,4,0>: Cost 3 vext1 <5,6,3,4>, LHS + 2642267118U, // <6,3,4,1>: Cost 3 vext2 <4,1,6,3>, <4,1,6,3> + 2642930751U, // <6,3,4,2>: Cost 3 vext2 <4,2,6,3>, <4,2,6,3> + 2705197552U, // <6,3,4,3>: Cost 3 vext3 <3,4,3,6>, <3,4,3,6> + 2584235318U, // <6,3,4,4>: Cost 3 vext1 <5,6,3,4>, RHS + 1631603202U, // <6,3,4,5>: Cost 2 vext3 <3,4,5,6>, <3,4,5,6> + 2654211444U, // <6,3,4,6>: Cost 3 vext2 <6,1,6,3>, <4,6,4,6> + 2237041332U, // <6,3,4,7>: Cost 3 vrev <3,6,7,4> + 1631824413U, // <6,3,4,u>: Cost 2 vext3 <3,4,u,6>, <3,4,u,6> + 3640066150U, // <6,3,5,0>: Cost 4 vext1 <2,6,3,5>, LHS + 3772746288U, // <6,3,5,1>: Cost 4 vext3 <2,4,0,6>, <3,5,1,7> + 3640067790U, // <6,3,5,2>: Cost 4 vext1 <2,6,3,5>, <2,3,4,5> + 3773041216U, // <6,3,5,3>: Cost 4 vext3 <2,4,4,6>, <3,5,3,5> + 2705934922U, // <6,3,5,4>: Cost 3 vext3 <3,5,4,6>, <3,5,4,6> + 3773041236U, // <6,3,5,5>: Cost 4 vext3 <2,4,4,6>, <3,5,5,7> + 3779086940U, // <6,3,5,6>: Cost 4 vext3 <3,4,5,6>, <3,5,6,6> + 3767732831U, // <6,3,5,7>: Cost 4 vext3 <1,5,4,6>, <3,5,7,0> + 2706229870U, // <6,3,5,u>: Cost 3 vext3 <3,5,u,6>, <3,5,u,6> + 2602164326U, // <6,3,6,0>: Cost 3 vext1 <u,6,3,6>, LHS + 2654212512U, // <6,3,6,1>: Cost 3 vext2 <6,1,6,3>, <6,1,6,3> + 2566334393U, // <6,3,6,2>: Cost 3 vext1 <2,6,3,6>, <2,6,3,6> + 3704066588U, // <6,3,6,3>: Cost 4 vext2 <2,1,6,3>, <6,3,2,1> + 2602167524U, // <6,3,6,4>: Cost 3 vext1 <u,6,3,6>, <4,4,6,6> + 3710702321U, // <6,3,6,5>: Cost 4 vext2 <3,2,6,3>, <6,5,7,7> + 2724661933U, // <6,3,6,6>: Cost 3 vext3 <6,6,6,6>, <3,6,6,6> + 3710702465U, // <6,3,6,7>: Cost 4 vext2 <3,2,6,3>, <6,7,5,7> + 2602170158U, // <6,3,6,u>: Cost 3 vext1 <u,6,3,6>, LHS + 1492598886U, // <6,3,7,0>: Cost 2 vext1 <2,6,3,7>, LHS + 2560369889U, // <6,3,7,1>: Cost 3 vext1 <1,6,3,7>, <1,6,3,7> + 1492600762U, // <6,3,7,2>: Cost 2 vext1 <2,6,3,7>, <2,6,3,7> + 2566342806U, // <6,3,7,3>: Cost 3 vext1 <2,6,3,7>, <3,0,1,2> + 1492602166U, // <6,3,7,4>: Cost 2 vext1 <2,6,3,7>, RHS + 2602176208U, // <6,3,7,5>: Cost 3 vext1 <u,6,3,7>, <5,1,7,3> + 2566345210U, // <6,3,7,6>: Cost 3 vext1 <2,6,3,7>, <6,2,7,3> + 2980496528U, // <6,3,7,7>: Cost 3 vzipr RHS, <1,5,3,7> + 1492604718U, // <6,3,7,u>: Cost 2 vext1 <2,6,3,7>, LHS + 1492607078U, // <6,3,u,0>: Cost 2 vext1 <2,6,3,u>, LHS + 2686028574U, // <6,3,u,1>: Cost 3 vext3 <0,2,4,6>, <3,u,1,2> + 1492608955U, // <6,3,u,2>: Cost 2 vext1 <2,6,3,u>, <2,6,3,u> + 2566350998U, // <6,3,u,3>: Cost 3 vext1 <2,6,3,u>, <3,0,1,2> + 1492610358U, // <6,3,u,4>: Cost 2 vext1 <2,6,3,u>, RHS + 1634257734U, // <6,3,u,5>: Cost 2 vext3 <3,u,5,6>, <3,u,5,6> + 2566353489U, // <6,3,u,6>: Cost 3 vext1 <2,6,3,u>, <6,3,u,0> + 2980504720U, // <6,3,u,7>: Cost 3 vzipr RHS, <1,5,3,7> + 1492612910U, // <6,3,u,u>: Cost 2 vext1 <2,6,3,u>, LHS + 3703406592U, // <6,4,0,0>: Cost 4 vext2 <2,0,6,4>, <0,0,0,0> + 2629664870U, // <6,4,0,1>: Cost 3 vext2 <2,0,6,4>, LHS + 2629664972U, // <6,4,0,2>: Cost 3 vext2 <2,0,6,4>, <0,2,4,6> + 3779087232U, // <6,4,0,3>: Cost 4 vext3 <3,4,5,6>, <4,0,3,1> + 2642936156U, // <6,4,0,4>: Cost 3 vext2 <4,2,6,4>, <0,4,2,6> + 2712570770U, // <6,4,0,5>: Cost 3 vext3 <4,6,4,6>, <4,0,5,1> + 2687208348U, // <6,4,0,6>: Cost 3 vext3 <0,4,2,6>, <4,0,6,2> + 3316723081U, // <6,4,0,7>: Cost 4 vrev <4,6,7,0> + 2629665437U, // <6,4,0,u>: Cost 3 vext2 <2,0,6,4>, LHS + 2242473291U, // <6,4,1,0>: Cost 3 vrev <4,6,0,1> + 3700089652U, // <6,4,1,1>: Cost 4 vext2 <1,4,6,4>, <1,1,1,1> + 3703407510U, // <6,4,1,2>: Cost 4 vext2 <2,0,6,4>, <1,2,3,0> + 2852962406U, // <6,4,1,3>: Cost 3 vuzpr <5,6,7,4>, LHS + 3628166454U, // <6,4,1,4>: Cost 4 vext1 <0,6,4,1>, RHS + 3760876514U, // <6,4,1,5>: Cost 4 vext3 <0,4,1,6>, <4,1,5,0> + 2687208430U, // <6,4,1,6>: Cost 3 vext3 <0,4,2,6>, <4,1,6,3> + 3316731274U, // <6,4,1,7>: Cost 4 vrev <4,6,7,1> + 2243063187U, // <6,4,1,u>: Cost 3 vrev <4,6,u,1> + 2629666284U, // <6,4,2,0>: Cost 3 vext2 <2,0,6,4>, <2,0,6,4> + 3703408188U, // <6,4,2,1>: Cost 4 vext2 <2,0,6,4>, <2,1,6,3> + 3703408232U, // <6,4,2,2>: Cost 4 vext2 <2,0,6,4>, <2,2,2,2> + 3703408294U, // <6,4,2,3>: Cost 4 vext2 <2,0,6,4>, <2,3,0,1> + 2632320816U, // <6,4,2,4>: Cost 3 vext2 <2,4,6,4>, <2,4,6,4> + 2923384118U, // <6,4,2,5>: Cost 3 vzipl <6,2,7,3>, RHS + 2687208508U, // <6,4,2,6>: Cost 3 vext3 <0,4,2,6>, <4,2,6,0> + 3760950341U, // <6,4,2,7>: Cost 4 vext3 <0,4,2,6>, <4,2,7,0> + 2634975348U, // <6,4,2,u>: Cost 3 vext2 <2,u,6,4>, <2,u,6,4> + 3703408790U, // <6,4,3,0>: Cost 4 vext2 <2,0,6,4>, <3,0,1,2> + 3316305238U, // <6,4,3,1>: Cost 4 vrev <4,6,1,3> + 3703408947U, // <6,4,3,2>: Cost 4 vext2 <2,0,6,4>, <3,2,0,6> + 3703409052U, // <6,4,3,3>: Cost 4 vext2 <2,0,6,4>, <3,3,3,3> + 2644929026U, // <6,4,3,4>: Cost 3 vext2 <4,5,6,4>, <3,4,5,6> + 3718670922U, // <6,4,3,5>: Cost 4 vext2 <4,5,6,4>, <3,5,4,6> + 2705345682U, // <6,4,3,6>: Cost 3 vext3 <3,4,5,6>, <4,3,6,5> + 3926705152U, // <6,4,3,7>: Cost 4 vuzpr <5,6,7,4>, <1,3,5,7> + 2668817222U, // <6,4,3,u>: Cost 3 vext2 <u,5,6,4>, <3,u,5,6> + 2590277734U, // <6,4,4,0>: Cost 3 vext1 <6,6,4,4>, LHS + 3716017135U, // <6,4,4,1>: Cost 4 vext2 <4,1,6,4>, <4,1,6,4> + 2642938944U, // <6,4,4,2>: Cost 3 vext2 <4,2,6,4>, <4,2,6,4> + 3717344401U, // <6,4,4,3>: Cost 4 vext2 <4,3,6,4>, <4,3,6,4> + 2712571088U, // <6,4,4,4>: Cost 3 vext3 <4,6,4,6>, <4,4,4,4> + 2629668150U, // <6,4,4,5>: Cost 3 vext2 <2,0,6,4>, RHS + 1637649636U, // <6,4,4,6>: Cost 2 vext3 <4,4,6,6>, <4,4,6,6> + 2646257109U, // <6,4,4,7>: Cost 3 vext2 <4,7,6,4>, <4,7,6,4> + 1637649636U, // <6,4,4,u>: Cost 2 vext3 <4,4,6,6>, <4,4,6,6> + 2566398054U, // <6,4,5,0>: Cost 3 vext1 <2,6,4,5>, LHS + 3760876805U, // <6,4,5,1>: Cost 4 vext3 <0,4,1,6>, <4,5,1,3> + 2566399937U, // <6,4,5,2>: Cost 3 vext1 <2,6,4,5>, <2,6,4,5> + 2584316418U, // <6,4,5,3>: Cost 3 vext1 <5,6,4,5>, <3,4,5,6> + 2566401334U, // <6,4,5,4>: Cost 3 vext1 <2,6,4,5>, RHS + 2584318028U, // <6,4,5,5>: Cost 3 vext1 <5,6,4,5>, <5,6,4,5> + 1612287286U, // <6,4,5,6>: Cost 2 vext3 <0,2,4,6>, RHS + 2852965686U, // <6,4,5,7>: Cost 3 vuzpr <5,6,7,4>, RHS + 1612287304U, // <6,4,5,u>: Cost 2 vext3 <0,2,4,6>, RHS + 1504608358U, // <6,4,6,0>: Cost 2 vext1 <4,6,4,6>, LHS + 2578350838U, // <6,4,6,1>: Cost 3 vext1 <4,6,4,6>, <1,0,3,2> + 2578351720U, // <6,4,6,2>: Cost 3 vext1 <4,6,4,6>, <2,2,2,2> + 2578352278U, // <6,4,6,3>: Cost 3 vext1 <4,6,4,6>, <3,0,1,2> + 1504611638U, // <6,4,6,4>: Cost 2 vext1 <4,6,4,6>, RHS + 2578353872U, // <6,4,6,5>: Cost 3 vext1 <4,6,4,6>, <5,1,7,3> + 2578354682U, // <6,4,6,6>: Cost 3 vext1 <4,6,4,6>, <6,2,7,3> + 2578355194U, // <6,4,6,7>: Cost 3 vext1 <4,6,4,6>, <7,0,1,2> + 1504614190U, // <6,4,6,u>: Cost 2 vext1 <4,6,4,6>, LHS + 2572386406U, // <6,4,7,0>: Cost 3 vext1 <3,6,4,7>, LHS + 2572387226U, // <6,4,7,1>: Cost 3 vext1 <3,6,4,7>, <1,2,3,4> + 3640157902U, // <6,4,7,2>: Cost 4 vext1 <2,6,4,7>, <2,3,4,5> + 2572389020U, // <6,4,7,3>: Cost 3 vext1 <3,6,4,7>, <3,6,4,7> + 2572389686U, // <6,4,7,4>: Cost 3 vext1 <3,6,4,7>, RHS + 2980497102U, // <6,4,7,5>: Cost 3 vzipr RHS, <2,3,4,5> + 2980495564U, // <6,4,7,6>: Cost 3 vzipr RHS, <0,2,4,6> + 4054239090U, // <6,4,7,7>: Cost 4 vzipr RHS, <2,5,4,7> + 2572392238U, // <6,4,7,u>: Cost 3 vext1 <3,6,4,7>, LHS + 1504608358U, // <6,4,u,0>: Cost 2 vext1 <4,6,4,6>, LHS + 2629670702U, // <6,4,u,1>: Cost 3 vext2 <2,0,6,4>, LHS + 2566424516U, // <6,4,u,2>: Cost 3 vext1 <2,6,4,u>, <2,6,4,u> + 2584340994U, // <6,4,u,3>: Cost 3 vext1 <5,6,4,u>, <3,4,5,6> + 1640156694U, // <6,4,u,4>: Cost 2 vext3 <4,u,4,6>, <4,u,4,6> + 2629671066U, // <6,4,u,5>: Cost 3 vext2 <2,0,6,4>, RHS + 1612287529U, // <6,4,u,6>: Cost 2 vext3 <0,2,4,6>, RHS + 2852965929U, // <6,4,u,7>: Cost 3 vuzpr <5,6,7,4>, RHS + 1612287547U, // <6,4,u,u>: Cost 2 vext3 <0,2,4,6>, RHS + 3708723200U, // <6,5,0,0>: Cost 4 vext2 <2,u,6,5>, <0,0,0,0> + 2634981478U, // <6,5,0,1>: Cost 3 vext2 <2,u,6,5>, LHS + 3694125260U, // <6,5,0,2>: Cost 4 vext2 <0,4,6,5>, <0,2,4,6> + 3779087962U, // <6,5,0,3>: Cost 4 vext3 <3,4,5,6>, <5,0,3,2> + 3760877154U, // <6,5,0,4>: Cost 4 vext3 <0,4,1,6>, <5,0,4,1> + 4195110916U, // <6,5,0,5>: Cost 4 vtrnr <5,6,7,0>, <5,5,5,5> + 3696779775U, // <6,5,0,6>: Cost 4 vext2 <0,u,6,5>, <0,6,2,7> + 1175212130U, // <6,5,0,7>: Cost 2 vrev <5,6,7,0> + 1175285867U, // <6,5,0,u>: Cost 2 vrev <5,6,u,0> + 2248445988U, // <6,5,1,0>: Cost 3 vrev <5,6,0,1> + 3698107237U, // <6,5,1,1>: Cost 4 vext2 <1,1,6,5>, <1,1,6,5> + 3708724118U, // <6,5,1,2>: Cost 4 vext2 <2,u,6,5>, <1,2,3,0> + 3908575334U, // <6,5,1,3>: Cost 4 vuzpr <2,6,4,5>, LHS + 3716023376U, // <6,5,1,4>: Cost 4 vext2 <4,1,6,5>, <1,4,5,6> + 3708724368U, // <6,5,1,5>: Cost 4 vext2 <2,u,6,5>, <1,5,3,7> + 3767733960U, // <6,5,1,6>: Cost 4 vext3 <1,5,4,6>, <5,1,6,4> + 2712571600U, // <6,5,1,7>: Cost 3 vext3 <4,6,4,6>, <5,1,7,3> + 2712571609U, // <6,5,1,u>: Cost 3 vext3 <4,6,4,6>, <5,1,u,3> + 2578391142U, // <6,5,2,0>: Cost 3 vext1 <4,6,5,2>, LHS + 3704079934U, // <6,5,2,1>: Cost 4 vext2 <2,1,6,5>, <2,1,6,5> + 3708724840U, // <6,5,2,2>: Cost 4 vext2 <2,u,6,5>, <2,2,2,2> + 3705407182U, // <6,5,2,3>: Cost 4 vext2 <2,3,6,5>, <2,3,4,5> + 2578394422U, // <6,5,2,4>: Cost 3 vext1 <4,6,5,2>, RHS + 3717351272U, // <6,5,2,5>: Cost 4 vext2 <4,3,6,5>, <2,5,3,6> + 2634983354U, // <6,5,2,6>: Cost 3 vext2 <2,u,6,5>, <2,6,3,7> + 3115486518U, // <6,5,2,7>: Cost 3 vtrnr <4,6,u,2>, RHS + 2634983541U, // <6,5,2,u>: Cost 3 vext2 <2,u,6,5>, <2,u,6,5> + 3708725398U, // <6,5,3,0>: Cost 4 vext2 <2,u,6,5>, <3,0,1,2> + 3710052631U, // <6,5,3,1>: Cost 4 vext2 <3,1,6,5>, <3,1,6,5> + 3708725606U, // <6,5,3,2>: Cost 4 vext2 <2,u,6,5>, <3,2,6,3> + 3708725660U, // <6,5,3,3>: Cost 4 vext2 <2,u,6,5>, <3,3,3,3> + 2643610114U, // <6,5,3,4>: Cost 3 vext2 <4,3,6,5>, <3,4,5,6> + 3717352010U, // <6,5,3,5>: Cost 4 vext2 <4,3,6,5>, <3,5,4,6> + 3773632358U, // <6,5,3,6>: Cost 4 vext3 <2,5,3,6>, <5,3,6,0> + 2248978533U, // <6,5,3,7>: Cost 3 vrev <5,6,7,3> + 2249052270U, // <6,5,3,u>: Cost 3 vrev <5,6,u,3> + 2596323430U, // <6,5,4,0>: Cost 3 vext1 <7,6,5,4>, LHS + 3716025328U, // <6,5,4,1>: Cost 4 vext2 <4,1,6,5>, <4,1,6,5> + 3716688961U, // <6,5,4,2>: Cost 4 vext2 <4,2,6,5>, <4,2,6,5> + 2643610770U, // <6,5,4,3>: Cost 3 vext2 <4,3,6,5>, <4,3,6,5> + 2596326710U, // <6,5,4,4>: Cost 3 vext1 <7,6,5,4>, RHS + 2634984758U, // <6,5,4,5>: Cost 3 vext2 <2,u,6,5>, RHS + 3767734199U, // <6,5,4,6>: Cost 4 vext3 <1,5,4,6>, <5,4,6,0> + 1643696070U, // <6,5,4,7>: Cost 2 vext3 <5,4,7,6>, <5,4,7,6> + 1643769807U, // <6,5,4,u>: Cost 2 vext3 <5,4,u,6>, <5,4,u,6> + 2578415718U, // <6,5,5,0>: Cost 3 vext1 <4,6,5,5>, LHS + 3652158198U, // <6,5,5,1>: Cost 4 vext1 <4,6,5,5>, <1,0,3,2> + 3652159080U, // <6,5,5,2>: Cost 4 vext1 <4,6,5,5>, <2,2,2,2> + 3652159638U, // <6,5,5,3>: Cost 4 vext1 <4,6,5,5>, <3,0,1,2> + 2578418998U, // <6,5,5,4>: Cost 3 vext1 <4,6,5,5>, RHS + 2712571908U, // <6,5,5,5>: Cost 3 vext3 <4,6,4,6>, <5,5,5,5> + 2718027790U, // <6,5,5,6>: Cost 3 vext3 <5,5,6,6>, <5,5,6,6> + 2712571928U, // <6,5,5,7>: Cost 3 vext3 <4,6,4,6>, <5,5,7,7> + 2712571937U, // <6,5,5,u>: Cost 3 vext3 <4,6,4,6>, <5,5,u,7> + 2705346596U, // <6,5,6,0>: Cost 3 vext3 <3,4,5,6>, <5,6,0,1> + 3767144496U, // <6,5,6,1>: Cost 4 vext3 <1,4,5,6>, <5,6,1,4> + 3773116473U, // <6,5,6,2>: Cost 4 vext3 <2,4,5,6>, <5,6,2,4> + 2705346626U, // <6,5,6,3>: Cost 3 vext3 <3,4,5,6>, <5,6,3,4> + 2705346636U, // <6,5,6,4>: Cost 3 vext3 <3,4,5,6>, <5,6,4,5> + 3908577217U, // <6,5,6,5>: Cost 4 vuzpr <2,6,4,5>, <2,6,4,5> + 2578428728U, // <6,5,6,6>: Cost 3 vext1 <4,6,5,6>, <6,6,6,6> + 2712572002U, // <6,5,6,7>: Cost 3 vext3 <4,6,4,6>, <5,6,7,0> + 2705346668U, // <6,5,6,u>: Cost 3 vext3 <3,4,5,6>, <5,6,u,1> + 2560516198U, // <6,5,7,0>: Cost 3 vext1 <1,6,5,7>, LHS + 2560517363U, // <6,5,7,1>: Cost 3 vext1 <1,6,5,7>, <1,6,5,7> + 2566490060U, // <6,5,7,2>: Cost 3 vext1 <2,6,5,7>, <2,6,5,7> + 3634260118U, // <6,5,7,3>: Cost 4 vext1 <1,6,5,7>, <3,0,1,2> + 2560519478U, // <6,5,7,4>: Cost 3 vext1 <1,6,5,7>, RHS + 2980498650U, // <6,5,7,5>: Cost 3 vzipr RHS, <4,4,5,5> + 2980497922U, // <6,5,7,6>: Cost 3 vzipr RHS, <3,4,5,6> + 3103214902U, // <6,5,7,7>: Cost 3 vtrnr <2,6,3,7>, RHS + 2560522030U, // <6,5,7,u>: Cost 3 vext1 <1,6,5,7>, LHS + 2560524390U, // <6,5,u,0>: Cost 3 vext1 <1,6,5,u>, LHS + 2560525556U, // <6,5,u,1>: Cost 3 vext1 <1,6,5,u>, <1,6,5,u> + 2566498253U, // <6,5,u,2>: Cost 3 vext1 <2,6,5,u>, <2,6,5,u> + 2646931439U, // <6,5,u,3>: Cost 3 vext2 <4,u,6,5>, <u,3,5,7> + 2560527670U, // <6,5,u,4>: Cost 3 vext1 <1,6,5,u>, RHS + 2634987674U, // <6,5,u,5>: Cost 3 vext2 <2,u,6,5>, RHS + 2980506114U, // <6,5,u,6>: Cost 3 vzipr RHS, <3,4,5,6> + 1175277674U, // <6,5,u,7>: Cost 2 vrev <5,6,7,u> + 1175351411U, // <6,5,u,u>: Cost 2 vrev <5,6,u,u> + 2578448486U, // <6,6,0,0>: Cost 3 vext1 <4,6,6,0>, LHS + 1573191782U, // <6,6,0,1>: Cost 2 vext2 <4,u,6,6>, LHS + 2686030124U, // <6,6,0,2>: Cost 3 vext3 <0,2,4,6>, <6,0,2,4> + 3779088690U, // <6,6,0,3>: Cost 4 vext3 <3,4,5,6>, <6,0,3,1> + 2687209788U, // <6,6,0,4>: Cost 3 vext3 <0,4,2,6>, <6,0,4,2> + 3652194000U, // <6,6,0,5>: Cost 4 vext1 <4,6,6,0>, <5,1,7,3> + 2254852914U, // <6,6,0,6>: Cost 3 vrev <6,6,6,0> + 4041575734U, // <6,6,0,7>: Cost 4 vzipr <2,4,6,0>, RHS + 1573192349U, // <6,6,0,u>: Cost 2 vext2 <4,u,6,6>, LHS + 2646934262U, // <6,6,1,0>: Cost 3 vext2 <4,u,6,6>, <1,0,3,2> + 2646934324U, // <6,6,1,1>: Cost 3 vext2 <4,u,6,6>, <1,1,1,1> + 2646934422U, // <6,6,1,2>: Cost 3 vext2 <4,u,6,6>, <1,2,3,0> + 2846785638U, // <6,6,1,3>: Cost 3 vuzpr <4,6,4,6>, LHS + 3760951694U, // <6,6,1,4>: Cost 4 vext3 <0,4,2,6>, <6,1,4,3> + 2646934672U, // <6,6,1,5>: Cost 3 vext2 <4,u,6,6>, <1,5,3,7> + 2712572320U, // <6,6,1,6>: Cost 3 vext3 <4,6,4,6>, <6,1,6,3> + 3775549865U, // <6,6,1,7>: Cost 4 vext3 <2,u,2,6>, <6,1,7,3> + 2846785643U, // <6,6,1,u>: Cost 3 vuzpr <4,6,4,6>, LHS + 3759772094U, // <6,6,2,0>: Cost 4 vext3 <0,2,4,6>, <6,2,0,6> + 3704751676U, // <6,6,2,1>: Cost 4 vext2 <2,2,6,6>, <2,1,6,3> + 2631009936U, // <6,6,2,2>: Cost 3 vext2 <2,2,6,6>, <2,2,6,6> + 2646935206U, // <6,6,2,3>: Cost 3 vext2 <4,u,6,6>, <2,3,0,1> + 3759772127U, // <6,6,2,4>: Cost 4 vext3 <0,2,4,6>, <6,2,4,3> + 3704752004U, // <6,6,2,5>: Cost 4 vext2 <2,2,6,6>, <2,5,6,7> + 2646935482U, // <6,6,2,6>: Cost 3 vext2 <4,u,6,6>, <2,6,3,7> + 2712572410U, // <6,6,2,7>: Cost 3 vext3 <4,6,4,6>, <6,2,7,3> + 2712572419U, // <6,6,2,u>: Cost 3 vext3 <4,6,4,6>, <6,2,u,3> + 2646935702U, // <6,6,3,0>: Cost 3 vext2 <4,u,6,6>, <3,0,1,2> + 3777024534U, // <6,6,3,1>: Cost 4 vext3 <3,1,4,6>, <6,3,1,4> + 3704752453U, // <6,6,3,2>: Cost 4 vext2 <2,2,6,6>, <3,2,2,6> + 2646935964U, // <6,6,3,3>: Cost 3 vext2 <4,u,6,6>, <3,3,3,3> + 2705347122U, // <6,6,3,4>: Cost 3 vext3 <3,4,5,6>, <6,3,4,5> + 3779678778U, // <6,6,3,5>: Cost 4 vext3 <3,5,4,6>, <6,3,5,4> + 2657553069U, // <6,6,3,6>: Cost 3 vext2 <6,6,6,6>, <3,6,6,6> + 4039609654U, // <6,6,3,7>: Cost 4 vzipr <2,1,6,3>, RHS + 2708001366U, // <6,6,3,u>: Cost 3 vext3 <3,u,5,6>, <6,3,u,5> + 2578481254U, // <6,6,4,0>: Cost 3 vext1 <4,6,6,4>, LHS + 3652223734U, // <6,6,4,1>: Cost 4 vext1 <4,6,6,4>, <1,0,3,2> + 3760951922U, // <6,6,4,2>: Cost 4 vext3 <0,4,2,6>, <6,4,2,6> + 3779089019U, // <6,6,4,3>: Cost 4 vext3 <3,4,5,6>, <6,4,3,6> + 1570540772U, // <6,6,4,4>: Cost 2 vext2 <4,4,6,6>, <4,4,6,6> + 1573195062U, // <6,6,4,5>: Cost 2 vext2 <4,u,6,6>, RHS + 2712572560U, // <6,6,4,6>: Cost 3 vext3 <4,6,4,6>, <6,4,6,0> + 2723410591U, // <6,6,4,7>: Cost 3 vext3 <6,4,7,6>, <6,4,7,6> + 1573195304U, // <6,6,4,u>: Cost 2 vext2 <4,u,6,6>, <4,u,6,6> + 3640287334U, // <6,6,5,0>: Cost 4 vext1 <2,6,6,5>, LHS + 2646937296U, // <6,6,5,1>: Cost 3 vext2 <4,u,6,6>, <5,1,7,3> + 3640289235U, // <6,6,5,2>: Cost 4 vext1 <2,6,6,5>, <2,6,6,5> + 3720679279U, // <6,6,5,3>: Cost 4 vext2 <4,u,6,6>, <5,3,7,0> + 2646937542U, // <6,6,5,4>: Cost 3 vext2 <4,u,6,6>, <5,4,7,6> + 2646937604U, // <6,6,5,5>: Cost 3 vext2 <4,u,6,6>, <5,5,5,5> + 2646937698U, // <6,6,5,6>: Cost 3 vext2 <4,u,6,6>, <5,6,7,0> + 2846788918U, // <6,6,5,7>: Cost 3 vuzpr <4,6,4,6>, RHS + 2846788919U, // <6,6,5,u>: Cost 3 vuzpr <4,6,4,6>, RHS + 1516699750U, // <6,6,6,0>: Cost 2 vext1 <6,6,6,6>, LHS + 2590442230U, // <6,6,6,1>: Cost 3 vext1 <6,6,6,6>, <1,0,3,2> + 2646938106U, // <6,6,6,2>: Cost 3 vext2 <4,u,6,6>, <6,2,7,3> + 2590443670U, // <6,6,6,3>: Cost 3 vext1 <6,6,6,6>, <3,0,1,2> + 1516703030U, // <6,6,6,4>: Cost 2 vext1 <6,6,6,6>, RHS + 2590445264U, // <6,6,6,5>: Cost 3 vext1 <6,6,6,6>, <5,1,7,3> + 296144182U, // <6,6,6,6>: Cost 1 vdup2 RHS + 2712572738U, // <6,6,6,7>: Cost 3 vext3 <4,6,4,6>, <6,6,7,7> + 296144182U, // <6,6,6,u>: Cost 1 vdup2 RHS + 2566561894U, // <6,6,7,0>: Cost 3 vext1 <2,6,6,7>, LHS + 3634332924U, // <6,6,7,1>: Cost 4 vext1 <1,6,6,7>, <1,6,6,7> + 2566563797U, // <6,6,7,2>: Cost 3 vext1 <2,6,6,7>, <2,6,6,7> + 2584480258U, // <6,6,7,3>: Cost 3 vext1 <5,6,6,7>, <3,4,5,6> + 2566565174U, // <6,6,7,4>: Cost 3 vext1 <2,6,6,7>, RHS + 2717438846U, // <6,6,7,5>: Cost 3 vext3 <5,4,7,6>, <6,7,5,4> + 2980500280U, // <6,6,7,6>: Cost 3 vzipr RHS, <6,6,6,6> + 1906756918U, // <6,6,7,7>: Cost 2 vzipr RHS, RHS + 1906756919U, // <6,6,7,u>: Cost 2 vzipr RHS, RHS + 1516699750U, // <6,6,u,0>: Cost 2 vext1 <6,6,6,6>, LHS + 1573197614U, // <6,6,u,1>: Cost 2 vext2 <4,u,6,6>, LHS + 2566571990U, // <6,6,u,2>: Cost 3 vext1 <2,6,6,u>, <2,6,6,u> + 2846786205U, // <6,6,u,3>: Cost 3 vuzpr <4,6,4,6>, LHS + 1516703030U, // <6,6,u,4>: Cost 2 vext1 <6,6,6,6>, RHS + 1573197978U, // <6,6,u,5>: Cost 2 vext2 <4,u,6,6>, RHS + 296144182U, // <6,6,u,6>: Cost 1 vdup2 RHS + 1906765110U, // <6,6,u,7>: Cost 2 vzipr RHS, RHS + 296144182U, // <6,6,u,u>: Cost 1 vdup2 RHS + 1571209216U, // <6,7,0,0>: Cost 2 vext2 RHS, <0,0,0,0> + 497467494U, // <6,7,0,1>: Cost 1 vext2 RHS, LHS + 1571209380U, // <6,7,0,2>: Cost 2 vext2 RHS, <0,2,0,2> + 2644951292U, // <6,7,0,3>: Cost 3 vext2 RHS, <0,3,1,0> + 1571209554U, // <6,7,0,4>: Cost 2 vext2 RHS, <0,4,1,5> + 1510756450U, // <6,7,0,5>: Cost 2 vext1 <5,6,7,0>, <5,6,7,0> + 2644951542U, // <6,7,0,6>: Cost 3 vext2 RHS, <0,6,1,7> + 2584499194U, // <6,7,0,7>: Cost 3 vext1 <5,6,7,0>, <7,0,1,2> + 497468061U, // <6,7,0,u>: Cost 1 vext2 RHS, LHS + 1571209974U, // <6,7,1,0>: Cost 2 vext2 RHS, <1,0,3,2> + 1571210036U, // <6,7,1,1>: Cost 2 vext2 RHS, <1,1,1,1> + 1571210134U, // <6,7,1,2>: Cost 2 vext2 RHS, <1,2,3,0> + 1571210200U, // <6,7,1,3>: Cost 2 vext2 RHS, <1,3,1,3> + 2644952098U, // <6,7,1,4>: Cost 3 vext2 RHS, <1,4,0,5> + 1571210384U, // <6,7,1,5>: Cost 2 vext2 RHS, <1,5,3,7> + 2644952271U, // <6,7,1,6>: Cost 3 vext2 RHS, <1,6,1,7> + 2578535418U, // <6,7,1,7>: Cost 3 vext1 <4,6,7,1>, <7,0,1,2> + 1571210605U, // <6,7,1,u>: Cost 2 vext2 RHS, <1,u,1,3> + 2644952509U, // <6,7,2,0>: Cost 3 vext2 RHS, <2,0,1,2> + 2644952582U, // <6,7,2,1>: Cost 3 vext2 RHS, <2,1,0,3> + 1571210856U, // <6,7,2,2>: Cost 2 vext2 RHS, <2,2,2,2> + 1571210918U, // <6,7,2,3>: Cost 2 vext2 RHS, <2,3,0,1> + 2644952828U, // <6,7,2,4>: Cost 3 vext2 RHS, <2,4,0,6> + 2633009028U, // <6,7,2,5>: Cost 3 vext2 <2,5,6,7>, <2,5,6,7> + 1571211194U, // <6,7,2,6>: Cost 2 vext2 RHS, <2,6,3,7> + 2668840938U, // <6,7,2,7>: Cost 3 vext2 RHS, <2,7,0,1> + 1571211323U, // <6,7,2,u>: Cost 2 vext2 RHS, <2,u,0,1> + 1571211414U, // <6,7,3,0>: Cost 2 vext2 RHS, <3,0,1,2> + 2644953311U, // <6,7,3,1>: Cost 3 vext2 RHS, <3,1,0,3> + 2644953390U, // <6,7,3,2>: Cost 3 vext2 RHS, <3,2,0,1> + 1571211676U, // <6,7,3,3>: Cost 2 vext2 RHS, <3,3,3,3> + 1571211778U, // <6,7,3,4>: Cost 2 vext2 RHS, <3,4,5,6> + 2644953648U, // <6,7,3,5>: Cost 3 vext2 RHS, <3,5,1,7> + 2644953720U, // <6,7,3,6>: Cost 3 vext2 RHS, <3,6,0,7> + 2644953795U, // <6,7,3,7>: Cost 3 vext2 RHS, <3,7,0,1> + 1571212062U, // <6,7,3,u>: Cost 2 vext2 RHS, <3,u,1,2> + 1573202834U, // <6,7,4,0>: Cost 2 vext2 RHS, <4,0,5,1> + 2644954058U, // <6,7,4,1>: Cost 3 vext2 RHS, <4,1,2,3> + 2644954166U, // <6,7,4,2>: Cost 3 vext2 RHS, <4,2,5,3> + 2644954258U, // <6,7,4,3>: Cost 3 vext2 RHS, <4,3,6,5> + 1571212496U, // <6,7,4,4>: Cost 2 vext2 RHS, <4,4,4,4> + 497470774U, // <6,7,4,5>: Cost 1 vext2 RHS, RHS + 1573203316U, // <6,7,4,6>: Cost 2 vext2 RHS, <4,6,4,6> + 2646281688U, // <6,7,4,7>: Cost 3 vext2 <4,7,6,7>, <4,7,6,7> + 497471017U, // <6,7,4,u>: Cost 1 vext2 RHS, RHS + 2644954696U, // <6,7,5,0>: Cost 3 vext2 RHS, <5,0,1,2> + 1573203664U, // <6,7,5,1>: Cost 2 vext2 RHS, <5,1,7,3> + 2644954878U, // <6,7,5,2>: Cost 3 vext2 RHS, <5,2,3,4> + 2644954991U, // <6,7,5,3>: Cost 3 vext2 RHS, <5,3,7,0> + 1571213254U, // <6,7,5,4>: Cost 2 vext2 RHS, <5,4,7,6> + 1571213316U, // <6,7,5,5>: Cost 2 vext2 RHS, <5,5,5,5> + 1571213410U, // <6,7,5,6>: Cost 2 vext2 RHS, <5,6,7,0> + 1573204136U, // <6,7,5,7>: Cost 2 vext2 RHS, <5,7,5,7> + 1573204217U, // <6,7,5,u>: Cost 2 vext2 RHS, <5,u,5,7> + 2644955425U, // <6,7,6,0>: Cost 3 vext2 RHS, <6,0,1,2> + 2644955561U, // <6,7,6,1>: Cost 3 vext2 RHS, <6,1,7,3> + 1573204474U, // <6,7,6,2>: Cost 2 vext2 RHS, <6,2,7,3> + 2644955698U, // <6,7,6,3>: Cost 3 vext2 RHS, <6,3,4,5> + 2644955789U, // <6,7,6,4>: Cost 3 vext2 RHS, <6,4,5,6> + 2644955889U, // <6,7,6,5>: Cost 3 vext2 RHS, <6,5,7,7> + 1571214136U, // <6,7,6,6>: Cost 2 vext2 RHS, <6,6,6,6> + 1571214158U, // <6,7,6,7>: Cost 2 vext2 RHS, <6,7,0,1> + 1573204895U, // <6,7,6,u>: Cost 2 vext2 RHS, <6,u,0,1> + 1573204986U, // <6,7,7,0>: Cost 2 vext2 RHS, <7,0,1,2> + 2572608656U, // <6,7,7,1>: Cost 3 vext1 <3,6,7,7>, <1,5,3,7> + 2644956362U, // <6,7,7,2>: Cost 3 vext2 RHS, <7,2,6,3> + 2572610231U, // <6,7,7,3>: Cost 3 vext1 <3,6,7,7>, <3,6,7,7> + 1573205350U, // <6,7,7,4>: Cost 2 vext2 RHS, <7,4,5,6> + 2646947220U, // <6,7,7,5>: Cost 3 vext2 RHS, <7,5,1,7> + 1516786498U, // <6,7,7,6>: Cost 2 vext1 <6,6,7,7>, <6,6,7,7> + 1571214956U, // <6,7,7,7>: Cost 2 vext2 RHS, <7,7,7,7> + 1573205634U, // <6,7,7,u>: Cost 2 vext2 RHS, <7,u,1,2> + 1571215059U, // <6,7,u,0>: Cost 2 vext2 RHS, <u,0,1,2> + 497473326U, // <6,7,u,1>: Cost 1 vext2 RHS, LHS + 1571215237U, // <6,7,u,2>: Cost 2 vext2 RHS, <u,2,3,0> + 1571215292U, // <6,7,u,3>: Cost 2 vext2 RHS, <u,3,0,1> + 1571215423U, // <6,7,u,4>: Cost 2 vext2 RHS, <u,4,5,6> + 497473690U, // <6,7,u,5>: Cost 1 vext2 RHS, RHS + 1571215568U, // <6,7,u,6>: Cost 2 vext2 RHS, <u,6,3,7> + 1573206272U, // <6,7,u,7>: Cost 2 vext2 RHS, <u,7,0,1> + 497473893U, // <6,7,u,u>: Cost 1 vext2 RHS, LHS + 1571217408U, // <6,u,0,0>: Cost 2 vext2 RHS, <0,0,0,0> + 497475686U, // <6,u,0,1>: Cost 1 vext2 RHS, LHS + 1571217572U, // <6,u,0,2>: Cost 2 vext2 RHS, <0,2,0,2> + 2689865445U, // <6,u,0,3>: Cost 3 vext3 <0,u,2,6>, <u,0,3,2> + 1571217746U, // <6,u,0,4>: Cost 2 vext2 RHS, <0,4,1,5> + 1510830187U, // <6,u,0,5>: Cost 2 vext1 <5,6,u,0>, <5,6,u,0> + 2644959734U, // <6,u,0,6>: Cost 3 vext2 RHS, <0,6,1,7> + 1193130221U, // <6,u,0,7>: Cost 2 vrev <u,6,7,0> + 497476253U, // <6,u,0,u>: Cost 1 vext2 RHS, LHS + 1571218166U, // <6,u,1,0>: Cost 2 vext2 RHS, <1,0,3,2> + 1571218228U, // <6,u,1,1>: Cost 2 vext2 RHS, <1,1,1,1> + 1612289838U, // <6,u,1,2>: Cost 2 vext3 <0,2,4,6>, LHS + 1571218392U, // <6,u,1,3>: Cost 2 vext2 RHS, <1,3,1,3> + 2566663478U, // <6,u,1,4>: Cost 3 vext1 <2,6,u,1>, RHS + 1571218576U, // <6,u,1,5>: Cost 2 vext2 RHS, <1,5,3,7> + 2644960463U, // <6,u,1,6>: Cost 3 vext2 RHS, <1,6,1,7> + 2717439835U, // <6,u,1,7>: Cost 3 vext3 <5,4,7,6>, <u,1,7,3> + 1612289892U, // <6,u,1,u>: Cost 2 vext3 <0,2,4,6>, LHS + 1504870502U, // <6,u,2,0>: Cost 2 vext1 <4,6,u,2>, LHS + 2644960774U, // <6,u,2,1>: Cost 3 vext2 RHS, <2,1,0,3> + 1571219048U, // <6,u,2,2>: Cost 2 vext2 RHS, <2,2,2,2> + 1571219110U, // <6,u,2,3>: Cost 2 vext2 RHS, <2,3,0,1> + 1504873782U, // <6,u,2,4>: Cost 2 vext1 <4,6,u,2>, RHS + 2633017221U, // <6,u,2,5>: Cost 3 vext2 <2,5,6,u>, <2,5,6,u> + 1571219386U, // <6,u,2,6>: Cost 2 vext2 RHS, <2,6,3,7> + 2712573868U, // <6,u,2,7>: Cost 3 vext3 <4,6,4,6>, <u,2,7,3> + 1571219515U, // <6,u,2,u>: Cost 2 vext2 RHS, <2,u,0,1> + 1571219606U, // <6,u,3,0>: Cost 2 vext2 RHS, <3,0,1,2> + 2644961503U, // <6,u,3,1>: Cost 3 vext2 RHS, <3,1,0,3> + 2566678499U, // <6,u,3,2>: Cost 3 vext1 <2,6,u,3>, <2,6,u,3> + 1571219868U, // <6,u,3,3>: Cost 2 vext2 RHS, <3,3,3,3> + 1571219970U, // <6,u,3,4>: Cost 2 vext2 RHS, <3,4,5,6> + 2689865711U, // <6,u,3,5>: Cost 3 vext3 <0,u,2,6>, <u,3,5,7> + 2708002806U, // <6,u,3,6>: Cost 3 vext3 <3,u,5,6>, <u,3,6,5> + 2644961987U, // <6,u,3,7>: Cost 3 vext2 RHS, <3,7,0,1> + 1571220254U, // <6,u,3,u>: Cost 2 vext2 RHS, <3,u,1,2> + 1571220370U, // <6,u,4,0>: Cost 2 vext2 RHS, <4,0,5,1> + 2644962250U, // <6,u,4,1>: Cost 3 vext2 RHS, <4,1,2,3> + 1661245476U, // <6,u,4,2>: Cost 2 vext3 <u,4,2,6>, <u,4,2,6> + 2686031917U, // <6,u,4,3>: Cost 3 vext3 <0,2,4,6>, <u,4,3,6> + 1571220688U, // <6,u,4,4>: Cost 2 vext2 RHS, <4,4,4,4> + 497478967U, // <6,u,4,5>: Cost 1 vext2 RHS, RHS + 1571220852U, // <6,u,4,6>: Cost 2 vext2 RHS, <4,6,4,6> + 1661614161U, // <6,u,4,7>: Cost 2 vext3 <u,4,7,6>, <u,4,7,6> + 497479209U, // <6,u,4,u>: Cost 1 vext2 RHS, RHS + 2566692966U, // <6,u,5,0>: Cost 3 vext1 <2,6,u,5>, LHS + 1571221200U, // <6,u,5,1>: Cost 2 vext2 RHS, <5,1,7,3> + 2566694885U, // <6,u,5,2>: Cost 3 vext1 <2,6,u,5>, <2,6,u,5> + 2689865855U, // <6,u,5,3>: Cost 3 vext3 <0,u,2,6>, <u,5,3,7> + 1571221446U, // <6,u,5,4>: Cost 2 vext2 RHS, <5,4,7,6> + 1571221508U, // <6,u,5,5>: Cost 2 vext2 RHS, <5,5,5,5> + 1612290202U, // <6,u,5,6>: Cost 2 vext3 <0,2,4,6>, RHS + 1571221672U, // <6,u,5,7>: Cost 2 vext2 RHS, <5,7,5,7> + 1612290220U, // <6,u,5,u>: Cost 2 vext3 <0,2,4,6>, RHS + 1504903270U, // <6,u,6,0>: Cost 2 vext1 <4,6,u,6>, LHS + 2644963752U, // <6,u,6,1>: Cost 3 vext2 RHS, <6,1,7,2> + 1571222010U, // <6,u,6,2>: Cost 2 vext2 RHS, <6,2,7,3> + 2686032080U, // <6,u,6,3>: Cost 3 vext3 <0,2,4,6>, <u,6,3,7> + 1504906550U, // <6,u,6,4>: Cost 2 vext1 <4,6,u,6>, RHS + 2644964079U, // <6,u,6,5>: Cost 3 vext2 RHS, <6,5,7,5> + 296144182U, // <6,u,6,6>: Cost 1 vdup2 RHS + 1571222350U, // <6,u,6,7>: Cost 2 vext2 RHS, <6,7,0,1> + 296144182U, // <6,u,6,u>: Cost 1 vdup2 RHS + 1492967526U, // <6,u,7,0>: Cost 2 vext1 <2,6,u,7>, LHS + 2560738574U, // <6,u,7,1>: Cost 3 vext1 <1,6,u,7>, <1,6,u,7> + 1492969447U, // <6,u,7,2>: Cost 2 vext1 <2,6,u,7>, <2,6,u,7> + 1906753692U, // <6,u,7,3>: Cost 2 vzipr RHS, LHS + 1492970806U, // <6,u,7,4>: Cost 2 vext1 <2,6,u,7>, RHS + 2980495761U, // <6,u,7,5>: Cost 3 vzipr RHS, <0,4,u,5> + 1516860235U, // <6,u,7,6>: Cost 2 vext1 <6,6,u,7>, <6,6,u,7> + 1906756936U, // <6,u,7,7>: Cost 2 vzipr RHS, RHS + 1492973358U, // <6,u,7,u>: Cost 2 vext1 <2,6,u,7>, LHS + 1492975718U, // <6,u,u,0>: Cost 2 vext1 <2,6,u,u>, LHS + 497481518U, // <6,u,u,1>: Cost 1 vext2 RHS, LHS + 1612290405U, // <6,u,u,2>: Cost 2 vext3 <0,2,4,6>, LHS + 1571223484U, // <6,u,u,3>: Cost 2 vext2 RHS, <u,3,0,1> + 1492978998U, // <6,u,u,4>: Cost 2 vext1 <2,6,u,u>, RHS + 497481882U, // <6,u,u,5>: Cost 1 vext2 RHS, RHS + 296144182U, // <6,u,u,6>: Cost 1 vdup2 RHS + 1906765128U, // <6,u,u,7>: Cost 2 vzipr RHS, RHS + 497482085U, // <6,u,u,u>: Cost 1 vext2 RHS, LHS + 1638318080U, // <7,0,0,0>: Cost 2 vext3 RHS, <0,0,0,0> + 1638318090U, // <7,0,0,1>: Cost 2 vext3 RHS, <0,0,1,1> + 1638318100U, // <7,0,0,2>: Cost 2 vext3 RHS, <0,0,2,2> + 3646442178U, // <7,0,0,3>: Cost 4 vext1 <3,7,0,0>, <3,7,0,0> + 2712059941U, // <7,0,0,4>: Cost 3 vext3 RHS, <0,0,4,1> + 2651603364U, // <7,0,0,5>: Cost 3 vext2 <5,6,7,0>, <0,5,1,6> + 2590618445U, // <7,0,0,6>: Cost 3 vext1 <6,7,0,0>, <6,7,0,0> + 3785801798U, // <7,0,0,7>: Cost 4 vext3 RHS, <0,0,7,7> + 1638318153U, // <7,0,0,u>: Cost 2 vext3 RHS, <0,0,u,1> + 1516879974U, // <7,0,1,0>: Cost 2 vext1 <6,7,0,1>, LHS + 2693922911U, // <7,0,1,1>: Cost 3 vext3 <1,5,3,7>, <0,1,1,5> + 564576358U, // <7,0,1,2>: Cost 1 vext3 RHS, LHS + 2638996480U, // <7,0,1,3>: Cost 3 vext2 <3,5,7,0>, <1,3,5,7> + 1516883254U, // <7,0,1,4>: Cost 2 vext1 <6,7,0,1>, RHS + 2649613456U, // <7,0,1,5>: Cost 3 vext2 <5,3,7,0>, <1,5,3,7> + 1516884814U, // <7,0,1,6>: Cost 2 vext1 <6,7,0,1>, <6,7,0,1> + 2590626808U, // <7,0,1,7>: Cost 3 vext1 <6,7,0,1>, <7,0,1,0> + 564576412U, // <7,0,1,u>: Cost 1 vext3 RHS, LHS + 1638318244U, // <7,0,2,0>: Cost 2 vext3 RHS, <0,2,0,2> + 2692743344U, // <7,0,2,1>: Cost 3 vext3 <1,3,5,7>, <0,2,1,5> + 2712060084U, // <7,0,2,2>: Cost 3 vext3 RHS, <0,2,2,0> + 2712060094U, // <7,0,2,3>: Cost 3 vext3 RHS, <0,2,3,1> + 1638318284U, // <7,0,2,4>: Cost 2 vext3 RHS, <0,2,4,6> + 2712060118U, // <7,0,2,5>: Cost 3 vext3 RHS, <0,2,5,7> + 2651604922U, // <7,0,2,6>: Cost 3 vext2 <5,6,7,0>, <2,6,3,7> + 2686255336U, // <7,0,2,7>: Cost 3 vext3 <0,2,7,7>, <0,2,7,7> + 1638318316U, // <7,0,2,u>: Cost 2 vext3 RHS, <0,2,u,2> + 2651605142U, // <7,0,3,0>: Cost 3 vext2 <5,6,7,0>, <3,0,1,2> + 2712060156U, // <7,0,3,1>: Cost 3 vext3 RHS, <0,3,1,0> + 2712060165U, // <7,0,3,2>: Cost 3 vext3 RHS, <0,3,2,0> + 2651605404U, // <7,0,3,3>: Cost 3 vext2 <5,6,7,0>, <3,3,3,3> + 2651605506U, // <7,0,3,4>: Cost 3 vext2 <5,6,7,0>, <3,4,5,6> + 2638998111U, // <7,0,3,5>: Cost 3 vext2 <3,5,7,0>, <3,5,7,0> + 2639661744U, // <7,0,3,6>: Cost 3 vext2 <3,6,7,0>, <3,6,7,0> + 3712740068U, // <7,0,3,7>: Cost 4 vext2 <3,5,7,0>, <3,7,3,7> + 2640989010U, // <7,0,3,u>: Cost 3 vext2 <3,u,7,0>, <3,u,7,0> + 2712060232U, // <7,0,4,0>: Cost 3 vext3 RHS, <0,4,0,4> + 1638318418U, // <7,0,4,1>: Cost 2 vext3 RHS, <0,4,1,5> + 1638318428U, // <7,0,4,2>: Cost 2 vext3 RHS, <0,4,2,6> + 3646474950U, // <7,0,4,3>: Cost 4 vext1 <3,7,0,4>, <3,7,0,4> + 2712060270U, // <7,0,4,4>: Cost 3 vext3 RHS, <0,4,4,6> + 1577864502U, // <7,0,4,5>: Cost 2 vext2 <5,6,7,0>, RHS + 2651606388U, // <7,0,4,6>: Cost 3 vext2 <5,6,7,0>, <4,6,4,6> + 3787792776U, // <7,0,4,7>: Cost 4 vext3 RHS, <0,4,7,5> + 1638318481U, // <7,0,4,u>: Cost 2 vext3 RHS, <0,4,u,5> + 2590654566U, // <7,0,5,0>: Cost 3 vext1 <6,7,0,5>, LHS + 2651606736U, // <7,0,5,1>: Cost 3 vext2 <5,6,7,0>, <5,1,7,3> + 2712060334U, // <7,0,5,2>: Cost 3 vext3 RHS, <0,5,2,7> + 2649616239U, // <7,0,5,3>: Cost 3 vext2 <5,3,7,0>, <5,3,7,0> + 2651606982U, // <7,0,5,4>: Cost 3 vext2 <5,6,7,0>, <5,4,7,6> + 2651607044U, // <7,0,5,5>: Cost 3 vext2 <5,6,7,0>, <5,5,5,5> + 1577865314U, // <7,0,5,6>: Cost 2 vext2 <5,6,7,0>, <5,6,7,0> + 2651607208U, // <7,0,5,7>: Cost 3 vext2 <5,6,7,0>, <5,7,5,7> + 1579192580U, // <7,0,5,u>: Cost 2 vext2 <5,u,7,0>, <5,u,7,0> + 2688393709U, // <7,0,6,0>: Cost 3 vext3 <0,6,0,7>, <0,6,0,7> + 2712060406U, // <7,0,6,1>: Cost 3 vext3 RHS, <0,6,1,7> + 2688541183U, // <7,0,6,2>: Cost 3 vext3 <0,6,2,7>, <0,6,2,7> + 2655588936U, // <7,0,6,3>: Cost 3 vext2 <6,3,7,0>, <6,3,7,0> + 3762430481U, // <7,0,6,4>: Cost 4 vext3 <0,6,4,7>, <0,6,4,7> + 2651607730U, // <7,0,6,5>: Cost 3 vext2 <5,6,7,0>, <6,5,0,7> + 2651607864U, // <7,0,6,6>: Cost 3 vext2 <5,6,7,0>, <6,6,6,6> + 2651607886U, // <7,0,6,7>: Cost 3 vext2 <5,6,7,0>, <6,7,0,1> + 2688983605U, // <7,0,6,u>: Cost 3 vext3 <0,6,u,7>, <0,6,u,7> + 2651608058U, // <7,0,7,0>: Cost 3 vext2 <5,6,7,0>, <7,0,1,2> + 2932703334U, // <7,0,7,1>: Cost 3 vzipl <7,7,7,7>, LHS + 3066921062U, // <7,0,7,2>: Cost 3 vtrnl <7,7,7,7>, LHS + 3712742678U, // <7,0,7,3>: Cost 4 vext2 <3,5,7,0>, <7,3,5,7> + 2651608422U, // <7,0,7,4>: Cost 3 vext2 <5,6,7,0>, <7,4,5,6> + 2651608513U, // <7,0,7,5>: Cost 3 vext2 <5,6,7,0>, <7,5,6,7> + 2663552532U, // <7,0,7,6>: Cost 3 vext2 <7,6,7,0>, <7,6,7,0> + 2651608684U, // <7,0,7,7>: Cost 3 vext2 <5,6,7,0>, <7,7,7,7> + 2651608706U, // <7,0,7,u>: Cost 3 vext2 <5,6,7,0>, <7,u,1,2> + 1638318730U, // <7,0,u,0>: Cost 2 vext3 RHS, <0,u,0,2> + 1638318738U, // <7,0,u,1>: Cost 2 vext3 RHS, <0,u,1,1> + 564576925U, // <7,0,u,2>: Cost 1 vext3 RHS, LHS + 2572765898U, // <7,0,u,3>: Cost 3 vext1 <3,7,0,u>, <3,7,0,u> + 1638318770U, // <7,0,u,4>: Cost 2 vext3 RHS, <0,u,4,6> + 1577867418U, // <7,0,u,5>: Cost 2 vext2 <5,6,7,0>, RHS + 1516942165U, // <7,0,u,6>: Cost 2 vext1 <6,7,0,u>, <6,7,0,u> + 2651609344U, // <7,0,u,7>: Cost 3 vext2 <5,6,7,0>, <u,7,0,1> + 564576979U, // <7,0,u,u>: Cost 1 vext3 RHS, LHS + 2590687334U, // <7,1,0,0>: Cost 3 vext1 <6,7,1,0>, LHS + 2639003750U, // <7,1,0,1>: Cost 3 vext2 <3,5,7,1>, LHS + 2793357414U, // <7,1,0,2>: Cost 3 vuzpl <7,0,1,2>, LHS + 1638318838U, // <7,1,0,3>: Cost 2 vext3 RHS, <1,0,3,2> + 2590690614U, // <7,1,0,4>: Cost 3 vext1 <6,7,1,0>, RHS + 2712060679U, // <7,1,0,5>: Cost 3 vext3 RHS, <1,0,5,1> + 2590692182U, // <7,1,0,6>: Cost 3 vext1 <6,7,1,0>, <6,7,1,0> + 3785802521U, // <7,1,0,7>: Cost 4 vext3 RHS, <1,0,7,1> + 1638318883U, // <7,1,0,u>: Cost 2 vext3 RHS, <1,0,u,2> + 2712060715U, // <7,1,1,0>: Cost 3 vext3 RHS, <1,1,0,1> + 1638318900U, // <7,1,1,1>: Cost 2 vext3 RHS, <1,1,1,1> + 3774300994U, // <7,1,1,2>: Cost 4 vext3 <2,6,3,7>, <1,1,2,6> + 1638318920U, // <7,1,1,3>: Cost 2 vext3 RHS, <1,1,3,3> + 2712060755U, // <7,1,1,4>: Cost 3 vext3 RHS, <1,1,4,5> + 2691416926U, // <7,1,1,5>: Cost 3 vext3 <1,1,5,7>, <1,1,5,7> + 2590700375U, // <7,1,1,6>: Cost 3 vext1 <6,7,1,1>, <6,7,1,1> + 3765158766U, // <7,1,1,7>: Cost 4 vext3 <1,1,5,7>, <1,1,7,5> + 1638318965U, // <7,1,1,u>: Cost 2 vext3 RHS, <1,1,u,3> + 2712060796U, // <7,1,2,0>: Cost 3 vext3 RHS, <1,2,0,1> + 2712060807U, // <7,1,2,1>: Cost 3 vext3 RHS, <1,2,1,3> + 3712747112U, // <7,1,2,2>: Cost 4 vext2 <3,5,7,1>, <2,2,2,2> + 1638318998U, // <7,1,2,3>: Cost 2 vext3 RHS, <1,2,3,0> + 2712060836U, // <7,1,2,4>: Cost 3 vext3 RHS, <1,2,4,5> + 2712060843U, // <7,1,2,5>: Cost 3 vext3 RHS, <1,2,5,3> + 2590708568U, // <7,1,2,6>: Cost 3 vext1 <6,7,1,2>, <6,7,1,2> + 2735948730U, // <7,1,2,7>: Cost 3 vext3 RHS, <1,2,7,0> + 1638319043U, // <7,1,2,u>: Cost 2 vext3 RHS, <1,2,u,0> + 2712060876U, // <7,1,3,0>: Cost 3 vext3 RHS, <1,3,0,0> + 1638319064U, // <7,1,3,1>: Cost 2 vext3 RHS, <1,3,1,3> + 2712060894U, // <7,1,3,2>: Cost 3 vext3 RHS, <1,3,2,0> + 2692596718U, // <7,1,3,3>: Cost 3 vext3 <1,3,3,7>, <1,3,3,7> + 2712060917U, // <7,1,3,4>: Cost 3 vext3 RHS, <1,3,4,5> + 1619002368U, // <7,1,3,5>: Cost 2 vext3 <1,3,5,7>, <1,3,5,7> + 2692817929U, // <7,1,3,6>: Cost 3 vext3 <1,3,6,7>, <1,3,6,7> + 2735948814U, // <7,1,3,7>: Cost 3 vext3 RHS, <1,3,7,3> + 1619223579U, // <7,1,3,u>: Cost 2 vext3 <1,3,u,7>, <1,3,u,7> + 2712060962U, // <7,1,4,0>: Cost 3 vext3 RHS, <1,4,0,5> + 2712060971U, // <7,1,4,1>: Cost 3 vext3 RHS, <1,4,1,5> + 2712060980U, // <7,1,4,2>: Cost 3 vext3 RHS, <1,4,2,5> + 2712060989U, // <7,1,4,3>: Cost 3 vext3 RHS, <1,4,3,5> + 3785802822U, // <7,1,4,4>: Cost 4 vext3 RHS, <1,4,4,5> + 2639007030U, // <7,1,4,5>: Cost 3 vext2 <3,5,7,1>, RHS + 2645642634U, // <7,1,4,6>: Cost 3 vext2 <4,6,7,1>, <4,6,7,1> + 3719384520U, // <7,1,4,7>: Cost 4 vext2 <4,6,7,1>, <4,7,5,0> + 2639007273U, // <7,1,4,u>: Cost 3 vext2 <3,5,7,1>, RHS + 2572812390U, // <7,1,5,0>: Cost 3 vext1 <3,7,1,5>, LHS + 2693776510U, // <7,1,5,1>: Cost 3 vext3 <1,5,1,7>, <1,5,1,7> + 3774301318U, // <7,1,5,2>: Cost 4 vext3 <2,6,3,7>, <1,5,2,6> + 1620182160U, // <7,1,5,3>: Cost 2 vext3 <1,5,3,7>, <1,5,3,7> + 2572815670U, // <7,1,5,4>: Cost 3 vext1 <3,7,1,5>, RHS + 3766486178U, // <7,1,5,5>: Cost 4 vext3 <1,3,5,7>, <1,5,5,7> + 2651615331U, // <7,1,5,6>: Cost 3 vext2 <5,6,7,1>, <5,6,7,1> + 2652278964U, // <7,1,5,7>: Cost 3 vext2 <5,7,7,1>, <5,7,7,1> + 1620550845U, // <7,1,5,u>: Cost 2 vext3 <1,5,u,7>, <1,5,u,7> + 3768108230U, // <7,1,6,0>: Cost 4 vext3 <1,6,0,7>, <1,6,0,7> + 2694440143U, // <7,1,6,1>: Cost 3 vext3 <1,6,1,7>, <1,6,1,7> + 2712061144U, // <7,1,6,2>: Cost 3 vext3 RHS, <1,6,2,7> + 2694587617U, // <7,1,6,3>: Cost 3 vext3 <1,6,3,7>, <1,6,3,7> + 3768403178U, // <7,1,6,4>: Cost 4 vext3 <1,6,4,7>, <1,6,4,7> + 2694735091U, // <7,1,6,5>: Cost 3 vext3 <1,6,5,7>, <1,6,5,7> + 3768550652U, // <7,1,6,6>: Cost 4 vext3 <1,6,6,7>, <1,6,6,7> + 2652279630U, // <7,1,6,7>: Cost 3 vext2 <5,7,7,1>, <6,7,0,1> + 2694956302U, // <7,1,6,u>: Cost 3 vext3 <1,6,u,7>, <1,6,u,7> + 2645644282U, // <7,1,7,0>: Cost 3 vext2 <4,6,7,1>, <7,0,1,2> + 2859062094U, // <7,1,7,1>: Cost 3 vuzpr <6,7,0,1>, <6,7,0,1> + 3779462437U, // <7,1,7,2>: Cost 4 vext3 <3,5,1,7>, <1,7,2,3> + 3121938534U, // <7,1,7,3>: Cost 3 vtrnr <5,7,5,7>, LHS + 2554916150U, // <7,1,7,4>: Cost 3 vext1 <0,7,1,7>, RHS + 3769140548U, // <7,1,7,5>: Cost 4 vext3 <1,7,5,7>, <1,7,5,7> + 3726022164U, // <7,1,7,6>: Cost 4 vext2 <5,7,7,1>, <7,6,7,0> + 2554918508U, // <7,1,7,7>: Cost 3 vext1 <0,7,1,7>, <7,7,7,7> + 3121938539U, // <7,1,7,u>: Cost 3 vtrnr <5,7,5,7>, LHS + 2572836966U, // <7,1,u,0>: Cost 3 vext1 <3,7,1,u>, LHS + 1638319469U, // <7,1,u,1>: Cost 2 vext3 RHS, <1,u,1,3> + 2712061299U, // <7,1,u,2>: Cost 3 vext3 RHS, <1,u,2,0> + 1622173059U, // <7,1,u,3>: Cost 2 vext3 <1,u,3,7>, <1,u,3,7> + 2572840246U, // <7,1,u,4>: Cost 3 vext1 <3,7,1,u>, RHS + 1622320533U, // <7,1,u,5>: Cost 2 vext3 <1,u,5,7>, <1,u,5,7> + 2696136094U, // <7,1,u,6>: Cost 3 vext3 <1,u,6,7>, <1,u,6,7> + 2859060777U, // <7,1,u,7>: Cost 3 vuzpr <6,7,0,1>, RHS + 1622541744U, // <7,1,u,u>: Cost 2 vext3 <1,u,u,7>, <1,u,u,7> + 2712061364U, // <7,2,0,0>: Cost 3 vext3 RHS, <2,0,0,2> + 2712061373U, // <7,2,0,1>: Cost 3 vext3 RHS, <2,0,1,2> + 2712061380U, // <7,2,0,2>: Cost 3 vext3 RHS, <2,0,2,0> + 2712061389U, // <7,2,0,3>: Cost 3 vext3 RHS, <2,0,3,0> + 2712061404U, // <7,2,0,4>: Cost 3 vext3 RHS, <2,0,4,6> + 2696725990U, // <7,2,0,5>: Cost 3 vext3 <2,0,5,7>, <2,0,5,7> + 2712061417U, // <7,2,0,6>: Cost 3 vext3 RHS, <2,0,6,1> + 3785803251U, // <7,2,0,7>: Cost 4 vext3 RHS, <2,0,7,2> + 2696947201U, // <7,2,0,u>: Cost 3 vext3 <2,0,u,7>, <2,0,u,7> + 2712061446U, // <7,2,1,0>: Cost 3 vext3 RHS, <2,1,0,3> + 3785803276U, // <7,2,1,1>: Cost 4 vext3 RHS, <2,1,1,0> + 3785803285U, // <7,2,1,2>: Cost 4 vext3 RHS, <2,1,2,0> + 2712061471U, // <7,2,1,3>: Cost 3 vext3 RHS, <2,1,3,1> + 2712061482U, // <7,2,1,4>: Cost 3 vext3 RHS, <2,1,4,3> + 3766486576U, // <7,2,1,5>: Cost 4 vext3 <1,3,5,7>, <2,1,5,0> + 2712061500U, // <7,2,1,6>: Cost 3 vext3 RHS, <2,1,6,3> + 2602718850U, // <7,2,1,7>: Cost 3 vext1 <u,7,2,1>, <7,u,1,2> + 2712061516U, // <7,2,1,u>: Cost 3 vext3 RHS, <2,1,u,1> + 2712061525U, // <7,2,2,0>: Cost 3 vext3 RHS, <2,2,0,1> + 2712061536U, // <7,2,2,1>: Cost 3 vext3 RHS, <2,2,1,3> + 1638319720U, // <7,2,2,2>: Cost 2 vext3 RHS, <2,2,2,2> + 1638319730U, // <7,2,2,3>: Cost 2 vext3 RHS, <2,2,3,3> + 2712061565U, // <7,2,2,4>: Cost 3 vext3 RHS, <2,2,4,5> + 2698053256U, // <7,2,2,5>: Cost 3 vext3 <2,2,5,7>, <2,2,5,7> + 2712061584U, // <7,2,2,6>: Cost 3 vext3 RHS, <2,2,6,6> + 3771795096U, // <7,2,2,7>: Cost 4 vext3 <2,2,5,7>, <2,2,7,5> + 1638319775U, // <7,2,2,u>: Cost 2 vext3 RHS, <2,2,u,3> + 1638319782U, // <7,2,3,0>: Cost 2 vext3 RHS, <2,3,0,1> + 2693924531U, // <7,2,3,1>: Cost 3 vext3 <1,5,3,7>, <2,3,1,5> + 2700560061U, // <7,2,3,2>: Cost 3 vext3 <2,6,3,7>, <2,3,2,6> + 2693924551U, // <7,2,3,3>: Cost 3 vext3 <1,5,3,7>, <2,3,3,7> + 1638319822U, // <7,2,3,4>: Cost 2 vext3 RHS, <2,3,4,5> + 2698716889U, // <7,2,3,5>: Cost 3 vext3 <2,3,5,7>, <2,3,5,7> + 2712061665U, // <7,2,3,6>: Cost 3 vext3 RHS, <2,3,6,6> + 2735949540U, // <7,2,3,7>: Cost 3 vext3 RHS, <2,3,7,0> + 1638319854U, // <7,2,3,u>: Cost 2 vext3 RHS, <2,3,u,1> + 2712061692U, // <7,2,4,0>: Cost 3 vext3 RHS, <2,4,0,6> + 2712061698U, // <7,2,4,1>: Cost 3 vext3 RHS, <2,4,1,3> + 2712061708U, // <7,2,4,2>: Cost 3 vext3 RHS, <2,4,2,4> + 2712061718U, // <7,2,4,3>: Cost 3 vext3 RHS, <2,4,3,5> + 2712061728U, // <7,2,4,4>: Cost 3 vext3 RHS, <2,4,4,6> + 2699380522U, // <7,2,4,5>: Cost 3 vext3 <2,4,5,7>, <2,4,5,7> + 2712061740U, // <7,2,4,6>: Cost 3 vext3 RHS, <2,4,6,0> + 3809691445U, // <7,2,4,7>: Cost 4 vext3 RHS, <2,4,7,0> + 2699601733U, // <7,2,4,u>: Cost 3 vext3 <2,4,u,7>, <2,4,u,7> + 2699675470U, // <7,2,5,0>: Cost 3 vext3 <2,5,0,7>, <2,5,0,7> + 3766486867U, // <7,2,5,1>: Cost 4 vext3 <1,3,5,7>, <2,5,1,3> + 2699822944U, // <7,2,5,2>: Cost 3 vext3 <2,5,2,7>, <2,5,2,7> + 2692745065U, // <7,2,5,3>: Cost 3 vext3 <1,3,5,7>, <2,5,3,7> + 2699970418U, // <7,2,5,4>: Cost 3 vext3 <2,5,4,7>, <2,5,4,7> + 3766486907U, // <7,2,5,5>: Cost 4 vext3 <1,3,5,7>, <2,5,5,7> + 2700117892U, // <7,2,5,6>: Cost 3 vext3 <2,5,6,7>, <2,5,6,7> + 3771795334U, // <7,2,5,7>: Cost 4 vext3 <2,2,5,7>, <2,5,7,0> + 2692745110U, // <7,2,5,u>: Cost 3 vext3 <1,3,5,7>, <2,5,u,7> + 2572894310U, // <7,2,6,0>: Cost 3 vext1 <3,7,2,6>, LHS + 2712061860U, // <7,2,6,1>: Cost 3 vext3 RHS, <2,6,1,3> + 2700486577U, // <7,2,6,2>: Cost 3 vext3 <2,6,2,7>, <2,6,2,7> + 1626818490U, // <7,2,6,3>: Cost 2 vext3 <2,6,3,7>, <2,6,3,7> + 2572897590U, // <7,2,6,4>: Cost 3 vext1 <3,7,2,6>, RHS + 2700707788U, // <7,2,6,5>: Cost 3 vext3 <2,6,5,7>, <2,6,5,7> + 2700781525U, // <7,2,6,6>: Cost 3 vext3 <2,6,6,7>, <2,6,6,7> + 3774597086U, // <7,2,6,7>: Cost 4 vext3 <2,6,7,7>, <2,6,7,7> + 1627187175U, // <7,2,6,u>: Cost 2 vext3 <2,6,u,7>, <2,6,u,7> + 2735949802U, // <7,2,7,0>: Cost 3 vext3 RHS, <2,7,0,1> + 3780200434U, // <7,2,7,1>: Cost 4 vext3 <3,6,2,7>, <2,7,1,0> + 3773564928U, // <7,2,7,2>: Cost 4 vext3 <2,5,2,7>, <2,7,2,5> + 2986541158U, // <7,2,7,3>: Cost 3 vzipr <5,5,7,7>, LHS + 2554989878U, // <7,2,7,4>: Cost 3 vext1 <0,7,2,7>, RHS + 3775113245U, // <7,2,7,5>: Cost 4 vext3 <2,7,5,7>, <2,7,5,7> + 4060283228U, // <7,2,7,6>: Cost 4 vzipr <5,5,7,7>, <0,4,2,6> + 2554992236U, // <7,2,7,7>: Cost 3 vext1 <0,7,2,7>, <7,7,7,7> + 2986541163U, // <7,2,7,u>: Cost 3 vzipr <5,5,7,7>, LHS + 1638320187U, // <7,2,u,0>: Cost 2 vext3 RHS, <2,u,0,1> + 2693924936U, // <7,2,u,1>: Cost 3 vext3 <1,5,3,7>, <2,u,1,5> + 1638319720U, // <7,2,u,2>: Cost 2 vext3 RHS, <2,2,2,2> + 1628145756U, // <7,2,u,3>: Cost 2 vext3 <2,u,3,7>, <2,u,3,7> + 1638320227U, // <7,2,u,4>: Cost 2 vext3 RHS, <2,u,4,5> + 2702035054U, // <7,2,u,5>: Cost 3 vext3 <2,u,5,7>, <2,u,5,7> + 2702108791U, // <7,2,u,6>: Cost 3 vext3 <2,u,6,7>, <2,u,6,7> + 2735949945U, // <7,2,u,7>: Cost 3 vext3 RHS, <2,u,7,0> + 1628514441U, // <7,2,u,u>: Cost 2 vext3 <2,u,u,7>, <2,u,u,7> + 2712062091U, // <7,3,0,0>: Cost 3 vext3 RHS, <3,0,0,0> + 1638320278U, // <7,3,0,1>: Cost 2 vext3 RHS, <3,0,1,2> + 2712062109U, // <7,3,0,2>: Cost 3 vext3 RHS, <3,0,2,0> + 2590836886U, // <7,3,0,3>: Cost 3 vext1 <6,7,3,0>, <3,0,1,2> + 2712062128U, // <7,3,0,4>: Cost 3 vext3 RHS, <3,0,4,1> + 2712062138U, // <7,3,0,5>: Cost 3 vext3 RHS, <3,0,5,2> + 2590839656U, // <7,3,0,6>: Cost 3 vext1 <6,7,3,0>, <6,7,3,0> + 3311414017U, // <7,3,0,7>: Cost 4 vrev <3,7,7,0> + 1638320341U, // <7,3,0,u>: Cost 2 vext3 RHS, <3,0,u,2> + 2237164227U, // <7,3,1,0>: Cost 3 vrev <3,7,0,1> + 2712062182U, // <7,3,1,1>: Cost 3 vext3 RHS, <3,1,1,1> + 2712062193U, // <7,3,1,2>: Cost 3 vext3 RHS, <3,1,2,3> + 2692745468U, // <7,3,1,3>: Cost 3 vext3 <1,3,5,7>, <3,1,3,5> + 2712062214U, // <7,3,1,4>: Cost 3 vext3 RHS, <3,1,4,6> + 2693925132U, // <7,3,1,5>: Cost 3 vext3 <1,5,3,7>, <3,1,5,3> + 3768183059U, // <7,3,1,6>: Cost 4 vext3 <1,6,1,7>, <3,1,6,1> + 2692745504U, // <7,3,1,7>: Cost 3 vext3 <1,3,5,7>, <3,1,7,5> + 2696063273U, // <7,3,1,u>: Cost 3 vext3 <1,u,5,7>, <3,1,u,5> + 2712062254U, // <7,3,2,0>: Cost 3 vext3 RHS, <3,2,0,1> + 2712062262U, // <7,3,2,1>: Cost 3 vext3 RHS, <3,2,1,0> + 2712062273U, // <7,3,2,2>: Cost 3 vext3 RHS, <3,2,2,2> + 2712062280U, // <7,3,2,3>: Cost 3 vext3 RHS, <3,2,3,0> + 2712062294U, // <7,3,2,4>: Cost 3 vext3 RHS, <3,2,4,5> + 2712062302U, // <7,3,2,5>: Cost 3 vext3 RHS, <3,2,5,4> + 2700560742U, // <7,3,2,6>: Cost 3 vext3 <2,6,3,7>, <3,2,6,3> + 2712062319U, // <7,3,2,7>: Cost 3 vext3 RHS, <3,2,7,3> + 2712062325U, // <7,3,2,u>: Cost 3 vext3 RHS, <3,2,u,0> + 2712062335U, // <7,3,3,0>: Cost 3 vext3 RHS, <3,3,0,1> + 2636368158U, // <7,3,3,1>: Cost 3 vext2 <3,1,7,3>, <3,1,7,3> + 2637031791U, // <7,3,3,2>: Cost 3 vext2 <3,2,7,3>, <3,2,7,3> + 1638320540U, // <7,3,3,3>: Cost 2 vext3 RHS, <3,3,3,3> + 2712062374U, // <7,3,3,4>: Cost 3 vext3 RHS, <3,3,4,4> + 2704689586U, // <7,3,3,5>: Cost 3 vext3 <3,3,5,7>, <3,3,5,7> + 2590864235U, // <7,3,3,6>: Cost 3 vext1 <6,7,3,3>, <6,7,3,3> + 2704837060U, // <7,3,3,7>: Cost 3 vext3 <3,3,7,7>, <3,3,7,7> + 1638320540U, // <7,3,3,u>: Cost 2 vext3 RHS, <3,3,3,3> + 2712062416U, // <7,3,4,0>: Cost 3 vext3 RHS, <3,4,0,1> + 2712062426U, // <7,3,4,1>: Cost 3 vext3 RHS, <3,4,1,2> + 2566981640U, // <7,3,4,2>: Cost 3 vext1 <2,7,3,4>, <2,7,3,4> + 2712062447U, // <7,3,4,3>: Cost 3 vext3 RHS, <3,4,3,5> + 2712062456U, // <7,3,4,4>: Cost 3 vext3 RHS, <3,4,4,5> + 1638320642U, // <7,3,4,5>: Cost 2 vext3 RHS, <3,4,5,6> + 2648313204U, // <7,3,4,6>: Cost 3 vext2 <5,1,7,3>, <4,6,4,6> + 3311446789U, // <7,3,4,7>: Cost 4 vrev <3,7,7,4> + 1638320669U, // <7,3,4,u>: Cost 2 vext3 RHS, <3,4,u,6> + 2602819686U, // <7,3,5,0>: Cost 3 vext1 <u,7,3,5>, LHS + 1574571728U, // <7,3,5,1>: Cost 2 vext2 <5,1,7,3>, <5,1,7,3> + 2648977185U, // <7,3,5,2>: Cost 3 vext2 <5,2,7,3>, <5,2,7,3> + 2705869378U, // <7,3,5,3>: Cost 3 vext3 <3,5,3,7>, <3,5,3,7> + 2237491947U, // <7,3,5,4>: Cost 3 vrev <3,7,4,5> + 2706016852U, // <7,3,5,5>: Cost 3 vext3 <3,5,5,7>, <3,5,5,7> + 2648313954U, // <7,3,5,6>: Cost 3 vext2 <5,1,7,3>, <5,6,7,0> + 2692745823U, // <7,3,5,7>: Cost 3 vext3 <1,3,5,7>, <3,5,7,0> + 1579217159U, // <7,3,5,u>: Cost 2 vext2 <5,u,7,3>, <5,u,7,3> + 2706311800U, // <7,3,6,0>: Cost 3 vext3 <3,6,0,7>, <3,6,0,7> + 2654286249U, // <7,3,6,1>: Cost 3 vext2 <6,1,7,3>, <6,1,7,3> + 1581208058U, // <7,3,6,2>: Cost 2 vext2 <6,2,7,3>, <6,2,7,3> + 2706533011U, // <7,3,6,3>: Cost 3 vext3 <3,6,3,7>, <3,6,3,7> + 2706606748U, // <7,3,6,4>: Cost 3 vext3 <3,6,4,7>, <3,6,4,7> + 3780422309U, // <7,3,6,5>: Cost 4 vext3 <3,6,5,7>, <3,6,5,7> + 2712062637U, // <7,3,6,6>: Cost 3 vext3 RHS, <3,6,6,6> + 2706827959U, // <7,3,6,7>: Cost 3 vext3 <3,6,7,7>, <3,6,7,7> + 1585189856U, // <7,3,6,u>: Cost 2 vext2 <6,u,7,3>, <6,u,7,3> + 2693925571U, // <7,3,7,0>: Cost 3 vext3 <1,5,3,7>, <3,7,0,1> + 2693925584U, // <7,3,7,1>: Cost 3 vext3 <1,5,3,7>, <3,7,1,5> + 2700561114U, // <7,3,7,2>: Cost 3 vext3 <2,6,3,7>, <3,7,2,6> + 2572978916U, // <7,3,7,3>: Cost 3 vext1 <3,7,3,7>, <3,7,3,7> + 2693925611U, // <7,3,7,4>: Cost 3 vext3 <1,5,3,7>, <3,7,4,5> + 2707344118U, // <7,3,7,5>: Cost 3 vext3 <3,7,5,7>, <3,7,5,7> + 2654950894U, // <7,3,7,6>: Cost 3 vext2 <6,2,7,3>, <7,6,2,7> + 2648315500U, // <7,3,7,7>: Cost 3 vext2 <5,1,7,3>, <7,7,7,7> + 2693925643U, // <7,3,7,u>: Cost 3 vext3 <1,5,3,7>, <3,7,u,1> + 2237221578U, // <7,3,u,0>: Cost 3 vrev <3,7,0,u> + 1638320926U, // <7,3,u,1>: Cost 2 vext3 RHS, <3,u,1,2> + 1593153452U, // <7,3,u,2>: Cost 2 vext2 <u,2,7,3>, <u,2,7,3> + 1638320540U, // <7,3,u,3>: Cost 2 vext3 RHS, <3,3,3,3> + 2237516526U, // <7,3,u,4>: Cost 3 vrev <3,7,4,u> + 1638320966U, // <7,3,u,5>: Cost 2 vext3 RHS, <3,u,5,6> + 2712062796U, // <7,3,u,6>: Cost 3 vext3 RHS, <3,u,6,3> + 2692967250U, // <7,3,u,7>: Cost 3 vext3 <1,3,u,7>, <3,u,7,0> + 1638320989U, // <7,3,u,u>: Cost 2 vext3 RHS, <3,u,u,2> + 2651635712U, // <7,4,0,0>: Cost 3 vext2 <5,6,7,4>, <0,0,0,0> + 1577893990U, // <7,4,0,1>: Cost 2 vext2 <5,6,7,4>, LHS + 2651635876U, // <7,4,0,2>: Cost 3 vext2 <5,6,7,4>, <0,2,0,2> + 3785804672U, // <7,4,0,3>: Cost 4 vext3 RHS, <4,0,3,1> + 2651636050U, // <7,4,0,4>: Cost 3 vext2 <5,6,7,4>, <0,4,1,5> + 1638468498U, // <7,4,0,5>: Cost 2 vext3 RHS, <4,0,5,1> + 1638468508U, // <7,4,0,6>: Cost 2 vext3 RHS, <4,0,6,2> + 3787795364U, // <7,4,0,7>: Cost 4 vext3 RHS, <4,0,7,1> + 1640459181U, // <7,4,0,u>: Cost 2 vext3 RHS, <4,0,u,1> + 2651636470U, // <7,4,1,0>: Cost 3 vext2 <5,6,7,4>, <1,0,3,2> + 2651636532U, // <7,4,1,1>: Cost 3 vext2 <5,6,7,4>, <1,1,1,1> + 2712062922U, // <7,4,1,2>: Cost 3 vext3 RHS, <4,1,2,3> + 2639029248U, // <7,4,1,3>: Cost 3 vext2 <3,5,7,4>, <1,3,5,7> + 2712062940U, // <7,4,1,4>: Cost 3 vext3 RHS, <4,1,4,3> + 2712062946U, // <7,4,1,5>: Cost 3 vext3 RHS, <4,1,5,0> + 2712062958U, // <7,4,1,6>: Cost 3 vext3 RHS, <4,1,6,3> + 3785804791U, // <7,4,1,7>: Cost 4 vext3 RHS, <4,1,7,3> + 2712062973U, // <7,4,1,u>: Cost 3 vext3 RHS, <4,1,u,0> + 3785804807U, // <7,4,2,0>: Cost 4 vext3 RHS, <4,2,0,1> + 3785804818U, // <7,4,2,1>: Cost 4 vext3 RHS, <4,2,1,3> + 2651637352U, // <7,4,2,2>: Cost 3 vext2 <5,6,7,4>, <2,2,2,2> + 2651637414U, // <7,4,2,3>: Cost 3 vext2 <5,6,7,4>, <2,3,0,1> + 3716753194U, // <7,4,2,4>: Cost 4 vext2 <4,2,7,4>, <2,4,5,7> + 2712063030U, // <7,4,2,5>: Cost 3 vext3 RHS, <4,2,5,3> + 2712063036U, // <7,4,2,6>: Cost 3 vext3 RHS, <4,2,6,0> + 3773123658U, // <7,4,2,7>: Cost 4 vext3 <2,4,5,7>, <4,2,7,5> + 2712063054U, // <7,4,2,u>: Cost 3 vext3 RHS, <4,2,u,0> + 2651637910U, // <7,4,3,0>: Cost 3 vext2 <5,6,7,4>, <3,0,1,2> + 3712772348U, // <7,4,3,1>: Cost 4 vext2 <3,5,7,4>, <3,1,3,5> + 3785804906U, // <7,4,3,2>: Cost 4 vext3 RHS, <4,3,2,1> + 2651638172U, // <7,4,3,3>: Cost 3 vext2 <5,6,7,4>, <3,3,3,3> + 2651638274U, // <7,4,3,4>: Cost 3 vext2 <5,6,7,4>, <3,4,5,6> + 2639030883U, // <7,4,3,5>: Cost 3 vext2 <3,5,7,4>, <3,5,7,4> + 2712063122U, // <7,4,3,6>: Cost 3 vext3 RHS, <4,3,6,5> + 3712772836U, // <7,4,3,7>: Cost 4 vext2 <3,5,7,4>, <3,7,3,7> + 2641021782U, // <7,4,3,u>: Cost 3 vext2 <3,u,7,4>, <3,u,7,4> + 2714053802U, // <7,4,4,0>: Cost 3 vext3 RHS, <4,4,0,2> + 3785804978U, // <7,4,4,1>: Cost 4 vext3 RHS, <4,4,1,1> + 3716754505U, // <7,4,4,2>: Cost 4 vext2 <4,2,7,4>, <4,2,7,4> + 3785804998U, // <7,4,4,3>: Cost 4 vext3 RHS, <4,4,3,3> + 1638321360U, // <7,4,4,4>: Cost 2 vext3 RHS, <4,4,4,4> + 1638468826U, // <7,4,4,5>: Cost 2 vext3 RHS, <4,4,5,5> + 1638468836U, // <7,4,4,6>: Cost 2 vext3 RHS, <4,4,6,6> + 3785215214U, // <7,4,4,7>: Cost 4 vext3 <4,4,7,7>, <4,4,7,7> + 1640459509U, // <7,4,4,u>: Cost 2 vext3 RHS, <4,4,u,5> + 1517207654U, // <7,4,5,0>: Cost 2 vext1 <6,7,4,5>, LHS + 2573034640U, // <7,4,5,1>: Cost 3 vext1 <3,7,4,5>, <1,5,3,7> + 2712063246U, // <7,4,5,2>: Cost 3 vext3 RHS, <4,5,2,3> + 2573036267U, // <7,4,5,3>: Cost 3 vext1 <3,7,4,5>, <3,7,4,5> + 1517210934U, // <7,4,5,4>: Cost 2 vext1 <6,7,4,5>, RHS + 2711989549U, // <7,4,5,5>: Cost 3 vext3 <4,5,5,7>, <4,5,5,7> + 564579638U, // <7,4,5,6>: Cost 1 vext3 RHS, RHS + 2651639976U, // <7,4,5,7>: Cost 3 vext2 <5,6,7,4>, <5,7,5,7> + 564579656U, // <7,4,5,u>: Cost 1 vext3 RHS, RHS + 2712063307U, // <7,4,6,0>: Cost 3 vext3 RHS, <4,6,0,1> + 3767668056U, // <7,4,6,1>: Cost 4 vext3 <1,5,3,7>, <4,6,1,5> + 2651640314U, // <7,4,6,2>: Cost 3 vext2 <5,6,7,4>, <6,2,7,3> + 2655621708U, // <7,4,6,3>: Cost 3 vext2 <6,3,7,4>, <6,3,7,4> + 1638468980U, // <7,4,6,4>: Cost 2 vext3 RHS, <4,6,4,6> + 2712063358U, // <7,4,6,5>: Cost 3 vext3 RHS, <4,6,5,7> + 2712063367U, // <7,4,6,6>: Cost 3 vext3 RHS, <4,6,6,7> + 2712210826U, // <7,4,6,7>: Cost 3 vext3 RHS, <4,6,7,1> + 1638469012U, // <7,4,6,u>: Cost 2 vext3 RHS, <4,6,u,2> + 2651640826U, // <7,4,7,0>: Cost 3 vext2 <5,6,7,4>, <7,0,1,2> + 3773713830U, // <7,4,7,1>: Cost 4 vext3 <2,5,4,7>, <4,7,1,2> + 3773713842U, // <7,4,7,2>: Cost 4 vext3 <2,5,4,7>, <4,7,2,5> + 3780349372U, // <7,4,7,3>: Cost 4 vext3 <3,6,4,7>, <4,7,3,6> + 2651641140U, // <7,4,7,4>: Cost 3 vext2 <5,6,7,4>, <7,4,0,1> + 2712210888U, // <7,4,7,5>: Cost 3 vext3 RHS, <4,7,5,0> + 2712210898U, // <7,4,7,6>: Cost 3 vext3 RHS, <4,7,6,1> + 2651641452U, // <7,4,7,7>: Cost 3 vext2 <5,6,7,4>, <7,7,7,7> + 2713538026U, // <7,4,7,u>: Cost 3 vext3 <4,7,u,7>, <4,7,u,7> + 1517232230U, // <7,4,u,0>: Cost 2 vext1 <6,7,4,u>, LHS + 1577899822U, // <7,4,u,1>: Cost 2 vext2 <5,6,7,4>, LHS + 2712063489U, // <7,4,u,2>: Cost 3 vext3 RHS, <4,u,2,3> + 2573060846U, // <7,4,u,3>: Cost 3 vext1 <3,7,4,u>, <3,7,4,u> + 1640312342U, // <7,4,u,4>: Cost 2 vext3 RHS, <4,u,4,6> + 1638469146U, // <7,4,u,5>: Cost 2 vext3 RHS, <4,u,5,1> + 564579881U, // <7,4,u,6>: Cost 1 vext3 RHS, RHS + 2714054192U, // <7,4,u,7>: Cost 3 vext3 RHS, <4,u,7,5> + 564579899U, // <7,4,u,u>: Cost 1 vext3 RHS, RHS + 2579038310U, // <7,5,0,0>: Cost 3 vext1 <4,7,5,0>, LHS + 2636382310U, // <7,5,0,1>: Cost 3 vext2 <3,1,7,5>, LHS + 2796339302U, // <7,5,0,2>: Cost 3 vuzpl <7,4,5,6>, LHS + 3646810719U, // <7,5,0,3>: Cost 4 vext1 <3,7,5,0>, <3,5,7,0> + 2712063586U, // <7,5,0,4>: Cost 3 vext3 RHS, <5,0,4,1> + 2735951467U, // <7,5,0,5>: Cost 3 vext3 RHS, <5,0,5,1> + 2735951476U, // <7,5,0,6>: Cost 3 vext3 RHS, <5,0,6,1> + 2579043322U, // <7,5,0,7>: Cost 3 vext1 <4,7,5,0>, <7,0,1,2> + 2636382877U, // <7,5,0,u>: Cost 3 vext2 <3,1,7,5>, LHS + 2712211087U, // <7,5,1,0>: Cost 3 vext3 RHS, <5,1,0,1> + 3698180916U, // <7,5,1,1>: Cost 4 vext2 <1,1,7,5>, <1,1,1,1> + 3710124950U, // <7,5,1,2>: Cost 4 vext2 <3,1,7,5>, <1,2,3,0> + 2636383232U, // <7,5,1,3>: Cost 3 vext2 <3,1,7,5>, <1,3,5,7> + 2712211127U, // <7,5,1,4>: Cost 3 vext3 RHS, <5,1,4,5> + 2590994128U, // <7,5,1,5>: Cost 3 vext1 <6,7,5,1>, <5,1,7,3> + 2590995323U, // <7,5,1,6>: Cost 3 vext1 <6,7,5,1>, <6,7,5,1> + 1638469328U, // <7,5,1,7>: Cost 2 vext3 RHS, <5,1,7,3> + 1638469337U, // <7,5,1,u>: Cost 2 vext3 RHS, <5,1,u,3> + 3785805536U, // <7,5,2,0>: Cost 4 vext3 RHS, <5,2,0,1> + 3785805544U, // <7,5,2,1>: Cost 4 vext3 RHS, <5,2,1,0> + 3704817288U, // <7,5,2,2>: Cost 4 vext2 <2,2,7,5>, <2,2,5,7> + 2712063742U, // <7,5,2,3>: Cost 3 vext3 RHS, <5,2,3,4> + 3716761386U, // <7,5,2,4>: Cost 4 vext2 <4,2,7,5>, <2,4,5,7> + 2714054415U, // <7,5,2,5>: Cost 3 vext3 RHS, <5,2,5,3> + 3774304024U, // <7,5,2,6>: Cost 4 vext3 <2,6,3,7>, <5,2,6,3> + 2712063777U, // <7,5,2,7>: Cost 3 vext3 RHS, <5,2,7,3> + 2712063787U, // <7,5,2,u>: Cost 3 vext3 RHS, <5,2,u,4> + 3634888806U, // <7,5,3,0>: Cost 4 vext1 <1,7,5,3>, LHS + 2636384544U, // <7,5,3,1>: Cost 3 vext2 <3,1,7,5>, <3,1,7,5> + 3710790001U, // <7,5,3,2>: Cost 4 vext2 <3,2,7,5>, <3,2,7,5> + 3710126492U, // <7,5,3,3>: Cost 4 vext2 <3,1,7,5>, <3,3,3,3> + 3634892086U, // <7,5,3,4>: Cost 4 vext1 <1,7,5,3>, RHS + 2639039076U, // <7,5,3,5>: Cost 3 vext2 <3,5,7,5>, <3,5,7,5> + 3713444533U, // <7,5,3,6>: Cost 4 vext2 <3,6,7,5>, <3,6,7,5> + 2693926767U, // <7,5,3,7>: Cost 3 vext3 <1,5,3,7>, <5,3,7,0> + 2712063864U, // <7,5,3,u>: Cost 3 vext3 RHS, <5,3,u,0> + 2579071078U, // <7,5,4,0>: Cost 3 vext1 <4,7,5,4>, LHS + 3646841856U, // <7,5,4,1>: Cost 4 vext1 <3,7,5,4>, <1,3,5,7> + 3716762698U, // <7,5,4,2>: Cost 4 vext2 <4,2,7,5>, <4,2,7,5> + 3646843491U, // <7,5,4,3>: Cost 4 vext1 <3,7,5,4>, <3,5,7,4> + 2579074358U, // <7,5,4,4>: Cost 3 vext1 <4,7,5,4>, RHS + 2636385590U, // <7,5,4,5>: Cost 3 vext2 <3,1,7,5>, RHS + 2645675406U, // <7,5,4,6>: Cost 3 vext2 <4,6,7,5>, <4,6,7,5> + 1638322118U, // <7,5,4,7>: Cost 2 vext3 RHS, <5,4,7,6> + 1638469583U, // <7,5,4,u>: Cost 2 vext3 RHS, <5,4,u,6> + 2714054611U, // <7,5,5,0>: Cost 3 vext3 RHS, <5,5,0,1> + 2652974800U, // <7,5,5,1>: Cost 3 vext2 <5,u,7,5>, <5,1,7,3> + 3710127905U, // <7,5,5,2>: Cost 4 vext2 <3,1,7,5>, <5,2,7,3> + 3785805808U, // <7,5,5,3>: Cost 4 vext3 RHS, <5,5,3,3> + 2712211450U, // <7,5,5,4>: Cost 3 vext3 RHS, <5,5,4,4> + 1638322180U, // <7,5,5,5>: Cost 2 vext3 RHS, <5,5,5,5> + 2712064014U, // <7,5,5,6>: Cost 3 vext3 RHS, <5,5,6,6> + 1638469656U, // <7,5,5,7>: Cost 2 vext3 RHS, <5,5,7,7> + 1638469665U, // <7,5,5,u>: Cost 2 vext3 RHS, <5,5,u,7> + 2712064036U, // <7,5,6,0>: Cost 3 vext3 RHS, <5,6,0,1> + 2714054707U, // <7,5,6,1>: Cost 3 vext3 RHS, <5,6,1,7> + 3785805879U, // <7,5,6,2>: Cost 4 vext3 RHS, <5,6,2,2> + 2712064066U, // <7,5,6,3>: Cost 3 vext3 RHS, <5,6,3,4> + 2712064076U, // <7,5,6,4>: Cost 3 vext3 RHS, <5,6,4,5> + 2714054743U, // <7,5,6,5>: Cost 3 vext3 RHS, <5,6,5,7> + 2712064096U, // <7,5,6,6>: Cost 3 vext3 RHS, <5,6,6,7> + 1638322274U, // <7,5,6,7>: Cost 2 vext3 RHS, <5,6,7,0> + 1638469739U, // <7,5,6,u>: Cost 2 vext3 RHS, <5,6,u,0> + 1511325798U, // <7,5,7,0>: Cost 2 vext1 <5,7,5,7>, LHS + 2692747392U, // <7,5,7,1>: Cost 3 vext3 <1,3,5,7>, <5,7,1,3> + 2585069160U, // <7,5,7,2>: Cost 3 vext1 <5,7,5,7>, <2,2,2,2> + 2573126390U, // <7,5,7,3>: Cost 3 vext1 <3,7,5,7>, <3,7,5,7> + 1511329078U, // <7,5,7,4>: Cost 2 vext1 <5,7,5,7>, RHS + 1638469800U, // <7,5,7,5>: Cost 2 vext3 RHS, <5,7,5,7> + 2712211626U, // <7,5,7,6>: Cost 3 vext3 RHS, <5,7,6,0> + 2712211636U, // <7,5,7,7>: Cost 3 vext3 RHS, <5,7,7,1> + 1638469823U, // <7,5,7,u>: Cost 2 vext3 RHS, <5,7,u,3> + 1511333990U, // <7,5,u,0>: Cost 2 vext1 <5,7,5,u>, LHS + 2636388142U, // <7,5,u,1>: Cost 3 vext2 <3,1,7,5>, LHS + 2712211671U, // <7,5,u,2>: Cost 3 vext3 RHS, <5,u,2,0> + 2573134583U, // <7,5,u,3>: Cost 3 vext1 <3,7,5,u>, <3,7,5,u> + 1511337270U, // <7,5,u,4>: Cost 2 vext1 <5,7,5,u>, RHS + 1638469881U, // <7,5,u,5>: Cost 2 vext3 RHS, <5,u,5,7> + 2712064258U, // <7,5,u,6>: Cost 3 vext3 RHS, <5,u,6,7> + 1638469892U, // <7,5,u,7>: Cost 2 vext3 RHS, <5,u,7,0> + 1638469904U, // <7,5,u,u>: Cost 2 vext3 RHS, <5,u,u,3> + 2650324992U, // <7,6,0,0>: Cost 3 vext2 <5,4,7,6>, <0,0,0,0> + 1576583270U, // <7,6,0,1>: Cost 2 vext2 <5,4,7,6>, LHS + 2712064300U, // <7,6,0,2>: Cost 3 vext3 RHS, <6,0,2,4> + 2255295336U, // <7,6,0,3>: Cost 3 vrev <6,7,3,0> + 2712064316U, // <7,6,0,4>: Cost 3 vext3 RHS, <6,0,4,2> + 2585088098U, // <7,6,0,5>: Cost 3 vext1 <5,7,6,0>, <5,6,7,0> + 2735952204U, // <7,6,0,6>: Cost 3 vext3 RHS, <6,0,6,0> + 2712211799U, // <7,6,0,7>: Cost 3 vext3 RHS, <6,0,7,2> + 1576583837U, // <7,6,0,u>: Cost 2 vext2 <5,4,7,6>, LHS + 1181340494U, // <7,6,1,0>: Cost 2 vrev <6,7,0,1> + 2650325812U, // <7,6,1,1>: Cost 3 vext2 <5,4,7,6>, <1,1,1,1> + 2650325910U, // <7,6,1,2>: Cost 3 vext2 <5,4,7,6>, <1,2,3,0> + 2650325976U, // <7,6,1,3>: Cost 3 vext2 <5,4,7,6>, <1,3,1,3> + 2579123510U, // <7,6,1,4>: Cost 3 vext1 <4,7,6,1>, RHS + 2650326160U, // <7,6,1,5>: Cost 3 vext2 <5,4,7,6>, <1,5,3,7> + 2714055072U, // <7,6,1,6>: Cost 3 vext3 RHS, <6,1,6,3> + 2712064425U, // <7,6,1,7>: Cost 3 vext3 RHS, <6,1,7,3> + 1181930390U, // <7,6,1,u>: Cost 2 vrev <6,7,u,1> + 2712211897U, // <7,6,2,0>: Cost 3 vext3 RHS, <6,2,0,1> + 2714055108U, // <7,6,2,1>: Cost 3 vext3 RHS, <6,2,1,3> + 2650326632U, // <7,6,2,2>: Cost 3 vext2 <5,4,7,6>, <2,2,2,2> + 2650326694U, // <7,6,2,3>: Cost 3 vext2 <5,4,7,6>, <2,3,0,1> + 2714055137U, // <7,6,2,4>: Cost 3 vext3 RHS, <6,2,4,5> + 2714055148U, // <7,6,2,5>: Cost 3 vext3 RHS, <6,2,5,7> + 2650326970U, // <7,6,2,6>: Cost 3 vext2 <5,4,7,6>, <2,6,3,7> + 1638470138U, // <7,6,2,7>: Cost 2 vext3 RHS, <6,2,7,3> + 1638470147U, // <7,6,2,u>: Cost 2 vext3 RHS, <6,2,u,3> + 2650327190U, // <7,6,3,0>: Cost 3 vext2 <5,4,7,6>, <3,0,1,2> + 2255172441U, // <7,6,3,1>: Cost 3 vrev <6,7,1,3> + 2255246178U, // <7,6,3,2>: Cost 3 vrev <6,7,2,3> + 2650327452U, // <7,6,3,3>: Cost 3 vext2 <5,4,7,6>, <3,3,3,3> + 2712064562U, // <7,6,3,4>: Cost 3 vext3 RHS, <6,3,4,5> + 2650327627U, // <7,6,3,5>: Cost 3 vext2 <5,4,7,6>, <3,5,4,7> + 3713452726U, // <7,6,3,6>: Cost 4 vext2 <3,6,7,6>, <3,6,7,6> + 2700563016U, // <7,6,3,7>: Cost 3 vext3 <2,6,3,7>, <6,3,7,0> + 2712064593U, // <7,6,3,u>: Cost 3 vext3 RHS, <6,3,u,0> + 2650327954U, // <7,6,4,0>: Cost 3 vext2 <5,4,7,6>, <4,0,5,1> + 2735952486U, // <7,6,4,1>: Cost 3 vext3 RHS, <6,4,1,3> + 2735952497U, // <7,6,4,2>: Cost 3 vext3 RHS, <6,4,2,5> + 2255328108U, // <7,6,4,3>: Cost 3 vrev <6,7,3,4> + 2712212100U, // <7,6,4,4>: Cost 3 vext3 RHS, <6,4,4,6> + 1576586550U, // <7,6,4,5>: Cost 2 vext2 <5,4,7,6>, RHS + 2714055312U, // <7,6,4,6>: Cost 3 vext3 RHS, <6,4,6,0> + 2712212126U, // <7,6,4,7>: Cost 3 vext3 RHS, <6,4,7,5> + 1576586793U, // <7,6,4,u>: Cost 2 vext2 <5,4,7,6>, RHS + 2579152998U, // <7,6,5,0>: Cost 3 vext1 <4,7,6,5>, LHS + 2650328784U, // <7,6,5,1>: Cost 3 vext2 <5,4,7,6>, <5,1,7,3> + 2714055364U, // <7,6,5,2>: Cost 3 vext3 RHS, <6,5,2,7> + 3785806538U, // <7,6,5,3>: Cost 4 vext3 RHS, <6,5,3,4> + 1576587206U, // <7,6,5,4>: Cost 2 vext2 <5,4,7,6>, <5,4,7,6> + 2650329092U, // <7,6,5,5>: Cost 3 vext2 <5,4,7,6>, <5,5,5,5> + 2650329186U, // <7,6,5,6>: Cost 3 vext2 <5,4,7,6>, <5,6,7,0> + 2712064753U, // <7,6,5,7>: Cost 3 vext3 RHS, <6,5,7,7> + 1181963162U, // <7,6,5,u>: Cost 2 vrev <6,7,u,5> + 2714055421U, // <7,6,6,0>: Cost 3 vext3 RHS, <6,6,0,1> + 2714055432U, // <7,6,6,1>: Cost 3 vext3 RHS, <6,6,1,3> + 2650329594U, // <7,6,6,2>: Cost 3 vext2 <5,4,7,6>, <6,2,7,3> + 3785806619U, // <7,6,6,3>: Cost 4 vext3 RHS, <6,6,3,4> + 2712212260U, // <7,6,6,4>: Cost 3 vext3 RHS, <6,6,4,4> + 2714055472U, // <7,6,6,5>: Cost 3 vext3 RHS, <6,6,5,7> + 1638323000U, // <7,6,6,6>: Cost 2 vext3 RHS, <6,6,6,6> + 1638470466U, // <7,6,6,7>: Cost 2 vext3 RHS, <6,6,7,7> + 1638470475U, // <7,6,6,u>: Cost 2 vext3 RHS, <6,6,u,7> + 1638323022U, // <7,6,7,0>: Cost 2 vext3 RHS, <6,7,0,1> + 2712064854U, // <7,6,7,1>: Cost 3 vext3 RHS, <6,7,1,0> + 2712064865U, // <7,6,7,2>: Cost 3 vext3 RHS, <6,7,2,2> + 2712064872U, // <7,6,7,3>: Cost 3 vext3 RHS, <6,7,3,0> + 1638323062U, // <7,6,7,4>: Cost 2 vext3 RHS, <6,7,4,5> + 2712064894U, // <7,6,7,5>: Cost 3 vext3 RHS, <6,7,5,4> + 2712064905U, // <7,6,7,6>: Cost 3 vext3 RHS, <6,7,6,6> + 2712064915U, // <7,6,7,7>: Cost 3 vext3 RHS, <6,7,7,7> + 1638323094U, // <7,6,7,u>: Cost 2 vext3 RHS, <6,7,u,1> + 1638470559U, // <7,6,u,0>: Cost 2 vext3 RHS, <6,u,0,1> + 1576589102U, // <7,6,u,1>: Cost 2 vext2 <5,4,7,6>, LHS + 2712212402U, // <7,6,u,2>: Cost 3 vext3 RHS, <6,u,2,2> + 2712212409U, // <7,6,u,3>: Cost 3 vext3 RHS, <6,u,3,0> + 1638470599U, // <7,6,u,4>: Cost 2 vext3 RHS, <6,u,4,5> + 1576589466U, // <7,6,u,5>: Cost 2 vext2 <5,4,7,6>, RHS + 1638323000U, // <7,6,u,6>: Cost 2 vext3 RHS, <6,6,6,6> + 1638470624U, // <7,6,u,7>: Cost 2 vext3 RHS, <6,u,7,3> + 1638470631U, // <7,6,u,u>: Cost 2 vext3 RHS, <6,u,u,1> + 2712065007U, // <7,7,0,0>: Cost 3 vext3 RHS, <7,0,0,0> + 1638323194U, // <7,7,0,1>: Cost 2 vext3 RHS, <7,0,1,2> + 2712065025U, // <7,7,0,2>: Cost 3 vext3 RHS, <7,0,2,0> + 3646958337U, // <7,7,0,3>: Cost 4 vext1 <3,7,7,0>, <3,7,7,0> + 2712065044U, // <7,7,0,4>: Cost 3 vext3 RHS, <7,0,4,1> + 2585161907U, // <7,7,0,5>: Cost 3 vext1 <5,7,7,0>, <5,7,7,0> + 2591134604U, // <7,7,0,6>: Cost 3 vext1 <6,7,7,0>, <6,7,7,0> + 2591134714U, // <7,7,0,7>: Cost 3 vext1 <6,7,7,0>, <7,0,1,2> + 1638323257U, // <7,7,0,u>: Cost 2 vext3 RHS, <7,0,u,2> + 2712065091U, // <7,7,1,0>: Cost 3 vext3 RHS, <7,1,0,3> + 2712065098U, // <7,7,1,1>: Cost 3 vext3 RHS, <7,1,1,1> + 2712065109U, // <7,7,1,2>: Cost 3 vext3 RHS, <7,1,2,3> + 2692748384U, // <7,7,1,3>: Cost 3 vext3 <1,3,5,7>, <7,1,3,5> + 2585169206U, // <7,7,1,4>: Cost 3 vext1 <5,7,7,1>, RHS + 2693928048U, // <7,7,1,5>: Cost 3 vext3 <1,5,3,7>, <7,1,5,3> + 2585170766U, // <7,7,1,6>: Cost 3 vext1 <5,7,7,1>, <6,7,0,1> + 2735953024U, // <7,7,1,7>: Cost 3 vext3 RHS, <7,1,7,1> + 2695918731U, // <7,7,1,u>: Cost 3 vext3 <1,u,3,7>, <7,1,u,3> + 3770471574U, // <7,7,2,0>: Cost 4 vext3 <2,0,5,7>, <7,2,0,5> + 3785807002U, // <7,7,2,1>: Cost 4 vext3 RHS, <7,2,1,0> + 2712065189U, // <7,7,2,2>: Cost 3 vext3 RHS, <7,2,2,2> + 2712065196U, // <7,7,2,3>: Cost 3 vext3 RHS, <7,2,3,0> + 3773125818U, // <7,7,2,4>: Cost 4 vext3 <2,4,5,7>, <7,2,4,5> + 3766490305U, // <7,7,2,5>: Cost 4 vext3 <1,3,5,7>, <7,2,5,3> + 2700563658U, // <7,7,2,6>: Cost 3 vext3 <2,6,3,7>, <7,2,6,3> + 2735953107U, // <7,7,2,7>: Cost 3 vext3 RHS, <7,2,7,3> + 2701890780U, // <7,7,2,u>: Cost 3 vext3 <2,u,3,7>, <7,2,u,3> + 2712065251U, // <7,7,3,0>: Cost 3 vext3 RHS, <7,3,0,1> + 3766490350U, // <7,7,3,1>: Cost 4 vext3 <1,3,5,7>, <7,3,1,3> + 3774305530U, // <7,7,3,2>: Cost 4 vext3 <2,6,3,7>, <7,3,2,6> + 2637728196U, // <7,7,3,3>: Cost 3 vext2 <3,3,7,7>, <3,3,7,7> + 2712065291U, // <7,7,3,4>: Cost 3 vext3 RHS, <7,3,4,5> + 2585186486U, // <7,7,3,5>: Cost 3 vext1 <5,7,7,3>, <5,7,7,3> + 2639719095U, // <7,7,3,6>: Cost 3 vext2 <3,6,7,7>, <3,6,7,7> + 2640382728U, // <7,7,3,7>: Cost 3 vext2 <3,7,7,7>, <3,7,7,7> + 2641046361U, // <7,7,3,u>: Cost 3 vext2 <3,u,7,7>, <3,u,7,7> + 2712212792U, // <7,7,4,0>: Cost 3 vext3 RHS, <7,4,0,5> + 3646989312U, // <7,7,4,1>: Cost 4 vext1 <3,7,7,4>, <1,3,5,7> + 3785807176U, // <7,7,4,2>: Cost 4 vext3 RHS, <7,4,2,3> + 3646991109U, // <7,7,4,3>: Cost 4 vext1 <3,7,7,4>, <3,7,7,4> + 2712065371U, // <7,7,4,4>: Cost 3 vext3 RHS, <7,4,4,4> + 1638323558U, // <7,7,4,5>: Cost 2 vext3 RHS, <7,4,5,6> + 2712212845U, // <7,7,4,6>: Cost 3 vext3 RHS, <7,4,6,4> + 2591167846U, // <7,7,4,7>: Cost 3 vext1 <6,7,7,4>, <7,4,5,6> + 1638323585U, // <7,7,4,u>: Cost 2 vext3 RHS, <7,4,u,6> + 2585198694U, // <7,7,5,0>: Cost 3 vext1 <5,7,7,5>, LHS + 2712212884U, // <7,7,5,1>: Cost 3 vext3 RHS, <7,5,1,7> + 3711471393U, // <7,7,5,2>: Cost 4 vext2 <3,3,7,7>, <5,2,7,3> + 2649673590U, // <7,7,5,3>: Cost 3 vext2 <5,3,7,7>, <5,3,7,7> + 2712065455U, // <7,7,5,4>: Cost 3 vext3 RHS, <7,5,4,7> + 1577259032U, // <7,7,5,5>: Cost 2 vext2 <5,5,7,7>, <5,5,7,7> + 2712065473U, // <7,7,5,6>: Cost 3 vext3 RHS, <7,5,6,7> + 2712212936U, // <7,7,5,7>: Cost 3 vext3 RHS, <7,5,7,5> + 1579249931U, // <7,7,5,u>: Cost 2 vext2 <5,u,7,7>, <5,u,7,7> + 2591178854U, // <7,7,6,0>: Cost 3 vext1 <6,7,7,6>, LHS + 2735953374U, // <7,7,6,1>: Cost 3 vext3 RHS, <7,6,1,0> + 2712212974U, // <7,7,6,2>: Cost 3 vext3 RHS, <7,6,2,7> + 2655646287U, // <7,7,6,3>: Cost 3 vext2 <6,3,7,7>, <6,3,7,7> + 2591182134U, // <7,7,6,4>: Cost 3 vext1 <6,7,7,6>, RHS + 2656973553U, // <7,7,6,5>: Cost 3 vext2 <6,5,7,7>, <6,5,7,7> + 1583895362U, // <7,7,6,6>: Cost 2 vext2 <6,6,7,7>, <6,6,7,7> + 2712065556U, // <7,7,6,7>: Cost 3 vext3 RHS, <7,6,7,0> + 1585222628U, // <7,7,6,u>: Cost 2 vext2 <6,u,7,7>, <6,u,7,7> + 1523417190U, // <7,7,7,0>: Cost 2 vext1 <7,7,7,7>, LHS + 2597159670U, // <7,7,7,1>: Cost 3 vext1 <7,7,7,7>, <1,0,3,2> + 2597160552U, // <7,7,7,2>: Cost 3 vext1 <7,7,7,7>, <2,2,2,2> + 2597161110U, // <7,7,7,3>: Cost 3 vext1 <7,7,7,7>, <3,0,1,2> + 1523420470U, // <7,7,7,4>: Cost 2 vext1 <7,7,7,7>, RHS + 2651002296U, // <7,7,7,5>: Cost 3 vext2 <5,5,7,7>, <7,5,5,7> + 2657637906U, // <7,7,7,6>: Cost 3 vext2 <6,6,7,7>, <7,6,6,7> + 363253046U, // <7,7,7,7>: Cost 1 vdup3 RHS + 363253046U, // <7,7,7,u>: Cost 1 vdup3 RHS + 1523417190U, // <7,7,u,0>: Cost 2 vext1 <7,7,7,7>, LHS + 1638471298U, // <7,7,u,1>: Cost 2 vext3 RHS, <7,u,1,2> + 2712213132U, // <7,7,u,2>: Cost 3 vext3 RHS, <7,u,2,3> + 2712213138U, // <7,7,u,3>: Cost 3 vext3 RHS, <7,u,3,0> + 1523420470U, // <7,7,u,4>: Cost 2 vext1 <7,7,7,7>, RHS + 1638471338U, // <7,7,u,5>: Cost 2 vext3 RHS, <7,u,5,6> + 1595840756U, // <7,7,u,6>: Cost 2 vext2 <u,6,7,7>, <u,6,7,7> + 363253046U, // <7,7,u,7>: Cost 1 vdup3 RHS + 363253046U, // <7,7,u,u>: Cost 1 vdup3 RHS + 1638318080U, // <7,u,0,0>: Cost 2 vext3 RHS, <0,0,0,0> + 1638323923U, // <7,u,0,1>: Cost 2 vext3 RHS, <u,0,1,2> + 1662211804U, // <7,u,0,2>: Cost 2 vext3 RHS, <u,0,2,2> + 1638323941U, // <7,u,0,3>: Cost 2 vext3 RHS, <u,0,3,2> + 2712065773U, // <7,u,0,4>: Cost 3 vext3 RHS, <u,0,4,1> + 1662359286U, // <7,u,0,5>: Cost 2 vext3 RHS, <u,0,5,1> + 1662359296U, // <7,u,0,6>: Cost 2 vext3 RHS, <u,0,6,2> + 2987150664U, // <7,u,0,7>: Cost 3 vzipr <5,6,7,0>, RHS + 1638323986U, // <7,u,0,u>: Cost 2 vext3 RHS, <u,0,u,2> + 1517469798U, // <7,u,1,0>: Cost 2 vext1 <6,7,u,1>, LHS + 1638318900U, // <7,u,1,1>: Cost 2 vext3 RHS, <1,1,1,1> + 564582190U, // <7,u,1,2>: Cost 1 vext3 RHS, LHS + 1638324023U, // <7,u,1,3>: Cost 2 vext3 RHS, <u,1,3,3> + 1517473078U, // <7,u,1,4>: Cost 2 vext1 <6,7,u,1>, RHS + 2693928777U, // <7,u,1,5>: Cost 3 vext3 <1,5,3,7>, <u,1,5,3> + 1517474710U, // <7,u,1,6>: Cost 2 vext1 <6,7,u,1>, <6,7,u,1> + 1640462171U, // <7,u,1,7>: Cost 2 vext3 RHS, <u,1,7,3> + 564582244U, // <7,u,1,u>: Cost 1 vext3 RHS, LHS + 1638318244U, // <7,u,2,0>: Cost 2 vext3 RHS, <0,2,0,2> + 2712065907U, // <7,u,2,1>: Cost 3 vext3 RHS, <u,2,1,0> + 1638319720U, // <7,u,2,2>: Cost 2 vext3 RHS, <2,2,2,2> + 1638324101U, // <7,u,2,3>: Cost 2 vext3 RHS, <u,2,3,0> + 1638318284U, // <7,u,2,4>: Cost 2 vext3 RHS, <0,2,4,6> + 2712065947U, // <7,u,2,5>: Cost 3 vext3 RHS, <u,2,5,4> + 2700564387U, // <7,u,2,6>: Cost 3 vext3 <2,6,3,7>, <u,2,6,3> + 1640314796U, // <7,u,2,7>: Cost 2 vext3 RHS, <u,2,7,3> + 1638324146U, // <7,u,2,u>: Cost 2 vext3 RHS, <u,2,u,0> + 1638324156U, // <7,u,3,0>: Cost 2 vext3 RHS, <u,3,0,1> + 1638319064U, // <7,u,3,1>: Cost 2 vext3 RHS, <1,3,1,3> + 2700564435U, // <7,u,3,2>: Cost 3 vext3 <2,6,3,7>, <u,3,2,6> + 1638320540U, // <7,u,3,3>: Cost 2 vext3 RHS, <3,3,3,3> + 1638324196U, // <7,u,3,4>: Cost 2 vext3 RHS, <u,3,4,5> + 1638324207U, // <7,u,3,5>: Cost 2 vext3 RHS, <u,3,5,7> + 2700564472U, // <7,u,3,6>: Cost 3 vext3 <2,6,3,7>, <u,3,6,7> + 2695919610U, // <7,u,3,7>: Cost 3 vext3 <1,u,3,7>, <u,3,7,0> + 1638324228U, // <7,u,3,u>: Cost 2 vext3 RHS, <u,3,u,1> + 2712066061U, // <7,u,4,0>: Cost 3 vext3 RHS, <u,4,0,1> + 1662212122U, // <7,u,4,1>: Cost 2 vext3 RHS, <u,4,1,5> + 1662212132U, // <7,u,4,2>: Cost 2 vext3 RHS, <u,4,2,6> + 2712066092U, // <7,u,4,3>: Cost 3 vext3 RHS, <u,4,3,5> + 1638321360U, // <7,u,4,4>: Cost 2 vext3 RHS, <4,4,4,4> + 1638324287U, // <7,u,4,5>: Cost 2 vext3 RHS, <u,4,5,6> + 1662359624U, // <7,u,4,6>: Cost 2 vext3 RHS, <u,4,6,6> + 1640314961U, // <7,u,4,7>: Cost 2 vext3 RHS, <u,4,7,6> + 1638324314U, // <7,u,4,u>: Cost 2 vext3 RHS, <u,4,u,6> + 1517502566U, // <7,u,5,0>: Cost 2 vext1 <6,7,u,5>, LHS + 1574612693U, // <7,u,5,1>: Cost 2 vext2 <5,1,7,u>, <5,1,7,u> + 2712066162U, // <7,u,5,2>: Cost 3 vext3 RHS, <u,5,2,3> + 1638324351U, // <7,u,5,3>: Cost 2 vext3 RHS, <u,5,3,7> + 1576603592U, // <7,u,5,4>: Cost 2 vext2 <5,4,7,u>, <5,4,7,u> + 1577267225U, // <7,u,5,5>: Cost 2 vext2 <5,5,7,u>, <5,5,7,u> + 564582554U, // <7,u,5,6>: Cost 1 vext3 RHS, RHS + 1640462499U, // <7,u,5,7>: Cost 2 vext3 RHS, <u,5,7,7> + 564582572U, // <7,u,5,u>: Cost 1 vext3 RHS, RHS + 2712066223U, // <7,u,6,0>: Cost 3 vext3 RHS, <u,6,0,1> + 2712066238U, // <7,u,6,1>: Cost 3 vext3 RHS, <u,6,1,7> + 1581249023U, // <7,u,6,2>: Cost 2 vext2 <6,2,7,u>, <6,2,7,u> + 1638324432U, // <7,u,6,3>: Cost 2 vext3 RHS, <u,6,3,7> + 1638468980U, // <7,u,6,4>: Cost 2 vext3 RHS, <4,6,4,6> + 2712066274U, // <7,u,6,5>: Cost 3 vext3 RHS, <u,6,5,7> + 1583903555U, // <7,u,6,6>: Cost 2 vext2 <6,6,7,u>, <6,6,7,u> + 1640315117U, // <7,u,6,7>: Cost 2 vext3 RHS, <u,6,7,0> + 1638324477U, // <7,u,6,u>: Cost 2 vext3 RHS, <u,6,u,7> + 1638471936U, // <7,u,7,0>: Cost 2 vext3 RHS, <u,7,0,1> + 2692970763U, // <7,u,7,1>: Cost 3 vext3 <1,3,u,7>, <u,7,1,3> + 2700933399U, // <7,u,7,2>: Cost 3 vext3 <2,6,u,7>, <u,7,2,6> + 2573347601U, // <7,u,7,3>: Cost 3 vext1 <3,7,u,7>, <3,7,u,7> + 1638471976U, // <7,u,7,4>: Cost 2 vext3 RHS, <u,7,4,5> + 1511551171U, // <7,u,7,5>: Cost 2 vext1 <5,7,u,7>, <5,7,u,7> + 2712213815U, // <7,u,7,6>: Cost 3 vext3 RHS, <u,7,6,2> + 363253046U, // <7,u,7,7>: Cost 1 vdup3 RHS + 363253046U, // <7,u,7,u>: Cost 1 vdup3 RHS + 1638324561U, // <7,u,u,0>: Cost 2 vext3 RHS, <u,u,0,1> + 1638324571U, // <7,u,u,1>: Cost 2 vext3 RHS, <u,u,1,2> + 564582757U, // <7,u,u,2>: Cost 1 vext3 RHS, LHS + 1638324587U, // <7,u,u,3>: Cost 2 vext3 RHS, <u,u,3,0> + 1638324601U, // <7,u,u,4>: Cost 2 vext3 RHS, <u,u,4,5> + 1638324611U, // <7,u,u,5>: Cost 2 vext3 RHS, <u,u,5,6> + 564582797U, // <7,u,u,6>: Cost 1 vext3 RHS, RHS + 363253046U, // <7,u,u,7>: Cost 1 vdup3 RHS + 564582811U, // <7,u,u,u>: Cost 1 vext3 RHS, LHS + 135053414U, // <u,0,0,0>: Cost 1 vdup0 LHS + 1611489290U, // <u,0,0,1>: Cost 2 vext3 LHS, <0,0,1,1> + 1611489300U, // <u,0,0,2>: Cost 2 vext3 LHS, <0,0,2,2> + 2568054923U, // <u,0,0,3>: Cost 3 vext1 <3,0,0,0>, <3,0,0,0> + 1481706806U, // <u,0,0,4>: Cost 2 vext1 <0,u,0,0>, RHS + 2555449040U, // <u,0,0,5>: Cost 3 vext1 <0,u,0,0>, <5,1,7,3> + 2591282078U, // <u,0,0,6>: Cost 3 vext1 <6,u,0,0>, <6,u,0,0> + 2591945711U, // <u,0,0,7>: Cost 3 vext1 <7,0,0,0>, <7,0,0,0> + 135053414U, // <u,0,0,u>: Cost 1 vdup0 LHS + 1493655654U, // <u,0,1,0>: Cost 2 vext1 <2,u,0,1>, LHS + 1860550758U, // <u,0,1,1>: Cost 2 vzipl LHS, LHS + 537747563U, // <u,0,1,2>: Cost 1 vext3 LHS, LHS + 2625135576U, // <u,0,1,3>: Cost 3 vext2 <1,2,u,0>, <1,3,1,3> + 1493658934U, // <u,0,1,4>: Cost 2 vext1 <2,u,0,1>, RHS + 2625135760U, // <u,0,1,5>: Cost 3 vext2 <1,2,u,0>, <1,5,3,7> + 1517548447U, // <u,0,1,6>: Cost 2 vext1 <6,u,0,1>, <6,u,0,1> + 2591290362U, // <u,0,1,7>: Cost 3 vext1 <6,u,0,1>, <7,0,1,2> + 537747612U, // <u,0,1,u>: Cost 1 vext3 LHS, LHS + 1611489444U, // <u,0,2,0>: Cost 2 vext3 LHS, <0,2,0,2> + 2685231276U, // <u,0,2,1>: Cost 3 vext3 LHS, <0,2,1,1> + 1994768486U, // <u,0,2,2>: Cost 2 vtrnl LHS, LHS + 2685231294U, // <u,0,2,3>: Cost 3 vext3 LHS, <0,2,3,1> + 1611489484U, // <u,0,2,4>: Cost 2 vext3 LHS, <0,2,4,6> + 2712068310U, // <u,0,2,5>: Cost 3 vext3 RHS, <0,2,5,7> + 2625136570U, // <u,0,2,6>: Cost 3 vext2 <1,2,u,0>, <2,6,3,7> + 2591962097U, // <u,0,2,7>: Cost 3 vext1 <7,0,0,2>, <7,0,0,2> + 1611489516U, // <u,0,2,u>: Cost 2 vext3 LHS, <0,2,u,2> + 2954067968U, // <u,0,3,0>: Cost 3 vzipr LHS, <0,0,0,0> + 2685231356U, // <u,0,3,1>: Cost 3 vext3 LHS, <0,3,1,0> + 72589981U, // <u,0,3,2>: Cost 1 vrev LHS + 2625137052U, // <u,0,3,3>: Cost 3 vext2 <1,2,u,0>, <3,3,3,3> + 2625137154U, // <u,0,3,4>: Cost 3 vext2 <1,2,u,0>, <3,4,5,6> + 2639071848U, // <u,0,3,5>: Cost 3 vext2 <3,5,u,0>, <3,5,u,0> + 2639735481U, // <u,0,3,6>: Cost 3 vext2 <3,6,u,0>, <3,6,u,0> + 2597279354U, // <u,0,3,7>: Cost 3 vext1 <7,u,0,3>, <7,u,0,3> + 73032403U, // <u,0,3,u>: Cost 1 vrev LHS + 2687074636U, // <u,0,4,0>: Cost 3 vext3 <0,4,0,u>, <0,4,0,u> + 1611489618U, // <u,0,4,1>: Cost 2 vext3 LHS, <0,4,1,5> + 1611489628U, // <u,0,4,2>: Cost 2 vext3 LHS, <0,4,2,6> + 3629222038U, // <u,0,4,3>: Cost 4 vext1 <0,u,0,4>, <3,0,1,2> + 2555481398U, // <u,0,4,4>: Cost 3 vext1 <0,u,0,4>, RHS + 1551396150U, // <u,0,4,5>: Cost 2 vext2 <1,2,u,0>, RHS + 2651680116U, // <u,0,4,6>: Cost 3 vext2 <5,6,u,0>, <4,6,4,6> + 2646150600U, // <u,0,4,7>: Cost 3 vext2 <4,7,5,0>, <4,7,5,0> + 1611932050U, // <u,0,4,u>: Cost 2 vext3 LHS, <0,4,u,6> + 2561458278U, // <u,0,5,0>: Cost 3 vext1 <1,u,0,5>, LHS + 1863532646U, // <u,0,5,1>: Cost 2 vzipl RHS, LHS + 2712068526U, // <u,0,5,2>: Cost 3 vext3 RHS, <0,5,2,7> + 2649689976U, // <u,0,5,3>: Cost 3 vext2 <5,3,u,0>, <5,3,u,0> + 2220237489U, // <u,0,5,4>: Cost 3 vrev <0,u,4,5> + 2651680772U, // <u,0,5,5>: Cost 3 vext2 <5,6,u,0>, <5,5,5,5> + 1577939051U, // <u,0,5,6>: Cost 2 vext2 <5,6,u,0>, <5,6,u,0> + 2830077238U, // <u,0,5,7>: Cost 3 vuzpr <1,u,3,0>, RHS + 1579266317U, // <u,0,5,u>: Cost 2 vext2 <5,u,u,0>, <5,u,u,0> + 2555494502U, // <u,0,6,0>: Cost 3 vext1 <0,u,0,6>, LHS + 2712068598U, // <u,0,6,1>: Cost 3 vext3 RHS, <0,6,1,7> + 1997750374U, // <u,0,6,2>: Cost 2 vtrnl RHS, LHS + 2655662673U, // <u,0,6,3>: Cost 3 vext2 <6,3,u,0>, <6,3,u,0> + 2555497782U, // <u,0,6,4>: Cost 3 vext1 <0,u,0,6>, RHS + 2651681459U, // <u,0,6,5>: Cost 3 vext2 <5,6,u,0>, <6,5,0,u> + 2651681592U, // <u,0,6,6>: Cost 3 vext2 <5,6,u,0>, <6,6,6,6> + 2651681614U, // <u,0,6,7>: Cost 3 vext2 <5,6,u,0>, <6,7,0,1> + 1997750428U, // <u,0,6,u>: Cost 2 vtrnl RHS, LHS + 2567446630U, // <u,0,7,0>: Cost 3 vext1 <2,u,0,7>, LHS + 2567447446U, // <u,0,7,1>: Cost 3 vext1 <2,u,0,7>, <1,2,3,0> + 2567448641U, // <u,0,7,2>: Cost 3 vext1 <2,u,0,7>, <2,u,0,7> + 2573421338U, // <u,0,7,3>: Cost 3 vext1 <3,u,0,7>, <3,u,0,7> + 2567449910U, // <u,0,7,4>: Cost 3 vext1 <2,u,0,7>, RHS + 2651682242U, // <u,0,7,5>: Cost 3 vext2 <5,6,u,0>, <7,5,6,u> + 2591339429U, // <u,0,7,6>: Cost 3 vext1 <6,u,0,7>, <6,u,0,7> + 2651682412U, // <u,0,7,7>: Cost 3 vext2 <5,6,u,0>, <7,7,7,7> + 2567452462U, // <u,0,7,u>: Cost 3 vext1 <2,u,0,7>, LHS + 135053414U, // <u,0,u,0>: Cost 1 vdup0 LHS + 1611489938U, // <u,0,u,1>: Cost 2 vext3 LHS, <0,u,1,1> + 537748125U, // <u,0,u,2>: Cost 1 vext3 LHS, LHS + 2685674148U, // <u,0,u,3>: Cost 3 vext3 LHS, <0,u,3,1> + 1611932338U, // <u,0,u,4>: Cost 2 vext3 LHS, <0,u,4,6> + 1551399066U, // <u,0,u,5>: Cost 2 vext2 <1,2,u,0>, RHS + 1517605798U, // <u,0,u,6>: Cost 2 vext1 <6,u,0,u>, <6,u,0,u> + 2830077481U, // <u,0,u,7>: Cost 3 vuzpr <1,u,3,0>, RHS + 537748179U, // <u,0,u,u>: Cost 1 vext3 LHS, LHS + 1544101961U, // <u,1,0,0>: Cost 2 vext2 <0,0,u,1>, <0,0,u,1> + 1558036582U, // <u,1,0,1>: Cost 2 vext2 <2,3,u,1>, LHS + 2619171051U, // <u,1,0,2>: Cost 3 vext2 <0,2,u,1>, <0,2,u,1> + 1611490038U, // <u,1,0,3>: Cost 2 vext3 LHS, <1,0,3,2> + 2555522358U, // <u,1,0,4>: Cost 3 vext1 <0,u,1,0>, RHS + 2712068871U, // <u,1,0,5>: Cost 3 vext3 RHS, <1,0,5,1> + 2591355815U, // <u,1,0,6>: Cost 3 vext1 <6,u,1,0>, <6,u,1,0> + 2597328512U, // <u,1,0,7>: Cost 3 vext1 <7,u,1,0>, <7,u,1,0> + 1611490083U, // <u,1,0,u>: Cost 2 vext3 LHS, <1,0,u,2> + 1481785446U, // <u,1,1,0>: Cost 2 vext1 <0,u,1,1>, LHS + 202162278U, // <u,1,1,1>: Cost 1 vdup1 LHS + 2555528808U, // <u,1,1,2>: Cost 3 vext1 <0,u,1,1>, <2,2,2,2> + 1611490120U, // <u,1,1,3>: Cost 2 vext3 LHS, <1,1,3,3> + 1481788726U, // <u,1,1,4>: Cost 2 vext1 <0,u,1,1>, RHS + 2689876828U, // <u,1,1,5>: Cost 3 vext3 LHS, <1,1,5,5> + 2591364008U, // <u,1,1,6>: Cost 3 vext1 <6,u,1,1>, <6,u,1,1> + 2592691274U, // <u,1,1,7>: Cost 3 vext1 <7,1,1,1>, <7,1,1,1> + 202162278U, // <u,1,1,u>: Cost 1 vdup1 LHS + 1499709542U, // <u,1,2,0>: Cost 2 vext1 <3,u,1,2>, LHS + 2689876871U, // <u,1,2,1>: Cost 3 vext3 LHS, <1,2,1,3> + 2631116445U, // <u,1,2,2>: Cost 3 vext2 <2,2,u,1>, <2,2,u,1> + 835584U, // <u,1,2,3>: Cost 0 copy LHS + 1499712822U, // <u,1,2,4>: Cost 2 vext1 <3,u,1,2>, RHS + 2689876907U, // <u,1,2,5>: Cost 3 vext3 LHS, <1,2,5,3> + 2631780282U, // <u,1,2,6>: Cost 3 vext2 <2,3,u,1>, <2,6,3,7> + 1523603074U, // <u,1,2,7>: Cost 2 vext1 <7,u,1,2>, <7,u,1,2> + 835584U, // <u,1,2,u>: Cost 0 copy LHS + 1487773798U, // <u,1,3,0>: Cost 2 vext1 <1,u,1,3>, LHS + 1611490264U, // <u,1,3,1>: Cost 2 vext3 LHS, <1,3,1,3> + 2685232094U, // <u,1,3,2>: Cost 3 vext3 LHS, <1,3,2,0> + 2018746470U, // <u,1,3,3>: Cost 2 vtrnr LHS, LHS + 1487777078U, // <u,1,3,4>: Cost 2 vext1 <1,u,1,3>, RHS + 1611490304U, // <u,1,3,5>: Cost 2 vext3 LHS, <1,3,5,7> + 2685674505U, // <u,1,3,6>: Cost 3 vext3 LHS, <1,3,6,7> + 2640407307U, // <u,1,3,7>: Cost 3 vext2 <3,7,u,1>, <3,7,u,1> + 1611490327U, // <u,1,3,u>: Cost 2 vext3 LHS, <1,3,u,3> + 1567992749U, // <u,1,4,0>: Cost 2 vext2 <4,0,u,1>, <4,0,u,1> + 2693121070U, // <u,1,4,1>: Cost 3 vext3 <1,4,1,u>, <1,4,1,u> + 2693194807U, // <u,1,4,2>: Cost 3 vext3 <1,4,2,u>, <1,4,2,u> + 1152386432U, // <u,1,4,3>: Cost 2 vrev <1,u,3,4> + 2555555126U, // <u,1,4,4>: Cost 3 vext1 <0,u,1,4>, RHS + 1558039862U, // <u,1,4,5>: Cost 2 vext2 <2,3,u,1>, RHS + 2645716371U, // <u,1,4,6>: Cost 3 vext2 <4,6,u,1>, <4,6,u,1> + 2597361284U, // <u,1,4,7>: Cost 3 vext1 <7,u,1,4>, <7,u,1,4> + 1152755117U, // <u,1,4,u>: Cost 2 vrev <1,u,u,4> + 1481818214U, // <u,1,5,0>: Cost 2 vext1 <0,u,1,5>, LHS + 2555560694U, // <u,1,5,1>: Cost 3 vext1 <0,u,1,5>, <1,0,3,2> + 2555561576U, // <u,1,5,2>: Cost 3 vext1 <0,u,1,5>, <2,2,2,2> + 1611490448U, // <u,1,5,3>: Cost 2 vext3 LHS, <1,5,3,7> + 1481821494U, // <u,1,5,4>: Cost 2 vext1 <0,u,1,5>, RHS + 2651025435U, // <u,1,5,5>: Cost 3 vext2 <5,5,u,1>, <5,5,u,1> + 2651689068U, // <u,1,5,6>: Cost 3 vext2 <5,6,u,1>, <5,6,u,1> + 2823966006U, // <u,1,5,7>: Cost 3 vuzpr <0,u,1,1>, RHS + 1611932861U, // <u,1,5,u>: Cost 2 vext3 LHS, <1,5,u,7> + 2555568230U, // <u,1,6,0>: Cost 3 vext1 <0,u,1,6>, LHS + 2689877199U, // <u,1,6,1>: Cost 3 vext3 LHS, <1,6,1,7> + 2712069336U, // <u,1,6,2>: Cost 3 vext3 RHS, <1,6,2,7> + 2685232353U, // <u,1,6,3>: Cost 3 vext3 LHS, <1,6,3,7> + 2555571510U, // <u,1,6,4>: Cost 3 vext1 <0,u,1,6>, RHS + 2689877235U, // <u,1,6,5>: Cost 3 vext3 LHS, <1,6,5,7> + 2657661765U, // <u,1,6,6>: Cost 3 vext2 <6,6,u,1>, <6,6,u,1> + 1584583574U, // <u,1,6,7>: Cost 2 vext2 <6,7,u,1>, <6,7,u,1> + 1585247207U, // <u,1,6,u>: Cost 2 vext2 <6,u,u,1>, <6,u,u,1> + 2561548390U, // <u,1,7,0>: Cost 3 vext1 <1,u,1,7>, LHS + 2561549681U, // <u,1,7,1>: Cost 3 vext1 <1,u,1,7>, <1,u,1,7> + 2573493926U, // <u,1,7,2>: Cost 3 vext1 <3,u,1,7>, <2,3,0,1> + 2042962022U, // <u,1,7,3>: Cost 2 vtrnr RHS, LHS + 2561551670U, // <u,1,7,4>: Cost 3 vext1 <1,u,1,7>, RHS + 2226300309U, // <u,1,7,5>: Cost 3 vrev <1,u,5,7> + 2658325990U, // <u,1,7,6>: Cost 3 vext2 <6,7,u,1>, <7,6,1,u> + 2658326124U, // <u,1,7,7>: Cost 3 vext2 <6,7,u,1>, <7,7,7,7> + 2042962027U, // <u,1,7,u>: Cost 2 vtrnr RHS, LHS + 1481842790U, // <u,1,u,0>: Cost 2 vext1 <0,u,1,u>, LHS + 202162278U, // <u,1,u,1>: Cost 1 vdup1 LHS + 2685674867U, // <u,1,u,2>: Cost 3 vext3 LHS, <1,u,2,0> + 835584U, // <u,1,u,3>: Cost 0 copy LHS + 1481846070U, // <u,1,u,4>: Cost 2 vext1 <0,u,1,u>, RHS + 1611933077U, // <u,1,u,5>: Cost 2 vext3 LHS, <1,u,5,7> + 2685674910U, // <u,1,u,6>: Cost 3 vext3 LHS, <1,u,6,7> + 1523652232U, // <u,1,u,7>: Cost 2 vext1 <7,u,1,u>, <7,u,1,u> + 835584U, // <u,1,u,u>: Cost 0 copy LHS + 1544110154U, // <u,2,0,0>: Cost 2 vext2 <0,0,u,2>, <0,0,u,2> + 1545437286U, // <u,2,0,1>: Cost 2 vext2 <0,2,u,2>, LHS + 1545437420U, // <u,2,0,2>: Cost 2 vext2 <0,2,u,2>, <0,2,u,2> + 2685232589U, // <u,2,0,3>: Cost 3 vext3 LHS, <2,0,3,0> + 2619179346U, // <u,2,0,4>: Cost 3 vext2 <0,2,u,2>, <0,4,1,5> + 2712069606U, // <u,2,0,5>: Cost 3 vext3 RHS, <2,0,5,7> + 2689877484U, // <u,2,0,6>: Cost 3 vext3 LHS, <2,0,6,4> + 2659656273U, // <u,2,0,7>: Cost 3 vext2 <7,0,u,2>, <0,7,2,u> + 1545437853U, // <u,2,0,u>: Cost 2 vext2 <0,2,u,2>, LHS + 1550082851U, // <u,2,1,0>: Cost 2 vext2 <1,0,u,2>, <1,0,u,2> + 2619179828U, // <u,2,1,1>: Cost 3 vext2 <0,2,u,2>, <1,1,1,1> + 2619179926U, // <u,2,1,2>: Cost 3 vext2 <0,2,u,2>, <1,2,3,0> + 2685232671U, // <u,2,1,3>: Cost 3 vext3 LHS, <2,1,3,1> + 2555604278U, // <u,2,1,4>: Cost 3 vext1 <0,u,2,1>, RHS + 2619180176U, // <u,2,1,5>: Cost 3 vext2 <0,2,u,2>, <1,5,3,7> + 2689877564U, // <u,2,1,6>: Cost 3 vext3 LHS, <2,1,6,3> + 2602718850U, // <u,2,1,7>: Cost 3 vext1 <u,7,2,1>, <7,u,1,2> + 1158703235U, // <u,2,1,u>: Cost 2 vrev <2,u,u,1> + 1481867366U, // <u,2,2,0>: Cost 2 vext1 <0,u,2,2>, LHS + 2555609846U, // <u,2,2,1>: Cost 3 vext1 <0,u,2,2>, <1,0,3,2> + 269271142U, // <u,2,2,2>: Cost 1 vdup2 LHS + 1611490930U, // <u,2,2,3>: Cost 2 vext3 LHS, <2,2,3,3> + 1481870646U, // <u,2,2,4>: Cost 2 vext1 <0,u,2,2>, RHS + 2689877640U, // <u,2,2,5>: Cost 3 vext3 LHS, <2,2,5,7> + 2619180986U, // <u,2,2,6>: Cost 3 vext2 <0,2,u,2>, <2,6,3,7> + 2593436837U, // <u,2,2,7>: Cost 3 vext1 <7,2,2,2>, <7,2,2,2> + 269271142U, // <u,2,2,u>: Cost 1 vdup2 LHS + 408134301U, // <u,2,3,0>: Cost 1 vext1 LHS, LHS + 1481876214U, // <u,2,3,1>: Cost 2 vext1 LHS, <1,0,3,2> + 1481877096U, // <u,2,3,2>: Cost 2 vext1 LHS, <2,2,2,2> + 1880326246U, // <u,2,3,3>: Cost 2 vzipr LHS, LHS + 408137014U, // <u,2,3,4>: Cost 1 vext1 LHS, RHS + 1529654992U, // <u,2,3,5>: Cost 2 vext1 LHS, <5,1,7,3> + 1529655802U, // <u,2,3,6>: Cost 2 vext1 LHS, <6,2,7,3> + 1529656314U, // <u,2,3,7>: Cost 2 vext1 LHS, <7,0,1,2> + 408139566U, // <u,2,3,u>: Cost 1 vext1 LHS, LHS + 1567853468U, // <u,2,4,0>: Cost 2 vext2 <4,0,6,2>, <4,0,6,2> + 2561598362U, // <u,2,4,1>: Cost 3 vext1 <1,u,2,4>, <1,2,3,4> + 2555627214U, // <u,2,4,2>: Cost 3 vext1 <0,u,2,4>, <2,3,4,5> + 2685232918U, // <u,2,4,3>: Cost 3 vext3 LHS, <2,4,3,5> + 2555628854U, // <u,2,4,4>: Cost 3 vext1 <0,u,2,4>, RHS + 1545440566U, // <u,2,4,5>: Cost 2 vext2 <0,2,u,2>, RHS + 1571982740U, // <u,2,4,6>: Cost 2 vext2 <4,6,u,2>, <4,6,u,2> + 2592125957U, // <u,2,4,7>: Cost 3 vext1 <7,0,2,4>, <7,0,2,4> + 1545440809U, // <u,2,4,u>: Cost 2 vext2 <0,2,u,2>, RHS + 2555633766U, // <u,2,5,0>: Cost 3 vext1 <0,u,2,5>, LHS + 2561606550U, // <u,2,5,1>: Cost 3 vext1 <1,u,2,5>, <1,2,3,0> + 2689877856U, // <u,2,5,2>: Cost 3 vext3 LHS, <2,5,2,7> + 2685233000U, // <u,2,5,3>: Cost 3 vext3 LHS, <2,5,3,6> + 1158441059U, // <u,2,5,4>: Cost 2 vrev <2,u,4,5> + 2645725188U, // <u,2,5,5>: Cost 3 vext2 <4,6,u,2>, <5,5,5,5> + 2689877892U, // <u,2,5,6>: Cost 3 vext3 LHS, <2,5,6,7> + 2823900470U, // <u,2,5,7>: Cost 3 vuzpr <0,u,0,2>, RHS + 1158736007U, // <u,2,5,u>: Cost 2 vrev <2,u,u,5> + 1481900134U, // <u,2,6,0>: Cost 2 vext1 <0,u,2,6>, LHS + 2555642614U, // <u,2,6,1>: Cost 3 vext1 <0,u,2,6>, <1,0,3,2> + 2555643496U, // <u,2,6,2>: Cost 3 vext1 <0,u,2,6>, <2,2,2,2> + 1611491258U, // <u,2,6,3>: Cost 2 vext3 LHS, <2,6,3,7> + 1481903414U, // <u,2,6,4>: Cost 2 vext1 <0,u,2,6>, RHS + 2689877964U, // <u,2,6,5>: Cost 3 vext3 LHS, <2,6,5,7> + 2689877973U, // <u,2,6,6>: Cost 3 vext3 LHS, <2,6,6,7> + 2645726030U, // <u,2,6,7>: Cost 3 vext2 <4,6,u,2>, <6,7,0,1> + 1611933671U, // <u,2,6,u>: Cost 2 vext3 LHS, <2,6,u,7> + 1585919033U, // <u,2,7,0>: Cost 2 vext2 <7,0,u,2>, <7,0,u,2> + 2573566710U, // <u,2,7,1>: Cost 3 vext1 <3,u,2,7>, <1,0,3,2> + 2567596115U, // <u,2,7,2>: Cost 3 vext1 <2,u,2,7>, <2,u,2,7> + 1906901094U, // <u,2,7,3>: Cost 2 vzipr RHS, LHS + 2555653430U, // <u,2,7,4>: Cost 3 vext1 <0,u,2,7>, RHS + 2800080230U, // <u,2,7,5>: Cost 3 vuzpl LHS, <7,4,5,6> + 2980643164U, // <u,2,7,6>: Cost 3 vzipr RHS, <0,4,2,6> + 2645726828U, // <u,2,7,7>: Cost 3 vext2 <4,6,u,2>, <7,7,7,7> + 1906901099U, // <u,2,7,u>: Cost 2 vzipr RHS, LHS + 408175266U, // <u,2,u,0>: Cost 1 vext1 LHS, LHS + 1545443118U, // <u,2,u,1>: Cost 2 vext2 <0,2,u,2>, LHS + 269271142U, // <u,2,u,2>: Cost 1 vdup2 LHS + 1611491416U, // <u,2,u,3>: Cost 2 vext3 LHS, <2,u,3,3> + 408177974U, // <u,2,u,4>: Cost 1 vext1 LHS, RHS + 1545443482U, // <u,2,u,5>: Cost 2 vext2 <0,2,u,2>, RHS + 1726339226U, // <u,2,u,6>: Cost 2 vuzpl LHS, RHS + 1529697274U, // <u,2,u,7>: Cost 2 vext1 LHS, <7,0,1,2> + 408180526U, // <u,2,u,u>: Cost 1 vext1 LHS, LHS + 1544781824U, // <u,3,0,0>: Cost 2 vext2 LHS, <0,0,0,0> + 471040156U, // <u,3,0,1>: Cost 1 vext2 LHS, LHS + 1544781988U, // <u,3,0,2>: Cost 2 vext2 LHS, <0,2,0,2> + 2618523900U, // <u,3,0,3>: Cost 3 vext2 LHS, <0,3,1,0> + 1544782162U, // <u,3,0,4>: Cost 2 vext2 LHS, <0,4,1,5> + 2238188352U, // <u,3,0,5>: Cost 3 vrev <3,u,5,0> + 2623169023U, // <u,3,0,6>: Cost 3 vext2 LHS, <0,6,2,7> + 2238335826U, // <u,3,0,7>: Cost 3 vrev <3,u,7,0> + 471040669U, // <u,3,0,u>: Cost 1 vext2 LHS, LHS + 1544782582U, // <u,3,1,0>: Cost 2 vext2 LHS, <1,0,3,2> + 1544782644U, // <u,3,1,1>: Cost 2 vext2 LHS, <1,1,1,1> + 1544782742U, // <u,3,1,2>: Cost 2 vext2 LHS, <1,2,3,0> + 1544782808U, // <u,3,1,3>: Cost 2 vext2 LHS, <1,3,1,3> + 2618524733U, // <u,3,1,4>: Cost 3 vext2 LHS, <1,4,3,5> + 1544782992U, // <u,3,1,5>: Cost 2 vext2 LHS, <1,5,3,7> + 2618524897U, // <u,3,1,6>: Cost 3 vext2 LHS, <1,6,3,7> + 2703517987U, // <u,3,1,7>: Cost 3 vext3 <3,1,7,u>, <3,1,7,u> + 1544783213U, // <u,3,1,u>: Cost 2 vext2 LHS, <1,u,1,3> + 1529716838U, // <u,3,2,0>: Cost 2 vext1 <u,u,3,2>, LHS + 1164167966U, // <u,3,2,1>: Cost 2 vrev <3,u,1,2> + 1544783464U, // <u,3,2,2>: Cost 2 vext2 LHS, <2,2,2,2> + 1544783526U, // <u,3,2,3>: Cost 2 vext2 LHS, <2,3,0,1> + 1529720118U, // <u,3,2,4>: Cost 2 vext1 <u,u,3,2>, RHS + 2618525544U, // <u,3,2,5>: Cost 3 vext2 LHS, <2,5,3,6> + 1544783802U, // <u,3,2,6>: Cost 2 vext2 LHS, <2,6,3,7> + 2704181620U, // <u,3,2,7>: Cost 3 vext3 <3,2,7,u>, <3,2,7,u> + 1544783931U, // <u,3,2,u>: Cost 2 vext2 LHS, <2,u,0,1> + 1544784022U, // <u,3,3,0>: Cost 2 vext2 LHS, <3,0,1,2> + 1487922559U, // <u,3,3,1>: Cost 2 vext1 <1,u,3,3>, <1,u,3,3> + 1493895256U, // <u,3,3,2>: Cost 2 vext1 <2,u,3,3>, <2,u,3,3> + 336380006U, // <u,3,3,3>: Cost 1 vdup3 LHS + 1544784386U, // <u,3,3,4>: Cost 2 vext2 LHS, <3,4,5,6> + 2824054478U, // <u,3,3,5>: Cost 3 vuzpr LHS, <2,3,4,5> + 2238286668U, // <u,3,3,6>: Cost 3 vrev <3,u,6,3> + 2954069136U, // <u,3,3,7>: Cost 3 vzipr LHS, <1,5,3,7> + 336380006U, // <u,3,3,u>: Cost 1 vdup3 LHS + 1487929446U, // <u,3,4,0>: Cost 2 vext1 <1,u,3,4>, LHS + 1487930752U, // <u,3,4,1>: Cost 2 vext1 <1,u,3,4>, <1,u,3,4> + 2623171644U, // <u,3,4,2>: Cost 3 vext2 LHS, <4,2,6,0> + 2561673366U, // <u,3,4,3>: Cost 3 vext1 <1,u,3,4>, <3,0,1,2> + 1487932726U, // <u,3,4,4>: Cost 2 vext1 <1,u,3,4>, RHS + 471043382U, // <u,3,4,5>: Cost 1 vext2 LHS, RHS + 1592561012U, // <u,3,4,6>: Cost 2 vext2 LHS, <4,6,4,6> + 2238368598U, // <u,3,4,7>: Cost 3 vrev <3,u,7,4> + 471043625U, // <u,3,4,u>: Cost 1 vext2 LHS, RHS + 2555707494U, // <u,3,5,0>: Cost 3 vext1 <0,u,3,5>, LHS + 1574645465U, // <u,3,5,1>: Cost 2 vext2 <5,1,u,3>, <5,1,u,3> + 2567653106U, // <u,3,5,2>: Cost 3 vext1 <2,u,3,5>, <2,3,u,5> + 2555709954U, // <u,3,5,3>: Cost 3 vext1 <0,u,3,5>, <3,4,5,6> + 1592561606U, // <u,3,5,4>: Cost 2 vext2 LHS, <5,4,7,6> + 1592561668U, // <u,3,5,5>: Cost 2 vext2 LHS, <5,5,5,5> + 1592561762U, // <u,3,5,6>: Cost 2 vext2 LHS, <5,6,7,0> + 1750314294U, // <u,3,5,7>: Cost 2 vuzpr LHS, RHS + 1750314295U, // <u,3,5,u>: Cost 2 vuzpr LHS, RHS + 2623172897U, // <u,3,6,0>: Cost 3 vext2 LHS, <6,0,1,2> + 2561688962U, // <u,3,6,1>: Cost 3 vext1 <1,u,3,6>, <1,u,3,6> + 1581281795U, // <u,3,6,2>: Cost 2 vext2 <6,2,u,3>, <6,2,u,3> + 2706541204U, // <u,3,6,3>: Cost 3 vext3 <3,6,3,u>, <3,6,3,u> + 2623173261U, // <u,3,6,4>: Cost 3 vext2 LHS, <6,4,5,6> + 1164495686U, // <u,3,6,5>: Cost 2 vrev <3,u,5,6> + 1592562488U, // <u,3,6,6>: Cost 2 vext2 LHS, <6,6,6,6> + 1592562510U, // <u,3,6,7>: Cost 2 vext2 LHS, <6,7,0,1> + 1164716897U, // <u,3,6,u>: Cost 2 vrev <3,u,u,6> + 1487954022U, // <u,3,7,0>: Cost 2 vext1 <1,u,3,7>, LHS + 1487955331U, // <u,3,7,1>: Cost 2 vext1 <1,u,3,7>, <1,u,3,7> + 1493928028U, // <u,3,7,2>: Cost 2 vext1 <2,u,3,7>, <2,u,3,7> + 2561697942U, // <u,3,7,3>: Cost 3 vext1 <1,u,3,7>, <3,0,1,2> + 1487957302U, // <u,3,7,4>: Cost 2 vext1 <1,u,3,7>, RHS + 2707352311U, // <u,3,7,5>: Cost 3 vext3 <3,7,5,u>, <3,7,5,u> + 2655024623U, // <u,3,7,6>: Cost 3 vext2 <6,2,u,3>, <7,6,2,u> + 1592563308U, // <u,3,7,7>: Cost 2 vext2 LHS, <7,7,7,7> + 1487959854U, // <u,3,7,u>: Cost 2 vext1 <1,u,3,7>, LHS + 1544787667U, // <u,3,u,0>: Cost 2 vext2 LHS, <u,0,1,2> + 471045934U, // <u,3,u,1>: Cost 1 vext2 LHS, LHS + 1549432709U, // <u,3,u,2>: Cost 2 vext2 LHS, <u,2,3,0> + 336380006U, // <u,3,u,3>: Cost 1 vdup3 LHS + 1544788031U, // <u,3,u,4>: Cost 2 vext2 LHS, <u,4,5,6> + 471046298U, // <u,3,u,5>: Cost 1 vext2 LHS, RHS + 1549433040U, // <u,3,u,6>: Cost 2 vext2 LHS, <u,6,3,7> + 1750314537U, // <u,3,u,7>: Cost 2 vuzpr LHS, RHS + 471046501U, // <u,3,u,u>: Cost 1 vext2 LHS, LHS + 2625167360U, // <u,4,0,0>: Cost 3 vext2 <1,2,u,4>, <0,0,0,0> + 1551425638U, // <u,4,0,1>: Cost 2 vext2 <1,2,u,4>, LHS + 2619195630U, // <u,4,0,2>: Cost 3 vext2 <0,2,u,4>, <0,2,u,4> + 2619343104U, // <u,4,0,3>: Cost 3 vext2 <0,3,1,4>, <0,3,1,4> + 2625167698U, // <u,4,0,4>: Cost 3 vext2 <1,2,u,4>, <0,4,1,5> + 1638329234U, // <u,4,0,5>: Cost 2 vext3 RHS, <4,0,5,1> + 1638329244U, // <u,4,0,6>: Cost 2 vext3 RHS, <4,0,6,2> + 3787803556U, // <u,4,0,7>: Cost 4 vext3 RHS, <4,0,7,1> + 1551426205U, // <u,4,0,u>: Cost 2 vext2 <1,2,u,4>, LHS + 2555748454U, // <u,4,1,0>: Cost 3 vext1 <0,u,4,1>, LHS + 2625168180U, // <u,4,1,1>: Cost 3 vext2 <1,2,u,4>, <1,1,1,1> + 1551426503U, // <u,4,1,2>: Cost 2 vext2 <1,2,u,4>, <1,2,u,4> + 2625168344U, // <u,4,1,3>: Cost 3 vext2 <1,2,u,4>, <1,3,1,3> + 2555751734U, // <u,4,1,4>: Cost 3 vext1 <0,u,4,1>, RHS + 1860554038U, // <u,4,1,5>: Cost 2 vzipl LHS, RHS + 2689879022U, // <u,4,1,6>: Cost 3 vext3 LHS, <4,1,6,3> + 2592248852U, // <u,4,1,7>: Cost 3 vext1 <7,0,4,1>, <7,0,4,1> + 1555408301U, // <u,4,1,u>: Cost 2 vext2 <1,u,u,4>, <1,u,u,4> + 2555756646U, // <u,4,2,0>: Cost 3 vext1 <0,u,4,2>, LHS + 2625168943U, // <u,4,2,1>: Cost 3 vext2 <1,2,u,4>, <2,1,4,u> + 2625169000U, // <u,4,2,2>: Cost 3 vext2 <1,2,u,4>, <2,2,2,2> + 2619197134U, // <u,4,2,3>: Cost 3 vext2 <0,2,u,4>, <2,3,4,5> + 2555759926U, // <u,4,2,4>: Cost 3 vext1 <0,u,4,2>, RHS + 2712071222U, // <u,4,2,5>: Cost 3 vext3 RHS, <4,2,5,3> + 1994771766U, // <u,4,2,6>: Cost 2 vtrnl LHS, RHS + 2592257045U, // <u,4,2,7>: Cost 3 vext1 <7,0,4,2>, <7,0,4,2> + 1994771784U, // <u,4,2,u>: Cost 2 vtrnl LHS, RHS + 2625169558U, // <u,4,3,0>: Cost 3 vext2 <1,2,u,4>, <3,0,1,2> + 2567709594U, // <u,4,3,1>: Cost 3 vext1 <2,u,4,3>, <1,2,3,4> + 2567710817U, // <u,4,3,2>: Cost 3 vext1 <2,u,4,3>, <2,u,4,3> + 2625169820U, // <u,4,3,3>: Cost 3 vext2 <1,2,u,4>, <3,3,3,3> + 2625169922U, // <u,4,3,4>: Cost 3 vext2 <1,2,u,4>, <3,4,5,6> + 2954069710U, // <u,4,3,5>: Cost 3 vzipr LHS, <2,3,4,5> + 2954068172U, // <u,4,3,6>: Cost 3 vzipr LHS, <0,2,4,6> + 3903849472U, // <u,4,3,7>: Cost 4 vuzpr <1,u,3,4>, <1,3,5,7> + 2954068174U, // <u,4,3,u>: Cost 3 vzipr LHS, <0,2,4,u> + 1505919078U, // <u,4,4,0>: Cost 2 vext1 <4,u,4,4>, LHS + 2567717831U, // <u,4,4,1>: Cost 3 vext1 <2,u,4,4>, <1,2,u,4> + 2567719010U, // <u,4,4,2>: Cost 3 vext1 <2,u,4,4>, <2,u,4,4> + 2570373542U, // <u,4,4,3>: Cost 3 vext1 <3,3,4,4>, <3,3,4,4> + 161926454U, // <u,4,4,4>: Cost 1 vdup0 RHS + 1551428918U, // <u,4,4,5>: Cost 2 vext2 <1,2,u,4>, RHS + 1638329572U, // <u,4,4,6>: Cost 2 vext3 RHS, <4,4,6,6> + 2594927963U, // <u,4,4,7>: Cost 3 vext1 <7,4,4,4>, <7,4,4,4> + 161926454U, // <u,4,4,u>: Cost 1 vdup0 RHS + 1493983334U, // <u,4,5,0>: Cost 2 vext1 <2,u,4,5>, LHS + 2689879301U, // <u,4,5,1>: Cost 3 vext3 LHS, <4,5,1,3> + 1493985379U, // <u,4,5,2>: Cost 2 vext1 <2,u,4,5>, <2,u,4,5> + 2567727254U, // <u,4,5,3>: Cost 3 vext1 <2,u,4,5>, <3,0,1,2> + 1493986614U, // <u,4,5,4>: Cost 2 vext1 <2,u,4,5>, RHS + 1863535926U, // <u,4,5,5>: Cost 2 vzipl RHS, RHS + 537750838U, // <u,4,5,6>: Cost 1 vext3 LHS, RHS + 2830110006U, // <u,4,5,7>: Cost 3 vuzpr <1,u,3,4>, RHS + 537750856U, // <u,4,5,u>: Cost 1 vext3 LHS, RHS + 1482047590U, // <u,4,6,0>: Cost 2 vext1 <0,u,4,6>, LHS + 2555790070U, // <u,4,6,1>: Cost 3 vext1 <0,u,4,6>, <1,0,3,2> + 2555790952U, // <u,4,6,2>: Cost 3 vext1 <0,u,4,6>, <2,2,2,2> + 2555791510U, // <u,4,6,3>: Cost 3 vext1 <0,u,4,6>, <3,0,1,2> + 1482050870U, // <u,4,6,4>: Cost 2 vext1 <0,u,4,6>, RHS + 2689879422U, // <u,4,6,5>: Cost 3 vext3 LHS, <4,6,5,7> + 1997753654U, // <u,4,6,6>: Cost 2 vtrnl RHS, RHS + 2712071562U, // <u,4,6,7>: Cost 3 vext3 RHS, <4,6,7,1> + 1482053422U, // <u,4,6,u>: Cost 2 vext1 <0,u,4,6>, LHS + 2567741542U, // <u,4,7,0>: Cost 3 vext1 <2,u,4,7>, LHS + 2567742362U, // <u,4,7,1>: Cost 3 vext1 <2,u,4,7>, <1,2,3,4> + 2567743589U, // <u,4,7,2>: Cost 3 vext1 <2,u,4,7>, <2,u,4,7> + 2573716286U, // <u,4,7,3>: Cost 3 vext1 <3,u,4,7>, <3,u,4,7> + 2567744822U, // <u,4,7,4>: Cost 3 vext1 <2,u,4,7>, RHS + 2712071624U, // <u,4,7,5>: Cost 3 vext3 RHS, <4,7,5,0> + 96808489U, // <u,4,7,6>: Cost 1 vrev RHS + 2651715180U, // <u,4,7,7>: Cost 3 vext2 <5,6,u,4>, <7,7,7,7> + 96955963U, // <u,4,7,u>: Cost 1 vrev RHS + 1482063974U, // <u,4,u,0>: Cost 2 vext1 <0,u,4,u>, LHS + 1551431470U, // <u,4,u,1>: Cost 2 vext2 <1,2,u,4>, LHS + 1494009958U, // <u,4,u,2>: Cost 2 vext1 <2,u,4,u>, <2,u,4,u> + 2555807894U, // <u,4,u,3>: Cost 3 vext1 <0,u,4,u>, <3,0,1,2> + 161926454U, // <u,4,u,4>: Cost 1 vdup0 RHS + 1551431834U, // <u,4,u,5>: Cost 2 vext2 <1,2,u,4>, RHS + 537751081U, // <u,4,u,6>: Cost 1 vext3 LHS, RHS + 2830110249U, // <u,4,u,7>: Cost 3 vuzpr <1,u,3,4>, RHS + 537751099U, // <u,4,u,u>: Cost 1 vext3 LHS, RHS + 2631811072U, // <u,5,0,0>: Cost 3 vext2 <2,3,u,5>, <0,0,0,0> + 1558069350U, // <u,5,0,1>: Cost 2 vext2 <2,3,u,5>, LHS + 2619203823U, // <u,5,0,2>: Cost 3 vext2 <0,2,u,5>, <0,2,u,5> + 2619867456U, // <u,5,0,3>: Cost 3 vext2 <0,3,u,5>, <0,3,u,5> + 1546273106U, // <u,5,0,4>: Cost 2 vext2 <0,4,1,5>, <0,4,1,5> + 2733010539U, // <u,5,0,5>: Cost 3 vext3 LHS, <5,0,5,1> + 2597622682U, // <u,5,0,6>: Cost 3 vext1 <7,u,5,0>, <6,7,u,5> + 1176539396U, // <u,5,0,7>: Cost 2 vrev <5,u,7,0> + 1558069917U, // <u,5,0,u>: Cost 2 vext2 <2,3,u,5>, LHS + 1505968230U, // <u,5,1,0>: Cost 2 vext1 <4,u,5,1>, LHS + 2624512887U, // <u,5,1,1>: Cost 3 vext2 <1,1,u,5>, <1,1,u,5> + 2631811990U, // <u,5,1,2>: Cost 3 vext2 <2,3,u,5>, <1,2,3,0> + 2618541056U, // <u,5,1,3>: Cost 3 vext2 <0,1,u,5>, <1,3,5,7> + 1505971510U, // <u,5,1,4>: Cost 2 vext1 <4,u,5,1>, RHS + 2627167419U, // <u,5,1,5>: Cost 3 vext2 <1,5,u,5>, <1,5,u,5> + 2579714554U, // <u,5,1,6>: Cost 3 vext1 <4,u,5,1>, <6,2,7,3> + 1638330064U, // <u,5,1,7>: Cost 2 vext3 RHS, <5,1,7,3> + 1638477529U, // <u,5,1,u>: Cost 2 vext3 RHS, <5,1,u,3> + 2561802342U, // <u,5,2,0>: Cost 3 vext1 <1,u,5,2>, LHS + 2561803264U, // <u,5,2,1>: Cost 3 vext1 <1,u,5,2>, <1,3,5,7> + 2631149217U, // <u,5,2,2>: Cost 3 vext2 <2,2,u,5>, <2,2,u,5> + 1558071026U, // <u,5,2,3>: Cost 2 vext2 <2,3,u,5>, <2,3,u,5> + 2561805622U, // <u,5,2,4>: Cost 3 vext1 <1,u,5,2>, RHS + 2714062607U, // <u,5,2,5>: Cost 3 vext3 RHS, <5,2,5,3> + 2631813050U, // <u,5,2,6>: Cost 3 vext2 <2,3,u,5>, <2,6,3,7> + 3092335926U, // <u,5,2,7>: Cost 3 vtrnr <0,u,0,2>, RHS + 1561389191U, // <u,5,2,u>: Cost 2 vext2 <2,u,u,5>, <2,u,u,5> + 2561810534U, // <u,5,3,0>: Cost 3 vext1 <1,u,5,3>, LHS + 2561811857U, // <u,5,3,1>: Cost 3 vext1 <1,u,5,3>, <1,u,5,3> + 2631813474U, // <u,5,3,2>: Cost 3 vext2 <2,3,u,5>, <3,2,5,u> + 2631813532U, // <u,5,3,3>: Cost 3 vext2 <2,3,u,5>, <3,3,3,3> + 2619869698U, // <u,5,3,4>: Cost 3 vext2 <0,3,u,5>, <3,4,5,6> + 3001847002U, // <u,5,3,5>: Cost 3 vzipr LHS, <4,4,5,5> + 2954070530U, // <u,5,3,6>: Cost 3 vzipr LHS, <3,4,5,6> + 2018749750U, // <u,5,3,7>: Cost 2 vtrnr LHS, RHS + 2018749751U, // <u,5,3,u>: Cost 2 vtrnr LHS, RHS + 2573762662U, // <u,5,4,0>: Cost 3 vext1 <3,u,5,4>, LHS + 2620017634U, // <u,5,4,1>: Cost 3 vext2 <0,4,1,5>, <4,1,5,0> + 2573764338U, // <u,5,4,2>: Cost 3 vext1 <3,u,5,4>, <2,3,u,5> + 2573765444U, // <u,5,4,3>: Cost 3 vext1 <3,u,5,4>, <3,u,5,4> + 1570680053U, // <u,5,4,4>: Cost 2 vext2 <4,4,u,5>, <4,4,u,5> + 1558072630U, // <u,5,4,5>: Cost 2 vext2 <2,3,u,5>, RHS + 2645749143U, // <u,5,4,6>: Cost 3 vext2 <4,6,u,5>, <4,6,u,5> + 1638330310U, // <u,5,4,7>: Cost 2 vext3 RHS, <5,4,7,6> + 1558072873U, // <u,5,4,u>: Cost 2 vext2 <2,3,u,5>, RHS + 1506000998U, // <u,5,5,0>: Cost 2 vext1 <4,u,5,5>, LHS + 2561827984U, // <u,5,5,1>: Cost 3 vext1 <1,u,5,5>, <1,5,3,7> + 2579744360U, // <u,5,5,2>: Cost 3 vext1 <4,u,5,5>, <2,2,2,2> + 2579744918U, // <u,5,5,3>: Cost 3 vext1 <4,u,5,5>, <3,0,1,2> + 1506004278U, // <u,5,5,4>: Cost 2 vext1 <4,u,5,5>, RHS + 229035318U, // <u,5,5,5>: Cost 1 vdup1 RHS + 2712072206U, // <u,5,5,6>: Cost 3 vext3 RHS, <5,5,6,6> + 1638330392U, // <u,5,5,7>: Cost 2 vext3 RHS, <5,5,7,7> + 229035318U, // <u,5,5,u>: Cost 1 vdup1 RHS + 1500037222U, // <u,5,6,0>: Cost 2 vext1 <3,u,5,6>, LHS + 2561836436U, // <u,5,6,1>: Cost 3 vext1 <1,u,5,6>, <1,u,5,6> + 2567809133U, // <u,5,6,2>: Cost 3 vext1 <2,u,5,6>, <2,u,5,6> + 1500040006U, // <u,5,6,3>: Cost 2 vext1 <3,u,5,6>, <3,u,5,6> + 1500040502U, // <u,5,6,4>: Cost 2 vext1 <3,u,5,6>, RHS + 2714062935U, // <u,5,6,5>: Cost 3 vext3 RHS, <5,6,5,7> + 2712072288U, // <u,5,6,6>: Cost 3 vext3 RHS, <5,6,6,7> + 27705344U, // <u,5,6,7>: Cost 0 copy RHS + 27705344U, // <u,5,6,u>: Cost 0 copy RHS + 1488101478U, // <u,5,7,0>: Cost 2 vext1 <1,u,5,7>, LHS + 1488102805U, // <u,5,7,1>: Cost 2 vext1 <1,u,5,7>, <1,u,5,7> + 2561844840U, // <u,5,7,2>: Cost 3 vext1 <1,u,5,7>, <2,2,2,2> + 2561845398U, // <u,5,7,3>: Cost 3 vext1 <1,u,5,7>, <3,0,1,2> + 1488104758U, // <u,5,7,4>: Cost 2 vext1 <1,u,5,7>, RHS + 1638330536U, // <u,5,7,5>: Cost 2 vext3 RHS, <5,7,5,7> + 2712072362U, // <u,5,7,6>: Cost 3 vext3 RHS, <5,7,6,0> + 2042965302U, // <u,5,7,7>: Cost 2 vtrnr RHS, RHS + 1488107310U, // <u,5,7,u>: Cost 2 vext1 <1,u,5,7>, LHS + 1488109670U, // <u,5,u,0>: Cost 2 vext1 <1,u,5,u>, LHS + 1488110998U, // <u,5,u,1>: Cost 2 vext1 <1,u,5,u>, <1,u,5,u> + 2561853032U, // <u,5,u,2>: Cost 3 vext1 <1,u,5,u>, <2,2,2,2> + 1500056392U, // <u,5,u,3>: Cost 2 vext1 <3,u,5,u>, <3,u,5,u> + 1488112950U, // <u,5,u,4>: Cost 2 vext1 <1,u,5,u>, RHS + 229035318U, // <u,5,u,5>: Cost 1 vdup1 RHS + 2954111490U, // <u,5,u,6>: Cost 3 vzipr LHS, <3,4,5,6> + 27705344U, // <u,5,u,7>: Cost 0 copy RHS + 27705344U, // <u,5,u,u>: Cost 0 copy RHS + 2619211776U, // <u,6,0,0>: Cost 3 vext2 <0,2,u,6>, <0,0,0,0> + 1545470054U, // <u,6,0,1>: Cost 2 vext2 <0,2,u,6>, LHS + 1545470192U, // <u,6,0,2>: Cost 2 vext2 <0,2,u,6>, <0,2,u,6> + 2255958969U, // <u,6,0,3>: Cost 3 vrev <6,u,3,0> + 1546797458U, // <u,6,0,4>: Cost 2 vext2 <0,4,u,6>, <0,4,u,6> + 2720624971U, // <u,6,0,5>: Cost 3 vext3 <6,0,5,u>, <6,0,5,u> + 2256180180U, // <u,6,0,6>: Cost 3 vrev <6,u,6,0> + 2960682294U, // <u,6,0,7>: Cost 3 vzipr <1,2,u,0>, RHS + 1545470621U, // <u,6,0,u>: Cost 2 vext2 <0,2,u,6>, LHS + 1182004127U, // <u,6,1,0>: Cost 2 vrev <6,u,0,1> + 2619212596U, // <u,6,1,1>: Cost 3 vext2 <0,2,u,6>, <1,1,1,1> + 2619212694U, // <u,6,1,2>: Cost 3 vext2 <0,2,u,6>, <1,2,3,0> + 2619212760U, // <u,6,1,3>: Cost 3 vext2 <0,2,u,6>, <1,3,1,3> + 2626511979U, // <u,6,1,4>: Cost 3 vext2 <1,4,u,6>, <1,4,u,6> + 2619212944U, // <u,6,1,5>: Cost 3 vext2 <0,2,u,6>, <1,5,3,7> + 2714063264U, // <u,6,1,6>: Cost 3 vext3 RHS, <6,1,6,3> + 2967326006U, // <u,6,1,7>: Cost 3 vzipr <2,3,u,1>, RHS + 1182594023U, // <u,6,1,u>: Cost 2 vrev <6,u,u,1> + 1506050150U, // <u,6,2,0>: Cost 2 vext1 <4,u,6,2>, LHS + 2579792630U, // <u,6,2,1>: Cost 3 vext1 <4,u,6,2>, <1,0,3,2> + 2619213416U, // <u,6,2,2>: Cost 3 vext2 <0,2,u,6>, <2,2,2,2> + 2619213478U, // <u,6,2,3>: Cost 3 vext2 <0,2,u,6>, <2,3,0,1> + 1506053430U, // <u,6,2,4>: Cost 2 vext1 <4,u,6,2>, RHS + 2633148309U, // <u,6,2,5>: Cost 3 vext2 <2,5,u,6>, <2,5,u,6> + 2619213754U, // <u,6,2,6>: Cost 3 vext2 <0,2,u,6>, <2,6,3,7> + 1638330874U, // <u,6,2,7>: Cost 2 vext3 RHS, <6,2,7,3> + 1638478339U, // <u,6,2,u>: Cost 2 vext3 RHS, <6,2,u,3> + 2619213974U, // <u,6,3,0>: Cost 3 vext2 <0,2,u,6>, <3,0,1,2> + 2255836074U, // <u,6,3,1>: Cost 3 vrev <6,u,1,3> + 2255909811U, // <u,6,3,2>: Cost 3 vrev <6,u,2,3> + 2619214236U, // <u,6,3,3>: Cost 3 vext2 <0,2,u,6>, <3,3,3,3> + 1564715549U, // <u,6,3,4>: Cost 2 vext2 <3,4,u,6>, <3,4,u,6> + 2639121006U, // <u,6,3,5>: Cost 3 vext2 <3,5,u,6>, <3,5,u,6> + 3001847012U, // <u,6,3,6>: Cost 3 vzipr LHS, <4,4,6,6> + 1880329526U, // <u,6,3,7>: Cost 2 vzipr LHS, RHS + 1880329527U, // <u,6,3,u>: Cost 2 vzipr LHS, RHS + 2567864422U, // <u,6,4,0>: Cost 3 vext1 <2,u,6,4>, LHS + 2733011558U, // <u,6,4,1>: Cost 3 vext3 LHS, <6,4,1,3> + 2567866484U, // <u,6,4,2>: Cost 3 vext1 <2,u,6,4>, <2,u,6,4> + 2638458005U, // <u,6,4,3>: Cost 3 vext2 <3,4,u,6>, <4,3,6,u> + 1570540772U, // <u,6,4,4>: Cost 2 vext2 <4,4,6,6>, <4,4,6,6> + 1545473334U, // <u,6,4,5>: Cost 2 vext2 <0,2,u,6>, RHS + 1572015512U, // <u,6,4,6>: Cost 2 vext2 <4,6,u,6>, <4,6,u,6> + 2960715062U, // <u,6,4,7>: Cost 3 vzipr <1,2,u,4>, RHS + 1545473577U, // <u,6,4,u>: Cost 2 vext2 <0,2,u,6>, RHS + 2567872614U, // <u,6,5,0>: Cost 3 vext1 <2,u,6,5>, LHS + 2645757648U, // <u,6,5,1>: Cost 3 vext2 <4,6,u,6>, <5,1,7,3> + 2567874490U, // <u,6,5,2>: Cost 3 vext1 <2,u,6,5>, <2,6,3,7> + 2576501250U, // <u,6,5,3>: Cost 3 vext1 <4,3,6,5>, <3,4,5,6> + 1576660943U, // <u,6,5,4>: Cost 2 vext2 <5,4,u,6>, <5,4,u,6> + 2645757956U, // <u,6,5,5>: Cost 3 vext2 <4,6,u,6>, <5,5,5,5> + 2645758050U, // <u,6,5,6>: Cost 3 vext2 <4,6,u,6>, <5,6,7,0> + 2824080694U, // <u,6,5,7>: Cost 3 vuzpr <0,u,2,6>, RHS + 1182626795U, // <u,6,5,u>: Cost 2 vrev <6,u,u,5> + 1506082918U, // <u,6,6,0>: Cost 2 vext1 <4,u,6,6>, LHS + 2579825398U, // <u,6,6,1>: Cost 3 vext1 <4,u,6,6>, <1,0,3,2> + 2645758458U, // <u,6,6,2>: Cost 3 vext2 <4,6,u,6>, <6,2,7,3> + 2579826838U, // <u,6,6,3>: Cost 3 vext1 <4,u,6,6>, <3,0,1,2> + 1506086198U, // <u,6,6,4>: Cost 2 vext1 <4,u,6,6>, RHS + 2579828432U, // <u,6,6,5>: Cost 3 vext1 <4,u,6,6>, <5,1,7,3> + 296144182U, // <u,6,6,6>: Cost 1 vdup2 RHS + 1638331202U, // <u,6,6,7>: Cost 2 vext3 RHS, <6,6,7,7> + 296144182U, // <u,6,6,u>: Cost 1 vdup2 RHS + 432349286U, // <u,6,7,0>: Cost 1 vext1 RHS, LHS + 1506091766U, // <u,6,7,1>: Cost 2 vext1 RHS, <1,0,3,2> + 1506092648U, // <u,6,7,2>: Cost 2 vext1 RHS, <2,2,2,2> + 1506093206U, // <u,6,7,3>: Cost 2 vext1 RHS, <3,0,1,2> + 432352809U, // <u,6,7,4>: Cost 1 vext1 RHS, RHS + 1506094800U, // <u,6,7,5>: Cost 2 vext1 RHS, <5,1,7,3> + 1506095610U, // <u,6,7,6>: Cost 2 vext1 RHS, <6,2,7,3> + 1906904374U, // <u,6,7,7>: Cost 2 vzipr RHS, RHS + 432355118U, // <u,6,7,u>: Cost 1 vext1 RHS, LHS + 432357478U, // <u,6,u,0>: Cost 1 vext1 RHS, LHS + 1545475886U, // <u,6,u,1>: Cost 2 vext2 <0,2,u,6>, LHS + 1506100840U, // <u,6,u,2>: Cost 2 vext1 RHS, <2,2,2,2> + 1506101398U, // <u,6,u,3>: Cost 2 vext1 RHS, <3,0,1,2> + 432361002U, // <u,6,u,4>: Cost 1 vext1 RHS, RHS + 1545476250U, // <u,6,u,5>: Cost 2 vext2 <0,2,u,6>, RHS + 296144182U, // <u,6,u,6>: Cost 1 vdup2 RHS + 1880370486U, // <u,6,u,7>: Cost 2 vzipr LHS, RHS + 432363310U, // <u,6,u,u>: Cost 1 vext1 RHS, LHS + 1571356672U, // <u,7,0,0>: Cost 2 vext2 RHS, <0,0,0,0> + 497614950U, // <u,7,0,1>: Cost 1 vext2 RHS, LHS + 1571356836U, // <u,7,0,2>: Cost 2 vext2 RHS, <0,2,0,2> + 2573880146U, // <u,7,0,3>: Cost 3 vext1 <3,u,7,0>, <3,u,7,0> + 1571357010U, // <u,7,0,4>: Cost 2 vext2 RHS, <0,4,1,5> + 1512083716U, // <u,7,0,5>: Cost 2 vext1 <5,u,7,0>, <5,u,7,0> + 2621874741U, // <u,7,0,6>: Cost 3 vext2 <0,6,u,7>, <0,6,u,7> + 2585826298U, // <u,7,0,7>: Cost 3 vext1 <5,u,7,0>, <7,0,1,2> + 497615517U, // <u,7,0,u>: Cost 1 vext2 RHS, LHS + 1571357430U, // <u,7,1,0>: Cost 2 vext2 RHS, <1,0,3,2> + 1571357492U, // <u,7,1,1>: Cost 2 vext2 RHS, <1,1,1,1> + 1571357590U, // <u,7,1,2>: Cost 2 vext2 RHS, <1,2,3,0> + 1552114715U, // <u,7,1,3>: Cost 2 vext2 <1,3,u,7>, <1,3,u,7> + 2573888822U, // <u,7,1,4>: Cost 3 vext1 <3,u,7,1>, RHS + 1553441981U, // <u,7,1,5>: Cost 2 vext2 <1,5,u,7>, <1,5,u,7> + 2627847438U, // <u,7,1,6>: Cost 3 vext2 <1,6,u,7>, <1,6,u,7> + 2727408775U, // <u,7,1,7>: Cost 3 vext3 <7,1,7,u>, <7,1,7,u> + 1555432880U, // <u,7,1,u>: Cost 2 vext2 <1,u,u,7>, <1,u,u,7> + 2629838337U, // <u,7,2,0>: Cost 3 vext2 <2,0,u,7>, <2,0,u,7> + 1188058754U, // <u,7,2,1>: Cost 2 vrev <7,u,1,2> + 1571358312U, // <u,7,2,2>: Cost 2 vext2 RHS, <2,2,2,2> + 1571358374U, // <u,7,2,3>: Cost 2 vext2 RHS, <2,3,0,1> + 2632492869U, // <u,7,2,4>: Cost 3 vext2 <2,4,u,7>, <2,4,u,7> + 2633156502U, // <u,7,2,5>: Cost 3 vext2 <2,5,u,7>, <2,5,u,7> + 1560078311U, // <u,7,2,6>: Cost 2 vext2 <2,6,u,7>, <2,6,u,7> + 2728072408U, // <u,7,2,7>: Cost 3 vext3 <7,2,7,u>, <7,2,7,u> + 1561405577U, // <u,7,2,u>: Cost 2 vext2 <2,u,u,7>, <2,u,u,7> + 1571358870U, // <u,7,3,0>: Cost 2 vext2 RHS, <3,0,1,2> + 2627184913U, // <u,7,3,1>: Cost 3 vext2 <1,5,u,7>, <3,1,5,u> + 2633820523U, // <u,7,3,2>: Cost 3 vext2 <2,6,u,7>, <3,2,6,u> + 1571359132U, // <u,7,3,3>: Cost 2 vext2 RHS, <3,3,3,3> + 1571359234U, // <u,7,3,4>: Cost 2 vext2 RHS, <3,4,5,6> + 1512108295U, // <u,7,3,5>: Cost 2 vext1 <5,u,7,3>, <5,u,7,3> + 1518080992U, // <u,7,3,6>: Cost 2 vext1 <6,u,7,3>, <6,u,7,3> + 2640456465U, // <u,7,3,7>: Cost 3 vext2 <3,7,u,7>, <3,7,u,7> + 1571359518U, // <u,7,3,u>: Cost 2 vext2 RHS, <3,u,1,2> + 1571359634U, // <u,7,4,0>: Cost 2 vext2 RHS, <4,0,5,1> + 2573911067U, // <u,7,4,1>: Cost 3 vext1 <3,u,7,4>, <1,3,u,7> + 2645101622U, // <u,7,4,2>: Cost 3 vext2 RHS, <4,2,5,3> + 2573912918U, // <u,7,4,3>: Cost 3 vext1 <3,u,7,4>, <3,u,7,4> + 1571359952U, // <u,7,4,4>: Cost 2 vext2 RHS, <4,4,4,4> + 497618248U, // <u,7,4,5>: Cost 1 vext2 RHS, RHS + 1571360116U, // <u,7,4,6>: Cost 2 vext2 RHS, <4,6,4,6> + 2645102024U, // <u,7,4,7>: Cost 3 vext2 RHS, <4,7,5,0> + 497618473U, // <u,7,4,u>: Cost 1 vext2 RHS, RHS + 2645102152U, // <u,7,5,0>: Cost 3 vext2 RHS, <5,0,1,2> + 1571360464U, // <u,7,5,1>: Cost 2 vext2 RHS, <5,1,7,3> + 2645102334U, // <u,7,5,2>: Cost 3 vext2 RHS, <5,2,3,4> + 2645102447U, // <u,7,5,3>: Cost 3 vext2 RHS, <5,3,7,0> + 1571360710U, // <u,7,5,4>: Cost 2 vext2 RHS, <5,4,7,6> + 1571360772U, // <u,7,5,5>: Cost 2 vext2 RHS, <5,5,5,5> + 1571360866U, // <u,7,5,6>: Cost 2 vext2 RHS, <5,6,7,0> + 1571360936U, // <u,7,5,7>: Cost 2 vext2 RHS, <5,7,5,7> + 1571361017U, // <u,7,5,u>: Cost 2 vext2 RHS, <5,u,5,7> + 1530044518U, // <u,7,6,0>: Cost 2 vext1 <u,u,7,6>, LHS + 2645103016U, // <u,7,6,1>: Cost 3 vext2 RHS, <6,1,7,2> + 1571361274U, // <u,7,6,2>: Cost 2 vext2 RHS, <6,2,7,3> + 2645103154U, // <u,7,6,3>: Cost 3 vext2 RHS, <6,3,4,5> + 1530047798U, // <u,7,6,4>: Cost 2 vext1 <u,u,7,6>, RHS + 1188386474U, // <u,7,6,5>: Cost 2 vrev <7,u,5,6> + 1571361592U, // <u,7,6,6>: Cost 2 vext2 RHS, <6,6,6,6> + 1571361614U, // <u,7,6,7>: Cost 2 vext2 RHS, <6,7,0,1> + 1571361695U, // <u,7,6,u>: Cost 2 vext2 RHS, <6,u,0,1> + 1571361786U, // <u,7,7,0>: Cost 2 vext2 RHS, <7,0,1,2> + 2573935616U, // <u,7,7,1>: Cost 3 vext1 <3,u,7,7>, <1,3,5,7> + 2645103781U, // <u,7,7,2>: Cost 3 vext2 RHS, <7,2,2,2> + 2573937497U, // <u,7,7,3>: Cost 3 vext1 <3,u,7,7>, <3,u,7,7> + 1571362150U, // <u,7,7,4>: Cost 2 vext2 RHS, <7,4,5,6> + 1512141067U, // <u,7,7,5>: Cost 2 vext1 <5,u,7,7>, <5,u,7,7> + 1518113764U, // <u,7,7,6>: Cost 2 vext1 <6,u,7,7>, <6,u,7,7> + 363253046U, // <u,7,7,7>: Cost 1 vdup3 RHS + 363253046U, // <u,7,7,u>: Cost 1 vdup3 RHS + 1571362515U, // <u,7,u,0>: Cost 2 vext2 RHS, <u,0,1,2> + 497620782U, // <u,7,u,1>: Cost 1 vext2 RHS, LHS + 1571362693U, // <u,7,u,2>: Cost 2 vext2 RHS, <u,2,3,0> + 1571362748U, // <u,7,u,3>: Cost 2 vext2 RHS, <u,3,0,1> + 1571362879U, // <u,7,u,4>: Cost 2 vext2 RHS, <u,4,5,6> + 497621146U, // <u,7,u,5>: Cost 1 vext2 RHS, RHS + 1571363024U, // <u,7,u,6>: Cost 2 vext2 RHS, <u,6,3,7> + 363253046U, // <u,7,u,7>: Cost 1 vdup3 RHS + 497621349U, // <u,7,u,u>: Cost 1 vext2 RHS, LHS + 135053414U, // <u,u,0,0>: Cost 1 vdup0 LHS + 471081121U, // <u,u,0,1>: Cost 1 vext2 LHS, LHS + 1544822948U, // <u,u,0,2>: Cost 2 vext2 LHS, <0,2,0,2> + 1616140005U, // <u,u,0,3>: Cost 2 vext3 LHS, <u,0,3,2> + 1544823122U, // <u,u,0,4>: Cost 2 vext2 LHS, <0,4,1,5> + 1512157453U, // <u,u,0,5>: Cost 2 vext1 <5,u,u,0>, <5,u,u,0> + 1662220032U, // <u,u,0,6>: Cost 2 vext3 RHS, <u,0,6,2> + 1194457487U, // <u,u,0,7>: Cost 2 vrev <u,u,7,0> + 471081629U, // <u,u,0,u>: Cost 1 vext2 LHS, LHS + 1544823542U, // <u,u,1,0>: Cost 2 vext2 LHS, <1,0,3,2> + 202162278U, // <u,u,1,1>: Cost 1 vdup1 LHS + 537753390U, // <u,u,1,2>: Cost 1 vext3 LHS, LHS + 1544823768U, // <u,u,1,3>: Cost 2 vext2 LHS, <1,3,1,3> + 1494248758U, // <u,u,1,4>: Cost 2 vext1 <2,u,u,1>, RHS + 1544823952U, // <u,u,1,5>: Cost 2 vext2 LHS, <1,5,3,7> + 1518138343U, // <u,u,1,6>: Cost 2 vext1 <6,u,u,1>, <6,u,u,1> + 1640322907U, // <u,u,1,7>: Cost 2 vext3 RHS, <u,1,7,3> + 537753444U, // <u,u,1,u>: Cost 1 vext3 LHS, LHS + 1482309734U, // <u,u,2,0>: Cost 2 vext1 <0,u,u,2>, LHS + 1194031451U, // <u,u,2,1>: Cost 2 vrev <u,u,1,2> + 269271142U, // <u,u,2,2>: Cost 1 vdup2 LHS + 835584U, // <u,u,2,3>: Cost 0 copy LHS + 1482313014U, // <u,u,2,4>: Cost 2 vext1 <0,u,u,2>, RHS + 2618566504U, // <u,u,2,5>: Cost 3 vext2 LHS, <2,5,3,6> + 1544824762U, // <u,u,2,6>: Cost 2 vext2 LHS, <2,6,3,7> + 1638479788U, // <u,u,2,7>: Cost 2 vext3 RHS, <u,2,7,3> + 835584U, // <u,u,2,u>: Cost 0 copy LHS + 408576723U, // <u,u,3,0>: Cost 1 vext1 LHS, LHS + 1482318582U, // <u,u,3,1>: Cost 2 vext1 LHS, <1,0,3,2> + 120371557U, // <u,u,3,2>: Cost 1 vrev LHS + 336380006U, // <u,u,3,3>: Cost 1 vdup3 LHS + 408579382U, // <u,u,3,4>: Cost 1 vext1 LHS, RHS + 1616140271U, // <u,u,3,5>: Cost 2 vext3 LHS, <u,3,5,7> + 1530098170U, // <u,u,3,6>: Cost 2 vext1 LHS, <6,2,7,3> + 1880329544U, // <u,u,3,7>: Cost 2 vzipr LHS, RHS + 408581934U, // <u,u,3,u>: Cost 1 vext1 LHS, LHS + 1488298086U, // <u,u,4,0>: Cost 2 vext1 <1,u,u,4>, LHS + 1488299437U, // <u,u,4,1>: Cost 2 vext1 <1,u,u,4>, <1,u,u,4> + 1659271204U, // <u,u,4,2>: Cost 2 vext3 LHS, <u,4,2,6> + 1194195311U, // <u,u,4,3>: Cost 2 vrev <u,u,3,4> + 161926454U, // <u,u,4,4>: Cost 1 vdup0 RHS + 471084342U, // <u,u,4,5>: Cost 1 vext2 LHS, RHS + 1571368308U, // <u,u,4,6>: Cost 2 vext2 RHS, <4,6,4,6> + 1640323153U, // <u,u,4,7>: Cost 2 vext3 RHS, <u,4,7,6> + 471084585U, // <u,u,4,u>: Cost 1 vext2 LHS, RHS + 1494278246U, // <u,u,5,0>: Cost 2 vext1 <2,u,u,5>, LHS + 1571368656U, // <u,u,5,1>: Cost 2 vext2 RHS, <5,1,7,3> + 1494280327U, // <u,u,5,2>: Cost 2 vext1 <2,u,u,5>, <2,u,u,5> + 1616140415U, // <u,u,5,3>: Cost 2 vext3 LHS, <u,5,3,7> + 1494281526U, // <u,u,5,4>: Cost 2 vext1 <2,u,u,5>, RHS + 229035318U, // <u,u,5,5>: Cost 1 vdup1 RHS + 537753754U, // <u,u,5,6>: Cost 1 vext3 LHS, RHS + 1750355254U, // <u,u,5,7>: Cost 2 vuzpr LHS, RHS + 537753772U, // <u,u,5,u>: Cost 1 vext3 LHS, RHS + 1482342502U, // <u,u,6,0>: Cost 2 vext1 <0,u,u,6>, LHS + 2556084982U, // <u,u,6,1>: Cost 3 vext1 <0,u,u,6>, <1,0,3,2> + 1571369466U, // <u,u,6,2>: Cost 2 vext2 RHS, <6,2,7,3> + 1611938000U, // <u,u,6,3>: Cost 2 vext3 LHS, <u,6,3,7> + 1482345782U, // <u,u,6,4>: Cost 2 vext1 <0,u,u,6>, RHS + 1194359171U, // <u,u,6,5>: Cost 2 vrev <u,u,5,6> + 296144182U, // <u,u,6,6>: Cost 1 vdup2 RHS + 27705344U, // <u,u,6,7>: Cost 0 copy RHS + 27705344U, // <u,u,6,u>: Cost 0 copy RHS + 432496742U, // <u,u,7,0>: Cost 1 vext1 RHS, LHS + 1488324016U, // <u,u,7,1>: Cost 2 vext1 <1,u,u,7>, <1,u,u,7> + 1494296713U, // <u,u,7,2>: Cost 2 vext1 <2,u,u,7>, <2,u,u,7> + 1906901148U, // <u,u,7,3>: Cost 2 vzipr RHS, LHS + 432500283U, // <u,u,7,4>: Cost 1 vext1 RHS, RHS + 1506242256U, // <u,u,7,5>: Cost 2 vext1 RHS, <5,1,7,3> + 120699277U, // <u,u,7,6>: Cost 1 vrev RHS + 363253046U, // <u,u,7,7>: Cost 1 vdup3 RHS + 432502574U, // <u,u,7,u>: Cost 1 vext1 RHS, LHS + 408617688U, // <u,u,u,0>: Cost 1 vext1 LHS, LHS + 471086894U, // <u,u,u,1>: Cost 1 vext2 LHS, LHS + 537753957U, // <u,u,u,2>: Cost 1 vext3 LHS, LHS + 835584U, // <u,u,u,3>: Cost 0 copy LHS + 408620342U, // <u,u,u,4>: Cost 1 vext1 LHS, RHS + 471087258U, // <u,u,u,5>: Cost 1 vext2 LHS, RHS + 537753997U, // <u,u,u,6>: Cost 1 vext3 LHS, RHS + 27705344U, // <u,u,u,7>: Cost 0 copy RHS + 835584U, // <u,u,u,u>: Cost 0 copy LHS 0 }; diff --git a/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.cpp index d5bc3f6..ad51bc1 100644 --- a/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.cpp @@ -28,7 +28,6 @@ #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/RegisterScavenging.h" #include "llvm/Support/ErrorHandling.h" -#include "llvm/Target/TargetFrameInfo.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" #include "llvm/ADT/BitVector.h" diff --git a/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.td b/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.td index 305b232..22d15b5 100644 --- a/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.td +++ b/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.td @@ -201,6 +201,10 @@ def CPSR : ARMReg<0, "cpsr">; def FPSCR : ARMReg<1, "fpscr">; def ITSTATE : ARMReg<2, "itstate">; +// Special Registers - only available in privileged mode. +def FPSID : ARMReg<0, "fpsid">; +def FPEXC : ARMReg<8, "fpexc">; + // Register classes. // // pc == Program Counter @@ -256,7 +260,7 @@ def GPR : RegisterClass<"ARM", [i32], 32, [R0, R1, R2, R3, R4, R5, R6, // restricted GPR register class. Many Thumb2 instructions allow the full // register range for operands, but have undefined behaviours when PC -// or SP (R13 or R15) are used. The ARM ARM refers to these operands +// or SP (R13 or R15) are used. The ARM ISA refers to these operands // via the BadReg() pseudo-code description. def rGPR : RegisterClass<"ARM", [i32], 32, [R0, R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, LR]> { @@ -381,27 +385,29 @@ def DPR : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32], 64, iterator allocation_order_end(const MachineFunction &MF) const; }]; let MethodBodies = [{ - // VFP2 + // VFP2 / VFPv3-D16 static const unsigned ARM_DPR_VFP2[] = { ARM::D0, ARM::D1, ARM::D2, ARM::D3, ARM::D4, ARM::D5, ARM::D6, ARM::D7, ARM::D8, ARM::D9, ARM::D10, ARM::D11, ARM::D12, ARM::D13, ARM::D14, ARM::D15 }; - // VFP3 + // VFP3: D8-D15 are callee saved and should be allocated last. + // Save other low registers for use as DPR_VFP2 and DPR_8 classes. static const unsigned ARM_DPR_VFP3[] = { - ARM::D0, ARM::D1, ARM::D2, ARM::D3, - ARM::D4, ARM::D5, ARM::D6, ARM::D7, - ARM::D8, ARM::D9, ARM::D10, ARM::D11, - ARM::D12, ARM::D13, ARM::D14, ARM::D15, ARM::D16, ARM::D17, ARM::D18, ARM::D19, ARM::D20, ARM::D21, ARM::D22, ARM::D23, ARM::D24, ARM::D25, ARM::D26, ARM::D27, - ARM::D28, ARM::D29, ARM::D30, ARM::D31 }; + ARM::D28, ARM::D29, ARM::D30, ARM::D31, + ARM::D0, ARM::D1, ARM::D2, ARM::D3, + ARM::D4, ARM::D5, ARM::D6, ARM::D7, + ARM::D8, ARM::D9, ARM::D10, ARM::D11, + ARM::D12, ARM::D13, ARM::D14, ARM::D15 }; + DPRClass::iterator DPRClass::allocation_order_begin(const MachineFunction &MF) const { const TargetMachine &TM = MF.getTarget(); const ARMSubtarget &Subtarget = TM.getSubtarget<ARMSubtarget>(); - if (Subtarget.hasVFP3()) + if (Subtarget.hasVFP3() && !Subtarget.hasD16()) return ARM_DPR_VFP3; return ARM_DPR_VFP2; } @@ -410,7 +416,7 @@ def DPR : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32], 64, DPRClass::allocation_order_end(const MachineFunction &MF) const { const TargetMachine &TM = MF.getTarget(); const ARMSubtarget &Subtarget = TM.getSubtarget<ARMSubtarget>(); - if (Subtarget.hasVFP3()) + if (Subtarget.hasVFP3() && !Subtarget.hasD16()) return ARM_DPR_VFP3 + (sizeof(ARM_DPR_VFP3)/sizeof(unsigned)); else return ARM_DPR_VFP2 + (sizeof(ARM_DPR_VFP2)/sizeof(unsigned)); @@ -438,6 +444,29 @@ def QPR : RegisterClass<"ARM", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], 128, [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7, Q8, Q9, Q10, Q11, Q12, Q13, Q14, Q15]> { let SubRegClasses = [(DPR dsub_0, dsub_1)]; + let MethodProtos = [{ + iterator allocation_order_begin(const MachineFunction &MF) const; + iterator allocation_order_end(const MachineFunction &MF) const; + }]; + let MethodBodies = [{ + // Q4-Q7 are callee saved and should be allocated last. + // Save other low registers for use as QPR_VFP2 and QPR_8 classes. + static const unsigned ARM_QPR[] = { + ARM::Q8, ARM::Q9, ARM::Q10, ARM::Q11, + ARM::Q12, ARM::Q13, ARM::Q14, ARM::Q15, + ARM::Q0, ARM::Q1, ARM::Q2, ARM::Q3, + ARM::Q4, ARM::Q5, ARM::Q6, ARM::Q7 }; + + QPRClass::iterator + QPRClass::allocation_order_begin(const MachineFunction &MF) const { + return ARM_QPR; + } + + QPRClass::iterator + QPRClass::allocation_order_end(const MachineFunction &MF) const { + return ARM_QPR + (sizeof(ARM_QPR)/sizeof(unsigned)); + } + }]; } // Subset of QPR that have 32-bit SPR subregs. @@ -463,6 +492,27 @@ def QQPR : RegisterClass<"ARM", [v4i64], [QQ0, QQ1, QQ2, QQ3, QQ4, QQ5, QQ6, QQ7]> { let SubRegClasses = [(DPR dsub_0, dsub_1, dsub_2, dsub_3), (QPR qsub_0, qsub_1)]; + let MethodProtos = [{ + iterator allocation_order_begin(const MachineFunction &MF) const; + iterator allocation_order_end(const MachineFunction &MF) const; + }]; + let MethodBodies = [{ + // QQ2-QQ3 are callee saved and should be allocated last. + // Save other low registers for use as QPR_VFP2 and QPR_8 classes. + static const unsigned ARM_QQPR[] = { + ARM::QQ4, ARM::QQ5, ARM::QQ6, ARM::QQ7, + ARM::QQ0, ARM::QQ1, ARM::QQ2, ARM::QQ3 }; + + QQPRClass::iterator + QQPRClass::allocation_order_begin(const MachineFunction &MF) const { + return ARM_QQPR; + } + + QQPRClass::iterator + QQPRClass::allocation_order_end(const MachineFunction &MF) const { + return ARM_QQPR + (sizeof(ARM_QQPR)/sizeof(unsigned)); + } + }]; } // Subset of QQPR that have 32-bit SPR subregs. @@ -483,6 +533,26 @@ def QQQQPR : RegisterClass<"ARM", [v8i64], let SubRegClasses = [(DPR dsub_0, dsub_1, dsub_2, dsub_3, dsub_4, dsub_5, dsub_6, dsub_7), (QPR qsub_0, qsub_1, qsub_2, qsub_3)]; + let MethodProtos = [{ + iterator allocation_order_begin(const MachineFunction &MF) const; + iterator allocation_order_end(const MachineFunction &MF) const; + }]; + let MethodBodies = [{ + // QQQQ1 is callee saved and should be allocated last. + // Save QQQQ0 for use as QPR_VFP2 and QPR_8 classes. + static const unsigned ARM_QQQQPR[] = { + ARM::QQQQ2, ARM::QQQQ3, ARM::QQQQ0, ARM::QQQQ1 }; + + QQQQPRClass::iterator + QQQQPRClass::allocation_order_begin(const MachineFunction &MF) const { + return ARM_QQQQPR; + } + + QQQQPRClass::iterator + QQQQPRClass::allocation_order_end(const MachineFunction &MF) const { + return ARM_QQQQPR + (sizeof(ARM_QQQQPR)/sizeof(unsigned)); + } + }]; } // Condition code registers. diff --git a/contrib/llvm/lib/Target/ARM/ARMSchedule.td b/contrib/llvm/lib/Target/ARM/ARMSchedule.td index b60ccca..958c5c6 100644 --- a/contrib/llvm/lib/Target/ARM/ARMSchedule.td +++ b/contrib/llvm/lib/Target/ARM/ARMSchedule.td @@ -14,42 +14,86 @@ def IIC_iALUx : InstrItinClass; def IIC_iALUi : InstrItinClass; def IIC_iALUr : InstrItinClass; def IIC_iALUsi : InstrItinClass; +def IIC_iALUsir : InstrItinClass; def IIC_iALUsr : InstrItinClass; +def IIC_iBITi : InstrItinClass; +def IIC_iBITr : InstrItinClass; +def IIC_iBITsi : InstrItinClass; +def IIC_iBITsr : InstrItinClass; def IIC_iUNAr : InstrItinClass; def IIC_iUNAsi : InstrItinClass; -def IIC_iUNAsr : InstrItinClass; +def IIC_iEXTr : InstrItinClass; +def IIC_iEXTAr : InstrItinClass; +def IIC_iEXTAsr : InstrItinClass; def IIC_iCMPi : InstrItinClass; def IIC_iCMPr : InstrItinClass; def IIC_iCMPsi : InstrItinClass; def IIC_iCMPsr : InstrItinClass; +def IIC_iTSTi : InstrItinClass; +def IIC_iTSTr : InstrItinClass; +def IIC_iTSTsi : InstrItinClass; +def IIC_iTSTsr : InstrItinClass; def IIC_iMOVi : InstrItinClass; def IIC_iMOVr : InstrItinClass; def IIC_iMOVsi : InstrItinClass; def IIC_iMOVsr : InstrItinClass; +def IIC_iMOVix2 : InstrItinClass; +def IIC_iMOVix2addpc : InstrItinClass; +def IIC_iMOVix2ld : InstrItinClass; +def IIC_iMVNi : InstrItinClass; +def IIC_iMVNr : InstrItinClass; +def IIC_iMVNsi : InstrItinClass; +def IIC_iMVNsr : InstrItinClass; def IIC_iCMOVi : InstrItinClass; def IIC_iCMOVr : InstrItinClass; def IIC_iCMOVsi : InstrItinClass; def IIC_iCMOVsr : InstrItinClass; +def IIC_iCMOVix2 : InstrItinClass; def IIC_iMUL16 : InstrItinClass; def IIC_iMAC16 : InstrItinClass; def IIC_iMUL32 : InstrItinClass; def IIC_iMAC32 : InstrItinClass; def IIC_iMUL64 : InstrItinClass; def IIC_iMAC64 : InstrItinClass; -def IIC_iLoadi : InstrItinClass; -def IIC_iLoadr : InstrItinClass; -def IIC_iLoadsi : InstrItinClass; -def IIC_iLoadiu : InstrItinClass; -def IIC_iLoadru : InstrItinClass; -def IIC_iLoadsiu : InstrItinClass; -def IIC_iLoadm : InstrItinClass; -def IIC_iStorei : InstrItinClass; -def IIC_iStorer : InstrItinClass; -def IIC_iStoresi : InstrItinClass; -def IIC_iStoreiu : InstrItinClass; -def IIC_iStoreru : InstrItinClass; -def IIC_iStoresiu : InstrItinClass; -def IIC_iStorem : InstrItinClass; +def IIC_iLoad_i : InstrItinClass; +def IIC_iLoad_r : InstrItinClass; +def IIC_iLoad_si : InstrItinClass; +def IIC_iLoad_iu : InstrItinClass; +def IIC_iLoad_ru : InstrItinClass; +def IIC_iLoad_siu : InstrItinClass; +def IIC_iLoad_bh_i : InstrItinClass; +def IIC_iLoad_bh_r : InstrItinClass; +def IIC_iLoad_bh_si : InstrItinClass; +def IIC_iLoad_bh_iu : InstrItinClass; +def IIC_iLoad_bh_ru : InstrItinClass; +def IIC_iLoad_bh_siu : InstrItinClass; +def IIC_iLoad_d_i : InstrItinClass; +def IIC_iLoad_d_r : InstrItinClass; +def IIC_iLoad_d_ru : InstrItinClass; +def IIC_iLoad_m : InstrItinClass<0>; // micro-coded +def IIC_iLoad_mu : InstrItinClass<0>; // micro-coded +def IIC_iLoad_mBr : InstrItinClass<0>; // micro-coded +def IIC_iPop : InstrItinClass<0>; // micro-coded +def IIC_iPop_Br : InstrItinClass<0>; // micro-coded +def IIC_iLoadiALU : InstrItinClass; +def IIC_iStore_i : InstrItinClass; +def IIC_iStore_r : InstrItinClass; +def IIC_iStore_si : InstrItinClass; +def IIC_iStore_iu : InstrItinClass; +def IIC_iStore_ru : InstrItinClass; +def IIC_iStore_siu : InstrItinClass; +def IIC_iStore_bh_i : InstrItinClass; +def IIC_iStore_bh_r : InstrItinClass; +def IIC_iStore_bh_si : InstrItinClass; +def IIC_iStore_bh_iu : InstrItinClass; +def IIC_iStore_bh_ru : InstrItinClass; +def IIC_iStore_bh_siu : InstrItinClass; +def IIC_iStore_d_i : InstrItinClass; +def IIC_iStore_d_r : InstrItinClass; +def IIC_iStore_d_ru : InstrItinClass; +def IIC_iStore_m : InstrItinClass<0>; // micro-coded +def IIC_iStore_mu : InstrItinClass<0>; // micro-coded +def IIC_Preload : InstrItinClass; def IIC_Br : InstrItinClass; def IIC_fpSTAT : InstrItinClass; def IIC_fpUNA32 : InstrItinClass; @@ -80,19 +124,76 @@ def IIC_fpSQRT32 : InstrItinClass; def IIC_fpSQRT64 : InstrItinClass; def IIC_fpLoad32 : InstrItinClass; def IIC_fpLoad64 : InstrItinClass; -def IIC_fpLoadm : InstrItinClass; +def IIC_fpLoad_m : InstrItinClass<0>; // micro-coded +def IIC_fpLoad_mu : InstrItinClass<0>; // micro-coded def IIC_fpStore32 : InstrItinClass; def IIC_fpStore64 : InstrItinClass; -def IIC_fpStorem : InstrItinClass; +def IIC_fpStore_m : InstrItinClass<0>; // micro-coded +def IIC_fpStore_mu : InstrItinClass<0>; // micro-coded def IIC_VLD1 : InstrItinClass; +def IIC_VLD1x2 : InstrItinClass; +def IIC_VLD1x3 : InstrItinClass; +def IIC_VLD1x4 : InstrItinClass; +def IIC_VLD1u : InstrItinClass; +def IIC_VLD1x2u : InstrItinClass; +def IIC_VLD1x3u : InstrItinClass; +def IIC_VLD1x4u : InstrItinClass; +def IIC_VLD1ln : InstrItinClass; +def IIC_VLD1lnu : InstrItinClass; +def IIC_VLD1dup : InstrItinClass; +def IIC_VLD1dupu : InstrItinClass; def IIC_VLD2 : InstrItinClass; +def IIC_VLD2x2 : InstrItinClass; +def IIC_VLD2u : InstrItinClass; +def IIC_VLD2x2u : InstrItinClass; +def IIC_VLD2ln : InstrItinClass; +def IIC_VLD2lnu : InstrItinClass; +def IIC_VLD2dup : InstrItinClass; +def IIC_VLD2dupu : InstrItinClass; def IIC_VLD3 : InstrItinClass; +def IIC_VLD3ln : InstrItinClass; +def IIC_VLD3u : InstrItinClass; +def IIC_VLD3lnu : InstrItinClass; +def IIC_VLD3dup : InstrItinClass; +def IIC_VLD3dupu : InstrItinClass; def IIC_VLD4 : InstrItinClass; -def IIC_VST : InstrItinClass; +def IIC_VLD4ln : InstrItinClass; +def IIC_VLD4u : InstrItinClass; +def IIC_VLD4lnu : InstrItinClass; +def IIC_VLD4dup : InstrItinClass; +def IIC_VLD4dupu : InstrItinClass; +def IIC_VST1 : InstrItinClass; +def IIC_VST1x2 : InstrItinClass; +def IIC_VST1x3 : InstrItinClass; +def IIC_VST1x4 : InstrItinClass; +def IIC_VST1u : InstrItinClass; +def IIC_VST1x2u : InstrItinClass; +def IIC_VST1x3u : InstrItinClass; +def IIC_VST1x4u : InstrItinClass; +def IIC_VST1ln : InstrItinClass; +def IIC_VST1lnu : InstrItinClass; +def IIC_VST2 : InstrItinClass; +def IIC_VST2x2 : InstrItinClass; +def IIC_VST2u : InstrItinClass; +def IIC_VST2x2u : InstrItinClass; +def IIC_VST2ln : InstrItinClass; +def IIC_VST2lnu : InstrItinClass; +def IIC_VST3 : InstrItinClass; +def IIC_VST3u : InstrItinClass; +def IIC_VST3ln : InstrItinClass; +def IIC_VST3lnu : InstrItinClass; +def IIC_VST4 : InstrItinClass; +def IIC_VST4u : InstrItinClass; +def IIC_VST4ln : InstrItinClass; +def IIC_VST4lnu : InstrItinClass; def IIC_VUNAD : InstrItinClass; def IIC_VUNAQ : InstrItinClass; def IIC_VBIND : InstrItinClass; def IIC_VBINQ : InstrItinClass; +def IIC_VPBIND : InstrItinClass; +def IIC_VFMULD : InstrItinClass; +def IIC_VFMULQ : InstrItinClass; +def IIC_VMOV : InstrItinClass; def IIC_VMOVImm : InstrItinClass; def IIC_VMOVD : InstrItinClass; def IIC_VMOVQ : InstrItinClass; @@ -101,6 +202,7 @@ def IIC_VMOVID : InstrItinClass; def IIC_VMOVISL : InstrItinClass; def IIC_VMOVSI : InstrItinClass; def IIC_VMOVDI : InstrItinClass; +def IIC_VMOVN : InstrItinClass; def IIC_VPERMD : InstrItinClass; def IIC_VPERMQ : InstrItinClass; def IIC_VPERMQ3 : InstrItinClass; @@ -152,7 +254,7 @@ def IIC_VTBX4 : InstrItinClass; //===----------------------------------------------------------------------===// // Processor instruction itineraries. -def GenericItineraries : ProcessorItineraries<[], []>; +def GenericItineraries : ProcessorItineraries<[], [], []>; include "ARMScheduleV6.td" include "ARMScheduleA8.td" diff --git a/contrib/llvm/lib/Target/ARM/ARMScheduleA8.td b/contrib/llvm/lib/Target/ARM/ARMScheduleA8.td index 282abca..8d86c01 100644 --- a/contrib/llvm/lib/Target/ARM/ARMScheduleA8.td +++ b/contrib/llvm/lib/Target/ARM/ARMScheduleA8.td @@ -14,18 +14,17 @@ // // Scheduling information derived from "Cortex-A8 Technical Reference Manual". // Functional Units. -def A8_Issue : FuncUnit; // issue def A8_Pipe0 : FuncUnit; // pipeline 0 def A8_Pipe1 : FuncUnit; // pipeline 1 -def A8_LdSt0 : FuncUnit; // pipeline 0 load/store -def A8_LdSt1 : FuncUnit; // pipeline 1 load/store +def A8_LSPipe : FuncUnit; // Load / store pipeline def A8_NPipe : FuncUnit; // NEON ALU/MUL pipe def A8_NLSPipe : FuncUnit; // NEON LS pipe // // Dual issue pipeline represented by A8_Pipe0 | A8_Pipe1 // def CortexA8Itineraries : ProcessorItineraries< - [A8_Issue, A8_Pipe0, A8_Pipe1, A8_LdSt0, A8_LdSt1, A8_NPipe, A8_NLSPipe], [ + [A8_Pipe0, A8_Pipe1, A8_LSPipe, A8_NPipe, A8_NLSPipe], + [], [ // Two fully-pipelined integer ALU pipelines // // No operand cycles @@ -35,12 +34,23 @@ def CortexA8Itineraries : ProcessorItineraries< InstrItinData<IIC_iALUi ,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2]>, InstrItinData<IIC_iALUr ,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2, 2]>, InstrItinData<IIC_iALUsi,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2, 1]>, + InstrItinData<IIC_iALUsir,[InstrStage<1,[A8_Pipe0, A8_Pipe1]>], [2, 1, 2]>, InstrItinData<IIC_iALUsr,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2, 1, 1]>, // + // Bitwise Instructions that produce a result + InstrItinData<IIC_iBITi ,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2]>, + InstrItinData<IIC_iBITr ,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2, 2]>, + InstrItinData<IIC_iBITsi,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2, 1]>, + InstrItinData<IIC_iBITsr,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2, 1, 1]>, + // // Unary Instructions that produce a result InstrItinData<IIC_iUNAr , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2]>, InstrItinData<IIC_iUNAsi, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1]>, - InstrItinData<IIC_iUNAsr, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1, 1]>, + // + // Zero and sign extension instructions + InstrItinData<IIC_iEXTr , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [1, 1]>, + InstrItinData<IIC_iEXTAr, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2, 1]>, + InstrItinData<IIC_iEXTAsr,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>],[2, 2, 1, 1]>, // // Compare instructions InstrItinData<IIC_iCMPi , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2]>, @@ -48,124 +58,184 @@ def CortexA8Itineraries : ProcessorItineraries< InstrItinData<IIC_iCMPsi, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1]>, InstrItinData<IIC_iCMPsr, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1, 1]>, // + // Test instructions + InstrItinData<IIC_iTSTi , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2]>, + InstrItinData<IIC_iTSTr , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2]>, + InstrItinData<IIC_iTSTsi, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1]>, + InstrItinData<IIC_iTSTsr, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1, 1]>, + // // Move instructions, unconditional InstrItinData<IIC_iMOVi , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [1]>, InstrItinData<IIC_iMOVr , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [1, 1]>, InstrItinData<IIC_iMOVsi, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [1, 1]>, InstrItinData<IIC_iMOVsr, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [1, 1, 1]>, + InstrItinData<IIC_iMOVix2,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2]>, + InstrItinData<IIC_iMOVix2addpc,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [3]>, + InstrItinData<IIC_iMOVix2ld,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<1, [A8_LSPipe]>], [5]>, // // Move instructions, conditional InstrItinData<IIC_iCMOVi , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2]>, InstrItinData<IIC_iCMOVr , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1]>, InstrItinData<IIC_iCMOVsi, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1]>, InstrItinData<IIC_iCMOVsr, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1, 1]>, + InstrItinData<IIC_iCMOVix2,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [3, 1]>, + // + // MVN instructions + InstrItinData<IIC_iMVNi , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [1]>, + InstrItinData<IIC_iMVNr , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [1, 1]>, + InstrItinData<IIC_iMVNsi, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [1, 1]>, + InstrItinData<IIC_iMVNsr, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [1, 1, 1]>, // Integer multiply pipeline // Result written in E5, but that is relative to the last cycle of multicycle, // so we use 6 for those cases // InstrItinData<IIC_iMUL16 , [InstrStage<1, [A8_Pipe0]>], [5, 1, 1]>, - InstrItinData<IIC_iMAC16 , [InstrStage<1, [A8_Pipe1], 0>, - InstrStage<2, [A8_Pipe0]>], [6, 1, 1, 4]>, - InstrItinData<IIC_iMUL32 , [InstrStage<1, [A8_Pipe1], 0>, - InstrStage<2, [A8_Pipe0]>], [6, 1, 1]>, - InstrItinData<IIC_iMAC32 , [InstrStage<1, [A8_Pipe1], 0>, - InstrStage<2, [A8_Pipe0]>], [6, 1, 1, 4]>, - InstrItinData<IIC_iMUL64 , [InstrStage<2, [A8_Pipe1], 0>, - InstrStage<3, [A8_Pipe0]>], [6, 6, 1, 1]>, - InstrItinData<IIC_iMAC64 , [InstrStage<2, [A8_Pipe1], 0>, - InstrStage<3, [A8_Pipe0]>], [6, 6, 1, 1]>, + InstrItinData<IIC_iMAC16 , [InstrStage<2, [A8_Pipe0]>], [6, 1, 1, 4]>, + InstrItinData<IIC_iMUL32 , [InstrStage<2, [A8_Pipe0]>], [6, 1, 1]>, + InstrItinData<IIC_iMAC32 , [InstrStage<2, [A8_Pipe0]>], [6, 1, 1, 4]>, + InstrItinData<IIC_iMUL64 , [InstrStage<3, [A8_Pipe0]>], [6, 6, 1, 1]>, + InstrItinData<IIC_iMAC64 , [InstrStage<3, [A8_Pipe0]>], [6, 6, 1, 1]>, // Integer load pipeline // - // loads have an extra cycle of latency, but are fully pipelined - // use A8_Issue to enforce the 1 load/store per cycle limit - // // Immediate offset - InstrItinData<IIC_iLoadi , [InstrStage<1, [A8_Issue], 0>, - InstrStage<1, [A8_Pipe0, A8_Pipe1]>, - InstrStage<1, [A8_LdSt0]>], [3, 1]>, + InstrItinData<IIC_iLoad_i , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_LSPipe]>], [3, 1]>, + InstrItinData<IIC_iLoad_bh_i, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_LSPipe]>], [3, 1]>, + InstrItinData<IIC_iLoad_d_i, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_LSPipe]>], [3, 1]>, // // Register offset - InstrItinData<IIC_iLoadr , [InstrStage<1, [A8_Issue], 0>, - InstrStage<1, [A8_Pipe0, A8_Pipe1]>, - InstrStage<1, [A8_LdSt0]>], [3, 1, 1]>, + InstrItinData<IIC_iLoad_r , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_LSPipe]>], [3, 1, 1]>, + InstrItinData<IIC_iLoad_bh_r, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_LSPipe]>], [3, 1, 1]>, + InstrItinData<IIC_iLoad_d_r , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_LSPipe]>], [3, 1, 1]>, // // Scaled register offset, issues over 2 cycles - InstrItinData<IIC_iLoadsi , [InstrStage<2, [A8_Issue], 0>, - InstrStage<1, [A8_Pipe0], 0>, - InstrStage<1, [A8_Pipe1]>, - InstrStage<1, [A8_Pipe0, A8_Pipe1]>, - InstrStage<1, [A8_LdSt0]>], [4, 1, 1]>, + // FIXME: lsl by 2 takes 1 cycle. + InstrItinData<IIC_iLoad_si , [InstrStage<2, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_LSPipe]>], [4, 1, 1]>, + InstrItinData<IIC_iLoad_bh_si,[InstrStage<2, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_LSPipe]>], [4, 1, 1]>, // // Immediate offset with update - InstrItinData<IIC_iLoadiu , [InstrStage<1, [A8_Issue], 0>, - InstrStage<1, [A8_Pipe0, A8_Pipe1]>, - InstrStage<1, [A8_LdSt0]>], [3, 2, 1]>, + InstrItinData<IIC_iLoad_iu , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_LSPipe]>], [3, 2, 1]>, + InstrItinData<IIC_iLoad_bh_iu,[InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_LSPipe]>], [3, 2, 1]>, // // Register offset with update - InstrItinData<IIC_iLoadru , [InstrStage<1, [A8_Issue], 0>, - InstrStage<1, [A8_Pipe0, A8_Pipe1]>, - InstrStage<1, [A8_LdSt0]>], [3, 2, 1, 1]>, + InstrItinData<IIC_iLoad_ru , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_LSPipe]>], [3, 2, 1, 1]>, + InstrItinData<IIC_iLoad_bh_ru,[InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_LSPipe]>], [3, 2, 1, 1]>, + InstrItinData<IIC_iLoad_d_ru, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_LSPipe]>], [3, 2, 1, 1]>, // // Scaled register offset with update, issues over 2 cycles - InstrItinData<IIC_iLoadsiu , [InstrStage<2, [A8_Issue], 0>, - InstrStage<1, [A8_Pipe0], 0>, - InstrStage<1, [A8_Pipe1]>, - InstrStage<1, [A8_Pipe0, A8_Pipe1]>, - InstrStage<1, [A8_LdSt0]>], [4, 3, 1, 1]>, - // - // Load multiple - InstrItinData<IIC_iLoadm , [InstrStage<2, [A8_Issue], 0>, - InstrStage<2, [A8_Pipe0], 0>, - InstrStage<2, [A8_Pipe1]>, - InstrStage<1, [A8_Pipe0, A8_Pipe1]>, - InstrStage<1, [A8_LdSt0]>]>, + InstrItinData<IIC_iLoad_siu , [InstrStage<2, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_LSPipe]>], [4, 3, 1, 1]>, + InstrItinData<IIC_iLoad_bh_siu,[InstrStage<2, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_LSPipe]>], [4, 3, 1, 1]>, + // + // Load multiple, def is the 5th operand. Pipeline 0 only. + // FIXME: A8_LSPipe cycle time is dynamic, this assumes 3 to 4 registers. + InstrItinData<IIC_iLoad_m , [InstrStage<2, [A8_Pipe0], 0>, + InstrStage<2, [A8_LSPipe]>], [1, 1, 1, 1, 3]>, + // + // Load multiple + update, defs are the 1st and 5th operands. + InstrItinData<IIC_iLoad_mu , [InstrStage<3, [A8_Pipe0], 0>, + InstrStage<3, [A8_LSPipe]>], [2, 1, 1, 1, 3]>, + // + // Load multiple plus branch + InstrItinData<IIC_iLoad_mBr, [InstrStage<3, [A8_Pipe0], 0>, + InstrStage<3, [A8_LSPipe]>, + InstrStage<1, [A8_Pipe0, A8_Pipe1]>], + [1, 2, 1, 1, 3]>, + // + // Pop, def is the 3rd operand. + InstrItinData<IIC_iPop , [InstrStage<3, [A8_Pipe0], 0>, + InstrStage<3, [A8_LSPipe]>], [1, 1, 3]>, + // + // Push, def is the 3th operand. + InstrItinData<IIC_iPop_Br, [InstrStage<3, [A8_Pipe0], 0>, + InstrStage<3, [A8_LSPipe]>, + InstrStage<1, [A8_Pipe0, A8_Pipe1]>], + [1, 1, 3]>, - // Integer store pipeline // - // use A8_Issue to enforce the 1 load/store per cycle limit + // iLoadi + iALUr for t2LDRpci_pic. + InstrItinData<IIC_iLoadiALU, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_LSPipe]>, + InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [4, 1]>, + + + // Integer store pipeline // // Immediate offset - InstrItinData<IIC_iStorei , [InstrStage<1, [A8_Issue], 0>, - InstrStage<1, [A8_Pipe0, A8_Pipe1]>, - InstrStage<1, [A8_LdSt0]>], [3, 1]>, + InstrItinData<IIC_iStore_i , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_LSPipe]>], [3, 1]>, + InstrItinData<IIC_iStore_bh_i,[InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_LSPipe]>], [3, 1]>, + InstrItinData<IIC_iStore_d_i, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_LSPipe]>], [3, 1]>, // // Register offset - InstrItinData<IIC_iStorer , [InstrStage<1, [A8_Issue], 0>, - InstrStage<1, [A8_Pipe0, A8_Pipe1]>, - InstrStage<1, [A8_LdSt0]>], [3, 1, 1]>, + InstrItinData<IIC_iStore_r , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_LSPipe]>], [3, 1, 1]>, + InstrItinData<IIC_iStore_bh_r,[InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_LSPipe]>], [3, 1, 1]>, + InstrItinData<IIC_iStore_d_r, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_LSPipe]>], [3, 1, 1]>, // // Scaled register offset, issues over 2 cycles - InstrItinData<IIC_iStoresi , [InstrStage<2, [A8_Issue], 0>, - InstrStage<1, [A8_Pipe0], 0>, - InstrStage<1, [A8_Pipe1]>, - InstrStage<1, [A8_Pipe0, A8_Pipe1]>, - InstrStage<1, [A8_LdSt0]>], [3, 1, 1]>, + InstrItinData<IIC_iStore_si , [InstrStage<2, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_LSPipe]>], [3, 1, 1]>, + InstrItinData<IIC_iStore_bh_si,[InstrStage<2, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_LSPipe]>], [3, 1, 1]>, // // Immediate offset with update - InstrItinData<IIC_iStoreiu , [InstrStage<1, [A8_Issue], 0>, - InstrStage<1, [A8_Pipe0, A8_Pipe1]>, - InstrStage<1, [A8_LdSt0]>], [2, 3, 1]>, + InstrItinData<IIC_iStore_iu , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_LSPipe]>], [2, 3, 1]>, + InstrItinData<IIC_iStore_bh_iu,[InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_LSPipe]>], [2, 3, 1]>, // // Register offset with update - InstrItinData<IIC_iStoreru , [InstrStage<1, [A8_Issue], 0>, - InstrStage<1, [A8_Pipe0, A8_Pipe1]>, - InstrStage<1, [A8_LdSt0]>], [2, 3, 1, 1]>, + InstrItinData<IIC_iStore_ru , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_LSPipe]>], [2, 3, 1, 1]>, + InstrItinData<IIC_iStore_bh_ru,[InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_LSPipe]>], [2, 3, 1, 1]>, + InstrItinData<IIC_iStore_d_ru, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_LSPipe]>], [2, 3, 1, 1]>, // // Scaled register offset with update, issues over 2 cycles - InstrItinData<IIC_iStoresiu, [InstrStage<2, [A8_Issue], 0>, - InstrStage<1, [A8_Pipe0], 0>, - InstrStage<1, [A8_Pipe1]>, - InstrStage<1, [A8_Pipe0, A8_Pipe1]>, - InstrStage<1, [A8_LdSt0]>], [3, 3, 1, 1]>, - // - // Store multiple - InstrItinData<IIC_iStorem , [InstrStage<2, [A8_Issue], 0>, - InstrStage<2, [A8_Pipe0], 0>, - InstrStage<2, [A8_Pipe1]>, - InstrStage<1, [A8_Pipe0, A8_Pipe1]>, - InstrStage<1, [A8_LdSt0]>]>, + InstrItinData<IIC_iStore_siu, [InstrStage<2, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_LSPipe]>], [3, 3, 1, 1]>, + InstrItinData<IIC_iStore_bh_siu,[InstrStage<2, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_LSPipe]>], [3, 3, 1, 1]>, + // + // Store multiple. Pipeline 0 only. + // FIXME: A8_LSPipe cycle time is dynamic, this assumes 3 to 4 registers. + InstrItinData<IIC_iStore_m , [InstrStage<2, [A8_Pipe0], 0>, + InstrStage<2, [A8_LSPipe]>]>, + // + // Store multiple + update + InstrItinData<IIC_iStore_mu, [InstrStage<2, [A8_Pipe0], 0>, + InstrStage<2, [A8_LSPipe]>], [2]>, + + // + // Preload + InstrItinData<IIC_Preload, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2]>, // Branch // @@ -178,440 +248,786 @@ def CortexA8Itineraries : ProcessorItineraries< // possible. // // FP Special Register to Integer Register File Move - InstrItinData<IIC_fpSTAT , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, - InstrStage<1, [A8_NLSPipe]>]>, + InstrItinData<IIC_fpSTAT , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NLSPipe]>], [20]>, // // Single-precision FP Unary - InstrItinData<IIC_fpUNA32 , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrItinData<IIC_fpUNA32 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, InstrStage<1, [A8_NPipe]>], [7, 1]>, // // Double-precision FP Unary - InstrItinData<IIC_fpUNA64 , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrItinData<IIC_fpUNA64 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, InstrStage<4, [A8_NPipe], 0>, InstrStage<4, [A8_NLSPipe]>], [4, 1]>, // // Single-precision FP Compare - InstrItinData<IIC_fpCMP32 , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrItinData<IIC_fpCMP32 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, InstrStage<1, [A8_NPipe]>], [1, 1]>, // // Double-precision FP Compare - InstrItinData<IIC_fpCMP64 , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrItinData<IIC_fpCMP64 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, InstrStage<4, [A8_NPipe], 0>, InstrStage<4, [A8_NLSPipe]>], [4, 1]>, // // Single to Double FP Convert - InstrItinData<IIC_fpCVTSD , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrItinData<IIC_fpCVTSD , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, InstrStage<7, [A8_NPipe], 0>, InstrStage<7, [A8_NLSPipe]>], [7, 1]>, // // Double to Single FP Convert - InstrItinData<IIC_fpCVTDS , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrItinData<IIC_fpCVTDS , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, InstrStage<5, [A8_NPipe], 0>, InstrStage<5, [A8_NLSPipe]>], [5, 1]>, // // Single-Precision FP to Integer Convert - InstrItinData<IIC_fpCVTSI , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrItinData<IIC_fpCVTSI , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, InstrStage<1, [A8_NPipe]>], [7, 1]>, // // Double-Precision FP to Integer Convert - InstrItinData<IIC_fpCVTDI , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrItinData<IIC_fpCVTDI , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, InstrStage<8, [A8_NPipe], 0>, InstrStage<8, [A8_NLSPipe]>], [8, 1]>, // // Integer to Single-Precision FP Convert - InstrItinData<IIC_fpCVTIS , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrItinData<IIC_fpCVTIS , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, InstrStage<1, [A8_NPipe]>], [7, 1]>, // // Integer to Double-Precision FP Convert - InstrItinData<IIC_fpCVTID , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrItinData<IIC_fpCVTID , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, InstrStage<8, [A8_NPipe], 0>, InstrStage<8, [A8_NLSPipe]>], [8, 1]>, // // Single-precision FP ALU - InstrItinData<IIC_fpALU32 , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrItinData<IIC_fpALU32 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, InstrStage<1, [A8_NPipe]>], [7, 1, 1]>, // // Double-precision FP ALU - InstrItinData<IIC_fpALU64 , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrItinData<IIC_fpALU64 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, InstrStage<9, [A8_NPipe], 0>, InstrStage<9, [A8_NLSPipe]>], [9, 1, 1]>, // // Single-precision FP Multiply - InstrItinData<IIC_fpMUL32 , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrItinData<IIC_fpMUL32 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, InstrStage<1, [A8_NPipe]>], [7, 1, 1]>, // // Double-precision FP Multiply - InstrItinData<IIC_fpMUL64 , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrItinData<IIC_fpMUL64 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, InstrStage<11, [A8_NPipe], 0>, InstrStage<11, [A8_NLSPipe]>], [11, 1, 1]>, // // Single-precision FP MAC - InstrItinData<IIC_fpMAC32 , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrItinData<IIC_fpMAC32 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, InstrStage<1, [A8_NPipe]>], [7, 2, 1, 1]>, // // Double-precision FP MAC - InstrItinData<IIC_fpMAC64 , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrItinData<IIC_fpMAC64 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, InstrStage<19, [A8_NPipe], 0>, InstrStage<19, [A8_NLSPipe]>], [19, 2, 1, 1]>, // // Single-precision FP DIV - InstrItinData<IIC_fpDIV32 , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrItinData<IIC_fpDIV32 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, InstrStage<20, [A8_NPipe], 0>, InstrStage<20, [A8_NLSPipe]>], [20, 1, 1]>, // // Double-precision FP DIV - InstrItinData<IIC_fpDIV64 , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrItinData<IIC_fpDIV64 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, InstrStage<29, [A8_NPipe], 0>, InstrStage<29, [A8_NLSPipe]>], [29, 1, 1]>, // // Single-precision FP SQRT - InstrItinData<IIC_fpSQRT32, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrItinData<IIC_fpSQRT32, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, InstrStage<19, [A8_NPipe], 0>, InstrStage<19, [A8_NLSPipe]>], [19, 1]>, // // Double-precision FP SQRT - InstrItinData<IIC_fpSQRT64, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrItinData<IIC_fpSQRT64, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, InstrStage<29, [A8_NPipe], 0>, InstrStage<29, [A8_NLSPipe]>], [29, 1]>, + + // + // Integer to Single-precision Move + InstrItinData<IIC_fpMOVIS, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NPipe]>], + [2, 1]>, + // + // Integer to Double-precision Move + InstrItinData<IIC_fpMOVID, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NPipe]>], + [2, 1, 1]>, + // + // Single-precision to Integer Move + InstrItinData<IIC_fpMOVSI, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NPipe]>], + [20, 1]>, + // + // Double-precision to Integer Move + InstrItinData<IIC_fpMOVDI, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NPipe]>], + [20, 20, 1]>, + // // Single-precision FP Load - // use A8_Issue to enforce the 1 load/store per cycle limit - InstrItinData<IIC_fpLoad32, [InstrStage<1, [A8_Issue], 0>, - InstrStage<1, [A8_Pipe0, A8_Pipe1]>, - InstrStage<1, [A8_LdSt0], 0>, - InstrStage<1, [A8_NLSPipe]>]>, + InstrItinData<IIC_fpLoad32, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NLSPipe], 0>, + InstrStage<1, [A8_LSPipe]>], + [2, 1]>, // // Double-precision FP Load - // use A8_Issue to enforce the 1 load/store per cycle limit - InstrItinData<IIC_fpLoad64, [InstrStage<2, [A8_Issue], 0>, - InstrStage<1, [A8_Pipe0], 0>, - InstrStage<1, [A8_Pipe1]>, - InstrStage<1, [A8_Pipe0, A8_Pipe1]>, - InstrStage<1, [A8_LdSt0], 0>, - InstrStage<1, [A8_NLSPipe]>]>, + InstrItinData<IIC_fpLoad64, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NLSPipe], 0>, + InstrStage<1, [A8_LSPipe]>], + [2, 1]>, // // FP Load Multiple - // use A8_Issue to enforce the 1 load/store per cycle limit - InstrItinData<IIC_fpLoadm, [InstrStage<3, [A8_Issue], 0>, - InstrStage<2, [A8_Pipe0], 0>, - InstrStage<2, [A8_Pipe1]>, - InstrStage<1, [A8_Pipe0, A8_Pipe1]>, - InstrStage<1, [A8_LdSt0], 0>, - InstrStage<1, [A8_NLSPipe]>]>, + // FIXME: A8_LSPipe cycle time is dynamic, this assumes 3 to 4 registers. + InstrItinData<IIC_fpLoad_m, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NLSPipe], 0>, + InstrStage<1, [A8_LSPipe]>, + InstrStage<1, [A8_NLSPipe], 0>, + InstrStage<1, [A8_LSPipe]>], [1, 1, 1, 2]>, + // + // FP Load Multiple + update + InstrItinData<IIC_fpLoad_mu,[InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NLSPipe], 0>, + InstrStage<1, [A8_LSPipe]>, + InstrStage<1, [A8_NLSPipe], 0>, + InstrStage<1, [A8_LSPipe]>], [2, 1, 1, 1, 2]>, // // Single-precision FP Store - // use A8_Issue to enforce the 1 load/store per cycle limit - InstrItinData<IIC_fpStore32,[InstrStage<1, [A8_Issue], 0>, - InstrStage<1, [A8_Pipe0, A8_Pipe1]>, - InstrStage<1, [A8_LdSt0], 0>, - InstrStage<1, [A8_NLSPipe]>]>, + InstrItinData<IIC_fpStore32,[InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NLSPipe], 0>, + InstrStage<1, [A8_LSPipe]>], + [1, 1]>, // // Double-precision FP Store - // use A8_Issue to enforce the 1 load/store per cycle limit - InstrItinData<IIC_fpStore64,[InstrStage<2, [A8_Issue], 0>, - InstrStage<1, [A8_Pipe0], 0>, - InstrStage<1, [A8_Pipe1]>, - InstrStage<1, [A8_Pipe0, A8_Pipe1]>, - InstrStage<1, [A8_LdSt0], 0>, - InstrStage<1, [A8_NLSPipe]>]>, + InstrItinData<IIC_fpStore64,[InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NLSPipe], 0>, + InstrStage<1, [A8_LSPipe]>], + [1, 1]>, // // FP Store Multiple - // use A8_Issue to enforce the 1 load/store per cycle limit - InstrItinData<IIC_fpStorem, [InstrStage<3, [A8_Issue], 0>, - InstrStage<2, [A8_Pipe0], 0>, - InstrStage<2, [A8_Pipe1]>, - InstrStage<1, [A8_Pipe0, A8_Pipe1]>, - InstrStage<1, [A8_LdSt0], 0>, - InstrStage<1, [A8_NLSPipe]>]>, + InstrItinData<IIC_fpStore_m,[InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NLSPipe], 0>, + InstrStage<1, [A8_LSPipe]>, + InstrStage<1, [A8_NLSPipe], 0>, + InstrStage<1, [A8_LSPipe]>], [1, 1, 1, 1]>, + // + // FP Store Multiple + update + InstrItinData<IIC_fpStore_mu,[InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NLSPipe], 0>, + InstrStage<1, [A8_LSPipe]>, + InstrStage<1, [A8_NLSPipe], 0>, + InstrStage<1, [A8_LSPipe]>], [2, 1, 1, 1, 1]>, // NEON // Issue through integer pipeline, and execute in NEON unit. // // VLD1 - // FIXME: We don't model this instruction properly - InstrItinData<IIC_VLD1, [InstrStage<1, [A8_Issue], 0>, - InstrStage<1, [A8_Pipe0, A8_Pipe1]>, - InstrStage<1, [A8_LdSt0], 0>, - InstrStage<1, [A8_NLSPipe]>]>, + InstrItinData<IIC_VLD1, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_NLSPipe], 0>, + InstrStage<2, [A8_LSPipe]>], + [2, 1]>, + // VLD1x2 + InstrItinData<IIC_VLD1x2, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_NLSPipe], 0>, + InstrStage<2, [A8_LSPipe]>], + [2, 2, 1]>, + // + // VLD1x3 + InstrItinData<IIC_VLD1x3, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<3, [A8_NLSPipe], 0>, + InstrStage<3, [A8_LSPipe]>], + [2, 2, 3, 1]>, + // + // VLD1x4 + InstrItinData<IIC_VLD1x4, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<3, [A8_NLSPipe], 0>, + InstrStage<3, [A8_LSPipe]>], + [2, 2, 3, 3, 1]>, + // + // VLD1u + InstrItinData<IIC_VLD1u, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_NLSPipe], 0>, + InstrStage<2, [A8_LSPipe]>], + [2, 2, 1]>, + // + // VLD1x2u + InstrItinData<IIC_VLD1x2u, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_NLSPipe], 0>, + InstrStage<2, [A8_LSPipe]>], + [2, 2, 2, 1]>, + // + // VLD1x3u + InstrItinData<IIC_VLD1x3u, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<3, [A8_NLSPipe], 0>, + InstrStage<3, [A8_LSPipe]>], + [2, 2, 3, 2, 1]>, + // + // VLD1x4u + InstrItinData<IIC_VLD1x4u, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<3, [A8_NLSPipe], 0>, + InstrStage<3, [A8_LSPipe]>], + [2, 2, 3, 3, 2, 1]>, + // + // VLD1ln + InstrItinData<IIC_VLD1ln, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<3, [A8_NLSPipe], 0>, + InstrStage<3, [A8_LSPipe]>], + [3, 1, 1, 1]>, + // + // VLD1lnu + InstrItinData<IIC_VLD1lnu, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<3, [A8_NLSPipe], 0>, + InstrStage<3, [A8_LSPipe]>], + [3, 2, 1, 1, 1, 1]>, + // + // VLD1dup + InstrItinData<IIC_VLD1dup, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<2, [A8_NLSPipe], 0>, + InstrStage<2, [A8_LSPipe]>], + [2, 1]>, + // + // VLD1dupu + InstrItinData<IIC_VLD1dupu, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<2, [A8_NLSPipe], 0>, + InstrStage<2, [A8_LSPipe]>], + [2, 2, 1, 1]>, // // VLD2 - // FIXME: We don't model this instruction properly - InstrItinData<IIC_VLD2, [InstrStage<1, [A8_Issue], 0>, - InstrStage<1, [A8_Pipe0, A8_Pipe1]>, - InstrStage<1, [A8_LdSt0], 0>, - InstrStage<1, [A8_NLSPipe]>], [2, 2, 1]>, + InstrItinData<IIC_VLD2, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_NLSPipe], 0>, + InstrStage<2, [A8_LSPipe]>], + [2, 2, 1]>, + // + // VLD2x2 + InstrItinData<IIC_VLD2x2, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<3, [A8_NLSPipe], 0>, + InstrStage<3, [A8_LSPipe]>], + [2, 2, 3, 3, 1]>, + // + // VLD2ln + InstrItinData<IIC_VLD2ln, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<3, [A8_NLSPipe], 0>, + InstrStage<3, [A8_LSPipe]>], + [3, 3, 1, 1, 1, 1]>, + // + // VLD2u + InstrItinData<IIC_VLD2u, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_NLSPipe], 0>, + InstrStage<2, [A8_LSPipe]>], + [2, 2, 2, 1, 1, 1]>, + // + // VLD2x2u + InstrItinData<IIC_VLD2x2u, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<3, [A8_NLSPipe], 0>, + InstrStage<3, [A8_LSPipe]>], + [2, 2, 3, 3, 2, 1]>, + // + // VLD2lnu + InstrItinData<IIC_VLD2lnu, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<3, [A8_NLSPipe], 0>, + InstrStage<3, [A8_LSPipe]>], + [3, 3, 2, 1, 1, 1, 1, 1]>, + // + // VLD2dup + InstrItinData<IIC_VLD2dup, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<2, [A8_NLSPipe], 0>, + InstrStage<2, [A8_LSPipe]>], + [2, 2, 1]>, + // + // VLD2dupu + InstrItinData<IIC_VLD2dupu, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<2, [A8_NLSPipe], 0>, + InstrStage<2, [A8_LSPipe]>], + [2, 2, 2, 1, 1]>, // // VLD3 - // FIXME: We don't model this instruction properly - InstrItinData<IIC_VLD3, [InstrStage<1, [A8_Issue], 0>, - InstrStage<1, [A8_Pipe0, A8_Pipe1]>, - InstrStage<1, [A8_LdSt0], 0>, - InstrStage<1, [A8_NLSPipe]>], [2, 2, 2, 1]>, + InstrItinData<IIC_VLD3, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<4, [A8_NLSPipe], 0>, + InstrStage<4, [A8_LSPipe]>], + [3, 3, 4, 1]>, + // + // VLD3ln + InstrItinData<IIC_VLD3ln, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<5, [A8_NLSPipe], 0>, + InstrStage<5, [A8_LSPipe]>], + [4, 4, 5, 1, 1, 1, 1, 2]>, + // + // VLD3u + InstrItinData<IIC_VLD3u, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<4, [A8_NLSPipe], 0>, + InstrStage<4, [A8_LSPipe]>], + [3, 3, 4, 2, 1]>, + // + // VLD3lnu + InstrItinData<IIC_VLD3lnu, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<5, [A8_NLSPipe], 0>, + InstrStage<5, [A8_LSPipe]>], + [4, 4, 5, 2, 1, 1, 1, 1, 1, 2]>, + // + // VLD3dup + InstrItinData<IIC_VLD3dup, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<3, [A8_NLSPipe], 0>, + InstrStage<3, [A8_LSPipe]>], + [2, 2, 3, 1]>, + // + // VLD3dupu + InstrItinData<IIC_VLD3dupu, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<3, [A8_NLSPipe], 0>, + InstrStage<3, [A8_LSPipe]>], + [2, 2, 3, 2, 1, 1]>, // // VLD4 - // FIXME: We don't model this instruction properly - InstrItinData<IIC_VLD4, [InstrStage<1, [A8_Issue], 0>, - InstrStage<1, [A8_Pipe0, A8_Pipe1]>, - InstrStage<1, [A8_LdSt0], 0>, - InstrStage<1, [A8_NLSPipe]>], [2, 2, 2, 2, 1]>, + InstrItinData<IIC_VLD4, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<4, [A8_NLSPipe], 0>, + InstrStage<4, [A8_LSPipe]>], + [3, 3, 4, 4, 1]>, + // + // VLD4ln + InstrItinData<IIC_VLD4ln, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<5, [A8_NLSPipe], 0>, + InstrStage<5, [A8_LSPipe]>], + [4, 4, 5, 5, 1, 1, 1, 1, 2, 2]>, + // + // VLD4u + InstrItinData<IIC_VLD4u, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<4, [A8_NLSPipe], 0>, + InstrStage<4, [A8_LSPipe]>], + [3, 3, 4, 4, 2, 1]>, + // + // VLD4lnu + InstrItinData<IIC_VLD4lnu, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<5, [A8_NLSPipe], 0>, + InstrStage<5, [A8_LSPipe]>], + [4, 4, 5, 5, 2, 1, 1, 1, 1, 1, 2, 2]>, + // + // VLD4dup + InstrItinData<IIC_VLD4dup, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<3, [A8_NLSPipe], 0>, + InstrStage<3, [A8_LSPipe]>], + [2, 2, 3, 3, 1]>, + // + // VLD4dupu + InstrItinData<IIC_VLD4dupu, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<3, [A8_NLSPipe], 0>, + InstrStage<3, [A8_LSPipe]>], + [2, 2, 3, 3, 2, 1, 1]>, + // + // VST1 + InstrItinData<IIC_VST1, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_NLSPipe], 0>, + InstrStage<2, [A8_LSPipe]>], + [1, 1, 1]>, // - // VST - // FIXME: We don't model this instruction properly - InstrItinData<IIC_VST, [InstrStage<1, [A8_Issue], 0>, - InstrStage<1, [A8_Pipe0, A8_Pipe1]>, - InstrStage<1, [A8_LdSt0], 0>, - InstrStage<1, [A8_NLSPipe]>]>, + // VST1x2 + InstrItinData<IIC_VST1x2, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_NLSPipe], 0>, + InstrStage<2, [A8_LSPipe]>], + [1, 1, 1, 1]>, + // + // VST1x3 + InstrItinData<IIC_VST1x3, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<3, [A8_NLSPipe], 0>, + InstrStage<3, [A8_LSPipe]>], + [1, 1, 1, 1, 2]>, + // + // VST1x4 + InstrItinData<IIC_VST1x4, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<3, [A8_NLSPipe], 0>, + InstrStage<3, [A8_LSPipe]>], + [1, 1, 1, 1, 2, 2]>, + // + // VST1u + InstrItinData<IIC_VST1u, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_NLSPipe], 0>, + InstrStage<2, [A8_LSPipe]>], + [2, 1, 1, 1, 1]>, + // + // VST1x2u + InstrItinData<IIC_VST1x2u, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_NLSPipe], 0>, + InstrStage<2, [A8_LSPipe]>], + [2, 1, 1, 1, 1, 1]>, + // + // VST1x3u + InstrItinData<IIC_VST1x3u, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<3, [A8_NLSPipe], 0>, + InstrStage<3, [A8_LSPipe]>], + [2, 1, 1, 1, 1, 1, 2]>, + // + // VST1x4u + InstrItinData<IIC_VST1x4u, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<3, [A8_NLSPipe], 0>, + InstrStage<3, [A8_LSPipe]>], + [2, 1, 1, 1, 1, 1, 2, 2]>, + // + // VST1ln + InstrItinData<IIC_VST1ln, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<2, [A8_NLSPipe], 0>, + InstrStage<2, [A8_LSPipe]>], + [1, 1, 1]>, + // + // VST1lnu + InstrItinData<IIC_VST1lnu, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<2, [A8_NLSPipe], 0>, + InstrStage<2, [A8_LSPipe]>], + [2, 1, 1, 1, 1]>, + // + // VST2 + InstrItinData<IIC_VST2, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_NLSPipe], 0>, + InstrStage<2, [A8_LSPipe]>], + [1, 1, 1, 1]>, + // + // VST2x2 + InstrItinData<IIC_VST2x2, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<4, [A8_NLSPipe], 0>, + InstrStage<4, [A8_LSPipe]>], + [1, 1, 1, 1, 2, 2]>, + // + // VST2u + InstrItinData<IIC_VST2u, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_NLSPipe], 0>, + InstrStage<2, [A8_LSPipe]>], + [2, 1, 1, 1, 1, 1]>, + // + // VST2x2u + InstrItinData<IIC_VST2x2u, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<4, [A8_NLSPipe], 0>, + InstrStage<4, [A8_LSPipe]>], + [2, 1, 1, 1, 1, 1, 2, 2]>, + // + // VST2ln + InstrItinData<IIC_VST2ln, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_NLSPipe], 0>, + InstrStage<2, [A8_LSPipe]>], + [1, 1, 1, 1]>, + // + // VST2lnu + InstrItinData<IIC_VST2lnu, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_NLSPipe], 0>, + InstrStage<2, [A8_LSPipe]>], + [2, 1, 1, 1, 1, 1]>, + // + // VST3 + InstrItinData<IIC_VST3, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<3, [A8_NLSPipe], 0>, + InstrStage<3, [A8_LSPipe]>], + [1, 1, 1, 1, 2]>, + // + // VST3u + InstrItinData<IIC_VST3u, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<3, [A8_NLSPipe], 0>, + InstrStage<3, [A8_LSPipe]>], + [2, 1, 1, 1, 1, 1, 2]>, + // + // VST3ln + InstrItinData<IIC_VST3ln, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<3, [A8_NLSPipe], 0>, + InstrStage<3, [A8_LSPipe]>], + [1, 1, 1, 1, 2]>, + // + // VST3lnu + InstrItinData<IIC_VST3lnu, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<3, [A8_NLSPipe], 0>, + InstrStage<3, [A8_LSPipe]>], + [2, 1, 1, 1, 1, 1, 2]>, + // + // VST4 + InstrItinData<IIC_VST4, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<4, [A8_NLSPipe], 0>, + InstrStage<4, [A8_LSPipe]>], + [1, 1, 1, 1, 2, 2]>, + // + // VST4u + InstrItinData<IIC_VST4u, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<4, [A8_NLSPipe], 0>, + InstrStage<4, [A8_LSPipe]>], + [2, 1, 1, 1, 1, 1, 2, 2]>, + // + // VST4ln + InstrItinData<IIC_VST4ln, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<4, [A8_NLSPipe], 0>, + InstrStage<4, [A8_LSPipe]>], + [1, 1, 1, 1, 2, 2]>, + // + // VST4lnu + InstrItinData<IIC_VST4lnu, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<4, [A8_NLSPipe], 0>, + InstrStage<4, [A8_LSPipe]>], + [2, 1, 1, 1, 1, 1, 2, 2]>, // // Double-register FP Unary - InstrItinData<IIC_VUNAD, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrItinData<IIC_VUNAD, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, InstrStage<1, [A8_NPipe]>], [5, 2]>, // // Quad-register FP Unary // Result written in N5, but that is relative to the last cycle of multicycle, // so we use 6 for those cases - InstrItinData<IIC_VUNAQ, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrItinData<IIC_VUNAQ, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, InstrStage<2, [A8_NPipe]>], [6, 2]>, // // Double-register FP Binary - InstrItinData<IIC_VBIND, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrItinData<IIC_VBIND, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, InstrStage<1, [A8_NPipe]>], [5, 2, 2]>, // + // VPADD, etc. + InstrItinData<IIC_VPBIND, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NPipe]>], [5, 2, 2]>, + // + // Double-register FP VMUL + InstrItinData<IIC_VFMULD, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NPipe]>], [5, 2, 1]>, + + // // Quad-register FP Binary // Result written in N5, but that is relative to the last cycle of multicycle, // so we use 6 for those cases - InstrItinData<IIC_VBINQ, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrItinData<IIC_VBINQ, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, InstrStage<2, [A8_NPipe]>], [6, 2, 2]>, // + // Quad-register FP VMUL + InstrItinData<IIC_VFMULQ, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NPipe]>], [6, 2, 1]>, + // + // Move + InstrItinData<IIC_VMOV, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NPipe]>], [1, 1]>, + // // Move Immediate - InstrItinData<IIC_VMOVImm, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrItinData<IIC_VMOVImm, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, InstrStage<1, [A8_NPipe]>], [3]>, // // Double-register Permute Move - InstrItinData<IIC_VMOVD, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrItinData<IIC_VMOVD, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, InstrStage<1, [A8_NLSPipe]>], [2, 1]>, // // Quad-register Permute Move // Result written in N2, but that is relative to the last cycle of multicycle, // so we use 3 for those cases - InstrItinData<IIC_VMOVQ, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrItinData<IIC_VMOVQ, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, InstrStage<2, [A8_NLSPipe]>], [3, 1]>, // // Integer to Single-precision Move - InstrItinData<IIC_VMOVIS , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrItinData<IIC_VMOVIS , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, InstrStage<1, [A8_NLSPipe]>], [2, 1]>, // // Integer to Double-precision Move - InstrItinData<IIC_VMOVID , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrItinData<IIC_VMOVID , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, InstrStage<1, [A8_NLSPipe]>], [2, 1, 1]>, // // Single-precision to Integer Move - InstrItinData<IIC_VMOVSI , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrItinData<IIC_VMOVSI , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, InstrStage<1, [A8_NLSPipe]>], [20, 1]>, // // Double-precision to Integer Move - InstrItinData<IIC_VMOVDI , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrItinData<IIC_VMOVDI , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, InstrStage<1, [A8_NLSPipe]>], [20, 20, 1]>, // // Integer to Lane Move - InstrItinData<IIC_VMOVISL , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrItinData<IIC_VMOVISL , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, InstrStage<2, [A8_NLSPipe]>], [3, 1, 1]>, // + // Vector narrow move + InstrItinData<IIC_VMOVN , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NPipe]>], [2, 1]>, + // // Double-register Permute - InstrItinData<IIC_VPERMD, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrItinData<IIC_VPERMD, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, InstrStage<1, [A8_NLSPipe]>], [2, 2, 1, 1]>, // // Quad-register Permute // Result written in N2, but that is relative to the last cycle of multicycle, // so we use 3 for those cases - InstrItinData<IIC_VPERMQ, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrItinData<IIC_VPERMQ, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, InstrStage<2, [A8_NLSPipe]>], [3, 3, 1, 1]>, // // Quad-register Permute (3 cycle issue) // Result written in N2, but that is relative to the last cycle of multicycle, // so we use 4 for those cases - InstrItinData<IIC_VPERMQ3, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrItinData<IIC_VPERMQ3, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, InstrStage<1, [A8_NLSPipe]>, InstrStage<1, [A8_NPipe], 0>, InstrStage<2, [A8_NLSPipe]>], [4, 4, 1, 1]>, // // Double-register FP Multiple-Accumulate - InstrItinData<IIC_VMACD, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrItinData<IIC_VMACD, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, InstrStage<1, [A8_NPipe]>], [9, 3, 2, 2]>, // // Quad-register FP Multiple-Accumulate // Result written in N9, but that is relative to the last cycle of multicycle, // so we use 10 for those cases - InstrItinData<IIC_VMACQ, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrItinData<IIC_VMACQ, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, InstrStage<2, [A8_NPipe]>], [10, 3, 2, 2]>, // // Double-register Reciprical Step - InstrItinData<IIC_VRECSD, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrItinData<IIC_VRECSD, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, InstrStage<1, [A8_NPipe]>], [9, 2, 2]>, // // Quad-register Reciprical Step - InstrItinData<IIC_VRECSQ, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrItinData<IIC_VRECSQ, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, InstrStage<2, [A8_NPipe]>], [10, 2, 2]>, // // Double-register Integer Count - InstrItinData<IIC_VCNTiD, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrItinData<IIC_VCNTiD, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, InstrStage<1, [A8_NPipe]>], [3, 2, 2]>, // // Quad-register Integer Count // Result written in N3, but that is relative to the last cycle of multicycle, // so we use 4 for those cases - InstrItinData<IIC_VCNTiQ, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrItinData<IIC_VCNTiQ, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, InstrStage<2, [A8_NPipe]>], [4, 2, 2]>, // // Double-register Integer Unary - InstrItinData<IIC_VUNAiD, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrItinData<IIC_VUNAiD, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, InstrStage<1, [A8_NPipe]>], [4, 2]>, // // Quad-register Integer Unary - InstrItinData<IIC_VUNAiQ, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrItinData<IIC_VUNAiQ, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, InstrStage<1, [A8_NPipe]>], [4, 2]>, // // Double-register Integer Q-Unary - InstrItinData<IIC_VQUNAiD, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrItinData<IIC_VQUNAiD, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, InstrStage<1, [A8_NPipe]>], [4, 1]>, // // Quad-register Integer CountQ-Unary - InstrItinData<IIC_VQUNAiQ, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrItinData<IIC_VQUNAiQ, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, InstrStage<1, [A8_NPipe]>], [4, 1]>, // // Double-register Integer Binary - InstrItinData<IIC_VBINiD, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrItinData<IIC_VBINiD, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, InstrStage<1, [A8_NPipe]>], [3, 2, 2]>, // // Quad-register Integer Binary - InstrItinData<IIC_VBINiQ, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrItinData<IIC_VBINiQ, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, InstrStage<1, [A8_NPipe]>], [3, 2, 2]>, // // Double-register Integer Binary (4 cycle) - InstrItinData<IIC_VBINi4D, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrItinData<IIC_VBINi4D, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, InstrStage<1, [A8_NPipe]>], [4, 2, 1]>, // // Quad-register Integer Binary (4 cycle) - InstrItinData<IIC_VBINi4Q, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrItinData<IIC_VBINi4Q, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, InstrStage<1, [A8_NPipe]>], [4, 2, 1]>, // // Double-register Integer Subtract - InstrItinData<IIC_VSUBiD, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrItinData<IIC_VSUBiD, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, InstrStage<1, [A8_NPipe]>], [3, 2, 1]>, // // Quad-register Integer Subtract - InstrItinData<IIC_VSUBiQ, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrItinData<IIC_VSUBiQ, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, InstrStage<1, [A8_NPipe]>], [3, 2, 1]>, // // Double-register Integer Subtract - InstrItinData<IIC_VSUBi4D, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrItinData<IIC_VSUBi4D, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, InstrStage<1, [A8_NPipe]>], [4, 2, 1]>, // // Quad-register Integer Subtract - InstrItinData<IIC_VSUBi4Q, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrItinData<IIC_VSUBi4Q, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, InstrStage<1, [A8_NPipe]>], [4, 2, 1]>, // // Double-register Integer Shift - InstrItinData<IIC_VSHLiD, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrItinData<IIC_VSHLiD, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, InstrStage<1, [A8_NPipe]>], [3, 1, 1]>, // // Quad-register Integer Shift - InstrItinData<IIC_VSHLiQ, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrItinData<IIC_VSHLiQ, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, InstrStage<2, [A8_NPipe]>], [4, 1, 1]>, // // Double-register Integer Shift (4 cycle) - InstrItinData<IIC_VSHLi4D, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrItinData<IIC_VSHLi4D, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, InstrStage<1, [A8_NPipe]>], [4, 1, 1]>, // // Quad-register Integer Shift (4 cycle) - InstrItinData<IIC_VSHLi4Q, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrItinData<IIC_VSHLi4Q, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, InstrStage<2, [A8_NPipe]>], [5, 1, 1]>, // // Double-register Integer Pair Add Long - InstrItinData<IIC_VPALiD, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrItinData<IIC_VPALiD, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, InstrStage<1, [A8_NPipe]>], [6, 3, 1]>, // // Quad-register Integer Pair Add Long - InstrItinData<IIC_VPALiQ, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrItinData<IIC_VPALiQ, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, InstrStage<2, [A8_NPipe]>], [7, 3, 1]>, // // Double-register Absolute Difference and Accumulate - InstrItinData<IIC_VABAD, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrItinData<IIC_VABAD, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, InstrStage<1, [A8_NPipe]>], [6, 3, 2, 1]>, // // Quad-register Absolute Difference and Accumulate - InstrItinData<IIC_VABAQ, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrItinData<IIC_VABAQ, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, InstrStage<2, [A8_NPipe]>], [6, 3, 2, 1]>, // // Double-register Integer Multiply (.8, .16) - InstrItinData<IIC_VMULi16D, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrItinData<IIC_VMULi16D, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, InstrStage<1, [A8_NPipe]>], [6, 2, 2]>, // // Double-register Integer Multiply (.32) - InstrItinData<IIC_VMULi32D, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrItinData<IIC_VMULi32D, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, InstrStage<2, [A8_NPipe]>], [7, 2, 1]>, // // Quad-register Integer Multiply (.8, .16) - InstrItinData<IIC_VMULi16Q, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrItinData<IIC_VMULi16Q, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, InstrStage<2, [A8_NPipe]>], [7, 2, 2]>, // // Quad-register Integer Multiply (.32) - InstrItinData<IIC_VMULi32Q, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrItinData<IIC_VMULi32Q, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, InstrStage<1, [A8_NPipe]>, InstrStage<2, [A8_NLSPipe], 0>, InstrStage<3, [A8_NPipe]>], [9, 2, 1]>, // // Double-register Integer Multiply-Accumulate (.8, .16) - InstrItinData<IIC_VMACi16D, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrItinData<IIC_VMACi16D, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, InstrStage<1, [A8_NPipe]>], [6, 3, 2, 2]>, // // Double-register Integer Multiply-Accumulate (.32) - InstrItinData<IIC_VMACi32D, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrItinData<IIC_VMACi32D, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, InstrStage<2, [A8_NPipe]>], [7, 3, 2, 1]>, // // Quad-register Integer Multiply-Accumulate (.8, .16) - InstrItinData<IIC_VMACi16Q, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrItinData<IIC_VMACi16Q, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, InstrStage<2, [A8_NPipe]>], [7, 3, 2, 2]>, // // Quad-register Integer Multiply-Accumulate (.32) - InstrItinData<IIC_VMACi32Q, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrItinData<IIC_VMACi32Q, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, InstrStage<1, [A8_NPipe]>, InstrStage<2, [A8_NLSPipe], 0>, InstrStage<3, [A8_NPipe]>], [9, 3, 2, 1]>, // // Double-register VEXT - InstrItinData<IIC_VEXTD, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrItinData<IIC_VEXTD, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, InstrStage<1, [A8_NLSPipe]>], [2, 1, 1]>, // // Quad-register VEXT - InstrItinData<IIC_VEXTQ, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrItinData<IIC_VEXTQ, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, InstrStage<2, [A8_NLSPipe]>], [3, 1, 1]>, // // VTB - InstrItinData<IIC_VTB1, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrItinData<IIC_VTB1, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, InstrStage<2, [A8_NLSPipe]>], [3, 2, 1]>, - InstrItinData<IIC_VTB2, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrItinData<IIC_VTB2, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, InstrStage<2, [A8_NLSPipe]>], [3, 2, 2, 1]>, - InstrItinData<IIC_VTB3, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrItinData<IIC_VTB3, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, InstrStage<1, [A8_NLSPipe]>, InstrStage<1, [A8_NPipe], 0>, InstrStage<2, [A8_NLSPipe]>], [4, 2, 2, 3, 1]>, - InstrItinData<IIC_VTB4, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrItinData<IIC_VTB4, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, InstrStage<1, [A8_NLSPipe]>, InstrStage<1, [A8_NPipe], 0>, InstrStage<2, [A8_NLSPipe]>],[4, 2, 2, 3, 3, 1]>, // // VTBX - InstrItinData<IIC_VTBX1, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrItinData<IIC_VTBX1, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, InstrStage<2, [A8_NLSPipe]>], [3, 1, 2, 1]>, - InstrItinData<IIC_VTBX2, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrItinData<IIC_VTBX2, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, InstrStage<2, [A8_NLSPipe]>], [3, 1, 2, 2, 1]>, - InstrItinData<IIC_VTBX3, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrItinData<IIC_VTBX3, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, InstrStage<1, [A8_NLSPipe]>, InstrStage<1, [A8_NPipe], 0>, InstrStage<2, [A8_NLSPipe]>],[4, 1, 2, 2, 3, 1]>, - InstrItinData<IIC_VTBX4, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrItinData<IIC_VTBX4, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, InstrStage<1, [A8_NLSPipe]>, InstrStage<1, [A8_NPipe], 0>, InstrStage<2, [A8_NLSPipe]>], [4, 1, 2, 2, 3, 3, 1]> diff --git a/contrib/llvm/lib/Target/ARM/ARMScheduleA9.td b/contrib/llvm/lib/Target/ARM/ARMScheduleA9.td index df2f896..82c6735 100644 --- a/contrib/llvm/lib/Target/ARM/ARMScheduleA9.td +++ b/contrib/llvm/lib/Target/ARM/ARMScheduleA9.td @@ -16,130 +16,417 @@ // Reference Manual". // // Functional units -def A9_Pipe0 : FuncUnit; // pipeline 0 -def A9_Pipe1 : FuncUnit; // pipeline 1 -def A9_LSPipe : FuncUnit; // LS pipe -def A9_NPipe : FuncUnit; // NEON ALU/MUL pipe +def A9_Issue0 : FuncUnit; // Issue 0 +def A9_Issue1 : FuncUnit; // Issue 1 +def A9_Branch : FuncUnit; // Branch +def A9_ALU0 : FuncUnit; // ALU / MUL pipeline 0 +def A9_ALU1 : FuncUnit; // ALU pipeline 1 +def A9_AGU : FuncUnit; // Address generation unit for ld / st +def A9_NPipe : FuncUnit; // NEON pipeline +def A9_MUX0 : FuncUnit; // AGU + NEON/FPU multiplexer +def A9_LSUnit : FuncUnit; // L/S Unit def A9_DRegsVFP: FuncUnit; // FP register set, VFP side def A9_DRegsN : FuncUnit; // FP register set, NEON side -// Dual issue pipeline represented by A9_Pipe0 | A9_Pipe1 -// +// Bypasses +def A9_LdBypass : Bypass; + def CortexA9Itineraries : ProcessorItineraries< - [A9_NPipe, A9_DRegsN, A9_DRegsVFP, A9_LSPipe, A9_Pipe0, A9_Pipe1], [ + [A9_Issue0, A9_Issue1, A9_Branch, A9_ALU0, A9_ALU1, A9_AGU, A9_NPipe, A9_MUX0, + A9_LSUnit, A9_DRegsVFP, A9_DRegsN], + [A9_LdBypass], [ // Two fully-pipelined integer ALU pipelines - // FIXME: There are no operand latencies for these instructions at all! + // // Move instructions, unconditional - InstrItinData<IIC_iMOVi , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [1]>, - InstrItinData<IIC_iMOVr , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [1, 1]>, - InstrItinData<IIC_iMOVsi , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [1, 1]>, - InstrItinData<IIC_iMOVsr , [InstrStage<2, [A9_Pipe0, A9_Pipe1]>], [2, 2, 1]>, + InstrItinData<IIC_iMOVi , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_ALU0, A9_ALU1]>], [1]>, + InstrItinData<IIC_iMOVr , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_ALU0, A9_ALU1]>], [1, 1]>, + InstrItinData<IIC_iMOVsi , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_ALU0, A9_ALU1]>], [1, 1]>, + InstrItinData<IIC_iMOVsr , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<2, [A9_ALU0, A9_ALU1]>], [2, 1, 1]>, + InstrItinData<IIC_iMOVix2 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_ALU0, A9_ALU1]>, + InstrStage<1, [A9_ALU0, A9_ALU1]>], [2]>, + InstrItinData<IIC_iMOVix2addpc,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_ALU0, A9_ALU1]>, + InstrStage<1, [A9_ALU0, A9_ALU1]>, + InstrStage<1, [A9_ALU0, A9_ALU1]>], [3]>, + InstrItinData<IIC_iMOVix2ld,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_ALU0, A9_ALU1]>, + InstrStage<1, [A9_ALU0, A9_ALU1]>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_AGU], 0>, + InstrStage<1, [A9_LSUnit]>], [5]>, + // + // MVN instructions + InstrItinData<IIC_iMVNi , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_ALU0, A9_ALU1]>], + [1]>, + InstrItinData<IIC_iMVNr , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_ALU0, A9_ALU1]>], + [1, 1], [NoBypass, A9_LdBypass]>, + InstrItinData<IIC_iMVNsi , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<2, [A9_ALU0, A9_ALU1]>], + [2, 1]>, + InstrItinData<IIC_iMVNsr , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<3, [A9_ALU0, A9_ALU1]>], + [3, 1, 1]>, // // No operand cycles - InstrItinData<IIC_iALUx , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>]>, + InstrItinData<IIC_iALUx , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_ALU0, A9_ALU1]>]>, // // Binary Instructions that produce a result - InstrItinData<IIC_iALUi , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [2, 2]>, - InstrItinData<IIC_iALUr , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [2, 2, 2]>, - InstrItinData<IIC_iALUsi, [InstrStage<2, [A9_Pipe0, A9_Pipe1]>], [2, 2, 1]>, - InstrItinData<IIC_iALUsr,[InstrStage<3, [A9_Pipe0, A9_Pipe1]>], [2, 2, 1, 1]>, + InstrItinData<IIC_iALUi , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_ALU0, A9_ALU1]>], + [1, 1], [NoBypass, A9_LdBypass]>, + InstrItinData<IIC_iALUr , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_ALU0, A9_ALU1]>], + [1, 1, 1], [NoBypass, A9_LdBypass, A9_LdBypass]>, + InstrItinData<IIC_iALUsi, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<2, [A9_ALU0, A9_ALU1]>], + [2, 1, 1], [NoBypass, A9_LdBypass, NoBypass]>, + InstrItinData<IIC_iALUsir,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<2, [A9_ALU0, A9_ALU1]>], + [2, 1, 1], [NoBypass, NoBypass, A9_LdBypass]>, + InstrItinData<IIC_iALUsr, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<3, [A9_ALU0, A9_ALU1]>], + [3, 1, 1, 1], + [NoBypass, A9_LdBypass, NoBypass, NoBypass]>, + // + // Bitwise Instructions that produce a result + InstrItinData<IIC_iBITi , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_ALU0, A9_ALU1]>], [1, 1]>, + InstrItinData<IIC_iBITr , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_ALU0, A9_ALU1]>], [1, 1, 1]>, + InstrItinData<IIC_iBITsi, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<2, [A9_ALU0, A9_ALU1]>], [2, 1, 1]>, + InstrItinData<IIC_iBITsr, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<3, [A9_ALU0, A9_ALU1]>], [3, 1, 1, 1]>, // // Unary Instructions that produce a result - InstrItinData<IIC_iUNAr , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [2, 2]>, - InstrItinData<IIC_iUNAsi , [InstrStage<2, [A9_Pipe0, A9_Pipe1]>], [2, 1]>, - InstrItinData<IIC_iUNAsr , [InstrStage<3, [A9_Pipe0, A9_Pipe1]>], [2, 1, 1]>, + + // CLZ, RBIT, etc. + InstrItinData<IIC_iUNAr , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_ALU0, A9_ALU1]>], [1, 1]>, + + // BFC, BFI, UBFX, SBFX + InstrItinData<IIC_iUNAsi, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<2, [A9_ALU0, A9_ALU1]>], [2, 1]>, + + // + // Zero and sign extension instructions + InstrItinData<IIC_iEXTr , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_ALU0, A9_ALU1]>], [2, 1]>, + InstrItinData<IIC_iEXTAr, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<2, [A9_ALU0, A9_ALU1]>], [3, 1, 1]>, + InstrItinData<IIC_iEXTAsr,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<3, [A9_ALU0, A9_ALU1]>], [3, 1, 1, 1]>, // // Compare instructions - InstrItinData<IIC_iCMPi , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [2]>, - InstrItinData<IIC_iCMPr , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [2, 2]>, - InstrItinData<IIC_iCMPsi , [InstrStage<2, [A9_Pipe0, A9_Pipe1]>], [2, 1]>, - InstrItinData<IIC_iCMPsr , [InstrStage<3, [A9_Pipe0, A9_Pipe1]>], [2, 1, 1]>, + InstrItinData<IIC_iCMPi , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_ALU0, A9_ALU1]>], + [1], [A9_LdBypass]>, + InstrItinData<IIC_iCMPr , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_ALU0, A9_ALU1]>], + [1, 1], [A9_LdBypass, A9_LdBypass]>, + InstrItinData<IIC_iCMPsi , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<2, [A9_ALU0, A9_ALU1]>], + [1, 1], [A9_LdBypass, NoBypass]>, + InstrItinData<IIC_iCMPsr , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<3, [A9_ALU0, A9_ALU1]>], + [1, 1, 1], [A9_LdBypass, NoBypass, NoBypass]>, + // + // Test instructions + InstrItinData<IIC_iTSTi , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_ALU0, A9_ALU1]>], [1]>, + InstrItinData<IIC_iTSTr , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_ALU0, A9_ALU1]>], [1, 1]>, + InstrItinData<IIC_iTSTsi , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<2, [A9_ALU0, A9_ALU1]>], [1, 1]>, + InstrItinData<IIC_iTSTsr , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<3, [A9_ALU0, A9_ALU1]>], [1, 1, 1]>, // // Move instructions, conditional - InstrItinData<IIC_iCMOVi , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [2]>, - InstrItinData<IIC_iCMOVr , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [2, 1]>, - InstrItinData<IIC_iCMOVsi , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [2, 1]>, - InstrItinData<IIC_iCMOVsr , [InstrStage<2, [A9_Pipe0, A9_Pipe1]>], [2, 1, 1]>, + // FIXME: Correctly model the extra input dep on the destination. + InstrItinData<IIC_iCMOVi , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_ALU0, A9_ALU1]>], [1]>, + InstrItinData<IIC_iCMOVr , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_ALU0, A9_ALU1]>], [1, 1]>, + InstrItinData<IIC_iCMOVsi , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_ALU0, A9_ALU1]>], [1, 1]>, + InstrItinData<IIC_iCMOVsr , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<2, [A9_ALU0, A9_ALU1]>], [2, 1, 1]>, + InstrItinData<IIC_iCMOVix2, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_ALU0, A9_ALU1]>, + InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_ALU0, A9_ALU1]>], [2]>, // Integer multiply pipeline // - InstrItinData<IIC_iMUL16 , [InstrStage<1, [A9_Pipe1], 0>, - InstrStage<2, [A9_Pipe0]>], [4, 1, 1]>, - InstrItinData<IIC_iMAC16 , [InstrStage<1, [A9_Pipe1], 0>, - InstrStage<2, [A9_Pipe0]>], [4, 1, 1, 2]>, - InstrItinData<IIC_iMUL32 , [InstrStage<1, [A9_Pipe1], 0>, - InstrStage<2, [A9_Pipe0]>], [4, 1, 1]>, - InstrItinData<IIC_iMAC32 , [InstrStage<1, [A9_Pipe1], 0>, - InstrStage<2, [A9_Pipe0]>], [4, 1, 1, 2]>, - InstrItinData<IIC_iMUL64 , [InstrStage<2, [A9_Pipe1], 0>, - InstrStage<3, [A9_Pipe0]>], [4, 5, 1, 1]>, - InstrItinData<IIC_iMAC64 , [InstrStage<2, [A9_Pipe1], 0>, - InstrStage<3, [A9_Pipe0]>], [4, 5, 1, 1]>, + InstrItinData<IIC_iMUL16 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<2, [A9_ALU0]>], [3, 1, 1]>, + InstrItinData<IIC_iMAC16 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<2, [A9_ALU0]>], + [3, 1, 1, 1]>, + InstrItinData<IIC_iMUL32 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<2, [A9_ALU0]>], [4, 1, 1]>, + InstrItinData<IIC_iMAC32 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<2, [A9_ALU0]>], + [4, 1, 1, 1]>, + InstrItinData<IIC_iMUL64 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<3, [A9_ALU0]>], [4, 5, 1, 1]>, + InstrItinData<IIC_iMAC64 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<3, [A9_ALU0]>], + [4, 5, 1, 1]>, // Integer load pipeline // FIXME: The timings are some rough approximations // // Immediate offset - InstrItinData<IIC_iLoadi , [InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_LSPipe]>], [3, 1]>, + InstrItinData<IIC_iLoad_i , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_AGU], 0>, + InstrStage<1, [A9_LSUnit]>], + [3, 1], [A9_LdBypass]>, + InstrItinData<IIC_iLoad_bh_i, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<2, [A9_AGU], 0>, + InstrStage<1, [A9_LSUnit]>], + [4, 1], [A9_LdBypass]>, + // FIXME: If address is 64-bit aligned, AGU cycles is 1. + InstrItinData<IIC_iLoad_d_i , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<2, [A9_AGU], 0>, + InstrStage<1, [A9_LSUnit]>], + [3, 3, 1], [A9_LdBypass]>, // // Register offset - InstrItinData<IIC_iLoadr , [InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_LSPipe]>], [3, 1, 1]>, + InstrItinData<IIC_iLoad_r , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_AGU], 0>, + InstrStage<1, [A9_LSUnit]>], + [3, 1, 1], [A9_LdBypass]>, + InstrItinData<IIC_iLoad_bh_r, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<2, [A9_AGU], 0>, + InstrStage<1, [A9_LSUnit]>], + [4, 1, 1], [A9_LdBypass]>, + InstrItinData<IIC_iLoad_d_r , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<2, [A9_AGU], 0>, + InstrStage<1, [A9_LSUnit]>], + [3, 3, 1, 1], [A9_LdBypass]>, // // Scaled register offset - InstrItinData<IIC_iLoadsi , [InstrStage<1, [A9_Pipe1]>, - InstrStage<2, [A9_LSPipe]>], [4, 1, 1]>, + InstrItinData<IIC_iLoad_si , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_AGU], 0>, + InstrStage<1, [A9_LSUnit], 0>], + [4, 1, 1], [A9_LdBypass]>, + InstrItinData<IIC_iLoad_bh_si,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<2, [A9_AGU], 0>, + InstrStage<1, [A9_LSUnit]>], + [5, 1, 1], [A9_LdBypass]>, // // Immediate offset with update - InstrItinData<IIC_iLoadiu , [InstrStage<1, [A9_Pipe1]>, - InstrStage<2, [A9_LSPipe]>], [3, 2, 1]>, + InstrItinData<IIC_iLoad_iu , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_AGU], 0>, + InstrStage<1, [A9_LSUnit]>], + [3, 2, 1], [A9_LdBypass]>, + InstrItinData<IIC_iLoad_bh_iu,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<2, [A9_AGU], 0>, + InstrStage<1, [A9_LSUnit]>], + [4, 3, 1], [A9_LdBypass]>, // // Register offset with update - InstrItinData<IIC_iLoadru , [InstrStage<1, [A9_Pipe1]>, - InstrStage<2, [A9_LSPipe]>], [3, 2, 1, 1]>, + InstrItinData<IIC_iLoad_ru , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_AGU], 0>, + InstrStage<1, [A9_LSUnit]>], + [3, 2, 1, 1], [A9_LdBypass]>, + InstrItinData<IIC_iLoad_bh_ru,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<2, [A9_AGU], 0>, + InstrStage<1, [A9_LSUnit]>], + [4, 3, 1, 1], [A9_LdBypass]>, + InstrItinData<IIC_iLoad_d_ru, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<2, [A9_AGU], 0>, + InstrStage<1, [A9_LSUnit]>], + [3, 3, 1, 1], [A9_LdBypass]>, // // Scaled register offset with update - InstrItinData<IIC_iLoadsiu , [InstrStage<1, [A9_Pipe1]>, - InstrStage<2, [A9_LSPipe]>], [4, 3, 1, 1]>, + InstrItinData<IIC_iLoad_siu , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_AGU], 0>, + InstrStage<1, [A9_LSUnit]>], + [4, 3, 1, 1], [A9_LdBypass]>, + InstrItinData<IIC_iLoad_bh_siu,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<2, [A9_AGU], 0>, + InstrStage<1, [A9_LSUnit]>], + [5, 4, 1, 1], [A9_LdBypass]>, + // + // Load multiple, def is the 5th operand. + // FIXME: This assumes 3 to 4 registers. + InstrItinData<IIC_iLoad_m , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<2, [A9_AGU], 1>, + InstrStage<2, [A9_LSUnit]>], + [1, 1, 1, 1, 3], + [NoBypass, NoBypass, NoBypass, NoBypass, A9_LdBypass]>, + // + // Load multiple + update, defs are the 1st and 5th operands. + InstrItinData<IIC_iLoad_mu , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<2, [A9_AGU], 1>, + InstrStage<2, [A9_LSUnit]>], + [2, 1, 1, 1, 3], + [NoBypass, NoBypass, NoBypass, NoBypass, A9_LdBypass]>, + // + // Load multiple plus branch + InstrItinData<IIC_iLoad_mBr, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_AGU], 1>, + InstrStage<2, [A9_LSUnit]>, + InstrStage<1, [A9_Branch]>], + [1, 2, 1, 1, 3], + [NoBypass, NoBypass, NoBypass, NoBypass, A9_LdBypass]>, + // + // Pop, def is the 3rd operand. + InstrItinData<IIC_iPop , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<2, [A9_AGU], 1>, + InstrStage<2, [A9_LSUnit]>], + [1, 1, 3], + [NoBypass, NoBypass, A9_LdBypass]>, + // + // Pop + branch, def is the 3rd operand. + InstrItinData<IIC_iPop_Br, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<2, [A9_AGU], 1>, + InstrStage<2, [A9_LSUnit]>, + InstrStage<1, [A9_Branch]>], + [1, 1, 3], + [NoBypass, NoBypass, A9_LdBypass]>, + // - // Load multiple - InstrItinData<IIC_iLoadm , [InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_LSPipe]>]>, + // iLoadi + iALUr for t2LDRpci_pic. + InstrItinData<IIC_iLoadiALU, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_AGU], 0>, + InstrStage<1, [A9_LSUnit]>, + InstrStage<1, [A9_ALU0, A9_ALU1]>], + [2, 1]>, // Integer store pipeline /// // Immediate offset - InstrItinData<IIC_iStorei , [InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_LSPipe]>], [3, 1]>, + InstrItinData<IIC_iStore_i , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_AGU], 0>, + InstrStage<1, [A9_LSUnit]>], [1, 1]>, + InstrItinData<IIC_iStore_bh_i,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<2, [A9_AGU], 1>, + InstrStage<1, [A9_LSUnit]>], [1, 1]>, + // FIXME: If address is 64-bit aligned, AGU cycles is 1. + InstrItinData<IIC_iStore_d_i, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<2, [A9_AGU], 1>, + InstrStage<1, [A9_LSUnit]>], [1, 1]>, // // Register offset - InstrItinData<IIC_iStorer , [InstrStage<1, [ A9_Pipe1]>, - InstrStage<1, [A9_LSPipe]>], [3, 1, 1]>, + InstrItinData<IIC_iStore_r , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_AGU], 0>, + InstrStage<1, [A9_LSUnit]>], [1, 1, 1]>, + InstrItinData<IIC_iStore_bh_r,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<2, [A9_AGU], 1>, + InstrStage<1, [A9_LSUnit]>], [1, 1, 1]>, + InstrItinData<IIC_iStore_d_r, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<2, [A9_AGU], 1>, + InstrStage<1, [A9_LSUnit]>], [1, 1, 1]>, // // Scaled register offset - InstrItinData<IIC_iStoresi , [InstrStage<1, [A9_Pipe1]>, - InstrStage<2, [A9_LSPipe]>], [3, 1, 1]>, + InstrItinData<IIC_iStore_si , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_AGU], 0>, + InstrStage<1, [A9_LSUnit]>], [1, 1, 1]>, + InstrItinData<IIC_iStore_bh_si,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<2, [A9_AGU], 1>, + InstrStage<1, [A9_LSUnit]>], [1, 1, 1]>, // // Immediate offset with update - InstrItinData<IIC_iStoreiu , [InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_LSPipe]>], [2, 3, 1]>, + InstrItinData<IIC_iStore_iu , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_AGU], 0>, + InstrStage<1, [A9_LSUnit]>], [2, 1, 1]>, + InstrItinData<IIC_iStore_bh_iu,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<2, [A9_AGU], 1>, + InstrStage<1, [A9_LSUnit]>], [3, 1, 1]>, // // Register offset with update - InstrItinData<IIC_iStoreru , [InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_LSPipe]>], [2, 3, 1, 1]>, + InstrItinData<IIC_iStore_ru , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_AGU], 0>, + InstrStage<1, [A9_LSUnit]>], + [2, 1, 1, 1]>, + InstrItinData<IIC_iStore_bh_ru,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<2, [A9_AGU], 1>, + InstrStage<1, [A9_LSUnit]>], + [3, 1, 1, 1]>, + InstrItinData<IIC_iStore_d_ru, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<2, [A9_AGU], 1>, + InstrStage<1, [A9_LSUnit]>], + [3, 1, 1, 1]>, // // Scaled register offset with update - InstrItinData<IIC_iStoresiu, [InstrStage<1, [A9_Pipe1]>, - InstrStage<2, [A9_LSPipe]>], [3, 3, 1, 1]>, + InstrItinData<IIC_iStore_siu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_AGU], 0>, + InstrStage<1, [A9_LSUnit]>], + [2, 1, 1, 1]>, + InstrItinData<IIC_iStore_bh_siu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<2, [A9_AGU], 1>, + InstrStage<1, [A9_LSUnit]>], + [3, 1, 1, 1]>, // // Store multiple - InstrItinData<IIC_iStorem , [InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_LSPipe]>]>, + InstrItinData<IIC_iStore_m , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_AGU], 0>, + InstrStage<2, [A9_LSUnit]>]>, + // + // Store multiple + update + InstrItinData<IIC_iStore_mu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_AGU], 0>, + InstrStage<2, [A9_LSUnit]>], [2]>, + + // + // Preload + InstrItinData<IIC_Preload, [InstrStage<1, [A9_Issue0, A9_Issue1]>], [1, 1]>, + // Branch // // no delay slots, so the latency of a branch is unimportant - InstrItinData<IIC_Br , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>]>, + InstrItinData<IIC_Br , [InstrStage<1, [A9_Issue0], 0>, + InstrStage<1, [A9_Issue1], 0>, + InstrStage<1, [A9_Branch]>]>, // VFP and NEON shares the same register file. This means that every VFP // instruction should wait for full completion of the consecutive NEON @@ -159,687 +446,1379 @@ def CortexA9Itineraries : ProcessorItineraries< // Issue through integer pipeline, and execute in NEON unit. // FP Special Register to Integer Register File Move - InstrItinData<IIC_fpSTAT , [InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrItinData<IIC_fpSTAT , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, InstrStage<2, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_NPipe]>]>, + InstrStage<1, [A9_NPipe]>], + [1]>, // // Single-precision FP Unary - InstrItinData<IIC_fpUNA32 , [InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrItinData<IIC_fpUNA32 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, // Extra latency cycles since wbck is 2 cycles InstrStage<3, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_NPipe]>], [1, 1]>, + InstrStage<1, [A9_NPipe]>], + [1, 1]>, // // Double-precision FP Unary - InstrItinData<IIC_fpUNA64 , [InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrItinData<IIC_fpUNA64 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, // Extra latency cycles since wbck is 2 cycles InstrStage<3, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_NPipe]>], [1, 1]>, + InstrStage<1, [A9_NPipe]>], + [1, 1]>, // // Single-precision FP Compare - InstrItinData<IIC_fpCMP32 , [InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrItinData<IIC_fpCMP32 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, // Extra latency cycles since wbck is 4 cycles InstrStage<5, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_NPipe]>], [1, 1]>, + InstrStage<1, [A9_NPipe]>], + [1, 1]>, // // Double-precision FP Compare - InstrItinData<IIC_fpCMP64 , [InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrItinData<IIC_fpCMP64 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, // Extra latency cycles since wbck is 4 cycles InstrStage<5, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_NPipe]>], [1, 1]>, + InstrStage<1, [A9_NPipe]>], + [1, 1]>, // // Single to Double FP Convert - InstrItinData<IIC_fpCVTSD , [InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrItinData<IIC_fpCVTSD , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, InstrStage<5, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_NPipe]>], [4, 1]>, + InstrStage<1, [A9_NPipe]>], + [4, 1]>, // // Double to Single FP Convert - InstrItinData<IIC_fpCVTDS , [InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrItinData<IIC_fpCVTDS , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, InstrStage<5, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_NPipe]>], [4, 1]>, + InstrStage<1, [A9_NPipe]>], + [4, 1]>, // // Single to Half FP Convert - InstrItinData<IIC_fpCVTSH , [InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrItinData<IIC_fpCVTSH , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, InstrStage<5, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_NPipe]>], [4, 1]>, + InstrStage<1, [A9_NPipe]>], + [4, 1]>, // // Half to Single FP Convert - InstrItinData<IIC_fpCVTHS , [InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrItinData<IIC_fpCVTHS , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, InstrStage<3, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_NPipe]>], [2, 1]>, + InstrStage<1, [A9_NPipe]>], + [2, 1]>, // // Single-Precision FP to Integer Convert - InstrItinData<IIC_fpCVTSI , [InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrItinData<IIC_fpCVTSI , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, InstrStage<5, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_NPipe]>], [4, 1]>, + InstrStage<1, [A9_NPipe]>], + [4, 1]>, // // Double-Precision FP to Integer Convert - InstrItinData<IIC_fpCVTDI , [InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrItinData<IIC_fpCVTDI , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, InstrStage<5, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_NPipe]>], [4, 1]>, + InstrStage<1, [A9_NPipe]>], + [4, 1]>, // // Integer to Single-Precision FP Convert - InstrItinData<IIC_fpCVTIS , [InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrItinData<IIC_fpCVTIS , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, InstrStage<5, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_NPipe]>], [4, 1]>, + InstrStage<1, [A9_NPipe]>], + [4, 1]>, // // Integer to Double-Precision FP Convert - InstrItinData<IIC_fpCVTID , [InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrItinData<IIC_fpCVTID , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, InstrStage<5, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_NPipe]>], [4, 1]>, + InstrStage<1, [A9_NPipe]>], + [4, 1]>, // // Single-precision FP ALU - InstrItinData<IIC_fpALU32 , [InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrItinData<IIC_fpALU32 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, InstrStage<5, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_NPipe]>], [4, 1, 1]>, + InstrStage<1, [A9_NPipe]>], + [4, 1, 1]>, // // Double-precision FP ALU - InstrItinData<IIC_fpALU64 , [InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrItinData<IIC_fpALU64 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, InstrStage<5, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_NPipe]>], [4, 1, 1]>, + InstrStage<1, [A9_NPipe]>], + [4, 1, 1]>, // // Single-precision FP Multiply - InstrItinData<IIC_fpMUL32 , [InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrItinData<IIC_fpMUL32 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, InstrStage<6, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_NPipe]>], [5, 1, 1]>, + InstrStage<1, [A9_NPipe]>], + [5, 1, 1]>, // // Double-precision FP Multiply - InstrItinData<IIC_fpMUL64 , [InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrItinData<IIC_fpMUL64 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, InstrStage<7, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<2, [A9_NPipe]>], [6, 1, 1]>, + InstrStage<2, [A9_NPipe]>], + [6, 1, 1]>, // // Single-precision FP MAC - InstrItinData<IIC_fpMAC32 , [InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrItinData<IIC_fpMAC32 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, InstrStage<9, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_NPipe]>], [8, 0, 1, 1]>, + InstrStage<1, [A9_NPipe]>], + [8, 1, 1, 1]>, // // Double-precision FP MAC - InstrItinData<IIC_fpMAC64 , [InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrItinData<IIC_fpMAC64 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, InstrStage<10, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<2, [A9_NPipe]>], [9, 0, 1, 1]>, + InstrStage<2, [A9_NPipe]>], + [9, 1, 1, 1]>, // // Single-precision FP DIV - InstrItinData<IIC_fpDIV32 , [InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrItinData<IIC_fpDIV32 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, InstrStage<16, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<10, [A9_NPipe]>], [15, 1, 1]>, + InstrStage<10, [A9_NPipe]>], + [15, 1, 1]>, // // Double-precision FP DIV - InstrItinData<IIC_fpDIV64 , [InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrItinData<IIC_fpDIV64 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, InstrStage<26, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<20, [A9_NPipe]>], [25, 1, 1]>, + InstrStage<20, [A9_NPipe]>], + [25, 1, 1]>, // // Single-precision FP SQRT - InstrItinData<IIC_fpSQRT32, [InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrItinData<IIC_fpSQRT32, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, InstrStage<18, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<13, [A9_NPipe]>], [17, 1]>, + InstrStage<13, [A9_NPipe]>], + [17, 1]>, // // Double-precision FP SQRT - InstrItinData<IIC_fpSQRT64, [InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrItinData<IIC_fpSQRT64, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, InstrStage<33, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<28, [A9_NPipe]>], [32, 1]>, + InstrStage<28, [A9_NPipe]>], + [32, 1]>, // // Integer to Single-precision Move - InstrItinData<IIC_fpMOVIS, [InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrItinData<IIC_fpMOVIS, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, // Extra 1 latency cycle since wbck is 2 cycles InstrStage<3, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_NPipe]>], [1, 1]>, + InstrStage<1, [A9_NPipe]>], + [1, 1]>, // // Integer to Double-precision Move - InstrItinData<IIC_fpMOVID, [InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrItinData<IIC_fpMOVID, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, // Extra 1 latency cycle since wbck is 2 cycles InstrStage<3, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_NPipe]>], [1, 1, 1]>, + InstrStage<1, [A9_NPipe]>], + [1, 1, 1]>, // // Single-precision to Integer Move - InstrItinData<IIC_fpMOVSI, [InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrItinData<IIC_fpMOVSI, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, InstrStage<2, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_NPipe]>], [1, 1]>, + InstrStage<1, [A9_NPipe]>], + [2, 1]>, // // Double-precision to Integer Move - InstrItinData<IIC_fpMOVDI, [InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrItinData<IIC_fpMOVDI, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, InstrStage<2, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_NPipe]>], [1, 1, 1]>, + InstrStage<1, [A9_NPipe]>], + [2, 1, 1]>, // // Single-precision FP Load - InstrItinData<IIC_fpLoad32, [InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrItinData<IIC_fpLoad32, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, InstrStage<2, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_Pipe1], 0>, - InstrStage<1, [A9_LSPipe]>, - InstrStage<1, [A9_NPipe]>]>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [1, 1]>, // // Double-precision FP Load - InstrItinData<IIC_fpLoad64, [InstrStage<1, [A9_DRegsVFP], 0, Required>, + // FIXME: Result latency is 1 if address is 64-bit aligned. + InstrItinData<IIC_fpLoad64, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, InstrStage<2, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_Pipe1], 0>, - InstrStage<1, [A9_LSPipe]>, - InstrStage<1, [A9_NPipe]>]>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [2, 1]>, // // FP Load Multiple - InstrItinData<IIC_fpLoadm, [InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrItinData<IIC_fpLoad_m, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrStage<2, [A9_DRegsN], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], [1, 1, 1, 1]>, + // + // FP Load Multiple + update + InstrItinData<IIC_fpLoad_mu,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, InstrStage<2, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_Pipe1], 0>, - InstrStage<1, [A9_LSPipe]>, - InstrStage<1, [A9_NPipe]>]>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], [2, 1, 1, 1]>, // // Single-precision FP Store - InstrItinData<IIC_fpStore32,[InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrItinData<IIC_fpStore32,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, InstrStage<2, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_Pipe1], 0>, - InstrStage<1, [A9_LSPipe]>, - InstrStage<1, [A9_NPipe]>]>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [1, 1]>, // // Double-precision FP Store - InstrItinData<IIC_fpStore64,[InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrItinData<IIC_fpStore64,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, InstrStage<2, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_Pipe1], 0>, - InstrStage<1, [A9_LSPipe]>, - InstrStage<1, [A9_NPipe]>]>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [1, 1]>, // // FP Store Multiple - InstrItinData<IIC_fpStorem, [InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrItinData<IIC_fpStore_m,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, InstrStage<2, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_Pipe1], 0>, - InstrStage<1, [A9_LSPipe]>, - InstrStage<1, [A9_NPipe]>]>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], [1, 1, 1, 1]>, + // + // FP Store Multiple + update + InstrItinData<IIC_fpStore_mu,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrStage<2, [A9_DRegsN], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], [2, 1, 1, 1]>, // NEON - // Issue through integer pipeline, and execute in NEON unit. - // FIXME: Neon pipeline and LdSt unit are multiplexed. - // Add some syntactic sugar to model this! // VLD1 - // FIXME: We don't model this instruction properly - InstrItinData<IIC_VLD1, [InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<7, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1], 0>, - InstrStage<1, [A9_LSPipe]>, - InstrStage<1, [A9_NPipe]>]>, + // FIXME: Conservatively assume insufficent alignment. + InstrItinData<IIC_VLD1, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [2, 1]>, + // VLD1x2 + InstrItinData<IIC_VLD1x2, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [2, 2, 1]>, + // VLD1x3 + InstrItinData<IIC_VLD1x3, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<9, [A9_DRegsVFP], 0, Reserved>, + InstrStage<3, [A9_NPipe], 0>, + InstrStage<3, [A9_LSUnit]>], + [2, 2, 3, 1]>, + // VLD1x4 + InstrItinData<IIC_VLD1x4, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<9, [A9_DRegsVFP], 0, Reserved>, + InstrStage<3, [A9_NPipe], 0>, + InstrStage<3, [A9_LSUnit]>], + [2, 2, 3, 3, 1]>, + // VLD1u + InstrItinData<IIC_VLD1u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [2, 2, 1]>, + // VLD1x2u + InstrItinData<IIC_VLD1x2u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [2, 2, 2, 1]>, + // VLD1x3u + InstrItinData<IIC_VLD1x3u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<9, [A9_DRegsVFP], 0, Reserved>, + InstrStage<3, [A9_NPipe], 0>, + InstrStage<3, [A9_LSUnit]>], + [2, 2, 3, 2, 1]>, + // VLD1x4u + InstrItinData<IIC_VLD1x4u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<9, [A9_DRegsVFP], 0, Reserved>, + InstrStage<3, [A9_NPipe], 0>, + InstrStage<3, [A9_LSUnit]>], + [2, 2, 3, 3, 2, 1]>, + // + // VLD1ln + InstrItinData<IIC_VLD1ln, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<9, [A9_DRegsVFP], 0, Reserved>, + InstrStage<3, [A9_NPipe], 0>, + InstrStage<3, [A9_LSUnit]>], + [4, 1, 1, 1]>, + // + // VLD1lnu + InstrItinData<IIC_VLD1lnu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<9, [A9_DRegsVFP], 0, Reserved>, + InstrStage<3, [A9_NPipe], 0>, + InstrStage<3, [A9_LSUnit]>], + [4, 2, 1, 1, 1, 1]>, + // + // VLD1dup + InstrItinData<IIC_VLD1dup, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [3, 1]>, + // + // VLD1dupu + InstrItinData<IIC_VLD1dupu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [3, 2, 1, 1]>, // // VLD2 - // FIXME: We don't model this instruction properly - InstrItinData<IIC_VLD2, [InstrStage<1, [A9_DRegsN], 0, Required>, - // Extra latency cycles since wbck is 6 cycles - InstrStage<7, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1], 0>, - InstrStage<1, [A9_LSPipe]>, - InstrStage<1, [A9_NPipe]>], [2, 2, 1]>, + InstrItinData<IIC_VLD2, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 7 cycles + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [3, 3, 1]>, + // + // VLD2x2 + InstrItinData<IIC_VLD2x2, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<9, [A9_DRegsVFP], 0, Reserved>, + InstrStage<3, [A9_NPipe], 0>, + InstrStage<3, [A9_LSUnit]>], + [3, 4, 3, 4, 1]>, + // + // VLD2ln + InstrItinData<IIC_VLD2ln, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<9, [A9_DRegsVFP], 0, Reserved>, + InstrStage<3, [A9_NPipe], 0>, + InstrStage<3, [A9_LSUnit]>], + [4, 4, 1, 1, 1, 1]>, + // + // VLD2u + InstrItinData<IIC_VLD2u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 7 cycles + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [3, 3, 2, 1, 1, 1]>, + // + // VLD2x2u + InstrItinData<IIC_VLD2x2u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<9, [A9_DRegsVFP], 0, Reserved>, + InstrStage<3, [A9_NPipe], 0>, + InstrStage<3, [A9_LSUnit]>], + [3, 4, 3, 4, 2, 1]>, + // + // VLD2lnu + InstrItinData<IIC_VLD2lnu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<9, [A9_DRegsVFP], 0, Reserved>, + InstrStage<3, [A9_NPipe], 0>, + InstrStage<3, [A9_LSUnit]>], + [4, 4, 2, 1, 1, 1, 1, 1]>, + // + // VLD2dup + InstrItinData<IIC_VLD2dup, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [3, 3, 1]>, + // + // VLD2dupu + InstrItinData<IIC_VLD2dupu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [3, 3, 2, 1, 1]>, // // VLD3 - // FIXME: We don't model this instruction properly - InstrItinData<IIC_VLD3, [InstrStage<1, [A9_DRegsN], 0, Required>, - // Extra latency cycles since wbck is 6 cycles - InstrStage<7, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1], 0>, - InstrStage<1, [A9_LSPipe]>, - InstrStage<1, [A9_NPipe]>], [2, 2, 2, 1]>, + InstrItinData<IIC_VLD3, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<10,[A9_DRegsVFP], 0, Reserved>, + InstrStage<4, [A9_NPipe], 0>, + InstrStage<4, [A9_LSUnit]>], + [4, 4, 5, 1]>, + // + // VLD3ln + InstrItinData<IIC_VLD3ln, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<11,[A9_DRegsVFP], 0, Reserved>, + InstrStage<5, [A9_NPipe], 0>, + InstrStage<5, [A9_LSUnit]>], + [5, 5, 6, 1, 1, 1, 1, 2]>, + // + // VLD3u + InstrItinData<IIC_VLD3u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<10,[A9_DRegsVFP], 0, Reserved>, + InstrStage<4, [A9_NPipe], 0>, + InstrStage<4, [A9_LSUnit]>], + [4, 4, 5, 2, 1]>, + // + // VLD3lnu + InstrItinData<IIC_VLD3lnu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<11,[A9_DRegsVFP], 0, Reserved>, + InstrStage<5, [A9_NPipe], 0>, + InstrStage<5, [A9_LSUnit]>], + [5, 5, 6, 2, 1, 1, 1, 1, 1, 2]>, + // + // VLD3dup + InstrItinData<IIC_VLD3dup, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<9, [A9_DRegsVFP], 0, Reserved>, + InstrStage<3, [A9_NPipe], 0>, + InstrStage<3, [A9_LSUnit]>], + [3, 3, 4, 1]>, + // + // VLD3dupu + InstrItinData<IIC_VLD3dupu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<9, [A9_DRegsVFP], 0, Reserved>, + InstrStage<3, [A9_NPipe], 0>, + InstrStage<3, [A9_LSUnit]>], + [3, 3, 4, 2, 1, 1]>, // // VLD4 - // FIXME: We don't model this instruction properly - InstrItinData<IIC_VLD4, [InstrStage<1, [A9_DRegsN], 0, Required>, - // Extra latency cycles since wbck is 6 cycles - InstrStage<7, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1], 0>, - InstrStage<1, [A9_LSPipe]>, - InstrStage<1, [A9_NPipe]>], [2, 2, 2, 2, 1]>, - // - // VST - // FIXME: We don't model this instruction properly - InstrItinData<IIC_VST, [InstrStage<1, [A9_DRegsN], 0, Required>, - // Extra latency cycles since wbck is 6 cycles - InstrStage<7, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1], 0>, - InstrStage<1, [A9_LSPipe]>, - InstrStage<1, [A9_NPipe]>]>, + InstrItinData<IIC_VLD4, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<10,[A9_DRegsVFP], 0, Reserved>, + InstrStage<4, [A9_NPipe], 0>, + InstrStage<4, [A9_LSUnit]>], + [4, 4, 5, 5, 1]>, + // + // VLD4ln + InstrItinData<IIC_VLD4ln, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<11,[A9_DRegsVFP], 0, Reserved>, + InstrStage<5, [A9_NPipe], 0>, + InstrStage<5, [A9_LSUnit]>], + [5, 5, 6, 6, 1, 1, 1, 1, 2, 2]>, + // + // VLD4u + InstrItinData<IIC_VLD4u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<10,[A9_DRegsVFP], 0, Reserved>, + InstrStage<4, [A9_NPipe], 0>, + InstrStage<4, [A9_LSUnit]>], + [4, 4, 5, 5, 2, 1]>, + // + // VLD4lnu + InstrItinData<IIC_VLD4lnu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<11,[A9_DRegsVFP], 0, Reserved>, + InstrStage<5, [A9_NPipe], 0>, + InstrStage<5, [A9_LSUnit]>], + [5, 5, 6, 6, 2, 1, 1, 1, 1, 1, 2, 2]>, + // + // VLD4dup + InstrItinData<IIC_VLD4dup, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<9, [A9_DRegsVFP], 0, Reserved>, + InstrStage<3, [A9_NPipe], 0>, + InstrStage<3, [A9_LSUnit]>], + [3, 3, 4, 4, 1]>, + // + // VLD4dupu + InstrItinData<IIC_VLD4dupu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<9, [A9_DRegsVFP], 0, Reserved>, + InstrStage<3, [A9_NPipe], 0>, + InstrStage<3, [A9_LSUnit]>], + [3, 3, 4, 4, 2, 1, 1]>, + // + // VST1 + InstrItinData<IIC_VST1, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<2, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [1, 1, 1]>, + // + // VST1x2 + InstrItinData<IIC_VST1x2, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<2, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [1, 1, 1, 1]>, + // + // VST1x3 + InstrItinData<IIC_VST1x3, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<3, [A9_DRegsVFP], 0, Reserved>, + InstrStage<3, [A9_NPipe], 0>, + InstrStage<3, [A9_LSUnit]>], + [1, 1, 1, 1, 2]>, + // + // VST1x4 + InstrItinData<IIC_VST1x4, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<3, [A9_DRegsVFP], 0, Reserved>, + InstrStage<3, [A9_NPipe], 0>, + InstrStage<3, [A9_LSUnit]>], + [1, 1, 1, 1, 2, 2]>, + // + // VST1u + InstrItinData<IIC_VST1u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<2, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [2, 1, 1, 1, 1]>, + // + // VST1x2u + InstrItinData<IIC_VST1x2u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<2, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [2, 1, 1, 1, 1, 1]>, + // + // VST1x3u + InstrItinData<IIC_VST1x3u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<2, [A9_DRegsVFP], 0, Reserved>, + InstrStage<3, [A9_NPipe], 0>, + InstrStage<3, [A9_LSUnit]>], + [2, 1, 1, 1, 1, 1, 2]>, + // + // VST1x4u + InstrItinData<IIC_VST1x4u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<3, [A9_DRegsVFP], 0, Reserved>, + InstrStage<3, [A9_NPipe], 0>, + InstrStage<3, [A9_LSUnit]>], + [2, 1, 1, 1, 1, 1, 2, 2]>, + // + // VST1ln + InstrItinData<IIC_VST1ln, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<2, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [1, 1, 1]>, + // + // VST1lnu + InstrItinData<IIC_VST1lnu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<3, [A9_DRegsVFP], 0, Reserved>, + InstrStage<3, [A9_NPipe], 0>, + InstrStage<3, [A9_LSUnit]>], + [2, 1, 1, 1, 1]>, + // + // VST2 + InstrItinData<IIC_VST2, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<2, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [1, 1, 1, 1]>, + // + // VST2x2 + InstrItinData<IIC_VST2x2, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<3, [A9_DRegsVFP], 0, Reserved>, + InstrStage<3, [A9_NPipe], 0>, + InstrStage<3, [A9_LSUnit]>], + [1, 1, 1, 1, 2, 2]>, + // + // VST2u + InstrItinData<IIC_VST2u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<2, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [2, 1, 1, 1, 1, 1]>, + // + // VST2x2u + InstrItinData<IIC_VST2x2u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<3, [A9_DRegsVFP], 0, Reserved>, + InstrStage<3, [A9_NPipe], 0>, + InstrStage<3, [A9_LSUnit]>], + [2, 1, 1, 1, 1, 1, 2, 2]>, + // + // VST2ln + InstrItinData<IIC_VST2ln, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<2, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [1, 1, 1, 1]>, + // + // VST2lnu + InstrItinData<IIC_VST2lnu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<3, [A9_DRegsVFP], 0, Reserved>, + InstrStage<3, [A9_NPipe], 0>, + InstrStage<3, [A9_LSUnit]>], + [2, 1, 1, 1, 1, 1]>, + // + // VST3 + InstrItinData<IIC_VST3, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<3, [A9_DRegsVFP], 0, Reserved>, + InstrStage<3, [A9_NPipe], 0>, + InstrStage<3, [A9_LSUnit]>], + [1, 1, 1, 1, 2]>, + // + // VST3u + InstrItinData<IIC_VST3u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<3, [A9_DRegsVFP], 0, Reserved>, + InstrStage<3, [A9_NPipe], 0>, + InstrStage<3, [A9_LSUnit]>], + [2, 1, 1, 1, 1, 1, 2]>, + // + // VST3ln + InstrItinData<IIC_VST3ln, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<3, [A9_DRegsVFP], 0, Reserved>, + InstrStage<3, [A9_NPipe], 0>, + InstrStage<3, [A9_LSUnit]>], + [1, 1, 1, 1, 2]>, + // + // VST3lnu + InstrItinData<IIC_VST3lnu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<3, [A9_DRegsVFP], 0, Reserved>, + InstrStage<3, [A9_NPipe], 0>, + InstrStage<3, [A9_LSUnit]>], + [2, 1, 1, 1, 1, 1, 2]>, + // + // VST4 + InstrItinData<IIC_VST4, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<3, [A9_DRegsVFP], 0, Reserved>, + InstrStage<3, [A9_NPipe], 0>, + InstrStage<3, [A9_LSUnit]>], + [1, 1, 1, 1, 2, 2]>, + // + // VST4u + InstrItinData<IIC_VST4u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<3, [A9_DRegsVFP], 0, Reserved>, + InstrStage<3, [A9_NPipe], 0>, + InstrStage<3, [A9_LSUnit]>], + [2, 1, 1, 1, 1, 1, 2, 2]>, + // + // VST4ln + InstrItinData<IIC_VST4ln, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<3, [A9_DRegsVFP], 0, Reserved>, + InstrStage<3, [A9_NPipe], 0>, + InstrStage<3, [A9_LSUnit]>], + [1, 1, 1, 1, 2, 2]>, + // + // VST4lnu + InstrItinData<IIC_VST4lnu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<3, [A9_DRegsVFP], 0, Reserved>, + InstrStage<3, [A9_NPipe], 0>, + InstrStage<3, [A9_LSUnit]>], + [2, 1, 1, 1, 1, 1, 2, 2]>, + // // Double-register Integer Unary - InstrItinData<IIC_VUNAiD, [InstrStage<1, [A9_DRegsN], 0, Required>, + InstrItinData<IIC_VUNAiD, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 6 cycles InstrStage<7, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_NPipe]>], [4, 2]>, + InstrStage<1, [A9_NPipe]>], + [4, 2]>, // // Quad-register Integer Unary - InstrItinData<IIC_VUNAiQ, [InstrStage<1, [A9_DRegsN], 0, Required>, + InstrItinData<IIC_VUNAiQ, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 6 cycles InstrStage<7, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_NPipe]>], [4, 2]>, + InstrStage<1, [A9_NPipe]>], + [4, 2]>, // // Double-register Integer Q-Unary - InstrItinData<IIC_VQUNAiD, [InstrStage<1, [A9_DRegsN], 0, Required>, + InstrItinData<IIC_VQUNAiD, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 6 cycles InstrStage<7, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_NPipe]>], [4, 1]>, + InstrStage<1, [A9_NPipe]>], + [4, 1]>, // // Quad-register Integer CountQ-Unary - InstrItinData<IIC_VQUNAiQ, [InstrStage<1, [A9_DRegsN], 0, Required>, + InstrItinData<IIC_VQUNAiQ, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 6 cycles InstrStage<7, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_NPipe]>], [4, 1]>, + InstrStage<1, [A9_NPipe]>], + [4, 1]>, // // Double-register Integer Binary - InstrItinData<IIC_VBINiD, [InstrStage<1, [A9_DRegsN], 0, Required>, + InstrItinData<IIC_VBINiD, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 6 cycles InstrStage<7, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_NPipe]>], [3, 2, 2]>, + InstrStage<1, [A9_NPipe]>], + [3, 2, 2]>, // // Quad-register Integer Binary - InstrItinData<IIC_VBINiQ, [InstrStage<1, [A9_DRegsN], 0, Required>, + InstrItinData<IIC_VBINiQ, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 6 cycles InstrStage<7, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_NPipe]>], [3, 2, 2]>, + InstrStage<1, [A9_NPipe]>], + [3, 2, 2]>, // // Double-register Integer Subtract - InstrItinData<IIC_VSUBiD, [InstrStage<1, [A9_DRegsN], 0, Required>, + InstrItinData<IIC_VSUBiD, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 6 cycles InstrStage<7, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_NPipe]>], [3, 2, 1]>, + InstrStage<1, [A9_NPipe]>], + [3, 2, 1]>, // // Quad-register Integer Subtract - InstrItinData<IIC_VSUBiQ, [InstrStage<1, [A9_DRegsN], 0, Required>, + InstrItinData<IIC_VSUBiQ, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 6 cycles InstrStage<7, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_NPipe]>], [3, 2, 1]>, + InstrStage<1, [A9_NPipe]>], + [3, 2, 1]>, // // Double-register Integer Shift - InstrItinData<IIC_VSHLiD, [InstrStage<1, [A9_DRegsN], 0, Required>, + InstrItinData<IIC_VSHLiD, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 6 cycles InstrStage<7, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_NPipe]>], [3, 1, 1]>, + InstrStage<1, [A9_NPipe]>], + [3, 1, 1]>, // // Quad-register Integer Shift - InstrItinData<IIC_VSHLiQ, [InstrStage<1, [A9_DRegsN], 0, Required>, + InstrItinData<IIC_VSHLiQ, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 6 cycles InstrStage<7, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_NPipe]>], [3, 1, 1]>, + InstrStage<1, [A9_NPipe]>], + [3, 1, 1]>, // // Double-register Integer Shift (4 cycle) - InstrItinData<IIC_VSHLi4D, [InstrStage<1, [A9_DRegsN], 0, Required>, + InstrItinData<IIC_VSHLi4D, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 6 cycles InstrStage<7, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_NPipe]>], [4, 1, 1]>, + InstrStage<1, [A9_NPipe]>], + [4, 1, 1]>, // // Quad-register Integer Shift (4 cycle) - InstrItinData<IIC_VSHLi4Q, [InstrStage<1, [A9_DRegsN], 0, Required>, + InstrItinData<IIC_VSHLi4Q, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 6 cycles InstrStage<7, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_NPipe]>], [4, 1, 1]>, + InstrStage<1, [A9_NPipe]>], + [4, 1, 1]>, // // Double-register Integer Binary (4 cycle) - InstrItinData<IIC_VBINi4D, [InstrStage<1, [A9_DRegsN], 0, Required>, + InstrItinData<IIC_VBINi4D, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 6 cycles InstrStage<7, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_NPipe]>], [4, 2, 2]>, + InstrStage<1, [A9_NPipe]>], + [4, 2, 2]>, // // Quad-register Integer Binary (4 cycle) - InstrItinData<IIC_VBINi4Q, [InstrStage<1, [A9_DRegsN], 0, Required>, + InstrItinData<IIC_VBINi4Q, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 6 cycles InstrStage<7, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_NPipe]>], [4, 2, 2]>, + InstrStage<1, [A9_NPipe]>], + [4, 2, 2]>, // // Double-register Integer Subtract (4 cycle) - InstrItinData<IIC_VSUBiD, [InstrStage<1, [A9_DRegsN], 0, Required>, + InstrItinData<IIC_VSUBi4D, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 6 cycles InstrStage<7, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_NPipe]>], [4, 2, 1]>, + InstrStage<1, [A9_NPipe]>], + [4, 2, 1]>, // // Quad-register Integer Subtract (4 cycle) - InstrItinData<IIC_VSUBiQ, [InstrStage<1, [A9_DRegsN], 0, Required>, + InstrItinData<IIC_VSUBi4Q, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 6 cycles InstrStage<7, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_NPipe]>], [4, 2, 1]>, + InstrStage<1, [A9_NPipe]>], + [4, 2, 1]>, // // Double-register Integer Count - InstrItinData<IIC_VCNTiD, [InstrStage<1, [A9_DRegsN], 0, Required>, + InstrItinData<IIC_VCNTiD, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 6 cycles InstrStage<7, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_NPipe]>], [3, 2, 2]>, + InstrStage<1, [A9_NPipe]>], + [3, 2, 2]>, // // Quad-register Integer Count // Result written in N3, but that is relative to the last cycle of multicycle, // so we use 4 for those cases - InstrItinData<IIC_VCNTiQ, [InstrStage<1, [A9_DRegsN], 0, Required>, + InstrItinData<IIC_VCNTiQ, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 7 cycles InstrStage<8, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<2, [A9_NPipe]>], [4, 2, 2]>, + InstrStage<2, [A9_NPipe]>], + [4, 2, 2]>, // // Double-register Absolute Difference and Accumulate - InstrItinData<IIC_VABAD, [InstrStage<1, [A9_DRegsN], 0, Required>, + InstrItinData<IIC_VABAD, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 6 cycles InstrStage<7, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_NPipe]>], [6, 3, 2, 1]>, + InstrStage<1, [A9_NPipe]>], + [6, 3, 2, 1]>, // // Quad-register Absolute Difference and Accumulate - InstrItinData<IIC_VABAQ, [InstrStage<1, [A9_DRegsN], 0, Required>, + InstrItinData<IIC_VABAQ, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 6 cycles InstrStage<7, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<2, [A9_NPipe]>], [6, 3, 2, 1]>, + InstrStage<2, [A9_NPipe]>], + [6, 3, 2, 1]>, // // Double-register Integer Pair Add Long - InstrItinData<IIC_VPALiD, [InstrStage<1, [A9_DRegsN], 0, Required>, + InstrItinData<IIC_VPALiD, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 6 cycles InstrStage<7, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_NPipe]>], [6, 3, 1]>, + InstrStage<1, [A9_NPipe]>], + [6, 3, 1]>, // // Quad-register Integer Pair Add Long - InstrItinData<IIC_VPALiQ, [InstrStage<1, [A9_DRegsN], 0, Required>, + InstrItinData<IIC_VPALiQ, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 6 cycles InstrStage<7, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<2, [A9_NPipe]>], [6, 3, 1]>, + InstrStage<2, [A9_NPipe]>], + [6, 3, 1]>, // // Double-register Integer Multiply (.8, .16) - InstrItinData<IIC_VMULi16D, [InstrStage<1, [A9_DRegsN], 0, Required>, + InstrItinData<IIC_VMULi16D, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 6 cycles InstrStage<7, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_NPipe]>], [6, 2, 2]>, + InstrStage<1, [A9_NPipe]>], + [6, 2, 2]>, // // Quad-register Integer Multiply (.8, .16) - InstrItinData<IIC_VMULi16Q, [InstrStage<1, [A9_DRegsN], 0, Required>, + InstrItinData<IIC_VMULi16Q, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 7 cycles InstrStage<8, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<2, [A9_NPipe]>], [7, 2, 2]>, + InstrStage<2, [A9_NPipe]>], + [7, 2, 2]>, // // Double-register Integer Multiply (.32) - InstrItinData<IIC_VMULi32D, [InstrStage<1, [A9_DRegsN], 0, Required>, + InstrItinData<IIC_VMULi32D, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 7 cycles InstrStage<8, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<2, [A9_NPipe]>], [7, 2, 1]>, + InstrStage<2, [A9_NPipe]>], + [7, 2, 1]>, // // Quad-register Integer Multiply (.32) - InstrItinData<IIC_VMULi32Q, [InstrStage<1, [A9_DRegsN], 0, Required>, + InstrItinData<IIC_VMULi32Q, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 9 cycles InstrStage<10, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<4, [A9_NPipe]>], [9, 2, 1]>, + InstrStage<4, [A9_NPipe]>], + [9, 2, 1]>, // // Double-register Integer Multiply-Accumulate (.8, .16) - InstrItinData<IIC_VMACi16D, [InstrStage<1, [A9_DRegsN], 0, Required>, + InstrItinData<IIC_VMACi16D, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 6 cycles InstrStage<7, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_NPipe]>], [6, 3, 2, 2]>, + InstrStage<1, [A9_NPipe]>], + [6, 3, 2, 2]>, // // Double-register Integer Multiply-Accumulate (.32) - InstrItinData<IIC_VMACi32D, [InstrStage<1, [A9_DRegsN], 0, Required>, + InstrItinData<IIC_VMACi32D, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 7 cycles InstrStage<8, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<2, [A9_NPipe]>], [7, 3, 2, 1]>, + InstrStage<2, [A9_NPipe]>], + [7, 3, 2, 1]>, // // Quad-register Integer Multiply-Accumulate (.8, .16) - InstrItinData<IIC_VMACi16Q, [InstrStage<1, [A9_DRegsN], 0, Required>, + InstrItinData<IIC_VMACi16Q, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 7 cycles InstrStage<8, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<2, [A9_NPipe]>], [7, 3, 2, 2]>, + InstrStage<2, [A9_NPipe]>], + [7, 3, 2, 2]>, // // Quad-register Integer Multiply-Accumulate (.32) - InstrItinData<IIC_VMACi32Q, [InstrStage<1, [A9_DRegsN], 0, Required>, + InstrItinData<IIC_VMACi32Q, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 9 cycles InstrStage<10, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<4, [A9_NPipe]>], [9, 3, 2, 1]>, + InstrStage<4, [A9_NPipe]>], + [9, 3, 2, 1]>, + + // + // Move + InstrItinData<IIC_VMOV, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<1, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [1,1]>, // // Move Immediate - InstrItinData<IIC_VMOVImm, [InstrStage<1, [A9_DRegsN], 0, Required>, + InstrItinData<IIC_VMOVImm, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 6 cycles InstrStage<7, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_NPipe]>], [3]>, + InstrStage<1, [A9_NPipe]>], + [3]>, // // Double-register Permute Move - InstrItinData<IIC_VMOVD, [InstrStage<1, [A9_DRegsN], 0, Required>, - // FIXME: all latencies are arbitrary, no information is available - InstrStage<3, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_LSPipe]>], [2, 1]>, + InstrItinData<IIC_VMOVD, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [2, 1]>, // // Quad-register Permute Move - // Result written in N2, but that is relative to the last cycle of multicycle, - // so we use 3 for those cases - InstrItinData<IIC_VMOVQ, [InstrStage<1, [A9_DRegsN], 0, Required>, - // FIXME: all latencies are arbitrary, no information is available - InstrStage<4, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<2, [A9_NPipe]>], [3, 1]>, + InstrItinData<IIC_VMOVQ, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [2, 1]>, // // Integer to Single-precision Move - InstrItinData<IIC_VMOVIS , [InstrStage<1, [A9_DRegsN], 0, Required>, - // FIXME: all latencies are arbitrary, no information is available + InstrItinData<IIC_VMOVIS , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, InstrStage<3, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_NPipe]>], [2, 1]>, + InstrStage<1, [A9_NPipe]>], + [1, 1]>, // // Integer to Double-precision Move - InstrItinData<IIC_VMOVID , [InstrStage<1, [A9_DRegsN], 0, Required>, - // FIXME: all latencies are arbitrary, no information is available + InstrItinData<IIC_VMOVID , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, InstrStage<3, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_NPipe]>], [2, 1, 1]>, + InstrStage<1, [A9_NPipe]>], + [1, 1, 1]>, // // Single-precision to Integer Move - InstrItinData<IIC_VMOVSI , [InstrStage<1, [A9_DRegsN], 0, Required>, - // FIXME: all latencies are arbitrary, no information is available + InstrItinData<IIC_VMOVSI , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, InstrStage<3, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_NPipe]>], [2, 1]>, + InstrStage<1, [A9_NPipe]>], + [2, 1]>, // // Double-precision to Integer Move - InstrItinData<IIC_VMOVDI , [InstrStage<1, [A9_DRegsN], 0, Required>, - // FIXME: all latencies are arbitrary, no information is available + InstrItinData<IIC_VMOVDI , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, InstrStage<3, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_NPipe]>], [2, 2, 1]>, + InstrStage<1, [A9_NPipe]>], + [2, 2, 1]>, // // Integer to Lane Move - InstrItinData<IIC_VMOVISL , [InstrStage<1, [A9_DRegsN], 0, Required>, - // FIXME: all latencies are arbitrary, no information is available + InstrItinData<IIC_VMOVISL , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, InstrStage<4, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<2, [A9_NPipe]>], [3, 1, 1]>, + InstrStage<2, [A9_NPipe]>], + [3, 1, 1]>, // + // Vector narrow move + InstrItinData<IIC_VMOVN, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [3, 1]>, + // // Double-register FP Unary - InstrItinData<IIC_VUNAD, [InstrStage<1, [A9_DRegsN], 0, Required>, + InstrItinData<IIC_VUNAD, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 6 cycles InstrStage<7, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_NPipe]>], [5, 2]>, + InstrStage<1, [A9_NPipe]>], + [5, 2]>, // // Quad-register FP Unary // Result written in N5, but that is relative to the last cycle of multicycle, // so we use 6 for those cases - InstrItinData<IIC_VUNAQ, [InstrStage<1, [A9_DRegsN], 0, Required>, + InstrItinData<IIC_VUNAQ, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 7 cycles InstrStage<8, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<2, [A9_NPipe]>], [6, 2]>, + InstrStage<2, [A9_NPipe]>], + [6, 2]>, // // Double-register FP Binary // FIXME: We're using this itin for many instructions and [2, 2] here is too // optimistic. - InstrItinData<IIC_VBIND, [InstrStage<1, [A9_DRegsN], 0, Required>, - // Extra latency cycles since wbck is 7 cycles + InstrItinData<IIC_VBIND, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles InstrStage<7, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_NPipe]>], [5, 2, 2]>, + InstrStage<1, [A9_NPipe]>], + [5, 2, 2]>, + + // + // VPADD, etc. + InstrItinData<IIC_VPBIND, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [5, 1, 1]>, + // + // Double-register FP VMUL + InstrItinData<IIC_VFMULD, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [5, 2, 1]>, // // Quad-register FP Binary // Result written in N5, but that is relative to the last cycle of multicycle, // so we use 6 for those cases // FIXME: We're using this itin for many instructions and [2, 2] here is too // optimistic. - InstrItinData<IIC_VBINQ, [InstrStage<1, [A9_DRegsN], 0, Required>, - // Extra latency cycles since wbck is 8 cycles + InstrItinData<IIC_VBINQ, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 7 cycles + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe]>], + [6, 2, 2]>, + // + // Quad-register FP VMUL + InstrItinData<IIC_VFMULQ, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 7 cycles InstrStage<8, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<2, [A9_NPipe]>], [6, 2, 2]>, + InstrStage<1, [A9_NPipe]>], + [6, 2, 1]>, // // Double-register FP Multiple-Accumulate - InstrItinData<IIC_VMACD, [InstrStage<1, [A9_DRegsN], 0, Required>, + InstrItinData<IIC_VMACD, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 7 cycles InstrStage<8, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<2, [A9_NPipe]>], [6, 3, 2, 1]>, + InstrStage<2, [A9_NPipe]>], + [6, 3, 2, 1]>, // // Quad-register FP Multiple-Accumulate // Result written in N9, but that is relative to the last cycle of multicycle, // so we use 10 for those cases - InstrItinData<IIC_VMACQ, [InstrStage<1, [A9_DRegsN], 0, Required>, + InstrItinData<IIC_VMACQ, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 9 cycles InstrStage<10, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<4, [A9_NPipe]>], [8, 4, 2, 1]>, + InstrStage<4, [A9_NPipe]>], + [8, 4, 2, 1]>, // // Double-register Reciprical Step - InstrItinData<IIC_VRECSD, [InstrStage<1, [A9_DRegsN], 0, Required>, - // Extra latency cycles since wbck is 7 cycles - InstrStage<8, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<2, [A9_NPipe]>], [6, 2, 2]>, + InstrItinData<IIC_VRECSD, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 10 cycles + InstrStage<11, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [9, 2, 2]>, // // Quad-register Reciprical Step - InstrItinData<IIC_VRECSQ, [InstrStage<1, [A9_DRegsN], 0, Required>, - // Extra latency cycles since wbck is 9 cycles - InstrStage<10, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<4, [A9_NPipe]>], [8, 2, 2]>, + InstrItinData<IIC_VRECSQ, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 11 cycles + InstrStage<12, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe]>], + [10, 2, 2]>, // // Double-register Permute - InstrItinData<IIC_VPERMD, [InstrStage<1, [A9_DRegsN], 0, Required>, + InstrItinData<IIC_VPERMD, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 6 cycles InstrStage<7, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_NPipe]>], [2, 2, 1, 1]>, + InstrStage<1, [A9_NPipe]>], + [2, 2, 1, 1]>, // // Quad-register Permute // Result written in N2, but that is relative to the last cycle of multicycle, // so we use 3 for those cases - InstrItinData<IIC_VPERMQ, [InstrStage<1, [A9_DRegsN], 0, Required>, + InstrItinData<IIC_VPERMQ, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 7 cycles InstrStage<8, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<2, [A9_NPipe]>], [3, 3, 1, 1]>, + InstrStage<2, [A9_NPipe]>], + [3, 3, 1, 1]>, // // Quad-register Permute (3 cycle issue) // Result written in N2, but that is relative to the last cycle of multicycle, // so we use 4 for those cases - InstrItinData<IIC_VPERMQ3, [InstrStage<1, [A9_DRegsN], 0, Required>, + InstrItinData<IIC_VPERMQ3, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 8 cycles InstrStage<9, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<3, [A9_LSPipe]>], [4, 4, 1, 1]>, + InstrStage<3, [A9_NPipe]>], + [4, 4, 1, 1]>, // // Double-register VEXT - InstrItinData<IIC_VEXTD, [InstrStage<1, [A9_DRegsN], 0, Required>, - // Extra latency cycles since wbck is 7 cycles + InstrItinData<IIC_VEXTD, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles InstrStage<7, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_NPipe]>], [2, 1, 1]>, + InstrStage<1, [A9_NPipe]>], + [2, 1, 1]>, // // Quad-register VEXT - InstrItinData<IIC_VEXTQ, [InstrStage<1, [A9_DRegsN], 0, Required>, - // Extra latency cycles since wbck is 9 cycles + InstrItinData<IIC_VEXTQ, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 7 cycles InstrStage<8, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<2, [A9_NPipe]>], [3, 1, 1]>, + InstrStage<2, [A9_NPipe]>], + [3, 1, 2]>, // // VTB - InstrItinData<IIC_VTB1, [InstrStage<1, [A9_DRegsN], 0, Required>, + InstrItinData<IIC_VTB1, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 7 cycles InstrStage<8, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<2, [A9_NPipe]>], [3, 2, 1]>, - InstrItinData<IIC_VTB2, [InstrStage<2, [A9_DRegsN], 0, Required>, + InstrStage<2, [A9_NPipe]>], + [3, 2, 1]>, + InstrItinData<IIC_VTB2, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<2, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 7 cycles InstrStage<8, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<2, [A9_NPipe]>], [3, 2, 2, 1]>, - InstrItinData<IIC_VTB3, [InstrStage<2, [A9_DRegsN], 0, Required>, + InstrStage<2, [A9_NPipe]>], + [3, 2, 2, 1]>, + InstrItinData<IIC_VTB3, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<2, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 8 cycles InstrStage<9, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<3, [A9_NPipe]>], [4, 2, 2, 3, 1]>, - InstrItinData<IIC_VTB4, [InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<3, [A9_NPipe]>], + [4, 2, 2, 3, 1]>, + InstrItinData<IIC_VTB4, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 8 cycles InstrStage<9, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<3, [A9_NPipe]>], [4, 2, 2, 3, 3, 1]>, + InstrStage<3, [A9_NPipe]>], + [4, 2, 2, 3, 3, 1]>, // // VTBX - InstrItinData<IIC_VTBX1, [InstrStage<1, [A9_DRegsN], 0, Required>, + InstrItinData<IIC_VTBX1, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 7 cycles InstrStage<8, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<2, [A9_NPipe]>], [3, 1, 2, 1]>, - InstrItinData<IIC_VTBX2, [InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<2, [A9_NPipe]>], + [3, 1, 2, 1]>, + InstrItinData<IIC_VTBX2, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 7 cycles InstrStage<8, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<2, [A9_NPipe]>], [3, 1, 2, 2, 1]>, - InstrItinData<IIC_VTBX3, [InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<2, [A9_NPipe]>], + [3, 1, 2, 2, 1]>, + InstrItinData<IIC_VTBX3, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 8 cycles InstrStage<9, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<3, [A9_NPipe]>], [4, 1, 2, 2, 3, 1]>, - InstrItinData<IIC_VTBX4, [InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<3, [A9_NPipe]>], + [4, 1, 2, 2, 3, 1]>, + InstrItinData<IIC_VTBX4, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 8 cycles InstrStage<9, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<2, [A9_NPipe]>], [4, 1, 2, 2, 3, 3, 1]> + InstrStage<2, [A9_NPipe]>], + [4, 1, 2, 2, 3, 3, 1]> ]>; diff --git a/contrib/llvm/lib/Target/ARM/ARMScheduleV6.td b/contrib/llvm/lib/Target/ARM/ARMScheduleV6.td index 08b560c..c1880a7 100644 --- a/contrib/llvm/lib/Target/ARM/ARMScheduleV6.td +++ b/contrib/llvm/lib/Target/ARM/ARMScheduleV6.td @@ -19,7 +19,7 @@ def V6_Pipe : FuncUnit; // pipeline // Scheduling information derived from "ARM1176JZF-S Technical Reference Manual" // def ARMV6Itineraries : ProcessorItineraries< - [V6_Pipe], [ + [V6_Pipe], [], [ // // No operand cycles InstrItinData<IIC_iALUx , [InstrStage<1, [V6_Pipe]>]>, @@ -30,10 +30,20 @@ def ARMV6Itineraries : ProcessorItineraries< InstrItinData<IIC_iALUsi , [InstrStage<1, [V6_Pipe]>], [2, 2, 1]>, InstrItinData<IIC_iALUsr , [InstrStage<2, [V6_Pipe]>], [3, 3, 2, 1]>, // + // Bitwise Instructions that produce a result + InstrItinData<IIC_iBITi , [InstrStage<1, [V6_Pipe]>], [2, 2]>, + InstrItinData<IIC_iBITr , [InstrStage<1, [V6_Pipe]>], [2, 2, 2]>, + InstrItinData<IIC_iBITsi , [InstrStage<1, [V6_Pipe]>], [2, 2, 1]>, + InstrItinData<IIC_iBITsr , [InstrStage<2, [V6_Pipe]>], [3, 3, 2, 1]>, + // // Unary Instructions that produce a result InstrItinData<IIC_iUNAr , [InstrStage<1, [V6_Pipe]>], [2, 2]>, InstrItinData<IIC_iUNAsi , [InstrStage<1, [V6_Pipe]>], [2, 1]>, - InstrItinData<IIC_iUNAsr , [InstrStage<2, [V6_Pipe]>], [3, 2, 1]>, + // + // Zero and sign extension instructions + InstrItinData<IIC_iEXTr , [InstrStage<1, [V6_Pipe]>], [1, 1]>, + InstrItinData<IIC_iEXTAr , [InstrStage<1, [V6_Pipe]>], [2, 2, 1]>, + InstrItinData<IIC_iEXTAsr , [InstrStage<2, [V6_Pipe]>], [3, 3, 2, 1]>, // // Compare instructions InstrItinData<IIC_iCMPi , [InstrStage<1, [V6_Pipe]>], [2]>, @@ -41,17 +51,39 @@ def ARMV6Itineraries : ProcessorItineraries< InstrItinData<IIC_iCMPsi , [InstrStage<1, [V6_Pipe]>], [2, 1]>, InstrItinData<IIC_iCMPsr , [InstrStage<2, [V6_Pipe]>], [3, 2, 1]>, // + // Test instructions + InstrItinData<IIC_iTSTi , [InstrStage<1, [V6_Pipe]>], [2]>, + InstrItinData<IIC_iTSTr , [InstrStage<1, [V6_Pipe]>], [2, 2]>, + InstrItinData<IIC_iTSTsi , [InstrStage<1, [V6_Pipe]>], [2, 1]>, + InstrItinData<IIC_iTSTsr , [InstrStage<2, [V6_Pipe]>], [3, 2, 1]>, + // // Move instructions, unconditional InstrItinData<IIC_iMOVi , [InstrStage<1, [V6_Pipe]>], [2]>, InstrItinData<IIC_iMOVr , [InstrStage<1, [V6_Pipe]>], [2, 2]>, InstrItinData<IIC_iMOVsi , [InstrStage<1, [V6_Pipe]>], [2, 1]>, InstrItinData<IIC_iMOVsr , [InstrStage<2, [V6_Pipe]>], [3, 2, 1]>, + InstrItinData<IIC_iMOVix2 , [InstrStage<1, [V6_Pipe]>, + InstrStage<1, [V6_Pipe]>], [2]>, + InstrItinData<IIC_iMOVix2addpc,[InstrStage<1, [V6_Pipe]>, + InstrStage<1, [V6_Pipe]>, + InstrStage<1, [V6_Pipe]>], [3]>, + InstrItinData<IIC_iMOVix2ld , [InstrStage<1, [V6_Pipe]>, + InstrStage<1, [V6_Pipe]>, + InstrStage<1, [V6_Pipe]>], [5]>, // // Move instructions, conditional InstrItinData<IIC_iCMOVi , [InstrStage<1, [V6_Pipe]>], [3]>, InstrItinData<IIC_iCMOVr , [InstrStage<1, [V6_Pipe]>], [3, 2]>, InstrItinData<IIC_iCMOVsi , [InstrStage<1, [V6_Pipe]>], [3, 1]>, InstrItinData<IIC_iCMOVsr , [InstrStage<1, [V6_Pipe]>], [4, 2, 1]>, + InstrItinData<IIC_iCMOVix2 , [InstrStage<1, [V6_Pipe]>, + InstrStage<1, [V6_Pipe]>], [4]>, + // + // MVN instructions + InstrItinData<IIC_iMVNi , [InstrStage<1, [V6_Pipe]>], [2]>, + InstrItinData<IIC_iMVNr , [InstrStage<1, [V6_Pipe]>], [2, 2]>, + InstrItinData<IIC_iMVNsi , [InstrStage<1, [V6_Pipe]>], [2, 1]>, + InstrItinData<IIC_iMVNsr , [InstrStage<2, [V6_Pipe]>], [3, 2, 1]>, // Integer multiply pipeline // @@ -65,50 +97,90 @@ def ARMV6Itineraries : ProcessorItineraries< // Integer load pipeline // // Immediate offset - InstrItinData<IIC_iLoadi , [InstrStage<1, [V6_Pipe]>], [4, 1]>, + InstrItinData<IIC_iLoad_i , [InstrStage<1, [V6_Pipe]>], [4, 1]>, + InstrItinData<IIC_iLoad_bh_i, [InstrStage<1, [V6_Pipe]>], [4, 1]>, + InstrItinData<IIC_iLoad_d_i , [InstrStage<1, [V6_Pipe]>], [4, 1]>, // // Register offset - InstrItinData<IIC_iLoadr , [InstrStage<1, [V6_Pipe]>], [4, 1, 1]>, + InstrItinData<IIC_iLoad_r , [InstrStage<1, [V6_Pipe]>], [4, 1, 1]>, + InstrItinData<IIC_iLoad_bh_r, [InstrStage<1, [V6_Pipe]>], [4, 1, 1]>, + InstrItinData<IIC_iLoad_d_r , [InstrStage<1, [V6_Pipe]>], [4, 1, 1]>, // // Scaled register offset, issues over 2 cycles - InstrItinData<IIC_iLoadsi , [InstrStage<2, [V6_Pipe]>], [5, 2, 1]>, + InstrItinData<IIC_iLoad_si , [InstrStage<2, [V6_Pipe]>], [5, 2, 1]>, + InstrItinData<IIC_iLoad_bh_si, [InstrStage<2, [V6_Pipe]>], [5, 2, 1]>, // // Immediate offset with update - InstrItinData<IIC_iLoadiu , [InstrStage<1, [V6_Pipe]>], [4, 2, 1]>, + InstrItinData<IIC_iLoad_iu , [InstrStage<1, [V6_Pipe]>], [4, 2, 1]>, + InstrItinData<IIC_iLoad_bh_iu, [InstrStage<1, [V6_Pipe]>], [4, 2, 1]>, // // Register offset with update - InstrItinData<IIC_iLoadru , [InstrStage<1, [V6_Pipe]>], [4, 2, 1, 1]>, + InstrItinData<IIC_iLoad_ru , [InstrStage<1, [V6_Pipe]>], [4, 2, 1, 1]>, + InstrItinData<IIC_iLoad_bh_ru, [InstrStage<1, [V6_Pipe]>], [4, 2, 1, 1]>, + InstrItinData<IIC_iLoad_d_ru , [InstrStage<1, [V6_Pipe]>], [4, 2, 1, 1]>, // // Scaled register offset with update, issues over 2 cycles - InstrItinData<IIC_iLoadsiu , [InstrStage<2, [V6_Pipe]>], [5, 2, 2, 1]>, + InstrItinData<IIC_iLoad_siu, [InstrStage<2, [V6_Pipe]>], [5, 2, 2, 1]>, + InstrItinData<IIC_iLoad_bh_siu,[InstrStage<2, [V6_Pipe]>], [5, 2, 2, 1]>, + + // + // Load multiple, def is the 5th operand. + InstrItinData<IIC_iLoad_m , [InstrStage<3, [V6_Pipe]>], [1, 1, 1, 1, 4]>, + // + // Load multiple + update, defs are the 1st and 5th operands. + InstrItinData<IIC_iLoad_mu , [InstrStage<3, [V6_Pipe]>], [2, 1, 1, 1, 4]>, + // + // Load multiple plus branch + InstrItinData<IIC_iLoad_mBr, [InstrStage<3, [V6_Pipe]>, + InstrStage<1, [V6_Pipe]>], [1, 2, 1, 1, 4]>, + + // + // iLoadi + iALUr for t2LDRpci_pic. + InstrItinData<IIC_iLoadiALU, [InstrStage<1, [V6_Pipe]>, + InstrStage<1, [V6_Pipe]>], [3, 1]>, // - // Load multiple - InstrItinData<IIC_iLoadm , [InstrStage<3, [V6_Pipe]>]>, + // Pop, def is the 3rd operand. + InstrItinData<IIC_iPop , [InstrStage<3, [V6_Pipe]>], [1, 1, 4]>, + // + // Pop + branch, def is the 3rd operand. + InstrItinData<IIC_iPop_Br, [InstrStage<3, [V6_Pipe]>, + InstrStage<1, [V6_Pipe]>], [1, 2, 4]>, // Integer store pipeline // // Immediate offset - InstrItinData<IIC_iStorei , [InstrStage<1, [V6_Pipe]>], [2, 1]>, + InstrItinData<IIC_iStore_i , [InstrStage<1, [V6_Pipe]>], [2, 1]>, + InstrItinData<IIC_iStore_bh_i, [InstrStage<1, [V6_Pipe]>], [2, 1]>, + InstrItinData<IIC_iStore_d_i , [InstrStage<1, [V6_Pipe]>], [2, 1]>, // // Register offset - InstrItinData<IIC_iStorer , [InstrStage<1, [V6_Pipe]>], [2, 1, 1]>, - + InstrItinData<IIC_iStore_r , [InstrStage<1, [V6_Pipe]>], [2, 1, 1]>, + InstrItinData<IIC_iStore_bh_r, [InstrStage<1, [V6_Pipe]>], [2, 1, 1]>, + InstrItinData<IIC_iStore_d_r , [InstrStage<1, [V6_Pipe]>], [2, 1, 1]>, // // Scaled register offset, issues over 2 cycles - InstrItinData<IIC_iStoresi , [InstrStage<2, [V6_Pipe]>], [2, 2, 1]>, + InstrItinData<IIC_iStore_si , [InstrStage<2, [V6_Pipe]>], [2, 2, 1]>, + InstrItinData<IIC_iStore_bh_si, [InstrStage<2, [V6_Pipe]>], [2, 2, 1]>, // // Immediate offset with update - InstrItinData<IIC_iStoreiu , [InstrStage<1, [V6_Pipe]>], [2, 2, 1]>, + InstrItinData<IIC_iStore_iu , [InstrStage<1, [V6_Pipe]>], [2, 2, 1]>, + InstrItinData<IIC_iStore_bh_iu, [InstrStage<1, [V6_Pipe]>], [2, 2, 1]>, // // Register offset with update - InstrItinData<IIC_iStoreru , [InstrStage<1, [V6_Pipe]>], [2, 2, 1, 1]>, + InstrItinData<IIC_iStore_ru, [InstrStage<1, [V6_Pipe]>], [2, 2, 1, 1]>, + InstrItinData<IIC_iStore_bh_ru,[InstrStage<1, [V6_Pipe]>], [2, 2, 1, 1]>, + InstrItinData<IIC_iStore_d_ru, [InstrStage<1, [V6_Pipe]>], [2, 2, 1, 1]>, // // Scaled register offset with update, issues over 2 cycles - InstrItinData<IIC_iStoresiu, [InstrStage<2, [V6_Pipe]>], [2, 2, 2, 1]>, + InstrItinData<IIC_iStore_siu, [InstrStage<2, [V6_Pipe]>], [2, 2, 2, 1]>, + InstrItinData<IIC_iStore_bh_siu,[InstrStage<2, [V6_Pipe]>], [2, 2, 2, 1]>, // // Store multiple - InstrItinData<IIC_iStorem , [InstrStage<3, [V6_Pipe]>]>, + InstrItinData<IIC_iStore_m , [InstrStage<3, [V6_Pipe]>]>, + // + // Store multiple + update + InstrItinData<IIC_iStore_mu , [InstrStage<3, [V6_Pipe]>], [2]>, // Branch // @@ -183,6 +255,18 @@ def ARMV6Itineraries : ProcessorItineraries< // Double-precision FP SQRT InstrItinData<IIC_fpSQRT64 , [InstrStage<29, [V6_Pipe]>], [34, 2, 2]>, // + // Integer to Single-precision Move + InstrItinData<IIC_fpMOVIS, [InstrStage<1, [V6_Pipe]>], [10, 1]>, + // + // Integer to Double-precision Move + InstrItinData<IIC_fpMOVID, [InstrStage<1, [V6_Pipe]>], [10, 1, 1]>, + // + // Single-precision to Integer Move + InstrItinData<IIC_fpMOVSI, [InstrStage<1, [V6_Pipe]>], [10, 1]>, + // + // Double-precision to Integer Move + InstrItinData<IIC_fpMOVDI, [InstrStage<1, [V6_Pipe]>], [10, 10, 1]>, + // // Single-precision FP Load InstrItinData<IIC_fpLoad32 , [InstrStage<1, [V6_Pipe]>], [5, 2, 2]>, // @@ -190,7 +274,10 @@ def ARMV6Itineraries : ProcessorItineraries< InstrItinData<IIC_fpLoad64 , [InstrStage<1, [V6_Pipe]>], [5, 2, 2]>, // // FP Load Multiple - InstrItinData<IIC_fpLoadm , [InstrStage<3, [V6_Pipe]>]>, + InstrItinData<IIC_fpLoad_m , [InstrStage<3, [V6_Pipe]>], [2, 1, 1, 5]>, + // + // FP Load Multiple + update + InstrItinData<IIC_fpLoad_mu, [InstrStage<3, [V6_Pipe]>], [3, 2, 1, 1, 5]>, // // Single-precision FP Store InstrItinData<IIC_fpStore32 , [InstrStage<1, [V6_Pipe]>], [2, 2, 2]>, @@ -200,5 +287,8 @@ def ARMV6Itineraries : ProcessorItineraries< InstrItinData<IIC_fpStore64 , [InstrStage<1, [V6_Pipe]>], [2, 2, 2]>, // // FP Store Multiple - InstrItinData<IIC_fpStorem , [InstrStage<3, [V6_Pipe]>]> + InstrItinData<IIC_fpStore_m, [InstrStage<3, [V6_Pipe]>], [2, 2, 2, 2]>, + // + // FP Store Multiple + update + InstrItinData<IIC_fpStore_mu,[InstrStage<3, [V6_Pipe]>], [3, 2, 2, 2, 2]> ]>; diff --git a/contrib/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp index a289407..2b9202b 100644 --- a/contrib/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp @@ -29,10 +29,8 @@ ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl, SDValue Dst, SDValue Src, SDValue Size, unsigned Align, bool isVolatile, bool AlwaysInline, - const Value *DstSV, - uint64_t DstSVOff, - const Value *SrcSV, - uint64_t SrcSVOff) const { + MachinePointerInfo DstPtrInfo, + MachinePointerInfo SrcPtrInfo) const { // Do repeated 4-byte loads and stores. To be improved. // This requires 4-byte alignment. if ((Align & 3) != 0) @@ -66,7 +64,8 @@ ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl, Loads[i] = DAG.getLoad(VT, dl, Chain, DAG.getNode(ISD::ADD, dl, MVT::i32, Src, DAG.getConstant(SrcOff, MVT::i32)), - SrcSV, SrcSVOff + SrcOff, isVolatile, false, 0); + SrcPtrInfo.getWithOffset(SrcOff), isVolatile, + false, 0); TFOps[i] = Loads[i].getValue(1); SrcOff += VTSize; } @@ -77,7 +76,8 @@ ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl, TFOps[i] = DAG.getStore(Chain, dl, Loads[i], DAG.getNode(ISD::ADD, dl, MVT::i32, Dst, DAG.getConstant(DstOff, MVT::i32)), - DstSV, DstSVOff + DstOff, isVolatile, false, 0); + DstPtrInfo.getWithOffset(DstOff), + isVolatile, false, 0); DstOff += VTSize; } Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &TFOps[0], i); @@ -103,7 +103,7 @@ ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl, Loads[i] = DAG.getLoad(VT, dl, Chain, DAG.getNode(ISD::ADD, dl, MVT::i32, Src, DAG.getConstant(SrcOff, MVT::i32)), - SrcSV, SrcSVOff + SrcOff, false, false, 0); + SrcPtrInfo.getWithOffset(SrcOff), false, false, 0); TFOps[i] = Loads[i].getValue(1); ++i; SrcOff += VTSize; @@ -125,7 +125,7 @@ ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl, TFOps[i] = DAG.getStore(Chain, dl, Loads[i], DAG.getNode(ISD::ADD, dl, MVT::i32, Dst, DAG.getConstant(DstOff, MVT::i32)), - DstSV, DstSVOff + DstOff, false, false, 0); + DstPtrInfo.getWithOffset(DstOff), false, false, 0); ++i; DstOff += VTSize; BytesLeft -= VTSize; diff --git a/contrib/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h b/contrib/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h index d7d00c2..7533690 100644 --- a/contrib/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h +++ b/contrib/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h @@ -33,10 +33,8 @@ public: SDValue Dst, SDValue Src, SDValue Size, unsigned Align, bool isVolatile, bool AlwaysInline, - const Value *DstSV, - uint64_t DstSVOff, - const Value *SrcSV, - uint64_t SrcSVOff) const; + MachinePointerInfo DstPtrInfo, + MachinePointerInfo SrcPtrInfo) const; }; } diff --git a/contrib/llvm/lib/Target/ARM/ARMSubtarget.cpp b/contrib/llvm/lib/Target/ARM/ARMSubtarget.cpp index cb539f4..0bd740c 100644 --- a/contrib/llvm/lib/Target/ARM/ARMSubtarget.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMSubtarget.cpp @@ -13,6 +13,7 @@ #include "ARMSubtarget.h" #include "ARMGenSubtarget.inc" +#include "ARMBaseRegisterInfo.h" #include "llvm/GlobalValue.h" #include "llvm/Target/TargetOptions.h" #include "llvm/Support/CommandLine.h" @@ -24,45 +25,52 @@ ReserveR9("arm-reserve-r9", cl::Hidden, cl::desc("Reserve R9, making it unavailable as GPR")); static cl::opt<bool> -UseMOVT("arm-use-movt", - cl::init(true), cl::Hidden); +DarwinUseMOVT("arm-darwin-use-movt", cl::init(true), cl::Hidden); + +static cl::opt<bool> +StrictAlign("arm-strict-align", cl::Hidden, + cl::desc("Disallow all unaligned memory accesses")); ARMSubtarget::ARMSubtarget(const std::string &TT, const std::string &FS, bool isT) : ARMArchVersion(V4) + , ARMProcFamily(Others) , ARMFPUType(None) , UseNEONForSinglePrecisionFP(false) - , SlowVMLx(false) + , SlowFPVMLx(false) , SlowFPBrcc(false) , IsThumb(isT) , ThumbMode(Thumb1) , NoARM(false) , PostRAScheduler(false) , IsR9Reserved(ReserveR9) - , UseMovt(UseMOVT) + , UseMovt(false) , HasFP16(false) + , HasD16(false) , HasHardwareDivide(false) , HasT2ExtractPack(false) , HasDataBarrier(false) , Pref32BitThumb(false) + , HasMPExtension(false) , FPOnlySP(false) + , AllowsUnalignedMem(false) , stackAlignment(4) , CPUString("generic") - , TargetType(isELF) // Default to ELF unless otherwise specified. + , TargetTriple(TT) , TargetABI(ARM_ABI_APCS) { - // default to soft float ABI + // Default to soft float ABI if (FloatABIType == FloatABI::Default) FloatABIType = FloatABI::Soft; // Determine default and user specified characteristics - // Parse features string. - CPUString = ParseSubtargetFeatures(FS, CPUString); - // When no arch is specified either by CPU or by attributes, make the default // ARMv4T. - if (CPUString == "generic" && (FS.empty() || FS == "generic")) + const char *ARMArchFeature = ""; + if (CPUString == "generic" && (FS.empty() || FS == "generic")) { ARMArchVersion = V4T; + ARMArchFeature = ",+v4t"; + } // Set the boolean corresponding to the current target triple, or the default // if one cannot be determined, to true. @@ -80,47 +88,78 @@ ARMSubtarget::ARMSubtarget(const std::string &TT, const std::string &FS, unsigned SubVer = TT[Idx]; if (SubVer >= '7' && SubVer <= '9') { ARMArchVersion = V7A; - if (Len >= Idx+2 && TT[Idx+1] == 'm') + ARMArchFeature = ",+v7a"; + if (Len >= Idx+2 && TT[Idx+1] == 'm') { ARMArchVersion = V7M; + ARMArchFeature = ",+v7m"; + } } else if (SubVer == '6') { ARMArchVersion = V6; - if (Len >= Idx+3 && TT[Idx+1] == 't' && TT[Idx+2] == '2') + ARMArchFeature = ",+v6"; + if (Len >= Idx+3 && TT[Idx+1] == 't' && TT[Idx+2] == '2') { ARMArchVersion = V6T2; + ARMArchFeature = ",+v6t2"; + } } else if (SubVer == '5') { ARMArchVersion = V5T; - if (Len >= Idx+3 && TT[Idx+1] == 't' && TT[Idx+2] == 'e') + ARMArchFeature = ",+v5t"; + if (Len >= Idx+3 && TT[Idx+1] == 't' && TT[Idx+2] == 'e') { ARMArchVersion = V5TE; + ARMArchFeature = ",+v5te"; + } } else if (SubVer == '4') { - if (Len >= Idx+2 && TT[Idx+1] == 't') + if (Len >= Idx+2 && TT[Idx+1] == 't') { ARMArchVersion = V4T; - else + ARMArchFeature = ",+v4t"; + } else { ARMArchVersion = V4; + ARMArchFeature = ""; + } } } + if (TT.find("eabi") != std::string::npos) + TargetABI = ARM_ABI_AAPCS; + + // Parse features string. If the first entry in FS (the CPU) is missing, + // insert the architecture feature derived from the target triple. This is + // important for setting features that are implied based on the architecture + // version. + std::string FSWithArch; + if (FS.empty()) + FSWithArch = std::string(ARMArchFeature); + else if (FS.find(',') == 0) + FSWithArch = std::string(ARMArchFeature) + FS; + else + FSWithArch = FS; + CPUString = ParseSubtargetFeatures(FSWithArch, CPUString); + + // After parsing Itineraries, set ItinData.IssueWidth. + computeIssueWidth(); + // Thumb2 implies at least V6T2. if (ARMArchVersion >= V6T2) ThumbMode = Thumb2; else if (ThumbMode >= Thumb2) ARMArchVersion = V6T2; - if (Len >= 10) { - if (TT.find("-darwin") != std::string::npos) - // arm-darwin - TargetType = isDarwin; - } - - if (TT.find("eabi") != std::string::npos) - TargetABI = ARM_ABI_AAPCS; - if (isAAPCS_ABI()) stackAlignment = 8; - if (isTargetDarwin()) + if (!isTargetDarwin()) + UseMovt = hasV6T2Ops(); + else { IsR9Reserved = ReserveR9 | (ARMArchVersion < V6); + UseMovt = DarwinUseMOVT && hasV6T2Ops(); + } if (!isThumb() || hasThumb2()) PostRAScheduler = true; + + // v6+ may or may not support unaligned mem access depending on the system + // configuration. + if (!StrictAlign && hasV6Ops() && isTargetDarwin()) + AllowsUnalignedMem = true; } /// GVIsIndirectSymbol - true if the GV will be accessed via an indirect symbol. @@ -163,7 +202,7 @@ ARMSubtarget::GVIsIndirectSymbol(const GlobalValue *GV, // through a stub. if (!isDecl && !GV->isWeakForLinker()) return false; - + // Unless we have a symbol with hidden visibility, we have to go through a // normal $non_lazy_ptr stub because this symbol might be resolved late. if (!GV->hasHiddenVisibility()) // Non-hidden $non_lazy_ptr reference. @@ -174,6 +213,34 @@ ARMSubtarget::GVIsIndirectSymbol(const GlobalValue *GV, return false; } +unsigned ARMSubtarget::getMispredictionPenalty() const { + // If we have a reasonable estimate of the pipeline depth, then we can + // estimate the penalty of a misprediction based on that. + if (isCortexA8()) + return 13; + else if (isCortexA9()) + return 8; + + // Otherwise, just return a sensible default. + return 10; +} + +void ARMSubtarget::computeIssueWidth() { + unsigned allStage1Units = 0; + for (const InstrItinerary *itin = InstrItins.Itineraries; + itin->FirstStage != ~0U; ++itin) { + const InstrStage *IS = InstrItins.Stages + itin->FirstStage; + allStage1Units |= IS->getUnits(); + } + InstrItins.IssueWidth = 0; + while (allStage1Units) { + ++InstrItins.IssueWidth; + // clear the lowest bit + allStage1Units ^= allStage1Units & ~(allStage1Units - 1); + } + assert(InstrItins.IssueWidth <= 2 && "itinerary bug, too many stage 1 units"); +} + bool ARMSubtarget::enablePostRAScheduler( CodeGenOpt::Level OptLevel, TargetSubtarget::AntiDepBreakMode& Mode, diff --git a/contrib/llvm/lib/Target/ARM/ARMSubtarget.h b/contrib/llvm/lib/Target/ARM/ARMSubtarget.h index 67e5803..76c1c3f 100644 --- a/contrib/llvm/lib/Target/ARM/ARMSubtarget.h +++ b/contrib/llvm/lib/Target/ARM/ARMSubtarget.h @@ -17,7 +17,7 @@ #include "llvm/Target/TargetInstrItineraries.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetSubtarget.h" -#include "ARMBaseRegisterInfo.h" +#include "llvm/ADT/Triple.h" #include <string> namespace llvm { @@ -29,6 +29,10 @@ protected: V4, V4T, V5T, V5TE, V6, V6M, V6T2, V7A, V7M }; + enum ARMProcFamilyEnum { + Others, CortexA8, CortexA9 + }; + enum ARMFPEnum { None, VFPv2, VFPv3, NEON }; @@ -42,6 +46,9 @@ protected: /// V6, V6T2, V7A, V7M. ARMArchEnum ARMArchVersion; + /// ARMProcFamily - ARM processor family: Cortex-A8, Cortex-A9, and others. + ARMProcFamilyEnum ARMProcFamily; + /// ARMFPUType - Floating Point Unit type. ARMFPEnum ARMFPUType; @@ -50,9 +57,9 @@ protected: /// determine if NEON should actually be used. bool UseNEONForSinglePrecisionFP; - /// SlowVMLx - If the VFP2 instructions are available, indicates whether - /// the VML[AS] instructions are slow (if so, don't use them). - bool SlowVMLx; + /// SlowFPVMLx - If the VFP2 / NEON instructions are available, indicates + /// whether the FP VML[AS] instructions are slow (if so, don't use them). + bool SlowFPVMLx; /// SlowFPBrcc - True if floating point compare + branch is slow. bool SlowFPBrcc; @@ -80,6 +87,10 @@ protected: /// only so far) bool HasFP16; + /// HasD16 - True if subtarget is limited to 16 double precision + /// FP registers for VFPv3. + bool HasD16; + /// HasHardwareDivide - True if subtarget supports [su]div bool HasHardwareDivide; @@ -95,10 +106,19 @@ protected: /// over 16-bit ones. bool Pref32BitThumb; + /// HasMPExtension - True if the subtarget supports Multiprocessing + /// extension (ARMv7 only). + bool HasMPExtension; + /// FPOnlySP - If true, the floating point unit only supports single /// precision. bool FPOnlySP; + /// AllowsUnalignedMem - If true, the subtarget allows unaligned memory + /// accesses for some types. For details, see + /// ARMTargetLowering::allowsUnalignedMemoryAccesses(). + bool AllowsUnalignedMem; + /// stackAlignment - The minimum alignment known to hold of the stack frame on /// entry to the function and which must be maintained by every function. unsigned stackAlignment; @@ -106,6 +126,9 @@ protected: /// CPUString - String name of used CPU. std::string CPUString; + /// TargetTriple - What processor and OS we're targeting. + Triple TargetTriple; + /// Selected instruction itineraries (one entry per itinerary class.) InstrItineraryData InstrItins; @@ -136,6 +159,8 @@ protected: std::string ParseSubtargetFeatures(const std::string &FS, const std::string &CPU); + void computeIssueWidth(); + bool hasV4TOps() const { return ARMArchVersion >= V4T; } bool hasV5TOps() const { return ARMArchVersion >= V5T; } bool hasV5TEOps() const { return ARMArchVersion >= V5TE; } @@ -143,6 +168,9 @@ protected: bool hasV6T2Ops() const { return ARMArchVersion >= V6T2; } bool hasV7Ops() const { return ARMArchVersion >= V7A; } + bool isCortexA8() const { return ARMProcFamily == CortexA8; } + bool isCortexA9() const { return ARMProcFamily == CortexA9; } + bool hasARMOps() const { return !NoARM; } bool hasVFP2() const { return ARMFPUType >= VFPv2; } @@ -153,15 +181,17 @@ protected: bool hasDivide() const { return HasHardwareDivide; } bool hasT2ExtractPack() const { return HasT2ExtractPack; } bool hasDataBarrier() const { return HasDataBarrier; } - bool useVMLx() const {return hasVFP2() && !SlowVMLx; } + bool useFPVMLx() const { return !SlowFPVMLx; } bool isFPBrccSlow() const { return SlowFPBrcc; } bool isFPOnlySP() const { return FPOnlySP; } bool prefers32BitThumb() const { return Pref32BitThumb; } + bool hasMPExtension() const { return HasMPExtension; } bool hasFP16() const { return HasFP16; } + bool hasD16() const { return HasD16; } - bool isTargetDarwin() const { return TargetType == isDarwin; } - bool isTargetELF() const { return TargetType == isELF; } + bool isTargetDarwin() const { return TargetTriple.getOS() == Triple::Darwin; } + bool isTargetELF() const { return !isTargetDarwin(); } bool isAPCS_ABI() const { return TargetABI == ARM_ABI_APCS; } bool isAAPCS_ABI() const { return TargetABI == ARM_ABI_AAPCS; } @@ -175,8 +205,12 @@ protected: bool useMovt() const { return UseMovt && hasV6T2Ops(); } + bool allowsUnalignedMem() const { return AllowsUnalignedMem; } + const std::string & getCPUString() const { return CPUString; } + unsigned getMispredictionPenalty() const; + /// enablePostRAScheduler - True at 'More' optimization. bool enablePostRAScheduler(CodeGenOpt::Level OptLevel, TargetSubtarget::AntiDepBreakMode& Mode, diff --git a/contrib/llvm/lib/Target/ARM/ARMTargetMachine.cpp b/contrib/llvm/lib/Target/ARM/ARMTargetMachine.cpp index 30ff827..0ee773b 100644 --- a/contrib/llvm/lib/Target/ARM/ARMTargetMachine.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMTargetMachine.cpp @@ -12,15 +12,18 @@ #include "ARMTargetMachine.h" #include "ARMMCAsmInfo.h" -#include "ARMFrameInfo.h" +#include "ARMFrameLowering.h" #include "ARM.h" #include "llvm/PassManager.h" #include "llvm/CodeGen/Passes.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/FormattedStream.h" #include "llvm/Target/TargetOptions.h" #include "llvm/Target/TargetRegistry.h" using namespace llvm; +static cl::opt<bool>ExpandMLx("expand-fp-mlx", cl::init(false), cl::Hidden); + static MCAsmInfo *createMCAsmInfo(const Target &T, StringRef TT) { Triple TheTriple(TT); switch (TheTriple.getOS()) { @@ -31,6 +34,26 @@ static MCAsmInfo *createMCAsmInfo(const Target &T, StringRef TT) { } } +// This is duplicated code. Refactor this. +static MCStreamer *createMCStreamer(const Target &T, const std::string &TT, + MCContext &Ctx, TargetAsmBackend &TAB, + raw_ostream &OS, + MCCodeEmitter *Emitter, + bool RelaxAll, + bool NoExecStack) { + switch (Triple(TT).getOS()) { + case Triple::Darwin: + return createMachOStreamer(Ctx, TAB, OS, Emitter, RelaxAll); + case Triple::MinGW32: + case Triple::Cygwin: + case Triple::Win32: + llvm_unreachable("ARM does not support Windows COFF format"); + return NULL; + default: + return createELFStreamer(Ctx, TAB, OS, Emitter, RelaxAll, NoExecStack); + } +} + extern "C" void LLVMInitializeARMTarget() { // Register the target. RegisterTargetMachine<ARMTargetMachine> X(TheARMTarget); @@ -39,6 +62,19 @@ extern "C" void LLVMInitializeARMTarget() { // Register the target asm info. RegisterAsmInfoFn A(TheARMTarget, createMCAsmInfo); RegisterAsmInfoFn B(TheThumbTarget, createMCAsmInfo); + + // Register the MC Code Emitter + TargetRegistry::RegisterCodeEmitter(TheARMTarget, createARMMCCodeEmitter); + TargetRegistry::RegisterCodeEmitter(TheThumbTarget, createARMMCCodeEmitter); + + // Register the asm backend. + TargetRegistry::RegisterAsmBackend(TheARMTarget, createARMAsmBackend); + TargetRegistry::RegisterAsmBackend(TheThumbTarget, createARMAsmBackend); + + // Register the object streamer. + TargetRegistry::RegisterObjectStreamer(TheARMTarget, createMCStreamer); + TargetRegistry::RegisterObjectStreamer(TheThumbTarget, createMCStreamer); + } /// TargetMachine ctor - Create an ARM architecture model. @@ -49,9 +85,9 @@ ARMBaseTargetMachine::ARMBaseTargetMachine(const Target &T, bool isThumb) : LLVMTargetMachine(T, TT), Subtarget(TT, FS, isThumb), - FrameInfo(Subtarget), JITInfo(), - InstrItins(Subtarget.getInstrItineraryData()) { + InstrItins(Subtarget.getInstrItineraryData()) +{ DefRelocModel = getRelocationModel(); } @@ -59,12 +95,14 @@ ARMTargetMachine::ARMTargetMachine(const Target &T, const std::string &TT, const std::string &FS) : ARMBaseTargetMachine(T, TT, FS, false), InstrInfo(Subtarget), DataLayout(Subtarget.isAPCS_ABI() ? - std::string("e-p:32:32-f64:32:32-i64:32:32-" + std::string("e-p:32:32-f64:32:64-i64:32:64-" "v128:32:128-v64:32:64-n32") : std::string("e-p:32:32-f64:64:64-i64:64:64-" "v128:64:128-v64:64:64-n32")), + ELFWriterInfo(*this), TLInfo(*this), - TSInfo(*this) { + TSInfo(*this), + FrameLowering(Subtarget) { if (!Subtarget.hasARMOps()) report_fatal_error("CPU: '" + Subtarget.getCPUString() + "' does not " "support ARM mode execution!"); @@ -77,14 +115,18 @@ ThumbTargetMachine::ThumbTargetMachine(const Target &T, const std::string &TT, ? ((ARMBaseInstrInfo*)new Thumb2InstrInfo(Subtarget)) : ((ARMBaseInstrInfo*)new Thumb1InstrInfo(Subtarget))), DataLayout(Subtarget.isAPCS_ABI() ? - std::string("e-p:32:32-f64:32:32-i64:32:32-" + std::string("e-p:32:32-f64:32:64-i64:32:64-" "i16:16:32-i8:8:32-i1:8:32-" "v128:32:128-v64:32:64-a:0:32-n32") : std::string("e-p:32:32-f64:64:64-i64:64:64-" "i16:16:32-i8:8:32-i1:8:32-" "v128:64:128-v64:64:64-a:0:32-n32")), + ELFWriterInfo(*this), TLInfo(*this), - TSInfo(*this) { + TSInfo(*this), + FrameLowering(Subtarget.hasThumb2() + ? new ARMFrameLowering(Subtarget) + : (ARMFrameLowering*)new Thumb1FrameLowering(Subtarget)) { } // Pass Pipeline Configuration @@ -104,12 +146,12 @@ bool ARMBaseTargetMachine::addInstSelector(PassManagerBase &PM, bool ARMBaseTargetMachine::addPreRegAlloc(PassManagerBase &PM, CodeGenOpt::Level OptLevel) { - if (Subtarget.hasNEON()) - PM.add(createNEONPreAllocPass()); - // FIXME: temporarily disabling load / store optimization pass for Thumb1. if (OptLevel != CodeGenOpt::None && !Subtarget.isThumb1Only()) PM.add(createARMLoadStoreOptimizationPass(true)); + if (ExpandMLx && + OptLevel != CodeGenOpt::None && Subtarget.hasVFP2()) + PM.add(createMLxExpansionPass()); return true; } diff --git a/contrib/llvm/lib/Target/ARM/ARMTargetMachine.h b/contrib/llvm/lib/Target/ARM/ARMTargetMachine.h index 17e5425..e0aa149 100644 --- a/contrib/llvm/lib/Target/ARM/ARMTargetMachine.h +++ b/contrib/llvm/lib/Target/ARM/ARMTargetMachine.h @@ -14,16 +14,19 @@ #ifndef ARMTARGETMACHINE_H #define ARMTARGETMACHINE_H -#include "llvm/Target/TargetMachine.h" -#include "llvm/Target/TargetData.h" #include "ARMInstrInfo.h" -#include "ARMFrameInfo.h" +#include "ARMELFWriterInfo.h" +#include "ARMFrameLowering.h" #include "ARMJITInfo.h" #include "ARMSubtarget.h" #include "ARMISelLowering.h" #include "ARMSelectionDAGInfo.h" #include "Thumb1InstrInfo.h" +#include "Thumb1FrameLowering.h" #include "Thumb2InstrInfo.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetData.h" +#include "llvm/MC/MCStreamer.h" #include "llvm/ADT/OwningPtr.h" namespace llvm { @@ -31,9 +34,7 @@ namespace llvm { class ARMBaseTargetMachine : public LLVMTargetMachine { protected: ARMSubtarget Subtarget; - private: - ARMFrameInfo FrameInfo; ARMJITInfo JITInfo; InstrItineraryData InstrItins; Reloc::Model DefRelocModel; // Reloc model before it's overridden. @@ -42,11 +43,10 @@ public: ARMBaseTargetMachine(const Target &T, const std::string &TT, const std::string &FS, bool isThumb); - virtual const ARMFrameInfo *getFrameInfo() const { return &FrameInfo; } virtual ARMJITInfo *getJITInfo() { return &JITInfo; } virtual const ARMSubtarget *getSubtargetImpl() const { return &Subtarget; } - virtual const InstrItineraryData getInstrItineraryData() const { - return InstrItins; + virtual const InstrItineraryData *getInstrItineraryData() const { + return &InstrItins; } // Pass Pipeline Configuration @@ -64,9 +64,11 @@ public: class ARMTargetMachine : public ARMBaseTargetMachine { ARMInstrInfo InstrInfo; const TargetData DataLayout; // Calculates type size & alignment + ARMELFWriterInfo ELFWriterInfo; ARMTargetLowering TLInfo; ARMSelectionDAGInfo TSInfo; -public: + ARMFrameLowering FrameLowering; + public: ARMTargetMachine(const Target &T, const std::string &TT, const std::string &FS); @@ -81,9 +83,15 @@ public: virtual const ARMSelectionDAGInfo* getSelectionDAGInfo() const { return &TSInfo; } + virtual const ARMFrameLowering *getFrameLowering() const { + return &FrameLowering; + } virtual const ARMInstrInfo *getInstrInfo() const { return &InstrInfo; } virtual const TargetData *getTargetData() const { return &DataLayout; } + virtual const ARMELFWriterInfo *getELFWriterInfo() const { + return Subtarget.isTargetELF() ? &ELFWriterInfo : 0; + } }; /// ThumbTargetMachine - Thumb target machine. @@ -94,8 +102,11 @@ class ThumbTargetMachine : public ARMBaseTargetMachine { // Either Thumb1InstrInfo or Thumb2InstrInfo. OwningPtr<ARMBaseInstrInfo> InstrInfo; const TargetData DataLayout; // Calculates type size & alignment + ARMELFWriterInfo ELFWriterInfo; ARMTargetLowering TLInfo; ARMSelectionDAGInfo TSInfo; + // Either Thumb1FrameLowering or ARMFrameLowering. + OwningPtr<ARMFrameLowering> FrameLowering; public: ThumbTargetMachine(const Target &T, const std::string &TT, const std::string &FS); @@ -117,7 +128,14 @@ public: virtual const ARMBaseInstrInfo *getInstrInfo() const { return InstrInfo.get(); } + /// returns either Thumb1FrameLowering or ARMFrameLowering + virtual const ARMFrameLowering *getFrameLowering() const { + return FrameLowering.get(); + } virtual const TargetData *getTargetData() const { return &DataLayout; } + virtual const ARMELFWriterInfo *getELFWriterInfo() const { + return Subtarget.isTargetELF() ? &ELFWriterInfo : 0; + } }; } // end namespace llvm diff --git a/contrib/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp b/contrib/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp index 091a3b3..7535da5 100644 --- a/contrib/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp @@ -12,6 +12,7 @@ #include "llvm/MC/MCContext.h" #include "llvm/MC/MCSectionELF.h" #include "llvm/Support/Dwarf.h" +#include "llvm/Support/ELF.h" #include "llvm/Target/TargetMachine.h" using namespace llvm; using namespace dwarf; @@ -26,14 +27,20 @@ void ARMElfTargetObjectFile::Initialize(MCContext &Ctx, if (TM.getSubtarget<ARMSubtarget>().isAAPCS_ABI()) { StaticCtorSection = - getContext().getELFSection(".init_array", MCSectionELF::SHT_INIT_ARRAY, - MCSectionELF::SHF_WRITE | - MCSectionELF::SHF_ALLOC, + getContext().getELFSection(".init_array", ELF::SHT_INIT_ARRAY, + ELF::SHF_WRITE | + ELF::SHF_ALLOC, SectionKind::getDataRel()); StaticDtorSection = - getContext().getELFSection(".fini_array", MCSectionELF::SHT_FINI_ARRAY, - MCSectionELF::SHF_WRITE | - MCSectionELF::SHF_ALLOC, + getContext().getELFSection(".fini_array", ELF::SHT_FINI_ARRAY, + ELF::SHF_WRITE | + ELF::SHF_ALLOC, SectionKind::getDataRel()); } + + AttributesSection = + getContext().getELFSection(".ARM.attributes", + ELF::SHT_ARM_ATTRIBUTES, + 0, + SectionKind::getMetadata()); } diff --git a/contrib/llvm/lib/Target/ARM/ARMTargetObjectFile.h b/contrib/llvm/lib/Target/ARM/ARMTargetObjectFile.h index 097fc2c..c6a7261 100644 --- a/contrib/llvm/lib/Target/ARM/ARMTargetObjectFile.h +++ b/contrib/llvm/lib/Target/ARM/ARMTargetObjectFile.h @@ -18,10 +18,19 @@ class MCContext; class TargetMachine; class ARMElfTargetObjectFile : public TargetLoweringObjectFileELF { +protected: + const MCSection *AttributesSection; public: - ARMElfTargetObjectFile() : TargetLoweringObjectFileELF() {} + ARMElfTargetObjectFile() : + TargetLoweringObjectFileELF(), + AttributesSection(NULL) + {} virtual void Initialize(MCContext &Ctx, const TargetMachine &TM); + + virtual const MCSection *getAttributesSection() const { + return AttributesSection; + } }; } // end namespace llvm diff --git a/contrib/llvm/lib/Target/ARM/AsmParser/ARMAsmLexer.cpp b/contrib/llvm/lib/Target/ARM/AsmParser/ARMAsmLexer.cpp index f859d1b..2428ce1 100644 --- a/contrib/llvm/lib/Target/ARM/AsmParser/ARMAsmLexer.cpp +++ b/contrib/llvm/lib/Target/ARM/AsmParser/ARMAsmLexer.cpp @@ -10,10 +10,6 @@ #include "ARM.h" #include "ARMTargetMachine.h" -#include "llvm/ADT/OwningPtr.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/ADT/StringExtras.h" - #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCParser/MCAsmLexer.h" #include "llvm/MC/MCParser/MCParsedAsmOperand.h" @@ -22,119 +18,135 @@ #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetRegistry.h" +#include "llvm/ADT/OwningPtr.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/StringSwitch.h" + #include <string> #include <map> using namespace llvm; namespace { - - class ARMBaseAsmLexer : public TargetAsmLexer { - const MCAsmInfo &AsmInfo; - - const AsmToken &lexDefinite() { - return getLexer()->Lex(); - } - - AsmToken LexTokenUAL(); - protected: - typedef std::map <std::string, unsigned> rmap_ty; - - rmap_ty RegisterMap; - - void InitRegisterMap(const TargetRegisterInfo *info) { - unsigned numRegs = info->getNumRegs(); - - for (unsigned i = 0; i < numRegs; ++i) { - const char *regName = info->getName(i); - if (regName) - RegisterMap[regName] = i; - } - } - - unsigned MatchRegisterName(StringRef Name) { - rmap_ty::iterator iter = RegisterMap.find(Name.str()); - if (iter != RegisterMap.end()) - return iter->second; - else - return 0; - } - - AsmToken LexToken() { - if (!Lexer) { - SetError(SMLoc(), "No MCAsmLexer installed"); - return AsmToken(AsmToken::Error, "", 0); - } - - switch (AsmInfo.getAssemblerDialect()) { - default: - SetError(SMLoc(), "Unhandled dialect"); - return AsmToken(AsmToken::Error, "", 0); - case 0: - return LexTokenUAL(); - } - } - public: - ARMBaseAsmLexer(const Target &T, const MCAsmInfo &MAI) - : TargetAsmLexer(T), AsmInfo(MAI) { + +class ARMBaseAsmLexer : public TargetAsmLexer { + const MCAsmInfo &AsmInfo; + + const AsmToken &lexDefinite() { + return getLexer()->Lex(); + } + + AsmToken LexTokenUAL(); +protected: + typedef std::map <std::string, unsigned> rmap_ty; + + rmap_ty RegisterMap; + + void InitRegisterMap(const TargetRegisterInfo *info) { + unsigned numRegs = info->getNumRegs(); + + for (unsigned i = 0; i < numRegs; ++i) { + const char *regName = info->getName(i); + if (regName) + RegisterMap[regName] = i; } - }; - - class ARMAsmLexer : public ARMBaseAsmLexer { - public: - ARMAsmLexer(const Target &T, const MCAsmInfo &MAI) - : ARMBaseAsmLexer(T, MAI) { - std::string tripleString("arm-unknown-unknown"); - std::string featureString; - OwningPtr<const TargetMachine> - targetMachine(T.createTargetMachine(tripleString, featureString)); - InitRegisterMap(targetMachine->getRegisterInfo()); + } + + unsigned MatchRegisterName(StringRef Name) { + rmap_ty::iterator iter = RegisterMap.find(Name.str()); + if (iter != RegisterMap.end()) + return iter->second; + else + return 0; + } + + AsmToken LexToken() { + if (!Lexer) { + SetError(SMLoc(), "No MCAsmLexer installed"); + return AsmToken(AsmToken::Error, "", 0); } - }; - - class ThumbAsmLexer : public ARMBaseAsmLexer { - public: - ThumbAsmLexer(const Target &T, const MCAsmInfo &MAI) - : ARMBaseAsmLexer(T, MAI) { - std::string tripleString("thumb-unknown-unknown"); - std::string featureString; - OwningPtr<const TargetMachine> - targetMachine(T.createTargetMachine(tripleString, featureString)); - InitRegisterMap(targetMachine->getRegisterInfo()); + + switch (AsmInfo.getAssemblerDialect()) { + default: + SetError(SMLoc(), "Unhandled dialect"); + return AsmToken(AsmToken::Error, "", 0); + case 0: + return LexTokenUAL(); } - }; -} + } +public: + ARMBaseAsmLexer(const Target &T, const MCAsmInfo &MAI) + : TargetAsmLexer(T), AsmInfo(MAI) { + } +}; + +class ARMAsmLexer : public ARMBaseAsmLexer { +public: + ARMAsmLexer(const Target &T, const MCAsmInfo &MAI) + : ARMBaseAsmLexer(T, MAI) { + std::string tripleString("arm-unknown-unknown"); + std::string featureString; + OwningPtr<const TargetMachine> + targetMachine(T.createTargetMachine(tripleString, featureString)); + InitRegisterMap(targetMachine->getRegisterInfo()); + } +}; + +class ThumbAsmLexer : public ARMBaseAsmLexer { +public: + ThumbAsmLexer(const Target &T, const MCAsmInfo &MAI) + : ARMBaseAsmLexer(T, MAI) { + std::string tripleString("thumb-unknown-unknown"); + std::string featureString; + OwningPtr<const TargetMachine> + targetMachine(T.createTargetMachine(tripleString, featureString)); + InitRegisterMap(targetMachine->getRegisterInfo()); + } +}; + +} // end anonymous namespace AsmToken ARMBaseAsmLexer::LexTokenUAL() { const AsmToken &lexedToken = lexDefinite(); - + switch (lexedToken.getKind()) { - default: - return AsmToken(lexedToken); + default: break; case AsmToken::Error: SetError(Lexer->getErrLoc(), Lexer->getErr()); - return AsmToken(lexedToken); - case AsmToken::Identifier: - { + break; + case AsmToken::Identifier: { std::string upperCase = lexedToken.getString().str(); std::string lowerCase = LowercaseString(upperCase); StringRef lowerRef(lowerCase); - + unsigned regID = MatchRegisterName(lowerRef); - - if (regID) { + // Check for register aliases. + // r13 -> sp + // r14 -> lr + // r15 -> pc + // ip -> r12 + // FIXME: Some assemblers support lots of others. Do we want them all? + if (!regID) { + regID = StringSwitch<unsigned>(lowerCase) + .Case("r13", ARM::SP) + .Case("r14", ARM::LR) + .Case("r15", ARM::PC) + .Case("ip", ARM::R12) + .Default(0); + } + + if (regID) return AsmToken(AsmToken::Register, lexedToken.getString(), static_cast<int64_t>(regID)); - } else { - return AsmToken(lexedToken); - } } } + + return AsmToken(lexedToken); } extern "C" void LLVMInitializeARMAsmLexer() { RegisterAsmLexer<ARMAsmLexer> X(TheARMTarget); RegisterAsmLexer<ThumbAsmLexer> Y(TheThumbTarget); } - diff --git a/contrib/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/contrib/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp index 75e2a73..129af20 100644 --- a/contrib/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp +++ b/contrib/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp @@ -8,28 +8,28 @@ //===----------------------------------------------------------------------===// #include "ARM.h" +#include "ARMAddressingModes.h" +#include "ARMMCExpr.h" +#include "ARMBaseRegisterInfo.h" #include "ARMSubtarget.h" #include "llvm/MC/MCParser/MCAsmLexer.h" #include "llvm/MC/MCParser/MCAsmParser.h" #include "llvm/MC/MCParser/MCParsedAsmOperand.h" +#include "llvm/MC/MCContext.h" #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" #include "llvm/Target/TargetRegistry.h" #include "llvm/Target/TargetAsmParser.h" -#include "llvm/Support/Compiler.h" #include "llvm/Support/SourceMgr.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/ADT/OwningPtr.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/ADT/Twine.h" using namespace llvm; -namespace { -struct ARMOperand; - -// The shift types for register controlled shifts in arm memory addressing +/// Shift types used for register controlled shifts in ARM memory addressing. enum ShiftType { Lsl, Lsr, @@ -38,24 +38,30 @@ enum ShiftType { Rrx }; +namespace { + +class ARMOperand; + class ARMAsmParser : public TargetAsmParser { MCAsmParser &Parser; TargetMachine &TM; -private: MCAsmParser &getParser() const { return Parser; } - MCAsmLexer &getLexer() const { return Parser.getLexer(); } void Warning(SMLoc L, const Twine &Msg) { Parser.Warning(L, Msg); } - bool Error(SMLoc L, const Twine &Msg) { return Parser.Error(L, Msg); } - bool MaybeParseRegister(OwningPtr<ARMOperand> &Op, bool ParseWriteBack); + int TryParseRegister(); + virtual bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc); + bool TryParseRegisterWithWriteBack(SmallVectorImpl<MCParsedAsmOperand*> &); + bool ParseRegisterList(SmallVectorImpl<MCParsedAsmOperand*> &); + bool ParseMemory(SmallVectorImpl<MCParsedAsmOperand*> &); + bool ParseOperand(SmallVectorImpl<MCParsedAsmOperand*> &, StringRef Mnemonic); + bool ParsePrefix(ARMMCExpr::VariantKind &RefKind); + const MCExpr *ApplyPrefixToExpr(const MCExpr *E, + MCSymbolRefExpr::VariantKind Variant); - bool ParseRegisterList(OwningPtr<ARMOperand> &Op); - - bool ParseMemory(OwningPtr<ARMOperand> &Op); bool ParseMemoryOffsetReg(bool &Negative, bool &OffsetRegShifted, @@ -65,70 +71,76 @@ private: bool &OffsetIsReg, int &OffsetRegNum, SMLoc &E); - bool ParseShift(enum ShiftType &St, const MCExpr *&ShiftAmount, SMLoc &E); - - bool ParseOperand(OwningPtr<ARMOperand> &Op); - bool ParseDirectiveWord(unsigned Size, SMLoc L); - bool ParseDirectiveThumb(SMLoc L); - bool ParseDirectiveThumbFunc(SMLoc L); - bool ParseDirectiveCode(SMLoc L); - bool ParseDirectiveSyntax(SMLoc L); - bool MatchInstruction(SMLoc IDLoc, - const SmallVectorImpl<MCParsedAsmOperand*> &Operands, - MCInst &Inst) { - if (!MatchInstructionImpl(Operands, Inst)) - return false; - - // FIXME: We should give nicer diagnostics about the exact failure. - Error(IDLoc, "unrecognized instruction"); - - return true; - } + bool MatchAndEmitInstruction(SMLoc IDLoc, + SmallVectorImpl<MCParsedAsmOperand*> &Operands, + MCStreamer &Out); + void GetMnemonicAcceptInfo(StringRef Mnemonic, bool &CanAcceptCarrySet, + bool &CanAcceptPredicationCode); /// @name Auto-generated Match Functions /// { - unsigned ComputeAvailableFeatures(const ARMSubtarget *Subtarget) const; - - bool MatchInstructionImpl(const SmallVectorImpl<MCParsedAsmOperand*> - &Operands, - MCInst &Inst); +#define GET_ASSEMBLER_HEADER +#include "ARMGenAsmMatcher.inc" /// } + OperandMatchResultTy tryParseCoprocNumOperand( + SmallVectorImpl<MCParsedAsmOperand*>&); + OperandMatchResultTy tryParseCoprocRegOperand( + SmallVectorImpl<MCParsedAsmOperand*>&); + OperandMatchResultTy tryParseMemBarrierOptOperand( + SmallVectorImpl<MCParsedAsmOperand*>&); + OperandMatchResultTy tryParseProcIFlagsOperand( + SmallVectorImpl<MCParsedAsmOperand*>&); + OperandMatchResultTy tryParseMSRMaskOperand( + SmallVectorImpl<MCParsedAsmOperand*>&); public: ARMAsmParser(const Target &T, MCAsmParser &_Parser, TargetMachine &_TM) - : TargetAsmParser(T), Parser(_Parser), TM(_TM) {} + : TargetAsmParser(T), Parser(_Parser), TM(_TM) { + // Initialize the set of available features. + setAvailableFeatures(ComputeAvailableFeatures( + &TM.getSubtarget<ARMSubtarget>())); + } virtual bool ParseInstruction(StringRef Name, SMLoc NameLoc, SmallVectorImpl<MCParsedAsmOperand*> &Operands); - virtual bool ParseDirective(AsmToken DirectiveID); }; - +} // end anonymous namespace + +namespace { + /// ARMOperand - Instances of this class represent a parsed ARM machine /// instruction. -struct ARMOperand : public MCParsedAsmOperand { -private: - ARMOperand() {} -public: +class ARMOperand : public MCParsedAsmOperand { enum KindTy { CondCode, + CCOut, + CoprocNum, + CoprocReg, Immediate, + MemBarrierOpt, Memory, + MSRMask, + ProcIFlags, Register, + RegisterList, + DPRRegisterList, + SPRRegisterList, Token } Kind; SMLoc StartLoc, EndLoc; + SmallVector<unsigned, 8> Registers; union { struct { @@ -136,40 +148,54 @@ public: } CC; struct { + ARM_MB::MemBOpt Val; + } MBOpt; + + struct { + unsigned Val; + } Cop; + + struct { + ARM_PROC::IFlags Val; + } IFlags; + + struct { + unsigned Val; + } MMask; + + struct { const char *Data; unsigned Length; } Tok; struct { unsigned RegNum; - bool Writeback; } Reg; struct { const MCExpr *Val; } Imm; - - // This is for all forms of ARM address expressions + + /// Combined record for all forms of ARM address expressions. struct { unsigned BaseRegNum; - unsigned OffsetRegNum; // used when OffsetIsReg is true - const MCExpr *Offset; // used when OffsetIsReg is false - const MCExpr *ShiftAmount; // used when OffsetRegShifted is true - enum ShiftType ShiftType; // used when OffsetRegShifted is true - unsigned - OffsetRegShifted : 1, // only used when OffsetIsReg is true - Preindexed : 1, - Postindexed : 1, - OffsetIsReg : 1, - Negative : 1, // only used when OffsetIsReg is true - Writeback : 1; + union { + unsigned RegNum; ///< Offset register num, when OffsetIsReg. + const MCExpr *Value; ///< Offset value, when !OffsetIsReg. + } Offset; + const MCExpr *ShiftAmount; // used when OffsetRegShifted is true + enum ShiftType ShiftType; // used when OffsetRegShifted is true + unsigned OffsetRegShifted : 1; // only used when OffsetIsReg is true + unsigned Preindexed : 1; + unsigned Postindexed : 1; + unsigned OffsetIsReg : 1; + unsigned Negative : 1; // only used when OffsetIsReg is true + unsigned Writeback : 1; } Mem; - }; - - //ARMOperand(KindTy K, SMLoc S, SMLoc E) - // : Kind(K), StartLoc(S), EndLoc(E) {} - + + ARMOperand(KindTy K) : MCParsedAsmOperand(), Kind(K) {} +public: ARMOperand(const ARMOperand &o) : MCParsedAsmOperand() { Kind = o.Kind; StartLoc = o.StartLoc; @@ -181,18 +207,36 @@ public: case Token: Tok = o.Tok; break; + case CCOut: case Register: Reg = o.Reg; break; + case RegisterList: + case DPRRegisterList: + case SPRRegisterList: + Registers = o.Registers; + break; + case CoprocNum: + case CoprocReg: + Cop = o.Cop; + break; case Immediate: Imm = o.Imm; break; + case MemBarrierOpt: + MBOpt = o.MBOpt; + break; case Memory: Mem = o.Mem; break; + case MSRMask: + MMask = o.MMask; + break; + case ProcIFlags: + IFlags = o.IFlags; } } - + /// getStartLoc - Get the location of the first token of this operand. SMLoc getStartLoc() const { return StartLoc; } /// getEndLoc - Get the location of the last token of this operand. @@ -203,32 +247,129 @@ public: return CC.Val; } + unsigned getCoproc() const { + assert((Kind == CoprocNum || Kind == CoprocReg) && "Invalid access!"); + return Cop.Val; + } + StringRef getToken() const { assert(Kind == Token && "Invalid access!"); return StringRef(Tok.Data, Tok.Length); } unsigned getReg() const { - assert(Kind == Register && "Invalid access!"); + assert((Kind == Register || Kind == CCOut) && "Invalid access!"); return Reg.RegNum; } + const SmallVectorImpl<unsigned> &getRegList() const { + assert((Kind == RegisterList || Kind == DPRRegisterList || + Kind == SPRRegisterList) && "Invalid access!"); + return Registers; + } + const MCExpr *getImm() const { assert(Kind == Immediate && "Invalid access!"); return Imm.Val; } - bool isCondCode() const { return Kind == CondCode; } + ARM_MB::MemBOpt getMemBarrierOpt() const { + assert(Kind == MemBarrierOpt && "Invalid access!"); + return MBOpt.Val; + } - bool isImm() const { return Kind == Immediate; } + ARM_PROC::IFlags getProcIFlags() const { + assert(Kind == ProcIFlags && "Invalid access!"); + return IFlags.Val; + } + + unsigned getMSRMask() const { + assert(Kind == MSRMask && "Invalid access!"); + return MMask.Val; + } + + /// @name Memory Operand Accessors + /// @{ + + unsigned getMemBaseRegNum() const { + return Mem.BaseRegNum; + } + unsigned getMemOffsetRegNum() const { + assert(Mem.OffsetIsReg && "Invalid access!"); + return Mem.Offset.RegNum; + } + const MCExpr *getMemOffset() const { + assert(!Mem.OffsetIsReg && "Invalid access!"); + return Mem.Offset.Value; + } + unsigned getMemOffsetRegShifted() const { + assert(Mem.OffsetIsReg && "Invalid access!"); + return Mem.OffsetRegShifted; + } + const MCExpr *getMemShiftAmount() const { + assert(Mem.OffsetIsReg && Mem.OffsetRegShifted && "Invalid access!"); + return Mem.ShiftAmount; + } + enum ShiftType getMemShiftType() const { + assert(Mem.OffsetIsReg && Mem.OffsetRegShifted && "Invalid access!"); + return Mem.ShiftType; + } + bool getMemPreindexed() const { return Mem.Preindexed; } + bool getMemPostindexed() const { return Mem.Postindexed; } + bool getMemOffsetIsReg() const { return Mem.OffsetIsReg; } + bool getMemNegative() const { return Mem.Negative; } + bool getMemWriteback() const { return Mem.Writeback; } + + /// @} + bool isCoprocNum() const { return Kind == CoprocNum; } + bool isCoprocReg() const { return Kind == CoprocReg; } + bool isCondCode() const { return Kind == CondCode; } + bool isCCOut() const { return Kind == CCOut; } + bool isImm() const { return Kind == Immediate; } bool isReg() const { return Kind == Register; } + bool isRegList() const { return Kind == RegisterList; } + bool isDPRRegList() const { return Kind == DPRRegisterList; } + bool isSPRRegList() const { return Kind == SPRRegisterList; } + bool isToken() const { return Kind == Token; } + bool isMemBarrierOpt() const { return Kind == MemBarrierOpt; } + bool isMemory() const { return Kind == Memory; } + bool isMemMode5() const { + if (!isMemory() || getMemOffsetIsReg() || getMemWriteback() || + getMemNegative()) + return false; + + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getMemOffset()); + if (!CE) return false; + + // The offset must be a multiple of 4 in the range 0-1020. + int64_t Value = CE->getValue(); + return ((Value & 0x3) == 0 && Value <= 1020 && Value >= -1020); + } + bool isMemModeRegThumb() const { + if (!isMemory() || !getMemOffsetIsReg() || getMemWriteback()) + return false; + return true; + } + bool isMemModeImmThumb() const { + if (!isMemory() || getMemOffsetIsReg() || getMemWriteback()) + return false; + + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getMemOffset()); + if (!CE) return false; - bool isToken() const {return Kind == Token; } + // The offset must be a multiple of 4 in the range 0-124. + uint64_t Value = CE->getValue(); + return ((Value & 0x3) == 0 && Value <= 124); + } + bool isMSRMask() const { return Kind == MSRMask; } + bool isProcIFlags() const { return Kind == ProcIFlags; } void addExpr(MCInst &Inst, const MCExpr *Expr) const { - // Add as immediates when possible. - if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr)) + // Add as immediates when possible. Null MCExpr = 0. + if (Expr == 0) + Inst.addOperand(MCOperand::CreateImm(0)); + else if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr)) Inst.addOperand(MCOperand::CreateImm(CE->getValue())); else Inst.addOperand(MCOperand::CreateExpr(Expr)); @@ -237,8 +378,23 @@ public: void addCondCodeOperands(MCInst &Inst, unsigned N) const { assert(N == 2 && "Invalid number of operands!"); Inst.addOperand(MCOperand::CreateImm(unsigned(getCondCode()))); - // FIXME: What belongs here? - Inst.addOperand(MCOperand::CreateReg(0)); + unsigned RegNum = getCondCode() == ARMCC::AL ? 0: ARM::CPSR; + Inst.addOperand(MCOperand::CreateReg(RegNum)); + } + + void addCoprocNumOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::CreateImm(getCoproc())); + } + + void addCoprocRegOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::CreateImm(getCoproc())); + } + + void addCCOutOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::CreateReg(getReg())); } void addRegOperands(MCInst &Inst, unsigned N) const { @@ -246,66 +402,181 @@ public: Inst.addOperand(MCOperand::CreateReg(getReg())); } + void addRegListOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + const SmallVectorImpl<unsigned> &RegList = getRegList(); + for (SmallVectorImpl<unsigned>::const_iterator + I = RegList.begin(), E = RegList.end(); I != E; ++I) + Inst.addOperand(MCOperand::CreateReg(*I)); + } + + void addDPRRegListOperands(MCInst &Inst, unsigned N) const { + addRegListOperands(Inst, N); + } + + void addSPRRegListOperands(MCInst &Inst, unsigned N) const { + addRegListOperands(Inst, N); + } + void addImmOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); addExpr(Inst, getImm()); } + void addMemBarrierOptOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::CreateImm(unsigned(getMemBarrierOpt()))); + } + + void addMemMode5Operands(MCInst &Inst, unsigned N) const { + assert(N == 2 && isMemMode5() && "Invalid number of operands!"); + + Inst.addOperand(MCOperand::CreateReg(getMemBaseRegNum())); + assert(!getMemOffsetIsReg() && "Invalid mode 5 operand"); + + // FIXME: #-0 is encoded differently than #0. Does the parser preserve + // the difference? + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getMemOffset()); + assert(CE && "Non-constant mode 5 offset operand!"); + + // The MCInst offset operand doesn't include the low two bits (like + // the instruction encoding). + int64_t Offset = CE->getValue() / 4; + if (Offset >= 0) + Inst.addOperand(MCOperand::CreateImm(ARM_AM::getAM5Opc(ARM_AM::add, + Offset))); + else + Inst.addOperand(MCOperand::CreateImm(ARM_AM::getAM5Opc(ARM_AM::sub, + -Offset))); + } + + void addMemModeRegThumbOperands(MCInst &Inst, unsigned N) const { + assert(N == 2 && isMemModeRegThumb() && "Invalid number of operands!"); + Inst.addOperand(MCOperand::CreateReg(getMemBaseRegNum())); + Inst.addOperand(MCOperand::CreateReg(getMemOffsetRegNum())); + } + + void addMemModeImmThumbOperands(MCInst &Inst, unsigned N) const { + assert(N == 2 && isMemModeImmThumb() && "Invalid number of operands!"); + Inst.addOperand(MCOperand::CreateReg(getMemBaseRegNum())); + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getMemOffset()); + assert(CE && "Non-constant mode offset operand!"); + Inst.addOperand(MCOperand::CreateImm(CE->getValue())); + } + + void addMSRMaskOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::CreateImm(unsigned(getMSRMask()))); + } + + void addProcIFlagsOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::CreateImm(unsigned(getProcIFlags()))); + } + virtual void dump(raw_ostream &OS) const; - static void CreateCondCode(OwningPtr<ARMOperand> &Op, ARMCC::CondCodes CC, - SMLoc S) { - Op.reset(new ARMOperand); - Op->Kind = CondCode; + static ARMOperand *CreateCondCode(ARMCC::CondCodes CC, SMLoc S) { + ARMOperand *Op = new ARMOperand(CondCode); Op->CC.Val = CC; Op->StartLoc = S; Op->EndLoc = S; + return Op; + } + + static ARMOperand *CreateCoprocNum(unsigned CopVal, SMLoc S) { + ARMOperand *Op = new ARMOperand(CoprocNum); + Op->Cop.Val = CopVal; + Op->StartLoc = S; + Op->EndLoc = S; + return Op; } - static void CreateToken(OwningPtr<ARMOperand> &Op, StringRef Str, - SMLoc S) { - Op.reset(new ARMOperand); - Op->Kind = Token; + static ARMOperand *CreateCoprocReg(unsigned CopVal, SMLoc S) { + ARMOperand *Op = new ARMOperand(CoprocReg); + Op->Cop.Val = CopVal; + Op->StartLoc = S; + Op->EndLoc = S; + return Op; + } + + static ARMOperand *CreateCCOut(unsigned RegNum, SMLoc S) { + ARMOperand *Op = new ARMOperand(CCOut); + Op->Reg.RegNum = RegNum; + Op->StartLoc = S; + Op->EndLoc = S; + return Op; + } + + static ARMOperand *CreateToken(StringRef Str, SMLoc S) { + ARMOperand *Op = new ARMOperand(Token); Op->Tok.Data = Str.data(); Op->Tok.Length = Str.size(); Op->StartLoc = S; Op->EndLoc = S; + return Op; } - static void CreateReg(OwningPtr<ARMOperand> &Op, unsigned RegNum, - bool Writeback, SMLoc S, SMLoc E) { - Op.reset(new ARMOperand); - Op->Kind = Register; + static ARMOperand *CreateReg(unsigned RegNum, SMLoc S, SMLoc E) { + ARMOperand *Op = new ARMOperand(Register); Op->Reg.RegNum = RegNum; - Op->Reg.Writeback = Writeback; - Op->StartLoc = S; Op->EndLoc = E; + return Op; } - static void CreateImm(OwningPtr<ARMOperand> &Op, const MCExpr *Val, - SMLoc S, SMLoc E) { - Op.reset(new ARMOperand); - Op->Kind = Immediate; + static ARMOperand * + CreateRegList(const SmallVectorImpl<std::pair<unsigned, SMLoc> > &Regs, + SMLoc StartLoc, SMLoc EndLoc) { + KindTy Kind = RegisterList; + + if (ARM::DPRRegClass.contains(Regs.front().first)) + Kind = DPRRegisterList; + else if (ARM::SPRRegClass.contains(Regs.front().first)) + Kind = SPRRegisterList; + + ARMOperand *Op = new ARMOperand(Kind); + for (SmallVectorImpl<std::pair<unsigned, SMLoc> >::const_iterator + I = Regs.begin(), E = Regs.end(); I != E; ++I) + Op->Registers.push_back(I->first); + array_pod_sort(Op->Registers.begin(), Op->Registers.end()); + Op->StartLoc = StartLoc; + Op->EndLoc = EndLoc; + return Op; + } + + static ARMOperand *CreateImm(const MCExpr *Val, SMLoc S, SMLoc E) { + ARMOperand *Op = new ARMOperand(Immediate); Op->Imm.Val = Val; - Op->StartLoc = S; Op->EndLoc = E; + return Op; } - static void CreateMem(OwningPtr<ARMOperand> &Op, - unsigned BaseRegNum, bool OffsetIsReg, - const MCExpr *Offset, unsigned OffsetRegNum, - bool OffsetRegShifted, enum ShiftType ShiftType, - const MCExpr *ShiftAmount, bool Preindexed, - bool Postindexed, bool Negative, bool Writeback, - SMLoc S, SMLoc E) { - Op.reset(new ARMOperand); - Op->Kind = Memory; + static ARMOperand *CreateMem(unsigned BaseRegNum, bool OffsetIsReg, + const MCExpr *Offset, int OffsetRegNum, + bool OffsetRegShifted, enum ShiftType ShiftType, + const MCExpr *ShiftAmount, bool Preindexed, + bool Postindexed, bool Negative, bool Writeback, + SMLoc S, SMLoc E) { + assert((OffsetRegNum == -1 || OffsetIsReg) && + "OffsetRegNum must imply OffsetIsReg!"); + assert((!OffsetRegShifted || OffsetIsReg) && + "OffsetRegShifted must imply OffsetIsReg!"); + assert((Offset || OffsetIsReg) && + "Offset must exists unless register offset is used!"); + assert((!ShiftAmount || (OffsetIsReg && OffsetRegShifted)) && + "Cannot have shift amount without shifted register offset!"); + assert((!Offset || !OffsetIsReg) && + "Cannot have expression offset and register offset!"); + + ARMOperand *Op = new ARMOperand(Memory); Op->Mem.BaseRegNum = BaseRegNum; Op->Mem.OffsetIsReg = OffsetIsReg; - Op->Mem.Offset = Offset; - Op->Mem.OffsetRegNum = OffsetRegNum; + if (OffsetIsReg) + Op->Mem.Offset.RegNum = OffsetRegNum; + else + Op->Mem.Offset.Value = Offset; Op->Mem.OffsetRegShifted = OffsetRegShifted; Op->Mem.ShiftType = ShiftType; Op->Mem.ShiftAmount = ShiftAmount; @@ -313,9 +584,34 @@ public: Op->Mem.Postindexed = Postindexed; Op->Mem.Negative = Negative; Op->Mem.Writeback = Writeback; - + Op->StartLoc = S; Op->EndLoc = E; + return Op; + } + + static ARMOperand *CreateMemBarrierOpt(ARM_MB::MemBOpt Opt, SMLoc S) { + ARMOperand *Op = new ARMOperand(MemBarrierOpt); + Op->MBOpt.Val = Opt; + Op->StartLoc = S; + Op->EndLoc = S; + return Op; + } + + static ARMOperand *CreateProcIFlags(ARM_PROC::IFlags IFlags, SMLoc S) { + ARMOperand *Op = new ARMOperand(ProcIFlags); + Op->IFlags.Val = IFlags; + Op->StartLoc = S; + Op->EndLoc = S; + return Op; + } + + static ARMOperand *CreateMSRMask(unsigned MMask, SMLoc S) { + ARMOperand *Op = new ARMOperand(MSRMask); + Op->MMask.Val = MMask; + Op->StartLoc = S; + Op->EndLoc = S; + return Op; } }; @@ -324,17 +620,77 @@ public: void ARMOperand::dump(raw_ostream &OS) const { switch (Kind) { case CondCode: - OS << ARMCondCodeToString(getCondCode()); + OS << "<ARMCC::" << ARMCondCodeToString(getCondCode()) << ">"; + break; + case CCOut: + OS << "<ccout " << getReg() << ">"; + break; + case CoprocNum: + OS << "<coprocessor number: " << getCoproc() << ">"; + break; + case CoprocReg: + OS << "<coprocessor register: " << getCoproc() << ">"; + break; + case MSRMask: + OS << "<mask: " << getMSRMask() << ">"; break; case Immediate: getImm()->print(OS); break; + case MemBarrierOpt: + OS << "<ARM_MB::" << MemBOptToString(getMemBarrierOpt()) << ">"; + break; case Memory: - OS << "<memory>"; + OS << "<memory " + << "base:" << getMemBaseRegNum(); + if (getMemOffsetIsReg()) { + OS << " offset:<register " << getMemOffsetRegNum(); + if (getMemOffsetRegShifted()) { + OS << " offset-shift-type:" << getMemShiftType(); + OS << " offset-shift-amount:" << *getMemShiftAmount(); + } + } else { + OS << " offset:" << *getMemOffset(); + } + if (getMemOffsetIsReg()) + OS << " (offset-is-reg)"; + if (getMemPreindexed()) + OS << " (pre-indexed)"; + if (getMemPostindexed()) + OS << " (post-indexed)"; + if (getMemNegative()) + OS << " (negative)"; + if (getMemWriteback()) + OS << " (writeback)"; + OS << ">"; + break; + case ProcIFlags: { + OS << "<ARM_PROC::"; + unsigned IFlags = getProcIFlags(); + for (int i=2; i >= 0; --i) + if (IFlags & (1 << i)) + OS << ARM_PROC::IFlagsToString(1 << i); + OS << ">"; break; + } case Register: OS << "<register " << getReg() << ">"; break; + case RegisterList: + case DPRRegisterList: + case SPRRegisterList: { + OS << "<register_list "; + + const SmallVectorImpl<unsigned> &RegList = getRegList(); + for (SmallVectorImpl<unsigned>::const_iterator + I = RegList.begin(), E = RegList.end(); I != E; ) { + OS << *I; + if (++I < E) OS << ", "; + } + + OS << ">"; + break; + } case Token: OS << "'" << getToken() << "'"; break; @@ -348,184 +704,456 @@ static unsigned MatchRegisterName(StringRef Name); /// } +bool ARMAsmParser::ParseRegister(unsigned &RegNo, + SMLoc &StartLoc, SMLoc &EndLoc) { + RegNo = TryParseRegister(); + + return (RegNo == (unsigned)-1); +} + /// Try to parse a register name. The token must be an Identifier when called, -/// and if it is a register name a Reg operand is created, the token is eaten -/// and false is returned. Else true is returned and no token is eaten. -/// TODO this is likely to change to allow different register types and or to -/// parse for a specific register type. -bool ARMAsmParser::MaybeParseRegister - (OwningPtr<ARMOperand> &Op, bool ParseWriteBack) { - SMLoc S, E; +/// and if it is a register name the token is eaten and the register number is +/// returned. Otherwise return -1. +/// +int ARMAsmParser::TryParseRegister() { const AsmToken &Tok = Parser.getTok(); assert(Tok.is(AsmToken::Identifier) && "Token is not an Identifier"); // FIXME: Validate register for the current architecture; we have to do // validation later, so maybe there is no need for this here. - int RegNum; + std::string upperCase = Tok.getString().str(); + std::string lowerCase = LowercaseString(upperCase); + unsigned RegNum = MatchRegisterName(lowerCase); + if (!RegNum) { + RegNum = StringSwitch<unsigned>(lowerCase) + .Case("r13", ARM::SP) + .Case("r14", ARM::LR) + .Case("r15", ARM::PC) + .Case("ip", ARM::R12) + .Default(0); + } + if (!RegNum) return -1; - RegNum = MatchRegisterName(Tok.getString()); - if (RegNum == -1) - return true; - - S = Tok.getLoc(); - Parser.Lex(); // Eat identifier token. - - E = Parser.getTok().getLoc(); + return RegNum; +} - bool Writeback = false; - if (ParseWriteBack) { - const AsmToken &ExclaimTok = Parser.getTok(); - if (ExclaimTok.is(AsmToken::Exclaim)) { - E = ExclaimTok.getLoc(); - Writeback = true; - Parser.Lex(); // Eat exclaim token +/// Try to parse a register name. The token must be an Identifier when called. +/// If it's a register, an AsmOperand is created. Another AsmOperand is created +/// if there is a "writeback". 'true' if it's not a register. +/// +/// TODO this is likely to change to allow different register types and or to +/// parse for a specific register type. +bool ARMAsmParser:: +TryParseRegisterWithWriteBack(SmallVectorImpl<MCParsedAsmOperand*> &Operands) { + SMLoc S = Parser.getTok().getLoc(); + int RegNo = TryParseRegister(); + if (RegNo == -1) + return true; + + Operands.push_back(ARMOperand::CreateReg(RegNo, S, Parser.getTok().getLoc())); + + const AsmToken &ExclaimTok = Parser.getTok(); + if (ExclaimTok.is(AsmToken::Exclaim)) { + Operands.push_back(ARMOperand::CreateToken(ExclaimTok.getString(), + ExclaimTok.getLoc())); + Parser.Lex(); // Eat exclaim token + } + + return false; +} + +/// MatchCoprocessorOperandName - Try to parse an coprocessor related +/// instruction with a symbolic operand name. Example: "p1", "p7", "c3", +/// "c5", ... +static int MatchCoprocessorOperandName(StringRef Name, char CoprocOp) { + // Use the same layout as the tablegen'erated register name matcher. Ugly, + // but efficient. + switch (Name.size()) { + default: break; + case 2: + if (Name[0] != CoprocOp) + return -1; + switch (Name[1]) { + default: return -1; + case '0': return 0; + case '1': return 1; + case '2': return 2; + case '3': return 3; + case '4': return 4; + case '5': return 5; + case '6': return 6; + case '7': return 7; + case '8': return 8; + case '9': return 9; + } + break; + case 3: + if (Name[0] != CoprocOp || Name[1] != '1') + return -1; + switch (Name[2]) { + default: return -1; + case '0': return 10; + case '1': return 11; + case '2': return 12; + case '3': return 13; + case '4': return 14; + case '5': return 15; } + break; } - ARMOperand::CreateReg(Op, RegNum, Writeback, S, E); + return -1; +} - return false; +/// tryParseCoprocNumOperand - Try to parse an coprocessor number operand. The +/// token must be an Identifier when called, and if it is a coprocessor +/// number, the token is eaten and the operand is added to the operand list. +ARMAsmParser::OperandMatchResultTy ARMAsmParser:: +tryParseCoprocNumOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) { + SMLoc S = Parser.getTok().getLoc(); + const AsmToken &Tok = Parser.getTok(); + assert(Tok.is(AsmToken::Identifier) && "Token is not an Identifier"); + + int Num = MatchCoprocessorOperandName(Tok.getString(), 'p'); + if (Num == -1) + return MatchOperand_NoMatch; + + Parser.Lex(); // Eat identifier token. + Operands.push_back(ARMOperand::CreateCoprocNum(Num, S)); + return MatchOperand_Success; } -/// Parse a register list, return false if successful else return true or an -/// error. The first token must be a '{' when called. -bool ARMAsmParser::ParseRegisterList(OwningPtr<ARMOperand> &Op) { - SMLoc S, E; - assert(Parser.getTok().is(AsmToken::LCurly) && - "Token is not an Left Curly Brace"); - S = Parser.getTok().getLoc(); - Parser.Lex(); // Eat left curly brace token. - - const AsmToken &RegTok = Parser.getTok(); - SMLoc RegLoc = RegTok.getLoc(); - if (RegTok.isNot(AsmToken::Identifier)) - return Error(RegLoc, "register expected"); - int RegNum = MatchRegisterName(RegTok.getString()); - if (RegNum == -1) - return Error(RegLoc, "register expected"); +/// tryParseCoprocRegOperand - Try to parse an coprocessor register operand. The +/// token must be an Identifier when called, and if it is a coprocessor +/// number, the token is eaten and the operand is added to the operand list. +ARMAsmParser::OperandMatchResultTy ARMAsmParser:: +tryParseCoprocRegOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) { + SMLoc S = Parser.getTok().getLoc(); + const AsmToken &Tok = Parser.getTok(); + assert(Tok.is(AsmToken::Identifier) && "Token is not an Identifier"); + + int Reg = MatchCoprocessorOperandName(Tok.getString(), 'c'); + if (Reg == -1) + return MatchOperand_NoMatch; + Parser.Lex(); // Eat identifier token. - unsigned RegList = 1 << RegNum; + Operands.push_back(ARMOperand::CreateCoprocReg(Reg, S)); + return MatchOperand_Success; +} - int HighRegNum = RegNum; - // TODO ranges like "{Rn-Rm}" - while (Parser.getTok().is(AsmToken::Comma)) { - Parser.Lex(); // Eat comma token. +/// Parse a register list, return it if successful else return null. The first +/// token must be a '{' when called. +bool ARMAsmParser:: +ParseRegisterList(SmallVectorImpl<MCParsedAsmOperand*> &Operands) { + assert(Parser.getTok().is(AsmToken::LCurly) && + "Token is not a Left Curly Brace"); + SMLoc S = Parser.getTok().getLoc(); + + // Read the rest of the registers in the list. + unsigned PrevRegNum = 0; + SmallVector<std::pair<unsigned, SMLoc>, 32> Registers; + + do { + bool IsRange = Parser.getTok().is(AsmToken::Minus); + Parser.Lex(); // Eat non-identifier token. const AsmToken &RegTok = Parser.getTok(); SMLoc RegLoc = RegTok.getLoc(); - if (RegTok.isNot(AsmToken::Identifier)) - return Error(RegLoc, "register expected"); - int RegNum = MatchRegisterName(RegTok.getString()); - if (RegNum == -1) - return Error(RegLoc, "register expected"); + if (RegTok.isNot(AsmToken::Identifier)) { + Error(RegLoc, "register expected"); + return true; + } - if (RegList & (1 << RegNum)) - Warning(RegLoc, "register duplicated in register list"); - else if (RegNum <= HighRegNum) - Warning(RegLoc, "register not in ascending order in register list"); - RegList |= 1 << RegNum; - HighRegNum = RegNum; + int RegNum = TryParseRegister(); + if (RegNum == -1) { + Error(RegLoc, "register expected"); + return true; + } - Parser.Lex(); // Eat identifier token. - } + if (IsRange) { + int Reg = PrevRegNum; + do { + ++Reg; + Registers.push_back(std::make_pair(Reg, RegLoc)); + } while (Reg != RegNum); + } else { + Registers.push_back(std::make_pair(RegNum, RegLoc)); + } + + PrevRegNum = RegNum; + } while (Parser.getTok().is(AsmToken::Comma) || + Parser.getTok().is(AsmToken::Minus)); + + // Process the right curly brace of the list. const AsmToken &RCurlyTok = Parser.getTok(); - if (RCurlyTok.isNot(AsmToken::RCurly)) - return Error(RCurlyTok.getLoc(), "'}' expected"); - E = RCurlyTok.getLoc(); - Parser.Lex(); // Eat left curly brace token. + if (RCurlyTok.isNot(AsmToken::RCurly)) { + Error(RCurlyTok.getLoc(), "'}' expected"); + return true; + } + + SMLoc E = RCurlyTok.getLoc(); + Parser.Lex(); // Eat right curly brace token. + + // Verify the register list. + SmallVectorImpl<std::pair<unsigned, SMLoc> >::const_iterator + RI = Registers.begin(), RE = Registers.end(); + + unsigned HighRegNum = getARMRegisterNumbering(RI->first); + bool EmittedWarning = false; + + DenseMap<unsigned, bool> RegMap; + RegMap[HighRegNum] = true; + for (++RI; RI != RE; ++RI) { + const std::pair<unsigned, SMLoc> &RegInfo = *RI; + unsigned Reg = getARMRegisterNumbering(RegInfo.first); + + if (RegMap[Reg]) { + Error(RegInfo.second, "register duplicated in register list"); + return true; + } + + if (!EmittedWarning && Reg < HighRegNum) + Warning(RegInfo.second, + "register not in ascending order in register list"); + + RegMap[Reg] = true; + HighRegNum = std::max(Reg, HighRegNum); + } + + Operands.push_back(ARMOperand::CreateRegList(Registers, S, E)); return false; } -/// Parse an arm memory expression, return false if successful else return true +/// tryParseMemBarrierOptOperand - Try to parse DSB/DMB data barrier options. +ARMAsmParser::OperandMatchResultTy ARMAsmParser:: +tryParseMemBarrierOptOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) { + SMLoc S = Parser.getTok().getLoc(); + const AsmToken &Tok = Parser.getTok(); + assert(Tok.is(AsmToken::Identifier) && "Token is not an Identifier"); + StringRef OptStr = Tok.getString(); + + unsigned Opt = StringSwitch<unsigned>(OptStr.slice(0, OptStr.size())) + .Case("sy", ARM_MB::SY) + .Case("st", ARM_MB::ST) + .Case("ish", ARM_MB::ISH) + .Case("ishst", ARM_MB::ISHST) + .Case("nsh", ARM_MB::NSH) + .Case("nshst", ARM_MB::NSHST) + .Case("osh", ARM_MB::OSH) + .Case("oshst", ARM_MB::OSHST) + .Default(~0U); + + if (Opt == ~0U) + return MatchOperand_NoMatch; + + Parser.Lex(); // Eat identifier token. + Operands.push_back(ARMOperand::CreateMemBarrierOpt((ARM_MB::MemBOpt)Opt, S)); + return MatchOperand_Success; +} + +/// tryParseProcIFlagsOperand - Try to parse iflags from CPS instruction. +ARMAsmParser::OperandMatchResultTy ARMAsmParser:: +tryParseProcIFlagsOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) { + SMLoc S = Parser.getTok().getLoc(); + const AsmToken &Tok = Parser.getTok(); + assert(Tok.is(AsmToken::Identifier) && "Token is not an Identifier"); + StringRef IFlagsStr = Tok.getString(); + + unsigned IFlags = 0; + for (int i = 0, e = IFlagsStr.size(); i != e; ++i) { + unsigned Flag = StringSwitch<unsigned>(IFlagsStr.substr(i, 1)) + .Case("a", ARM_PROC::A) + .Case("i", ARM_PROC::I) + .Case("f", ARM_PROC::F) + .Default(~0U); + + // If some specific iflag is already set, it means that some letter is + // present more than once, this is not acceptable. + if (Flag == ~0U || (IFlags & Flag)) + return MatchOperand_NoMatch; + + IFlags |= Flag; + } + + Parser.Lex(); // Eat identifier token. + Operands.push_back(ARMOperand::CreateProcIFlags((ARM_PROC::IFlags)IFlags, S)); + return MatchOperand_Success; +} + +/// tryParseMSRMaskOperand - Try to parse mask flags from MSR instruction. +ARMAsmParser::OperandMatchResultTy ARMAsmParser:: +tryParseMSRMaskOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) { + SMLoc S = Parser.getTok().getLoc(); + const AsmToken &Tok = Parser.getTok(); + assert(Tok.is(AsmToken::Identifier) && "Token is not an Identifier"); + StringRef Mask = Tok.getString(); + + // Split spec_reg from flag, example: CPSR_sxf => "CPSR" and "sxf" + size_t Start = 0, Next = Mask.find('_'); + StringRef Flags = ""; + StringRef SpecReg = Mask.slice(Start, Next); + if (Next != StringRef::npos) + Flags = Mask.slice(Next+1, Mask.size()); + + // FlagsVal contains the complete mask: + // 3-0: Mask + // 4: Special Reg (cpsr, apsr => 0; spsr => 1) + unsigned FlagsVal = 0; + + if (SpecReg == "apsr") { + FlagsVal = StringSwitch<unsigned>(Flags) + .Case("nzcvq", 0x8) // same as CPSR_c + .Case("g", 0x4) // same as CPSR_s + .Case("nzcvqg", 0xc) // same as CPSR_fs + .Default(~0U); + + if (FlagsVal == ~0U) { + if (!Flags.empty()) + return MatchOperand_NoMatch; + else + FlagsVal = 0; // No flag + } + } else if (SpecReg == "cpsr" || SpecReg == "spsr") { + for (int i = 0, e = Flags.size(); i != e; ++i) { + unsigned Flag = StringSwitch<unsigned>(Flags.substr(i, 1)) + .Case("c", 1) + .Case("x", 2) + .Case("s", 4) + .Case("f", 8) + .Default(~0U); + + // If some specific flag is already set, it means that some letter is + // present more than once, this is not acceptable. + if (FlagsVal == ~0U || (FlagsVal & Flag)) + return MatchOperand_NoMatch; + FlagsVal |= Flag; + } + } else // No match for special register. + return MatchOperand_NoMatch; + + // Special register without flags are equivalent to "fc" flags. + if (!FlagsVal) + FlagsVal = 0x9; + + // Bit 4: Special Reg (cpsr, apsr => 0; spsr => 1) + if (SpecReg == "spsr") + FlagsVal |= 16; + + Parser.Lex(); // Eat identifier token. + Operands.push_back(ARMOperand::CreateMSRMask(FlagsVal, S)); + return MatchOperand_Success; +} + +/// Parse an ARM memory expression, return false if successful else return true /// or an error. The first token must be a '[' when called. +/// /// TODO Only preindexing and postindexing addressing are started, unindexed /// with option, etc are still to do. -bool ARMAsmParser::ParseMemory(OwningPtr<ARMOperand> &Op) { +bool ARMAsmParser:: +ParseMemory(SmallVectorImpl<MCParsedAsmOperand*> &Operands) { SMLoc S, E; assert(Parser.getTok().is(AsmToken::LBrac) && - "Token is not an Left Bracket"); + "Token is not a Left Bracket"); S = Parser.getTok().getLoc(); Parser.Lex(); // Eat left bracket token. const AsmToken &BaseRegTok = Parser.getTok(); - if (BaseRegTok.isNot(AsmToken::Identifier)) - return Error(BaseRegTok.getLoc(), "register expected"); - if (MaybeParseRegister(Op, false)) - return Error(BaseRegTok.getLoc(), "register expected"); - int BaseRegNum = Op->getReg(); + if (BaseRegTok.isNot(AsmToken::Identifier)) { + Error(BaseRegTok.getLoc(), "register expected"); + return true; + } + int BaseRegNum = TryParseRegister(); + if (BaseRegNum == -1) { + Error(BaseRegTok.getLoc(), "register expected"); + return true; + } + + // The next token must either be a comma or a closing bracket. + const AsmToken &Tok = Parser.getTok(); + if (!Tok.is(AsmToken::Comma) && !Tok.is(AsmToken::RBrac)) + return true; bool Preindexed = false; bool Postindexed = false; bool OffsetIsReg = false; bool Negative = false; bool Writeback = false; + ARMOperand *WBOp = 0; + int OffsetRegNum = -1; + bool OffsetRegShifted = false; + enum ShiftType ShiftType = Lsl; + const MCExpr *ShiftAmount = 0; + const MCExpr *Offset = 0; // First look for preindexed address forms, that is after the "[Rn" we now // have to see if the next token is a comma. - const AsmToken &Tok = Parser.getTok(); if (Tok.is(AsmToken::Comma)) { Preindexed = true; Parser.Lex(); // Eat comma token. - int OffsetRegNum; - bool OffsetRegShifted; - enum ShiftType ShiftType; - const MCExpr *ShiftAmount; - const MCExpr *Offset; - if(ParseMemoryOffsetReg(Negative, OffsetRegShifted, ShiftType, ShiftAmount, - Offset, OffsetIsReg, OffsetRegNum, E)) + + if (ParseMemoryOffsetReg(Negative, OffsetRegShifted, ShiftType, ShiftAmount, + Offset, OffsetIsReg, OffsetRegNum, E)) return true; const AsmToken &RBracTok = Parser.getTok(); - if (RBracTok.isNot(AsmToken::RBrac)) - return Error(RBracTok.getLoc(), "']' expected"); + if (RBracTok.isNot(AsmToken::RBrac)) { + Error(RBracTok.getLoc(), "']' expected"); + return true; + } E = RBracTok.getLoc(); Parser.Lex(); // Eat right bracket token. const AsmToken &ExclaimTok = Parser.getTok(); if (ExclaimTok.is(AsmToken::Exclaim)) { - E = ExclaimTok.getLoc(); + WBOp = ARMOperand::CreateToken(ExclaimTok.getString(), + ExclaimTok.getLoc()); Writeback = true; Parser.Lex(); // Eat exclaim token } - ARMOperand::CreateMem(Op, BaseRegNum, OffsetIsReg, Offset, OffsetRegNum, - OffsetRegShifted, ShiftType, ShiftAmount, - Preindexed, Postindexed, Negative, Writeback, S, E); - return false; - } - // The "[Rn" we have so far was not followed by a comma. - else if (Tok.is(AsmToken::RBrac)) { - // This is a post indexing addressing forms, that is a ']' follows after - // the "[Rn". - Postindexed = true; - Writeback = true; + } else { + // The "[Rn" we have so far was not followed by a comma. + + // If there's anything other than the right brace, this is a post indexing + // addressing form. E = Tok.getLoc(); Parser.Lex(); // Eat right bracket token. - int OffsetRegNum = 0; - bool OffsetRegShifted = false; - enum ShiftType ShiftType; - const MCExpr *ShiftAmount; - const MCExpr *Offset; - const AsmToken &NextTok = Parser.getTok(); + if (NextTok.isNot(AsmToken::EndOfStatement)) { - if (NextTok.isNot(AsmToken::Comma)) - return Error(NextTok.getLoc(), "',' expected"); + Postindexed = true; + Writeback = true; + + if (NextTok.isNot(AsmToken::Comma)) { + Error(NextTok.getLoc(), "',' expected"); + return true; + } + Parser.Lex(); // Eat comma token. - if(ParseMemoryOffsetReg(Negative, OffsetRegShifted, ShiftType, - ShiftAmount, Offset, OffsetIsReg, OffsetRegNum, - E)) + + if (ParseMemoryOffsetReg(Negative, OffsetRegShifted, ShiftType, + ShiftAmount, Offset, OffsetIsReg, OffsetRegNum, + E)) return true; } + } - ARMOperand::CreateMem(Op, BaseRegNum, OffsetIsReg, Offset, OffsetRegNum, - OffsetRegShifted, ShiftType, ShiftAmount, - Preindexed, Postindexed, Negative, Writeback, S, E); - return false; + // Force Offset to exist if used. + if (!OffsetIsReg) { + if (!Offset) + Offset = MCConstantExpr::Create(0, getContext()); } - return true; + Operands.push_back(ARMOperand::CreateMem(BaseRegNum, OffsetIsReg, Offset, + OffsetRegNum, OffsetRegShifted, + ShiftType, ShiftAmount, Preindexed, + Postindexed, Negative, Writeback, + S, E)); + if (WBOp) + Operands.push_back(WBOp); + + return false; } /// Parse the offset of a memory operand after we have seen "[Rn," or "[Rn]," @@ -543,7 +1171,6 @@ bool ARMAsmParser::ParseMemoryOffsetReg(bool &Negative, bool &OffsetIsReg, int &OffsetRegNum, SMLoc &E) { - OwningPtr<ARMOperand> Op; Negative = false; OffsetRegShifted = false; OffsetIsReg = false; @@ -559,13 +1186,15 @@ bool ARMAsmParser::ParseMemoryOffsetReg(bool &Negative, // See if there is a register following the "[Rn," or "[Rn]," we have so far. const AsmToken &OffsetRegTok = Parser.getTok(); if (OffsetRegTok.is(AsmToken::Identifier)) { - OffsetIsReg = !MaybeParseRegister(Op, false); - if (OffsetIsReg) { - E = Op->getEndLoc(); - OffsetRegNum = Op->getReg(); + SMLoc CurLoc = OffsetRegTok.getLoc(); + OffsetRegNum = TryParseRegister(); + if (OffsetRegNum != -1) { + OffsetIsReg = true; + E = CurLoc; } } - // If we parsed a register as the offset then their can be a shift after that + + // If we parsed a register as the offset then there can be a shift after that. if (OffsetRegNum != -1) { // Look for a comma then a shift const AsmToken &Tok = Parser.getTok(); @@ -583,7 +1212,7 @@ bool ARMAsmParser::ParseMemoryOffsetReg(bool &Negative, const AsmToken &HashTok = Parser.getTok(); if (HashTok.isNot(AsmToken::Hash)) return Error(HashTok.getLoc(), "'#' expected"); - + Parser.Lex(); // Eat hash token. if (getParser().ParseExpression(Offset)) @@ -597,8 +1226,7 @@ bool ARMAsmParser::ParseMemoryOffsetReg(bool &Negative, /// ( lsl | lsr | asr | ror ) , # shift_amount /// rrx /// and returns true if it parses a shift otherwise it returns false. -bool ARMAsmParser::ParseShift(ShiftType &St, - const MCExpr *&ShiftAmount, +bool ARMAsmParser::ParseShift(ShiftType &St, const MCExpr *&ShiftAmount, SMLoc &E) { const AsmToken &Tok = Parser.getTok(); if (Tok.isNot(AsmToken::Identifier)) @@ -636,13 +1264,33 @@ bool ARMAsmParser::ParseShift(ShiftType &St, /// Parse a arm instruction operand. For now this parses the operand regardless /// of the mnemonic. -bool ARMAsmParser::ParseOperand(OwningPtr<ARMOperand> &Op) { +bool ARMAsmParser::ParseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands, + StringRef Mnemonic) { SMLoc S, E; - + + // Check if the current operand has a custom associated parser, if so, try to + // custom parse the operand, or fallback to the general approach. + OperandMatchResultTy ResTy = MatchOperandParserImpl(Operands, Mnemonic); + if (ResTy == MatchOperand_Success) + return false; + // If there wasn't a custom match, try the generic matcher below. Otherwise, + // there was a match, but an error occurred, in which case, just return that + // the operand parsing failed. + if (ResTy == MatchOperand_ParseFail) + return true; + switch (getLexer().getKind()) { + default: + Error(Parser.getTok().getLoc(), "unexpected token in operand"); + return true; case AsmToken::Identifier: - if (!MaybeParseRegister(Op, true)) + if (!TryParseRegisterWithWriteBack(Operands)) return false; + + // Fall though for the Identifier case that is not a register or a + // special name. + case AsmToken::Integer: // things like 1f and 2b as a branch targets + case AsmToken::Dot: { // . as a branch target // This was not a register so parse other operands that start with an // identifier (like labels) as expressions and create them as immediates. const MCExpr *IdVal; @@ -650,12 +1298,13 @@ bool ARMAsmParser::ParseOperand(OwningPtr<ARMOperand> &Op) { if (getParser().ParseExpression(IdVal)) return true; E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1); - ARMOperand::CreateImm(Op, IdVal, S, E); + Operands.push_back(ARMOperand::CreateImm(IdVal, S, E)); return false; + } case AsmToken::LBrac: - return ParseMemory(Op); + return ParseMemory(Operands); case AsmToken::LCurly: - return ParseRegisterList(Op); + return ParseRegisterList(Operands); case AsmToken::Hash: // #42 -> immediate. // TODO: ":lower16:" and ":upper16:" modifiers after # before immediate @@ -665,28 +1314,134 @@ bool ARMAsmParser::ParseOperand(OwningPtr<ARMOperand> &Op) { if (getParser().ParseExpression(ImmVal)) return true; E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1); - ARMOperand::CreateImm(Op, ImmVal, S, E); + Operands.push_back(ARMOperand::CreateImm(ImmVal, S, E)); return false; - default: - return Error(Parser.getTok().getLoc(), "unexpected token in operand"); + case AsmToken::Colon: { + // ":lower16:" and ":upper16:" expression prefixes + // FIXME: Check it's an expression prefix, + // e.g. (FOO - :lower16:BAR) isn't legal. + ARMMCExpr::VariantKind RefKind; + if (ParsePrefix(RefKind)) + return true; + + const MCExpr *SubExprVal; + if (getParser().ParseExpression(SubExprVal)) + return true; + + const MCExpr *ExprVal = ARMMCExpr::Create(RefKind, SubExprVal, + getContext()); + E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1); + Operands.push_back(ARMOperand::CreateImm(ExprVal, S, E)); + return false; + } } } -/// Parse an arm instruction mnemonic followed by its operands. -bool ARMAsmParser::ParseInstruction(StringRef Name, SMLoc NameLoc, - SmallVectorImpl<MCParsedAsmOperand*> &Operands) { - OwningPtr<ARMOperand> Op; +// ParsePrefix - Parse ARM 16-bit relocations expression prefix, i.e. +// :lower16: and :upper16:. +bool ARMAsmParser::ParsePrefix(ARMMCExpr::VariantKind &RefKind) { + RefKind = ARMMCExpr::VK_ARM_None; - // Create the leading tokens for the mnemonic, split by '.' characters. - size_t Start = 0, Next = Name.find('.'); - StringRef Head = Name.slice(Start, Next); + // :lower16: and :upper16: modifiers + assert(getLexer().is(AsmToken::Colon) && "expected a :"); + Parser.Lex(); // Eat ':' + + if (getLexer().isNot(AsmToken::Identifier)) { + Error(Parser.getTok().getLoc(), "expected prefix identifier in operand"); + return true; + } + + StringRef IDVal = Parser.getTok().getIdentifier(); + if (IDVal == "lower16") { + RefKind = ARMMCExpr::VK_ARM_LO16; + } else if (IDVal == "upper16") { + RefKind = ARMMCExpr::VK_ARM_HI16; + } else { + Error(Parser.getTok().getLoc(), "unexpected prefix in operand"); + return true; + } + Parser.Lex(); + + if (getLexer().isNot(AsmToken::Colon)) { + Error(Parser.getTok().getLoc(), "unexpected token after prefix"); + return true; + } + Parser.Lex(); // Eat the last ':' + return false; +} + +const MCExpr * +ARMAsmParser::ApplyPrefixToExpr(const MCExpr *E, + MCSymbolRefExpr::VariantKind Variant) { + // Recurse over the given expression, rebuilding it to apply the given variant + // to the leftmost symbol. + if (Variant == MCSymbolRefExpr::VK_None) + return E; + + switch (E->getKind()) { + case MCExpr::Target: + llvm_unreachable("Can't handle target expr yet"); + case MCExpr::Constant: + llvm_unreachable("Can't handle lower16/upper16 of constant yet"); + + case MCExpr::SymbolRef: { + const MCSymbolRefExpr *SRE = cast<MCSymbolRefExpr>(E); - // Determine the predicate, if any. + if (SRE->getKind() != MCSymbolRefExpr::VK_None) + return 0; + + return MCSymbolRefExpr::Create(&SRE->getSymbol(), Variant, getContext()); + } + + case MCExpr::Unary: + llvm_unreachable("Can't handle unary expressions yet"); + + case MCExpr::Binary: { + const MCBinaryExpr *BE = cast<MCBinaryExpr>(E); + const MCExpr *LHS = ApplyPrefixToExpr(BE->getLHS(), Variant); + const MCExpr *RHS = BE->getRHS(); + if (!LHS) + return 0; + + return MCBinaryExpr::Create(BE->getOpcode(), LHS, RHS, getContext()); + } + } + + assert(0 && "Invalid expression kind!"); + return 0; +} + +/// \brief Given a mnemonic, split out possible predication code and carry +/// setting letters to form a canonical mnemonic and flags. +// +// FIXME: Would be nice to autogen this. +static StringRef SplitMnemonic(StringRef Mnemonic, + unsigned &PredicationCode, + bool &CarrySetting, + unsigned &ProcessorIMod) { + PredicationCode = ARMCC::AL; + CarrySetting = false; + ProcessorIMod = 0; + + // Ignore some mnemonics we know aren't predicated forms. // - // FIXME: We need a way to check whether a prefix supports predication, - // otherwise we will end up with an ambiguity for instructions that happen to - // end with a predicate name. - unsigned CC = StringSwitch<unsigned>(Head.substr(Head.size()-2)) + // FIXME: Would be nice to autogen this. + if (Mnemonic == "teq" || Mnemonic == "vceq" || + Mnemonic == "movs" || + Mnemonic == "svc" || + (Mnemonic == "mls" || Mnemonic == "smmls" || Mnemonic == "vcls" || + Mnemonic == "vmls" || Mnemonic == "vnmls") || + Mnemonic == "vacge" || Mnemonic == "vcge" || + Mnemonic == "vclt" || + Mnemonic == "vacgt" || Mnemonic == "vcgt" || + Mnemonic == "vcle" || + (Mnemonic == "smlal" || Mnemonic == "umaal" || Mnemonic == "umlal" || + Mnemonic == "vabal" || Mnemonic == "vmlal" || Mnemonic == "vpadal" || + Mnemonic == "vqdmlal")) + return Mnemonic; + + // First, split out any predication code. + unsigned CC = StringSwitch<unsigned>(Mnemonic.substr(Mnemonic.size()-2)) .Case("eq", ARMCC::EQ) .Case("ne", ARMCC::NE) .Case("hs", ARMCC::HS) @@ -704,44 +1459,268 @@ bool ARMAsmParser::ParseInstruction(StringRef Name, SMLoc NameLoc, .Case("al", ARMCC::AL) .Default(~0U); if (CC != ~0U) { - Head = Head.slice(0, Head.size() - 2); - } else - CC = ARMCC::AL; + Mnemonic = Mnemonic.slice(0, Mnemonic.size() - 2); + PredicationCode = CC; + } + + // Next, determine if we have a carry setting bit. We explicitly ignore all + // the instructions we know end in 's'. + if (Mnemonic.endswith("s") && + !(Mnemonic == "asrs" || Mnemonic == "cps" || Mnemonic == "mls" || + Mnemonic == "movs" || Mnemonic == "mrs" || Mnemonic == "smmls" || + Mnemonic == "vabs" || Mnemonic == "vcls" || Mnemonic == "vmls" || + Mnemonic == "vmrs" || Mnemonic == "vnmls" || Mnemonic == "vqabs" || + Mnemonic == "vrecps" || Mnemonic == "vrsqrts")) { + Mnemonic = Mnemonic.slice(0, Mnemonic.size() - 1); + CarrySetting = true; + } + + // The "cps" instruction can have a interrupt mode operand which is glued into + // the mnemonic. Check if this is the case, split it and parse the imod op + if (Mnemonic.startswith("cps")) { + // Split out any imod code. + unsigned IMod = + StringSwitch<unsigned>(Mnemonic.substr(Mnemonic.size()-2, 2)) + .Case("ie", ARM_PROC::IE) + .Case("id", ARM_PROC::ID) + .Default(~0U); + if (IMod != ~0U) { + Mnemonic = Mnemonic.slice(0, Mnemonic.size()-2); + ProcessorIMod = IMod; + } + } + + return Mnemonic; +} + +/// \brief Given a canonical mnemonic, determine if the instruction ever allows +/// inclusion of carry set or predication code operands. +// +// FIXME: It would be nice to autogen this. +void ARMAsmParser:: +GetMnemonicAcceptInfo(StringRef Mnemonic, bool &CanAcceptCarrySet, + bool &CanAcceptPredicationCode) { + bool isThumb = TM.getSubtarget<ARMSubtarget>().isThumb(); + + if (Mnemonic == "and" || Mnemonic == "lsl" || Mnemonic == "lsr" || + Mnemonic == "rrx" || Mnemonic == "ror" || Mnemonic == "sub" || + Mnemonic == "smull" || Mnemonic == "add" || Mnemonic == "adc" || + Mnemonic == "mul" || Mnemonic == "bic" || Mnemonic == "asr" || + Mnemonic == "umlal" || Mnemonic == "orr" || Mnemonic == "mov" || + Mnemonic == "rsb" || Mnemonic == "rsc" || Mnemonic == "orn" || + Mnemonic == "sbc" || Mnemonic == "mla" || Mnemonic == "umull" || + Mnemonic == "eor" || Mnemonic == "smlal" || Mnemonic == "mvn") { + CanAcceptCarrySet = true; + } else { + CanAcceptCarrySet = false; + } + + if (Mnemonic == "cbnz" || Mnemonic == "setend" || Mnemonic == "dmb" || + Mnemonic == "cps" || Mnemonic == "mcr2" || Mnemonic == "it" || + Mnemonic == "mcrr2" || Mnemonic == "cbz" || Mnemonic == "cdp2" || + Mnemonic == "trap" || Mnemonic == "mrc2" || Mnemonic == "mrrc2" || + Mnemonic == "dsb" || Mnemonic == "movs" || Mnemonic == "isb" || + Mnemonic == "clrex" || Mnemonic.startswith("cps")) { + CanAcceptPredicationCode = false; + } else { + CanAcceptPredicationCode = true; + } - ARMOperand::CreateToken(Op, Head, NameLoc); - Operands.push_back(Op.take()); + if (isThumb) + if (Mnemonic == "bkpt" || Mnemonic == "mcr" || Mnemonic == "mcrr" || + Mnemonic == "mrc" || Mnemonic == "mrrc" || Mnemonic == "cdp") + CanAcceptPredicationCode = false; +} + +/// Parse an arm instruction mnemonic followed by its operands. +bool ARMAsmParser::ParseInstruction(StringRef Name, SMLoc NameLoc, + SmallVectorImpl<MCParsedAsmOperand*> &Operands) { + // Create the leading tokens for the mnemonic, split by '.' characters. + size_t Start = 0, Next = Name.find('.'); + StringRef Head = Name.slice(Start, Next); + + // Split out the predication code and carry setting flag from the mnemonic. + unsigned PredicationCode; + unsigned ProcessorIMod; + bool CarrySetting; + Head = SplitMnemonic(Head, PredicationCode, CarrySetting, + ProcessorIMod); + + Operands.push_back(ARMOperand::CreateToken(Head, NameLoc)); + + // Next, add the CCOut and ConditionCode operands, if needed. + // + // For mnemonics which can ever incorporate a carry setting bit or predication + // code, our matching model involves us always generating CCOut and + // ConditionCode operands to match the mnemonic "as written" and then we let + // the matcher deal with finding the right instruction or generating an + // appropriate error. + bool CanAcceptCarrySet, CanAcceptPredicationCode; + GetMnemonicAcceptInfo(Head, CanAcceptCarrySet, CanAcceptPredicationCode); + + // Add the carry setting operand, if necessary. + // + // FIXME: It would be awesome if we could somehow invent a location such that + // match errors on this operand would print a nice diagnostic about how the + // 's' character in the mnemonic resulted in a CCOut operand. + if (CanAcceptCarrySet) { + Operands.push_back(ARMOperand::CreateCCOut(CarrySetting ? ARM::CPSR : 0, + NameLoc)); + } else { + // This mnemonic can't ever accept a carry set, but the user wrote one (or + // misspelled another mnemonic). + + // FIXME: Issue a nice error. + } + + // Add the predication code operand, if necessary. + if (CanAcceptPredicationCode) { + Operands.push_back(ARMOperand::CreateCondCode( + ARMCC::CondCodes(PredicationCode), NameLoc)); + } else { + // This mnemonic can't ever accept a predication code, but the user wrote + // one (or misspelled another mnemonic). + + // FIXME: Issue a nice error. + } + + // Add the processor imod operand, if necessary. + if (ProcessorIMod) { + Operands.push_back(ARMOperand::CreateImm( + MCConstantExpr::Create(ProcessorIMod, getContext()), + NameLoc, NameLoc)); + } else { + // This mnemonic can't ever accept a imod, but the user wrote + // one (or misspelled another mnemonic). - ARMOperand::CreateCondCode(Op, ARMCC::CondCodes(CC), NameLoc); - Operands.push_back(Op.take()); + // FIXME: Issue a nice error. + } // Add the remaining tokens in the mnemonic. while (Next != StringRef::npos) { Start = Next; Next = Name.find('.', Start + 1); - Head = Name.slice(Start, Next); + StringRef ExtraToken = Name.slice(Start, Next); - ARMOperand::CreateToken(Op, Head, NameLoc); - Operands.push_back(Op.take()); + Operands.push_back(ARMOperand::CreateToken(ExtraToken, NameLoc)); } // Read the remaining operands. if (getLexer().isNot(AsmToken::EndOfStatement)) { // Read the first operand. - OwningPtr<ARMOperand> Op; - if (ParseOperand(Op)) return true; - Operands.push_back(Op.take()); + if (ParseOperand(Operands, Head)) { + Parser.EatToEndOfStatement(); + return true; + } while (getLexer().is(AsmToken::Comma)) { Parser.Lex(); // Eat the comma. // Parse and remember the operand. - if (ParseOperand(Op)) return true; - Operands.push_back(Op.take()); + if (ParseOperand(Operands, Head)) { + Parser.EatToEndOfStatement(); + return true; + } } } + + if (getLexer().isNot(AsmToken::EndOfStatement)) { + Parser.EatToEndOfStatement(); + return TokError("unexpected token in argument list"); + } + + Parser.Lex(); // Consume the EndOfStatement return false; } +bool ARMAsmParser:: +MatchAndEmitInstruction(SMLoc IDLoc, + SmallVectorImpl<MCParsedAsmOperand*> &Operands, + MCStreamer &Out) { + MCInst Inst; + unsigned ErrorInfo; + MatchResultTy MatchResult, MatchResult2; + MatchResult = MatchInstructionImpl(Operands, Inst, ErrorInfo); + if (MatchResult != Match_Success) { + // If we get a Match_InvalidOperand it might be some arithmetic instruction + // that does not update the condition codes. So try adding a CCOut operand + // with a value of reg0. + if (MatchResult == Match_InvalidOperand) { + Operands.insert(Operands.begin() + 1, + ARMOperand::CreateCCOut(0, + ((ARMOperand*)Operands[0])->getStartLoc())); + MatchResult2 = MatchInstructionImpl(Operands, Inst, ErrorInfo); + if (MatchResult2 == Match_Success) + MatchResult = Match_Success; + else { + ARMOperand *CCOut = ((ARMOperand*)Operands[1]); + Operands.erase(Operands.begin() + 1); + delete CCOut; + } + } + // If we get a Match_MnemonicFail it might be some arithmetic instruction + // that updates the condition codes if it ends in 's'. So see if the + // mnemonic ends in 's' and if so try removing the 's' and adding a CCOut + // operand with a value of CPSR. + else if(MatchResult == Match_MnemonicFail) { + // Get the instruction mnemonic, which is the first token. + StringRef Mnemonic = ((ARMOperand*)Operands[0])->getToken(); + if (Mnemonic.substr(Mnemonic.size()-1) == "s") { + // removed the 's' from the mnemonic for matching. + StringRef MnemonicNoS = Mnemonic.slice(0, Mnemonic.size() - 1); + SMLoc NameLoc = ((ARMOperand*)Operands[0])->getStartLoc(); + ARMOperand *OldMnemonic = ((ARMOperand*)Operands[0]); + Operands.erase(Operands.begin()); + delete OldMnemonic; + Operands.insert(Operands.begin(), + ARMOperand::CreateToken(MnemonicNoS, NameLoc)); + Operands.insert(Operands.begin() + 1, + ARMOperand::CreateCCOut(ARM::CPSR, NameLoc)); + MatchResult2 = MatchInstructionImpl(Operands, Inst, ErrorInfo); + if (MatchResult2 == Match_Success) + MatchResult = Match_Success; + else { + ARMOperand *OldMnemonic = ((ARMOperand*)Operands[0]); + Operands.erase(Operands.begin()); + delete OldMnemonic; + Operands.insert(Operands.begin(), + ARMOperand::CreateToken(Mnemonic, NameLoc)); + ARMOperand *CCOut = ((ARMOperand*)Operands[1]); + Operands.erase(Operands.begin() + 1); + delete CCOut; + } + } + } + } + switch (MatchResult) { + case Match_Success: + Out.EmitInstruction(Inst); + return false; + case Match_MissingFeature: + Error(IDLoc, "instruction requires a CPU feature not currently enabled"); + return true; + case Match_InvalidOperand: { + SMLoc ErrorLoc = IDLoc; + if (ErrorInfo != ~0U) { + if (ErrorInfo >= Operands.size()) + return Error(IDLoc, "too few operands for instruction"); + + ErrorLoc = ((ARMOperand*)Operands[ErrorInfo])->getStartLoc(); + if (ErrorLoc == SMLoc()) ErrorLoc = IDLoc; + } + + return Error(ErrorLoc, "invalid operand for instruction"); + } + case Match_MnemonicFail: + return Error(IDLoc, "unrecognized instruction mnemonic"); + case Match_ConversionFail: + return Error(IDLoc, "unable to convert operands to instruction"); + } + + llvm_unreachable("Implement any new match types added!"); + return true; +} + /// ParseDirective parses the arm specific directives bool ARMAsmParser::ParseDirective(AsmToken DirectiveID) { StringRef IDVal = DirectiveID.getIdentifier(); @@ -771,7 +1750,7 @@ bool ARMAsmParser::ParseDirectiveWord(unsigned Size, SMLoc L) { if (getLexer().is(AsmToken::EndOfStatement)) break; - + // FIXME: Improve diagnostic. if (getLexer().isNot(AsmToken::Comma)) return Error(L, "unexpected token in directive"); @@ -801,16 +1780,16 @@ bool ARMAsmParser::ParseDirectiveThumb(SMLoc L) { bool ARMAsmParser::ParseDirectiveThumbFunc(SMLoc L) { const AsmToken &Tok = Parser.getTok(); if (Tok.isNot(AsmToken::Identifier) && Tok.isNot(AsmToken::String)) - return Error(L, "unexpected token in .syntax directive"); - StringRef ATTRIBUTE_UNUSED SymbolName = Parser.getTok().getIdentifier(); + return Error(L, "unexpected token in .thumb_func directive"); + StringRef Name = Tok.getString(); Parser.Lex(); // Consume the identifier token. - if (getLexer().isNot(AsmToken::EndOfStatement)) return Error(L, "unexpected token in directive"); Parser.Lex(); - // TODO: mark symbol as a thumb symbol - // getParser().getStreamer().Emit???(); + // Mark symbol as a thumb symbol. + MCSymbol *Func = getParser().getContext().GetOrCreateSymbol(Name); + getParser().getStreamer().EmitThumbFunc(Func); return false; } @@ -824,7 +1803,7 @@ bool ARMAsmParser::ParseDirectiveSyntax(SMLoc L) { if (Mode == "unified" || Mode == "UNIFIED") Parser.Lex(); else if (Mode == "divided" || Mode == "DIVIDED") - Parser.Lex(); + return Error(L, "'.syntax divided' arm asssembly not supported"); else return Error(L, "unrecognized syntax mode in .syntax directive"); @@ -855,8 +1834,21 @@ bool ARMAsmParser::ParseDirectiveCode(SMLoc L) { return Error(Parser.getTok().getLoc(), "unexpected token in directive"); Parser.Lex(); - // TODO tell the MC streamer the mode - // getParser().getStreamer().Emit???(); + // FIXME: We need to be able switch subtargets at this point so that + // MatchInstructionImpl() will work when it gets the AvailableFeatures which + // includes Feature_IsThumb or not to match the right instructions. This is + // blocked on the FIXME in llvm-mc.cpp when creating the TargetMachine. + if (Val == 16){ + assert(TM.getSubtarget<ARMSubtarget>().isThumb() && + "switching between arm/thumb not yet suppported via .code 16)"); + getParser().getStreamer().EmitAssemblerFlag(MCAF_Code16); + } + else{ + assert(!TM.getSubtarget<ARMSubtarget>().isThumb() && + "switching between thumb/arm not yet suppported via .code 32)"); + getParser().getStreamer().EmitAssemblerFlag(MCAF_Code32); + } + return false; } @@ -869,4 +1861,6 @@ extern "C" void LLVMInitializeARMAsmParser() { LLVMInitializeARMAsmLexer(); } +#define GET_REGISTER_MATCHER +#define GET_MATCHER_IMPLEMENTATION #include "ARMGenAsmMatcher.inc" diff --git a/contrib/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/contrib/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp index e220289..78d73d3 100644 --- a/contrib/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp +++ b/contrib/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp @@ -39,9 +39,9 @@ /// o static uint16_t decodeThumbInstruction(field_t insn) - the decoding /// function for a Thumb instruction. /// -#include "../ARMGenDecoderTables.inc" +#include "ARMGenDecoderTables.inc" -#include "../ARMGenEDInfo.inc" +#include "ARMGenEDInfo.inc" using namespace llvm; @@ -89,7 +89,8 @@ static unsigned decodeARMInstruction(uint32_t &insn) { return ARM::BFI; } - // Ditto for STRBT, which is a super-instruction for A8.6.199 Encoding A1 & A2. + // Ditto for STRBT, which is a super-instruction for A8.6.199 Encodings + // A1 & A2. // As a result, the decoder fails to deocode USAT properly. if (slice(insn, 27, 21) == 0x37 && slice(insn, 5, 4) == 1) return ARM::USAT; @@ -252,9 +253,6 @@ static unsigned T2Morph2LoadLiteral(unsigned Opcode) { default: return Opcode; // Return unmorphed opcode. - case ARM::t2LDRDi8: - return ARM::t2LDRDpci; - case ARM::t2LDR_POST: case ARM::t2LDR_PRE: case ARM::t2LDRi12: case ARM::t2LDRi8: case ARM::t2LDRs: case ARM::t2LDRT: @@ -349,36 +347,6 @@ static unsigned decodeThumbSideEffect(bool IsThumb2, unsigned &insn) { return decodeThumbInstruction(insn); } -static inline bool Thumb2PreloadOpcodeNoPCI(unsigned Opcode) { - switch (Opcode) { - default: - return false; - case ARM::t2PLDi12: case ARM::t2PLDi8: - case ARM::t2PLDr: case ARM::t2PLDs: - case ARM::t2PLDWi12: case ARM::t2PLDWi8: - case ARM::t2PLDWr: case ARM::t2PLDWs: - case ARM::t2PLIi12: case ARM::t2PLIi8: - case ARM::t2PLIr: case ARM::t2PLIs: - return true; - } -} - -static inline unsigned T2Morph2Preload2PCI(unsigned Opcode) { - switch (Opcode) { - default: - return 0; - case ARM::t2PLDi12: case ARM::t2PLDi8: - case ARM::t2PLDr: case ARM::t2PLDs: - return ARM::t2PLDpci; - case ARM::t2PLDWi12: case ARM::t2PLDWi8: - case ARM::t2PLDWr: case ARM::t2PLDWs: - return ARM::t2PLDWpci; - case ARM::t2PLIi12: case ARM::t2PLIi8: - case ARM::t2PLIr: case ARM::t2PLIs: - return ARM::t2PLIpci; - } -} - // // Public interface for the disassembler // @@ -485,11 +453,6 @@ bool ThumbDisassembler::getInstruction(MCInst &MI, // instructions as well. unsigned Opcode = decodeThumbSideEffect(IsThumb2, insn); - // A8.6.117/119/120/121. - // PLD/PLDW/PLI instructions with Rn==15 is transformed to the pci variant. - if (Thumb2PreloadOpcodeNoPCI(Opcode) && slice(insn, 19, 16) == 15) - Opcode = T2Morph2Preload2PCI(Opcode); - ARMFormat Format = ARMFormats[Opcode]; Size = IsThumb2 ? 4 : 2; @@ -568,9 +531,9 @@ static MCDisassembler *createThumbDisassembler(const Target &T) { return new ThumbDisassembler; } -extern "C" void LLVMInitializeARMDisassembler() { +extern "C" void LLVMInitializeARMDisassembler() { // Register the disassembler. - TargetRegistry::RegisterMCDisassembler(TheARMTarget, + TargetRegistry::RegisterMCDisassembler(TheARMTarget, createARMDisassembler); TargetRegistry::RegisterMCDisassembler(TheThumbTarget, createThumbDisassembler); diff --git a/contrib/llvm/lib/Target/ARM/Disassembler/ARMDisassemblerCore.cpp b/contrib/llvm/lib/Target/ARM/Disassembler/ARMDisassemblerCore.cpp index 9f493b9..bac68dd 100644 --- a/contrib/llvm/lib/Target/ARM/Disassembler/ARMDisassemblerCore.cpp +++ b/contrib/llvm/lib/Target/ARM/Disassembler/ARMDisassemblerCore.cpp @@ -79,22 +79,9 @@ const char *ARMUtils::OpcodeName(unsigned Opcode) { } // Return the register enum Based on RegClass and the raw register number. -// For DRegPair, see comments below. // FIXME: Auto-gened? -static unsigned getRegisterEnum(BO B, unsigned RegClassID, unsigned RawRegister, - bool DRegPair = false) { - - if (DRegPair && RegClassID == ARM::QPRRegClassID) { - // LLVM expects { Dd, Dd+1 } to form a super register; this is not specified - // in the ARM Architecture Manual as far as I understand it (A8.6.307). - // Therefore, we morph the RegClassID to be the sub register class and don't - // subsequently transform the RawRegister encoding when calculating RegNum. - // - // See also ARMinstPrinter::printOperand() wrt "dregpair" modifier part - // where this workaround is meant for. - RegClassID = ARM::DPRRegClassID; - } - +static unsigned +getRegisterEnum(BO B, unsigned RegClassID, unsigned RawRegister) { // For this purpose, we can treat rGPR as if it were GPR. if (RegClassID == ARM::rGPRRegClassID) RegClassID = ARM::GPRRegClassID; @@ -704,8 +691,8 @@ static bool DisassembleCoprocessor(MCInst &MI, unsigned Opcode, uint32_t insn, // MSR/MSRsys: Rm mask=Inst{19-16} // BXJ: Rm // MSRi/MSRsysi: so_imm -// SRSW/SRS: addrmode4:$addr mode_imm -// RFEW/RFE: addrmode4:$addr Rn +// SRSW/SRS: ldstm_mode:$amode mode_imm +// RFEW/RFE: ldstm_mode:$amode Rn static bool DisassembleBrFrm(MCInst &MI, unsigned Opcode, uint32_t insn, unsigned short NumOps, unsigned &NumOpsAdded, BO B) { @@ -733,35 +720,34 @@ static bool DisassembleBrFrm(MCInst &MI, unsigned Opcode, uint32_t insn, NumOpsAdded = 1; return true; } - // MSR and MSRsys take one GPR reg Rm, followed by the mask. - if (Opcode == ARM::MSR || Opcode == ARM::MSRsys) { - assert(NumOps >= 1 && OpInfo[0].RegClass == ARM::GPRRegClassID && + // MSR take a mask, followed by one GPR reg Rm. The mask contains the R Bit in + // bit 4, and the special register fields in bits 3-0. + if (Opcode == ARM::MSR) { + assert(NumOps >= 1 && OpInfo[1].RegClass == ARM::GPRRegClassID && "Reg operand expected"); + MI.addOperand(MCOperand::CreateImm(slice(insn, 22, 22) << 4 /* R Bit */ | + slice(insn, 19, 16) /* Special Reg */ )); MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, decodeRm(insn)))); - MI.addOperand(MCOperand::CreateImm(slice(insn, 19, 16))); NumOpsAdded = 2; return true; } - // MSRi and MSRsysi take one so_imm operand, followed by the mask. - if (Opcode == ARM::MSRi || Opcode == ARM::MSRsysi) { + // MSRi take a mask, followed by one so_imm operand. The mask contains the + // R Bit in bit 4, and the special register fields in bits 3-0. + if (Opcode == ARM::MSRi) { + MI.addOperand(MCOperand::CreateImm(slice(insn, 22, 22) << 4 /* R Bit */ | + slice(insn, 19, 16) /* Special Reg */ )); // SOImm is 4-bit rotate amount in bits 11-8 with 8-bit imm in bits 7-0. // A5.2.4 Rotate amount is twice the numeric value of Inst{11-8}. // See also ARMAddressingModes.h: getSOImmValImm() and getSOImmValRot(). unsigned Rot = (insn >> ARMII::SoRotImmShift) & 0xF; unsigned Imm = insn & 0xFF; MI.addOperand(MCOperand::CreateImm(ARM_AM::rotr32(Imm, 2*Rot))); - MI.addOperand(MCOperand::CreateImm(slice(insn, 19, 16))); NumOpsAdded = 2; return true; } - // SRSW and SRS requires addrmode4:$addr for ${addr:submode}, followed by the - // mode immediate (Inst{4-0}). if (Opcode == ARM::SRSW || Opcode == ARM::SRS || Opcode == ARM::RFEW || Opcode == ARM::RFE) { - // ARMInstPrinter::printAddrMode4Operand() prints special mode string - // if the base register is SP; so don't set ARM::SP. - MI.addOperand(MCOperand::CreateReg(0)); ARM_AM::AMSubMode SubMode = getAMSubModeForBits(getPUBits(insn)); MI.addOperand(MCOperand::CreateImm(ARM_AM::getAM4ModeImm(SubMode))); @@ -807,9 +793,8 @@ static bool DisassembleBrFrm(MCInst &MI, unsigned Opcode, uint32_t insn, } // Misc. Branch Instructions. -// BR_JTadd, BR_JTr, BR_JTm // BLXr9, BXr9 -// BRIND, BX_RET +// BX, BX_RET static bool DisassembleBrMiscFrm(MCInst &MI, unsigned Opcode, uint32_t insn, unsigned short NumOps, unsigned &NumOpsAdded, BO B) { @@ -820,12 +805,12 @@ static bool DisassembleBrMiscFrm(MCInst &MI, unsigned Opcode, uint32_t insn, OpIdx = 0; - // BX_RET has only two predicate operands, do an early return. - if (Opcode == ARM::BX_RET) + // BX_RET and MOVPCLR have only two predicate operands; do an early return. + if (Opcode == ARM::BX_RET || Opcode == ARM::MOVPCLR) return true; - // BLXr9 and BRIND take one GPR reg. - if (Opcode == ARM::BLXr9 || Opcode == ARM::BRIND) { + // BLXr9 and BX take one GPR reg. + if (Opcode == ARM::BLXr9 || Opcode == ARM::BX) { assert(NumOps >= 1 && OpInfo[OpIdx].RegClass == ARM::GPRRegClassID && "Reg operand expected"); MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, @@ -834,72 +819,6 @@ static bool DisassembleBrMiscFrm(MCInst &MI, unsigned Opcode, uint32_t insn, return true; } - // BR_JTadd is an ADD with Rd = PC, (Rn, Rm) as the target and index regs. - if (Opcode == ARM::BR_JTadd) { - // InOperandList with GPR:$target and GPR:$idx regs. - - assert(NumOps == 4 && "Expect 4 operands"); - MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, - decodeRn(insn)))); - MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, - decodeRm(insn)))); - - // Fill in the two remaining imm operands to signify build completion. - MI.addOperand(MCOperand::CreateImm(0)); - MI.addOperand(MCOperand::CreateImm(0)); - - OpIdx = 4; - return true; - } - - // BR_JTr is a MOV with Rd = PC, and Rm as the source register. - if (Opcode == ARM::BR_JTr) { - // InOperandList with GPR::$target reg. - - assert(NumOps == 3 && "Expect 3 operands"); - MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, - decodeRm(insn)))); - - // Fill in the two remaining imm operands to signify build completion. - MI.addOperand(MCOperand::CreateImm(0)); - MI.addOperand(MCOperand::CreateImm(0)); - - OpIdx = 3; - return true; - } - - // BR_JTm is an LDR with Rt = PC. - if (Opcode == ARM::BR_JTm) { - // This is the reg/reg form, with base reg followed by +/- reg shop imm. - // See also ARMAddressingModes.h (Addressing Mode #2). - - assert(NumOps == 5 && getIBit(insn) == 1 && "Expect 5 operands && I-bit=1"); - MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, - decodeRn(insn)))); - - ARM_AM::AddrOpc AddrOpcode = getUBit(insn) ? ARM_AM::add : ARM_AM::sub; - - // Disassemble the offset reg (Rm), shift type, and immediate shift length. - MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, - decodeRm(insn)))); - // Inst{6-5} encodes the shift opcode. - ARM_AM::ShiftOpc ShOp = getShiftOpcForBits(slice(insn, 6, 5)); - // Inst{11-7} encodes the imm5 shift amount. - unsigned ShImm = slice(insn, 11, 7); - - // A8.4.1. Possible rrx or shift amount of 32... - getImmShiftSE(ShOp, ShImm); - MI.addOperand(MCOperand::CreateImm( - ARM_AM::getAM2Opc(AddrOpcode, ShImm, ShOp))); - - // Fill in the two remaining imm operands to signify build completion. - MI.addOperand(MCOperand::CreateImm(0)); - MI.addOperand(MCOperand::CreateImm(0)); - - OpIdx = 5; - return true; - } - return false; } @@ -1324,30 +1243,28 @@ static bool DisassembleLdStMulFrm(MCInst &MI, unsigned Opcode, uint32_t insn, unsigned short NumOps, unsigned &NumOpsAdded, BO B) { assert(NumOps >= 5 && "LdStMulFrm expects NumOps >= 5"); - - unsigned &OpIdx = NumOpsAdded; - - OpIdx = 0; + NumOpsAdded = 0; unsigned Base = getRegisterEnum(B, ARM::GPRRegClassID, decodeRn(insn)); // Writeback to base, if necessary. - if (Opcode == ARM::LDM_UPD || Opcode == ARM::STM_UPD) { + if (Opcode == ARM::LDMIA_UPD || Opcode == ARM::STMIA_UPD || + Opcode == ARM::LDMDA_UPD || Opcode == ARM::STMDA_UPD || + Opcode == ARM::LDMDB_UPD || Opcode == ARM::STMDB_UPD || + Opcode == ARM::LDMIB_UPD || Opcode == ARM::STMIB_UPD) { MI.addOperand(MCOperand::CreateReg(Base)); - ++OpIdx; + ++NumOpsAdded; } + // Add the base register operand. MI.addOperand(MCOperand::CreateReg(Base)); - ARM_AM::AMSubMode SubMode = getAMSubModeForBits(getPUBits(insn)); - MI.addOperand(MCOperand::CreateImm(ARM_AM::getAM4ModeImm(SubMode))); - // Handling the two predicate operands before the reglist. int64_t CondVal = insn >> ARMII::CondShift; MI.addOperand(MCOperand::CreateImm(CondVal == 0xF ? 0xE : CondVal)); MI.addOperand(MCOperand::CreateReg(ARM::CPSR)); - OpIdx += 4; + NumOpsAdded += 3; // Fill the variadic part of reglist. unsigned RegListBits = insn & ((1 << 16) - 1); @@ -1355,7 +1272,7 @@ static bool DisassembleLdStMulFrm(MCInst &MI, unsigned Opcode, uint32_t insn, if ((RegListBits >> i) & 1) { MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, i))); - ++OpIdx; + ++NumOpsAdded; } } @@ -1586,8 +1503,7 @@ static unsigned decodeVFPRm(uint32_t insn, bool isSPVFP) { } // A7.5.1 -#if 0 -static uint64_t VFPExpandImm(unsigned char byte, unsigned N) { +static APInt VFPExpandImm(unsigned char byte, unsigned N) { assert(N == 32 || N == 64); uint64_t Result; @@ -1602,13 +1518,12 @@ static uint64_t VFPExpandImm(unsigned char byte, unsigned N) { Result = (uint64_t)slice(byte, 7, 7) << 63 | (uint64_t)slice(byte, 5, 0) << 48; if (bit6) - Result |= 0xffL << 54; + Result |= 0xffULL << 54; else - Result |= 0x1L << 62; + Result |= 0x1ULL << 62; } - return Result; + return APInt(N, Result); } -#endif // VFP Unary Format Instructions: // @@ -1902,8 +1817,10 @@ static bool DisassembleVFPLdStMulFrm(MCInst &MI, unsigned Opcode, uint32_t insn, unsigned Base = getRegisterEnum(B, ARM::GPRRegClassID, decodeRn(insn)); // Writeback to base, if necessary. - if (Opcode == ARM::VLDMD_UPD || Opcode == ARM::VLDMS_UPD || - Opcode == ARM::VSTMD_UPD || Opcode == ARM::VSTMS_UPD) { + if (Opcode == ARM::VLDMDIA_UPD || Opcode == ARM::VLDMSIA_UPD || + Opcode == ARM::VLDMDDB_UPD || Opcode == ARM::VLDMSDB_UPD || + Opcode == ARM::VSTMDIA_UPD || Opcode == ARM::VSTMSIA_UPD || + Opcode == ARM::VSTMDDB_UPD || Opcode == ARM::VSTMSDB_UPD) { MI.addOperand(MCOperand::CreateReg(Base)); ++OpIdx; } @@ -1926,8 +1843,10 @@ static bool DisassembleVFPLdStMulFrm(MCInst &MI, unsigned Opcode, uint32_t insn, OpIdx += 4; - bool isSPVFP = (Opcode == ARM::VLDMS || Opcode == ARM::VLDMS_UPD || - Opcode == ARM::VSTMS || Opcode == ARM::VSTMS_UPD); + bool isSPVFP = (Opcode == ARM::VLDMSIA || Opcode == ARM::VLDMSDB || + Opcode == ARM::VLDMSIA_UPD || Opcode == ARM::VLDMSDB_UPD || + Opcode == ARM::VSTMSIA || Opcode == ARM::VSTMSDB || + Opcode == ARM::VSTMSIA_UPD || Opcode == ARM::VSTMSDB_UPD); unsigned RegClassID = isSPVFP ? ARM::SPRRegClassID : ARM::DPRRegClassID; // Extract Dd/Sd. @@ -1985,10 +1904,14 @@ static bool DisassembleVFPMiscFrm(MCInst &MI, unsigned Opcode, uint32_t insn, // Extract/decode the f64/f32 immediate. if (OpIdx < NumOps && OpInfo[OpIdx].RegClass < 0 && !OpInfo[OpIdx].isPredicate() && !OpInfo[OpIdx].isOptionalDef()) { - // The asm syntax specifies the before-expanded <imm>. - // Not VFPExpandImm(slice(insn,19,16) << 4 | slice(insn, 3, 0), - // Opcode == ARM::FCONSTD ? 64 : 32) - MI.addOperand(MCOperand::CreateImm(slice(insn,19,16)<<4 | slice(insn,3,0))); + // The asm syntax specifies the floating point value, not the 8-bit literal. + APInt immRaw = VFPExpandImm(slice(insn,19,16) << 4 | slice(insn, 3, 0), + Opcode == ARM::FCONSTD ? 64 : 32); + APFloat immFP = APFloat(immRaw, true); + double imm = Opcode == ARM::FCONSTD ? immFP.convertToDouble() : + immFP.convertToFloat(); + MI.addOperand(MCOperand::CreateFPImm(imm)); + ++OpIdx; } @@ -2201,22 +2124,6 @@ static unsigned decodeN3VImm(uint32_t insn) { return (insn >> 8) & 0xF; } -static bool UseDRegPair(unsigned Opcode) { - switch (Opcode) { - default: - return false; - case ARM::VLD1q8_UPD: - case ARM::VLD1q16_UPD: - case ARM::VLD1q32_UPD: - case ARM::VLD1q64_UPD: - case ARM::VST1q8_UPD: - case ARM::VST1q16_UPD: - case ARM::VST1q32_UPD: - case ARM::VST1q64_UPD: - return true; - } -} - // VLD* // D[d] D[d2] ... Rn [TIED_TO Rn] align [Rm] // VLD*LN* @@ -2243,10 +2150,9 @@ static bool DisassembleNLdSt0(MCInst &MI, unsigned Opcode, uint32_t insn, // We have homogeneous NEON registers for Load/Store. unsigned RegClass = 0; - bool DRegPair = UseDRegPair(Opcode); // Double-spaced registers have increments of 2. - unsigned Inc = (DblSpaced || DRegPair) ? 2 : 1; + unsigned Inc = DblSpaced ? 2 : 1; unsigned Rn = decodeRn(insn); unsigned Rm = decodeRm(insn); @@ -2292,7 +2198,7 @@ static bool DisassembleNLdSt0(MCInst &MI, unsigned Opcode, uint32_t insn, RegClass = OpInfo[OpIdx].RegClass; while (OpIdx < NumOps && (unsigned)OpInfo[OpIdx].RegClass == RegClass) { MI.addOperand(MCOperand::CreateReg( - getRegisterEnum(B, RegClass, Rd, DRegPair))); + getRegisterEnum(B, RegClass, Rd))); Rd += Inc; ++OpIdx; } @@ -2311,7 +2217,7 @@ static bool DisassembleNLdSt0(MCInst &MI, unsigned Opcode, uint32_t insn, while (OpIdx < NumOps && (unsigned)OpInfo[OpIdx].RegClass == RegClass) { MI.addOperand(MCOperand::CreateReg( - getRegisterEnum(B, RegClass, Rd, DRegPair))); + getRegisterEnum(B, RegClass, Rd))); Rd += Inc; ++OpIdx; } @@ -2771,8 +2677,8 @@ static bool DisassembleN3RegVecShFrm(MCInst &MI, unsigned Opcode, return DisassembleNVdVnVmOptImm(MI, Opcode, insn, NumOps, NumOpsAdded, N3V_VectorShift, B); } -static bool DisassembleNVecExtractFrm(MCInst &MI, unsigned Opcode, uint32_t insn, - unsigned short NumOps, unsigned &NumOpsAdded, BO B) { +static bool DisassembleNVecExtractFrm(MCInst &MI, unsigned Opcode, + uint32_t insn, unsigned short NumOps, unsigned &NumOpsAdded, BO B) { return DisassembleNVdVnVmOptImm(MI, Opcode, insn, NumOps, NumOpsAdded, N3V_VectorExtract, B); @@ -2959,9 +2865,9 @@ static inline bool MemBarrierInstr(uint32_t insn) { static inline bool PreLoadOpcode(unsigned Opcode) { switch(Opcode) { - case ARM::PLDi: case ARM::PLDr: - case ARM::PLDWi: case ARM::PLDWr: - case ARM::PLIi: case ARM::PLIr: + case ARM::PLDi12: case ARM::PLDrs: + case ARM::PLDWi12: case ARM::PLDWrs: + case ARM::PLIi12: case ARM::PLIrs: return true; default: return false; @@ -2971,18 +2877,21 @@ static inline bool PreLoadOpcode(unsigned Opcode) { static bool DisassemblePreLoadFrm(MCInst &MI, unsigned Opcode, uint32_t insn, unsigned short NumOps, unsigned &NumOpsAdded, BO B) { - // Preload Data/Instruction requires either 2 or 4 operands. - // PLDi, PLDWi, PLIi: Rn [+/-]imm12 add = (U == '1') - // PLDr[a|m], PLDWr[a|m], PLIr[a|m]: Rn Rm addrmode2_opc + // Preload Data/Instruction requires either 2 or 3 operands. + // PLDi, PLDWi, PLIi: addrmode_imm12 + // PLDr[a|m], PLDWr[a|m], PLIr[a|m]: ldst_so_reg MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, decodeRn(insn)))); - if (Opcode == ARM::PLDi || Opcode == ARM::PLDWi || Opcode == ARM::PLIi) { + if (Opcode == ARM::PLDi12 || Opcode == ARM::PLDWi12 + || Opcode == ARM::PLIi12) { unsigned Imm12 = slice(insn, 11, 0); bool Negative = getUBit(insn) == 0; - int Offset = Negative ? -1 - Imm12 : 1 * Imm12; - MI.addOperand(MCOperand::CreateImm(Offset)); + // -0 is represented specially. All other values are as normal. + if (Imm12 == 0 && Negative) + Imm12 = INT32_MIN; + MI.addOperand(MCOperand::CreateImm(Imm12)); NumOpsAdded = 2; } else { MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, @@ -3026,22 +2935,36 @@ static bool DisassembleMiscFrm(MCInst &MI, unsigned Opcode, uint32_t insn, case ARM::WFE: case ARM::WFI: case ARM::SEV: - case ARM::SETENDBE: - case ARM::SETENDLE: return true; default: break; } - // CPS has a singleton $opt operand that contains the following information: - // opt{4-0} = mode from Inst{4-0} - // opt{5} = changemode from Inst{17} - // opt{8-6} = AIF from Inst{8-6} - // opt{10-9} = imod from Inst{19-18} with 0b10 as enable and 0b11 as disable - if (Opcode == ARM::CPS) { - unsigned Option = slice(insn, 4, 0) | slice(insn, 17, 17) << 5 | - slice(insn, 8, 6) << 6 | slice(insn, 19, 18) << 9; - MI.addOperand(MCOperand::CreateImm(Option)); + if (Opcode == ARM::SETEND) { + NumOpsAdded = 1; + MI.addOperand(MCOperand::CreateImm(slice(insn, 9, 9))); + return true; + } + + // FIXME: To enable correct asm parsing and disasm of CPS we need 3 different + // opcodes which match the same real instruction. This is needed since there's + // no current handling of optional arguments. Fix here when a better handling + // of optional arguments is implemented. + if (Opcode == ARM::CPS3p) { + MI.addOperand(MCOperand::CreateImm(slice(insn, 19, 18))); // imod + MI.addOperand(MCOperand::CreateImm(slice(insn, 8, 6))); // iflags + MI.addOperand(MCOperand::CreateImm(slice(insn, 4, 0))); // mode + NumOpsAdded = 3; + return true; + } + if (Opcode == ARM::CPS2p) { + MI.addOperand(MCOperand::CreateImm(slice(insn, 19, 18))); // imod + MI.addOperand(MCOperand::CreateImm(slice(insn, 8, 6))); // iflags + NumOpsAdded = 2; + return true; + } + if (Opcode == ARM::CPS1p) { + MI.addOperand(MCOperand::CreateImm(slice(insn, 4, 0))); // mode NumOpsAdded = 1; return true; } diff --git a/contrib/llvm/lib/Target/ARM/Disassembler/ThumbDisassemblerCore.h b/contrib/llvm/lib/Target/ARM/Disassembler/ThumbDisassemblerCore.h index 112817b..23372e0 100644 --- a/contrib/llvm/lib/Target/ARM/Disassembler/ThumbDisassemblerCore.h +++ b/contrib/llvm/lib/Target/ARM/Disassembler/ThumbDisassemblerCore.h @@ -564,6 +564,38 @@ static bool DisassembleThumb1LdPC(MCInst &MI, unsigned Opcode, uint32_t insn, // t_addrmode_sp := sp + imm8 * 4 // +// A8.6.63 LDRB (literal) +// A8.6.79 LDRSB (literal) +// A8.6.75 LDRH (literal) +// A8.6.83 LDRSH (literal) +// A8.6.59 LDR (literal) +// +// These instrs calculate an address from the PC value and an immediate offset. +// Rd Rn=PC (+/-)imm12 (+ if Inst{23} == 0b1) +static bool DisassembleThumb2Ldpci(MCInst &MI, unsigned Opcode, + uint32_t insn, unsigned short NumOps, unsigned &NumOpsAdded, BO B) { + + const TargetOperandInfo *OpInfo = ARMInsts[Opcode].OpInfo; + if (!OpInfo) return false; + + assert(NumOps >= 2 && + OpInfo[0].RegClass == ARM::GPRRegClassID && + OpInfo[1].RegClass < 0 && + "Expect >= 2 operands, first as reg, and second as imm operand"); + + // Build the register operand, followed by the (+/-)imm12 immediate. + + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + decodeRd(insn)))); + + MI.addOperand(MCOperand::CreateImm(decodeImm12(insn))); + + NumOpsAdded = 2; + + return true; +} + + // A6.2.4 Load/store single data item // // Load/Store Register (reg|imm): tRd tRn imm5 tRm @@ -796,14 +828,13 @@ static bool DisassembleThumb1Misc(MCInst &MI, unsigned Opcode, uint32_t insn, } // CPS has a singleton $opt operand that contains the following information: - // opt{4-0} = don't care - // opt{5} = 0 (false) - // opt{8-6} = AIF from Inst{2-0} - // opt{10-9} = 1:imod from Inst{4} with 0b10 as enable and 0b11 as disable + // The first op would be 0b10 as enable and 0b11 as disable in regular ARM, + // but in Thumb it's is 0 as enable and 1 as disable. So map it to ARM's + // default one. The second get the AIF flags from Inst{2-0}. if (Opcode == ARM::tCPS) { - unsigned Option = slice(insn, 2, 0) << 6 | slice(insn, 4, 4) << 9 | 1 << 10; - MI.addOperand(MCOperand::CreateImm(Option)); - NumOpsAdded = 1; + MI.addOperand(MCOperand::CreateImm(2 + slice(insn, 4, 4))); + MI.addOperand(MCOperand::CreateImm(slice(insn, 2, 0))); + NumOpsAdded = 2; return true; } @@ -833,40 +864,32 @@ static bool DisassembleThumb1Misc(MCInst &MI, unsigned Opcode, uint32_t insn, // A8.6.53 LDM / LDMIA // A8.6.189 STM / STMIA // -// tLDM_UPD/tSTM_UPD: tRt tRt AM4ModeImm Pred-Imm Pred-CCR register_list -// tLDM: tRt AM4ModeImm Pred-Imm Pred-CCR register_list +// tLDMIA_UPD/tSTMIA_UPD: tRt tRt AM4ModeImm Pred-Imm Pred-CCR register_list +// tLDMIA: tRt AM4ModeImm Pred-Imm Pred-CCR register_list static bool DisassembleThumb1LdStMul(bool Ld, MCInst &MI, unsigned Opcode, - uint32_t insn, unsigned short NumOps, unsigned &NumOpsAdded, BO B) { - - assert((Opcode == ARM::tLDM || Opcode == ARM::tLDM_UPD || - Opcode == ARM::tSTM_UPD) && "Unexpected opcode"); - - unsigned &OpIdx = NumOpsAdded; + uint32_t insn, unsigned short NumOps, + unsigned &NumOpsAdded, BO B) { + assert((Opcode == ARM::tLDMIA || Opcode == ARM::tLDMIA_UPD || + Opcode == ARM::tSTMIA_UPD) && "Unexpected opcode"); unsigned tRt = getT1tRt(insn); - - OpIdx = 0; + NumOpsAdded = 0; // WB register, if necessary. - if (Opcode == ARM::tLDM_UPD || Opcode == ARM::tSTM_UPD) { + if (Opcode == ARM::tLDMIA_UPD || Opcode == ARM::tSTMIA_UPD) { MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, tRt))); - ++OpIdx; + ++NumOpsAdded; } MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, tRt))); - ++OpIdx; - - // A8.6.53 LDM / LDMIA / LDMFD - Encoding T1 - // A8.6.53 STM / STMIA / STMEA - Encoding T1 - MI.addOperand(MCOperand::CreateImm(ARM_AM::getAM4ModeImm(ARM_AM::ia))); - ++OpIdx; + ++NumOpsAdded; // Handling the two predicate operands before the reglist. - if (B->DoPredicateOperands(MI, Opcode, insn, NumOps)) - OpIdx += 2; - else { + if (B->DoPredicateOperands(MI, Opcode, insn, NumOps)) { + NumOpsAdded += 2; + } else { DEBUG(errs() << "Expected predicate operands not found.\n"); return false; } @@ -874,13 +897,12 @@ static bool DisassembleThumb1LdStMul(bool Ld, MCInst &MI, unsigned Opcode, unsigned RegListBits = slice(insn, 7, 0); // Fill the variadic part of reglist. - for (unsigned i = 0; i < 8; ++i) { + for (unsigned i = 0; i < 8; ++i) if ((RegListBits >> i) & 1) { MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::tGPRRegClassID, i))); - ++OpIdx; + ++NumOpsAdded; } - } return true; } @@ -959,22 +981,23 @@ static bool DisassembleThumb1Br(MCInst &MI, unsigned Opcode, uint32_t insn, // corresponding to op. // // Table A6-1 16-bit Thumb instruction encoding (abridged) -// op Instruction or instruction class -// ------ -------------------------------------------------------------------- -// 00xxxx Shift (immediate), add, subtract, move, and compare on page A6-7 -// 010000 Data-processing on page A6-8 -// 010001 Special data instructions and branch and exchange on page A6-9 -// 01001x Load from Literal Pool, see LDR (literal) on page A8-122 -// 0101xx Load/store single data item on page A6-10 +// op Instruction or instruction class +// ------ -------------------------------------------------------------------- +// 00xxxx Shift (immediate), add, subtract, move, and compare on page A6-7 +// 010000 Data-processing on page A6-8 +// 010001 Special data instructions and branch and exchange on page A6-9 +// 01001x Load from Literal Pool, see LDR (literal) on page A8-122 +// 0101xx Load/store single data item on page A6-10 // 011xxx // 100xxx -// 10100x Generate PC-relative address, see ADR on page A8-32 -// 10101x Generate SP-relative address, see ADD (SP plus immediate) on page A8-28 -// 1011xx Miscellaneous 16-bit instructions on page A6-11 -// 11000x Store multiple registers, see STM / STMIA / STMEA on page A8-374 -// 11001x Load multiple registers, see LDM / LDMIA / LDMFD on page A8-110 a -// 1101xx Conditional branch, and Supervisor Call on page A6-13 -// 11100x Unconditional Branch, see B on page A8-44 +// 10100x Generate PC-relative address, see ADR on page A8-32 +// 10101x Generate SP-relative address, see ADD (SP plus immediate) on +// page A8-28 +// 1011xx Miscellaneous 16-bit instructions on page A6-11 +// 11000x Store multiple registers, see STM / STMIA / STMEA on page A8-374 +// 11001x Load multiple registers, see LDM / LDMIA / LDMFD on page A8-110 a +// 1101xx Conditional branch, and Supervisor Call on page A6-13 +// 11100x Unconditional Branch, see B on page A8-44 // static bool DisassembleThumb1(uint16_t op, MCInst &MI, unsigned Opcode, uint32_t insn, unsigned short NumOps, unsigned &NumOpsAdded, BO B) { @@ -1121,34 +1144,31 @@ static bool DisassembleThumb2LdStMul(MCInst &MI, unsigned Opcode, uint32_t insn, if (Thumb2RFEOpcode(Opcode)) return DisassembleThumb2RFE(MI, Opcode, insn, NumOps, NumOpsAdded, B); - assert((Opcode == ARM::t2LDM || Opcode == ARM::t2LDM_UPD || - Opcode == ARM::t2STM || Opcode == ARM::t2STM_UPD) + assert((Opcode == ARM::t2LDMIA || Opcode == ARM::t2LDMIA_UPD || + Opcode == ARM::t2LDMDB || Opcode == ARM::t2LDMDB_UPD || + Opcode == ARM::t2STMIA || Opcode == ARM::t2STMIA_UPD || + Opcode == ARM::t2STMDB || Opcode == ARM::t2STMDB_UPD) && "Unexpected opcode"); assert(NumOps >= 5 && "Thumb2 LdStMul expects NumOps >= 5"); - unsigned &OpIdx = NumOpsAdded; - - OpIdx = 0; + NumOpsAdded = 0; unsigned Base = getRegisterEnum(B, ARM::GPRRegClassID, decodeRn(insn)); // Writeback to base. - if (Opcode == ARM::t2LDM_UPD || Opcode == ARM::t2STM_UPD) { + if (Opcode == ARM::t2LDMIA_UPD || Opcode == ARM::t2LDMDB_UPD || + Opcode == ARM::t2STMIA_UPD || Opcode == ARM::t2STMDB_UPD) { MI.addOperand(MCOperand::CreateReg(Base)); - ++OpIdx; + ++NumOpsAdded; } MI.addOperand(MCOperand::CreateReg(Base)); - ++OpIdx; - - ARM_AM::AMSubMode SubMode = getAMSubModeForBits(getPUBits(insn)); - MI.addOperand(MCOperand::CreateImm(ARM_AM::getAM4ModeImm(SubMode))); - ++OpIdx; + ++NumOpsAdded; // Handling the two predicate operands before the reglist. - if (B->DoPredicateOperands(MI, Opcode, insn, NumOps)) - OpIdx += 2; - else { + if (B->DoPredicateOperands(MI, Opcode, insn, NumOps)) { + NumOpsAdded += 2; + } else { DEBUG(errs() << "Expected predicate operands not found.\n"); return false; } @@ -1156,13 +1176,12 @@ static bool DisassembleThumb2LdStMul(MCInst &MI, unsigned Opcode, uint32_t insn, unsigned RegListBits = insn & ((1 << 16) - 1); // Fill the variadic part of reglist. - for (unsigned i = 0; i < 16; ++i) { + for (unsigned i = 0; i < 16; ++i) if ((RegListBits >> i) & 1) { MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, i))); - ++OpIdx; + ++NumOpsAdded; } - } return true; } @@ -1260,13 +1279,7 @@ static bool DisassembleThumb2LdStDual(MCInst &MI, unsigned Opcode, return true; } -// PC-based defined for Codegen, which do not get decoded by design: -// -// t2TBB, t2TBH: Rm immDontCare immDontCare -// -// Generic version defined for disassembly: -// -// t2TBBgen, t2TBHgen: Rn Rm Pred-Imm Pred-CCR +// t2TBB, t2TBH: Rn Rm Pred-Imm Pred-CCR static bool DisassembleThumb2TB(MCInst &MI, unsigned Opcode, uint32_t insn, unsigned short NumOps, unsigned &NumOpsAdded, BO B) { @@ -1401,7 +1414,8 @@ static bool DisassembleThumb2DPSoReg(MCInst &MI, unsigned Opcode, uint32_t insn, // // Two register operands: Rs Rn ModImm // One register operands (Rs=0b1111 no explicit dest reg): Rn ModImm -// One register operands (Rn=0b1111 no explicit src reg): Rs ModImm - {t2MOVi, t2MVNi} +// One register operands (Rn=0b1111 no explicit src reg): Rs ModImm - +// {t2MOVi, t2MVNi} // // ModImm = ThumbExpandImm(i:imm3:imm8) static bool DisassembleThumb2DPModImm(MCInst &MI, unsigned Opcode, @@ -1644,15 +1658,25 @@ static bool DisassembleThumb2BrMiscCtrl(MCInst &MI, unsigned Opcode, break; } - // CPS has a singleton $opt operand that contains the following information: - // opt{4-0} = mode from Inst{4-0} - // opt{5} = changemode from Inst{8} - // opt{8-6} = AIF from Inst{7-5} - // opt{10-9} = imod from Inst{10-9} with 0b10 as enable and 0b11 as disable - if (Opcode == ARM::t2CPS) { - unsigned Option = slice(insn, 4, 0) | slice(insn, 8, 8) << 5 | - slice(insn, 7, 5) << 6 | slice(insn, 10, 9) << 9; - MI.addOperand(MCOperand::CreateImm(Option)); + // FIXME: To enable correct asm parsing and disasm of CPS we need 3 different + // opcodes which match the same real instruction. This is needed since there's + // no current handling of optional arguments. Fix here when a better handling + // of optional arguments is implemented. + if (Opcode == ARM::t2CPS3p) { + MI.addOperand(MCOperand::CreateImm(slice(insn, 10, 9))); // imod + MI.addOperand(MCOperand::CreateImm(slice(insn, 7, 5))); // iflags + MI.addOperand(MCOperand::CreateImm(slice(insn, 4, 0))); // mode + NumOpsAdded = 3; + return true; + } + if (Opcode == ARM::t2CPS2p) { + MI.addOperand(MCOperand::CreateImm(slice(insn, 10, 9))); // imod + MI.addOperand(MCOperand::CreateImm(slice(insn, 7, 5))); // iflags + NumOpsAdded = 2; + return true; + } + if (Opcode == ARM::t2CPS1p) { + MI.addOperand(MCOperand::CreateImm(slice(insn, 4, 0))); // mode NumOpsAdded = 1; return true; } @@ -1678,11 +1702,13 @@ static bool DisassembleThumb2BrMiscCtrl(MCInst &MI, unsigned Opcode, NumOpsAdded = 1; return true; } - // MSR and MSRsys take one GPR reg Rn, followed by the mask. - if (Opcode == ARM::t2MSR || Opcode == ARM::t2MSRsys || Opcode == ARM::t2BXJ) { + // MSR take a mask, followed by one GPR reg Rn. The mask contains the R Bit in + // bit 4, and the special register fields in bits 3-0. + if (Opcode == ARM::t2MSR) { + MI.addOperand(MCOperand::CreateImm(slice(insn, 20, 20) << 4 /* R Bit */ | + slice(insn, 11, 8) /* Special Reg */)); MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, decodeRn(insn)))); - MI.addOperand(MCOperand::CreateImm(slice(insn, 11, 8))); NumOpsAdded = 2; return true; } @@ -1728,12 +1754,12 @@ static inline bool Thumb2PreloadOpcode(unsigned Opcode) { switch (Opcode) { default: return false; - case ARM::t2PLDi12: case ARM::t2PLDi8: case ARM::t2PLDpci: - case ARM::t2PLDr: case ARM::t2PLDs: - case ARM::t2PLDWi12: case ARM::t2PLDWi8: case ARM::t2PLDWpci: - case ARM::t2PLDWr: case ARM::t2PLDWs: - case ARM::t2PLIi12: case ARM::t2PLIi8: case ARM::t2PLIpci: - case ARM::t2PLIr: case ARM::t2PLIs: + case ARM::t2PLDi12: case ARM::t2PLDi8: + case ARM::t2PLDs: + case ARM::t2PLDWi12: case ARM::t2PLDWi8: + case ARM::t2PLDWs: + case ARM::t2PLIi12: case ARM::t2PLIi8: + case ARM::t2PLIs: return true; } } @@ -1769,11 +1795,10 @@ static bool DisassembleThumb2PreLoad(MCInst &MI, unsigned Opcode, uint32_t insn, && !OpInfo[OpIdx].isOptionalDef() && "Pure imm operand expected"); int Offset = 0; - if (Opcode == ARM::t2PLDpci || Opcode == ARM::t2PLDWpci || - Opcode == ARM::t2PLIpci) { + if (slice(insn, 19, 16) == 0xFF) { bool Negative = slice(insn, 23, 23) == 0; unsigned Imm12 = getImm12(insn); - Offset = Negative ? -1 - Imm12 : 1 * Imm12; + Offset = Negative ? -1 - Imm12 : 1 * Imm12; } else if (Opcode == ARM::t2PLDi8 || Opcode == ARM::t2PLDWi8 || Opcode == ARM::t2PLIi8) { // A8.6.117 Encoding T2: add = FALSE @@ -1795,37 +1820,6 @@ static bool DisassembleThumb2PreLoad(MCInst &MI, unsigned Opcode, uint32_t insn, return true; } -// A8.6.63 LDRB (literal) -// A8.6.79 LDRSB (literal) -// A8.6.75 LDRH (literal) -// A8.6.83 LDRSH (literal) -// A8.6.59 LDR (literal) -// -// These instrs calculate an address from the PC value and an immediate offset. -// Rd Rn=PC (+/-)imm12 (+ if Inst{23} == 0b1) -static bool DisassembleThumb2Ldpci(MCInst &MI, unsigned Opcode, - uint32_t insn, unsigned short NumOps, unsigned &NumOpsAdded, BO B) { - - const TargetOperandInfo *OpInfo = ARMInsts[Opcode].OpInfo; - if (!OpInfo) return false; - - assert(NumOps >= 2 && - OpInfo[0].RegClass == ARM::GPRRegClassID && - OpInfo[1].RegClass < 0 && - "Expect >= 2 operands, first as reg, and second as imm operand"); - - // Build the register operand, followed by the (+/-)imm12 immediate. - - MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, - decodeRd(insn)))); - - MI.addOperand(MCOperand::CreateImm(decodeImm12(insn))); - - NumOpsAdded = 2; - - return true; -} - // A6.3.10 Store single data item // A6.3.9 Load byte, memory hints // A6.3.8 Load halfword, memory hints @@ -1835,13 +1829,15 @@ static bool DisassembleThumb2Ldpci(MCInst &MI, unsigned Opcode, // // t2LDRi12: Rd Rn (+)imm12 // t2LDRi8: Rd Rn (+/-)imm8 (+ if Inst{9} == 0b1) -// t2LDRs: Rd Rn Rm ConstantShiftSpecifier (see also DisassembleThumb2DPSoReg) +// t2LDRs: Rd Rn Rm ConstantShiftSpecifier (see also +// DisassembleThumb2DPSoReg) // t2LDR_POST: Rd Rn Rn(TIED_TO) (+/-)imm8 (+ if Inst{9} == 0b1) // t2LDR_PRE: Rd Rn Rn(TIED_TO) (+/-)imm8 (+ if Inst{9} == 0b1) // // t2STRi12: Rd Rn (+)imm12 // t2STRi8: Rd Rn (+/-)imm8 (+ if Inst{9} == 0b1) -// t2STRs: Rd Rn Rm ConstantShiftSpecifier (see also DisassembleThumb2DPSoReg) +// t2STRs: Rd Rn Rm ConstantShiftSpecifier (see also +// DisassembleThumb2DPSoReg) // t2STR_POST: Rn Rd Rn(TIED_TO) (+/-)imm8 (+ if Inst{9} == 0b1) // t2STR_PRE: Rn Rd Rn(TIED_TO) (+/-)imm8 (+ if Inst{9} == 0b1) // @@ -1862,7 +1858,6 @@ static bool DisassembleThumb2LdSt(bool Load, MCInst &MI, unsigned Opcode, // See, for example, A6.3.7 Load word: Table A6-18 Load word. if (Load && Rn == 15) return DisassembleThumb2Ldpci(MI, Opcode, insn, NumOps, NumOpsAdded, B); - const TargetInstrDesc &TID = ARMInsts[Opcode]; const TargetOperandInfo *OpInfo = TID.OpInfo; unsigned &OpIdx = NumOpsAdded; @@ -1909,7 +1904,7 @@ static bool DisassembleThumb2LdSt(bool Load, MCInst &MI, unsigned Opcode, else Imm = decodeImm8(insn); } - + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, R0))); ++OpIdx; @@ -2081,25 +2076,29 @@ static bool DisassembleThumb2LongMul(MCInst &MI, unsigned Opcode, uint32_t insn, // corresponding to (op1, op2, op). // // Table A6-9 32-bit Thumb instruction encoding -// op1 op2 op Instruction class, see -// --- ------- -- ------------------------------------------------------------ -// 01 00xx0xx - Load/store multiple on page A6-23 -// 00xx1xx - Load/store dual, load/store exclusive, table branch on page A6-24 -// 01xxxxx - Data-processing (shifted register) on page A6-31 -// 1xxxxxx - Coprocessor instructions on page A6-40 -// 10 x0xxxxx 0 Data-processing (modified immediate) on page A6-15 -// x1xxxxx 0 Data-processing (plain binary immediate) on page A6-19 -// - 1 Branches and miscellaneous control on page A6-20 -// 11 000xxx0 - Store single data item on page A6-30 -// 001xxx0 - Advanced SIMD element or structure load/store instructions on page A7-27 -// 00xx001 - Load byte, memory hints on page A6-28 -// 00xx011 - Load halfword, memory hints on page A6-26 -// 00xx101 - Load word on page A6-25 -// 00xx111 - UNDEFINED -// 010xxxx - Data-processing (register) on page A6-33 -// 0110xxx - Multiply, multiply accumulate, and absolute difference on page A6-38 -// 0111xxx - Long multiply, long multiply accumulate, and divide on page A6-39 -// 1xxxxxx - Coprocessor instructions on page A6-40 +// op1 op2 op Instruction class, see +// --- ------- -- ----------------------------------------------------------- +// 01 00xx0xx - Load/store multiple on page A6-23 +// 00xx1xx - Load/store dual, load/store exclusive, table branch on +// page A6-24 +// 01xxxxx - Data-processing (shifted register) on page A6-31 +// 1xxxxxx - Coprocessor instructions on page A6-40 +// 10 x0xxxxx 0 Data-processing (modified immediate) on page A6-15 +// x1xxxxx 0 Data-processing (plain binary immediate) on page A6-19 +// - 1 Branches and miscellaneous control on page A6-20 +// 11 000xxx0 - Store single data item on page A6-30 +// 001xxx0 - Advanced SIMD element or structure load/store instructions +// on page A7-27 +// 00xx001 - Load byte, memory hints on page A6-28 +// 00xx011 - Load halfword, memory hints on page A6-26 +// 00xx101 - Load word on page A6-25 +// 00xx111 - UNDEFINED +// 010xxxx - Data-processing (register) on page A6-33 +// 0110xxx - Multiply, multiply accumulate, and absolute difference on +// page A6-38 +// 0111xxx - Long multiply, long multiply accumulate, and divide on +// page A6-39 +// 1xxxxxx - Coprocessor instructions on page A6-40 // static bool DisassembleThumb2(uint16_t op1, uint16_t op2, uint16_t op, MCInst &MI, unsigned Opcode, uint32_t insn, unsigned short NumOps, @@ -2130,7 +2129,7 @@ static bool DisassembleThumb2(uint16_t op1, uint16_t op2, uint16_t op, return DisassembleThumb2LdStDual(MI, Opcode, insn, NumOps, NumOpsAdded, B); } - if (Opcode == ARM::t2TBBgen || Opcode == ARM::t2TBHgen) { + if (Opcode == ARM::t2TBB || Opcode == ARM::t2TBH) { // Table branch. return DisassembleThumb2TB(MI, Opcode, insn, NumOps, NumOpsAdded, B); } @@ -2175,7 +2174,8 @@ static bool DisassembleThumb2(uint16_t op1, uint16_t op2, uint16_t op, } } else { // Table A6-9 32-bit Thumb instruction encoding: Load byte|halfword|word - return DisassembleThumb2LdSt(true, MI,Opcode,insn,NumOps,NumOpsAdded, B); + return DisassembleThumb2LdSt(true, MI, Opcode, insn, NumOps, + NumOpsAdded, B); } break; case 1: @@ -2229,7 +2229,7 @@ static bool DisassembleThumbFrm(MCInst &MI, unsigned Opcode, uint32_t insn, } // A6.3 32-bit Thumb instruction encoding - + uint16_t op1 = slice(HalfWord, 12, 11); uint16_t op2 = slice(HalfWord, 10, 4); uint16_t op = slice(insn, 15, 15); diff --git a/contrib/llvm/lib/Target/ARM/AsmPrinter/ARMInstPrinter.cpp b/contrib/llvm/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp index 8026e77..1499da0 100644 --- a/contrib/llvm/lib/Target/ARM/AsmPrinter/ARMInstPrinter.cpp +++ b/contrib/llvm/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp @@ -12,7 +12,7 @@ //===----------------------------------------------------------------------===// #define DEBUG_TYPE "asm-printer" -#include "ARM.h" // FIXME: FACTOR ENUMS BETTER. +#include "ARMBaseInfo.h" #include "ARMInstPrinter.h" #include "ARMAddressingModes.h" #include "llvm/MC/MCInst.h" @@ -22,86 +22,20 @@ #include "llvm/Support/raw_ostream.h" using namespace llvm; -// Include the auto-generated portion of the assembly writer. -#define MachineInstr MCInst -#define ARMAsmPrinter ARMInstPrinter // FIXME: REMOVE. +#define GET_INSTRUCTION_NAME #include "ARMGenAsmWriter.inc" -#undef MachineInstr -#undef ARMAsmPrinter -static unsigned NextReg(unsigned Reg) { - switch (Reg) { - default: - assert(0 && "Unexpected register enum"); - - case ARM::D0: - return ARM::D1; - case ARM::D1: - return ARM::D2; - case ARM::D2: - return ARM::D3; - case ARM::D3: - return ARM::D4; - case ARM::D4: - return ARM::D5; - case ARM::D5: - return ARM::D6; - case ARM::D6: - return ARM::D7; - case ARM::D7: - return ARM::D8; - case ARM::D8: - return ARM::D9; - case ARM::D9: - return ARM::D10; - case ARM::D10: - return ARM::D11; - case ARM::D11: - return ARM::D12; - case ARM::D12: - return ARM::D13; - case ARM::D13: - return ARM::D14; - case ARM::D14: - return ARM::D15; - case ARM::D15: - return ARM::D16; - case ARM::D16: - return ARM::D17; - case ARM::D17: - return ARM::D18; - case ARM::D18: - return ARM::D19; - case ARM::D19: - return ARM::D20; - case ARM::D20: - return ARM::D21; - case ARM::D21: - return ARM::D22; - case ARM::D22: - return ARM::D23; - case ARM::D23: - return ARM::D24; - case ARM::D24: - return ARM::D25; - case ARM::D25: - return ARM::D26; - case ARM::D26: - return ARM::D27; - case ARM::D27: - return ARM::D28; - case ARM::D28: - return ARM::D29; - case ARM::D29: - return ARM::D30; - case ARM::D30: - return ARM::D31; - } +StringRef ARMInstPrinter::getOpcodeName(unsigned Opcode) const { + return getInstructionName(Opcode); } + void ARMInstPrinter::printInst(const MCInst *MI, raw_ostream &O) { + unsigned Opcode = MI->getOpcode(); + // Check for MOVs and print canonical forms, instead. - if (MI->getOpcode() == ARM::MOVs) { + if (Opcode == ARM::MOVs) { + // FIXME: Thumb variants? const MCOperand &Dst = MI->getOperand(0); const MCOperand &MO1 = MI->getOperand(1); const MCOperand &MO2 = MI->getOperand(2); @@ -129,118 +63,82 @@ void ARMInstPrinter::printInst(const MCInst *MI, raw_ostream &O) { } // A8.6.123 PUSH - if ((MI->getOpcode() == ARM::STM_UPD || MI->getOpcode() == ARM::t2STM_UPD) && + if ((Opcode == ARM::STMDB_UPD || Opcode == ARM::t2STMDB_UPD) && MI->getOperand(0).getReg() == ARM::SP) { - const MCOperand &MO1 = MI->getOperand(2); - if (ARM_AM::getAM4SubMode(MO1.getImm()) == ARM_AM::db) { - O << '\t' << "push"; - printPredicateOperand(MI, 3, O); - O << '\t'; - printRegisterList(MI, 5, O); - return; - } + O << '\t' << "push"; + printPredicateOperand(MI, 2, O); + if (Opcode == ARM::t2STMDB_UPD) + O << ".w"; + O << '\t'; + printRegisterList(MI, 4, O); + return; } // A8.6.122 POP - if ((MI->getOpcode() == ARM::LDM_UPD || MI->getOpcode() == ARM::t2LDM_UPD) && + if ((Opcode == ARM::LDMIA_UPD || Opcode == ARM::t2LDMIA_UPD) && MI->getOperand(0).getReg() == ARM::SP) { - const MCOperand &MO1 = MI->getOperand(2); - if (ARM_AM::getAM4SubMode(MO1.getImm()) == ARM_AM::ia) { - O << '\t' << "pop"; - printPredicateOperand(MI, 3, O); - O << '\t'; - printRegisterList(MI, 5, O); - return; - } + O << '\t' << "pop"; + printPredicateOperand(MI, 2, O); + if (Opcode == ARM::t2LDMIA_UPD) + O << ".w"; + O << '\t'; + printRegisterList(MI, 4, O); + return; } // A8.6.355 VPUSH - if ((MI->getOpcode() == ARM::VSTMS_UPD || MI->getOpcode() ==ARM::VSTMD_UPD) && + if ((Opcode == ARM::VSTMSDB_UPD || Opcode == ARM::VSTMDDB_UPD) && MI->getOperand(0).getReg() == ARM::SP) { - const MCOperand &MO1 = MI->getOperand(2); - if (ARM_AM::getAM4SubMode(MO1.getImm()) == ARM_AM::db) { - O << '\t' << "vpush"; - printPredicateOperand(MI, 3, O); - O << '\t'; - printRegisterList(MI, 5, O); - return; - } + O << '\t' << "vpush"; + printPredicateOperand(MI, 2, O); + O << '\t'; + printRegisterList(MI, 4, O); + return; } // A8.6.354 VPOP - if ((MI->getOpcode() == ARM::VLDMS_UPD || MI->getOpcode() ==ARM::VLDMD_UPD) && + if ((Opcode == ARM::VLDMSIA_UPD || Opcode == ARM::VLDMDIA_UPD) && MI->getOperand(0).getReg() == ARM::SP) { - const MCOperand &MO1 = MI->getOperand(2); - if (ARM_AM::getAM4SubMode(MO1.getImm()) == ARM_AM::ia) { - O << '\t' << "vpop"; - printPredicateOperand(MI, 3, O); - O << '\t'; - printRegisterList(MI, 5, O); - return; - } + O << '\t' << "vpop"; + printPredicateOperand(MI, 2, O); + O << '\t'; + printRegisterList(MI, 4, O); + return; } printInstruction(MI, O); - } +} void ARMInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, - raw_ostream &O, const char *Modifier) { + raw_ostream &O) { const MCOperand &Op = MI->getOperand(OpNo); if (Op.isReg()) { unsigned Reg = Op.getReg(); - if (Modifier && strcmp(Modifier, "dregpair") == 0) { - O << '{' << getRegisterName(Reg) << ", " - << getRegisterName(NextReg(Reg)) << '}'; -#if 0 - // FIXME: Breaks e.g. ARM/vmul.ll. - assert(0); - /* - unsigned DRegLo = TRI->getSubReg(Reg, ARM::dsub_0); - unsigned DRegHi = TRI->getSubReg(Reg, ARM::dsub_1); - O << '{' - << getRegisterName(DRegLo) << ',' << getRegisterName(DRegHi) - << '}';*/ -#endif - } else if (Modifier && strcmp(Modifier, "lane") == 0) { - assert(0); - /* - unsigned RegNum = ARMRegisterInfo::getRegisterNumbering(Reg); - unsigned DReg = TRI->getMatchingSuperReg(Reg, RegNum & 1 ? 2 : 1, - &ARM::DPR_VFP2RegClass); - O << getRegisterName(DReg) << '[' << (RegNum & 1) << ']'; - */ - } else { - O << getRegisterName(Reg); - } + O << getRegisterName(Reg); } else if (Op.isImm()) { - assert((Modifier && !strcmp(Modifier, "call")) || - ((Modifier == 0 || Modifier[0] == 0) && "No modifiers supported")); O << '#' << Op.getImm(); } else { - if (Modifier && Modifier[0] != 0 && strcmp(Modifier, "call") != 0) - llvm_unreachable("Unsupported modifier"); assert(Op.isExpr() && "unknown operand kind in printOperand"); O << *Op.getExpr(); } } -static void printSOImm(raw_ostream &O, int64_t V, bool VerboseAsm, +static void printSOImm(raw_ostream &O, int64_t V, raw_ostream *CommentStream, const MCAsmInfo *MAI) { // Break it up into two parts that make up a shifter immediate. V = ARM_AM::getSOImmVal(V); assert(V != -1 && "Not a valid so_imm value!"); - + unsigned Imm = ARM_AM::getSOImmValImm(V); unsigned Rot = ARM_AM::getSOImmValRot(V); - + // Print low-level immediate formation info, per // A5.1.3: "Data-processing operands - Immediate". if (Rot) { O << "#" << Imm << ", " << Rot; // Pretty printed version. - if (VerboseAsm) - O << ' ' << MAI->getCommentString() - << ' ' << (int)ARM_AM::rotr32(Imm, Rot); + if (CommentStream) + *CommentStream << (int)ARM_AM::rotr32(Imm, Rot) << "\n"; } else { O << "#" << Imm; } @@ -253,15 +151,7 @@ void ARMInstPrinter::printSOImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O) { const MCOperand &MO = MI->getOperand(OpNum); assert(MO.isImm() && "Not a valid so_imm value!"); - printSOImm(O, MO.getImm(), VerboseAsm, &MAI); -} - -/// printSOImm2PartOperand - SOImm is broken into two pieces using a 'mov' -/// followed by an 'orr' to materialize. -void ARMInstPrinter::printSOImm2PartOperand(const MCInst *MI, unsigned OpNum, - raw_ostream &O) { - // FIXME: REMOVE this method. - abort(); + printSOImm(O, MO.getImm(), CommentStream, &MAI); } // so_reg is a 4-operand unit corresponding to register forms of the A5.1 @@ -274,9 +164,9 @@ void ARMInstPrinter::printSORegOperand(const MCInst *MI, unsigned OpNum, const MCOperand &MO1 = MI->getOperand(OpNum); const MCOperand &MO2 = MI->getOperand(OpNum+1); const MCOperand &MO3 = MI->getOperand(OpNum+2); - + O << getRegisterName(MO1.getReg()); - + // Print the shift opc. ARM_AM::ShiftOpc ShOpc = ARM_AM::getSORegShOp(MO3.getImm()); O << ", " << ARM_AM::getShiftOpcStr(ShOpc); @@ -294,14 +184,14 @@ void ARMInstPrinter::printAddrMode2Operand(const MCInst *MI, unsigned Op, const MCOperand &MO1 = MI->getOperand(Op); const MCOperand &MO2 = MI->getOperand(Op+1); const MCOperand &MO3 = MI->getOperand(Op+2); - + if (!MO1.isReg()) { // FIXME: This is for CP entries, but isn't right. printOperand(MI, Op, O); return; } - + O << "[" << getRegisterName(MO1.getReg()); - + if (!MO2.getReg()) { if (ARM_AM::getAM2Offset(MO3.getImm())) // Don't print +0. O << ", #" @@ -310,24 +200,24 @@ void ARMInstPrinter::printAddrMode2Operand(const MCInst *MI, unsigned Op, O << "]"; return; } - + O << ", " << ARM_AM::getAddrOpcStr(ARM_AM::getAM2Op(MO3.getImm())) << getRegisterName(MO2.getReg()); - + if (unsigned ShImm = ARM_AM::getAM2Offset(MO3.getImm())) O << ", " << ARM_AM::getShiftOpcStr(ARM_AM::getAM2ShiftOpc(MO3.getImm())) << " #" << ShImm; O << "]"; -} +} void ARMInstPrinter::printAddrMode2OffsetOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O) { const MCOperand &MO1 = MI->getOperand(OpNum); const MCOperand &MO2 = MI->getOperand(OpNum+1); - + if (!MO1.getReg()) { unsigned ImmOffs = ARM_AM::getAM2Offset(MO2.getImm()); O << '#' @@ -335,10 +225,10 @@ void ARMInstPrinter::printAddrMode2OffsetOperand(const MCInst *MI, << ImmOffs; return; } - + O << ARM_AM::getAddrOpcStr(ARM_AM::getAM2Op(MO2.getImm())) << getRegisterName(MO1.getReg()); - + if (unsigned ShImm = ARM_AM::getAM2Offset(MO2.getImm())) O << ", " << ARM_AM::getShiftOpcStr(ARM_AM::getAM2ShiftOpc(MO2.getImm())) @@ -350,15 +240,15 @@ void ARMInstPrinter::printAddrMode3Operand(const MCInst *MI, unsigned OpNum, const MCOperand &MO1 = MI->getOperand(OpNum); const MCOperand &MO2 = MI->getOperand(OpNum+1); const MCOperand &MO3 = MI->getOperand(OpNum+2); - + O << '[' << getRegisterName(MO1.getReg()); - + if (MO2.getReg()) { O << ", " << (char)ARM_AM::getAM3Op(MO3.getImm()) << getRegisterName(MO2.getReg()) << ']'; return; } - + if (unsigned ImmOffs = ARM_AM::getAM3Offset(MO3.getImm())) O << ", #" << ARM_AM::getAddrOpcStr(ARM_AM::getAM3Op(MO3.getImm())) @@ -371,53 +261,42 @@ void ARMInstPrinter::printAddrMode3OffsetOperand(const MCInst *MI, raw_ostream &O) { const MCOperand &MO1 = MI->getOperand(OpNum); const MCOperand &MO2 = MI->getOperand(OpNum+1); - + if (MO1.getReg()) { O << (char)ARM_AM::getAM3Op(MO2.getImm()) << getRegisterName(MO1.getReg()); return; } - + unsigned ImmOffs = ARM_AM::getAM3Offset(MO2.getImm()); O << '#' << ARM_AM::getAddrOpcStr(ARM_AM::getAM3Op(MO2.getImm())) << ImmOffs; } - -void ARMInstPrinter::printAddrMode4Operand(const MCInst *MI, unsigned OpNum, - raw_ostream &O, - const char *Modifier) { - const MCOperand &MO2 = MI->getOperand(OpNum+1); - ARM_AM::AMSubMode Mode = ARM_AM::getAM4SubMode(MO2.getImm()); - if (Modifier && strcmp(Modifier, "submode") == 0) { - O << ARM_AM::getAMSubModeStr(Mode); - } else if (Modifier && strcmp(Modifier, "wide") == 0) { - ARM_AM::AMSubMode Mode = ARM_AM::getAM4SubMode(MO2.getImm()); - if (Mode == ARM_AM::ia) - O << ".w"; - } else { - printOperand(MI, OpNum, O); - } +void ARMInstPrinter::printLdStmModeOperand(const MCInst *MI, unsigned OpNum, + raw_ostream &O) { + ARM_AM::AMSubMode Mode = ARM_AM::getAM4SubMode(MI->getOperand(OpNum) + .getImm()); + O << ARM_AM::getAMSubModeStr(Mode); } void ARMInstPrinter::printAddrMode5Operand(const MCInst *MI, unsigned OpNum, - raw_ostream &O, - const char *Modifier) { + raw_ostream &O) { const MCOperand &MO1 = MI->getOperand(OpNum); const MCOperand &MO2 = MI->getOperand(OpNum+1); - + if (!MO1.isReg()) { // FIXME: This is for CP entries, but isn't right. printOperand(MI, OpNum, O); return; } - + O << "[" << getRegisterName(MO1.getReg()); - + if (unsigned ImmOffs = ARM_AM::getAM5Offset(MO2.getImm())) { O << ", #" << ARM_AM::getAddrOpcStr(ARM_AM::getAM5Op(MO2.getImm())) - << ImmOffs*4; + << ImmOffs * 4; } O << "]"; } @@ -426,7 +305,7 @@ void ARMInstPrinter::printAddrMode6Operand(const MCInst *MI, unsigned OpNum, raw_ostream &O) { const MCOperand &MO1 = MI->getOperand(OpNum); const MCOperand &MO2 = MI->getOperand(OpNum+1); - + O << "[" << getRegisterName(MO1.getReg()); if (MO2.getImm()) { // FIXME: Both darwin as and GNU as violate ARM docs here. @@ -445,12 +324,6 @@ void ARMInstPrinter::printAddrMode6OffsetOperand(const MCInst *MI, O << ", " << getRegisterName(MO.getReg()); } -void ARMInstPrinter::printAddrModePCOperand(const MCInst *MI, unsigned OpNum, - raw_ostream &O, - const char *Modifier) { - assert(0 && "FIXME: Implement printAddrModePCOperand"); -} - void ARMInstPrinter::printBitfieldInvMaskImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O) { @@ -497,33 +370,41 @@ void ARMInstPrinter::printRegisterList(const MCInst *MI, unsigned OpNum, O << "}"; } -void ARMInstPrinter::printCPSOptionOperand(const MCInst *MI, unsigned OpNum, - raw_ostream &O) { +void ARMInstPrinter::printSetendOperand(const MCInst *MI, unsigned OpNum, + raw_ostream &O) { const MCOperand &Op = MI->getOperand(OpNum); - unsigned option = Op.getImm(); - unsigned mode = option & 31; - bool changemode = option >> 5 & 1; - unsigned AIF = option >> 6 & 7; - unsigned imod = option >> 9 & 3; - if (imod == 2) - O << "ie"; - else if (imod == 3) - O << "id"; - O << '\t'; - if (imod > 1) { - if (AIF & 4) O << 'a'; - if (AIF & 2) O << 'i'; - if (AIF & 1) O << 'f'; - if (AIF > 0 && changemode) O << ", "; - } - if (changemode) - O << '#' << mode; + if (Op.getImm()) + O << "be"; + else + O << "le"; +} + +void ARMInstPrinter::printCPSIMod(const MCInst *MI, unsigned OpNum, + raw_ostream &O) { + const MCOperand &Op = MI->getOperand(OpNum); + O << ARM_PROC::IModToString(Op.getImm()); +} + +void ARMInstPrinter::printCPSIFlag(const MCInst *MI, unsigned OpNum, + raw_ostream &O) { + const MCOperand &Op = MI->getOperand(OpNum); + unsigned IFlags = Op.getImm(); + for (int i=2; i >= 0; --i) + if (IFlags & (1 << i)) + O << ARM_PROC::IFlagsToString(1 << i); } void ARMInstPrinter::printMSRMaskOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O) { const MCOperand &Op = MI->getOperand(OpNum); - unsigned Mask = Op.getImm(); + unsigned SpecRegRBit = Op.getImm() >> 4; + unsigned Mask = Op.getImm() & 0xf; + + if (SpecRegRBit) + O << "spsr"; + else + O << "cpsr"; + if (Mask) { O << '_'; if (Mask & 8) O << 'f'; @@ -550,7 +431,7 @@ void ARMInstPrinter::printPredicateOperand(const MCInst *MI, unsigned OpNum, O << ARMCondCodeToString(CC); } -void ARMInstPrinter::printMandatoryPredicateOperand(const MCInst *MI, +void ARMInstPrinter::printMandatoryPredicateOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O) { ARMCC::CondCodes CC = (ARMCC::CondCodes)MI->getOperand(OpNum).getImm(); @@ -566,25 +447,24 @@ void ARMInstPrinter::printSBitModifierOperand(const MCInst *MI, unsigned OpNum, } } - - -void ARMInstPrinter::printCPInstOperand(const MCInst *MI, unsigned OpNum, - raw_ostream &O, - const char *Modifier) { - // FIXME: remove this. - abort(); -} - void ARMInstPrinter::printNoHashImmediate(const MCInst *MI, unsigned OpNum, raw_ostream &O) { O << MI->getOperand(OpNum).getImm(); } +void ARMInstPrinter::printPImmediate(const MCInst *MI, unsigned OpNum, + raw_ostream &O) { + O << "p" << MI->getOperand(OpNum).getImm(); +} + +void ARMInstPrinter::printCImmediate(const MCInst *MI, unsigned OpNum, + raw_ostream &O) { + O << "c" << MI->getOperand(OpNum).getImm(); +} void ARMInstPrinter::printPCLabel(const MCInst *MI, unsigned OpNum, raw_ostream &O) { - // FIXME: remove this. - abort(); + llvm_unreachable("Unhandled PC-relative pseudo-instruction!"); } void ARMInstPrinter::printThumbS4ImmOperand(const MCInst *MI, unsigned OpNum, @@ -611,17 +491,25 @@ void ARMInstPrinter::printThumbITMask(const MCInst *MI, unsigned OpNum, void ARMInstPrinter::printThumbAddrModeRROperand(const MCInst *MI, unsigned Op, raw_ostream &O) { const MCOperand &MO1 = MI->getOperand(Op); - const MCOperand &MO2 = MI->getOperand(Op+1); + const MCOperand &MO2 = MI->getOperand(Op + 1); + + if (!MO1.isReg()) { // FIXME: This is for CP entries, but isn't right. + printOperand(MI, Op, O); + return; + } + O << "[" << getRegisterName(MO1.getReg()); - O << ", " << getRegisterName(MO2.getReg()) << "]"; + if (unsigned RegNum = MO2.getReg()) + O << ", " << getRegisterName(RegNum); + O << "]"; } -void ARMInstPrinter::printThumbAddrModeRI5Operand(const MCInst *MI, unsigned Op, - raw_ostream &O, - unsigned Scale) { +void ARMInstPrinter::printThumbAddrModeImm5SOperand(const MCInst *MI, + unsigned Op, + raw_ostream &O, + unsigned Scale) { const MCOperand &MO1 = MI->getOperand(Op); - const MCOperand &MO2 = MI->getOperand(Op+1); - const MCOperand &MO3 = MI->getOperand(Op+2); + const MCOperand &MO2 = MI->getOperand(Op + 1); if (!MO1.isReg()) { // FIXME: This is for CP entries, but isn't right. printOperand(MI, Op, O); @@ -629,44 +517,32 @@ void ARMInstPrinter::printThumbAddrModeRI5Operand(const MCInst *MI, unsigned Op, } O << "[" << getRegisterName(MO1.getReg()); - if (MO3.getReg()) - O << ", " << getRegisterName(MO3.getReg()); - else if (unsigned ImmOffs = MO2.getImm()) + if (unsigned ImmOffs = MO2.getImm()) O << ", #" << ImmOffs * Scale; O << "]"; } -void ARMInstPrinter::printThumbAddrModeS1Operand(const MCInst *MI, unsigned Op, - raw_ostream &O) { - printThumbAddrModeRI5Operand(MI, Op, O, 1); +void ARMInstPrinter::printThumbAddrModeImm5S1Operand(const MCInst *MI, + unsigned Op, + raw_ostream &O) { + printThumbAddrModeImm5SOperand(MI, Op, O, 1); } -void ARMInstPrinter::printThumbAddrModeS2Operand(const MCInst *MI, unsigned Op, - raw_ostream &O) { - printThumbAddrModeRI5Operand(MI, Op, O, 2); +void ARMInstPrinter::printThumbAddrModeImm5S2Operand(const MCInst *MI, + unsigned Op, + raw_ostream &O) { + printThumbAddrModeImm5SOperand(MI, Op, O, 2); } -void ARMInstPrinter::printThumbAddrModeS4Operand(const MCInst *MI, unsigned Op, - raw_ostream &O) { - printThumbAddrModeRI5Operand(MI, Op, O, 4); +void ARMInstPrinter::printThumbAddrModeImm5S4Operand(const MCInst *MI, + unsigned Op, + raw_ostream &O) { + printThumbAddrModeImm5SOperand(MI, Op, O, 4); } void ARMInstPrinter::printThumbAddrModeSPOperand(const MCInst *MI, unsigned Op, raw_ostream &O) { - const MCOperand &MO1 = MI->getOperand(Op); - const MCOperand &MO2 = MI->getOperand(Op+1); - O << "[" << getRegisterName(MO1.getReg()); - if (unsigned ImmOffs = MO2.getImm()) - O << ", #" << ImmOffs*4; - O << "]"; -} - -void ARMInstPrinter::printTBAddrMode(const MCInst *MI, unsigned OpNum, - raw_ostream &O) { - O << "[pc, " << getRegisterName(MI->getOperand(OpNum).getReg()); - if (MI->getOpcode() == ARM::t2TBH) - O << ", lsl #1"; - O << ']'; + printThumbAddrModeImm5SOperand(MI, Op, O, 4); } // Constant shifts t2_so_reg is a 2-operand unit corresponding to the Thumb2 @@ -689,16 +565,26 @@ void ARMInstPrinter::printT2SOOperand(const MCInst *MI, unsigned OpNum, O << " #" << ARM_AM::getSORegOffset(MO2.getImm()); } -void ARMInstPrinter::printT2AddrModeImm12Operand(const MCInst *MI, - unsigned OpNum, - raw_ostream &O) { +void ARMInstPrinter::printAddrModeImm12Operand(const MCInst *MI, unsigned OpNum, + raw_ostream &O) { const MCOperand &MO1 = MI->getOperand(OpNum); const MCOperand &MO2 = MI->getOperand(OpNum+1); + if (!MO1.isReg()) { // FIXME: This is for CP entries, but isn't right. + printOperand(MI, OpNum, O); + return; + } + O << "[" << getRegisterName(MO1.getReg()); - unsigned OffImm = MO2.getImm(); - if (OffImm) // Don't print +0. + int32_t OffImm = (int32_t)MO2.getImm(); + bool isSub = OffImm < 0; + // Special value for #-0. All others are normal. + if (OffImm == INT32_MIN) + OffImm = 0; + if (isSub) + O << ", #-" << -OffImm; + else if (OffImm > 0) O << ", #" << OffImm; O << "]"; } @@ -783,12 +669,37 @@ void ARMInstPrinter::printT2AddrModeSoRegOperand(const MCInst *MI, void ARMInstPrinter::printVFPf32ImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O) { - O << '#' << MI->getOperand(OpNum).getImm(); + const MCOperand &MO = MI->getOperand(OpNum); + O << '#'; + if (MO.isFPImm()) { + O << (float)MO.getFPImm(); + } else { + union { + uint32_t I; + float F; + } FPUnion; + + FPUnion.I = MO.getImm(); + O << FPUnion.F; + } } void ARMInstPrinter::printVFPf64ImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O) { - O << '#' << MI->getOperand(OpNum).getImm(); + const MCOperand &MO = MI->getOperand(OpNum); + O << '#'; + if (MO.isFPImm()) { + O << MO.getFPImm(); + } else { + // We expect the binary encoding of a floating point number here. + union { + uint64_t I; + double D; + } FPUnion; + + FPUnion.I = MO.getImm(); + O << FPUnion.D; + } } void ARMInstPrinter::printNEONModImmOperand(const MCInst *MI, unsigned OpNum, diff --git a/contrib/llvm/lib/Target/ARM/AsmPrinter/ARMInstPrinter.h b/contrib/llvm/lib/Target/ARM/InstPrinter/ARMInstPrinter.h index e5ad0d0..679d313 100644 --- a/contrib/llvm/lib/Target/ARM/AsmPrinter/ARMInstPrinter.h +++ b/contrib/llvm/lib/Target/ARM/InstPrinter/ARMInstPrinter.h @@ -18,26 +18,25 @@ namespace llvm { class MCOperand; - + class ARMInstPrinter : public MCInstPrinter { - bool VerboseAsm; public: - ARMInstPrinter(const MCAsmInfo &MAI, bool verboseAsm) - : MCInstPrinter(MAI), VerboseAsm(verboseAsm) {} + ARMInstPrinter(const MCAsmInfo &MAI) : MCInstPrinter(MAI) {} virtual void printInst(const MCInst *MI, raw_ostream &O); - + virtual StringRef getOpcodeName(unsigned Opcode) const; + + static const char *getInstructionName(unsigned Opcode); + // Autogenerated by tblgen. void printInstruction(const MCInst *MI, raw_ostream &O); static const char *getRegisterName(unsigned RegNo); - void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O, - const char *Modifier = 0); - + void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printSOImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O); - void printSOImm2PartOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O); - + void printSORegOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O); void printAddrMode2Operand(const MCInst *MI, unsigned OpNum, raw_ostream &O); void printAddrMode2OffsetOperand(const MCInst *MI, unsigned OpNum, @@ -45,15 +44,11 @@ public: void printAddrMode3Operand(const MCInst *MI, unsigned OpNum, raw_ostream &O); void printAddrMode3OffsetOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O); - void printAddrMode4Operand(const MCInst *MI, unsigned OpNum, raw_ostream &O, - const char *Modifier = 0); - void printAddrMode5Operand(const MCInst *MI, unsigned OpNum, raw_ostream &O, - const char *Modifier = 0); + void printLdStmModeOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O); + void printAddrMode5Operand(const MCInst *MI, unsigned OpNum, raw_ostream &O); void printAddrMode6Operand(const MCInst *MI, unsigned OpNum, raw_ostream &O); void printAddrMode6OffsetOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O); - void printAddrModePCOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O, - const char *Modifier = 0); void printBitfieldInvMaskImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O); @@ -64,20 +59,20 @@ public: void printThumbITMask(const MCInst *MI, unsigned OpNum, raw_ostream &O); void printThumbAddrModeRROperand(const MCInst *MI, unsigned OpNum, raw_ostream &O); - void printThumbAddrModeRI5Operand(const MCInst *MI, unsigned OpNum, - raw_ostream &O, unsigned Scale); - void printThumbAddrModeS1Operand(const MCInst *MI, unsigned OpNum, - raw_ostream &O); - void printThumbAddrModeS2Operand(const MCInst *MI, unsigned OpNum, - raw_ostream &O); - void printThumbAddrModeS4Operand(const MCInst *MI, unsigned OpNum, - raw_ostream &O); + void printThumbAddrModeImm5SOperand(const MCInst *MI, unsigned OpNum, + raw_ostream &O, unsigned Scale); + void printThumbAddrModeImm5S1Operand(const MCInst *MI, unsigned OpNum, + raw_ostream &O); + void printThumbAddrModeImm5S2Operand(const MCInst *MI, unsigned OpNum, + raw_ostream &O); + void printThumbAddrModeImm5S4Operand(const MCInst *MI, unsigned OpNum, + raw_ostream &O); void printThumbAddrModeSPOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O); - + void printT2SOOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O); - void printT2AddrModeImm12Operand(const MCInst *MI, unsigned OpNum, - raw_ostream &O); + void printAddrModeImm12Operand(const MCInst *MI, unsigned OpNum, + raw_ostream &O); void printT2AddrModeImm8Operand(const MCInst *MI, unsigned OpNum, raw_ostream &O); void printT2AddrModeImm8s4Operand(const MCInst *MI, unsigned OpNum, @@ -88,7 +83,10 @@ public: raw_ostream &O); void printT2AddrModeSoRegOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O); - + + void printSetendOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O); + void printCPSIMod(const MCInst *MI, unsigned OpNum, raw_ostream &O); + void printCPSIFlag(const MCInst *MI, unsigned OpNum, raw_ostream &O); void printCPSOptionOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O); void printMSRMaskOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O); void printNegZeroOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O); @@ -98,21 +96,16 @@ public: void printSBitModifierOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O); void printRegisterList(const MCInst *MI, unsigned OpNum, raw_ostream &O); - void printCPInstOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O, - const char *Modifier); - void printJTBlockOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O) {} - void printJT2BlockOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O) {} - void printTBAddrMode(const MCInst *MI, unsigned OpNum, raw_ostream &O); void printNoHashImmediate(const MCInst *MI, unsigned OpNum, raw_ostream &O); + void printPImmediate(const MCInst *MI, unsigned OpNum, raw_ostream &O); + void printCImmediate(const MCInst *MI, unsigned OpNum, raw_ostream &O); void printVFPf32ImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O); void printVFPf64ImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O); void printNEONModImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O); - void printPCLabel(const MCInst *MI, unsigned OpNum, raw_ostream &O); - // FIXME: Implement. - void PrintSpecial(const MCInst *MI, raw_ostream &O, const char *Kind) {} + void printPCLabel(const MCInst *MI, unsigned OpNum, raw_ostream &O); }; - -} + +} // end namespace llvm #endif diff --git a/contrib/llvm/lib/Target/ARM/InstPrinter/CMakeLists.txt b/contrib/llvm/lib/Target/ARM/InstPrinter/CMakeLists.txt new file mode 100644 index 0000000..18645c0 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/InstPrinter/CMakeLists.txt @@ -0,0 +1,6 @@ +include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. ) + +add_llvm_library(LLVMARMAsmPrinter + ARMInstPrinter.cpp + ) +add_dependencies(LLVMARMAsmPrinter ARMCodeGenTable_gen) diff --git a/contrib/llvm/lib/Target/ARM/InstPrinter/Makefile b/contrib/llvm/lib/Target/ARM/InstPrinter/Makefile new file mode 100644 index 0000000..65d372e --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/InstPrinter/Makefile @@ -0,0 +1,15 @@ +##===- lib/Target/ARM/AsmPrinter/Makefile ------------------*- Makefile -*-===## +# +# The LLVM Compiler Infrastructure +# +# This file is distributed under the University of Illinois Open Source +# License. See LICENSE.TXT for details. +# +##===----------------------------------------------------------------------===## +LEVEL = ../../../.. +LIBRARYNAME = LLVMARMAsmPrinter + +# Hack: we need to include 'main' arm target directory to grab private headers +CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/.. + +include $(LEVEL)/Makefile.common diff --git a/contrib/llvm/lib/Target/ARM/MLxExpansionPass.cpp b/contrib/llvm/lib/Target/ARM/MLxExpansionPass.cpp new file mode 100644 index 0000000..f9e86eb --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/MLxExpansionPass.cpp @@ -0,0 +1,321 @@ +//===-- MLxExpansionPass.cpp - Expand MLx instrs to avoid hazards ----------=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Expand VFP / NEON floating point MLA / MLS instructions (each to a pair of +// multiple and add / sub instructions) when special VMLx hazards are detected. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "mlx-expansion" +#include "ARM.h" +#include "ARMBaseInstrInfo.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Target/TargetRegisterInfo.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +using namespace llvm; + +static cl::opt<bool> +ForceExapnd("expand-all-fp-mlx", cl::init(false), cl::Hidden); +static cl::opt<unsigned> +ExpandLimit("expand-limit", cl::init(~0U), cl::Hidden); + +STATISTIC(NumExpand, "Number of fp MLA / MLS instructions expanded"); + +namespace { + struct MLxExpansion : public MachineFunctionPass { + static char ID; + MLxExpansion() : MachineFunctionPass(ID) {} + + virtual bool runOnMachineFunction(MachineFunction &Fn); + + virtual const char *getPassName() const { + return "ARM MLA / MLS expansion pass"; + } + + private: + const ARMBaseInstrInfo *TII; + const TargetRegisterInfo *TRI; + MachineRegisterInfo *MRI; + + unsigned MIIdx; + MachineInstr* LastMIs[4]; + + void clearStack(); + void pushStack(MachineInstr *MI); + MachineInstr *getAccDefMI(MachineInstr *MI) const; + unsigned getDefReg(MachineInstr *MI) const; + bool hasRAWHazard(unsigned Reg, MachineInstr *MI) const; + bool FindMLxHazard(MachineInstr *MI) const; + void ExpandFPMLxInstruction(MachineBasicBlock &MBB, MachineInstr *MI, + unsigned MulOpc, unsigned AddSubOpc, + bool NegAcc, bool HasLane); + bool ExpandFPMLxInstructions(MachineBasicBlock &MBB); + }; + char MLxExpansion::ID = 0; +} + +void MLxExpansion::clearStack() { + std::fill(LastMIs, LastMIs + 4, (MachineInstr*)0); + MIIdx = 0; +} + +void MLxExpansion::pushStack(MachineInstr *MI) { + LastMIs[MIIdx] = MI; + if (++MIIdx == 4) + MIIdx = 0; +} + +MachineInstr *MLxExpansion::getAccDefMI(MachineInstr *MI) const { + // Look past COPY and INSERT_SUBREG instructions to find the + // real definition MI. This is important for _sfp instructions. + unsigned Reg = MI->getOperand(1).getReg(); + if (TargetRegisterInfo::isPhysicalRegister(Reg)) + return 0; + + MachineBasicBlock *MBB = MI->getParent(); + MachineInstr *DefMI = MRI->getVRegDef(Reg); + while (true) { + if (DefMI->getParent() != MBB) + break; + if (DefMI->isCopyLike()) { + Reg = DefMI->getOperand(1).getReg(); + if (TargetRegisterInfo::isVirtualRegister(Reg)) { + DefMI = MRI->getVRegDef(Reg); + continue; + } + } else if (DefMI->isInsertSubreg()) { + Reg = DefMI->getOperand(2).getReg(); + if (TargetRegisterInfo::isVirtualRegister(Reg)) { + DefMI = MRI->getVRegDef(Reg); + continue; + } + } + break; + } + return DefMI; +} + +unsigned MLxExpansion::getDefReg(MachineInstr *MI) const { + unsigned Reg = MI->getOperand(0).getReg(); + if (TargetRegisterInfo::isPhysicalRegister(Reg) || + !MRI->hasOneNonDBGUse(Reg)) + return Reg; + + MachineBasicBlock *MBB = MI->getParent(); + MachineInstr *UseMI = &*MRI->use_nodbg_begin(Reg); + if (UseMI->getParent() != MBB) + return Reg; + + while (UseMI->isCopy() || UseMI->isInsertSubreg()) { + Reg = UseMI->getOperand(0).getReg(); + if (TargetRegisterInfo::isPhysicalRegister(Reg) || + !MRI->hasOneNonDBGUse(Reg)) + return Reg; + UseMI = &*MRI->use_nodbg_begin(Reg); + if (UseMI->getParent() != MBB) + return Reg; + } + + return Reg; +} + +bool MLxExpansion::hasRAWHazard(unsigned Reg, MachineInstr *MI) const { + const TargetInstrDesc &TID = MI->getDesc(); + // FIXME: Detect integer instructions properly. + unsigned Domain = TID.TSFlags & ARMII::DomainMask; + if (Domain == ARMII::DomainVFP) { + unsigned Opcode = TID.getOpcode(); + if (Opcode == ARM::VSTRS || Opcode == ARM::VSTRD || + Opcode == ARM::VMOVRS || Opcode == ARM::VMOVRRD) + return false; + } else if (Domain == ARMII::DomainNEON) { + if (TID.mayStore() || TID.mayLoad()) + return false; + } else { + return false; + } + + return MI->readsRegister(Reg, TRI); + return false; +} + + +bool MLxExpansion::FindMLxHazard(MachineInstr *MI) const { + if (NumExpand >= ExpandLimit) + return false; + + if (ForceExapnd) + return true; + + MachineInstr *DefMI = getAccDefMI(MI); + if (TII->isFpMLxInstruction(DefMI->getOpcode())) + // r0 = vmla + // r3 = vmla r0, r1, r2 + // takes 16 - 17 cycles + // + // r0 = vmla + // r4 = vmul r1, r2 + // r3 = vadd r0, r4 + // takes about 14 - 15 cycles even with vmul stalling for 4 cycles. + return true; + + // If a VMLA.F is followed by an VADD.F or VMUL.F with no RAW hazard, the + // VADD.F or VMUL.F will stall 4 cycles before issue. The 4 cycle stall + // preserves the in-order retirement of the instructions. + // Look at the next few instructions, if *most* of them can cause hazards, + // then the scheduler can't *fix* this, we'd better break up the VMLA. + for (unsigned i = 1; i <= 4; ++i) { + int Idx = ((int)MIIdx - i + 4) % 4; + MachineInstr *NextMI = LastMIs[Idx]; + if (!NextMI) + continue; + + if (TII->canCauseFpMLxStall(NextMI->getOpcode())) + return true; + + // Look for VMLx RAW hazard. + if (hasRAWHazard(getDefReg(MI), NextMI)) + return true; + } + + return false; +} + +/// ExpandFPMLxInstructions - Expand a MLA / MLS instruction into a pair +/// of MUL + ADD / SUB instructions. +void +MLxExpansion::ExpandFPMLxInstruction(MachineBasicBlock &MBB, MachineInstr *MI, + unsigned MulOpc, unsigned AddSubOpc, + bool NegAcc, bool HasLane) { + unsigned DstReg = MI->getOperand(0).getReg(); + bool DstDead = MI->getOperand(0).isDead(); + unsigned AccReg = MI->getOperand(1).getReg(); + unsigned Src1Reg = MI->getOperand(2).getReg(); + unsigned Src2Reg = MI->getOperand(3).getReg(); + bool Src1Kill = MI->getOperand(2).isKill(); + bool Src2Kill = MI->getOperand(3).isKill(); + unsigned LaneImm = HasLane ? MI->getOperand(4).getImm() : 0; + unsigned NextOp = HasLane ? 5 : 4; + ARMCC::CondCodes Pred = (ARMCC::CondCodes)MI->getOperand(NextOp).getImm(); + unsigned PredReg = MI->getOperand(++NextOp).getReg(); + + const TargetInstrDesc &TID1 = TII->get(MulOpc); + const TargetInstrDesc &TID2 = TII->get(AddSubOpc); + unsigned TmpReg = MRI->createVirtualRegister(TID1.getRegClass(0, TRI)); + + MachineInstrBuilder MIB = BuildMI(MBB, *MI, MI->getDebugLoc(), TID1, TmpReg) + .addReg(Src1Reg, getKillRegState(Src1Kill)) + .addReg(Src2Reg, getKillRegState(Src2Kill)); + if (HasLane) + MIB.addImm(LaneImm); + MIB.addImm(Pred).addReg(PredReg); + + MIB = BuildMI(MBB, *MI, MI->getDebugLoc(), TID2) + .addReg(DstReg, getDefRegState(true) | getDeadRegState(DstDead)); + + if (NegAcc) { + bool AccKill = MRI->hasOneNonDBGUse(AccReg); + MIB.addReg(TmpReg, getKillRegState(true)) + .addReg(AccReg, getKillRegState(AccKill)); + } else { + MIB.addReg(AccReg).addReg(TmpReg, getKillRegState(true)); + } + MIB.addImm(Pred).addReg(PredReg); + + DEBUG({ + dbgs() << "Expanding: " << *MI; + dbgs() << " to:\n"; + MachineBasicBlock::iterator MII = MI; + MII = llvm::prior(MII); + MachineInstr &MI2 = *MII; + MII = llvm::prior(MII); + MachineInstr &MI1 = *MII; + dbgs() << " " << MI1; + dbgs() << " " << MI2; + }); + + MI->eraseFromParent(); + ++NumExpand; +} + +bool MLxExpansion::ExpandFPMLxInstructions(MachineBasicBlock &MBB) { + bool Changed = false; + + clearStack(); + + unsigned Skip = 0; + MachineBasicBlock::reverse_iterator MII = MBB.rbegin(), E = MBB.rend(); + while (MII != E) { + MachineInstr *MI = &*MII; + + if (MI->isLabel() || MI->isImplicitDef() || MI->isCopy()) { + ++MII; + continue; + } + + const TargetInstrDesc &TID = MI->getDesc(); + if (TID.isBarrier()) { + clearStack(); + Skip = 0; + ++MII; + continue; + } + + unsigned Domain = TID.TSFlags & ARMII::DomainMask; + if (Domain == ARMII::DomainGeneral) { + if (++Skip == 2) + // Assume dual issues of non-VFP / NEON instructions. + pushStack(0); + } else { + Skip = 0; + + unsigned MulOpc, AddSubOpc; + bool NegAcc, HasLane; + if (!TII->isFpMLxInstruction(TID.getOpcode(), + MulOpc, AddSubOpc, NegAcc, HasLane) || + !FindMLxHazard(MI)) + pushStack(MI); + else { + ExpandFPMLxInstruction(MBB, MI, MulOpc, AddSubOpc, NegAcc, HasLane); + E = MBB.rend(); // May have changed if MI was the 1st instruction. + Changed = true; + continue; + } + } + + ++MII; + } + + return Changed; +} + +bool MLxExpansion::runOnMachineFunction(MachineFunction &Fn) { + TII = static_cast<const ARMBaseInstrInfo*>(Fn.getTarget().getInstrInfo()); + TRI = Fn.getTarget().getRegisterInfo(); + MRI = &Fn.getRegInfo(); + + bool Modified = false; + for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E; + ++MFI) { + MachineBasicBlock &MBB = *MFI; + Modified |= ExpandFPMLxInstructions(MBB); + } + + return Modified; +} + +FunctionPass *llvm::createMLxExpansionPass() { + return new MLxExpansion(); +} diff --git a/contrib/llvm/lib/Target/ARM/NEONPreAllocPass.cpp b/contrib/llvm/lib/Target/ARM/NEONPreAllocPass.cpp deleted file mode 100644 index 3407ac6..0000000 --- a/contrib/llvm/lib/Target/ARM/NEONPreAllocPass.cpp +++ /dev/null @@ -1,406 +0,0 @@ -//===-- NEONPreAllocPass.cpp - Allocate adjacent NEON registers--*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// - -#define DEBUG_TYPE "neon-prealloc" -#include "ARM.h" -#include "ARMInstrInfo.h" -#include "llvm/CodeGen/MachineInstr.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -using namespace llvm; - -namespace { - class NEONPreAllocPass : public MachineFunctionPass { - const TargetInstrInfo *TII; - MachineRegisterInfo *MRI; - - public: - static char ID; - NEONPreAllocPass() : MachineFunctionPass(ID) {} - - virtual bool runOnMachineFunction(MachineFunction &MF); - - virtual const char *getPassName() const { - return "NEON register pre-allocation pass"; - } - - private: - bool FormsRegSequence(MachineInstr *MI, - unsigned FirstOpnd, unsigned NumRegs, - unsigned Offset, unsigned Stride) const; - bool PreAllocNEONRegisters(MachineBasicBlock &MBB); - }; - - char NEONPreAllocPass::ID = 0; -} - -static bool isNEONMultiRegOp(int Opcode, unsigned &FirstOpnd, unsigned &NumRegs, - unsigned &Offset, unsigned &Stride) { - // Default to unit stride with no offset. - Stride = 1; - Offset = 0; - - switch (Opcode) { - default: - break; - - case ARM::VLD2LNd8: - case ARM::VLD2LNd16: - case ARM::VLD2LNd32: - FirstOpnd = 0; - NumRegs = 2; - return true; - - case ARM::VLD2LNq16: - case ARM::VLD2LNq32: - FirstOpnd = 0; - NumRegs = 2; - Offset = 0; - Stride = 2; - return true; - - case ARM::VLD2LNq16odd: - case ARM::VLD2LNq32odd: - FirstOpnd = 0; - NumRegs = 2; - Offset = 1; - Stride = 2; - return true; - - case ARM::VLD3LNd8: - case ARM::VLD3LNd16: - case ARM::VLD3LNd32: - FirstOpnd = 0; - NumRegs = 3; - return true; - - case ARM::VLD3LNq16: - case ARM::VLD3LNq32: - FirstOpnd = 0; - NumRegs = 3; - Offset = 0; - Stride = 2; - return true; - - case ARM::VLD3LNq16odd: - case ARM::VLD3LNq32odd: - FirstOpnd = 0; - NumRegs = 3; - Offset = 1; - Stride = 2; - return true; - - case ARM::VLD4LNd8: - case ARM::VLD4LNd16: - case ARM::VLD4LNd32: - FirstOpnd = 0; - NumRegs = 4; - return true; - - case ARM::VLD4LNq16: - case ARM::VLD4LNq32: - FirstOpnd = 0; - NumRegs = 4; - Offset = 0; - Stride = 2; - return true; - - case ARM::VLD4LNq16odd: - case ARM::VLD4LNq32odd: - FirstOpnd = 0; - NumRegs = 4; - Offset = 1; - Stride = 2; - return true; - - case ARM::VST2LNd8: - case ARM::VST2LNd16: - case ARM::VST2LNd32: - FirstOpnd = 2; - NumRegs = 2; - return true; - - case ARM::VST2LNq16: - case ARM::VST2LNq32: - FirstOpnd = 2; - NumRegs = 2; - Offset = 0; - Stride = 2; - return true; - - case ARM::VST2LNq16odd: - case ARM::VST2LNq32odd: - FirstOpnd = 2; - NumRegs = 2; - Offset = 1; - Stride = 2; - return true; - - case ARM::VST3LNd8: - case ARM::VST3LNd16: - case ARM::VST3LNd32: - FirstOpnd = 2; - NumRegs = 3; - return true; - - case ARM::VST3LNq16: - case ARM::VST3LNq32: - FirstOpnd = 2; - NumRegs = 3; - Offset = 0; - Stride = 2; - return true; - - case ARM::VST3LNq16odd: - case ARM::VST3LNq32odd: - FirstOpnd = 2; - NumRegs = 3; - Offset = 1; - Stride = 2; - return true; - - case ARM::VST4LNd8: - case ARM::VST4LNd16: - case ARM::VST4LNd32: - FirstOpnd = 2; - NumRegs = 4; - return true; - - case ARM::VST4LNq16: - case ARM::VST4LNq32: - FirstOpnd = 2; - NumRegs = 4; - Offset = 0; - Stride = 2; - return true; - - case ARM::VST4LNq16odd: - case ARM::VST4LNq32odd: - FirstOpnd = 2; - NumRegs = 4; - Offset = 1; - Stride = 2; - return true; - - case ARM::VTBL2: - FirstOpnd = 1; - NumRegs = 2; - return true; - - case ARM::VTBL3: - FirstOpnd = 1; - NumRegs = 3; - return true; - - case ARM::VTBL4: - FirstOpnd = 1; - NumRegs = 4; - return true; - - case ARM::VTBX2: - FirstOpnd = 2; - NumRegs = 2; - return true; - - case ARM::VTBX3: - FirstOpnd = 2; - NumRegs = 3; - return true; - - case ARM::VTBX4: - FirstOpnd = 2; - NumRegs = 4; - return true; - } - - return false; -} - -bool -NEONPreAllocPass::FormsRegSequence(MachineInstr *MI, - unsigned FirstOpnd, unsigned NumRegs, - unsigned Offset, unsigned Stride) const { - MachineOperand &FMO = MI->getOperand(FirstOpnd); - assert(FMO.isReg() && FMO.getSubReg() == 0 && "unexpected operand"); - unsigned VirtReg = FMO.getReg(); - (void)VirtReg; - assert(TargetRegisterInfo::isVirtualRegister(VirtReg) && - "expected a virtual register"); - - unsigned LastSubIdx = 0; - if (FMO.isDef()) { - MachineInstr *RegSeq = 0; - for (unsigned R = 0; R < NumRegs; ++R) { - const MachineOperand &MO = MI->getOperand(FirstOpnd + R); - assert(MO.isReg() && MO.getSubReg() == 0 && "unexpected operand"); - unsigned VirtReg = MO.getReg(); - assert(TargetRegisterInfo::isVirtualRegister(VirtReg) && - "expected a virtual register"); - // Feeding into a REG_SEQUENCE. - if (!MRI->hasOneNonDBGUse(VirtReg)) - return false; - MachineInstr *UseMI = &*MRI->use_nodbg_begin(VirtReg); - if (!UseMI->isRegSequence()) - return false; - if (RegSeq && RegSeq != UseMI) - return false; - unsigned OpIdx = 1 + (Offset + R * Stride) * 2; - if (UseMI->getOperand(OpIdx).getReg() != VirtReg) - llvm_unreachable("Malformed REG_SEQUENCE instruction!"); - unsigned SubIdx = UseMI->getOperand(OpIdx + 1).getImm(); - if (LastSubIdx) { - if (LastSubIdx != SubIdx-Stride) - return false; - } else { - // Must start from dsub_0 or qsub_0. - if (SubIdx != (ARM::dsub_0+Offset) && - SubIdx != (ARM::qsub_0+Offset)) - return false; - } - RegSeq = UseMI; - LastSubIdx = SubIdx; - } - - // In the case of vld3, etc., make sure the trailing operand of - // REG_SEQUENCE is an undef. - if (NumRegs == 3) { - unsigned OpIdx = 1 + (Offset + 3 * Stride) * 2; - const MachineOperand &MO = RegSeq->getOperand(OpIdx); - unsigned VirtReg = MO.getReg(); - MachineInstr *DefMI = MRI->getVRegDef(VirtReg); - if (!DefMI || !DefMI->isImplicitDef()) - return false; - } - return true; - } - - unsigned LastSrcReg = 0; - SmallVector<unsigned, 4> SubIds; - for (unsigned R = 0; R < NumRegs; ++R) { - const MachineOperand &MO = MI->getOperand(FirstOpnd + R); - assert(MO.isReg() && MO.getSubReg() == 0 && "unexpected operand"); - unsigned VirtReg = MO.getReg(); - assert(TargetRegisterInfo::isVirtualRegister(VirtReg) && - "expected a virtual register"); - // Extracting from a Q or QQ register. - MachineInstr *DefMI = MRI->getVRegDef(VirtReg); - if (!DefMI || !DefMI->isCopy() || !DefMI->getOperand(1).getSubReg()) - return false; - VirtReg = DefMI->getOperand(1).getReg(); - if (LastSrcReg && LastSrcReg != VirtReg) - return false; - LastSrcReg = VirtReg; - const TargetRegisterClass *RC = MRI->getRegClass(VirtReg); - if (RC != ARM::QPRRegisterClass && - RC != ARM::QQPRRegisterClass && - RC != ARM::QQQQPRRegisterClass) - return false; - unsigned SubIdx = DefMI->getOperand(1).getSubReg(); - if (LastSubIdx) { - if (LastSubIdx != SubIdx-Stride) - return false; - } else { - // Must start from dsub_0 or qsub_0. - if (SubIdx != (ARM::dsub_0+Offset) && - SubIdx != (ARM::qsub_0+Offset)) - return false; - } - SubIds.push_back(SubIdx); - LastSubIdx = SubIdx; - } - - // FIXME: Update the uses of EXTRACT_SUBREG from REG_SEQUENCE is - // currently required for correctness. e.g. - // %reg1041<def> = REG_SEQUENCE %reg1040<kill>, 5, %reg1035<kill>, 6 - // %reg1042<def> = EXTRACT_SUBREG %reg1041, 6 - // %reg1043<def> = EXTRACT_SUBREG %reg1041, 5 - // VST1q16 %reg1025<kill>, 0, %reg1043<kill>, %reg1042<kill>, - // reg1042 and reg1043 should be replaced with reg1041:6 and reg1041:5 - // respectively. - // We need to change how we model uses of REG_SEQUENCE. - for (unsigned R = 0; R < NumRegs; ++R) { - MachineOperand &MO = MI->getOperand(FirstOpnd + R); - unsigned OldReg = MO.getReg(); - MachineInstr *DefMI = MRI->getVRegDef(OldReg); - assert(DefMI->isCopy()); - MO.setReg(LastSrcReg); - MO.setSubReg(SubIds[R]); - MO.setIsKill(false); - // Delete the EXTRACT_SUBREG if its result is now dead. - if (MRI->use_empty(OldReg)) - DefMI->eraseFromParent(); - } - - return true; -} - -bool NEONPreAllocPass::PreAllocNEONRegisters(MachineBasicBlock &MBB) { - bool Modified = false; - - MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); - for (; MBBI != E; ++MBBI) { - MachineInstr *MI = &*MBBI; - unsigned FirstOpnd, NumRegs, Offset, Stride; - if (!isNEONMultiRegOp(MI->getOpcode(), FirstOpnd, NumRegs, Offset, Stride)) - continue; - if (FormsRegSequence(MI, FirstOpnd, NumRegs, Offset, Stride)) - continue; - - MachineBasicBlock::iterator NextI = llvm::next(MBBI); - for (unsigned R = 0; R < NumRegs; ++R) { - MachineOperand &MO = MI->getOperand(FirstOpnd + R); - assert(MO.isReg() && MO.getSubReg() == 0 && "unexpected operand"); - unsigned VirtReg = MO.getReg(); - assert(TargetRegisterInfo::isVirtualRegister(VirtReg) && - "expected a virtual register"); - - // For now, just assign a fixed set of adjacent registers. - // This leaves plenty of room for future improvements. - static const unsigned NEONDRegs[] = { - ARM::D0, ARM::D1, ARM::D2, ARM::D3, - ARM::D4, ARM::D5, ARM::D6, ARM::D7 - }; - MO.setReg(NEONDRegs[Offset + R * Stride]); - - if (MO.isUse()) { - // Insert a copy from VirtReg. - BuildMI(MBB, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY),MO.getReg()) - .addReg(VirtReg, getKillRegState(MO.isKill())); - MO.setIsKill(); - } else if (MO.isDef() && !MO.isDead()) { - // Add a copy to VirtReg. - BuildMI(MBB, NextI, DebugLoc(), TII->get(TargetOpcode::COPY), VirtReg) - .addReg(MO.getReg()); - } - } - } - - return Modified; -} - -bool NEONPreAllocPass::runOnMachineFunction(MachineFunction &MF) { - TII = MF.getTarget().getInstrInfo(); - MRI = &MF.getRegInfo(); - - bool Modified = false; - for (MachineFunction::iterator MFI = MF.begin(), E = MF.end(); MFI != E; - ++MFI) { - MachineBasicBlock &MBB = *MFI; - Modified |= PreAllocNEONRegisters(MBB); - } - - return Modified; -} - -/// createNEONPreAllocPass - returns an instance of the NEON register -/// pre-allocation pass. -FunctionPass *llvm::createNEONPreAllocPass() { - return new NEONPreAllocPass(); -} diff --git a/contrib/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp b/contrib/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp new file mode 100644 index 0000000..233e165 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp @@ -0,0 +1,352 @@ +//======- Thumb1FrameLowering.cpp - Thumb1 Frame Information ---*- C++ -*-====// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the Thumb1 implementation of TargetFrameLowering class. +// +//===----------------------------------------------------------------------===// + +#include "Thumb1FrameLowering.h" +#include "ARMBaseInstrInfo.h" +#include "ARMMachineFunctionInfo.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" + +using namespace llvm; + +bool Thumb1FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const { + const MachineFrameInfo *FFI = MF.getFrameInfo(); + unsigned CFSize = FFI->getMaxCallFrameSize(); + // It's not always a good idea to include the call frame as part of the + // stack frame. ARM (especially Thumb) has small immediate offset to + // address the stack frame. So a large call frame can cause poor codegen + // and may even makes it impossible to scavenge a register. + if (CFSize >= ((1 << 8) - 1) * 4 / 2) // Half of imm8 * 4 + return false; + + return !MF.getFrameInfo()->hasVarSizedObjects(); +} + +static void emitSPUpdate(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, + const TargetInstrInfo &TII, DebugLoc dl, + const Thumb1RegisterInfo &MRI, + int NumBytes) { + emitThumbRegPlusImmediate(MBB, MBBI, ARM::SP, ARM::SP, NumBytes, TII, + MRI, dl); +} + +void Thumb1FrameLowering::emitPrologue(MachineFunction &MF) const { + MachineBasicBlock &MBB = MF.front(); + MachineBasicBlock::iterator MBBI = MBB.begin(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + const Thumb1RegisterInfo *RegInfo = + static_cast<const Thumb1RegisterInfo*>(MF.getTarget().getRegisterInfo()); + const Thumb1InstrInfo &TII = + *static_cast<const Thumb1InstrInfo*>(MF.getTarget().getInstrInfo()); + + unsigned VARegSaveSize = AFI->getVarArgsRegSaveSize(); + unsigned NumBytes = MFI->getStackSize(); + const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo(); + DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc(); + unsigned FramePtr = RegInfo->getFrameRegister(MF); + unsigned BasePtr = RegInfo->getBaseRegister(); + + // Thumb add/sub sp, imm8 instructions implicitly multiply the offset by 4. + NumBytes = (NumBytes + 3) & ~3; + MFI->setStackSize(NumBytes); + + // Determine the sizes of each callee-save spill areas and record which frame + // belongs to which callee-save spill areas. + unsigned GPRCS1Size = 0, GPRCS2Size = 0, DPRCSSize = 0; + int FramePtrSpillFI = 0; + + if (VARegSaveSize) + emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, -VARegSaveSize); + + if (!AFI->hasStackFrame()) { + if (NumBytes != 0) + emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, -NumBytes); + return; + } + + for (unsigned i = 0, e = CSI.size(); i != e; ++i) { + unsigned Reg = CSI[i].getReg(); + int FI = CSI[i].getFrameIdx(); + switch (Reg) { + case ARM::R4: + case ARM::R5: + case ARM::R6: + case ARM::R7: + case ARM::LR: + if (Reg == FramePtr) + FramePtrSpillFI = FI; + AFI->addGPRCalleeSavedArea1Frame(FI); + GPRCS1Size += 4; + break; + case ARM::R8: + case ARM::R9: + case ARM::R10: + case ARM::R11: + if (Reg == FramePtr) + FramePtrSpillFI = FI; + if (STI.isTargetDarwin()) { + AFI->addGPRCalleeSavedArea2Frame(FI); + GPRCS2Size += 4; + } else { + AFI->addGPRCalleeSavedArea1Frame(FI); + GPRCS1Size += 4; + } + break; + default: + AFI->addDPRCalleeSavedAreaFrame(FI); + DPRCSSize += 8; + } + } + + if (MBBI != MBB.end() && MBBI->getOpcode() == ARM::tPUSH) { + ++MBBI; + if (MBBI != MBB.end()) + dl = MBBI->getDebugLoc(); + } + + // Determine starting offsets of spill areas. + unsigned DPRCSOffset = NumBytes - (GPRCS1Size + GPRCS2Size + DPRCSSize); + unsigned GPRCS2Offset = DPRCSOffset + DPRCSSize; + unsigned GPRCS1Offset = GPRCS2Offset + GPRCS2Size; + AFI->setFramePtrSpillOffset(MFI->getObjectOffset(FramePtrSpillFI) + NumBytes); + AFI->setGPRCalleeSavedArea1Offset(GPRCS1Offset); + AFI->setGPRCalleeSavedArea2Offset(GPRCS2Offset); + AFI->setDPRCalleeSavedAreaOffset(DPRCSOffset); + NumBytes = DPRCSOffset; + + // Adjust FP so it point to the stack slot that contains the previous FP. + if (hasFP(MF)) { + BuildMI(MBB, MBBI, dl, TII.get(ARM::tADDrSPi), FramePtr) + .addFrameIndex(FramePtrSpillFI).addImm(0); + if (NumBytes > 7) + // If offset is > 7 then sp cannot be adjusted in a single instruction, + // try restoring from fp instead. + AFI->setShouldRestoreSPFromFP(true); + } + + if (NumBytes) + // Insert it after all the callee-save spills. + emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, -NumBytes); + + if (STI.isTargetELF() && hasFP(MF)) + MFI->setOffsetAdjustment(MFI->getOffsetAdjustment() - + AFI->getFramePtrSpillOffset()); + + AFI->setGPRCalleeSavedArea1Size(GPRCS1Size); + AFI->setGPRCalleeSavedArea2Size(GPRCS2Size); + AFI->setDPRCalleeSavedAreaSize(DPRCSSize); + + // If we need a base pointer, set it up here. It's whatever the value + // of the stack pointer is at this point. Any variable size objects + // will be allocated after this, so we can still use the base pointer + // to reference locals. + if (RegInfo->hasBasePointer(MF)) + BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVgpr2gpr), BasePtr).addReg(ARM::SP); + + // If the frame has variable sized objects then the epilogue must restore + // the sp from fp. We can assume there's an FP here since hasFP already + // checks for hasVarSizedObjects. + if (MFI->hasVarSizedObjects()) + AFI->setShouldRestoreSPFromFP(true); +} + +static bool isCalleeSavedRegister(unsigned Reg, const unsigned *CSRegs) { + for (unsigned i = 0; CSRegs[i]; ++i) + if (Reg == CSRegs[i]) + return true; + return false; +} + +static bool isCSRestore(MachineInstr *MI, const unsigned *CSRegs) { + if (MI->getOpcode() == ARM::tRestore && + MI->getOperand(1).isFI() && + isCalleeSavedRegister(MI->getOperand(0).getReg(), CSRegs)) + return true; + else if (MI->getOpcode() == ARM::tPOP) { + // The first two operands are predicates. The last two are + // imp-def and imp-use of SP. Check everything in between. + for (int i = 2, e = MI->getNumOperands() - 2; i != e; ++i) + if (!isCalleeSavedRegister(MI->getOperand(i).getReg(), CSRegs)) + return false; + return true; + } + return false; +} + +void Thumb1FrameLowering::emitEpilogue(MachineFunction &MF, + MachineBasicBlock &MBB) const { + MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr(); + assert((MBBI->getOpcode() == ARM::tBX_RET || + MBBI->getOpcode() == ARM::tPOP_RET) && + "Can only insert epilog into returning blocks"); + DebugLoc dl = MBBI->getDebugLoc(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + const Thumb1RegisterInfo *RegInfo = + static_cast<const Thumb1RegisterInfo*>(MF.getTarget().getRegisterInfo()); + const Thumb1InstrInfo &TII = + *static_cast<const Thumb1InstrInfo*>(MF.getTarget().getInstrInfo()); + + unsigned VARegSaveSize = AFI->getVarArgsRegSaveSize(); + int NumBytes = (int)MFI->getStackSize(); + const unsigned *CSRegs = RegInfo->getCalleeSavedRegs(); + unsigned FramePtr = RegInfo->getFrameRegister(MF); + + if (!AFI->hasStackFrame()) { + if (NumBytes != 0) + emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, NumBytes); + } else { + // Unwind MBBI to point to first LDR / VLDRD. + if (MBBI != MBB.begin()) { + do + --MBBI; + while (MBBI != MBB.begin() && isCSRestore(MBBI, CSRegs)); + if (!isCSRestore(MBBI, CSRegs)) + ++MBBI; + } + + // Move SP to start of FP callee save spill area. + NumBytes -= (AFI->getGPRCalleeSavedArea1Size() + + AFI->getGPRCalleeSavedArea2Size() + + AFI->getDPRCalleeSavedAreaSize()); + + if (AFI->shouldRestoreSPFromFP()) { + NumBytes = AFI->getFramePtrSpillOffset() - NumBytes; + // Reset SP based on frame pointer only if the stack frame extends beyond + // frame pointer stack slot, the target is ELF and the function has FP, or + // the target uses var sized objects. + if (NumBytes) { + assert(MF.getRegInfo().isPhysRegUsed(ARM::R4) && + "No scratch register to restore SP from FP!"); + emitThumbRegPlusImmediate(MBB, MBBI, ARM::R4, FramePtr, -NumBytes, + TII, *RegInfo, dl); + BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVtgpr2gpr), ARM::SP) + .addReg(ARM::R4); + } else + BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVtgpr2gpr), ARM::SP) + .addReg(FramePtr); + } else { + if (MBBI->getOpcode() == ARM::tBX_RET && + &MBB.front() != MBBI && + prior(MBBI)->getOpcode() == ARM::tPOP) { + MachineBasicBlock::iterator PMBBI = prior(MBBI); + emitSPUpdate(MBB, PMBBI, TII, dl, *RegInfo, NumBytes); + } else + emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, NumBytes); + } + } + + if (VARegSaveSize) { + // Unlike T2 and ARM mode, the T1 pop instruction cannot restore + // to LR, and we can't pop the value directly to the PC since + // we need to update the SP after popping the value. Therefore, we + // pop the old LR into R3 as a temporary. + + // Move back past the callee-saved register restoration + while (MBBI != MBB.end() && isCSRestore(MBBI, CSRegs)) + ++MBBI; + // Epilogue for vararg functions: pop LR to R3 and branch off it. + AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tPOP))) + .addReg(ARM::R3, RegState::Define); + + emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, VARegSaveSize); + + BuildMI(MBB, MBBI, dl, TII.get(ARM::tBX_RET_vararg)) + .addReg(ARM::R3, RegState::Kill); + // erase the old tBX_RET instruction + MBB.erase(MBBI); + } +} + +bool Thumb1FrameLowering:: +spillCalleeSavedRegisters(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + const std::vector<CalleeSavedInfo> &CSI, + const TargetRegisterInfo *TRI) const { + if (CSI.empty()) + return false; + + DebugLoc DL; + MachineFunction &MF = *MBB.getParent(); + const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo(); + + if (MI != MBB.end()) DL = MI->getDebugLoc(); + + MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(ARM::tPUSH)); + AddDefaultPred(MIB); + for (unsigned i = CSI.size(); i != 0; --i) { + unsigned Reg = CSI[i-1].getReg(); + bool isKill = true; + + // Add the callee-saved register as live-in unless it's LR and + // @llvm.returnaddress is called. If LR is returned for @llvm.returnaddress + // then it's already added to the function and entry block live-in sets. + if (Reg == ARM::LR) { + MachineFunction &MF = *MBB.getParent(); + if (MF.getFrameInfo()->isReturnAddressTaken() && + MF.getRegInfo().isLiveIn(Reg)) + isKill = false; + } + + if (isKill) + MBB.addLiveIn(Reg); + + MIB.addReg(Reg, getKillRegState(isKill)); + } + return true; +} + +bool Thumb1FrameLowering:: +restoreCalleeSavedRegisters(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + const std::vector<CalleeSavedInfo> &CSI, + const TargetRegisterInfo *TRI) const { + if (CSI.empty()) + return false; + + MachineFunction &MF = *MBB.getParent(); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo(); + + bool isVarArg = AFI->getVarArgsRegSaveSize() > 0; + DebugLoc DL = MI->getDebugLoc(); + MachineInstrBuilder MIB = BuildMI(MF, DL, TII.get(ARM::tPOP)); + AddDefaultPred(MIB); + + bool NumRegs = false; + for (unsigned i = CSI.size(); i != 0; --i) { + unsigned Reg = CSI[i-1].getReg(); + if (Reg == ARM::LR) { + // Special epilogue for vararg functions. See emitEpilogue + if (isVarArg) + continue; + Reg = ARM::PC; + (*MIB).setDesc(TII.get(ARM::tPOP_RET)); + MI = MBB.erase(MI); + } + MIB.addReg(Reg, getDefRegState(true)); + NumRegs = true; + } + + // It's illegal to emit pop instruction without operands. + if (NumRegs) + MBB.insert(MI, &*MIB); + else + MF.DeleteMachineInstr(MIB); + + return true; +} diff --git a/contrib/llvm/lib/Target/ARM/Thumb1FrameLowering.h b/contrib/llvm/lib/Target/ARM/Thumb1FrameLowering.h new file mode 100644 index 0000000..c592e12 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/Thumb1FrameLowering.h @@ -0,0 +1,52 @@ +//===-- Thumb1FrameLowering.h - Thumb1-specific frame info stuff --*- C++ -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// +// +//===----------------------------------------------------------------------===// + +#ifndef __THUMB_FRAMEINFO_H_ +#define __THUMM_FRAMEINFO_H_ + +#include "ARM.h" +#include "ARMFrameLowering.h" +#include "ARMSubtarget.h" +#include "Thumb1InstrInfo.h" +#include "Thumb1RegisterInfo.h" +#include "llvm/Target/TargetFrameLowering.h" + +namespace llvm { + class ARMSubtarget; + +class Thumb1FrameLowering : public ARMFrameLowering { +public: + explicit Thumb1FrameLowering(const ARMSubtarget &sti) + : ARMFrameLowering(sti) { + } + + /// emitProlog/emitEpilog - These methods insert prolog and epilog code into + /// the function. + void emitPrologue(MachineFunction &MF) const; + void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const; + + bool spillCalleeSavedRegisters(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + const std::vector<CalleeSavedInfo> &CSI, + const TargetRegisterInfo *TRI) const; + bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + const std::vector<CalleeSavedInfo> &CSI, + const TargetRegisterInfo *TRI) const; + + bool hasReservedCallFrame(const MachineFunction &MF) const; +}; + +} // End llvm namespace + +#endif diff --git a/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp b/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp index af630ac..3fbb433 100644 --- a/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp +++ b/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp @@ -71,8 +71,9 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, MachineFunction &MF = *MBB.getParent(); MachineFrameInfo &MFI = *MF.getFrameInfo(); MachineMemOperand *MMO = - MF.getMachineMemOperand(PseudoSourceValue::getFixedStack(FI), - MachineMemOperand::MOStore, 0, + MF.getMachineMemOperand( + MachinePointerInfo(PseudoSourceValue::getFixedStack(FI)), + MachineMemOperand::MOStore, MFI.getObjectSize(FI), MFI.getObjectAlignment(FI)); AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::tSpill)) @@ -99,85 +100,12 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, MachineFunction &MF = *MBB.getParent(); MachineFrameInfo &MFI = *MF.getFrameInfo(); MachineMemOperand *MMO = - MF.getMachineMemOperand(PseudoSourceValue::getFixedStack(FI), - MachineMemOperand::MOLoad, 0, + MF.getMachineMemOperand( + MachinePointerInfo(PseudoSourceValue::getFixedStack(FI)), + MachineMemOperand::MOLoad, MFI.getObjectSize(FI), MFI.getObjectAlignment(FI)); AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::tRestore), DestReg) .addFrameIndex(FI).addImm(0).addMemOperand(MMO)); } } - -bool Thumb1InstrInfo:: -spillCalleeSavedRegisters(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, - const std::vector<CalleeSavedInfo> &CSI, - const TargetRegisterInfo *TRI) const { - if (CSI.empty()) - return false; - - DebugLoc DL; - if (MI != MBB.end()) DL = MI->getDebugLoc(); - - MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, get(ARM::tPUSH)); - AddDefaultPred(MIB); - for (unsigned i = CSI.size(); i != 0; --i) { - unsigned Reg = CSI[i-1].getReg(); - bool isKill = true; - - // Add the callee-saved register as live-in unless it's LR and - // @llvm.returnaddress is called. If LR is returned for @llvm.returnaddress - // then it's already added to the function and entry block live-in sets. - if (Reg == ARM::LR) { - MachineFunction &MF = *MBB.getParent(); - if (MF.getFrameInfo()->isReturnAddressTaken() && - MF.getRegInfo().isLiveIn(Reg)) - isKill = false; - } - - if (isKill) - MBB.addLiveIn(Reg); - - MIB.addReg(Reg, getKillRegState(isKill)); - } - return true; -} - -bool Thumb1InstrInfo:: -restoreCalleeSavedRegisters(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, - const std::vector<CalleeSavedInfo> &CSI, - const TargetRegisterInfo *TRI) const { - MachineFunction &MF = *MBB.getParent(); - ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); - if (CSI.empty()) - return false; - - bool isVarArg = AFI->getVarArgsRegSaveSize() > 0; - DebugLoc DL = MI->getDebugLoc(); - MachineInstrBuilder MIB = BuildMI(MF, DL, get(ARM::tPOP)); - AddDefaultPred(MIB); - - bool NumRegs = false; - for (unsigned i = CSI.size(); i != 0; --i) { - unsigned Reg = CSI[i-1].getReg(); - if (Reg == ARM::LR) { - // Special epilogue for vararg functions. See emitEpilogue - if (isVarArg) - continue; - Reg = ARM::PC; - (*MIB).setDesc(get(ARM::tPOP_RET)); - MI = MBB.erase(MI); - } - MIB.addReg(Reg, getDefRegState(true)); - NumRegs = true; - } - - // It's illegal to emit pop instruction without operands. - if (NumRegs) - MBB.insert(MI, &*MIB); - else - MF.DeleteMachineInstr(MIB); - - return true; -} diff --git a/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.h b/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.h index 555135a..17ef2f7 100644 --- a/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.h +++ b/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.h @@ -37,28 +37,19 @@ public: /// const Thumb1RegisterInfo &getRegisterInfo() const { return RI; } - bool spillCalleeSavedRegisters(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, - const std::vector<CalleeSavedInfo> &CSI, - const TargetRegisterInfo *TRI) const; - bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, - const std::vector<CalleeSavedInfo> &CSI, - const TargetRegisterInfo *TRI) const; - void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, DebugLoc DL, unsigned DestReg, unsigned SrcReg, bool KillSrc) const; void storeRegToStackSlot(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, - unsigned SrcReg, bool isKill, int FrameIndex, + MachineBasicBlock::iterator MBBI, + unsigned SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const; void loadRegFromStackSlot(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, - unsigned DestReg, int FrameIndex, + MachineBasicBlock::iterator MBBI, + unsigned DestReg, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const; diff --git a/contrib/llvm/lib/Target/ARM/Thumb1RegisterInfo.cpp b/contrib/llvm/lib/Target/ARM/Thumb1RegisterInfo.cpp index a21a3da..f62a13e 100644 --- a/contrib/llvm/lib/Target/ARM/Thumb1RegisterInfo.cpp +++ b/contrib/llvm/lib/Target/ARM/Thumb1RegisterInfo.cpp @@ -29,7 +29,7 @@ #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineLocation.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/Target/TargetFrameInfo.h" +#include "llvm/Target/TargetFrameLowering.h" #include "llvm/Target/TargetMachine.h" #include "llvm/ADT/BitVector.h" #include "llvm/ADT/SmallVector.h" @@ -63,24 +63,11 @@ void Thumb1RegisterInfo::emitLoadConstPool(MachineBasicBlock &MBB, Type::getInt32Ty(MBB.getParent()->getFunction()->getContext()), Val); unsigned Idx = ConstantPool->getConstantPoolIndex(C, 4); - BuildMI(MBB, MBBI, dl, TII.get(ARM::tLDRcp)) + BuildMI(MBB, MBBI, dl, TII.get(ARM::tLDRpci)) .addReg(DestReg, getDefRegState(true), SubIdx) .addConstantPoolIndex(Idx).addImm(Pred).addReg(PredReg); } -bool Thumb1RegisterInfo::hasReservedCallFrame(const MachineFunction &MF) const { - const MachineFrameInfo *FFI = MF.getFrameInfo(); - unsigned CFSize = FFI->getMaxCallFrameSize(); - // It's not always a good idea to include the call frame as part of the - // stack frame. ARM (especially Thumb) has small immediate offset to - // address the stack frame. So a large call frame can cause poor codegen - // and may even makes it impossible to scavenge a register. - if (CFSize >= ((1 << 8) - 1) * 4 / 2) // Half of imm8 * 4 - return false; - - return !MF.getFrameInfo()->hasVarSizedObjects(); -} - /// emitThumbRegPlusImmInReg - Emits a series of instructions to materialize /// a destreg = basereg + immediate in Thumb code. Materialize the immediate @@ -92,7 +79,7 @@ void emitThumbRegPlusImmInReg(MachineBasicBlock &MBB, unsigned DestReg, unsigned BaseReg, int NumBytes, bool CanChangeCC, const TargetInstrInfo &TII, - const Thumb1RegisterInfo& MRI, + const ARMBaseRegisterInfo& MRI, DebugLoc dl) { MachineFunction &MF = *MBB.getParent(); bool isHigh = !isARMLowRegister(DestReg) || @@ -162,13 +149,12 @@ static unsigned calcNumMI(int Opc, int ExtraOpc, unsigned Bytes, /// emitThumbRegPlusImmediate - Emits a series of instructions to materialize /// a destreg = basereg + immediate in Thumb code. -static -void emitThumbRegPlusImmediate(MachineBasicBlock &MBB, - MachineBasicBlock::iterator &MBBI, - unsigned DestReg, unsigned BaseReg, - int NumBytes, const TargetInstrInfo &TII, - const Thumb1RegisterInfo& MRI, - DebugLoc dl) { +void llvm::emitThumbRegPlusImmediate(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, + unsigned DestReg, unsigned BaseReg, + int NumBytes, const TargetInstrInfo &TII, + const ARMBaseRegisterInfo& MRI, + DebugLoc dl) { bool isSub = NumBytes < 0; unsigned Bytes = (unsigned)NumBytes; if (isSub) Bytes = -NumBytes; @@ -304,7 +290,9 @@ static void emitSPUpdate(MachineBasicBlock &MBB, void Thumb1RegisterInfo:: eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const { - if (!hasReservedCallFrame(MF)) { + const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering(); + + if (!TFI->hasReservedCallFrame(MF)) { // If we have alloca, convert as follows: // ADJCALLSTACKDOWN -> sub, sp, sp, amount // ADJCALLSTACKUP -> add, sp, sp, amount @@ -315,7 +303,7 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, // We need to keep the stack aligned properly. To do this, we round the // amount of space needed for the outgoing arguments up to the next // alignment boundary. - unsigned Align = MF.getTarget().getFrameInfo()->getStackAlignment(); + unsigned Align = TFI->getStackAlignment(); Amount = (Amount+Align-1)/Align*Align; // Replace the pseudo instruction with a new instruction... @@ -363,6 +351,22 @@ static void removeOperands(MachineInstr &MI, unsigned i) { MI.RemoveOperand(Op); } +/// convertToNonSPOpcode - Change the opcode to the non-SP version, because +/// we're replacing the frame index with a non-SP register. +static unsigned convertToNonSPOpcode(unsigned Opcode) { + switch (Opcode) { + case ARM::tLDRspi: + case ARM::tRestore: // FIXME: Should this opcode be here? + return ARM::tLDRi; + + case ARM::tSTRspi: + case ARM::tSpill: // FIXME: Should this opcode be here? + return ARM::tSTRi; + } + + return Opcode; +} + bool Thumb1RegisterInfo:: rewriteFrameIndex(MachineBasicBlock::iterator II, unsigned FrameRegIdx, unsigned FrameReg, int &Offset, @@ -464,55 +468,51 @@ rewriteFrameIndex(MachineBasicBlock::iterator II, unsigned FrameRegIdx, } return true; } else { - unsigned ImmIdx = 0; - int InstrOffs = 0; - unsigned NumBits = 0; - unsigned Scale = 1; - switch (AddrMode) { - case ARMII::AddrModeT1_s: { - ImmIdx = FrameRegIdx+1; - InstrOffs = MI.getOperand(ImmIdx).getImm(); - NumBits = (FrameReg == ARM::SP) ? 8 : 5; - Scale = 4; - break; - } - default: + if (AddrMode != ARMII::AddrModeT1_s) llvm_unreachable("Unsupported addressing mode!"); - break; - } + + unsigned ImmIdx = FrameRegIdx + 1; + int InstrOffs = MI.getOperand(ImmIdx).getImm(); + unsigned NumBits = (FrameReg == ARM::SP) ? 8 : 5; + unsigned Scale = 4; Offset += InstrOffs * Scale; - assert((Offset & (Scale-1)) == 0 && "Can't encode this offset!"); + assert((Offset & (Scale - 1)) == 0 && "Can't encode this offset!"); // Common case: small offset, fits into instruction. MachineOperand &ImmOp = MI.getOperand(ImmIdx); int ImmedOffset = Offset / Scale; unsigned Mask = (1 << NumBits) - 1; + if ((unsigned)Offset <= Mask * Scale) { - // Replace the FrameIndex with sp + // Replace the FrameIndex with the frame register (e.g., sp). MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false); ImmOp.ChangeToImmediate(ImmedOffset); + + // If we're using a register where sp was stored, convert the instruction + // to the non-SP version. + unsigned NewOpc = convertToNonSPOpcode(Opcode); + if (NewOpc != Opcode && FrameReg != ARM::SP) + MI.setDesc(TII.get(NewOpc)); + return true; } - bool isThumSpillRestore = Opcode == ARM::tRestore || Opcode == ARM::tSpill; - if (AddrMode == ARMII::AddrModeT1_s) { - // Thumb tLDRspi, tSTRspi. These will change to instructions that use - // a different base register. - NumBits = 5; - Mask = (1 << NumBits) - 1; - } + NumBits = 5; + Mask = (1 << NumBits) - 1; + // If this is a thumb spill / restore, we will be using a constpool load to // materialize the offset. - if (AddrMode == ARMII::AddrModeT1_s && isThumSpillRestore) + if (Opcode == ARM::tRestore || Opcode == ARM::tSpill) { ImmOp.ChangeToImmediate(0); - else { + } else { // Otherwise, it didn't fit. Pull in what we can to simplify the immed. ImmedOffset = ImmedOffset & Mask; ImmOp.ChangeToImmediate(ImmedOffset); - Offset &= ~(Mask*Scale); + Offset &= ~(Mask * Scale); } } + return Offset == 0; } @@ -602,7 +602,8 @@ Thumb1RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, else if (AFI->isGPRCalleeSavedArea2Frame(FrameIndex)) Offset -= AFI->getGPRCalleeSavedArea2Offset(); else if (MF.getFrameInfo()->hasVarSizedObjects()) { - assert(SPAdj == 0 && hasFP(MF) && "Unexpected"); + assert(SPAdj == 0 && MF.getTarget().getFrameLowering()->hasFP(MF) && + "Unexpected"); // There are alloca()'s in this function, must reference off the frame // pointer or base pointer instead. if (!hasBasePointer(MF)) { @@ -655,13 +656,12 @@ Thumb1RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, *this, dl); } - MI.setDesc(TII.get(ARM::tLDR)); + MI.setDesc(TII.get(UseRR ? ARM::tLDRr : ARM::tLDRi)); MI.getOperand(i).ChangeToRegister(TmpReg, false, false, true); if (UseRR) - // Use [reg, reg] addrmode. - MI.addOperand(MachineOperand::CreateReg(FrameReg, false)); - else // tLDR has an extra register operand. - MI.addOperand(MachineOperand::CreateReg(0, false)); + // Use [reg, reg] addrmode. Replace the immediate operand w/ the frame + // register. The offset is already handled in the vreg value. + MI.getOperand(i+1).ChangeToRegister(FrameReg, false, false, false); } else if (Desc.mayStore()) { VReg = MF.getRegInfo().createVirtualRegister(ARM::tGPRRegisterClass); bool UseRR = false; @@ -677,14 +677,15 @@ Thumb1RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, } else emitThumbRegPlusImmediate(MBB, II, VReg, FrameReg, Offset, TII, *this, dl); - MI.setDesc(TII.get(ARM::tSTR)); + MI.setDesc(TII.get(UseRR ? ARM::tSTRr : ARM::tSTRi)); MI.getOperand(i).ChangeToRegister(VReg, false, false, true); - if (UseRR) // Use [reg, reg] addrmode. - MI.addOperand(MachineOperand::CreateReg(FrameReg, false)); - else // tSTR has an extra register operand. - MI.addOperand(MachineOperand::CreateReg(0, false)); - } else + if (UseRR) + // Use [reg, reg] addrmode. Replace the immediate operand w/ the frame + // register. The offset is already handled in the vreg value. + MI.getOperand(i+1).ChangeToRegister(FrameReg, false, false, false); + } else { assert(false && "Unexpected opcode!"); + } // Add predicate back if it's needed. if (MI.getDesc().isPredicable()) { @@ -692,206 +693,3 @@ Thumb1RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, AddDefaultPred(MIB); } } - -void Thumb1RegisterInfo::emitPrologue(MachineFunction &MF) const { - MachineBasicBlock &MBB = MF.front(); - MachineBasicBlock::iterator MBBI = MBB.begin(); - MachineFrameInfo *MFI = MF.getFrameInfo(); - ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); - unsigned VARegSaveSize = AFI->getVarArgsRegSaveSize(); - unsigned NumBytes = MFI->getStackSize(); - const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo(); - DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc(); - - // Thumb add/sub sp, imm8 instructions implicitly multiply the offset by 4. - NumBytes = (NumBytes + 3) & ~3; - MFI->setStackSize(NumBytes); - - // Determine the sizes of each callee-save spill areas and record which frame - // belongs to which callee-save spill areas. - unsigned GPRCS1Size = 0, GPRCS2Size = 0, DPRCSSize = 0; - int FramePtrSpillFI = 0; - - if (VARegSaveSize) - emitSPUpdate(MBB, MBBI, TII, dl, *this, -VARegSaveSize); - - if (!AFI->hasStackFrame()) { - if (NumBytes != 0) - emitSPUpdate(MBB, MBBI, TII, dl, *this, -NumBytes); - return; - } - - for (unsigned i = 0, e = CSI.size(); i != e; ++i) { - unsigned Reg = CSI[i].getReg(); - int FI = CSI[i].getFrameIdx(); - switch (Reg) { - case ARM::R4: - case ARM::R5: - case ARM::R6: - case ARM::R7: - case ARM::LR: - if (Reg == FramePtr) - FramePtrSpillFI = FI; - AFI->addGPRCalleeSavedArea1Frame(FI); - GPRCS1Size += 4; - break; - case ARM::R8: - case ARM::R9: - case ARM::R10: - case ARM::R11: - if (Reg == FramePtr) - FramePtrSpillFI = FI; - if (STI.isTargetDarwin()) { - AFI->addGPRCalleeSavedArea2Frame(FI); - GPRCS2Size += 4; - } else { - AFI->addGPRCalleeSavedArea1Frame(FI); - GPRCS1Size += 4; - } - break; - default: - AFI->addDPRCalleeSavedAreaFrame(FI); - DPRCSSize += 8; - } - } - - if (MBBI != MBB.end() && MBBI->getOpcode() == ARM::tPUSH) { - ++MBBI; - if (MBBI != MBB.end()) - dl = MBBI->getDebugLoc(); - } - - // Adjust FP so it point to the stack slot that contains the previous FP. - if (hasFP(MF)) { - BuildMI(MBB, MBBI, dl, TII.get(ARM::tADDrSPi), FramePtr) - .addFrameIndex(FramePtrSpillFI).addImm(0); - AFI->setShouldRestoreSPFromFP(true); - } - - // Determine starting offsets of spill areas. - unsigned DPRCSOffset = NumBytes - (GPRCS1Size + GPRCS2Size + DPRCSSize); - unsigned GPRCS2Offset = DPRCSOffset + DPRCSSize; - unsigned GPRCS1Offset = GPRCS2Offset + GPRCS2Size; - AFI->setFramePtrSpillOffset(MFI->getObjectOffset(FramePtrSpillFI) + NumBytes); - AFI->setGPRCalleeSavedArea1Offset(GPRCS1Offset); - AFI->setGPRCalleeSavedArea2Offset(GPRCS2Offset); - AFI->setDPRCalleeSavedAreaOffset(DPRCSOffset); - - NumBytes = DPRCSOffset; - if (NumBytes) { - // Insert it after all the callee-save spills. - emitSPUpdate(MBB, MBBI, TII, dl, *this, -NumBytes); - } - - if (STI.isTargetELF() && hasFP(MF)) - MFI->setOffsetAdjustment(MFI->getOffsetAdjustment() - - AFI->getFramePtrSpillOffset()); - - AFI->setGPRCalleeSavedArea1Size(GPRCS1Size); - AFI->setGPRCalleeSavedArea2Size(GPRCS2Size); - AFI->setDPRCalleeSavedAreaSize(DPRCSSize); - - // If we need a base pointer, set it up here. It's whatever the value - // of the stack pointer is at this point. Any variable size objects - // will be allocated after this, so we can still use the base pointer - // to reference locals. - if (hasBasePointer(MF)) - BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVgpr2gpr), BasePtr).addReg(ARM::SP); -} - -static bool isCalleeSavedRegister(unsigned Reg, const unsigned *CSRegs) { - for (unsigned i = 0; CSRegs[i]; ++i) - if (Reg == CSRegs[i]) - return true; - return false; -} - -static bool isCSRestore(MachineInstr *MI, const unsigned *CSRegs) { - if (MI->getOpcode() == ARM::tRestore && - MI->getOperand(1).isFI() && - isCalleeSavedRegister(MI->getOperand(0).getReg(), CSRegs)) - return true; - else if (MI->getOpcode() == ARM::tPOP) { - // The first two operands are predicates. The last two are - // imp-def and imp-use of SP. Check everything in between. - for (int i = 2, e = MI->getNumOperands() - 2; i != e; ++i) - if (!isCalleeSavedRegister(MI->getOperand(i).getReg(), CSRegs)) - return false; - return true; - } - return false; -} - -void Thumb1RegisterInfo::emitEpilogue(MachineFunction &MF, - MachineBasicBlock &MBB) const { - MachineBasicBlock::iterator MBBI = prior(MBB.end()); - assert((MBBI->getOpcode() == ARM::tBX_RET || - MBBI->getOpcode() == ARM::tPOP_RET) && - "Can only insert epilog into returning blocks"); - DebugLoc dl = MBBI->getDebugLoc(); - MachineFrameInfo *MFI = MF.getFrameInfo(); - ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); - unsigned VARegSaveSize = AFI->getVarArgsRegSaveSize(); - int NumBytes = (int)MFI->getStackSize(); - const unsigned *CSRegs = getCalleeSavedRegs(); - - if (!AFI->hasStackFrame()) { - if (NumBytes != 0) - emitSPUpdate(MBB, MBBI, TII, dl, *this, NumBytes); - } else { - // Unwind MBBI to point to first LDR / VLDRD. - if (MBBI != MBB.begin()) { - do - --MBBI; - while (MBBI != MBB.begin() && isCSRestore(MBBI, CSRegs)); - if (!isCSRestore(MBBI, CSRegs)) - ++MBBI; - } - - // Move SP to start of FP callee save spill area. - NumBytes -= (AFI->getGPRCalleeSavedArea1Size() + - AFI->getGPRCalleeSavedArea2Size() + - AFI->getDPRCalleeSavedAreaSize()); - - if (AFI->shouldRestoreSPFromFP()) { - NumBytes = AFI->getFramePtrSpillOffset() - NumBytes; - // Reset SP based on frame pointer only if the stack frame extends beyond - // frame pointer stack slot or target is ELF and the function has FP. - if (NumBytes) - emitThumbRegPlusImmediate(MBB, MBBI, ARM::SP, FramePtr, -NumBytes, - TII, *this, dl); - else - BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVtgpr2gpr), ARM::SP) - .addReg(FramePtr); - } else { - if (MBBI->getOpcode() == ARM::tBX_RET && - &MBB.front() != MBBI && - prior(MBBI)->getOpcode() == ARM::tPOP) { - MachineBasicBlock::iterator PMBBI = prior(MBBI); - emitSPUpdate(MBB, PMBBI, TII, dl, *this, NumBytes); - } else - emitSPUpdate(MBB, MBBI, TII, dl, *this, NumBytes); - } - } - - if (VARegSaveSize) { - // Unlike T2 and ARM mode, the T1 pop instruction cannot restore - // to LR, and we can't pop the value directly to the PC since - // we need to update the SP after popping the value. Therefore, we - // pop the old LR into R3 as a temporary. - - // Move back past the callee-saved register restoration - while (MBBI != MBB.end() && isCSRestore(MBBI, CSRegs)) - ++MBBI; - // Epilogue for vararg functions: pop LR to R3 and branch off it. - AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tPOP))) - .addReg(ARM::R3, RegState::Define); - - emitSPUpdate(MBB, MBBI, TII, dl, *this, VARegSaveSize); - - BuildMI(MBB, MBBI, dl, TII.get(ARM::tBX_RET_vararg)) - .addReg(ARM::R3, RegState::Kill); - // erase the old tBX_RET instruction - MBB.erase(MBBI); - } -} diff --git a/contrib/llvm/lib/Target/ARM/Thumb1RegisterInfo.h b/contrib/llvm/lib/Target/ARM/Thumb1RegisterInfo.h index c578054..8a87cc5 100644 --- a/contrib/llvm/lib/Target/ARM/Thumb1RegisterInfo.h +++ b/contrib/llvm/lib/Target/ARM/Thumb1RegisterInfo.h @@ -38,8 +38,6 @@ public: unsigned PredReg = 0) const; /// Code Generation virtual methods... - bool hasReservedCallFrame(const MachineFunction &MF) const; - void eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const; @@ -59,9 +57,6 @@ public: unsigned Reg) const; void eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj, RegScavenger *RS = NULL) const; - - void emitPrologue(MachineFunction &MF) const; - void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const; }; } diff --git a/contrib/llvm/lib/Target/ARM/Thumb2HazardRecognizer.cpp b/contrib/llvm/lib/Target/ARM/Thumb2HazardRecognizer.cpp deleted file mode 100644 index 172908d..0000000 --- a/contrib/llvm/lib/Target/ARM/Thumb2HazardRecognizer.cpp +++ /dev/null @@ -1,53 +0,0 @@ -//===-- Thumb2HazardRecognizer.cpp - Thumb2 postra hazard recognizer ------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// - -#include "ARM.h" -#include "Thumb2HazardRecognizer.h" -#include "llvm/CodeGen/MachineInstr.h" -#include "llvm/CodeGen/ScheduleDAG.h" -using namespace llvm; - -ScheduleHazardRecognizer::HazardType -Thumb2HazardRecognizer::getHazardType(SUnit *SU) { - if (ITBlockSize) { - MachineInstr *MI = SU->getInstr(); - if (!MI->isDebugValue() && MI != ITBlockMIs[ITBlockSize-1]) - return Hazard; - } - - return PostRAHazardRecognizer::getHazardType(SU); -} - -void Thumb2HazardRecognizer::Reset() { - ITBlockSize = 0; - PostRAHazardRecognizer::Reset(); -} - -void Thumb2HazardRecognizer::EmitInstruction(SUnit *SU) { - MachineInstr *MI = SU->getInstr(); - unsigned Opcode = MI->getOpcode(); - if (ITBlockSize) { - --ITBlockSize; - } else if (Opcode == ARM::t2IT) { - unsigned Mask = MI->getOperand(1).getImm(); - unsigned NumTZ = CountTrailingZeros_32(Mask); - assert(NumTZ <= 3 && "Invalid IT mask!"); - ITBlockSize = 4 - NumTZ; - MachineBasicBlock::iterator I = MI; - for (unsigned i = 0; i < ITBlockSize; ++i) { - // Advance to the next instruction, skipping any dbg_value instructions. - do { - ++I; - } while (I->isDebugValue()); - ITBlockMIs[ITBlockSize-1-i] = &*I; - } - } - - PostRAHazardRecognizer::EmitInstruction(SU); -} diff --git a/contrib/llvm/lib/Target/ARM/Thumb2HazardRecognizer.h b/contrib/llvm/lib/Target/ARM/Thumb2HazardRecognizer.h deleted file mode 100644 index 4726658..0000000 --- a/contrib/llvm/lib/Target/ARM/Thumb2HazardRecognizer.h +++ /dev/null @@ -1,40 +0,0 @@ -//===-- Thumb2HazardRecognizer.h - Thumb2 Hazard Recognizers ----*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file defines hazard recognizers for scheduling Thumb2 functions on -// ARM processors. -// -//===----------------------------------------------------------------------===// - -#ifndef THUMB2HAZARDRECOGNIZER_H -#define THUMB2HAZARDRECOGNIZER_H - -#include "llvm/CodeGen/PostRAHazardRecognizer.h" - -namespace llvm { - -class MachineInstr; - -class Thumb2HazardRecognizer : public PostRAHazardRecognizer { - unsigned ITBlockSize; // No. of MIs in current IT block yet to be scheduled. - MachineInstr *ITBlockMIs[4]; - -public: - Thumb2HazardRecognizer(const InstrItineraryData &ItinData) : - PostRAHazardRecognizer(ItinData) {} - - virtual HazardType getHazardType(SUnit *SU); - virtual void Reset(); - virtual void EmitInstruction(SUnit *SU); -}; - - -} // end namespace llvm - -#endif // THUMB2HAZARDRECOGNIZER_H diff --git a/contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp b/contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp index 442f41d..2f67257 100644 --- a/contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp +++ b/contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp @@ -17,7 +17,6 @@ #include "ARMAddressingModes.h" #include "ARMGenInstrInfo.inc" #include "ARMMachineFunctionInfo.h" -#include "Thumb2HazardRecognizer.h" #include "Thumb2InstrInfo.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -28,15 +27,10 @@ using namespace llvm; -static cl::opt<unsigned> -IfCvtLimit("thumb2-ifcvt-limit", cl::Hidden, - cl::desc("Thumb2 if-conversion limit (default 3)"), - cl::init(3)); - -static cl::opt<unsigned> -IfCvtDiamondLimit("thumb2-ifcvt-diamond-limit", cl::Hidden, - cl::desc("Thumb2 diamond if-conversion limit (default 3)"), - cl::init(3)); +static cl::opt<bool> +OldT2IfCvt("old-thumb2-ifcvt", cl::Hidden, + cl::desc("Use old-style Thumb2 if-conversion heuristics"), + cl::init(false)); Thumb2InstrInfo::Thumb2InstrInfo(const ARMSubtarget &STI) : ARMBaseInstrInfo(STI), RI(*this, STI) { @@ -105,21 +99,6 @@ Thumb2InstrInfo::isLegalToSplitMBBAt(MachineBasicBlock &MBB, return llvm::getITInstrPredicate(MBBI, PredReg) == ARMCC::AL; } -bool Thumb2InstrInfo::isProfitableToIfCvt(MachineBasicBlock &MBB, - unsigned NumInstrs) const { - return NumInstrs && NumInstrs <= IfCvtLimit; -} - -bool Thumb2InstrInfo:: -isProfitableToIfCvt(MachineBasicBlock &TMBB, unsigned NumT, - MachineBasicBlock &FMBB, unsigned NumF) const { - // FIXME: Catch optimization such as: - // r0 = movne - // r0 = moveq - return NumT && NumF && - NumT <= (IfCvtDiamondLimit) && NumF <= (IfCvtDiamondLimit); -} - void Thumb2InstrInfo::copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, DebugLoc DL, unsigned DestReg, unsigned SrcReg, @@ -155,8 +134,9 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, MachineFunction &MF = *MBB.getParent(); MachineFrameInfo &MFI = *MF.getFrameInfo(); MachineMemOperand *MMO = - MF.getMachineMemOperand(PseudoSourceValue::getFixedStack(FI), - MachineMemOperand::MOStore, 0, + MF.getMachineMemOperand( + MachinePointerInfo(PseudoSourceValue::getFixedStack(FI)), + MachineMemOperand::MOStore, MFI.getObjectSize(FI), MFI.getObjectAlignment(FI)); AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::t2STRi12)) @@ -181,8 +161,9 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, MachineFunction &MF = *MBB.getParent(); MachineFrameInfo &MFI = *MF.getFrameInfo(); MachineMemOperand *MMO = - MF.getMachineMemOperand(PseudoSourceValue::getFixedStack(FI), - MachineMemOperand::MOLoad, 0, + MF.getMachineMemOperand( + MachinePointerInfo(PseudoSourceValue::getFixedStack(FI)), + MachineMemOperand::MOLoad, MFI.getObjectSize(FI), MFI.getObjectAlignment(FI)); AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::t2LDRi12), DestReg) @@ -193,11 +174,6 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, ARMBaseInstrInfo::loadRegFromStackSlot(MBB, I, DestReg, FI, RC, TRI); } -ScheduleHazardRecognizer *Thumb2InstrInfo:: -CreateTargetPostRAHazardRecognizer(const InstrItineraryData &II) const { - return (ScheduleHazardRecognizer *)new Thumb2HazardRecognizer(II); -} - void llvm::emitT2RegPlusImmediate(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, DebugLoc dl, unsigned DestReg, unsigned BaseReg, int NumBytes, diff --git a/contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.h b/contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.h index 3a9f8b1..f2637d7 100644 --- a/contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.h +++ b/contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.h @@ -38,11 +38,6 @@ public: bool isLegalToSplitMBBAt(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const; - bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumInstrs) const; - - bool isProfitableToIfCvt(MachineBasicBlock &TMBB, unsigned NumTInstrs, - MachineBasicBlock &FMBB, unsigned NumFInstrs) const; - void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, DebugLoc DL, unsigned DestReg, unsigned SrcReg, @@ -70,9 +65,6 @@ public: /// always be able to get register info as well (through this method). /// const Thumb2RegisterInfo &getRegisterInfo() const { return RI; } - - ScheduleHazardRecognizer * - CreateTargetPostRAHazardRecognizer(const InstrItineraryData &II) const; }; /// getITInstrPredicate - Valid only in Thumb2 mode. This function is identical diff --git a/contrib/llvm/lib/Target/ARM/Thumb2RegisterInfo.cpp b/contrib/llvm/lib/Target/ARM/Thumb2RegisterInfo.cpp index 07dd0be..099b8f7 100644 --- a/contrib/llvm/lib/Target/ARM/Thumb2RegisterInfo.cpp +++ b/contrib/llvm/lib/Target/ARM/Thumb2RegisterInfo.cpp @@ -29,7 +29,6 @@ #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineLocation.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/Target/TargetFrameInfo.h" #include "llvm/Target/TargetMachine.h" #include "llvm/ADT/BitVector.h" #include "llvm/ADT/SmallVector.h" diff --git a/contrib/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp b/contrib/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp index 0c3962d..cc8f61c 100644 --- a/contrib/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp +++ b/contrib/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp @@ -58,7 +58,7 @@ namespace { { ARM::t2ADDri, ARM::tADDi3, ARM::tADDi8, 3, 8, 1, 1, 0,0, 0 }, { ARM::t2ADDrr, ARM::tADDrr, ARM::tADDhirr, 0, 0, 1, 0, 0,1, 0 }, // Note: immediate scale is 4. - { ARM::t2ADDrSPi,ARM::tADDrSPi,0, 8, 0, 1, 0, 1,0, 0 }, + { ARM::t2ADDrSPi,ARM::tADDrSPi,0, 8, 0, 1, 0, 1,0, 1 }, { ARM::t2ADDSri,ARM::tADDi3, ARM::tADDi8, 3, 8, 1, 1, 2,2, 1 }, { ARM::t2ADDSrr,ARM::tADDrr, 0, 0, 0, 1, 0, 2,0, 1 }, { ARM::t2ANDrr, 0, ARM::tAND, 0, 0, 0, 1, 0,0, 0 }, @@ -68,9 +68,7 @@ namespace { //FIXME: Disable CMN, as CCodes are backwards from compare expectations //{ ARM::t2CMNrr, ARM::tCMN, 0, 0, 0, 1, 0, 2,0, 0 }, { ARM::t2CMPri, ARM::tCMPi8, 0, 8, 0, 1, 0, 2,0, 0 }, - { ARM::t2CMPrr, ARM::tCMPhir, 0, 0, 0, 0, 0, 2,0, 0 }, - { ARM::t2CMPzri,ARM::tCMPzi8, 0, 8, 0, 1, 0, 2,0, 0 }, - { ARM::t2CMPzrr,ARM::tCMPzhir,0, 0, 0, 0, 0, 2,0, 0 }, + { ARM::t2CMPrr, ARM::tCMPhir, 0, 0, 0, 0, 0, 2,0, 1 }, { ARM::t2EORrr, 0, ARM::tEOR, 0, 0, 0, 1, 0,0, 0 }, // FIXME: adr.n immediate offset must be multiple of 4. //{ ARM::t2LEApcrelJT,ARM::tLEApcrelJT, 0, 0, 0, 1, 0, 1,0, 0 }, @@ -106,26 +104,27 @@ namespace { // FIXME: Clean this up after splitting each Thumb load / store opcode // into multiple ones. - { ARM::t2LDRi12,ARM::tLDR, ARM::tLDRspi, 5, 8, 1, 0, 0,0, 1 }, - { ARM::t2LDRs, ARM::tLDR, 0, 0, 0, 1, 0, 0,0, 1 }, - { ARM::t2LDRBi12,ARM::tLDRB, 0, 5, 0, 1, 0, 0,0, 1 }, - { ARM::t2LDRBs, ARM::tLDRB, 0, 0, 0, 1, 0, 0,0, 1 }, - { ARM::t2LDRHi12,ARM::tLDRH, 0, 5, 0, 1, 0, 0,0, 1 }, - { ARM::t2LDRHs, ARM::tLDRH, 0, 0, 0, 1, 0, 0,0, 1 }, + { ARM::t2LDRi12,ARM::tLDRi, ARM::tLDRspi, 5, 8, 1, 0, 0,0, 1 }, + { ARM::t2LDRs, ARM::tLDRr, 0, 0, 0, 1, 0, 0,0, 1 }, + { ARM::t2LDRBi12,ARM::tLDRBi, 0, 5, 0, 1, 0, 0,0, 1 }, + { ARM::t2LDRBs, ARM::tLDRBr, 0, 0, 0, 1, 0, 0,0, 1 }, + { ARM::t2LDRHi12,ARM::tLDRHi, 0, 5, 0, 1, 0, 0,0, 1 }, + { ARM::t2LDRHs, ARM::tLDRHr, 0, 0, 0, 1, 0, 0,0, 1 }, { ARM::t2LDRSBs,ARM::tLDRSB, 0, 0, 0, 1, 0, 0,0, 1 }, { ARM::t2LDRSHs,ARM::tLDRSH, 0, 0, 0, 1, 0, 0,0, 1 }, - { ARM::t2STRi12,ARM::tSTR, ARM::tSTRspi, 5, 8, 1, 0, 0,0, 1 }, - { ARM::t2STRs, ARM::tSTR, 0, 0, 0, 1, 0, 0,0, 1 }, - { ARM::t2STRBi12,ARM::tSTRB, 0, 5, 0, 1, 0, 0,0, 1 }, - { ARM::t2STRBs, ARM::tSTRB, 0, 0, 0, 1, 0, 0,0, 1 }, - { ARM::t2STRHi12,ARM::tSTRH, 0, 5, 0, 1, 0, 0,0, 1 }, - { ARM::t2STRHs, ARM::tSTRH, 0, 0, 0, 1, 0, 0,0, 1 }, - - { ARM::t2LDM, ARM::tLDM, 0, 0, 0, 1, 1, 1,1, 1 }, - { ARM::t2LDM_RET,0, ARM::tPOP_RET, 0, 0, 1, 1, 1,1, 1 }, - { ARM::t2LDM_UPD,ARM::tLDM_UPD,ARM::tPOP, 0, 0, 1, 1, 1,1, 1 }, + { ARM::t2STRi12,ARM::tSTRi, ARM::tSTRspi, 5, 8, 1, 0, 0,0, 1 }, + { ARM::t2STRs, ARM::tSTRr, 0, 0, 0, 1, 0, 0,0, 1 }, + { ARM::t2STRBi12,ARM::tSTRBi, 0, 5, 0, 1, 0, 0,0, 1 }, + { ARM::t2STRBs, ARM::tSTRBr, 0, 0, 0, 1, 0, 0,0, 1 }, + { ARM::t2STRHi12,ARM::tSTRHi, 0, 5, 0, 1, 0, 0,0, 1 }, + { ARM::t2STRHs, ARM::tSTRHr, 0, 0, 0, 1, 0, 0,0, 1 }, + + { ARM::t2LDMIA, ARM::tLDMIA, 0, 0, 0, 1, 1, 1,1, 1 }, + { ARM::t2LDMIA_RET,0, ARM::tPOP_RET, 0, 0, 1, 1, 1,1, 1 }, + { ARM::t2LDMIA_UPD,ARM::tLDMIA_UPD,ARM::tPOP,0, 0, 1, 1, 1,1, 1 }, // ARM::t2STM (with no basereg writeback) has no Thumb1 equivalent - { ARM::t2STM_UPD,ARM::tSTM_UPD,ARM::tPUSH, 0, 0, 1, 1, 1,1, 1 }, + { ARM::t2STMIA_UPD,ARM::tSTMIA_UPD, 0, 0, 0, 1, 1, 1,1, 1 }, + { ARM::t2STMDB_UPD, 0, ARM::tPUSH, 0, 0, 1, 1, 1,1, 1 }, }; class Thumb2SizeReduce : public MachineFunctionPass { @@ -217,8 +216,8 @@ Thumb2SizeReduce::VerifyPredAndCC(MachineInstr *MI, const ReduceEntry &Entry, /// Old opcode has an optional def of CPSR. if (HasCC) return true; - // If both old opcode does not implicit CPSR def, then it's not ok since - // these new opcodes CPSR def is not meant to be thrown away. e.g. CMP. + // If old opcode does not implicitly define CPSR, then it's not ok since + // these new opcodes' CPSR def is not meant to be thrown away. e.g. CMP. if (!HasImplicitCPSRDef(MI->getDesc())) return false; HasCC = true; @@ -233,9 +232,10 @@ Thumb2SizeReduce::VerifyPredAndCC(MachineInstr *MI, const ReduceEntry &Entry, static bool VerifyLowRegs(MachineInstr *MI) { unsigned Opc = MI->getOpcode(); - bool isPCOk = (Opc == ARM::t2LDM_RET || Opc == ARM::t2LDM || - Opc == ARM::t2LDM_UPD); - bool isLROk = (Opc == ARM::t2STM_UPD); + bool isPCOk = (Opc == ARM::t2LDMIA_RET || Opc == ARM::t2LDMIA || + Opc == ARM::t2LDMDB || Opc == ARM::t2LDMIA_UPD || + Opc == ARM::t2LDMDB_UPD); + bool isLROk = (Opc == ARM::t2STMIA_UPD || Opc == ARM::t2STMDB_UPD); bool isSPOk = isPCOk || isLROk || (Opc == ARM::t2ADDrSPi); for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { const MachineOperand &MO = MI->getOperand(i); @@ -275,29 +275,32 @@ Thumb2SizeReduce::ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI, unsigned Opc = Entry.NarrowOpc1; unsigned OpNum = 3; // First 'rest' of operands. uint8_t ImmLimit = Entry.Imm1Limit; + switch (Entry.WideOpc) { default: llvm_unreachable("Unexpected Thumb2 load / store opcode!"); case ARM::t2LDRi12: - case ARM::t2STRi12: { - unsigned BaseReg = MI->getOperand(1).getReg(); - if (BaseReg == ARM::SP) { + case ARM::t2STRi12: + if (MI->getOperand(1).getReg() == ARM::SP) { Opc = Entry.NarrowOpc2; ImmLimit = Entry.Imm2Limit; HasOffReg = false; } + Scale = 4; HasImmOffset = true; + HasOffReg = false; break; - } case ARM::t2LDRBi12: case ARM::t2STRBi12: HasImmOffset = true; + HasOffReg = false; break; case ARM::t2LDRHi12: case ARM::t2STRHi12: Scale = 2; HasImmOffset = true; + HasOffReg = false; break; case ARM::t2LDRs: case ARM::t2LDRBs: @@ -310,11 +313,12 @@ Thumb2SizeReduce::ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI, HasShift = true; OpNum = 4; break; - case ARM::t2LDM: { + case ARM::t2LDMIA: + case ARM::t2LDMDB: { unsigned BaseReg = MI->getOperand(0).getReg(); - ARM_AM::AMSubMode Mode = ARM_AM::getAM4SubMode(MI->getOperand(1).getImm()); - if (!isARMLowRegister(BaseReg) || Mode != ARM_AM::ia) + if (!isARMLowRegister(BaseReg) || Entry.WideOpc != ARM::t2LDMIA) return false; + // For the non-writeback version (this one), the base register must be // one of the registers being loaded. bool isOK = false; @@ -324,6 +328,7 @@ Thumb2SizeReduce::ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI, break; } } + if (!isOK) return false; @@ -331,28 +336,33 @@ Thumb2SizeReduce::ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI, isLdStMul = true; break; } - case ARM::t2LDM_RET: { + case ARM::t2LDMIA_RET: { unsigned BaseReg = MI->getOperand(1).getReg(); if (BaseReg != ARM::SP) return false; Opc = Entry.NarrowOpc2; // tPOP_RET - OpNum = 3; + OpNum = 2; isLdStMul = true; break; } - case ARM::t2LDM_UPD: - case ARM::t2STM_UPD: { + case ARM::t2LDMIA_UPD: + case ARM::t2LDMDB_UPD: + case ARM::t2STMIA_UPD: + case ARM::t2STMDB_UPD: { OpNum = 0; + unsigned BaseReg = MI->getOperand(1).getReg(); - ARM_AM::AMSubMode Mode = ARM_AM::getAM4SubMode(MI->getOperand(2).getImm()); if (BaseReg == ARM::SP && - ((Entry.WideOpc == ARM::t2LDM_UPD && Mode == ARM_AM::ia) || - (Entry.WideOpc == ARM::t2STM_UPD && Mode == ARM_AM::db))) { + (Entry.WideOpc == ARM::t2LDMIA_UPD || + Entry.WideOpc == ARM::t2STMDB_UPD)) { Opc = Entry.NarrowOpc2; // tPOP or tPUSH - OpNum = 3; - } else if (!isARMLowRegister(BaseReg) || Mode != ARM_AM::ia) { + OpNum = 2; + } else if (!isARMLowRegister(BaseReg) || + (Entry.WideOpc != ARM::t2LDMIA_UPD && + Entry.WideOpc != ARM::t2STMIA_UPD)) { return false; } + isLdStMul = true; break; } @@ -363,6 +373,7 @@ Thumb2SizeReduce::ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI, if (HasShift) { OffsetReg = MI->getOperand(2).getReg(); OffsetKill = MI->getOperand(2).isKill(); + if (MI->getOperand(3).getImm()) // Thumb1 addressing mode doesn't support shift. return false; @@ -372,23 +383,22 @@ Thumb2SizeReduce::ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI, if (HasImmOffset) { OffsetImm = MI->getOperand(2).getImm(); unsigned MaxOffset = ((1 << ImmLimit) - 1) * Scale; - if ((OffsetImm & (Scale-1)) || OffsetImm > MaxOffset) + + if ((OffsetImm & (Scale - 1)) || OffsetImm > MaxOffset) // Make sure the immediate field fits. return false; } // Add the 16-bit load / store instruction. - // FIXME: Thumb1 addressing mode encode both immediate and register offset. DebugLoc dl = MI->getDebugLoc(); MachineInstrBuilder MIB = BuildMI(MBB, *MI, dl, TII->get(Opc)); if (!isLdStMul) { - MIB.addOperand(MI->getOperand(0)).addOperand(MI->getOperand(1)); - if (Opc != ARM::tLDRSB && Opc != ARM::tLDRSH) { - // tLDRSB and tLDRSH do not have an immediate offset field. On the other - // hand, it must have an offset register. - // FIXME: Remove this special case. - MIB.addImm(OffsetImm/Scale); - } + MIB.addOperand(MI->getOperand(0)); + MIB.addOperand(MI->getOperand(1)); + + if (HasImmOffset) + MIB.addImm(OffsetImm / Scale); + assert((!HasShift || OffsetReg) && "Invalid so_reg load / store address!"); if (HasOffReg) @@ -423,7 +433,7 @@ Thumb2SizeReduce::ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI, unsigned Opc = MI->getOpcode(); switch (Opc) { default: break; - case ARM::t2ADDSri: + case ARM::t2ADDSri: case ARM::t2ADDSrr: { unsigned PredReg = 0; if (getInstrPredicate(MI, PredReg) == ARMCC::AL) { @@ -451,6 +461,25 @@ Thumb2SizeReduce::ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI, if (MI->getOperand(1).isImm()) return ReduceToNarrow(MBB, MI, Entry, LiveCPSR); break; + case ARM::t2CMPrr: { + // Try to reduce to the lo-reg only version first. Why there are two + // versions of the instruction is a mystery. + // It would be nice to just have two entries in the master table that + // are prioritized, but the table assumes a unique entry for each + // source insn opcode. So for now, we hack a local entry record to use. + static const ReduceEntry NarrowEntry = + { ARM::t2CMPrr,ARM::tCMPr, 0, 0, 0, 1, 1,2, 0, 1 }; + if (ReduceToNarrow(MBB, MI, NarrowEntry, LiveCPSR)) + return true; + return ReduceToNarrow(MBB, MI, Entry, LiveCPSR); + } + case ARM::t2ADDrSPi: { + static const ReduceEntry NarrowEntry = + { ARM::t2ADDrSPi,ARM::tADDspi, 0, 7, 0, 1, 0, 1, 0, 1 }; + if (MI->getOperand(0).getReg() == ARM::SP) + return ReduceToNarrow(MBB, MI, NarrowEntry, LiveCPSR); + return ReduceToNarrow(MBB, MI, Entry, LiveCPSR); + } } return false; } |